aboutsummaryrefslogtreecommitdiffstats
path: root/storage/src
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
committerJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
commit72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /storage/src
Publish
Diffstat (limited to 'storage/src')
-rw-r--r--storage/src/.gitignore10
-rw-r--r--storage/src/Doxyfile994
-rw-r--r--storage/src/tests/.gitignore22
-rw-r--r--storage/src/tests/CMakeLists.txt22
-rw-r--r--storage/src/tests/bucketdb/.gitignore12
-rw-r--r--storage/src/tests/bucketdb/CMakeLists.txt14
-rw-r--r--storage/src/tests/bucketdb/bucketinfotest.cpp201
-rw-r--r--storage/src/tests/bucketdb/bucketmanagertest.cpp1323
-rw-r--r--storage/src/tests/bucketdb/distribution_hash_normalizer_test.cpp114
-rw-r--r--storage/src/tests/bucketdb/initializertest.cpp924
-rw-r--r--storage/src/tests/bucketdb/judyarraytest.cpp287
-rw-r--r--storage/src/tests/bucketdb/judymultimaptest.cpp172
-rw-r--r--storage/src/tests/bucketdb/lockablemaptest.cpp1262
-rw-r--r--storage/src/tests/bucketmover/CMakeLists.txt9
-rw-r--r--storage/src/tests/bucketmover/bucketmovertest.cpp190
-rw-r--r--storage/src/tests/bucketmover/htmltabletest.cpp100
-rw-r--r--storage/src/tests/common/.gitignore8
-rw-r--r--storage/src/tests/common/CMakeLists.txt12
-rw-r--r--storage/src/tests/common/dummystoragelink.cpp191
-rw-r--r--storage/src/tests/common/dummystoragelink.h121
-rw-r--r--storage/src/tests/common/hostreporter/CMakeLists.txt14
-rw-r--r--storage/src/tests/common/hostreporter/cpureportertest.cpp40
-rw-r--r--storage/src/tests/common/hostreporter/diskreportertest.cpp33
-rw-r--r--storage/src/tests/common/hostreporter/hostinfotest.cpp60
-rw-r--r--storage/src/tests/common/hostreporter/memreportertest.cpp44
-rw-r--r--storage/src/tests/common/hostreporter/networkreportertest.cpp40
-rw-r--r--storage/src/tests/common/hostreporter/util.cpp34
-rw-r--r--storage/src/tests/common/hostreporter/util.h16
-rw-r--r--storage/src/tests/common/hostreporter/versionreportertest.cpp39
-rw-r--r--storage/src/tests/common/metricstest.cpp393
-rw-r--r--storage/src/tests/common/storagelinktest.cpp57
-rw-r--r--storage/src/tests/common/storagelinktest.h46
-rw-r--r--storage/src/tests/common/testhelper.cpp209
-rw-r--r--storage/src/tests/common/testhelper.h58
-rw-r--r--storage/src/tests/common/testnodestateupdater.h50
-rw-r--r--storage/src/tests/common/teststorageapp.cpp292
-rw-r--r--storage/src/tests/common/teststorageapp.h161
-rw-r--r--storage/src/tests/config-doctypes.cfg158
-rw-r--r--storage/src/tests/config-document.cfg78
-rw-r--r--storage/src/tests/config-testdocman-document.cfg138
-rw-r--r--storage/src/tests/distributor/.gitignore8
-rw-r--r--storage/src/tests/distributor/CMakeLists.txt44
-rw-r--r--storage/src/tests/distributor/blockingoperationstartertest.cpp78
-rw-r--r--storage/src/tests/distributor/bucketdatabasetest.cpp550
-rw-r--r--storage/src/tests/distributor/bucketdatabasetest.h63
-rw-r--r--storage/src/tests/distributor/bucketdbmetricupdatertest.cpp361
-rw-r--r--storage/src/tests/distributor/bucketdbupdatertest.cpp2296
-rw-r--r--storage/src/tests/distributor/bucketgctimecalculatortest.cpp114
-rw-r--r--storage/src/tests/distributor/bucketstateoperationtest.cpp251
-rw-r--r--storage/src/tests/distributor/distributor_host_info_reporter_test.cpp225
-rw-r--r--storage/src/tests/distributor/distributortest.cpp691
-rw-r--r--storage/src/tests/distributor/distributortestutil.cpp298
-rw-r--r--storage/src/tests/distributor/distributortestutil.h200
-rw-r--r--storage/src/tests/distributor/externaloperationhandlertest.cpp176
-rw-r--r--storage/src/tests/distributor/garbagecollectiontest.cpp77
-rw-r--r--storage/src/tests/distributor/getoperationtest.cpp567
-rw-r--r--storage/src/tests/distributor/idealstatemanagertest.cpp268
-rw-r--r--storage/src/tests/distributor/joinbuckettest.cpp127
-rw-r--r--storage/src/tests/distributor/maintenancemocks.h123
-rw-r--r--storage/src/tests/distributor/maintenanceschedulertest.cpp108
-rw-r--r--storage/src/tests/distributor/mapbucketdatabasetest.cpp26
-rw-r--r--storage/src/tests/distributor/mergelimitertest.cpp161
-rw-r--r--storage/src/tests/distributor/mergeoperationtest.cpp430
-rw-r--r--storage/src/tests/distributor/messagesenderstub.cpp88
-rw-r--r--storage/src/tests/distributor/messagesenderstub.h71
-rw-r--r--storage/src/tests/distributor/nodeinfotest.cpp83
-rw-r--r--storage/src/tests/distributor/nodemaintenancestatstrackertest.cpp102
-rw-r--r--storage/src/tests/distributor/operationtargetresolvertest.cpp316
-rw-r--r--storage/src/tests/distributor/pendingmessagetrackertest.cpp674
-rw-r--r--storage/src/tests/distributor/putoperationtest.cpp704
-rw-r--r--storage/src/tests/distributor/removebucketoperationtest.cpp150
-rw-r--r--storage/src/tests/distributor/removelocationtest.cpp84
-rw-r--r--storage/src/tests/distributor/removeoperationtest.cpp203
-rw-r--r--storage/src/tests/distributor/simplebucketprioritydatabasetest.cpp143
-rw-r--r--storage/src/tests/distributor/simplemaintenancescannertest.cpp220
-rw-r--r--storage/src/tests/distributor/splitbuckettest.cpp353
-rw-r--r--storage/src/tests/distributor/statecheckerstest.cpp1838
-rw-r--r--storage/src/tests/distributor/statoperationtest.cpp115
-rw-r--r--storage/src/tests/distributor/statusreporterdelegatetest.cpp87
-rw-r--r--storage/src/tests/distributor/throttlingoperationstartertest.cpp142
-rw-r--r--storage/src/tests/distributor/twophaseupdateoperationtest.cpp1194
-rw-r--r--storage/src/tests/distributor/updateoperationtest.cpp210
-rw-r--r--storage/src/tests/distributor/visitoroperationtest.cpp1646
-rw-r--r--storage/src/tests/fastos.project.newcore80
-rw-r--r--storage/src/tests/frameworkimpl/memory/CMakeLists.txt8
-rw-r--r--storage/src/tests/frameworkimpl/memory/memorystatusviewertest.cpp168
-rw-r--r--storage/src/tests/frameworkimpl/status/CMakeLists.txt8
-rw-r--r--storage/src/tests/frameworkimpl/status/statustest.cpp222
-rw-r--r--storage/src/tests/persistence/.gitignore12
-rw-r--r--storage/src/tests/persistence/CMakeLists.txt19
-rw-r--r--storage/src/tests/persistence/bucketownershipnotifiertest.cpp162
-rw-r--r--storage/src/tests/persistence/diskmoveoperationhandlertest.cpp57
-rw-r--r--storage/src/tests/persistence/filestorage/.gitignore13
-rw-r--r--storage/src/tests/persistence/filestorage/CMakeLists.txt17
-rw-r--r--storage/src/tests/persistence/filestorage/deactivatebucketstest.cpp66
-rw-r--r--storage/src/tests/persistence/filestorage/deletebuckettest.cpp63
-rw-r--r--storage/src/tests/persistence/filestorage/filestormanagertest.cpp3150
-rw-r--r--storage/src/tests/persistence/filestorage/filestormodifiedbucketstest.cpp142
-rw-r--r--storage/src/tests/persistence/filestorage/filestortestfixture.cpp143
-rw-r--r--storage/src/tests/persistence/filestorage/filestortestfixture.h112
-rw-r--r--storage/src/tests/persistence/filestorage/forwardingmessagesender.h26
-rw-r--r--storage/src/tests/persistence/filestorage/mergeblockingtest.cpp239
-rw-r--r--storage/src/tests/persistence/filestorage/modifiedbucketcheckertest.cpp214
-rw-r--r--storage/src/tests/persistence/filestorage/operationabortingtest.cpp470
-rw-r--r--storage/src/tests/persistence/filestorage/sanitycheckeddeletetest.cpp78
-rw-r--r--storage/src/tests/persistence/filestorage/singlebucketjointest.cpp51
-rw-r--r--storage/src/tests/persistence/legacyoperationhandlertest.cpp190
-rw-r--r--storage/src/tests/persistence/mergehandlertest.cpp1494
-rw-r--r--storage/src/tests/persistence/persistenceproviderwrapper.cpp222
-rw-r--r--storage/src/tests/persistence/persistenceproviderwrapper.h153
-rw-r--r--storage/src/tests/persistence/persistencequeuetest.cpp103
-rw-r--r--storage/src/tests/persistence/persistencetestutils.cpp412
-rw-r--r--storage/src/tests/persistence/persistencetestutils.h214
-rw-r--r--storage/src/tests/persistence/persistencethread_splittest.cpp234
-rw-r--r--storage/src/tests/persistence/processalltest.cpp262
-rw-r--r--storage/src/tests/persistence/providershutdownwrappertest.cpp87
-rw-r--r--storage/src/tests/persistence/splitbitdetectortest.cpp363
-rw-r--r--storage/src/tests/persistence/testandsettest.cpp331
-rwxr-xr-xstorage/src/tests/pstack_testrunner14
-rw-r--r--storage/src/tests/serverapp/.gitignore8
-rw-r--r--storage/src/tests/storageserver/.gitignore13
-rw-r--r--storage/src/tests/storageserver/CMakeLists.txt17
-rw-r--r--storage/src/tests/storageserver/bouncertest.cpp285
-rw-r--r--storage/src/tests/storageserver/bucketintegritycheckertest.cpp302
-rw-r--r--storage/src/tests/storageserver/changedbucketownershiphandlertest.cpp648
-rw-r--r--storage/src/tests/storageserver/communicationmanagertest.cpp235
-rw-r--r--storage/src/tests/storageserver/documentapiconvertertest.cpp529
-rw-r--r--storage/src/tests/storageserver/dummystoragelink.cpp182
-rw-r--r--storage/src/tests/storageserver/dummystoragelink.h115
-rw-r--r--storage/src/tests/storageserver/mergethrottlertest.cpp1566
-rw-r--r--storage/src/tests/storageserver/priorityconvertertest.cpp104
-rw-r--r--storage/src/tests/storageserver/statemanagertest.cpp264
-rw-r--r--storage/src/tests/storageserver/statereportertest.cpp279
-rw-r--r--storage/src/tests/storageserver/testvisitormessagesession.cpp78
-rw-r--r--storage/src/tests/storageserver/testvisitormessagesession.h79
-rw-r--r--storage/src/tests/storageutil/.gitignore13
-rw-r--r--storage/src/tests/storageutil/CMakeLists.txt10
-rw-r--r--storage/src/tests/storageutil/charttest.cpp66
-rw-r--r--storage/src/tests/storageutil/functortest.cpp55
-rw-r--r--storage/src/tests/storageutil/palettetest.cpp33
-rw-r--r--storage/src/tests/storageutil/recordflatfiletest.cpp314
-rw-r--r--storage/src/tests/subscriptions/.gitignore8
-rw-r--r--storage/src/tests/systemtests/.gitignore0
-rw-r--r--storage/src/tests/testhelper.cpp175
-rw-r--r--storage/src/tests/testhelper.h58
-rw-r--r--storage/src/tests/testrunner.cpp15
-rw-r--r--storage/src/tests/visiting/.gitignore12
-rw-r--r--storage/src/tests/visiting/CMakeLists.txt11
-rw-r--r--storage/src/tests/visiting/commandqueuetest.cpp223
-rw-r--r--storage/src/tests/visiting/memory_bounded_trace_test.cpp131
-rw-r--r--storage/src/tests/visiting/visitormanagertest.cpp1172
-rw-r--r--storage/src/tests/visiting/visitortest.cpp1023
-rw-r--r--storage/src/versiontag.mak7
-rw-r--r--storage/src/vespa/storage/.gitignore9
-rw-r--r--storage/src/vespa/storage/CMakeLists.txt25
-rw-r--r--storage/src/vespa/storage/bucketdb/.gitignore11
-rw-r--r--storage/src/vespa/storage/bucketdb/CMakeLists.txt17
-rw-r--r--storage/src/vespa/storage/bucketdb/bucketmanager.cpp871
-rw-r--r--storage/src/vespa/storage/bucketdb/bucketmanager.h245
-rw-r--r--storage/src/vespa/storage/bucketdb/bucketmanagermetrics.h80
-rw-r--r--storage/src/vespa/storage/bucketdb/distrbucketdb.cpp44
-rw-r--r--storage/src/vespa/storage/bucketdb/distrbucketdb.h53
-rw-r--r--storage/src/vespa/storage/bucketdb/distribution_hash_normalizer.cpp205
-rw-r--r--storage/src/vespa/storage/bucketdb/distribution_hash_normalizer.h28
-rw-r--r--storage/src/vespa/storage/bucketdb/judyarray.cpp90
-rw-r--r--storage/src/vespa/storage/bucketdb/judyarray.h266
-rw-r--r--storage/src/vespa/storage/bucketdb/judymultimap.h561
-rw-r--r--storage/src/vespa/storage/bucketdb/lockablemap.h1067
-rw-r--r--storage/src/vespa/storage/bucketdb/minimumusedbitstracker.h43
-rw-r--r--storage/src/vespa/storage/bucketdb/stdmapwrapper.h94
-rw-r--r--storage/src/vespa/storage/bucketdb/stor-bucket-init.def35
-rw-r--r--storage/src/vespa/storage/bucketdb/stor-bucketdb.def9
-rw-r--r--storage/src/vespa/storage/bucketdb/storagebucketdbinitializer.cpp785
-rw-r--r--storage/src/vespa/storage/bucketdb/storagebucketdbinitializer.h229
-rw-r--r--storage/src/vespa/storage/bucketdb/storbucketdb.cpp66
-rw-r--r--storage/src/vespa/storage/bucketdb/storbucketdb.h83
-rw-r--r--storage/src/vespa/storage/bucketmover/CMakeLists.txt11
-rw-r--r--storage/src/vespa/storage/bucketmover/bucketmover.cpp541
-rw-r--r--storage/src/vespa/storage/bucketmover/bucketmover.h99
-rw-r--r--storage/src/vespa/storage/bucketmover/htmltable.h318
-rw-r--r--storage/src/vespa/storage/bucketmover/move.cpp45
-rw-r--r--storage/src/vespa/storage/bucketmover/move.h44
-rw-r--r--storage/src/vespa/storage/bucketmover/run.cpp250
-rw-r--r--storage/src/vespa/storage/bucketmover/run.h104
-rw-r--r--storage/src/vespa/storage/bucketmover/runstatistics.cpp197
-rw-r--r--storage/src/vespa/storage/bucketmover/runstatistics.h102
-rw-r--r--storage/src/vespa/storage/common/.gitignore8
-rw-r--r--storage/src/vespa/storage/common/CMakeLists.txt16
-rw-r--r--storage/src/vespa/storage/common/bucketmessages.h477
-rw-r--r--storage/src/vespa/storage/common/bucketoperationlogger.cpp331
-rw-r--r--storage/src/vespa/storage/common/bucketoperationlogger.h126
-rw-r--r--storage/src/vespa/storage/common/distributorcomponent.h125
-rw-r--r--storage/src/vespa/storage/common/doneinitializehandler.h21
-rw-r--r--storage/src/vespa/storage/common/hostreporter/CMakeLists.txt14
-rw-r--r--storage/src/vespa/storage/common/hostreporter/cpureporter.cpp150
-rw-r--r--storage/src/vespa/storage/common/hostreporter/cpureporter.h19
-rw-r--r--storage/src/vespa/storage/common/hostreporter/diskreporter.cpp68
-rw-r--r--storage/src/vespa/storage/common/hostreporter/diskreporter.h19
-rw-r--r--storage/src/vespa/storage/common/hostreporter/hostinfo.cpp29
-rw-r--r--storage/src/vespa/storage/common/hostreporter/hostinfo.h40
-rw-r--r--storage/src/vespa/storage/common/hostreporter/hostreporter.h17
-rw-r--r--storage/src/vespa/storage/common/hostreporter/kernelmetrictool.cpp75
-rw-r--r--storage/src/vespa/storage/common/hostreporter/kernelmetrictool.h30
-rw-r--r--storage/src/vespa/storage/common/hostreporter/memreporter.cpp72
-rw-r--r--storage/src/vespa/storage/common/hostreporter/memreporter.h19
-rw-r--r--storage/src/vespa/storage/common/hostreporter/networkreporter.cpp48
-rw-r--r--storage/src/vespa/storage/common/hostreporter/networkreporter.h19
-rw-r--r--storage/src/vespa/storage/common/hostreporter/versionreporter.cpp19
-rw-r--r--storage/src/vespa/storage/common/hostreporter/versionreporter.h20
-rw-r--r--storage/src/vespa/storage/common/messagebucketid.cpp100
-rw-r--r--storage/src/vespa/storage/common/messagebucketid.h22
-rw-r--r--storage/src/vespa/storage/common/messagesender.cpp20
-rw-r--r--storage/src/vespa/storage/common/messagesender.h44
-rw-r--r--storage/src/vespa/storage/common/nodestateupdater.h70
-rw-r--r--storage/src/vespa/storage/common/servicelayercomponent.cpp26
-rw-r--r--storage/src/vespa/storage/common/servicelayercomponent.h93
-rw-r--r--storage/src/vespa/storage/common/statusmessages.h99
-rw-r--r--storage/src/vespa/storage/common/statusmetricconsumer.cpp1081
-rw-r--r--storage/src/vespa/storage/common/statusmetricconsumer.h83
-rw-r--r--storage/src/vespa/storage/common/storagecomponent.cpp136
-rw-r--r--storage/src/vespa/storage/common/storagecomponent.h125
-rw-r--r--storage/src/vespa/storage/common/storagelink.cpp306
-rw-r--r--storage/src/vespa/storage/common/storagelink.h198
-rw-r--r--storage/src/vespa/storage/common/storagelinkqueued.cpp64
-rw-r--r--storage/src/vespa/storage/common/storagelinkqueued.h247
-rw-r--r--storage/src/vespa/storage/common/vectorprinter.h48
-rw-r--r--storage/src/vespa/storage/common/visitorfactory.h40
-rw-r--r--storage/src/vespa/storage/common/vtag.cpp81
-rw-r--r--storage/src/vespa/storage/common/vtag.h24
-rw-r--r--storage/src/vespa/storage/config/.gitignore11
-rw-r--r--storage/src/vespa/storage/config/CMakeLists.txt29
-rw-r--r--storage/src/vespa/storage/config/rpc-provider.def4
-rw-r--r--storage/src/vespa/storage/config/stor-bouncer.def30
-rw-r--r--storage/src/vespa/storage/config/stor-bucketmover.def37
-rw-r--r--storage/src/vespa/storage/config/stor-communicationmanager.def19
-rw-r--r--storage/src/vespa/storage/config/stor-distributormanager.def169
-rw-r--r--storage/src/vespa/storage/config/stor-integritychecker.def38
-rw-r--r--storage/src/vespa/storage/config/stor-messageforwarder.def4
-rw-r--r--storage/src/vespa/storage/config/stor-opslogger.def4
-rw-r--r--storage/src/vespa/storage/config/stor-prioritymapping.def20
-rw-r--r--storage/src/vespa/storage/config/stor-server.def78
-rw-r--r--storage/src/vespa/storage/config/stor-status.def4
-rw-r--r--storage/src/vespa/storage/config/stor-visitordispatcher.def13
-rw-r--r--storage/src/vespa/storage/distributor/.gitignore4
-rw-r--r--storage/src/vespa/storage/distributor/CMakeLists.txt35
-rw-r--r--storage/src/vespa/storage/distributor/activecopy.cpp182
-rw-r--r--storage/src/vespa/storage/distributor/activecopy.h44
-rw-r--r--storage/src/vespa/storage/distributor/blockingoperationstarter.cpp20
-rw-r--r--storage/src/vespa/storage/distributor/blockingoperationstarter.h34
-rw-r--r--storage/src/vespa/storage/distributor/bucketdb/CMakeLists.txt13
-rw-r--r--storage/src/vespa/storage/distributor/bucketdb/bucketcopy.cpp24
-rw-r--r--storage/src/vespa/storage/distributor/bucketdb/bucketcopy.h113
-rw-r--r--storage/src/vespa/storage/distributor/bucketdb/bucketdatabase.cpp56
-rw-r--r--storage/src/vespa/storage/distributor/bucketdb/bucketdatabase.h121
-rw-r--r--storage/src/vespa/storage/distributor/bucketdb/bucketdbmetricupdater.cpp139
-rw-r--r--storage/src/vespa/storage/distributor/bucketdb/bucketdbmetricupdater.h104
-rw-r--r--storage/src/vespa/storage/distributor/bucketdb/bucketinfo.cpp316
-rw-r--r--storage/src/vespa/storage/distributor/bucketdb/bucketinfo.h177
-rw-r--r--storage/src/vespa/storage/distributor/bucketdb/judybucketdatabase.cpp187
-rw-r--r--storage/src/vespa/storage/distributor/bucketdb/judybucketdatabase.h47
-rw-r--r--storage/src/vespa/storage/distributor/bucketdb/mapbucketdatabase.cpp515
-rw-r--r--storage/src/vespa/storage/distributor/bucketdb/mapbucketdatabase.h111
-rw-r--r--storage/src/vespa/storage/distributor/bucketdbupdater.cpp746
-rw-r--r--storage/src/vespa/storage/distributor/bucketdbupdater.h267
-rw-r--r--storage/src/vespa/storage/distributor/bucketgctimecalculator.cpp33
-rw-r--r--storage/src/vespa/storage/distributor/bucketgctimecalculator.h56
-rw-r--r--storage/src/vespa/storage/distributor/bucketlistmerger.cpp35
-rw-r--r--storage/src/vespa/storage/distributor/bucketlistmerger.h44
-rw-r--r--storage/src/vespa/storage/distributor/bucketownership.h49
-rw-r--r--storage/src/vespa/storage/distributor/clusterinformation.cpp56
-rw-r--r--storage/src/vespa/storage/distributor/clusterinformation.h50
-rw-r--r--storage/src/vespa/storage/distributor/common/.gitignore0
-rw-r--r--storage/src/vespa/storage/distributor/delegatedstatusrequest.h29
-rw-r--r--storage/src/vespa/storage/distributor/distributor.cpp759
-rw-r--r--storage/src/vespa/storage/distributor/distributor.h304
-rw-r--r--storage/src/vespa/storage/distributor/distributor_host_info_reporter.cpp103
-rw-r--r--storage/src/vespa/storage/distributor/distributor_host_info_reporter.h51
-rw-r--r--storage/src/vespa/storage/distributor/distributorcomponent.cpp356
-rw-r--r--storage/src/vespa/storage/distributor/distributorcomponent.h206
-rw-r--r--storage/src/vespa/storage/distributor/distributorconfiguration.cpp176
-rw-r--r--storage/src/vespa/storage/distributor/distributorconfiguration.h276
-rw-r--r--storage/src/vespa/storage/distributor/distributorinterface.h79
-rw-r--r--storage/src/vespa/storage/distributor/distributormessagesender.cpp33
-rw-r--r--storage/src/vespa/storage/distributor/distributormessagesender.h37
-rw-r--r--storage/src/vespa/storage/distributor/distributormetricsset.h145
-rw-r--r--storage/src/vespa/storage/distributor/externaloperationhandler.cpp227
-rw-r--r--storage/src/vespa/storage/distributor/externaloperationhandler.h56
-rw-r--r--storage/src/vespa/storage/distributor/idealstatemanager.cpp277
-rw-r--r--storage/src/vespa/storage/distributor/idealstatemanager.h147
-rw-r--r--storage/src/vespa/storage/distributor/idealstatemetricsset.h118
-rw-r--r--storage/src/vespa/storage/distributor/latency_statistics_provider.cpp29
-rw-r--r--storage/src/vespa/storage/distributor/latency_statistics_provider.h58
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/CMakeLists.txt12
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/bucketprioritydatabase.h74
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/maintenanceoperation.h28
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/maintenanceoperationgenerator.h46
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/maintenancepriority.h59
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/maintenancepriorityandtype.h36
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/maintenanceprioritygenerator.h24
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/maintenancescanner.h39
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/maintenancescheduler.cpp105
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/maintenancescheduler.h50
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp25
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h61
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/prioritizedbucket.cpp19
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/prioritizedbucket.h76
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/simplebucketprioritydatabase.cpp143
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/simplebucketprioritydatabase.h71
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/simplemaintenancescanner.cpp56
-rw-r--r--storage/src/vespa/storage/distributor/maintenance/simplemaintenancescanner.h67
-rw-r--r--storage/src/vespa/storage/distributor/maintenancebucket.h59
-rw-r--r--storage/src/vespa/storage/distributor/messageguard.h46
-rw-r--r--storage/src/vespa/storage/distributor/messagetracker.cpp53
-rw-r--r--storage/src/vespa/storage/distributor/messagetracker.h54
-rw-r--r--storage/src/vespa/storage/distributor/min_replica_provider.h26
-rw-r--r--storage/src/vespa/storage/distributor/nodeinfo.cpp85
-rw-r--r--storage/src/vespa/storage/distributor/nodeinfo.h50
-rw-r--r--storage/src/vespa/storage/distributor/operationowner.cpp88
-rw-r--r--storage/src/vespa/storage/distributor/operationowner.h110
-rw-r--r--storage/src/vespa/storage/distributor/operations/CMakeLists.txt8
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/CMakeLists.txt18
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/getoperation.cpp289
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/getoperation.h111
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/multioperationoperation.cpp246
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/multioperationoperation.h61
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/putoperation.cpp375
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/putoperation.h83
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/removelocationoperation.cpp118
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/removelocationoperation.h48
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/removeoperation.cpp103
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/removeoperation.h44
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/statbucketlistoperation.cpp65
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/statbucketlistoperation.h53
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/statbucketoperation.cpp107
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/statbucketoperation.h49
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp556
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.h136
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/updateoperation.cpp170
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/updateoperation.h68
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/visitoroperation.cpp1008
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/visitoroperation.h193
-rw-r--r--storage/src/vespa/storage/distributor/operations/external/visitororder.h83
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/CMakeLists.txt15
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/garbagecollectionoperation.cpp80
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/garbagecollectionoperation.h43
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/idealstateoperation.cpp252
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/idealstateoperation.h255
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/joinoperation.cpp158
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/joinoperation.h64
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.cpp178
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.h23
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergemetadata.h38
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp393
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h84
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/removebucketoperation.cpp124
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/removebucketoperation.h50
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/setbucketstateoperation.cpp125
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/setbucketstateoperation.h47
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/splitoperation.cpp179
-rw-r--r--storage/src/vespa/storage/distributor/operations/idealstate/splitoperation.h55
-rw-r--r--storage/src/vespa/storage/distributor/operations/operation.cpp55
-rw-r--r--storage/src/vespa/storage/distributor/operations/operation.h98
-rw-r--r--storage/src/vespa/storage/distributor/operationstarter.h23
-rw-r--r--storage/src/vespa/storage/distributor/operationtargetresolver.cpp237
-rw-r--r--storage/src/vespa/storage/distributor/operationtargetresolver.h75
-rw-r--r--storage/src/vespa/storage/distributor/operationtargetresolverimpl.cpp193
-rw-r--r--storage/src/vespa/storage/distributor/operationtargetresolverimpl.h115
-rw-r--r--storage/src/vespa/storage/distributor/pendingclusterstate.cpp600
-rw-r--r--storage/src/vespa/storage/distributor/pendingclusterstate.h287
-rw-r--r--storage/src/vespa/storage/distributor/pendingmessagetracker.cpp349
-rw-r--r--storage/src/vespa/storage/distributor/pendingmessagetracker.h255
-rw-r--r--storage/src/vespa/storage/distributor/persistencemessagetracker.cpp367
-rw-r--r--storage/src/vespa/storage/distributor/persistencemessagetracker.h119
-rw-r--r--storage/src/vespa/storage/distributor/sentmessagemap.cpp96
-rw-r--r--storage/src/vespa/storage/distributor/sentmessagemap.h45
-rw-r--r--storage/src/vespa/storage/distributor/simpleclusterinformation.h49
-rw-r--r--storage/src/vespa/storage/distributor/statechecker.cpp112
-rw-r--r--storage/src/vespa/storage/distributor/statechecker.h165
-rw-r--r--storage/src/vespa/storage/distributor/statecheckers.cpp1147
-rw-r--r--storage/src/vespa/storage/distributor/statecheckers.h139
-rw-r--r--storage/src/vespa/storage/distributor/statusdelegator.h18
-rw-r--r--storage/src/vespa/storage/distributor/statusreporterdelegate.cpp42
-rw-r--r--storage/src/vespa/storage/distributor/statusreporterdelegate.h30
-rw-r--r--storage/src/vespa/storage/distributor/throttlingoperationstarter.cpp45
-rw-r--r--storage/src/vespa/storage/distributor/throttlingoperationstarter.h100
-rw-r--r--storage/src/vespa/storage/distributor/visitormetricsset.h36
-rw-r--r--storage/src/vespa/storage/frameworkimpl/component/CMakeLists.txt10
-rw-r--r--storage/src/vespa/storage/frameworkimpl/component/distributorcomponentregisterimpl.cpp105
-rw-r--r--storage/src/vespa/storage/frameworkimpl/component/distributorcomponentregisterimpl.h59
-rw-r--r--storage/src/vespa/storage/frameworkimpl/component/servicelayercomponentregisterimpl.cpp43
-rw-r--r--storage/src/vespa/storage/frameworkimpl/component/servicelayercomponentregisterimpl.h45
-rw-r--r--storage/src/vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.cpp123
-rw-r--r--storage/src/vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h72
-rw-r--r--storage/src/vespa/storage/frameworkimpl/memory/CMakeLists.txt8
-rw-r--r--storage/src/vespa/storage/frameworkimpl/memory/memorysnapshotlist.h20
-rw-r--r--storage/src/vespa/storage/frameworkimpl/memory/memorystatusviewer.cpp661
-rw-r--r--storage/src/vespa/storage/frameworkimpl/memory/memorystatusviewer.h139
-rw-r--r--storage/src/vespa/storage/frameworkimpl/status/CMakeLists.txt8
-rw-r--r--storage/src/vespa/storage/frameworkimpl/status/statuswebserver.cpp357
-rw-r--r--storage/src/vespa/storage/frameworkimpl/status/statuswebserver.h84
-rw-r--r--storage/src/vespa/storage/frameworkimpl/thread/CMakeLists.txt9
-rw-r--r--storage/src/vespa/storage/frameworkimpl/thread/appkiller.cpp18
-rw-r--r--storage/src/vespa/storage/frameworkimpl/thread/appkiller.h29
-rw-r--r--storage/src/vespa/storage/frameworkimpl/thread/deadlockdetector.cpp341
-rw-r--r--storage/src/vespa/storage/frameworkimpl/thread/deadlockdetector.h100
-rw-r--r--storage/src/vespa/storage/persistence/.gitignore8
-rw-r--r--storage/src/vespa/storage/persistence/CMakeLists.txt19
-rw-r--r--storage/src/vespa/storage/persistence/bucketownershipnotifier.cpp165
-rw-r--r--storage/src/vespa/storage/persistence/bucketownershipnotifier.h94
-rw-r--r--storage/src/vespa/storage/persistence/bucketprocessor.cpp83
-rw-r--r--storage/src/vespa/storage/persistence/bucketprocessor.h31
-rw-r--r--storage/src/vespa/storage/persistence/diskmoveoperationhandler.cpp95
-rw-r--r--storage/src/vespa/storage/persistence/diskmoveoperationhandler.h24
-rw-r--r--storage/src/vespa/storage/persistence/diskthread.h77
-rw-r--r--storage/src/vespa/storage/persistence/fieldvisitor.cpp27
-rw-r--r--storage/src/vespa/storage/persistence/fieldvisitor.h60
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/.gitignore11
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/CMakeLists.txt12
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/debugverifications.h36
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/filestorhandler.cpp208
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/filestorhandler.h277
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/filestorhandlerimpl.cpp1388
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/filestorhandlerimpl.h362
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/filestormanager.cpp1081
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/filestormanager.h202
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/filestormetrics.h363
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/mergestatus.cpp109
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/mergestatus.h50
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/modifiedbucketchecker.cpp206
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/modifiedbucketchecker.h71
-rw-r--r--storage/src/vespa/storage/persistence/filestorage/pausehandler.h34
-rw-r--r--storage/src/vespa/storage/persistence/mergehandler.cpp1598
-rw-r--r--storage/src/vespa/storage/persistence/mergehandler.h103
-rw-r--r--storage/src/vespa/storage/persistence/messages.h424
-rw-r--r--storage/src/vespa/storage/persistence/persistencethread.cpp1265
-rw-r--r--storage/src/vespa/storage/persistence/persistencethread.h117
-rw-r--r--storage/src/vespa/storage/persistence/persistenceutil.cpp216
-rw-r--r--storage/src/vespa/storage/persistence/persistenceutil.h125
-rw-r--r--storage/src/vespa/storage/persistence/processallhandler.cpp136
-rw-r--r--storage/src/vespa/storage/persistence/processallhandler.h33
-rw-r--r--storage/src/vespa/storage/persistence/providershutdownwrapper.cpp207
-rw-r--r--storage/src/vespa/storage/persistence/providershutdownwrapper.h124
-rw-r--r--storage/src/vespa/storage/persistence/splitbitdetector.cpp262
-rw-r--r--storage/src/vespa/storage/persistence/splitbitdetector.h66
-rw-r--r--storage/src/vespa/storage/persistence/testandsethelper.cpp72
-rw-r--r--storage/src/vespa/storage/persistence/testandsethelper.h39
-rw-r--r--storage/src/vespa/storage/persistence/types.cpp12
-rw-r--r--storage/src/vespa/storage/persistence/types.h38
-rw-r--r--storage/src/vespa/storage/storageserver/.gitignore8
-rw-r--r--storage/src/vespa/storage/storageserver/CMakeLists.txt28
-rw-r--r--storage/src/vespa/storage/storageserver/applicationgenerationfetcher.h23
-rw-r--r--storage/src/vespa/storage/storageserver/bouncer.cpp295
-rw-r--r--storage/src/vespa/storage/storageserver/bouncer.h87
-rw-r--r--storage/src/vespa/storage/storageserver/bucketintegritychecker.cpp671
-rw-r--r--storage/src/vespa/storage/storageserver/bucketintegritychecker.h160
-rw-r--r--storage/src/vespa/storage/storageserver/changedbucketownershiphandler.cpp398
-rw-r--r--storage/src/vespa/storage/storageserver/changedbucketownershiphandler.h218
-rw-r--r--storage/src/vespa/storage/storageserver/communicationmanager.cpp850
-rw-r--r--storage/src/vespa/storage/storageserver/communicationmanager.h236
-rw-r--r--storage/src/vespa/storage/storageserver/communicationmanagermetrics.h56
-rw-r--r--storage/src/vespa/storage/storageserver/distributornode.cpp143
-rw-r--r--storage/src/vespa/storage/storageserver/distributornode.h66
-rw-r--r--storage/src/vespa/storage/storageserver/distributornodecontext.cpp16
-rw-r--r--storage/src/vespa/storage/storageserver/distributornodecontext.h48
-rw-r--r--storage/src/vespa/storage/storageserver/documentapiconverter.cpp516
-rw-r--r--storage/src/vespa/storage/storageserver/documentapiconverter.h41
-rw-r--r--storage/src/vespa/storage/storageserver/fnetlistener.cpp178
-rw-r--r--storage/src/vespa/storage/storageserver/fnetlistener.h42
-rw-r--r--storage/src/vespa/storage/storageserver/framework.cpp34
-rw-r--r--storage/src/vespa/storage/storageserver/framework.h69
-rw-r--r--storage/src/vespa/storage/storageserver/mergethrottler.cpp1225
-rw-r--r--storage/src/vespa/storage/storageserver/mergethrottler.h475
-rw-r--r--storage/src/vespa/storage/storageserver/messageallocationtypes.cpp90
-rw-r--r--storage/src/vespa/storage/storageserver/messageallocationtypes.h33
-rw-r--r--storage/src/vespa/storage/storageserver/messagedispatcher.cpp234
-rw-r--r--storage/src/vespa/storage/storageserver/messagedispatcher.h75
-rw-r--r--storage/src/vespa/storage/storageserver/messagesink.cpp87
-rw-r--r--storage/src/vespa/storage/storageserver/messagesink.h38
-rw-r--r--storage/src/vespa/storage/storageserver/opslogger.cpp142
-rw-r--r--storage/src/vespa/storage/storageserver/opslogger.h55
-rw-r--r--storage/src/vespa/storage/storageserver/priorityconverter.cpp81
-rw-r--r--storage/src/vespa/storage/storageserver/priorityconverter.h43
-rw-r--r--storage/src/vespa/storage/storageserver/prioritymapper.h43
-rw-r--r--storage/src/vespa/storage/storageserver/rpcrequestwrapper.cpp93
-rw-r--r--storage/src/vespa/storage/storageserver/rpcrequestwrapper.h70
-rw-r--r--storage/src/vespa/storage/storageserver/servicelayernode.cpp311
-rw-r--r--storage/src/vespa/storage/storageserver/servicelayernode.h76
-rw-r--r--storage/src/vespa/storage/storageserver/servicelayernodecontext.cpp16
-rw-r--r--storage/src/vespa/storage/storageserver/servicelayernodecontext.h47
-rw-r--r--storage/src/vespa/storage/storageserver/statemanager.cpp573
-rw-r--r--storage/src/vespa/storage/storageserver/statemanager.h146
-rw-r--r--storage/src/vespa/storage/storageserver/statereporter.cpp119
-rw-r--r--storage/src/vespa/storage/storageserver/statereporter.h65
-rw-r--r--storage/src/vespa/storage/storageserver/storagemetricsset.h116
-rw-r--r--storage/src/vespa/storage/storageserver/storagenode.cpp626
-rw-r--r--storage/src/vespa/storage/storageserver/storagenode.h195
-rw-r--r--storage/src/vespa/storage/storageserver/storagenodecontext.cpp33
-rw-r--r--storage/src/vespa/storage/storageserver/storagenodecontext.h69
-rw-r--r--storage/src/vespa/storage/storageutil/.gitignore4
-rw-r--r--storage/src/vespa/storage/storageutil/CMakeLists.txt11
-rw-r--r--storage/src/vespa/storage/storageutil/bloomfilter.cpp41
-rw-r--r--storage/src/vespa/storage/storageutil/bloomfilter.h118
-rw-r--r--storage/src/vespa/storage/storageutil/distributorstatecache.h58
-rw-r--r--storage/src/vespa/storage/storageutil/functor.h60
-rw-r--r--storage/src/vespa/storage/storageutil/graph.cpp201
-rw-r--r--storage/src/vespa/storage/storageutil/graph.h96
-rw-r--r--storage/src/vespa/storage/storageutil/log.h30
-rw-r--r--storage/src/vespa/storage/storageutil/palette.cpp111
-rw-r--r--storage/src/vespa/storage/storageutil/palette.h30
-rw-r--r--storage/src/vespa/storage/storageutil/piechart.cpp202
-rw-r--r--storage/src/vespa/storage/storageutil/piechart.h65
-rw-r--r--storage/src/vespa/storage/storageutil/recordflatfile.cpp155
-rw-r--r--storage/src/vespa/storage/storageutil/recordflatfile.h340
-rw-r--r--storage/src/vespa/storage/storageutil/resumeguard.h38
-rw-r--r--storage/src/vespa/storage/storageutil/utils.h90
-rw-r--r--storage/src/vespa/storage/subscriptions/.gitignore8
-rw-r--r--storage/src/vespa/storage/tools/.gitignore30
-rw-r--r--storage/src/vespa/storage/tools/CMakeLists.txt53
-rw-r--r--storage/src/vespa/storage/tools/analyzedistribution.cpp523
-rwxr-xr-xstorage/src/vespa/storage/tools/generate_distribution_doc.sh5
-rw-r--r--storage/src/vespa/storage/tools/generatedistributionbits.cpp264
-rw-r--r--storage/src/vespa/storage/tools/getidealstate.cpp199
-rw-r--r--storage/src/vespa/storage/tools/lib/.gitignore0
-rw-r--r--storage/src/vespa/storage/tools/statfs.cpp64
-rw-r--r--storage/src/vespa/storage/tools/storage-cmd.cpp126
-rw-r--r--storage/src/vespa/storage/tools/throttlingsim.cpp474
-rw-r--r--storage/src/vespa/storage/tools/throttlingsim.h150
-rw-r--r--storage/src/vespa/storage/visiting/.gitignore10
-rw-r--r--storage/src/vespa/storage/visiting/CMakeLists.txt19
-rw-r--r--storage/src/vespa/storage/visiting/commandqueue.h250
-rw-r--r--storage/src/vespa/storage/visiting/countvisitor.cpp117
-rw-r--r--storage/src/vespa/storage/visiting/countvisitor.h62
-rw-r--r--storage/src/vespa/storage/visiting/dumpvisitor.cpp134
-rw-r--r--storage/src/vespa/storage/visiting/dumpvisitor.h58
-rw-r--r--storage/src/vespa/storage/visiting/dumpvisitorsingle.cpp47
-rw-r--r--storage/src/vespa/storage/visiting/dumpvisitorsingle.h45
-rw-r--r--storage/src/vespa/storage/visiting/memory_bounded_trace.cpp71
-rw-r--r--storage/src/vespa/storage/visiting/memory_bounded_trace.h51
-rw-r--r--storage/src/vespa/storage/visiting/messagebusvisitormessagesession.h59
-rw-r--r--storage/src/vespa/storage/visiting/messages.h79
-rw-r--r--storage/src/vespa/storage/visiting/recoveryvisitor.cpp106
-rw-r--r--storage/src/vespa/storage/visiting/recoveryvisitor.h61
-rw-r--r--storage/src/vespa/storage/visiting/stor-visitor.def72
-rw-r--r--storage/src/vespa/storage/visiting/testvisitor.cpp84
-rw-r--r--storage/src/vespa/storage/visiting/testvisitor.h60
-rw-r--r--storage/src/vespa/storage/visiting/visitor.cpp1295
-rw-r--r--storage/src/vespa/storage/visiting/visitor.h584
-rw-r--r--storage/src/vespa/storage/visiting/visitorlibraries.cpp70
-rw-r--r--storage/src/vespa/storage/visiting/visitorlibraries.h39
-rw-r--r--storage/src/vespa/storage/visiting/visitormanager.cpp716
-rw-r--r--storage/src/vespa/storage/visiting/visitormanager.h184
-rw-r--r--storage/src/vespa/storage/visiting/visitormessagesession.h28
-rw-r--r--storage/src/vespa/storage/visiting/visitormessagesessionfactory.h25
-rw-r--r--storage/src/vespa/storage/visiting/visitormetrics.h76
-rw-r--r--storage/src/vespa/storage/visiting/visitorthread.cpp818
-rw-r--r--storage/src/vespa/storage/visiting/visitorthread.h152
-rw-r--r--storage/src/vespa/storage/visiting/visitorthreadmetrics.h108
557 files changed, 103102 insertions, 0 deletions
diff --git a/storage/src/.gitignore b/storage/src/.gitignore
new file mode 100644
index 00000000000..f7cecb195ca
--- /dev/null
+++ b/storage/src/.gitignore
@@ -0,0 +1,10 @@
+*.So
+*.lo
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile.ini
+config_command.sh
+project.dsw
+/storage.mak
diff --git a/storage/src/Doxyfile b/storage/src/Doxyfile
new file mode 100644
index 00000000000..d40aff6f46c
--- /dev/null
+++ b/storage/src/Doxyfile
@@ -0,0 +1,994 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+# Doxyfile 1.2.18
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project
+#
+# All text after a hash (#) is considered a comment and will be ignored
+# The format is:
+# TAG = value [value, ...]
+# For lists items can also be appended using:
+# TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ")
+
+#---------------------------------------------------------------------------
+# General configuration options
+#---------------------------------------------------------------------------
+
+# The PROJECT_NAME tag is a single word (or a sequence of words surrounded
+# by quotes) that should identify the project.
+
+PROJECT_NAME = Storage
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY = ../doc
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Brazilian, Catalan, Chinese, Chinese-Traditional, Croatian, Czech, Danish, Dutch,
+# Finnish, French, German, Greek, Hungarian, Italian, Japanese, Japanese-en
+# (Japanese with english messages), Korean, Norwegian, Polish, Portuguese,
+# Romanian, Russian, Serbian, Slovak, Slovene, Spanish, Swedish and Ukrainian.
+
+OUTPUT_LANGUAGE = English
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES = YES
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these class will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS = NO
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF = YES
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all inherited
+# members of a class in the documentation of that class as if those members were
+# ordinary class members. Constructors, destructors and assignment operators of
+# the base classes will not be shown.
+
+INLINE_INHERITED_MEMB = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. It is allowed to use relative paths in the argument list.
+
+STRIP_FROM_PATH =
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS = YES
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower case letters. If set to YES upper case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# users are adviced to set this option to NO.
+
+CASE_SENSE_NAMES = YES
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful is your file systems
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES = NO
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS = YES
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES = YES
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like the Qt-style comments (thus requiring an
+# explict @brief command for a brief description.
+
+JAVADOC_AUTOBRIEF = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the DETAILS_AT_TOP tag is set to YES then Doxygen
+# will output the detailed description near the top, like JavaDoc.
+# If set to NO, the detailed description appears after the member
+# documentation.
+
+DETAILS_AT_TOP = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# reimplements.
+
+INHERIT_DOCS = YES
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE = 4
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES =
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or define consist of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and defines in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES = 30
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources
+# only. Doxygen will then generate output that is more tailored for C.
+# For instance some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C = YES
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java sources
+# only. Doxygen will then generate output that is more tailored for Java.
+# For instance namespaces will be presented as packages, qualified scopes
+# will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA = NO
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED = YES
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text.
+
+WARN_FORMAT = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT = storage
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh *.hxx *.hpp
+# *.h++ *.idl *.odl
+
+FILE_PATTERNS = *.h *.cpp
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE = YES
+
+# The EXCLUDE tag can be used to specify files and/or directories that should
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+
+EXCLUDE =
+
+# The EXCLUDE_SYMLINKS tag can be used select whether or not files or directories
+# that are symbolic links (a Unix filesystem feature) are excluded from the input.
+
+EXCLUDE_SYMLINKS = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories.
+
+EXCLUDE_PATTERNS =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+
+INPUT_FILTER =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+
+SOURCE_BROWSER = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES = NO
+
+# If the REFERENCED_BY_RELATION tag is set to YES (the default)
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES (the default)
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX = NO
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT = html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header.
+
+HTML_HEADER =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER =
+
+# The HTML_STYLESHEET tag can be used to specify a user defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet
+
+HTML_STYLESHEET = ../cpp/vespa_link.css
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS = YES
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compressed HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output dir.
+
+CHM_FILE =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non empty doxygen will try to run
+# the html help compiler on the generated index.hhp.
+
+HHC_LOCATION =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the Html help documentation and to the tree view.
+
+TOC_EXPAND = NO
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index at
+# top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it.
+
+DISABLE_INDEX = NO
+
+# This tag can be used to set the number of enum values (range [1..20])
+# that doxygen will group on one line in the generated HTML documentation.
+
+ENUM_VALUES_PER_LINE = 4
+
+# If the GENERATE_TREEVIEW tag is set to YES, a side panel will be
+# generated containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript and frames is required (for instance Mozilla, Netscape 4.0+,
+# or Internet explorer 4.0+). Note that for large projects the tree generation
+# can take a very long time. In such cases it is better to disable this feature.
+# Windows users are probably better off using the HTML help feature.
+
+GENERATE_TREEVIEW = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH = 250
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be invoked. If left blank `latex' will be used as the default command name.
+
+LATEX_CMD_NAME = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, a4wide, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS = NO
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimised for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS = NO
+
+# Load stylesheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assigments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_XML = NO
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD =
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF = NO
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_PREDEFINED tags.
+
+EXPAND_ONLY_PREDEF = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# in the INCLUDE_PATH (see below) will be search if a #include is found.
+
+SEARCH_INCLUDES = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed.
+
+PREDEFINED =
+
+# If the MACRO_EXPANSION and EXPAND_PREDEF_ONLY tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition.
+
+EXPAND_AS_DEFINED =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all function-like macros that are alone
+# on a line, have an all uppercase name, and do not end with a semicolon. Such
+# function macros are typically used for boiler-plate code, and will confuse the
+# parser if not removed.
+
+SKIP_FUNCTION_MACROS = YES
+
+#---------------------------------------------------------------------------
+# Configuration::addtions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES tag can be used to specify one or more tagfiles.
+
+TAGFILES =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in Html, RTF and LaTeX) for classes with base or
+# super classes. Setting the tag to NO turns the diagrams off. Note that this
+# option is superceded by the HAVE_DOT option below. This is only a fallback. It is
+# recommended to install and use dot, since it yield more powerful graphs.
+
+CLASS_DIAGRAMS = YES
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT = NO
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# the CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH = YES
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH = YES
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are png, jpg, or gif
+# If left blank png will be used.
+
+DOT_IMAGE_FORMAT = png
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found on the path.
+
+DOT_PATH =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS =
+
+# The MAX_DOT_GRAPH_WIDTH tag can be used to set the maximum allowed width
+# (in pixels) of the graphs generated by dot. If a graph becomes larger than
+# this value, doxygen will try to truncate the graph, so that it fits within
+# the specified constraint. Beware that most browsers cannot cope with very
+# large images.
+
+MAX_DOT_GRAPH_WIDTH = 1024
+
+# The MAX_DOT_GRAPH_HEIGHT tag can be used to set the maximum allows height
+# (in pixels) of the graphs generated by dot. If a graph becomes larger than
+# this value, doxygen will try to truncate the graph, so that it fits within
+# the specified constraint. Beware that most browsers cannot cope with very
+# large images.
+
+MAX_DOT_GRAPH_HEIGHT = 1024
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermedate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP = YES
+
+#---------------------------------------------------------------------------
+# Configuration::addtions related to the search engine
+#---------------------------------------------------------------------------
+
+# The SEARCHENGINE tag specifies whether or not a search engine should be
+# used. If set to NO the values of all tags below this one will be ignored.
+
+SEARCHENGINE = NO
+
+# The CGI_NAME tag should be the name of the CGI script that
+# starts the search engine (doxysearch) with the correct parameters.
+# A script with this name will be generated by doxygen.
+
+CGI_NAME = search.cgi
+
+# The CGI_URL tag should be the absolute URL to the directory where the
+# cgi binaries are located. See the documentation of your http daemon for
+# details.
+
+CGI_URL =
+
+# The DOC_URL tag should be the absolute URL to the directory where the
+# documentation is located. If left blank the absolute path to the
+# documentation, with file:// prepended to it, will be used.
+
+DOC_URL =
+
+# The DOC_ABSPATH tag should be the absolute path to the directory where the
+# documentation is located. If left blank the directory on the local machine
+# will be used.
+
+DOC_ABSPATH =
+
+# The BIN_ABSPATH tag must point to the directory where the doxysearch binary
+# is installed.
+
+BIN_ABSPATH = /usr/local/bin/
+
+# The EXT_DOC_PATHS tag can be used to specify one or more paths to
+# documentation generated for other projects. This allows doxysearch to search
+# the documentation for these projects as well.
+
+EXT_DOC_PATHS =
diff --git a/storage/src/tests/.gitignore b/storage/src/tests/.gitignore
new file mode 100644
index 00000000000..9023e5da3b4
--- /dev/null
+++ b/storage/src/tests/.gitignore
@@ -0,0 +1,22 @@
+*.o
+*.lo
+.depend.NEW
+.depend
+.deps
+.libs
+.config.log
+Makefile
+testrunner
+vdsroot
+*.core
+state
+*.So
+test.vlog
+dirconfig.tmp
+.*.swp
+metricsreport.html
+piefile.html
+piefile-customcols.html
+palette.html
+use_new_storage_core
+storage_testrunner_app
diff --git a/storage/src/tests/CMakeLists.txt b/storage/src/tests/CMakeLists.txt
new file mode 100644
index 00000000000..894ea7b4d25
--- /dev/null
+++ b/storage/src/tests/CMakeLists.txt
@@ -0,0 +1,22 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(storage_testrunner_app
+ SOURCES
+ testrunner.cpp
+ DEPENDS
+ storage_teststorageserver
+ storage_testbucketmover
+ storage_teststorageutil
+ storage_testvisiting
+ storage_testbucketdb
+ storage_testcommon
+ storage_testhostreporter
+ storage_testdistributor
+ storage_testpersistence
+ storage_testfilestorage
+ storage_testmemory
+ storage_teststatus
+ storage
+ AFTER
+ storage_storageconfig
+)
+vespa_add_test(NAME storage_testrunner_app COMMAND storage_testrunner_app)
diff --git a/storage/src/tests/bucketdb/.gitignore b/storage/src/tests/bucketdb/.gitignore
new file mode 100644
index 00000000000..4e71c44a596
--- /dev/null
+++ b/storage/src/tests/bucketdb/.gitignore
@@ -0,0 +1,12 @@
+*.So
+*.core
+*.lo
+*.o
+.*.swp
+.config.log
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
+testrunner
diff --git a/storage/src/tests/bucketdb/CMakeLists.txt b/storage/src/tests/bucketdb/CMakeLists.txt
new file mode 100644
index 00000000000..95228966589
--- /dev/null
+++ b/storage/src/tests/bucketdb/CMakeLists.txt
@@ -0,0 +1,14 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_testbucketdb
+ SOURCES
+ initializertest.cpp
+ bucketmanagertest.cpp
+ judyarraytest.cpp
+ judymultimaptest.cpp
+ lockablemaptest.cpp
+ bucketinfotest.cpp
+ distribution_hash_normalizer_test.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/tests/bucketdb/bucketinfotest.cpp b/storage/src/tests/bucketdb/bucketinfotest.cpp
new file mode 100644
index 00000000000..eef4c6d7739
--- /dev/null
+++ b/storage/src/tests/bucketdb/bucketinfotest.cpp
@@ -0,0 +1,201 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <boost/assign.hpp>
+#include <boost/random.hpp>
+#include <cppunit/extensions/HelperMacros.h>
+#include <map>
+#include <vector>
+#include <vespa/vespalib/text/stringtokenizer.h>
+#include <vespa/storage/distributor/bucketdb/bucketinfo.h>
+
+namespace storage {
+
+namespace distributor {
+
+struct BucketInfoTest : public CppUnit::TestFixture {
+ void testBucketInfoEntriesWithNewestTimestampsAreKept();
+ void testOrder();
+ void testHasInvalidCopy();
+ void testAddNodeSetsTrustedWhenConsistent();
+ void testTrustedResetWhenCopiesBecomeInconsistent();
+ void testTrustedResetWhenTrustedCopiesGoOutOfSync();
+ void testTrustedNotResetWhenNonTrustedCopiesStillOutOfSync();
+
+ CPPUNIT_TEST_SUITE(BucketInfoTest);
+ CPPUNIT_TEST(testBucketInfoEntriesWithNewestTimestampsAreKept);
+ CPPUNIT_TEST(testOrder);
+ CPPUNIT_TEST(testHasInvalidCopy);
+ CPPUNIT_TEST(testAddNodeSetsTrustedWhenConsistent);
+ CPPUNIT_TEST_IGNORED(testTrustedResetWhenCopiesBecomeInconsistent);
+ CPPUNIT_TEST(testTrustedResetWhenTrustedCopiesGoOutOfSync);
+ CPPUNIT_TEST(testTrustedNotResetWhenNonTrustedCopiesStillOutOfSync);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(BucketInfoTest);
+
+BucketInfo
+getBucketInfo(std::string nodeList, std::string order) {
+ BucketInfo info;
+
+ std::vector<uint16_t> ordering;
+ {
+ vespalib::StringTokenizer tokenizer(order, ",");
+ for (uint32_t i = 0; i < tokenizer.size(); i++) {
+ ordering.push_back(atoi(tokenizer[i].c_str()));
+ }
+ }
+
+ vespalib::StringTokenizer tokenizer(nodeList, ",");
+ for (uint32_t i = 0; i < tokenizer.size(); i++) {
+ info.addNode(BucketCopy(0,
+ atoi(tokenizer[i].c_str()),
+ api::BucketInfo(1,1,1)),
+ ordering);
+ }
+
+ return info;
+}
+
+std::string
+nodeList(const BucketInfo& info) {
+ std::ostringstream ost;
+ for (uint32_t i = 0; i < info.getNodeCount(); i++) {
+ if (i != 0) {
+ ost << ",";
+ }
+ ost << (int)info.getNodeRef(i).getNode();
+ }
+ return ost.str();
+}
+
+// Since we keep bucket info in memory for a period of time before applying
+// to bucket db, we maintain timestamps to prevent external load happening
+// in the meantime from having their updates lost when we perform a batch
+// insert. This also applies for when we postpone db updates in persistence
+// message tracker until we've received a reply from all copies.
+void
+BucketInfoTest::testBucketInfoEntriesWithNewestTimestampsAreKept()
+{
+ BucketInfo bi;
+ std::vector<uint16_t> idealState;
+ idealState.push_back(0);
+
+ bi.addNode(BucketCopy(5, 0, api::BucketInfo(1,1,1)), idealState);
+ CPPUNIT_ASSERT_EQUAL(api::BucketInfo(1,1,1),
+ bi.getNode(0)->getBucketInfo());
+
+ bi.addNode(BucketCopy(5, 0, api::BucketInfo(2,2,2)), idealState);
+ CPPUNIT_ASSERT_EQUAL(api::BucketInfo(1,1,1),
+ bi.getNode(0)->getBucketInfo());
+
+ bi.addNode(BucketCopy(4, 0, api::BucketInfo(3,3,3)), idealState);
+ CPPUNIT_ASSERT_EQUAL(api::BucketInfo(1,1,1),
+ bi.getNode(0)->getBucketInfo());
+
+ bi.addNode(BucketCopy(7, 0, api::BucketInfo(4,4,4)), idealState);
+ CPPUNIT_ASSERT_EQUAL(api::BucketInfo(4,4,4),
+ bi.getNode(0)->getBucketInfo());
+
+ bi.addNode(BucketCopy(2, 1, api::BucketInfo(4,4,4)), idealState);
+ CPPUNIT_ASSERT_EQUAL(api::BucketInfo(4,4,4),
+ bi.getNode(1)->getBucketInfo());
+}
+
+void
+BucketInfoTest::testOrder() {
+
+ CPPUNIT_ASSERT_EQUAL(std::string("2,0,1"), nodeList(getBucketInfo("0,1,2", "2,0,1")));
+ CPPUNIT_ASSERT_EQUAL(std::string("2,0,1"), nodeList(getBucketInfo("1,0,2", "2,0,1")));
+ CPPUNIT_ASSERT_EQUAL(std::string("1,0,2"), nodeList(getBucketInfo("1,2,0", "1")));
+ CPPUNIT_ASSERT_EQUAL(std::string("2,1,0,3,4"), nodeList(getBucketInfo("0,1,2,3,4", "2,1")));
+}
+
+void
+BucketInfoTest::testHasInvalidCopy()
+{
+ std::vector<uint16_t> order;
+
+ BucketInfo info;
+ info.addNode(BucketCopy(0, 0, api::BucketInfo(10, 100, 1000)), order);
+ info.addNode(BucketCopy(0, 1, api::BucketInfo(10, 100, 1000)), order);
+ CPPUNIT_ASSERT(!info.hasInvalidCopy());
+
+ info.addNode(BucketCopy(0, 2, api::BucketInfo()), order);
+ CPPUNIT_ASSERT(info.hasInvalidCopy());
+
+}
+
+void
+BucketInfoTest::testAddNodeSetsTrustedWhenConsistent()
+{
+ std::vector<uint16_t> order;
+
+ {
+ BucketInfo info;
+ info.addNode(BucketCopy(0, 0, api::BucketInfo(0x1, 2, 144)).setTrusted(), order);
+ info.addNode(BucketCopy(0, 1, api::BucketInfo(0x1, 2, 144)), order);
+ CPPUNIT_ASSERT(info.getNode(1)->trusted());
+ }
+
+ {
+ BucketInfo info;
+ info.addNode(BucketCopy(0, 0, api::BucketInfo(0x1, 1, 2)).setTrusted(), order);
+ info.addNode(BucketCopy(0, 1, api::BucketInfo(0x2, 2, 3)), order);
+ info.addNode(BucketCopy(0, 2, api::BucketInfo(0x3, 3, 4)), order);
+
+ BucketCopy copy(1, 1, api::BucketInfo(0x1, 1, 2));
+ info.updateNode(copy);
+ CPPUNIT_ASSERT(info.getNode(1)->trusted());
+ CPPUNIT_ASSERT(!info.getNode(2)->trusted());
+ }
+}
+
+void
+BucketInfoTest::testTrustedResetWhenCopiesBecomeInconsistent()
+{
+ CPPUNIT_FAIL("TODO: test this!");
+}
+
+void
+BucketInfoTest::testTrustedResetWhenTrustedCopiesGoOutOfSync()
+{
+ std::vector<uint16_t> order;
+
+ BucketInfo info;
+ info.addNode(BucketCopy(0, 0, api::BucketInfo(10, 100, 1000)).setTrusted(), order);
+ info.addNode(BucketCopy(0, 1, api::BucketInfo(10, 100, 1000)), order);
+ CPPUNIT_ASSERT(info.getNode(0)->trusted());
+ CPPUNIT_ASSERT(info.getNode(1)->trusted());
+
+ info.updateNode(BucketCopy(0, 1, api::BucketInfo(20, 200, 2000)).setTrusted());
+ CPPUNIT_ASSERT(!info.getNode(0)->trusted());
+ CPPUNIT_ASSERT(!info.getNode(1)->trusted());
+}
+
+void
+BucketInfoTest::testTrustedNotResetWhenNonTrustedCopiesStillOutOfSync()
+{
+ std::vector<uint16_t> order;
+
+ BucketInfo info;
+ info.addNode(BucketCopy(0, 0, api::BucketInfo(10, 100, 1000)).setTrusted(), order);
+ info.addNode(BucketCopy(0, 1, api::BucketInfo(20, 200, 2000)), order);
+ info.addNode(BucketCopy(0, 2, api::BucketInfo(30, 300, 3000)), order);
+ CPPUNIT_ASSERT(info.getNode(0)->trusted());
+ CPPUNIT_ASSERT(!info.getNode(1)->trusted());
+ CPPUNIT_ASSERT(!info.getNode(2)->trusted());
+
+ info.updateNode(BucketCopy(0, 1, api::BucketInfo(21, 201, 2001)));
+
+ CPPUNIT_ASSERT(info.getNode(0)->trusted());
+ CPPUNIT_ASSERT(!info.getNode(1)->trusted());
+ CPPUNIT_ASSERT(!info.getNode(2)->trusted());
+}
+
+}
+
+} // storage
+
diff --git a/storage/src/tests/bucketdb/bucketmanagertest.cpp b/storage/src/tests/bucketdb/bucketmanagertest.cpp
new file mode 100644
index 00000000000..ee2e3f6ef7f
--- /dev/null
+++ b/storage/src/tests/bucketdb/bucketmanagertest.cpp
@@ -0,0 +1,1323 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+LOG_SETUP(".test.bucketdb.bucketmanager");
+
+#include <vespa/config/helper/configgetter.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <vespa/document/config/config-documenttypes.h>
+#include <vespa/document/datatype/documenttype.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/document/repo/documenttyperepo.h>
+#include <vespa/storage/bucketdb/bucketmanager.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
+#include <vespa/storage/persistence/filestorage/filestormanager.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/dummystoragelink.h>
+#include <tests/common/testhelper.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/vdslib/state/random.h>
+#include <vespa/vespalib/io/fileutil.h>
+#include <vespa/vespalib/stllike/string.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <thread>
+#include <future>
+
+using config::ConfigGetter;
+using document::DocumenttypesConfig;
+using config::FileSpec;
+using document::DocumentType;
+using document::DocumentTypeRepo;
+
+namespace storage {
+
+struct TestBucketInfo {
+ uint32_t crc;
+ uint32_t size;
+ uint32_t count;
+ uint32_t partition;
+
+ api::BucketInfo getInfo() const
+ { return api::BucketInfo(crc, count, size); }
+};
+
+std::ostream& operator<<(std::ostream& out, const TestBucketInfo& info) {
+ out << "TestBucketInfo(" << info.crc << ", " << info.size
+ << ", " << info.count << ", " << info.partition << ")";
+ return out;
+}
+
+class ConcurrentOperationFixture;
+struct TestParams;
+
+struct BucketManagerTest : public CppUnit::TestFixture {
+public:
+ CPPUNIT_TEST_SUITE(BucketManagerTest);
+ CPPUNIT_TEST(testRequestBucketInfoWithList);
+ CPPUNIT_TEST(testDistributionBitGenerationEmpty);
+ CPPUNIT_TEST(testDistributionBitChangeOnCreateBucket);
+ CPPUNIT_TEST(testMinUsedBitsFromComponentIsHonored);
+ CPPUNIT_TEST(testRemoveLastModifiedOK);
+ CPPUNIT_TEST(testRemoveLastModifiedFailed);
+ CPPUNIT_TEST(testSwallowNotifyBucketChangeReply);
+ CPPUNIT_TEST(testMetricsGeneration);
+ CPPUNIT_TEST(testSplitReplyOrderedAfterBucketReply);
+ CPPUNIT_TEST(testJoinReplyOrderedAfterBucketReply);
+ CPPUNIT_TEST(testDeleteReplyOrderedAfterBucketReply);
+ CPPUNIT_TEST(testOnlyEnqueueWhenProcessingRequest);
+ CPPUNIT_TEST(testOrderRepliesAfterBucketSpecificRequest);
+ CPPUNIT_TEST(testQueuedRepliesOnlyDispatchedWhenAllProcessingDone);
+ CPPUNIT_TEST(testMutationRepliesForSplitBucketAreEnqueued);
+ CPPUNIT_TEST(testMutationRepliesForDeletedBucketAreEnqueued);
+ CPPUNIT_TEST(testMutationRepliesForJoinedBucketAreEnqueued);
+ CPPUNIT_TEST(testConflictingPutRepliesAreEnqueued);
+ CPPUNIT_TEST(testConflictingUpdateRepliesAreEnqueued);
+ CPPUNIT_TEST(testRemappedMutationIsCheckedAgainstOriginalBucket);
+ CPPUNIT_TEST(testBucketConflictSetIsClearedBetweenBlockingRequests);
+ CPPUNIT_TEST(testConflictSetOnlyClearedAfterAllBucketRequestsDone);
+ CPPUNIT_TEST(testRejectRequestWithMismatchingDistributionHash);
+ CPPUNIT_TEST(testDbNotIteratedWhenAllRequestsRejected);
+ CPPUNIT_TEST(testReceivedDistributionHashIsNormalized);
+
+ // FIXME(vekterli): test is not deterministic and enjoys failing
+ // sporadically when running under Valgrind. See bug 5932891.
+ CPPUNIT_TEST_IGNORED(testRequestBucketInfoWithState);
+ CPPUNIT_TEST_SUITE_END();
+
+ std::unique_ptr<TestServiceLayerApp> _node;
+ std::unique_ptr<DummyStorageLink> _top;
+ BucketManager *_manager;
+ DummyStorageLink* _bottom;
+ FileStorManager* _filestorManager;
+ std::map<document::BucketId, TestBucketInfo> _bucketInfo;
+ uint32_t _emptyBuckets;
+ document::Document::SP _document;
+
+ void setupTestEnvironment(bool fakePersistenceLayer = true,
+ bool noDelete = false);
+ void addBucketsToDB(uint32_t count);
+ bool wasBlockedDueToLastModified(api::StorageMessage* msg,
+ uint64_t lastModified);
+ bool wasBlockedDueToLastModified(api::StorageMessage::SP msg);
+ void insertSingleBucket(const document::BucketId& bucket,
+ const api::BucketInfo& info);
+ void waitUntilRequestsAreProcessing(size_t nRequests = 1);
+ void doTestMutationOrdering(
+ ConcurrentOperationFixture& fixture,
+ const TestParams& params);
+ void doTestConflictingReplyIsEnqueued(
+ const document::BucketId& bucket,
+ const api::StorageCommand::SP& treeMutationCmd,
+ const api::MessageType& treeMutationReplyType);
+
+ void scheduleBucketInfoRequestWithConcurrentOps(
+ ConcurrentOperationFixture& fixture,
+ const document::BucketId& bucketForRemove,
+ const document::BucketId& bucketForSplit,
+ api::Timestamp mutationTimestamp);
+ void sendSingleBucketInfoRequest(const document::BucketId& id);
+ void assertRequestWithBadHashIsRejected(
+ ConcurrentOperationFixture& fixture);
+
+
+ void testRequestBucketInfoWithState();
+ void testRequestBucketInfoWithList();
+ void testDistributionBitGenerationEmpty();
+ void testDistributionBitChangeOnCreateBucket();
+ void testMinUsedBitsFromComponentIsHonored();
+
+ void testRemoveLastModifiedOK();
+ void testRemoveLastModifiedFailed();
+
+ void testSwallowNotifyBucketChangeReply();
+ void testMetricsGeneration();
+ void testSplitReplyOrderedAfterBucketReply();
+ void testJoinReplyOrderedAfterBucketReply();
+ void testDeleteReplyOrderedAfterBucketReply();
+ void testOnlyEnqueueWhenProcessingRequest();
+ void testOrderRepliesAfterBucketSpecificRequest();
+ void testQueuedRepliesOnlyDispatchedWhenAllProcessingDone();
+ void testMutationRepliesForSplitBucketAreEnqueued();
+ void testMutationRepliesForDeletedBucketAreEnqueued();
+ void testMutationRepliesForJoinedBucketAreEnqueued();
+ void testConflictingPutRepliesAreEnqueued();
+ void testConflictingUpdateRepliesAreEnqueued();
+ void testRemappedMutationIsCheckedAgainstOriginalBucket();
+ void testBucketConflictSetIsClearedBetweenBlockingRequests();
+ void testConflictSetOnlyClearedAfterAllBucketRequestsDone();
+ void testRejectRequestWithMismatchingDistributionHash();
+ void testDbNotIteratedWhenAllRequestsRejected();
+ void testReceivedDistributionHashIsNormalized();
+
+public:
+ static constexpr uint32_t DIR_SPREAD = 3;
+ static constexpr uint32_t MESSAGE_WAIT_TIME = 60*2;
+
+ void setUp() {
+ _emptyBuckets = 0;
+ }
+
+ void tearDown() {
+ }
+
+ friend class ConcurrentOperationFixture;
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(BucketManagerTest);
+
+#define ASSERT_DUMMYLINK_REPLY_COUNT(link, count) \
+ if (link->getNumReplies() != count) { \
+ std::ostringstream ost; \
+ ost << "Expected there to be " << count << " replies in link, but " \
+ << "found " << link->getNumReplies() << ":\n"; \
+ for (uint32_t i=0; i<link->getNumReplies(); ++i) { \
+ ost << link->getReply(i)->getType() << "\n"; \
+ } \
+ CPPUNIT_FAIL(ost.str()); \
+ }
+
+void BucketManagerTest::setupTestEnvironment(bool fakePersistenceLayer,
+ bool noDelete)
+{
+ if (!noDelete) {
+ assert(system("rm -rf vdsroot") == 0);
+ }
+ assert(system("mkdir -p vdsroot/disks/d0") == 0);
+ assert(system("mkdir -p vdsroot/disks/d1") == 0);
+ vdstestlib::DirConfig config(getStandardConfig(true));
+
+ DocumentTypeRepo::SP repo(new DocumentTypeRepo(
+ *ConfigGetter<DocumenttypesConfig>::getConfig("config-doctypes",
+ FileSpec("config-doctypes.cfg"))));
+ _top.reset(new DummyStorageLink);
+ _node.reset(new TestServiceLayerApp(
+ DiskCount(2), NodeIndex(0), config.getConfigId()));
+ _node->setTypeRepo(repo);
+ _node->setupDummyPersistence();
+ // Set up the 3 links
+ StorageLink::UP manager(new BucketManager("", _node->getComponentRegister()));
+ _manager = (BucketManager*) manager.get();
+ _top->push_back(std::move(manager));
+ if (fakePersistenceLayer) {
+ StorageLink::UP bottom(new DummyStorageLink);
+ _bottom = (DummyStorageLink*) bottom.get();
+ _top->push_back(std::move(bottom));
+ } else {
+ StorageLink::UP bottom(new FileStorManager(
+ config.getConfigId(), _node->getPartitions(),
+ _node->getPersistenceProvider(), _node->getComponentRegister()));
+ _filestorManager = (FileStorManager*) bottom.get();
+ _top->push_back(std::move(bottom));
+ }
+ // Generate a doc to use for testing..
+ const DocumentType &type(*_node->getTypeRepo()
+ ->getDocumentType("text/html"));
+ _document.reset(new document::Document(type, document::DocumentId(
+ document::DocIdString("test", "ntnu"))));
+}
+
+void BucketManagerTest::addBucketsToDB(uint32_t count)
+{
+ _bucketInfo.clear();
+ _emptyBuckets = 0;
+ lib::RandomGen randomizer(25423);
+ while (_bucketInfo.size() < count) {
+ document::BucketId id(16, randomizer.nextUint32());
+ id = id.stripUnused();
+ if (_bucketInfo.size() == 0) {
+ id = _node->getBucketIdFactory().getBucketId(
+ _document->getId()).stripUnused();
+ }
+ TestBucketInfo info;
+ info.crc = randomizer.nextUint32();
+ info.size = randomizer.nextUint32();
+ info.count = randomizer.nextUint32(1, 0xFFFF);
+
+ info.partition = _node->getPartition(id);
+ _bucketInfo[id] = info;
+ }
+
+ // Make sure we have at least one empty bucket
+ TestBucketInfo& info = (++_bucketInfo.begin())->second;
+ CPPUNIT_ASSERT(info.size != 0);
+ info.size = 0;
+ info.count = 0;
+ info.crc = 0;
+ ++_emptyBuckets;
+ for (std::map<document::BucketId, TestBucketInfo>::iterator it
+ = _bucketInfo.begin(); it != _bucketInfo.end(); ++it)
+ {
+ bucketdb::StorageBucketInfo entry;
+ entry.disk = it->second.partition;
+ entry.setBucketInfo(api::BucketInfo(it->second.crc,
+ it->second.count,
+ it->second.size));
+ _node->getStorageBucketDatabase().insert(it->first, entry, "foo");
+ }
+}
+
+bool
+BucketManagerTest::wasBlockedDueToLastModified(api::StorageMessage* msg,
+ uint64_t lastModified)
+{
+ setupTestEnvironment();
+ document::BucketId id(16, 1);
+ api::BucketInfo info(1, 2, 3);
+ info.setLastModified(api::Timestamp(1234));
+
+ {
+ bucketdb::StorageBucketInfo entry;
+ entry.setBucketInfo(info);
+ entry.disk = 0;
+ _node->getStorageBucketDatabase().insert(id, entry, "foo");
+ }
+
+ _top->open();
+
+ _top->sendDown(api::StorageMessage::SP(msg));
+ if (_top->getNumReplies() == 1) {
+ CPPUNIT_ASSERT_EQUAL(0, (int)_bottom->getNumCommands());
+ CPPUNIT_ASSERT(!static_cast<api::StorageReply&>(
+ *_top->getReply(0)).getResult().success());
+ return true;
+ } else {
+ CPPUNIT_ASSERT_EQUAL(0, (int)_top->getNumReplies());
+
+ // Check that bucket database now has the operation's timestamp as last modified.
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(id, "foo"));
+ CPPUNIT_ASSERT_EQUAL(lastModified, entry->info.getLastModified());
+ }
+
+ return false;
+ }
+}
+
+void BucketManagerTest::testRemoveLastModifiedOK()
+{
+ CPPUNIT_ASSERT(!wasBlockedDueToLastModified(
+ new api::RemoveCommand(document::BucketId(16, 1),
+ document::DocumentId("userdoc:m:1:foo"),
+ api::Timestamp(1235)),
+ 1235));
+}
+
+
+void BucketManagerTest::testRemoveLastModifiedFailed()
+{
+ CPPUNIT_ASSERT(wasBlockedDueToLastModified(
+ new api::RemoveCommand(document::BucketId(16, 1),
+ document::DocumentId("userdoc:m:1:foo"),
+ api::Timestamp(1233)),
+ 1233));
+}
+
+void BucketManagerTest::testDistributionBitGenerationEmpty()
+{
+ TestName("BucketManagerTest::testDistributionBitGenerationEmpty()");
+ setupTestEnvironment();
+ _manager->doneInit();
+ vespalib::Monitor l;
+ _manager->updateMetrics(BucketManager::MetricLockGuard(l));
+ CPPUNIT_ASSERT_EQUAL(58u, _node->getStateUpdater().getReportedNodeState()->getMinUsedBits());
+}
+
+void BucketManagerTest::testDistributionBitChangeOnCreateBucket()
+{
+ TestName("BucketManagerTest::testDistributionBitChangeOnCreateBucket()");
+ setupTestEnvironment();
+ addBucketsToDB(30);
+ _top->open();
+ _node->getDoneInitializeHandler().notifyDoneInitializing();
+ _manager->doneInit();
+ _manager->updateMinUsedBits();
+ CPPUNIT_ASSERT_EQUAL(16u, _node->getStateUpdater().getReportedNodeState()->getMinUsedBits());
+
+ std::shared_ptr<api::CreateBucketCommand> cmd(
+ new api::CreateBucketCommand(document::BucketId(4, 5678)));
+ _top->sendDown(cmd);
+ CPPUNIT_ASSERT_EQUAL(4u, _node->getStateUpdater().getReportedNodeState()->getMinUsedBits());
+}
+
+void BucketManagerTest::testMinUsedBitsFromComponentIsHonored()
+{
+ TestName("BucketManagerTest::testMinUsedBitsFromComponentIsHonored()");
+ setupTestEnvironment();
+ // Let these differ in order to test state update behavior.
+ _node->getComponentRegister().getMinUsedBitsTracker().setMinUsedBits(10);
+ lib::NodeState ns(
+ *_node->getStateUpdater().getReportedNodeState());
+ ns.setMinUsedBits(13);
+ _node->getStateUpdater().setReportedNodeState(ns);
+ addBucketsToDB(30);
+ _top->open();
+ // Don't update metrics, as these will always overwrite the min used bits
+ // if it differs from the db.
+
+ // 12 >= 10, so no update of reported state (left at 13; this should of
+ // course not happen in practice, but used for faking in the test)
+ std::shared_ptr<api::CreateBucketCommand> cmd(
+ new api::CreateBucketCommand(document::BucketId(12, 5678)));
+ _top->sendDown(cmd);
+ CPPUNIT_ASSERT_EQUAL(13u, _node->getStateUpdater().getReportedNodeState()->getMinUsedBits());
+}
+
+void BucketManagerTest::testRequestBucketInfoWithState()
+{
+ TestName("BucketManagerTest::testRequestBucketInfoWithState()");
+ // Test prior to building bucket cache
+ setupTestEnvironment();
+ addBucketsToDB(30);
+ /* Currently this is just queued up
+ {
+ std::shared_ptr<api::RequestBucketInfoCommand> cmd(
+ new api::RequestBucketInfoCommand(
+ 0, lib::ClusterState("distributor:3 .2.s:d storage:1")));
+ _top->sendDown(cmd);
+ _top->waitForMessages(1, 5);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, _top->getNumReplies());
+ std::shared_ptr<api::RequestBucketInfoReply> reply(
+ std::dynamic_pointer_cast<api::RequestBucketInfoReply>(
+ _top->getReply(0)));
+ _top->reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode(api::ReturnCode::NOT_READY),
+ reply->getResult());
+ } */
+ std::vector<lib::ClusterState> states;
+ states.push_back(lib::ClusterState("version:0"));
+ states.push_back(lib::ClusterState("version:1 distributor:1 storage:1"));
+ states.push_back(lib::ClusterState(
+ "version:2 distributor:3 .1.s:i .2.s:d storage:4"));
+ states.push_back(lib::ClusterState(
+ "version:3 distributor:3 .1.s:i .2.s:d storage:4 .3.s:d"));
+ states.push_back(lib::ClusterState(
+ "version:4 distributor:3 .1.s:i .2.s:d storage:4"));
+
+ _node->setClusterState(states.back());
+ for (uint32_t i=0; i<states.size(); ++i) {
+ api::SetSystemStateCommand::SP cmd(
+ new api::SetSystemStateCommand(states[i]));
+ _manager->onDown(cmd);
+ }
+
+ // Send a request bucket info command that will be outdated and failed.
+ std::shared_ptr<api::RequestBucketInfoCommand> cmd1(
+ new api::RequestBucketInfoCommand(0, states[1]));
+ // Send two request bucket info commands that will be processed together
+ // when the bucket manager is idle, as states are equivalent
+ std::shared_ptr<api::RequestBucketInfoCommand> cmd2(
+ new api::RequestBucketInfoCommand(0, states[2]));
+ std::shared_ptr<api::RequestBucketInfoCommand> cmd3(
+ new api::RequestBucketInfoCommand(0, states[3]));
+
+ // Tag server initialized before starting
+ _top->open();
+ _manager->startWorkerThread();
+ _node->getDoneInitializeHandler().notifyDoneInitializing();
+ _manager->doneInit();
+
+ LOG(info, "Sending 3 different request bucket info messages");
+ _top->sendDown(cmd1);
+ _top->sendDown(cmd2);
+ _top->sendDown(cmd3);
+
+ {
+ LOG(info, "Waiting for response from 3 request bucket info messages");
+ _top->waitForMessages(3, 5);
+ ASSERT_DUMMYLINK_REPLY_COUNT(_top, 3);
+ std::map<uint64_t, api::RequestBucketInfoReply::SP> replies;
+ for (uint32_t i=0; i<3; ++i) {
+ replies[_top->getReply(i)->getMsgId()]
+ = std::dynamic_pointer_cast<api::RequestBucketInfoReply>(
+ _top->getReply(i));
+ }
+ std::shared_ptr<api::RequestBucketInfoReply> reply1(
+ replies[cmd1->getMsgId()]);
+ std::shared_ptr<api::RequestBucketInfoReply> reply2(
+ replies[cmd2->getMsgId()]);
+ std::shared_ptr<api::RequestBucketInfoReply> reply3(
+ replies[cmd3->getMsgId()]);
+ _top->reset();
+ CPPUNIT_ASSERT(reply1.get());
+ CPPUNIT_ASSERT(reply2.get());
+ CPPUNIT_ASSERT(reply3.get());
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode(api::ReturnCode::REJECTED,
+ "Ignoring bucket info request for cluster state version 1 as "
+ "versions from version 2 differs from this state."),
+ reply1->getResult());
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode(api::ReturnCode::REJECTED,
+ "There is already a newer bucket info request for "
+ "this node from distributor 0"),
+ reply2->getResult());
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode(api::ReturnCode::OK),
+ reply3->getResult());
+ api::RequestBucketInfoReply::Entry entry;
+
+ CPPUNIT_ASSERT_EQUAL((size_t) 18, reply3->getBucketInfo().size());
+ entry = api::RequestBucketInfoReply::Entry(
+ document::BucketId(16, 0xe8c8), api::BucketInfo(0x79d04f78, 11153, 1851385240u));
+ CPPUNIT_ASSERT_EQUAL(entry, reply3->getBucketInfo()[0]);
+ }
+}
+
+namespace {
+ struct PopenWrapper {
+ FILE* _file;
+ std::vector<char> _buffer;
+ uint32_t _index;
+ uint32_t _size;
+ bool _eof;
+
+ PopenWrapper(const std::string& cmd)
+ : _buffer(65536, '\0'), _index(0), _size(0), _eof(false)
+ {
+ _file = popen(cmd.c_str(), "r");
+ if (_file == 0) {
+ throw vespalib::Exception("Failed to run '" + cmd
+ + "' in popen: " + strerror(errno), VESPA_STRLOC);
+ }
+ }
+
+ const char* getNextLine() {
+ if (_eof && _size == 0) return 0;
+ // Check if we have a newline waiting
+ char* newline = strchr(&_buffer[_index], '\n');
+ // If not try to get one
+ if (_eof) {
+ newline = &_buffer[_index + _size];
+ } else if (newline == 0) {
+ // If we index is passed half the buffer, reposition
+ if (_index > _buffer.size() / 2) {
+ memcpy(&_buffer[0], &_buffer[_index], _size);
+ _index = 0;
+ }
+ // Verify we have space to write to
+ if (_index + _size >= _buffer.size()) {
+ throw vespalib::Exception("No newline could be find in "
+ "half the buffer size. Wrapper not designed to "
+ "handle that long lines (1)", VESPA_STRLOC);
+ }
+ // Fill up buffer
+ size_t bytesRead = fread(&_buffer[_index + _size],
+ 1, _buffer.size() - _index - _size - 1,
+ _file);
+ if (bytesRead == 0) {
+ if (!feof(_file)) {
+ throw vespalib::Exception("Failed to run fgets: "
+ + std::string(strerror(errno)), VESPA_STRLOC);
+ } else {
+ _eof = true;
+ }
+ } else {
+ _size += bytesRead;
+ }
+ newline = strchr(&_buffer[_index], '\n');
+ if (newline == 0) {
+ if (_eof) {
+ if (_size == 0) return 0;
+ } else {
+ throw vespalib::Exception("No newline could be find in "
+ "half the buffer size. Wrapper not designed to "
+ "handle that long lines (2)", VESPA_STRLOC);
+ }
+ }
+ }
+ *newline = '\0';
+ ++newline;
+ const char* line = &_buffer[_index];
+ uint32_t strlen = (newline - line);
+ _index += strlen;
+ _size -= strlen;
+ return line;
+ }
+ };
+}
+
+void BucketManagerTest::testRequestBucketInfoWithList()
+{
+ TestName("BucketManagerTest::testRequestBucketInfoWithList()");
+ setupTestEnvironment();
+ addBucketsToDB(30);
+ _top->open();
+ _node->getDoneInitializeHandler().notifyDoneInitializing();
+ _top->doneInit();
+ {
+ std::vector<document::BucketId> bids;
+ bids.push_back(document::BucketId(16, 0xe8c8));
+
+ std::shared_ptr<api::RequestBucketInfoCommand> cmd(
+ new api::RequestBucketInfoCommand(bids));
+
+ _top->sendDown(cmd);
+ _top->waitForMessages(1, 5);
+ ASSERT_DUMMYLINK_REPLY_COUNT(_top, 1);
+ std::shared_ptr<api::RequestBucketInfoReply> reply(
+ std::dynamic_pointer_cast<api::RequestBucketInfoReply>(
+ _top->getReply(0)));
+ _top->reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode(api::ReturnCode::OK),
+ reply->getResult());
+ if (reply->getBucketInfo().size() > 1) {
+ std::cerr << "Too many replies found\n";
+ for (uint32_t i=0; i<reply->getBucketInfo().size(); ++i) {
+ std::cerr << reply->getBucketInfo()[i] << "\n";
+ }
+ }
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, reply->getBucketInfo().size());
+ api::RequestBucketInfoReply::Entry entry(
+ document::BucketId(16, 0xe8c8),
+ api::BucketInfo(0x79d04f78, 11153, 1851385240u));
+ CPPUNIT_ASSERT_EQUAL(entry, reply->getBucketInfo()[0]);
+ }
+}
+
+void
+BucketManagerTest::testSwallowNotifyBucketChangeReply()
+{
+ TestName("BucketManagerTest::testSwallowNotifyBucketChangeReply()");
+ setupTestEnvironment();
+ addBucketsToDB(30);
+ _top->open();
+ _node->getDoneInitializeHandler().notifyDoneInitializing();
+ _top->doneInit();
+
+ api::NotifyBucketChangeCommand cmd(document::BucketId(1, 16),
+ api::BucketInfo());
+ std::shared_ptr<api::NotifyBucketChangeReply> reply(
+ new api::NotifyBucketChangeReply(cmd));
+
+ _top->sendDown(reply);
+ // Should not leave the bucket manager.
+ CPPUNIT_ASSERT_EQUAL(0, (int)_bottom->getNumCommands());
+}
+
+void
+BucketManagerTest::testMetricsGeneration()
+{
+ setupTestEnvironment();
+ _top->open();
+ // Add 3 buckets; 2 ready, 1 active. 300 docs total, 600 bytes total.
+ for (int i = 0; i < 3; ++i) {
+ bucketdb::StorageBucketInfo entry;
+ entry.disk = 0;
+ api::BucketInfo info(50, 100, 200);
+ if (i > 0) {
+ info.setReady();
+ if (i == 2) {
+ info.setActive();
+ }
+ }
+ entry.setBucketInfo(info);
+ _node->getStorageBucketDatabase().insert(document::BucketId(16, i),
+ entry, "foo");
+ }
+ _node->getDoneInitializeHandler().notifyDoneInitializing();
+ _top->doneInit();
+ vespalib::Monitor l;
+ _manager->updateMetrics(BucketManager::MetricLockGuard(l));
+
+ CPPUNIT_ASSERT_EQUAL(size_t(2), _manager->_metrics->disks.size());
+ const DataStoredMetrics& m(*_manager->_metrics->disks[0]);
+ CPPUNIT_ASSERT_EQUAL(int64_t(3), m.buckets.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(300), m.docs.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(600), m.bytes.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), m.active.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(2), m.ready.getLast());
+}
+
+void
+BucketManagerTest::insertSingleBucket(const document::BucketId& bucket,
+ const api::BucketInfo& info)
+{
+ bucketdb::StorageBucketInfo entry;
+ entry.disk = 0;
+ entry.setBucketInfo(info);
+ _node->getStorageBucketDatabase().insert(bucket, entry, "foo");
+}
+
+void
+BucketManagerTest::waitUntilRequestsAreProcessing(size_t nRequests)
+{
+ while (_manager->bucketInfoRequestsCurrentlyProcessing() != nRequests) {
+ std::this_thread::yield();
+ }
+}
+
+namespace {
+
+struct WithBuckets {
+ std::map<document::BucketId, api::BucketInfo> _bucketsAndInfo;
+
+ WithBuckets& add(const document::BucketId& id,
+ const api::BucketInfo& info)
+ {
+ _bucketsAndInfo[id] = info;
+ return *this;
+ }
+};
+
+} // anon ns
+
+class ConcurrentOperationFixture {
+public:
+ ConcurrentOperationFixture(BucketManagerTest& self)
+ : _self(self),
+ _state("distributor:1 storage:1")
+ {
+ _self.setupTestEnvironment();
+ _self._top->open();
+ _self._node->getDoneInitializeHandler().notifyDoneInitializing();
+ _self._manager->startWorkerThread();
+ _self._top->doneInit();
+
+ // Need a cluster state to work with initially, so that processing
+ // bucket requests can calculate a target distributor.
+ _self._node->setClusterState(_state);
+ _self._manager->onDown(
+ std::make_shared<api::SetSystemStateCommand>(_state));
+ }
+
+ void setUp(const WithBuckets& buckets) {
+ for (auto& b : buckets._bucketsAndInfo) {
+ _self.insertSingleBucket(b.first, b.second);
+ }
+ }
+
+ auto acquireBucketLock(const document::BucketId& bucket) {
+ return _self._node->getStorageBucketDatabase().get(bucket, "foo");
+ }
+
+ auto createRemoveCommand(const document::BucketId& bucket,
+ api::Timestamp timestamp = 123456) const
+ {
+ // Note: this is a dummy message; its contained document ID will not
+ // map to the provided bucket ID (at least it's extremely unlikely..)
+ return std::make_shared<api::RemoveCommand>(
+ bucket,
+ document::DocumentId("id:foo:testdoctype1::bar"),
+ timestamp);
+ }
+
+ auto createPutCommand(const document::BucketId& bucket) const {
+ auto doc = _self._node->getTestDocMan().createDocument(
+ "a foo walks into a bar", "id:foo:testdoctype1::bar1");
+ return std::make_shared<api::PutCommand>(
+ bucket, std::move(doc), api::Timestamp(123456));
+ }
+
+ auto createUpdateCommand(const document::BucketId& bucket) const {
+ auto update = std::make_shared<document::DocumentUpdate>(
+ *_self._node->getTestDocMan().getTypeRepo()
+ .getDocumentType("testdoctype1"),
+ document::DocumentId("id:foo:testdoctype1::bar2"));
+ return std::make_shared<api::UpdateCommand>(
+ bucket, update, api::Timestamp(123456));
+ }
+
+ auto createFullFetchCommand() const {
+ return std::make_shared<api::RequestBucketInfoCommand>(0, _state);
+ }
+
+ auto createFullFetchCommandWithHash(vespalib::stringref hash) const {
+ return std::make_shared<api::RequestBucketInfoCommand>(0, _state, hash);
+ }
+
+ auto acquireBucketLockAndSendInfoRequest(const document::BucketId& bucket) {
+ auto guard = acquireBucketLock(bucket);
+ // Send down processing command which will block.
+ _self._top->sendDown(createFullFetchCommand());
+ // Have to wait until worker thread has started chewing on request
+ // before we can continue, or we can end up in a race where processing
+ // does not start until _after_ we've sent up our bucket-deleting
+ // message. Since we hold a bucket lock, the below function can never
+ // transition false->true->false under our feet, only false->true.
+ _self.waitUntilRequestsAreProcessing(1);
+ return guard;
+ }
+
+ // Currently assumes there is only 1 command of cmd's message type in
+ // the bottom storage link.
+ void bounceWithReply(api::StorageCommand& cmd,
+ api::ReturnCode::Result code = api::ReturnCode::OK,
+ const document::BucketId& remapTo = document::BucketId())
+ {
+ _self._bottom->waitForMessages(1, BucketManagerTest::MESSAGE_WAIT_TIME);
+ // Bounce it back up with an implicitly OK status. This should cause the
+ // bucket manager to avoid reporting deleted buckets in its result set
+ // since these have been "tainted" by a concurrent removal.
+ std::unique_ptr<api::StorageReply> reply(cmd.makeReply());
+ if (remapTo.getRawId() != 0) {
+ dynamic_cast<api::BucketReply&>(*reply).remapBucketId(remapTo);
+ }
+ reply->setResult(code);
+ _self._bottom->getAndRemoveMessage(cmd.getType());
+ _self._bottom->sendUp(std::move(reply));
+ }
+
+ auto awaitAndGetReplies(size_t nReplies) {
+ _self._top->waitForMessages(
+ nReplies, BucketManagerTest::MESSAGE_WAIT_TIME);
+ return _self._top->getReplies();
+ }
+
+ void assertOrderedAfterBucketReply(size_t nBucketReplies,
+ const api::MessageType& msgType)
+ {
+ const size_t nTotal = nBucketReplies + 1;
+ auto replies = awaitAndGetReplies(nTotal);
+ CPPUNIT_ASSERT_EQUAL(nTotal, replies.size());
+ for (size_t i = 0; i < nBucketReplies; ++i) {
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::REQUESTBUCKETINFO_REPLY,
+ replies[i]->getType());
+ }
+ CPPUNIT_ASSERT_EQUAL(msgType, replies[nBucketReplies]->getType());
+ }
+
+ void assertReplyOrdering(
+ const std::vector<const api::MessageType*>& replyTypes)
+ {
+ auto replies = awaitAndGetReplies(replyTypes.size());
+ CPPUNIT_ASSERT_EQUAL(replyTypes.size(), replies.size());
+ for (size_t i = 0; i < replyTypes.size(); ++i) {
+ CPPUNIT_ASSERT_EQUAL(*replyTypes[i], replies[i]->getType());
+ }
+ }
+
+ void clearReceivedReplies() {
+ _self._top->getRepliesOnce();
+ }
+
+private:
+ BucketManagerTest& _self;
+ lib::ClusterState _state;
+};
+
+void
+BucketManagerTest::testSplitReplyOrderedAfterBucketReply()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId bucketA(17, 0);
+ document::BucketId bucketB(17, 1);
+ fixture.setUp(WithBuckets()
+ .add(bucketA, api::BucketInfo(50, 100, 200))
+ .add(bucketB, api::BucketInfo(100, 200, 400)));
+ auto guard = fixture.acquireBucketLockAndSendInfoRequest(bucketB);
+
+ // Split bucket A to model a concurrent modification to an already fetched
+ // bucket.
+ auto splitCmd = std::make_shared<api::SplitBucketCommand>(bucketA);
+ _top->sendDown(splitCmd);
+ fixture.bounceWithReply(*splitCmd);
+ // Let bucket manager breathe again.
+ guard.unlock();
+
+ fixture.assertOrderedAfterBucketReply(
+ 1, api::MessageType::SPLITBUCKET_REPLY);
+}
+
+void
+BucketManagerTest::testJoinReplyOrderedAfterBucketReply()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId bucketA(17, 0);
+ document::BucketId bucketB(17, 1 << 16);
+ document::BucketId parent(16, 0);
+ fixture.setUp(WithBuckets()
+ .add(bucketA, api::BucketInfo(50, 100, 200))
+ .add(bucketB, api::BucketInfo(100, 200, 400)));
+ auto guard = fixture.acquireBucketLockAndSendInfoRequest(bucketB);
+
+ auto joinCmd = std::make_shared<api::JoinBucketsCommand>(parent);
+ joinCmd->getSourceBuckets().assign({bucketA, bucketB});
+ _top->sendDown(joinCmd);
+ fixture.bounceWithReply(*joinCmd);
+
+ guard.unlock();
+ fixture.assertOrderedAfterBucketReply(
+ 1, api::MessageType::JOINBUCKETS_REPLY);
+}
+
+// Technically, deletes being ordered after bucket info replies won't help
+// correctness since buckets are removed from the distributor DB upon _sending_
+// the delete and not receiving it.
+void
+BucketManagerTest::testDeleteReplyOrderedAfterBucketReply()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId bucketA(17, 0);
+ document::BucketId bucketB(17, 1);
+ fixture.setUp(WithBuckets()
+ .add(bucketA, api::BucketInfo(50, 100, 200))
+ .add(bucketB, api::BucketInfo(100, 200, 400)));
+ auto guard = fixture.acquireBucketLockAndSendInfoRequest(bucketB);
+
+ auto deleteCmd = std::make_shared<api::DeleteBucketCommand>(bucketA);
+ _top->sendDown(deleteCmd);
+ fixture.bounceWithReply(*deleteCmd);
+
+ guard.unlock();
+
+ fixture.assertOrderedAfterBucketReply(
+ 1, api::MessageType::DELETEBUCKET_REPLY);
+}
+
+void
+BucketManagerTest::testOnlyEnqueueWhenProcessingRequest()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId bucketA(17, 0);
+ fixture.setUp(WithBuckets()
+ .add(bucketA, api::BucketInfo(50, 100, 200)));
+
+ // Process delete command _before_ processing bucket requests.
+ auto deleteCmd = std::make_shared<api::DeleteBucketCommand>(bucketA);
+ _top->sendDown(deleteCmd);
+ fixture.bounceWithReply(*deleteCmd);
+ // Should arrive happily on the top.
+ _top->waitForMessages(1, MESSAGE_WAIT_TIME);
+}
+
+// Bucket info requests that contain a specific set of buckets are handled
+// differently than full bucket info fetches and are not delegated to the
+// worker thread. We still require that any split/joins etc are ordered after
+// this reply if their reply is sent up concurrently.
+void
+BucketManagerTest::testOrderRepliesAfterBucketSpecificRequest()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId bucketA(17, 0);
+ fixture.setUp(WithBuckets()
+ .add(bucketA, api::BucketInfo(50, 100, 200)));
+
+ auto guard = fixture.acquireBucketLock(bucketA);
+
+ auto infoRoundtrip = std::async(std::launch::async, [&]() {
+ std::vector<document::BucketId> buckets{bucketA};
+ auto infoCmd = std::make_shared<api::RequestBucketInfoCommand>(buckets);
+ // Can't complete until `guard` has been unlocked.
+ _top->sendDown(infoCmd);
+ // Barrier: bucket reply and subsequent split reply
+ _top->waitForMessages(2, MESSAGE_WAIT_TIME);
+ });
+ waitUntilRequestsAreProcessing();
+ // Barrier: roundtrip thread now blocked. Send a split whose reply shall
+ // be enqueued since there's a RequestBucketInfo currently doing its thing.
+ auto splitCmd = std::make_shared<api::SplitBucketCommand>(bucketA);
+ _top->sendDown(splitCmd);
+ // Enqueuing happens synchronously in this thread, so no need for further
+ // synchronization.
+ fixture.bounceWithReply(*splitCmd);
+
+ guard.unlock();
+ infoRoundtrip.get();
+ // At this point, we know 2 messages are in the top queue since the
+ // async future guarantees this for completion.
+ fixture.assertOrderedAfterBucketReply(
+ 1, api::MessageType::SPLITBUCKET_REPLY);
+}
+
+// Test is similar to testOrderRepliesAfterBucketSpecificRequest, but has
+// two concurrent bucket info request processing instances going on; one in
+// the worker thread and one in the message chain itself. Since we only have
+// one queue, we must wait with dispatching replies until _all_ processing
+// has ceased.
+void
+BucketManagerTest::testQueuedRepliesOnlyDispatchedWhenAllProcessingDone()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId bucketA(17, 0);
+ fixture.setUp(WithBuckets()
+ .add(bucketA, api::BucketInfo(50, 100, 200)));
+
+ auto guard = fixture.acquireBucketLock(bucketA);
+
+ auto singleBucketInfo = std::async(std::launch::async, [&]() {
+ std::vector<document::BucketId> buckets{bucketA};
+ auto infoCmd = std::make_shared<api::RequestBucketInfoCommand>(buckets);
+ _top->sendDown(infoCmd);
+ _top->waitForMessages(3, MESSAGE_WAIT_TIME);
+ });
+ waitUntilRequestsAreProcessing(1);
+ auto fullFetch = std::async(std::launch::async, [&]() {
+ _top->sendDown(fixture.createFullFetchCommand());
+ _top->waitForMessages(3, MESSAGE_WAIT_TIME);
+ });
+ waitUntilRequestsAreProcessing(2);
+ auto splitCmd = std::make_shared<api::SplitBucketCommand>(bucketA);
+ _top->sendDown(splitCmd);
+ fixture.bounceWithReply(*splitCmd);
+
+ guard.unlock();
+ singleBucketInfo.get();
+ fullFetch.get();
+
+ fixture.assertOrderedAfterBucketReply(
+ 2, api::MessageType::SPLITBUCKET_REPLY);
+}
+
+// Hide boring, repetetive code to allow for chaining of setters (and auto-
+// generation of getters and member vars) behind a macro.
+#ifdef BUILDER_PARAM
+# error "Redefinition of existing macro `BUILDER_PARAM`"
+#endif
+#define BUILDER_PARAM(type, name) \
+ type _ ## name; \
+ auto& name(const type& name ## _) { _ ## name = name ## _; return *this; } \
+ const type & name() const { return _ ## name; }
+
+struct TestParams {
+ BUILDER_PARAM(document::BucketId, bucket);
+ BUILDER_PARAM(document::BucketId, remappedTo);
+ BUILDER_PARAM(api::StorageCommand::SP, documentMutation);
+ BUILDER_PARAM(api::StorageCommand::SP, treeMutation);
+ BUILDER_PARAM(std::vector<const api::MessageType*>, expectedOrdering);
+};
+
+void
+BucketManagerTest::doTestMutationOrdering(
+ ConcurrentOperationFixture& fixture,
+ const TestParams& params)
+{
+ fixture.setUp(WithBuckets()
+ .add(params.bucket(), api::BucketInfo(50, 100, 200)));
+ // Have to send down mutating command _before_ we take bucket lock, as the
+ // bucket manager acquires a lock for bucket on the way down in order to
+ // check the timestamp of the message vs the last modified timestamp of
+ // the bucket itself (offers some time travelling clock protection).
+ _top->sendDown(params.documentMutation());
+ auto guard = fixture.acquireBucketLockAndSendInfoRequest(params.bucket());
+
+ _top->sendDown(params.treeMutation());
+ // Unless "conflicting" mutation replies are enqueued after splits et al,
+ // they will bypass the lock and arrive in an inverse order of execution
+ // at the distributor. Note that we send replies in the opposite order their
+ // commands were sent down, but this is an artifact of ordering commands
+ // to avoid test deadlocks, and priorities may alter the execution order
+ // anyway. The important thing is that reply orders are not altered.
+ fixture.bounceWithReply(*params.treeMutation());
+ fixture.bounceWithReply(*params.documentMutation(),
+ api::ReturnCode::OK,
+ params.remappedTo());
+ guard.unlock();
+
+ fixture.assertReplyOrdering(params.expectedOrdering());
+}
+
+void
+BucketManagerTest::doTestConflictingReplyIsEnqueued(
+ const document::BucketId& bucket,
+ const api::StorageCommand::SP& treeMutationCmd,
+ const api::MessageType& treeMutationReplyType)
+{
+ ConcurrentOperationFixture fixture(*this);
+
+ // We don't check all combinations of document operation replies vs
+ // bucket operation replies, just RemoveReply vs all bucket ops.
+ auto params = TestParams()
+ .bucket(bucket)
+ .documentMutation(fixture.createRemoveCommand(bucket))
+ .treeMutation(treeMutationCmd)
+ .expectedOrdering({&api::MessageType::REQUESTBUCKETINFO_REPLY,
+ &treeMutationReplyType,
+ &api::MessageType::REMOVE_REPLY});
+
+ doTestMutationOrdering(fixture, params);
+}
+
+void
+BucketManagerTest::testMutationRepliesForSplitBucketAreEnqueued()
+{
+ document::BucketId bucket(17, 0);
+ doTestConflictingReplyIsEnqueued(
+ bucket,
+ std::make_shared<api::SplitBucketCommand>(bucket),
+ api::MessageType::SPLITBUCKET_REPLY);
+}
+
+void
+BucketManagerTest::testMutationRepliesForDeletedBucketAreEnqueued()
+{
+ document::BucketId bucket(17, 0);
+ doTestConflictingReplyIsEnqueued(
+ bucket,
+ std::make_shared<api::DeleteBucketCommand>(bucket),
+ api::MessageType::DELETEBUCKET_REPLY);
+}
+
+void
+BucketManagerTest::testMutationRepliesForJoinedBucketAreEnqueued()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId bucketA(17, 0);
+ document::BucketId bucketB(17, 1 << 16);
+ document::BucketId parent(16, 0);
+ // We only test for the parent bucket, since that's what queued operations
+ // will be remapped to after a successful join.
+ auto joinCmd = std::make_shared<api::JoinBucketsCommand>(parent);
+ joinCmd->getSourceBuckets().assign({bucketA, bucketB});
+
+ auto params = TestParams()
+ .bucket(parent)
+ .documentMutation(fixture.createRemoveCommand(parent))
+ .treeMutation(joinCmd)
+ .expectedOrdering({&api::MessageType::REQUESTBUCKETINFO_REPLY,
+ &api::MessageType::JOINBUCKETS_REPLY,
+ &api::MessageType::REMOVE_REPLY});
+
+ doTestMutationOrdering(fixture, params);
+}
+
+void
+BucketManagerTest::testConflictingPutRepliesAreEnqueued()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId bucket(17, 0);
+
+ auto params = TestParams()
+ .bucket(bucket)
+ .documentMutation(fixture.createPutCommand(bucket))
+ .treeMutation(std::make_shared<api::SplitBucketCommand>(bucket))
+ .expectedOrdering({&api::MessageType::REQUESTBUCKETINFO_REPLY,
+ &api::MessageType::SPLITBUCKET_REPLY,
+ &api::MessageType::PUT_REPLY});
+
+ doTestMutationOrdering(fixture, params);
+}
+
+void
+BucketManagerTest::testConflictingUpdateRepliesAreEnqueued()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId bucket(17, 0);
+
+ auto params = TestParams()
+ .bucket(bucket)
+ .documentMutation(fixture.createUpdateCommand(bucket))
+ .treeMutation(std::make_shared<api::SplitBucketCommand>(bucket))
+ .expectedOrdering({&api::MessageType::REQUESTBUCKETINFO_REPLY,
+ &api::MessageType::SPLITBUCKET_REPLY,
+ &api::MessageType::UPDATE_REPLY});
+
+ doTestMutationOrdering(fixture, params);
+}
+
+/**
+ * After a split or join, any messages bound for the original bucket(s) that
+ * are currently in the persistence queues will be remapped to the bucket
+ * resulting from the operation. We have to make sure remapped operations are
+ * enqueued as well.
+ */
+void
+BucketManagerTest::testRemappedMutationIsCheckedAgainstOriginalBucket()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId bucket(17, 0);
+ document::BucketId remappedToBucket(18, 0);
+
+ auto params = TestParams()
+ .bucket(bucket)
+ .documentMutation(fixture.createRemoveCommand(bucket))
+ .remappedTo(remappedToBucket)
+ .treeMutation(std::make_shared<api::SplitBucketCommand>(bucket))
+ .expectedOrdering({&api::MessageType::REQUESTBUCKETINFO_REPLY,
+ &api::MessageType::SPLITBUCKET_REPLY,
+ &api::MessageType::REMOVE_REPLY});
+
+ doTestMutationOrdering(fixture, params);
+}
+
+void
+BucketManagerTest::scheduleBucketInfoRequestWithConcurrentOps(
+ ConcurrentOperationFixture& fixture,
+ const document::BucketId& bucketForRemove,
+ const document::BucketId& bucketForSplit,
+ api::Timestamp mutationTimestamp)
+{
+ auto mutation(
+ fixture.createRemoveCommand(bucketForRemove, mutationTimestamp));
+ _top->sendDown(mutation);
+ auto guard = fixture.acquireBucketLockAndSendInfoRequest(
+ bucketForRemove);
+
+ auto conflictingOp(
+ std::make_shared<api::SplitBucketCommand>(bucketForSplit));
+ _top->sendDown(conflictingOp);
+ fixture.bounceWithReply(*conflictingOp);
+ fixture.bounceWithReply(*mutation);
+ guard.unlock();
+}
+
+void
+BucketManagerTest::testBucketConflictSetIsClearedBetweenBlockingRequests()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId firstConflictBucket(17, 0);
+ document::BucketId secondConflictBucket(18, 0);
+
+ fixture.setUp(WithBuckets()
+ .add(firstConflictBucket, api::BucketInfo(50, 100, 200))
+ .add(secondConflictBucket, api::BucketInfo(60, 200, 300)));
+
+ // Do a single round of starting and completing a request bucket info
+ // command with queueing and adding of `firstConflictBucket` to the set
+ // of conflicting buckets.
+ scheduleBucketInfoRequestWithConcurrentOps(
+ fixture, firstConflictBucket,
+ firstConflictBucket, api::Timestamp(1000));
+
+ // Barrier for completion of first round of replies. Subsequently remove
+ // all replies to get a clean slate.
+ fixture.awaitAndGetReplies(3);
+ fixture.clearReceivedReplies();
+
+ // Do a second round with a different bucket as the conflict. The
+ // mutation towards the first conflict bucket should now _not_ be queued
+ // as it was for an entirely different request bucket round.
+ scheduleBucketInfoRequestWithConcurrentOps(
+ fixture, firstConflictBucket,
+ secondConflictBucket, api::Timestamp(1001));
+
+ // Remove is not ordered after the split here since it should not be
+ // queued.
+ fixture.assertReplyOrdering({&api::MessageType::REMOVE_REPLY,
+ &api::MessageType::REQUESTBUCKETINFO_REPLY,
+ &api::MessageType::SPLITBUCKET_REPLY});
+}
+
+void
+BucketManagerTest::sendSingleBucketInfoRequest(const document::BucketId& id)
+{
+ std::vector<document::BucketId> buckets{id};
+ auto infoCmd = std::make_shared<api::RequestBucketInfoCommand>(buckets);
+ _top->sendDown(infoCmd);
+}
+
+void
+BucketManagerTest::testConflictSetOnlyClearedAfterAllBucketRequestsDone()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId bucketA(16, 0);
+ document::BucketId bucketB(16, 1);
+
+ fixture.setUp(WithBuckets()
+ .add(bucketA, api::BucketInfo(50, 100, 200))
+ .add(bucketB, api::BucketInfo(60, 200, 300)));
+
+ auto mutation = fixture.createRemoveCommand(bucketA);
+ _top->sendDown(mutation);
+
+ auto guardA = fixture.acquireBucketLock(bucketA);
+ auto guardB = fixture.acquireBucketLock(bucketB);
+
+ auto singleBucketInfoA = std::async(std::launch::async, [&]() {
+ sendSingleBucketInfoRequest(bucketA);
+ _top->waitForMessages(4, MESSAGE_WAIT_TIME);
+ });
+ waitUntilRequestsAreProcessing(1);
+ auto singleBucketInfoB = std::async(std::launch::async, [&]() {
+ sendSingleBucketInfoRequest(bucketB);
+ _top->waitForMessages(4, MESSAGE_WAIT_TIME);
+ });
+ // Barrier: after this point, both tasks are in the protected section.
+ // Neither async bucket info request can proceed as long as there are
+ // guards holding their desired bucket locks.
+ waitUntilRequestsAreProcessing(2);
+
+ auto conflictingOp = std::make_shared<api::SplitBucketCommand>(bucketA);
+ _top->sendDown(conflictingOp);
+ fixture.bounceWithReply(*conflictingOp);
+ // Releasing guard A (and allowing the request for A to go through) should
+ // _not_ clear the conflict set. I.e. if we send a mutation reply for a
+ // conflicted bucket up at this point, it should be enqueued after the
+ // split reply.
+ guardA.unlock();
+ _top->waitForMessages(1, MESSAGE_WAIT_TIME); // Completion barrier for A.
+ fixture.bounceWithReply(*mutation);
+ // Allow B to go through. This _should_ clear the conflict set and dequeue
+ // any conflicted mutations after their conflicting ops.
+ guardB.unlock();
+ singleBucketInfoA.get();
+ singleBucketInfoB.get();
+ // Note: request bucket info reply is dispatched up _before_ protected
+ // section guard goes out of scope, so reply is ordered before conflicts.
+ fixture.assertReplyOrdering({&api::MessageType::REQUESTBUCKETINFO_REPLY,
+ &api::MessageType::REQUESTBUCKETINFO_REPLY,
+ &api::MessageType::SPLITBUCKET_REPLY,
+ &api::MessageType::REMOVE_REPLY});
+}
+
+void
+BucketManagerTest::assertRequestWithBadHashIsRejected(
+ ConcurrentOperationFixture& fixture)
+{
+ // Test by default sets up 10 nodes in config. Pretend we only know of 3.
+ auto infoCmd = fixture.createFullFetchCommandWithHash("(0;0;1;2)");
+ _top->sendDown(infoCmd);
+ auto replies = fixture.awaitAndGetReplies(1);
+ auto& reply = dynamic_cast<api::RequestBucketInfoReply&>(*replies[0]);
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::REJECTED,
+ reply.getResult().getResult());
+}
+
+void
+BucketManagerTest::testRejectRequestWithMismatchingDistributionHash()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId bucket(17, 0);
+ fixture.setUp(WithBuckets().add(bucket, api::BucketInfo(50, 100, 200)));
+ assertRequestWithBadHashIsRejected(fixture);
+}
+
+void
+BucketManagerTest::testDbNotIteratedWhenAllRequestsRejected()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId bucket(17, 0);
+ fixture.setUp(WithBuckets().add(bucket, api::BucketInfo(50, 100, 200)));
+ auto guard = fixture.acquireBucketLock(bucket);
+ // We've got a bucket locked, so iff the manager actually starts processing
+ // buckets even though it has no requests active, it will stall while
+ // waiting for the lock to be released. When we then send down an additional
+ // bucket info request, this request will either be rejected immediately (if
+ // the db is NOT processed) or time out and fail the test.
+ assertRequestWithBadHashIsRejected(fixture);
+ fixture.clearReceivedReplies();
+
+ auto infoCmd = fixture.createFullFetchCommandWithHash("(0;0;1;2)");
+ _top->sendDown(infoCmd);
+ auto replies = fixture.awaitAndGetReplies(1);
+}
+
+/**
+ * Accept bucket info requests if their distribution hash is a valid permutation
+ * of our own config (i.e. they are set-wise identical even though the
+ * ordering of nodes may differ). See VESPA-1980 for context.
+ */
+void
+BucketManagerTest::testReceivedDistributionHashIsNormalized()
+{
+ ConcurrentOperationFixture fixture(*this);
+ document::BucketId bucket(17, 0);
+ fixture.setUp(WithBuckets().add(bucket, api::BucketInfo(50, 100, 200)));
+
+ // Test is configured with 10 nodes in increasing order. Jumble the order
+ // around.
+ auto infoCmd = fixture.createFullFetchCommandWithHash(
+ "(0;2;1;3;9;6;4;5;8;7;0)");
+ _top->sendDown(infoCmd);
+ auto replies = fixture.awaitAndGetReplies(1);
+ auto& reply = dynamic_cast<api::RequestBucketInfoReply&>(*replies[0]);
+ // Should NOT have been rejected despite hash not matching config order
+ // verbatim.
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::OK, reply.getResult().getResult());
+}
+
+} // storage
diff --git a/storage/src/tests/bucketdb/distribution_hash_normalizer_test.cpp b/storage/src/tests/bucketdb/distribution_hash_normalizer_test.cpp
new file mode 100644
index 00000000000..7734e1054ff
--- /dev/null
+++ b/storage/src/tests/bucketdb/distribution_hash_normalizer_test.cpp
@@ -0,0 +1,114 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storage/bucketdb/distribution_hash_normalizer.h>
+#include <string>
+
+namespace storage {
+
+using Normalizer = DistributionHashNormalizer;
+
+class DistributionHashNormalizerTest : public CppUnit::TestFixture {
+public:
+ CPPUNIT_TEST_SUITE(DistributionHashNormalizerTest);
+ CPPUNIT_TEST(orderNonHierarchicRootGroupNodesByDistributionKey);
+ CPPUNIT_TEST(mayHaveSameGroupIndexAsNodeIndex);
+ CPPUNIT_TEST(emitOptionalCapacityForRootGroup);
+ CPPUNIT_TEST(emitOptionalCapacityForSubGroups);
+ CPPUNIT_TEST(hierarchicGroupsAreOrderedByGroupIndex);
+ CPPUNIT_TEST(subgroupsOrderedOnEachNestingLevel);
+ CPPUNIT_TEST(distributionSpecIsCopiedVerbatim);
+ CPPUNIT_TEST(emptyInputYieldsEmptyOutput);
+ CPPUNIT_TEST(parseFailureReturnsInputVerbatim);
+ CPPUNIT_TEST_SUITE_END();
+
+ void orderNonHierarchicRootGroupNodesByDistributionKey();
+ void mayHaveSameGroupIndexAsNodeIndex();
+ void emitOptionalCapacityForRootGroup();
+ void emitOptionalCapacityForSubGroups();
+ void hierarchicGroupsAreOrderedByGroupIndex();
+ void subgroupsOrderedOnEachNestingLevel();
+ void distributionSpecIsCopiedVerbatim();
+ void emptyInputYieldsEmptyOutput();
+ void parseFailureReturnsInputVerbatim();
+
+private:
+ DistributionHashNormalizer _normalizer;
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(DistributionHashNormalizerTest);
+
+void
+DistributionHashNormalizerTest::orderNonHierarchicRootGroupNodesByDistributionKey()
+{
+ // Group index is first in list.
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("(1;0;2;3;4;7)"),
+ _normalizer.normalize("(1;4;7;2;0;3)"));
+}
+
+void
+DistributionHashNormalizerTest::mayHaveSameGroupIndexAsNodeIndex()
+{
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("(0;0;2;3;4;7)"),
+ _normalizer.normalize("(0;4;7;2;0;3)"));
+}
+
+void
+DistributionHashNormalizerTest::emitOptionalCapacityForRootGroup()
+{
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("(0c12.5;1;2;3;4;7)"),
+ _normalizer.normalize("(0c12.5;1;4;7;2;3)"));
+}
+
+void
+DistributionHashNormalizerTest::emitOptionalCapacityForSubGroups()
+{
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("(0d1|*(1c5.5;1)(2;2)(3c7;3))"),
+ _normalizer.normalize("(0d1|*(2;2)(1c5.5;1)(3c7;3))"));
+}
+
+void
+DistributionHashNormalizerTest::hierarchicGroupsAreOrderedByGroupIndex()
+{
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("(0d1|*(0;0)(1;1)(3;3))"),
+ _normalizer.normalize("(0d1|*(3;3)(1;1)(0;0))"));
+}
+
+void
+DistributionHashNormalizerTest::subgroupsOrderedOnEachNestingLevel()
+{
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("(0d1|*(1d3|*(2;2)(3;3))"
+ "(4;1)(7d2|*(5;5)(6;6)))"),
+ _normalizer.normalize("(0d1|*(7d2|*(6;6)(5;5))"
+ "(1d3|*(2;2)(3;3))(4;1))"));
+}
+
+void
+DistributionHashNormalizerTest::distributionSpecIsCopiedVerbatim()
+{
+ // Definitely don't want to do any ordering of the distribution spec.
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("(0d3|2|1|*(0;0)(1;1)(3;3))"),
+ _normalizer.normalize("(0d3|2|1|*(3;3)(1;1)(0;0))"));
+}
+
+void
+DistributionHashNormalizerTest::emptyInputYieldsEmptyOutput()
+{
+ // Technically a parse failure (only 4.2 has this behavior), but it's
+ // explicitly checked for in BucketManager, so let's test it explicitly
+ // here as well.
+ CPPUNIT_ASSERT_EQUAL(vespalib::string(""), _normalizer.normalize(""));
+}
+
+// In the (unlikely) case that the parser somehow fails to capture all possible
+// valid values of the distribution hash, fall back to returning the non-
+// normalized string. A log warning will also be emitted (though that's not
+// testable).
+void
+DistributionHashNormalizerTest::parseFailureReturnsInputVerbatim()
+{
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("onkel skrue"),
+ _normalizer.normalize("onkel skrue"));
+}
+
+} // storage
+
diff --git a/storage/src/tests/bucketdb/initializertest.cpp b/storage/src/tests/bucketdb/initializertest.cpp
new file mode 100644
index 00000000000..169150a7ff9
--- /dev/null
+++ b/storage/src/tests/bucketdb/initializertest.cpp
@@ -0,0 +1,924 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * Tests storage initialization without depending on persistence layer.
+ */
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/bucketdb/storagebucketdbinitializer.h>
+
+#include <vespa/document/base/testdocman.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/persistence/filestorage/filestormanager.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/state.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/dummystoragelink.h>
+#include <tests/common/testhelper.h>
+#include <vespa/vdstestlib/cppunit/dirconfig.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+
+LOG_SETUP(".test.bucketdb.initializing");
+
+namespace storage {
+
+typedef uint16_t PartitionId;
+
+struct InitializerTest : public CppUnit::TestFixture {
+
+ class InitParams {
+ vdstestlib::DirConfig config;
+ bool configFinalized;
+
+ public:
+ uint32_t bucketBitsUsed;
+ NodeIndex nodeIndex;
+ NodeCount nodeCount;
+ Redundancy redundancy;
+ uint32_t docsPerDisk;
+ DiskCount diskCount;
+ std::set<uint32_t> disksDown;
+ bool bucketWrongDisk;
+ bool bucketMultipleDisks;
+ bool failingListRequest;
+ bool failingInfoRequest;
+
+ InitParams()
+ : config(getStandardConfig(true)),
+ configFinalized(false),
+ bucketBitsUsed(4),
+ nodeIndex(0),
+ nodeCount(10),
+ redundancy(2),
+ docsPerDisk(10),
+ diskCount(5),
+ bucketWrongDisk(false),
+ bucketMultipleDisks(false),
+ failingListRequest(false),
+ failingInfoRequest(false) {}
+
+ void setAllFailures() {
+ bucketWrongDisk = true;
+ bucketMultipleDisks = true;
+ failingListRequest = true;
+ failingInfoRequest = true;
+ }
+
+ vdstestlib::DirConfig& getConfig() {
+ if (!configFinalized) {
+ config.getConfig("stor-server")
+ .setValue("node_index", nodeIndex);
+ config.getConfig("stor-distribution")
+ .setValue("redundancy", redundancy);
+ configFinalized = true;
+ }
+ return config;
+ }
+
+ };
+
+ document::TestDocMan _docMan;
+
+ void testInitialization(InitParams& params);
+
+ /**
+ * Test that the status page can be shown during init without a deadlock
+ * or crash or anything. Don't validate much output, it might change.
+ */
+ void testStatusPage();
+
+ /** Test initializing with an empty node. */
+ void testInitEmptyNode() {
+ InitParams params;
+ params.docsPerDisk = 0;
+ testInitialization(params);
+ }
+ /** Test initializing with some data on single disk. */
+ void testInitSingleDisk() {
+ InitParams params;
+ params.diskCount = DiskCount(1);
+ testInitialization(params);
+ }
+ /** Test initializing with multiple disks. */
+ void testInitMultiDisk() {
+ InitParams params;
+ testInitialization(params);
+ }
+ /** Test initializing with one of the disks being bad. */
+ void testInitFailingMiddleDisk() {
+ InitParams params;
+ params.disksDown.insert(1);
+ testInitialization(params);
+ }
+ /** Test initializing with last disk being bad. */
+ void testInitFailingLastDisk() {
+ InitParams params;
+ params.disksDown.insert(params.diskCount - 1);
+ testInitialization(params);
+ }
+ /** Test initializing with bucket on wrong disk. */
+ void testInitBucketOnWrongDisk() {
+ InitParams params;
+ params.bucketWrongDisk = true;
+ params.bucketBitsUsed = 58;
+ testInitialization(params);
+ }
+ /** Test initializing with bucket on multiple disks. */
+ void testInitBucketOnMultipleDisks() {
+ InitParams params;
+ params.bucketMultipleDisks = true;
+ params.bucketBitsUsed = 58;
+ testInitialization(params);
+ }
+ /** Test initializing with failing list request. */
+ void testInitFailingListRequest() {
+ InitParams params;
+ params.failingListRequest = true;
+ testInitialization(params);
+ }
+ void testInitFailingInfoRequest() {
+ InitParams params;
+ params.failingInfoRequest = true;
+ testInitialization(params);
+ }
+ /** Test initializing with everything being wrong at once. */
+ void testAllFailures() {
+ InitParams params;
+ params.docsPerDisk = 100;
+ params.diskCount = DiskCount(10);
+ params.disksDown.insert(0);
+ params.disksDown.insert(2);
+ params.disksDown.insert(3);
+ params.disksDown.insert(9);
+ params.setAllFailures();
+ testInitialization(params);
+ }
+ void testCommandBlockingDuringInit();
+
+ void testBucketProgressCalculator();
+
+ void testBucketsInitializedByLoad();
+
+ CPPUNIT_TEST_SUITE(InitializerTest);
+ CPPUNIT_TEST(testInitEmptyNode);
+ CPPUNIT_TEST(testInitSingleDisk);
+ CPPUNIT_TEST(testInitMultiDisk);
+ CPPUNIT_TEST(testInitFailingMiddleDisk);
+ CPPUNIT_TEST(testInitFailingLastDisk);
+ CPPUNIT_TEST(testInitBucketOnWrongDisk);
+ //CPPUNIT_TEST(testInitBucketOnMultipleDisks);
+ //CPPUNIT_TEST(testStatusPage);
+ //CPPUNIT_TEST(testCommandBlockingDuringInit);
+ //CPPUNIT_TEST(testAllFailures);
+ CPPUNIT_TEST(testBucketProgressCalculator);
+ CPPUNIT_TEST(testBucketsInitializedByLoad);
+ CPPUNIT_TEST_SUITE_END();
+
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(InitializerTest);
+
+namespace {
+// Data kept on buckets we're using in test.
+struct BucketData {
+ api::BucketInfo info;
+
+ BucketData() : info(0, 0, 0, 0, 0) {
+ }
+
+ BucketData operator+(const BucketData& other) const {
+ BucketData copy;
+ copy.info.setDocumentCount(
+ info.getDocumentCount() + other.info.getDocumentCount());
+ copy.info.setTotalDocumentSize(
+ info.getTotalDocumentSize()
+ + other.info.getTotalDocumentSize());
+ copy.info.setChecksum(
+ info.getChecksum() * other.info.getChecksum());
+ return copy;
+ }
+};
+// Data reciding on one disk
+typedef std::map<document::BucketId, BucketData> DiskData;
+struct BucketInfoLogger {
+ std::map<PartitionId, DiskData>& map;
+
+ BucketInfoLogger(std::map<PartitionId, DiskData>& m)
+ : map(m) {}
+
+ StorBucketDatabase::Decision operator()(
+ uint64_t revBucket, StorBucketDatabase::Entry& entry)
+ {
+ document::BucketId bucket(
+ document::BucketId::keyToBucketId(revBucket));
+ CPPUNIT_ASSERT(bucket.getRawId() != 0);
+ CPPUNIT_ASSERT_MSG(
+ "Found invalid bucket in database: " + bucket.toString()
+ + " " + entry.getBucketInfo().toString(),
+ entry.getBucketInfo().valid());
+ DiskData& ddata(map[entry.disk]);
+ BucketData& bdata(ddata[bucket]);
+ bdata.info = entry.getBucketInfo();
+ return StorBucketDatabase::CONTINUE;
+ }
+};
+std::map<PartitionId, DiskData>
+createMapFromBucketDatabase(StorBucketDatabase& db) {
+ std::map<PartitionId, DiskData> result;
+ BucketInfoLogger infoLogger(result);
+ db.all(infoLogger, "createmap");
+ return result;
+}
+// Create data we want to have in this test
+std::map<PartitionId, DiskData>
+buildBucketInfo(const document::TestDocMan& docMan,
+ InitializerTest::InitParams& params)
+{
+ std::map<PartitionId, DiskData> result;
+ for (uint32_t i=0; i<params.diskCount; ++i) {
+ if (params.disksDown.find(i) == params.disksDown.end()) {
+ result[i];
+ }
+ }
+ lib::Distribution distribution(
+ lib::Distribution::getDefaultDistributionConfig(
+ params.redundancy, params.nodeCount));
+ document::BucketIdFactory bucketIdFactory;
+ lib::NodeState nodeState;
+ nodeState.setDiskCount(params.diskCount);
+
+ uint64_t totalDocs = params.docsPerDisk * params.diskCount;
+ for (uint32_t i=0, n=totalDocs; i<n; ++i) {
+ bool useWrongDisk = false;
+ if (i == 1 && params.bucketWrongDisk) {
+ useWrongDisk = true;
+ }
+ document::Document::SP doc(docMan.createRandomDocument(i));
+ if (i == 3 && params.bucketMultipleDisks) {
+ doc = docMan.createRandomDocument(i - 1);
+ useWrongDisk = true;
+ }
+ document::BucketId bid(bucketIdFactory.getBucketId(doc->getId()));
+ bid.setUsedBits(params.bucketBitsUsed);
+ bid = bid.stripUnused();
+ uint32_t partition(distribution.getIdealDisk(
+ nodeState, params.nodeIndex, bid,
+ lib::Distribution::IDEAL_DISK_EVEN_IF_DOWN));
+ if (params.disksDown.find(partition) != params.disksDown.end()) {
+ continue;
+ }
+ if (useWrongDisk) {
+ int correctPart = partition;
+ partition = (partition + 1) % params.diskCount;;
+ while (params.disksDown.find(partition) != params.disksDown.end()) {
+ partition = (partition + 1) % params.diskCount;;
+ }
+ LOG(info, "Putting bucket %s on wrong disk %u instead of %u",
+ bid.toString().c_str(), partition, correctPart);
+ }
+ LOG(info, "Putting bucket %s on disk %u",
+ bid.toString().c_str(), partition);
+ BucketData& data(result[partition][bid]);
+ data.info.setDocumentCount(data.info.getDocumentCount() + 1);
+ data.info.setTotalDocumentSize(
+ data.info.getTotalDocumentSize() + 100);
+ data.info.setChecksum(data.info.getChecksum() * 3);
+ }
+ return result;
+}
+void verifyEqual(std::map<PartitionId, DiskData>& org,
+ std::map<PartitionId, DiskData>& existing)
+{
+ uint32_t equalCount = 0;
+ std::map<PartitionId, DiskData>::const_iterator part1(org.begin());
+ std::map<PartitionId, DiskData>::const_iterator part2(existing.begin());
+ while (part1 != org.end() && part2 != existing.end()) {
+ if (part1->first < part2->first) {
+ if (!part1->second.empty()) {
+ std::ostringstream ost;
+ ost << "No data in partition " << part1->first << " found.";
+ CPPUNIT_FAIL(ost.str());
+ }
+ ++part1;
+ } else if (part1->first > part2->first) {
+ if (!part2->second.empty()) {
+ std::ostringstream ost;
+ ost << "Found data in partition " << part2->first
+ << " which should not exist.";
+ CPPUNIT_FAIL(ost.str());
+ }
+ ++part2;
+ } else {
+ DiskData::const_iterator bucket1(part1->second.begin());
+ DiskData::const_iterator bucket2(part2->second.begin());
+ while (bucket1 != part1->second.end()
+ && bucket2 != part2->second.end())
+ {
+ if (bucket1->first < bucket2->first) {
+ std::ostringstream ost;
+ ost << "No data in partition " << part1->first
+ << " for bucket " << bucket1->first << " found.";
+ CPPUNIT_FAIL(ost.str());
+ } else if (bucket1->first.getId() > bucket2->first.getId())
+ {
+ std::ostringstream ost;
+ ost << "Found data in partition " << part2->first
+ << " for bucket " << bucket2->first
+ << " which should not exist.";
+ CPPUNIT_FAIL(ost.str());
+ } else if (!(bucket1->second.info == bucket2->second.info)) {
+ std::ostringstream ost;
+ ost << "Bucket " << bucket1->first << " on partition "
+ << part1->first << " has bucket info "
+ << bucket2->second.info << " and not "
+ << bucket1->second.info << " as expected.";
+ CPPUNIT_FAIL(ost.str());
+ }
+ ++bucket1;
+ ++bucket2;
+ ++equalCount;
+ }
+ if (bucket1 != part1->second.end()) {
+ std::ostringstream ost;
+ ost << "No data in partition " << part1->first
+ << " for bucket " << bucket1->first << " found.";
+ CPPUNIT_FAIL(ost.str());
+ }
+ if (bucket2 != part2->second.end()) {
+ std::ostringstream ost;
+ ost << "Found data in partition " << part2->first
+ << " for bucket " << bucket2->first
+ << " which should not exist.";
+ CPPUNIT_FAIL(ost.str());
+ }
+ ++part1;
+ ++part2;
+ }
+ }
+ if (part1 != org.end() && !part1->second.empty()) {
+ std::ostringstream ost;
+ ost << "No data in partition " << part1->first << " found.";
+ CPPUNIT_FAIL(ost.str());
+ }
+ if (part2 != existing.end() && !part2->second.empty()) {
+ std::ostringstream ost;
+ ost << "Found data in partition " << part2->first
+ << " which should not exist.";
+ CPPUNIT_FAIL(ost.str());
+ }
+ //std::cerr << "\n " << equalCount << " buckets were matched. ";
+}
+
+struct MessageCallback
+{
+public:
+ virtual ~MessageCallback() {}
+ virtual void onMessage(const api::StorageMessage&) = 0;
+};
+
+struct FakePersistenceLayer : public StorageLink {
+ StorBucketDatabase& bucketDatabase;
+ std::map<PartitionId, DiskData>& data;
+ std::string firstFatal;
+ std::string fatalError;
+ MessageCallback* messageCallback;
+
+ FakePersistenceLayer(std::map<PartitionId, DiskData>& d,
+ StorBucketDatabase& db)
+ : StorageLink("fakepersistencelayer"),
+ bucketDatabase(db),
+ data(d),
+ messageCallback(0)
+ {
+ }
+
+ void fatal(vespalib::stringref error) {
+ fatalError = error;
+ if (firstFatal.empty()) firstFatal = fatalError;
+ }
+ const BucketData* getBucketData(PartitionId partition,
+ const document::BucketId& bucket,
+ vespalib::stringref opname)
+ {
+ std::map<PartitionId, DiskData>::const_iterator it(
+ data.find(partition));
+ if (it == data.end()) {
+ std::ostringstream ost;
+ ost << bucket << " is stated to be on partition " << partition
+ << " in operation " << opname << ", but we have no data for "
+ << "it there.";
+ fatal(ost.str());
+ } else {
+ DiskData::const_iterator it2(it->second.find(bucket));
+ if (it2 == it->second.end()) {
+ std::ostringstream ost;
+ ost << "Have no data for " << bucket << " on disk " << partition
+ << " in operation " << opname;
+ fatal(ost.str());
+ } else {
+ const BucketData& bucketData(it2->second);
+ return &bucketData;
+ }
+ }
+ return 0;
+ }
+ virtual bool onDown(const api::StorageMessage::SP& msg) {
+ fatalError = "";
+ if (messageCallback) {
+ messageCallback->onMessage(*msg);
+ }
+ if (msg->getType() == api::MessageType::INTERNAL) {
+ api::InternalCommand& cmd(
+ dynamic_cast<api::InternalCommand&>(*msg));
+ if (cmd.getType() == ReadBucketList::ID) {
+ ReadBucketList& rbl(dynamic_cast<ReadBucketList&>(cmd));
+ ReadBucketListReply::SP reply(new ReadBucketListReply(rbl));
+ std::map<PartitionId, DiskData>::const_iterator it(
+ data.find(rbl.getPartition()));
+ if (it == data.end()) {
+ std::ostringstream ost;
+ ost << "Got list request to partition "
+ << rbl.getPartition()
+ << " for which we should not get a request";
+ fatal(ost.str());
+ } else {
+ for (DiskData::const_iterator it2 = it->second.begin();
+ it2 != it->second.end(); ++it2)
+ {
+ reply->getBuckets().push_back(it2->first);
+ }
+ }
+ if (!fatalError.empty()) {
+ reply->setResult(api::ReturnCode(
+ api::ReturnCode::INTERNAL_FAILURE, fatalError));
+ }
+ sendUp(reply);
+ } else if (cmd.getType() == ReadBucketInfo::ID) {
+ ReadBucketInfo& rbi(dynamic_cast<ReadBucketInfo&>(cmd));
+ ReadBucketInfoReply::SP reply(new ReadBucketInfoReply(rbi));
+ StorBucketDatabase::WrappedEntry entry(
+ bucketDatabase.get(rbi.getBucketId(), "fakelayer"));
+ if (!entry.exist()) {
+ fatal("Bucket " + rbi.getBucketId().toString()
+ + " did not exist in bucket database but we got "
+ + "read bucket info request for it.");
+ } else {
+ const BucketData* bucketData(getBucketData(
+ entry->disk, rbi.getBucketId(), "readbucketinfo"));
+ if (bucketData != 0) {
+ entry->setBucketInfo(bucketData->info);
+ entry.write();
+ }
+ }
+ if (!fatalError.empty()) {
+ reply->setResult(api::ReturnCode(
+ api::ReturnCode::INTERNAL_FAILURE, fatalError));
+ }
+ sendUp(reply);
+ } else if (cmd.getType() == InternalBucketJoinCommand::ID) {
+ InternalBucketJoinCommand& ibj(
+ dynamic_cast<InternalBucketJoinCommand&>(cmd));
+ InternalBucketJoinReply::SP reply(
+ new InternalBucketJoinReply(ibj));
+ StorBucketDatabase::WrappedEntry entry(
+ bucketDatabase.get(ibj.getBucketId(), "fakelayer"));
+ if (!entry.exist()) {
+ fatal("Bucket " + ibj.getBucketId().toString()
+ + " did not exist in bucket database but we got "
+ + "read bucket info request for it.");
+ } else {
+ const BucketData* source(getBucketData(
+ ibj.getDiskOfInstanceToJoin(), ibj.getBucketId(),
+ "internaljoinsource"));
+ const BucketData* target(getBucketData(
+ ibj.getDiskOfInstanceToKeep(), ibj.getBucketId(),
+ "internaljointarget"));
+ if (source != 0 && target != 0) {
+ entry->setBucketInfo((*source + *target).info);
+ entry.write();
+ }
+ }
+ if (!fatalError.empty()) {
+ reply->setResult(api::ReturnCode(
+ api::ReturnCode::INTERNAL_FAILURE, fatalError));
+ }
+ sendUp(reply);
+ } else {
+ return false;
+ }
+ return true;
+ }
+ return false;
+ }
+};
+
+} // end of anonymous namespace
+
+#define CPPUNIT_ASSERT_METRIC_SET(x) \
+ CPPUNIT_ASSERT(initializer->getMetrics().x.getValue() > 0);
+
+void
+InitializerTest::testInitialization(InitParams& params)
+{
+ std::map<PartitionId, DiskData> data(buildBucketInfo(_docMan, params));
+
+ spi::PartitionStateList partitions(params.diskCount);
+ for (std::set<uint32_t>::const_iterator it = params.disksDown.begin();
+ it != params.disksDown.end(); ++it)
+ {
+ partitions[*it] = spi::PartitionState(
+ spi::PartitionState::DOWN, "Set down in test");
+ }
+ TestServiceLayerApp node(params.diskCount, params.nodeIndex,
+ params.getConfig().getConfigId());
+ DummyStorageLink top;
+ StorageBucketDBInitializer* initializer;
+ FakePersistenceLayer* bottom;
+ top.push_back(StorageLink::UP(initializer = new StorageBucketDBInitializer(
+ params.getConfig().getConfigId(),
+ partitions,
+ node.getDoneInitializeHandler(),
+ node.getComponentRegister())));
+ top.push_back(StorageLink::UP(bottom = new FakePersistenceLayer(
+ data, node.getStorageBucketDatabase())));
+
+ LOG(info, "STARTING INITIALIZATION");
+ top.open();
+
+ /*
+ FileChanger updater(config, nodeIndex, params, orgBucketDatabase);
+ if (params.bucketWrongDisk) updater.moveBucketWrongDisk();
+ if (params.bucketMultipleDisks) updater.copyBucketWrongDisk();
+ if (params.failingListRequest) {
+ updater.removeDirPermission(6, 'r');
+ updater.removeBucketsFromDBAtPath(6);
+ }
+ if (params.failingInfoRequest) {
+ updater.removeFilePermission();
+ orgBucketDatabase.erase(updater.getBucket(8));
+ }
+ */
+
+ node.waitUntilInitialized(initializer);
+
+ std::map<PartitionId, DiskData> initedBucketDatabase(
+ createMapFromBucketDatabase(node.getStorageBucketDatabase()));
+ verifyEqual(data, initedBucketDatabase);
+ /*
+ if (params.bucketWrongDisk) {
+ CPPUNIT_ASSERT_METRIC_SET(_wrongDisk);
+ }
+ if (params.bucketMultipleDisks) {
+ CPPUNIT_ASSERT_METRIC_SET(_joinedCount);
+ }
+ */
+}
+
+/*
+namespace {
+ enum State { LISTING, INFO, DONE };
+ void verifyStatusContent(StorageBucketDBInitializer& initializer,
+ State state)
+ {
+ std::ostringstream ost;
+ initializer.reportStatus(ost, framework::HttpUrlPath(""));
+ std::string status = ost.str();
+
+ if (state == LISTING) {
+ CPPUNIT_ASSERT_CONTAIN("List phase completed: false", status);
+ CPPUNIT_ASSERT_CONTAIN("Initialization completed: false", status);
+ } else if (state == INFO) {
+ CPPUNIT_ASSERT_CONTAIN("List phase completed: true", status);
+ CPPUNIT_ASSERT_CONTAIN("Initialization completed: false", status);
+ } else if (state == DONE) {
+ CPPUNIT_ASSERT_CONTAIN("List phase completed: true", status);
+ CPPUNIT_ASSERT_CONTAIN("Initialization completed: true", status);
+ }
+ }
+}
+
+void
+InitializerTest::testStatusPage()
+{
+ // Set up surrounding system to create a single bucket for us to
+ // do init on.
+ vdstestlib::DirConfig config(getStandardConfig(true));
+ uint16_t nodeIndex(
+ config.getConfig("stor-server").getValue("node_index", 0));
+ InitParams params;
+ params.docsPerDisk = 1;
+ params.diskCount = 1;
+ std::map<document::BucketId, api::BucketInfo> orgBucketDatabase(
+ buildBucketInfo(_docMan, config, nodeIndex, 1, 1, params.disksDown));
+ FileChanger updater(config, nodeIndex, params, orgBucketDatabase);
+
+ // Set up the initializer.
+ DummyStorageServer server(config.getConfigId());
+ DummyStorageLink top;
+ DummyStorageLink *bottom;
+ StorageBucketDBInitializer* initializer;
+ top.push_back(StorageLink::UP(initializer = new StorageBucketDBInitializer(
+ config.getConfigId(), server)));
+ top.push_back(StorageLink::UP(bottom = new DummyStorageLink));
+
+ // Grab bucket database lock for bucket to init to lock the initializer
+ // in the init stage
+ StorBucketDatabase::WrappedEntry entry(
+ server.getStorageBucketDatabase().get(
+ updater.getBucket(0), "testCommandBlocking",
+ StorBucketDatabase::LOCK_IF_NONEXISTING_AND_NOT_CREATING));
+ // Start the initializer
+ top.open();
+ bottom->waitForMessages(1, 30);
+ verifyStatusContent(*initializer, LISTING);
+ // Attempt to send put. Should be blocked
+ // Attempt to send request bucket info. Should be blocked.
+ // Attempt to send getNodeState. Should not be blocked.
+
+ // Unlock bucket in bucket database so listing step can complete.
+ // Await read info request being sent down.
+ entry.unlock();
+ bottom->waitForMessages(1, 30);
+ verifyStatusContent(*initializer, INFO);
+
+ ReadBucketInfo& cmd(dynamic_cast<ReadBucketInfo&>(*bottom->getCommand(0)));
+ ReadBucketInfoReply::SP reply(new ReadBucketInfoReply(cmd));
+ bottom->sendUp(reply);
+
+ node.waitUntilInitialized(initializer);
+ verifyStatusContent(*initializer, DONE);
+
+}
+
+#define ASSERT_BLOCKED(top, bottom, blocks) \
+ if (blocks) { \
+ top.waitForMessages(1, 30); \
+ CPPUNIT_ASSERT_EQUAL(size_t(1), top.getReplies().size()); \
+ CPPUNIT_ASSERT_EQUAL(size_t(0), bottom.getCommands().size()); \
+ api::StorageReply& reply(dynamic_cast<api::StorageReply&>( \
+ *top.getReply(0))); \
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::ABORTED, \
+ reply.getResult().getResult()); \
+ top.reset(); \
+ } else { \
+ bottom.waitForMessages(1, 30); \
+ CPPUNIT_ASSERT_EQUAL(size_t(0), top.getReplies().size()); \
+ CPPUNIT_ASSERT_EQUAL(size_t(1), bottom.getCommands().size()); \
+ api::StorageCommand& command(dynamic_cast<api::StorageCommand&>( \
+ *bottom.getCommand(0))); \
+ (void) command; \
+ bottom.reset(); \
+ }
+
+namespace {
+ void verifyBlockingOn(DummyStorageLink& top,
+ DummyStorageLink& bottom,
+ bool blockEnabled)
+ {
+ // Attempt to send get. Should be blocked if block enabled
+ {
+ api::GetCommand::SP cmd(new api::GetCommand(
+ document::BucketId(16, 4),
+ document::DocumentId("userdoc:ns:4:test"), true));
+ top.sendDown(cmd);
+ ASSERT_BLOCKED(top, bottom, blockEnabled);
+ }
+ // Attempt to send request bucket info. Should be blocked if enabled.
+ {
+ api::RequestBucketInfoCommand::SP cmd(
+ new api::RequestBucketInfoCommand(
+ 0, lib::ClusterState("")));
+ top.sendDown(cmd);
+ ASSERT_BLOCKED(top, bottom, blockEnabled);
+ }
+ // Attempt to send getNodeState. Should not be blocked.
+ {
+ api::GetNodeStateCommand::SP cmd(new api::GetNodeStateCommand(
+ lib::NodeState::UP(0)));
+ top.sendDown(cmd);
+ ASSERT_BLOCKED(top, bottom, false);
+ }
+ }
+}
+
+void
+InitializerTest::testCommandBlockingDuringInit()
+{
+ // Set up surrounding system to create a single bucket for us to
+ // do init on.
+ vdstestlib::DirConfig config(getStandardConfig(true));
+ uint16_t nodeIndex(
+ config.getConfig("stor-server").getValue("node_index", 0));
+ InitParams params;
+ params.docsPerDisk = 1;
+ params.diskCount = 1;
+ std::map<document::BucketId, api::BucketInfo> orgBucketDatabase(
+ buildBucketInfo(_docMan, config, nodeIndex, 1, 1, params.disksDown));
+ FileChanger updater(config, nodeIndex, params, orgBucketDatabase);
+
+ // Set up the initializer.
+ DummyStorageServer server(config.getConfigId());
+ DummyStorageLink top;
+ DummyStorageLink *bottom;
+ StorageBucketDBInitializer* initializer;
+ top.push_back(StorageLink::UP(initializer = new StorageBucketDBInitializer(
+ config.getConfigId(), server)));
+ top.push_back(StorageLink::UP(bottom = new DummyStorageLink));
+
+ // Grab bucket database lock for bucket to init to lock the initializer
+ // in the init stage
+ StorBucketDatabase::WrappedEntry entry(
+ server.getStorageBucketDatabase().get(
+ updater.getBucket(0), "testCommandBlocking",
+ StorBucketDatabase::LOCK_IF_NONEXISTING_AND_NOT_CREATING));
+ // Start the initializer
+ top.open();
+ verifyBlockingOn(top, *bottom, true);
+ // Attempt to send put. Should be blocked
+ // Attempt to send request bucket info. Should be blocked.
+ // Attempt to send getNodeState. Should not be blocked.
+
+ // Unlock bucket in bucket database so listing step can complete.
+ // Await read info request being sent down.
+ entry.unlock();
+ bottom->waitForMessages(1, 30);
+ dynamic_cast<ReadBucketInfo&>(*bottom->getCommand(0));
+ CPPUNIT_ASSERT(!server.isInitialized());
+ bottom->reset();
+
+ // Retry - Should now not block
+ verifyBlockingOn(top, *bottom, false);
+}
+*/
+
+void
+InitializerTest::testBucketProgressCalculator()
+{
+ using document::BucketId;
+ StorageBucketDBInitializer::BucketProgressCalculator calc;
+ // We consider the given bucket as not being completed, so progress
+ // will be _up to_, not _including_ the bucket. This means we can never
+ // reach 1.0, so progress completion must be handled by other logic!
+ CPPUNIT_ASSERT_EQUAL(0.0, calc.calculateProgress(BucketId(1, 0)));
+ CPPUNIT_ASSERT_EQUAL(0.0, calc.calculateProgress(BucketId(32, 0)));
+
+ CPPUNIT_ASSERT_EQUAL(0.5, calc.calculateProgress(BucketId(1, 1)));
+
+ CPPUNIT_ASSERT_EQUAL(0.25, calc.calculateProgress(BucketId(2, 2)));
+ CPPUNIT_ASSERT_EQUAL(0.5, calc.calculateProgress(BucketId(2, 1)));
+ CPPUNIT_ASSERT_EQUAL(0.75, calc.calculateProgress(BucketId(2, 3)));
+
+ CPPUNIT_ASSERT_EQUAL(0.875, calc.calculateProgress(BucketId(3, 7)));
+}
+
+struct DatabaseInsertCallback : MessageCallback
+{
+ DiskData& _data;
+ StorBucketDatabase& _database;
+ TestServiceLayerApp& _app;
+ const InitializerTest::InitParams& _params;
+ bool _invoked;
+ double _lastSeenProgress;
+ uint8_t _expectedReadBucketPriority;
+ std::ostringstream _errors;
+ DatabaseInsertCallback(DiskData& data,
+ StorBucketDatabase& db,
+ TestServiceLayerApp& app,
+ const InitializerTest::InitParams& params)
+ : _data(data),
+ _database(db),
+ _app(app),
+ _params(params),
+ _invoked(false),
+ _lastSeenProgress(0),
+ _expectedReadBucketPriority(255)
+ {}
+
+ void onMessage(const api::StorageMessage& msg)
+ {
+ // Always make sure we're not set as initialized while we're still
+ // processing messages! Also ensure progress never goes down.
+ lib::NodeState::CSP reportedState(
+ _app.getStateUpdater().getReportedNodeState());
+ double progress(reportedState->getInitProgress().getValue());
+ LOG(debug, "reported progress is now %g", progress);
+ // CppUnit exceptions are swallowed...
+ if (progress >= 1.0) {
+ _errors << "progress exceeded 1.0: " << progress << "\n";
+ }
+ if (progress < _lastSeenProgress) {
+ _errors << "progress went down! "
+ << _lastSeenProgress << " -> " << progress
+ << "\n";
+ }
+ // 16 bits is allowed before we have listed any buckets at all
+ // since we at that point have no idea and have not reported anything
+ // back to the fleetcontroller.
+ if (_params.bucketBitsUsed != reportedState->getMinUsedBits()
+ && !(reportedState->getMinUsedBits() == 16 && !_invoked))
+ {
+ _errors << "reported state contains wrong min used bits. "
+ << "expected " << _params.bucketBitsUsed
+ << ", but got " << reportedState->getMinUsedBits()
+ << "\n";
+ }
+ _lastSeenProgress = progress;
+ if (_invoked) {
+ return;
+ }
+
+ if (msg.getType() == api::MessageType::INTERNAL) {
+ const api::InternalCommand& cmd(
+ dynamic_cast<const api::InternalCommand&>(msg));
+ if (cmd.getType() == ReadBucketInfo::ID) {
+ if (cmd.getPriority() != _expectedReadBucketPriority) {
+ _errors << "expected ReadBucketInfo priority of "
+ << static_cast<int>(_expectedReadBucketPriority)
+ << ", was " << static_cast<int>(cmd.getPriority());
+ }
+ // As soon as we get the first ReadBucketInfo, we insert new buckets
+ // into the the bucket database in order to simulate external
+ // load init. Kinda hacky, but should work as long as initializer
+ // always does at least 1 extra iteration pass (which we use
+ // config overrides to ensure happens).
+ _invoked = true;
+ for (int i = 0; i < 4; ++i) {
+ document::BucketId bid(16 + i, 8); // not the first, nor the last bucket
+ BucketData d;
+ StorBucketDatabase::WrappedEntry entry(
+ _database.get(bid, "DatabaseInsertCallback::onMessage",
+ StorBucketDatabase::LOCK_IF_NONEXISTING_AND_NOT_CREATING));
+ if (entry.exist()) {
+ _errors << "db entry for " << bid << " already existed";
+ }
+ if (i < 5) {
+ d.info = api::BucketInfo(3+i, 4+i, 5+i, 6+i, 7+i);
+ }
+ _data[bid] = d;
+ entry->disk = 0;
+ entry->setBucketInfo(d.info);
+ entry.write();
+ }
+ }
+ }
+ }
+};
+
+void
+InitializerTest::testBucketsInitializedByLoad()
+{
+ InitParams params;
+ params.docsPerDisk = 100;
+ params.diskCount = DiskCount(1);
+ params.getConfig().getConfig("stor-bucket-init").setValue("max_pending_info_reads_per_disk", 1);
+ params.getConfig().getConfig("stor-bucket-init").setValue("min_pending_info_reads_per_disk", 1);
+ params.getConfig().getConfig("stor-bucket-init")
+ .setValue("info_read_priority", 231);
+
+ std::map<PartitionId, DiskData> data(buildBucketInfo(_docMan, params));
+
+ spi::PartitionStateList partitions(params.diskCount);
+ TestServiceLayerApp node(params.diskCount, params.nodeIndex,
+ params.getConfig().getConfigId());
+ DummyStorageLink top;
+ StorageBucketDBInitializer* initializer;
+ FakePersistenceLayer* bottom;
+ top.push_back(StorageLink::UP(initializer = new StorageBucketDBInitializer(
+ params.getConfig().getConfigId(),
+ partitions,
+ node.getDoneInitializeHandler(),
+ node.getComponentRegister())));
+ top.push_back(StorageLink::UP(bottom = new FakePersistenceLayer(
+ data, node.getStorageBucketDatabase())));
+
+ DatabaseInsertCallback callback(data[0], node.getStorageBucketDatabase(),
+ node, params);
+ callback._expectedReadBucketPriority = 231;
+
+ bottom->messageCallback = &callback;
+
+ top.open();
+
+ node.waitUntilInitialized(initializer);
+ // Must explicitly wait until initializer has closed to ensure node state
+ // has been set.
+ top.close();
+
+ CPPUNIT_ASSERT(callback._invoked);
+ CPPUNIT_ASSERT_EQUAL(std::string(), callback._errors.str());
+
+ std::map<PartitionId, DiskData> initedBucketDatabase(
+ createMapFromBucketDatabase(node.getStorageBucketDatabase()));
+ verifyEqual(data, initedBucketDatabase);
+
+ lib::NodeState::CSP reportedState(
+ node.getStateUpdater().getReportedNodeState());
+
+ double progress(reportedState->getInitProgress().getValue());
+ CPPUNIT_ASSERT(progress >= 1.0);
+ CPPUNIT_ASSERT(progress < 1.0001);
+
+ CPPUNIT_ASSERT_EQUAL(params.bucketBitsUsed,
+ reportedState->getMinUsedBits());
+}
+
+} // storage
diff --git a/storage/src/tests/bucketdb/judyarraytest.cpp b/storage/src/tests/bucketdb/judyarraytest.cpp
new file mode 100644
index 00000000000..235c0c9eb5c
--- /dev/null
+++ b/storage/src/tests/bucketdb/judyarraytest.cpp
@@ -0,0 +1,287 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/bucketdb/judyarray.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <boost/assign.hpp>
+#include <boost/random.hpp>
+#include <cppunit/extensions/HelperMacros.h>
+#include <map>
+#include <vector>
+
+namespace storage {
+
+struct JudyArrayTest : public CppUnit::TestFixture {
+ void testIterating();
+ void testDualArrayFunctions();
+ void testComparing();
+ void testSize();
+ void testStress();
+
+ CPPUNIT_TEST_SUITE(JudyArrayTest);
+ CPPUNIT_TEST(testIterating);
+ CPPUNIT_TEST(testDualArrayFunctions);
+ CPPUNIT_TEST(testSize);
+ CPPUNIT_TEST(testStress);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(JudyArrayTest);
+
+namespace {
+ std::vector<std::pair<JudyArray::key_type, JudyArray::data_type> >
+ getJudyArrayContents(const JudyArray& array) {
+ std::vector<std::pair<JudyArray::key_type, JudyArray::data_type> > vals;
+ for (JudyArray::const_iterator it = array.begin();
+ it != array.end(); ++it)
+ {
+ vals.push_back(std::make_pair(it.key(), it.value()));
+ }
+ return vals;
+ }
+}
+
+void
+JudyArrayTest::testIterating()
+{
+ JudyArray array;
+ // Test that things are sane for empty document
+ CPPUNIT_ASSERT_EQUAL(array.begin(), array.end());
+ // Add some values
+ using namespace boost::assign;
+ std::vector<std::pair<JudyArray::key_type, JudyArray::data_type> > values
+ = map_list_of(3,2)(5,12)(15,8)(13,10)(7,6)(9,4);
+ for (uint32_t i=0; i<values.size(); ++i) {
+ array.insert(values[i].first, values[i].second);
+ }
+ // Create expected result
+ std::sort(values.begin(), values.end());
+ // Test that we can iterate through const iterator
+ std::vector<std::pair<JudyArray::key_type, JudyArray::data_type> >
+ foundVals = getJudyArrayContents(array);
+ CPPUNIT_ASSERT_EQUAL(values, foundVals);
+
+ { // Test that both postfix operator work
+ JudyArray::iterator it = array.begin();
+ JudyArray::iterator it2 = it++;
+ CPPUNIT_ASSERT_EQUAL(JudyArray::value_type(values[0]), *it2);
+ CPPUNIT_ASSERT_EQUAL(JudyArray::value_type(values[1]), *it);
+
+ // And that iterator comparisons work
+ CPPUNIT_ASSERT_EQUAL(it2, array.begin());
+ CPPUNIT_ASSERT_EQUAL(it, ++array.begin());
+ CPPUNIT_ASSERT(!(it == it2));
+ CPPUNIT_ASSERT(it != it2);
+ }
+ { // Test that we can alter through non-const iterator
+ JudyArray::iterator it = array.begin();
+ ++it;
+ ++it;
+ it.setValue(20);
+ CPPUNIT_ASSERT_EQUAL((JudyArray::key_type) 7, it.key());
+ CPPUNIT_ASSERT_EQUAL((JudyArray::data_type) 20, array[7]);
+ it.remove();
+ CPPUNIT_ASSERT_EQUAL((JudyArray::size_type) 5,
+ getJudyArrayContents(array).size());
+ CPPUNIT_ASSERT_EQUAL(array.end(), array.find(7));
+ values.erase(values.begin() + 2);
+ CPPUNIT_ASSERT_EQUAL(values, getJudyArrayContents(array));
+ // And that we can continue iterating after removing.
+ ++it;
+ CPPUNIT_ASSERT_EQUAL((JudyArray::key_type) 9, it.key());
+ CPPUNIT_ASSERT_EQUAL((JudyArray::data_type) 4, array[9]);
+ }
+ { // Test printing of iterators
+ JudyArray::ConstIterator cit = array.begin();
+ CPPUNIT_ASSERT_MATCH_REGEX(
+ "^ConstIterator\\(Key: 3, Valp: 0x[0-9a-f]{1,16}, Val: 2\\)$",
+ cit.toString());
+ JudyArray::Iterator it = array.end();
+ CPPUNIT_ASSERT_MATCH_REGEX(
+ "^Iterator\\(Key: 0, Valp: 0\\)$",
+ it.toString());
+ }
+}
+
+void
+JudyArrayTest::testDualArrayFunctions()
+{
+ JudyArray array1;
+ JudyArray array2;
+ // Add values to array1
+ using namespace boost::assign;
+ std::vector<std::pair<JudyArray::key_type, JudyArray::data_type> > values1
+ = map_list_of(3,2)(5,12)(15,8)(13,10)(7,6)(9,4);
+ for (uint32_t i=0; i<values1.size(); ++i) {
+ array1.insert(values1[i].first, values1[i].second);
+ }
+ // Add values to array2
+ std::vector<std::pair<JudyArray::key_type, JudyArray::data_type> > values2
+ = map_list_of(4,5)(9,40);
+ for (uint32_t i=0; i<values2.size(); ++i) {
+ array2.insert(values2[i].first, values2[i].second);
+ }
+ // Create expected result
+ std::sort(values1.begin(), values1.end());
+ std::sort(values2.begin(), values2.end());
+
+ CPPUNIT_ASSERT_EQUAL(values1, getJudyArrayContents(array1));
+ CPPUNIT_ASSERT_EQUAL(values2, getJudyArrayContents(array2));
+ CPPUNIT_ASSERT(array1 > array2);
+ CPPUNIT_ASSERT(array1 != array2);
+ array1.swap(array2);
+ CPPUNIT_ASSERT_EQUAL(values1, getJudyArrayContents(array2));
+ CPPUNIT_ASSERT_EQUAL(values2, getJudyArrayContents(array1));
+ CPPUNIT_ASSERT(array1 < array2);
+ CPPUNIT_ASSERT(array1 != array2);
+
+ // Test some operators
+ JudyArray array3;
+ for (uint32_t i=0; i<values1.size(); ++i) {
+ array3.insert(values1[i].first, values1[i].second);
+ }
+ CPPUNIT_ASSERT(array1 != array3);
+ CPPUNIT_ASSERT_EQUAL(array2, array3);
+ CPPUNIT_ASSERT(array2 >= array3);
+ CPPUNIT_ASSERT(array2 <= array3);
+ CPPUNIT_ASSERT(!(array2 < array3));
+ CPPUNIT_ASSERT(!(array2 > array3));
+}
+
+void
+JudyArrayTest::testSize()
+{
+ JudyArray array;
+ CPPUNIT_ASSERT_EQUAL(array.begin(), array.end());
+ CPPUNIT_ASSERT(array.empty());
+ CPPUNIT_ASSERT_EQUAL((JudyArray::size_type) 0, array.size());
+ CPPUNIT_ASSERT_EQUAL((JudyArray::size_type) 0, array.getMemoryUsage());
+
+ // Test each method one can insert stuff into array
+ array.insert(4, 3);
+ CPPUNIT_ASSERT_EQUAL(getJudyArrayContents(array).size(), array.size());
+ array.insert(4, 7);
+ CPPUNIT_ASSERT_EQUAL(getJudyArrayContents(array).size(), array.size());
+ if (sizeof(JudyArray::size_type) == 4) {
+ CPPUNIT_ASSERT_EQUAL((JudyArray::size_type) 12, array.getMemoryUsage());
+ } else if (sizeof(JudyArray::size_type) == 8) {
+ CPPUNIT_ASSERT_EQUAL((JudyArray::size_type) 24, array.getMemoryUsage());
+ } else CPPUNIT_FAIL("Unknown size of type");
+
+ array[6] = 8;
+ CPPUNIT_ASSERT_EQUAL(getJudyArrayContents(array).size(), array.size());
+ array[6] = 10;
+ CPPUNIT_ASSERT_EQUAL(getJudyArrayContents(array).size(), array.size());
+ if (sizeof(JudyArray::size_type) == 4) {
+ CPPUNIT_ASSERT_EQUAL((JudyArray::size_type) 20, array.getMemoryUsage());
+ } else if (sizeof(JudyArray::size_type) == 8) {
+ CPPUNIT_ASSERT_EQUAL((JudyArray::size_type) 40, array.getMemoryUsage());
+ } else CPPUNIT_FAIL("Unknown size of type");
+
+ bool preExisted;
+ array.find(8, true, preExisted);
+ CPPUNIT_ASSERT_EQUAL(false, preExisted);
+ CPPUNIT_ASSERT_EQUAL(getJudyArrayContents(array).size(), array.size());
+ array.find(8, true, preExisted);
+ CPPUNIT_ASSERT_EQUAL(true, preExisted);
+ CPPUNIT_ASSERT_EQUAL(getJudyArrayContents(array).size(), array.size());
+ CPPUNIT_ASSERT_EQUAL((JudyArray::size_type) 3, array.size());
+ if (sizeof(JudyArray::size_type) == 4) {
+ CPPUNIT_ASSERT_EQUAL((JudyArray::size_type) 28, array.getMemoryUsage());
+ } else if (sizeof(JudyArray::size_type) == 8) {
+ CPPUNIT_ASSERT_EQUAL((JudyArray::size_type) 56, array.getMemoryUsage());
+ } else CPPUNIT_FAIL("Unknown size of type");
+
+ // Test each method one can remove stuff in array with
+ array.erase(8);
+ CPPUNIT_ASSERT_EQUAL(getJudyArrayContents(array).size(), array.size());
+ array.erase(8);
+ CPPUNIT_ASSERT_EQUAL(getJudyArrayContents(array).size(), array.size());
+ CPPUNIT_ASSERT_EQUAL((JudyArray::size_type) 2, array.size());
+ if (sizeof(JudyArray::size_type) == 4) {
+ CPPUNIT_ASSERT_EQUAL((JudyArray::size_type) 20, array.getMemoryUsage());
+ } else if (sizeof(JudyArray::size_type) == 8) {
+ CPPUNIT_ASSERT_EQUAL((JudyArray::size_type) 40, array.getMemoryUsage());
+ } else CPPUNIT_FAIL("Unknown size of type");
+}
+
+namespace {
+ template<typename T>
+ std::string toString(const T& m) {
+ std::cerr << "#";
+ std::ostringstream ost;
+ ost << m;
+ return ost.str();
+ }
+}
+
+void
+JudyArrayTest::testStress()
+{
+ // Do a lot of random stuff to both judy array and std::map. Ensure equal
+ // behaviour
+
+ JudyArray judyArray;
+ typedef std::map<JudyArray::key_type, JudyArray::data_type> StdMap;
+ StdMap stdMap;
+
+ boost::rand48 rnd(55);
+
+ for (uint32_t checkpoint=0; checkpoint<50; ++checkpoint) {
+ for (uint32_t opnr=0; opnr<500; ++opnr) {
+ int optype = rnd() % 100;
+ if (optype < 30) { // Insert
+ JudyArray::key_type key(rnd() % 500);
+ JudyArray::key_type value(rnd());
+ judyArray.insert(key, value);
+ stdMap[key] = value;
+ //std::pair<StdMap::iterator, bool> result
+ // = stdMap.insert(std::make_pair(key, value));
+ //if (!result.second) result.first->second = value;
+ } else if (optype < 50) { // operator[]
+ JudyArray::key_type key(rnd() % 500);
+ JudyArray::key_type value(rnd());
+ judyArray[key] = value;
+ stdMap[key] = value;
+ } else if (optype < 70) { // erase()
+ JudyArray::key_type key(rnd() % 500);
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ toString(judyArray) + toString(stdMap),
+ stdMap.erase(key), judyArray.erase(key));
+ } else if (optype < 75) { // size()
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ toString(judyArray) + toString(stdMap),
+ stdMap.size(), judyArray.size());
+ } else if (optype < 78) { // empty()
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ toString(judyArray) + toString(stdMap),
+ stdMap.empty(), judyArray.empty());
+ } else { // find()
+ JudyArray::key_type key(rnd() % 500);
+ JudyArray::iterator it = judyArray.find(key);
+ StdMap::iterator it2 = stdMap.find(key);
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ toString(judyArray) + toString(stdMap),
+ it2 == stdMap.end(), it == judyArray.end());
+ if (it != judyArray.end()) {
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ toString(judyArray) + toString(stdMap),
+ it.key(), it2->first);
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ toString(judyArray) + toString(stdMap),
+ it.value(), it2->second);
+ }
+ }
+ }
+ // Ensure judy array contents is equal to std::map's at this point
+ StdMap tmpMap;
+ for (JudyArray::const_iterator it = judyArray.begin();
+ it != judyArray.end(); ++it)
+ {
+ tmpMap[it.key()] = it.value();
+ }
+ CPPUNIT_ASSERT_EQUAL(stdMap, tmpMap);
+ }
+}
+
+} // storage
diff --git a/storage/src/tests/bucketdb/judymultimaptest.cpp b/storage/src/tests/bucketdb/judymultimaptest.cpp
new file mode 100644
index 00000000000..f63fad9aa06
--- /dev/null
+++ b/storage/src/tests/bucketdb/judymultimaptest.cpp
@@ -0,0 +1,172 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/bucketdb/judymultimap.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <boost/assign.hpp>
+#include <boost/random.hpp>
+#include <cppunit/extensions/HelperMacros.h>
+#include <map>
+#include <vector>
+
+namespace storage {
+
+struct JudyMultiMapTest : public CppUnit::TestFixture {
+ void testSimpleUsage();
+ void testIterator();
+
+ CPPUNIT_TEST_SUITE(JudyMultiMapTest);
+ CPPUNIT_TEST(testSimpleUsage);
+ CPPUNIT_TEST(testIterator);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(JudyMultiMapTest);
+
+namespace {
+ struct B;
+ struct C;
+
+ struct A {
+ int _val1;
+ int _val2;
+ int _val3;
+
+ A() {}
+ A(const B& b);
+ A(const C& c);
+ A(int val1, int val2, int val3)
+ : _val1(val1), _val2(val2), _val3(val3) {}
+
+ static bool mayContain(const A&) { return true; }
+
+ bool operator==(const A& a) const {
+ return (_val1 == a._val1 && _val2 == a._val2 && _val3 == a._val3);
+ }
+ };
+
+ struct B {
+ int _val1;
+ int _val2;
+
+ B() {}
+ B(const A& a) : _val1(a._val1), _val2(a._val2) {}
+ B(int val1, int val2) : _val1(val1), _val2(val2) {}
+
+ static bool mayContain(const A& a) { return (a._val3 == 0); }
+ };
+
+ struct C {
+ int _val1;
+
+ C() {}
+ C(const A& a) : _val1(a._val1) {}
+ C(int val1) : _val1(val1) {}
+
+ static bool mayContain(const A& a)
+ { return (a._val2 == 0 && a._val3 == 0); }
+ };
+
+ A::A(const B& b) : _val1(b._val1), _val2(b._val2), _val3(0) {}
+ A::A(const C& c) : _val1(c._val1), _val2(0), _val3(0) {}
+
+ std::ostream& operator<<(std::ostream& out, const A& a) {
+ return out << "A(" << a._val1 << ", " << a._val2 << ", "
+ << a._val3 << ")";
+ }
+ std::ostream& operator<<(std::ostream& out, const B& b) {
+ return out << "B(" << b._val1 << ", " << b._val2 << ")";
+ }
+ std::ostream& operator<<(std::ostream& out, const C& c) {
+ return out << "C(" << c._val1 << ")";
+ }
+}
+
+void
+JudyMultiMapTest::testSimpleUsage() {
+ typedef JudyMultiMap<C, B, A> MultiMap;
+ MultiMap multiMap;
+ // Do some insertions
+ bool preExisted;
+ CPPUNIT_ASSERT(multiMap.empty());
+ multiMap.insert(16, A(1, 2, 3), preExisted);
+ CPPUNIT_ASSERT_EQUAL(false, preExisted);
+ multiMap.insert(11, A(4, 6, 0), preExisted);
+ CPPUNIT_ASSERT_EQUAL(false, preExisted);
+ multiMap.insert(14, A(42, 0, 0), preExisted);
+ CPPUNIT_ASSERT_EQUAL(false, preExisted);
+ CPPUNIT_ASSERT_EQUAL_MSG(multiMap.toString(),
+ (MultiMap::size_type) 3, multiMap.size());
+
+ multiMap.insert(11, A(4, 7, 0), preExisted);
+ CPPUNIT_ASSERT_EQUAL(true, preExisted);
+ CPPUNIT_ASSERT_EQUAL((MultiMap::size_type) 3, multiMap.size());
+ CPPUNIT_ASSERT(!multiMap.empty());
+
+ // Access some elements
+ CPPUNIT_ASSERT_EQUAL(A(4, 7, 0), multiMap[11]);
+ CPPUNIT_ASSERT_EQUAL(A(1, 2, 3), multiMap[16]);
+ CPPUNIT_ASSERT_EQUAL(A(42,0, 0), multiMap[14]);
+
+ // Do removes
+ CPPUNIT_ASSERT(multiMap.erase(12) == 0);
+ CPPUNIT_ASSERT_EQUAL((MultiMap::size_type) 3, multiMap.size());
+
+ CPPUNIT_ASSERT(multiMap.erase(14) == 1);
+ CPPUNIT_ASSERT_EQUAL((MultiMap::size_type) 2, multiMap.size());
+
+ CPPUNIT_ASSERT(multiMap.erase(11) == 1);
+ CPPUNIT_ASSERT(multiMap.erase(16) == 1);
+ CPPUNIT_ASSERT_EQUAL((MultiMap::size_type) 0, multiMap.size());
+ CPPUNIT_ASSERT(multiMap.empty());
+}
+
+void
+JudyMultiMapTest::testIterator()
+{
+ typedef JudyMultiMap<C, B, A> MultiMap;
+ MultiMap multiMap;
+ bool preExisted;
+ // Do some insertions
+ multiMap.insert(16, A(1, 2, 3), preExisted);
+ multiMap.insert(11, A(4, 6, 0), preExisted);
+ multiMap.insert(14, A(42, 0, 0), preExisted);
+
+ MultiMap::Iterator iter = multiMap.begin();
+ CPPUNIT_ASSERT_EQUAL((uint64_t)11, (uint64_t)iter.key());
+ CPPUNIT_ASSERT_EQUAL(A(4, 6, 0), iter.value());
+ iter++;
+ CPPUNIT_ASSERT_EQUAL((uint64_t)14, (uint64_t)iter.key());
+ CPPUNIT_ASSERT_EQUAL(A(42, 0, 0), iter.value());
+ iter++;
+ CPPUNIT_ASSERT_EQUAL((uint64_t)16, (uint64_t)iter.key());
+ CPPUNIT_ASSERT_EQUAL(A(1, 2, 3), iter.value());
+ iter--;
+ CPPUNIT_ASSERT_EQUAL((uint64_t)14, (uint64_t)iter.key());
+ CPPUNIT_ASSERT_EQUAL(A(42, 0, 0), iter.value());
+ iter++;
+ CPPUNIT_ASSERT_EQUAL((uint64_t)16, (uint64_t)iter.key());
+ CPPUNIT_ASSERT_EQUAL(A(1, 2, 3), iter.value());
+ iter--;
+ iter--;
+ CPPUNIT_ASSERT_EQUAL((uint64_t)11,(uint64_t) iter.key());
+ CPPUNIT_ASSERT_EQUAL(A(4, 6, 0), iter.value());
+ iter++;
+ iter++;
+ iter++;
+ CPPUNIT_ASSERT_EQUAL(multiMap.end(), iter);
+ iter--;
+ CPPUNIT_ASSERT_EQUAL((uint64_t)16, (uint64_t)iter.key());
+ CPPUNIT_ASSERT_EQUAL(A(1, 2, 3), iter.value());
+ iter--;
+ CPPUNIT_ASSERT_EQUAL((uint64_t)14, (uint64_t)iter.key());
+ CPPUNIT_ASSERT_EQUAL(A(42, 0, 0), iter.value());
+ iter--;
+ CPPUNIT_ASSERT_EQUAL((uint64_t)11,(uint64_t) iter.key());
+ CPPUNIT_ASSERT_EQUAL(A(4, 6, 0), iter.value());
+
+
+}
+
+} // storage
+
diff --git a/storage/src/tests/bucketdb/lockablemaptest.cpp b/storage/src/tests/bucketdb/lockablemaptest.cpp
new file mode 100644
index 00000000000..0f35f51afbd
--- /dev/null
+++ b/storage/src/tests/bucketdb/lockablemaptest.cpp
@@ -0,0 +1,1262 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vespalib/util/document_runnable.h>
+#include <vespa/storage/bucketdb/judymultimap.h>
+#include <vespa/storage/bucketdb/lockablemap.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <cppunit/extensions/HelperMacros.h>
+
+namespace storage {
+
+struct LockableMapTest : public CppUnit::TestFixture {
+ void testSimpleUsage();
+ void testComparison();
+ void testIterating();
+ void testChunkedIterationIsTransparentAcrossChunkSizes();
+ void testCanAbortDuringChunkedIteration();
+ void testThreadSafetyStress();
+ void testFindBuckets();
+ void testFindBuckets2();
+ void testFindBuckets3();
+ void testFindBuckets4();
+ void testFindBuckets5();
+ void testFindBucketsSimple();
+ void testFindNoBuckets();
+ void testFindAll();
+ void testFindAll2();
+ void testFindAllUnusedBitIsSet();
+ void testFindAllInconsistentlySplit();
+ void testFindAllInconsistentlySplit2();
+ void testFindAllInconsistentlySplit3();
+ void testFindAllInconsistentlySplit4();
+ void testFindAllInconsistentlySplit5();
+ void testFindAllInconsistentlySplit6();
+ void testFindAllInconsistentBelow16Bits();
+ void testCreate();
+ void testCreate2();
+ void testCreate3();
+ void testCreate4();
+ void testCreate5();
+ void testCreate6();
+ void testCreateEmpty();
+ void testIsConsistent();
+
+ CPPUNIT_TEST_SUITE(LockableMapTest);
+ CPPUNIT_TEST(testSimpleUsage);
+ CPPUNIT_TEST(testComparison);
+ CPPUNIT_TEST(testIterating);
+ CPPUNIT_TEST(testChunkedIterationIsTransparentAcrossChunkSizes);
+ CPPUNIT_TEST(testCanAbortDuringChunkedIteration);
+ CPPUNIT_TEST(testThreadSafetyStress);
+ CPPUNIT_TEST(testFindBuckets);
+ CPPUNIT_TEST(testFindBuckets2);
+ CPPUNIT_TEST(testFindBuckets3);
+ CPPUNIT_TEST(testFindBuckets4);
+ CPPUNIT_TEST(testFindBuckets5);
+ CPPUNIT_TEST(testFindBucketsSimple);
+ CPPUNIT_TEST(testFindNoBuckets);
+ CPPUNIT_TEST(testFindAll);
+ CPPUNIT_TEST(testFindAll2);
+ CPPUNIT_TEST(testFindAllUnusedBitIsSet);
+ CPPUNIT_TEST(testFindAllInconsistentlySplit);
+ CPPUNIT_TEST(testFindAllInconsistentlySplit2);
+ CPPUNIT_TEST(testFindAllInconsistentlySplit3);
+ CPPUNIT_TEST(testFindAllInconsistentlySplit4);
+ CPPUNIT_TEST(testFindAllInconsistentlySplit5);
+ CPPUNIT_TEST(testFindAllInconsistentlySplit6);
+ CPPUNIT_TEST(testFindAllInconsistentBelow16Bits);
+ CPPUNIT_TEST(testCreate);
+ CPPUNIT_TEST(testCreate2);
+ CPPUNIT_TEST(testCreate3);
+ CPPUNIT_TEST(testCreate4);
+ CPPUNIT_TEST(testCreate5);
+ CPPUNIT_TEST(testCreate6);
+ CPPUNIT_TEST(testCreateEmpty);
+ CPPUNIT_TEST(testIsConsistent);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(LockableMapTest);
+
+namespace {
+ struct A : public boost::operators<A> {
+ int _val1;
+ int _val2;
+ int _val3;
+
+ A() : _val1(0), _val2(0), _val3(0) {}
+ A(int val1, int val2, int val3)
+ : _val1(val1), _val2(val2), _val3(val3) {}
+
+ static bool mayContain(const A&) { return true; }
+
+ bool operator==(const A& a) const {
+ return (_val1 == a._val1 && _val2 == a._val2 && _val3 == a._val3);
+ }
+ bool operator<(const A& a) const {
+ if (_val1 != a._val1) return (_val1 < a._val1);
+ if (_val2 != a._val2) return (_val2 < a._val2);
+ return (_val3 < a._val3);
+ }
+ };
+
+ std::ostream& operator<<(std::ostream& out, const A& a) {
+ return out << "A(" << a._val1 << ", " << a._val2 << ", "
+ << a._val3 << ")";
+ }
+
+ typedef LockableMap<JudyMultiMap<A> > Map;
+}
+
+void
+LockableMapTest::testSimpleUsage() {
+ // Tests insert, erase, size, empty, operator[]
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+ // Do some insertions
+ CPPUNIT_ASSERT(map.empty());
+ bool preExisted;
+ map.insert(16, A(1, 2, 3), "foo", preExisted);
+ CPPUNIT_ASSERT_EQUAL(false, preExisted);
+ map.insert(11, A(4, 6, 0), "foo", preExisted);
+ CPPUNIT_ASSERT_EQUAL(false, preExisted);
+ map.insert(14, A(42, 0, 0), "foo", preExisted);
+ CPPUNIT_ASSERT_EQUAL(false, preExisted);
+ CPPUNIT_ASSERT_EQUAL_MSG(map.toString(),
+ (Map::size_type) 3, map.size());
+
+ map.insert(11, A(4, 7, 0), "foo", preExisted);
+ CPPUNIT_ASSERT_EQUAL(true, preExisted);
+ CPPUNIT_ASSERT_EQUAL((Map::size_type) 3, map.size());
+ CPPUNIT_ASSERT(!map.empty());
+
+ // Access some elements
+ CPPUNIT_ASSERT_EQUAL(A(4, 7, 0), *map.get(11, "foo"));
+ CPPUNIT_ASSERT_EQUAL(A(1, 2, 3), *map.get(16, "foo"));
+ CPPUNIT_ASSERT_EQUAL(A(42,0, 0), *map.get(14, "foo"));
+
+ // Do removes
+ CPPUNIT_ASSERT(map.erase(12, "foo") == 0);
+ CPPUNIT_ASSERT_EQUAL((Map::size_type) 3, map.size());
+
+ CPPUNIT_ASSERT(map.erase(14, "foo") == 1);
+ CPPUNIT_ASSERT_EQUAL((Map::size_type) 2, map.size());
+
+ CPPUNIT_ASSERT(map.erase(11, "foo") == 1);
+ CPPUNIT_ASSERT(map.erase(16, "foo") == 1);
+ CPPUNIT_ASSERT_EQUAL((Map::size_type) 0, map.size());
+ CPPUNIT_ASSERT(map.empty());
+}
+
+void
+LockableMapTest::testComparison() {
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map1;
+ Map map2;
+ bool preExisted;
+
+ // Check empty state is correct
+ CPPUNIT_ASSERT_EQUAL(map1, map2);
+ CPPUNIT_ASSERT(map1 <= map2);
+ CPPUNIT_ASSERT(map1 >= map2);
+ CPPUNIT_ASSERT(!(map1 < map2));
+ CPPUNIT_ASSERT(!(map1 > map2));
+ CPPUNIT_ASSERT(!(map1 != map2));
+
+ // Check that different lengths are oki
+ map1.insert(4, A(1, 2, 3), "foo", preExisted);
+ CPPUNIT_ASSERT(!(map1 == map2));
+ CPPUNIT_ASSERT(!(map1 <= map2));
+ CPPUNIT_ASSERT(!(map1 < map2));
+ CPPUNIT_ASSERT(map1 >= map2);
+ CPPUNIT_ASSERT(map1 > map2);
+ CPPUNIT_ASSERT(map1 != map2);
+
+ // Check that equal elements are oki
+ map2.insert(4, A(1, 2, 3), "foo", preExisted);
+ CPPUNIT_ASSERT_EQUAL(map1, map2);
+ CPPUNIT_ASSERT(map1 <= map2);
+ CPPUNIT_ASSERT(map1 >= map2);
+ CPPUNIT_ASSERT(!(map1 < map2));
+ CPPUNIT_ASSERT(!(map1 > map2));
+ CPPUNIT_ASSERT(!(map1 != map2));
+
+ // Check that non-equal values are oki
+ map1.insert(6, A(1, 2, 6), "foo", preExisted);
+ map2.insert(6, A(1, 2, 3), "foo", preExisted);
+ CPPUNIT_ASSERT(!(map1 == map2));
+ CPPUNIT_ASSERT(!(map1 <= map2));
+ CPPUNIT_ASSERT(!(map1 < map2));
+ CPPUNIT_ASSERT(map1 >= map2);
+ CPPUNIT_ASSERT(map1 > map2);
+ CPPUNIT_ASSERT(map1 != map2);
+
+ // Check that non-equal keys are oki
+ map1.erase(6, "foo");
+ map1.insert(7, A(1, 2, 3), "foo", preExisted);
+ CPPUNIT_ASSERT(!(map1 == map2));
+ CPPUNIT_ASSERT(!(map1 <= map2));
+ CPPUNIT_ASSERT(!(map1 < map2));
+ CPPUNIT_ASSERT(map1 >= map2);
+ CPPUNIT_ASSERT(map1 > map2);
+ CPPUNIT_ASSERT(map1 != map2);
+}
+
+namespace {
+ struct NonConstProcessor {
+ Map::Decision operator()(int key, A& a) {
+ (void) key;
+ ++a._val2;
+ return Map::UPDATE;
+ }
+ };
+ struct EntryProcessor {
+ mutable uint32_t count;
+ mutable std::vector<std::string> log;
+ mutable std::vector<Map::Decision> behaviour;
+
+ EntryProcessor() : count(0), log(), behaviour() {}
+ EntryProcessor(const std::vector<Map::Decision>& decisions)
+ : count(0), log(), behaviour(decisions) {}
+
+ Map::Decision operator()(uint64_t key, A& a) const {
+ std::ostringstream ost;
+ ost << key << " - " << a;
+ log.push_back(ost.str());
+ Map::Decision d = Map::CONTINUE;
+ if (behaviour.size() > count) {
+ d = behaviour[count++];
+ }
+ if (d == Map::UPDATE) {
+ ++a._val3;
+ }
+ return d;
+ }
+
+ std::string toString() {
+ std::ostringstream ost;
+ for (uint32_t i=0; i<log.size(); ++i) ost << log[i] << "\n";
+ return ost.str();
+ }
+ };
+}
+
+void
+LockableMapTest::testIterating() {
+ Map map;
+ bool preExisted;
+ map.insert(16, A(1, 2, 3), "foo", preExisted);
+ map.insert(11, A(4, 6, 0), "foo", preExisted);
+ map.insert(14, A(42, 0, 0), "foo", preExisted);
+ // Test that we can use functor with non-const function
+ {
+ NonConstProcessor ncproc;
+ map.each(ncproc, "foo"); // Locking both for each element
+ CPPUNIT_ASSERT_EQUAL(A(4, 7, 0), *map.get(11, "foo"));
+ CPPUNIT_ASSERT_EQUAL(A(42,1, 0), *map.get(14, "foo"));
+ CPPUNIT_ASSERT_EQUAL(A(1, 3, 3), *map.get(16, "foo"));
+ map.all(ncproc, "foo"); // And for all
+ CPPUNIT_ASSERT_EQUAL(A(4, 8, 0), *map.get(11, "foo"));
+ CPPUNIT_ASSERT_EQUAL(A(42,2, 0), *map.get(14, "foo"));
+ CPPUNIT_ASSERT_EQUAL(A(1, 4, 3), *map.get(16, "foo"));
+ }
+ // Test that we can use const functors directly..
+ map.each(EntryProcessor(), "foo");
+
+ // Test iterator bounds
+ {
+ EntryProcessor proc;
+ map.each(proc, "foo", 11, 16);
+ std::string expected("11 - A(4, 8, 0)\n"
+ "14 - A(42, 2, 0)\n"
+ "16 - A(1, 4, 3)\n");
+ CPPUNIT_ASSERT_EQUAL(expected, proc.toString());
+
+ EntryProcessor proc2;
+ map.each(proc2, "foo", 12, 15);
+ expected = "14 - A(42, 2, 0)\n";
+ CPPUNIT_ASSERT_EQUAL(expected, proc2.toString());
+ }
+ // Test that we can abort iterating
+ {
+ std::vector<Map::Decision> decisions;
+ decisions.push_back(Map::CONTINUE);
+ decisions.push_back(Map::ABORT);
+ EntryProcessor proc(decisions);
+ map.each(proc, "foo");
+ std::string expected("11 - A(4, 8, 0)\n"
+ "14 - A(42, 2, 0)\n");
+ CPPUNIT_ASSERT_EQUAL(expected, proc.toString());
+ }
+ // Test that we can remove during iteration
+ {
+ std::vector<Map::Decision> decisions;
+ decisions.push_back(Map::CONTINUE);
+ decisions.push_back(Map::REMOVE);
+ EntryProcessor proc(decisions);
+ map.each(proc, "foo");
+ std::string expected("11 - A(4, 8, 0)\n"
+ "14 - A(42, 2, 0)\n"
+ "16 - A(1, 4, 3)\n");
+ CPPUNIT_ASSERT_EQUAL(expected, proc.toString());
+ CPPUNIT_ASSERT_EQUAL_MSG(map.toString(),
+ (Map::size_type) 2, map.size());
+ CPPUNIT_ASSERT_EQUAL(A(4, 8, 0), *map.get(11, "foo"));
+ CPPUNIT_ASSERT_EQUAL(A(1, 4, 3), *map.get(16, "foo"));
+ Map::WrappedEntry entry = map.get(14, "foo");
+ CPPUNIT_ASSERT(!entry.exist());
+ }
+}
+
+void
+LockableMapTest::testChunkedIterationIsTransparentAcrossChunkSizes()
+{
+ Map map;
+ bool preExisted;
+ map.insert(16, A(1, 2, 3), "foo", preExisted);
+ map.insert(11, A(4, 6, 0), "foo", preExisted);
+ map.insert(14, A(42, 0, 0), "foo", preExisted);
+ NonConstProcessor ncproc; // Increments 2nd value in all entries.
+ // chunkedAll with chunk size of 1
+ map.chunkedAll(ncproc, "foo", 1);
+ CPPUNIT_ASSERT_EQUAL(A(4, 7, 0), *map.get(11, "foo"));
+ CPPUNIT_ASSERT_EQUAL(A(42, 1, 0), *map.get(14, "foo"));
+ CPPUNIT_ASSERT_EQUAL(A(1, 3, 3), *map.get(16, "foo"));
+ // chunkedAll with chunk size larger than db size
+ map.chunkedAll(ncproc, "foo", 100);
+ CPPUNIT_ASSERT_EQUAL(A(4, 8, 0), *map.get(11, "foo"));
+ CPPUNIT_ASSERT_EQUAL(A(42, 2, 0), *map.get(14, "foo"));
+ CPPUNIT_ASSERT_EQUAL(A(1, 4, 3), *map.get(16, "foo"));
+}
+
+void
+LockableMapTest::testCanAbortDuringChunkedIteration()
+{
+ Map map;
+ bool preExisted;
+ map.insert(16, A(1, 2, 3), "foo", preExisted);
+ map.insert(11, A(4, 6, 0), "foo", preExisted);
+ map.insert(14, A(42, 0, 0), "foo", preExisted);
+
+ std::vector<Map::Decision> decisions;
+ decisions.push_back(Map::CONTINUE);
+ decisions.push_back(Map::ABORT);
+ EntryProcessor proc(decisions);
+ map.chunkedAll(proc, "foo", 100);
+ std::string expected("11 - A(4, 6, 0)\n"
+ "14 - A(42, 0, 0)\n");
+ CPPUNIT_ASSERT_EQUAL(expected, proc.toString());
+}
+
+namespace {
+ struct LoadGiver : public document::Runnable {
+ typedef std::shared_ptr<LoadGiver> SP;
+ Map& _map;
+ uint32_t _counter;
+
+ LoadGiver(Map& map) : _map(map), _counter(0) {}
+ };
+
+ struct InsertEraseLoadGiver : public LoadGiver {
+ InsertEraseLoadGiver(Map& map) : LoadGiver(map) {}
+
+ void run() {
+ // Screws up order of buckets by xor'ing with 12345.
+ // Only operate on last 32k super buckets.
+ while (running()) {
+ uint32_t bucket = ((_counter ^ 12345) % 0x8000) + 0x8000;
+ if (bucket % 7 < 3) {
+ bool preExisted;
+ _map.insert(bucket, A(bucket, 0, _counter), "foo",
+ preExisted);
+ }
+ if (bucket % 5 < 2) {
+ _map.erase(bucket, "foo");
+ }
+ ++_counter;
+ }
+ }
+ };
+
+ struct GetLoadGiver : public LoadGiver {
+ GetLoadGiver(Map& map) : LoadGiver(map) {}
+
+ void run() {
+ // It's legal to keep entries as long as you only request higher
+ // buckets. So, to test this, keep entries until you request one
+ // that is smaller than those stored.
+ std::vector<std::pair<uint32_t, Map::WrappedEntry> > stored;
+ while (running()) {
+ uint32_t bucket = (_counter ^ 52721) % 0x10000;
+ if (!stored.empty() && stored.back().first > bucket) {
+ stored.clear();
+ }
+ stored.push_back(std::pair<uint32_t, Map::WrappedEntry>(
+ bucket, _map.get(bucket, "foo", _counter % 3 == 0)));
+ ++_counter;
+ }
+ }
+ };
+
+ struct AllLoadGiver : public LoadGiver {
+ AllLoadGiver(Map& map) : LoadGiver(map) {}
+
+ void run() {
+ while (running()) {
+ _map.all(*this, "foo");
+ ++_counter;
+ }
+ }
+
+ Map::Decision operator()(int key, A& a) {
+ //std::cerr << (void*) this << " - " << key << "\n";
+ (void) key;
+ ++a._val2;
+ return Map::CONTINUE;
+ }
+ };
+
+ struct EachLoadGiver : public LoadGiver {
+ EachLoadGiver(Map& map) : LoadGiver(map) {}
+
+ void run() {
+ while (running()) {
+ _map.each(*this, "foo");
+ ++_counter;
+ }
+ }
+
+ Map::Decision operator()(int key, A& a) {
+ //std::cerr << (void*) this << " - " << key << "\n";
+ (void) key;
+ ++a._val2;
+ return Map::CONTINUE;
+ }
+ };
+
+ struct RandomRangeLoadGiver : public LoadGiver {
+ RandomRangeLoadGiver(Map& map) : LoadGiver(map) {}
+
+ void run() {
+ while (running()) {
+ uint32_t min = (_counter ^ 23426) % 0x10000;
+ uint32_t max = (_counter ^ 40612) % 0x10000;
+ if (min > max) {
+ uint32_t tmp = min;
+ min = max;
+ max = tmp;
+ }
+ if (_counter % 7 < 5) {
+ _map.each(*this, "foo", min, max);
+ } else {
+ _map.all(*this, "foo", min, max);
+ }
+ ++_counter;
+ }
+ }
+
+ Map::Decision operator()(int key, A& a) {
+ //std::cerr << ".";
+ (void) key;
+ ++a._val2;
+ return Map::CONTINUE;
+ }
+ };
+
+ struct GetNextLoadGiver : public LoadGiver {
+ GetNextLoadGiver(Map& map) : LoadGiver(map) {}
+
+ void run() {
+ while (running()) {
+ uint32_t bucket = (_counter ^ 60417) % 0xffff;
+ if (_counter % 7 < 5) {
+ _map.each(*this, "foo", bucket + 1, 0xffff);
+ } else {
+ _map.all(*this, "foo", bucket + 1, 0xffff);
+ }
+ ++_counter;
+ }
+ }
+
+ Map::Decision operator()(int key, A& a) {
+ //std::cerr << ".";
+ (void) key;
+ ++a._val2;
+ return Map::ABORT;
+ }
+ };
+}
+
+void
+LockableMapTest::testThreadSafetyStress() {
+ uint32_t duration = 2 * 1000;
+ std::cerr << "\nRunning LockableMap threadsafety test for "
+ << (duration / 1000) << " seconds.\n";
+ // Set up multiple threads going through the bucket database at the same
+ // time. Ensuring all works and there are no deadlocks.
+
+ // Initial database of 32k elements which should always be present.
+ // Next 32k elements may exist (loadgivers may erase and create them, "foo")
+ Map map;
+ for (uint32_t i=0; i<65536; ++i) {
+ bool preExisted;
+ map.insert(i, A(i, 0, i ^ 12345), "foo", preExisted);
+ }
+ std::vector<LoadGiver::SP> loadgivers;
+ for (uint32_t i=0; i<8; ++i) {
+ loadgivers.push_back(LoadGiver::SP(new InsertEraseLoadGiver(map)));
+ }
+ for (uint32_t i=0; i<2; ++i) {
+ loadgivers.push_back(LoadGiver::SP(new GetLoadGiver(map)));
+ }
+ for (uint32_t i=0; i<2; ++i) {
+ loadgivers.push_back(LoadGiver::SP(new AllLoadGiver(map)));
+ }
+ for (uint32_t i=0; i<2; ++i) {
+ loadgivers.push_back(LoadGiver::SP(new EachLoadGiver(map)));
+ }
+ for (uint32_t i=0; i<2; ++i) {
+ loadgivers.push_back(LoadGiver::SP(new RandomRangeLoadGiver(map)));
+ }
+ for (uint32_t i=0; i<2; ++i) {
+ loadgivers.push_back(LoadGiver::SP(new GetNextLoadGiver(map)));
+ }
+
+ FastOS_ThreadPool pool(128 * 1024);
+ for (uint32_t i=0; i<loadgivers.size(); ++i) {
+ CPPUNIT_ASSERT(loadgivers[i]->start(pool));
+ }
+ FastOS_Thread::Sleep(duration);
+ std::cerr << "Closing down test\n";
+ for (uint32_t i=0; i<loadgivers.size(); ++i) {
+ CPPUNIT_ASSERT(loadgivers[i]->stop());
+ }
+// FastOS_Thread::Sleep(duration);
+// std::cerr << "Didn't manage to shut down\n";
+// map._lockedKeys.print(std::cerr, true, "");
+
+ for (uint32_t i=0; i<loadgivers.size(); ++i) {
+ CPPUNIT_ASSERT(loadgivers[i]->join());
+ }
+ std::cerr << "Loadgiver counts:";
+ for (uint32_t i=0; i<loadgivers.size(); ++i) {
+ std::cerr << " " << loadgivers[i]->_counter;
+ }
+ std::cerr << "\nTest completed\n";
+}
+
+#if 0
+namespace {
+struct Hex {
+ document::BucketId::Type val;
+
+ Hex(document::BucketId::Type v) : val(v) {}
+ bool operator==(const Hex& h) const { return val == h.val; }
+};
+
+std::ostream& operator<<(std::ostream& out, const Hex& h) {
+ out << std::hex << h.val << std::dec;
+ return out;
+}
+
+void
+printBucket(const std::string s, const document::BucketId& b) {
+ std::cerr << s << "bucket=" << b << ", reversed=" << b.stripUnused().toKey() << ", hex=" << Hex(b.stripUnused().toKey()) << "\n";
+}
+
+void
+printBuckets(const std::map<document::BucketId, Map::WrappedEntry>& results) {
+ for (std::map<document::BucketId, Map::WrappedEntry>::const_iterator iter = results.begin();
+ iter != results.end();
+ iter++) {
+ printBucket("Returned ", iter->first);
+ }
+}
+
+}
+#endif
+
+void
+LockableMapTest::testFindBucketsSimple() {
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(17, 0x0ffff);
+ id1 = id1.stripUnused();
+
+ document::BucketId id2(18, 0x1ffff);
+ id2 = id2.stripUnused();
+
+ document::BucketId id3(18, 0x3ffff);
+ id3 = id3.stripUnused();
+
+ bool preExisted;
+ map.insert(id1.toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.toKey(), A(2,3,4), "foo", preExisted);
+ map.insert(id3.toKey(), A(3,4,5), "foo", preExisted);
+
+ document::BucketId id(22, 0xfffff);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getContained(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)1, results.size());
+ CPPUNIT_ASSERT_EQUAL(A(3,4,5), *results[id3]);
+#endif
+}
+
+void
+LockableMapTest::testFindBuckets() {
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(16, 0x0ffff);
+ document::BucketId id2(17, 0x0ffff);
+ document::BucketId id3(17, 0x1ffff);
+ document::BucketId id4(19, 0xfffff);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+ map.insert(id3.stripUnused().toKey(), A(3,4,5), "foo", preExisted);
+ map.insert(id4.stripUnused().toKey(), A(4,5,6), "foo", preExisted);
+
+ document::BucketId id(22, 0xfffff);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getContained(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)3, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(1,2,3), *results[id1.stripUnused()]);
+ CPPUNIT_ASSERT_EQUAL(A(4,5,6), *results[id4.stripUnused()]);
+ CPPUNIT_ASSERT_EQUAL(A(3,4,5), *results[id3.stripUnused()]);
+#endif
+}
+
+void
+LockableMapTest::testFindBuckets2() { // ticket 3121525
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(16, 0x0ffff);
+ document::BucketId id2(17, 0x0ffff);
+ document::BucketId id3(17, 0x1ffff);
+ document::BucketId id4(18, 0x1ffff);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+ map.insert(id3.stripUnused().toKey(), A(3,4,5), "foo", preExisted);
+ map.insert(id4.stripUnused().toKey(), A(4,5,6), "foo", preExisted);
+
+ document::BucketId id(22, 0x1ffff);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getContained(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)3, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(1,2,3), *results[id1.stripUnused()]);
+ CPPUNIT_ASSERT_EQUAL(A(4,5,6), *results[id4.stripUnused()]);
+ CPPUNIT_ASSERT_EQUAL(A(3,4,5), *results[id3.stripUnused()]);
+#endif
+}
+
+void
+LockableMapTest::testFindBuckets3() { // ticket 3121525
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(16, 0x0ffff);
+ document::BucketId id2(17, 0x0ffff);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+
+ document::BucketId id(22, 0x1ffff);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getContained(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)1, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(1,2,3), *results[id1.stripUnused()]);
+#endif
+}
+
+void
+LockableMapTest::testFindBuckets4() { // ticket 3121525
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(16, 0x0ffff);
+ document::BucketId id2(17, 0x0ffff);
+ document::BucketId id3(19, 0x1ffff);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+ map.insert(id3.stripUnused().toKey(), A(3,4,5), "foo", preExisted);
+
+ document::BucketId id(18, 0x1ffff);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getContained(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)1, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(1,2,3), *results[id1.stripUnused()]);
+#endif
+}
+
+void
+LockableMapTest::testFindBuckets5() { // ticket 3121525
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(16, 0x0ffff);
+ document::BucketId id2(17, 0x0ffff);
+ document::BucketId id3(19, 0x5ffff);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+ map.insert(id3.stripUnused().toKey(), A(3,4,5), "foo", preExisted);
+
+ document::BucketId id(18, 0x1ffff);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getContained(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)1, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(1,2,3), *results[id1.stripUnused()]);
+#endif
+}
+
+void
+LockableMapTest::testFindNoBuckets() {
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id(16, 0x0ffff);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getAll(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)0, results.size());
+#endif
+}
+
+void
+LockableMapTest::testFindAll() {
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(16, 0x0aaaa); // contains id2-id7
+ document::BucketId id2(17, 0x0aaaa); // contains id3-id4
+ document::BucketId id3(20, 0xcaaaa);
+ document::BucketId id4(20, 0xeaaaa);
+ document::BucketId id5(17, 0x1aaaa); // contains id6-id7
+ document::BucketId id6(20, 0xdaaaa);
+ document::BucketId id7(20, 0xfaaaa);
+ document::BucketId id8(20, 0xceaaa);
+ document::BucketId id9(17, 0x1ffff);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+ map.insert(id3.stripUnused().toKey(), A(3,4,5), "foo", preExisted);
+ map.insert(id4.stripUnused().toKey(), A(4,5,6), "foo", preExisted);
+ map.insert(id5.stripUnused().toKey(), A(5,6,7), "foo", preExisted);
+ map.insert(id6.stripUnused().toKey(), A(6,7,8), "foo", preExisted);
+ map.insert(id7.stripUnused().toKey(), A(7,8,9), "foo", preExisted);
+ map.insert(id8.stripUnused().toKey(), A(8,9,10), "foo", preExisted);
+ map.insert(id9.stripUnused().toKey(), A(9,10,11), "foo", preExisted);
+ //printBucket("Inserted ", id1);
+ //printBucket("Inserted ", id2);
+ //printBucket("Inserted ", id3);
+ //printBucket("Inserted ", id4);
+ //printBucket("Inserted ", id5);
+ //printBucket("Inserted ", id6);
+ //printBucket("Inserted ", id7);
+ //printBucket("Inserted ", id8);
+ //printBucket("Inserted ", id9);
+
+ document::BucketId id(17, 0x1aaaa);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getAll(id, "foo");
+
+ //std::cerr << "Done: getAll() for bucket " << id << "\n";
+ //printBuckets(results);
+
+ CPPUNIT_ASSERT_EQUAL((size_t)4, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(1,2,3), *results[id1.stripUnused()]); // super bucket
+ CPPUNIT_ASSERT_EQUAL(A(5,6,7), *results[id5.stripUnused()]); // most specific match (exact match)
+ CPPUNIT_ASSERT_EQUAL(A(6,7,8), *results[id6.stripUnused()]); // sub bucket
+ CPPUNIT_ASSERT_EQUAL(A(7,8,9), *results[id7.stripUnused()]); // sub bucket
+
+ id = document::BucketId(16, 0xffff);
+ results = map.getAll(id, "foo");
+
+ //std::cerr << "Done: getAll() for bucket " << id << "\n";
+ //printBuckets(results);
+
+ CPPUNIT_ASSERT_EQUAL((size_t)1, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(9,10,11), *results[id9.stripUnused()]); // sub bucket
+#endif
+}
+
+void
+LockableMapTest::testFindAll2() { // Ticket 3121525
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(17, 0x00001);
+ document::BucketId id2(17, 0x10001);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+
+ document::BucketId id(16, 0x00001);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getAll(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)2, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(1,2,3), *results[id1.stripUnused()]); // sub bucket
+ CPPUNIT_ASSERT_EQUAL(A(2,3,4), *results[id2.stripUnused()]); // sub bucket
+#endif
+}
+
+void
+LockableMapTest::testFindAllUnusedBitIsSet() { // ticket 2938896
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(24, 0x000dc7089);
+ document::BucketId id2(33, 0x0053c7089);
+ document::BucketId id3(33, 0x1053c7089);
+ document::BucketId id4(24, 0x000bc7089);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+ map.insert(id3.stripUnused().toKey(), A(3,4,5), "foo", preExisted);
+ map.insert(id4.stripUnused().toKey(), A(4,5,6), "foo", preExisted);
+
+ document::BucketId id(33, 0x1053c7089);
+ id.setUsedBits(32); // Bit 33 is set, but unused
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getAll(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)2, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(2,3,4), *results[id2.stripUnused()]); // sub bucket
+ CPPUNIT_ASSERT_EQUAL(A(3,4,5), *results[id3.stripUnused()]); // sub bucket
+#endif
+}
+
+void
+LockableMapTest::testFindAllInconsistentlySplit() { // Ticket 2938896
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(16, 0x00001); // contains id2-id3
+ document::BucketId id2(17, 0x00001);
+ document::BucketId id3(17, 0x10001);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+ map.insert(id3.stripUnused().toKey(), A(3,4,5), "foo", preExisted);
+
+ document::BucketId id(16, 0x00001);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getAll(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)3, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(1,2,3), *results[id1.stripUnused()]); // most specific match (exact match)
+ CPPUNIT_ASSERT_EQUAL(A(2,3,4), *results[id2.stripUnused()]); // sub bucket
+ CPPUNIT_ASSERT_EQUAL(A(3,4,5), *results[id3.stripUnused()]); // sub bucket
+#endif
+}
+
+void
+LockableMapTest::testFindAllInconsistentlySplit2() { // ticket 3121525
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(17, 0x10000);
+ document::BucketId id2(27, 0x007228034); // contains id3
+ document::BucketId id3(29, 0x007228034);
+ document::BucketId id4(17, 0x1ffff);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+ map.insert(id3.stripUnused().toKey(), A(3,4,5), "foo", preExisted);
+ map.insert(id4.stripUnused().toKey(), A(4,5,6), "foo", preExisted);
+
+ document::BucketId id(32, 0x027228034);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getAll(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)2, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(2,3,4), *results[id2.stripUnused()]); // super bucket
+ CPPUNIT_ASSERT_EQUAL(A(3,4,5), *results[id3.stripUnused()]); // most specific match (super bucket)
+#endif
+}
+
+void
+LockableMapTest::testFindAllInconsistentlySplit3() { // ticket 3121525
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(16, 0x0ffff); // contains id2
+ document::BucketId id2(17, 0x0ffff);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+
+ document::BucketId id(22, 0x1ffff);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getAll(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)1, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(1,2,3), *results[id1.stripUnused()]); // super bucket
+#endif
+}
+
+void
+LockableMapTest::testFindAllInconsistentlySplit4() { // ticket 3121525
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(16, 0x0ffff); // contains id2-id3
+ document::BucketId id2(17, 0x0ffff);
+ document::BucketId id3(19, 0x1ffff);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+ map.insert(id3.stripUnused().toKey(), A(3,4,5), "foo", preExisted);
+
+ document::BucketId id(18, 0x1ffff);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getAll(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)2, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(1,2,3), *results[id1.stripUnused()]); // super bucket
+ CPPUNIT_ASSERT_EQUAL(A(3,4,5), *results[id3.stripUnused()]); // sub bucket
+#endif
+}
+
+void
+LockableMapTest::testFindAllInconsistentlySplit5() { // ticket 3121525
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(16, 0x0ffff); // contains id2-id3
+ document::BucketId id2(17, 0x0ffff);
+ document::BucketId id3(19, 0x5ffff);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+ map.insert(id3.stripUnused().toKey(), A(3,4,5), "foo", preExisted);
+
+ document::BucketId id(18, 0x1ffff);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getAll(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)2, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(1,2,3), *results[id1.stripUnused()]); // super bucket
+ CPPUNIT_ASSERT_EQUAL(A(3,4,5), *results[id3.stripUnused()]); // sub bucket
+#endif
+}
+
+void
+LockableMapTest::testFindAllInconsistentlySplit6() {
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(16, 0x0ffff); // contains id2-id3
+ document::BucketId id2(18, 0x1ffff);
+ document::BucketId id3(19, 0x7ffff);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+ map.insert(id3.stripUnused().toKey(), A(3,4,5), "foo", preExisted);
+
+ document::BucketId id(18, 0x3ffff);
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getAll(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL((size_t)2, results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(1,2,3), *results[id1.stripUnused()]); // super bucket
+ CPPUNIT_ASSERT_EQUAL(A(3,4,5), *results[id3.stripUnused()]); // sub bucket
+}
+
+void
+LockableMapTest::testFindAllInconsistentBelow16Bits()
+{
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+
+ document::BucketId id1(1, 0x1); // contains id2-id3
+ document::BucketId id2(3, 0x1);
+ document::BucketId id3(4, 0xD);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ map.insert(id2.stripUnused().toKey(), A(2,3,4), "foo", preExisted);
+ map.insert(id3.stripUnused().toKey(), A(3,4,5), "foo", preExisted);
+
+ document::BucketId id(3, 0x5);
+
+ std::map<document::BucketId, Map::WrappedEntry> results =
+ map.getAll(id, "foo");
+
+ CPPUNIT_ASSERT_EQUAL(size_t(2), results.size());
+
+ CPPUNIT_ASSERT_EQUAL(A(1,2,3), *results[id1.stripUnused()]); // super bucket
+ CPPUNIT_ASSERT_EQUAL(A(3,4,5), *results[id3.stripUnused()]); // sub bucket
+}
+
+void
+LockableMapTest::testCreate() {
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+ {
+ document::BucketId id1(58, 0x43d6c878000004d2ull);
+
+ std::map<document::BucketId, Map::WrappedEntry> entries(
+ map.getContained(id1, "foo"));
+
+ CPPUNIT_ASSERT_EQUAL((size_t)0, entries.size());
+
+ Map::WrappedEntry entry = map.createAppropriateBucket(36, "", id1);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(36,0x8000004d2ull),
+ entry.getBucketId());
+ }
+ {
+ document::BucketId id1(58, 0x423bf1e0000004d2ull);
+
+ std::map<document::BucketId, Map::WrappedEntry> entries(
+ map.getContained(id1, "foo"));
+ CPPUNIT_ASSERT_EQUAL((size_t)0, entries.size());
+
+ Map::WrappedEntry entry = map.createAppropriateBucket(36, "", id1);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(36,0x0000004d2ull),
+ entry.getBucketId());
+ }
+
+ CPPUNIT_ASSERT_EQUAL((size_t)2, map.size());
+#endif
+}
+
+void
+LockableMapTest::testCreate2() {
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+ {
+ document::BucketId id1(58, 0xeaf77782000004d2);
+ Map::WrappedEntry entry(
+ map.get(id1.stripUnused().toKey(), "foo", true));
+ }
+ {
+ document::BucketId id1(58, 0x00000000000004d2);
+ std::map<document::BucketId, Map::WrappedEntry> entries(
+ map.getContained(id1, "foo"));
+
+ CPPUNIT_ASSERT_EQUAL((size_t)0, entries.size());
+
+ Map::WrappedEntry entry = map.createAppropriateBucket(16, "", id1);
+
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(34, 0x0000004d2ull),
+ entry.getBucketId());
+ }
+
+ CPPUNIT_ASSERT_EQUAL((size_t)2, map.size());
+#endif
+}
+
+void
+LockableMapTest::testCreate3() {
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+ {
+ document::BucketId id1(58, 0xeaf77780000004d2);
+ Map::WrappedEntry entry(
+ map.get(id1.stripUnused().toKey(), "foo", true));
+ }
+ {
+ document::BucketId id1(58, 0xeaf77782000004d2);
+ Map::WrappedEntry entry(
+ map.get(id1.stripUnused().toKey(), "foo", true));
+ }
+ {
+ document::BucketId id1(58, 0x00000000000004d2);
+ std::map<document::BucketId, Map::WrappedEntry> entries(
+ map.getContained(id1, "foo"));
+
+ CPPUNIT_ASSERT_EQUAL((size_t)0, entries.size());
+
+ Map::WrappedEntry entry = map.createAppropriateBucket(16, "", id1);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(40, 0x0000004d2ull),
+ entry.getBucketId());
+ }
+#endif
+}
+
+void
+LockableMapTest::testCreate4() {
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+ {
+ document::BucketId id1(16, 0x00000000000004d1);
+ Map::WrappedEntry entry(
+ map.get(id1.stripUnused().toKey(), "foo", true));
+ }
+ {
+ document::BucketId id1(40, 0x00000000000004d2);
+ Map::WrappedEntry entry(
+ map.get(id1.stripUnused().toKey(), "foo", true));
+ }
+ {
+ document::BucketId id1(58, 0x00000000010004d2);
+ Map::WrappedEntry entry = map.createAppropriateBucket(16, "", id1);
+
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(25, 0x0010004d2ull),
+ entry.getBucketId());
+ }
+#endif
+}
+
+void
+LockableMapTest::testCreate6() {
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+ {
+ document::BucketId id1(0x8c000000000004d2);
+ Map::WrappedEntry entry(
+ map.get(id1.stripUnused().toKey(), "foo", true));
+ }
+
+ {
+ document::BucketId id1(0xeb54b3ac000004d2);
+ Map::WrappedEntry entry(
+ map.get(id1.stripUnused().toKey(), "foo", true));
+ }
+
+ {
+ document::BucketId id1(0x88000002000004d2);
+ Map::WrappedEntry entry(
+ map.get(id1.stripUnused().toKey(), "foo", true));
+ }
+ {
+ document::BucketId id1(0x84000001000004d2);
+ Map::WrappedEntry entry(
+ map.get(id1.stripUnused().toKey(), "foo", true));
+ }
+ {
+ document::BucketId id1(0xe9944a44000004d2);
+ Map::WrappedEntry entry = map.createAppropriateBucket(16, "", id1);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(0x90000004000004d2),
+ entry.getBucketId());
+ }
+#endif
+}
+
+
+void
+LockableMapTest::testCreate5() {
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+ {
+ document::BucketId id1(58, 0xeaf77780000004d2);
+ Map::WrappedEntry entry(
+ map.get(id1.stripUnused().toKey(), "foo", true));
+ }
+ {
+ document::BucketId id1(40, 0x00000000000004d1);
+
+ Map::WrappedEntry entry(
+ map.get(id1.stripUnused().toKey(), "foo", true));
+ }
+ {
+ document::BucketId id1(58, 0x00000000010004d2);
+ Map::WrappedEntry entry = map.createAppropriateBucket(16, "", id1);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(25, 0x0010004d2ull),
+ entry.getBucketId());
+ }
+#endif
+}
+
+void
+LockableMapTest::testCreateEmpty() {
+#if __WORDSIZE == 64
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+ {
+ document::BucketId id1(58, 0x00000000010004d2);
+ Map::WrappedEntry entry = map.createAppropriateBucket(16, "", id1);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 0x0000004d2ull),
+ entry.getBucketId());
+ }
+#endif
+}
+
+void
+LockableMapTest::testIsConsistent()
+{
+ typedef LockableMap<JudyMultiMap<A> > Map;
+ Map map;
+ document::BucketId id1(16, 0x00001); // contains id2-id3
+ document::BucketId id2(17, 0x00001);
+
+ bool preExisted;
+ map.insert(id1.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ {
+ Map::WrappedEntry entry(
+ map.get(id1.stripUnused().toKey(), "foo", true));
+ CPPUNIT_ASSERT(map.isConsistent(entry));
+ }
+ map.insert(id2.stripUnused().toKey(), A(1,2,3), "foo", preExisted);
+ {
+ Map::WrappedEntry entry(
+ map.get(id1.stripUnused().toKey(), "foo", true));
+ CPPUNIT_ASSERT(!map.isConsistent(entry));
+ }
+}
+
+} // storage
diff --git a/storage/src/tests/bucketmover/CMakeLists.txt b/storage/src/tests/bucketmover/CMakeLists.txt
new file mode 100644
index 00000000000..2d02cdc4942
--- /dev/null
+++ b/storage/src/tests/bucketmover/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_testbucketmover
+ SOURCES
+ bucketmovertest.cpp
+ htmltabletest.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/tests/bucketmover/bucketmovertest.cpp b/storage/src/tests/bucketmover/bucketmovertest.cpp
new file mode 100644
index 00000000000..2720e6bac2a
--- /dev/null
+++ b/storage/src/tests/bucketmover/bucketmovertest.cpp
@@ -0,0 +1,190 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storage/bucketmover/bucketmover.h>
+#include <tests/common/dummystoragelink.h>
+#include <tests/common/testhelper.h>
+#include <tests/common/teststorageapp.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+
+bool debug = false;
+
+namespace storage {
+namespace bucketmover {
+
+struct BucketMoverTest : public CppUnit::TestFixture {
+public:
+ void setUp();
+ void tearDown();
+
+ void testNormalUsage();
+ void testMaxPending();
+ void testErrorHandling();
+
+ CPPUNIT_TEST_SUITE(BucketMoverTest);
+ CPPUNIT_TEST(testNormalUsage);
+ CPPUNIT_TEST(testMaxPending);
+ CPPUNIT_TEST(testErrorHandling);
+ CPPUNIT_TEST_SUITE_END();
+
+ std::unique_ptr<TestServiceLayerApp> _node;
+ std::unique_ptr<ServiceLayerComponent> _component;
+ std::unique_ptr<BucketMover> _bucketMover;
+ DummyStorageLink* after;
+
+private:
+ void addBucket(const document::BucketId& id, uint16_t idealDiff);
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(BucketMoverTest);
+
+void
+BucketMoverTest::tearDown()
+{
+ _node.reset(0);
+}
+
+void
+BucketMoverTest::setUp()
+{
+ try {
+ _node.reset(new TestServiceLayerApp(DiskCount(4)));
+ _node->setupDummyPersistence();
+ } catch (config::InvalidConfigException& e) {
+ fprintf(stderr, "%s\n", e.what());
+ }
+
+ _component.reset(new ServiceLayerComponent(_node->getComponentRegister(), "foo"));
+ _bucketMover.reset(new BucketMover("raw:", _node->getComponentRegister()));
+ after = new DummyStorageLink();
+ _bucketMover->push_back(StorageLink::UP(after));
+}
+
+void
+BucketMoverTest::addBucket(const document::BucketId& id,
+ uint16_t idealDiff)
+{
+ StorBucketDatabase::WrappedEntry entry(
+ _component->getBucketDatabase().get(
+ id,
+ "",
+ StorBucketDatabase::CREATE_IF_NONEXISTING));
+
+ entry->setBucketInfo(api::BucketInfo(1,1,1));
+
+ uint16_t idealDisk = _component->getIdealPartition(id);
+ entry->disk = (idealDisk + idealDiff) % _component->getDiskCount();
+ entry.write();
+}
+
+void
+BucketMoverTest::testNormalUsage()
+{
+ for (uint32_t i = 1; i < 4; ++i) {
+ addBucket(document::BucketId(16, i), 1);
+ }
+ for (uint32_t i = 4; i < 6; ++i) {
+ addBucket(document::BucketId(16, i), 0);
+ }
+
+ _bucketMover->open();
+ _bucketMover->tick();
+
+ std::vector<api::StorageMessage::SP> msgs = after->getCommandsOnce();
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketDiskMoveCommand("
+ "BucketId(0x4000000000000002), source 3, target 2)"),
+ msgs[0]->toString());
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketDiskMoveCommand("
+ "BucketId(0x4000000000000001), source 2, target 1)"),
+ msgs[1]->toString());
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketDiskMoveCommand("
+ "BucketId(0x4000000000000003), source 1, target 0)"),
+ msgs[2]->toString());
+
+ for (uint32_t i = 0; i < 2; ++i) {
+ after->sendUp(std::shared_ptr<api::StorageMessage>(
+ ((api::StorageCommand*)msgs[i].get())->
+ makeReply().release()));
+ }
+
+ _bucketMover->tick();
+ CPPUNIT_ASSERT_EQUAL(0, (int)after->getNumCommands());
+
+ _bucketMover->finishCurrentRun();
+}
+
+void
+BucketMoverTest::testMaxPending()
+{
+ for (uint32_t i = 1; i < 100; ++i) {
+ addBucket(document::BucketId(16, i), 1);
+ }
+ for (uint32_t i = 101; i < 200; ++i) {
+ addBucket(document::BucketId(16, i), 0);
+ }
+
+ _bucketMover->open();
+ _bucketMover->tick();
+
+ std::vector<api::StorageMessage::SP> msgs = after->getCommandsOnce();
+ // 5 is the max pending default config.
+ CPPUNIT_ASSERT_EQUAL(5, (int)msgs.size());
+
+ after->sendUp(std::shared_ptr<api::StorageMessage>(
+ ((api::StorageCommand*)msgs[3].get())->
+ makeReply().release()));
+
+ _bucketMover->tick();
+
+ std::vector<api::StorageMessage::SP> msgs2 = after->getCommandsOnce();
+ CPPUNIT_ASSERT_EQUAL(1, (int)msgs2.size());
+}
+
+void
+BucketMoverTest::testErrorHandling()
+{
+ for (uint32_t i = 1; i < 100; ++i) {
+ addBucket(document::BucketId(16, i), 1);
+ }
+ for (uint32_t i = 101; i < 200; ++i) {
+ addBucket(document::BucketId(16, i), 0);
+ }
+
+ _bucketMover->open();
+ _bucketMover->tick();
+
+ std::vector<api::StorageMessage::SP> msgs = after->getCommandsOnce();
+ // 5 is the max pending default config.
+ CPPUNIT_ASSERT_EQUAL(5, (int)msgs.size());
+
+ BucketDiskMoveCommand& cmd = static_cast<BucketDiskMoveCommand&>(*msgs[0]);
+ uint32_t targetDisk = cmd.getDstDisk();
+
+ std::unique_ptr<api::StorageReply> reply(cmd.makeReply().release());
+ reply->setResult(api::ReturnCode(api::ReturnCode::INTERNAL_FAILURE, "foobar"));
+ after->sendUp(std::shared_ptr<api::StorageMessage>(reply.release()));
+
+ for (uint32_t i = 1; i < msgs.size(); ++i) {
+ after->sendUp(std::shared_ptr<api::StorageMessage>(
+ ((api::StorageCommand*)msgs[i].get())->
+ makeReply().release()));
+ }
+
+ _bucketMover->tick();
+
+ std::vector<api::StorageMessage::SP> msgs2 = after->getCommandsOnce();
+ CPPUNIT_ASSERT_EQUAL(5, (int)msgs2.size());
+
+ for (uint32_t i = 0; i < msgs2.size(); ++i) {
+ BucketDiskMoveCommand& bdm = static_cast<BucketDiskMoveCommand&>(*msgs2[i]);
+ CPPUNIT_ASSERT(bdm.getDstDisk() != targetDisk);
+ }
+}
+
+} // bucketmover
+} // storage
diff --git a/storage/src/tests/bucketmover/htmltabletest.cpp b/storage/src/tests/bucketmover/htmltabletest.cpp
new file mode 100644
index 00000000000..98cf68d489a
--- /dev/null
+++ b/storage/src/tests/bucketmover/htmltabletest.cpp
@@ -0,0 +1,100 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/bucketmover/htmltable.h>
+#include <tests/common/testhelper.h>
+
+namespace storage {
+
+struct HtmlTableTest : public CppUnit::TestFixture {
+
+ void testPercentageColumn();
+ void testByteSizeColumn();
+
+ CPPUNIT_TEST_SUITE(HtmlTableTest);
+ CPPUNIT_TEST(testPercentageColumn);
+ CPPUNIT_TEST(testByteSizeColumn);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(HtmlTableTest);
+
+void HtmlTableTest::testPercentageColumn()
+{
+ // With total hardcoded to 100
+ {
+ HtmlTable table("disk");
+ PercentageColumn perc("fillrate", 100);
+ perc.addColorLimit(70, Column::LIGHT_GREEN);
+ perc.addColorLimit(85, Column::LIGHT_YELLOW);
+ perc.addColorLimit(100, Column::LIGHT_RED);
+ table.addColumn(perc);
+ table.addRow(0);
+ table.addRow(1);
+ table.addRow(2);
+ perc[0] = 30;
+ perc[1] = 80;
+ perc[2] = 100;
+ std::ostringstream ost;
+ table.print(ost);
+ std::string expected(
+"<table border=\"1\" cellpadding=\"2\" cellspacing=\"0\">\n"
+"<tr><th>disk</th><th>fillrate</th></tr>\n"
+"<tr><td>0</td><td bgcolor=\"#a0ffa0\" align=\"right\">30.00 %</td></tr>\n"
+"<tr><td>1</td><td bgcolor=\"#ffffa0\" align=\"right\">80.00 %</td></tr>\n"
+"<tr><td>2</td><td bgcolor=\"#ffa0a0\" align=\"right\">100.00 %</td></tr>\n"
+"</table>\n");
+ CPPUNIT_ASSERT_EQUAL(expected, ost.str());
+ }
+ // With automatically gathered total
+ {
+ HtmlTable table("disk");
+ PercentageColumn perc("fillrate");
+ table.addColumn(perc);
+ table.addRow(0);
+ table.addRow(1);
+ table.addRow(2);
+ perc[0] = 30;
+ perc[1] = 80;
+ perc[2] = 100;
+ std::ostringstream ost;
+ table.print(ost);
+ std::string expected(
+ "<table border=\"1\" cellpadding=\"2\" cellspacing=\"0\">\n"
+ "<tr><th>disk</th><th>fillrate</th></tr>\n"
+ "<tr><td>0</td><td align=\"right\">14.29 %</td></tr>\n"
+ "<tr><td>1</td><td align=\"right\">38.10 %</td></tr>\n"
+ "<tr><td>2</td><td align=\"right\">47.62 %</td></tr>\n"
+ "</table>\n");
+ CPPUNIT_ASSERT_EQUAL(expected, ost.str());
+ }
+}
+
+void HtmlTableTest::testByteSizeColumn()
+{
+ {
+ HtmlTable table("disk");
+ ByteSizeColumn size("size");
+ table.addColumn(size);
+ table.addRow(0);
+ table.addRow(1);
+ table.addRow(2);
+ // Biggest value enforce the denomination
+ size[0] = 42123;
+ size[1] = 124123151;
+ size[2] = 6131231;
+ std::ostringstream ost;
+ table.print(ost);
+ std::string expected(
+ "<table border=\"1\" cellpadding=\"2\" cellspacing=\"0\">\n"
+ "<tr><th>disk</th><th>size</th></tr>\n"
+ "<tr><td>0</td><td align=\"right\">0 MB</td></tr>\n"
+ "<tr><td>1</td><td align=\"right\">118 MB</td></tr>\n"
+ "<tr><td>2</td><td align=\"right\">5 MB</td></tr>\n"
+ "</table>\n");
+ CPPUNIT_ASSERT_EQUAL(expected, ost.str());
+ }
+
+}
+
+} // storage
diff --git a/storage/src/tests/common/.gitignore b/storage/src/tests/common/.gitignore
new file mode 100644
index 00000000000..333f254ba10
--- /dev/null
+++ b/storage/src/tests/common/.gitignore
@@ -0,0 +1,8 @@
+*.So
+*.lo
+.*.swp
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
diff --git a/storage/src/tests/common/CMakeLists.txt b/storage/src/tests/common/CMakeLists.txt
new file mode 100644
index 00000000000..309308473e1
--- /dev/null
+++ b/storage/src/tests/common/CMakeLists.txt
@@ -0,0 +1,12 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_testcommon
+ SOURCES
+ dummystoragelink.cpp
+ testhelper.cpp
+ metricstest.cpp
+ storagelinktest.cpp
+ teststorageapp.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/tests/common/dummystoragelink.cpp b/storage/src/tests/common/dummystoragelink.cpp
new file mode 100644
index 00000000000..d05241cb5b5
--- /dev/null
+++ b/storage/src/tests/common/dummystoragelink.cpp
@@ -0,0 +1,191 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
+#include <sys/time.h>
+#include "dummystoragelink.h"
+
+namespace storage {
+
+DummyStorageLink* DummyStorageLink::_last(0);
+
+DummyStorageLink::DummyStorageLink()
+ : StorageLink("Dummy storage link"),
+ _commands(),
+ _replies(),
+ _injected(),
+ _autoReply(false),
+ _useDispatch(false),
+ _ignore(false),
+ _waitMonitor()
+{
+ _last = this;
+}
+
+DummyStorageLink::~DummyStorageLink()
+{
+ // Often a chain with dummy link on top is deleted in unit tests.
+ // If they haven't been closed already, close them for a cleaner
+ // shutdown
+ if (getState() == OPENED) {
+ close();
+ flush();
+ }
+ closeNextLink();
+ reset();
+}
+
+bool
+DummyStorageLink::handleInjectedReply()
+{
+ vespalib::LockGuard guard(_lock);
+ if (!_injected.empty()) {
+ sendUp(*_injected.begin());
+ _injected.pop_front();
+ return true;
+ }
+ return false;
+}
+
+bool DummyStorageLink::onDown(const api::StorageMessage::SP& cmd)
+{
+ if (_ignore) {
+ return false;
+ }
+ bool injected = handleInjectedReply();
+ if (!injected && _autoReply) {
+ if (!cmd->getType().isReply()) {
+ std::shared_ptr<api::StorageReply> reply(
+ std::dynamic_pointer_cast<api::StorageCommand>(cmd)
+ ->makeReply().release());
+ reply->setResult(api::ReturnCode(
+ api::ReturnCode::OK, "Automatically generated reply"));
+ sendUp(reply);
+ }
+ }
+ if (isBottom()) {
+ vespalib::MonitorGuard lock(_waitMonitor);
+ {
+ vespalib::LockGuard guard(_lock);
+ _commands.push_back(cmd);
+ }
+ lock.broadcast();
+ return true;
+ }
+ return StorageLink::onDown(cmd);
+}
+
+bool DummyStorageLink::onUp(const api::StorageMessage::SP& reply) {
+ if (isTop()) {
+ vespalib::MonitorGuard lock(_waitMonitor);
+ {
+ vespalib::LockGuard guard(_lock);
+ _replies.push_back(reply);
+ }
+ lock.broadcast();
+ return true;
+ }
+ return StorageLink::onUp(reply);
+
+}
+
+void DummyStorageLink::injectReply(api::StorageReply* reply)
+{
+ assert(reply);
+ vespalib::LockGuard guard(_lock);
+ _injected.push_back(std::shared_ptr<api::StorageReply>(reply));
+}
+
+void DummyStorageLink::reset() {
+ vespalib::MonitorGuard lock(_waitMonitor);
+ vespalib::LockGuard guard(_lock);
+ _commands.clear();
+ _replies.clear();
+ _injected.clear();
+}
+
+void DummyStorageLink::waitForMessages(unsigned int msgCount, int timeout)
+{
+ framework::defaultimplementation::RealClock clock;
+ framework::MilliSecTime endTime(
+ clock.getTimeInMillis() + framework::MilliSecTime(timeout * 1000));
+ vespalib::MonitorGuard lock(_waitMonitor);
+ while (_commands.size() + _replies.size() < msgCount) {
+ if (timeout != 0 && clock.getTimeInMillis() > endTime) {
+ std::ostringstream ost;
+ ost << "Timed out waiting for " << msgCount << " messages to "
+ << "arrive in dummy storage link. Only "
+ << (_commands.size() + _replies.size()) << " messages seen "
+ << "after timout of " << timeout << " seconds was reached.";
+ throw vespalib::IllegalStateException(ost.str(), VESPA_STRLOC);
+ }
+ if (timeout >= 0) {
+ lock.wait((endTime - clock.getTimeInMillis()).getTime());
+ } else {
+ lock.wait();
+ }
+ }
+}
+
+void DummyStorageLink::waitForMessage(const api::MessageType& type, int timeout)
+{
+ framework::defaultimplementation::RealClock clock;
+ framework::MilliSecTime endTime(
+ clock.getTimeInMillis() + framework::MilliSecTime(timeout * 1000));
+ vespalib::MonitorGuard lock(_waitMonitor);
+ while (true) {
+ for (uint32_t i=0; i<_commands.size(); ++i) {
+ if (_commands[i]->getType() == type) return;
+ }
+ for (uint32_t i=0; i<_replies.size(); ++i) {
+ if (_replies[i]->getType() == type) return;
+ }
+ if (timeout != 0 && clock.getTimeInMillis() > endTime) {
+ std::ostringstream ost;
+ ost << "Timed out waiting for " << type << " message to "
+ << "arrive in dummy storage link. Only "
+ << (_commands.size() + _replies.size()) << " messages seen "
+ << "after timout of " << timeout << " seconds was reached.";
+ if (_commands.size() == 1) {
+ ost << " Found command of type " << _commands[0]->getType();
+ }
+ if (_replies.size() == 1) {
+ ost << " Found command of type " << _replies[0]->getType();
+ }
+ throw vespalib::IllegalStateException(ost.str(), VESPA_STRLOC);
+ }
+ if (timeout >= 0) {
+ lock.wait((endTime - clock.getTimeInMillis()).getTime());
+ } else {
+ lock.wait();
+ }
+ }
+}
+
+api::StorageMessage::SP
+DummyStorageLink::getAndRemoveMessage(const api::MessageType& type)
+{
+ vespalib::MonitorGuard lock(_waitMonitor);
+ for (std::vector<api::StorageMessage::SP>::iterator it = _commands.begin();
+ it != _commands.end(); ++it)
+ {
+ if ((*it)->getType() == type) {
+ api::StorageMessage::SP result(*it);
+ _commands.erase(it);
+ return result;
+ }
+ }
+ for (std::vector<api::StorageMessage::SP>::iterator it = _replies.begin();
+ it != _replies.end(); ++it)
+ {
+ if ((*it)->getType() == type) {
+ api::StorageMessage::SP result(*it);
+ _replies.erase(it);
+ return result;
+ }
+ }
+ std::ostringstream ost;
+ ost << "No message of type " << type << " found.";
+ throw vespalib::IllegalStateException(ost.str(), VESPA_STRLOC);
+}
+
+} // storage
diff --git a/storage/src/tests/common/dummystoragelink.h b/storage/src/tests/common/dummystoragelink.h
new file mode 100644
index 00000000000..072d961cbc0
--- /dev/null
+++ b/storage/src/tests/common/dummystoragelink.h
@@ -0,0 +1,121 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/util/sync.h>
+#include <list>
+#include <sstream>
+#include <vespa/storageapi/messageapi/storagecommand.h>
+#include <string>
+#include <vector>
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storageapi/message/internal.h>
+
+class FastOS_ThreadPool;
+
+namespace storage {
+
+class DummyStorageLink : public StorageLink {
+
+ mutable vespalib::Lock _lock; // to protect below containers:
+ std::vector<api::StorageMessage::SP> _commands;
+ std::vector<api::StorageMessage::SP> _replies;
+ std::list<api::StorageMessage::SP> _injected;
+
+ bool _autoReply;
+ bool _useDispatch;
+ bool _ignore;
+ static DummyStorageLink* _last;
+ vespalib::Monitor _waitMonitor;
+
+public:
+ DummyStorageLink();
+ ~DummyStorageLink();
+
+ bool onDown(const api::StorageMessage::SP&);
+ bool onUp(const api::StorageMessage::SP&);
+
+ void addOnTopOfChain(StorageLink& link) {
+ link.addTestLinkOnTop(this);
+ }
+
+ void print(std::ostream& ost, bool verbose, const std::string& indent) const
+ {
+ (void) verbose;
+ ost << indent << "DummyStorageLink("
+ << "autoreply = " << (_autoReply ? "on" : "off")
+ << ", dispatch = " << (_useDispatch ? "on" : "off")
+ << ", " << _commands.size() << " commands"
+ << ", " << _replies.size() << " replies";
+ if (_injected.size() > 0)
+ ost << ", " << _injected.size() << " injected";
+ ost << ")";
+ }
+
+ void injectReply(api::StorageReply* reply);
+ void reset();
+ void setAutoreply(bool autoReply) { _autoReply = autoReply; }
+ void setIgnore(bool ignore) { _ignore = ignore; }
+ // Timeout is given in seconds
+ void waitForMessages(unsigned int msgCount = 1, int timeout = -1);
+ // Wait for a single message of a given type
+ void waitForMessage(const api::MessageType&, int timeout = -1);
+
+ api::StorageMessage::SP getCommand(size_t i) const {
+ vespalib::LockGuard guard(_lock);
+ api::StorageMessage::SP ret = _commands[i];
+ return ret;
+ }
+ api::StorageMessage::SP getReply(size_t i) const {
+ vespalib::LockGuard guard(_lock);
+ api::StorageMessage::SP ret = _replies[i];
+ return ret;
+ }
+ size_t getNumCommands() const {
+ vespalib::LockGuard guard(_lock);
+ return _commands.size();
+ }
+ size_t getNumReplies() const {
+ vespalib::LockGuard guard(_lock);
+ return _replies.size();
+ }
+
+ const std::vector<api::StorageMessage::SP>& getCommands() const
+ { return _commands; }
+ const std::vector<api::StorageMessage::SP>& getReplies() const
+ { return _replies; }
+
+ std::vector<api::StorageMessage::SP> getCommandsOnce() {
+ vespalib::MonitorGuard lock(_waitMonitor);
+ std::vector<api::StorageMessage::SP> retval;
+ {
+ vespalib::LockGuard guard(_lock);
+ retval.swap(_commands);
+ }
+ return retval;
+ }
+
+ std::vector<api::StorageMessage::SP> getRepliesOnce() {
+ vespalib::MonitorGuard lock(_waitMonitor);
+ std::vector<api::StorageMessage::SP> retval;
+ {
+ vespalib::LockGuard guard(_lock);
+ retval.swap(_replies);
+ }
+ return retval;
+ }
+
+ api::StorageMessage::SP getAndRemoveMessage(const api::MessageType&);
+
+ static DummyStorageLink* getLast() { return _last; }
+private:
+ /**
+ * Auto-reply with an injected message if one is available and return
+ * whether such an injection took place.
+ */
+ bool handleInjectedReply();
+};
+
+}
+
diff --git a/storage/src/tests/common/hostreporter/CMakeLists.txt b/storage/src/tests/common/hostreporter/CMakeLists.txt
new file mode 100644
index 00000000000..f0cb197c5e2
--- /dev/null
+++ b/storage/src/tests/common/hostreporter/CMakeLists.txt
@@ -0,0 +1,14 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_testhostreporter
+ SOURCES
+ cpureportertest.cpp
+ memreportertest.cpp
+ networkreportertest.cpp
+ versionreportertest.cpp
+ diskreportertest.cpp
+ util.cpp
+ hostinfotest.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/tests/common/hostreporter/cpureportertest.cpp b/storage/src/tests/common/hostreporter/cpureportertest.cpp
new file mode 100644
index 00000000000..56a929c3aff
--- /dev/null
+++ b/storage/src/tests/common/hostreporter/cpureportertest.cpp
@@ -0,0 +1,40 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/common/hostreporter/cpureporter.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vespalib/util/jsonstream.h>
+#include "util.h"
+
+LOG_SETUP(".test.cpureporter");
+
+namespace storage {
+namespace {
+using Object = vespalib::JsonStream::Object;
+using End = vespalib::JsonStream::End;
+}
+
+struct CpuReporterTest : public CppUnit::TestFixture
+{
+ void testCpuReporter();
+
+ CPPUNIT_TEST_SUITE(CpuReporterTest);
+ CPPUNIT_TEST(testCpuReporter);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(CpuReporterTest);
+
+void
+CpuReporterTest::testCpuReporter()
+{
+ CpuReporter cpuReporter;
+ vespalib::Slime slime;
+ util::reporterToSlime(cpuReporter, slime);
+ CPPUNIT_ASSERT(1.0 <= slime.get()["cpu"]["context switches"].asDouble());
+ CPPUNIT_ASSERT(1.0 <= slime.get()["cpu"]["cputotal"]["user"].asDouble());
+ CPPUNIT_ASSERT(1.0 <= slime.get()["cpu"]["cputotal"]["user"].asDouble());
+ CPPUNIT_ASSERT(1.0 <= slime.get()["cpu"]["cputotal"]["user"].asDouble());
+}
+} // storage
diff --git a/storage/src/tests/common/hostreporter/diskreportertest.cpp b/storage/src/tests/common/hostreporter/diskreportertest.cpp
new file mode 100644
index 00000000000..158a77c2e7e
--- /dev/null
+++ b/storage/src/tests/common/hostreporter/diskreportertest.cpp
@@ -0,0 +1,33 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/common/hostreporter/diskreporter.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vespalib/util/jsonstream.h>
+#include "util.h"
+
+LOG_SETUP(".test.diskreporter");
+
+namespace storage {
+
+struct DiskReporterTest : public CppUnit::TestFixture
+{
+ void testDiskReporter();
+
+ CPPUNIT_TEST_SUITE(DiskReporterTest);
+ CPPUNIT_TEST(testDiskReporter);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(DiskReporterTest);
+
+void
+DiskReporterTest::testDiskReporter()
+{
+ DiskReporter diskReporter;
+ vespalib::Slime slime;
+ util::reporterToSlime(diskReporter, slime);
+ CPPUNIT_ASSERT(0 < slime.get()["disk"].toString().size());
+}
+} // storage
diff --git a/storage/src/tests/common/hostreporter/hostinfotest.cpp b/storage/src/tests/common/hostreporter/hostinfotest.cpp
new file mode 100644
index 00000000000..99954c19840
--- /dev/null
+++ b/storage/src/tests/common/hostreporter/hostinfotest.cpp
@@ -0,0 +1,60 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/common/hostreporter/hostinfo.h>
+#include <vespa/storage/common/hostreporter/hostreporter.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vespalib/util/jsonstream.h>
+#include "util.h"
+
+LOG_SETUP(".test.hostinforeporter");
+
+namespace storage {
+namespace {
+using Object = vespalib::JsonStream::Object;
+using End = vespalib::JsonStream::End;
+using JsonFormat = vespalib::slime::JsonFormat;
+using Memory = vespalib::slime::Memory;
+
+class DummyReporter: public HostReporter {
+public:
+ void report(vespalib::JsonStream& jsonreport) override {
+ jsonreport << "dummy" << Object() << "foo" << "bar" << End();
+ }
+};
+}
+
+struct HostInfoReporterTest : public CppUnit::TestFixture
+{
+ void testHostInfoReporter();
+
+ CPPUNIT_TEST_SUITE(HostInfoReporterTest);
+ CPPUNIT_TEST(testHostInfoReporter);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(HostInfoReporterTest);
+
+void
+HostInfoReporterTest::testHostInfoReporter()
+{
+ HostInfo hostinfo;
+ DummyReporter dummyReporter;
+ hostinfo.registerReporter(&dummyReporter);
+ vespalib::asciistream json;
+ vespalib::JsonStream stream(json, true);
+
+ stream << Object();
+ hostinfo.printReport(stream);
+ stream << End();
+
+ std::string jsonData = json.str();
+ vespalib::Slime slime;
+ JsonFormat::decode(Memory(jsonData), slime);
+ CPPUNIT_ASSERT(slime.get()["dummy"]["foo"].asString() == "bar");
+ CPPUNIT_ASSERT(0 < slime.get()["network"]["lo"]["input"]["packets"].asLong());
+ CPPUNIT_ASSERT(1.0 <= slime.get()["cpu"]["context switches"].asDouble());
+}
+} // storage
+
diff --git a/storage/src/tests/common/hostreporter/memreportertest.cpp b/storage/src/tests/common/hostreporter/memreportertest.cpp
new file mode 100644
index 00000000000..3eedfd48a3c
--- /dev/null
+++ b/storage/src/tests/common/hostreporter/memreportertest.cpp
@@ -0,0 +1,44 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/common/hostreporter/memreporter.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vespalib/util/jsonstream.h>
+#include "util.h"
+
+LOG_SETUP(".test.memreporter");
+
+namespace storage {
+namespace {
+using Object = vespalib::JsonStream::Object;
+using End = vespalib::JsonStream::End;
+}
+
+struct MemReporterTest : public CppUnit::TestFixture
+{
+ void testMemReporter();
+
+ CPPUNIT_TEST_SUITE(MemReporterTest);
+ CPPUNIT_TEST(testMemReporter);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(MemReporterTest);
+
+void
+MemReporterTest::testMemReporter()
+{
+ MemReporter memReporter;
+ vespalib::Slime slime;
+ util::reporterToSlime(memReporter, slime);
+ CPPUNIT_ASSERT(0 < slime.get()["memory"]["total memory"].asLong());
+ CPPUNIT_ASSERT(0 < slime.get()["memory"]["free memory"].asLong());
+ CPPUNIT_ASSERT(0 < slime.get()["memory"]["disk cache"].asLong());
+ CPPUNIT_ASSERT(0 < slime.get()["memory"]["active memory"].asLong());
+ CPPUNIT_ASSERT(0 < slime.get()["memory"]["inactive memory"].asLong());
+ CPPUNIT_ASSERT(0 <= slime.get()["memory"]["swap total"].asLong());
+ CPPUNIT_ASSERT(0 <= slime.get()["memory"]["swap free"].asLong());
+ CPPUNIT_ASSERT(0 < slime.get()["memory"]["dirty"].asLong());
+}
+} // storage
diff --git a/storage/src/tests/common/hostreporter/networkreportertest.cpp b/storage/src/tests/common/hostreporter/networkreportertest.cpp
new file mode 100644
index 00000000000..cba5717adce
--- /dev/null
+++ b/storage/src/tests/common/hostreporter/networkreportertest.cpp
@@ -0,0 +1,40 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/common/hostreporter/networkreporter.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vespalib/util/jsonstream.h>
+#include "util.h"
+
+LOG_SETUP(".test.networkreporter");
+
+namespace storage {
+namespace {
+using Object = vespalib::JsonStream::Object;
+using End = vespalib::JsonStream::End;
+}
+
+struct NetworkReporterTest : public CppUnit::TestFixture
+{
+ void testNetworkReporter();
+
+ CPPUNIT_TEST_SUITE(NetworkReporterTest);
+ CPPUNIT_TEST(testNetworkReporter);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(NetworkReporterTest);
+
+void
+NetworkReporterTest::testNetworkReporter()
+{
+ NetworkReporter networkReporter;
+ vespalib::Slime slime;
+ util::reporterToSlime(networkReporter, slime);
+ CPPUNIT_ASSERT(0 < slime.get()["network"]["lo"]["input"]["bytes"].asLong());
+ CPPUNIT_ASSERT(0 < slime.get()["network"]["lo"]["input"]["packets"].asLong());
+ CPPUNIT_ASSERT(0 < slime.get()["network"]["lo"]["output"]["bytes"].asLong());
+ CPPUNIT_ASSERT(0 < slime.get()["network"]["lo"]["output"]["packets"].asLong());
+}
+} // storage
diff --git a/storage/src/tests/common/hostreporter/util.cpp b/storage/src/tests/common/hostreporter/util.cpp
new file mode 100644
index 00000000000..37d5803070d
--- /dev/null
+++ b/storage/src/tests/common/hostreporter/util.cpp
@@ -0,0 +1,34 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include "util.h"
+#include <vespa/storage/common/hostreporter/hostreporter.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/vespalib/util/jsonstream.h>
+
+namespace storage {
+namespace util {
+namespace {
+using Object = vespalib::JsonStream::Object;
+using End = vespalib::JsonStream::End;
+using JsonFormat = vespalib::slime::JsonFormat;
+using Memory = vespalib::slime::Memory;
+}
+
+void
+reporterToSlime(HostReporter &hostReporter, vespalib::Slime &slime) {
+ vespalib::asciistream json;
+ vespalib::JsonStream stream(json, true);
+
+ stream << Object();
+ hostReporter.report(stream);
+ stream << End();
+ std::string jsonData = json.str();
+ size_t parsedSize = JsonFormat::decode(Memory(jsonData), slime);
+
+ if (jsonData.size() != parsedSize) {
+ CPPUNIT_FAIL("Sizes of jsonData mismatched, probably not json:\n" + jsonData);
+ }
+}
+}
+}
diff --git a/storage/src/tests/common/hostreporter/util.h b/storage/src/tests/common/hostreporter/util.h
new file mode 100644
index 00000000000..e7fcf418bd3
--- /dev/null
+++ b/storage/src/tests/common/hostreporter/util.h
@@ -0,0 +1,16 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#ifndef VESPA_STORAGE_COMMON_UTIL
+#define VESPA_STORAGE_COMMON_UTIL
+
+#include <vespa/storage/common/hostreporter/hostreporter.h>
+#include <vespa/vespalib/data/slime/slime.h>
+
+namespace storage {
+namespace util {
+
+void
+reporterToSlime(HostReporter &hostReporter, vespalib::Slime &slime);
+}
+}
+
+#endif // VESPA_STORAGE_COMMON_UTIL
diff --git a/storage/src/tests/common/hostreporter/versionreportertest.cpp b/storage/src/tests/common/hostreporter/versionreportertest.cpp
new file mode 100644
index 00000000000..43c6e64b0de
--- /dev/null
+++ b/storage/src/tests/common/hostreporter/versionreportertest.cpp
@@ -0,0 +1,39 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/common/hostreporter/versionreporter.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vespalib/util/jsonstream.h>
+#include "util.h"
+
+LOG_SETUP(".test.versionreporter");
+
+namespace storage {
+namespace {
+using Object = vespalib::JsonStream::Object;
+using End = vespalib::JsonStream::End;
+}
+
+struct VersionReporterTest : public CppUnit::TestFixture
+{
+ void testVersionReporter();
+
+ CPPUNIT_TEST_SUITE(VersionReporterTest);
+ CPPUNIT_TEST(testVersionReporter);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(VersionReporterTest);
+
+void
+VersionReporterTest::testVersionReporter()
+{
+ VersionReporter versionReporter;
+ vespalib::Slime slime;
+ util::reporterToSlime(versionReporter, slime);
+ std::string version = slime.get()["vtag"]["version"].asString().make_string().c_str();
+ CPPUNIT_ASSERT(version.length() > 2);
+ CPPUNIT_ASSERT(version.find(".") > 0);
+}
+} // storage
diff --git a/storage/src/tests/common/metricstest.cpp b/storage/src/tests/common/metricstest.cpp
new file mode 100644
index 00000000000..e06b2183380
--- /dev/null
+++ b/storage/src/tests/common/metricstest.cpp
@@ -0,0 +1,393 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/log/log.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageframework/defaultimplementation/clock/fakeclock.h>
+#include <vespa/storage/bucketdb/bucketmanager.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storage/common/statusmetricconsumer.h>
+#include <vespa/storage/persistence/filestorage/filestormanager.h>
+#include <vespa/storage/visiting/visitormetrics.h>
+#include <vespa/documentapi/loadtypes/loadtype.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/testhelper.h>
+#include <tests/common/dummystoragelink.h>
+#include <iostream>
+#include <string>
+#include <chrono>
+
+LOG_SETUP(".test.metrics");
+
+namespace storage {
+
+struct MetricsTest : public CppUnit::TestFixture {
+ FastOS_ThreadPool _threadPool;
+ framework::defaultimplementation::FakeClock* _clock;
+ std::unique_ptr<TestServiceLayerApp> _node;
+ std::unique_ptr<DummyStorageLink> _top;
+ std::unique_ptr<StatusMetricConsumer> _metricsConsumer;
+ std::unique_ptr<vdstestlib::DirConfig> _config;
+ std::unique_ptr<metrics::MetricSet> _topSet;
+ std::unique_ptr<metrics::MetricManager> _metricManager;
+ std::shared_ptr<FileStorMetrics> _filestorMetrics;
+ std::shared_ptr<BucketManagerMetrics> _bucketManagerMetrics;
+ std::shared_ptr<VisitorMetrics> _visitorMetrics;
+
+ void createSnapshotForPeriod(std::chrono::seconds secs);
+ void assertMetricLastValue(const std::string& name,
+ int interval,
+ uint64_t expected);
+
+ MetricsTest();
+
+ void setUp();
+ void tearDown();
+ void runLoad(uint32_t count = 1);
+ void createFakeLoad();
+
+ void testFileStorMetrics();
+ void testSnapshotPresenting();
+ void testHtmlMetricsReport();
+ void testCurrentGaugeValuesOverrideSnapshotValues();
+ void testVerboseReportIncludesNonSetMetricsEvenAfterSnapshot();
+
+ CPPUNIT_TEST_SUITE(MetricsTest);
+ CPPUNIT_TEST(testFileStorMetrics);
+ CPPUNIT_TEST(testSnapshotPresenting);
+ CPPUNIT_TEST(testHtmlMetricsReport);
+ CPPUNIT_TEST(testCurrentGaugeValuesOverrideSnapshotValues);
+ CPPUNIT_TEST(testVerboseReportIncludesNonSetMetricsEvenAfterSnapshot);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(MetricsTest);
+
+namespace {
+ struct MetricClock : public metrics::MetricManager::Timer
+ {
+ framework::Clock& _clock;
+ MetricClock(framework::Clock& c) : _clock(c) {}
+ virtual time_t getTime() const
+ { return _clock.getTimeInSeconds().getTime(); }
+ virtual time_t getTimeInMilliSecs() const
+ { return _clock.getTimeInMillis().getTime(); }
+ };
+}
+
+MetricsTest::MetricsTest()
+ : _threadPool(256*1024),
+ _clock(0),
+ _top(),
+ _metricsConsumer()
+{
+}
+
+void MetricsTest::setUp() {
+ assert(system("rm -rf vdsroot") == 0);
+ _config.reset(new vdstestlib::DirConfig(getStandardConfig(true)));
+ try {
+ _node.reset(new TestServiceLayerApp(DiskCount(4), NodeIndex(0),
+ _config->getConfigId()));
+ _node->setupDummyPersistence();
+ _clock = &_node->getClock();
+ _clock->setAbsoluteTimeInSeconds(1000000);
+ _top.reset(new DummyStorageLink);
+ } catch (config::InvalidConfigException& e) {
+ fprintf(stderr, "%s\n", e.what());
+ }
+ _metricManager.reset(new metrics::MetricManager(
+ std::unique_ptr<metrics::MetricManager::Timer>(
+ new MetricClock(*_clock))));
+ _topSet.reset(new metrics::MetricSet("vds", "", ""));
+ {
+ metrics::MetricLockGuard guard(_metricManager->getMetricLock());
+ _metricManager->registerMetric(guard, *_topSet);
+ }
+
+ _metricsConsumer.reset(new StatusMetricConsumer(
+ _node->getComponentRegister(),
+ *_metricManager,
+ "status"));
+
+ uint16_t diskCount = _node->getPartitions().size();
+ documentapi::LoadTypeSet::SP loadTypes(_node->getLoadTypes());
+
+ _filestorMetrics.reset(new FileStorMetrics(
+ _node->getLoadTypes()->getMetricLoadTypes()));
+ _filestorMetrics->initDiskMetrics(
+ diskCount, loadTypes->getMetricLoadTypes(), 1);
+ _topSet->registerMetric(*_filestorMetrics);
+
+ _bucketManagerMetrics.reset(new BucketManagerMetrics);
+ _bucketManagerMetrics->setDisks(diskCount);
+ _topSet->registerMetric(*_bucketManagerMetrics);
+
+ _visitorMetrics.reset(new VisitorMetrics);
+ _visitorMetrics->initThreads(4, loadTypes->getMetricLoadTypes());
+ _topSet->registerMetric(*_visitorMetrics);
+ _metricManager->init(_config->getConfigId(), _node->getThreadPool());
+}
+
+void MetricsTest::tearDown() {
+ _metricManager->stop();
+ _metricsConsumer.reset(0);
+ _topSet.reset(0);
+ _metricManager.reset(0);
+ _top.reset(0);
+ _node.reset(0);
+ _config.reset(0);
+ _filestorMetrics.reset();
+ _bucketManagerMetrics.reset();
+ _visitorMetrics.reset();
+}
+
+void MetricsTest::createFakeLoad()
+{
+ _clock->addSecondsToTime(1);
+ _metricManager->timeChangedNotification();
+ uint32_t n = 5;
+ for (uint32_t i=0; i<_bucketManagerMetrics->disks.size(); ++i) {
+ DataStoredMetrics& metrics(*_bucketManagerMetrics->disks[i]);
+ metrics.docs.inc(10 * n);
+ metrics.bytes.inc(10240 * n);
+ }
+ _filestorMetrics->directoryEvents.inc(5);
+ _filestorMetrics->partitionEvents.inc(4);
+ _filestorMetrics->diskEvents.inc(3);
+ for (uint32_t i=0; i<_filestorMetrics->disks.size(); ++i) {
+ FileStorDiskMetrics& disk(*_filestorMetrics->disks[i]);
+ disk.queueSize.addValue(4 * n);
+ disk.averageQueueWaitingTime[documentapi::LoadType::DEFAULT].addValue(10 * n);
+ disk.pendingMerges.addValue(4 * n);
+ for (uint32_t j=0; j<disk.threads.size(); ++j) {
+ FileStorThreadMetrics& thread(*disk.threads[j]);
+ thread.operations.inc(120 * n);
+ thread.failedOperations.inc(2 * n);
+
+ using documentapi::LoadType;
+
+ thread.put[LoadType::DEFAULT].count.inc(10 * n);
+ thread.put[LoadType::DEFAULT].latency.addValue(5 * n);
+ thread.get[LoadType::DEFAULT].count.inc(12 * n);
+ thread.get[LoadType::DEFAULT].notFound.inc(2 * n);
+ thread.get[LoadType::DEFAULT].latency.addValue(3 * n);
+ thread.remove[LoadType::DEFAULT].count.inc(6 * n);
+ thread.remove[LoadType::DEFAULT].notFound.inc(1 * n);
+ thread.remove[LoadType::DEFAULT].latency.addValue(2 * n);
+ thread.update[LoadType::DEFAULT].count.inc(2 * n);
+ thread.update[LoadType::DEFAULT].notFound.inc(1 * n);
+ thread.update[LoadType::DEFAULT].latencyRead.addValue(2 * n);
+ thread.update[LoadType::DEFAULT].latency.addValue(7 * n);
+ thread.revert[LoadType::DEFAULT].count.inc(2 * n);
+ thread.revert[LoadType::DEFAULT].notFound.inc(n / 2);
+ thread.revert[LoadType::DEFAULT].latency.addValue(2 * n);
+ thread.visit[LoadType::DEFAULT].count.inc(6 * n);
+
+ thread.deleteBuckets.count.inc(1 * n);
+ thread.repairs.count.inc(3 * n);
+ thread.repairFixed.inc(1 * n);
+ thread.splitBuckets.count.inc(20 * n);
+ thread.movedBuckets.count.inc(1 * n);
+ thread.readBucketInfo.count.inc(2 * n);
+ thread.internalJoin.count.inc(3 * n);
+
+ thread.mergeBuckets.count.inc(2 * n);
+ thread.bytesMerged.inc(1000 * n);
+ thread.getBucketDiff.count.inc(4 * n);
+ thread.getBucketDiffReply.inc(4 * n);
+ thread.applyBucketDiff.count.inc(4 * n);
+ thread.applyBucketDiffReply.inc(4 * n);
+ thread.mergeLatencyTotal.addValue(300 * n);
+ thread.mergeMetadataReadLatency.addValue(20 * n);
+ thread.mergeDataReadLatency.addValue(40 * n);
+ thread.mergeDataWriteLatency.addValue(50 * n);
+ thread.mergeAverageDataReceivedNeeded.addValue(0.8);
+ }
+ }
+ for (uint32_t i=0; i<_visitorMetrics->threads.size(); ++i) {
+ VisitorThreadMetrics& thread(*_visitorMetrics->threads[i]);
+ thread.queueSize.addValue(2);
+ thread.averageQueueWaitingTime[documentapi::LoadType::DEFAULT].addValue(10);
+ thread.averageVisitorLifeTime[documentapi::LoadType::DEFAULT].addValue(1000);
+ thread.createdVisitors[documentapi::LoadType::DEFAULT].inc(5 * n);
+ thread.abortedVisitors[documentapi::LoadType::DEFAULT].inc(1 * n);
+ thread.completedVisitors[documentapi::LoadType::DEFAULT].inc(4 * n);
+ thread.failedVisitors[documentapi::LoadType::DEFAULT].inc(2 * n);
+ }
+ _clock->addSecondsToTime(60);
+ _metricManager->timeChangedNotification();
+ while (uint64_t(_metricManager->getLastProcessedTime())
+ < _clock->getTimeInSeconds().getTime())
+ {
+ FastOS_Thread::Sleep(5);
+ _metricManager->timeChangedNotification();
+ }
+}
+
+void MetricsTest::testFileStorMetrics() {
+ createFakeLoad();
+ std::ostringstream ost;
+ framework::HttpUrlPath path("metrics?interval=-1&format=text");
+ bool retVal = _metricsConsumer->reportStatus(ost, path);
+ CPPUNIT_ASSERT_MESSAGE("_metricsConsumer->reportStatus failed", retVal);
+ std::string s = ost.str();
+ CPPUNIT_ASSERT_MESSAGE("No get statistics in:\n" + s,
+ s.find("vds.filestor.alldisks.allthreads.get.sum.count count=240") != std::string::npos);
+ CPPUNIT_ASSERT_MESSAGE("No put statistics in:\n" + s,
+ s.find("vds.filestor.alldisks.allthreads.put.sum.count count=200") != std::string::npos);
+ CPPUNIT_ASSERT_MESSAGE("No remove statistics in:\n" + s,
+ s.find("vds.filestor.alldisks.allthreads.remove.sum.count count=120") != std::string::npos);
+ CPPUNIT_ASSERT_MESSAGE("No removenotfound stats in:\n" + s,
+ s.find("vds.filestor.alldisks.allthreads.remove.sum.not_found count=20") != std::string::npos);
+}
+
+#define ASSERT_METRIC(interval, metric, count) \
+{ \
+ std::ostringstream pathost; \
+ pathost << "metrics?interval=" << interval << "&format=text"; \
+ std::ostringstream ost;\
+ framework::HttpUrlPath path(pathost.str()); \
+ bool retVal = _metricsConsumer->reportStatus(ost, path); \
+ CPPUNIT_ASSERT_MESSAGE("_metricsConsumer->reportStatus failed", retVal); \
+ std::string s = ost.str(); \
+ if (count == -1) { \
+ CPPUNIT_ASSERT_MESSAGE(std::string("Metric ") + metric + " was set", \
+ s.find(metric) == std::string::npos); \
+ } else { \
+ std::ostringstream valueost; \
+ valueost << metric << " count=" << count; \
+ CPPUNIT_ASSERT_MESSAGE("Did not find value " + valueost.str() \
+ + " in metric dump " + s, \
+ s.find(valueost.str()) != std::string::npos); \
+ } \
+}
+
+void MetricsTest::testSnapshotPresenting() {
+ FileStorDiskMetrics& disk0(*_filestorMetrics->disks[0]);
+ FileStorThreadMetrics& thread0(*disk0.threads[0]);
+
+ LOG(info, "Adding to get metric");
+
+ using documentapi::LoadType;
+ thread0.get[LoadType::DEFAULT].count.inc(1);
+
+ LOG(info, "Waiting for 5 minute snapshot to be taken");
+ // Wait until active metrics have been added to 5 min snapshot and reset
+ for (uint32_t i=0; i<6; ++i) {
+ _clock->addSecondsToTime(60);
+ _metricManager->timeChangedNotification();
+ while (
+ uint64_t(_metricManager->getLastProcessedTime())
+ < _clock->getTimeInSeconds().getTime())
+ {
+ FastOS_Thread::Sleep(1);
+ }
+ }
+ LOG(info, "5 minute snapshot should have been taken. Adding put count");
+
+ thread0.put[LoadType::DEFAULT].count.inc(1);
+
+ // Verify that active metrics have set put count but not get count
+ ASSERT_METRIC(-2, "vds.filestor.alldisks.allthreads.put.sum.count", 1);
+ ASSERT_METRIC(-2, "vds.filestor.alldisks.allthreads.get.sum.count", -1);
+
+ // Verify that 5 min metrics have set get count but not put count
+ ASSERT_METRIC(300, "vds.filestor.alldisks.allthreads.put.sum.count", -1);
+ ASSERT_METRIC(300, "vds.filestor.alldisks.allthreads.get.sum.count", 1);
+
+ // Verify that the total metrics is equal to 5 minute
+ ASSERT_METRIC(0, "vds.filestor.alldisks.allthreads.put.sum.count", -1);
+ ASSERT_METRIC(0, "vds.filestor.alldisks.allthreads.get.sum.count", 1);
+
+ // Verify that total + active have set both
+ ASSERT_METRIC(-1, "vds.filestor.alldisks.allthreads.put.sum.count", 1);
+ ASSERT_METRIC(-1, "vds.filestor.alldisks.allthreads.get.sum.count", 1);
+}
+
+void MetricsTest::testHtmlMetricsReport() {
+ createFakeLoad();
+ _clock->addSecondsToTime(6 * 60);
+ _metricManager->timeChangedNotification();
+ _metricsConsumer->waitUntilTimeProcessed(_clock->getTimeInSeconds());
+ createFakeLoad();
+ std::ostringstream ost;
+ framework::HttpUrlPath path("metrics?interval=300&format=html");
+ bool retVal = _metricsConsumer->reportStatus(ost, path);
+ CPPUNIT_ASSERT_MESSAGE("_metricsConsumer->reportStatus failed", retVal);
+ std::string s = ost.str();
+ // Not actually testing against content. Better to manually verify that
+ // HTML look sane after changes.
+ //std::cerr << s << "\n";
+ {
+ std::ofstream out("metricsreport.html");
+ out << s;
+ out.close();
+ }
+}
+
+void
+MetricsTest::assertMetricLastValue(const std::string& name,
+ int interval,
+ uint64_t expected)
+{
+ std::ostringstream path;
+ path << "metrics?interval=" << interval
+ << "&format=text&pattern=" << name
+ << "&verbosity=2";
+ std::ostringstream report;
+ framework::HttpUrlPath uri(path.str());
+ CPPUNIT_ASSERT(_metricsConsumer->reportStatus(report, uri));
+ std::ostringstream expectedSubstr;
+ expectedSubstr << " last=" << expected;
+ auto str = report.str();
+ CPPUNIT_ASSERT_MESSAGE("Did not find value " + expectedSubstr.str()
+ + " in metric dump " + str,
+ str.find(expectedSubstr.str()) != std::string::npos);
+}
+
+using namespace std::chrono_literals;
+
+void
+MetricsTest::createSnapshotForPeriod(std::chrono::seconds secs)
+{
+ _clock->addSecondsToTime(secs.count());
+ _metricManager->timeChangedNotification();
+ while (uint64_t(_metricManager->getLastProcessedTime())
+ < _clock->getTimeInSeconds().getTime())
+ {
+ std::this_thread::sleep_for(100ms);
+ }
+}
+
+void
+MetricsTest::testCurrentGaugeValuesOverrideSnapshotValues()
+{
+ auto& metrics(*_bucketManagerMetrics->disks[0]);
+ metrics.docs.set(1000);
+ // Take a 5 minute snapshot of active metrics (1000 docs).
+ createSnapshotForPeriod(5min);
+ metrics.docs.set(2000);
+ // Active metrics are now 2000 docs. Asking for metric snapshots with
+ // an interval of -1 implies that the _active_ metric values should
+ // be added to the total snapshot, which in the case of gauge metrics
+ // only makes sense if the _active_ gauge value gets reported back.
+ // In this case it means we should observe 2000 docs, not 1000.
+ assertMetricLastValue("vds.datastored.alldisks.docs", -1, 2000);
+}
+
+void
+MetricsTest::testVerboseReportIncludesNonSetMetricsEvenAfterSnapshot()
+{
+ createSnapshotForPeriod(5min);
+ // When using verbosity=2 (which is what the system test framework invokes),
+ // all metrics should be included regardless of whether they've been set or
+ // not. In this case, the bytes gauge metric has not been set explicitly
+ // but should be reported as zero.
+ assertMetricLastValue("vds.datastored.alldisks.bytes", -1, 0);
+}
+
+} // storage
diff --git a/storage/src/tests/common/storagelinktest.cpp b/storage/src/tests/common/storagelinktest.cpp
new file mode 100644
index 00000000000..34b774ac424
--- /dev/null
+++ b/storage/src/tests/common/storagelinktest.cpp
@@ -0,0 +1,57 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <tests/common/storagelinktest.h>
+#include <iostream>
+#include <string>
+#include <vespa/storageapi/message/stat.h>
+
+namespace storage {
+
+CPPUNIT_TEST_SUITE_REGISTRATION(StorageLinkTest);
+
+StorageLinkTest::StorageLinkTest()
+ : _threadPool(1024),
+ _feeder(),
+ _middle(0),
+ _replier(0) {}
+
+void StorageLinkTest::setUp() {
+ _feeder.reset(new DummyStorageLink());
+ _middle = new DummyStorageLink();
+ _replier = new DummyStorageLink();
+ _feeder->push_back(StorageLink::UP(_middle));
+ _feeder->push_back(StorageLink::UP(_replier));
+ _replier->setAutoreply(true);
+}
+
+void StorageLinkTest::testPrinting() {
+ std::ostringstream actual;
+ actual << *_feeder;
+ std::string expected =
+"StorageChain(3)\n"
+" DummyStorageLink(autoreply = off, dispatch = off, 0 commands, 0 replies)\n"
+" DummyStorageLink(autoreply = off, dispatch = off, 0 commands, 0 replies)\n"
+" DummyStorageLink(autoreply = on, dispatch = off, 0 commands, 0 replies)";
+
+ CPPUNIT_ASSERT_EQUAL(expected, actual.str());
+}
+
+void StorageLinkTest::testNotImplemented() {
+ _feeder->open();
+ // Test that a message that nobody handles fails with NOT_IMPLEMENTED
+ _replier->setIgnore(true);
+ _feeder->sendDown(api::StorageCommand::SP(
+ new api::StatBucketCommand(document::BucketId(0), "")));
+ _feeder->close();
+ _feeder->flush();
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, _feeder->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(
+ dynamic_cast<api::StatBucketReply&>(
+ *_feeder->getReply(0)).getResult(),
+ api::ReturnCode(api::ReturnCode::NOT_IMPLEMENTED, "Statbucket"));
+ _feeder->reset();
+ _replier->setIgnore(false);
+}
+
+} // storage
diff --git a/storage/src/tests/common/storagelinktest.h b/storage/src/tests/common/storagelinktest.h
new file mode 100644
index 00000000000..efeebb1146e
--- /dev/null
+++ b/storage/src/tests/common/storagelinktest.h
@@ -0,0 +1,46 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <cppunit/extensions/HelperMacros.h>
+#include <vespa/fastos/fastos.h>
+#include <tests/common/dummystoragelink.h>
+
+namespace storage {
+
+struct StorageLinkTest : public CppUnit::TestFixture {
+ FastOS_ThreadPool _threadPool;
+ std::unique_ptr<DummyStorageLink> _feeder;
+ DummyStorageLink* _middle;
+ DummyStorageLink* _replier;
+
+ StorageLinkTest();
+
+ void setUp();
+
+ void testPrinting();
+ void testNotImplemented();
+
+ static bool callOnUp(StorageLink& link,
+ const api::StorageMessage::SP& msg)
+ {
+ return link.onUp(msg);
+ }
+ static bool callOnDown(StorageLink& link,
+ const api::StorageMessage::SP& msg)
+ {
+ return link.onDown(msg);
+ }
+ static void callOnFlush(StorageLink& link, bool downwards)
+ {
+ link.onFlush(downwards);
+ }
+
+ CPPUNIT_TEST_SUITE(StorageLinkTest);
+ CPPUNIT_TEST(testPrinting);
+ CPPUNIT_TEST(testNotImplemented);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+}
+
diff --git a/storage/src/tests/common/testhelper.cpp b/storage/src/tests/common/testhelper.cpp
new file mode 100644
index 00000000000..b8b42124d39
--- /dev/null
+++ b/storage/src/tests/common/testhelper.cpp
@@ -0,0 +1,209 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <tests/common/testhelper.h>
+
+#include <vespa/log/log.h>
+#include <vespa/vespalib/io/fileutil.h>
+
+LOG_SETUP(".testhelper");
+
+namespace storage {
+
+namespace {
+ bool useNewStorageCore() {
+ if ( // Unit test directory
+ vespalib::fileExists("use_new_storage_core") ||
+ // src/cpp directory
+ vespalib::fileExists("../use_new_storage_core") ||
+ // Top build directory where storage-HEAD remains
+ vespalib::fileExists("../../../../use_new_storage_core"))
+ {
+ std::cerr << "Using new storage core for unit tests\n";
+ return true;
+ }
+ return false;
+ }
+ bool newStorageCore(useNewStorageCore());
+}
+
+void addStorageDistributionConfig(vdstestlib::DirConfig& dc)
+{
+ vdstestlib::DirConfig::Config* config;
+ config = &dc.getConfig("stor-distribution", true);
+ config->clear();
+ config->set("group[1]");
+ config->set("group[0].name", "invalid");
+ config->set("group[0].index", "invalid");
+ config->set("group[0].nodes[50]");
+ config->set("redundancy", "2");
+
+ for (uint32_t i = 0; i < 50; i++) {
+ std::ostringstream key; key << "group[0].nodes[" << i << "].index";
+ std::ostringstream val; val << i;
+ config->set(key.str(), val.str());
+ }
+}
+
+vdstestlib::DirConfig getStandardConfig(bool storagenode) {
+ std::string clusterName("storage");
+ vdstestlib::DirConfig dc;
+ vdstestlib::DirConfig::Config* config;
+ config = &dc.addConfig("fleetcontroller");
+ config->set("cluster_name", clusterName);
+ config->set("index", "0");
+ config->set("zookeeper_server", "\"\"");
+ config->set("total_distributor_count", "10");
+ config->set("total_storage_count", "10");
+ config = &dc.addConfig("upgrading");
+ config = &dc.addConfig("load-type");
+ config->set("type[10]");
+ config->set("type[0].id", "1");
+ config->set("type[0].name", "\"maintenance.inconsistent.join\"");
+ config->set("type[0].priority", "\"high_3\"");
+ config->set("type[1].id", "2");
+ config->set("type[1].name", "\"maintenance.inconsistent.split\"");
+ config->set("type[1].priority", "\"normal_1\"");
+ config->set("type[2].id", "3");
+ config->set("type[2].name", "\"maintenance.active.incorrectamount\"");
+ config->set("type[2].priority", "\"normal_2\"");
+ config->set("type[3].id", "4");
+ config->set("type[3].name", "\"maintenance.active.wrongcopy\"");
+ config->set("type[3].priority", "\"normal_3\"");
+ config->set("type[4].id", "5");
+ config->set("type[4].name", "\"maintenance.size.split\"");
+ config->set("type[4].priority", "\"normal_4\"");
+ config->set("type[5].id", "6");
+ config->set("type[5].name", "\"maintenance.size.join\"");
+ config->set("type[5].priority", "\"normal_5\"");
+ config->set("type[6].id", "7");
+ config->set("type[6].name", "\"maintenance.merge.toofewcopies\"");
+ config->set("type[6].priority", "\"normal_6\"");
+ config->set("type[7].id", "8");
+ config->set("type[7].name", "\"maintenance.merge.toomanycopies\"");
+ config->set("type[7].priority", "\"low_1\"");
+ config->set("type[8].id", "9");
+ config->set("type[8].name", "\"maintenance.merge.outofsync\"");
+ config->set("type[8].priority", "\"low_2\"");
+ config->set("type[9].id", "10");
+ config->set("type[9].name", "\"maintenance.move\"");
+ config->set("type[9].priority", "\"low_3\"");
+ config = &dc.addConfig("bucket");
+ config = &dc.addConfig("messagebus");
+ config = &dc.addConfig("stor-prioritymapping");
+ config = &dc.addConfig("stor-bucketdbupdater");
+ config = &dc.addConfig("stor-bucket-init");
+ config = &dc.addConfig("metricsmanager");
+ config->set("consumer[2]");
+ config->set("consumer[0].name", "\"status\"");
+ config->set("consumer[0].addedmetrics[1]");
+ config->set("consumer[0].addedmetrics[0]", "\"*\"");
+ config->set("consumer[1].name", "\"statereporter\"");
+ config->set("consumer[1].addedmetrics[1]");
+ config->set("consumer[1].addedmetrics[0]", "\"*\"");
+ config = &dc.addConfig("stor-communicationmanager");
+ config->set("rpcport", "0");
+ config->set("mbusport", "0");
+ config = &dc.addConfig("stor-bucketdb");
+ config->set("chunklevel", "0");
+ config = &dc.addConfig("stor-distributormanager");
+ config->set("splitcount", "1000");
+ config->set("splitsize", "10000000");
+ config->set("joincount", "500");
+ config->set("joinsize", "5000000");
+ config = &dc.addConfig("stor-opslogger");
+ config = &dc.addConfig("persistence");
+ config->set("abort_operations_with_changed_bucket_ownership", "true");
+ config = &dc.addConfig("stor-filestor");
+ // Easier to see what goes wrong with only 1 thread per disk.
+ config->set("minimum_file_meta_slots", "2");
+ config->set("minimum_file_header_block_size", "368");
+ config->set("minimum_file_size", "4096");
+ config->set("threads[1]");
+ config->set("threads[0].lowestpri 255");
+ config->set("dir_spread", "4");
+ config->set("dir_levels", "0");
+ config->set("use_new_core", newStorageCore ? "true" : "false");
+ config->set("maximum_versions_of_single_document_stored", "0");
+ //config->set("enable_slotfile_cache", "false");
+ // Unit tests typically use fake low time values, so don't complain
+ // about them or compact/delete them by default. Override in tests testing that
+ // behavior
+ config->set("time_future_limit", "5");
+ config->set("time_past_limit", "2000000000");
+ config->set("keep_remove_time_period", "2000000000");
+ config->set("revert_time_period", "2000000000");
+ // Don't want test to call exit()
+ config->set("fail_disk_after_error_count", "0");
+ config = &dc.addConfig("stor-bouncer");
+ config = &dc.addConfig("stor-integritychecker");
+ config = &dc.addConfig("stor-bucketmover");
+ config = &dc.addConfig("stor-messageforwarder");
+ config = &dc.addConfig("stor-server");
+ config->set("cluster_name", clusterName);
+ config->set("enable_dead_lock_detector", "false");
+ config->set("enable_dead_lock_detector_warnings", "false");
+ config->set("max_merges_per_node", "25");
+ config->set("max_merge_queue_size", "20");
+ config->set("root_folder",
+ (storagenode ? "vdsroot" : "vdsroot.distributor"));
+ config->set("is_distributor",
+ (storagenode ? "false" : "true"));
+ config = &dc.addConfig("stor-devices");
+ config->set("root_folder",
+ (storagenode ? "vdsroot" : "vdsroot.distributor"));
+ config = &dc.addConfig("stor-status");
+ config->set("httpport", "0");
+ config = &dc.addConfig("stor-visitor");
+ config->set("defaultdocblocksize", "8192");
+ // By default, need "old" behaviour of maxconcurrent
+ config->set("maxconcurrentvisitors_fixed", "4");
+ config->set("maxconcurrentvisitors_variable", "0");
+ config = &dc.addConfig("stor-visitordispatcher");
+ addFileConfig(dc, "documenttypes", "config-doctypes.cfg");
+ addStorageDistributionConfig(dc);
+ return dc;
+}
+
+void addSlobrokConfig(vdstestlib::DirConfig& dc,
+ const mbus::Slobrok& slobrok)
+{
+ std::ostringstream ost;
+ ost << "tcp/localhost:" << slobrok.port();
+ vdstestlib::DirConfig::Config* config;
+ config = &dc.getConfig("slobroks", true);
+ config->clear();
+ config->set("slobrok[1]");
+ config->set("slobrok[0].connectionspec", ost.str());
+}
+
+void addFileConfig(vdstestlib::DirConfig& dc,
+ const std::string& configDefName,
+ const std::string& fileName)
+{
+ vdstestlib::DirConfig::Config* config;
+ config = &dc.getConfig(configDefName, true);
+ config->clear();
+ std::ifstream in(fileName.c_str());
+ std::string line;
+ while (std::getline(in, line, '\n')) {
+ std::string::size_type pos = line.find(' ');
+ if (pos == std::string::npos) {
+ config->set(line);
+ } else {
+ config->set(line.substr(0, pos), line.substr(pos + 1));
+ }
+ }
+ in.close();
+}
+
+TestName::TestName(const std::string& n)
+ : name(n)
+{
+ LOG(debug, "Starting test %s", name.c_str());
+}
+
+TestName::~TestName() {
+ LOG(debug, "Done with test %s", name.c_str());
+}
+
+} // storage
diff --git a/storage/src/tests/common/testhelper.h b/storage/src/tests/common/testhelper.h
new file mode 100644
index 00000000000..be2c3e7ec66
--- /dev/null
+++ b/storage/src/tests/common/testhelper.h
@@ -0,0 +1,58 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+#include <vespa/vdstestlib/cppunit/dirconfig.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+
+
+#include <fstream>
+#include <vespa/fastos/fastos.h>
+#include <vespa/messagebus/testlib/slobrok.h>
+#include <sstream>
+
+#define ASSERT_REPLY_COUNT(count, dummylink) \
+ { \
+ std::ostringstream msgost; \
+ if ((dummylink).getNumReplies() != count) { \
+ for (uint32_t ijx=0; ijx<(dummylink).getNumReplies(); ++ijx) { \
+ msgost << (dummylink).getReply(ijx)->toString(true) << "\n"; \
+ } \
+ } \
+ CPPUNIT_ASSERT_EQUAL_MSG(msgost.str(), size_t(count), \
+ (dummylink).getNumReplies()); \
+ }
+#define ASSERT_COMMAND_COUNT(count, dummylink) \
+ { \
+ std::ostringstream msgost; \
+ if ((dummylink).getNumCommands() != count) { \
+ for (uint32_t ijx=0; ijx<(dummylink).getNumCommands(); ++ijx) { \
+ msgost << (dummylink).getCommand(ijx)->toString(true) << "\n"; \
+ } \
+ } \
+ CPPUNIT_ASSERT_EQUAL_MSG(msgost.str(), size_t(count), \
+ (dummylink).getNumCommands()); \
+ }
+
+namespace storage {
+
+void addFileConfig(vdstestlib::DirConfig& dc,
+ const std::string& configDefName,
+ const std::string& fileName);
+
+
+void addStorageDistributionConfig(vdstestlib::DirConfig& dc);
+
+vdstestlib::DirConfig getStandardConfig(bool storagenode);
+
+void addSlobrokConfig(vdstestlib::DirConfig& dc,
+ const mbus::Slobrok& slobrok);
+
+// Class used to print start and end of test. Enable debug when you want to see
+// which test creates what output or where we get stuck
+struct TestName {
+ std::string name;
+ TestName(const std::string& n);
+ ~TestName();
+};
+
+} // storage
+
diff --git a/storage/src/tests/common/testnodestateupdater.h b/storage/src/tests/common/testnodestateupdater.h
new file mode 100644
index 00000000000..9f5b2d8ba51
--- /dev/null
+++ b/storage/src/tests/common/testnodestateupdater.h
@@ -0,0 +1,50 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::TestNodeStateUpdater
+ * \ingroup common
+ *
+ * \brief Test implementation of the node state updater.
+ */
+
+#pragma once
+
+#include <vespa/storage/common/nodestateupdater.h>
+
+namespace storage {
+
+struct TestNodeStateUpdater : public NodeStateUpdater
+{
+ lib::NodeState::CSP _reported;
+ lib::NodeState::CSP _current;
+ lib::ClusterState::CSP _cluster;
+ std::vector<StateListener*> _listeners;
+
+public:
+ TestNodeStateUpdater(const lib::NodeType& type) {
+ _reported.reset(new lib::NodeState(type, lib::State::UP));
+ _current.reset(new lib::NodeState(type, lib::State::UP));
+ }
+
+ lib::NodeState::CSP getReportedNodeState() const { return _reported; }
+ lib::NodeState::CSP getCurrentNodeState() const { return _current; }
+ lib::ClusterState::CSP getSystemState() const { return _cluster; }
+ void addStateListener(StateListener& s) {
+ _listeners.push_back(&s);
+ }
+ void removeStateListener(StateListener&) {}
+ Lock::SP grabStateChangeLock() { return Lock::SP(new Lock); }
+ void setReportedNodeState(const lib::NodeState& state)
+ { _reported.reset(new lib::NodeState(state)); }
+ void setCurrentNodeState(const lib::NodeState& state)
+ { _current.reset(new lib::NodeState(state)); }
+
+ void setClusterState(lib::ClusterState::CSP c) {
+ _cluster = c;
+ for (uint32_t i = 0; i < _listeners.size(); ++i) {
+ _listeners[i]->handleNewState();
+ }
+ }
+};
+
+} // storage
+
diff --git a/storage/src/tests/common/teststorageapp.cpp b/storage/src/tests/common/teststorageapp.cpp
new file mode 100644
index 00000000000..eb4c1c41c78
--- /dev/null
+++ b/storage/src/tests/common/teststorageapp.cpp
@@ -0,0 +1,292 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <tests/common/teststorageapp.h>
+
+#include <vespa/log/log.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+#include <vespa/storage/bucketdb/storagebucketdbinitializer.h>
+#include <vespa/storage/config/config-stor-server.h>
+#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
+#include <vespa/storageframework/defaultimplementation/memory/nomemorymanager.h>
+#include <vespa/config-fleetcontroller.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/vespalib/io/fileutil.h>
+
+LOG_SETUP(".test.servicelayerapp");
+
+using storage::framework::defaultimplementation::ComponentRegisterImpl;
+
+namespace storage {
+
+namespace {
+ template<typename T>
+ struct ConfigReader : public T::Subscriber,
+ public T
+ {
+ ConfigReader(const std::string& configId) {
+ T::subscribe(configId, *this);
+ }
+ void configure(const T& c) { dynamic_cast<T&>(*this) = c; }
+ };
+}
+
+TestStorageApp::TestStorageApp(StorageComponentRegisterImpl::UP compReg,
+ const lib::NodeType& type, NodeIndex index,
+ vespalib::stringref configId)
+ : TestComponentRegister(ComponentRegisterImpl::UP(std::move(compReg))),
+ _compReg(dynamic_cast<StorageComponentRegisterImpl&>(
+ TestComponentRegister::getComponentRegister())),
+ _docMan(),
+ _nodeStateUpdater(type),
+ _configId(configId),
+ _initialized(false)
+{
+ // Use config to adjust values
+ vespalib::string clusterName = "mycluster";
+ uint32_t redundancy = 2;
+ uint32_t nodeCount = 10;
+ documentapi::LoadTypeSet::SP loadTypes;
+ if (!configId.empty()) {
+ config::ConfigUri uri(configId);
+ std::unique_ptr<vespa::config::content::core::StorServerConfig> serverConfig = config::ConfigGetter<vespa::config::content::core::StorServerConfig>::getConfig(uri.getConfigId(), uri.getContext());
+ clusterName = serverConfig->clusterName;
+ if (index == 0xffff) index = serverConfig->nodeIndex;
+ redundancy = config::ConfigGetter<vespa::config::content::StorDistributionConfig>::getConfig(uri.getConfigId(), uri.getContext())->redundancy;
+ nodeCount = config::ConfigGetter<vespa::config::content::FleetcontrollerConfig>::getConfig(uri.getConfigId(), uri.getContext())->totalStorageCount;
+ _compReg.setPriorityConfig(
+ *config::ConfigGetter<StorageComponent::PriorityConfig>
+ ::getConfig(uri.getConfigId(), uri.getContext()));
+ loadTypes.reset(new documentapi::LoadTypeSet(
+ *config::ConfigGetter<vespa::config::content::LoadTypeConfig>
+ ::getConfig(uri.getConfigId(), uri.getContext())));
+ } else {
+ if (index == 0xffff) index = 0;
+ loadTypes.reset(new documentapi::LoadTypeSet);
+ }
+ if (index >= nodeCount) nodeCount = index + 1;
+ if (redundancy > nodeCount) redundancy = nodeCount;
+
+ _compReg.setNodeInfo(clusterName, type, index);
+ _compReg.setNodeStateUpdater(_nodeStateUpdater);
+ _compReg.setDocumentTypeRepo(_docMan.getTypeRepoSP());
+ _compReg.setLoadTypes(loadTypes);
+ _compReg.setBucketIdFactory(document::BucketIdFactory());
+ lib::Distribution::SP distr(new lib::Distribution(
+ lib::Distribution::getDefaultDistributionConfig(
+ redundancy, nodeCount)));
+ _compReg.setDistribution(distr);
+}
+
+void
+TestStorageApp::setDistribution(Redundancy redundancy, NodeCount nodeCount)
+{
+ lib::Distribution::SP distr(new lib::Distribution(
+ lib::Distribution::getDefaultDistributionConfig(
+ redundancy, nodeCount)));
+ _compReg.setDistribution(distr);
+}
+
+void
+TestStorageApp::setTypeRepo(document::DocumentTypeRepo::SP repo)
+{
+ _compReg.setDocumentTypeRepo(repo);
+}
+
+void
+TestStorageApp::setClusterState(const lib::ClusterState& c)
+{
+ _nodeStateUpdater.setClusterState(
+ lib::ClusterState::CSP(new lib::ClusterState(c)));
+}
+
+void
+TestStorageApp::waitUntilInitialized(
+ StorageBucketDBInitializer* initializer, framework::SecondTime timeout)
+{
+ // Always use real clock for wait timeouts. Component clock may be faked
+ // in tests
+ framework::defaultimplementation::RealClock clock;
+ framework::MilliSecTime endTime(
+ clock.getTimeInMillis() + timeout.getMillis());
+ while (!isInitialized()) {
+ FastOS_Thread::Sleep(1);
+ framework::MilliSecTime currentTime(clock.getTimeInMillis());
+ if (currentTime > endTime) {
+ std::ostringstream error;
+ error << "Failed to initialize service layer within timeout of "
+ << timeout << " seconds.";
+ if (initializer != 0) {
+ error << " ";
+ initializer->reportStatus(error, framework::HttpUrlPath(""));
+ LOG(error, "%s", error.str().c_str());
+ CPPUNIT_FAIL(error.str().c_str());
+ }
+ }
+ }
+}
+
+namespace {
+ NodeIndex getIndexFromConfig(vespalib::stringref configId) {
+ if (!configId.empty()) {
+ config::ConfigUri uri(configId);
+ return NodeIndex(
+ config::ConfigGetter<vespa::config::content::core::StorServerConfig>::getConfig(uri.getConfigId(), uri.getContext())->nodeIndex);
+ }
+ return NodeIndex(0);
+ }
+}
+
+TestServiceLayerApp::TestServiceLayerApp(vespalib::stringref configId)
+ : TestStorageApp(
+ StorageComponentRegisterImpl::UP(
+ new ServiceLayerComponentRegisterImpl),
+ lib::NodeType::STORAGE, getIndexFromConfig(configId), configId),
+ _compReg(dynamic_cast<ServiceLayerComponentRegisterImpl&>(
+ TestStorageApp::getComponentRegister())),
+ _persistenceProvider(),
+ _partitions(1)
+{
+ _compReg.setDiskCount(1);
+ lib::NodeState ns(*_nodeStateUpdater.getReportedNodeState());
+ ns.setDiskCount(1);
+ _nodeStateUpdater.setReportedNodeState(ns);
+}
+
+TestServiceLayerApp::TestServiceLayerApp(DiskCount dc, NodeIndex index,
+ vespalib::stringref configId)
+ : TestStorageApp(
+ StorageComponentRegisterImpl::UP(
+ new ServiceLayerComponentRegisterImpl),
+ lib::NodeType::STORAGE, index, configId),
+ _compReg(dynamic_cast<ServiceLayerComponentRegisterImpl&>(
+ TestStorageApp::getComponentRegister())),
+ _persistenceProvider(),
+ _partitions(dc)
+{
+ _compReg.setDiskCount(dc);
+ lib::NodeState ns(*_nodeStateUpdater.getReportedNodeState());
+ ns.setDiskCount(dc);
+ _nodeStateUpdater.setReportedNodeState(ns);
+ // Tests should know how many disks they want to use. If testing auto
+ // detection, you should not need this utility.
+ CPPUNIT_ASSERT(dc > 0);
+}
+
+void
+TestServiceLayerApp::setupDummyPersistence()
+{
+ spi::PersistenceProvider::UP provider(new spi::dummy::DummyPersistence(
+ getTypeRepo(), _compReg.getDiskCount()));
+ setPersistenceProvider(std::move(provider));
+}
+
+void
+TestServiceLayerApp::setPersistenceProvider(
+ spi::PersistenceProvider::UP provider)
+{
+ _partitions = provider->getPartitionStates().getList();
+ CPPUNIT_ASSERT_EQUAL(spi::PartitionId(_compReg.getDiskCount()),
+ _partitions.size());
+ _persistenceProvider = std::move(provider);
+}
+
+spi::PersistenceProvider&
+TestServiceLayerApp::getPersistenceProvider()
+{
+ if (_persistenceProvider.get() == 0) {
+ throw vespalib::IllegalStateException(
+ "Persistence provider requested but not initialized.",
+ VESPA_STRLOC);
+ }
+ return *_persistenceProvider;
+}
+
+spi::PartitionStateList&
+TestServiceLayerApp::getPartitions()
+{
+ if (_persistenceProvider.get() == 0) {
+ throw vespalib::IllegalStateException(
+ "Partition list requested but not initialized.",
+ VESPA_STRLOC);
+ }
+ return _partitions;
+}
+
+uint16_t
+TestServiceLayerApp::getPartition(const document::BucketId& bucket)
+{
+ lib::NodeState state(lib::NodeType::STORAGE, lib::State::UP);
+ state.setDiskCount(_compReg.getDiskCount());
+ return getDistribution()->getIdealDisk(
+ state, _compReg.getIndex(), bucket.stripUnused(),
+ lib::Distribution::IDEAL_DISK_EVEN_IF_DOWN);
+}
+
+namespace {
+ template<typename T>
+ const T getConfig(vespalib::stringref configId) {
+ config::ConfigUri uri(configId);
+ return *config::ConfigGetter<T>::getConfig(
+ uri.getConfigId(), uri.getContext());
+ }
+}
+
+void
+TestDistributorApp::configure(vespalib::stringref id)
+{
+ if (id.empty()) return;
+ DistributorConfig dc(getConfig<vespa::config::content::core::StorDistributormanagerConfig>(id));
+ _compReg.setDistributorConfig(dc);
+ VisitorConfig vc(getConfig<vespa::config::content::core::StorVisitordispatcherConfig>(id));
+ _compReg.setVisitorConfig(vc);
+}
+
+TestDistributorApp::TestDistributorApp(vespalib::stringref configId)
+ : TestStorageApp(
+ StorageComponentRegisterImpl::UP(
+ new DistributorComponentRegisterImpl),
+ lib::NodeType::DISTRIBUTOR, getIndexFromConfig(configId), configId),
+ _compReg(dynamic_cast<DistributorComponentRegisterImpl&>(
+ TestStorageApp::getComponentRegister())),
+ _lastUniqueTimestampRequested(0),
+ _uniqueTimestampCounter(0)
+{
+ _compReg.setTimeCalculator(*this);
+ configure(configId);
+}
+
+TestDistributorApp::TestDistributorApp(NodeIndex index,
+ vespalib::stringref configId)
+ : TestStorageApp(
+ StorageComponentRegisterImpl::UP(new StorageComponentRegisterImpl),
+ lib::NodeType::DISTRIBUTOR, index, configId),
+ _compReg(dynamic_cast<DistributorComponentRegisterImpl&>(
+ TestStorageApp::getComponentRegister())),
+ _lastUniqueTimestampRequested(0),
+ _uniqueTimestampCounter(0)
+{
+ _compReg.setTimeCalculator(*this);
+ configure(configId);
+}
+
+api::Timestamp
+TestDistributorApp::getUniqueTimestamp()
+{
+ vespalib::Lock lock(_accessLock);
+ uint64_t timeNow(getClock().getTimeInSeconds().getTime());
+ if (timeNow == _lastUniqueTimestampRequested) {
+ ++_uniqueTimestampCounter;
+ } else {
+ if (timeNow < _lastUniqueTimestampRequested) {
+ LOG(error, "Time has moved backwards, from %" PRIu64 " to %" PRIu64 ".",
+ _lastUniqueTimestampRequested, timeNow);
+ }
+ _lastUniqueTimestampRequested = timeNow;
+ _uniqueTimestampCounter = 0;
+ }
+
+ return _lastUniqueTimestampRequested * 1000000ll + _uniqueTimestampCounter;
+}
+
+} // storage
diff --git a/storage/src/tests/common/teststorageapp.h b/storage/src/tests/common/teststorageapp.h
new file mode 100644
index 00000000000..e7da9178743
--- /dev/null
+++ b/storage/src/tests/common/teststorageapp.h
@@ -0,0 +1,161 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::TestServiceLayerApp
+ * \ingroup common
+ *
+ * \brief Helper class for tests involving service layer.
+ *
+ * Some components need some dependencies injected in order to work correctly.
+ * This test class simplifies the process of creating these dependencies.
+ *
+ * Note that the interface between this class and the test class should be as
+ * clean as possible, such that we can change as little as possible when
+ * refactoring later. Also, advanced functionality should not be generated in
+ * here, but rather fixed by tests themselves. Functionality here should be
+ * needed by many tests, and we should avoid instantiating complex instances
+ * here that several tests
+ */
+#pragma once
+
+#include <vespa/document/base/testdocman.h>
+#include <vespa/persistence/spi/persistenceprovider.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storage/common/doneinitializehandler.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/storageserver/framework.h>
+#include <vespa/storage/frameworkimpl/component/distributorcomponentregisterimpl.h>
+#include <vespa/storage/frameworkimpl/component/servicelayercomponentregisterimpl.h>
+#include <vespa/storageframework/generic/memory/memorymanagerinterface.h>
+#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
+#include <vespa/storageframework/defaultimplementation/component/testcomponentregister.h>
+#include <tests/common/testnodestateupdater.h>
+
+namespace storage {
+
+class StorageBucketDBInitializer;
+
+DEFINE_PRIMITIVE_WRAPPER(uint16_t, DiskCount);
+DEFINE_PRIMITIVE_WRAPPER(uint16_t, NodeIndex);
+DEFINE_PRIMITIVE_WRAPPER(uint16_t, NodeCount);
+DEFINE_PRIMITIVE_WRAPPER(uint16_t, Redundancy);
+
+class TestStorageApp
+ : public framework::defaultimplementation::TestComponentRegister,
+ private DoneInitializeHandler
+{
+ StorageComponentRegisterImpl& _compReg;
+
+protected:
+ document::TestDocMan _docMan;
+ TestNodeStateUpdater _nodeStateUpdater;
+ vespalib::string _configId;
+ bool _initialized;
+
+public:
+ /**
+ * Set up a storage application. If node index is not set, it will be
+ * fetched from config if config id is given, otherwise it is set to 0.
+ * If configId is given, some critical values are taken from config.
+ * (node count, redundancy, node index etc). If configId set not set these
+ * will just have some default values. A non-default node index will
+ * override config, but be careful with this, as components may fetch index
+ * from config themselves.
+ */
+ TestStorageApp(StorageComponentRegisterImpl::UP compReg,
+ const lib::NodeType&, NodeIndex = NodeIndex(0xffff),
+ vespalib::stringref configId = "");
+
+ // Set functions, to be able to modify content while running.
+ void setDistribution(Redundancy, NodeCount);
+ void setTypeRepo(document::DocumentTypeRepo::SP repo);
+ void setClusterState(const lib::ClusterState&);
+
+ // Utility functions for getting a hold of currently used bits. Practical
+ // to avoid adding extra components in the tests.
+ StorageComponentRegisterImpl& getComponentRegister() { return _compReg; }
+ document::TestDocMan& getTestDocMan() { return _docMan; }
+ document::DocumentTypeRepo::SP getTypeRepo()
+ { return _compReg.getTypeRepo(); }
+ const document::BucketIdFactory& getBucketIdFactory()
+ { return _compReg.getBucketIdFactory(); }
+ TestNodeStateUpdater& getStateUpdater() { return _nodeStateUpdater; }
+ documentapi::LoadTypeSet::SP getLoadTypes()
+ { return _compReg.getLoadTypes(); }
+ lib::Distribution::SP getDistribution()
+ { return _compReg.getDistribution(); }
+ TestNodeStateUpdater& getNodeStateUpdater() { return _nodeStateUpdater; }
+ uint16_t getIndex() const { return _compReg.getIndex(); }
+
+ // The storage app also implements the done initializer interface, so it can
+ // be sent to components needing this.
+ DoneInitializeHandler& getDoneInitializeHandler() { return *this; }
+ virtual void notifyDoneInitializing() { _initialized = true; }
+ bool isInitialized() const { return _initialized; }
+ void waitUntilInitialized(
+ StorageBucketDBInitializer* initializer = 0,
+ framework::SecondTime timeout = framework::SecondTime(30));
+
+private:
+ // Storage server interface implementation (until we can remove it)
+ virtual api::Timestamp getUniqueTimestamp() { assert(0); throw; }
+ virtual StorBucketDatabase& getStorageBucketDatabase() { assert(0); throw; }
+ virtual distributor::BucketDatabase& getBucketDatabase() { assert(0); throw; }
+ virtual uint16_t getDiskCount() const { assert(0); throw; }
+};
+
+class TestServiceLayerApp : public TestStorageApp
+{
+ ServiceLayerComponentRegisterImpl& _compReg;
+ spi::PersistenceProvider::UP _persistenceProvider;
+ spi::PartitionStateList _partitions;
+
+public:
+ TestServiceLayerApp(vespalib::stringref configId = "");
+ TestServiceLayerApp(DiskCount diskCount, NodeIndex = NodeIndex(0xffff),
+ vespalib::stringref configId = "");
+
+ void setupDummyPersistence();
+ void setPersistenceProvider(spi::PersistenceProvider::UP);
+
+ ServiceLayerComponentRegisterImpl& getComponentRegister()
+ { return _compReg; }
+
+ spi::PersistenceProvider& getPersistenceProvider();
+ spi::PartitionStateList& getPartitions();
+
+ uint16_t getPartition(const document::BucketId&);
+
+ virtual StorBucketDatabase& getStorageBucketDatabase()
+ { return _compReg.getBucketDatabase(); }
+
+private:
+ // For storage server interface implementation we'll get rid of soon.
+ // Use getPartitions().size() instead.
+ virtual uint16_t getDiskCount() const { return _compReg.getDiskCount(); }
+};
+
+class TestDistributorApp : public TestStorageApp,
+ public UniqueTimeCalculator
+{
+ DistributorComponentRegisterImpl& _compReg;
+ vespalib::Lock _accessLock;
+ uint64_t _lastUniqueTimestampRequested;
+ uint32_t _uniqueTimestampCounter;
+
+ void configure(vespalib::stringref configId);
+
+public:
+ TestDistributorApp(vespalib::stringref configId = "");
+ TestDistributorApp(NodeIndex index, vespalib::stringref configId = "");
+
+ DistributorComponentRegisterImpl& getComponentRegister()
+ { return _compReg; }
+ virtual distributor::BucketDatabase& getBucketDatabase()
+ { return _compReg.getBucketDatabase(); }
+
+ virtual api::Timestamp getUniqueTimestamp();
+};
+
+} // storageo
+
diff --git a/storage/src/tests/config-doctypes.cfg b/storage/src/tests/config-doctypes.cfg
new file mode 100644
index 00000000000..f41593ebfc3
--- /dev/null
+++ b/storage/src/tests/config-doctypes.cfg
@@ -0,0 +1,158 @@
+enablecompression false
+documenttype[3]
+documenttype[0].id -519202262
+documenttype[0].name "text/plain"
+documenttype[0].version 0
+documenttype[0].headerstruct 160469461
+documenttype[0].bodystruct 749465898
+documenttype[0].inherits[0]
+documenttype[0].datatype[2]
+documenttype[0].datatype[0].id 160469461
+documenttype[0].datatype[0].type STRUCT
+documenttype[0].datatype[0].array.element.id 0
+documenttype[0].datatype[0].map.key.id 0
+documenttype[0].datatype[0].map.value.id 0
+documenttype[0].datatype[0].wset.key.id 0
+documenttype[0].datatype[0].wset.createifnonexistent false
+documenttype[0].datatype[0].wset.removeifzero false
+documenttype[0].datatype[0].annotationref.annotation.id 0
+documenttype[0].datatype[0].sstruct.name "text/plain.header"
+documenttype[0].datatype[0].sstruct.version 0
+documenttype[0].datatype[0].sstruct.compression.type NONE
+documenttype[0].datatype[0].sstruct.compression.level 0
+documenttype[0].datatype[0].sstruct.compression.threshold 90
+documenttype[0].datatype[0].sstruct.compression.minsize 0
+documenttype[0].datatype[0].sstruct.field[3]
+documenttype[0].datatype[0].sstruct.field[0].name "author"
+documenttype[0].datatype[0].sstruct.field[0].id 644499292
+documenttype[0].datatype[0].sstruct.field[0].id_v6 177126295
+documenttype[0].datatype[0].sstruct.field[0].datatype 2
+documenttype[0].datatype[0].sstruct.field[1].name "date"
+documenttype[0].datatype[0].sstruct.field[1].id 491786523
+documenttype[0].datatype[0].sstruct.field[1].id_v6 916979460
+documenttype[0].datatype[0].sstruct.field[1].datatype 0
+documenttype[0].datatype[0].sstruct.field[2].name "subject"
+documenttype[0].datatype[0].sstruct.field[2].id 1797950813
+documenttype[0].datatype[0].sstruct.field[2].id_v6 943449689
+documenttype[0].datatype[0].sstruct.field[2].datatype 2
+documenttype[0].datatype[1].id 749465898
+documenttype[0].datatype[1].type STRUCT
+documenttype[0].datatype[1].array.element.id 0
+documenttype[0].datatype[1].map.key.id 0
+documenttype[0].datatype[1].map.value.id 0
+documenttype[0].datatype[1].wset.key.id 0
+documenttype[0].datatype[1].wset.createifnonexistent false
+documenttype[0].datatype[1].wset.removeifzero false
+documenttype[0].datatype[1].annotationref.annotation.id 0
+documenttype[0].datatype[1].sstruct.name "text/plain.body"
+documenttype[0].datatype[1].sstruct.version 0
+documenttype[0].datatype[1].sstruct.compression.type NONE
+documenttype[0].datatype[1].sstruct.compression.level 0
+documenttype[0].datatype[1].sstruct.compression.threshold 90
+documenttype[0].datatype[1].sstruct.compression.minsize 0
+documenttype[0].datatype[1].sstruct.field[1]
+documenttype[0].datatype[1].sstruct.field[0].name "content"
+documenttype[0].datatype[1].sstruct.field[0].id 1721764358
+documenttype[0].datatype[1].sstruct.field[0].id_v6 1751481844
+documenttype[0].datatype[1].sstruct.field[0].datatype 3
+documenttype[0].annotationtype[0]
+documenttype[1].id -653677105
+documenttype[1].name "text/html"
+documenttype[1].version 0
+documenttype[1].headerstruct 143329936
+documenttype[1].bodystruct 1473469605
+documenttype[1].inherits[0]
+documenttype[1].datatype[2]
+documenttype[1].datatype[0].id 143329936
+documenttype[1].datatype[0].type STRUCT
+documenttype[1].datatype[0].array.element.id 0
+documenttype[1].datatype[0].map.key.id 0
+documenttype[1].datatype[0].map.value.id 0
+documenttype[1].datatype[0].wset.key.id 0
+documenttype[1].datatype[0].wset.createifnonexistent false
+documenttype[1].datatype[0].wset.removeifzero false
+documenttype[1].datatype[0].annotationref.annotation.id 0
+documenttype[1].datatype[0].sstruct.name "text/html.header"
+documenttype[1].datatype[0].sstruct.version 0
+documenttype[1].datatype[0].sstruct.compression.type NONE
+documenttype[1].datatype[0].sstruct.compression.level 0
+documenttype[1].datatype[0].sstruct.compression.threshold 90
+documenttype[1].datatype[0].sstruct.compression.minsize 0
+documenttype[1].datatype[0].sstruct.field[3]
+documenttype[1].datatype[0].sstruct.field[0].name "author"
+documenttype[1].datatype[0].sstruct.field[0].id 644499292
+documenttype[1].datatype[0].sstruct.field[0].id_v6 177126295
+documenttype[1].datatype[0].sstruct.field[0].datatype 2
+documenttype[1].datatype[0].sstruct.field[1].name "date"
+documenttype[1].datatype[0].sstruct.field[1].id 491786523
+documenttype[1].datatype[0].sstruct.field[1].id_v6 916979460
+documenttype[1].datatype[0].sstruct.field[1].datatype 0
+documenttype[1].datatype[0].sstruct.field[2].name "subject"
+documenttype[1].datatype[0].sstruct.field[2].id 1797950813
+documenttype[1].datatype[0].sstruct.field[2].id_v6 943449689
+documenttype[1].datatype[0].sstruct.field[2].datatype 2
+documenttype[1].datatype[1].id 1473469605
+documenttype[1].datatype[1].type STRUCT
+documenttype[1].datatype[1].array.element.id 0
+documenttype[1].datatype[1].map.key.id 0
+documenttype[1].datatype[1].map.value.id 0
+documenttype[1].datatype[1].wset.key.id 0
+documenttype[1].datatype[1].wset.createifnonexistent false
+documenttype[1].datatype[1].wset.removeifzero false
+documenttype[1].datatype[1].annotationref.annotation.id 0
+documenttype[1].datatype[1].sstruct.name "text/html.body"
+documenttype[1].datatype[1].sstruct.version 0
+documenttype[1].datatype[1].sstruct.compression.type NONE
+documenttype[1].datatype[1].sstruct.compression.level 0
+documenttype[1].datatype[1].sstruct.compression.threshold 90
+documenttype[1].datatype[1].sstruct.compression.minsize 0
+documenttype[1].datatype[1].sstruct.field[1]
+documenttype[1].datatype[1].sstruct.field[0].name "content"
+documenttype[1].datatype[1].sstruct.field[0].id 1721764358
+documenttype[1].datatype[1].sstruct.field[0].id_v6 1751481844
+documenttype[1].datatype[1].sstruct.field[0].datatype 3
+documenttype[1].annotationtype[0]
+documenttype[2].id 238423572
+documenttype[2].name "testdoctype1"
+documenttype[2].version 1
+documenttype[2].headerstruct -226322995
+documenttype[2].bodystruct -1016297758
+documenttype[2].inherits[0]
+documenttype[2].datatype[2]
+documenttype[2].datatype[0].id -226322995
+documenttype[2].datatype[0].type STRUCT
+documenttype[2].datatype[0].array.element.id 0
+documenttype[2].datatype[0].map.key.id 0
+documenttype[2].datatype[0].map.value.id 0
+documenttype[2].datatype[0].wset.key.id 0
+documenttype[2].datatype[0].wset.createifnonexistent false
+documenttype[2].datatype[0].wset.removeifzero false
+documenttype[2].datatype[0].annotationref.annotation.id 0
+documenttype[2].datatype[0].sstruct.name "testdoctype1.header"
+documenttype[2].datatype[0].sstruct.version 1
+documenttype[2].datatype[0].sstruct.compression.type NONE
+documenttype[2].datatype[0].sstruct.compression.level 0
+documenttype[2].datatype[0].sstruct.compression.threshold 90
+documenttype[2].datatype[0].sstruct.compression.minsize 0
+documenttype[2].datatype[0].sstruct.field[0]
+documenttype[2].datatype[1].id -1016297758
+documenttype[2].datatype[1].type STRUCT
+documenttype[2].datatype[1].array.element.id 0
+documenttype[2].datatype[1].map.key.id 0
+documenttype[2].datatype[1].map.value.id 0
+documenttype[2].datatype[1].wset.key.id 0
+documenttype[2].datatype[1].wset.createifnonexistent false
+documenttype[2].datatype[1].wset.removeifzero false
+documenttype[2].datatype[1].annotationref.annotation.id 0
+documenttype[2].datatype[1].sstruct.name "testdoctype1.body"
+documenttype[2].datatype[1].sstruct.version 1
+documenttype[2].datatype[1].sstruct.compression.type NONE
+documenttype[2].datatype[1].sstruct.compression.level 0
+documenttype[2].datatype[1].sstruct.compression.threshold 90
+documenttype[2].datatype[1].sstruct.compression.minsize 0
+documenttype[2].datatype[1].sstruct.field[1]
+documenttype[2].datatype[1].sstruct.field[0].name "content"
+documenttype[2].datatype[1].sstruct.field[0].id 5
+documenttype[2].datatype[1].sstruct.field[0].id_v6 5
+documenttype[2].datatype[1].sstruct.field[0].datatype 2
+documenttype[2].annotationtype[0]
diff --git a/storage/src/tests/config-document.cfg b/storage/src/tests/config-document.cfg
new file mode 100644
index 00000000000..0ec7e881ddf
--- /dev/null
+++ b/storage/src/tests/config-document.cfg
@@ -0,0 +1,78 @@
+enablecompression false
+datatype[6]
+datatype[0].id 143329936
+datatype[0].arraytype[0]
+datatype[0].weightedsettype[0]
+datatype[0].structtype[1]
+datatype[0].structtype[0].name text/html.header
+datatype[0].structtype[0].version 0
+datatype[0].structtype[0].field[3]
+datatype[0].structtype[0].field[0].name author
+datatype[0].structtype[0].field[0].id[0]
+datatype[0].structtype[0].field[0].datatype 2
+datatype[0].structtype[0].field[1].name subject
+datatype[0].structtype[0].field[1].id[0]
+datatype[0].structtype[0].field[1].datatype 2
+datatype[0].structtype[0].field[2].name date
+datatype[0].structtype[0].field[2].id[0]
+datatype[0].structtype[0].field[2].datatype 0
+datatype[0].documenttype[0]
+datatype[1].id 1473469605
+datatype[1].arraytype[0]
+datatype[1].weightedsettype[0]
+datatype[1].structtype[1]
+datatype[1].structtype[0].name text/html.body
+datatype[1].structtype[0].version 0
+datatype[1].structtype[0].field[1]
+datatype[1].structtype[0].field[0].name content
+datatype[1].structtype[0].field[0].id[0]
+datatype[1].structtype[0].field[0].datatype 3
+datatype[1].documenttype[0]
+datatype[2].id -653677105
+datatype[2].arraytype[0]
+datatype[2].weightedsettype[0]
+datatype[2].structtype[0]
+datatype[2].documenttype[1]
+datatype[2].documenttype[0].name text/html
+datatype[2].documenttype[0].version 0
+datatype[2].documenttype[0].inherits[0]
+datatype[2].documenttype[0].headerstruct 143329936
+datatype[2].documenttype[0].bodystruct 1473469605
+datatype[3].id 160469461
+datatype[3].arraytype[0]
+datatype[3].weightedsettype[0]
+datatype[3].structtype[1]
+datatype[3].structtype[0].name text/plain.header
+datatype[3].structtype[0].version 0
+datatype[3].structtype[0].field[3]
+datatype[3].structtype[0].field[0].name author
+datatype[3].structtype[0].field[0].id[0]
+datatype[3].structtype[0].field[0].datatype 2
+datatype[3].structtype[0].field[1].name subject
+datatype[3].structtype[0].field[1].id[0]
+datatype[3].structtype[0].field[1].datatype 2
+datatype[3].structtype[0].field[2].name date
+datatype[3].structtype[0].field[2].id[0]
+datatype[3].structtype[0].field[2].datatype 0
+datatype[3].documenttype[0]
+datatype[4].id 749465898
+datatype[4].arraytype[0]
+datatype[4].weightedsettype[0]
+datatype[4].structtype[1]
+datatype[4].structtype[0].name text/plain.body
+datatype[4].structtype[0].version 0
+datatype[4].structtype[0].field[1]
+datatype[4].structtype[0].field[0].name content
+datatype[4].structtype[0].field[0].id[0]
+datatype[4].structtype[0].field[0].datatype 3
+datatype[4].documenttype[0]
+datatype[5].id -519202262
+datatype[5].arraytype[0]
+datatype[5].weightedsettype[0]
+datatype[5].structtype[0]
+datatype[5].documenttype[1]
+datatype[5].documenttype[0].name text/plain
+datatype[5].documenttype[0].version 0
+datatype[5].documenttype[0].inherits[0]
+datatype[5].documenttype[0].headerstruct 160469461
+datatype[5].documenttype[0].bodystruct 749465898
diff --git a/storage/src/tests/config-testdocman-document.cfg b/storage/src/tests/config-testdocman-document.cfg
new file mode 100644
index 00000000000..c4bf43d9e37
--- /dev/null
+++ b/storage/src/tests/config-testdocman-document.cfg
@@ -0,0 +1,138 @@
+datatype[14]
+datatype[0].id 1001
+datatype[0].arraytype[1]
+datatype[0].arraytype[0].datatype 2
+datatype[1].id 2001
+datatype[1].weightedsettype[1]
+datatype[1].weightedsettype[0].datatype 2
+datatype[1].weightedsettype[0].createifnonexistant false
+datatype[1].weightedsettype[0].removeifzero false
+datatype[2].id -2092985851
+datatype[2].structtype[1]
+datatype[2].structtype[0].name mystruct
+datatype[2].structtype[0].version 2
+datatype[2].structtype[0].field[2]
+datatype[2].structtype[0].field[0].name key
+datatype[2].structtype[0].field[0].id[1]
+datatype[2].structtype[0].field[0].id[0].id 1
+datatype[2].structtype[0].field[0].datatype 0
+datatype[2].structtype[0].field[1].name value
+datatype[2].structtype[0].field[1].id[1]
+datatype[2].structtype[0].field[1].id[0].id 2
+datatype[2].structtype[0].field[1].datatype 2
+datatype[3].id -1244861287
+datatype[3].arraytype[1]
+datatype[3].arraytype[0].datatype 3
+datatype[4].id 759956026
+datatype[4].arraytype[1]
+datatype[4].arraytype[0].datatype -2092985851
+datatype[5].id -226322995
+datatype[5].structtype[1]
+datatype[5].structtype[0].name testdoctype1.header
+datatype[5].structtype[0].version 1
+datatype[5].structtype[0].field[9]
+datatype[5].structtype[0].field[0].name headerval
+datatype[5].structtype[0].field[0].id[1]
+datatype[5].structtype[0].field[0].id[0].id 2
+datatype[5].structtype[0].field[0].datatype 0
+datatype[5].structtype[0].field[1].name hfloatval
+datatype[5].structtype[0].field[1].id[1]
+datatype[5].structtype[0].field[1].id[0].id 3
+datatype[5].structtype[0].field[1].datatype 1
+datatype[5].structtype[0].field[2].name hstringval
+datatype[5].structtype[0].field[2].id[1]
+datatype[5].structtype[0].field[2].id[0].id 4
+datatype[5].structtype[0].field[2].datatype 2
+datatype[5].structtype[0].field[3].name mystruct
+datatype[5].structtype[0].field[3].id[1]
+datatype[5].structtype[0].field[3].id[0].id 513
+datatype[5].structtype[0].field[3].datatype -2092985851
+datatype[5].structtype[0].field[4].name stringweightedset
+datatype[5].structtype[0].field[4].id[1]
+datatype[5].structtype[0].field[4].id[0].id 7
+datatype[5].structtype[0].field[4].datatype 2001
+datatype[5].structtype[0].field[5].name stringweightedset2
+datatype[5].structtype[0].field[5].id[1]
+datatype[5].structtype[0].field[5].id[0].id 8
+datatype[5].structtype[0].field[5].datatype 18
+datatype[5].structtype[0].field[6].name tags
+datatype[5].structtype[0].field[6].id[1]
+datatype[5].structtype[0].field[6].id[0].id 6
+datatype[5].structtype[0].field[6].datatype 1001
+datatype[5].structtype[0].field[7].name title
+datatype[5].structtype[0].field[7].id[1]
+datatype[5].structtype[0].field[7].id[0].id 12
+datatype[5].structtype[0].field[7].datatype 2
+datatype[5].structtype[0].field[8].name headerlongval
+datatype[5].structtype[0].field[8].id[1]
+datatype[5].structtype[0].field[8].id[0].id 9999
+datatype[5].structtype[0].field[8].datatype 4
+datatype[6].id -1016297758
+datatype[6].structtype[1]
+datatype[6].structtype[0].name testdoctype1.body
+datatype[6].structtype[0].version 1
+datatype[6].structtype[0].field[3]
+datatype[6].structtype[0].field[0].name content
+datatype[6].structtype[0].field[0].id[1]
+datatype[6].structtype[0].field[0].id[0].id 5
+datatype[6].structtype[0].field[0].datatype 2
+datatype[6].structtype[0].field[1].name rawarray
+datatype[6].structtype[0].field[1].id[1]
+datatype[6].structtype[0].field[1].id[0].id 10
+datatype[6].structtype[0].field[1].datatype -1244861287
+datatype[6].structtype[0].field[2].name structarray
+datatype[6].structtype[0].field[2].id[1]
+datatype[6].structtype[0].field[2].id[0].id 7123
+datatype[6].structtype[0].field[2].datatype 759956026
+datatype[7].id 238423572
+datatype[7].documenttype[1]
+datatype[7].documenttype[0].name testdoctype1
+datatype[7].documenttype[0].version 1
+datatype[7].documenttype[0].headerstruct -226322995
+datatype[7].documenttype[0].bodystruct -1016297758
+datatype[8].id -422836500
+datatype[8].structtype[1]
+datatype[8].structtype[0].name testdoctype2.header
+datatype[8].structtype[0].version 1
+datatype[8].structtype[0].field[1]
+datatype[8].structtype[0].field[0].name onlyinchild
+datatype[8].structtype[0].field[0].id[1]
+datatype[8].structtype[0].field[0].id[0].id 9
+datatype[8].structtype[0].field[0].datatype 0
+datatype[9].id 726512577
+datatype[9].structtype[1]
+datatype[9].structtype[0].name testdoctype2.body
+datatype[9].structtype[0].version 1
+datatype[9].structtype[0].field[0]
+datatype[10].id 238424533
+datatype[10].documenttype[1]
+datatype[10].documenttype[0].name testdoctype2
+datatype[10].documenttype[0].version 1
+datatype[10].documenttype[0].inherits[1]
+datatype[10].documenttype[0].inherits[0].name testdoctype1
+datatype[10].documenttype[0].inherits[0].version 1
+datatype[10].documenttype[0].headerstruct -422836500
+datatype[10].documenttype[0].bodystruct 726512577
+datatype[11].id -1301366770
+datatype[11].structtype[1]
+datatype[11].structtype[0].name _test_doctype3_.header
+datatype[11].structtype[0].version 1
+datatype[11].structtype[0].field[1]
+datatype[11].structtype[0].field[0].name _only_in_child_
+datatype[11].structtype[0].field[0].id[1]
+datatype[11].structtype[0].field[0].id[0].id 9
+datatype[11].structtype[0].field[0].datatype 0
+datatype[12].id 1422804323
+datatype[12].structtype[1]
+datatype[12].structtype[0].name _test_doctype3_.body
+datatype[12].structtype[0].version 1
+datatype[12].structtype[0].field[0]
+datatype[13].id 1088783091
+datatype[13].documenttype[1]
+datatype[13].documenttype[0].name _test_doctype3_
+datatype[13].documenttype[0].version 1
+datatype[13].documenttype[0].inherits[1]
+datatype[13].documenttype[0].inherits[0].name testdoctype1
+datatype[13].documenttype[0].inherits[0].version 1
+datatype[13].documenttype[0].headerstruct -1301366770
+datatype[13].documenttype[0].bodystruct 1422804323
diff --git a/storage/src/tests/distributor/.gitignore b/storage/src/tests/distributor/.gitignore
new file mode 100644
index 00000000000..333f254ba10
--- /dev/null
+++ b/storage/src/tests/distributor/.gitignore
@@ -0,0 +1,8 @@
+*.So
+*.lo
+.*.swp
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
diff --git a/storage/src/tests/distributor/CMakeLists.txt b/storage/src/tests/distributor/CMakeLists.txt
new file mode 100644
index 00000000000..6c6ba62ba6e
--- /dev/null
+++ b/storage/src/tests/distributor/CMakeLists.txt
@@ -0,0 +1,44 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_testdistributor
+ SOURCES
+ mergelimitertest.cpp
+ bucketdatabasetest.cpp
+ messagesenderstub.cpp
+ externaloperationhandlertest.cpp
+ getoperationtest.cpp
+ idealstatemanagertest.cpp
+ putoperationtest.cpp
+ removeoperationtest.cpp
+ removebucketoperationtest.cpp
+ mergeoperationtest.cpp
+ splitbuckettest.cpp
+ joinbuckettest.cpp
+ visitoroperationtest.cpp
+ twophaseupdateoperationtest.cpp
+ removelocationtest.cpp
+ bucketdbupdatertest.cpp
+ statoperationtest.cpp
+ pendingmessagetrackertest.cpp
+ distributortestutil.cpp
+ simplebucketprioritydatabasetest.cpp
+ simplemaintenancescannertest.cpp
+ maintenanceschedulertest.cpp
+ throttlingoperationstartertest.cpp
+ blockingoperationstartertest.cpp
+ nodeinfotest.cpp
+ updateoperationtest.cpp
+ bucketstateoperationtest.cpp
+ distributortest.cpp
+ mapbucketdatabasetest.cpp
+ operationtargetresolvertest.cpp
+ garbagecollectiontest.cpp
+ statecheckerstest.cpp
+ statusreporterdelegatetest.cpp
+ bucketdbmetricupdatertest.cpp
+ bucketgctimecalculatortest.cpp
+ nodemaintenancestatstrackertest.cpp
+ distributor_host_info_reporter_test.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/tests/distributor/blockingoperationstartertest.cpp b/storage/src/tests/distributor/blockingoperationstartertest.cpp
new file mode 100644
index 00000000000..ee0058643d9
--- /dev/null
+++ b/storage/src/tests/distributor/blockingoperationstartertest.cpp
@@ -0,0 +1,78 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <string>
+#include <sstream>
+#include <memory>
+#include <vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h>
+#include <vespa/storage/distributor/blockingoperationstarter.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+#include <tests/distributor/maintenancemocks.h>
+
+namespace storage {
+
+namespace distributor {
+
+using document::BucketId;
+
+class BlockingOperationStarterTest : public CppUnit::TestFixture {
+ CPPUNIT_TEST_SUITE(BlockingOperationStarterTest);
+ CPPUNIT_TEST(testOperationNotBlockedWhenNoMessagesPending);
+ CPPUNIT_TEST(testOperationBlockedWhenMessagesPending);
+ CPPUNIT_TEST_SUITE_END();
+
+ std::shared_ptr<Operation> createMockOperation() {
+ return std::shared_ptr<Operation>(new MockOperation(BucketId(16, 1)));
+ }
+ std::shared_ptr<Operation> createBlockingMockOperation() {
+ std::shared_ptr<MockOperation> op(new MockOperation(BucketId(16, 1)));
+ op->setShouldBlock(true);
+ return op;
+ }
+
+ framework::defaultimplementation::FakeClock _clock;
+ std::unique_ptr<MockOperationStarter> _starterImpl;
+ std::unique_ptr<StorageComponentRegisterImpl> _compReg;
+ std::unique_ptr<PendingMessageTracker> _messageTracker;
+ std::unique_ptr<BlockingOperationStarter> _operationStarter;
+
+public:
+ void testOperationNotBlockedWhenNoMessagesPending();
+ void testOperationBlockedWhenMessagesPending();
+
+ void setUp();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(BlockingOperationStarterTest);
+
+void
+BlockingOperationStarterTest::setUp()
+{
+ _starterImpl.reset(new MockOperationStarter());
+ _compReg.reset(new StorageComponentRegisterImpl());
+ _compReg->setClock(_clock);
+ _clock.setAbsoluteTimeInSeconds(1);
+ _messageTracker.reset(new PendingMessageTracker(*_compReg));
+ _operationStarter.reset(new BlockingOperationStarter(*_messageTracker, *_starterImpl));
+}
+
+void
+BlockingOperationStarterTest::testOperationNotBlockedWhenNoMessagesPending()
+{
+ CPPUNIT_ASSERT(_operationStarter->start(createMockOperation(),
+ OperationStarter::Priority(0)));
+ CPPUNIT_ASSERT_EQUAL(std::string("BucketId(0x4000000000000001), pri 0\n"),
+ _starterImpl->toString());
+}
+
+void
+BlockingOperationStarterTest::testOperationBlockedWhenMessagesPending()
+{
+ // start should return true but not forward message to underlying starter.
+ CPPUNIT_ASSERT(_operationStarter->start(createBlockingMockOperation(),
+ OperationStarter::Priority(0)));
+ CPPUNIT_ASSERT_EQUAL(std::string(""), _starterImpl->toString());
+}
+
+}
+}
diff --git a/storage/src/tests/distributor/bucketdatabasetest.cpp b/storage/src/tests/distributor/bucketdatabasetest.cpp
new file mode 100644
index 00000000000..011b02c8f89
--- /dev/null
+++ b/storage/src/tests/distributor/bucketdatabasetest.cpp
@@ -0,0 +1,550 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <tests/distributor/bucketdatabasetest.h>
+#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
+#include <iostream>
+#include <fstream>
+#include <iomanip>
+
+namespace storage {
+namespace distributor {
+
+using document::BucketId;
+
+void
+BucketDatabaseTest::setUp()
+{
+ db().clear();
+}
+
+namespace {
+ BucketCopy BC(uint32_t nodeIdx) {
+ return BucketCopy(0, nodeIdx, api::BucketInfo());
+ }
+
+ BucketInfo BI(uint32_t nodeIdx) {
+ BucketInfo bi;
+ bi.addNode(BC(nodeIdx), toVector<uint16_t>(0));
+ return bi;
+ }
+}
+
+void
+BucketDatabaseTest::testClear() {
+ db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1)));
+ db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2)));
+ db().clear();
+ CPPUNIT_ASSERT_EQUAL(size_t(0), db().size());
+}
+
+void
+BucketDatabaseTest::testUpdateGetAndRemove() {
+ // Do some insertions
+ CPPUNIT_ASSERT_EQUAL(0, (int)db().size());
+ db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1)));
+ db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2)));
+ db().update(BucketDatabase::Entry(document::BucketId(16, 42), BI(3)));
+ CPPUNIT_ASSERT_EQUAL(3, (int)db().size());
+
+ db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(4)));
+ CPPUNIT_ASSERT_EQUAL(3, (int)db().size());
+
+ // Access some elements
+ CPPUNIT_ASSERT_EQUAL(BI(4), db().get(document::BucketId(16, 11)).getBucketInfo());
+ CPPUNIT_ASSERT_EQUAL(BI(1), db().get(document::BucketId(16, 16)).getBucketInfo());
+ CPPUNIT_ASSERT_EQUAL(BI(3), db().get(document::BucketId(16, 42)).getBucketInfo());
+
+ // Do removes
+ db().remove(document::BucketId(16, 12));
+
+ CPPUNIT_ASSERT_EQUAL(3, (int)db().size());
+
+ db().remove(document::BucketId(16, 11));
+
+ CPPUNIT_ASSERT_EQUAL(2, (int)db().size());
+
+ db().remove(document::BucketId(16, 16));
+ db().remove(document::BucketId(16, 42));
+
+ CPPUNIT_ASSERT_EQUAL(0, (int)db().size());
+}
+
+namespace {
+
+struct ModifyProcessor : public BucketDatabase::MutableEntryProcessor
+{
+ bool process(BucketDatabase::Entry& e) {
+ if (e.getBucketId() == document::BucketId(16, 0x0b)) {
+ e.getBucketInfo() = BI(7);
+ } else if (e.getBucketId() == document::BucketId(16, 0x2a)) {
+ e->clear();
+ e->addNode(BC(4), toVector<uint16_t>(0));
+ e->addNode(BC(5), toVector<uint16_t>(0));
+ }
+
+ return true;
+ }
+};
+
+struct ListAllProcessor : public BucketDatabase::EntryProcessor
+{
+ std::ostringstream ost;
+
+ bool process(const BucketDatabase::Entry& e) {
+ ost << e << "\n";
+ return true;
+ }
+};
+
+struct DummyProcessor : public BucketDatabase::EntryProcessor
+{
+ std::ostringstream ost;
+
+ bool process(const BucketDatabase::Entry&) {
+ return true;
+ }
+};
+
+
+struct StoppingProcessor : public BucketDatabase::EntryProcessor
+{
+ std::ostringstream ost;
+
+ bool process(const BucketDatabase::Entry& e) {
+ ost << e << "\n";
+
+ if (e.getBucketId() == document::BucketId(16, 0x2a)) {
+ return false;
+ }
+
+ return true;
+ }
+};
+
+}
+
+void
+BucketDatabaseTest::testIterating() {
+ // Do some insertions
+ db().update(BucketDatabase::Entry(document::BucketId(16, 0x10), BI(1)));
+ db().update(BucketDatabase::Entry(document::BucketId(16, 0x0b), BI(2)));
+ db().update(BucketDatabase::Entry(document::BucketId(16, 0x2a), BI(3)));
+
+ {
+ ListAllProcessor proc;
+ db().forEach(proc, document::BucketId());
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "BucketId(0x4000000000000010) : "
+ "node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false)\n"
+ "BucketId(0x400000000000002a) : "
+ "node(idx=3,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false)\n"
+ "BucketId(0x400000000000000b) : "
+ "node(idx=2,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false)\n"),
+ proc.ost.str());
+ }
+
+ {
+ ListAllProcessor proc;
+ db().forEach(proc, document::BucketId(16, 0x2a));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "BucketId(0x400000000000000b) : "
+ "node(idx=2,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false)\n"),
+ proc.ost.str());
+ }
+
+ {
+ StoppingProcessor proc;
+ db().forEach(proc, document::BucketId());
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "BucketId(0x4000000000000010) : "
+ "node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false)\n"
+ "BucketId(0x400000000000002a) : "
+ "node(idx=3,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false)\n"),
+ proc.ost.str());
+ }
+
+ {
+ ModifyProcessor alterProc;
+ db().forEach(alterProc, document::BucketId());
+ // Verify content after altering
+ ListAllProcessor proc;
+ db().forEach(proc);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "BucketId(0x4000000000000010) : "
+ "node(idx=1,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false)\n"
+ "BucketId(0x400000000000002a) : "
+ "node(idx=4,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false), "
+ "node(idx=5,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false)\n"
+ "BucketId(0x400000000000000b) : "
+ "node(idx=7,crc=0x0,docs=0/0,bytes=1/1,trusted=false,active=false)\n"),
+ proc.ost.str());
+ }
+}
+
+std::string
+BucketDatabaseTest::doFindParents(const std::vector<document::BucketId>& ids,
+ const document::BucketId& searchId)
+{
+ db().clear();
+
+ for (uint32_t i = 0; i < ids.size(); ++i) {
+ db().update(BucketDatabase::Entry(ids[i], BI(i)));
+ }
+
+ std::vector<BucketDatabase::Entry> entries;
+ db().getParents(searchId, entries);
+
+ std::ostringstream ost;
+ for (uint32_t i = 0; i < ids.size(); ++i) {
+ if (std::find(entries.begin(), entries.end(),
+ BucketDatabase::Entry(ids[i], BI(i))) != entries.end()) {
+ if (!ost.str().empty()) {
+ ost << ",";
+ }
+ ost << i;
+ }
+ }
+
+ return ost.str();
+}
+
+void
+BucketDatabaseTest::testFindParents() {
+ // test what parents in the DB (specified in vector) are parents of the
+ // specified bucket. Result is a list of indexes into the vector.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("2"),
+ doFindParents(toVector(document::BucketId(17, 0x0ffff),
+ document::BucketId(18, 0x1ffff),
+ document::BucketId(18, 0x3ffff)),
+ document::BucketId(22, 0xfffff)));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,2,3"),
+ doFindParents(toVector(document::BucketId(16, 0x0ffff),
+ document::BucketId(17, 0x0ffff),
+ document::BucketId(17, 0x1ffff),
+ document::BucketId(19, 0xfffff)),
+ document::BucketId(22, 0xfffff)));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,2,3"),
+ doFindParents(toVector(document::BucketId(16, 0x0ffff),
+ document::BucketId(17, 0x0ffff),
+ document::BucketId(17, 0x1ffff),
+ document::BucketId(18, 0x1ffff)),
+ document::BucketId(22, 0x1ffff)));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0"),
+ doFindParents(toVector(document::BucketId(16, 0x0ffff),
+ document::BucketId(17, 0x0ffff)),
+ document::BucketId(22, 0x1ffff)));
+
+ CPPUNIT_ASSERT_EQUAL( // ticket 3121525
+ std::string("0"),
+ doFindParents(toVector(document::BucketId(16, 0x0ffff),
+ document::BucketId(17, 0x0ffff),
+ document::BucketId(19, 0x1ffff)),
+ document::BucketId(18, 0x1ffff)));
+
+ CPPUNIT_ASSERT_EQUAL( // ticket 3121525
+ std::string("0"),
+ doFindParents(toVector(document::BucketId(16, 0x0ffff),
+ document::BucketId(17, 0x0ffff),
+ document::BucketId(19, 0x5ffff)),
+ document::BucketId(18, 0x1ffff)));
+}
+
+std::string
+BucketDatabaseTest::doFindAll(const std::vector<document::BucketId>& ids,
+ const document::BucketId& searchId)
+{
+ db().clear();
+
+ for (uint32_t i = 0; i < ids.size(); ++i) {
+ db().update(BucketDatabase::Entry(ids[i], BI(i)));
+ }
+
+ std::vector<BucketDatabase::Entry> entries;
+ db().getAll(searchId, entries);
+
+ std::ostringstream ost;
+ for (uint32_t i = 0; i < ids.size(); ++i) {
+ if (std::find(entries.begin(), entries.end(),
+ BucketDatabase::Entry(ids[i], BI(i))) != entries.end()) {
+ if (!ost.str().empty()) {
+ ost << ",";
+ }
+ ost << i;
+ }
+ }
+
+ return ost.str();
+}
+
+void
+BucketDatabaseTest::testFindAll()
+{
+ std::vector<document::BucketId> buckets;
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(""),
+ doFindAll(buckets, document::BucketId(18, 0x1ffff)));
+
+ buckets.push_back(document::BucketId(16, 0x0aaaa)); // contains bucket 2-7
+ buckets.push_back(document::BucketId(17, 0x0aaaa)); // contains bucket 3-4
+ buckets.push_back(document::BucketId(20, 0xcaaaa));
+ buckets.push_back(document::BucketId(20, 0xeaaaa));
+ buckets.push_back(document::BucketId(17, 0x1aaaa)); // contains bucket 6-7
+ buckets.push_back(document::BucketId(20, 0xdaaaa));
+ buckets.push_back(document::BucketId(20, 0xfaaaa));
+ buckets.push_back(document::BucketId(20, 0xceaaa));
+ buckets.push_back(document::BucketId(17, 0x1ffff));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,4,5,6"),
+ doFindAll(buckets, document::BucketId(17, 0x1aaaa)));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("8"),
+ doFindAll(buckets, document::BucketId(16, 0xffff)));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,1"),
+ doFindAll(toVector(document::BucketId(17, 0x00001),
+ document::BucketId(17, 0x10001)),
+ document::BucketId(16, 0x00001)));
+
+ document::BucketId id(33, 0x1053c7089); // Bit 32 is set, but unused.
+ id.setUsedBits(32);
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("1,2"),
+ doFindAll(toVector(document::BucketId(24, 0x000dc7089),
+ document::BucketId(33, 0x0053c7089),
+ document::BucketId(33, 0x1053c7089),
+ document::BucketId(24, 0x000bc7089)),
+ id));
+
+ CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+ std::string("0,1,2"),
+ doFindAll(toVector(
+ document::BucketId(16, 0x00001), // contains 2-3
+ document::BucketId(17, 0x00001),
+ document::BucketId(17, 0x10001)),
+ document::BucketId(16, 0x00001)));
+
+ CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+ std::string("1,2"),
+ doFindAll(toVector(
+ document::BucketId(17, 0x10000),
+ document::BucketId(27, 0x007228034), // contains 3
+ document::BucketId(29, 0x007228034),
+ document::BucketId(17, 0x1ffff)),
+ document::BucketId(32, 0x027228034)));
+
+ CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+ std::string("0"),
+ doFindAll(toVector(
+ document::BucketId(16, 0x0ffff),
+ document::BucketId(17, 0x0ffff)),
+ document::BucketId(22, 0x1ffff)));
+
+ CPPUNIT_ASSERT_EQUAL( // Inconsistent split
+ std::string("0,2"),
+ doFindAll(toVector(
+ document::BucketId(16, 0x0ffff),
+ document::BucketId(17, 0x0ffff),
+ document::BucketId(19, 0x1ffff)),
+ document::BucketId(18, 0x1ffff)));
+
+ CPPUNIT_ASSERT_EQUAL( // Inconsistent split, ticket 3121525
+ std::string("0,2"),
+ doFindAll(toVector(
+ document::BucketId(16, 0x0ffff),
+ document::BucketId(17, 0x0ffff),
+ document::BucketId(19, 0x5ffff)),
+ document::BucketId(18, 0x1ffff)));
+}
+
+document::BucketId
+BucketDatabaseTest::doCreate(const std::vector<document::BucketId>& ids,
+ uint32_t minBits,
+ const document::BucketId& wantedId)
+{
+ db().clear();
+
+ for (uint32_t i = 0; i < ids.size(); ++i) {
+ db().update(BucketDatabase::Entry(ids[i], BI(i)));
+ }
+
+ BucketDatabase::Entry entry = db().createAppropriateBucket(minBits, wantedId);
+ return entry.getBucketId();
+}
+
+void
+BucketDatabaseTest::testCreateAppropriateBucket() {
+ // Use min split bits when no relevant bucket exist.
+ CPPUNIT_ASSERT_EQUAL(
+ document::BucketId(36,0x0000004d2),
+ doCreate(toVector(document::BucketId(58, 0x43d6c878000004d2ull)), 36,
+ document::BucketId(58, 0x423bf1e0000004d2ull)));
+ // New bucket has bits in common with existing bucket.
+ // Create bucket with min amount of bits while not being overlapping
+ CPPUNIT_ASSERT_EQUAL(
+ document::BucketId(34,0x0000004d2),
+ doCreate(toVector(document::BucketId(58, 0xeaf77782000004d2)),
+ 16,
+ document::BucketId(58, 0x00000000000004d2)));
+ // Create sibling of existing bucket with most LSB bits in common.
+ CPPUNIT_ASSERT_EQUAL(
+ document::BucketId(40, 0x0000004d2),
+ doCreate(toVector(document::BucketId(58, 0xeaf77780000004d2),
+ document::BucketId(58, 0xeaf77782000004d2)),
+ 16,
+ document::BucketId(58, 0x00000000000004d2)));
+ // Create sibling of existing bucket with most LSB bits in common.
+ CPPUNIT_ASSERT_EQUAL(
+ document::BucketId(25, 0x0010004d2),
+ doCreate(toVector(document::BucketId(16, 0x00000000000004d1),
+ document::BucketId(40, 0x00000000000004d2)),
+ 16,
+ document::BucketId(58, 0x00000000010004d2)));
+
+ CPPUNIT_ASSERT_EQUAL(
+ document::BucketId(36, 0x10000004000004d2),
+ doCreate(toVector(document::BucketId(0x8c000000000004d2),
+ document::BucketId(0xeb54b3ac000004d2),
+ document::BucketId(0x88000002000004d2),
+ document::BucketId(0x84000001000004d2)),
+ 16,
+ document::BucketId(58, 0x1944a44000004d2)));
+ CPPUNIT_ASSERT_EQUAL(
+ document::BucketId(25, 0x0010004d2),
+ doCreate(toVector(document::BucketId(58, 0xeaf77780000004d2),
+ document::BucketId(40, 0x00000000000004d1)),
+ 16,
+ document::BucketId(58,0x00000000010004d2)));
+ // Test empty bucket database case. (Use min split bits)
+ std::vector<document::BucketId> buckets;
+ CPPUNIT_ASSERT_EQUAL(
+ document::BucketId(16, 0x0000004d2ull),
+ doCreate(buckets, 16,
+ document::BucketId(58, 0x00000000010004d2)));
+}
+
+void
+BucketDatabaseTest::testGetNext()
+{
+ db().clear();
+ db().update(BucketDatabase::Entry(document::BucketId(16, 16), BI(1)));
+ db().update(BucketDatabase::Entry(document::BucketId(16, 11), BI(2)));
+ db().update(BucketDatabase::Entry(document::BucketId(16, 42), BI(3)));
+
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 16),
+ db().getNext(document::BucketId()).getBucketId());
+
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 42),
+ db().getNext(document::BucketId(16, 16)).getBucketId());
+
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 11),
+ db().getNext(document::BucketId(16, 42)).getBucketId());
+}
+
+void
+BucketDatabaseTest::doTestUpperBound(const UBoundFunc& f)
+{
+ db().clear();
+ // Tree is rooted at the LSB bit, so the following buckets are in iteration
+ // order based on the reverse of their "normal" bitstring:
+ // 0010:3
+ db().update(BucketDatabase::Entry(document::BucketId(3, 4), BI(2)));
+ // 1000:3
+ db().update(BucketDatabase::Entry(document::BucketId(3, 1), BI(2)));
+ // 1001:4
+ db().update(BucketDatabase::Entry(document::BucketId(4, 9), BI(1)));
+ // 10010:5
+ db().update(BucketDatabase::Entry(document::BucketId(5, 9), BI(1)));
+ // 1100:3
+ db().update(BucketDatabase::Entry(document::BucketId(3, 3), BI(3)));
+
+ // 0000:0 (default constructed) has ubound of 0010:3
+ CPPUNIT_ASSERT_EQUAL(BucketId(3, 4), f(db(), BucketId()));
+ // 0011:4 has ubound of 1000:3
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(3, 1), f(db(), BucketId(4, 12)));
+ // 1000:1 has ubound of 1000:3
+ CPPUNIT_ASSERT_EQUAL(BucketId(3, 4), f(db(), BucketId(1, 0)));
+ CPPUNIT_ASSERT_EQUAL(BucketId(3, 1), f(db(), BucketId(3, 4)));
+ CPPUNIT_ASSERT_EQUAL(BucketId(4, 9), f(db(), BucketId(3, 1)));
+ CPPUNIT_ASSERT_EQUAL(BucketId(5, 9), f(db(), BucketId(4, 9)));
+ CPPUNIT_ASSERT_EQUAL(BucketId(3, 3), f(db(), BucketId(5, 9)));
+ // 100101:6 does not exist, should also return 1100:3
+ CPPUNIT_ASSERT_EQUAL(BucketId(3, 3), f(db(), BucketId(6, 41)));
+
+ // Test extremes.
+ db().clear();
+ db().update(BucketDatabase::Entry(document::BucketId(8, 0), BI(2)));
+ db().update(BucketDatabase::Entry(document::BucketId(8, 0xff), BI(2)));
+
+ CPPUNIT_ASSERT_EQUAL(BucketId(8, 0), f(db(), BucketId()));
+ CPPUNIT_ASSERT_EQUAL(BucketId(8, 0xff), f(db(), BucketId(8, 0)));
+}
+
+void
+BucketDatabaseTest::testUpperBoundReturnsNextInOrderGreaterBucket()
+{
+ doTestUpperBound([](const BucketDatabase& bucketDb,
+ const document::BucketId& id)
+ {
+ return bucketDb.upperBound(id).getBucketId();
+ });
+}
+
+void
+BucketDatabaseTest::testGetNextReturnsUpperBoundBucket()
+{
+ // getNext() would generally be implemented in terms of upperBound(), but
+ // make sure it conforms to the same contract in case this changes.
+ doTestUpperBound([](const BucketDatabase& bucketDb,
+ const document::BucketId& id)
+ {
+ return bucketDb.getNext(id).getBucketId();
+ });
+}
+
+void
+BucketDatabaseTest::testChildCount()
+{
+ db().clear();
+ // Empty tree; inserts cannot create inconsistencies.
+ CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1)));
+
+ // Same bucket; cannot be inconsistent with itself.
+ db().update(BucketDatabase::Entry(document::BucketId(3, 1), BI(1)));
+ CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1)));
+
+ // (2, 1) has one subtree.
+ CPPUNIT_ASSERT_EQUAL(1u, db().childCount(BucketId(2, 1)));
+
+ // Bucket exists in another subtree from (1, 1); inconsistency would
+ // result if we tried inserting it.
+ db().update(BucketDatabase::Entry(document::BucketId(3, 3), BI(2)));
+ CPPUNIT_ASSERT_EQUAL(2u, db().childCount(BucketId(1, 1)));
+
+ // Inner node with 1 subtree.
+ CPPUNIT_ASSERT_EQUAL(1u, db().childCount(BucketId(2, 3)));
+
+ // Leaves have no subtrees.
+ CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 1)));
+ CPPUNIT_ASSERT_EQUAL(0u, db().childCount(BucketId(3, 5)));
+}
+
+}
+} // storage
diff --git a/storage/src/tests/distributor/bucketdatabasetest.h b/storage/src/tests/distributor/bucketdatabasetest.h
new file mode 100644
index 00000000000..1eb8bf86add
--- /dev/null
+++ b/storage/src/tests/distributor/bucketdatabasetest.h
@@ -0,0 +1,63 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vespalib/util/document_runnable.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <vespa/storage/distributor/bucketdb/judybucketdatabase.h>
+#include <vespa/storage/storageutil/utils.h>
+
+#define SETUP_DATABASE_TESTS() \
+ CPPUNIT_TEST(testUpdateGetAndRemove); \
+ CPPUNIT_TEST(testClear); \
+ CPPUNIT_TEST(testIterating); \
+ CPPUNIT_TEST(testFindParents); \
+ CPPUNIT_TEST(testFindAll); \
+ CPPUNIT_TEST(testCreateAppropriateBucket); \
+ CPPUNIT_TEST(testGetNext); \
+ CPPUNIT_TEST(testGetNextReturnsUpperBoundBucket); \
+ CPPUNIT_TEST(testUpperBoundReturnsNextInOrderGreaterBucket); \
+ CPPUNIT_TEST(testChildCount);
+
+namespace storage {
+namespace distributor {
+
+struct BucketDatabaseTest : public CppUnit::TestFixture {
+ void setUp();
+
+ void testUpdateGetAndRemove();
+ void testClear();
+ void testIterating();
+ void testFindParents();
+ void testFindAll();
+ void testCreateAppropriateBucket();
+ void testGetNext();
+ void testGetNextReturnsUpperBoundBucket();
+ void testUpperBoundReturnsNextInOrderGreaterBucket();
+ void testChildCount();
+
+ void testBenchmark();
+
+ std::string doFindParents(const std::vector<document::BucketId>& ids,
+ const document::BucketId& searchId);
+ std::string doFindAll(const std::vector<document::BucketId>& ids,
+ const document::BucketId& searchId);
+ document::BucketId doCreate(const std::vector<document::BucketId>& ids,
+ uint32_t minBits,
+ const document::BucketId& wantedId);
+
+ virtual BucketDatabase& db() = 0;
+
+private:
+ using UBoundFunc = std::function<
+ document::BucketId(const BucketDatabase&,
+ const document::BucketId&)>;
+
+ void doTestUpperBound(const UBoundFunc& f);
+};
+
+}
+
+}
+
diff --git a/storage/src/tests/distributor/bucketdbmetricupdatertest.cpp b/storage/src/tests/distributor/bucketdbmetricupdatertest.cpp
new file mode 100644
index 00000000000..6aa9ef3a844
--- /dev/null
+++ b/storage/src/tests/distributor/bucketdbmetricupdatertest.cpp
@@ -0,0 +1,361 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <string>
+#include <sstream>
+#include <vespa/storage/distributor/bucketdb/bucketdbmetricupdater.h>
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storage/distributor/idealstatemetricsset.h>
+#include <vespa/storage/config/config-stor-distributormanager.h>
+
+namespace storage {
+namespace distributor {
+
+using document::BucketId;
+
+class BucketDBMetricUpdaterTest : public CppUnit::TestFixture {
+ CPPUNIT_TEST_SUITE(BucketDBMetricUpdaterTest);
+ CPPUNIT_TEST(testDocAndByteCountsAreUpdated);
+ CPPUNIT_TEST(testBucketsWithTooFewAndTooManyCopies);
+ CPPUNIT_TEST(testBucketsWithVaryingTrustedness);
+ CPPUNIT_TEST(testPickCountsFromTrustedCopy);
+ CPPUNIT_TEST(testPickLargestCopyIfNoTrusted);
+ CPPUNIT_TEST(testCompleteRoundClearsWorkingState);
+ CPPUNIT_TEST(testMinBucketReplicaTrackedAndReportedPerNode);
+ CPPUNIT_TEST(nonTrustedReplicasAlsoCountedInModeAny);
+ CPPUNIT_TEST(minimumReplicaCountReturnedForNodeInModeAny);
+ CPPUNIT_TEST_SUITE_END();
+
+ void visitBucketWith2Copies1Trusted(BucketDBMetricUpdater& metricUpdater);
+ void visitBucketWith2CopiesBothTrusted(
+ BucketDBMetricUpdater& metricUpdater);
+ void visitBucketWith1Copy(BucketDBMetricUpdater& metricUpdater);
+
+
+ using NodeToReplicasMap = std::unordered_map<uint16_t, uint32_t>;
+ NodeToReplicasMap replicaStatsOf(BucketDBMetricUpdater& metricUpdater);
+
+ metrics::LoadTypeSet _loadTypes;
+public:
+ BucketDBMetricUpdaterTest();
+
+ void testDocAndByteCountsAreUpdated();
+ void testBucketsWithTooFewAndTooManyCopies();
+ void testBucketsWithVaryingTrustedness();
+ void testPickCountsFromTrustedCopy();
+ void testPickLargestCopyIfNoTrusted();
+ void testCompleteRoundClearsWorkingState();
+ void testMinBucketReplicaTrackedAndReportedPerNode();
+ void nonTrustedReplicasAlsoCountedInModeAny();
+ void minimumReplicaCountReturnedForNodeInModeAny();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(BucketDBMetricUpdaterTest);
+
+BucketDBMetricUpdaterTest::BucketDBMetricUpdaterTest()
+{
+ _loadTypes.push_back(metrics::LoadType(0, "foo"));
+}
+
+namespace {
+
+void addNode(BucketInfo& info, uint16_t node, uint32_t crc) {
+ auto apiInfo = api::BucketInfo(crc, crc + 1, crc + 2);
+ std::vector<uint16_t> order;
+ info.addNode(BucketCopy(1234, node, apiInfo), order);
+}
+
+typedef bool Trusted;
+
+BucketInfo
+makeInfo(uint32_t copy0Crc)
+{
+ BucketInfo info;
+ addNode(info, 0, copy0Crc);
+ return info;
+}
+
+BucketInfo
+makeInfo(uint32_t copy0Crc, uint32_t copy1Crc)
+{
+ BucketInfo info;
+ addNode(info, 0, copy0Crc);
+ addNode(info, 1, copy1Crc);
+ return info;
+}
+
+} // anonymous namespace
+
+void
+BucketDBMetricUpdaterTest::testDocAndByteCountsAreUpdated()
+{
+ BucketDBMetricUpdater metricUpdater;
+ IdealStateMetricSet ims;
+ DistributorMetricSet dms(_loadTypes);
+
+ CPPUNIT_ASSERT_EQUAL(false, metricUpdater.hasCompletedRound());
+
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+ metricUpdater.completeRound(false);
+
+ CPPUNIT_ASSERT_EQUAL(true, metricUpdater.hasCompletedRound());
+
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), dms.docsStored.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), dms.bytesStored.getLast());
+ {
+ BucketDatabase::Entry e(document::BucketId(16, 1), makeInfo(10));
+ metricUpdater.visit(e, 1);
+ }
+
+ metricUpdater.completeRound(false);
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+
+ CPPUNIT_ASSERT_EQUAL(true, metricUpdater.hasCompletedRound());
+
+ CPPUNIT_ASSERT_EQUAL(int64_t(11), dms.docsStored.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(12), dms.bytesStored.getLast());
+
+ {
+ BucketDatabase::Entry e(document::BucketId(16, 1), makeInfo(20));
+ metricUpdater.visit(e, 1);
+ }
+
+ metricUpdater.completeRound(false);
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+
+ CPPUNIT_ASSERT_EQUAL(int64_t(32), dms.docsStored.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(34), dms.bytesStored.getLast());
+}
+
+void
+BucketDBMetricUpdaterTest::testBucketsWithTooFewAndTooManyCopies()
+{
+ BucketDBMetricUpdater metricUpdater;
+ IdealStateMetricSet ims;
+ DistributorMetricSet dms(_loadTypes);
+
+ metricUpdater.completeRound();
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), ims.buckets_toofewcopies.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), ims.buckets_toomanycopies.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), ims.buckets.getLast());
+
+ // 1 copy too little
+ {
+ BucketDatabase::Entry e(document::BucketId(16, 1), makeInfo(10));
+ metricUpdater.visit(e, 2);
+ }
+ metricUpdater.completeRound(false);
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), ims.buckets_toofewcopies.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), ims.buckets_toomanycopies.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), ims.buckets.getLast());
+
+ // 1 copy too many
+ {
+ BucketDatabase::Entry e(document::BucketId(16, 1), makeInfo(40, 40));
+ metricUpdater.visit(e, 1);
+ }
+ metricUpdater.completeRound(false);
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), ims.buckets_toofewcopies.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), ims.buckets_toomanycopies.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(2), ims.buckets.getLast());
+
+ // Right amount of copies, just inc bucket counter.
+ {
+ BucketDatabase::Entry e(document::BucketId(16, 1), makeInfo(40, 40));
+ metricUpdater.visit(e, 2);
+ }
+ metricUpdater.completeRound(false);
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), ims.buckets_toofewcopies.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), ims.buckets_toomanycopies.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(3), ims.buckets.getLast());
+}
+
+void
+BucketDBMetricUpdaterTest::testBucketsWithVaryingTrustedness()
+{
+ BucketDBMetricUpdater metricUpdater;
+ IdealStateMetricSet ims;
+ DistributorMetricSet dms(_loadTypes);
+
+ metricUpdater.completeRound(false);
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), ims.buckets_notrusted.getLast());
+ // Has only trusted (implicit for first added)
+ {
+ BucketDatabase::Entry e(document::BucketId(16, 1), makeInfo(100));
+ metricUpdater.visit(e, 2);
+ }
+ metricUpdater.completeRound(false);
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), ims.buckets_notrusted.getLast());
+ // Has at least one trusted (implicit for first added)
+ {
+ BucketDatabase::Entry e(document::BucketId(16, 2), makeInfo(100, 200));
+ metricUpdater.visit(e, 2);
+ }
+ metricUpdater.completeRound(false);
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), ims.buckets_notrusted.getLast());
+ // Has no trusted
+ {
+ BucketInfo info(makeInfo(100, 200));
+ info.resetTrusted();
+ BucketDatabase::Entry e(document::BucketId(16, 3), info);
+ metricUpdater.visit(e, 2);
+ }
+ metricUpdater.completeRound(false);
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), ims.buckets_notrusted.getLast());
+}
+
+void
+BucketDBMetricUpdaterTest::testPickCountsFromTrustedCopy()
+{
+ BucketDBMetricUpdater metricUpdater;
+ IdealStateMetricSet ims;
+ DistributorMetricSet dms(_loadTypes);
+
+ // First copy added is implicitly trusted, but it is not the largest.
+ BucketDatabase::Entry e(document::BucketId(16, 2), makeInfo(100, 200));
+ metricUpdater.visit(e, 2);
+ metricUpdater.completeRound(false);
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+
+ CPPUNIT_ASSERT_EQUAL(int64_t(101), dms.docsStored.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(102), dms.bytesStored.getLast());
+}
+
+void
+BucketDBMetricUpdaterTest::testPickLargestCopyIfNoTrusted()
+{
+ BucketDBMetricUpdater metricUpdater;
+ IdealStateMetricSet ims;
+ DistributorMetricSet dms(_loadTypes);
+
+ // No trusted copies, so must pick second copy.
+ BucketInfo info(makeInfo(100, 200));
+ info.resetTrusted();
+ BucketDatabase::Entry e(document::BucketId(16, 2), info);
+ metricUpdater.visit(e, 2);
+ metricUpdater.completeRound(false);
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+
+ CPPUNIT_ASSERT_EQUAL(int64_t(201), dms.docsStored.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(202), dms.bytesStored.getLast());
+}
+
+void
+BucketDBMetricUpdaterTest::testCompleteRoundClearsWorkingState()
+{
+ BucketDBMetricUpdater metricUpdater;
+ IdealStateMetricSet ims;
+ DistributorMetricSet dms(_loadTypes);
+
+ {
+ BucketDatabase::Entry e(document::BucketId(16, 1), makeInfo(10));
+ metricUpdater.visit(e, 1);
+ }
+ metricUpdater.completeRound();
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+
+ CPPUNIT_ASSERT_EQUAL(int64_t(11), dms.docsStored.getLast());
+ // Completing the round again with no visits having been done will
+ // propagate an empty working state to the complete state.
+ metricUpdater.completeRound();
+ metricUpdater.getLastCompleteStats().propagateMetrics(ims, dms);
+
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), dms.docsStored.getLast());
+}
+
+// Replicas on nodes 0 and 1.
+void
+BucketDBMetricUpdaterTest::visitBucketWith2Copies1Trusted(
+ BucketDBMetricUpdater& metricUpdater)
+{
+ BucketInfo info;
+ addNode(info, 0, 100);
+ addNode(info, 1, 101); // Note different checksums => #trusted = 1
+ BucketDatabase::Entry e(document::BucketId(16, 1), info);
+ metricUpdater.visit(e, 2);
+}
+
+// Replicas on nodes 0 and 2.
+void
+BucketDBMetricUpdaterTest::visitBucketWith2CopiesBothTrusted(
+ BucketDBMetricUpdater& metricUpdater)
+{
+ BucketInfo info;
+ addNode(info, 0, 200);
+ addNode(info, 2, 200);
+ BucketDatabase::Entry e(document::BucketId(16, 2), info);
+ metricUpdater.visit(e, 2);
+}
+
+// Single replica on node 2.
+void
+BucketDBMetricUpdaterTest::visitBucketWith1Copy(
+ BucketDBMetricUpdater& metricUpdater)
+{
+ BucketInfo info;
+ addNode(info, 2, 100);
+ BucketDatabase::Entry e(document::BucketId(16, 1), info);
+ metricUpdater.visit(e, 2);
+}
+
+BucketDBMetricUpdaterTest::NodeToReplicasMap
+BucketDBMetricUpdaterTest::replicaStatsOf(BucketDBMetricUpdater& metricUpdater)
+{
+ metricUpdater.completeRound(true);
+ return metricUpdater.getLastCompleteStats()._minBucketReplica;
+}
+
+void BucketDBMetricUpdaterTest::testMinBucketReplicaTrackedAndReportedPerNode()
+{
+ BucketDBMetricUpdater metricUpdater;
+
+ // Node 0 and 1 should have min replica 1, while node 2 should have min
+ // replica 2.
+ visitBucketWith2Copies1Trusted(metricUpdater);
+ visitBucketWith2CopiesBothTrusted(metricUpdater);
+
+ CPPUNIT_ASSERT_EQUAL(NodeToReplicasMap({{0, 1}, {1, 1}, {2, 2}}),
+ replicaStatsOf(metricUpdater));
+}
+
+void
+BucketDBMetricUpdaterTest::nonTrustedReplicasAlsoCountedInModeAny()
+{
+ BucketDBMetricUpdater metricUpdater;
+ using CountingMode = BucketDBMetricUpdater::ReplicaCountingMode;
+ metricUpdater.setMinimumReplicaCountingMode(CountingMode::ANY);
+ visitBucketWith2Copies1Trusted(metricUpdater);
+ visitBucketWith2CopiesBothTrusted(metricUpdater);
+
+ CPPUNIT_ASSERT_EQUAL(NodeToReplicasMap({{0, 2}, {1, 2}, {2, 2}}),
+ replicaStatsOf(metricUpdater));
+}
+
+void
+BucketDBMetricUpdaterTest::minimumReplicaCountReturnedForNodeInModeAny()
+{
+ BucketDBMetricUpdater metricUpdater;
+ using CountingMode = BucketDBMetricUpdater::ReplicaCountingMode;
+ metricUpdater.setMinimumReplicaCountingMode(CountingMode::ANY);
+ visitBucketWith2CopiesBothTrusted(metricUpdater);
+ visitBucketWith1Copy(metricUpdater);
+
+ // Node 2 has a bucket with only 1 replica.
+ CPPUNIT_ASSERT_EQUAL(NodeToReplicasMap({{0, 2}, {2, 1}}),
+ replicaStatsOf(metricUpdater));
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/bucketdbupdatertest.cpp b/storage/src/tests/distributor/bucketdbupdatertest.cpp
new file mode 100644
index 00000000000..a1c933d2606
--- /dev/null
+++ b/storage/src/tests/distributor/bucketdbupdatertest.cpp
@@ -0,0 +1,2296 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storage/distributor/bucketdbupdater.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/vdslib/state/random.h>
+#include <vespa/vespalib/io/fileutil.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storage/distributor/pendingclusterstate.h>
+#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/vespalib/text/stringtokenizer.h>
+#include <vespa/storage/storageutil/distributorstatecache.h>
+#include <tests/distributor/distributortestutil.h>
+#include <tests/distributor/messagesenderstub.h>
+#include <vespa/storage/distributor/simpleclusterinformation.h>
+
+#include <iostream>
+#include <fstream>
+#include <string>
+
+using namespace storage::api;
+using namespace storage::lib;
+
+namespace storage {
+namespace distributor {
+
+class BucketDBUpdaterTest : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+ CPPUNIT_TEST_SUITE(BucketDBUpdaterTest);
+ CPPUNIT_TEST(testNormalUsage); // Make sure that bucketdbupdater sends requests to nodes, send responses back for 3 nodes, check that bucketdb is in correct state
+ CPPUNIT_TEST(testDistributorChange);
+ CPPUNIT_TEST(testDistributorChangeWithGrouping);
+ CPPUNIT_TEST(testNormalUsageInitializing); // Check that we send request bucket info when storage node is initializing, and send another when it's up.
+ CPPUNIT_TEST(testFailedRequestBucketInfo);
+ CPPUNIT_TEST(testBitChange); // Check what happens when distribution bits change
+ CPPUNIT_TEST(testNodeDown);
+ CPPUNIT_TEST(testStorageNodeInMaintenanceClearsBucketsForNode);
+ CPPUNIT_TEST(testNodeDownCopiesGetInSync);
+ CPPUNIT_TEST(testDownWhileInit);
+ CPPUNIT_TEST(testInitializingWhileRecheck);
+ CPPUNIT_TEST(testRecheckNode);
+ CPPUNIT_TEST(testRecheckNodeWithFailure);
+ CPPUNIT_TEST(testNotifyBucketChange);
+ CPPUNIT_TEST(testNotifyBucketChangeFromNodeDown);
+ CPPUNIT_TEST(testNotifyChangeWithPendingStateQueuesBucketInfoRequests);
+ CPPUNIT_TEST(testMergeReply);
+ CPPUNIT_TEST(testMergeReplyNodeDown);
+ CPPUNIT_TEST(testMergeReplyNodeDownAfterRequestSent);
+ CPPUNIT_TEST(testFlush);
+ CPPUNIT_TEST(testPendingClusterStateSendMessages);
+ CPPUNIT_TEST(testPendingClusterStateReceive);
+ CPPUNIT_TEST(testPendingClusterStateMerge);
+ CPPUNIT_TEST(testPendingClusterStateMergeReplicaChanged);
+ CPPUNIT_TEST(testPendingClusterStateWithGroupDown);
+ CPPUNIT_TEST(testPendingClusterStateWithGroupDownAndNoHandover);
+ CPPUNIT_TEST(testNoDbResurrectionForBucketNotOwnedInCurrentState);
+ CPPUNIT_TEST(testNoDbResurrectionForBucketNotOwnedInPendingState);
+ CPPUNIT_TEST(testClusterStateAlwaysSendsFullFetchWhenDistributionChangePending);
+ CPPUNIT_TEST(testChangedDistributionConfigTriggersRecoveryMode);
+ CPPUNIT_TEST(testNewlyAddedBucketsHaveCurrentTimeAsGcTimestamp);
+ CPPUNIT_TEST(testNewerMutationsNotOverwrittenByEarlierBucketFetch);
+ CPPUNIT_TEST(preemptedDistrChangeCarriesNodeSetOverToNextStateFetch);
+ CPPUNIT_TEST(preemptedStorChangeCarriesNodeSetOverToNextStateFetch);
+ CPPUNIT_TEST(preemptedStorageNodeDownMustBeReFetched);
+ CPPUNIT_TEST(outdatedNodeSetClearedAfterSuccessfulStateCompletion);
+ CPPUNIT_TEST(doNotSendToPreemptedNodeNowInDownState);
+ CPPUNIT_TEST(doNotSendToPreemptedNodeNotPartOfNewState);
+ CPPUNIT_TEST_DISABLED(clusterConfigDownsizeOnlySendsToAvailableNodes);
+ CPPUNIT_TEST(changedDiskSetTriggersReFetch);
+ CPPUNIT_TEST(nodeMissingFromConfigIsTreatedAsNeedingOwnershipTransfer);
+ CPPUNIT_TEST_SUITE_END();
+
+protected:
+ void testNormalUsage();
+ void testDistributorChange();
+ void testDistributorChangeWithGrouping();
+ void testNormalUsageInitializing();
+ void testFailedRequestBucketInfo();
+ void testNoResponses();
+ void testBitChange();
+ void testInconsistentChecksum();
+ void testAddEmptyNode();
+ void testNodeDown();
+ void testStorageNodeInMaintenanceClearsBucketsForNode();
+ void testNodeDownCopiesGetInSync();
+ void testDownWhileInit();
+ void testInitializingWhileRecheck();
+ void testRecheckNode();
+ void testRecheckNodeWithFailure();
+ void testNotifyBucketChange();
+ void testNotifyBucketChangeFromNodeDown();
+ void testNotifyChangeWithPendingStateQueuesBucketInfoRequests();
+ void testMergeReply();
+ void testMergeReplyNodeDown();
+ void testMergeReplyNodeDownAfterRequestSent();
+ void testFlush();
+ void testPendingClusterStateSendMessages();
+ void testPendingClusterStateReceive();
+ void testPendingClusterStateMerge();
+ void testPendingClusterStateMergeReplicaChanged();
+ void testPendingClusterStateWithGroupDown();
+ void testPendingClusterStateWithGroupDownAndNoHandover();
+ void testNoDbResurrectionForBucketNotOwnedInCurrentState();
+ void testNoDbResurrectionForBucketNotOwnedInPendingState();
+ void testClusterStateAlwaysSendsFullFetchWhenDistributionChangePending();
+ void testChangedDistributionConfigTriggersRecoveryMode();
+ void testNewlyAddedBucketsHaveCurrentTimeAsGcTimestamp();
+ void testNewerMutationsNotOverwrittenByEarlierBucketFetch();
+ void preemptedDistrChangeCarriesNodeSetOverToNextStateFetch();
+ void preemptedStorChangeCarriesNodeSetOverToNextStateFetch();
+ void preemptedStorageNodeDownMustBeReFetched();
+ void outdatedNodeSetClearedAfterSuccessfulStateCompletion();
+ void doNotSendToPreemptedNodeNowInDownState();
+ void doNotSendToPreemptedNodeNotPartOfNewState();
+ void clusterConfigDownsizeOnlySendsToAvailableNodes();
+ void changedDiskSetTriggersReFetch();
+ void nodeMissingFromConfigIsTreatedAsNeedingOwnershipTransfer();
+
+ bool bucketExistsThatHasNode(int bucketCount, uint16_t node) const;
+
+ ClusterInformation::CSP createClusterInfo(const std::string& clusterState) {
+ ClusterInformation::CSP clusterInfo(
+ new SimpleClusterInformation(
+ getBucketDBUpdater().getDistributorComponent().getIndex(),
+ getBucketDBUpdater().getDistributorComponent().getDistribution(),
+ lib::ClusterState(clusterState),
+ "ui"));
+ return clusterInfo;
+ }
+
+public:
+ void setUp() {
+ createLinks();
+ };
+
+ void tearDown() {
+ close();
+ }
+
+ std::shared_ptr<RequestBucketInfoReply> getFakeBucketReply(
+ const lib::ClusterState& state,
+ RequestBucketInfoCommand& cmd,
+ int storageIndex,
+ int bucketCount,
+ int invalidBucketCount = 0)
+ {
+ RequestBucketInfoReply* sreply = new RequestBucketInfoReply(cmd);
+ sreply->setAddress(storageAddress(storageIndex));
+
+ api::RequestBucketInfoReply::EntryVector &vec = sreply->getBucketInfo();
+
+ for (int i=0; i<bucketCount + invalidBucketCount; i++) {
+ if (!getBucketDBUpdater().getDistributorComponent()
+ .ownsBucketInState(state, document::BucketId(16, i))) {
+ continue;
+ }
+
+ std::vector<uint16_t> nodes;
+ getBucketDBUpdater().getDistributorComponent()
+ .getDistribution().getIdealNodes(
+ lib::NodeType::STORAGE,
+ state,
+ document::BucketId(16, i),
+ nodes);
+
+ for (uint32_t j=0; j<nodes.size(); j++) {
+ if (nodes[j] == storageIndex) {
+ if (i >= bucketCount) {
+ vec.push_back(api::RequestBucketInfoReply::Entry(
+ document::BucketId(16, i),
+ api::BucketInfo()));
+ } else {
+ vec.push_back(api::RequestBucketInfoReply::Entry(
+ document::BucketId(16, i),
+ api::BucketInfo(10,1,1)));
+ }
+ }
+ }
+ }
+
+ return std::shared_ptr<api::RequestBucketInfoReply>(sreply);
+ }
+
+ void fakeBucketReply(
+ const lib::ClusterState& state,
+ RequestBucketInfoCommand& cmd,
+ int storageIndex,
+ int bucketCount,
+ int invalidBucketCount = 0)
+ {
+ getBucketDBUpdater().onRequestBucketInfoReply(
+ getFakeBucketReply(state,
+ cmd,
+ storageIndex,
+ bucketCount,
+ invalidBucketCount));
+ }
+
+ void sendFakeReplyForSingleBucketRequest(
+ const api::RequestBucketInfoCommand& rbi)
+ {
+ CPPUNIT_ASSERT_EQUAL(size_t(1), rbi.getBuckets().size());
+ const document::BucketId& bucket(rbi.getBuckets()[0]);
+
+ std::shared_ptr<api::RequestBucketInfoReply> reply(
+ new api::RequestBucketInfoReply(rbi));
+ reply->getBucketInfo().push_back(
+ api::RequestBucketInfoReply::Entry(bucket,
+ api::BucketInfo(20, 10, 12, 50, 60, true, true)));
+ getBucketDBUpdater().onRequestBucketInfoReply(reply);
+ }
+
+ std::string verifyBucket(document::BucketId id, const lib::ClusterState& state) {
+ BucketDatabase::Entry entry = getBucketDatabase().get(id);
+ if (!entry.valid()) {
+ return vespalib::make_string("%s doesn't exist in DB",
+ id.toString().c_str());
+ }
+
+ std::vector<uint16_t> nodes;
+ getBucketDBUpdater().getDistributorComponent().getDistribution().getIdealNodes(
+ lib::NodeType::STORAGE,
+ state,
+ document::BucketId(id),
+ nodes);
+
+ if (nodes.size() != entry->getNodeCount()) {
+ return vespalib::make_string("Bucket Id %s has %d nodes in "
+ "ideal state, but has only %d in DB",
+ id.toString().c_str(),
+ (int)nodes.size(),
+ (int)entry->getNodeCount());
+ }
+
+ for (uint32_t i = 0; i<nodes.size(); i++) {
+ bool found = false;
+
+ for (uint32_t j = 0; j<entry->getNodeCount(); j++) {
+ if (nodes[i] == entry->getNodeRef(j).getNode()) {
+ found = true;
+ }
+ }
+
+ if (!found) {
+ return vespalib::make_string(
+ "Bucket Id %s has no copy from node %d",
+ id.toString().c_str(),
+ nodes[i]);
+ }
+ }
+
+ return "";
+ }
+
+
+ void verifyInvalid(document::BucketId id, int storageNode) {
+ BucketDatabase::Entry entry = getBucketDatabase().get(id);
+
+ CPPUNIT_ASSERT(entry.valid());
+
+ bool found = false;
+ for (uint32_t j = 0; j<entry->getNodeCount(); j++) {
+ if (entry->getNodeRef(j).getNode() == storageNode) {
+ CPPUNIT_ASSERT(!entry->getNodeRef(j).valid());
+ found = true;
+ }
+ }
+
+ CPPUNIT_ASSERT(found);
+ }
+
+ struct OrderByIncreasingNodeIndex {
+ template <typename T>
+ bool operator()(const T& lhs, const T& rhs) {
+ return (lhs->getAddress()->getIndex()
+ < rhs->getAddress()->getIndex());
+ }
+ };
+
+ void sortSentMessagesByIndex(MessageSenderStub& sender,
+ size_t sortFromOffset = 0)
+ {
+ std::sort(sender.commands.begin() + sortFromOffset,
+ sender.commands.end(),
+ OrderByIncreasingNodeIndex());
+ }
+
+ void setSystemState(const lib::ClusterState& state) {
+ const size_t sizeBeforeState = _sender.commands.size();
+ getBucketDBUpdater().onSetSystemState(
+ std::shared_ptr<api::SetSystemStateCommand>(
+ new api::SetSystemStateCommand(state)));
+ // A lot of test logic has the assumption that all messages sent as a
+ // result of cluster state changes will be in increasing index order
+ // (for simplicity, not because this is required for correctness).
+ // Only sort the messages that arrived as a result of the state, don't
+ // jumble the sorting with any existing messages.
+ sortSentMessagesByIndex(_sender, sizeBeforeState);
+ }
+
+ void setAndEnableClusterState(const lib::ClusterState& state,
+ uint32_t expectedMsgs,
+ uint32_t nBuckets) {
+ _sender.clear();
+ setSystemState(state);
+ CPPUNIT_ASSERT_EQUAL(size_t(expectedMsgs), _sender.commands.size());
+
+ for (uint32_t i = 0; i < _sender.commands.size(); i++) {
+ CPPUNIT_ASSERT(_sender.commands[i]->getType() ==
+ MessageType::REQUESTBUCKETINFO);
+
+ const api::StorageMessageAddress& address(
+ *_sender.commands[i]->getAddress());
+ fakeBucketReply(
+ state,
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[i]),
+ address.getIndex(),
+ nBuckets);
+ }
+ }
+
+
+ void setStorageNodes(uint32_t numStorageNodes) {
+ _sender.clear();
+
+ lib::ClusterState newState(
+ vespalib::make_string("distributor:1 storage:%d", numStorageNodes));
+
+ setSystemState(newState);
+
+ for (uint32_t i=0; i<numStorageNodes; i++) {
+ CPPUNIT_ASSERT(_sender.commands[i]->getType() ==
+ MessageType::REQUESTBUCKETINFO);
+
+ const api::StorageMessageAddress *address = _sender.commands[i]->getAddress();
+ CPPUNIT_ASSERT_EQUAL(i, (uint32_t)address->getIndex());
+ }
+ }
+
+ void initializeNodesAndBuckets(uint32_t numStorageNodes,
+ uint32_t numBuckets)
+ {
+ setStorageNodes(numStorageNodes);
+
+ vespalib::string state(vespalib::make_string(
+ "distributor:1 storage:%d", numStorageNodes));
+ lib::ClusterState newState(state);
+
+ for (uint32_t i=0; i<numStorageNodes; i++) {
+ fakeBucketReply(newState,
+ *((RequestBucketInfoCommand*)_sender.commands[i].get()),
+ i,
+ numBuckets);
+ }
+ assertCorrectBuckets(numBuckets, state);
+ }
+
+ bool bucketHasNode(document::BucketId id, uint16_t node) const {
+ BucketDatabase::Entry entry = getBucket(id);
+ CPPUNIT_ASSERT(entry.valid());
+
+ for (uint32_t j=0; j<entry->getNodeCount(); j++) {
+ if (entry->getNodeRef(j).getNode() == node) {
+ return true;
+ }
+ }
+
+ return false;
+ }
+
+ api::StorageMessageAddress storageAddress(uint16_t node) {
+ return api::StorageMessageAddress("storage", lib::NodeType::STORAGE, node);
+ }
+
+ std::string getSentNodes(const std::string& oldClusterState,
+ const std::string& newClusterState);
+
+ std::string getSentNodesDistributionChanged(
+ const std::string& oldClusterState);
+
+ std::vector<uint16_t> getSentNodesWithPreemption(
+ const std::string& oldClusterState,
+ uint32_t expectedOldStateMessages,
+ const std::string& preemptedClusterState,
+ const std::string& newClusterState);
+
+ std::vector<uint16_t> getSendSet() const;
+
+ std::string mergeBucketLists(
+ const lib::ClusterState& oldState,
+ const std::string& existingData,
+ const lib::ClusterState& newState,
+ const std::string& newData,
+ bool includeBucketInfo = false);
+
+ std::string mergeBucketLists(
+ const std::string& existingData,
+ const std::string& newData,
+ bool includeBucketInfo = false);
+
+ void assertCorrectBuckets(int numBuckets, const std::string& stateStr) {
+ lib::ClusterState state(stateStr);
+ for (int i=0; i<numBuckets; i++) {
+ CPPUNIT_ASSERT_EQUAL(
+ getIdealStr(document::BucketId(16, i), state),
+ getNodes(document::BucketId(16, i)));
+ }
+ }
+
+ void setDistribution(const std::string& distConfig) {
+ lib::Distribution* distribution = new lib::Distribution(distConfig);
+ _node->getComponentRegister().setDistribution(
+ lib::Distribution::SP(distribution));
+ }
+
+ std::string getDistConfig6Nodes3Groups() const {
+ return ("redundancy 2\n"
+ "group[3]\n"
+ "group[0].name \"invalid\"\n"
+ "group[0].index \"invalid\"\n"
+ "group[0].partitions 1|*\n"
+ "group[0].nodes[0]\n"
+ "group[1].name rack0\n"
+ "group[1].index 0\n"
+ "group[1].nodes[3]\n"
+ "group[1].nodes[0].index 0\n"
+ "group[1].nodes[1].index 1\n"
+ "group[1].nodes[2].index 2\n"
+ "group[2].name rack1\n"
+ "group[2].index 1\n"
+ "group[2].nodes[3]\n"
+ "group[2].nodes[0].index 3\n"
+ "group[2].nodes[1].index 4\n"
+ "group[2].nodes[2].index 5\n");
+ }
+
+ std::string getDistConfig6Nodes4Groups() const {
+ return ("redundancy 2\n"
+ "group[4]\n"
+ "group[0].name \"invalid\"\n"
+ "group[0].index \"invalid\"\n"
+ "group[0].partitions 1|*\n"
+ "group[0].nodes[0]\n"
+ "group[1].name rack0\n"
+ "group[1].index 0\n"
+ "group[1].nodes[2]\n"
+ "group[1].nodes[0].index 0\n"
+ "group[1].nodes[1].index 1\n"
+ "group[2].name rack1\n"
+ "group[2].index 1\n"
+ "group[2].nodes[2]\n"
+ "group[2].nodes[0].index 2\n"
+ "group[2].nodes[1].index 3\n"
+ "group[3].name rack2\n"
+ "group[3].index 2\n"
+ "group[3].nodes[2]\n"
+ "group[3].nodes[0].index 4\n"
+ "group[3].nodes[1].index 5\n");
+ }
+
+ std::string getDistConfig3Nodes1Group() const {
+ return ("redundancy 2\n"
+ "group[2]\n"
+ "group[0].name \"invalid\"\n"
+ "group[0].index \"invalid\"\n"
+ "group[0].partitions 1|*\n"
+ "group[0].nodes[0]\n"
+ "group[1].name rack0\n"
+ "group[1].index 0\n"
+ "group[1].nodes[3]\n"
+ "group[1].nodes[0].index 0\n"
+ "group[1].nodes[1].index 1\n"
+ "group[1].nodes[2].index 2\n");
+ }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(BucketDBUpdaterTest);
+
+void
+BucketDBUpdaterTest::testNormalUsage()
+{
+ setSystemState(lib::ClusterState("distributor:2 .0.s:i .1.s:i storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(size_t(3), _sender.commands.size());
+
+ // Ensure distribution hash is set correctly
+ CPPUNIT_ASSERT_EQUAL(
+ getBucketDBUpdater().getDistributorComponent().getDistribution()
+ .getNodeGraph().getDistributionConfigHash(),
+ dynamic_cast<const RequestBucketInfoCommand&>(
+ *_sender.commands[0]).getDistributionHash());
+
+ fakeBucketReply(
+ lib::ClusterState("distributor:2 .0.s:i .1.s:i storage:3"),
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[0]),
+ 0, 10);
+
+ _sender.clear();
+
+ // Optimization for not refetching unneeded data after cluster state
+ // change is only implemented after completion of previous cluster state
+ setSystemState(lib::ClusterState("distributor:2 .0.s:i storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(size_t(3), _sender.commands.size());
+ // Expect reply of first set SystemState request.
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.replies.size());
+
+ for (uint32_t i = 0; i < 3; ++i) {
+ fakeBucketReply(
+ lib::ClusterState("distributor:2 .0.s:i .1.s:i storage:3"),
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[i]),
+ i, 10);
+ }
+
+ assertCorrectBuckets(10, "distributor:2 storage:3");
+}
+
+void
+BucketDBUpdaterTest::testDistributorChange()
+{
+ int numBuckets = 100;
+
+ // First sends request
+ setSystemState(lib::ClusterState("distributor:2 .0.s:i .1.s:i storage:3"));
+ CPPUNIT_ASSERT_EQUAL(size_t(3), _sender.commands.size());
+ for (uint32_t i = 0; i < 3; ++i) {
+ fakeBucketReply(
+ lib::ClusterState("distributor:2 .0.s:i .1.s:i storage:3"),
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[i]),
+ i, numBuckets);
+ }
+ _sender.clear();
+
+ // No change from initializing to up (when done with last job)
+ setSystemState(lib::ClusterState("distributor:2 storage:3"));
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _sender.commands.size());
+ _sender.clear();
+
+ // Adding node. No new read requests, but buckets thrown
+ setSystemState(lib::ClusterState("distributor:3 storage:3"));
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _sender.commands.size());
+ assertCorrectBuckets(numBuckets, "distributor:3 storage:3");
+ _sender.clear();
+
+ // Removing distributor. Need to refetch new data from all nodes.
+ setSystemState(lib::ClusterState("distributor:2 storage:3"));
+ CPPUNIT_ASSERT_EQUAL(size_t(3), _sender.commands.size());
+ for (uint32_t i = 0; i < 3; ++i) {
+ fakeBucketReply(
+ lib::ClusterState("distributor:2 storage:3"),
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[i]),
+ i, numBuckets);
+ }
+ _sender.clear();
+ assertCorrectBuckets(numBuckets, "distributor:2 storage:3");
+}
+
+void
+BucketDBUpdaterTest::testDistributorChangeWithGrouping()
+{
+ std::string distConfig(getDistConfig6Nodes3Groups());
+ setDistribution(distConfig);
+ _distributor->enableNextDistribution();
+ int numBuckets = 100;
+
+ setSystemState(lib::ClusterState("distributor:6 storage:6"));
+ CPPUNIT_ASSERT_EQUAL(size_t(6), _sender.commands.size());
+ for (uint32_t i = 0; i < 6; ++i) {
+ fakeBucketReply(
+ lib::ClusterState("distributor:6 storage:6"),
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[i]),
+ i, numBuckets);
+ }
+ _sender.clear();
+
+ // Distributor going down in other group, no change
+ setSystemState(lib::ClusterState("distributor:6 .5.s:d storage:6"));
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _sender.commands.size());
+ _sender.clear();
+
+ setSystemState(lib::ClusterState("distributor:6 storage:6"));
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _sender.commands.size());
+ assertCorrectBuckets(numBuckets, "distributor:6 storage:6");
+ _sender.clear();
+
+ // Unchanged grouping cause no change.
+ setDistribution(distConfig);
+ _distributor->storageDistributionChanged();
+ _distributor->enableNextDistribution();
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _sender.commands.size());
+
+ // Changed grouping cause change
+ setDistribution(getDistConfig6Nodes4Groups());
+ _distributor->storageDistributionChanged();
+ _distributor->enableNextDistribution();
+
+ CPPUNIT_ASSERT_EQUAL(size_t(6), _sender.commands.size());
+}
+
+void
+BucketDBUpdaterTest::testNormalUsageInitializing()
+{
+ setSystemState(lib::ClusterState("distributor:1 .0.s:i storage:1 .0.s:i"));
+
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.commands.size());
+
+ // Not yet passing on system state.
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _senderDown.commands.size());
+
+ fakeBucketReply(lib::ClusterState("distributor:1 .0.s:i storage:1"),
+ *((RequestBucketInfoCommand*)_sender.commands[0].get()),
+ 0,
+ 10,
+ 10);
+
+ assertCorrectBuckets(10, "distributor:1 storage:1");
+
+ for (int i=10; i<20; i++) {
+ verifyInvalid(document::BucketId(16, i), 0);
+ }
+
+ // Pass on cluster state and recheck buckets now.
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _senderDown.commands.size());
+
+ _sender.clear();
+ _senderDown.clear();
+
+ setSystemState(lib::ClusterState("distributor:1 .0.s:i storage:1"));
+
+ // Send a new request bucket info up.
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.commands.size());
+
+ fakeBucketReply(lib::ClusterState("distributor:1 .0.s:i storage:1"),
+ *((RequestBucketInfoCommand*)_sender.commands[0].get()),
+ 0,
+ 20);
+
+ // Pass on cluster state and recheck buckets now.
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _senderDown.commands.size());
+
+ assertCorrectBuckets(20, "distributor:1 storage:1");
+}
+
+void
+BucketDBUpdaterTest::testFailedRequestBucketInfo()
+{
+ setSystemState(lib::ClusterState("distributor:1 .0.s:i storage:1"));
+
+ // 2 messages sent up: 1 to the nodes, and one reply to the setsystemstate.
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.commands.size());
+
+ {
+ std::shared_ptr<api::RequestBucketInfoReply> reply =
+ getFakeBucketReply(lib::ClusterState("distributor:1 .0.s:i storage:1"),
+ *((RequestBucketInfoCommand*)_sender.commands[0].get()),
+ 0,
+ 10);
+
+ reply->setResult(api::ReturnCode::NOT_CONNECTED);
+ getBucketDBUpdater().onRequestBucketInfoReply(reply);
+ // Trigger that delayed message is sent
+ getClock().addSecondsToTime(10);
+ getBucketDBUpdater().resendDelayedMessages();
+ }
+
+ // Should be resent.
+ CPPUNIT_ASSERT_EQUAL(std::string("Request bucket info,"
+ "Request bucket info"),
+ _sender.getCommands());
+
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _senderDown.commands.size());
+
+ fakeBucketReply(lib::ClusterState("distributor:1 .0.s:i storage:1"),
+ *((RequestBucketInfoCommand*)_sender.commands[1].get()),
+ 0,
+ 10);
+
+ for (int i=0; i<10; i++) {
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(""),
+ verifyBucket(document::BucketId(16, i),
+ lib::ClusterState("distributor:1 storage:1")));
+ }
+
+ // Set system state should now be passed on
+ CPPUNIT_ASSERT_EQUAL(std::string("Set system state"),
+ _senderDown.getCommands());
+}
+
+void
+BucketDBUpdaterTest::testDownWhileInit()
+{
+ setStorageNodes(3);
+
+ fakeBucketReply(lib::ClusterState("distributor:1 storage:3"),
+ *((RequestBucketInfoCommand*)_sender.commands[0].get()),
+ 0,
+ 5);
+
+ setSystemState(lib::ClusterState("distributor:1 storage:3 .1.s:d"));
+
+ fakeBucketReply(lib::ClusterState("distributor:1 storage:3"),
+ *((RequestBucketInfoCommand*)_sender.commands[2].get()),
+ 2,
+ 5);
+
+ fakeBucketReply(lib::ClusterState("distributor:1 storage:3"),
+ *((RequestBucketInfoCommand*)_sender.commands[1].get()),
+ 1,
+ 5);
+}
+
+bool
+BucketDBUpdaterTest::bucketExistsThatHasNode(int bucketCount, uint16_t node) const
+{
+ for (int i=1; i<bucketCount; i++) {
+ if (bucketHasNode(document::BucketId(16, i), node)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void
+BucketDBUpdaterTest::testNodeDown()
+{
+ setStorageNodes(3);
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:3"));
+
+ for (int i=1; i<100; i++) {
+ addIdealNodes(document::BucketId(16, i));
+ }
+
+ CPPUNIT_ASSERT(bucketExistsThatHasNode(100, 1));
+
+ setSystemState(lib::ClusterState("distributor:1 storage:3 .1.s:d"));
+
+ CPPUNIT_ASSERT(!bucketExistsThatHasNode(100, 1));
+}
+
+void
+BucketDBUpdaterTest::testStorageNodeInMaintenanceClearsBucketsForNode()
+{
+ setStorageNodes(3);
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:3"));
+
+ for (int i=1; i<100; i++) {
+ addIdealNodes(document::BucketId(16, i));
+ }
+
+ CPPUNIT_ASSERT(bucketExistsThatHasNode(100, 1));
+
+ setSystemState(lib::ClusterState("distributor:1 storage:3 .1.s:m"));
+
+ CPPUNIT_ASSERT(!bucketExistsThatHasNode(100, 1));
+}
+
+void
+BucketDBUpdaterTest::testNodeDownCopiesGetInSync()
+{
+ setStorageNodes(3);
+
+ lib::ClusterState systemState("distributor:1 storage:3");
+ document::BucketId bid(16, 1);
+
+ addNodesToBucketDB(bid, "0=3,1=2,2=3");
+
+ setSystemState(lib::ClusterState("distributor:1 storage:3 .1.s:d"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001) : "
+ "node(idx=0,crc=0x3,docs=3/3,bytes=3/3,trusted=true,active=false), "
+ "node(idx=2,crc=0x3,docs=3/3,bytes=3/3,trusted=true,active=false)"),
+ dumpBucket(bid));
+}
+
+void
+BucketDBUpdaterTest::testInitializingWhileRecheck()
+{
+ lib::ClusterState systemState("distributor:1 storage:2 .0.s:i .0.i:0.1");
+ setSystemState(systemState);
+
+ CPPUNIT_ASSERT_EQUAL(size_t(2), _sender.commands.size());
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _senderDown.commands.size());
+
+ getBucketDBUpdater().recheckBucketInfo(1, document::BucketId(16, 3));
+
+ for (int i=0; i<2; i++) {
+ fakeBucketReply(systemState,
+ *((RequestBucketInfoCommand*)_sender.commands[i].get()),
+ i,
+ 100);
+ }
+
+ // Now we can pass on system state.
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _senderDown.commands.size());
+
+ CPPUNIT_ASSERT_EQUAL(MessageType::SETSYSTEMSTATE,
+ _senderDown.commands[0]->getType());
+}
+
+void
+BucketDBUpdaterTest::testBitChange()
+{
+
+ std::vector<document::BucketId> bucketlist;
+
+ {
+ setSystemState(lib::ClusterState("bits:14 storage:1 distributor:2"));
+
+ CPPUNIT_ASSERT_EQUAL(1, (int)_sender.commands.size());
+
+ CPPUNIT_ASSERT(_sender.commands[0]->getType() == MessageType::REQUESTBUCKETINFO);
+
+ RequestBucketInfoReply* sreply =
+ new RequestBucketInfoReply(*((RequestBucketInfoCommand*)_sender.commands[0].get()));
+ sreply->setAddress(storageAddress(0));
+ api::RequestBucketInfoReply::EntryVector &vec = sreply->getBucketInfo();
+
+
+ int cnt=0;
+ for (int i=0; cnt < 2; i++) {
+ lib::Distribution distribution = getBucketDBUpdater().getDistributorComponent()
+ .getDistribution();
+ std::vector<uint16_t> distributors;
+ if (distribution.getIdealDistributorNode(
+ lib::ClusterState("redundancy:1 bits:14 storage:1 distributor:2"),
+ document::BucketId(16, i))
+ == 0)
+ {
+ vec.push_back(api::RequestBucketInfoReply::Entry(
+ document::BucketId(16, i),
+ api::BucketInfo(10,1,1)));
+
+ bucketlist.push_back(document::BucketId(16, i));
+ cnt++;
+ }
+ }
+
+ getBucketDBUpdater().onRequestBucketInfoReply(std::shared_ptr<RequestBucketInfoReply>(sreply));
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001) : "
+ "node(idx=0,crc=0xa,docs=1/1,bytes=1/1,trusted=true,active=false)"),
+ dumpBucket(bucketlist[0]));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000002) : "
+ "node(idx=0,crc=0xa,docs=1/1,bytes=1/1,trusted=true,active=false)"),
+ dumpBucket(bucketlist[1]));
+
+ {
+ _sender.clear();
+ setSystemState(lib::ClusterState("bits:16 storage:1 distributor:2"));
+
+ CPPUNIT_ASSERT_EQUAL(1, (int)_sender.commands.size());
+
+ CPPUNIT_ASSERT(_sender.commands[0]->getType() == MessageType::REQUESTBUCKETINFO);
+
+ RequestBucketInfoReply* sreply =
+ new RequestBucketInfoReply(
+ *((RequestBucketInfoCommand*)_sender.commands[0].get()));
+ sreply->setAddress(storageAddress(0));
+ sreply->setResult(api::ReturnCode::OK);
+ api::RequestBucketInfoReply::EntryVector &vec = sreply->getBucketInfo();
+
+ for (uint32_t i = 0; i < 3; ++i) {
+ vec.push_back(api::RequestBucketInfoReply::Entry(
+ document::BucketId(16, i),
+ api::BucketInfo(10,1,1)));
+ }
+
+ vec.push_back(api::RequestBucketInfoReply::Entry(
+ document::BucketId(16, 4),
+ api::BucketInfo(10,1,1)));
+
+ getBucketDBUpdater().onRequestBucketInfoReply(
+ std::shared_ptr<RequestBucketInfoReply>(sreply));
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000000) : "
+ "node(idx=0,crc=0xa,docs=1/1,bytes=1/1,trusted=true,active=false)"),
+ dumpBucket(document::BucketId(16, 0)));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001) : "
+ "node(idx=0,crc=0xa,docs=1/1,bytes=1/1,trusted=true,active=false)"),
+ dumpBucket(document::BucketId(16, 1)));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000002) : "
+ "node(idx=0,crc=0xa,docs=1/1,bytes=1/1,trusted=true,active=false)"),
+ dumpBucket(document::BucketId(16, 2)));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000004) : "
+ "node(idx=0,crc=0xa,docs=1/1,bytes=1/1,trusted=true,active=false)"),
+ dumpBucket(document::BucketId(16, 4)));
+
+ {
+ _sender.clear();
+ setSystemState(lib::ClusterState("storage:1 distributor:2 .1.s:i"));
+ }
+
+ {
+ _sender.clear();
+ setSystemState(lib::ClusterState("storage:1 distributor:2"));
+ }
+};
+
+void
+BucketDBUpdaterTest::testRecheckNodeWithFailure()
+{
+ initializeNodesAndBuckets(3, 5);
+
+ _sender.clear();
+
+ getBucketDBUpdater().recheckBucketInfo(1, document::BucketId(16, 3));
+
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.commands.size());
+
+
+ uint16_t index = 0;
+ {
+ api::RequestBucketInfoCommand& rbi(
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[0]));
+ CPPUNIT_ASSERT_EQUAL(size_t(1), rbi.getBuckets().size());
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 3), rbi.getBuckets()[0]);
+ auto reply(std::make_shared<api::RequestBucketInfoReply>(rbi));
+
+ const api::StorageMessageAddress *address = _sender.commands[0]->getAddress();
+ index = address->getIndex();
+
+ reply->setResult(api::ReturnCode::NOT_CONNECTED);
+ getBucketDBUpdater().onRequestBucketInfoReply(reply);
+ // Trigger that delayed message is sent
+ getClock().addSecondsToTime(10);
+ getBucketDBUpdater().resendDelayedMessages();
+ }
+
+ CPPUNIT_ASSERT_EQUAL(size_t(2), _sender.commands.size());
+
+ setSystemState(
+ lib::ClusterState(vespalib::make_string("distributor:1 storage:3 .%d.s:d", index)));
+
+ // Recheck bucket.
+ {
+ api::RequestBucketInfoCommand& rbi(dynamic_cast<RequestBucketInfoCommand&>
+ (*_sender.commands[1]));
+ CPPUNIT_ASSERT_EQUAL(size_t(1), rbi.getBuckets().size());
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 3), rbi.getBuckets()[0]);
+ auto reply(std::make_shared<api::RequestBucketInfoReply>(rbi));
+ reply->setResult(api::ReturnCode::NOT_CONNECTED);
+ getBucketDBUpdater().onRequestBucketInfoReply(reply);
+ }
+
+ // Should not retry since node is down.
+ CPPUNIT_ASSERT_EQUAL(size_t(2), _sender.commands.size());
+}
+
+void
+BucketDBUpdaterTest::testRecheckNode()
+{
+ initializeNodesAndBuckets(3, 5);
+
+ _sender.clear();
+
+ getBucketDBUpdater().recheckBucketInfo(1, document::BucketId(16, 3));
+
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.commands.size());
+
+ api::RequestBucketInfoCommand& rbi(
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[0]));
+ CPPUNIT_ASSERT_EQUAL(size_t(1), rbi.getBuckets().size());
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 3), rbi.getBuckets()[0]);
+
+ auto reply(std::make_shared<api::RequestBucketInfoReply>(rbi));
+ reply->getBucketInfo().push_back(
+ api::RequestBucketInfoReply::Entry(document::BucketId(16, 3),
+ api::BucketInfo(20, 10, 12, 50, 60, true, true)));
+ getBucketDBUpdater().onRequestBucketInfoReply(reply);
+
+ lib::ClusterState state("distributor:1 storage:3");
+ for (uint32_t i = 0; i < 3; i++) {
+ CPPUNIT_ASSERT_EQUAL(
+ getIdealStr(document::BucketId(16, i), state),
+ getNodes(document::BucketId(16, i)));
+ }
+
+ for (uint32_t i = 4; i < 5; i++) {
+ CPPUNIT_ASSERT_EQUAL(
+ getIdealStr(document::BucketId(16, i), state),
+ getNodes(document::BucketId(16, i)));
+ }
+
+ BucketDatabase::Entry entry = getBucketDatabase().get(document::BucketId(16, 3));
+ CPPUNIT_ASSERT(entry.valid());
+
+ const BucketCopy* copy = entry->getNode(1);
+ CPPUNIT_ASSERT(copy != 0);
+ CPPUNIT_ASSERT_EQUAL(api::BucketInfo(20,10,12, 50, 60, true, true),
+ copy->getBucketInfo());
+}
+
+void
+BucketDBUpdaterTest::testNotifyBucketChange()
+{
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:1"));
+
+ addNodesToBucketDB(document::BucketId(16, 1), "0=1234");
+ _sender.replies.clear();
+
+ {
+ api::BucketInfo info(1, 2, 3, 4, 5, true, true);
+ auto cmd(std::make_shared<api::NotifyBucketChangeCommand>(
+ document::BucketId(16, 1), info));
+ cmd->setSourceIndex(0);
+ getBucketDBUpdater().onNotifyBucketChange(cmd);
+ }
+
+ {
+ api::BucketInfo info(10, 11, 12, 13, 14, false, false);
+ auto cmd(std::make_shared<api::NotifyBucketChangeCommand>(
+ document::BucketId(16, 2), info));
+ cmd->setSourceIndex(0);
+ getBucketDBUpdater().onNotifyBucketChange(cmd);
+ }
+
+ // Must receive reply
+ CPPUNIT_ASSERT_EQUAL(size_t(2), _sender.replies.size());
+
+ for (int i = 0; i < 2; ++i) {
+ CPPUNIT_ASSERT_EQUAL(MessageType::NOTIFYBUCKETCHANGE_REPLY,
+ _sender.replies[i]->getType());
+ }
+
+ // No database update until request bucket info replies have been received.
+ CPPUNIT_ASSERT_EQUAL(std::string("BucketId(0x4000000000000001) : "
+ "node(idx=0,crc=0x4d2,docs=1234/1234,bytes=1234/1234,"
+ "trusted=false,active=false)"),
+ dumpBucket(document::BucketId(16, 1)));
+ CPPUNIT_ASSERT_EQUAL(std::string("NONEXISTING"),
+ dumpBucket(document::BucketId(16, 2)));
+
+ CPPUNIT_ASSERT_EQUAL(size_t(2), _sender.commands.size());
+
+ std::vector<api::BucketInfo> infos;
+ infos.push_back(api::BucketInfo(4567, 200, 2000, 400, 4000, true, true));
+ infos.push_back(api::BucketInfo(8999, 300, 3000, 500, 5000, false, false));
+
+ for (int i = 0; i < 2; ++i) {
+ api::RequestBucketInfoCommand& rbi(
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[i]));
+ CPPUNIT_ASSERT_EQUAL(size_t(1), rbi.getBuckets().size());
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, i + 1), rbi.getBuckets()[0]);
+
+ auto reply(std::make_shared<api::RequestBucketInfoReply>(rbi));
+ reply->getBucketInfo().push_back(
+ api::RequestBucketInfoReply::Entry(document::BucketId(16, i + 1),
+ infos[i]));
+ getBucketDBUpdater().onRequestBucketInfoReply(reply);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001) : "
+ "node(idx=0,crc=0x11d7,docs=200/400,bytes=2000/4000,trusted=true,active=true)"),
+ dumpBucket(document::BucketId(16, 1)));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000002) : "
+ "node(idx=0,crc=0x2327,docs=300/500,bytes=3000/5000,trusted=true,active=false)"),
+ dumpBucket(document::BucketId(16, 2)));
+
+}
+
+void
+BucketDBUpdaterTest::testNotifyBucketChangeFromNodeDown()
+{
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:2"));
+
+ addNodesToBucketDB(document::BucketId(16, 1), "1=1234");
+
+ _sender.replies.clear();
+
+ {
+ api::BucketInfo info(8999, 300, 3000, 500, 5000, false, false);
+ auto cmd(std::make_shared<api::NotifyBucketChangeCommand>(
+ document::BucketId(16, 1), info));
+ cmd->setSourceIndex(0);
+ getBucketDBUpdater().onNotifyBucketChange(cmd);
+ }
+ // Enable here to avoid having request bucket info be silently swallowed
+ // (sendRequestBucketInfo drops message if node is down).
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:2 .0.s:d"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001) : "
+ "node(idx=1,crc=0x4d2,docs=1234/1234,bytes=1234/1234,trusted=false,active=false)"),
+ dumpBucket(document::BucketId(16, 1)));
+
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.replies.size());
+ CPPUNIT_ASSERT_EQUAL(MessageType::NOTIFYBUCKETCHANGE_REPLY,
+ _sender.replies[0]->getType());
+
+ // Currently, this pending operation will be auto-flushed when the cluster state
+ // changes so the behavior is still correct. Keep this test around to prevent
+ // regressions here.
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.commands.size());
+ api::RequestBucketInfoCommand& rbi(
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[0]));
+ CPPUNIT_ASSERT_EQUAL(size_t(1), rbi.getBuckets().size());
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 1), rbi.getBuckets()[0]);
+
+ auto reply(std::make_shared<api::RequestBucketInfoReply>(rbi));
+ reply->getBucketInfo().push_back(
+ api::RequestBucketInfoReply::Entry(
+ document::BucketId(16, 1),
+ api::BucketInfo(8999, 300, 3000, 500, 5000, false, false)));
+ getBucketDBUpdater().onRequestBucketInfoReply(reply);
+
+ // No change
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001) : "
+ "node(idx=1,crc=0x4d2,docs=1234/1234,bytes=1234/1234,trusted=false,active=false)"),
+ dumpBucket(document::BucketId(16, 1)));
+}
+
+/**
+ * Test that NotifyBucketChange received while there's a pending cluster state
+ * waits until the cluster state has been enabled as current before it sends off
+ * the single bucket info requests. This is to prevent a race condition where
+ * the replies to bucket info requests for buckets that would be owned by the
+ * distributor in the pending state but not by the current state would be
+ * discarded when attempted inserted into the bucket database.
+ */
+void
+BucketDBUpdaterTest::testNotifyChangeWithPendingStateQueuesBucketInfoRequests()
+{
+ setSystemState(lib::ClusterState("distributor:1 storage:1"));
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.commands.size());
+
+ {
+ api::BucketInfo info(8999, 300, 3000, 500, 5000, false, false);
+ auto cmd(std::make_shared<api::NotifyBucketChangeCommand>(
+ document::BucketId(16, 1), info));
+ cmd->setSourceIndex(0);
+ getBucketDBUpdater().onNotifyBucketChange(cmd);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.commands.size());
+
+ fakeBucketReply(
+ lib::ClusterState("distributor:1 storage:1"),
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[0]),
+ 0, 10);
+
+ CPPUNIT_ASSERT_EQUAL(size_t(2), _sender.commands.size());
+
+ {
+ api::RequestBucketInfoCommand& rbi(
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[1]));
+ CPPUNIT_ASSERT_EQUAL(size_t(1), rbi.getBuckets().size());
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 1), rbi.getBuckets()[0]);
+ }
+ _sender.clear();
+
+ // Queue must be cleared once pending state is enabled.
+ {
+ lib::ClusterState state("distributor:1 storage:2");
+ uint32_t expectedMsgs = 1, dummyBucketsToReturn = 1;
+ setAndEnableClusterState(state, expectedMsgs, dummyBucketsToReturn);
+ }
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.commands.size());
+ {
+ api::RequestBucketInfoCommand& rbi(
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[0]));
+ CPPUNIT_ASSERT_EQUAL(size_t(0), rbi.getBuckets().size());
+ }
+}
+
+void
+BucketDBUpdaterTest::testMergeReply()
+{
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:3"));
+
+ addNodesToBucketDB(document::BucketId(16, 1234),
+ "0=1234,1=1234,2=1234");
+
+ std::vector<api::MergeBucketCommand::Node> nodes;
+ nodes.push_back(api::MergeBucketCommand::Node(0));
+ nodes.push_back(api::MergeBucketCommand::Node(1));
+ nodes.push_back(api::MergeBucketCommand::Node(2));
+
+ api::MergeBucketCommand cmd(document::BucketId(16, 1234), nodes, 0);
+
+ auto reply(std::make_shared<api::MergeBucketReply>(cmd));
+
+ _sender.clear();
+ getBucketDBUpdater().onMergeBucketReply(reply);
+
+ CPPUNIT_ASSERT_EQUAL(size_t(3), _sender.commands.size());
+
+ for (uint32_t i = 0; i < 3; i++) {
+ std::shared_ptr<api::RequestBucketInfoCommand>
+ req(std::dynamic_pointer_cast<api::RequestBucketInfoCommand>(
+ _sender.commands[i]));
+
+ CPPUNIT_ASSERT(req.get());
+ CPPUNIT_ASSERT_EQUAL(size_t(1), req->getBuckets().size());
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 1234), req->getBuckets()[0]);
+
+ auto reqreply(std::make_shared<api::RequestBucketInfoReply>(*req));
+ reqreply->getBucketInfo().push_back(
+ api::RequestBucketInfoReply::Entry(document::BucketId(16, 1234),
+ api::BucketInfo(10 * (i + 1), 100 * (i +1), 1000 * (i+1))));
+
+ getBucketDBUpdater().onRequestBucketInfoReply(reqreply);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x40000000000004d2) : "
+ "node(idx=0,crc=0xa,docs=100/100,bytes=1000/1000,trusted=false,active=false), "
+ "node(idx=1,crc=0x14,docs=200/200,bytes=2000/2000,trusted=false,active=false), "
+ "node(idx=2,crc=0x1e,docs=300/300,bytes=3000/3000,trusted=false,active=false)"),
+ dumpBucket(document::BucketId(16, 1234)));
+};
+
+void
+BucketDBUpdaterTest::testMergeReplyNodeDown()
+{
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:3"));
+ std::vector<api::MergeBucketCommand::Node> nodes;
+
+ addNodesToBucketDB(document::BucketId(16, 1234), "0=1234,1=1234,2=1234");
+
+ for (uint32_t i = 0; i < 3; ++i) {
+ nodes.push_back(api::MergeBucketCommand::Node(i));
+ }
+
+ api::MergeBucketCommand cmd(document::BucketId(16, 1234), nodes, 0);
+
+ auto reply(std::make_shared<api::MergeBucketReply>(cmd));
+
+ setSystemState(lib::ClusterState("distributor:1 storage:2"));
+
+ _sender.clear();
+ getBucketDBUpdater().onMergeBucketReply(reply);
+
+ CPPUNIT_ASSERT_EQUAL(size_t(2), _sender.commands.size());
+
+ for (uint32_t i = 0; i < 2; i++) {
+ std::shared_ptr<api::RequestBucketInfoCommand> req(
+ std::dynamic_pointer_cast<api::RequestBucketInfoCommand>(
+ _sender.commands[i]));
+
+ CPPUNIT_ASSERT(req.get());
+ CPPUNIT_ASSERT_EQUAL(size_t(1), req->getBuckets().size());
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 1234), req->getBuckets()[0]);
+
+ auto reqreply(std::make_shared<api::RequestBucketInfoReply>(*req));
+ reqreply->getBucketInfo().push_back(
+ api::RequestBucketInfoReply::Entry(
+ document::BucketId(16, 1234),
+ api::BucketInfo(10 * (i + 1), 100 * (i +1), 1000 * (i+1))));
+ getBucketDBUpdater().onRequestBucketInfoReply(reqreply);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x40000000000004d2) : "
+ "node(idx=0,crc=0xa,docs=100/100,bytes=1000/1000,trusted=false,active=false), "
+ "node(idx=1,crc=0x14,docs=200/200,bytes=2000/2000,trusted=false,active=false)"),
+ dumpBucket(document::BucketId(16, 1234)));
+};
+
+void
+BucketDBUpdaterTest::testMergeReplyNodeDownAfterRequestSent()
+{
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:3"));
+ std::vector<api::MergeBucketCommand::Node> nodes;
+
+ addNodesToBucketDB(document::BucketId(16, 1234), "0=1234,1=1234,2=1234");
+
+ for (uint32_t i = 0; i < 3; ++i) {
+ nodes.push_back(api::MergeBucketCommand::Node(i));
+ }
+
+ api::MergeBucketCommand cmd(document::BucketId(16, 1234), nodes, 0);
+
+ auto reply(std::make_shared<api::MergeBucketReply>(cmd));
+
+ _sender.clear();
+ getBucketDBUpdater().onMergeBucketReply(reply);
+
+ CPPUNIT_ASSERT_EQUAL(size_t(3), _sender.commands.size());
+
+ setSystemState(lib::ClusterState("distributor:1 storage:2"));
+
+ for (uint32_t i = 0; i < 3; i++) {
+ std::shared_ptr<api::RequestBucketInfoCommand> req(
+ std::dynamic_pointer_cast<api::RequestBucketInfoCommand>(
+ _sender.commands[i]));
+
+ CPPUNIT_ASSERT(req.get());
+ CPPUNIT_ASSERT_EQUAL(size_t(1), req->getBuckets().size());
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 1234), req->getBuckets()[0]);
+
+ auto reqreply(std::make_shared<api::RequestBucketInfoReply>(*req));
+ reqreply->getBucketInfo().push_back(
+ api::RequestBucketInfoReply::Entry(
+ document::BucketId(16, 1234),
+ api::BucketInfo(10 * (i + 1), 100 * (i +1), 1000 * (i+1))));
+ getBucketDBUpdater().onRequestBucketInfoReply(reqreply);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x40000000000004d2) : "
+ "node(idx=0,crc=0xa,docs=100/100,bytes=1000/1000,trusted=false,active=false), "
+ "node(idx=1,crc=0x14,docs=200/200,bytes=2000/2000,trusted=false,active=false)"),
+ dumpBucket(document::BucketId(16, 1234)));
+};
+
+
+void
+BucketDBUpdaterTest::testFlush()
+{
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:3"));
+ _sender.clear();
+
+ addNodesToBucketDB(document::BucketId(16, 1234), "0=1234,1=1234,2=1234");
+
+ std::vector<api::MergeBucketCommand::Node> nodes;
+ for (uint32_t i = 0; i < 3; ++i) {
+ nodes.push_back(api::MergeBucketCommand::Node(i));
+ }
+
+ api::MergeBucketCommand cmd(document::BucketId(16, 1234),
+ nodes,
+ 0);
+
+ auto reply(std::make_shared<api::MergeBucketReply>(cmd));
+
+ _sender.clear();
+ getBucketDBUpdater().onMergeBucketReply(reply);
+
+ CPPUNIT_ASSERT_EQUAL(size_t(3), _sender.commands.size());
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _senderDown.replies.size());
+
+ getBucketDBUpdater().flush();
+ // Flushing should drop all merge bucket replies
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _senderDown.commands.size());
+}
+
+std::string
+BucketDBUpdaterTest::getSentNodes(
+ const std::string& oldClusterState,
+ const std::string& newClusterState)
+{
+ MessageSenderStub sender;
+
+ std::shared_ptr<api::SetSystemStateCommand> cmd(
+ new api::SetSystemStateCommand(
+ lib::ClusterState(newClusterState)));
+
+ framework::defaultimplementation::FakeClock clock;
+ ClusterInformation::CSP clusterInfo(createClusterInfo(oldClusterState));
+
+ std::unordered_set<uint16_t> outdatedNodes;
+ std::unique_ptr<PendingClusterState> state(
+ PendingClusterState::createForClusterStateChange(
+ clock, clusterInfo, sender, cmd, outdatedNodes,
+ api::Timestamp(1)));
+
+ sortSentMessagesByIndex(sender);
+
+ std::ostringstream ost;
+ for (uint32_t i = 0; i < sender.commands.size(); i++) {
+ RequestBucketInfoCommand* req =
+ dynamic_cast<RequestBucketInfoCommand*>(sender.commands[i].get());
+
+ if (i > 0) {
+ ost << ",";
+ }
+
+ ost << req->getAddress()->getIndex();
+ }
+
+ return ost.str();
+}
+
+std::string
+BucketDBUpdaterTest::getSentNodesDistributionChanged(
+ const std::string& oldClusterState)
+{
+ MessageSenderStub sender;
+
+ framework::defaultimplementation::FakeClock clock;
+ ClusterInformation::CSP clusterInfo(createClusterInfo(oldClusterState));
+ std::unique_ptr<PendingClusterState> state(
+ PendingClusterState::createForDistributionChange(
+ clock, clusterInfo, sender, api::Timestamp(1)));
+
+ sortSentMessagesByIndex(sender);
+
+ std::ostringstream ost;
+ for (uint32_t i = 0; i < sender.commands.size(); i++) {
+ RequestBucketInfoCommand* req =
+ dynamic_cast<RequestBucketInfoCommand*>(sender.commands[i].get());
+
+ if (i > 0) {
+ ost << ",";
+ }
+
+ ost << req->getAddress()->getIndex();
+ }
+
+ return ost.str();
+}
+
+void
+BucketDBUpdaterTest::testPendingClusterStateSendMessages()
+{
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,1,2"),
+ getSentNodes("cluster:d",
+ "distributor:1 storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,1"),
+ getSentNodes("cluster:d",
+ "distributor:1 storage:3 .2.s:m"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("2"),
+ getSentNodes("distributor:1 storage:2",
+ "distributor:1 storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("2,3,4,5"),
+ getSentNodes("distributor:1 storage:2",
+ "distributor:1 storage:6"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,1,2"),
+ getSentNodes("distributor:4 storage:3",
+ "distributor:3 storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,1,2,3"),
+ getSentNodes("distributor:4 storage:3",
+ "distributor:4 .2.s:d storage:4"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(""),
+ getSentNodes("distributor:4 storage:3",
+ "distributor:4 .0.s:d storage:4"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(""),
+ getSentNodes("distributor:3 storage:3",
+ "distributor:4 storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("2"),
+ getSentNodes("distributor:3 storage:3 .2.s:i",
+ "distributor:3 storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("1"),
+ getSentNodes("distributor:3 storage:3 .1.s:d",
+ "distributor:3 storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("1,2,4"),
+ getSentNodes("distributor:3 storage:4 .1.s:d .2.s:i",
+ "distributor:3 storage:5"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(""),
+ getSentNodes("distributor:1 storage:3",
+ "cluster:d"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(""),
+ getSentNodes("distributor:1 storage:3",
+ "distributor:1 storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(""),
+ getSentNodes("distributor:1 storage:3",
+ "cluster:d distributor:1 storage:6"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(""),
+ getSentNodes("distributor:3 storage:3",
+ "distributor:3 .2.s:m storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,1,2"),
+ getSentNodes("distributor:3 .2.s:m storage:3",
+ "distributor:3 .2.s:d storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(""),
+ getSentNodes("distributor:3 .2.s:m storage:3",
+ "distributor:3 storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,1,2"),
+ getSentNodesDistributionChanged("distributor:3 storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,1"),
+ getSentNodes("distributor:10 storage:2",
+ "distributor:10 .1.s:d storage:2"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("1"),
+ getSentNodes("distributor:2 storage:2",
+ "distributor:2 storage:2 .1.d:3 .1.d.1.s:d"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("1"),
+ getSentNodes("distributor:2 storage:2 .1.s:d",
+ "distributor:2 storage:2 .1.d:3 .1.d.1.s:d"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(""),
+ getSentNodes("distributor:2 storage:2",
+ "distributor:3 .2.s:i storage:2"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,1,2"),
+ getSentNodes("distributor:3 storage:3",
+ "distributor:3 .2.s:s storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(""),
+ getSentNodes("distributor:3 .2.s:s storage:3",
+ "distributor:3 .2.s:d storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("1"),
+ getSentNodes("distributor:3 storage:3 .1.s:m",
+ "distributor:3 storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(""),
+ getSentNodes("distributor:3 storage:3",
+ "distributor:3 storage:3 .1.s:m"));
+};
+
+void
+BucketDBUpdaterTest::testPendingClusterStateReceive()
+{
+ MessageSenderStub sender;
+
+ auto cmd(std::make_shared<api::SetSystemStateCommand>(
+ lib::ClusterState("distributor:1 storage:3")));
+
+ framework::defaultimplementation::FakeClock clock;
+ ClusterInformation::CSP clusterInfo(createClusterInfo("cluster:d"));
+ std::unordered_set<uint16_t> outdatedNodes;
+ std::unique_ptr<PendingClusterState> state(
+ PendingClusterState::createForClusterStateChange(
+ clock, clusterInfo, sender, cmd, outdatedNodes,
+ api::Timestamp(1)));
+
+ CPPUNIT_ASSERT_EQUAL(3, (int)sender.commands.size());
+
+ sortSentMessagesByIndex(sender);
+
+ std::ostringstream ost;
+ for (uint32_t i = 0; i < sender.commands.size(); i++) {
+ RequestBucketInfoCommand* req =
+ dynamic_cast<RequestBucketInfoCommand*>(sender.commands[i].get());
+
+ RequestBucketInfoReply* rep =
+ new RequestBucketInfoReply(*req);
+
+ rep->getBucketInfo().push_back(
+ RequestBucketInfoReply::Entry(
+ document::BucketId(16, i),
+ api::BucketInfo(i, i, i, i, i)));
+
+ CPPUNIT_ASSERT(
+ state->onRequestBucketInfoReply(
+ std::shared_ptr<api::RequestBucketInfoReply>(rep)));
+
+ CPPUNIT_ASSERT_EQUAL(i == sender.commands.size() - 1 ? true : false,
+ state->done());
+ }
+
+ CPPUNIT_ASSERT_EQUAL(3, (int)state->results().size());
+}
+
+void
+BucketDBUpdaterTest::testPendingClusterStateWithGroupDown()
+{
+ std::string config(getDistConfig6Nodes4Groups());
+ config += "distributor_auto_ownership_transfer_on_whole_group_down true\n";
+ setDistribution(config);
+
+ // Group config has nodes {0, 1}, {2, 3}, {4, 5}
+ // We're node index 0.
+
+ // Entire group 1 goes down. Must refetch from all nodes.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,1,2,3,4,5"),
+ getSentNodes("distributor:6 storage:6",
+ "distributor:6 .2.s:d .3.s:d storage:6"));
+
+ // But don't fetch if not the entire group is down.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(""),
+ getSentNodes("distributor:6 storage:6",
+ "distributor:6 .2.s:d storage:6"));
+}
+
+void
+BucketDBUpdaterTest::testPendingClusterStateWithGroupDownAndNoHandover()
+{
+ std::string config(getDistConfig6Nodes4Groups());
+ config += "distributor_auto_ownership_transfer_on_whole_group_down false\n";
+ setDistribution(config);
+
+ // Group is down, but config says to not do anything about it.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(""),
+ getSentNodes("distributor:6 storage:6",
+ "distributor:6 .2.s:d .3.s:d storage:6"));
+}
+
+void
+parseInputData(const std::string& data,
+ uint64_t timestamp,
+ PendingClusterState& state,
+ bool includeBucketInfo)
+{
+ vespalib::StringTokenizer tokenizer(data, "|");
+ for (uint32_t i = 0; i < tokenizer.size(); i++) {
+ vespalib::StringTokenizer tok2(tokenizer[i], ":");
+
+ uint16_t node = atoi(tok2[0].c_str());
+
+ state.setNodeReplied(node);
+
+ vespalib::StringTokenizer tok3(tok2[1], ",");
+ for (uint32_t j = 0; j < tok3.size(); j++) {
+ if (includeBucketInfo) {
+ vespalib::StringTokenizer tok4(tok3[j], "/");
+
+ state.addNodeInfo(
+ document::BucketId(16, atoi(tok4[0].c_str())),
+ BucketCopy(
+ timestamp,
+ node,
+ api::BucketInfo(
+ atoi(tok4[1].c_str()),
+ atoi(tok4[2].c_str()),
+ atoi(tok4[3].c_str()),
+ atoi(tok4[2].c_str()),
+ atoi(tok4[3].c_str()))));
+ } else {
+ state.addNodeInfo(
+ document::BucketId(16, atoi(tok3[j].c_str())),
+ BucketCopy(timestamp,
+ node,
+ api::BucketInfo(3, 3, 3, 3, 3)));
+ }
+ }
+ }
+}
+
+struct BucketDumper : public BucketDatabase::EntryProcessor
+{
+ std::ostringstream ost;
+ bool _includeBucketInfo;
+
+ BucketDumper(bool includeBucketInfo)
+ : _includeBucketInfo(includeBucketInfo)
+ {
+ }
+
+ bool process(const BucketDatabase::Entry& e) {
+ document::BucketId bucketId(e.getBucketId());
+
+ ost << (uint32_t)bucketId.getRawId() << ":";
+ for (uint32_t i = 0; i < e->getNodeCount(); ++i) {
+ if (i > 0) {
+ ost << ",";
+ }
+ const BucketCopy& copy(e->getNodeRef(i));
+ ost << copy.getNode();
+ if (_includeBucketInfo) {
+ ost << "/" << copy.getChecksum()
+ << "/" << copy.getDocumentCount()
+ << "/" << copy.getTotalDocumentSize()
+ << "/" << (copy.trusted() ? "t" : "u");
+ }
+ }
+ ost << "|";
+ return true;
+ }
+};
+
+std::string
+BucketDBUpdaterTest::mergeBucketLists(
+ const lib::ClusterState& oldState,
+ const std::string& existingData,
+ const lib::ClusterState& newState,
+ const std::string& newData,
+ bool includeBucketInfo)
+{
+ framework::defaultimplementation::FakeClock clock;
+ framework::MilliSecTimer timer(clock);
+
+ MessageSenderStub sender;
+ std::unordered_set<uint16_t> outdatedNodes;
+
+ {
+ auto cmd(std::make_shared<api::SetSystemStateCommand>(oldState));
+
+ api::Timestamp beforeTime(1);
+
+ ClusterInformation::CSP clusterInfo(createClusterInfo("cluster:d"));
+ std::unique_ptr<PendingClusterState> state(
+ PendingClusterState::createForClusterStateChange(
+ clock, clusterInfo, sender, cmd, outdatedNodes,
+ beforeTime));
+
+ parseInputData(existingData, beforeTime, *state, includeBucketInfo);
+ state->mergeInto(getBucketDBUpdater().getDistributorComponent().getBucketDatabase());
+ }
+
+ BucketDumper dumper_tmp(true);
+ getBucketDatabase().forEach(dumper_tmp);
+
+ {
+ auto cmd(std::make_shared<api::SetSystemStateCommand>(
+ lib::ClusterState(newState)));
+
+ api::Timestamp afterTime(2);
+
+ ClusterInformation::CSP clusterInfo(createClusterInfo(oldState.toString()));
+ std::unique_ptr<PendingClusterState> state(
+ PendingClusterState::createForClusterStateChange(
+ clock, clusterInfo, sender, cmd, outdatedNodes,
+ afterTime));
+
+ parseInputData(newData, afterTime, *state, includeBucketInfo);
+ state->mergeInto(getBucketDBUpdater().getDistributorComponent()
+ .getBucketDatabase());
+ }
+
+ BucketDumper dumper(includeBucketInfo);
+ getBucketDBUpdater().getDistributorComponent()
+ .getBucketDatabase().forEach(dumper);
+ getBucketDBUpdater().getDistributorComponent()
+ .getBucketDatabase().clear();
+ return dumper.ost.str();
+}
+
+std::string
+BucketDBUpdaterTest::mergeBucketLists(const std::string& existingData,
+ const std::string& newData,
+ bool includeBucketInfo)
+{
+ return mergeBucketLists(
+ lib::ClusterState("distributor:1 storage:3"),
+ existingData,
+ lib::ClusterState("distributor:1 storage:3"),
+ newData,
+ includeBucketInfo);
+}
+
+void
+BucketDBUpdaterTest::testPendingClusterStateMerge()
+{
+ // Simple initializing case - ask all nodes for info
+ CPPUNIT_ASSERT_EQUAL(
+ // Result is on the form: [bucket w/o count bits]:[node indexes]|..
+ std::string("4:0,1|2:0,1|6:1,2|1:0,2|5:2,0|3:2,1|"),
+ // Input is on the form: [node]:[bucket w/o count bits]|...
+ mergeBucketLists(
+ "",
+ "0:1,2,4,5|1:2,3,4,6|2:1,3,5,6"));
+
+ // Node came up with fewer buckets (lost disk)
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("4:1|2:0,1|6:1,2|1:2,0|5:2|3:2,1|"),
+ mergeBucketLists(
+ lib::ClusterState("distributor:1 storage:3"),
+ "0:1,2,4,5|1:2,3,4,6|2:1,3,5,6",
+ lib::ClusterState("distributor:1 storage:3 .0.d:3 .0.d.1.s:d"),
+ "0:1,2")
+ );
+
+ // New node came up
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("4:0,1|2:0,1|6:1,2,3|1:0,2,3|5:2,0,3|3:2,1,3|"),
+ mergeBucketLists(
+ "0:1,2,4,5|1:2,3,4,6|2:1,3,5,6",
+ "3:1,3,5,6"));
+
+ // Node came up with some buckets removed and some added
+ // Buckets that were removed should not be removed as the node
+ // didn't lose a disk.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("8:0|4:0,1|2:0,1|6:1,0,2|1:0,2|5:2,0|3:2,1|"),
+ mergeBucketLists(
+ "0:1,2,4,5|1:2,3,4,6|2:1,3,5,6",
+ "0:1,2,6,8"));
+
+ // Node came up with no buckets
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("4:1|2:1|6:1,2|1:2|5:2|3:2,1|"),
+ mergeBucketLists(
+ lib::ClusterState("distributor:1 storage:3"),
+ "0:1,2,4,5|1:2,3,4,6|2:1,3,5,6",
+ lib::ClusterState("distributor:1 storage:3 .0.d:3 .0.d.1.s:d"),
+ "0:")
+ );
+
+ // One node lost a disk, another was just reasked (distributor
+ // change)
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("2:0,1|6:1,2|1:2,0|5:2|3:2,1|"),
+ mergeBucketLists(
+ lib::ClusterState("distributor:1 storage:3"),
+ "0:1,2,4,5|1:2,3,6|2:1,3,5,6",
+ lib::ClusterState("distributor:1 storage:3 .0.d:3 .0.d.1.s:d"),
+ "0:1,2|1:2,3")
+ );
+
+ // Bucket info format is "bucketid/checksum/count/size"
+ // Node went from initializing to up and invalid bucket went to empty.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("2:0/0/0/0/t|"),
+ mergeBucketLists(
+ "0:2/0/0/1",
+ "0:2/0/0/0",
+ true));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("5:1/2/3/4/u,0/0/0/0/u|"),
+ mergeBucketLists("", "0:5/0/0/0|1:5/2/3/4", true));
+}
+
+void
+BucketDBUpdaterTest::testPendingClusterStateMergeReplicaChanged()
+{
+ // Node went from initializing to up and non-invalid bucket changed.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("2:0/2/3/4/t|3:0/2/4/6/t|"),
+ mergeBucketLists(
+ lib::ClusterState("distributor:1 storage:1 .0.s:i"),
+ "0:2/1/2/3,3/2/4/6",
+ lib::ClusterState("distributor:1 storage:1"),
+ "0:2/2/3/4,3/2/4/6",
+ true));
+}
+
+void
+BucketDBUpdaterTest::testNoDbResurrectionForBucketNotOwnedInCurrentState()
+{
+ document::BucketId bucket(16, 3);
+ lib::ClusterState stateBefore("distributor:1 storage:1");
+ {
+ uint32_t expectedMsgs = 1, dummyBucketsToReturn = 1;
+ setAndEnableClusterState(stateBefore, expectedMsgs, dummyBucketsToReturn);
+ }
+ _sender.clear();
+
+ getBucketDBUpdater().recheckBucketInfo(0, bucket);
+
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.commands.size());
+ std::shared_ptr<api::RequestBucketInfoCommand> rbi(
+ std::dynamic_pointer_cast<RequestBucketInfoCommand>(
+ _sender.commands[0]));
+
+ lib::ClusterState stateAfter("distributor:3 storage:3");
+
+ {
+ uint32_t expectedMsgs = 2, dummyBucketsToReturn = 1;
+ setAndEnableClusterState(stateAfter, expectedMsgs, dummyBucketsToReturn);
+ }
+ CPPUNIT_ASSERT(!getBucketDBUpdater().getDistributorComponent()
+ .ownsBucketInCurrentState(bucket));
+
+ sendFakeReplyForSingleBucketRequest(*rbi);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("NONEXISTING"), dumpBucket(bucket));
+}
+
+void
+BucketDBUpdaterTest::testNoDbResurrectionForBucketNotOwnedInPendingState()
+{
+ document::BucketId bucket(16, 3);
+ lib::ClusterState stateBefore("distributor:1 storage:1");
+ {
+ uint32_t expectedMsgs = 1, dummyBucketsToReturn = 1;
+ setAndEnableClusterState(stateBefore, expectedMsgs, dummyBucketsToReturn);
+ }
+ _sender.clear();
+
+ getBucketDBUpdater().recheckBucketInfo(0, bucket);
+
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.commands.size());
+ std::shared_ptr<api::RequestBucketInfoCommand> rbi(
+ std::dynamic_pointer_cast<RequestBucketInfoCommand>(
+ _sender.commands[0]));
+
+ lib::ClusterState stateAfter("distributor:3 storage:3");
+ // Set, but _don't_ enable cluster state. We want it to be pending.
+ setSystemState(stateAfter);
+ CPPUNIT_ASSERT(getBucketDBUpdater().getDistributorComponent()
+ .ownsBucketInCurrentState(bucket));
+ CPPUNIT_ASSERT(!getBucketDBUpdater()
+ .checkOwnershipInPendingState(bucket).isOwned());
+
+ sendFakeReplyForSingleBucketRequest(*rbi);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("NONEXISTING"), dumpBucket(bucket));
+}
+
+/*
+ * If we get a distribution config change, it's important that cluster states that
+ * arrive after this--but _before_ the pending cluster state has finished--must trigger
+ * a full bucket info fetch no matter what the cluster state change was! Otherwise, we
+ * will with a high likelihood end up not getting the complete view of the buckets in
+ * the cluster.
+ */
+void
+BucketDBUpdaterTest::testClusterStateAlwaysSendsFullFetchWhenDistributionChangePending()
+{
+ lib::ClusterState stateBefore("distributor:6 storage:6");
+ {
+ uint32_t expectedMsgs = 6, dummyBucketsToReturn = 1;
+ setAndEnableClusterState(stateBefore, expectedMsgs, dummyBucketsToReturn);
+ }
+ _sender.clear();
+ std::string distConfig(getDistConfig6Nodes3Groups());
+ {
+ _node->getComponentRegister().setDistribution(
+ std::make_shared<lib::Distribution>(distConfig));
+ _distributor->storageDistributionChanged();
+ _distributor->enableNextDistribution();
+ }
+ sortSentMessagesByIndex(_sender);
+ CPPUNIT_ASSERT_EQUAL(size_t(6), _sender.commands.size());
+ // Suddenly, a wild cluster state change appears! Even though this state
+ // does not in itself imply any bucket changes, it will still overwrite the
+ // pending cluster state and thus its state of pending bucket info requests.
+ setSystemState(lib::ClusterState("distributor:6 .2.t:12345 storage:6"));
+
+ CPPUNIT_ASSERT_EQUAL(size_t(12), _sender.commands.size());
+
+ // Send replies for first 6 (outdated requests).
+ int numBuckets = 10;
+ for (uint32_t i = 0; i < 6; ++i) {
+ fakeBucketReply(
+ lib::ClusterState("distributor:6 storage:6"),
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[i]),
+ i, numBuckets);
+ }
+ // No change from these.
+ assertCorrectBuckets(1, "distributor:6 storage:6");
+
+ // Send for current pending.
+ for (uint32_t i = 0; i < 6; ++i) {
+ fakeBucketReply(
+ lib::ClusterState("distributor:6 .2.t:12345 storage:6"),
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[i + 6]),
+ i, numBuckets);
+ }
+ assertCorrectBuckets(numBuckets, "distributor:6 storage:6");
+ _sender.clear();
+
+ // No more pending global fetch; this should be a no-op state.
+ setSystemState(lib::ClusterState("distributor:6 .3.t:12345 storage:6"));
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _sender.commands.size());
+}
+
+void
+BucketDBUpdaterTest::testChangedDistributionConfigTriggersRecoveryMode()
+{
+ setAndEnableClusterState(lib::ClusterState("distributor:6 storage:6"), 6, 20);
+ _sender.clear();
+ // First cluster state; implicit scan of all buckets which does not
+ // use normal recovery mode ticking-path.
+ CPPUNIT_ASSERT(!_distributor->isInRecoveryMode());
+
+ std::string distConfig(getDistConfig6Nodes4Groups());
+ _node->getComponentRegister().setDistribution(
+ std::make_shared<lib::Distribution>(distConfig));
+ _distributor->storageDistributionChanged();
+ _distributor->enableNextDistribution();
+ sortSentMessagesByIndex(_sender);
+ // No replies received yet, still no recovery mode.
+ CPPUNIT_ASSERT(!_distributor->isInRecoveryMode());
+
+ CPPUNIT_ASSERT_EQUAL(size_t(6), _sender.commands.size());
+ uint32_t numBuckets = 10;
+ for (uint32_t i = 0; i < 6; ++i) {
+ fakeBucketReply(
+ lib::ClusterState("distributor:6 storage:6"),
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[i]),
+ i, numBuckets);
+ }
+
+ // Pending cluster state (i.e. distribution) has been enabled, which should
+ // cause recovery mode to be entered.
+ CPPUNIT_ASSERT(_distributor->isInRecoveryMode());
+}
+
+void
+BucketDBUpdaterTest::testNewlyAddedBucketsHaveCurrentTimeAsGcTimestamp()
+{
+ getClock().setAbsoluteTimeInSeconds(101234);
+ lib::ClusterState stateBefore("distributor:1 storage:1");
+ {
+ uint32_t expectedMsgs = 1, dummyBucketsToReturn = 1;
+ setAndEnableClusterState(stateBefore, expectedMsgs, dummyBucketsToReturn);
+ }
+
+ // setAndEnableClusterState adds n buckets with id (16, i)
+ document::BucketId bucket(16, 0);
+ BucketDatabase::Entry e(getBucket(bucket));
+ CPPUNIT_ASSERT(e.valid());
+ CPPUNIT_ASSERT_EQUAL(uint32_t(101234), e->getLastGarbageCollectionTime());
+}
+
+void
+BucketDBUpdaterTest::testNewerMutationsNotOverwrittenByEarlierBucketFetch()
+{
+ {
+ lib::ClusterState stateBefore("distributor:1 storage:1 .0.s:i");
+ uint32_t expectedMsgs = 1, dummyBucketsToReturn = 0;
+ // This step is required to make the distributor ready for accepting
+ // the below explicit database insertion towards node 0.
+ setAndEnableClusterState(stateBefore, expectedMsgs,
+ dummyBucketsToReturn);
+ }
+ _sender.clear();
+ getClock().setAbsoluteTimeInSeconds(1000);
+ lib::ClusterState state("distributor:1 storage:1");
+ setSystemState(state);
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.commands.size());
+
+ // Before replying with the bucket info, simulate the arrival of a mutation
+ // reply that alters the state of the bucket with information that will be
+ // more recent that what is returned by the bucket info. This information
+ // must not be lost when the bucket info is later merged into the database.
+ document::BucketId bucket(16, 1);
+ constexpr uint64_t insertionTimestamp = 1001ULL * 1000000;
+ api::BucketInfo wantedInfo(5, 6, 7);
+ getBucketDBUpdater().getDistributorComponent().updateBucketDatabase(
+ bucket,
+ BucketCopy(insertionTimestamp, 0, wantedInfo),
+ DatabaseUpdate::CREATE_IF_NONEXISTING);
+
+ getClock().setAbsoluteTimeInSeconds(1002);
+ constexpr uint32_t bucketsReturned = 10; // Buckets (16, 0) ... (16, 9)
+ // Return bucket information which on the timeline might originate from
+ // anywhere between [1000, 1002]. Our assumption is that any mutations
+ // taking place after t=1000 must have its reply received and processed
+ // by this distributor and timestamped strictly higher than t=1000 (modulo
+ // clock skew, of course, but that is outside the scope of this). A mutation
+ // happening before t=1000 but receiving a reply at t>1000 does not affect
+ // correctness, as this should contain the same bucket info as that
+ // contained in the full bucket reply and the DB update is thus idempotent.
+ fakeBucketReply(
+ state,
+ dynamic_cast<RequestBucketInfoCommand&>(*_sender.commands[0]),
+ 0,
+ bucketsReturned);
+
+ BucketDatabase::Entry e(getBucket(bucket));
+ CPPUNIT_ASSERT_EQUAL(uint32_t(1), e->getNodeCount());
+ CPPUNIT_ASSERT_EQUAL(wantedInfo, e->getNodeRef(0).getBucketInfo());
+
+}
+
+std::vector<uint16_t>
+BucketDBUpdaterTest::getSendSet() const
+{
+ std::vector<uint16_t> nodes;
+ std::transform(_sender.commands.begin(),
+ _sender.commands.end(),
+ std::back_inserter(nodes),
+ [](auto& cmd)
+ {
+ auto& req(dynamic_cast<const api::RequestBucketInfoCommand&>(*cmd));
+ return req.getAddress()->getIndex();
+ });
+ return nodes;
+}
+
+std::vector<uint16_t>
+BucketDBUpdaterTest::getSentNodesWithPreemption(
+ const std::string& oldClusterState,
+ uint32_t expectedOldStateMessages,
+ const std::string& preemptedClusterState,
+ const std::string& newClusterState)
+{
+ lib::ClusterState stateBefore(oldClusterState);
+ uint32_t dummyBucketsToReturn = 10;
+ setAndEnableClusterState(lib::ClusterState(oldClusterState),
+ expectedOldStateMessages,
+ dummyBucketsToReturn);
+ _sender.clear();
+
+ setSystemState(lib::ClusterState(preemptedClusterState));
+ _sender.clear();
+ // Do not allow the pending state to become the active state; trigger a
+ // new transition without ACKing the info requests first. This will
+ // overwrite the pending state entirely.
+ setSystemState(lib::ClusterState(newClusterState));
+ return getSendSet();
+}
+
+using nodeVec = std::vector<uint16_t>;
+
+/*
+ * If we don't carry over the set of nodes that we need to fetch from,
+ * a naive comparison between the active state and the new state will
+ * make it appear to the distributor that nothing has changed, as any
+ * database modifications caused by intermediate states will not be
+ * accounted for (basically the ABA problem in a distributed setting).
+ */
+void
+BucketDBUpdaterTest::preemptedDistrChangeCarriesNodeSetOverToNextStateFetch()
+{
+ CPPUNIT_ASSERT_EQUAL(
+ (nodeVec{0, 1, 2, 3, 4, 5}),
+ getSentNodesWithPreemption("version:1 distributor:6 storage:6", 6,
+ "version:2 distributor:6 .5.s:d storage:6",
+ "version:3 distributor:6 storage:6"));
+}
+
+void
+BucketDBUpdaterTest::preemptedStorChangeCarriesNodeSetOverToNextStateFetch()
+{
+ CPPUNIT_ASSERT_EQUAL(
+ (nodeVec{2, 3}),
+ getSentNodesWithPreemption(
+ "version:1 distributor:6 storage:6 .2.s:d", 5,
+ "version:2 distributor:6 storage:6 .2.s:d .3.s:d",
+ "version:3 distributor:6 storage:6"));
+}
+
+void
+BucketDBUpdaterTest::preemptedStorageNodeDownMustBeReFetched()
+{
+ CPPUNIT_ASSERT_EQUAL(
+ (nodeVec{2}),
+ getSentNodesWithPreemption(
+ "version:1 distributor:6 storage:6", 6,
+ "version:2 distributor:6 storage:6 .2.s:d",
+ "version:3 distributor:6 storage:6"));
+}
+
+void
+BucketDBUpdaterTest::doNotSendToPreemptedNodeNowInDownState()
+{
+ CPPUNIT_ASSERT_EQUAL(
+ nodeVec{},
+ getSentNodesWithPreemption(
+ "version:1 distributor:6 storage:6 .2.s:d", 5,
+ "version:2 distributor:6 storage:6", // Sends to 2.
+ "version:3 distributor:6 storage:6 .2.s:d")); // 2 down again.
+}
+
+void
+BucketDBUpdaterTest::doNotSendToPreemptedNodeNotPartOfNewState()
+{
+ // Even though 100 nodes are preempted, not all of these should be part
+ // of the request afterwards when only 6 are part of the state.
+ CPPUNIT_ASSERT_EQUAL(
+ (nodeVec{0, 1, 2, 3, 4, 5}),
+ getSentNodesWithPreemption(
+ "version:1 distributor:6 storage:100", 100,
+ "version:2 distributor:5 .4.s:d storage:100",
+ "version:3 distributor:6 storage:6"));
+}
+
+void
+BucketDBUpdaterTest::outdatedNodeSetClearedAfterSuccessfulStateCompletion()
+{
+ lib::ClusterState stateBefore(
+ "version:1 distributor:6 storage:6 .1.t:1234");
+ uint32_t expectedMsgs = 6, dummyBucketsToReturn = 10;
+ setAndEnableClusterState(stateBefore, expectedMsgs, dummyBucketsToReturn);
+ _sender.clear();
+ // New cluster state that should not by itself trigger any new fetches,
+ // unless outdated node set is somehow not cleared after an enabled
+ // (completed) cluster state has been set.
+ lib::ClusterState stateAfter("version:3 distributor:6 storage:6");
+ setSystemState(stateAfter);
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _sender.commands.size());
+}
+
+// XXX test currently disabled since distribution config currently isn't used
+// at all in order to deduce the set of nodes to send to. This might not matter
+// in practice since it is assumed that the cluster state matching the new
+// distribution config will follow very shortly after the config has been
+// applied to the node. The new cluster state will then send out requests to
+// the correct node set.
+void
+BucketDBUpdaterTest::clusterConfigDownsizeOnlySendsToAvailableNodes()
+{
+ uint32_t expectedMsgs = 6, dummyBucketsToReturn = 20;
+ setAndEnableClusterState(lib::ClusterState("distributor:6 storage:6"),
+ expectedMsgs, dummyBucketsToReturn);
+ _sender.clear();
+
+ // Intentionally trigger a racing config change which arrives before the
+ // new cluster state representing it.
+ std::string distConfig(getDistConfig3Nodes1Group());
+ _node->getComponentRegister().setDistribution(
+ std::make_shared<lib::Distribution>(distConfig));
+ _distributor->storageDistributionChanged();
+ _distributor->enableNextDistribution();
+ sortSentMessagesByIndex(_sender);
+
+ CPPUNIT_ASSERT_EQUAL((nodeVec{0, 1, 2}), getSendSet());
+}
+
+void
+BucketDBUpdaterTest::changedDiskSetTriggersReFetch()
+{
+ // Same number of online disks, but the set of disks has changed.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("1"),
+ getSentNodes("distributor:2 storage:2 .1.d:3 .1.d.2.s:d",
+ "distributor:2 storage:2 .1.d:3 .1.d.1.s:d"));
+}
+
+/**
+ * Test scenario where a cluster is downsized by removing a subset of the nodes
+ * from the distribution configuration. The system must be able to deal with
+ * a scenario where the set of nodes between two cluster states across a config
+ * change may differ.
+ *
+ * See VESPA-790 for details.
+ */
+void
+BucketDBUpdaterTest::nodeMissingFromConfigIsTreatedAsNeedingOwnershipTransfer()
+{
+ uint32_t expectedMsgs = 3, dummyBucketsToReturn = 1;
+ setAndEnableClusterState(lib::ClusterState("distributor:3 storage:3"),
+ expectedMsgs, dummyBucketsToReturn);
+ _sender.clear();
+
+ // Cluster goes from {0, 1, 2} -> {0, 1}. This leaves us with a config
+ // that does not contain node 2 while the _active_ cluster state still
+ // contains this node.
+ const char* downsizeCfg =
+ "redundancy 2\n"
+ "distributor_auto_ownership_transfer_on_whole_group_down true\n"
+ "group[2]\n"
+ "group[0].name \"invalid\"\n"
+ "group[0].index \"invalid\"\n"
+ "group[0].partitions 1|*\n"
+ "group[0].nodes[0]\n"
+ "group[1].name rack0\n"
+ "group[1].index 0\n"
+ "group[1].nodes[2]\n"
+ "group[1].nodes[0].index 0\n"
+ "group[1].nodes[1].index 1\n";
+
+ _node->getComponentRegister().setDistribution(
+ std::make_shared<lib::Distribution>(downsizeCfg));
+ _distributor->storageDistributionChanged();
+ _distributor->enableNextDistribution();
+ sortSentMessagesByIndex(_sender);
+ _sender.clear();
+
+ // Attempt to apply state with {0, 1} set. This will compare the new state
+ // with the previous state, which still has node 2.
+ expectedMsgs = 2;
+ setAndEnableClusterState(lib::ClusterState("distributor:2 storage:2"),
+ expectedMsgs, dummyBucketsToReturn);
+
+ CPPUNIT_ASSERT_EQUAL((nodeVec{0, 1}), getSendSet());
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/bucketgctimecalculatortest.cpp b/storage/src/tests/distributor/bucketgctimecalculatortest.cpp
new file mode 100644
index 00000000000..39bef3ec395
--- /dev/null
+++ b/storage/src/tests/distributor/bucketgctimecalculatortest.cpp
@@ -0,0 +1,114 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <chrono>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storage/distributor/bucketgctimecalculator.h>
+
+namespace storage {
+namespace distributor {
+
+struct MockBucketIdHasher : public BucketGcTimeCalculator::BucketIdHasher
+{
+ size_t nextGeneratedHash {0};
+
+ size_t doHash(const document::BucketId&) const override {
+ return nextGeneratedHash;
+ }
+};
+
+struct BucketGcTimeCalculatorTest : public CppUnit::TestFixture
+{
+ void noGcIfAlreadyCheckedAfterStartPoint();
+ void gcIfNotRunInCurrentPeriodAndCheckPeriodPassed();
+ void noGcIfNotRunInCurrentPeriodAndCheckPeriodNotPassed();
+ void noGcIfCheckIntervalIsZero();
+ void identityHasherReturnsBucketId();
+
+ BucketGcTimeCalculatorTest();
+
+ CPPUNIT_TEST_SUITE(BucketGcTimeCalculatorTest);
+ CPPUNIT_TEST(noGcIfAlreadyCheckedAfterStartPoint);
+ CPPUNIT_TEST(gcIfNotRunInCurrentPeriodAndCheckPeriodPassed);
+ CPPUNIT_TEST(noGcIfNotRunInCurrentPeriodAndCheckPeriodNotPassed);
+ CPPUNIT_TEST(noGcIfCheckIntervalIsZero);
+ CPPUNIT_TEST(identityHasherReturnsBucketId);
+ CPPUNIT_TEST_SUITE_END();
+
+private:
+ // Ease of reading aliases
+ using CurrentTime = std::chrono::seconds;
+ using LastRunAt = std::chrono::seconds;
+
+ MockBucketIdHasher hasher;
+ std::chrono::seconds checkInterval;
+ BucketGcTimeCalculator calc;
+ document::BucketId b;
+};
+
+BucketGcTimeCalculatorTest::BucketGcTimeCalculatorTest()
+ : checkInterval(1000),
+ calc(hasher, checkInterval),
+ b(16, 1)
+{
+ hasher.nextGeneratedHash = 500;
+}
+
+CPPUNIT_TEST_SUITE_REGISTRATION(BucketGcTimeCalculatorTest);
+
+void
+BucketGcTimeCalculatorTest::noGcIfAlreadyCheckedAfterStartPoint()
+{
+ // Note: LastRun(0) is considered to be within the current period.
+ CPPUNIT_ASSERT(!calc.shouldGc(b, CurrentTime(0), LastRunAt(0)));
+ CPPUNIT_ASSERT(!calc.shouldGc(b, CurrentTime(499), LastRunAt(0)));
+ CPPUNIT_ASSERT(!calc.shouldGc(b, CurrentTime(999), LastRunAt(500)));
+
+ CPPUNIT_ASSERT(!calc.shouldGc(b, CurrentTime(1000), LastRunAt(1000)));
+ CPPUNIT_ASSERT(!calc.shouldGc(b, CurrentTime(1234), LastRunAt(1100)));
+ CPPUNIT_ASSERT(!calc.shouldGc(b, CurrentTime(1600), LastRunAt(1500)));
+}
+
+void
+BucketGcTimeCalculatorTest::gcIfNotRunInCurrentPeriodAndCheckPeriodPassed()
+{
+ CPPUNIT_ASSERT(calc.shouldGc(b, CurrentTime(500), LastRunAt(0)));
+ CPPUNIT_ASSERT(calc.shouldGc(b, CurrentTime(1600), LastRunAt(500)));
+ // Note: this may look wrong, but is correct since GC should have been
+ // scheduled _after_ 1499 so this is most likely the case where a bucket
+ // has been added to the database at this point in time. Not treating
+ // this as a valid GC scenario would mean newly added buckets would have to
+ // wait until the next period to be considered. If the period is long and
+ // the system is unstable (causing many bucket handoffs), we'd risk not
+ // being able to scheduled many buckets at all.
+ CPPUNIT_ASSERT(calc.shouldGc(b, CurrentTime(1600), LastRunAt(1499)));
+
+ CPPUNIT_ASSERT(calc.shouldGc(b, CurrentTime(2000), LastRunAt(500)));
+ CPPUNIT_ASSERT(calc.shouldGc(b, CurrentTime(2600), LastRunAt(1500)));
+}
+
+void
+BucketGcTimeCalculatorTest::noGcIfNotRunInCurrentPeriodAndCheckPeriodNotPassed()
+{
+ CPPUNIT_ASSERT(!calc.shouldGc(b, CurrentTime(1000), LastRunAt(500)));
+}
+
+void
+BucketGcTimeCalculatorTest::noGcIfCheckIntervalIsZero()
+{
+ BucketGcTimeCalculator calc2(hasher, std::chrono::seconds(0));
+ CPPUNIT_ASSERT(!calc2.shouldGc(b, CurrentTime(5000), LastRunAt(0)));
+}
+
+void
+BucketGcTimeCalculatorTest::identityHasherReturnsBucketId()
+{
+ BucketGcTimeCalculator::BucketIdIdentityHasher hasher2;
+ document::BucketId bucket(36, 1234);
+
+ CPPUNIT_ASSERT_EQUAL(bucket.getId(), hasher2.hash(bucket));
+}
+
+} // distributor
+} // storage
+
diff --git a/storage/src/tests/distributor/bucketstateoperationtest.cpp b/storage/src/tests/distributor/bucketstateoperationtest.cpp
new file mode 100644
index 00000000000..1477f1d6ed0
--- /dev/null
+++ b/storage/src/tests/distributor/bucketstateoperationtest.cpp
@@ -0,0 +1,251 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <tests/distributor/distributortestutil.h>
+#include <vespa/storage/distributor/operations/idealstate/setbucketstateoperation.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+
+namespace storage {
+
+namespace distributor {
+
+class BucketStateOperationTest : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+ CPPUNIT_TEST_SUITE(BucketStateOperationTest);
+ CPPUNIT_TEST(testActiveStateSupportedInBucketDb);
+ CPPUNIT_TEST(testActivateSingleNode);
+ CPPUNIT_TEST(testActivateAndDeactivateNodes);
+ CPPUNIT_TEST(testDoNotDeactivateIfActivateFails);
+ CPPUNIT_TEST(testBucketDbNotUpdatedOnFailure);
+ CPPUNIT_TEST_SUITE_END();
+
+private:
+ void testActiveStateSupportedInBucketDb();
+ void testActivateSingleNode();
+ void testActivateAndDeactivateNodes();
+ void testDoNotDeactivateIfActivateFails();
+ void testBucketDbNotUpdatedOnFailure();
+
+public:
+ void setUp()
+ {
+ createLinks();
+ }
+
+ void tearDown()
+ {
+ close();
+ }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(BucketStateOperationTest);
+
+void
+BucketStateOperationTest::testActiveStateSupportedInBucketDb()
+{
+ document::BucketId bid(16, 1);
+ insertBucketInfo(bid, 0, 0xabc, 10, 1100, true, true);
+
+ BucketDatabase::Entry entry = getBucket(bid);
+ CPPUNIT_ASSERT(entry.valid());
+ CPPUNIT_ASSERT(entry->getNode(0)->active());
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("node(idx=0,crc=0xabc,docs=10/10,bytes=1100/1100,"
+ "trusted=true,active=true)"),
+ entry->getNode(0)->toString());
+}
+
+void
+BucketStateOperationTest::testActivateSingleNode()
+{
+ document::BucketId bid(16, 1);
+ insertBucketInfo(bid, 0, 0xabc, 10, 1100, true, false);
+
+ BucketAndNodes bucketAndNodes(bid, toVector<uint16_t>(0));
+ std::vector<uint16_t> active;
+ active.push_back(0);
+ SetBucketStateOperation op("storage", bucketAndNodes, active);
+
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL((size_t)1, _sender.commands.size());
+
+ std::shared_ptr<api::StorageCommand> msg = _sender.commands[0];
+ CPPUNIT_ASSERT(msg->getType() == api::MessageType::SETBUCKETSTATE);
+ CPPUNIT_ASSERT_EQUAL(
+ api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 0).toString(),
+ msg->getAddress()->toString());
+
+ const api::SetBucketStateCommand& cmd(
+ dynamic_cast<const api::SetBucketStateCommand&>(*msg));
+ CPPUNIT_ASSERT_EQUAL(bid, cmd.getBucketId());
+ CPPUNIT_ASSERT_EQUAL(api::SetBucketStateCommand::ACTIVE, cmd.getState());
+
+ std::shared_ptr<api::StorageReply> reply(msg->makeReply().release());
+ op.receive(_sender, reply);
+
+ BucketDatabase::Entry entry = getBucket(bid);
+ CPPUNIT_ASSERT(entry.valid());
+ CPPUNIT_ASSERT(entry->getNodeRef(0).active());
+
+ CPPUNIT_ASSERT(op.ok());
+
+ // TODO: check that it's done
+}
+
+void
+BucketStateOperationTest::testActivateAndDeactivateNodes()
+{
+ document::BucketId bid(16, 1);
+ insertBucketInfo(bid, 0, 0xabc, 10, 1100, false, true);
+ insertBucketInfo(bid, 1, 0xdef, 15, 1500, false, false);
+
+ BucketAndNodes bucketAndNodes(bid, toVector<uint16_t>(0, 1));
+ std::vector<uint16_t> active;
+ active.push_back(1);
+ SetBucketStateOperation op("storage", bucketAndNodes, active);
+
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL((size_t)1, _sender.commands.size());
+ {
+ std::shared_ptr<api::StorageCommand> msg = _sender.commands[0];
+ CPPUNIT_ASSERT(msg->getType() == api::MessageType::SETBUCKETSTATE);
+ CPPUNIT_ASSERT_EQUAL(
+ api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 1).toString(),
+ msg->getAddress()->toString());
+
+ const api::SetBucketStateCommand& cmd(
+ dynamic_cast<const api::SetBucketStateCommand&>(*msg));
+ CPPUNIT_ASSERT_EQUAL(bid, cmd.getBucketId());
+ CPPUNIT_ASSERT_EQUAL(api::SetBucketStateCommand::ACTIVE, cmd.getState());
+
+ std::shared_ptr<api::StorageReply> reply(msg->makeReply().release());
+ op.receive(_sender, reply);
+ }
+
+ CPPUNIT_ASSERT_EQUAL((size_t)2, _sender.commands.size());
+ {
+ std::shared_ptr<api::StorageCommand> msg = _sender.commands[1];
+ CPPUNIT_ASSERT(msg->getType() == api::MessageType::SETBUCKETSTATE);
+ CPPUNIT_ASSERT_EQUAL(
+ api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 0).toString(),
+ msg->getAddress()->toString());
+
+ const api::SetBucketStateCommand& cmd(
+ dynamic_cast<const api::SetBucketStateCommand&>(*msg));
+ CPPUNIT_ASSERT_EQUAL(bid, cmd.getBucketId());
+ CPPUNIT_ASSERT_EQUAL(api::SetBucketStateCommand::INACTIVE, cmd.getState());
+
+ std::shared_ptr<api::StorageReply> reply(msg->makeReply().release());
+ op.receive(_sender, reply);
+ }
+
+ BucketDatabase::Entry entry = getBucket(bid);
+ CPPUNIT_ASSERT(entry.valid());
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("node(idx=0,crc=0xabc,docs=10/10,bytes=1100/1100,"
+ "trusted=true,active=false)"),
+ entry->getNodeRef(0).toString());
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("node(idx=1,crc=0xdef,docs=15/15,bytes=1500/1500,"
+ "trusted=false,active=true)"),
+ entry->getNodeRef(1).toString());
+
+ CPPUNIT_ASSERT(op.ok());
+}
+
+void
+BucketStateOperationTest::testDoNotDeactivateIfActivateFails()
+{
+ document::BucketId bid(16, 1);
+ insertBucketInfo(bid, 0, 0xabc, 10, 1100, false, true);
+ insertBucketInfo(bid, 1, 0xdef, 15, 1500, false, false);
+
+ BucketAndNodes bucketAndNodes(bid, toVector<uint16_t>(0, 1));
+ std::vector<uint16_t> active;
+ active.push_back(1);
+ SetBucketStateOperation op("storage", bucketAndNodes, active);
+
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL((size_t)1, _sender.commands.size());
+ {
+ std::shared_ptr<api::StorageCommand> msg = _sender.commands[0];
+ CPPUNIT_ASSERT(msg->getType() == api::MessageType::SETBUCKETSTATE);
+ CPPUNIT_ASSERT_EQUAL(
+ api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 1).toString(),
+ msg->getAddress()->toString());
+
+ const api::SetBucketStateCommand& cmd(
+ dynamic_cast<const api::SetBucketStateCommand&>(*msg));
+ CPPUNIT_ASSERT_EQUAL(bid, cmd.getBucketId());
+ CPPUNIT_ASSERT_EQUAL(api::SetBucketStateCommand::ACTIVE, cmd.getState());
+
+ std::shared_ptr<api::StorageReply> reply(msg->makeReply().release());
+ reply->setResult(api::ReturnCode(api::ReturnCode::ABORTED, "aaarg!"));
+ op.receive(_sender, reply);
+ }
+
+ CPPUNIT_ASSERT_EQUAL((size_t)1, _sender.commands.size());
+
+ BucketDatabase::Entry entry = getBucket(bid);
+ CPPUNIT_ASSERT(entry.valid());
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("node(idx=0,crc=0xabc,docs=10/10,bytes=1100/1100,"
+ "trusted=true,active=true)"),
+ entry->getNodeRef(0).toString());
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("node(idx=1,crc=0xdef,docs=15/15,bytes=1500/1500,"
+ "trusted=false,active=false)"),
+ entry->getNodeRef(1).toString());
+
+ CPPUNIT_ASSERT(!op.ok());
+}
+
+void
+BucketStateOperationTest::testBucketDbNotUpdatedOnFailure()
+{
+ document::BucketId bid(16, 1);
+ insertBucketInfo(bid, 0, 0xabc, 10, 1100, true, false);
+
+ BucketAndNodes bucketAndNodes(bid, toVector<uint16_t>(0));
+ std::vector<uint16_t> active;
+ active.push_back(0);
+ SetBucketStateOperation op("storage", bucketAndNodes, active);
+
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL((size_t)1, _sender.commands.size());
+
+ std::shared_ptr<api::StorageCommand> msg = _sender.commands[0];
+ CPPUNIT_ASSERT(msg->getType() == api::MessageType::SETBUCKETSTATE);
+ CPPUNIT_ASSERT_EQUAL(
+ api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 0).toString(),
+ msg->getAddress()->toString());
+
+ std::shared_ptr<api::StorageReply> reply(msg->makeReply().release());
+ reply->setResult(api::ReturnCode(api::ReturnCode::ABORTED, "aaarg!"));
+ op.receive(_sender, reply);
+
+ BucketDatabase::Entry entry = getBucket(bid);
+ CPPUNIT_ASSERT(entry.valid());
+ // Should not be updated
+ CPPUNIT_ASSERT(!entry->getNodeRef(0).active());
+
+ CPPUNIT_ASSERT(!op.ok());
+}
+
+} // namespace distributor
+
+} // namespace storage
diff --git a/storage/src/tests/distributor/distributor_host_info_reporter_test.cpp b/storage/src/tests/distributor/distributor_host_info_reporter_test.cpp
new file mode 100644
index 00000000000..65ccc65bdcf
--- /dev/null
+++ b/storage/src/tests/distributor/distributor_host_info_reporter_test.cpp
@@ -0,0 +1,225 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storage/distributor/distributor_host_info_reporter.h>
+#include <vespa/storage/distributor/latency_statistics_provider.h>
+#include <vespa/storage/distributor/min_replica_provider.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vespalib/io/fileutil.h>
+#include <vespa/vespalib/util/jsonstream.h>
+#include <tests/common/hostreporter/util.h>
+
+namespace storage {
+namespace distributor {
+
+using End = vespalib::JsonStream::End;
+using File = vespalib::File;
+using Object = vespalib::JsonStream::Object;
+
+class DistributorHostInfoReporterTest : public CppUnit::TestFixture
+{
+ CPPUNIT_TEST_SUITE(DistributorHostInfoReporterTest);
+ CPPUNIT_TEST(hostInfoWithPutLatenciesOnly);
+ CPPUNIT_TEST(hostInfoAllInfo);
+ CPPUNIT_TEST(generateExampleJson);
+ CPPUNIT_TEST(noReportGeneratedIfDisabled);
+ CPPUNIT_TEST_SUITE_END();
+
+ void hostInfoWithPutLatenciesOnly();
+ void hostInfoAllInfo();
+ void verifyReportedNodeLatencies(
+ const vespalib::Slime& root,
+ uint16_t node,
+ int64_t latencySum,
+ int64_t count);
+ void generateExampleJson();
+ void noReportGeneratedIfDisabled();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(DistributorHostInfoReporterTest);
+
+using ms = std::chrono::milliseconds;
+
+namespace {
+
+OperationStats
+makeOpStats(std::chrono::milliseconds totalLatency, uint64_t numRequests)
+{
+ OperationStats stats;
+ stats.totalLatency = totalLatency;
+ stats.numRequests = numRequests;
+ return stats;
+}
+
+// My kingdom for GoogleMock!
+struct MockedLatencyStatisticsProvider : LatencyStatisticsProvider
+{
+ NodeStatsSnapshot returnedSnapshot;
+
+ NodeStatsSnapshot doGetLatencyStatistics() const {
+ return returnedSnapshot;
+ }
+};
+
+struct MockedMinReplicaProvider : MinReplicaProvider
+{
+ std::unordered_map<uint16_t, uint32_t> minReplica;
+
+ std::unordered_map<uint16_t, uint32_t> getMinReplica() const override {
+ return minReplica;
+ }
+};
+
+const vespalib::slime::Inspector&
+getNode(const vespalib::Slime& root, uint16_t nodeIndex)
+{
+ auto& storage_nodes = root.get()["distributor"]["storage-nodes"];
+ const size_t n = storage_nodes.entries();
+ for (size_t i = 0; i < n; ++i) {
+ if (storage_nodes[i]["node-index"].asLong() == nodeIndex) {
+ return storage_nodes[i];
+ }
+ }
+ throw std::runtime_error("No node found with index "
+ + std::to_string(nodeIndex));
+}
+
+int
+getMinReplica(const vespalib::Slime& root, uint16_t nodeIndex)
+{
+ return getNode(root, nodeIndex)["min-current-replication-factor"].asLong();
+}
+
+const vespalib::slime::Inspector&
+getLatenciesForNode(const vespalib::Slime& root, uint16_t nodeIndex)
+{
+ return getNode(root, nodeIndex)["ops-latency"];
+}
+
+} // anon ns
+
+void
+DistributorHostInfoReporterTest::verifyReportedNodeLatencies(
+ const vespalib::Slime& root,
+ uint16_t node,
+ int64_t latencySum,
+ int64_t count)
+{
+ auto& latencies = getLatenciesForNode(root, node);
+ CPPUNIT_ASSERT_EQUAL(latencySum,
+ latencies["put"]["latency-ms-sum"].asLong());
+ CPPUNIT_ASSERT_EQUAL(count, latencies["put"]["count"].asLong());
+}
+
+void
+DistributorHostInfoReporterTest::hostInfoWithPutLatenciesOnly()
+{
+ MockedLatencyStatisticsProvider latencyStatsProvider;
+ MockedMinReplicaProvider minReplicaProvider;
+ DistributorHostInfoReporter reporter(latencyStatsProvider,
+ minReplicaProvider);
+
+ NodeStatsSnapshot snapshot;
+ snapshot.nodeToStats[0] = { makeOpStats(ms(10000), 3) };
+ snapshot.nodeToStats[5] = { makeOpStats(ms(25000), 7) };
+
+ latencyStatsProvider.returnedSnapshot = snapshot;
+
+ vespalib::Slime root;
+ util::reporterToSlime(reporter, root);
+ verifyReportedNodeLatencies(root, 0, 10000, 3);
+ verifyReportedNodeLatencies(root, 5, 25000, 7);
+}
+
+void
+DistributorHostInfoReporterTest::hostInfoAllInfo()
+{
+ MockedLatencyStatisticsProvider latencyStatsProvider;
+ MockedMinReplicaProvider minReplicaProvider;
+ DistributorHostInfoReporter reporter(latencyStatsProvider,
+ minReplicaProvider);
+
+ NodeStatsSnapshot latencySnapshot;
+ latencySnapshot.nodeToStats[0] = { makeOpStats(ms(10000), 3) };
+ latencySnapshot.nodeToStats[5] = { makeOpStats(ms(25000), 7) };
+ latencyStatsProvider.returnedSnapshot = latencySnapshot;
+
+ std::unordered_map<uint16_t, uint32_t> minReplica;
+ minReplica[0] = 2;
+ minReplica[5] = 9;
+ minReplicaProvider.minReplica = minReplica;
+
+ vespalib::Slime root;
+ util::reporterToSlime(reporter, root);
+ verifyReportedNodeLatencies(root, 0, 10000, 3);
+ verifyReportedNodeLatencies(root, 5, 25000, 7);
+
+ CPPUNIT_ASSERT_EQUAL(2, getMinReplica(root, 0));
+ CPPUNIT_ASSERT_EQUAL(9, getMinReplica(root, 5));
+}
+
+void
+DistributorHostInfoReporterTest::generateExampleJson()
+{
+ MockedLatencyStatisticsProvider latencyStatsProvider;
+ MockedMinReplicaProvider minReplicaProvider;
+ DistributorHostInfoReporter reporter(latencyStatsProvider,
+ minReplicaProvider);
+
+ NodeStatsSnapshot snapshot;
+ snapshot.nodeToStats[0] = { makeOpStats(ms(10000), 3) };
+ snapshot.nodeToStats[5] = { makeOpStats(ms(25000), 7) };
+ latencyStatsProvider.returnedSnapshot = snapshot;
+
+ std::unordered_map<uint16_t, uint32_t> minReplica;
+ minReplica[0] = 2;
+ minReplica[5] = 9;
+ minReplicaProvider.minReplica = minReplica;
+
+ vespalib::asciistream json;
+ vespalib::JsonStream stream(json, true);
+
+ stream << Object();
+ reporter.report(stream);
+ stream << End();
+ stream.finalize();
+
+ std::string jsonString = json.str();
+
+ std::string path = "../../../protocols/getnodestate/distributor.json";
+ std::string goldenString = File::readAll(path);
+
+ vespalib::slime::Memory goldenMemory(goldenString);
+ vespalib::Slime goldenSlime;
+ vespalib::slime::JsonFormat::decode(goldenMemory, goldenSlime);
+
+ vespalib::slime::Memory jsonMemory(jsonString);
+ vespalib::Slime jsonSlime;
+ vespalib::slime::JsonFormat::decode(jsonMemory, jsonSlime);
+
+ CPPUNIT_ASSERT_EQUAL(goldenSlime, jsonSlime);
+}
+
+void
+DistributorHostInfoReporterTest::noReportGeneratedIfDisabled()
+{
+ MockedLatencyStatisticsProvider latencyStatsProvider;
+ MockedMinReplicaProvider minReplicaProvider;
+ DistributorHostInfoReporter reporter(latencyStatsProvider,
+ minReplicaProvider);
+ reporter.enableReporting(false);
+
+ NodeStatsSnapshot snapshot;
+ snapshot.nodeToStats[0] = { makeOpStats(ms(10000), 3) };
+ snapshot.nodeToStats[5] = { makeOpStats(ms(25000), 7) };
+
+ latencyStatsProvider.returnedSnapshot = snapshot;
+
+ vespalib::Slime root;
+ util::reporterToSlime(reporter, root);
+ CPPUNIT_ASSERT_EQUAL(size_t(0), root.get().children());
+}
+
+} // distributor
+} // storage
+
diff --git a/storage/src/tests/distributor/distributortest.cpp b/storage/src/tests/distributor/distributortest.cpp
new file mode 100644
index 00000000000..b51c8dd3873
--- /dev/null
+++ b/storage/src/tests/distributor/distributortest.cpp
@@ -0,0 +1,691 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <boost/assign/std/vector.hpp> // for 'operator+=()'
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/idealstatemetricsset.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <vespa/storageapi/message/visitor.h>
+#include <vespa/storageapi/message/removelocation.h>
+#include <vespa/storageframework/defaultimplementation/thread/threadpoolimpl.h>
+#include <tests/distributor/distributortestutil.h>
+#include <vespa/storage/config/config-stor-distributormanager.h>
+#include <tests/common/dummystoragelink.h>
+
+namespace storage {
+
+namespace distributor {
+
+class Distributor_Test : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+ CPPUNIT_TEST_SUITE(Distributor_Test);
+ CPPUNIT_TEST(testOperationGeneration);
+ CPPUNIT_TEST(testOperationsGeneratedAndStartedWithoutDuplicates);
+ CPPUNIT_TEST(testRecoveryModeOnClusterStateChange);
+ CPPUNIT_TEST(testOperationsAreThrottled);
+ CPPUNIT_TEST_IGNORED(testRecoveryModeEntryResetsScanner);
+ CPPUNIT_TEST_IGNORED(testReprioritizeBucketOnMaintenanceReply);
+ CPPUNIT_TEST(testHandleUnknownMaintenanceReply);
+ CPPUNIT_TEST(testContainsTimeStatement);
+ CPPUNIT_TEST(testUpdateBucketDatabase);
+ CPPUNIT_TEST(testTickProcessesStatusRequests);
+ CPPUNIT_TEST(testMetricUpdateHookUpdatesPendingMaintenanceMetrics);
+ CPPUNIT_TEST(testPriorityConfigIsPropagatedToDistributorConfiguration);
+ CPPUNIT_TEST(testNoDbResurrectionForBucketNotOwnedInPendingState);
+ CPPUNIT_TEST(testAddedDbBucketsWithoutGcTimestampImplicitlyGetCurrentTime);
+ CPPUNIT_TEST(mergeStatsAreAccumulatedDuringDatabaseIteration);
+ CPPUNIT_TEST(statsGeneratedForPreemptedOperations);
+ CPPUNIT_TEST(hostInfoReporterConfigIsPropagatedToReporter);
+ CPPUNIT_TEST(replicaCountingModeIsConfiguredToTrustedByDefault);
+ CPPUNIT_TEST(replicaCountingModeConfigIsPropagatedToMetricUpdater);
+ CPPUNIT_TEST(bucketActivationIsEnabledByDefault);
+ CPPUNIT_TEST(bucketActivationConfigIsPropagatedToDistributorConfiguration);
+ CPPUNIT_TEST_SUITE_END();
+
+protected:
+ void testOperationGeneration();
+ void testOperationsGeneratedAndStartedWithoutDuplicates();
+ void testRecoveryModeOnClusterStateChange();
+ void testOperationsAreThrottled();
+ void testRecoveryModeEntryResetsScanner();
+ void testReprioritizeBucketOnMaintenanceReply();
+ void testHandleUnknownMaintenanceReply();
+ void testContainsTimeStatement();
+ void testUpdateBucketDatabase();
+ void testTickProcessesStatusRequests();
+ void testMetricUpdateHookUpdatesPendingMaintenanceMetrics();
+ void testPriorityConfigIsPropagatedToDistributorConfiguration();
+ void testNoDbResurrectionForBucketNotOwnedInPendingState();
+ void testAddedDbBucketsWithoutGcTimestampImplicitlyGetCurrentTime();
+ void mergeStatsAreAccumulatedDuringDatabaseIteration();
+ void statsGeneratedForPreemptedOperations();
+ void hostInfoReporterConfigIsPropagatedToReporter();
+ void replicaCountingModeIsConfiguredToTrustedByDefault();
+ void replicaCountingModeConfigIsPropagatedToMetricUpdater();
+ void bucketActivationIsEnabledByDefault();
+ void bucketActivationConfigIsPropagatedToDistributorConfiguration();
+
+public:
+ void setUp() {
+ createLinks();
+ };
+
+ void tearDown() {
+ close();
+ }
+
+private:
+ // Simple type aliases to make interfacing with certain utility functions
+ // easier. Note that this is only for readability and does not provide any
+ // added type safety.
+ using NodeCount = int;
+ using Redundancy = int;
+
+ using ConfigBuilder = vespa::config::content::core::StorDistributormanagerConfigBuilder;
+
+ void configureDistributor(const ConfigBuilder& config) {
+ getConfig().configure(config);
+ _distributor->enableNextConfig();
+ }
+
+ auto currentReplicaCountingMode() const noexcept {
+ return _distributor->_bucketDBMetricUpdater
+ .getMinimumReplicaCountingMode();
+ }
+
+ std::string testOp(api::StorageMessage* msg)
+ {
+ api::StorageMessage::SP msgPtr(msg);
+ _distributor->handleMessage(msgPtr);
+
+ std::string tmp = _sender.getCommands();
+ _sender.clear();
+ return tmp;
+ }
+
+ void tickDistributorNTimes(uint32_t n) {
+ for (uint32_t i = 0; i < n; ++i) {
+ tick();
+ }
+ }
+
+ typedef bool ResetTrusted;
+
+ std::string updateBucketDB(const std::string& firstState,
+ const std::string& secondState,
+ bool resetTrusted = false)
+ {
+ std::vector<std::string> states(toVector<std::string>(firstState, secondState));
+
+ for (uint32_t i = 0; i < states.size(); ++i) {
+ std::vector<uint16_t> removedNodes;
+ std::vector<BucketCopy> changedNodes;
+
+ vespalib::StringTokenizer tokenizer(states[i], ",");
+ for (uint32_t j = 0; j < tokenizer.size(); ++j) {
+ vespalib::StringTokenizer tokenizer2(tokenizer[j], ":");
+
+ bool trusted = false;
+ if (tokenizer2.size() > 2) {
+ trusted = true;
+ }
+
+ uint16_t node = atoi(tokenizer2[0].c_str());
+ if (tokenizer2[1] == "r") {
+ removedNodes.push_back(node);
+ } else {
+ uint32_t checksum = atoi(tokenizer2[1].c_str());
+ changedNodes.push_back(
+ BucketCopy(
+ i + 1,
+ node,
+ api::BucketInfo(
+ checksum,
+ checksum / 2,
+ checksum / 4)).setTrusted(trusted));
+ }
+ }
+
+ getExternalOperationHandler().removeNodesFromDB(document::BucketId(16, 1), removedNodes);
+
+ uint32_t flags(DatabaseUpdate::CREATE_IF_NONEXISTING
+ | (resetTrusted ? DatabaseUpdate::RESET_TRUSTED : 0));
+
+ getExternalOperationHandler().updateBucketDatabase(document::BucketId(16, 1),
+ changedNodes,
+ flags);
+ }
+
+ std::string retVal = dumpBucket(document::BucketId(16, 1));
+ getBucketDatabase().clear();
+ return retVal;
+ }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(Distributor_Test);
+
+void
+Distributor_Test::testOperationGeneration()
+{
+ setupDistributor(Redundancy(1), NodeCount(1), "storage:1 distributor:1");
+
+ document::BucketId bid;
+ addNodesToBucketDB(document::BucketId(16, 1), "0=1/1/1/t");
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Remove"),
+ testOp(new api::RemoveCommand(
+ bid,
+ document::DocumentId("userdoc:m:1:foo"),
+ api::Timestamp(1234))));
+
+ api::CreateVisitorCommand* cmd = new api::CreateVisitorCommand("foo", "bar", "");
+ cmd->addBucketToBeVisited(document::BucketId(16, 1));
+ cmd->addBucketToBeVisited(document::BucketId());
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create"), testOp(cmd));
+}
+
+void
+Distributor_Test::testOperationsGeneratedAndStartedWithoutDuplicates()
+{
+ setupDistributor(Redundancy(1), NodeCount(1), "storage:1 distributor:1");
+
+ for (uint32_t i = 0; i < 6; ++i) {
+ addNodesToBucketDB(document::BucketId(16, i), "0=1");
+ }
+
+ tickDistributorNTimes(20);
+
+ CPPUNIT_ASSERT(!tick());
+
+ CPPUNIT_ASSERT_EQUAL(6, (int)_sender.commands.size());
+}
+
+void
+Distributor_Test::testRecoveryModeOnClusterStateChange()
+{
+ setupDistributor(Redundancy(1), NodeCount(2),
+ "storage:1 .0.s:d distributor:1");
+ _distributor->enableClusterState(
+ lib::ClusterState("storage:1 distributor:1"));
+
+ CPPUNIT_ASSERT(_distributor->isInRecoveryMode());
+ for (uint32_t i = 0; i < 3; ++i) {
+ addNodesToBucketDB(document::BucketId(16, i), "0=1");
+ }
+ for (int i = 0; i < 3; ++i) {
+ tick();
+ CPPUNIT_ASSERT(_distributor->isInRecoveryMode());
+ }
+ tick();
+ CPPUNIT_ASSERT(!_distributor->isInRecoveryMode());
+
+ _distributor->enableClusterState(lib::ClusterState("storage:2 distributor:1"));
+ CPPUNIT_ASSERT(_distributor->isInRecoveryMode());
+}
+
+void
+Distributor_Test::testOperationsAreThrottled()
+{
+ setupDistributor(Redundancy(1), NodeCount(1), "storage:1 distributor:1");
+ getConfig().setMinPendingMaintenanceOps(1);
+ getConfig().setMaxPendingMaintenanceOps(1);
+
+ for (uint32_t i = 0; i < 6; ++i) {
+ addNodesToBucketDB(document::BucketId(16, i), "0=1");
+ }
+ tickDistributorNTimes(20);
+ CPPUNIT_ASSERT_EQUAL(1, (int)_sender.commands.size());
+}
+
+void
+Distributor_Test::testRecoveryModeEntryResetsScanner()
+{
+ CPPUNIT_FAIL("TODO: refactor so this can be mocked and tested easily");
+}
+
+void
+Distributor_Test::testReprioritizeBucketOnMaintenanceReply()
+{
+ CPPUNIT_FAIL("TODO: refactor so this can be mocked and tested easily");
+}
+
+void
+Distributor_Test::testHandleUnknownMaintenanceReply()
+{
+ setupDistributor(Redundancy(1), NodeCount(1), "storage:1 distributor:1");
+
+ {
+ api::SplitBucketCommand::SP cmd(
+ new api::SplitBucketCommand(document::BucketId(16, 1234)));
+ api::SplitBucketReply::SP reply(new api::SplitBucketReply(*cmd));
+
+ CPPUNIT_ASSERT(_distributor->handleReply(reply));
+ }
+
+ {
+ // RemoveLocationReply must be treated as a maintenance reply since
+ // it's what GC is currently built around.
+ auto cmd = std::make_shared<api::RemoveLocationCommand>(
+ "false", document::BucketId(30, 1234));
+ auto reply = std::shared_ptr<api::StorageReply>(cmd->makeReply());
+ CPPUNIT_ASSERT(_distributor->handleReply(reply));
+ }
+}
+
+void
+Distributor_Test::testContainsTimeStatement()
+{
+ setupDistributor(Redundancy(1), NodeCount(1), "storage:1 distributor:1");
+
+ CPPUNIT_ASSERT_EQUAL(false, getConfig().containsTimeStatement(""));
+ CPPUNIT_ASSERT_EQUAL(false, getConfig().containsTimeStatement("testdoctype1"));
+ CPPUNIT_ASSERT_EQUAL(false, getConfig().containsTimeStatement("testdoctype1.headerfield > 42"));
+ CPPUNIT_ASSERT_EQUAL(true, getConfig().containsTimeStatement("testdoctype1.headerfield > now()"));
+ CPPUNIT_ASSERT_EQUAL(true, getConfig().containsTimeStatement("testdoctype1.headerfield > now() - 3600"));
+ CPPUNIT_ASSERT_EQUAL(true, getConfig().containsTimeStatement("testdoctype1.headerfield == now() - 3600"));
+}
+
+void
+Distributor_Test::testUpdateBucketDatabase()
+{
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:3"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001) : "
+ "node(idx=0,crc=0x1c8,docs=228/228,bytes=114/114,trusted=true,active=false), "
+ "node(idx=1,crc=0x1c8,docs=228/228,bytes=114/114,trusted=true,active=false)"
+ ),
+ updateBucketDB("0:456,1:456,2:789", "2:r"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001) : "
+ "node(idx=0,crc=0x1c8,docs=228/228,bytes=114/114,trusted=true,active=false), "
+ "node(idx=2,crc=0x1c8,docs=228/228,bytes=114/114,trusted=true,active=false), "
+ "node(idx=1,crc=0x1c8,docs=228/228,bytes=114/114,trusted=true,active=false)"
+ ),
+ updateBucketDB("0:456,1:456", "2:456"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001) : "
+ "node(idx=0,crc=0x315,docs=394/394,bytes=197/197,trusted=false,active=false), "
+ "node(idx=2,crc=0x14d,docs=166/166,bytes=83/83,trusted=false,active=false), "
+ "node(idx=1,crc=0x34a,docs=421/421,bytes=210/210,trusted=false,active=false)"
+ ),
+ updateBucketDB("0:456:t,1:456:t,2:123", "0:789,1:842,2:333"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001) : "
+ "node(idx=0,crc=0x315,docs=394/394,bytes=197/197,trusted=true,active=false), "
+ "node(idx=2,crc=0x14d,docs=166/166,bytes=83/83,trusted=false,active=false), "
+ "node(idx=1,crc=0x315,docs=394/394,bytes=197/197,trusted=true,active=false)"
+ ),
+ updateBucketDB("0:456:t,1:456:t,2:123", "0:789,1:789,2:333"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001) : "
+ "node(idx=2,crc=0x14d,docs=166/166,bytes=83/83,trusted=true,active=false)"),
+ updateBucketDB("0:456:t,1:456:t", "0:r,1:r,2:333"));
+
+ // Copies are in sync so should still be trusted even if explicitly reset.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001) : "
+ "node(idx=0,crc=0x1c8,docs=228/228,bytes=114/114,trusted=true,active=false), "
+ "node(idx=2,crc=0x1c8,docs=228/228,bytes=114/114,trusted=true,active=false), "
+ "node(idx=1,crc=0x1c8,docs=228/228,bytes=114/114,trusted=true,active=false)"
+ ),
+ updateBucketDB("0:456,1:456", "2:456", ResetTrusted(true)));
+
+ // When resetting, first inserted copy should not end up as implicitly trusted.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001) : "
+ "node(idx=0,crc=0x1c8,docs=228/228,bytes=114/114,trusted=false,active=false), "
+ "node(idx=2,crc=0x14d,docs=166/166,bytes=83/83,trusted=false,active=false)"
+ ),
+ updateBucketDB("0:456",
+ "2:333",
+ ResetTrusted(true)));
+}
+
+namespace {
+
+using namespace framework::defaultimplementation;
+
+class StatusRequestThread : public framework::Runnable
+{
+ StatusReporterDelegate& _reporter;
+ std::string _result;
+public:
+ StatusRequestThread(StatusReporterDelegate& reporter)
+ : _reporter(reporter)
+ {}
+ void run(framework::ThreadHandle&) {
+ framework::HttpUrlPath path("/distributor?page=buckets");
+ std::ostringstream stream;
+ _reporter.reportStatus(stream, path);
+ _result = stream.str();
+ }
+
+ std::string getResult() const {
+ return _result;
+ }
+};
+
+}
+
+void
+Distributor_Test::testTickProcessesStatusRequests()
+{
+ setupDistributor(Redundancy(1), NodeCount(1), "storage:1 distributor:1");
+
+ addNodesToBucketDB(document::BucketId(16, 1), "0=1/1/1/t");
+
+ // Must go via delegate since reportStatus is now just a rendering
+ // function and not a request enqueuer (see Distributor::handleStatusRequest).
+ StatusRequestThread thread(_distributor->_distributorStatusDelegate);
+ FakeClock clock;
+ ThreadPoolImpl pool(clock);
+
+ uint64_t tickWaitMs = 5;
+ uint64_t tickMaxProcessTime = 5000;
+ int ticksBeforeWait = 1;
+ framework::Thread::UP tp(pool.startThread(
+ thread, "statustest", tickWaitMs, tickMaxProcessTime, ticksBeforeWait));
+
+ while (true) {
+ FastOS_Thread::Sleep(1);
+ framework::TickingLockGuard guard(
+ _distributor->_threadPool.freezeCriticalTicks());
+ if (!_distributor->_statusToDo.empty()) break;
+
+ }
+ CPPUNIT_ASSERT(tick());
+
+ tp->interruptAndJoin(0);
+
+ CPPUNIT_ASSERT_CONTAIN("BucketId(0x4000000000000001)", thread.getResult());
+}
+
+void
+Distributor_Test::testMetricUpdateHookUpdatesPendingMaintenanceMetrics()
+{
+ setupDistributor(Redundancy(2), NodeCount(2), "storage:2 distributor:1");
+ // To ensure we count all operations, not just those fitting within the
+ // pending window.
+ getConfig().setMinPendingMaintenanceOps(1);
+ getConfig().setMaxPendingMaintenanceOps(1);
+
+ // 1 bucket must be merged, 1 must be split, 1 should be activated.
+ addNodesToBucketDB(document::BucketId(16, 1), "0=1/1/1/t/a,1=2/2/2");
+ addNodesToBucketDB(document::BucketId(16, 2),
+ "0=100/10000000/200000/t/a,1=100/10000000/200000/t");
+ addNodesToBucketDB(document::BucketId(16, 3),
+ "0=200/300/400/t,1=200/300/400/t");
+
+ // Go many full scanner rounds to check that metrics are set, not
+ // added to existing.
+ tickDistributorNTimes(50);
+
+ // By this point, no hook has been called so the metrics have not been
+ // set.
+ typedef MaintenanceOperation MO;
+ {
+ const IdealStateMetricSet& metrics(getIdealStateManager().getMetrics());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0),
+ metrics.operations[MO::MERGE_BUCKET]
+ ->pending.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), metrics.operations[MO::SPLIT_BUCKET]
+ ->pending.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0),
+ metrics.operations[MO::SET_BUCKET_STATE]
+ ->pending.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), metrics.operations[MO::DELETE_BUCKET]
+ ->pending.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), metrics.operations[MO::JOIN_BUCKET]
+ ->pending.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0),
+ metrics.operations[MO::GARBAGE_COLLECTION]
+ ->pending.getLast());
+ }
+
+ // Force trigger update hook
+ vespalib::Monitor l;
+ _distributor->_metricUpdateHook.updateMetrics(metrics::MetricLockGuard(l));
+ // Metrics should now be updated to the last complete working state
+ {
+ const IdealStateMetricSet& metrics(getIdealStateManager().getMetrics());
+ CPPUNIT_ASSERT_EQUAL(int64_t(1),
+ metrics.operations[MO::MERGE_BUCKET]
+ ->pending.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(1), metrics.operations[MO::SPLIT_BUCKET]
+ ->pending.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(1),
+ metrics.operations[MO::SET_BUCKET_STATE]
+ ->pending.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), metrics.operations[MO::DELETE_BUCKET]
+ ->pending.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0), metrics.operations[MO::JOIN_BUCKET]
+ ->pending.getLast());
+ CPPUNIT_ASSERT_EQUAL(int64_t(0),
+ metrics.operations[MO::GARBAGE_COLLECTION]
+ ->pending.getLast());
+ }
+}
+
+void
+Distributor_Test::testPriorityConfigIsPropagatedToDistributorConfiguration()
+{
+ using namespace vespa::config::content::core;
+ using ConfigBuilder = StorDistributormanagerConfigBuilder;
+
+ setupDistributor(Redundancy(2), NodeCount(2), "storage:2 distributor:1");
+
+ ConfigBuilder builder;
+ builder.priorityMergeMoveToIdealNode = 1;
+ builder.priorityMergeOutOfSyncCopies = 2;
+ builder.priorityMergeTooFewCopies = 3;
+ builder.priorityActivateNoExistingActive = 4;
+ builder.priorityActivateWithExistingActive = 5;
+ builder.priorityDeleteBucketCopy = 6;
+ builder.priorityJoinBuckets = 7;
+ builder.prioritySplitDistributionBits = 8;
+ builder.prioritySplitLargeBucket = 9;
+ builder.prioritySplitInconsistentBucket = 10;
+ builder.priorityGarbageCollection = 11;
+
+ getConfig().configure(builder);
+
+ const DistributorConfiguration::MaintenancePriorities& mp(
+ getConfig().getMaintenancePriorities());
+ CPPUNIT_ASSERT_EQUAL(1, static_cast<int>(mp.mergeMoveToIdealNode));
+ CPPUNIT_ASSERT_EQUAL(2, static_cast<int>(mp.mergeOutOfSyncCopies));
+ CPPUNIT_ASSERT_EQUAL(3, static_cast<int>(mp.mergeTooFewCopies));
+ CPPUNIT_ASSERT_EQUAL(4, static_cast<int>(mp.activateNoExistingActive));
+ CPPUNIT_ASSERT_EQUAL(5, static_cast<int>(mp.activateWithExistingActive));
+ CPPUNIT_ASSERT_EQUAL(6, static_cast<int>(mp.deleteBucketCopy));
+ CPPUNIT_ASSERT_EQUAL(7, static_cast<int>(mp.joinBuckets));
+ CPPUNIT_ASSERT_EQUAL(8, static_cast<int>(mp.splitDistributionBits));
+ CPPUNIT_ASSERT_EQUAL(9, static_cast<int>(mp.splitLargeBucket));
+ CPPUNIT_ASSERT_EQUAL(10, static_cast<int>(mp.splitInconsistentBucket));
+ CPPUNIT_ASSERT_EQUAL(11, static_cast<int>(mp.garbageCollection));
+}
+
+void
+Distributor_Test::testNoDbResurrectionForBucketNotOwnedInPendingState()
+{
+ setupDistributor(Redundancy(1), NodeCount(10), "storage:2 distributor:2");
+ lib::ClusterState newState("storage:10 distributor:10");
+ auto stateCmd = std::make_shared<api::SetSystemStateCommand>(newState);
+ // Force newState into being the pending state. According to the initial
+ // state we own the bucket, but according to the pending state, we do
+ // not. This must be handled correctly by the database update code.
+ getBucketDBUpdater().onSetSystemState(stateCmd);
+
+ document::BucketId nonOwnedBucket(16, 3);
+ CPPUNIT_ASSERT(!getBucketDBUpdater()
+ .checkOwnershipInPendingState(nonOwnedBucket).isOwned());
+ CPPUNIT_ASSERT(!getBucketDBUpdater().getDistributorComponent()
+ .checkOwnershipInPendingAndCurrentState(nonOwnedBucket)
+ .isOwned());
+
+ std::vector<BucketCopy> copies;
+ copies.emplace_back(1234, 0, api::BucketInfo(0x567, 1, 2));
+ getExternalOperationHandler().updateBucketDatabase(nonOwnedBucket, copies,
+ DatabaseUpdate::CREATE_IF_NONEXISTING);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("NONEXISTING"),
+ dumpBucket(nonOwnedBucket));
+}
+
+void
+Distributor_Test::testAddedDbBucketsWithoutGcTimestampImplicitlyGetCurrentTime()
+{
+ setupDistributor(Redundancy(1), NodeCount(10), "storage:2 distributor:2");
+ getClock().setAbsoluteTimeInSeconds(101234);
+ document::BucketId bucket(16, 7654);
+
+ std::vector<BucketCopy> copies;
+ copies.emplace_back(1234, 0, api::BucketInfo(0x567, 1, 2));
+ getExternalOperationHandler().updateBucketDatabase(bucket, copies,
+ DatabaseUpdate::CREATE_IF_NONEXISTING);
+ BucketDatabase::Entry e(getBucket(bucket));
+ CPPUNIT_ASSERT_EQUAL(uint32_t(101234), e->getLastGarbageCollectionTime());
+}
+
+
+void
+Distributor_Test::mergeStatsAreAccumulatedDuringDatabaseIteration()
+{
+ setupDistributor(Redundancy(2), NodeCount(3), "storage:3 distributor:1");
+ // Copies out of sync. Not possible for distributor to _reliably_ tell
+ // which direction(s) data will flow, so for simplicity assume that we
+ // must sync both copies.
+ // Note that we mark certain copies as active to prevent the bucketstate
+ // checker from pre-empting the merges.
+ // -> syncing[0] += 1, syncing[2] += 1
+ addNodesToBucketDB(document::BucketId(16, 1), "0=1/1/1/t/a,2=2/2/2");
+ // Must add missing node 2 for bucket
+ // -> copyingOut[0] += 1, copyingIn[2] += 1
+ addNodesToBucketDB(document::BucketId(16, 2), "0=1/1/1/t/a");
+ // Moving from non-ideal node 1 to ideal node 2. Both nodes 0 and 1 will
+ // be involved in this merge, but only node 1 will be tagged as source only
+ // (i.e. to be deleted after the merge is completed).
+ // -> copyingOut[0] += 1, movingOut[1] += 1, copyingIn[2] += 1
+ addNodesToBucketDB(document::BucketId(16, 3), "0=2/2/2/t/a,1=2/2/2/t");
+
+ // Go many full scanner rounds to check that stats are set, not
+ // added to existing.
+ tickDistributorNTimes(50);
+
+ const auto& stats(_distributor->_maintenanceStats);
+ {
+ NodeMaintenanceStats wanted;
+ wanted.syncing = 1;
+ wanted.copyingOut = 2;
+ CPPUNIT_ASSERT_EQUAL(wanted, stats.perNodeStats.forNode(0));
+ }
+ {
+ NodeMaintenanceStats wanted;
+ wanted.movingOut = 1;
+ CPPUNIT_ASSERT_EQUAL(wanted, stats.perNodeStats.forNode(1));
+ }
+ {
+ NodeMaintenanceStats wanted;
+ wanted.syncing = 1;
+ wanted.copyingIn = 2;
+ CPPUNIT_ASSERT_EQUAL(wanted, stats.perNodeStats.forNode(2));
+ }
+}
+
+/**
+ * Since maintenance operations are prioritized differently, activation
+ * pre-empts merging and other ops. If this also implies pre-empting running
+ * their state checkers at all, we won't get any statistics from any other
+ * operations for the bucket.
+ */
+void
+Distributor_Test::statsGeneratedForPreemptedOperations()
+{
+ setupDistributor(Redundancy(2), NodeCount(2), "storage:2 distributor:1");
+ // For this test it suffices to have a single bucket with multiple aspects
+ // wrong about it. In this case, let a bucket be both out of sync _and_
+ // missing an active copy. This _should_ give a statistic with both nodes 0
+ // and 1 requiring a sync. If instead merge stats generation is preempted
+ // by activation, we'll see no merge stats at all.
+ addNodesToBucketDB(document::BucketId(16, 1), "0=1/1/1,1=2/2/2");
+ tickDistributorNTimes(50);
+ const auto& stats(_distributor->_maintenanceStats);
+ {
+ NodeMaintenanceStats wanted;
+ wanted.syncing = 1;
+ CPPUNIT_ASSERT_EQUAL(wanted, stats.perNodeStats.forNode(0));
+ }
+ {
+ NodeMaintenanceStats wanted;
+ wanted.syncing = 1;
+ CPPUNIT_ASSERT_EQUAL(wanted, stats.perNodeStats.forNode(1));
+ }
+}
+
+void
+Distributor_Test::hostInfoReporterConfigIsPropagatedToReporter()
+{
+ setupDistributor(Redundancy(2), NodeCount(2), "storage:2 distributor:1");
+
+ // Default is enabled=true.
+ CPPUNIT_ASSERT(_distributor->_hostInfoReporter.isReportingEnabled());
+
+ ConfigBuilder builder;
+ builder.enableHostInfoReporting = false;
+ configureDistributor(builder);
+
+ CPPUNIT_ASSERT(!_distributor->_hostInfoReporter.isReportingEnabled());
+}
+
+void
+Distributor_Test::replicaCountingModeIsConfiguredToTrustedByDefault()
+{
+ setupDistributor(Redundancy(2), NodeCount(2), "storage:2 distributor:1");
+ CPPUNIT_ASSERT_EQUAL(ConfigBuilder::TRUSTED, currentReplicaCountingMode());
+}
+
+void
+Distributor_Test::replicaCountingModeConfigIsPropagatedToMetricUpdater()
+{
+ setupDistributor(Redundancy(2), NodeCount(2), "storage:2 distributor:1");
+ ConfigBuilder builder;
+ builder.minimumReplicaCountingMode = ConfigBuilder::ANY;
+ configureDistributor(builder);
+ CPPUNIT_ASSERT_EQUAL(ConfigBuilder::ANY, currentReplicaCountingMode());
+}
+
+void
+Distributor_Test::bucketActivationIsEnabledByDefault()
+{
+ setupDistributor(Redundancy(2), NodeCount(2), "storage:2 distributor:1");
+ CPPUNIT_ASSERT(getConfig().isBucketActivationDisabled() == false);
+}
+
+void
+Distributor_Test::bucketActivationConfigIsPropagatedToDistributorConfiguration()
+{
+ using namespace vespa::config::content::core;
+ using ConfigBuilder = StorDistributormanagerConfigBuilder;
+
+ setupDistributor(Redundancy(2), NodeCount(2), "storage:2 distributor:1");
+
+ ConfigBuilder builder;
+ builder.disableBucketActivation = true;
+ getConfig().configure(builder);
+
+ CPPUNIT_ASSERT(getConfig().isBucketActivationDisabled());
+}
+
+}
+
+}
diff --git a/storage/src/tests/distributor/distributortestutil.cpp b/storage/src/tests/distributor/distributortestutil.cpp
new file mode 100644
index 00000000000..c2d878a253d
--- /dev/null
+++ b/storage/src/tests/distributor/distributortestutil.cpp
@@ -0,0 +1,298 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <ctype.h>
+#include <vespa/document/base/testdocman.h>
+#include <vespa/storageframework/defaultimplementation/memory/nomemorymanager.h>
+#include <tests/distributor/distributortestutil.h>
+#include <vespa/vespalib/text/stringtokenizer.h>
+
+namespace storage {
+
+namespace distributor {
+
+void
+DistributorTestUtil::createLinks()
+{
+ _node.reset(new TestDistributorApp(_config.getConfigId()));
+ _threadPool = framework::TickingThreadPool::createDefault("distributor");
+ _distributor.reset(new Distributor(
+ _node->getComponentRegister(),
+ *_threadPool,
+ *this,
+ true,
+ _hostInfo,
+ &_messageSender));
+ _component.reset(new storage::DistributorComponent(_node->getComponentRegister(), "distrtestutil"));
+};
+
+void
+DistributorTestUtil::setupDistributor(int redundancy,
+ int nodeCount,
+ const std::string& systemState,
+ uint32_t earlyReturn,
+ bool requirePrimaryToBeWritten)
+{
+ lib::Distribution::DistributionConfig config(
+ lib::Distribution::getDefaultDistributionConfig(
+ redundancy, nodeCount));
+ config.redundancy = redundancy;
+ config.initialRedundancy = earlyReturn;
+ config.ensurePrimaryPersisted = requirePrimaryToBeWritten;
+ lib::Distribution* distribution = new lib::Distribution(config);
+ _node->getComponentRegister().setDistribution(
+ lib::Distribution::SP(distribution));
+ _distributor->enableClusterState(lib::ClusterState(systemState));
+}
+
+void
+DistributorTestUtil::setRedundancy(uint32_t redundancy)
+{
+ _node->getComponentRegister().setDistribution(lib::Distribution::SP(
+ new lib::Distribution(
+ lib::Distribution::getDefaultDistributionConfig(
+ redundancy, 100))));
+ _distributor->storageDistributionChanged();
+}
+
+void
+DistributorTestUtil::setTypeRepo(const document::DocumentTypeRepo::SP &repo)
+{
+ _node->getComponentRegister().setDocumentTypeRepo(repo);
+}
+
+void
+DistributorTestUtil::close()
+{
+ _component.reset(0);
+ if (_distributor.get()) {
+ _distributor->onClose();
+ }
+ _sender.clear();
+ _node.reset(0);
+ _config = getStandardConfig(false);
+}
+
+namespace {
+ std::string dumpVector(const std::vector<uint16_t>& vec) {
+ std::ostringstream ost;
+ for (uint32_t i = 0; i < vec.size(); ++i) {
+ if (i != 0) {
+ ost << ",";
+ }
+ ost << vec[i];
+ }
+ return ost.str();
+ }
+}
+
+std::string
+DistributorTestUtil::getNodes(document::BucketId id)
+{
+ BucketDatabase::Entry entry = getBucket(id);
+
+ if (!entry.valid()) {
+ return id.toString();
+ } else {
+ std::vector<uint16_t> nodes = entry->getNodes();
+ std::sort(nodes.begin(), nodes.end());
+
+ std::ostringstream ost;
+ ost << id << ": " << dumpVector(nodes);
+ return ost.str();
+ }
+}
+
+std::string
+DistributorTestUtil::getIdealStr(document::BucketId id, const lib::ClusterState& state)
+{
+ if (!getExternalOperationHandler().ownsBucketInState(state, id)) {
+ return id.toString();
+ }
+
+ std::vector<uint16_t> nodes;
+ _component->getDistribution()->getIdealNodes(lib::NodeType::STORAGE,
+ state,
+ id,
+ nodes);
+ std::sort(nodes.begin(), nodes.end());
+ std::ostringstream ost;
+ ost << id << ": " << dumpVector(nodes);
+ return ost.str();
+}
+
+void
+DistributorTestUtil::addIdealNodes(const lib::ClusterState& state,
+ const document::BucketId& id)
+{
+ BucketDatabase::Entry entry = getBucket(id);
+
+ if (!entry.valid()) {
+ entry = BucketDatabase::Entry(id);
+ }
+
+ std::vector<uint16_t> res;
+ assert(_component.get());
+ _component->getDistribution()->getIdealNodes(lib::NodeType::STORAGE,
+ state,
+ id,
+ res);
+
+ for (uint32_t i = 0; i < res.size(); ++i) {
+ if (state.getNodeState(lib::Node(lib::NodeType::STORAGE, res[i])).getState() !=
+ lib::State::MAINTENANCE)
+ {
+ entry->addNode(BucketCopy(0, res[i], api::BucketInfo(1,1,1)),
+ toVector<uint16_t>(0));
+ }
+ }
+
+ getBucketDatabase().update(entry);
+}
+
+void
+DistributorTestUtil::addNodesToBucketDB(const document::BucketId& id,
+ const std::string& nodeStr)
+{
+ BucketDatabase::Entry entry = getBucket(id);
+
+ if (!entry.valid()) {
+ entry = BucketDatabase::Entry(id);
+ }
+
+ entry->clear();
+
+ vespalib::StringTokenizer tokenizer(nodeStr, ",");
+ for (uint32_t i = 0; i < tokenizer.size(); ++i) {
+ vespalib::StringTokenizer tok2(tokenizer[i], "=");
+ vespalib::StringTokenizer tok3(tok2[1], "/");
+
+ api::BucketInfo info(atoi(tok3[0].c_str()),
+ atoi(tok3.size() > 1 ? tok3[1].c_str() : tok3[0].c_str()),
+ atoi(tok3.size() > 2 ? tok3[2].c_str() : tok3[0].c_str()));
+
+ size_t flagsIdx = 3;
+
+ // Meta info override? For simplicity, require both meta count and size
+ if (tok3.size() > 4 && (!tok3[3].empty() && isdigit(tok3[3][0]))) {
+ info.setMetaCount(atoi(tok3[3].c_str()));
+ info.setUsedFileSize(atoi(tok3[4].c_str()));
+ flagsIdx = 5;
+ }
+
+ if ((tok3.size() > flagsIdx + 1) && tok3[flagsIdx + 1] == "a") {
+ info.setActive();
+ } else {
+ info.setActive(false);
+ }
+ if ((tok3.size() > flagsIdx + 2) && tok3[flagsIdx + 2] == "r") {
+ info.setReady();
+ } else {
+ info.setReady(false);
+ }
+
+ uint16_t idx = atoi(tok2[0].c_str());
+ BucketCopy node(
+ 0,
+ idx,
+ info);
+
+ // Allow user to manually override trusted and active.
+ if (tok3.size() > flagsIdx && tok3[flagsIdx] == "t") {
+ node.setTrusted();
+ }
+
+ entry->addNodeManual(node);
+ }
+
+ getBucketDatabase().update(entry);
+}
+
+void
+DistributorTestUtil::removeFromBucketDB(const document::BucketId& id)
+{
+ getBucketDatabase().remove(id);
+}
+
+void
+DistributorTestUtil::addIdealNodes(const document::BucketId& id)
+{
+ addIdealNodes(getExternalOperationHandler().getClusterState(), id);
+}
+
+void
+DistributorTestUtil::insertBucketInfo(document::BucketId id,
+ uint16_t node,
+ uint32_t checksum,
+ uint32_t count,
+ uint32_t size,
+ bool trusted,
+ bool active)
+{
+ api::BucketInfo info(checksum, count, size);
+ insertBucketInfo(id, node, info, trusted, active);
+}
+
+void
+DistributorTestUtil::insertBucketInfo(document::BucketId id,
+ uint16_t node,
+ const api::BucketInfo& info,
+ bool trusted,
+ bool active)
+{
+ BucketDatabase::Entry entry = getBucketDatabase().get(id);
+ if (!entry.valid()) {
+ entry = BucketDatabase::Entry(id, BucketInfo());
+ }
+
+ api::BucketInfo info2(info);
+ if (active) {
+ info2.setActive();
+ }
+ BucketCopy copy(getExternalOperationHandler().getUniqueTimestamp(), node, info2);
+
+ entry->addNode(copy.setTrusted(trusted), toVector<uint16_t>(0));
+
+ getBucketDatabase().update(entry);
+}
+
+std::string
+DistributorTestUtil::dumpBucket(const document::BucketId& bid)
+{
+ return getBucketDatabase().get(bid).toString();
+}
+
+void
+DistributorTestUtil::sendReply(Operation& op,
+ int idx,
+ api::ReturnCode::Result result)
+{
+ if (idx == -1) {
+ idx = _sender.commands.size() - 1;
+ }
+ assert(idx >= 0 && idx < static_cast<int>(_sender.commands.size()));
+
+ std::shared_ptr<api::StorageCommand> cmd = _sender.commands[idx];
+ api::StorageReply::SP reply(cmd->makeReply().release());
+ reply->setResult(result);
+ op.receive(_sender, reply);
+}
+
+BucketDatabase::Entry
+DistributorTestUtil::getBucket(const document::BucketId& bId) const
+{
+ return _distributor->getBucketDatabase().get(bId);
+}
+
+void
+DistributorTestUtil::disableBucketActivationInConfig(bool disable)
+{
+ vespa::config::content::core::StorDistributormanagerConfigBuilder config;
+ config.disableBucketActivation = disable;
+ getConfig().configure(config);
+}
+
+}
+
+}
+
+
diff --git a/storage/src/tests/distributor/distributortestutil.h b/storage/src/tests/distributor/distributortestutil.h
new file mode 100644
index 00000000000..43b56859d0d
--- /dev/null
+++ b/storage/src/tests/distributor/distributortestutil.h
@@ -0,0 +1,200 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <tests/common/dummystoragelink.h>
+#include <vespa/storageframework/defaultimplementation/clock/fakeclock.h>
+#include <vespa/storage/common/hostreporter/hostinfo.h>
+#include <vespa/storage/distributor/distributor.h>
+#include <vespa/storage/frameworkimpl/component/distributorcomponentregisterimpl.h>
+#include <vespa/storage/storageutil/utils.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/distributor/messagesenderstub.h>
+#include <vespa/storageapi/message/state.h>
+#include <tests/common/testhelper.h>
+
+namespace storage {
+
+namespace distributor {
+
+class DistributorTestUtil : private DoneInitializeHandler
+{
+public:
+ DistributorTestUtil()
+ : _messageSender(_sender, _senderDown)
+ {
+ _config = getStandardConfig(false);
+ }
+ virtual ~DistributorTestUtil() {};
+
+ /**
+ * Sets up the storage link chain.
+ */
+ void createLinks();
+ void setTypeRepo(const document::DocumentTypeRepo::SP &repo);
+
+ void close();
+
+ /**
+ * Returns a string with the nodes currently stored in the bucket
+ * database for the given bucket.
+ */
+ std::string getNodes(document::BucketId id);
+
+ /**
+ * Returns a string with the ideal state nodes for the given bucket.
+ */
+ std::string getIdealStr(document::BucketId id, const lib::ClusterState& state);
+
+ /**
+ * Adds the ideal nodes for the given bucket and the given cluster state
+ * to the bucket database.
+ */
+ void addIdealNodes(const lib::ClusterState& state, const document::BucketId& id);
+
+ /**
+ * Adds all the ideal nodes for the given bucket to the bucket database.
+ */
+ void addIdealNodes(const document::BucketId& id);
+
+ /**
+ * Parses the given string to a set of node => bucket info data,
+ * and inserts them as nodes in the given bucket.
+ * Format:
+ * "node1=checksum/docs/size,node2=checksum/docs/size"
+ */
+ void addNodesToBucketDB(const document::BucketId& id, const std::string& nodeStr);
+
+ /**
+ * Removes the given bucket from the bucket database.
+ */
+ void removeFromBucketDB(const document::BucketId& id);
+
+ /**
+ * Inserts the given bucket information for the given bucket and node in
+ * the bucket database.
+ */
+ void insertBucketInfo(document::BucketId id,
+ uint16_t node,
+ uint32_t checksum,
+ uint32_t count,
+ uint32_t size,
+ bool trusted = false,
+ bool active = false);
+
+ /**
+ * Inserts the given bucket information for the given bucket and node in
+ * the bucket database.
+ */
+ void insertBucketInfo(document::BucketId id,
+ uint16_t node,
+ const api::BucketInfo& info,
+ bool trusted = false,
+ bool active = false);
+
+ std::string dumpBucket(const document::BucketId& bucket);
+
+ /**
+ * Replies to message idx sent upwards with the given result code.
+ * If idx = -1, replies to the last command received upwards.
+ */
+ void sendReply(Operation& op,
+ int idx = -1,
+ api::ReturnCode::Result result = api::ReturnCode::OK);
+
+ BucketDBUpdater& getBucketDBUpdater() {
+ return _distributor->_bucketDBUpdater;
+ }
+ IdealStateManager& getIdealStateManager() {
+ return _distributor->_idealStateManager;
+ }
+ ExternalOperationHandler& getExternalOperationHandler() {
+ return _distributor->_externalOperationHandler;
+ }
+
+ Distributor& getDistributor() {
+ return *_distributor;
+ }
+
+ bool tick() {
+ framework::ThreadWaitInfo res(
+ framework::ThreadWaitInfo::NO_MORE_CRITICAL_WORK_KNOWN);
+ {
+ framework::TickingLockGuard lock(
+ _distributor->_threadPool.freezeCriticalTicks());
+ res.merge(_distributor->doCriticalTick(0));
+ }
+ res.merge(_distributor->doNonCriticalTick(0));
+ return !res.waitWanted();
+ }
+
+ DistributorConfiguration& getConfig() {
+ return const_cast<DistributorConfiguration&>(_distributor->getConfig());
+ }
+
+ vdstestlib::DirConfig& getDirConfig() {
+ return _config;
+ }
+
+ BucketDatabase& getBucketDatabase() { return _distributor->getBucketDatabase(); }
+
+ framework::defaultimplementation::FakeClock& getClock() { return _node->getClock(); }
+ DistributorComponentRegister& getComponentRegister() { return _node->getComponentRegister(); }
+ DistributorComponentRegisterImpl& getComponentRegisterImpl() { return _node->getComponentRegister(); }
+
+ StorageComponent& getComponent() {
+ if (_component.get() == 0) {
+ _component.reset(new storage::DistributorComponent(
+ _node->getComponentRegister(), "distributor_test_utils"));
+ }
+ return *_component;
+ }
+
+ void setupDistributor(int redundancy,
+ int nodeCount,
+ const std::string& systemState,
+ uint32_t earlyReturn = false,
+ bool requirePrimaryToBeWritten = true);
+
+ void setRedundancy(uint32_t redundancy);
+
+ virtual void notifyDoneInitializing() {}
+
+ // Must implement this for storage server interface for now
+ virtual api::Timestamp getUniqueTimestamp() {
+ return _component->getUniqueTimestamp();
+ }
+
+ void disableBucketActivationInConfig(bool disable);
+
+ BucketDatabase::Entry getBucket(const document::BucketId& bId) const;
+
+protected:
+ vdstestlib::DirConfig _config;
+ std::unique_ptr<TestDistributorApp> _node;
+ framework::TickingThreadPool::UP _threadPool;
+ std::unique_ptr<Distributor> _distributor;
+ std::unique_ptr<storage::DistributorComponent> _component;
+ MessageSenderStub _sender;
+ MessageSenderStub _senderDown;
+ HostInfo _hostInfo;
+
+ struct MessageSenderImpl : public ChainedMessageSender {
+ MessageSenderStub& _sender;
+ MessageSenderStub& _senderDown;
+ MessageSenderImpl(MessageSenderStub& up, MessageSenderStub& down)
+ : _sender(up), _senderDown(down) {}
+
+ void sendUp(const std::shared_ptr<api::StorageMessage>& msg) {
+ _sender.send(msg);
+ }
+ void sendDown(const std::shared_ptr<api::StorageMessage>& msg) {
+ _senderDown.send(msg);
+ }
+ };
+ MessageSenderImpl _messageSender;
+};
+
+}
+
+}
+
diff --git a/storage/src/tests/distributor/externaloperationhandlertest.cpp b/storage/src/tests/distributor/externaloperationhandlertest.cpp
new file mode 100644
index 00000000000..ce8149b4bac
--- /dev/null
+++ b/storage/src/tests/distributor/externaloperationhandlertest.cpp
@@ -0,0 +1,176 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+
+#include <tests/distributor/distributortestutil.h>
+#include <vespa/storage/distributor/externaloperationhandler.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+
+namespace storage {
+namespace distributor {
+
+class ExternalOperationHandlerTest : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+ CPPUNIT_TEST_SUITE(ExternalOperationHandlerTest);
+ CPPUNIT_TEST(testBucketSplitMask);
+ CPPUNIT_TEST(testOperationRejectedOnWrongDistribution);
+ CPPUNIT_TEST(testOperationRejectedOnPendingWrongDistribution);
+ CPPUNIT_TEST_SUITE_END();
+
+ document::BucketId findNonOwnedUserBucketInState(vespalib::stringref state);
+ document::BucketId findOwned1stNotOwned2ndInStates(
+ vespalib::stringref state1,
+ vespalib::stringref state2);
+
+ std::shared_ptr<api::StorageMessage> makeGetCommandForUser(uint64_t id);
+
+protected:
+ void testBucketSplitMask();
+ void testOperationRejectedOnWrongDistribution();
+ void testOperationRejectedOnPendingWrongDistribution();
+
+public:
+ void tearDown() {
+ close();
+ }
+
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(ExternalOperationHandlerTest);
+
+void
+ExternalOperationHandlerTest::testBucketSplitMask()
+{
+ {
+ createLinks();
+ getDirConfig().getConfig("stor-distributormanager").set("minsplitcount", "16");
+
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 0xffff),
+ getExternalOperationHandler().getBucketId(document::DocumentId(
+ vespalib::make_string("userdoc:ns:%d::", 0xffff))
+ ).stripUnused());
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 0),
+ getExternalOperationHandler().getBucketId(document::DocumentId(
+ vespalib::make_string("userdoc:ns:%d::", 0x10000))
+ ).stripUnused());
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 0xffff),
+ getExternalOperationHandler().getBucketId(document::DocumentId(
+ vespalib::make_string("userdoc:ns:%d::", 0xffff))
+ ).stripUnused());
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 0x100),
+ getExternalOperationHandler().getBucketId(document::DocumentId(
+ vespalib::make_string("userdoc:ns:%d::", 0x100))
+ ).stripUnused());
+ close();
+ }
+ {
+ getDirConfig().getConfig("stor-distributormanager").set("minsplitcount", "20");
+ createLinks();
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(20, 0x11111),
+ getExternalOperationHandler().getBucketId(document::DocumentId(
+ vespalib::make_string("userdoc:ns:%d::", 0x111111))
+ ).stripUnused());
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(20, 0x22222),
+ getExternalOperationHandler().getBucketId(document::DocumentId(
+ vespalib::make_string("userdoc:ns:%d::", 0x222222))
+ ).stripUnused());
+ }
+}
+
+document::BucketId
+ExternalOperationHandlerTest::findNonOwnedUserBucketInState(
+ vespalib::stringref statestr)
+{
+ lib::ClusterState state(statestr);
+ for (uint64_t i = 1; i < 1000; ++i) {
+ document::BucketId bucket(32, i);
+ if (!getExternalOperationHandler().ownsBucketInState(state, bucket)) {
+ return bucket;
+ }
+ }
+ throw std::runtime_error("no appropriate bucket found");
+}
+
+document::BucketId
+ExternalOperationHandlerTest::findOwned1stNotOwned2ndInStates(
+ vespalib::stringref statestr1,
+ vespalib::stringref statestr2)
+{
+ lib::ClusterState state1(statestr1);
+ lib::ClusterState state2(statestr2);
+ for (uint64_t i = 1; i < 1000; ++i) {
+ document::BucketId bucket(32, i);
+ if (getExternalOperationHandler().ownsBucketInState(state1, bucket)
+ && !getExternalOperationHandler().ownsBucketInState(state2, bucket))
+ {
+ return bucket;
+ }
+ }
+ throw std::runtime_error("no appropriate bucket found");
+}
+
+std::shared_ptr<api::StorageMessage>
+ExternalOperationHandlerTest::makeGetCommandForUser(uint64_t id)
+{
+ document::DocumentId docId(document::UserDocIdString("userdoc:foo:" + vespalib::make_string("%lu", id) + ":bar"));
+ std::shared_ptr<api::StorageMessage> cmd(
+ new api::GetCommand(document::BucketId(0), docId, "[all]"));
+ return cmd;
+}
+
+void
+ExternalOperationHandlerTest::testOperationRejectedOnWrongDistribution()
+{
+ createLinks();
+ std::string state("distributor:2 storage:2");
+ setupDistributor(1, 2, state);
+
+ document::BucketId bucket(findNonOwnedUserBucketInState(state));
+ auto cmd = makeGetCommandForUser(bucket.withoutCountBits());
+
+ Operation::SP genOp;
+ CPPUNIT_ASSERT(getExternalOperationHandler().handleMessage(cmd, genOp));
+ CPPUNIT_ASSERT(!genOp.get());
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.replies.size());
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("ReturnCode(WRONG_DISTRIBUTION, "
+ "distributor:2 storage:2)"),
+ _sender.replies[0]->getResult().toString());
+}
+
+void
+ExternalOperationHandlerTest::testOperationRejectedOnPendingWrongDistribution()
+{
+ createLinks();
+ std::string current("distributor:2 storage:2");
+ std::string pending("distributor:3 storage:3");
+ setupDistributor(1, 3, current);
+
+ document::BucketId b(findOwned1stNotOwned2ndInStates(current, pending));
+
+ // Trigger pending cluster state
+ auto stateCmd = std::make_shared<api::SetSystemStateCommand>(
+ lib::ClusterState(pending));
+ getBucketDBUpdater().onSetSystemState(stateCmd);
+
+ auto cmd = makeGetCommandForUser(b.withoutCountBits());
+
+ Operation::SP genOp;
+ CPPUNIT_ASSERT(getExternalOperationHandler().handleMessage(cmd, genOp));
+ CPPUNIT_ASSERT(!genOp.get());
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.replies.size());
+ // Fail back with _pending_ cluster state so client can start trying
+ // correct distributor immediately. If that distributor has not yet
+ // completed processing its pending cluster state, it'll return the
+ // old (current) cluster state, causing the client to bounce between
+ // the two until the pending states have been resolved. This is pretty
+ // much inevitable with the current design.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("ReturnCode(WRONG_DISTRIBUTION, "
+ "distributor:3 storage:3)"),
+ _sender.replies[0]->getResult().toString());
+}
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/garbagecollectiontest.cpp b/storage/src/tests/distributor/garbagecollectiontest.cpp
new file mode 100644
index 00000000000..399222f0e34
--- /dev/null
+++ b/storage/src/tests/distributor/garbagecollectiontest.cpp
@@ -0,0 +1,77 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <vespa/storageapi/message/removelocation.h>
+#include <vespa/storage/distributor/operations/idealstate/garbagecollectionoperation.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <tests/distributor/distributortestutil.h>
+
+namespace storage {
+namespace distributor {
+
+class GarbageCollectionOperationTest : public CppUnit::TestFixture, public DistributorTestUtil
+{
+ CPPUNIT_TEST_SUITE(GarbageCollectionOperationTest);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST_SUITE_END();
+
+protected:
+ void testSimple();
+
+public:
+ void setUp() {
+ createLinks();
+ };
+
+ void tearDown() {
+ close();
+ }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(GarbageCollectionOperationTest);
+
+void
+GarbageCollectionOperationTest::testSimple()
+{
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:2"));
+ addNodesToBucketDB(document::BucketId(16, 1), "0=250/50/300,1=250/50/300");
+ getConfig().setGarbageCollection("music.date < 34", 3600);
+
+ GarbageCollectionOperation op("storage",
+ BucketAndNodes(document::BucketId(16, 1),
+ toVector<uint16_t>(0, 1)));
+
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL((size_t)2, _sender.commands.size());
+
+ getClock().setAbsoluteTimeInSeconds(34);
+
+ for (uint32_t i = 0; i < 2; ++i) {
+ std::shared_ptr<api::StorageCommand> msg = _sender.commands[i];
+ CPPUNIT_ASSERT(msg->getType() == api::MessageType::REMOVELOCATION);
+
+ api::RemoveLocationCommand* tmp = (api::RemoveLocationCommand*)msg.get();
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("music.date < 34"),
+ tmp->getDocumentSelection());
+
+ std::shared_ptr<api::StorageReply> reply(tmp->makeReply().release());
+ api::RemoveLocationReply* sreply = (api::RemoveLocationReply*)reply.get();
+ sreply->setBucketInfo(api::BucketInfo(666, 90, 500));
+
+ op.receive(_sender, reply);
+ }
+
+ BucketDatabase::Entry entry = getBucket(document::BucketId(16, 1));
+ CPPUNIT_ASSERT(entry.valid());
+ CPPUNIT_ASSERT_EQUAL(2, (int)entry->getNodeCount());
+ CPPUNIT_ASSERT_EQUAL(34, (int)entry->getLastGarbageCollectionTime());
+ CPPUNIT_ASSERT_EQUAL(api::BucketInfo(666, 90, 500),
+ entry->getNodeRef(0).getBucketInfo());
+ CPPUNIT_ASSERT_EQUAL(api::BucketInfo(666, 90, 500),
+ entry->getNodeRef(1).getBucketInfo());
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/getoperationtest.cpp b/storage/src/tests/distributor/getoperationtest.cpp
new file mode 100644
index 00000000000..12853be2e42
--- /dev/null
+++ b/storage/src/tests/distributor/getoperationtest.cpp
@@ -0,0 +1,567 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+
+#include <vespa/config/helper/configgetter.h>
+#include <vespa/document/config/config-documenttypes.h>
+#include <vespa/document/repo/documenttyperepo.h>
+#include <vespa/storage/distributor/externaloperationhandler.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <tests/distributor/distributortestutil.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <vespa/storage/distributor/operations/external/getoperation.h>
+
+using std::shared_ptr;
+using config::ConfigGetter;
+using document::DocumenttypesConfig;
+using config::FileSpec;
+
+namespace storage {
+namespace distributor {
+
+class GetOperationTest : public CppUnit::TestFixture, public DistributorTestUtil {
+ CPPUNIT_TEST_SUITE(GetOperationTest);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST(testNotFound);
+ CPPUNIT_TEST(testResendOnStorageFailure);
+ CPPUNIT_TEST(testResendOnStorageFailureAllFail);
+ CPPUNIT_TEST(testSendToIdealCopyIfBucketInSync);
+ CPPUNIT_TEST(testReturnNotFoundWhenBucketNotInDb);
+ CPPUNIT_TEST(testAskAllNodesIfBucketIsInconsistent);
+ CPPUNIT_TEST(testSendToAllInvalidNodesWhenInconsistent);
+ CPPUNIT_TEST(testAskTrustedNodeIfBucketIsInconsistent);
+ CPPUNIT_TEST(testInconsistentSplit); // Test that we ask all nodes if a bucket is inconsistent.
+ CPPUNIT_TEST(testSendToAllInvalidCopies);
+ CPPUNIT_TEST(testMultiInconsistentBucket);
+ CPPUNIT_TEST(testMultiInconsistentBucketFail);
+ CPPUNIT_TEST(testMultiInconsistentBucketNotFound);
+ CPPUNIT_TEST(testMultiInconsistentBucketNotFoundDeleted);
+ CPPUNIT_TEST(testMultipleCopiesWithFailureOnLocalNode);
+ CPPUNIT_TEST(canGetDocumentsWhenAllReplicaNodesRetired);
+ CPPUNIT_TEST_SUITE_END();
+
+ document::DocumentTypeRepo::SP _repo;
+
+public:
+ document::DocumentId docId;
+ document::BucketId bucketId;
+ std::unique_ptr<Operation> op;
+
+ void setUp() {
+ _repo.reset(
+ new document::DocumentTypeRepo(*ConfigGetter<DocumenttypesConfig>::
+ getConfig("config-doctypes", FileSpec("config-doctypes.cfg"))));
+ createLinks();
+
+ docId = document::DocumentId(document::DocIdString("test", "uri"));
+ bucketId = getExternalOperationHandler().getBucketId(docId);
+ };
+
+ void tearDown() {
+ close();
+ op.reset();
+ }
+
+ void sendGet() {
+ std::shared_ptr<api::GetCommand> msg(
+ new api::GetCommand(document::BucketId(0), docId, "[all]"));
+
+ op.reset(new GetOperation(getExternalOperationHandler(),
+ msg,
+ getDistributor().getMetrics().
+ gets[msg->getLoadType()]));
+ op->start(_sender, framework::MilliSecTime(0));
+ }
+
+ void sendReply(uint32_t idx,
+ api::ReturnCode::Result result,
+ std::string authorVal, uint32_t timestamp)
+ {
+ if (idx == (uint32_t)-1) {
+ idx = _sender.commands.size() - 1;
+ }
+
+ std::shared_ptr<api::StorageCommand> msg2 = _sender.commands[idx];
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::GET, msg2->getType());
+
+ api::GetCommand* tmp = static_cast<api::GetCommand*>(msg2.get());
+ document::Document::SP doc;
+
+ if (authorVal.length()) {
+ const document::DocumentType* type(_repo->getDocumentType("text/html"));
+ doc = document::Document::SP(
+ new document::Document(*type, docId));
+
+ doc->setValue(doc->getField("author"),
+ document::StringFieldValue(authorVal));
+ }
+
+ api::GetReply* reply = new api::GetReply(*tmp, doc, timestamp);
+ reply->setResult(result);
+
+ op->receive(_sender, std::shared_ptr<api::StorageReply>(reply));
+ }
+
+ void replyWithFailure() {
+ sendReply(-1, api::ReturnCode::IO_FAILURE, "", 0);
+ }
+
+ void replyWithNotFound() {
+ sendReply(-1, api::ReturnCode::OK, "", 0);
+ }
+
+ void replyWithDocument() {
+ sendReply(-1, api::ReturnCode::OK, "foo", 100);
+ }
+
+ std::string getLastReplyAuthor() {
+ api::StorageMessage& msg = *_sender.replies[_sender.replies.size() - 1];
+
+ if (msg.getType() == api::MessageType::GET_REPLY) {
+ document::Document::SP doc(
+ dynamic_cast<api::GetReply&>(msg).getDocument());
+
+ return doc->getValue(doc->getField("author"))->toString();
+ } else {
+ std::ostringstream ost;
+ ost << "Last reply was not a GET reply, but " << msg;
+ return ost.str();
+ }
+ }
+
+ void setClusterState(const std::string& clusterState) {
+ _distributor->enableClusterState(lib::ClusterState(clusterState));
+ }
+
+ void testSimple();
+ void testReturnNotFoundWhenBucketNotInDb();
+ void testNotFound();
+ void testResendOnStorageFailure();
+ void testResendOnStorageFailureAllFail();
+ void testSendToIdealCopyIfBucketInSync();
+ void testAskAllNodesIfBucketIsInconsistent();
+ void testSendToAllInvalidNodesWhenInconsistent();
+ void testAskTrustedNodeIfBucketIsInconsistent();
+ void testInconsistentSplit();
+ void testMultiInconsistentBucket();
+ void testMultiInconsistentBucketFail();
+ void testMultiInconsistentBucketNotFound();
+ void testMultiInconsistentBucketNotFoundDeleted();
+ void testSendToAllInvalidCopies();
+ void testMultipleCopiesWithFailureOnLocalNode();
+ void canGetDocumentsWhenAllReplicaNodesRetired();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(GetOperationTest);
+
+void
+GetOperationTest::testSimple()
+{
+ setClusterState("distributor:1 storage:2");
+
+ addNodesToBucketDB(bucketId, "0=4,1=4");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get => 0"),
+ _sender.getCommands(true));
+
+ replyWithDocument();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 100) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+GetOperationTest::testAskTrustedNodeIfBucketIsInconsistent()
+{
+ setClusterState("distributor:1 storage:4");
+
+ addNodesToBucketDB(bucketId, "0=100/3/10,1=200/4/12/t");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Get => 1"),
+ _sender.getCommands(true));
+
+ replyWithDocument();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 100) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+GetOperationTest::testAskAllNodesIfBucketIsInconsistent()
+{
+ setClusterState("distributor:1 storage:4");
+
+ addNodesToBucketDB(bucketId, "0=100/3/10,1=200/4/12");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get => 0,Get => 1"),
+ _sender.getCommands(true));
+
+ sendReply(0, api::ReturnCode::OK, "newauthor", 2);
+ sendReply(1, api::ReturnCode::OK, "oldauthor", 1);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 2) ReturnCode(NONE)"),
+ _sender.getLastReply());
+
+ CPPUNIT_ASSERT_EQUAL(std::string("newauthor"), getLastReplyAuthor());
+}
+
+
+void
+GetOperationTest::testSendToAllInvalidCopies()
+{
+ setClusterState("distributor:1 storage:4");
+
+ addNodesToBucketDB(bucketId, "2=0/0/1,3=0/0/1");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get => 2,Get => 3"),
+ _sender.getCommands(true));
+
+ sendReply(0, api::ReturnCode::OK, "newauthor", 2);
+ sendReply(1, api::ReturnCode::OK, "oldauthor", 1);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 2) ReturnCode(NONE)"),
+ _sender.getLastReply());
+
+ CPPUNIT_ASSERT_EQUAL(std::string("newauthor"), getLastReplyAuthor());
+}
+
+void
+GetOperationTest::testSendToAllInvalidNodesWhenInconsistent()
+{
+ setClusterState("distributor:1 storage:4");
+
+ addNodesToBucketDB(bucketId, "0=100,1=200,2=0/0/1,3=0/0/1");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get => 2,Get => 3,Get => 0,Get => 1"),
+ _sender.getCommands(true));
+
+ sendReply(0, api::ReturnCode::OK, "newauthor", 2);
+ sendReply(1, api::ReturnCode::OK, "oldauthor", 1);
+ sendReply(2, api::ReturnCode::OK, "oldauthor", 1);
+ sendReply(3, api::ReturnCode::OK, "oldauthor", 1);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 2) ReturnCode(NONE)"),
+ _sender.getLastReply());
+
+ CPPUNIT_ASSERT_EQUAL(std::string("newauthor"), getLastReplyAuthor());
+}
+
+void
+GetOperationTest::testInconsistentSplit()
+{
+ setClusterState("distributor:1 storage:4");
+
+ addNodesToBucketDB(document::BucketId(16, 0x2a52), "0=100");
+ addNodesToBucketDB(document::BucketId(17, 0x2a52), "1=200");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get => 0,Get => 1"),
+ _sender.getCommands(true));
+
+ sendReply(0, api::ReturnCode::OK, "newauthor", 2);
+ sendReply(1, api::ReturnCode::OK, "oldauthor", 1);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 2) ReturnCode(NONE)"),
+ _sender.getLastReply());
+
+ CPPUNIT_ASSERT_EQUAL(std::string("newauthor"), getLastReplyAuthor());
+}
+
+
+void
+GetOperationTest::testMultiInconsistentBucketNotFound()
+{
+ setClusterState("distributor:1 storage:4");
+
+ addNodesToBucketDB(bucketId, "0=100,2=100,1=200,3=200");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get => 0,Get => 1"),
+ _sender.getCommands(true));
+
+ sendReply(0, api::ReturnCode::OK, "newauthor", 2);
+ sendReply(1, api::ReturnCode::OK, "", 0);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 2) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+GetOperationTest::testMultiInconsistentBucketNotFoundDeleted()
+{
+ setClusterState("distributor:1 storage:4");
+
+ addNodesToBucketDB(bucketId, "0=100,2=100,1=200,3=200");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get => 0,Get => 1"),
+ _sender.getCommands(true));
+
+ sendReply(0, api::ReturnCode::OK, "newauthor", 2);
+ // This signifies that the latest change was that the document was deleted
+ // at timestamp 3.
+ sendReply(1, api::ReturnCode::OK, "", 3);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 3) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+GetOperationTest::testMultiInconsistentBucket()
+{
+ setClusterState("distributor:1 storage:4");
+
+ addNodesToBucketDB(bucketId, "0=100,2=100,1=200,3=200");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get => 0,Get => 1"),
+ _sender.getCommands(true));
+
+ sendReply(0, api::ReturnCode::OK, "newauthor", 2);
+ sendReply(1, api::ReturnCode::OK, "oldauthor", 1);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 2) ReturnCode(NONE)"),
+ _sender.getLastReply());
+
+ CPPUNIT_ASSERT_EQUAL(std::string("newauthor"), getLastReplyAuthor());
+}
+
+void
+GetOperationTest::testMultiInconsistentBucketFail()
+{
+ setClusterState("distributor:1 storage:4");
+
+ addNodesToBucketDB(bucketId, "0=100,2=100,1=200,3=200");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get => 0,Get => 1"),
+ _sender.getCommands(true));
+
+ sendReply(0, api::ReturnCode::OK, "newauthor", 1);
+ sendReply(1, api::ReturnCode::DISK_FAILURE, "", 0);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get(BucketId(0x4000000000002a52), doc:test:uri) => 3"),
+ _sender.getLastCommand());
+
+ replyWithDocument();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 100) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+
+void
+GetOperationTest::testReturnNotFoundWhenBucketNotInDb()
+{
+ setClusterState("distributor:1 storage:1");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 0) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+GetOperationTest::testNotFound()
+{
+ setClusterState("distributor:1 storage:1");
+
+ addNodesToBucketDB(bucketId, "0=100");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get(BucketId(0x4000000000002a52), doc:test:uri) => 0"),
+ _sender.getLastCommand());
+
+ replyWithNotFound();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 0) ReturnCode(NONE)"),
+ _sender.getLastReply());
+
+ CPPUNIT_ASSERT_EQUAL(1, (int)(getDistributor().
+ getMetrics().gets[documentapi::LoadType::DEFAULT].
+ failures.notfound.getValue()));
+}
+
+void
+GetOperationTest::testResendOnStorageFailure()
+{
+ setClusterState("distributor:1 storage:3");
+
+ // Add two nodes that are not trusted. GET should retry each one of them
+ // if one fails.
+ addNodesToBucketDB(bucketId, "1=100,2=100");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get(BucketId(0x4000000000002a52), doc:test:uri) => 1"),
+ _sender.getLastCommand());
+
+ replyWithFailure();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get(BucketId(0x4000000000002a52), doc:test:uri) => 2"),
+ _sender.getLastCommand());
+
+ replyWithDocument();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 100) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+GetOperationTest::testResendOnStorageFailureAllFail()
+{
+ setClusterState("distributor:1 storage:3");
+
+ // Add two nodes that are not trusted. GET should retry each one of them
+ // if one fails.
+ addNodesToBucketDB(bucketId, "1=100,2=100");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get(BucketId(0x4000000000002a52), doc:test:uri) => 1"),
+ _sender.getLastCommand());
+
+ replyWithFailure();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get(BucketId(0x4000000000002a52), doc:test:uri) => 2"),
+ _sender.getLastCommand());
+
+ replyWithFailure();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 0) ReturnCode(IO_FAILURE)"),
+ _sender.getLastReply());
+}
+
+void
+GetOperationTest::testSendToIdealCopyIfBucketInSync()
+{
+ setClusterState("distributor:1 storage:4");
+
+ addNodesToBucketDB(bucketId, "1=100,2=100,3=100");
+
+ sendGet();
+
+ // Should always send to node 1 (follow bucket db order)
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get(BucketId(0x4000000000002a52), doc:test:uri) => 1"),
+ _sender.getLastCommand());
+
+ replyWithDocument();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 100) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+GetOperationTest::testMultipleCopiesWithFailureOnLocalNode()
+{
+ setClusterState("distributor:1 storage:4");
+
+ // Node 0 is local copy to distributor 0 and will be preferred when
+ // sending initially.
+ addNodesToBucketDB(document::BucketId(16, 0x2a52), "2=100,0=100");
+
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get => 0"),
+ _sender.getCommands(true));
+
+ // Fail local node; no reply must be sent yet since we've got more nodes
+ // to try.
+ sendReply(0, api::ReturnCode::TIMEOUT, "", 0);
+
+ // Retry with remaining copy on node 2.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get => 0,Get => 2"),
+ _sender.getCommands(true));
+
+ sendReply(1, api::ReturnCode::OK, "newestauthor", 3);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("GetReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 3) ReturnCode(NONE)"),
+ _sender.getLastReply());
+
+ CPPUNIT_ASSERT_EQUAL(std::string("newestauthor"), getLastReplyAuthor());
+}
+
+void
+GetOperationTest::canGetDocumentsWhenAllReplicaNodesRetired()
+{
+ setClusterState("distributor:1 storage:2 .0.s:r .1.s:r");
+ addNodesToBucketDB(bucketId, "0=4,1=4");
+ sendGet();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get => 0"),
+ _sender.getCommands(true));
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/idealstatemanagertest.cpp b/storage/src/tests/distributor/idealstatemanagertest.cpp
new file mode 100644
index 00000000000..9c97a2ba967
--- /dev/null
+++ b/storage/src/tests/distributor/idealstatemanagertest.cpp
@@ -0,0 +1,268 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storage/distributor/bucketdbupdater.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/storage/distributor/operations/idealstate/mergeoperation.h>
+#include <vespa/storage/distributor/operations/idealstate/removebucketoperation.h>
+#include <vespa/storage/distributor/operations/idealstate/setbucketstateoperation.h>
+#include <vespa/storage/distributor/operations/idealstate/splitoperation.h>
+#include <vespa/storageapi/message/stat.h>
+#include <vespa/storageapi/message/visitor.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <vespa/storage/storageutil/utils.h>
+#include <tests/distributor/distributortestutil.h>
+#include <vespa/storage/distributor/statecheckers.h>
+#include <vespa/storageapi/message/state.h>
+
+namespace storage {
+namespace distributor {
+
+class IdealStateManagerTest : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+public:
+ IdealStateManagerTest() {}
+ void setUp() {
+ createLinks();
+ };
+
+ void tearDown() {
+ close();
+ }
+
+ void testSibling();
+ void testClearActiveOnNodeDown();
+ void testRecheckWhenActive();
+ void testRecheckWhenPending();
+ void testOpsGenerationBusy();
+ void testStatusPage();
+ void testDisabledStateChecker();
+ void testBlockIdealStateOpsOnFullRequestBucketInfo();
+ void testBlockCheckForAllOperationsToSpecificBucket();
+
+ void setSystemState(const lib::ClusterState& systemState) {
+ _distributor->enableClusterState(systemState);
+ }
+
+ CPPUNIT_TEST_SUITE(IdealStateManagerTest);
+ CPPUNIT_TEST(testSibling);
+ CPPUNIT_TEST(testClearActiveOnNodeDown);
+ CPPUNIT_TEST(testRecheckWhenActive);
+ CPPUNIT_TEST(testStatusPage);
+ CPPUNIT_TEST(testDisabledStateChecker);
+ CPPUNIT_TEST(testBlockIdealStateOpsOnFullRequestBucketInfo);
+ CPPUNIT_TEST(testBlockCheckForAllOperationsToSpecificBucket);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(IdealStateManagerTest);
+
+void
+IdealStateManagerTest::testSibling()
+{
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(1,1),
+ getIdealStateManager().getDistributorComponent()
+ .getSibling(document::BucketId(1, 0)));
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(1,0),
+ getIdealStateManager().getDistributorComponent()
+ .getSibling(document::BucketId(1, 1)));
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(2,3),
+ getIdealStateManager().getDistributorComponent()
+ .getSibling(document::BucketId(2, 1)));
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(2,1),
+ getIdealStateManager().getDistributorComponent()
+ .getSibling(document::BucketId(2, 3)));
+}
+
+void
+IdealStateManagerTest::testStatusPage() {
+ close();
+ getDirConfig().getConfig("stor-distributormanager").set("splitsize", "100");
+ getDirConfig().getConfig("stor-distributormanager").set("splitcount", "1000000");
+ getDirConfig().getConfig("stor-distributormanager").set("joinsize", "0");
+ getDirConfig().getConfig("stor-distributormanager").set("joincount", "0");
+ createLinks();
+ setupDistributor(1, 1, "distributor:1 storage:1");
+
+ insertBucketInfo(document::BucketId(16, 5), 0, 0xff, 100, 200, true, true);
+ insertBucketInfo(document::BucketId(16, 2), 0, 0xff, 10, 10, true, true);
+
+ std::ostringstream ost;
+ getIdealStateManager().getBucketStatus(ost);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("BucketId(0x4000000000000002) : [node(idx=0,crc=0xff,docs=10/10,bytes=10/10,trusted=true,active=true)]<br>\n"
+ "<b>BucketId(0x4000000000000005):</b> <i> : split: [Splitting bucket because its maximum size (200 b, 100 docs, 100 meta, 200 b total) is "
+ "higher than the configured limit of (100, 1000000)]</i> [node(idx=0,crc=0xff,docs=100/100,bytes=200/200,trusted=true,"
+ "active=true)]<br>\n"),
+ ost.str());
+}
+
+void
+IdealStateManagerTest::testDisabledStateChecker() {
+ setupDistributor(1, 1, "distributor:1 storage:1");
+
+ getConfig().setSplitSize(100);
+ getConfig().setSplitCount(1000000);
+ getConfig().disableStateChecker("SplitBucket");
+
+ insertBucketInfo(document::BucketId(16, 5), 0, 0xff, 100, 200, true, true);
+ insertBucketInfo(document::BucketId(16, 2), 0, 0xff, 10, 10, true, true);
+
+ std::ostringstream ost;
+ getIdealStateManager().getBucketStatus(ost);
+
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "BucketId(0x4000000000000002) : [node(idx=0,crc=0xff,docs=10/10,bytes=10/10,trusted=true,active=true)]<br>\n"
+ "<b>BucketId(0x4000000000000005):</b> <i> : split: [Splitting bucket because its maximum size (200 b, 100 docs, 100 meta, 200 b total) is "
+ "higher than the configured limit of (100, 1000000)]</i> [node(idx=0,crc=0xff,docs=100/100,bytes=200/200,trusted=true,"
+ "active=true)]<br>\n"),
+ ost.str());
+
+ tick();
+ CPPUNIT_ASSERT_EQUAL(std::string(""),
+ _distributor->getActiveIdealStateOperations());
+
+}
+
+void
+IdealStateManagerTest::testClearActiveOnNodeDown()
+{
+ setSystemState(lib::ClusterState("distributor:1 storage:3"));
+ for (int i = 1; i < 4; i++) {
+ insertBucketInfo(document::BucketId(16, i), 0, 0xff, 100, 200);
+ insertBucketInfo(document::BucketId(16, i), 1, 0xffe, 1020, 2300);
+ insertBucketInfo(document::BucketId(16, i), 2, 0xfff, 1030, 2400);
+ }
+
+ tick();
+
+ // Start all three operations.
+ for (uint32_t i = 0; i < 3; ++i) {
+ tick();
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("setbucketstate to [0] BucketId(0x4000000000000001) (pri 100)\n"
+ "setbucketstate to [0] BucketId(0x4000000000000002) (pri 100)\n"
+ "setbucketstate to [0] BucketId(0x4000000000000003) (pri 100)\n"),
+ _distributor->getActiveIdealStateOperations());
+
+ setSystemState(lib::ClusterState("distributor:1 storage:3 .0.s:d"));
+
+ CPPUNIT_ASSERT_EQUAL(std::string(""),
+ _distributor->getActiveIdealStateOperations());
+ CPPUNIT_ASSERT_EQUAL(uint32_t(0),
+ _distributor->getPendingMessageTracker()
+ .getNodeInfo().getPendingCount(0));
+}
+
+void
+IdealStateManagerTest::testRecheckWhenActive()
+{
+ for (uint32_t j = 0; j < 3; j++) {
+ insertBucketInfo(document::BucketId(16, 1), j, 0xff - j, 100, 200);
+ }
+
+ setSystemState(lib::ClusterState("distributor:1 storage:3"));
+
+ tick();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("setbucketstate to [0] BucketId(0x4000000000000001) (pri 100)\n"),
+ _distributor->getActiveIdealStateOperations());
+
+ tick();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("setbucketstate to [0] BucketId(0x4000000000000001) (pri 100)\n"),
+ _distributor->getActiveIdealStateOperations());
+
+ tick();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("setbucketstate to [0] BucketId(0x4000000000000001) (pri 100)\n"),
+ _distributor->getActiveIdealStateOperations());
+}
+
+void
+IdealStateManagerTest::testBlockIdealStateOpsOnFullRequestBucketInfo()
+{
+ setupDistributor(2, 10, "distributor:1 storage:2");
+
+ framework::defaultimplementation::FakeClock clock;
+ PendingMessageTracker tracker(_node->getComponentRegister());
+
+ document::BucketId bid(16, 1234);
+ std::vector<document::BucketId> buckets;
+
+ // RequestBucketInfoCommand does not have a specific bucketid since it's
+ // sent to the entire node. It will then use a null bucketid.
+ {
+ std::shared_ptr<api::RequestBucketInfoCommand> msg(
+ new api::RequestBucketInfoCommand(buckets));
+ msg->setAddress(
+ api::StorageMessageAddress("storage", lib::NodeType::STORAGE, 4));
+ tracker.insert(msg);
+ }
+
+ {
+ RemoveBucketOperation op("storage",
+ BucketAndNodes(bid, toVector<uint16_t>(3, 4)));
+ CPPUNIT_ASSERT(op.isBlocked(tracker));
+ }
+
+ {
+ // Don't trigger on requests to other nodes.
+ RemoveBucketOperation op("storage",
+ BucketAndNodes(bid, toVector<uint16_t>(3, 5)));
+ CPPUNIT_ASSERT(!op.isBlocked(tracker));
+ }
+
+ // Don't block on null-bucket messages that aren't RequestBucketInfo.
+ {
+ std::shared_ptr<api::CreateVisitorCommand> msg(
+ new api::CreateVisitorCommand("foo", "bar", "baz"));
+ msg->setAddress(
+ api::StorageMessageAddress("storage", lib::NodeType::STORAGE, 7));
+ tracker.insert(msg);
+ }
+
+ {
+ RemoveBucketOperation op("storage",
+ BucketAndNodes(bid, toVector<uint16_t>(7)));
+ CPPUNIT_ASSERT(!op.isBlocked(tracker));
+ }
+}
+
+void
+IdealStateManagerTest::testBlockCheckForAllOperationsToSpecificBucket()
+{
+ setupDistributor(2, 10, "distributor:1 storage:2");
+ framework::defaultimplementation::FakeClock clock;
+ PendingMessageTracker tracker(_node->getComponentRegister());
+ document::BucketId bid(16, 1234);
+
+ {
+ auto msg = std::make_shared<api::JoinBucketsCommand>(bid);
+ msg->setAddress(
+ api::StorageMessageAddress("storage", lib::NodeType::STORAGE, 4));
+ tracker.insert(msg);
+ }
+ {
+ RemoveBucketOperation op("storage",
+ BucketAndNodes(bid, toVector<uint16_t>(7)));
+ // Not blocked for exact node match.
+ CPPUNIT_ASSERT(!op.checkBlock(bid, tracker));
+ // But blocked for bucket match!
+ CPPUNIT_ASSERT(op.checkBlockForAllNodes(bid, tracker));
+ }
+}
+
+} // distributor
+} // storage
+
diff --git a/storage/src/tests/distributor/joinbuckettest.cpp b/storage/src/tests/distributor/joinbuckettest.cpp
new file mode 100644
index 00000000000..ec7e3aaac32
--- /dev/null
+++ b/storage/src/tests/distributor/joinbuckettest.cpp
@@ -0,0 +1,127 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <vespa/storage/distributor/operations/idealstate/joinoperation.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <tests/distributor/distributortestutil.h>
+
+namespace storage {
+namespace distributor {
+
+class JoinOperationTest : public CppUnit::TestFixture, public DistributorTestUtil
+{
+ CPPUNIT_TEST_SUITE(JoinOperationTest);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST(sendSparseJoinsToNodesWithoutBothSourceBuckets);
+ CPPUNIT_TEST_SUITE_END();
+
+ void checkSourceBucketsAndSendReply(
+ JoinOperation& op,
+ size_t msgIndex,
+ const std::vector<document::BucketId>& wantedIds);
+
+protected:
+ void testSimple();
+ void sendSparseJoinsToNodesWithoutBothSourceBuckets();
+
+public:
+ void setUp() {
+ createLinks();
+ };
+
+ void tearDown() {
+ close();
+ }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(JoinOperationTest);
+
+void
+JoinOperationTest::testSimple()
+{
+ getConfig().setJoinCount(100);
+ getConfig().setJoinSize(1000);
+
+ addNodesToBucketDB(document::BucketId(33, 1), "0=250/50/300");
+ addNodesToBucketDB(document::BucketId(33, 0x100000001), "0=300/40/200");
+
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:1"));
+
+ JoinOperation op("storage",
+ BucketAndNodes(document::BucketId(32, 0),
+ toVector<uint16_t>(0)),
+ toVector(document::BucketId(33, 1),
+ document::BucketId(33, 0x100000001)));
+
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ checkSourceBucketsAndSendReply(op, 0, {{33, 1}, {33, 0x100000001}});
+
+ CPPUNIT_ASSERT(!getBucket(document::BucketId(33, 0x100000001)).valid());
+ CPPUNIT_ASSERT(!getBucket(document::BucketId(33, 1)).valid());
+
+ BucketDatabase::Entry entry = getBucket(document::BucketId(32, 0));
+ CPPUNIT_ASSERT(entry.valid());
+ CPPUNIT_ASSERT_EQUAL((uint16_t)0, entry->getNodeRef(0).getNode());
+ CPPUNIT_ASSERT_EQUAL(api::BucketInfo(666, 90, 500),
+ entry->getNodeRef(0).getBucketInfo());
+}
+
+void
+JoinOperationTest::checkSourceBucketsAndSendReply(
+ JoinOperation& op,
+ size_t msgIndex,
+ const std::vector<document::BucketId>& wantedIds)
+{
+ CPPUNIT_ASSERT(_sender.commands.size() > msgIndex);
+
+ std::shared_ptr<api::StorageCommand> msg(_sender.commands[msgIndex]);
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::JOINBUCKETS, msg->getType());
+
+ api::JoinBucketsCommand& joinCmd(
+ dynamic_cast<api::JoinBucketsCommand&>(*msg));
+ CPPUNIT_ASSERT_EQUAL(wantedIds, joinCmd.getSourceBuckets());
+
+ std::shared_ptr<api::StorageReply> reply(joinCmd.makeReply());
+ api::JoinBucketsReply& sreply(
+ dynamic_cast<api::JoinBucketsReply&>(*reply));
+ sreply.setBucketInfo(api::BucketInfo(666, 90, 500));
+
+ op.receive(_sender, reply);
+}
+
+/**
+ * If the set of buckets kept on nodes is disjoint, send sparse joins (same
+ * bucket id used as both source buckets) for those nodes having only one of
+ * the buckets.
+ */
+void
+JoinOperationTest::sendSparseJoinsToNodesWithoutBothSourceBuckets()
+{
+ getConfig().setJoinCount(100);
+ getConfig().setJoinSize(1000);
+
+ addNodesToBucketDB(document::BucketId(33, 1), "0=250/50/300,1=250/50/300");
+ addNodesToBucketDB(document::BucketId(33, 0x100000001), "0=300/40/200");
+
+ _distributor->enableClusterState(
+ lib::ClusterState("distributor:1 storage:2"));
+
+ JoinOperation op("storage",
+ BucketAndNodes(document::BucketId(32, 0),
+ toVector<uint16_t>(0, 1)),
+ toVector(document::BucketId(33, 1),
+ document::BucketId(33, 0x100000001)));
+
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ checkSourceBucketsAndSendReply(op, 0, {{33, 1}, {33, 0x100000001}});
+ checkSourceBucketsAndSendReply(op, 1, {{33, 1}, {33, 1}});
+}
+
+}
+
+}
diff --git a/storage/src/tests/distributor/maintenancemocks.h b/storage/src/tests/distributor/maintenancemocks.h
new file mode 100644
index 00000000000..923f7edec2b
--- /dev/null
+++ b/storage/src/tests/distributor/maintenancemocks.h
@@ -0,0 +1,123 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <sstream>
+#include <vespa/storage/distributor/maintenance/maintenanceprioritygenerator.h>
+#include <vespa/storage/distributor/maintenance/maintenanceoperationgenerator.h>
+#include <vespa/storage/distributor/operationstarter.h>
+#include <vespa/storage/distributor/operations/operation.h>
+#include <vespa/storageframework/defaultimplementation/clock/fakeclock.h>
+
+namespace storage {
+namespace distributor {
+
+class MockMaintenancePriorityGenerator
+ : public MaintenancePriorityGenerator
+{
+ MaintenancePriorityAndType prioritize(
+ const document::BucketId&,
+ NodeMaintenanceStatsTracker& stats) const
+ {
+ stats.incMovingOut(1);
+ stats.incCopyingIn(2);
+ return MaintenancePriorityAndType(
+ MaintenancePriority(MaintenancePriority::VERY_HIGH),
+ MaintenanceOperation::MERGE_BUCKET);
+ }
+};
+
+
+class MockOperation : public MaintenanceOperation
+{
+ document::BucketId _bucketId;
+ std::string _reason;
+ bool _shouldBlock;
+public:
+ MockOperation(const document::BucketId& bucketId)
+ : _bucketId(bucketId),
+ _shouldBlock(false)
+ {}
+
+ std::string toString() const {
+ return _bucketId.toString();
+ }
+
+ void onClose(DistributorMessageSender&) {
+ }
+ const char* getName() const {
+ return "MockOperation";
+ }
+ virtual const std::string& getDetailedReason() const {
+ return _reason;
+ }
+ void onStart(DistributorMessageSender&) {
+ }
+ void onReceive(DistributorMessageSender&, const std::shared_ptr<api::StorageReply>&) {
+ }
+ bool isBlocked(const PendingMessageTracker&) const {
+ return _shouldBlock;
+ }
+ void setShouldBlock(bool shouldBlock) {
+ _shouldBlock = shouldBlock;
+ }
+};
+
+class MockMaintenanceOperationGenerator
+ : public MaintenanceOperationGenerator
+{
+public:
+ MaintenanceOperation::SP generate(const document::BucketId& id) const {
+ return MaintenanceOperation::SP(new MockOperation(id));
+ }
+
+ std::vector<MaintenanceOperation::SP> generateAll(
+ const document::BucketId& id,
+ NodeMaintenanceStatsTracker& tracker) const
+ {
+ (void) tracker;
+ std::vector<MaintenanceOperation::SP> ret;
+ ret.push_back(MaintenanceOperation::SP(new MockOperation(id)));
+ return ret;
+ }
+
+};
+
+class MockOperationStarter
+ : public OperationStarter
+{
+ std::ostringstream _started;
+ std::vector<Operation::SP> _operations;
+ bool _shouldStart;
+public:
+ MockOperationStarter()
+ : _shouldStart(true)
+ {}
+
+ bool start(const std::shared_ptr<Operation>& operation,
+ Priority priority)
+ {
+ if (_shouldStart) {
+ _started << operation->toString()
+ << ", pri " << static_cast<int>(priority)
+ << "\n";
+ _operations.push_back(operation);
+ }
+ return _shouldStart;
+ }
+
+ void setShouldStartOperations(bool shouldStart) {
+ _shouldStart = shouldStart;
+ }
+
+ std::vector<Operation::SP>& getOperations() {
+ return _operations;
+ }
+
+ std::string toString() const {
+ return _started.str();
+ }
+};
+
+}
+}
+
diff --git a/storage/src/tests/distributor/maintenanceschedulertest.cpp b/storage/src/tests/distributor/maintenanceschedulertest.cpp
new file mode 100644
index 00000000000..4316bfd137c
--- /dev/null
+++ b/storage/src/tests/distributor/maintenanceschedulertest.cpp
@@ -0,0 +1,108 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <string>
+#include <sstream>
+#include <memory>
+#include <vespa/storage/distributor/maintenance/simplebucketprioritydatabase.h>
+#include <vespa/storage/distributor/maintenance/maintenancescheduler.h>
+#include <vespa/storage/distributor/bucketdb/mapbucketdatabase.h>
+#include <tests/distributor/maintenancemocks.h>
+
+namespace storage {
+
+namespace distributor {
+
+using document::BucketId;
+typedef MaintenancePriority Priority;
+typedef MaintenanceScheduler::WaitTimeMs WaitTimeMs;
+
+class MaintenanceSchedulerTest : public CppUnit::TestFixture {
+ CPPUNIT_TEST_SUITE(MaintenanceSchedulerTest);
+ CPPUNIT_TEST(testPriorityClearedAfterScheduled);
+ CPPUNIT_TEST(testOperationIsScheduled);
+ CPPUNIT_TEST(testNoOperationsToSchedule);
+ CPPUNIT_TEST(testSuppressLowPrioritiesInEmergencyMode);
+ CPPUNIT_TEST(testPriorityNotClearedIfOperationNotStarted);
+ CPPUNIT_TEST_SUITE_END();
+
+ std::unique_ptr<SimpleBucketPriorityDatabase> _priorityDb;
+ std::unique_ptr<MockMaintenanceOperationGenerator> _operationGenerator;
+ std::unique_ptr<MockOperationStarter> _operationStarter;
+ std::unique_ptr<MaintenanceScheduler> _scheduler;
+
+ void addBucketToDb(int bucketNum);
+public:
+ void testPriorityClearedAfterScheduled();
+ void testOperationIsScheduled();
+ void testNoOperationsToSchedule();
+ void testSuppressLowPrioritiesInEmergencyMode();
+ void testPriorityNotClearedIfOperationNotStarted();
+
+ void setUp();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(MaintenanceSchedulerTest);
+
+void
+MaintenanceSchedulerTest::setUp()
+{
+ _priorityDb.reset(new SimpleBucketPriorityDatabase());
+ _operationGenerator.reset(new MockMaintenanceOperationGenerator());
+ _operationStarter.reset(new MockOperationStarter());
+ _scheduler.reset(new MaintenanceScheduler(*_operationGenerator,
+ *_priorityDb,
+ *_operationStarter));
+}
+
+void
+MaintenanceSchedulerTest::testPriorityClearedAfterScheduled()
+{
+ _priorityDb->setPriority(PrioritizedBucket(BucketId(16, 1), Priority::VERY_HIGH));
+ _scheduler->tick(MaintenanceScheduler::NORMAL_SCHEDULING_MODE);
+ CPPUNIT_ASSERT_EQUAL(std::string(), _priorityDb->toString());
+}
+
+void
+MaintenanceSchedulerTest::testOperationIsScheduled()
+{
+ _priorityDb->setPriority(PrioritizedBucket(BucketId(16, 1), Priority::MEDIUM));
+ _scheduler->tick(MaintenanceScheduler::NORMAL_SCHEDULING_MODE);
+ CPPUNIT_ASSERT_EQUAL(std::string("BucketId(0x4000000000000001), pri 100\n"),
+ _operationStarter->toString());
+}
+
+void
+MaintenanceSchedulerTest::testNoOperationsToSchedule()
+{
+ WaitTimeMs waitMs(_scheduler->tick(MaintenanceScheduler::NORMAL_SCHEDULING_MODE));
+ CPPUNIT_ASSERT_EQUAL(WaitTimeMs(1), waitMs);
+ CPPUNIT_ASSERT_EQUAL(std::string(), _operationStarter->toString());
+}
+
+void
+MaintenanceSchedulerTest::testSuppressLowPrioritiesInEmergencyMode()
+{
+ _priorityDb->setPriority(PrioritizedBucket(BucketId(16, 1), Priority::HIGH));
+ _priorityDb->setPriority(PrioritizedBucket(BucketId(16, 2), Priority::VERY_HIGH));
+ CPPUNIT_ASSERT_EQUAL(WaitTimeMs(0), _scheduler->tick(MaintenanceScheduler::RECOVERY_SCHEDULING_MODE));
+ CPPUNIT_ASSERT_EQUAL(WaitTimeMs(1), _scheduler->tick(MaintenanceScheduler::RECOVERY_SCHEDULING_MODE));
+ CPPUNIT_ASSERT_EQUAL(std::string("BucketId(0x4000000000000002), pri 0\n"),
+ _operationStarter->toString());
+ CPPUNIT_ASSERT_EQUAL(std::string("PrioritizedBucket(BucketId(0x4000000000000001), pri HIGH)\n"),
+ _priorityDb->toString());
+}
+
+void
+MaintenanceSchedulerTest::testPriorityNotClearedIfOperationNotStarted()
+{
+ _priorityDb->setPriority(PrioritizedBucket(BucketId(16, 1), Priority::HIGH));
+ _operationStarter->setShouldStartOperations(false);
+ WaitTimeMs waitMs(_scheduler->tick(MaintenanceScheduler::NORMAL_SCHEDULING_MODE));
+ CPPUNIT_ASSERT_EQUAL(WaitTimeMs(1), waitMs);
+ CPPUNIT_ASSERT_EQUAL(std::string("PrioritizedBucket(BucketId(0x4000000000000001), pri HIGH)\n"),
+ _priorityDb->toString());
+}
+
+}
+}
diff --git a/storage/src/tests/distributor/mapbucketdatabasetest.cpp b/storage/src/tests/distributor/mapbucketdatabasetest.cpp
new file mode 100644
index 00000000000..ab8e5add65f
--- /dev/null
+++ b/storage/src/tests/distributor/mapbucketdatabasetest.cpp
@@ -0,0 +1,26 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/vespalib/util/document_runnable.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <vespa/storage/distributor/bucketdb/mapbucketdatabase.h>
+#include <vespa/storage/storageutil/utils.h>
+#include <tests/distributor/bucketdatabasetest.h>
+
+namespace storage {
+namespace distributor {
+
+struct MapBucketDatabaseTest : public BucketDatabaseTest {
+ MapBucketDatabase _db;
+
+ virtual BucketDatabase& db() { return _db; };
+
+ CPPUNIT_TEST_SUITE(MapBucketDatabaseTest);
+ SETUP_DATABASE_TESTS();
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(MapBucketDatabaseTest);
+
+}
+}
diff --git a/storage/src/tests/distributor/mergelimitertest.cpp b/storage/src/tests/distributor/mergelimitertest.cpp
new file mode 100644
index 00000000000..fd86e071579
--- /dev/null
+++ b/storage/src/tests/distributor/mergelimitertest.cpp
@@ -0,0 +1,161 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/util/linkedptr.h>
+#include <vespa/storage/distributor/operations/idealstate/mergelimiter.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+
+namespace storage {
+namespace distributor {
+
+struct MergeLimiterTest : public CppUnit::TestFixture
+{
+ void testKeepsAllBelowLimit();
+ void testLessThanMaxUntrusted();
+ void testMoreThanMaxUntrusted();
+ void testAllUntrustedLessThanMaxVariants();
+ void testAllUntrustedMoreThanMaxVariants();
+ void testSourceOnlyLast();
+
+ CPPUNIT_TEST_SUITE(MergeLimiterTest);
+ CPPUNIT_TEST(testKeepsAllBelowLimit);
+ CPPUNIT_TEST(testLessThanMaxUntrusted);
+ CPPUNIT_TEST(testMoreThanMaxUntrusted);
+ CPPUNIT_TEST(testAllUntrustedLessThanMaxVariants);
+ CPPUNIT_TEST(testAllUntrustedMoreThanMaxVariants);
+ CPPUNIT_TEST(testSourceOnlyLast);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(MergeLimiterTest);
+
+namespace {
+ typedef vespalib::LinkedPtr<BucketCopy> BucketCopyPtr;
+ std::vector<BucketCopyPtr> _bucketDatabase;
+
+ struct NodeFactory {
+ std::vector<MergeMetaData> _nodes;
+
+ NodeFactory& add(int index, int crc) {
+ _bucketDatabase.push_back(BucketCopyPtr(
+ new BucketCopy(0, index, api::BucketInfo(crc, 5, 10))));
+ _nodes.push_back(MergeMetaData(index, *_bucketDatabase.back()));
+ return *this;
+ }
+ NodeFactory& addTrusted(int index, int crc) {
+ add(index, crc);
+ _bucketDatabase.back()->setTrusted(true);
+ return *this;
+ }
+ NodeFactory& setSourceOnly() {
+ _nodes.back()._sourceOnly = true;
+ return *this;
+ }
+
+ operator const MergeLimiter::NodeArray&() const { return _nodes; }
+ };
+
+ #define ASSERT_LIMIT(maxNodes, nodes, result) \
+ { \
+ MergeLimiter limiter(maxNodes); \
+ limiter.limitMergeToMaxNodes(nodes); \
+ std::ostringstream actual; \
+ for (uint32_t i=0; i<nodes.size(); ++i) { \
+ if (i != 0) actual << ","; \
+ actual << nodes[i]._nodeIndex; \
+ if (nodes[i]._sourceOnly) actual << 's'; \
+ } \
+ CPPUNIT_ASSERT_EQUAL(std::string(result), actual.str()); \
+ }
+}
+
+// If there is <= max nodes, then none should be removed.
+void
+MergeLimiterTest::testKeepsAllBelowLimit()
+{
+ MergeLimiter::NodeArray nodes(NodeFactory()
+ .addTrusted(3, 0x4)
+ .addTrusted(5, 0x4)
+ .add(9, 0x6)
+ .add(2, 0x6)
+ .add(4, 0x5));
+
+ ASSERT_LIMIT(8, nodes, "3,5,9,2,4");
+}
+// If less than max nodes is untrusted, merge all untrusted copies with a
+// trusted one. (Optionally with extra trusted copies if there is space)
+void
+MergeLimiterTest::testLessThanMaxUntrusted()
+{
+ MergeLimiter::NodeArray nodes(NodeFactory()
+ .addTrusted(3, 0x4)
+ .addTrusted(5, 0x4)
+ .add(9, 0x6)
+ .add(2, 0x6)
+ .add(4, 0x5));
+ ASSERT_LIMIT(4, nodes, "2,4,9,5");
+}
+// With more than max untrusted, just merge one trusted with as many untrusted
+// that fits.
+void
+MergeLimiterTest::testMoreThanMaxUntrusted()
+{
+ MergeLimiter::NodeArray nodes(NodeFactory()
+ .addTrusted(3, 0x4)
+ .addTrusted(5, 0x4)
+ .add(9, 0x6)
+ .add(2, 0x6)
+ .add(13, 0x9)
+ .add(1, 0x7)
+ .add(4, 0x5));
+ ASSERT_LIMIT(4, nodes, "2,13,1,5");
+}
+// With nothing trusted. If there is <= max different variants (checksums),
+// merge one of each variant. After this merge, all these nodes can be set
+// trusted. (Except for any source only ones)
+void
+MergeLimiterTest::testAllUntrustedLessThanMaxVariants()
+{
+ MergeLimiter::NodeArray nodes(NodeFactory()
+ .add(3, 0x4)
+ .add(5, 0x4)
+ .add(9, 0x6)
+ .add(2, 0x6)
+ .add(13, 0x3)
+ .add(1, 0x3)
+ .add(4, 0x3));
+ ASSERT_LIMIT(4, nodes, "5,2,4,3");
+}
+// With nothing trusted and more than max variants, we just have to merge one
+// of each variant until we end up with less than max variants.
+void
+MergeLimiterTest::testAllUntrustedMoreThanMaxVariants()
+{
+ MergeLimiter::NodeArray nodes(NodeFactory()
+ .add(3, 0x4)
+ .add(5, 0x5)
+ .add(9, 0x6)
+ .add(2, 0x6)
+ .add(13, 0x3)
+ .add(1, 0x9)
+ .add(4, 0x8));
+ ASSERT_LIMIT(4, nodes, "3,5,2,13");
+}
+
+// With more than max untrusted, just merge one trusted with as many untrusted
+// that fits.
+void
+MergeLimiterTest::testSourceOnlyLast()
+{
+ MergeLimiter::NodeArray nodes(NodeFactory()
+ .addTrusted(3, 0x4)
+ .addTrusted(5, 0x4).setSourceOnly()
+ .add(9, 0x6)
+ .add(2, 0x6).setSourceOnly()
+ .add(13, 0x9)
+ .add(1, 0x7)
+ .add(4, 0x5));
+ ASSERT_LIMIT(4, nodes, "13,1,2s,5s");
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/mergeoperationtest.cpp b/storage/src/tests/distributor/mergeoperationtest.cpp
new file mode 100644
index 00000000000..a2373731bc3
--- /dev/null
+++ b/storage/src/tests/distributor/mergeoperationtest.cpp
@@ -0,0 +1,430 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <boost/lexical_cast.hpp>
+#include <cppunit/extensions/HelperMacros.h>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storage/distributor/operations/idealstate/mergeoperation.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storage/distributor/bucketdbupdater.h>
+#include <tests/distributor/distributortestutil.h>
+#include <vespa/vespalib/text/stringtokenizer.h>
+
+using std::shared_ptr;
+
+namespace storage {
+namespace distributor {
+
+class MergeOperationTest : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+ CPPUNIT_TEST_SUITE(MergeOperationTest);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST(testFailIfSourceOnlyCopiesChanged);
+ CPPUNIT_TEST(testGenerateNodeList);
+ CPPUNIT_TEST(doNotRemoveCopiesWithPendingMessages);
+ CPPUNIT_TEST(testDoNotRemoveActiveSourceOnlyCopies);
+ CPPUNIT_TEST(testMarkRedundantTrustedCopiesAsSourceOnly);
+ CPPUNIT_TEST(onlyMarkRedundantRetiredReplicasAsSourceOnly);
+ CPPUNIT_TEST_SUITE_END();
+
+ std::unique_ptr<PendingMessageTracker> _pendingTracker;
+
+protected:
+ void testSimple();
+ void testFailIfSourceOnlyCopiesChanged();
+ void testGenerateNodeList();
+ void doNotRemoveCopiesWithPendingMessages();
+ void testDoNotRemoveActiveSourceOnlyCopies();
+ void testMarkRedundantTrustedCopiesAsSourceOnly();
+ void onlyMarkRedundantRetiredReplicasAsSourceOnly();
+
+public:
+ void setUp() {
+ createLinks();
+ _pendingTracker.reset(new PendingMessageTracker(getComponentRegister()));
+ _sender.setPendingMessageTracker(*_pendingTracker);
+ }
+
+ void tearDown() {
+ close();
+ }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(MergeOperationTest);
+
+void
+MergeOperationTest::testSimple()
+{
+ getClock().setAbsoluteTimeInSeconds(10);
+
+ addNodesToBucketDB(document::BucketId(16, 1),
+ "0=10/1/1/t,"
+ "1=20/1/1,"
+ "2=10/1/1/t");
+
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:3"));
+
+ MergeOperation op(BucketAndNodes(document::BucketId(16, 1),
+ toVector<uint16_t>(0, 1, 2)));
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "MergeBucketCommand(BucketId(0x4000000000000001), to time 10000000, "
+ "cluster state version: 0, nodes: [0, 2, 1 (source only)], chain: [], "
+ "reasons to start: ) => 0"),
+ _sender.getLastCommand(true));
+
+ sendReply(op);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("DeleteBucketCommand(BucketId(0x4000000000000001)) "
+ "Reasons to start: => 1"),
+ _sender.getLastCommand(true));
+
+}
+
+void
+MergeOperationTest::testFailIfSourceOnlyCopiesChanged()
+{
+ getClock().setAbsoluteTimeInSeconds(10);
+
+ addNodesToBucketDB(document::BucketId(16, 1),
+ "0=10/1/1/t,"
+ "1=20/1/1,"
+ "2=10/1/1/t");
+
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:3"));
+
+ MergeOperation op(BucketAndNodes(document::BucketId(16, 1),
+ toVector<uint16_t>(0, 1, 2)));
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ std::string merge("MergeBucketCommand(BucketId(0x4000000000000001), to time 10000000, "
+ "cluster state version: 0, nodes: [0, 2, 1 (source only)], chain: [], "
+ "reasons to start: ) => 0");
+
+ CPPUNIT_ASSERT_EQUAL(merge, _sender.getLastCommand(true));
+ {
+ const api::MergeBucketCommand& cmd(
+ dynamic_cast<api::MergeBucketCommand&>(*_sender.commands[0]));
+ CPPUNIT_ASSERT_EQUAL(uint16_t(0), cmd.getSourceIndex());
+ }
+
+ // Source-only copy changed during merge
+ addNodesToBucketDB(document::BucketId(16, 1),
+ "0=10/1/1/t,"
+ "1=40/1/1,"
+ "2=10/1/1/t");
+ sendReply(op);
+ // Should not be a remove here!
+ CPPUNIT_ASSERT_EQUAL(merge, _sender.getLastCommand(true));
+ CPPUNIT_ASSERT(!op.ok());
+}
+
+namespace {
+std::string getNodeList(std::string state, uint32_t redundancy, std::string existing) {
+ lib::Distribution distribution(
+ lib::Distribution::getDefaultDistributionConfig(redundancy));
+ lib::ClusterState clusterState(state);
+ vespalib::StringTokenizer st(existing, ",");
+ std::vector<BucketCopy> bucketDB(st.size());
+ for (uint32_t i = 0; i < st.size(); i++) {
+ std::string num = st[i];
+ size_t pos = num.find('t');
+ bool trusted = false;
+
+ if (pos != std::string::npos) {
+ num.erase(pos);
+ trusted = true;
+ }
+ bucketDB[i] = BucketCopy(0, atoi(num.c_str()),
+ api::BucketInfo(1, 2, 3));
+ bucketDB[i].setTrusted(trusted);
+ }
+ std::vector<MergeMetaData> nodes(st.size());
+ for (uint32_t i = 0; i < st.size(); i++) {
+ nodes[i] = MergeMetaData(bucketDB[i].getNode(), bucketDB[i]);
+ }
+ MergeLimiter limiter(16);
+ MergeOperation::generateSortedNodeList(distribution, clusterState,
+ document::BucketId(32, 1),
+ limiter, nodes);
+ std::ostringstream actual;
+ for (uint32_t i = 0; i < nodes.size(); i++) {
+ if (i != 0) {
+ actual << ",";
+ }
+ actual << nodes[i]._nodeIndex;
+ if (nodes[i]._sourceOnly) {
+ actual << "s";
+ }
+ }
+ return actual.str();
+}
+}
+
+void
+MergeOperationTest::testGenerateNodeList()
+{
+ // If this fails, the distribution has changed and the rest of the test will
+ // likely fail
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0,9,2,1,4"),
+ getNodeList("storage:10", 10, "0,1,2,3,4,5,6,7,8,9"));
+
+ // Nodes that are initializing should be treated as up
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7s,6s"),
+ getNodeList("storage:10 .3.s:i .5.s:i", 2, "7,6,3,5")); // Ideal: 3,5
+
+ // Order is given by ideal state algorithm, not order of storagenodes in bucket db
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7"),
+ getNodeList("storage:10", 3, "3,7,5"));
+
+ // Node not in ideal state will be used if not enough nodes in ideal state
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,7,6"),
+ getNodeList("storage:10", 3, "3,7,6"));
+
+ // Nodes not in ideal state will be included as source only after redundancy
+ // is reached
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,8s"),
+ getNodeList("storage:10", 3, "3,5,7,8"));
+
+ // Need at least redundancy copies that are not source only
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,8,9s"),
+ getNodeList("storage:10", 3, "3,5,8,9"));
+
+ // Order is given by storagenodes in bucket db
+ // when no nodes are in ideal state
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("4,1,2"),
+ getNodeList("storage:10", 3, "4,1,2"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,0s,1s,2s,4s,5s,6s,7s,8s,9s"),
+ getNodeList("storage:10", 1, "0,1,2,3,4,5,6,7,8,9"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,0s,1s,2s,4s,6s,7s,8s,9s"),
+ getNodeList("storage:10", 2, "0,1,2,3,4,5,6,7,8,9"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,0s,1s,2s,4s,6s,8s,9s"),
+ getNodeList("storage:10", 3, "0,1,2,3,4,5,6,7,8,9"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,0s,1s,2s,4s,8s,9s"),
+ getNodeList("storage:10", 4, "0,1,2,3,4,5,6,7,8,9"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0s,1s,2s,4s,9s"),
+ getNodeList("storage:10", 5, "0,1,2,3,4,5,6,7,8,9"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0,1s,2s,4s,9s"),
+ getNodeList("storage:10", 6, "0,1,2,3,4,5,6,7,8,9"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0,9,1s,2s,4s"),
+ getNodeList("storage:10", 7, "0,1,2,3,4,5,6,7,8,9"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0,9,2,1s,4s"),
+ getNodeList("storage:10", 8, "0,1,2,3,4,5,6,7,8,9"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0,9,2,1,4s"),
+ getNodeList("storage:10", 9, "0,1,2,3,4,5,6,7,8,9"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0,9,2,1,4"),
+ getNodeList("storage:10", 10, "0,1,2,3,4,5,6,7,8,9"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,9s,8s,7s,6s,5s,4s,2s,1s,0s"),
+ getNodeList("storage:10", 1, "9,8,7,6,5,4,3,2,1,0"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,9s,8s,7s,6s,4s,2s,1s,0s"),
+ getNodeList("storage:10", 2, "9,8,7,6,5,4,3,2,1,0"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,9s,8s,6s,4s,2s,1s,0s"),
+ getNodeList("storage:10", 3, "9,8,7,6,5,4,3,2,1,0"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,9s,8s,4s,2s,1s,0s"),
+ getNodeList("storage:10", 4, "9,8,7,6,5,4,3,2,1,0"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,9s,4s,2s,1s,0s"),
+ getNodeList("storage:10", 5, "9,8,7,6,5,4,3,2,1,0"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0,9s,4s,2s,1s"),
+ getNodeList("storage:10", 6, "9,8,7,6,5,4,3,2,1,0"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0,9,4s,2s,1s"),
+ getNodeList("storage:10", 7, "9,8,7,6,5,4,3,2,1,0"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0,9,2,4s,1s"),
+ getNodeList("storage:10", 8, "9,8,7,6,5,4,3,2,1,0"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0,9,2,1,4s"),
+ getNodeList("storage:10", 9, "9,8,7,6,5,4,3,2,1,0"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0,9,2,1,4"),
+ getNodeList("storage:10", 10, "9,8,7,6,5,4,3,2,1,0"));
+
+ // Trusted copies should not be source only.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0,9,1,2,4s"),
+ getNodeList("storage:10", 7, "0,1t,2t,3,4,5,6,7,8,9"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0,9,2,1s,4s"),
+ getNodeList("storage:10", 7, "0,1,2t,3,4,5,6,7,8,9"));
+
+ // Retired nodes are not in ideal state
+ // Ideal: 5,7
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,2,3s"),
+ getNodeList("storage:10 .3.s:r", 2, "0,2,3"));
+ // Ideal: 5,7,6
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,2,3"),
+ getNodeList("storage:10 .3.s:r", 3, "0,2,3"));
+}
+
+void
+MergeOperationTest::doNotRemoveCopiesWithPendingMessages()
+{
+ document::BucketId bucket(16, 1);
+
+ getClock().setAbsoluteTimeInSeconds(10);
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:3"));
+ addNodesToBucketDB(bucket,
+ "0=10/1/1/t,"
+ "1=20/1/1,"
+ "2=10/1/1/t");
+
+ MergeOperation op(BucketAndNodes(bucket,
+ toVector<uint16_t>(0, 1, 2)));
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ std::string merge("MergeBucketCommand(BucketId(0x4000000000000001), to time 10000000, "
+ "cluster state version: 0, nodes: [0, 2, 1 (source only)], chain: [], "
+ "reasons to start: ) => 0");
+
+ CPPUNIT_ASSERT_EQUAL(merge, _sender.getLastCommand(true));
+
+ // Suddenly a wild operation appears to the source only copy!
+ // Removes are blocked by all and any operation types, so can just choose
+ // at will.
+ api::StorageMessage::SP msg(
+ new api::SetBucketStateCommand(bucket, api::SetBucketStateCommand::ACTIVE));
+ msg->setAddress(api::StorageMessageAddress("storage", lib::NodeType::STORAGE, 1));
+ _pendingTracker->insert(msg);
+
+
+ sendReply(op);
+ // Should not be a remove here!
+ CPPUNIT_ASSERT_EQUAL(merge, _sender.getLastCommand(true));
+ CPPUNIT_ASSERT(!op.ok());
+}
+
+void
+MergeOperationTest::testDoNotRemoveActiveSourceOnlyCopies()
+{
+ getClock().setAbsoluteTimeInSeconds(10);
+
+ addNodesToBucketDB(document::BucketId(16, 1),
+ "0=10/1/1/t,"
+ "1=20/1/1/u/a,"
+ "2=10/1/1/t");
+
+ _distributor->enableClusterState(
+ lib::ClusterState("distributor:1 storage:3"));
+ MergeOperation op(BucketAndNodes(document::BucketId(16, 1),
+ toVector<uint16_t>(0, 1, 2)));
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ std::string merge(
+ "MergeBucketCommand(BucketId(0x4000000000000001), to time "
+ "10000000, cluster state version: 0, nodes: [0, 2, 1 "
+ "(source only)], chain: [], reasons to start: ) => 0");
+ CPPUNIT_ASSERT_EQUAL(merge, _sender.getLastCommand(true));
+
+ sendReply(op);
+ // No DeleteBucket shall have been sent
+ CPPUNIT_ASSERT_EQUAL(merge, _sender.getLastCommand(true));
+}
+
+void
+MergeOperationTest::testMarkRedundantTrustedCopiesAsSourceOnly()
+{
+ // This test uses the same distribution as testGenerateNodeList(), i.e.
+ // an ideal state sequence of [3, 5, 7, 6, 8, 0, 9, 2, 1, 4]
+
+ // 3 redundancy, 5 trusted -> 2 trusted source only.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6s,8s"),
+ getNodeList("storage:10", 3, "3t,5t,7t,6t,8t"));
+
+ // 3 redundancy, 4 trusted -> 1 trusted source only.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6s,8s"),
+ getNodeList("storage:10", 3, "3t,5t,7t,6t,8"));
+
+ // 3 redundancy, 3 trusted -> 0 trusted source only, 2 non-trusted sources.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6s,8s"),
+ getNodeList("storage:10", 3, "3t,5t,7t,6,8"));
+
+ // 3 redundancy, 4 trusted -> 1 source only trusted.
+ // We allow marking a trusted, non-ideal copy as source even when we don't
+ // have #redundancy trusted _ideal_ copies, as long as we're left with >=
+ // #redundancy trusted copies in total.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8s"),
+ getNodeList("storage:10", 3, "3t,5t,7,6t,8t"));
+
+ // Not sufficient number of trusted copies to mark any as source only.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8"),
+ getNodeList("storage:10", 3, "3t,5,7,6t,8t"));
+
+ // Same as above, with all trusted copies being non-ideal.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8"),
+ getNodeList("storage:10", 3, "3,5,7,6t,8t"));
+
+ // #redundancy of trusted, but none are ideal. Non-ideal trusted should
+ // not be marked as source only (though we can mark non-trusted non-ideal
+ // node as source only).
+ // Note the node reordering since trusted are added before the rest.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,8,0,9,6s"),
+ getNodeList("storage:10", 3, "3,5,7,6,8t,0t,9t"));
+
+ // But allow for removing excess trusted, non-ideal copies.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("3,5,7,6,8,0,9s"),
+ getNodeList("storage:10", 3, "3,5,7,6t,8t,0t,9t"));
+}
+
+void
+MergeOperationTest::onlyMarkRedundantRetiredReplicasAsSourceOnly()
+{
+ // No nodes in ideal state and all nodes are retired. With redundancy of 2
+ // we can only mark the last replica in the DB as source-only. Retired
+ // nodes are meant as source-only due to being migrated away from, but
+ // source-only nodes will have their replica removed after a successful
+ // merge, which we cannot allow to happen here.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("0,1,2s"),
+ getNodeList("storage:3 .0.s.:r .1.s:r .2.s:r", 2, "1,0,2"));
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/messagesenderstub.cpp b/storage/src/tests/distributor/messagesenderstub.cpp
new file mode 100644
index 00000000000..88210a94848
--- /dev/null
+++ b/storage/src/tests/distributor/messagesenderstub.cpp
@@ -0,0 +1,88 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <tests/distributor/messagesenderstub.h>
+#include <tests/distributor/distributortestutil.h>
+
+namespace storage {
+
+std::string
+MessageSenderStub::getLastCommand(bool verbose) const
+{
+ if (commands.empty()) {
+ throw std::logic_error("Expected command where there was none");
+ }
+ return dumpMessage(*commands[commands.size() - 1],
+ true,
+ verbose);
+
+}
+
+std::string
+MessageSenderStub::dumpMessage(const api::StorageMessage& msg,
+ bool includeAddress,
+ bool verbose) const
+{
+ std::ostringstream ost;
+
+ if (verbose) {
+ ost << msg;
+ } else {
+ ost << msg.getType().getName();
+ }
+
+ if (includeAddress && msg.getAddress()) {
+ ost << " => " << msg.getAddress()->getIndex();
+ }
+ if (verbose && msg.getType().isReply()) {
+ ost << " " << dynamic_cast<const api::StorageReply&>(msg).getResult();
+ }
+
+ return ost.str();
+}
+
+std::string
+MessageSenderStub::getCommands(bool includeAddress, bool verbose, uint32_t fromIdx) const
+{
+ std::ostringstream ost;
+
+ for (uint32_t i = fromIdx; i < commands.size(); i++) {
+ if (i != fromIdx) {
+ ost << ",";
+ }
+
+ ost << dumpMessage(*commands[i], includeAddress, verbose);
+ }
+
+ return ost.str();
+}
+
+std::string
+MessageSenderStub::getLastReply(bool verbose) const
+{
+ if (replies.empty()) {
+ throw std::logic_error("Expected reply where there was none");
+ }
+
+ return dumpMessage(*replies.back(),
+ true,
+ verbose);
+
+}
+
+std::string
+MessageSenderStub::getReplies(bool includeAddress, bool verbose) const
+{
+ std::ostringstream ost;
+ for (uint32_t i = 0; i < replies.size(); i++) {
+ if (i != 0) {
+ ost << ",";
+ }
+
+ ost << dumpMessage(*replies[i], includeAddress, verbose);
+ }
+
+ return ost.str();
+}
+
+}
+
diff --git a/storage/src/tests/distributor/messagesenderstub.h b/storage/src/tests/distributor/messagesenderstub.h
new file mode 100644
index 00000000000..d70c5355868
--- /dev/null
+++ b/storage/src/tests/distributor/messagesenderstub.h
@@ -0,0 +1,71 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/distributor.h>
+#include <vespa/storage/distributor/distributormessagesender.h>
+
+namespace storage {
+
+struct MessageSenderStub : distributor::DistributorMessageSender
+{
+ std::vector<std::shared_ptr<api::StorageCommand> > commands;
+ std::vector<std::shared_ptr<api::StorageReply> > replies;
+
+ MessageSenderStub()
+ : _clusterName("storage"),
+ _pendingMessageTracker(0)
+ {}
+
+ void clear() {
+ commands.clear();
+ replies.clear();
+ }
+
+ virtual void sendCommand(const std::shared_ptr<api::StorageCommand>& cmd)
+ {
+ commands.push_back(cmd);
+ }
+
+ virtual void sendReply(const std::shared_ptr<api::StorageReply>& reply)
+ {
+ replies.push_back(reply);
+ }
+
+ std::string getLastCommand(bool verbose = true) const;
+
+ std::string getCommands(bool includeAddress = false,
+ bool verbose = false,
+ uint32_t fromIndex = 0) const;
+
+ std::string getLastReply(bool verbose = true) const;
+
+ std::string getReplies(bool includeAddress = false,
+ bool verbose = false) const;
+
+ std::string dumpMessage(const api::StorageMessage& msg,
+ bool includeAddress,
+ bool verbose) const;
+
+ virtual int getDistributorIndex() const {
+ return 0;
+ }
+
+ virtual const std::string& getClusterName() const {
+ return _clusterName;
+ }
+
+ virtual const distributor::PendingMessageTracker& getPendingMessageTracker() const {
+ assert(_pendingMessageTracker);
+ return *_pendingMessageTracker;
+ }
+
+ void setPendingMessageTracker(distributor::PendingMessageTracker& tracker) {
+ _pendingMessageTracker = &tracker;
+ }
+private:
+ std::string _clusterName;
+ distributor::PendingMessageTracker* _pendingMessageTracker;
+};
+
+}
+
diff --git a/storage/src/tests/distributor/nodeinfotest.cpp b/storage/src/tests/distributor/nodeinfotest.cpp
new file mode 100644
index 00000000000..883e6ba7668
--- /dev/null
+++ b/storage/src/tests/distributor/nodeinfotest.cpp
@@ -0,0 +1,83 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageframework/defaultimplementation/clock/fakeclock.h>
+#include <vespa/storage/distributor/bucketdbupdater.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/vdslib/state/random.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storage/distributor/pendingclusterstate.h>
+#include <vespa/vespalib/text/stringtokenizer.h>
+#include <vespa/storage/distributor/nodeinfo.h>
+
+#include <iostream>
+#include <fstream>
+#include <string>
+
+namespace storage {
+namespace distributor {
+
+class NodeInfoTest : public CppUnit::TestFixture {
+ CPPUNIT_TEST_SUITE(NodeInfoTest);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST_SUITE_END();
+public:
+ void testSimple();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(NodeInfoTest);
+
+void
+NodeInfoTest::testSimple()
+{
+ framework::defaultimplementation::FakeClock clock;
+ NodeInfo info(clock);
+
+ CPPUNIT_ASSERT_EQUAL(0, (int)info.getPendingCount(3));
+ CPPUNIT_ASSERT_EQUAL(0, (int)info.getPendingCount(9));
+
+ info.incPending(3);
+ info.incPending(3);
+ info.incPending(3);
+ info.incPending(3);
+ info.decPending(3);
+ info.decPending(4);
+ info.incPending(7);
+ info.incPending(4);
+ info.decPending(3);
+
+ CPPUNIT_ASSERT_EQUAL(2, (int)info.getPendingCount(3));
+ CPPUNIT_ASSERT_EQUAL(1, (int)info.getPendingCount(4));
+ CPPUNIT_ASSERT_EQUAL(1, (int)info.getPendingCount(7));
+ CPPUNIT_ASSERT_EQUAL(0, (int)info.getPendingCount(5));
+
+ info.setBusy(5);
+ clock.addSecondsToTime(10);
+ info.setBusy(1);
+ clock.addSecondsToTime(20);
+ info.setBusy(42);
+
+ CPPUNIT_ASSERT_EQUAL(true, info.isBusy(5));
+ CPPUNIT_ASSERT_EQUAL(true, info.isBusy(1));
+ CPPUNIT_ASSERT_EQUAL(true, info.isBusy(42));
+ CPPUNIT_ASSERT_EQUAL(false, info.isBusy(7));
+
+ clock.addSecondsToTime(42);
+
+ CPPUNIT_ASSERT_EQUAL(false, info.isBusy(5));
+ CPPUNIT_ASSERT_EQUAL(false, info.isBusy(1));
+ CPPUNIT_ASSERT_EQUAL(true, info.isBusy(42));
+ CPPUNIT_ASSERT_EQUAL(false, info.isBusy(7));
+
+}
+
+}
+
+}
diff --git a/storage/src/tests/distributor/nodemaintenancestatstrackertest.cpp b/storage/src/tests/distributor/nodemaintenancestatstrackertest.cpp
new file mode 100644
index 00000000000..f1c177e7939
--- /dev/null
+++ b/storage/src/tests/distributor/nodemaintenancestatstrackertest.cpp
@@ -0,0 +1,102 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+
+#include <vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h>
+
+namespace storage {
+namespace distributor {
+
+class NodeMaintenanceStatsTrackerTest : public CppUnit::TestFixture
+{
+ CPPUNIT_TEST_SUITE(NodeMaintenanceStatsTrackerTest);
+ CPPUNIT_TEST(emptyStatsInstancesAreEqual);
+ CPPUNIT_TEST(statsFieldsAffectEqualityComparison);
+ CPPUNIT_TEST(requestingNonExistingNodeGivesEmptyStats);
+ CPPUNIT_TEST(statsAreTrackedPerNode);
+ CPPUNIT_TEST_SUITE_END();
+
+ void emptyStatsInstancesAreEqual();
+ void statsFieldsAffectEqualityComparison();
+ void requestingNonExistingNodeGivesEmptyStats();
+ void statsAreTrackedPerNode();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(NodeMaintenanceStatsTrackerTest);
+
+void
+NodeMaintenanceStatsTrackerTest::emptyStatsInstancesAreEqual()
+{
+ NodeMaintenanceStats a;
+ NodeMaintenanceStats b;
+ CPPUNIT_ASSERT_EQUAL(a, b);
+}
+
+void
+NodeMaintenanceStatsTrackerTest::statsFieldsAffectEqualityComparison()
+{
+ NodeMaintenanceStats a;
+ NodeMaintenanceStats b;
+
+ a.movingOut = 1;
+ CPPUNIT_ASSERT(!(a == b));
+ b.movingOut = 1;
+ CPPUNIT_ASSERT(a == b);
+
+ a.syncing = 1;
+ CPPUNIT_ASSERT(!(a == b));
+ b.syncing = 1;
+ CPPUNIT_ASSERT(a == b);
+
+ a.copyingIn = 1;
+ CPPUNIT_ASSERT(!(a == b));
+ b.copyingIn = 1;
+ CPPUNIT_ASSERT(a == b);
+
+ a.copyingOut = 1;
+ CPPUNIT_ASSERT(!(a == b));
+ b.copyingOut = 1;
+ CPPUNIT_ASSERT(a == b);
+}
+
+void
+NodeMaintenanceStatsTrackerTest::requestingNonExistingNodeGivesEmptyStats()
+{
+ NodeMaintenanceStatsTracker tracker;
+ NodeMaintenanceStats wanted;
+ CPPUNIT_ASSERT_EQUAL(wanted, tracker.forNode(0));
+}
+
+void
+NodeMaintenanceStatsTrackerTest::statsAreTrackedPerNode()
+{
+ NodeMaintenanceStatsTracker tracker;
+ NodeMaintenanceStats wanted;
+
+ tracker.incMovingOut(0);
+ wanted.movingOut = 1;
+ CPPUNIT_ASSERT_EQUAL(wanted, tracker.forNode(0));
+ wanted.movingOut = 0;
+ CPPUNIT_ASSERT_EQUAL(wanted, tracker.forNode(1));
+
+ tracker.incMovingOut(0);
+ wanted.movingOut = 2;
+ CPPUNIT_ASSERT_EQUAL(wanted, tracker.forNode(0));
+
+ tracker.incMovingOut(1);
+ wanted.movingOut = 1;
+ CPPUNIT_ASSERT_EQUAL(wanted, tracker.forNode(1));
+
+ tracker.incSyncing(1);
+ tracker.incCopyingIn(1);
+ tracker.incCopyingOut(1);
+ wanted.syncing = 1;
+ wanted.copyingIn = 1;
+ wanted.copyingOut = 1;
+ CPPUNIT_ASSERT_EQUAL(wanted, tracker.forNode(1));
+}
+
+} // distributor
+} // storage
+
diff --git a/storage/src/tests/distributor/operationtargetresolvertest.cpp b/storage/src/tests/distributor/operationtargetresolvertest.cpp
new file mode 100644
index 00000000000..5b23d3a7a9e
--- /dev/null
+++ b/storage/src/tests/distributor/operationtargetresolvertest.cpp
@@ -0,0 +1,316 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <vespa/config/helper/configgetter.h>
+#include <vespa/document/config/config-documenttypes.h>
+#include <vespa/document/repo/documenttyperepo.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/state.h>
+#include <tests/distributor/distributortestutil.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/vdslib/distribution/idealnodecalculatorimpl.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storage/distributor/operationtargetresolverimpl.h>
+
+using document::BucketId;
+
+namespace storage {
+namespace distributor {
+
+struct OperationTargetResolverTest : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+
+ document::DocumentTypeRepo::SP _repo;
+ const document::DocumentType* _html_type;
+ std::unique_ptr<Operation> op;
+
+ void testSimple();
+ void testMultipleNodes();
+ void testChooseIdealStateWhenManyCopies();
+ void testChooseHighestSplitBucket();
+ void testChooseHighestSplitBucketPerNode();
+ void testChooseHighestSplitBucketWithTrusted();
+ void testInconsistentBucketsAreNotExplicitlyCreated();
+ void testNoTrustedOrIdealStateCopyAvailable();
+ void testCreateMissingCopies();
+ void testNoExistingCopies();
+ void testCountMaintenanceNodesAsDown();
+ void testResolvingDoesNotMutateDatabase();
+ void testTrustedOverIdealState();
+
+ BucketInstanceList getInstances(const BucketId& bid,
+ bool stripToRedundancy);
+
+ void setUp() {
+ _repo.reset(new document::DocumentTypeRepo(
+ *config::ConfigGetter<document::DocumenttypesConfig>::getConfig(
+ "config-doctypes", config::FileSpec("config-doctypes.cfg"))));
+ _html_type = _repo->getDocumentType("text/html");
+ createLinks();
+ };
+
+ void tearDown() {
+ close();
+ }
+
+ CPPUNIT_TEST_SUITE(OperationTargetResolverTest);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST(testMultipleNodes);
+ CPPUNIT_TEST(testChooseIdealStateWhenManyCopies);
+ CPPUNIT_TEST(testChooseHighestSplitBucket);
+ CPPUNIT_TEST(testChooseHighestSplitBucketPerNode);
+ CPPUNIT_TEST(testChooseHighestSplitBucketWithTrusted);
+ CPPUNIT_TEST(testNoTrustedOrIdealStateCopyAvailable);
+ CPPUNIT_TEST(testInconsistentBucketsAreNotExplicitlyCreated);
+ CPPUNIT_TEST(testCreateMissingCopies);
+ CPPUNIT_TEST(testNoExistingCopies);
+ CPPUNIT_TEST(testCountMaintenanceNodesAsDown);
+ CPPUNIT_TEST(testResolvingDoesNotMutateDatabase);
+ CPPUNIT_TEST(testTrustedOverIdealState);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(OperationTargetResolverTest);
+
+namespace {
+
+ // Create assertion that makes it easy to write tests, and report correct
+ // line for problem at command line
+#define ASSERT_THAT(id) \
+ { \
+ struct MyAsserter : public Asserter { \
+ void assertEqualMsg(std::string t1, OperationTargetList t2, \
+ OperationTargetList t3) { \
+ CPPUNIT_ASSERT_EQUAL_MSG(t1, t2, t3); \
+ } \
+ }; \
+ _asserters.push_back(new MyAsserter); \
+ } \
+ TestTargets::createTest(id, *this, *_asserters.back())
+
+ struct Asserter {
+ virtual ~Asserter() {}
+ virtual void assertEqualMsg(std::string t1,
+ OperationTargetList t2,
+ OperationTargetList t3) = 0;
+ };
+ std::vector<Asserter*> _asserters;
+ struct TestTargets {
+ const BucketId& _id;
+ OperationTargetList _expected;
+ OperationTargetResolverTest& _test;
+ Asserter& _asserter;
+
+ TestTargets(const BucketId& id,
+ OperationTargetResolverTest& test,
+ Asserter& asserter)
+ : _id(id), _test(test), _asserter(asserter) {}
+
+ ~TestTargets() {
+ BucketInstanceList result(_test.getInstances(_id, true));
+ BucketInstanceList all(_test.getInstances(_id, false));
+ _asserter.assertEqualMsg(
+ all.toString(), _expected, result.createTargets());
+ delete _asserters.back();
+ _asserters.pop_back();
+ }
+
+ TestTargets& sendsTo(const BucketId& id, uint16_t node) {
+ _expected.push_back(OperationTarget(
+ id, lib::Node(lib::NodeType::STORAGE, node), false));
+ return *this;
+ }
+ TestTargets& createsAt(const BucketId& id, uint16_t node) {
+ _expected.push_back(OperationTarget(
+ id, lib::Node(lib::NodeType::STORAGE, node), true));
+ return *this;
+ }
+
+ static TestTargets createTest(const BucketId& id,
+ OperationTargetResolverTest& test,
+ Asserter& asserter)
+ {
+ return TestTargets(id, test, asserter);
+ }
+ };
+
+
+} // anonymous
+
+BucketInstanceList
+OperationTargetResolverTest::getInstances(const BucketId& id,
+ bool stripToRedundancy)
+{
+ lib::IdealNodeCalculatorImpl idealNodeCalc;
+ idealNodeCalc.setDistribution(getExternalOperationHandler().getDistribution());
+ idealNodeCalc.setClusterState(getExternalOperationHandler().getClusterState());
+ OperationTargetResolverImpl resolver(
+ getExternalOperationHandler().getBucketDatabase(), idealNodeCalc, 16,
+ getExternalOperationHandler().getDistribution().getRedundancy());
+ if (stripToRedundancy) {
+ return resolver.getInstances(OperationTargetResolver::PUT, id);
+ } else {
+ return resolver.getAllInstances(OperationTargetResolver::PUT, id);
+ }
+}
+
+/*
+ * Test basic case with no inconsistencies
+ */
+void
+OperationTargetResolverTest::testSimple()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ addNodesToBucketDB(BucketId(16, 0), "0=0,1=0");
+
+ ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(16, 0), 1)
+ .sendsTo(BucketId(16, 0), 0);
+}
+
+void
+OperationTargetResolverTest::testMultipleNodes()
+{
+ setupDistributor(1, 2, "storage:2 distributor:1");
+
+ for (int i = 0; i < 100; ++i) {
+ addNodesToBucketDB(BucketId(16, i), "0=0,1=0");
+
+ lib::IdealNodeCalculatorImpl idealNodeCalc;
+ idealNodeCalc.setDistribution(getExternalOperationHandler().getDistribution());
+ idealNodeCalc.setClusterState(getExternalOperationHandler().getClusterState());
+ lib::IdealNodeList idealNodes(
+ idealNodeCalc.getIdealStorageNodes(BucketId(16, i)));
+ uint16_t expectedNode = idealNodes[0].getIndex();
+ ASSERT_THAT(BucketId(32, i)).sendsTo(BucketId(16, i), expectedNode);
+ }
+}
+
+void
+OperationTargetResolverTest::testChooseIdealStateWhenManyCopies()
+{
+ setupDistributor(2, 4, "storage:4 distributor:1");
+ addNodesToBucketDB(BucketId(16, 0), "0=0,1=0,2=0,3=0"); // ideal nodes: 1, 3
+ ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(16, 0), 1)
+ .sendsTo(BucketId(16, 0), 3);
+}
+
+void
+OperationTargetResolverTest::testTrustedOverIdealState()
+{
+ setupDistributor(2, 4, "storage:4 distributor:1");
+ addNodesToBucketDB(BucketId(16, 0), "0=0/0/0/t,1=0,2=0/0/0/t,3=0");
+ // ideal nodes: 1, 3
+ ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(16, 0), 0)
+ .sendsTo(BucketId(16, 0), 2);
+}
+
+void
+OperationTargetResolverTest::testChooseHighestSplitBucket()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ // 0, 1 are both in ideal state for both buckets.
+ addNodesToBucketDB(BucketId(16, 0), "0=0,1=0");
+ addNodesToBucketDB(BucketId(17, 0), "0=0,1=0");
+ ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(17, 0), 1)
+ .sendsTo(BucketId(17, 0), 0);
+}
+
+void
+OperationTargetResolverTest::testChooseHighestSplitBucketPerNode()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ addNodesToBucketDB(BucketId(16, 0), "1=0");
+ addNodesToBucketDB(BucketId(17, 0), "0=0");
+ ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(17, 0), 0)
+ .sendsTo(BucketId(16, 0), 1);
+}
+
+void
+OperationTargetResolverTest::testChooseHighestSplitBucketWithTrusted()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ // Unfinished split scenario: split done on 0, not on 1.
+ // Copy on 1 is only remaining for (16, 0), so always trusted.
+ addNodesToBucketDB(BucketId(16, 0), "1=1/2/3/t");
+ addNodesToBucketDB(BucketId(17, 0), "0=2/3/4/t");
+ addNodesToBucketDB(BucketId(17, 1ULL << 16), "0=3/4/5/t");
+ ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(17, 0), 0)
+ .sendsTo(BucketId(16, 0), 1);
+}
+
+void
+OperationTargetResolverTest::testInconsistentBucketsAreNotExplicitlyCreated()
+{
+ setupDistributor(2, 2, "bits:8 storage:2 distributor:1");
+ addNodesToBucketDB(BucketId(15, 0), "1=9/9/9/t");
+ addNodesToBucketDB(BucketId(16, 1 << 15), "0=9/9/9/t");
+ // (32, 0) belongs in (16, 0) subtree, but it does not exist. We cannot
+ // create a bucket on (15, 0) node 0 since that will explicitly introduce
+ // an inconsistent bucket in its local state. Note that we still _send_ to
+ // the inconsistent (15, 0) bucket since it already exists and will be
+ // split out very soon anyway. This is predominantly to avoid making things
+ // even worse than they are and to avoid the edge case in bug 7296087.
+ ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(15, 0), 1)
+ .createsAt(BucketId(16, 0), 0);
+}
+
+void
+OperationTargetResolverTest::testNoTrustedOrIdealStateCopyAvailable()
+{
+ setupDistributor(2, 4, "storage:4 distributor:1");
+ addNodesToBucketDB(BucketId(16, 0), "0=0,2=0");
+ addNodesToBucketDB(BucketId(18, 0), "0=0"); // ideal nodes: 1, 3
+ ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(18, 0), 0)
+ .sendsTo(BucketId(16, 0), 2);
+}
+
+void
+OperationTargetResolverTest::testCreateMissingCopies()
+{
+ setupDistributor(4, 10, "storage:10 distributor:1");
+ addNodesToBucketDB(BucketId(16, 0), "6=0");
+ addNodesToBucketDB(BucketId(18, 0), "4=0"); // ideal nodes: 6, 8, 7, 1
+
+ ASSERT_THAT(BucketId(32, 0)).sendsTo(BucketId(18, 0), 4)
+ .sendsTo(BucketId(16, 0), 6)
+ .createsAt(BucketId(18, 0), 8)
+ .createsAt(BucketId(18, 0), 7);
+}
+
+void
+OperationTargetResolverTest::testNoExistingCopies()
+{
+ setupDistributor(2, 5, "storage:5 distributor:1");
+
+ ASSERT_THAT(BucketId(32, 0)).createsAt(BucketId(16, 0), 1)
+ .createsAt(BucketId(16, 0), 3);
+}
+
+void
+OperationTargetResolverTest::testCountMaintenanceNodesAsDown()
+{
+ setupDistributor(2, 5, "storage:5 .1.s:m distributor:1");
+
+ ASSERT_THAT(BucketId(32, 0)).createsAt(BucketId(16, 0), 3)
+ .createsAt(BucketId(16, 0), 2);
+}
+
+void
+OperationTargetResolverTest::testResolvingDoesNotMutateDatabase()
+{
+ setupDistributor(2, 5, "storage:5 distributor:1");
+
+ ASSERT_THAT(BucketId(32, 0)).createsAt(BucketId(16, 0), 1)
+ .createsAt(BucketId(16, 0), 3);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("NONEXISTING"),
+ dumpBucket(BucketId(0x4000000000000000)));
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/pendingmessagetrackertest.cpp b/storage/src/tests/distributor/pendingmessagetrackertest.cpp
new file mode 100644
index 00000000000..f69525836be
--- /dev/null
+++ b/storage/src/tests/distributor/pendingmessagetrackertest.cpp
@@ -0,0 +1,674 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+
+#include <vespa/document/base/testdocman.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+#include <vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageframework/defaultimplementation/clock/fakeclock.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/vdslib/state/random.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <fstream>
+#include <sstream>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <string>
+#include <iterator>
+
+namespace storage {
+namespace distributor {
+
+// Workaround typedef for not (yet) running with --std=c++14 which supports
+// user defined literals. Once we do, replace ms(123) with 123ms.
+using ms = std::chrono::milliseconds;
+
+class PendingMessageTrackerCallback_Test : public CppUnit::TestFixture {
+ CPPUNIT_TEST_SUITE(PendingMessageTrackerCallback_Test);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST(testMultipleMessages);
+ CPPUNIT_TEST(testStartPage);
+ CPPUNIT_TEST(testGetPendingMessageTypes);
+ CPPUNIT_TEST(testHasPendingMessage);
+ CPPUNIT_TEST(testGetAllMessagesForSingleBucket);
+ CPPUNIT_TEST(nodeStatsCanBeOutputStreamed);
+ CPPUNIT_TEST(totalPutLatencyIsInitiallyZero);
+ CPPUNIT_TEST(statsNotAlteredBeforeReplyReceived);
+ CPPUNIT_TEST(totalPutLatencyIsTrackedForSingleRequest);
+ CPPUNIT_TEST(statsAreTrackedSeparatelyPerNode);
+ CPPUNIT_TEST(onlyPutMessagesAreTracked);
+ CPPUNIT_TEST(totalPutLatencyIsAggregatedAcrossRequests);
+ CPPUNIT_TEST(clearingMessagesDoesNotAffectStats);
+ CPPUNIT_TEST(timeTravellingClockLatenciesNotRegistered);
+ CPPUNIT_TEST(statsSnapshotIncludesAllNodes);
+ CPPUNIT_TEST(latencyProviderForwardsToImplementation);
+ CPPUNIT_TEST_SUITE_END();
+
+public:
+ void testSimple();
+ void testMultipleMessages();
+ void testStartPage();
+ void testGetPendingMessageTypes();
+ void testHasPendingMessage();
+ void testGetAllMessagesForSingleBucket();
+ void nodeStatsCanBeOutputStreamed();
+ void totalPutLatencyIsInitiallyZero();
+ void statsNotAlteredBeforeReplyReceived();
+ void totalPutLatencyIsTrackedForSingleRequest();
+ void statsAreTrackedSeparatelyPerNode();
+ void onlyPutMessagesAreTracked();
+ void totalPutLatencyIsAggregatedAcrossRequests();
+ void clearingMessagesDoesNotAffectStats();
+ void timeTravellingClockLatenciesNotRegistered();
+ void statsSnapshotIncludesAllNodes();
+ void latencyProviderForwardsToImplementation();
+
+private:
+ void insertMessages(PendingMessageTracker& tracker);
+
+ OperationStats makeOpStats(std::chrono::milliseconds totalLatency,
+ uint64_t numRequests) const
+ {
+ OperationStats stats;
+ stats.totalLatency = totalLatency;
+ stats.numRequests = numRequests;
+ return stats;
+ }
+};
+
+bool
+operator==(const OperationStats& a, const OperationStats& b)
+{
+ return (a.totalLatency == b.totalLatency
+ && a.numRequests == b.numRequests);
+}
+
+namespace {
+
+class RequestBuilder {
+ uint16_t _toNode;
+ std::chrono::milliseconds _atTime;
+public:
+ RequestBuilder()
+ : _toNode(0),
+ _atTime()
+ {
+ }
+
+ RequestBuilder& atTime(std::chrono::milliseconds t) {
+ _atTime = t;
+ return *this;
+ }
+
+ RequestBuilder& toNode(uint16_t node) {
+ _toNode = node;
+ return *this;
+ }
+
+ uint16_t toNode() const { return _toNode; }
+ std::chrono::milliseconds atTime() const { return _atTime; }
+};
+
+class Fixture
+{
+ StorageComponentRegisterImpl _compReg;
+ framework::defaultimplementation::FakeClock _clock;
+ std::unique_ptr<PendingMessageTracker> _tracker;
+ document::TestDocMan _testDocMan;
+public:
+
+ Fixture()
+ : _compReg(),
+ _clock(),
+ _tracker(),
+ _testDocMan()
+ {
+ _compReg.setClock(_clock);
+ _clock.setAbsoluteTimeInSeconds(1);
+ // Have to set clock in compReg before constructing tracker, or it'll
+ // flip out and die on an explicit nullptr check.
+ _tracker = std::unique_ptr<PendingMessageTracker>(
+ new PendingMessageTracker(_compReg));
+ }
+
+ std::shared_ptr<api::PutCommand> sendPut(const RequestBuilder& builder) {
+ assignMockedTime(builder.atTime());
+ auto put = createPutToNode(builder.toNode());
+ _tracker->insert(put);
+ return put;
+ }
+
+ void sendPutReply(api::PutCommand& putCmd,
+ const RequestBuilder& builder)
+ {
+ assignMockedTime(builder.atTime());
+ auto putReply = putCmd.makeReply();
+ _tracker->reply(*putReply);
+ }
+
+ std::shared_ptr<api::RemoveCommand> sendRemove(
+ const RequestBuilder& builder)
+ {
+ assignMockedTime(builder.atTime());
+ auto remove = createRemoveToNode(builder.toNode());
+ _tracker->insert(remove);
+ return remove;
+ }
+
+ void sendRemoveReply(api::RemoveCommand& removeCmd,
+ const RequestBuilder& builder)
+ {
+ assignMockedTime(builder.atTime());
+ auto removeReply = removeCmd.makeReply();
+ _tracker->reply(*removeReply);
+ }
+
+ void sendPutAndReplyWithLatency(uint16_t node,
+ std::chrono::milliseconds latency)
+ {
+ auto put = sendPut(RequestBuilder().atTime(ms(1000)).toNode(node));
+ sendPutReply(*put, RequestBuilder().atTime(ms(1000) + latency));
+ }
+
+ OperationStats getNodePutOperationStats(uint16_t node) {
+ return _tracker->getNodeStats(node).puts;
+ }
+
+ PendingMessageTracker& tracker() { return *_tracker; }
+
+private:
+ std::string createDummyIdString(const document::BucketId& bucket) const {
+ std::ostringstream id;
+ id << "id:foo:testdoctype1:n=" << bucket.getId() << ":foo";
+ return id.str();
+ }
+
+ document::Document::SP createDummyDocumentForBucket(
+ const document::BucketId& bucket) const
+ {
+ return _testDocMan.createDocument("foobar",
+ createDummyIdString(bucket));
+ }
+
+ api::StorageMessageAddress makeStorageAddress(uint16_t node) const {
+ return {"storage", lib::NodeType::STORAGE, node};
+ }
+
+ std::shared_ptr<api::PutCommand> createPutToNode(uint16_t node) const {
+ document::BucketId bucket(16, 1234);
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bucket,
+ createDummyDocumentForBucket(bucket),
+ api::Timestamp(123456)));
+ cmd->setAddress(makeStorageAddress(node));
+ return cmd;
+ }
+
+ std::shared_ptr<api::RemoveCommand> createRemoveToNode(
+ uint16_t node) const
+ {
+ document::BucketId bucket(16, 1234);
+ std::shared_ptr<api::RemoveCommand> cmd(
+ new api::RemoveCommand(bucket,
+ document::DocumentId(
+ createDummyIdString(bucket)),
+ api::Timestamp(123456)));
+ cmd->setAddress(makeStorageAddress(node));
+ return cmd;
+ }
+
+ void assignMockedTime(std::chrono::milliseconds time) {
+ _clock.setAbsoluteTimeInMicroSeconds(time.count() * 1000);
+ }
+};
+
+
+}
+
+CPPUNIT_TEST_SUITE_REGISTRATION(PendingMessageTrackerCallback_Test);
+
+void
+PendingMessageTrackerCallback_Test::testSimple()
+{
+ StorageComponentRegisterImpl compReg;
+ framework::defaultimplementation::FakeClock clock;
+ compReg.setClock(clock);
+ clock.setAbsoluteTimeInSeconds(1);
+ PendingMessageTracker tracker(compReg);
+
+ std::shared_ptr<api::RemoveCommand> remove(
+ new api::RemoveCommand(
+ document::BucketId(16, 1234),
+ document::DocumentId("userdoc:footype:1234:foo"), 1001));
+ remove->setAddress(
+ api::StorageMessageAddress("storage", lib::NodeType::STORAGE, 0));
+ tracker.insert(remove);
+
+ {
+ std::ostringstream ost;
+ tracker.reportStatus(ost, framework::HttpUrlPath("/pendingmessages?order=bucket"));
+
+ CPPUNIT_ASSERT_CONTAIN(
+ std::string(
+ "<b>BucketId(0x40000000000004d2)</b>\n"
+ "<ul>\n"
+ "<li><i>Node 0</i>: <b>1970-01-01 00:00:01</b> "
+ "Remove(BucketId(0x40000000000004d2), "
+ "userdoc:footype:1234:foo, timestamp 1001)</li>\n"
+ "</ul>\n"),
+ ost.str());
+ }
+
+ api::RemoveReply reply(*remove);
+ tracker.reply(reply);
+
+ {
+ std::ostringstream ost;
+ tracker.reportStatus(ost, framework::HttpUrlPath("/pendingmessages?order=bucket"));
+
+ CPPUNIT_ASSERT_MSG(ost.str(),
+ ost.str().find("doc:") == std::string::npos);
+ }
+}
+
+void
+PendingMessageTrackerCallback_Test::insertMessages(PendingMessageTracker& tracker)
+{
+ for (uint32_t i = 0; i < 4; i++) {
+ std::ostringstream ost;
+ ost << "userdoc:footype:1234:" << i;
+ std::shared_ptr<api::RemoveCommand> remove(
+ new api::RemoveCommand(
+ document::BucketId(16, 1234),
+ document::DocumentId(ost.str()), 1000 + i));
+ remove->setAddress(
+ api::StorageMessageAddress("storage",
+ lib::NodeType::STORAGE, i % 2));
+ tracker.insert(remove);
+ }
+
+ for (uint32_t i = 0; i < 4; i++) {
+ std::ostringstream ost;
+ ost << "userdoc:footype:4567:" << i;
+ std::shared_ptr<api::RemoveCommand> remove(new api::RemoveCommand(document::BucketId(16, 4567), document::DocumentId(ost.str()), 2000 + i));
+ remove->setAddress(api::StorageMessageAddress("storage", lib::NodeType::STORAGE, i % 2));
+ tracker.insert(remove);
+ }
+}
+
+void
+PendingMessageTrackerCallback_Test::testStartPage()
+{
+ StorageComponentRegisterImpl compReg;
+ framework::defaultimplementation::FakeClock clock;
+ compReg.setClock(clock);
+ PendingMessageTracker tracker(compReg);
+
+ {
+ std::ostringstream ost;
+ tracker.reportStatus(ost, framework::HttpUrlPath("/pendingmessages"));
+
+ CPPUNIT_ASSERT_CONTAIN(
+ std::string(
+ "<h1>Pending messages to storage nodes</h1>\n"
+ "View:\n"
+ "<ul>\n"
+ "<li><a href=\"?order=bucket\">Group by bucket</a></li>"
+ "<li><a href=\"?order=node\">Group by node</a></li>"),
+ ost.str());
+
+ }
+}
+
+void
+PendingMessageTrackerCallback_Test::testMultipleMessages()
+{
+ StorageComponentRegisterImpl compReg;
+ framework::defaultimplementation::FakeClock clock;
+ compReg.setClock(clock);
+ clock.setAbsoluteTimeInSeconds(1);
+ PendingMessageTracker tracker(compReg);
+
+ insertMessages(tracker);
+
+ {
+ std::ostringstream ost;
+ tracker.reportStatus(ost, framework::HttpUrlPath("/pendingmessages?order=bucket"));
+
+ CPPUNIT_ASSERT_CONTAIN(
+ std::string(
+ "<b>BucketId(0x40000000000004d2)</b>\n"
+ "<ul>\n"
+ "<li><i>Node 0</i>: <b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000004d2), userdoc:footype:1234:0, timestamp 1000)</li>\n"
+ "<li><i>Node 0</i>: <b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000004d2), userdoc:footype:1234:2, timestamp 1002)</li>\n"
+ "<li><i>Node 1</i>: <b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000004d2), userdoc:footype:1234:1, timestamp 1001)</li>\n"
+ "<li><i>Node 1</i>: <b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000004d2), userdoc:footype:1234:3, timestamp 1003)</li>\n"
+ "</ul>\n"
+ "<b>BucketId(0x40000000000011d7)</b>\n"
+ "<ul>\n"
+ "<li><i>Node 0</i>: <b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000011d7), userdoc:footype:4567:0, timestamp 2000)</li>\n"
+ "<li><i>Node 0</i>: <b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000011d7), userdoc:footype:4567:2, timestamp 2002)</li>\n"
+ "<li><i>Node 1</i>: <b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000011d7), userdoc:footype:4567:1, timestamp 2001)</li>\n"
+ "<li><i>Node 1</i>: <b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000011d7), userdoc:footype:4567:3, timestamp 2003)</li>\n"
+ "</ul>\n"),
+ ost.str());
+ }
+
+ {
+ std::ostringstream ost;
+ tracker.reportStatus(ost, framework::HttpUrlPath("/pendingmessages?order=node"));
+
+ CPPUNIT_ASSERT_CONTAIN(std::string(
+ "<b>Node 0 (pending count: 4)</b>\n"
+ "<ul>\n"
+ "<li><b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000004d2), userdoc:footype:1234:0, timestamp 1000)</li>\n"
+ "<li><b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000004d2), userdoc:footype:1234:2, timestamp 1002)</li>\n"
+ "<li><b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000011d7), userdoc:footype:4567:0, timestamp 2000)</li>\n"
+ "<li><b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000011d7), userdoc:footype:4567:2, timestamp 2002)</li>\n"
+ "</ul>\n"
+ "<b>Node 1 (pending count: 4)</b>\n"
+ "<ul>\n"
+ "<li><b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000004d2), userdoc:footype:1234:1, timestamp 1001)</li>\n"
+ "<li><b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000004d2), userdoc:footype:1234:3, timestamp 1003)</li>\n"
+ "<li><b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000011d7), userdoc:footype:4567:1, timestamp 2001)</li>\n"
+ "<li><b>1970-01-01 00:00:01</b> Remove(BucketId(0x40000000000011d7), userdoc:footype:4567:3, timestamp 2003)</li>\n"
+ "</ul>\n"
+ ), ost.str());
+ }
+}
+
+namespace {
+
+template <typename T>
+std::string setToString(const std::set<T>& s)
+{
+ std::ostringstream ost;
+ ost << '{';
+ for (typename std::set<T>::const_iterator i(s.begin()), e(s.end());
+ i != e; ++i)
+ {
+ if (i != s.begin()) {
+ ost << ',';
+ }
+ ost << *i;
+ }
+ ost << '}';
+ return ost.str();
+}
+
+}
+
+namespace {
+
+class TestChecker : public PendingMessageTracker::Checker
+{
+public:
+ uint8_t pri;
+
+ TestChecker() : pri(UINT8_MAX) {}
+
+ bool check(uint32_t msgType, uint16_t node, uint8_t p) {
+ (void) node;
+ if (msgType == api::MessageType::REMOVE_ID) {
+ pri = p;
+ return false;
+ }
+
+ return true;
+ }
+};
+
+
+}
+
+void
+PendingMessageTrackerCallback_Test::testGetPendingMessageTypes()
+{
+ StorageComponentRegisterImpl compReg;
+ framework::defaultimplementation::FakeClock clock;
+ compReg.setClock(clock);
+ clock.setAbsoluteTimeInSeconds(1);
+ PendingMessageTracker tracker(compReg);
+ document::BucketId bid(16, 1234);
+
+ std::shared_ptr<api::RemoveCommand> remove(
+ new api::RemoveCommand(
+ bid,
+ document::DocumentId("userdoc:footype:1234:foo"), 1001));
+ remove->setAddress(
+ api::StorageMessageAddress("storage", lib::NodeType::STORAGE, 0));
+ tracker.insert(remove);
+
+ {
+ TestChecker checker;
+ tracker.checkPendingMessages(0, bid, checker);
+ CPPUNIT_ASSERT_EQUAL(127, (int)checker.pri);
+ }
+
+ {
+ TestChecker checker;
+ tracker.checkPendingMessages(0, document::BucketId(16, 1235), checker);
+ CPPUNIT_ASSERT_EQUAL(255, (int)checker.pri);
+ }
+
+ {
+ TestChecker checker;
+ tracker.checkPendingMessages(1, bid, checker);
+ CPPUNIT_ASSERT_EQUAL(255, (int)checker.pri);
+ }
+}
+
+void
+PendingMessageTrackerCallback_Test::testHasPendingMessage()
+{
+ StorageComponentRegisterImpl compReg;
+ framework::defaultimplementation::FakeClock clock;
+ compReg.setClock(clock);
+ clock.setAbsoluteTimeInSeconds(1);
+ PendingMessageTracker tracker(compReg);
+ document::BucketId bid(16, 1234);
+
+ CPPUNIT_ASSERT(!tracker.hasPendingMessage(1, bid, api::MessageType::REMOVE_ID));
+
+ {
+ std::shared_ptr<api::RemoveCommand> remove(
+ new api::RemoveCommand(
+ bid,
+ document::DocumentId("userdoc:footype:1234:foo"), 1001));
+ remove->setAddress(
+ api::StorageMessageAddress("storage", lib::NodeType::STORAGE, 1));
+ tracker.insert(remove);
+ }
+
+ CPPUNIT_ASSERT(tracker.hasPendingMessage(1, bid, api::MessageType::REMOVE_ID));
+ CPPUNIT_ASSERT(!tracker.hasPendingMessage(0, bid, api::MessageType::REMOVE_ID));
+ CPPUNIT_ASSERT(!tracker.hasPendingMessage(2, bid, api::MessageType::REMOVE_ID));
+ CPPUNIT_ASSERT(!tracker.hasPendingMessage(1,
+ document::BucketId(16, 1233),
+ api::MessageType::REMOVE_ID));
+ CPPUNIT_ASSERT(!tracker.hasPendingMessage(1, bid, api::MessageType::DELETEBUCKET_ID));
+}
+
+namespace {
+
+class OperationEnumerator : public PendingMessageTracker::Checker
+{
+ std::ostringstream ss;
+public:
+ bool check(uint32_t msgType, uint16_t node, uint8_t p) override {
+ (void) p;
+ ss << api::MessageType::get(static_cast<api::MessageType::Id>(msgType))
+ .getName()
+ << " -> " << node
+ << "\n";
+
+ return true;
+ }
+
+ std::string str() const { return ss.str(); }
+};
+
+} // anon ns
+
+void
+PendingMessageTrackerCallback_Test::testGetAllMessagesForSingleBucket()
+{
+ StorageComponentRegisterImpl compReg;
+ framework::defaultimplementation::FakeClock clock;
+ compReg.setClock(clock);
+ clock.setAbsoluteTimeInSeconds(1);
+ PendingMessageTracker tracker(compReg);
+
+ insertMessages(tracker);
+
+ {
+ OperationEnumerator enumerator;
+ tracker.checkPendingMessages(document::BucketId(16, 1234), enumerator);
+ CPPUNIT_ASSERT_EQUAL(std::string("Remove -> 0\n"
+ "Remove -> 0\n"
+ "Remove -> 1\n"
+ "Remove -> 1\n"),
+ enumerator.str());
+ }
+ {
+ OperationEnumerator enumerator;
+ tracker.checkPendingMessages(document::BucketId(16, 9876), enumerator);
+ CPPUNIT_ASSERT_EQUAL(std::string(""), enumerator.str());
+ }
+}
+
+void
+PendingMessageTrackerCallback_Test::nodeStatsCanBeOutputStreamed()
+{
+ NodeStats stats;
+ stats.puts = makeOpStats(ms(56789), 10);
+
+ std::ostringstream os;
+ os << stats;
+ std::string expected(
+ "NodeStats(puts=OperationStats("
+ "totalLatency=56789ms, "
+ "numRequests=10))");
+ CPPUNIT_ASSERT_EQUAL(expected, os.str());
+}
+
+void
+PendingMessageTrackerCallback_Test::totalPutLatencyIsInitiallyZero()
+{
+ Fixture fixture;
+ CPPUNIT_ASSERT_EQUAL(makeOpStats(ms(0), 0),
+ fixture.getNodePutOperationStats(0));
+}
+
+void
+PendingMessageTrackerCallback_Test::statsNotAlteredBeforeReplyReceived()
+{
+ Fixture fixture;
+ fixture.sendPut(RequestBuilder().atTime(ms(1000)).toNode(0));
+ CPPUNIT_ASSERT_EQUAL(makeOpStats(ms(0), 0),
+ fixture.getNodePutOperationStats(0));
+}
+
+void
+PendingMessageTrackerCallback_Test::totalPutLatencyIsTrackedForSingleRequest()
+{
+ Fixture fixture;
+ fixture.sendPutAndReplyWithLatency(0, ms(500));
+
+ CPPUNIT_ASSERT_EQUAL(makeOpStats(ms(500), 1),
+ fixture.getNodePutOperationStats(0));
+}
+
+void
+PendingMessageTrackerCallback_Test::statsAreTrackedSeparatelyPerNode()
+{
+ Fixture fixture;
+ fixture.sendPutAndReplyWithLatency(0, ms(500));
+ fixture.sendPutAndReplyWithLatency(1, ms(600));
+
+ CPPUNIT_ASSERT_EQUAL(makeOpStats(ms(500), 1),
+ fixture.getNodePutOperationStats(0));
+ CPPUNIT_ASSERT_EQUAL(makeOpStats(ms(600), 1),
+ fixture.getNodePutOperationStats(1));
+}
+
+// Necessarily, this test will have to be altered when we add tracking of
+// other message types as well.
+void
+PendingMessageTrackerCallback_Test::onlyPutMessagesAreTracked()
+{
+ Fixture fixture;
+ auto remove = fixture.sendRemove(
+ RequestBuilder().atTime(ms(1000)).toNode(0));
+ fixture.sendRemoveReply(*remove, RequestBuilder().atTime(ms(2000)));
+ CPPUNIT_ASSERT_EQUAL(makeOpStats(ms(0), 0),
+ fixture.getNodePutOperationStats(0));
+}
+
+void
+PendingMessageTrackerCallback_Test::totalPutLatencyIsAggregatedAcrossRequests()
+{
+ Fixture fixture;
+ // Model 2 concurrent puts to node 0.
+ fixture.sendPutAndReplyWithLatency(0, ms(500));
+ fixture.sendPutAndReplyWithLatency(0, ms(600));
+ CPPUNIT_ASSERT_EQUAL(makeOpStats(ms(1100), 2),
+ fixture.getNodePutOperationStats(0));
+}
+
+void
+PendingMessageTrackerCallback_Test::clearingMessagesDoesNotAffectStats()
+{
+ Fixture fixture;
+ fixture.sendPutAndReplyWithLatency(2, ms(2000));
+ fixture.tracker().clearMessagesForNode(2);
+ CPPUNIT_ASSERT_EQUAL(makeOpStats(ms(2000), 1),
+ fixture.getNodePutOperationStats(2));
+}
+
+void
+PendingMessageTrackerCallback_Test::timeTravellingClockLatenciesNotRegistered()
+{
+ Fixture fixture;
+ auto put = fixture.sendPut(RequestBuilder().atTime(ms(1000)).toNode(0));
+ fixture.sendPutReply(*put, RequestBuilder().atTime(ms(999)));
+ // Latency increase of zero, but we do count the request itself.
+ CPPUNIT_ASSERT_EQUAL(makeOpStats(ms(0), 1),
+ fixture.getNodePutOperationStats(0));
+}
+
+void
+PendingMessageTrackerCallback_Test::statsSnapshotIncludesAllNodes()
+{
+ Fixture fixture;
+ fixture.sendPutAndReplyWithLatency(0, ms(500));
+ fixture.sendPutAndReplyWithLatency(1, ms(600));
+
+ NodeStatsSnapshot snapshot = fixture.tracker().getLatencyStatistics();
+
+ CPPUNIT_ASSERT_EQUAL(size_t(2), snapshot.nodeToStats.size());
+ CPPUNIT_ASSERT_EQUAL(makeOpStats(ms(500), 1),
+ snapshot.nodeToStats[0].puts);
+ CPPUNIT_ASSERT_EQUAL(makeOpStats(ms(600), 1),
+ snapshot.nodeToStats[1].puts);
+}
+
+void
+PendingMessageTrackerCallback_Test::latencyProviderForwardsToImplementation()
+{
+ Fixture fixture;
+ fixture.sendPutAndReplyWithLatency(0, ms(500));
+
+ LatencyStatisticsProvider& provider(
+ fixture.tracker().getLatencyStatisticsProvider());
+ NodeStatsSnapshot snapshot = provider.getLatencyStatistics();
+
+ CPPUNIT_ASSERT_EQUAL(size_t(1), snapshot.nodeToStats.size());
+ CPPUNIT_ASSERT_EQUAL(makeOpStats(ms(500), 1),
+ snapshot.nodeToStats[0].puts);
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/putoperationtest.cpp b/storage/src/tests/distributor/putoperationtest.cpp
new file mode 100644
index 00000000000..011b34cd1e3
--- /dev/null
+++ b/storage/src/tests/distributor/putoperationtest.cpp
@@ -0,0 +1,704 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+
+#include <vespa/config/helper/configgetter.h>
+#include <vespa/document/config/config-documenttypes.h>
+#include <vespa/document/repo/documenttyperepo.h>
+#include <vespa/storage/distributor/operations/external/putoperation.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/state.h>
+#include <tests/distributor/distributortestutil.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/vespalib/text/stringtokenizer.h>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+
+using std::shared_ptr;
+using config::ConfigGetter;
+using document::DocumenttypesConfig;
+using config::FileSpec;
+using vespalib::string;
+using namespace document;
+using namespace storage;
+using namespace storage::api;
+using namespace storage::lib;
+using namespace std::literals::string_literals;
+
+namespace storage {
+
+namespace distributor {
+
+class PutOperationTest : public CppUnit::TestFixture,
+ public DistributorTestUtil {
+ CPPUNIT_TEST_SUITE(PutOperationTest);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST(testBucketDatabaseGetsSpecialEntryWhenCreateBucketSent);
+ CPPUNIT_TEST(testSendInlineSplitBeforePutIfBucketTooLarge);
+ CPPUNIT_TEST(testDoNotSendInlineSplitIfNotConfigured);
+ CPPUNIT_TEST(testNodeRemovedOnReply);
+ CPPUNIT_TEST(testDoNotSendCreateBucketIfAlreadyPending);
+ CPPUNIT_TEST(testMultipleCopies);
+ CPPUNIT_TEST(testMultipleCopiesEarlyReturnPrimaryNotRequired);
+ CPPUNIT_TEST(testMultipleCopiesEarlyReturnPrimaryRequired);
+ CPPUNIT_TEST(testMultipleCopiesEarlyReturnPrimaryRequiredNotDone);
+ CPPUNIT_TEST_IGNORED(testDoNotRevertOnFailureAfterEarlyReturn);
+ CPPUNIT_TEST(testStorageFailed);
+ CPPUNIT_TEST(testRevertSuccessfulCopiesWhenOneFails);
+ CPPUNIT_TEST(testNoRevertIfRevertDisabled);
+ CPPUNIT_TEST(testNoStorageNodes);
+ CPPUNIT_TEST(testUpdateCorrectBucketOnRemappedPut);
+ CPPUNIT_TEST(testTargetNodes);
+ CPPUNIT_TEST(testDoNotResurrectDownedNodesInBucketDB);
+ CPPUNIT_TEST(sendToRetiredNodesIfNoUpNodesAvailable);
+ CPPUNIT_TEST(replicaImplicitlyActivatedWhenActivationIsNotDisabled);
+ CPPUNIT_TEST(replicaNotImplicitlyActivatedWhenActivationIsDisabled);
+ CPPUNIT_TEST_SUITE_END();
+
+ DocumentTypeRepo::SP _repo;
+ const DocumentType* _html_type;
+ std::unique_ptr<Operation> op;
+
+protected:
+ void testSimple();
+ void testBucketDatabaseGetsSpecialEntryWhenCreateBucketSent();
+ void testSendInlineSplitBeforePutIfBucketTooLarge();
+ void testDoNotSendInlineSplitIfNotConfigured();
+ void testNodeRemovedOnReply();
+ void testDoNotSendCreateBucketIfAlreadyPending();
+ void testStorageFailed();
+ void testNoReply();
+ void testMultipleCopies();
+ void testRevertSuccessfulCopiesWhenOneFails();
+ void testNoRevertIfRevertDisabled();
+ void testInconsistentChecksum();
+ void testNoStorageNodes();
+ void testMultipleCopiesEarlyReturnPrimaryNotRequired();
+ void testMultipleCopiesEarlyReturnPrimaryRequired();
+ void testMultipleCopiesEarlyReturnPrimaryRequiredNotDone();
+ void testDoNotRevertOnFailureAfterEarlyReturn();
+ void testUpdateCorrectBucketOnRemappedPut();
+ void testBucketNotFound();
+ void testTargetNodes();
+ void testDoNotResurrectDownedNodesInBucketDB();
+ void sendToRetiredNodesIfNoUpNodesAvailable();
+ void replicaImplicitlyActivatedWhenActivationIsNotDisabled();
+ void replicaNotImplicitlyActivatedWhenActivationIsDisabled();
+
+ void doTestCreationWithBucketActivationDisabled(bool disabled);
+
+public:
+ void setUp() {
+ _repo.reset(
+ new DocumentTypeRepo(*ConfigGetter<DocumenttypesConfig>
+ ::getConfig("config-doctypes", FileSpec("config-doctypes.cfg"))));
+ _html_type = _repo->getDocumentType("text/html");
+ createLinks();
+ };
+
+ void tearDown() {
+ close();
+ }
+
+ document::BucketId createAndSendSampleDocument(uint32_t timeout);
+ std::string getNodes(const std::string& infoString);
+
+ void sendReply(int idx = -1,
+ api::ReturnCode::Result result
+ = api::ReturnCode::OK,
+ api::BucketInfo info = api::BucketInfo(1,2,3,4,5))
+ {
+ CPPUNIT_ASSERT(!_sender.commands.empty());
+ if (idx == -1) {
+ idx = _sender.commands.size() - 1;
+ } else if (static_cast<size_t>(idx) >= _sender.commands.size()) {
+ throw std::logic_error("Specified message index is greater "
+ "than number of received messages");
+ }
+
+ std::shared_ptr<api::StorageCommand> msg = _sender.commands[idx];
+ api::StorageReply::SP reply(msg->makeReply().release());
+ dynamic_cast<api::BucketInfoReply*>(reply.get())->setBucketInfo(info);
+ reply->setResult(result);
+
+ op->receive(_sender, reply);
+ }
+
+ void sendPut(std::shared_ptr<api::PutCommand> msg) {
+ op.reset(new PutOperation(getExternalOperationHandler(),
+ msg,
+ getDistributor().getMetrics().
+ puts[msg->getLoadType()]));
+ op->start(_sender, framework::MilliSecTime(0));
+ }
+
+ Document::SP createDummyDocument(const char* ns,
+ const char* id) const
+ {
+ return Document::SP(
+ new Document(*_html_type,
+ DocumentId(DocIdString(ns, id))));
+
+ }
+
+ std::shared_ptr<api::PutCommand> createPut(
+ const Document::SP doc) const
+ {
+ return std::shared_ptr<api::PutCommand>(
+ new api::PutCommand(document::BucketId(0), doc, 100));
+ }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(PutOperationTest);
+
+document::BucketId
+PutOperationTest::createAndSendSampleDocument(uint32_t timeout) {
+ Document::SP
+ doc(new Document(*_html_type,
+ DocumentId(DocIdString("test", "test"))));
+
+ document::BucketId id = getExternalOperationHandler().getBucketId(doc->getId());
+ addIdealNodes(id);
+
+ std::shared_ptr<api::PutCommand> msg(
+ new api::PutCommand(document::BucketId(0),
+ doc,
+ 0));
+ msg->setTimestamp(100);
+ msg->setPriority(128);
+ msg->setTimeout(timeout);
+ sendPut(msg);
+ return id;
+}
+
+namespace {
+
+typedef int Redundancy;
+typedef int NodeCount;
+typedef uint32_t ReturnAfter;
+typedef bool RequirePrimaryWritten;
+
+}
+
+void
+PutOperationTest::testSimple()
+{
+ setupDistributor(1, 1, "storage:1 distributor:1");
+ createAndSendSampleDocument(180);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Put(BucketId(0x4000000000008b13), "
+ "doc:test:test, timestamp 100, size 33) => 0"),
+ _sender.getCommands(true, true));
+
+ sendReply();
+
+ CPPUNIT_ASSERT_EQUAL(std::string("PutReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 100) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+PutOperationTest::testBucketDatabaseGetsSpecialEntryWhenCreateBucketSent()
+{
+ setupDistributor(2, 1, "storage:1 distributor:1");
+
+ Document::SP doc(createDummyDocument("test", "test"));
+ document::BucketId bucketId(getExternalOperationHandler().getBucketId(doc->getId()));
+ sendPut(createPut(doc));
+
+ // Database updated before CreateBucket is sent
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000008b13) : "
+ "node(idx=0,crc=0x1,docs=0/0,bytes=0/0,trusted=true,active=true)"),
+ dumpBucket(getExternalOperationHandler().getBucketId(doc->getId())));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Create bucket => 0,Put => 0"),
+ _sender.getCommands(true));
+}
+
+void
+PutOperationTest::testSendInlineSplitBeforePutIfBucketTooLarge()
+{
+ setupDistributor(1, 1, "storage:1 distributor:1");
+ getConfig().setSplitCount(1024);
+ getConfig().setSplitSize(1000000);
+
+ addNodesToBucketDB(document::BucketId(0x4000000000002a52), "0=10000/10000/10000/t");
+
+ sendPut(createPut(createDummyDocument("test", "uri")));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("SplitBucketCommand(BucketId(0x4000000000002a52)Max doc count: "
+ "1024, Max total doc size: 1000000) Reasons to start: "
+ "[Splitting bucket because its maximum size (10000 b, 10000 docs, 10000 meta, 10000 b total) is "
+ "higher than the configured limit of (1000000, 1024)] => 0,"
+ "Put(BucketId(0x4000000000002a52), doc:test:uri, timestamp 100, "
+ "size 32) => 0"),
+ _sender.getCommands(true, true));
+}
+
+void
+PutOperationTest::testDoNotSendInlineSplitIfNotConfigured()
+{
+ setupDistributor(1, 1, "storage:1 distributor:1");
+ getConfig().setSplitCount(1024);
+ getConfig().setDoInlineSplit(false);
+
+ addNodesToBucketDB(document::BucketId(0x4000000000002a52), "0=10000/10000/10000/t");
+
+ sendPut(createPut(createDummyDocument("test", "uri")));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "Put(BucketId(0x4000000000002a52), doc:test:uri, timestamp 100, "
+ "size 32) => 0"),
+ _sender.getCommands(true, true));
+}
+
+void
+PutOperationTest::testNodeRemovedOnReply()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ createAndSendSampleDocument(180);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Put(BucketId(0x4000000000008b13), "
+ "doc:test:test, timestamp 100, size 33) => 1,"
+ "Put(BucketId(0x4000000000008b13), "
+ "doc:test:test, timestamp 100, size 33) => 0"),
+ _sender.getCommands(true, true));
+
+ getExternalOperationHandler().removeNodeFromDB(document::BucketId(16, 0x8b13), 0);
+
+ sendReply(0);
+ sendReply(1);
+
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "PutReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 100) ReturnCode(BUCKET_DELETED, "
+ "BucketId(0x4000000000008b13) was deleted from nodes [0] "
+ "after message was sent but before it was done. "
+ "Sent to [1,0])"),
+ _sender.getLastReply());
+}
+
+void
+PutOperationTest::testStorageFailed()
+{
+ setupDistributor(2, 1, "storage:1 distributor:1");
+
+ createAndSendSampleDocument(180);
+
+ sendReply(-1, api::ReturnCode::INTERNAL_FAILURE);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("PutReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 100) ReturnCode(INTERNAL_FAILURE)"),
+ _sender.getLastReply(true));
+}
+
+void
+PutOperationTest::testMultipleCopies()
+{
+ setupDistributor(3, 4, "storage:4 distributor:1");
+
+ Document::SP doc(createDummyDocument("test", "test"));
+ sendPut(createPut(doc));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Create bucket => 3,Create bucket => 1,"
+ "Create bucket => 0,Put => 3,Put => 1,Put => 0"),
+ _sender.getCommands(true));
+
+ for (uint32_t i = 0; i < 6; i++) {
+ sendReply(i);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("PutReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 100) ReturnCode(NONE)"),
+ _sender.getLastReply(true));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000008b13) : "
+ "node(idx=3,crc=0x1,docs=2/4,bytes=3/5,trusted=true,active=false), "
+ "node(idx=1,crc=0x1,docs=2/4,bytes=3/5,trusted=true,active=false), "
+ "node(idx=0,crc=0x1,docs=2/4,bytes=3/5,trusted=true,active=false)"),
+ dumpBucket(getExternalOperationHandler().getBucketId(doc->getId())));
+}
+
+
+void
+PutOperationTest::testMultipleCopiesEarlyReturnPrimaryRequired()
+{
+ setupDistributor(3, 4, "storage:4 distributor:1", 2, true);
+
+ sendPut(createPut(createDummyDocument("test", "test")));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Create bucket => 3,Create bucket => 1,"
+ "Create bucket => 0,Put => 3,Put => 1,Put => 0"),
+ _sender.getCommands(true));
+
+ // Reply to 2 CreateBucket, including primary
+ for (uint32_t i = 0; i < 2; i++) {
+ sendReply(i);
+ }
+ // Reply to 2 puts, including primary
+ for (uint32_t i = 0; i < 2; i++) {
+ sendReply(3 + i);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "PutReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 100) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+PutOperationTest::testMultipleCopiesEarlyReturnPrimaryNotRequired()
+{
+ setupDistributor(3, 4, "storage:4 distributor:1", 2, false);
+
+ sendPut(createPut(createDummyDocument("test", "test")));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Create bucket => 3,Create bucket => 1,"
+ "Create bucket => 0,Put => 3,Put => 1,Put => 0"),
+ _sender.getCommands(true));
+
+ // Reply only to 2 nodes (but not the primary)
+ for (uint32_t i = 1; i < 3; i++) {
+ sendReply(i); // CreateBucket
+ }
+ for (uint32_t i = 1; i < 3; i++) {
+ sendReply(3 + i); // Put
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("PutReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 100) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+PutOperationTest::testMultipleCopiesEarlyReturnPrimaryRequiredNotDone()
+{
+ setupDistributor(3, 4, "storage:4 distributor:1", 2, true);
+
+ sendPut(createPut(createDummyDocument("test", "test")));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Create bucket => 3,Create bucket => 1,"
+ "Create bucket => 0,Put => 3,Put => 1,Put => 0"),
+ _sender.getCommands(true));
+
+ // Reply only to 2 nodes (but not the primary)
+ sendReply(1);
+ sendReply(2);
+ sendReply(4);
+ sendReply(5);
+
+ CPPUNIT_ASSERT_EQUAL(0, (int)_sender.replies.size());
+}
+
+void
+PutOperationTest::testDoNotRevertOnFailureAfterEarlyReturn()
+{
+ setupDistributor(Redundancy(3),NodeCount(4), "storage:4 distributor:1",
+ ReturnAfter(2), RequirePrimaryWritten(false));
+
+ sendPut(createPut(createDummyDocument("test", "test")));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Create bucket => 3,Create bucket => 1,"
+ "Create bucket => 0,Put => 3,Put => 1,Put => 0"),
+ _sender.getCommands(true));
+
+ for (uint32_t i = 0; i < 3; i++) {
+ sendReply(i); // CreateBucket
+ }
+ for (uint32_t i = 0; i < 2; i++) {
+ sendReply(3 + i); // Put
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "PutReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 100) ReturnCode(NONE)"),
+ _sender.getLastReply());
+
+ sendReply(5, api::ReturnCode::INTERNAL_FAILURE);
+ // Should not be any revert commands sent
+ CPPUNIT_ASSERT_EQUAL(std::string("Create bucket => 3,Create bucket => 1,"
+ "Create bucket => 0,Put => 3,Put => 1,Put => 0"),
+ _sender.getCommands(true));
+}
+
+void
+PutOperationTest::testRevertSuccessfulCopiesWhenOneFails()
+{
+ setupDistributor(3, 4, "storage:4 distributor:1");
+
+ createAndSendSampleDocument(180);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Put => 3,Put => 1,Put => 0"),
+ _sender.getCommands(true));
+
+ for (uint32_t i = 0; i < 2; i++) {
+ sendReply(i);
+ }
+
+ sendReply(2, api::ReturnCode::INTERNAL_FAILURE);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("PutReply(doc:test:test, "
+ "BucketId(0x0000000000000000), timestamp 100) "
+ "ReturnCode(INTERNAL_FAILURE)"),
+ _sender.getLastReply(true));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Revert => 3,Revert => 1"),
+ _sender.getCommands(true, false, 3));
+}
+
+void
+PutOperationTest::testNoRevertIfRevertDisabled()
+{
+ close();
+ getDirConfig().getConfig("stor-distributormanager")
+ .set("enable_revert", "false");
+ setUp();
+ setupDistributor(3, 4, "storage:4 distributor:1");
+
+ createAndSendSampleDocument(180);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Put => 3,Put => 1,Put => 0"),
+ _sender.getCommands(true));
+
+ for (uint32_t i = 0; i < 2; i++) {
+ sendReply(i);
+ }
+
+ sendReply(2, api::ReturnCode::INTERNAL_FAILURE);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("PutReply(doc:test:test, "
+ "BucketId(0x0000000000000000), timestamp 100) "
+ "ReturnCode(INTERNAL_FAILURE)"),
+ _sender.getLastReply(true));
+
+ CPPUNIT_ASSERT_EQUAL(std::string(""),
+ _sender.getCommands(true, false, 3));
+}
+
+void
+PutOperationTest::testDoNotSendCreateBucketIfAlreadyPending()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+
+ Document::SP doc(createDummyDocument("test", "uri"));
+ sendPut(createPut(doc));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Create bucket => 1,Create bucket => 0,"
+ "Put => 1,Put => 0"),
+ _sender.getCommands(true));
+
+ // Manually shove sent messages into pending message tracker, since
+ // this isn't done automatically.
+ for (size_t i = 0; i < _sender.commands.size(); ++i) {
+ getExternalOperationHandler().getDistributor().getPendingMessageTracker()
+ .insert(_sender.commands[i]);
+ }
+
+ sendPut(createPut(doc));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Create bucket => 1,Create bucket => 0,"
+ "Put => 1,Put => 0,"
+ "Put => 1,Put => 0"),
+ _sender.getCommands(true));
+}
+
+void
+PutOperationTest::testNoStorageNodes()
+{
+ setupDistributor(2, 1, "storage:0 distributor:1");
+ createAndSendSampleDocument(180);
+ CPPUNIT_ASSERT_EQUAL(std::string("PutReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 100) ReturnCode(NOT_CONNECTED, "
+ "Can't store document: No storage nodes available)"),
+ _sender.getLastReply(true));
+}
+
+void
+PutOperationTest::testUpdateCorrectBucketOnRemappedPut()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+
+ Document::SP doc(new Document(*_html_type, DocumentId(
+ UserDocIdString("userdoc:test:13:uri"))));
+
+ addNodesToBucketDB(document::BucketId(16,13), "0=0,1=0");
+
+ sendPut(createPut(doc));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Put => 0,Put => 1"),
+ _sender.getCommands(true));
+
+ {
+ std::shared_ptr<api::StorageCommand> msg2 = _sender.commands[0];
+ std::shared_ptr<api::StorageReply> reply(msg2->makeReply().release());
+ PutReply* sreply = (PutReply*)reply.get();
+ sreply->remapBucketId(document::BucketId(17, 13));
+ sreply->setBucketInfo(api::BucketInfo(1,2,3,4,5));
+ op->receive(_sender, reply);
+ }
+
+ sendReply(1);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("PutReply(userdoc:test:13:uri, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 100) ReturnCode(NONE)"),
+ _sender.getLastReply());
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x440000000000000d) : "
+ "node(idx=0,crc=0x1,docs=2/4,bytes=3/5,trusted=true,active=false)"),
+ dumpBucket(document::BucketId(17, 13)));
+}
+
+BucketInfo
+parseBucketInfoString(const std::string& nodeList) {
+ vespalib::StringTokenizer tokenizer(nodeList, ",");
+
+ BucketInfo entry;
+ for (uint32_t i = 0; i < tokenizer.size(); i++) {
+ vespalib::StringTokenizer tokenizer2(tokenizer[i], "-");
+ int node = atoi(tokenizer2[0].c_str());
+ int size = atoi(tokenizer2[1].c_str());
+ bool trusted = (tokenizer2[2] == "true");
+
+ entry.addNode(BucketCopy(0,
+ node,
+ api::BucketInfo(size, size * 1000, size * 2000))
+ .setTrusted(trusted),
+ toVector<uint16_t>(0));
+ }
+
+ return entry;
+}
+
+std::string
+PutOperationTest::getNodes(const std::string& infoString) {
+ Document::SP doc(createDummyDocument("test", "uri"));
+ document::BucketId bid(getExternalOperationHandler().getBucketId(doc->getId()));
+
+ BucketInfo entry = parseBucketInfoString(infoString);
+
+ std::ostringstream ost;
+
+ std::vector<uint16_t> targetNodes;
+ std::vector<uint16_t> createNodes;
+ PutOperation::getTargetNodes(getExternalOperationHandler().getIdealNodes(bid),
+ targetNodes, createNodes, entry, 2);
+
+ ost << "target( ";
+ for (uint32_t i = 0; i < targetNodes.size(); i++) {
+ ost << targetNodes[i] << " ";
+ }
+ ost << ") create( ";
+ for (uint32_t i = 0; i < createNodes.size(); i++) {
+ ost << createNodes[i] << " ";
+ }
+ ost << ")";
+
+ return ost.str();
+}
+
+void
+PutOperationTest::testTargetNodes()
+{
+ setupDistributor(2, 6, "storage:6 distributor:1");
+
+ // Ideal state of bucket is 1,3.
+ CPPUNIT_ASSERT_EQUAL(std::string("target( 1 3 ) create( 1 3 )"), getNodes(""));
+ CPPUNIT_ASSERT_EQUAL(std::string("target( 1 3 ) create( 3 )"), getNodes("1-1-true"));
+ CPPUNIT_ASSERT_EQUAL(std::string("target( 1 3 ) create( 3 )"), getNodes("1-1-false"));
+ CPPUNIT_ASSERT_EQUAL(std::string("target( 3 4 5 ) create( )"), getNodes("3-1-true,4-1-true,5-1-true"));
+ CPPUNIT_ASSERT_EQUAL(std::string("target( 3 4 ) create( )"), getNodes("3-2-true,4-2-true,5-1-false"));
+ CPPUNIT_ASSERT_EQUAL(std::string("target( 1 3 4 ) create( )"), getNodes("3-2-true,4-2-true,1-1-false"));
+ CPPUNIT_ASSERT_EQUAL(std::string("target( 4 5 ) create( )"), getNodes("4-2-false,5-1-false"));
+ CPPUNIT_ASSERT_EQUAL(std::string("target( 1 4 ) create( 1 )"), getNodes("4-1-true"));
+}
+
+void
+PutOperationTest::testDoNotResurrectDownedNodesInBucketDB()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+
+ Document::SP doc(createDummyDocument("test", "uri"));
+ document::BucketId bId = getExternalOperationHandler().getBucketId(doc->getId());
+
+ addNodesToBucketDB(bId, "0=1/2/3/t,1=1/2/3/t");
+
+ sendPut(createPut(doc));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Put => 1,Put => 0"),
+ _sender.getCommands(true));
+
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:2 .1.s:d"));
+ addNodesToBucketDB(bId, "0=1/2/3/t"); // This will actually remove node #1.
+
+ sendReply(0, api::ReturnCode::OK, api::BucketInfo(9,9,9));
+ sendReply(1, api::ReturnCode::OK, api::BucketInfo(5,6,7));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000002a52) : "
+ "node(idx=0,crc=0x5,docs=6/6,bytes=7/7,trusted=true,active=false)"),
+ dumpBucket(getExternalOperationHandler().getBucketId(doc->getId())));
+}
+
+void
+PutOperationTest::sendToRetiredNodesIfNoUpNodesAvailable()
+{
+ setupDistributor(Redundancy(2), NodeCount(2),
+ "distributor:1 storage:2 .0.s:r .1.s:r");
+ Document::SP doc(createDummyDocument("test", "uri"));
+ document::BucketId bucket(
+ getExternalOperationHandler().getBucketId(doc->getId()));
+ addNodesToBucketDB(bucket, "0=1/2/3/t,1=1/2/3/t");
+
+ sendPut(createPut(doc));
+
+ CPPUNIT_ASSERT_EQUAL("Put => 0,Put => 1"s,
+ _sender.getCommands(true));
+}
+
+void
+PutOperationTest::doTestCreationWithBucketActivationDisabled(bool disabled)
+{
+ setupDistributor(Redundancy(2), NodeCount(2), "distributor:1 storage:1");
+ disableBucketActivationInConfig(disabled);
+
+ Document::SP doc(createDummyDocument("test", "uri"));
+ sendPut(createPut(doc));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Create bucket => 0,Put => 0"),
+ _sender.getCommands(true));
+ auto cmd = _sender.commands[0];
+ auto createCmd = std::dynamic_pointer_cast<api::CreateBucketCommand>(cmd);
+ CPPUNIT_ASSERT(createCmd.get() != nullptr);
+ // There's only 1 content node, so if activation were not disabled, it
+ // should always be activated.
+ CPPUNIT_ASSERT_EQUAL(!disabled, createCmd->getActive());
+}
+
+void
+PutOperationTest::replicaImplicitlyActivatedWhenActivationIsNotDisabled()
+{
+ doTestCreationWithBucketActivationDisabled(false);
+}
+
+void
+PutOperationTest::replicaNotImplicitlyActivatedWhenActivationIsDisabled()
+{
+ doTestCreationWithBucketActivationDisabled(true);
+}
+
+}
+
+}
diff --git a/storage/src/tests/distributor/removebucketoperationtest.cpp b/storage/src/tests/distributor/removebucketoperationtest.cpp
new file mode 100644
index 00000000000..aeceefa15a0
--- /dev/null
+++ b/storage/src/tests/distributor/removebucketoperationtest.cpp
@@ -0,0 +1,150 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storage/distributor/operations/idealstate/removebucketoperation.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+#include <vespa/storage/storageutil/utils.h>
+#include <tests/distributor/distributortestutil.h>
+
+namespace storage {
+namespace distributor {
+
+class RemoveBucketOperationTest : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+ CPPUNIT_TEST_SUITE(RemoveBucketOperationTest);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST(testBucketInfoMismatchFailure);
+ CPPUNIT_TEST(testFailWithInvalidBucketInfo);
+ CPPUNIT_TEST_SUITE_END();
+
+protected:
+ void testSimple();
+ void testBucketInfoMismatchFailure();
+ void testFailWithInvalidBucketInfo();
+
+public:
+ void setUp() {
+ createLinks();
+ };
+
+ void tearDown() {
+ close();
+ }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(RemoveBucketOperationTest);
+
+void
+RemoveBucketOperationTest::testSimple()
+{
+ addNodesToBucketDB(document::BucketId(16, 1),
+ "0=10/100/1/t,"
+ "1=10/100/1/t,"
+ "2=10/100/1/t");
+ setRedundancy(1);
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:3"));
+
+ RemoveBucketOperation op("storage",
+ BucketAndNodes(document::BucketId(16, 1),
+ toVector<uint16_t>(1,2)));
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Delete bucket => 1,"
+ "Delete bucket => 2"),
+ _sender.getCommands(true));
+
+ sendReply(op, 0);
+ sendReply(op, 1);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "BucketId(0x4000000000000001) : "
+ "node(idx=0,crc=0xa,docs=100/100,bytes=1/1,trusted=true,active=false)"),
+ dumpBucket(document::BucketId(16, 1)));
+}
+
+/**
+ * Test that receiving a DeleteBucket failure from a storage node that sends
+ * back actual bucket info reinserts that bucket info into the distributor
+ * bucket database.
+ */
+void
+RemoveBucketOperationTest::testBucketInfoMismatchFailure()
+{
+ addNodesToBucketDB(document::BucketId(16, 1), "1=0/0/0/t");
+
+ getComponentRegisterImpl().setDistribution(std::shared_ptr<lib::Distribution>(
+ new lib::Distribution(
+ lib::Distribution::getDefaultDistributionConfig(1, 10))));
+
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:2"));
+
+ RemoveBucketOperation op("storage",
+ BucketAndNodes(document::BucketId(16, 1),
+ toVector<uint16_t>(1)));
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Delete bucket => 1"),
+ _sender.getCommands(true));
+
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, _sender.commands.size());
+ std::shared_ptr<api::StorageCommand> msg2 = _sender.commands[0];
+ std::shared_ptr<api::StorageReply> reply(msg2->makeReply().release());
+ dynamic_cast<api::DeleteBucketReply&>(*reply).setBucketInfo(
+ api::BucketInfo(10, 100, 1));
+ reply->setResult(api::ReturnCode::REJECTED);
+ op.receive(_sender, reply);
+
+ // RemoveBucketOperation should reinsert bucketinfo into database
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "BucketId(0x4000000000000001) : "
+ "node(idx=1,crc=0xa,docs=100/100,bytes=1/1,trusted=true,active=false)"),
+ dumpBucket(document::BucketId(16, 1)));
+}
+
+/**
+ * Test that receiving a DeleteBucket failure from a storage node that does
+ * not include valid BucketInfo in its reply does not reinsert the bucket
+ * into the distributor.
+ */
+void
+RemoveBucketOperationTest::testFailWithInvalidBucketInfo()
+{
+ addNodesToBucketDB(document::BucketId(16, 1), "1=0/0/0/t");
+
+ getComponentRegisterImpl().setDistribution(std::shared_ptr<lib::Distribution>(
+ new lib::Distribution(
+ lib::Distribution::getDefaultDistributionConfig(1, 10))));
+
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:2"));
+
+ RemoveBucketOperation op("storage",
+ BucketAndNodes(document::BucketId(16, 1),
+ toVector<uint16_t>(1)));
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Delete bucket => 1"),
+ _sender.getCommands(true));
+
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, _sender.commands.size());
+ std::shared_ptr<api::StorageCommand> msg2 = _sender.commands[0];
+ std::shared_ptr<api::StorageReply> reply(msg2->makeReply().release());
+ reply->setResult(api::ReturnCode::ABORTED);
+ op.receive(_sender, reply);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("NONEXISTING"),
+ dumpBucket(document::BucketId(16, 1)));
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/removelocationtest.cpp b/storage/src/tests/distributor/removelocationtest.cpp
new file mode 100644
index 00000000000..7a1bba86303
--- /dev/null
+++ b/storage/src/tests/distributor/removelocationtest.cpp
@@ -0,0 +1,84 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/storageapi/message/removelocation.h>
+#include <vespa/storage/distributor/operations/external/removelocationoperation.h>
+#include <tests/distributor/distributortestutil.h>
+
+namespace storage {
+namespace distributor {
+
+class RemoveLocationOperationTest : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+ CPPUNIT_TEST_SUITE(RemoveLocationOperationTest);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST_SUITE_END();
+
+protected:
+ void testSimple();
+
+public:
+ std::unique_ptr<RemoveLocationOperation> op;
+
+ void setUp() {
+ createLinks();
+ };
+
+ void tearDown() {
+ close();
+ }
+
+ void sendRemoveLocation(const std::string& selection) {
+ std::shared_ptr<api::RemoveLocationCommand> msg(
+ new api::RemoveLocationCommand(selection, document::BucketId(0)));
+
+ op.reset(new RemoveLocationOperation(getExternalOperationHandler(),
+ msg,
+ getDistributor().getMetrics().
+ removelocations[msg->getLoadType()]));
+
+ op->start(_sender, framework::MilliSecTime(0));
+ }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(RemoveLocationOperationTest);
+
+void
+RemoveLocationOperationTest::testSimple()
+{
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:3"));
+
+ addNodesToBucketDB(document::BucketId(34, 0x000001234), "0=1,1=1");
+ addNodesToBucketDB(document::BucketId(34, 0x100001234), "0=1,2=1");
+ addNodesToBucketDB(document::BucketId(34, 0x200001234), "0=1,2=1");
+ addNodesToBucketDB(document::BucketId(34, 0x300001234), "1=1,2=1");
+
+ sendRemoveLocation("id.user=4660");
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Remove selection(id.user=4660): BucketInfoCommand() => 0,"
+ "Remove selection(id.user=4660): BucketInfoCommand() => 1,"
+ "Remove selection(id.user=4660): BucketInfoCommand() => 0,"
+ "Remove selection(id.user=4660): BucketInfoCommand() => 2,"
+ "Remove selection(id.user=4660): BucketInfoCommand() => 0,"
+ "Remove selection(id.user=4660): BucketInfoCommand() => 2,"
+ "Remove selection(id.user=4660): BucketInfoCommand() => 1,"
+ "Remove selection(id.user=4660): BucketInfoCommand() => 2"),
+ _sender.getCommands(true, true));
+
+ for (uint32_t i = 0; i < 8; ++i) {
+ sendReply(*op, i);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketInfoReply(BucketInfo(invalid)) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/removeoperationtest.cpp b/storage/src/tests/distributor/removeoperationtest.cpp
new file mode 100644
index 00000000000..7907541a7c7
--- /dev/null
+++ b/storage/src/tests/distributor/removeoperationtest.cpp
@@ -0,0 +1,203 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/storage/distributor/externaloperationhandler.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <tests/distributor/distributortestutil.h>
+#include <vespa/storage/distributor/operations/external/removeoperation.h>
+
+namespace storage {
+namespace distributor {
+
+class RemoveOperationTest : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+ CPPUNIT_TEST_SUITE(RemoveOperationTest);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST(testNotFound);
+ CPPUNIT_TEST(testStorageFailure);
+ CPPUNIT_TEST(testNotInDB);
+ CPPUNIT_TEST(testMultipleCopies);
+ CPPUNIT_TEST(canSendRemoveWhenAllReplicaNodesRetired);
+ CPPUNIT_TEST_SUITE_END();
+
+protected:
+ void testSimple();
+ void testNotFound();
+ void testStorageFailure();
+ void testNoReply();
+ void testNotInDB();
+ void testMultipleCopies();
+ void testRevert();
+ void canSendRemoveWhenAllReplicaNodesRetired();
+
+public:
+ document::DocumentId docId;
+ document::BucketId bucketId;
+ std::unique_ptr<RemoveOperation> op;
+
+ void setUp() {
+ createLinks();
+
+ docId = document::DocumentId(document::DocIdString("test", "uri"));
+ bucketId = getExternalOperationHandler().getBucketId(docId);
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:4"));
+ };
+
+ void tearDown() {
+ close();
+ }
+
+ void sendRemove(document::DocumentId dId) {
+ std::shared_ptr<api::RemoveCommand> msg(
+ new api::RemoveCommand(document::BucketId(0), dId, 100));
+
+ op.reset(new RemoveOperation(getExternalOperationHandler(),
+ msg,
+ getDistributor().getMetrics().
+ removes[msg->getLoadType()]));
+
+ op->start(_sender, framework::MilliSecTime(0));
+ }
+
+ void replyToMessage(RemoveOperation& callback,
+ uint32_t index,
+ uint64_t oldTimestamp)
+ {
+ if (index == (uint32_t)-1) {
+ index = _sender.commands.size() - 1;
+ }
+
+ std::shared_ptr<api::StorageMessage> msg2 = _sender.commands[index];
+ api::RemoveCommand* removec = dynamic_cast<api::RemoveCommand*>(msg2.get());
+ std::unique_ptr<api::StorageReply> reply(removec->makeReply());
+ api::RemoveReply* removeR = static_cast<api::RemoveReply*>(reply.get());
+ removeR->setOldTimestamp(oldTimestamp);
+ callback.onReceive(_sender,
+ std::shared_ptr<api::StorageReply>(reply.release()));
+ }
+
+ void sendRemove() {
+ sendRemove(docId);
+ }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(RemoveOperationTest);
+
+void
+RemoveOperationTest::testSimple()
+{
+ addNodesToBucketDB(bucketId, "1=0");
+
+ sendRemove();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Remove(BucketId(0x4000000000002a52), doc:test:uri, "
+ "timestamp 100) => 1"),
+ _sender.getLastCommand());
+
+ replyToMessage(*op, -1, 34);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("RemoveReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 100, removed doc from 34) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+RemoveOperationTest::testNotFound()
+{
+ addNodesToBucketDB(bucketId, "1=0");
+
+ sendRemove();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Remove(BucketId(0x4000000000002a52), doc:test:uri, "
+ "timestamp 100) => 1"),
+ _sender.getLastCommand());
+
+ replyToMessage(*op, -1, 0);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("RemoveReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 100, not found) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+RemoveOperationTest::testStorageFailure()
+{
+ addNodesToBucketDB(bucketId, "1=0");
+
+ sendRemove();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Remove(BucketId(0x4000000000002a52), doc:test:uri, "
+ "timestamp 100) => 1"),
+ _sender.getLastCommand());
+
+ sendReply(*op, -1, api::ReturnCode::INTERNAL_FAILURE);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("RemoveReply(BucketId(0x0000000000000000), doc:test:uri, "
+ "timestamp 100, not found) ReturnCode(INTERNAL_FAILURE)"),
+ _sender.getLastReply());
+}
+
+void
+RemoveOperationTest::testNotInDB()
+{
+ sendRemove();
+
+ CPPUNIT_ASSERT_EQUAL(std::string("RemoveReply(BucketId(0x0000000000000000), "
+ "doc:test:uri, timestamp 100, not found) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+RemoveOperationTest::testMultipleCopies()
+{
+ addNodesToBucketDB(bucketId, "1=0, 2=0, 3=0");
+
+ sendRemove();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Remove(BucketId(0x4000000000002a52), doc:test:uri, "
+ "timestamp 100) => 1,"
+ "Remove(BucketId(0x4000000000002a52), doc:test:uri, "
+ "timestamp 100) => 2,"
+ "Remove(BucketId(0x4000000000002a52), doc:test:uri, "
+ "timestamp 100) => 3"),
+ _sender.getCommands(true, true));
+
+ replyToMessage(*op, 0, 34);
+ replyToMessage(*op, 1, 34);
+ replyToMessage(*op, 2, 75);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("RemoveReply(BucketId(0x0000000000000000), "
+ "doc:test:uri, timestamp 100, removed doc from 75) ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+RemoveOperationTest::canSendRemoveWhenAllReplicaNodesRetired()
+{
+ _distributor->enableClusterState(
+ lib::ClusterState("distributor:1 storage:1 .0.s:r"));
+ addNodesToBucketDB(bucketId, "0=123");
+ sendRemove();
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Remove(BucketId(0x4000000000002a52), doc:test:uri, "
+ "timestamp 100) => 0"),
+ _sender.getLastCommand());
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/simplebucketprioritydatabasetest.cpp b/storage/src/tests/distributor/simplebucketprioritydatabasetest.cpp
new file mode 100644
index 00000000000..a066649477c
--- /dev/null
+++ b/storage/src/tests/distributor/simplebucketprioritydatabasetest.cpp
@@ -0,0 +1,143 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <string>
+#include <vespa/storage/distributor/maintenance/simplebucketprioritydatabase.h>
+
+namespace storage {
+
+namespace distributor {
+
+using document::BucketId;
+typedef MaintenancePriority Priority;
+
+class SimpleBucketPriorityDatabaseTest : public CppUnit::TestFixture {
+ CPPUNIT_TEST_SUITE(SimpleBucketPriorityDatabaseTest);
+ CPPUNIT_TEST(testIteratorRangeIsEqualOnEmptyDatabase);
+ CPPUNIT_TEST(testCanGetPrioritizedBucket);
+ CPPUNIT_TEST(testIterateOverMultiplePriorities);
+ CPPUNIT_TEST(testMultipleSetPriorityForOneBucket);
+ CPPUNIT_TEST(testIterateOverMultipleBucketsWithMultiplePriorities);
+ CPPUNIT_TEST(testNoMaintenanceNeededClearsBucketFromDatabase);
+ CPPUNIT_TEST_SUITE_END();
+
+ typedef SimpleBucketPriorityDatabase::const_iterator const_iterator;
+
+public:
+ void testIteratorRangeIsEqualOnEmptyDatabase();
+ void testCanGetPrioritizedBucket();
+ void testIterateOverMultiplePriorities();
+ void testMultipleSetPriorityForOneBucket();
+ void testIterateOverMultipleBucketsWithMultiplePriorities();
+ void testNoMaintenanceNeededClearsBucketFromDatabase();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(SimpleBucketPriorityDatabaseTest);
+
+void
+SimpleBucketPriorityDatabaseTest::testIteratorRangeIsEqualOnEmptyDatabase()
+{
+ SimpleBucketPriorityDatabase queue;
+ const_iterator begin(queue.begin());
+ const_iterator end(queue.end());
+
+ CPPUNIT_ASSERT(begin == end);
+ CPPUNIT_ASSERT(begin == begin);
+ CPPUNIT_ASSERT(end == end);
+}
+
+void
+SimpleBucketPriorityDatabaseTest::testCanGetPrioritizedBucket()
+{
+ SimpleBucketPriorityDatabase queue;
+
+ PrioritizedBucket lowPriBucket(BucketId(16, 1234), Priority::VERY_LOW);
+ queue.setPriority(lowPriBucket);
+
+ PrioritizedBucket highest(*queue.begin());
+ CPPUNIT_ASSERT_EQUAL(lowPriBucket, highest);
+}
+
+void
+SimpleBucketPriorityDatabaseTest::testIterateOverMultiplePriorities()
+{
+ SimpleBucketPriorityDatabase queue;
+
+ PrioritizedBucket lowPriBucket(BucketId(16, 1234), Priority::LOW);
+ PrioritizedBucket highPriBucket(BucketId(16, 4321), Priority::HIGH);
+ queue.setPriority(lowPriBucket);
+ queue.setPriority(highPriBucket);
+
+ const_iterator iter(queue.begin());
+ CPPUNIT_ASSERT_EQUAL(highPriBucket, *iter);
+ ++iter;
+ CPPUNIT_ASSERT(iter != queue.end());
+ CPPUNIT_ASSERT_EQUAL(lowPriBucket, *iter);
+ ++iter;
+ CPPUNIT_ASSERT(iter == queue.end());
+}
+
+void
+SimpleBucketPriorityDatabaseTest::testMultipleSetPriorityForOneBucket()
+{
+ SimpleBucketPriorityDatabase queue;
+
+ PrioritizedBucket lowPriBucket(BucketId(16, 1234), Priority::LOW);
+ PrioritizedBucket highPriBucket(BucketId(16, 1234), Priority::HIGH);
+
+ queue.setPriority(lowPriBucket);
+ queue.setPriority(highPriBucket);
+
+ const_iterator iter(queue.begin());
+ CPPUNIT_ASSERT_EQUAL(highPriBucket, *iter);
+ ++iter;
+ CPPUNIT_ASSERT(iter == queue.end());
+}
+
+void
+SimpleBucketPriorityDatabaseTest::testNoMaintenanceNeededClearsBucketFromDatabase()
+{
+ SimpleBucketPriorityDatabase queue;
+
+ PrioritizedBucket highPriBucket(BucketId(16, 1234), Priority::HIGH);
+ PrioritizedBucket noPriBucket(BucketId(16, 1234),
+ Priority::NO_MAINTENANCE_NEEDED);
+ queue.setPriority(highPriBucket);
+ queue.setPriority(noPriBucket);
+
+ const_iterator iter(queue.begin());
+ CPPUNIT_ASSERT(iter == queue.end());
+}
+
+void
+SimpleBucketPriorityDatabaseTest::testIterateOverMultipleBucketsWithMultiplePriorities()
+{
+ SimpleBucketPriorityDatabase queue;
+
+ PrioritizedBucket lowPriBucket1(BucketId(16, 1), Priority::LOW);
+ PrioritizedBucket lowPriBucket2(BucketId(16, 2), Priority::LOW);
+ PrioritizedBucket mediumPriBucket(BucketId(16, 3), Priority::MEDIUM);
+ PrioritizedBucket highPriBucket1(BucketId(16, 4), Priority::HIGH);
+ PrioritizedBucket highPriBucket2(BucketId(16, 5), Priority::HIGH);
+
+ queue.setPriority(highPriBucket1);
+ queue.setPriority(lowPriBucket2);
+ queue.setPriority(mediumPriBucket);
+ queue.setPriority(highPriBucket2);
+ queue.setPriority(lowPriBucket1);
+
+ const_iterator iter(queue.begin());
+ PrioritizedBucket lastBucket(BucketId(), Priority::PRIORITY_LIMIT);
+ for (int i = 0; i < 5; ++i) {
+ CPPUNIT_ASSERT(iter != queue.end());
+ CPPUNIT_ASSERT(!iter->moreImportantThan(lastBucket));
+ lastBucket = *iter;
+ ++iter;
+ }
+ CPPUNIT_ASSERT(iter == queue.end());
+}
+
+}
+}
+
diff --git a/storage/src/tests/distributor/simplemaintenancescannertest.cpp b/storage/src/tests/distributor/simplemaintenancescannertest.cpp
new file mode 100644
index 00000000000..512a10bbd9a
--- /dev/null
+++ b/storage/src/tests/distributor/simplemaintenancescannertest.cpp
@@ -0,0 +1,220 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storage/distributor/maintenance/simplemaintenancescanner.h>
+#include <vespa/storage/distributor/maintenance/simplebucketprioritydatabase.h>
+#include <vespa/storage/distributor/bucketdb/mapbucketdatabase.h>
+#include <tests/distributor/maintenancemocks.h>
+
+#include <string>
+#include <sstream>
+#include <memory>
+#include <algorithm>
+#include <iterator>
+
+namespace storage {
+
+namespace distributor {
+
+using document::BucketId;
+typedef MaintenancePriority Priority;
+
+class SimpleMaintenanceScannerTest : public CppUnit::TestFixture {
+ CPPUNIT_TEST_SUITE(SimpleMaintenanceScannerTest);
+ CPPUNIT_TEST(testPrioritizeSingleBucket);
+ CPPUNIT_TEST(testPrioritizeMultipleBuckets);
+ CPPUNIT_TEST(testPendingMaintenanceOperationStatistics);
+ CPPUNIT_TEST(perNodeMaintenanceStatsAreTracked);
+ CPPUNIT_TEST(testReset);
+ CPPUNIT_TEST_SUITE_END();
+
+ using PendingStats = SimpleMaintenanceScanner::PendingMaintenanceStats;
+
+ std::string dumpPriorityDbToString(const BucketPriorityDatabase&) const;
+
+ std::unique_ptr<MockMaintenancePriorityGenerator> _priorityGenerator;
+ std::unique_ptr<MapBucketDatabase> _bucketDb;
+ std::unique_ptr<SimpleBucketPriorityDatabase> _priorityDb;
+ std::unique_ptr<SimpleMaintenanceScanner> _scanner;
+
+ void addBucketToDb(int bucketNum);
+
+ bool scanEntireDatabase(int expected);
+
+ std::string stringifyGlobalPendingStats(const PendingStats&) const;
+
+public:
+ void testPrioritizeSingleBucket();
+ void testPrioritizeMultipleBuckets();
+ void testPendingMaintenanceOperationStatistics();
+ void perNodeMaintenanceStatsAreTracked();
+ void testReset();
+
+ void setUp();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(SimpleMaintenanceScannerTest);
+
+void
+SimpleMaintenanceScannerTest::setUp()
+{
+ _priorityGenerator.reset(new MockMaintenancePriorityGenerator());
+ _bucketDb.reset(new MapBucketDatabase());
+ _priorityDb.reset(new SimpleBucketPriorityDatabase());
+ _scanner.reset(new SimpleMaintenanceScanner(*_priorityDb, *_priorityGenerator, *_bucketDb));
+}
+
+void
+SimpleMaintenanceScannerTest::addBucketToDb(int bucketNum)
+{
+ BucketDatabase::Entry entry(BucketId(16, bucketNum), BucketInfo());
+ _bucketDb->update(entry);
+}
+
+std::string
+SimpleMaintenanceScannerTest::stringifyGlobalPendingStats(
+ const PendingStats& stats) const
+{
+ std::ostringstream ss;
+ ss << stats.global;
+ return ss.str();
+}
+
+void
+SimpleMaintenanceScannerTest::testPrioritizeSingleBucket()
+{
+ addBucketToDb(1);
+ std::string expected("PrioritizedBucket(BucketId(0x4000000000000001), pri VERY_HIGH)\n");
+
+ CPPUNIT_ASSERT(!_scanner->scanNext().isDone());
+ CPPUNIT_ASSERT_EQUAL(expected, _priorityDb->toString());
+
+ CPPUNIT_ASSERT(_scanner->scanNext().isDone());
+ CPPUNIT_ASSERT_EQUAL(expected, _priorityDb->toString());
+}
+
+namespace {
+ std::string sortLines(const std::string& source) {
+ vespalib::StringTokenizer st(source,"\n","");
+ std::vector<std::string> lines;
+ std::copy(st.begin(), st.end(), std::back_inserter(lines));
+ std::sort(lines.begin(), lines.end());
+ std::ostringstream ost;
+ for (auto& line : lines) {
+ ost << line << "\n";
+ }
+ return ost.str();
+ }
+}
+
+void
+SimpleMaintenanceScannerTest::testPrioritizeMultipleBuckets()
+{
+ addBucketToDb(1);
+ addBucketToDb(2);
+ addBucketToDb(3);
+ std::string expected("PrioritizedBucket(BucketId(0x4000000000000001), pri VERY_HIGH)\n"
+ "PrioritizedBucket(BucketId(0x4000000000000002), pri VERY_HIGH)\n"
+ "PrioritizedBucket(BucketId(0x4000000000000003), pri VERY_HIGH)\n");
+
+ CPPUNIT_ASSERT(scanEntireDatabase(3));
+ CPPUNIT_ASSERT_EQUAL(sortLines(expected),
+ sortLines(_priorityDb->toString()));
+}
+
+bool
+SimpleMaintenanceScannerTest::scanEntireDatabase(int expected)
+{
+ for (int i = 0; i < expected; ++i) {
+ if (_scanner->scanNext().isDone()) {
+ return false;
+ }
+ }
+ return _scanner->scanNext().isDone();
+}
+
+void
+SimpleMaintenanceScannerTest::testReset()
+{
+ addBucketToDb(1);
+ addBucketToDb(3);
+
+ CPPUNIT_ASSERT(scanEntireDatabase(2));
+ std::string expected("PrioritizedBucket(BucketId(0x4000000000000001), pri VERY_HIGH)\n"
+ "PrioritizedBucket(BucketId(0x4000000000000003), pri VERY_HIGH)\n");
+ CPPUNIT_ASSERT_EQUAL(expected, _priorityDb->toString());
+
+ addBucketToDb(2);
+ CPPUNIT_ASSERT(scanEntireDatabase(0));
+ CPPUNIT_ASSERT_EQUAL(expected, _priorityDb->toString());
+
+ _scanner->reset();
+ CPPUNIT_ASSERT(scanEntireDatabase(3));
+
+ expected = "PrioritizedBucket(BucketId(0x4000000000000001), pri VERY_HIGH)\n"
+ "PrioritizedBucket(BucketId(0x4000000000000002), pri VERY_HIGH)\n"
+ "PrioritizedBucket(BucketId(0x4000000000000003), pri VERY_HIGH)\n";
+ CPPUNIT_ASSERT_EQUAL(sortLines(expected), sortLines(_priorityDb->toString()));
+}
+
+void
+SimpleMaintenanceScannerTest::testPendingMaintenanceOperationStatistics()
+{
+ addBucketToDb(1);
+ addBucketToDb(3);
+
+ std::string expectedEmpty("delete bucket: 0, merge bucket: 0, "
+ "split bucket: 0, join bucket: 0, "
+ "set bucket state: 0, garbage collection: 0");
+ {
+ auto stats(_scanner->getPendingMaintenanceStats());
+ CPPUNIT_ASSERT_EQUAL(expectedEmpty, stringifyGlobalPendingStats(stats));
+ }
+
+ CPPUNIT_ASSERT(scanEntireDatabase(2));
+
+ // All mock operations generated have the merge type.
+ {
+ auto stats(_scanner->getPendingMaintenanceStats());
+ std::string expected("delete bucket: 0, merge bucket: 2, "
+ "split bucket: 0, join bucket: 0, "
+ "set bucket state: 0, garbage collection: 0");
+ CPPUNIT_ASSERT_EQUAL(expected, stringifyGlobalPendingStats(stats));
+ }
+
+ _scanner->reset();
+ {
+ auto stats(_scanner->getPendingMaintenanceStats());
+ CPPUNIT_ASSERT_EQUAL(expectedEmpty, stringifyGlobalPendingStats(stats));
+ }
+}
+
+void
+SimpleMaintenanceScannerTest::perNodeMaintenanceStatsAreTracked()
+{
+ addBucketToDb(1);
+ addBucketToDb(3);
+ {
+ auto stats(_scanner->getPendingMaintenanceStats());
+ NodeMaintenanceStats emptyStats;
+ CPPUNIT_ASSERT_EQUAL(emptyStats, stats.perNodeStats.forNode(0));
+ }
+ CPPUNIT_ASSERT(scanEntireDatabase(2));
+ // Mock is currently hardwired to increment movingOut for node 1 and
+ // copyingIn for node 2 per bucket iterated (we've got 2).
+ auto stats(_scanner->getPendingMaintenanceStats());
+ {
+ NodeMaintenanceStats wantedNode1Stats;
+ wantedNode1Stats.movingOut = 2;
+ CPPUNIT_ASSERT_EQUAL(wantedNode1Stats, stats.perNodeStats.forNode(1));
+ }
+ {
+ NodeMaintenanceStats wantedNode2Stats;
+ wantedNode2Stats.copyingIn = 2;
+ CPPUNIT_ASSERT_EQUAL(wantedNode2Stats, stats.perNodeStats.forNode(2));
+ }
+}
+
+}
+}
diff --git a/storage/src/tests/distributor/splitbuckettest.cpp b/storage/src/tests/distributor/splitbuckettest.cpp
new file mode 100644
index 00000000000..d0fa69d600e
--- /dev/null
+++ b/storage/src/tests/distributor/splitbuckettest.cpp
@@ -0,0 +1,353 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <vespa/storage/distributor/operations/idealstate/splitoperation.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/document/base/documentid.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <tests/distributor/distributortestutil.h>
+
+using std::shared_ptr;
+using namespace document;
+
+namespace storage {
+
+namespace distributor {
+
+class SplitOperationTest : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+ CPPUNIT_TEST_SUITE(SplitOperationTest);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST(testMultiNodeFailure);
+ CPPUNIT_TEST(testCopyTrustedStatusNotCarriedOverAfterSplit);
+ CPPUNIT_TEST(testOperationBlockedByPendingJoin);
+ CPPUNIT_TEST_SUITE_END();
+
+ uint32_t splitByteSize;
+ uint32_t tooLargeBucketSize;
+ uint32_t splitCount;
+ uint32_t maxSplitBits;
+
+protected:
+ void testSimple();
+ void testMultiNodeFailure();
+ void testCopyTrustedStatusNotCarriedOverAfterSplit();
+ void testOperationBlockedByPendingJoin();
+
+public:
+ SplitOperationTest();
+
+ void setUp() {
+ createLinks();
+ getConfig().setSplitCount(splitCount);
+ getConfig().setSplitSize(splitByteSize);
+
+ }
+
+ void tearDown() {
+ close();
+ }
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(SplitOperationTest);
+
+SplitOperationTest::SplitOperationTest()
+ : splitByteSize(10*1024*1024),
+ tooLargeBucketSize(splitByteSize * 1.1),
+ splitCount(UINT32_MAX),
+ maxSplitBits(58)
+{
+}
+
+void
+SplitOperationTest::testSimple()
+{
+ _distributor->enableClusterState(
+ lib::ClusterState("distributor:1 storage:1"));
+
+ insertBucketInfo(document::BucketId(16, 1), 0, 0xabc, 1000,
+ tooLargeBucketSize, 250);
+
+ SplitOperation op("storage",
+ BucketAndNodes(document::BucketId(16, 1),
+ toVector<uint16_t>(0)),
+ maxSplitBits,
+ splitCount,
+ splitByteSize);
+
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ {
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _sender.commands.size());
+
+ std::shared_ptr<api::StorageCommand> msg = _sender.commands[0];
+ CPPUNIT_ASSERT(msg->getType() == api::MessageType::SPLITBUCKET);
+ CPPUNIT_ASSERT_EQUAL(
+ api::StorageMessageAddress("storage", lib::NodeType::STORAGE, 0)
+ .toString(),
+ msg->getAddress()->toString());
+
+ std::shared_ptr<api::StorageReply> reply(msg->makeReply().release());
+ api::SplitBucketReply* sreply(
+ static_cast<api::SplitBucketReply*>(reply.get()));
+
+ sreply->getSplitInfo().push_back(api::SplitBucketReply::Entry(
+ document::BucketId(17, 1),
+ api::BucketInfo(100, 600, 5000000)));
+
+ sreply->getSplitInfo().push_back(api::SplitBucketReply::Entry(
+ document::BucketId(17, 0x10001),
+ api::BucketInfo(110, 400, 6000000)));
+
+ op.receive(_sender, reply);
+ }
+
+ CPPUNIT_ASSERT(!getBucket(document::BucketId(16, 1)).valid());
+
+ {
+ BucketDatabase::Entry entry = getBucket(document::BucketId(17, 1));
+
+ CPPUNIT_ASSERT(entry.valid());
+ CPPUNIT_ASSERT_EQUAL((uint16_t)0, entry->getNodeRef(0).getNode());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)100, entry->getNodeRef(0).getChecksum());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)5000000,
+ entry->getNodeRef(0).getTotalDocumentSize());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)600,
+ entry->getNodeRef(0).getDocumentCount());
+ }
+
+ {
+ BucketDatabase::Entry entry(getBucket(document::BucketId(17, 0x10001)));
+
+ CPPUNIT_ASSERT(entry.valid());
+ CPPUNIT_ASSERT_EQUAL((uint16_t)0, entry->getNodeRef(0).getNode());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)110, entry->getNodeRef(0).getChecksum());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)6000000,
+ entry->getNodeRef(0).getTotalDocumentSize());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)400,
+ entry->getNodeRef(0).getDocumentCount());
+ }
+}
+
+void
+SplitOperationTest::testMultiNodeFailure()
+{
+ {
+ BucketDatabase::Entry entry(document::BucketId(16, 1));
+
+ BucketCopy copy(0, 0, api::BucketInfo(250, 1000, tooLargeBucketSize));
+ entry->addNode(copy, toVector<uint16_t>(0));
+
+ entry->addNode(BucketCopy(0, 1, copy.getBucketInfo()),
+ toVector<uint16_t>(0));
+ getBucketDatabase().update(entry);
+ }
+
+ _distributor->enableClusterState(
+ lib::ClusterState("distributor:1 storage:2"));
+
+
+ SplitOperation op("storage",
+ BucketAndNodes(document::BucketId(16, 1),
+ toVector<uint16_t>(0,1)),
+ maxSplitBits,
+ splitCount,
+ splitByteSize);
+
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ {
+ CPPUNIT_ASSERT_EQUAL((size_t)2, _sender.commands.size());
+
+ {
+ std::shared_ptr<api::StorageCommand> msg = _sender.commands[0];
+ CPPUNIT_ASSERT(msg->getType() == api::MessageType::SPLITBUCKET);
+ CPPUNIT_ASSERT_EQUAL(
+ api::StorageMessageAddress("storage",
+ lib::NodeType::STORAGE, 0).toString(),
+ msg->getAddress()->toString());
+
+ api::SplitBucketReply* sreply(
+ static_cast<api::SplitBucketReply*>(
+ msg->makeReply().release()));
+ sreply->setResult(api::ReturnCode::OK);
+
+ sreply->getSplitInfo().push_back(api::SplitBucketReply::Entry(
+ document::BucketId(17, 1),
+ api::BucketInfo(100, 600, 5000000)));
+
+ sreply->getSplitInfo().push_back(api::SplitBucketReply::Entry(
+ document::BucketId(17, 0x10001),
+ api::BucketInfo(110, 400, 6000000)));
+
+ op.receive(_sender, std::shared_ptr<api::StorageReply>(sreply));
+ }
+
+ sendReply(op, 1, api::ReturnCode::NOT_CONNECTED);
+ }
+
+ {
+ BucketDatabase::Entry entry = getBucket(document::BucketId(16, 1));
+
+ CPPUNIT_ASSERT(entry.valid());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)1, entry->getNodeCount());
+
+ CPPUNIT_ASSERT_EQUAL((uint16_t)1, entry->getNodeRef(0).getNode());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)250, entry->getNodeRef(0).getChecksum());
+ CPPUNIT_ASSERT_EQUAL(tooLargeBucketSize,
+ entry->getNodeRef(0).getTotalDocumentSize());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)1000,
+ entry->getNodeRef(0).getDocumentCount());
+ }
+
+ {
+ BucketDatabase::Entry entry = getBucket(document::BucketId(17, 1));
+
+ CPPUNIT_ASSERT(entry.valid());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)1, entry->getNodeCount());
+
+ CPPUNIT_ASSERT_EQUAL((uint16_t)0, entry->getNodeRef(0).getNode());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)100, entry->getNodeRef(0).getChecksum());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)5000000,
+ entry->getNodeRef(0).getTotalDocumentSize());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)600,
+ entry->getNodeRef(0).getDocumentCount());
+ }
+
+ {
+ BucketDatabase::Entry entry(getBucket(document::BucketId(17, 0x10001)));
+
+ CPPUNIT_ASSERT(entry.valid());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)1, entry->getNodeCount());
+
+ CPPUNIT_ASSERT_EQUAL((uint16_t)0, entry->getNodeRef(0).getNode());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)110, entry->getNodeRef(0).getChecksum());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)6000000,
+ entry->getNodeRef(0).getTotalDocumentSize());
+ CPPUNIT_ASSERT_EQUAL((uint32_t)400,
+ entry->getNodeRef(0).getDocumentCount());
+ }
+}
+
+void
+SplitOperationTest::testCopyTrustedStatusNotCarriedOverAfterSplit()
+{
+ _distributor->enableClusterState(
+ lib::ClusterState("distributor:1 storage:2"));
+
+ document::BucketId sourceBucket(16, 1);
+ /*
+ * Need 3 nodes to reproduce bug 6418516. Otherwise, the source bucket is
+ * left with only 1 copy which implicitly becomes trusted. When this copy
+ * is then split, the distributor db will automatically un-trust all buckets
+ * since it sees that multiple copies are trusted that are not consistent
+ * with each other. This prevents the bug from being visible.
+ */
+ addNodesToBucketDB(sourceBucket, "0=150/20/30000000/t,1=450/50/60000/u,"
+ "2=550/60/70000");
+
+ SplitOperation op("storage",
+ BucketAndNodes(sourceBucket, toVector<uint16_t>(0, 1)),
+ maxSplitBits,
+ splitCount,
+ splitByteSize);
+
+ op.setIdealStateManager(&getIdealStateManager());
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(size_t(3), _sender.commands.size());
+
+ std::vector<document::BucketId> childBuckets;
+ childBuckets.push_back(document::BucketId(17, 1));
+ childBuckets.push_back(document::BucketId(17, 0x10001));
+
+ // Note: only 2 out of 3 requests replied to!
+ for (int i = 0; i < 2; ++i) {
+ std::shared_ptr<api::StorageCommand> msg = _sender.commands[i];
+ CPPUNIT_ASSERT(msg->getType() == api::MessageType::SPLITBUCKET);
+ std::shared_ptr<api::StorageReply> reply(msg->makeReply().release());
+ api::SplitBucketReply* sreply(
+ static_cast<api::SplitBucketReply*>(reply.get()));
+
+ // Make sure copies differ so they cannot become implicitly trusted.
+ sreply->getSplitInfo().push_back(api::SplitBucketReply::Entry(
+ childBuckets[0],
+ api::BucketInfo(100 + i, 600, 5000000)));
+ sreply->getSplitInfo().push_back(api::SplitBucketReply::Entry(
+ childBuckets[1],
+ api::BucketInfo(110 + i, 400, 6000000)));
+
+ op.receive(_sender, reply);
+ }
+
+ CPPUNIT_ASSERT(getBucket(sourceBucket).valid()); // Still alive
+
+ for (uint32_t i = 0; i < 2; ++i) {
+ BucketDatabase::Entry entry(getBucket(childBuckets[i]));
+
+ CPPUNIT_ASSERT(entry.valid());
+ CPPUNIT_ASSERT_EQUAL(size_t(2), entry->getNodes().size());
+
+ for (uint16_t j = 0; j < 2; ++j) {
+ CPPUNIT_ASSERT(!entry->getNodeRef(i).trusted());
+ }
+ }
+}
+
+void
+SplitOperationTest::testOperationBlockedByPendingJoin()
+{
+ StorageComponentRegisterImpl compReg;
+ framework::defaultimplementation::FakeClock clock;
+ compReg.setClock(clock);
+ clock.setAbsoluteTimeInSeconds(1);
+ PendingMessageTracker tracker(compReg);
+
+ _distributor->enableClusterState(
+ lib::ClusterState("distributor:1 storage:2"));
+
+ document::BucketId joinTarget(2, 1);
+ std::vector<document::BucketId> joinSources = {
+ document::BucketId(3, 1), document::BucketId(3, 5)
+ };
+ auto joinCmd = std::make_shared<api::JoinBucketsCommand>(joinTarget);
+ joinCmd->getSourceBuckets() = joinSources;
+ joinCmd->setAddress(
+ api::StorageMessageAddress("storage", lib::NodeType::STORAGE, 0));
+
+ tracker.insert(joinCmd);
+
+ insertBucketInfo(joinTarget, 0, 0xabc, 1000, 1234, 250);
+
+ SplitOperation op("storage",
+ BucketAndNodes(joinTarget, toVector<uint16_t>(0)),
+ maxSplitBits,
+ splitCount,
+ splitByteSize);
+
+ CPPUNIT_ASSERT(op.isBlocked(tracker));
+
+ // Now, pretend there's a join for another node in the same bucket. This
+ // will happen when a join is partially completed.
+ tracker.clearMessagesForNode(0);
+ CPPUNIT_ASSERT(!op.isBlocked(tracker));
+
+ joinCmd->setAddress(
+ api::StorageMessageAddress("storage", lib::NodeType::STORAGE, 1));
+ tracker.insert(joinCmd);
+
+ CPPUNIT_ASSERT(op.isBlocked(tracker));
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/statecheckerstest.cpp b/storage/src/tests/distributor/statecheckerstest.cpp
new file mode 100644
index 00000000000..da444b9d22a
--- /dev/null
+++ b/storage/src/tests/distributor/statecheckerstest.cpp
@@ -0,0 +1,1838 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storage/distributor/bucketdbupdater.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storage/config/config-stor-distributormanager.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/storage/distributor/operations/idealstate/mergeoperation.h>
+#include <vespa/storage/distributor/operations/idealstate/removebucketoperation.h>
+#include <vespa/storage/distributor/operations/idealstate/setbucketstateoperation.h>
+#include <vespa/storage/distributor/operations/idealstate/splitoperation.h>
+#include <vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h>
+#include <vespa/storageapi/message/stat.h>
+#include <vespa/storage/storageutil/utils.h>
+#include <tests/distributor/distributortestutil.h>
+#include <vespa/storage/distributor/statecheckers.h>
+#include <vespa/storageapi/message/state.h>
+
+using namespace std::literals::string_literals;
+
+namespace storage {
+namespace distributor {
+
+struct StateCheckersTest : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+ StateCheckersTest() {}
+
+ void setUp() {
+ createLinks();
+ }
+
+ void tearDown() {
+ close();
+ }
+
+ struct PendingMessage
+ {
+ uint32_t _msgType;
+ uint8_t _pri;
+
+ PendingMessage() : _msgType(UINT32_MAX), _pri(0) {}
+
+ PendingMessage(uint32_t msgType, uint8_t pri)
+ : _msgType(msgType), _pri(pri) {}
+
+ bool shouldCheck() const { return _msgType != UINT32_MAX; }
+ };
+
+ void testSplit();
+ void testInconsistentSplit();
+ void splitCanBeScheduledWhenReplicasOnRetiredNodes();
+ void testSynchronizeAndMove();
+ void testDoNotMergeInconsistentlySplitBuckets();
+ void doNotMoveReplicasWithinRetiredNodes();
+ void testDeleteExtraCopies();
+ void testDoNotDeleteActiveExtraCopies();
+ void testConsistentCopiesOnRetiredNodesMayBeDeleted();
+ void redundantCopyDeletedEvenWhenAllNodesRetired();
+ void testJoin();
+ void testDoNotJoinBelowClusterStateBitCount();
+ void testAllowInconsistentJoinInDifferingSiblingIdealState();
+ void testDoNotAllowInconsistentJoinWhenNotInIdealState();
+ void testDoNotAllowInconsistentJoinWhenConfigDisabled();
+ void testNoJoinWhenInvalidCopyExists();
+ void testNoJoinOnDifferentNodes();
+ void testNoJoinWhenCopyCountAboveRedundancyLevelsForLeftSibling();
+ void testNoJoinWhenCopyCountAboveRedundancyLevelsForRightSibling();
+ void testNoJoinWhenCopyCountAboveRedundancyLevelsForBothSiblings();
+ void joinCanBeScheduledWhenReplicasOnRetiredNodes();
+ void testBucketState();
+ void testDoNotActivateNonReadyCopiesWhenIdealNodeInMaintenance();
+ void testDoNotChangeActiveStateForInconsistentlySplitBuckets();
+ void testNoActiveChangeForNonIdealCopiesWhenOtherwiseIdentical();
+ void testBucketStatePerGroup();
+ void allowActivationOfRetiredNodes();
+ void inhibitBucketActivationIfDisabledInConfig();
+ void inhibitBucketDeactivationIfDisabledInConfig();
+ void retiredNodesOutOfSyncAreMerged();
+ void testGarbageCollection();
+ void gcInhibitedWhenIdealNodeInMaintenance();
+ void testNoRemoveWhenIdealNodeInMaintenance();
+ void testStepwiseJoinForSmallBucketsWithoutSiblings();
+ void testNoStepwiseJoinWhenDisabledThroughConfig();
+ void testNoStepwiseJoinWhenSingleSiblingTooLarge();
+ void testStepwiseJoinMaySkipMultipleBitsWhenConsistent();
+ void testStepwiseJoinDoesNotSkipBeyondLevelWithSibling();
+ void contextPopulatesIdealStateContainers();
+ void statsUpdatedWhenMergingDueToMove();
+ void statsUpdatedWhenMergingDueToMissingCopy();
+ void statsUpdatedWhenMergingDueToOutOfSyncCopies();
+
+ void enableClusterState(const lib::ClusterState& systemState) {
+ _distributor->enableClusterState(systemState);
+ }
+
+ void insertJoinableBuckets();
+
+ void assertCurrentIdealState(const document::BucketId& bucket,
+ const std::vector<uint16_t> expected)
+ {
+ std::vector<uint16_t> idealNodes(
+ getIdealStateManager().getDistributorComponent()
+ .getDistribution().getIdealStorageNodes(
+ getIdealStateManager().getDistributorComponent()
+ .getClusterState(),
+ bucket,
+ "ui"));
+ CPPUNIT_ASSERT_EQUAL(expected, idealNodes);
+ }
+
+ void enableInconsistentJoinInConfig(bool enabled);
+
+ std::string testStateChecker(
+ StateChecker& checker,
+ StateChecker::Context& c,
+ bool includeBucketId = false,
+ const PendingMessage& blocker = PendingMessage(),
+ bool includeMessagePriority = false,
+ bool includeSchedulingPriority = false)
+ {
+ std::ostringstream ost;
+
+ c.siblingBucket = getIdealStateManager().getDistributorComponent()
+ .getSibling(c.bucketId);
+
+ std::vector<BucketDatabase::Entry> entries;
+ getBucketDatabase().getAll(c.bucketId, entries);
+ c.siblingEntry = getBucketDatabase().get(c.siblingBucket);
+
+ c.entries = entries;
+ for (uint32_t j = 0; j < entries.size(); ++j) {
+ // Run checking only on this bucketid, but include all buckets
+ // owned by it or owners of it, so we can detect inconsistent split.
+ if (entries[j].getBucketId() == c.bucketId) {
+ c.entry = entries[j];
+
+ StateChecker::Result result(checker.check(c));
+ IdealStateOperation::UP op(result.createOperation());
+ if (op.get()) {
+ if (blocker.shouldCheck()
+ && op->shouldBlockThisOperation(blocker._msgType,
+ blocker._pri))
+ {
+ return "BLOCKED";
+ }
+
+ if (!ost.str().empty()) {
+ ost << ",";
+ }
+ if (includeBucketId) {
+ ost << op->getBucketId() << ": ";
+ }
+
+ ost << op->getDetailedReason();
+ if (includeMessagePriority) {
+ ost << " (pri "
+ << static_cast<int>(op->getPriority())
+ << ')';
+ }
+ if (includeSchedulingPriority) {
+ ost << " (scheduling pri "
+ << MaintenancePriority::toString(
+ result.getPriority().getPriority())
+ << ")";
+ }
+ }
+ }
+ }
+
+ if (ost.str().empty()) {
+ ost << "NO OPERATIONS GENERATED";
+ }
+
+ getBucketDatabase().clear();
+
+ return ost.str();
+ }
+
+ std::string testGarbageCollection(uint32_t prevTimestamp,
+ uint32_t nowTimestamp,
+ uint32_t checkInterval,
+ uint32_t lastChangeTime = 0,
+ bool includePriority = false);
+
+ std::string testSplit(uint32_t splitCount,
+ uint32_t splitSize,
+ uint32_t minSplitBits,
+ const std::string& bucketInfo,
+ const PendingMessage& blocker = PendingMessage(),
+ bool includePriority = false);
+
+ std::string testInconsistentSplit(const document::BucketId& bid,
+ bool includePriority = false);
+
+ std::string testJoin(uint32_t joinCount,
+ uint32_t joinSize,
+ uint32_t minSplitBits,
+ const document::BucketId& bid,
+ const PendingMessage& blocker = PendingMessage(),
+ bool includePriority = false);
+
+ struct CheckerParams {
+ std::string _bucketInfo;
+ std::string _clusterState {"distributor:1 storage:2"};
+ std::string _expect;
+ static const PendingMessage NO_OP_BLOCKER;
+ const PendingMessage* _blockerMessage {&NO_OP_BLOCKER};
+ uint32_t _redundancy {2};
+ uint32_t _splitCount {0};
+ uint32_t _splitSize {0};
+ uint32_t _minSplitBits {0};
+ bool _includeMessagePriority {false};
+ bool _includeSchedulingPriority {false};
+
+ CheckerParams& expect(const std::string& e) {
+ _expect = e;
+ return *this;
+ }
+ CheckerParams& bucketInfo(const std::string& info) {
+ _bucketInfo = info;
+ return *this;
+ }
+ CheckerParams& clusterState(const std::string& state) {
+ _clusterState = state;
+ return *this;
+ }
+ CheckerParams& blockerMessage(const PendingMessage& blocker) {
+ _blockerMessage = &blocker;
+ return *this;
+ }
+ CheckerParams& redundancy(uint32_t r) {
+ _redundancy = r;
+ return *this;
+ }
+ CheckerParams& includeMessagePriority(bool includePri) {
+ _includeMessagePriority = includePri;
+ return *this;
+ }
+ CheckerParams& includeSchedulingPriority(bool includePri) {
+ _includeSchedulingPriority = includePri;
+ return *this;
+ }
+ };
+
+ template <typename CheckerImpl>
+ void runAndVerify(const CheckerParams& params) {
+ CheckerImpl checker;
+
+ document::BucketId bid(17, 0);
+ addNodesToBucketDB(bid, params._bucketInfo);
+ setRedundancy(params._redundancy);
+ _distributor->enableClusterState(
+ lib::ClusterState(params._clusterState));
+ NodeMaintenanceStatsTracker statsTracker;
+ StateChecker::Context c(
+ getExternalOperationHandler(), statsTracker, bid);
+ std::string result = testStateChecker(
+ checker, c, false, *params._blockerMessage,
+ params._includeMessagePriority,
+ params._includeSchedulingPriority);
+ CPPUNIT_ASSERT_EQUAL(params._expect, result);
+ }
+
+ std::string testSynchronizeAndMove(
+ const std::string& bucketInfo,
+ const std::string& clusterState = "distributor:1 storage:2",
+ uint32_t redundancy = 2,
+ const PendingMessage& blocker = PendingMessage(),
+ bool includePriority = false);
+
+ std::string testDeleteExtraCopies(
+ const std::string& bucketInfo,
+ uint32_t redundancy = 2,
+ const PendingMessage& blocker = PendingMessage(),
+ const std::string& clusterState = "",
+ bool includePriority = false);
+
+ std::string testBucketState(const std::string& bucketInfo,
+ uint32_t redundancy = 2,
+ bool includePriority = false);
+ std::string testBucketStatePerGroup(const std::string& bucketInfo,
+ bool includePriority = false);
+
+ CPPUNIT_TEST_SUITE(StateCheckersTest);
+ CPPUNIT_TEST(testSplit);
+ CPPUNIT_TEST(testInconsistentSplit);
+ CPPUNIT_TEST(splitCanBeScheduledWhenReplicasOnRetiredNodes);
+ CPPUNIT_TEST(testSynchronizeAndMove);
+ CPPUNIT_TEST(testDoNotMergeInconsistentlySplitBuckets);
+ CPPUNIT_TEST(doNotMoveReplicasWithinRetiredNodes);
+ CPPUNIT_TEST(retiredNodesOutOfSyncAreMerged);
+ CPPUNIT_TEST(testDoNotChangeActiveStateForInconsistentlySplitBuckets);
+ CPPUNIT_TEST(testDeleteExtraCopies);
+ CPPUNIT_TEST(testDoNotDeleteActiveExtraCopies);
+ CPPUNIT_TEST(testConsistentCopiesOnRetiredNodesMayBeDeleted);
+ CPPUNIT_TEST(redundantCopyDeletedEvenWhenAllNodesRetired);
+ CPPUNIT_TEST(testJoin);
+ CPPUNIT_TEST(testDoNotJoinBelowClusterStateBitCount);
+ CPPUNIT_TEST(testAllowInconsistentJoinInDifferingSiblingIdealState);
+ CPPUNIT_TEST(testDoNotAllowInconsistentJoinWhenNotInIdealState);
+ CPPUNIT_TEST(testDoNotAllowInconsistentJoinWhenConfigDisabled);
+ CPPUNIT_TEST(testNoJoinWhenInvalidCopyExists);
+ CPPUNIT_TEST(testNoJoinOnDifferentNodes);
+ CPPUNIT_TEST(testNoJoinWhenCopyCountAboveRedundancyLevelsForLeftSibling);
+ CPPUNIT_TEST(testNoJoinWhenCopyCountAboveRedundancyLevelsForRightSibling);
+ CPPUNIT_TEST(testNoJoinWhenCopyCountAboveRedundancyLevelsForBothSiblings);
+ CPPUNIT_TEST(joinCanBeScheduledWhenReplicasOnRetiredNodes);
+ CPPUNIT_TEST(testBucketState);
+ CPPUNIT_TEST(testDoNotActivateNonReadyCopiesWhenIdealNodeInMaintenance);
+ CPPUNIT_TEST(testNoActiveChangeForNonIdealCopiesWhenOtherwiseIdentical);
+ CPPUNIT_TEST(testBucketStatePerGroup);
+ CPPUNIT_TEST(allowActivationOfRetiredNodes);
+ CPPUNIT_TEST(inhibitBucketActivationIfDisabledInConfig);
+ CPPUNIT_TEST(inhibitBucketDeactivationIfDisabledInConfig);
+ CPPUNIT_TEST(testGarbageCollection);
+ CPPUNIT_TEST(gcInhibitedWhenIdealNodeInMaintenance);
+ CPPUNIT_TEST(testNoRemoveWhenIdealNodeInMaintenance);
+ CPPUNIT_TEST(testStepwiseJoinForSmallBucketsWithoutSiblings);
+ CPPUNIT_TEST(testNoStepwiseJoinWhenDisabledThroughConfig);
+ CPPUNIT_TEST(testNoStepwiseJoinWhenSingleSiblingTooLarge);
+ CPPUNIT_TEST(testStepwiseJoinMaySkipMultipleBitsWhenConsistent);
+ CPPUNIT_TEST(testStepwiseJoinDoesNotSkipBeyondLevelWithSibling);
+ CPPUNIT_TEST(contextPopulatesIdealStateContainers);
+ CPPUNIT_TEST(statsUpdatedWhenMergingDueToMove);
+ CPPUNIT_TEST(statsUpdatedWhenMergingDueToMissingCopy);
+ CPPUNIT_TEST(statsUpdatedWhenMergingDueToOutOfSyncCopies);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(StateCheckersTest);
+
+const StateCheckersTest::PendingMessage
+StateCheckersTest::CheckerParams::NO_OP_BLOCKER;
+
+std::string StateCheckersTest::testSplit(uint32_t splitCount,
+ uint32_t splitSize,
+ uint32_t minSplitBits,
+ const std::string& bucketInfo,
+ const PendingMessage& blocker,
+ bool includePriority)
+{
+ document::BucketId bid(17, 0);
+
+ addNodesToBucketDB(bid, bucketInfo);
+
+ SplitBucketStateChecker checker;
+ NodeMaintenanceStatsTracker statsTracker;
+ StateChecker::Context c(getExternalOperationHandler(), statsTracker, bid);
+ getConfig().setSplitSize(splitSize);
+ getConfig().setSplitCount(splitCount);
+ getConfig().setMinimalBucketSplit(minSplitBits);
+ return testStateChecker(checker, c, false, blocker, includePriority);
+}
+
+
+
+void
+StateCheckersTest::testSplit()
+{
+ setupDistributor(3, 10, "distributor:1 storage:2");
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Splitting bucket because its maximum size (2000 b, 10 docs, 10 meta, 2000 b total) "
+ "is higher than the configured limit of (1000, 4294967295)]"),
+ testSplit((uint32_t)-1, 1000, 16, "0=100/10/2000"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Splitting bucket because its maximum size (1000 b, "
+ "200 docs, 200 meta, 1000 b total) "
+ "is higher than the configured limit of (10000, 100)] "
+ "(pri 175)"),
+ testSplit(100, 10000, 16, "0=100/200/1000", PendingMessage(), true));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testSplit(1000, 1000, 16, "0=100/200/200"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testSplit(1000, 1000, 16, "0=100/200/200/2000/2000"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Splitting bucket because the current system size requires "
+ "a higher minimum split bit]"),
+ testSplit((uint32_t)-1, (uint32_t)-1, 24, "0=100/200/1000"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Splitting bucket because its maximum size (1000 b, 1000 docs, 1000 meta, 1000 b total) "
+ "is higher than the configured limit of (10000, 100)]"),
+ testSplit(100, 10000, 16, "0=100/10/10,1=100/1000/1000"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Splitting bucket because its maximum size (1000 b, 1000 docs, 1000 meta, 1000 b total) "
+ "is higher than the configured limit of (10000, 100)]"),
+ testSplit(100, 10000, 16, "0=1/0/0,1=100/1000/1000"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Splitting bucket because its maximum size (1000 b, 1000 docs, 1000 meta, 1000 b total) "
+ "is higher than the configured limit of (10000, 100)]"),
+ testSplit(100, 10000, 16, "0=0/0/1,1=100/1000/1000"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testSplit(1000, 1000, 16, "0=100/1/200000"));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BLOCKED"),
+ testSplit(100, 10000, 16, "0=0/0/1,1=100/1000/1000",
+ PendingMessage(api::MessageType::SPLITBUCKET_ID, 0)));
+
+ // Split on too high meta
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Splitting bucket because its maximum size (1000 b, 100 docs, 2100 meta, 15000000 b total) "
+ "is higher than the configured limit of (10000000, 1000)]"),
+ testSplit(1000, 10000000, 16, "0=14/100/1000/2100/15000000"));
+ // Split on too high file size
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Splitting bucket because its maximum size (1000 b, 100 docs, 1500 meta, 21000000 b total) "
+ "is higher than the configured limit of (10000000, 1000)]"),
+ testSplit(1000, 10000000, 16, "0=14/100/1000/1500/21000000"));
+
+ // Don't block higher priority splits than what's already pending.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Splitting bucket because its maximum size (1000 b, 1000 docs, 1000 meta, 1000 b total) "
+ "is higher than the configured limit of (10000, 100)]"),
+ testSplit(100, 10000, 16, "0=100/10/10,1=100/1000/1000",
+ PendingMessage(api::MessageType::SPLITBUCKET_ID, 255)));
+
+ // But must block equal priority splits that are already pending, or
+ // we'll end up spamming the nodes with splits!
+ // NOTE: assuming split priority of 175.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BLOCKED"),
+ testSplit(100, 10000, 16, "0=0/0/1,1=100/1000/1000",
+ PendingMessage(api::MessageType::SPLITBUCKET_ID, 175)));
+
+ // Don't split if we're already joining, since there's a window of time
+ // where the bucket will appear to be inconsistently split when the join
+ // is not finished on all the nodes.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BLOCKED"),
+ testSplit(100, 10000, 16, "0=0/0/1,1=100/1000/1000",
+ PendingMessage(api::MessageType::JOINBUCKETS_ID, 175)));
+}
+
+std::string
+StateCheckersTest::testInconsistentSplit(const document::BucketId& bid,
+ bool includePriority)
+{
+ SplitInconsistentStateChecker checker;
+ NodeMaintenanceStatsTracker statsTracker;
+ StateChecker::Context c(getExternalOperationHandler(), statsTracker, bid);
+ return testStateChecker(checker, c, true,
+ PendingMessage(), includePriority);
+}
+
+void
+StateCheckersTest::testInconsistentSplit()
+{
+ setupDistributor(3, 10, "distributor:1 storage:2");
+
+ insertBucketInfo(document::BucketId(16, 1), 1, 0x1, 1, 1);
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testInconsistentSplit(document::BucketId(16, 1)));
+
+ insertBucketInfo(document::BucketId(17, 1), 1, 0x1, 1, 1);
+ insertBucketInfo(document::BucketId(16, 1), 1, 0x1, 1, 1);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001): [Bucket is inconsistently "
+ "split (list includes 0x4000000000000001, 0x4400000000000001) "
+ "Splitting it to improve the problem (max used bits 17)]"),
+ testInconsistentSplit(document::BucketId(16, 1)));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testInconsistentSplit(document::BucketId(17, 1)));
+
+ insertBucketInfo(document::BucketId(17, 1), 0, 0x0, 0, 0);
+ insertBucketInfo(document::BucketId(16, 1), 1, 0x1, 1, 1);
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000001): [Bucket is inconsistently "
+ "split (list includes 0x4000000000000001, 0x4400000000000001) "
+ "Splitting it to improve the problem (max used bits "
+ "17)] (pri 110)"),
+ testInconsistentSplit(document::BucketId(16, 1), true));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testInconsistentSplit(document::BucketId(17, 1)));
+}
+
+void
+StateCheckersTest::splitCanBeScheduledWhenReplicasOnRetiredNodes()
+{
+ setupDistributor(Redundancy(2), NodeCount(2),
+ "distributor:1 storage:2, .0.s:r .1.s:r");
+ CPPUNIT_ASSERT_EQUAL(
+ "[Splitting bucket because its maximum size (2000 b, 10 docs, "
+ "10 meta, 2000 b total) is higher than the configured limit of "
+ "(1000, 4294967295)]"s,
+ testSplit(UINT32_MAX, 1000, 16, "0=100/10/2000"));
+}
+
+std::string
+StateCheckersTest::testJoin(uint32_t joinCount,
+ uint32_t joinSize,
+ uint32_t minSplitBits,
+ const document::BucketId& bid,
+ const PendingMessage& blocker,
+ bool includePriority)
+{
+ JoinBucketsStateChecker checker;
+ getConfig().setJoinSize(joinSize);
+ getConfig().setJoinCount(joinCount);
+ getConfig().setMinimalBucketSplit(minSplitBits);
+
+ NodeMaintenanceStatsTracker statsTracker;
+ StateChecker::Context c(getExternalOperationHandler(), statsTracker, bid);
+ return testStateChecker(checker, c, true, blocker, includePriority);
+}
+
+void
+StateCheckersTest::insertJoinableBuckets()
+{
+ insertBucketInfo(document::BucketId(33, 1), 1, 0x1, 1, 1);
+ insertBucketInfo(document::BucketId(33, 0x100000001), 1, 0x1, 1, 1);
+}
+
+void
+StateCheckersTest::testJoin()
+{
+ setupDistributor(3, 10, "distributor:1 storage:2");
+
+ insertJoinableBuckets();
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "BucketId(0x8000000000000001): "
+ "[Joining buckets BucketId(0x8400000000000001) and "
+ "BucketId(0x8400000100000001) because their size "
+ "(2 bytes, 2 docs) is less than the configured limit "
+ "of (100, 10)"),
+ testJoin(10, 100, 16, document::BucketId(33, 1)));
+
+ insertJoinableBuckets();
+ // Join size is 0, so only look at document count
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "BucketId(0x8000000000000001): "
+ "[Joining buckets BucketId(0x8400000000000001) and "
+ "BucketId(0x8400000100000001) because their size "
+ "(2 bytes, 2 docs) is less than the configured limit "
+ "of (0, 3) (pri 155)"),
+ testJoin(3, 0, 16, document::BucketId(33, 1), PendingMessage(), true));
+
+ insertJoinableBuckets();
+ // Should not generate joins for both pairs, just the primary
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testJoin(10, 100, 16, document::BucketId(33, 0x100000001)));
+
+ insertJoinableBuckets();
+ // Should not generate join if min split bits is higher
+ CPPUNIT_ASSERT_EQUAL(std::string("NO OPERATIONS GENERATED"),
+ testJoin(10, 100, 33, document::BucketId(33, 1)));
+
+ insertJoinableBuckets();
+ // Meta data too big, no join
+ insertBucketInfo(document::BucketId(33, 1), 1,
+ api::BucketInfo(0x1, 1, 1, 1000, 1000));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("NO OPERATIONS GENERATED"),
+ testJoin(10, 100, 16, document::BucketId(33, 1)));
+
+ insertJoinableBuckets();
+ // Bucket recently created
+ insertBucketInfo(document::BucketId(33, 1), 1,
+ api::BucketInfo(0x1, 0, 0, 0, 0));
+ CPPUNIT_ASSERT_EQUAL(std::string("NO OPERATIONS GENERATED"),
+ testJoin(10, 100, 16, document::BucketId(33, 1)));
+
+}
+
+/**
+ * If distributor config says minsplitcount is 8, but cluster state says that
+ * distribution bit count is 16, we should not allow the join to take place.
+ * We don't properly handle the "reduce distribution bits" case in general, so
+ * the safest is to never violate this and to effectively make distribution
+ * bit increases a one-way street.
+ */
+void
+StateCheckersTest::testDoNotJoinBelowClusterStateBitCount()
+{
+ setupDistributor(2, 2, "bits:16 distributor:1 storage:2");
+ // Insert sibling buckets at 16 bits that are small enough to be joined
+ // unless there is special logic for dealing with distribution bits.
+ insertBucketInfo(document::BucketId(16, 1), 1, 0x1, 1, 1);
+ insertBucketInfo(document::BucketId(16, (1 << 15) | 1), 1, 0x1, 1, 1);
+ using ConfiguredMinSplitBits = uint32_t;
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testJoin(100, 100, ConfiguredMinSplitBits(8),
+ document::BucketId(16, 1)));
+}
+
+void
+StateCheckersTest::enableInconsistentJoinInConfig(bool enabled)
+{
+ vespa::config::content::core::StorDistributormanagerConfigBuilder config;
+ config.enableInconsistentJoin = enabled;
+ getConfig().configure(config);
+}
+
+void
+StateCheckersTest::testAllowInconsistentJoinInDifferingSiblingIdealState()
+{
+ // Normally, bucket siblings have an ideal state on the same node in order
+ // to enable joining these back together. However, the ideal disks assigned
+ // may differ and it's sufficient for a sibling bucket's ideal disk to be
+ // down on the node of its other sibling for it to be assigned a different
+ // node. In this case, there's no other way to get buckets joined back
+ // together than if we allow bucket replicas to get temporarily out of sync
+ // by _forcing_ a join across all replicas no matter their placement.
+ // This will trigger a merge to reconcile and move the new bucket copies to
+ // their ideal location.
+ setupDistributor(2, 3, "distributor:1 storage:3 .0.d:20 .0.d.14.s:d .2.d:20");
+ document::BucketId sibling1(33, 0x000000001); // ideal disk 14 on node 0
+ document::BucketId sibling2(33, 0x100000001); // ideal disk 1 on node 0
+
+ // Full node sequence sorted by score for sibling(1|2) is [0, 2, 1].
+ // Node 0 cannot be used, so use 1 instead.
+ assertCurrentIdealState(sibling1, {2, 1});
+ assertCurrentIdealState(sibling2, {0, 2});
+
+ insertBucketInfo(sibling1, 2, 0x1, 2, 3);
+ insertBucketInfo(sibling1, 1, 0x1, 2, 3);
+ insertBucketInfo(sibling2, 0, 0x1, 2, 3);
+ insertBucketInfo(sibling2, 2, 0x1, 2, 3);
+
+ enableInconsistentJoinInConfig(true);
+
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "BucketId(0x8000000000000001): "
+ "[Joining buckets BucketId(0x8400000000000001) and "
+ "BucketId(0x8400000100000001) because their size "
+ "(6 bytes, 4 docs) is less than the configured limit "
+ "of (100, 10)"),
+ testJoin(10, 100, 16, sibling1));
+}
+
+void
+StateCheckersTest::testDoNotAllowInconsistentJoinWhenNotInIdealState()
+{
+ setupDistributor(2, 4, "distributor:1 storage:4 .0.d:20 .0.d.14.s:d .2.d:20 .3.d:20");
+ document::BucketId sibling1(33, 0x000000001);
+ document::BucketId sibling2(33, 0x100000001);
+
+ assertCurrentIdealState(sibling1, {3, 2});
+ assertCurrentIdealState(sibling2, {3, 0});
+
+ insertBucketInfo(sibling1, 3, 0x1, 2, 3);
+ insertBucketInfo(sibling1, 2, 0x1, 2, 3);
+ insertBucketInfo(sibling2, 3, 0x1, 2, 3);
+ insertBucketInfo(sibling2, 1, 0x1, 2, 3); // not in ideal state
+
+ enableInconsistentJoinInConfig(true);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("NO OPERATIONS GENERATED"),
+ testJoin(10, 100, 16, sibling1));
+}
+
+void
+StateCheckersTest::testDoNotAllowInconsistentJoinWhenConfigDisabled()
+{
+ setupDistributor(2, 3, "distributor:1 storage:3 .0.d:20 .0.d.14.s:d .2.d:20");
+ document::BucketId sibling1(33, 0x000000001); // ideal disk 14 on node 0
+ document::BucketId sibling2(33, 0x100000001); // ideal disk 1 on node 0
+
+ // Full node sequence sorted by score for sibling(1|2) is [0, 2, 1].
+ // Node 0 cannot be used, so use 1 instead.
+ assertCurrentIdealState(sibling1, {2, 1});
+ assertCurrentIdealState(sibling2, {0, 2});
+
+ insertBucketInfo(sibling1, 2, 0x1, 2, 3);
+ insertBucketInfo(sibling1, 1, 0x1, 2, 3);
+ insertBucketInfo(sibling2, 0, 0x1, 2, 3);
+ insertBucketInfo(sibling2, 2, 0x1, 2, 3);
+
+ enableInconsistentJoinInConfig(false);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("NO OPERATIONS GENERATED"),
+ testJoin(10, 100, 16, sibling1));
+}
+
+void
+StateCheckersTest::testNoJoinWhenInvalidCopyExists()
+{
+ setupDistributor(3, 10, "distributor:1 storage:3");
+
+ insertBucketInfo(document::BucketId(33, 0x100000001), 1, 0x1, 1, 1);
+ // No join when there exists an invalid copy
+ insertBucketInfo(document::BucketId(33, 1), 1, api::BucketInfo());
+
+ CPPUNIT_ASSERT_EQUAL(std::string("NO OPERATIONS GENERATED"),
+ testJoin(10, 100, 16, document::BucketId(33, 1)));
+}
+
+void
+StateCheckersTest::testNoJoinOnDifferentNodes()
+{
+ setupDistributor(3, 10, "distributor:1 storage:2");
+
+ insertBucketInfo(document::BucketId(33, 0x000000001), 0, 0x1, 1, 1);
+ insertBucketInfo(document::BucketId(33, 0x100000001), 1, 0x1, 1, 1);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testJoin(10, 100, 16, document::BucketId(33, 0x1)));
+}
+
+void
+StateCheckersTest::testNoJoinWhenCopyCountAboveRedundancyLevelsForLeftSibling()
+{
+ setupDistributor(3, 10, "distributor:1 storage:2");
+ setRedundancy(1);
+ insertBucketInfo(document::BucketId(33, 0x000000001), 0, 0x1, 1, 1);
+ insertBucketInfo(document::BucketId(33, 0x000000001), 1, 0x1, 1, 1);
+ insertBucketInfo(document::BucketId(33, 0x100000001), 0, 0x1, 1, 1);
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testJoin(10, 100, 16, document::BucketId(33, 0x1)));
+}
+
+void
+StateCheckersTest::testNoJoinWhenCopyCountAboveRedundancyLevelsForRightSibling()
+{
+ setupDistributor(3, 10, "distributor:1 storage:2");
+ setRedundancy(1);
+ insertBucketInfo(document::BucketId(33, 0x000000001), 1, 0x1, 1, 1);
+ insertBucketInfo(document::BucketId(33, 0x100000001), 0, 0x1, 1, 1);
+ insertBucketInfo(document::BucketId(33, 0x100000001), 1, 0x1, 1, 1);
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testJoin(10, 100, 16, document::BucketId(33, 0x1)));
+}
+
+void
+StateCheckersTest::testNoJoinWhenCopyCountAboveRedundancyLevelsForBothSiblings()
+{
+ setupDistributor(3, 10, "distributor:1 storage:2");
+ setRedundancy(1);
+ insertBucketInfo(document::BucketId(33, 0x000000001), 0, 0x1, 1, 1);
+ insertBucketInfo(document::BucketId(33, 0x000000001), 1, 0x1, 1, 1);
+ insertBucketInfo(document::BucketId(33, 0x100000001), 0, 0x1, 1, 1);
+ insertBucketInfo(document::BucketId(33, 0x100000001), 1, 0x1, 1, 1);
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testJoin(10, 100, 16, document::BucketId(33, 0x1)));
+}
+
+std::string
+StateCheckersTest::testSynchronizeAndMove(const std::string& bucketInfo,
+ const std::string& clusterState,
+ uint32_t redundancy,
+ const PendingMessage& blocker,
+ bool includePriority)
+{
+ document::BucketId bid(17, 0);
+
+ addNodesToBucketDB(bid, bucketInfo);
+
+ SynchronizeAndMoveStateChecker checker;
+ setRedundancy(redundancy);
+
+ _distributor->enableClusterState(lib::ClusterState(clusterState));
+ NodeMaintenanceStatsTracker statsTracker;
+ StateChecker::Context c(getExternalOperationHandler(), statsTracker, bid);
+ return testStateChecker(checker, c, false, blocker, includePriority);
+}
+
+void
+StateCheckersTest::testSynchronizeAndMove()
+{
+ // Plus if it was more obvious which nodes were in ideal state for various
+ // cluster states. (One possibility to override ideal state function for
+ // test)
+ runAndVerify<SynchronizeAndMoveStateChecker>(
+ CheckerParams().expect(
+ "[Synchronizing buckets with different checksums "
+ "node(idx=0,crc=0x1,docs=1/1,bytes=1/1,trusted=false,"
+ "active=false), "
+ "node(idx=1,crc=0x2,docs=2/2,bytes=2/2,trusted=false,"
+ "active=false)] "
+ "(scheduling pri MEDIUM)")
+ .bucketInfo("0=1,1=2")
+ .includeSchedulingPriority(true));
+
+ // If 1+ nodes in ideal state is in maintenance, do nothing
+ runAndVerify<SynchronizeAndMoveStateChecker>(
+ CheckerParams()
+ .expect("NO OPERATIONS GENERATED")
+ .bucketInfo("0=1,2=2")
+ .clusterState("distributor:1 storage:3 .1.s:m"));
+
+ runAndVerify<SynchronizeAndMoveStateChecker>(
+ CheckerParams()
+ .expect("[Moving bucket to ideal node 3] "
+ "(scheduling pri VERY_LOW)")
+ .bucketInfo("0=1,1=1,2=1")
+ .clusterState("distributor:1 storage:4")
+ .includeSchedulingPriority(true));
+
+ // Not doing anything in ideal state
+ runAndVerify<SynchronizeAndMoveStateChecker>(
+ CheckerParams()
+ .expect("NO OPERATIONS GENERATED")
+ .bucketInfo("0=1,1=1,3=1")
+ .clusterState("distributor:1 storage:4"));
+
+ // Both copies out of ideal state
+ runAndVerify<SynchronizeAndMoveStateChecker>(
+ CheckerParams()
+ .expect("[Moving bucket to ideal node 1]"
+ "[Moving bucket to ideal node 3] (pri 165) "
+ "(scheduling pri VERY_LOW)")
+ .clusterState("distributor:1 storage:5")
+ .bucketInfo("0=1,4=1,5=1")
+ .includeMessagePriority(true)
+ .includeSchedulingPriority(true));
+
+ // Too little redundancy and out of ideal state. Note that in this case,
+ // the non-ideal node is reported as a missing node and not with a "Moving
+ // bucket to ideal node" reason.
+ runAndVerify<SynchronizeAndMoveStateChecker>(
+ CheckerParams()
+ .expect("[Adding missing node 1]"
+ "[Adding missing node 3] (pri 120) "
+ "(scheduling pri MEDIUM)")
+ .bucketInfo("0=1")
+ .clusterState("distributor:1 storage:5")
+ .includeMessagePriority(true)
+ .includeSchedulingPriority(true));
+
+ // Synchronizing even when ideal state is in sync
+ runAndVerify<SynchronizeAndMoveStateChecker>(
+ CheckerParams()
+ .expect("[Synchronizing buckets with different checksums "
+ "node(idx=0,crc=0x3,docs=3/3,bytes=3/3,trusted=false,"
+ "active=false), "
+ "node(idx=1,crc=0x3,docs=3/3,bytes=3/3,trusted=false,"
+ "active=false), "
+ "node(idx=2,crc=0x0,docs=0/0,bytes=0/0,trusted=false,"
+ "active=false)]")
+ .bucketInfo("0=3,1=3,2=0")
+ .clusterState("distributor:1 storage:3"));
+
+ // Synchronize even when we have >= redundancy trusted copies and ideal
+ // nodes are in sync.
+ runAndVerify<SynchronizeAndMoveStateChecker>(
+ CheckerParams()
+ .expect("[Synchronizing buckets with different checksums "
+ "node(idx=0,crc=0x2,docs=3/3,bytes=4/4,trusted=false,"
+ "active=false), "
+ "node(idx=1,crc=0x1,docs=2/2,bytes=3/3,trusted=true,"
+ "active=false), "
+ "node(idx=2,crc=0x1,docs=2/2,bytes=3/3,trusted=true,"
+ "active=false), "
+ "node(idx=3,crc=0x1,docs=2/2,bytes=3/3,trusted=true,"
+ "active=false)] "
+ "(pri 120) (scheduling pri MEDIUM)")
+ .bucketInfo("0=2/3/4,1=1/2/3/t,2=1/2/3/t,3=1/2/3/t")
+ .clusterState("distributor:1 storage:5")
+ .includeMessagePriority(true)
+ .includeSchedulingPriority(true));
+
+ // Not doing anything if one of the buckets in ideal state is invalid
+ // but we have redundancy coverage otherwise
+ runAndVerify<SynchronizeAndMoveStateChecker>(
+ CheckerParams()
+ .expect("NO OPERATIONS GENERATED")
+ .bucketInfo("1=0/0/1,3=1")
+ .clusterState("distributor:1 storage:4"));
+
+ // Not doing anything if all copies we have are invalid
+ runAndVerify<SynchronizeAndMoveStateChecker>(
+ CheckerParams()
+ .expect("NO OPERATIONS GENERATED")
+ .bucketInfo("1=0/0/1,3=0/0/1")
+ .clusterState("distributor:1 storage:4"));
+
+ // Not doing anything if we have < redundancy copies but all existing
+ // copies are invalid.
+ runAndVerify<SynchronizeAndMoveStateChecker>(
+ CheckerParams()
+ .expect("NO OPERATIONS GENERATED")
+ .bucketInfo("1=0/0/1")
+ .clusterState("distributor:1 storage:4"));
+}
+
+void
+StateCheckersTest::testDoNotMergeInconsistentlySplitBuckets()
+{
+ // No merge generated if buckets are inconsistently split.
+ // This matches the case where a bucket has been split into 2 on one
+ // node and is not yet split on another; we should never try to merge
+ // either two of the split leaf buckets back onto the first node!
+ // Running state checker on a leaf:
+ addNodesToBucketDB(document::BucketId(16, 0), "0=2");
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testSynchronizeAndMove("1=1", // 17 bits
+ "distributor:1 storage:4"));
+ // Running state checker on an inner node bucket:
+ addNodesToBucketDB(document::BucketId(18, 0), "0=2");
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testSynchronizeAndMove("0=1", // 17 bits
+ "distributor:1 storage:4"));
+}
+
+void
+StateCheckersTest::doNotMoveReplicasWithinRetiredNodes()
+{
+ // Nodes 1 and 3 would be in ideal state if the nodes were not retired.
+ // Here, all nodes are retired and we should thus not do any sort of
+ // moving.
+ runAndVerify<SynchronizeAndMoveStateChecker>(
+ CheckerParams()
+ .expect("NO OPERATIONS GENERATED")
+ .bucketInfo("0=2,1=2")
+ .clusterState("distributor:1 storage:4 "
+ ".0.s:r .1.s:r .2.s:r .3.s:r"));
+}
+
+void
+StateCheckersTest::retiredNodesOutOfSyncAreMerged()
+{
+ // Normally, we'd do a merge that'd move the bucket to new nodes, leaving
+ // the out of sync retired nodes as source-only replicas. But here we
+ // don't have that choice and thus try to do the most useful thing we can
+ // with what we have available to us (which is to try to get things in
+ // sync).
+ runAndVerify<SynchronizeAndMoveStateChecker>(
+ CheckerParams()
+ .expect("[Synchronizing buckets with different checksums "
+ "node(idx=0,crc=0x1,docs=1/1,bytes=1/1,trusted=false,"
+ "active=false), "
+ "node(idx=1,crc=0x2,docs=2/2,bytes=2/2,trusted=false,"
+ "active=false)]")
+ .bucketInfo("0=1,1=2")
+ .clusterState("distributor:1 storage:4 "
+ ".0.s:r .1.s:r .2.s:r .3.s:r"));
+}
+
+std::string
+StateCheckersTest::testDeleteExtraCopies(
+ const std::string& bucketInfo, uint32_t redundancy,
+ const PendingMessage& blocker,
+ const std::string& clusterState,
+ bool includePriority)
+{
+ document::BucketId bid(17, 0);
+
+ addNodesToBucketDB(bid, bucketInfo);
+ setRedundancy(redundancy);
+
+ if (!clusterState.empty()) {
+ _distributor->enableClusterState(lib::ClusterState(clusterState));
+ }
+ DeleteExtraCopiesStateChecker checker;
+ NodeMaintenanceStatsTracker statsTracker;
+ StateChecker::Context c(getExternalOperationHandler(), statsTracker, bid);
+ return testStateChecker(checker, c, false, blocker, includePriority);
+}
+
+
+void
+StateCheckersTest::testDeleteExtraCopies()
+{
+ setupDistributor(2, 100, "distributor:1 storage:4");
+
+ {
+ std::vector<uint16_t> idealNodes(
+ getIdealStateManager().getDistributorComponent()
+ .getDistribution().getIdealStorageNodes(
+ getIdealStateManager().getDistributorComponent().getClusterState(),
+ document::BucketId(17, 0),
+ "ui"));
+ std::vector<uint16_t> wanted;
+ wanted.push_back(1);
+ wanted.push_back(3);
+ CPPUNIT_ASSERT_EQUAL(wanted, idealNodes);
+ }
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Remove empty buckets",
+ std::string("[Removing all copies since bucket is empty:node(idx=0,crc=0x0,"
+ "docs=0/0,bytes=0/0,trusted=false,active=false)]"
+ " (pri 100)"),
+ testDeleteExtraCopies("0=0", 2, PendingMessage(), "", true));
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Remove extra trusted copy",
+ std::string("[Removing redundant in-sync copy from node 2]"),
+ testDeleteExtraCopies("3=3/3/3/t,1=3/3/3/t,2=3/3/3/t"));
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Redundant copies in sync can be removed without trusted being a "
+ "factor of consideration. Ideal state copy not removed.",
+ std::string("[Removing redundant in-sync copy from node 2]"),
+ testDeleteExtraCopies("3=3/3/3,1=3/3/3/t,2=3/3/3/t"));
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Need redundancy number of copies",
+ std::string("NO OPERATIONS GENERATED"),
+ testDeleteExtraCopies("0=3,1=3"));
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Do not remove extra copies without enough trusted copies",
+ std::string("NO OPERATIONS GENERATED"),
+ testDeleteExtraCopies("0=0/0/1,1=3,2=3"));
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Do not remove buckets that have meta entries",
+ std::string("NO OPERATIONS GENERATED"),
+ testDeleteExtraCopies("0=0/0/1,1=0/0/1"));
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Do not remove any recently created copies",
+ std::string("NO OPERATIONS GENERATED"),
+ testDeleteExtraCopies("0=1/0/0/t,1=1/0/0/t,2=1/0/0/t"));
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Do not remove untrusted copy that is out of sync",
+ std::string("NO OPERATIONS GENERATED"),
+ testDeleteExtraCopies("0=2/3/4,1=1/2/3/t,2=1/2/3/t"));
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Do not remove out of sync copies, even if we have more than #"
+ "redundancy trusted copies",
+ std::string("NO OPERATIONS GENERATED"),
+ testDeleteExtraCopies("0=2/3/4,1=1/2/3/t,2=1/2/3/t,3=1/2/3/t"));
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Don't remove unless we have enough trusted "
+ "copies to satisfy redundancy",
+ std::string("NO OPERATIONS GENERATED"),
+ testDeleteExtraCopies("0=2/3/4,1=1/2/3,2=2/3/4,3=1/2/3"));
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Only remove empty copies unless all other copies are in sync",
+ std::string("[Removing empty copy from node 4]"),
+ testDeleteExtraCopies("0=2/3/4,1=1/2/3,2=2/3/4,3=1/2/3,4=0/0/0"));
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Remove redundant empty copy",
+ std::string("[Removing empty copy from node 0]"),
+ testDeleteExtraCopies("1=2/3,3=1/2/3,0=0/0/0"));
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Remove empty bucket with multiple copies",
+ std::string(
+ "[Removing all copies since bucket is empty:"
+ "node(idx=0,crc=0x0,docs=0/0,bytes=0/0,trusted=false,active=false), "
+ "node(idx=1,crc=0x0,docs=0/0,bytes=0/0,trusted=false,active=false), "
+ "node(idx=2,crc=0x0,docs=0/0,bytes=0/0,trusted=false,active=false)]"),
+ testDeleteExtraCopies("0=0/0/0,1=0/0/0,2=0/0/0"));
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Pending persistence operation blocks delete",
+ std::string("BLOCKED"),
+ testDeleteExtraCopies("0=0/0/0,1=1/2/3/t,2=1/2/3/t",
+ 2,
+ PendingMessage(api::MessageType::PUT_ID, 255)));
+}
+
+void
+StateCheckersTest::testDoNotDeleteActiveExtraCopies()
+{
+ setupDistributor(2, 100, "distributor:1 storage:4");
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Do not delete redundant copy if it is marked active",
+ std::string("NO OPERATIONS GENERATED"),
+ testDeleteExtraCopies("3=3/3/3/t,1=3/3/3/t,2=3/3/3/t/a"));
+}
+
+void
+StateCheckersTest::testConsistentCopiesOnRetiredNodesMayBeDeleted()
+{
+ setupDistributor(2, 100, "distributor:1 storage:4 .1.s:r");
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Remove in-sync copy on node that is retired",
+ std::string("[Removing redundant in-sync copy from node 1]"),
+ testDeleteExtraCopies("3=3/3/3/t,1=3/3/3/t,2=3/3/3/t"));
+}
+
+void
+StateCheckersTest::redundantCopyDeletedEvenWhenAllNodesRetired()
+{
+ setupDistributor(2, 100, "distributor:1 storage:4 "
+ ".0.s:r .1.s:r .2.s:r .3.s:r");
+
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Remove in-sync copy on node that is retired",
+ "[Removing redundant in-sync copy from node 2]"s,
+ testDeleteExtraCopies("3=3/3/3/t,1=3/3/3/t,2=3/3/3/t"));
+}
+
+std::string StateCheckersTest::testBucketState(
+ const std::string& bucketInfo, uint32_t redundancy,
+ bool includePriority)
+{
+ document::BucketId bid(17, 0);
+ setRedundancy(redundancy);
+ addNodesToBucketDB(bid, bucketInfo);
+
+ BucketStateStateChecker checker;
+ NodeMaintenanceStatsTracker statsTracker;
+ StateChecker::Context c(getExternalOperationHandler(), statsTracker, bid);
+ return testStateChecker(checker, c, false, PendingMessage(),
+ includePriority);
+}
+
+void
+StateCheckersTest::testBucketState()
+{
+ setupDistributor(2, 100, "distributor:1 storage:4");
+
+ {
+ // Set config explicitly so we can compare priorities for differing
+ // cases.
+ DistributorConfiguration::MaintenancePriorities mp;
+ mp.activateNoExistingActive = 90;
+ mp.activateWithExistingActive = 120;
+ getConfig().setMaintenancePriorities(mp);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testBucketState(""));
+
+ // Node 1 is in ideal state
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 1 as active:"
+ " copy is ideal state priority 0] (pri 90)"),
+ testBucketState("1=2/3/4", 2, true));
+
+ // Node 3 is in ideal state
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 3 as active:"
+ " copy is ideal state priority 1]"),
+ testBucketState("3=2/3/4"));
+
+ // No trusted nodes, but node 1 is first in ideal state.
+ // Also check bad case where more than 1 node is set as active just
+ // to ensure we can get out of that situation if it should ever happen.
+ // Nothing done with node 3 since is't not active and shouldn't be.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 1 as active:"
+ " copy is ideal state priority 0]"
+ "[Setting node 0 as inactive]"
+ "[Setting node 2 as inactive] (pri 120)"),
+ testBucketState("0=3/4/5/u/a,1=3,2=4/5/6/u/a,3=3", 2, true));
+
+ // Test setting active when only node available is not contained
+ // within the resolved ideal state.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 0 as active: first available copy]"),
+ testBucketState("0=2/3/4"));
+
+ // A trusted ideal state copy should be set active rather than a non-trusted
+ // ideal state copy
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 3 as active:"
+ " copy is trusted and ideal state priority 1]"
+ "[Setting node 1 as inactive]"),
+ testBucketState("1=2/3/4/u/a,3=5/6/7/t"));
+
+ // None of the ideal state copies are trusted but a non-ideal copy is.
+ // The trusted copy should be active.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 2 as active: copy is trusted]"),
+ testBucketState("1=2/3/4,3=5/6/7/,2=8/9/10/t"));
+
+ // Make sure bucket db ordering does not matter
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 2 as active: copy is trusted]"),
+ testBucketState("2=8/9/10/t,1=2/3/4,3=5/6/7"));
+
+ // If copy is already active, we shouldn't generate operations
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testBucketState("1=2/3/4/t/a"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testBucketState("1=2/3/4,3=5/6/7/t/a"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testBucketState("2=8/9/10/t/a,1=2/3/4,3=5/6/7"));
+
+ // If multiple buckets are active, deactive all but one
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 2 as inactive]"
+ "[Setting node 3 as inactive]"),
+ testBucketState("1=1/2/3/t/a,2=1/2/3/t/a,3=1/2/3/t/a"));
+
+ // Invalid buckets should not be included
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testBucketState("1=0/0/1,3=0/0/1"));
+
+ // Ready preferred over trusted & ideal state
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testBucketState("2=8/9/10/t/i/u,1=2/3/4/u/a/r,3=5/6/7"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 2 as active: copy is ready]"
+ "[Setting node 1 as inactive]"),
+ testBucketState("2=8/9/10/u/i/r,1=2/3/4/u/a/u,3=5/6/7/u/i/u"));
+
+ // Prefer in ideal state if multiple copies ready
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 3 as active: copy is ready]"
+ "[Setting node 1 as inactive]"),
+ testBucketState("2=8/9/10/u/i/r,1=2/3/4/u/a/u,3=5/6/7/u/i/r"));
+
+ // Prefer ideal state if all ready but no trusted
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 1 as active: copy is ready]"),
+ testBucketState("2=8/9/10/u/i/r,1=2/3/4/u/i/r,3=5/6/7/u/i/r"));
+
+ // Prefer trusted over ideal state
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 2 as active: copy is ready and trusted]"
+ "[Setting node 1 as inactive]"),
+ testBucketState("2=8/9/10/t/i/r,1=2/3/4/u/a/r,3=5/6/7"));
+}
+
+/**
+ * Users assume that setting nodes into maintenance will not cause extra load
+ * on the cluster, but activating non-ready copies because the active copy went
+ * into maintenance violates that assumption. See bug 6833209 for context and
+ * details.
+ */
+void
+StateCheckersTest::testDoNotActivateNonReadyCopiesWhenIdealNodeInMaintenance()
+{
+ setupDistributor(2, 100, "distributor:1 storage:4 .1.s:m");
+ // Ideal node 1 is in maintenance and no ready copy available.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testBucketState("2=8/9/10/t/i/u,3=5/6/7"));
+ // But we should activate another copy iff there's another ready copy.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 2 as active: copy is ready]"),
+ testBucketState("2=8/9/10/u/i/r,3=5/6/7/u/i/u"));
+}
+
+/**
+ * We really do not want to activate buckets when they are inconsistent.
+ * See bug 6395693 for a set of reasons why.
+ */
+void
+StateCheckersTest::testDoNotChangeActiveStateForInconsistentlySplitBuckets()
+{
+ setupDistributor(2, 100, "distributor:1 storage:4");
+ // Running state checker on a leaf:
+ addNodesToBucketDB(document::BucketId(16, 0), "0=2");
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testBucketState("1=1")); // 17 bits
+ // Running state checker on an inner node bucket:
+ addNodesToBucketDB(document::BucketId(18, 0), "0=2");
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testSynchronizeAndMove("0=1")); // 17 bits
+}
+
+/**
+ * If all existing copies are outside the ideal state, e.g. if the set of nodes
+ * in the cluster has changed significantly, we do not want to change the active
+ * state of copies needlessly iff the copies are otherwise equally scored in
+ * terms of activation eligibility. If we do not prioritize existing active
+ * copies higher in this case, it's possible that their ideal order has been
+ * permutated, causing another copy to rank higher in the ideal state node
+ * sequence. This would in turn activate the newly higher ranked copy and
+ * deactivate the previously active copy, causing transient search duplicates
+ * and uneeded work in the cluster; new copies will be created and indexed
+ * soon anyway.
+ *
+ * See bug 7278932.
+ */
+void
+StateCheckersTest::testNoActiveChangeForNonIdealCopiesWhenOtherwiseIdentical()
+{
+ setupDistributor(2, 100, "distributor:1 storage:50");
+ // 1 is more ideal than 3 in this state, but since they're both not part
+ // of the #redundancy ideal set, activation should not change hands.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testBucketState("1=2/3/4/t/i/r,3=2/3/4/t/a/r"));
+ // Same applies if the copies aren't ready, since if a copy has been marked
+ // as active it will already have started background indexing. No need in
+ // undoing that if we don't have any better candidates going anyway.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testBucketState("1=2/3/4/t,3=2/3/4/t/a"));
+}
+
+std::string StateCheckersTest::testBucketStatePerGroup(
+ const std::string& bucketInfo, bool includePriority)
+{
+ document::BucketId bid(17, 0);
+ addNodesToBucketDB(bid, bucketInfo);
+
+ BucketStateStateChecker checker;
+ NodeMaintenanceStatsTracker statsTracker;
+ StateChecker::Context c(getExternalOperationHandler(), statsTracker, bid);
+ return testStateChecker(checker, c, false, PendingMessage(),
+ includePriority);
+}
+
+void
+StateCheckersTest::testBucketStatePerGroup()
+{
+ setupDistributor(6, 20, "distributor:1 storage:12 .2.s:d .4.s:d .7.s:d");
+ vespa::config::content::StorDistributionConfigBuilder config;
+ config.activePerLeafGroup = true;
+ config.redundancy = 6;
+ config.group.resize(4);
+ config.group[0].index = "invalid";
+ config.group[0].name = "invalid";
+ config.group[0].partitions = "2|2|*";
+ config.group[1].index = "0";
+ config.group[1].name = "left";
+ config.group[1].nodes.resize(3);
+ config.group[1].nodes[0].index = 0;
+ config.group[1].nodes[1].index = 1;
+ config.group[1].nodes[2].index = 3;
+ config.group[2].index = "1";
+ config.group[2].name = "right";
+ config.group[2].nodes.resize(3);
+ config.group[2].nodes[0].index = 5;
+ config.group[2].nodes[1].index = 6;
+ config.group[2].nodes[2].index = 8;
+ config.group[3].index = "2";
+ config.group[3].name = "middle";
+ config.group[3].nodes.resize(3);
+ config.group[3].nodes[0].index = 9;
+ config.group[3].nodes[1].index = 10;
+ config.group[3].nodes[2].index = 11;
+ lib::Distribution::SP distr(new lib::Distribution(config));
+ _node->getComponentRegister().setDistribution(distr);
+
+ {
+ DistributorConfiguration::MaintenancePriorities mp;
+ mp.activateNoExistingActive = 90;
+ mp.activateWithExistingActive = 120;
+ getConfig().setMaintenancePriorities(mp);
+ }
+
+ // Node 1 and 8 is is ideal state
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 1 as active: "
+ "copy is trusted and ideal state priority 4]"
+ "[Setting node 6 as active: "
+ "copy is trusted and ideal state priority 0] (pri 90)"),
+ testBucketStatePerGroup("0=2/3/4/t, 1=2/3/4/t, 3=2/3/4/t, "
+ "5=2/3/4/t, 6=2/3/4/t, 8=2/3/4/t", true));
+
+ // Data differ between groups
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 1 as active: "
+ "copy is trusted and ideal state priority 4]"
+ "[Setting node 6 as active: "
+ "copy is ideal state priority 0] (pri 90)"),
+ testBucketStatePerGroup("0=2/3/4/t, 1=2/3/4/t, 3=2/3/4/t, "
+ "5=5/6/7, 6=5/6/7, 8=5/6/7", true));
+
+ // Disable too
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 0 as inactive]"
+ "[Setting node 3 as inactive]"
+ "[Setting node 5 as inactive]"
+ "[Setting node 8 as inactive] (pri 90)"),
+ testBucketStatePerGroup("0=2/3/4/t/a, 1=2/3/4/t/a, 3=2/3/4/t/a, "
+ "5=2/3/4/t/a, 6=2/3/4/t/a, 8=2/3/4/t/a",
+ true));
+
+ // Node 1 and 8 is is ideal state
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Setting node 1 as active: "
+ "copy is trusted and ideal state priority 4]"
+ "[Setting node 6 as active: "
+ "copy is trusted and ideal state priority 0]"
+ "[Setting node 9 as active: "
+ "copy is trusted and ideal state priority 2] (pri 90)"),
+ testBucketStatePerGroup("0=2/3/4/t, 1=2/3/4/t, 3=2/3/4/t, "
+ "5=2/3/4/t, 6=2/3/4/t, 8=2/3/4/t, "
+ "9=2/3/4/t, 10=2/3/4/t, 11=2/3/4/t",
+ true));
+}
+
+void
+StateCheckersTest::allowActivationOfRetiredNodes()
+{
+ // All nodes in retired state implies that the ideal state is empty. But
+ // we still want to be able to shuffle bucket activations around in order
+ // to preserve coverage.
+ setupDistributor(2, 2, "distributor:1 storage:2 .0.s:r .1.s:r");
+ CPPUNIT_ASSERT_EQUAL(
+ "[Setting node 1 as active: copy is trusted]"
+ "[Setting node 0 as inactive]"s,
+ testBucketState("0=2/3/4/u/a,1=5/6/7/t"));
+}
+
+void
+StateCheckersTest::inhibitBucketActivationIfDisabledInConfig()
+{
+ setupDistributor(2, 4, "distributor:1 storage:4");
+ disableBucketActivationInConfig(true);
+
+ // Node 1 is in ideal state and only replica and should be activated in
+ // an indexed cluster context (but not here).
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testBucketState("1=2/3/4", 2, true));
+}
+
+void
+StateCheckersTest::inhibitBucketDeactivationIfDisabledInConfig()
+{
+ setupDistributor(2, 4, "distributor:1 storage:4");
+ disableBucketActivationInConfig(true);
+
+ // Multiple replicas which would have been deactivated. This test is mostly
+ // for the sake of completion; a scenario where buckets are active while
+ // having no indexed documents configured should not happen.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testBucketState("1=1/2/3/t/a,2=1/2/3/t/a,3=1/2/3/t/a"));
+}
+
+std::string StateCheckersTest::testGarbageCollection(
+ uint32_t prevTimestamp, uint32_t nowTimestamp,
+ uint32_t checkInterval, uint32_t lastChangeTime,
+ bool includePriority)
+{
+ BucketDatabase::Entry e(document::BucketId(17, 0));
+ e.getBucketInfo().addNode(BucketCopy(prevTimestamp, 0,
+ api::BucketInfo(3,3,3)),
+ toVector((uint16_t)0));
+ e.getBucketInfo().setLastGarbageCollectionTime(prevTimestamp);
+ getBucketDatabase().update(e);
+
+ GarbageCollectionStateChecker checker;
+ getConfig().setGarbageCollection("music", checkInterval);
+ getConfig().setLastGarbageCollectionChangeTime(lastChangeTime);
+ NodeMaintenanceStatsTracker statsTracker;
+ StateChecker::Context c(getExternalOperationHandler(), statsTracker,
+ e.getBucketId());
+ getClock().setAbsoluteTimeInSeconds(nowTimestamp);
+ return testStateChecker(checker, c, false, PendingMessage(),
+ includePriority);
+}
+
+void
+StateCheckersTest::testGarbageCollection()
+{
+ // BucketId(17, 0) has id (and thus 'hash') 0x4400000000000000. With a
+ // check interval modulo of 3600, this implies a start point of 848.
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testGarbageCollection(900, 3600 + 847, 3600));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Needs garbage collection: Last check at 900, current time 4448, "
+ "configured interval 3600]"),
+ testGarbageCollection(900, 3600 + 848, 3600));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Needs garbage collection: Last check at 3, current time 4000, "
+ "configured interval 3600]"),
+ testGarbageCollection(3, 4000, 3600));
+
+ // GC start point 3648.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testGarbageCollection(3, 3647, 8000));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Needs garbage collection: Last check at 3, current time 4000, "
+ "configured interval 3600]"),
+ testGarbageCollection(3, 4000, 3600));
+
+ // GC explicitly disabled.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testGarbageCollection(3, 4000, 0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testGarbageCollection(3, 3, 1));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[Needs garbage collection: Last check at 3, current time 4000, "
+ "configured interval 300] (pri 200)"),
+ testGarbageCollection(3, 4000, 300, 1, true));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NO OPERATIONS GENERATED"),
+ testGarbageCollection(3850, 4000, 300, 1));
+}
+
+/**
+ * When a node is in maintenance, we want to do our best to avoid any unneeded
+ * changes to the bucket replicas' states, as this will require re-syncing of
+ * the replicas when the node out of maintenance. Consequently we should not
+ * trigger GC for buckets when this is the case.
+ */
+void
+StateCheckersTest::gcInhibitedWhenIdealNodeInMaintenance()
+{
+ // Redundancy is 3, so with only 3 nodes, node 1 is guaranteed to be part of
+ // the ideal state of any bucket in the system.
+ setupDistributor(3, 3, "distributor:1 storage:3 .1.s:m");
+ document::BucketId bucket(17, 0);
+ addNodesToBucketDB(bucket, "0=10/100/1/true,"
+ "1=10/100/1/true,"
+ "2=10/100/1/true");
+ BucketDatabase::Entry e(getBucketDatabase().get(bucket));
+ e.getBucketInfo().setLastGarbageCollectionTime(3);
+ getBucketDatabase().update(e);
+
+ GarbageCollectionStateChecker checker;
+ getConfig().setGarbageCollection("music", 3600);
+ getConfig().setLastGarbageCollectionChangeTime(0);
+ NodeMaintenanceStatsTracker statsTracker;
+ StateChecker::Context c(getExternalOperationHandler(), statsTracker,
+ bucket);
+ getClock().setAbsoluteTimeInSeconds(4000);
+ // Would normally (in a non-maintenance case) trigger GC due to having
+ // overshot the GC check cycle.
+ auto result = testStateChecker(checker, c, false, PendingMessage(), false);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("NO OPERATIONS GENERATED"), result);
+}
+
+/*
+ * Bug 6656726, comment #25. Merge state checker does not execute if an ideal
+ * node is in maintenance, so for symmetry we need to do the same for deletes
+ * (it's bad mojo to potentially delete something that would've been merged
+ * had it not been for a node being in maintenance).
+ */
+void
+StateCheckersTest::testNoRemoveWhenIdealNodeInMaintenance()
+{
+ CPPUNIT_ASSERT_EQUAL_MSG(
+ "Do not remove when ideal node is in maintenance mode",
+ std::string("NO OPERATIONS GENERATED"),
+ testDeleteExtraCopies("0=10/100/1/true,"
+ "1=10/100/1/true,"
+ "2=10/100/1/true",
+ 2, PendingMessage(),
+ "distributor:1 storage:3 .1.s:m"));
+}
+
+/*
+ * Just joining buckets where both children are present is not enough to
+ * ensure any system can compact its bucket tree. We must therefore
+ * gradually hoist buckets higher into the tree when possible in order
+ * to converge in a state where as many buckets as possible have siblings
+ * on the same level.
+ *
+ * See bug 6768991 for context.
+ */
+void
+StateCheckersTest::testStepwiseJoinForSmallBucketsWithoutSiblings()
+{
+ setupDistributor(3, 10, "distributor:1 storage:2 bits:1");
+ vespa::config::content::core::StorDistributormanagerConfigBuilder config;
+ config.enableJoinForSiblingLessBuckets = true;
+ getConfig().configure(config);
+ // Buckets without siblings but that should be step-wise joined back
+ // into bucket (2, 1).
+ insertBucketInfo(document::BucketId(3, 1), 1, 0x1, 1, 1);
+ insertBucketInfo(document::BucketId(3, 0x3), 1, 0x1, 1, 1);
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "BucketId(0x0800000000000001): "
+ "[Joining buckets BucketId(0x0c00000000000001) and "
+ "BucketId(0x0c00000000000001) because their size "
+ "(1 bytes, 1 docs) is less than the configured limit "
+ "of (100, 10)"),
+ testJoin(10, 100, 2, document::BucketId(3, 1)));
+
+ // Other bucket should be joined as well. Together the two join targets
+ // will transform into a mighty sibling pair that can rule the galaxy
+ // (and also be joined together afterwards)!
+ insertBucketInfo(document::BucketId(3, 1), 1, 0x1, 1, 1);
+ insertBucketInfo(document::BucketId(3, 0x3), 1, 0x1, 1, 1);
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "BucketId(0x0800000000000003): "
+ "[Joining buckets BucketId(0x0c00000000000003) and "
+ "BucketId(0x0c00000000000003) because their size "
+ "(1 bytes, 1 docs) is less than the configured limit "
+ "of (100, 10)"),
+ testJoin(10, 100, 2, document::BucketId(3, 0x3)));
+}
+
+void
+StateCheckersTest::testNoStepwiseJoinWhenDisabledThroughConfig()
+{
+ setupDistributor(3, 10, "distributor:1 storage:2 bits:1");
+ vespa::config::content::core::StorDistributormanagerConfigBuilder config;
+ config.enableJoinForSiblingLessBuckets = false;
+ getConfig().configure(config);
+
+ // Buckets without siblings but that would have been step-wise joined back
+ // into bucket 1 if it had been config-enabled.
+ insertBucketInfo(document::BucketId(3, 1), 1, 0x1, 1, 1);
+ insertBucketInfo(document::BucketId(3, 0x3), 1, 0x1, 1, 1);
+ CPPUNIT_ASSERT_EQUAL(std::string("NO OPERATIONS GENERATED"),
+ testJoin(10, 100, 1, document::BucketId(3, 1)));
+}
+
+void
+StateCheckersTest::testNoStepwiseJoinWhenSingleSiblingTooLarge()
+{
+ setupDistributor(3, 10, "distributor:1 storage:2 bits:1");
+ vespa::config::content::core::StorDistributormanagerConfigBuilder config;
+ config.enableJoinForSiblingLessBuckets = true;
+ getConfig().configure(config);
+
+ // Bucket is exactly at the boundary where it's too big.
+ insertBucketInfo(document::BucketId(3, 1), 1, 0x1, 10, 100);
+ insertBucketInfo(document::BucketId(3, 0x3), 1, 0x1, 1, 1);
+ CPPUNIT_ASSERT_EQUAL(std::string("NO OPERATIONS GENERATED"),
+ testJoin(10, 100, 1, document::BucketId(3, 1)));
+}
+
+void
+StateCheckersTest::testStepwiseJoinMaySkipMultipleBitsWhenConsistent()
+{
+ setupDistributor(2, 10, "distributor:1 storage:2 bits:8");
+ vespa::config::content::core::StorDistributormanagerConfigBuilder config;
+ config.enableJoinForSiblingLessBuckets = true;
+ getConfig().configure(config);
+
+ insertBucketInfo(document::BucketId(16, 1), 1, 0x1, 1, 1);
+ // No buckets further up in the tree, can join up to the distribution bit
+ // limit at 8.
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "BucketId(0x2000000000000001): "
+ "[Joining buckets BucketId(0x4000000000000001) and "
+ "BucketId(0x4000000000000001) because their size "
+ "(1 bytes, 1 docs) is less than the configured limit "
+ "of (100, 10)"),
+ testJoin(10, 100, 8, document::BucketId(16, 1)));
+}
+
+void
+StateCheckersTest::testStepwiseJoinDoesNotSkipBeyondLevelWithSibling()
+{
+ setupDistributor(2, 10, "distributor:1 storage:2 bits:8");
+ vespa::config::content::core::StorDistributormanagerConfigBuilder config;
+ config.enableJoinForSiblingLessBuckets = true;
+ getConfig().configure(config);
+
+ // All 0-branch children
+ insertBucketInfo(document::BucketId(16, 0), 1, 0x1, 1, 1);
+ // 0-branches down to level 10, then 1-branch down to level 11. This means
+ // the (16, 0) bucket cannot be moved further up than level 11 as it has a
+ // sibling there (0x2c00000000000400 sibling of 0x2c00000000000000).
+ insertBucketInfo(document::BucketId(11, 1 << 10), 1, 0x1, 1, 1);
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "BucketId(0x2c00000000000000): "
+ "[Joining buckets BucketId(0x4000000000000000) and "
+ "BucketId(0x4000000000000000) because their size "
+ "(1 bytes, 1 docs) is less than the configured limit "
+ "of (100, 10)"),
+ testJoin(10, 100, 8, document::BucketId(16, 0)));
+}
+
+void
+StateCheckersTest::joinCanBeScheduledWhenReplicasOnRetiredNodes()
+{
+ setupDistributor(1, 1, "distributor:1 storage:1 .0.s.:r");
+ insertJoinableBuckets();
+ CPPUNIT_ASSERT_EQUAL(
+ "BucketId(0x8000000000000001): "
+ "[Joining buckets BucketId(0x8400000000000001) and "
+ "BucketId(0x8400000100000001) because their size "
+ "(2 bytes, 2 docs) is less than the configured limit "
+ "of (100, 10)"s,
+ testJoin(10, 100, 16, document::BucketId(33, 1)));
+}
+
+void
+StateCheckersTest::contextPopulatesIdealStateContainers()
+{
+ // 1 and 3 are ideal nodes for bucket {17, 0}
+ setupDistributor(2, 100, "distributor:1 storage:4");
+
+ NodeMaintenanceStatsTracker statsTracker;
+ StateChecker::Context c(getExternalOperationHandler(), statsTracker, {17, 0});
+
+ CPPUNIT_ASSERT_EQUAL((std::vector<uint16_t>{1, 3}), c.idealState);
+ CPPUNIT_ASSERT_EQUAL(size_t(2), c.unorderedIdealState.size());
+ CPPUNIT_ASSERT(c.unorderedIdealState.find(1)
+ != c.unorderedIdealState.end());
+ CPPUNIT_ASSERT(c.unorderedIdealState.find(3)
+ != c.unorderedIdealState.end());
+}
+
+namespace {
+
+template <typename Checker>
+class StateCheckerRunner
+{
+ StateCheckersTest& _fixture;
+ NodeMaintenanceStatsTracker _statsTracker;
+ std::string _result;
+public:
+ StateCheckerRunner(StateCheckersTest& fixture)
+ : _fixture(fixture)
+ {
+ }
+
+ StateCheckerRunner& addToDb(const document::BucketId& bid,
+ const std::string& bucketInfo)
+ {
+ _fixture.addNodesToBucketDB(bid, bucketInfo);
+ return *this;
+ }
+
+ StateCheckerRunner& redundancy(uint32_t red) {
+ _fixture.setRedundancy(red);
+ return *this;
+ }
+
+ StateCheckerRunner& clusterState(const std::string& state) {
+ _fixture.enableClusterState(lib::ClusterState(state));
+ return *this;
+ }
+
+ // Run the templated state checker with the provided parameters, updating
+ // _result with the ideal state operations triggered.
+ // NOTE: resets the bucket database!
+ void runFor(const document::BucketId& bid) {
+ Checker checker;
+ StateChecker::Context c(_fixture.getExternalOperationHandler(), _statsTracker, bid);
+ _result = _fixture.testStateChecker(
+ checker, c, false, StateCheckersTest::PendingMessage(), false);
+ }
+
+ const std::string& result() const { return _result; }
+ const NodeMaintenanceStatsTracker& stats() const {
+ return _statsTracker;
+ }
+};
+
+} // anon ns
+
+void
+StateCheckersTest::statsUpdatedWhenMergingDueToMove()
+{
+ StateCheckerRunner<SynchronizeAndMoveStateChecker> runner(*this);
+ // Ideal state for bucket {17,0} in given cluster state is [1, 3]
+ runner.addToDb({17, 0}, "0=1,1=1,2=1")
+ .clusterState("distributor:1 storage:4")
+ .runFor({17, 0});
+ // Node 1 treated as copy source, but not as move source.
+ {
+ NodeMaintenanceStats wanted;
+ wanted.copyingOut = 1;
+ CPPUNIT_ASSERT_EQUAL(wanted, runner.stats().forNode(1));
+ }
+ // Moving 1 bucket from nodes {0, 2} into 3.
+ // Note that we do not at this point in time distinguish _which_ of these
+ // will do the actual data movement to node 3.
+ {
+ NodeMaintenanceStats wanted;
+ wanted.copyingIn = 1;
+ CPPUNIT_ASSERT_EQUAL(wanted, runner.stats().forNode(3));
+ }
+ {
+ NodeMaintenanceStats wanted;
+ wanted.movingOut = 1;
+ CPPUNIT_ASSERT_EQUAL(wanted, runner.stats().forNode(0));
+ CPPUNIT_ASSERT_EQUAL(wanted, runner.stats().forNode(2));
+ }
+}
+
+void
+StateCheckersTest::statsUpdatedWhenMergingDueToMissingCopy()
+{
+ StateCheckerRunner<SynchronizeAndMoveStateChecker> runner(*this);
+ // Ideal state for bucket {17,0} in given cluster state is [1, 3]
+ runner.addToDb({17, 0}, "1=1")
+ .clusterState("distributor:1 storage:4")
+ .runFor({17, 0});
+
+ {
+ NodeMaintenanceStats wanted;
+ wanted.copyingIn = 1;
+ CPPUNIT_ASSERT_EQUAL(wanted, runner.stats().forNode(3));
+ }
+ {
+ NodeMaintenanceStats wanted;
+ wanted.copyingOut = 1;
+ CPPUNIT_ASSERT_EQUAL(wanted, runner.stats().forNode(1));
+ }
+}
+
+void
+StateCheckersTest::statsUpdatedWhenMergingDueToOutOfSyncCopies()
+{
+ StateCheckerRunner<SynchronizeAndMoveStateChecker> runner(*this);
+ runner.addToDb({17, 0}, "1=1,3=2")
+ .clusterState("distributor:1 storage:4")
+ .runFor({17, 0});
+ {
+ NodeMaintenanceStats wanted;
+ wanted.syncing = 1;
+ CPPUNIT_ASSERT_EQUAL(wanted, runner.stats().forNode(1));
+ CPPUNIT_ASSERT_EQUAL(wanted, runner.stats().forNode(3));
+ }
+}
+
+} // distributor
+} // storage}
diff --git a/storage/src/tests/distributor/statoperationtest.cpp b/storage/src/tests/distributor/statoperationtest.cpp
new file mode 100644
index 00000000000..22fee6e44d7
--- /dev/null
+++ b/storage/src/tests/distributor/statoperationtest.cpp
@@ -0,0 +1,115 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/storageapi/message/stat.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <tests/distributor/distributortestutil.h>
+#include <vespa/storage/distributor/operations/external/statbucketoperation.h>
+#include <vespa/storage/distributor/operations/external/statbucketlistoperation.h>
+
+namespace storage {
+namespace distributor {
+
+struct StatOperationTest : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+ void setUp() {
+ createLinks();
+ };
+
+ void tearDown() {
+ close();
+ }
+
+ void testBucketInfo();
+ void testBucketList();
+
+ CPPUNIT_TEST_SUITE(StatOperationTest);
+ CPPUNIT_TEST(testBucketInfo);
+ CPPUNIT_TEST(testBucketList);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(StatOperationTest);
+
+void
+StatOperationTest::testBucketInfo()
+{
+ _distributor->enableClusterState(lib::ClusterState("distributor:1 storage:2"));
+
+ addNodesToBucketDB(document::BucketId(16, 5),
+ "0=4/2/100,1=4/2/100");
+
+ StatBucketOperation op(
+ getExternalOperationHandler(),
+ std::shared_ptr<api::StatBucketCommand>(
+ new api::StatBucketCommand(document::BucketId(16, 5), "")));
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Statbucket => 0,Statbucket => 1"),
+ _sender.getCommands(true));
+
+ {
+ api::StatBucketCommand* tmp(
+ static_cast<api::StatBucketCommand*>(_sender.commands[0].get()));
+ api::StatBucketReply* reply = new api::StatBucketReply(*tmp, "foo");
+ op.receive(_sender, std::shared_ptr<api::StorageReply>(reply));
+ }
+
+ {
+ api::StatBucketCommand* tmp(
+ static_cast<api::StatBucketCommand*>(_sender.commands[1].get()));
+ api::StatBucketReply* reply = new api::StatBucketReply(*tmp, "bar");
+ op.receive(_sender, std::shared_ptr<api::StorageReply>(reply));
+ }
+
+ api::StatBucketReply* replyback(
+ static_cast<api::StatBucketReply*>(_sender.replies.back().get()));
+ CPPUNIT_ASSERT_CONTAIN("foo", replyback->getResults());
+ CPPUNIT_ASSERT_CONTAIN("bar", replyback->getResults());
+}
+
+void
+StatOperationTest::testBucketList() {
+ setupDistributor(2, 2, "distributor:1 storage:2");
+
+ getConfig().setSplitCount(10);
+ getConfig().setSplitSize(100);
+
+ for (uint32_t i = 0; i < 2; ++i) {
+ insertBucketInfo(document::BucketId(16, 5), i,
+ 0xff, 100, 200, true, (i == 1));
+ }
+
+ std::shared_ptr<api::GetBucketListCommand> msg(
+ new api::GetBucketListCommand(document::BucketId(16, 5)));
+
+ StatBucketListOperation op(
+ getExternalOperationHandler().getBucketDatabase(),
+ getIdealStateManager(),
+ getExternalOperationHandler().getIndex(),
+ msg);
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(1, (int)_sender.replies.size());
+
+ api::GetBucketListReply* repl(
+ dynamic_cast<api::GetBucketListReply*>(_sender.replies[0].get()));
+
+ CPPUNIT_ASSERT_EQUAL(1, (int)repl->getBuckets().size());
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 5),
+ repl->getBuckets()[0]._bucket);
+ CPPUNIT_ASSERT_EQUAL(
+ vespalib::string(
+ "[distributor:0] split: "
+ "[Splitting bucket because its maximum size (200 b, 100 docs, 100 meta, 200 b total) "
+ "is higher than the configured limit of (100, 10)] "
+ "[node(idx=0,crc=0xff,docs=100/100,bytes=200/200,trusted=true,active=false), "
+ "node(idx=1,crc=0xff,docs=100/100,bytes=200/200,trusted=true,active=true)]"),
+ repl->getBuckets()[0]._bucketInformation);
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/statusreporterdelegatetest.cpp b/storage/src/tests/distributor/statusreporterdelegatetest.cpp
new file mode 100644
index 00000000000..f05eebed0ce
--- /dev/null
+++ b/storage/src/tests/distributor/statusreporterdelegatetest.cpp
@@ -0,0 +1,87 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <tests/common/testhelper.h>
+#include <tests/distributor/distributortestutil.h>
+
+#include <vespa/storage/distributor/statusreporterdelegate.h>
+
+namespace storage {
+namespace distributor {
+
+class StatusReporterDelegateTest : public CppUnit::TestFixture
+{
+ CPPUNIT_TEST_SUITE(StatusReporterDelegateTest);
+ CPPUNIT_TEST(testDelegateInvokesDelegatorOnStatusRequest);
+ CPPUNIT_TEST_SUITE_END();
+
+ void testDelegateInvokesDelegatorOnStatusRequest();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(StatusReporterDelegateTest);
+
+namespace {
+
+// We really ought to get GoogleMock as part of our testing suite...
+class MockDelegator : public StatusDelegator
+{
+ mutable std::ostringstream _calls;
+ bool handleStatusRequest(const DelegatedStatusRequest& request) const {
+ _calls << "Request(" << request.path << ")";
+ return request.reporter.reportStatus(request.outputStream, request.path);
+ }
+public:
+ std::string getCalls() const {
+ return _calls.str();
+ }
+};
+
+class MockStatusReporter : public framework::StatusReporter
+{
+public:
+ MockStatusReporter()
+ : framework::StatusReporter("foo", "Bar")
+ {}
+ vespalib::string getReportContentType(
+ const framework::HttpUrlPath&) const
+ {
+ return "foo/bar";
+ }
+
+ bool reportStatus(std::ostream& os,
+ const framework::HttpUrlPath& path) const
+ {
+ os << "reportStatus with " << path;
+ return true;
+ }
+};
+
+}
+
+void
+StatusReporterDelegateTest::testDelegateInvokesDelegatorOnStatusRequest()
+{
+ vdstestlib::DirConfig config(getStandardConfig(false));
+ TestDistributorApp app(config.getConfigId());
+
+ MockDelegator mockDelegator;
+ MockStatusReporter reporter;
+
+ StatusReporterDelegate delegate(app.getComponentRegister(),
+ mockDelegator,
+ reporter);
+ framework::HttpUrlPath path("dummy");
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("foo/bar"),
+ delegate.getReportContentType(path));
+
+ std::ostringstream ss;
+ CPPUNIT_ASSERT(delegate.reportStatus(ss, path));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Request(dummy)"),
+ mockDelegator.getCalls());
+ CPPUNIT_ASSERT_EQUAL(std::string("reportStatus with dummy"),
+ ss.str());
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/throttlingoperationstartertest.cpp b/storage/src/tests/distributor/throttlingoperationstartertest.cpp
new file mode 100644
index 00000000000..5c4ba99563c
--- /dev/null
+++ b/storage/src/tests/distributor/throttlingoperationstartertest.cpp
@@ -0,0 +1,142 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <string>
+#include <sstream>
+#include <memory>
+#include <vespa/storage/distributor/throttlingoperationstarter.h>
+#include <tests/distributor/maintenancemocks.h>
+
+namespace storage {
+
+namespace distributor {
+
+using document::BucketId;
+
+class ThrottlingOperationStarterTest : public CppUnit::TestFixture {
+ CPPUNIT_TEST_SUITE(ThrottlingOperationStarterTest);
+ CPPUNIT_TEST(testOperationNotThrottledWhenSlotAvailable);
+ CPPUNIT_TEST(testOperationStartingIsForwardedToImplementation);
+ CPPUNIT_TEST(testOperationThrottledWhenNoAvailableSlots);
+ CPPUNIT_TEST(testThrottlingWithMaxPendingRange);
+ CPPUNIT_TEST(testStartingOperationsFillsUpPendingWindow);
+ CPPUNIT_TEST(testFinishingOperationsAllowsMoreToStart);
+ CPPUNIT_TEST_SUITE_END();
+
+ std::shared_ptr<Operation> createMockOperation() {
+ return std::shared_ptr<Operation>(new MockOperation(BucketId(16, 1)));
+ }
+
+ std::unique_ptr<MockOperationStarter> _starterImpl;
+ std::unique_ptr<ThrottlingOperationStarter> _operationStarter;
+
+public:
+ void testOperationNotThrottledWhenSlotAvailable();
+ void testOperationStartingIsForwardedToImplementation();
+ void testOperationThrottledWhenNoAvailableSlots();
+ void testThrottlingWithMaxPendingRange();
+ void testStartingOperationsFillsUpPendingWindow();
+ void testFinishingOperationsAllowsMoreToStart();
+
+ void setUp();
+ void tearDown();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(ThrottlingOperationStarterTest);
+
+void
+ThrottlingOperationStarterTest::setUp()
+{
+ _starterImpl.reset(new MockOperationStarter());
+ _operationStarter.reset(new ThrottlingOperationStarter(*_starterImpl));
+}
+
+void
+ThrottlingOperationStarterTest::tearDown()
+{
+ // Must clear before _operationStarter goes out of scope, or operation
+ // destructors will try to call method on destroyed object.
+ _starterImpl->getOperations().clear();
+}
+
+void
+ThrottlingOperationStarterTest::testOperationNotThrottledWhenSlotAvailable()
+{
+ CPPUNIT_ASSERT(_operationStarter->start(createMockOperation(),
+ OperationStarter::Priority(0)));
+}
+
+void
+ThrottlingOperationStarterTest::testOperationStartingIsForwardedToImplementation()
+{
+ CPPUNIT_ASSERT(_operationStarter->start(createMockOperation(),
+ OperationStarter::Priority(0)));
+ CPPUNIT_ASSERT_EQUAL(std::string("BucketId(0x4000000000000001), pri 0\n"),
+ _starterImpl->toString());
+}
+
+void
+ThrottlingOperationStarterTest::testOperationThrottledWhenNoAvailableSlots()
+{
+ _operationStarter->setMaxPendingRange(0, 0);
+ CPPUNIT_ASSERT(!_operationStarter->start(createMockOperation(),
+ OperationStarter::Priority(0)));
+}
+
+void
+ThrottlingOperationStarterTest::testThrottlingWithMaxPendingRange()
+{
+ _operationStarter->setMaxPendingRange(0, 1);
+ CPPUNIT_ASSERT(!_operationStarter->canStart(0, OperationStarter::Priority(255)));
+ CPPUNIT_ASSERT(_operationStarter->canStart(0, OperationStarter::Priority(0)));
+
+ _operationStarter->setMaxPendingRange(1, 1);
+ CPPUNIT_ASSERT(_operationStarter->canStart(0, OperationStarter::Priority(255)));
+ CPPUNIT_ASSERT(_operationStarter->canStart(0, OperationStarter::Priority(0)));
+
+ _operationStarter->setMaxPendingRange(1, 3);
+ CPPUNIT_ASSERT(!_operationStarter->canStart(1, OperationStarter::Priority(255)));
+ CPPUNIT_ASSERT(_operationStarter->canStart(1, OperationStarter::Priority(100)));
+ CPPUNIT_ASSERT(_operationStarter->canStart(1, OperationStarter::Priority(0)));
+ CPPUNIT_ASSERT(_operationStarter->canStart(2, OperationStarter::Priority(0)));
+ CPPUNIT_ASSERT(!_operationStarter->canStart(3, OperationStarter::Priority(0)));
+ CPPUNIT_ASSERT(!_operationStarter->canStart(4, OperationStarter::Priority(0)));
+}
+
+void
+ThrottlingOperationStarterTest::testStartingOperationsFillsUpPendingWindow()
+{
+ _operationStarter->setMaxPendingRange(1, 3);
+ CPPUNIT_ASSERT(_operationStarter->start(createMockOperation(),
+ OperationStarter::Priority(255)));
+ CPPUNIT_ASSERT(!_operationStarter->start(createMockOperation(),
+ OperationStarter::Priority(255)));
+ CPPUNIT_ASSERT(_operationStarter->start(createMockOperation(),
+ OperationStarter::Priority(100)));
+ CPPUNIT_ASSERT(!_operationStarter->start(createMockOperation(),
+ OperationStarter::Priority(100)));
+ CPPUNIT_ASSERT(_operationStarter->start(createMockOperation(),
+ OperationStarter::Priority(0)));
+ CPPUNIT_ASSERT(!_operationStarter->start(createMockOperation(),
+ OperationStarter::Priority(0)));
+}
+
+void
+ThrottlingOperationStarterTest::testFinishingOperationsAllowsMoreToStart()
+{
+ _operationStarter->setMaxPendingRange(1, 1);
+ CPPUNIT_ASSERT(_operationStarter->start(createMockOperation(),
+ OperationStarter::Priority(255)));
+ CPPUNIT_ASSERT(!_operationStarter->start(createMockOperation(),
+ OperationStarter::Priority(255)));
+ CPPUNIT_ASSERT(!_starterImpl->getOperations().empty());
+
+ _starterImpl->getOperations().pop_back();
+
+ CPPUNIT_ASSERT(_operationStarter->start(createMockOperation(),
+ OperationStarter::Priority(255)));
+ CPPUNIT_ASSERT(!_starterImpl->getOperations().empty());
+}
+
+}
+}
diff --git a/storage/src/tests/distributor/twophaseupdateoperationtest.cpp b/storage/src/tests/distributor/twophaseupdateoperationtest.cpp
new file mode 100644
index 00000000000..f6346c9755f
--- /dev/null
+++ b/storage/src/tests/distributor/twophaseupdateoperationtest.cpp
@@ -0,0 +1,1194 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/config/helper/configgetter.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <vespa/document/config/config-documenttypes.h>
+#include <vespa/document/repo/documenttyperepo.h>
+#include <vespa/document/base/testdocrepo.h>
+#include <vespa/document/update/arithmeticvalueupdate.h>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/storage/distributor/externaloperationhandler.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storage/distributor/operations/external/twophaseupdateoperation.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storageapi/message/batch.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <tests/distributor/distributortestutil.h>
+#include <tests/distributor/messagesenderstub.h>
+
+namespace storage {
+namespace distributor {
+
+using std::shared_ptr;
+using config::ConfigGetter;
+using document::DocumenttypesConfig;
+using config::FileSpec;
+using namespace document;
+using namespace storage;
+using namespace storage::distributor;
+using namespace storage::api;
+using namespace storage::lib;
+
+using namespace std::literals::string_literals;
+
+class TwoPhaseUpdateOperationTest : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+ CPPUNIT_TEST_SUITE(TwoPhaseUpdateOperationTest);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST(testNonExisting);
+ CPPUNIT_TEST(testUpdateFailed);
+ CPPUNIT_TEST(testFastPathInconsistentTimestamps);
+ CPPUNIT_TEST(testFastPathInconsistentTimestampsNotFound);
+ CPPUNIT_TEST(testFastPathInconsistentTimestampsUpdateError);
+ CPPUNIT_TEST(testFastPathInconsistentTimestampsGetError);
+ CPPUNIT_TEST(testFastPathInconsistentTimestampsPutError);
+ CPPUNIT_TEST(testFastPathInconsistentTimestampsPutNotStarted);
+ CPPUNIT_TEST(testFastPathInconsistentTimestampsInconsistentSplit);
+ CPPUNIT_TEST(testFastPathPropagatesMessageSettingsToUpdate);
+ CPPUNIT_TEST(testNofM);
+ CPPUNIT_TEST(testSafePathUpdatesNewestReceivedDocument);
+ CPPUNIT_TEST(testCreateIfNonExistentCreatesDocumentIfAllEmptyGets);
+ CPPUNIT_TEST(testUpdateFailsIfSafePathHasFailedPut);
+ CPPUNIT_TEST(testUpdateFailsIfSafePathGetsFail);
+ CPPUNIT_TEST(testUpdateFailsIfApplyThrowsException);
+ CPPUNIT_TEST(testNonExistingWithAutoCreate);
+ CPPUNIT_TEST(testSafePathFailsUpdateWhenMismatchingTimestampConstraint);
+ CPPUNIT_TEST(testSafePathUpdatePropagatesMessageSettingsToGetsAndPuts);
+ CPPUNIT_TEST(testSafePathPropagatesMbusTracesFromReplies);
+ CPPUNIT_TEST(testUpdateFailsIfOwnershipChangesBetweenGetAndPut);
+ CPPUNIT_TEST(testSafePathConditionMismatchFailsWithTasError);
+ CPPUNIT_TEST(testSafePathConditionMatchSendsPutsWithUpdatedDoc);
+ CPPUNIT_TEST(testSafePathConditionParseFailureFailsWithIllegalParamsError);
+ CPPUNIT_TEST(testSafePathConditonUnknownDocTypeFailsWithIllegalParamsError);
+ CPPUNIT_TEST(testSafePathConditionWithMissingDocFailsWithTasError);
+ CPPUNIT_TEST(testFastPathCloseEdgeSendsCorrectReply);
+ CPPUNIT_TEST(testSafePathCloseEdgeSendsCorrectReply);
+ CPPUNIT_TEST_SUITE_END();
+
+ document::TestDocRepo _testRepo;
+ DocumentTypeRepo::SP _repo;
+ const DocumentType* _doc_type;
+
+protected:
+ void testSimple();
+ void testNonExisting();
+ void testUpdateFailed();
+ void testFastPathInconsistentTimestamps();
+ void testFastPathInconsistentTimestampsNotFound();
+ void testFastPathInconsistentTimestampsUpdateError();
+ void testFastPathInconsistentTimestampsGetError();
+ void testFastPathInconsistentTimestampsPutError();
+ void testFastPathInconsistentTimestampsPutNotStarted();
+ void testFastPathInconsistentTimestampsInconsistentSplit();
+ void testFastPathPropagatesMessageSettingsToUpdate();
+ void testNofM();
+ void testSafePathUpdatesNewestReceivedDocument();
+ void testCreateIfNonExistentCreatesDocumentIfAllEmptyGets();
+ void testUpdateFailsIfSafePathHasFailedPut();
+ void testUpdateFailsIfSafePathGetsFail();
+ void testUpdateFailsIfApplyThrowsException();
+ void testNonExistingWithAutoCreate();
+ void testSafePathFailsUpdateWhenMismatchingTimestampConstraint();
+ void testSafePathUpdatePropagatesMessageSettingsToGetsAndPuts();
+ void testSafePathPropagatesMbusTracesFromReplies();
+ void testUpdateFailsIfOwnershipChangesBetweenGetAndPut();
+ void testSafePathConditionMismatchFailsWithTasError();
+ void testSafePathConditionMatchSendsPutsWithUpdatedDoc();
+ void testSafePathConditionParseFailureFailsWithIllegalParamsError();
+ void testSafePathConditonUnknownDocTypeFailsWithIllegalParamsError();
+ void testSafePathConditionWithMissingDocFailsWithTasError();
+ void testFastPathCloseEdgeSendsCorrectReply();
+ void testSafePathCloseEdgeSendsCorrectReply();
+
+ void checkMessageSettingsPropagatedTo(
+ const api::StorageCommand::SP& msg) const;
+
+ std::string getUpdatedValueFromLastPut(MessageSenderStub&);
+public:
+ void setUp() {
+ _repo = _testRepo.getTypeRepoSp();
+ _doc_type = _repo->getDocumentType("testdoctype1");
+ createLinks();
+ setTypeRepo(_repo);
+ getClock().setAbsoluteTimeInSeconds(200);
+ }
+
+ void tearDown() {
+ close();
+ }
+
+ void replyToMessage(Operation& callback,
+ MessageSenderStub& sender,
+ uint32_t index,
+ uint64_t oldTimestamp,
+ api::ReturnCode::Result result = api::ReturnCode::OK);
+
+ void replyToPut(
+ Operation& callback,
+ MessageSenderStub& sender,
+ uint32_t index,
+ api::ReturnCode::Result result = api::ReturnCode::OK,
+ const std::string& traceMsg = "");
+
+ void replyToCreateBucket(
+ Operation& callback,
+ MessageSenderStub& sender,
+ uint32_t index,
+ api::ReturnCode::Result result = api::ReturnCode::OK);
+
+ void replyToGet(
+ Operation& callback,
+ MessageSenderStub& sender,
+ uint32_t index,
+ uint64_t oldTimestamp,
+ bool haveDocument = true,
+ api::ReturnCode::Result result = api::ReturnCode::OK,
+ const std::string& traceMsg = "");
+
+ struct UpdateOptions {
+ bool _makeInconsistentSplit;
+ bool _createIfNonExistent;
+ bool _withError;
+ api::Timestamp _timestampToUpdate;
+ documentapi::TestAndSetCondition _condition;
+
+ UpdateOptions()
+ : _makeInconsistentSplit(false),
+ _createIfNonExistent(false),
+ _withError(false),
+ _timestampToUpdate(0),
+ _condition()
+ {
+ }
+
+ UpdateOptions& makeInconsistentSplit(bool mis) {
+ _makeInconsistentSplit = mis;
+ return *this;
+ }
+ UpdateOptions& createIfNonExistent(bool cine) {
+ _createIfNonExistent = cine;
+ return *this;
+ }
+ UpdateOptions& withError(bool error = true) {
+ _withError = error;
+ return *this;
+ }
+ UpdateOptions& timestampToUpdate(api::Timestamp ts) {
+ _timestampToUpdate = ts;
+ return *this;
+ }
+ UpdateOptions& condition(vespalib::stringref cond) {
+ _condition = documentapi::TestAndSetCondition(cond);
+ return *this;
+ }
+ };
+
+ std::shared_ptr<TwoPhaseUpdateOperation>
+ sendUpdate(const std::string& bucketState,
+ const UpdateOptions& options = UpdateOptions());
+
+ void assertAbortedUpdateReplyWithContextPresent(
+ const MessageSenderStub& closeSender) const;
+
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(TwoPhaseUpdateOperationTest);
+
+void
+TwoPhaseUpdateOperationTest::replyToMessage(
+ Operation& callback,
+ MessageSenderStub& sender,
+ uint32_t index,
+ uint64_t oldTimestamp,
+ api::ReturnCode::Result result)
+{
+ std::shared_ptr<api::StorageMessage> msg2 = sender.commands.at(index);
+ UpdateCommand& updatec = dynamic_cast<UpdateCommand&>(*msg2);
+ std::unique_ptr<api::StorageReply> reply(updatec.makeReply());
+ static_cast<api::UpdateReply*>(reply.get())->setOldTimestamp(oldTimestamp);
+ reply->setResult(api::ReturnCode(result, ""));
+
+ callback.receive(sender,
+ std::shared_ptr<StorageReply>(reply.release()));
+}
+
+void
+TwoPhaseUpdateOperationTest::replyToPut(
+ Operation& callback,
+ MessageSenderStub& sender,
+ uint32_t index,
+ api::ReturnCode::Result result,
+ const std::string& traceMsg)
+{
+ std::shared_ptr<api::StorageMessage> msg2 = sender.commands.at(index);
+ PutCommand& putc = dynamic_cast<PutCommand&>(*msg2);
+ std::unique_ptr<api::StorageReply> reply(putc.makeReply());
+ reply->setResult(api::ReturnCode(result, ""));
+ if (!traceMsg.empty()) {
+ MBUS_TRACE(reply->getTrace(), 1, traceMsg);
+ }
+ callback.receive(sender,
+ std::shared_ptr<StorageReply>(reply.release()));
+}
+
+void
+TwoPhaseUpdateOperationTest::replyToCreateBucket(
+ Operation& callback,
+ MessageSenderStub& sender,
+ uint32_t index,
+ api::ReturnCode::Result result)
+{
+ std::shared_ptr<api::StorageMessage> msg2 = sender.commands.at(index);
+ CreateBucketCommand& putc = dynamic_cast<CreateBucketCommand&>(*msg2);
+ std::unique_ptr<api::StorageReply> reply(putc.makeReply());
+ reply->setResult(api::ReturnCode(result, ""));
+ callback.receive(sender,
+ std::shared_ptr<StorageReply>(reply.release()));
+}
+
+void
+TwoPhaseUpdateOperationTest::replyToGet(
+ Operation& callback,
+ MessageSenderStub& sender,
+ uint32_t index,
+ uint64_t oldTimestamp,
+ bool haveDocument,
+ api::ReturnCode::Result result,
+ const std::string& traceMsg)
+{
+ const api::GetCommand& get(
+ static_cast<const api::GetCommand&>(*sender.commands.at(index)));
+ std::shared_ptr<api::StorageReply> reply;
+
+ if (haveDocument) {
+ auto doc(std::make_shared<Document>(
+ *_doc_type, DocumentId(DocIdString("test", "test"))));
+ doc->setValue("headerval", IntFieldValue(oldTimestamp));
+
+ reply = std::make_shared<api::GetReply>(get, doc, oldTimestamp);
+ } else {
+ reply = std::make_shared<api::GetReply>(get, Document::SP(), 0);
+ }
+ reply->setResult(api::ReturnCode(result, ""));
+ if (!traceMsg.empty()) {
+ MBUS_TRACE(reply->getTrace(), 1, traceMsg);
+ }
+
+ callback.receive(sender, reply);
+}
+
+namespace {
+
+struct DummyTransportContext : api::TransportContext {
+ // No methods to implement.
+};
+
+}
+
+std::shared_ptr<TwoPhaseUpdateOperation>
+TwoPhaseUpdateOperationTest::sendUpdate(const std::string& bucketState,
+ const UpdateOptions& options)
+{
+ document::DocumentUpdate::SP update;
+ if (!options._withError) {
+ update = std::make_shared<document::DocumentUpdate>(
+ *_doc_type,
+ document::DocumentId(document::DocIdString("test", "test")));
+ document::FieldUpdate fup(_doc_type->getField("headerval"));
+ fup.addUpdate(ArithmeticValueUpdate(ArithmeticValueUpdate::Add, 10));
+ update->addUpdate(fup);
+ } else {
+ // Create an update to a different doctype than the one returned as
+ // part of the Get. Just a sneaky way to force an eval error.
+ auto* badDocType = _repo->getDocumentType("testdoctype2");
+ update = std::make_shared<document::DocumentUpdate>(
+ *badDocType,
+ document::DocumentId(document::DocIdString("test", "test")));
+ document::FieldUpdate fup(badDocType->getField("onlyinchild"));
+ fup.addUpdate(ArithmeticValueUpdate(ArithmeticValueUpdate::Add, 10));
+ update->addUpdate(fup);
+ }
+ update->setCreateIfNonExistent(options._createIfNonExistent);
+
+ document::BucketId id = getExternalOperationHandler().getBucketId(update->getId());
+ document::BucketId id2 = document::BucketId(id.getUsedBits() + 1, id.getRawId());
+
+ if (bucketState.length()) {
+ addNodesToBucketDB(id, bucketState);
+ }
+
+ if (options._makeInconsistentSplit) {
+ addNodesToBucketDB(id2, bucketState);
+ }
+
+ auto msg(std::make_shared<api::UpdateCommand>(
+ document::BucketId(0), update, api::Timestamp(0)));
+ // Misc settings for checking that propagation works.
+ msg->getTrace().setLevel(6);
+ msg->setTimeout(6789);
+ msg->setPriority(99);
+ if (options._timestampToUpdate) {
+ msg->setOldTimestamp(options._timestampToUpdate);
+ }
+ msg->setCondition(options._condition);
+ msg->setTransportContext(std::make_unique<DummyTransportContext>());
+
+ ExternalOperationHandler& handler = getExternalOperationHandler();
+ return std::make_shared<TwoPhaseUpdateOperation>(
+ handler, msg, getDistributor().getMetrics());
+}
+
+
+void
+TwoPhaseUpdateOperationTest::testSimple()
+{
+ setupDistributor(1, 1, "storage:1 distributor:1");
+
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(sendUpdate("0=1/2/3"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0"),
+ sender.getCommands(true));
+
+ replyToMessage(*cb, sender, 0, 90);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 90) ReturnCode(NONE)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testNonExisting()
+{
+ setupDistributor(1, 1, "storage:1 distributor:1");
+
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(sendUpdate(""));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 0) ReturnCode(NONE)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testUpdateFailed()
+{
+ setupDistributor(1, 1, "storage:1 distributor:1");
+
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(sendUpdate("0=1/2/3"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0"),
+ sender.getCommands(true));
+
+ replyToMessage(*cb, sender, 0, 90, api::ReturnCode::INTERNAL_FAILURE);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 0) "
+ "ReturnCode(INTERNAL_FAILURE)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testFastPathInconsistentTimestamps()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(sendUpdate("0=1/2/3,1=1/2/3"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0,Update => 1"),
+ sender.getCommands(true));
+
+ replyToMessage(*cb, sender, 0, 90);
+ replyToMessage(*cb, sender, 1, 110);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get(BucketId(0x4000000000008b13), doc:test:test) => 1"),
+ sender.getLastCommand(true));
+
+ replyToGet(*cb, sender, 2, 110);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0,Update => 1,Get => 1,Put => 1,Put => 0"),
+ sender.getCommands(true));
+
+ CPPUNIT_ASSERT(sender.replies.empty());
+
+ replyToPut(*cb, sender, 3);
+ replyToPut(*cb, sender, 4);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 110 Was inconsistent "
+ "(best node 1)) ReturnCode(NONE)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testFastPathInconsistentTimestampsNotFound()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(sendUpdate("0=1/2/3,1=1/2/3"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0,Update => 1"),
+ sender.getCommands(true));
+
+ replyToMessage(*cb, sender, 0, 90);
+ replyToMessage(*cb, sender, 1, 110);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get(BucketId(0x4000000000008b13), doc:test:test) => 1"),
+ sender.getLastCommand(true));
+ CPPUNIT_ASSERT(sender.replies.empty());
+
+ replyToGet(*cb, sender, 2, 110, false);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 110 Was inconsistent "
+ "(best node 1)) ReturnCode(INTERNAL_FAILURE)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testFastPathInconsistentTimestampsUpdateError()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(sendUpdate("0=1/2/3,1=1/2/3"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0,Update => 1"),
+ sender.getCommands(true));
+
+ replyToMessage(*cb, sender, 0, 90);
+ CPPUNIT_ASSERT(sender.replies.empty());
+ replyToMessage(*cb, sender, 1, 110, api::ReturnCode::IO_FAILURE);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 90) "
+ "ReturnCode(IO_FAILURE)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testFastPathInconsistentTimestampsGetError()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(sendUpdate("0=1/2/3,1=1/2/3"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0,Update => 1"),
+ sender.getCommands(true));
+
+ replyToMessage(*cb, sender, 0, 90);
+ replyToMessage(*cb, sender, 1, 110);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get(BucketId(0x4000000000008b13), doc:test:test) => 1"),
+ sender.getLastCommand(true));
+
+ CPPUNIT_ASSERT(sender.replies.empty());
+ replyToGet(*cb, sender, 2, 110, false, api::ReturnCode::IO_FAILURE);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 110 Was inconsistent "
+ "(best node 1)) ReturnCode(IO_FAILURE)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testFastPathInconsistentTimestampsPutError()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(sendUpdate("0=1/2/3,1=1/2/3"));
+
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0,Update => 1"),
+ sender.getCommands(true));
+
+ replyToMessage(*cb, sender, 0, 90);
+ replyToMessage(*cb, sender, 1, 110);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get(BucketId(0x4000000000008b13), doc:test:test) => 1"),
+ sender.getLastCommand(true));
+
+ replyToGet(*cb, sender, 2, 110);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0,Update => 1,Get => 1,Put => 1,Put => 0"),
+ sender.getCommands(true));
+
+ replyToPut(*cb, sender, 3, api::ReturnCode::IO_FAILURE);
+ CPPUNIT_ASSERT(sender.replies.empty());
+ replyToPut(*cb, sender, 4);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 110 Was inconsistent "
+ "(best node 1)) ReturnCode(IO_FAILURE)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testFastPathInconsistentTimestampsPutNotStarted()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(sendUpdate("0=1/2/3,1=1/2/3"));
+
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0,Update => 1"),
+ sender.getCommands(true));
+
+ replyToMessage(*cb, sender, 0, 90);
+ replyToMessage(*cb, sender, 1, 110);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get(BucketId(0x4000000000008b13), doc:test:test) => 1"),
+ sender.getLastCommand(true));
+ checkMessageSettingsPropagatedTo(sender.commands.back());
+
+ _distributor->enableClusterState(lib::ClusterState("storage:0 distributor:1"));
+ CPPUNIT_ASSERT(sender.replies.empty());
+ replyToGet(*cb, sender, 2, 110);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 110 Was inconsistent "
+ "(best node 1)) ReturnCode(NOT_CONNECTED, "
+ "Can't store document: No storage nodes available)"),
+ sender.getLastReply(true));
+}
+
+
+void
+TwoPhaseUpdateOperationTest::testFastPathInconsistentTimestampsInconsistentSplit()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=1/2/3",
+ UpdateOptions().makeInconsistentSplit(true)));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ std::string wanted("Get(BucketId(0x4000000000008b13), doc:test:test) => 0,"
+ "Get(BucketId(0x4400000000008b13), doc:test:test) => 0");
+
+ std::string text = sender.getCommands(true, true);
+ CPPUNIT_ASSERT_EQUAL(wanted, text);
+
+ replyToGet(*cb, sender, 0, 90);
+ replyToGet(*cb, sender, 1, 120);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "Put(BucketId(0x4400000000008b13), doc:test:test, "
+ "timestamp 200000000, size 52) => 1,"
+ "Put(BucketId(0x4400000000008b13), doc:test:test, "
+ "timestamp 200000000, size 52) => 0"),
+ sender.getCommands(true, true, 2));
+
+ replyToPut(*cb, sender, 2);
+ CPPUNIT_ASSERT(sender.replies.empty());
+ replyToPut(*cb, sender, 3);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 120) "
+ "ReturnCode(NONE)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::checkMessageSettingsPropagatedTo(
+ const api::StorageCommand::SP& msg) const
+{
+ // Settings set in sendUpdate().
+ CPPUNIT_ASSERT_EQUAL(uint32_t(6), msg->getTrace().getLevel());
+ CPPUNIT_ASSERT_EQUAL(uint32_t(6789), msg->getTimeout());
+ CPPUNIT_ASSERT_EQUAL(uint8_t(99), msg->getPriority());
+}
+
+void
+TwoPhaseUpdateOperationTest::testFastPathPropagatesMessageSettingsToUpdate()
+{
+ setupDistributor(1, 1, "storage:1 distributor:1");
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(sendUpdate("0=1/2/3"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Update => 0"), sender.getCommands(true));
+
+ StorageCommand::SP msg(sender.commands.back());
+ checkMessageSettingsPropagatedTo(msg);
+}
+
+void
+TwoPhaseUpdateOperationTest::testNofM()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1", 1);
+
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(sendUpdate("0=1/2/3,1=1/2/3"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0,Update => 1"),
+ sender.getCommands(true));
+
+ CPPUNIT_ASSERT(sender.replies.empty());
+ replyToMessage(*cb, sender, 0, 90);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 90) ReturnCode(NONE)"),
+ sender.getLastReply(true));
+
+ replyToMessage(*cb, sender, 1, 123);
+}
+
+std::string
+TwoPhaseUpdateOperationTest::getUpdatedValueFromLastPut(
+ MessageSenderStub& sender)
+{
+ Document::SP doc(dynamic_cast<api::PutCommand&>(*sender.commands.back())
+ .getDocument());
+ FieldValue::UP value(doc->getValue("headerval"));
+ return value->toString();
+}
+
+void
+TwoPhaseUpdateOperationTest::testSafePathUpdatesNewestReceivedDocument()
+{
+ setupDistributor(3, 3, "storage:3 distributor:1");
+ // 0,1 in sync. 2 out of sync.
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=1/2/3,2=2/3/4"));
+
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Get(BucketId(0x4000000000008b13), doc:test:test) => 0,"
+ "Get(BucketId(0x4000000000008b13), doc:test:test) => 2"),
+ sender.getCommands(true, true));
+ replyToGet(*cb, sender, 0, 50);
+ replyToGet(*cb, sender, 1, 70);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "Put(BucketId(0x4000000000008b13), doc:test:test, "
+ "timestamp 200000000, size 52) => 1,"
+ "Put(BucketId(0x4000000000008b13), doc:test:test, "
+ "timestamp 200000000, size 52) => 0,"
+ "Put(BucketId(0x4000000000008b13), doc:test:test, "
+ "timestamp 200000000, size 52) => 2"),
+ sender.getCommands(true, true, 2));
+ // Make sure Put contains an updated document (+10 arith. update on field
+ // whose value equals gotten timestamp). In this case we want 70 -> 80.
+ CPPUNIT_ASSERT_EQUAL(std::string("80"), getUpdatedValueFromLastPut(sender));
+
+ replyToPut(*cb, sender, 2);
+ replyToPut(*cb, sender, 3);
+ CPPUNIT_ASSERT(sender.replies.empty());
+ replyToPut(*cb, sender, 4);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 70) "
+ "ReturnCode(NONE)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testCreateIfNonExistentCreatesDocumentIfAllEmptyGets()
+{
+ setupDistributor(3, 3, "storage:3 distributor:1");
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=1/2/3,2=2/3/4",
+ UpdateOptions().createIfNonExistent(true)));
+
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Get => 0,Get => 2"),
+ sender.getCommands(true));
+ replyToGet(*cb, sender, 0, 0, false);
+ replyToGet(*cb, sender, 1, 0, false);
+ // Since create-if-non-existent is set, distributor should create doc from
+ // scratch.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "Put(BucketId(0x4000000000008b13), doc:test:test, "
+ "timestamp 200000000, size 52) => 1,"
+ "Put(BucketId(0x4000000000008b13), doc:test:test, "
+ "timestamp 200000000, size 52) => 0,"
+ "Put(BucketId(0x4000000000008b13), doc:test:test, "
+ "timestamp 200000000, size 52) => 2"),
+ sender.getCommands(true, true, 2));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("10"), getUpdatedValueFromLastPut(sender));
+
+ replyToPut(*cb, sender, 2);
+ replyToPut(*cb, sender, 3);
+ CPPUNIT_ASSERT(sender.replies.empty());
+ replyToPut(*cb, sender, 4);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 200000000) "
+ "ReturnCode(NONE)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testUpdateFailsIfSafePathHasFailedPut()
+{
+ setupDistributor(3, 3, "storage:3 distributor:1");
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=1/2/3,2=2/3/4",
+ UpdateOptions().createIfNonExistent(true)));
+
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Get => 0,Get => 2"),
+ sender.getCommands(true));
+ replyToGet(*cb, sender, 0, 0, false);
+ replyToGet(*cb, sender, 1, 0, false);
+ // Since create-if-non-existent is set, distributor should create doc from
+ // scratch.
+ CPPUNIT_ASSERT_EQUAL(std::string("Put => 1,Put => 0,Put => 2"),
+ sender.getCommands(true, false, 2));
+
+ replyToPut(*cb, sender, 2);
+ replyToPut(*cb, sender, 3);
+ CPPUNIT_ASSERT(sender.replies.empty());
+ replyToPut(*cb, sender, 4, api::ReturnCode::IO_FAILURE);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 200000000) "
+ "ReturnCode(IO_FAILURE)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testUpdateFailsIfSafePathGetsFail()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=2/3/4",
+ UpdateOptions().createIfNonExistent(true)));
+
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Get => 0,Get => 1"),
+ sender.getCommands(true));
+ replyToGet(*cb, sender, 0, 0, false, api::ReturnCode::IO_FAILURE);
+ CPPUNIT_ASSERT(sender.replies.empty());
+ replyToGet(*cb, sender, 1, 0, false, api::ReturnCode::IO_FAILURE);
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 0) "
+ "ReturnCode(IO_FAILURE)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testUpdateFailsIfApplyThrowsException()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ // Create update for wrong doctype which will fail the update.
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=2/3/4", UpdateOptions().withError()));
+
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Get => 0,Get => 1"),
+ sender.getCommands(true));
+ replyToGet(*cb, sender, 0, 50);
+ CPPUNIT_ASSERT(sender.replies.empty());
+ replyToGet(*cb, sender, 1, 70);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 70) "
+ "ReturnCode(INTERNAL_FAILURE, Can not apply a "
+ "\"testdoctype2\" document update to a "
+ "\"testdoctype1\" document.)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testNonExistingWithAutoCreate()
+{
+ setupDistributor(1, 1, "storage:1 distributor:1");
+
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("", UpdateOptions().createIfNonExistent(true)));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "CreateBucketCommand(BucketId(0x4000000000008b13), active) "
+ "Reasons to start: => 0,"
+ "Put(BucketId(0x4000000000008b13), doc:test:test, "
+ "timestamp 200000000, size 52) => 0"),
+ sender.getCommands(true, true));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("10"), getUpdatedValueFromLastPut(sender));
+
+ replyToCreateBucket(*cb, sender, 0);
+ CPPUNIT_ASSERT(sender.replies.empty());
+ replyToPut(*cb, sender, 1);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 200000000) "
+ "ReturnCode(NONE)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testSafePathFailsUpdateWhenMismatchingTimestampConstraint()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=2/3/4",
+ UpdateOptions().timestampToUpdate(1234)));
+
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Get => 0,Get => 1"),
+ sender.getCommands(true));
+ replyToGet(*cb, sender, 0, 100);
+ CPPUNIT_ASSERT(sender.replies.empty());
+ replyToGet(*cb, sender, 1, 110);
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 0) "
+ "ReturnCode(NONE, No document with requested "
+ "timestamp found)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testSafePathUpdatePropagatesMessageSettingsToGetsAndPuts()
+{
+ setupDistributor(3, 3, "storage:3 distributor:1");
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=1/2/3,2=2/3/4"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Get => 0,Get => 2"),
+ sender.getCommands(true));
+ checkMessageSettingsPropagatedTo(sender.commands.at(0));
+ checkMessageSettingsPropagatedTo(sender.commands.at(1));
+ replyToGet(*cb, sender, 0, 50);
+ replyToGet(*cb, sender, 1, 70);
+ CPPUNIT_ASSERT_EQUAL(std::string("Put => 1,Put => 0,Put => 2"),
+ sender.getCommands(true, false, 2));
+ checkMessageSettingsPropagatedTo(sender.commands.at(2));
+ checkMessageSettingsPropagatedTo(sender.commands.at(3));
+ checkMessageSettingsPropagatedTo(sender.commands.at(4));
+ replyToPut(*cb, sender, 2);
+ replyToPut(*cb, sender, 3);
+ replyToPut(*cb, sender, 4);
+}
+
+void
+TwoPhaseUpdateOperationTest::testSafePathPropagatesMbusTracesFromReplies()
+{
+ setupDistributor(3, 3, "storage:3 distributor:1");
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=1/2/3,2=2/3/4"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Get => 0,Get => 2"),
+ sender.getCommands(true));
+ replyToGet(*cb, sender, 0, 50, true,
+ api::ReturnCode::OK, "hello earthlings");
+ replyToGet(*cb, sender, 1, 70);
+ CPPUNIT_ASSERT_EQUAL(std::string("Put => 1,Put => 0,Put => 2"),
+ sender.getCommands(true, false, 2));
+ replyToPut(*cb, sender, 2, api::ReturnCode::OK, "fooo");
+ replyToPut(*cb, sender, 3, api::ReturnCode::OK, "baaa");
+ CPPUNIT_ASSERT(sender.replies.empty());
+ replyToPut(*cb, sender, 4);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Update Reply"),
+ sender.getLastReply(false));
+
+ std::string trace(sender.replies.back()->getTrace().toString());
+ //std::cout << "\n\n" << trace << "\n\n";
+ CPPUNIT_ASSERT(trace.find("hello earthlings") != std::string::npos);
+ CPPUNIT_ASSERT(trace.find("fooo") != std::string::npos);
+ CPPUNIT_ASSERT(trace.find("baaa") != std::string::npos);
+}
+
+void
+TwoPhaseUpdateOperationTest::testUpdateFailsIfOwnershipChangesBetweenGetAndPut()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+
+ // Update towards inconsistent bucket invokes safe path.
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=2/3/4"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Get => 0,Get => 1"),
+ sender.getCommands(true));
+
+ // Alter cluster state so that distributor is now down (technically the
+ // entire cluster is down in this state, but this should not matter). In
+ // this new state, the distributor no longer owns the bucket in question
+ // and the operation should thus be failed. We must not try to send Puts
+ // to a bucket we no longer own.
+ _distributor->enableClusterState(
+ lib::ClusterState("storage:2 distributor:1 .0.s:d"));
+ getBucketDatabase().clear();
+ replyToGet(*cb, sender, 0, 70);
+ replyToGet(*cb, sender, 1, 70);
+
+ // BUCKET_NOT_FOUND is a transient error code which should cause the client
+ // to re-send the operation, presumably to the correct distributor the next
+ // time.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 70) "
+ "ReturnCode(BUCKET_NOT_FOUND, Distributor lost "
+ "ownership of bucket between executing the read "
+ "and write phases of a two-phase update operation)"),
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testSafePathConditionMismatchFailsWithTasError()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=2/3/4", UpdateOptions().condition(
+ "testdoctype1.headerval==120")));
+
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+ // Newest doc has headerval==110, not 120.
+ replyToGet(*cb, sender, 0, 100);
+ replyToGet(*cb, sender, 1, 110);
+ CPPUNIT_ASSERT_EQUAL(
+ "UpdateReply(doc:test:test, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 0) "
+ "ReturnCode(TEST_AND_SET_CONDITION_FAILED, "
+ "Condition did not match document)"s,
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testSafePathConditionMatchSendsPutsWithUpdatedDoc()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=2/3/4", UpdateOptions().condition(
+ "testdoctype1.headerval==110")));
+
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+ replyToGet(*cb, sender, 0, 100);
+ replyToGet(*cb, sender, 1, 110);
+ CPPUNIT_ASSERT_EQUAL("Put => 1,Put => 0"s,
+ sender.getCommands(true, false, 2));
+}
+
+void
+TwoPhaseUpdateOperationTest::testSafePathConditionParseFailureFailsWithIllegalParamsError()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=2/3/4", UpdateOptions().condition(
+ "testdoctype1.san==fran...cisco")));
+
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+ replyToGet(*cb, sender, 0, 100);
+ replyToGet(*cb, sender, 1, 110);
+ // NOTE: condition is currently not attempted parsed until Gets have been
+ // replied to. This may change in the future.
+ // XXX reliance on parser/exception error message is very fragile.
+ CPPUNIT_ASSERT_EQUAL(
+ "UpdateReply(doc:test:test, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 0) "
+ "ReturnCode(ILLEGAL_PARAMETERS, "
+ "Failed to parse test and set condition: "
+ "Unexpected token at position 16 "
+ "('==fran...c') in query 'testdoctype1."
+ "san==fran...cisco',)"s,
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testSafePathConditonUnknownDocTypeFailsWithIllegalParamsError()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=2/3/4", UpdateOptions().condition(
+ "langbein.headerval=1234")));
+
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+ replyToGet(*cb, sender, 0, 100);
+ replyToGet(*cb, sender, 1, 110);
+ // NOTE: condition is currently not attempted parsed until Gets have been
+ // replied to. This may change in the future.
+ CPPUNIT_ASSERT_EQUAL(
+ "UpdateReply(doc:test:test, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 0) "
+ "ReturnCode(ILLEGAL_PARAMETERS, "
+ "Failed to parse test and set condition: "
+ "Document type langbein not found)"s,
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::testSafePathConditionWithMissingDocFailsWithTasError()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=2/3/4", UpdateOptions().condition(
+ "testdoctype1.headerval==120")));
+
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+ // Both Gets return nothing at all, nothing at all.
+ replyToGet(*cb, sender, 0, 100, false);
+ replyToGet(*cb, sender, 1, 110, false);
+ CPPUNIT_ASSERT_EQUAL(
+ "UpdateReply(doc:test:test, "
+ "BucketId(0x0000000000000000), "
+ "timestamp 0, timestamp of updated doc: 0) "
+ "ReturnCode(TEST_AND_SET_CONDITION_FAILED, "
+ "Document did not exist)"s,
+ sender.getLastReply(true));
+}
+
+void
+TwoPhaseUpdateOperationTest::assertAbortedUpdateReplyWithContextPresent(
+ const MessageSenderStub& closeSender) const
+{
+ CPPUNIT_ASSERT_EQUAL(size_t(1), closeSender.replies.size());
+ StorageReply::SP reply(closeSender.replies.back());
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::UPDATE_REPLY, reply->getType());
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::ABORTED,
+ reply->getResult().getResult());
+ auto context = reply->getTransportContext(); // Transfers ownership
+ CPPUNIT_ASSERT(context.get());
+}
+
+void
+TwoPhaseUpdateOperationTest::testFastPathCloseEdgeSendsCorrectReply()
+{
+ setupDistributor(1, 1, "storage:1 distributor:1");
+ // Only 1 replica; consistent with itself by definition.
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(sendUpdate("0=1/2/3"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL("Update => 0"s, sender.getCommands(true));
+ // Close the operation. This should generate a single reply that is
+ // bound to the original command. We can identify rogue replies by these
+ // not having a transport context, as these are unique_ptrs that are
+ // moved to the reply upon the first reply construction. Any subsequent or
+ // erroneous replies will not have this context attached to themselves.
+ MessageSenderStub closeSender;
+ cb->onClose(closeSender);
+
+ assertAbortedUpdateReplyWithContextPresent(closeSender);
+}
+
+void
+TwoPhaseUpdateOperationTest::testSafePathCloseEdgeSendsCorrectReply()
+{
+ setupDistributor(2, 2, "storage:2 distributor:1");
+
+ std::shared_ptr<TwoPhaseUpdateOperation> cb(
+ sendUpdate("0=1/2/3,1=2/3/4")); // Inconsistent replicas.
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Get => 0,Get => 1"),
+ sender.getCommands(true));
+ // Closing the operation should now only return an ABORTED reply for
+ // the UpdateCommand, _not_ from the nested, pending Get operation (which
+ // will implicitly generate an ABORTED reply for the synthesized Get
+ // command passed to it).
+ MessageSenderStub closeSender;
+ cb->onClose(closeSender);
+
+ assertAbortedUpdateReplyWithContextPresent(closeSender);
+}
+
+// XXX currently differs in behavior from content nodes in that updates for
+// document IDs without explicit doctypes will _not_ be auto-failed on the
+// distributor.
+
+// XXX shouldn't be necessary to have any special handling of create-if... and
+// test-and-set right? They appear fully mutually exclusive.
+
+// XXX: test case where update reply has been sent but callback still
+// has pending messages (e.g. n-of-m case).
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/distributor/updateoperationtest.cpp b/storage/src/tests/distributor/updateoperationtest.cpp
new file mode 100644
index 00000000000..912d0235e42
--- /dev/null
+++ b/storage/src/tests/distributor/updateoperationtest.cpp
@@ -0,0 +1,210 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <tests/distributor/distributortestutil.h>
+#include <vespa/config/helper/configgetter.h>
+#include <vespa/document/config/config-documenttypes.h>
+#include <tests/distributor/messagesenderstub.h>
+#include <vespa/storage/distributor/operations/external/updateoperation.h>
+
+using std::shared_ptr;
+using namespace document;
+using namespace storage;
+using namespace storage::distributor;
+using namespace storage::api;
+using namespace std;
+using namespace storage::lib;
+using config::ConfigGetter;
+using config::FileSpec;
+using vespalib::string;
+
+class UpdateOperation_Test : public CppUnit::TestFixture,
+ public DistributorTestUtil
+{
+ CPPUNIT_TEST_SUITE(UpdateOperation_Test);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST(testNotFound);
+ CPPUNIT_TEST(testMultiNode);
+ CPPUNIT_TEST(testMultiNodeInconsistentTimestamp);
+ CPPUNIT_TEST_SUITE_END();
+
+ DocumentTypeRepo::SP _repo;
+ const DocumentType *_html_type;
+
+protected:
+ void testSimple();
+ void testNotFound();
+ void testMultiNode();
+ void testMultiNodeInconsistentTimestamp();
+
+public:
+ void setUp() {
+ _repo.reset(
+ new DocumentTypeRepo(*ConfigGetter<DocumenttypesConfig>::
+ getConfig("config-doctypes", FileSpec("config-doctypes.cfg"))));
+ _html_type = _repo->getDocumentType("text/html");
+ createLinks();
+ }
+
+ void tearDown() {
+ close();
+ }
+
+ void replyToMessage(
+ UpdateOperation& callback,
+ MessageSenderStub& sender,
+ uint32_t index,
+ uint64_t oldTimestamp,
+ api::BucketInfo info = api::BucketInfo(2,4,6));
+
+ std::shared_ptr<UpdateOperation>
+ sendUpdate(const std::string& bucketState);
+
+ document::BucketId _bId;
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(UpdateOperation_Test);
+
+std::shared_ptr<UpdateOperation>
+UpdateOperation_Test::sendUpdate(const std::string& bucketState)
+{
+ document::DocumentUpdate::SP update(
+ new document::DocumentUpdate(
+ *_html_type,
+ document::DocumentId(document::DocIdString("test", "test"))));
+
+ _bId = getExternalOperationHandler().getBucketId(update->getId());
+
+ addNodesToBucketDB(_bId, bucketState);
+
+ std::shared_ptr<api::UpdateCommand> msg(
+ new api::UpdateCommand(document::BucketId(0),
+ update,
+ 100));
+
+ ExternalOperationHandler& handler = getExternalOperationHandler();
+ return std::shared_ptr<UpdateOperation>(
+ new UpdateOperation(handler,
+ msg,
+ getDistributor().getMetrics().updates[msg->getLoadType()]));
+}
+
+
+void
+UpdateOperation_Test::replyToMessage(
+ UpdateOperation& callback,
+ MessageSenderStub& sender,
+ uint32_t index,
+ uint64_t oldTimestamp,
+ api::BucketInfo info)
+{
+ std::shared_ptr<api::StorageMessage> msg2 = sender.commands[index];
+ UpdateCommand* updatec = dynamic_cast<UpdateCommand*>(msg2.get());
+ std::unique_ptr<api::StorageReply> reply(updatec->makeReply());
+ UpdateReply* updateR = static_cast<api::UpdateReply*>(reply.get());
+ updateR->setOldTimestamp(oldTimestamp);
+ updateR->setBucketInfo(info);
+
+ callback.onReceive(sender,
+ std::shared_ptr<StorageReply>(reply.release()));
+}
+
+void
+UpdateOperation_Test::testSimple()
+{
+ setupDistributor(1, 1, "storage:1 distributor:1");
+
+ std::shared_ptr<UpdateOperation> cb(sendUpdate("0=1/2/3"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0"),
+ sender.getCommands(true));
+
+ replyToMessage(*cb, sender, 0, 90);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 100, timestamp of updated doc: 90) ReturnCode(NONE)"),
+ sender.getLastReply(true));
+}
+
+void
+UpdateOperation_Test::testNotFound()
+{
+ setupDistributor(1, 1, "storage:1 distributor:1");
+
+ std::shared_ptr<UpdateOperation> cb(sendUpdate("0=1/2/3"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0"),
+ sender.getCommands(true));
+
+ replyToMessage(*cb, sender, 0, 0);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 100, timestamp of updated doc: 0) ReturnCode(NONE)"),
+ sender.getLastReply(true));
+}
+
+void
+UpdateOperation_Test::testMultiNode()
+{
+ setupDistributor(2, 2, "distributor:1 storage:2");
+ std::shared_ptr<UpdateOperation> cb(sendUpdate("0=1/2/3,1=1/2/3"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0,Update => 1"),
+ sender.getCommands(true));
+
+ replyToMessage(*cb, sender, 0, 120);
+ replyToMessage(*cb, sender, 1, 120);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 100, timestamp of updated doc: 120) ReturnCode(NONE)"),
+ sender.getLastReply(true));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ _bId.toString() + " : "
+ "node(idx=1,crc=0x2,docs=4/4,bytes=6/6,trusted=true,active=false), "
+ "node(idx=0,crc=0x2,docs=4/4,bytes=6/6,trusted=true,active=false)"),
+ dumpBucket(_bId));
+}
+
+void
+UpdateOperation_Test::testMultiNodeInconsistentTimestamp()
+{
+ setupDistributor(2, 2, "distributor:1 storage:2");
+ std::shared_ptr<UpdateOperation> cb(sendUpdate("0=1/2/3,1=1/2/3"));
+ MessageSenderStub sender;
+ cb->start(sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("Update => 0,Update => 1"),
+ sender.getCommands(true));
+
+ replyToMessage(*cb, sender, 0, 119);
+ replyToMessage(*cb, sender, 1, 120);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("UpdateReply(doc:test:test, BucketId(0x0000000000000000), "
+ "timestamp 100, timestamp of updated doc: 120 Was inconsistent "
+ "(best node 1)) ReturnCode(NONE)"),
+ sender.getLastReply(true));
+}
+
diff --git a/storage/src/tests/distributor/visitoroperationtest.cpp b/storage/src/tests/distributor/visitoroperationtest.cpp
new file mode 100644
index 00000000000..a8f28a73fb6
--- /dev/null
+++ b/storage/src/tests/distributor/visitoroperationtest.cpp
@@ -0,0 +1,1646 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <math.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <vespa/storageapi/message/datagram.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storage/distributor/operations/external/visitoroperation.h>
+#include <vespa/storage/distributor/operations/external/visitororder.h>
+#include <tests/distributor/distributortestutil.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+
+using namespace document;
+using namespace storage::api;
+using namespace storage::lib;
+using namespace std::string_literals;
+
+namespace storage {
+namespace distributor {
+
+class VisitorOperationTest : public CppUnit::TestFixture,
+ public DistributorTestUtil {
+ CPPUNIT_TEST_SUITE(VisitorOperationTest);
+ CPPUNIT_TEST(testParameterForwarding);
+ CPPUNIT_TEST(testShutdown);
+ CPPUNIT_TEST(testNoBucket);
+ CPPUNIT_TEST(testOnlySuperBucketAndProgressAllowed);
+ CPPUNIT_TEST(testRetiredStorageNode);
+ CPPUNIT_TEST(testNoResendAfterTimeoutPassed);
+ CPPUNIT_TEST(testDistributorNotReady);
+ CPPUNIT_TEST(testInvalidOrderDocSelection);
+ CPPUNIT_TEST(testNonExistingBucket);
+ CPPUNIT_TEST(testUserSingleBucket);
+ CPPUNIT_TEST(testUserInconsistentlySplitBucket);
+ CPPUNIT_TEST(testBucketRemovedWhileVisitorPending);
+ CPPUNIT_TEST(testEmptyBucketsVisitedWhenVisitingRemoves);
+ CPPUNIT_TEST(testResendToOtherStorageNodeOnFailure);
+ CPPUNIT_TEST(testTimeoutOnlyAfterReplyFromAllStorageNodes);
+ CPPUNIT_TEST(testTimeoutDoesNotOverrideCriticalError);
+ CPPUNIT_TEST(testWrongDistribution);
+ CPPUNIT_TEST(testWrongDistributionInPendingState);
+ CPPUNIT_TEST(testVisitorAbortedIfNodeIsMarkedAsDown);
+ CPPUNIT_TEST(testBucketHighBitCount);
+ CPPUNIT_TEST(testBucketLowBitCount);
+ CPPUNIT_TEST(testParallelVisitorsToOneStorageNode);
+ CPPUNIT_TEST(testParallelVisitorsResendOnlyFailing);
+ CPPUNIT_TEST(testParallelVisitorsToOneStorageNodeOneSuperBucket);
+ CPPUNIT_TEST(testVisitWhenOneBucketCopyIsInvalid);
+ CPPUNIT_TEST(testVisitingWhenAllBucketsAreInvalid);
+ CPPUNIT_TEST(testInconsistencyHandling);
+ CPPUNIT_TEST(testVisitIdealNode);
+ CPPUNIT_TEST(testNoResendingOnCriticalFailure);
+ CPPUNIT_TEST(testFailureOnAllNodes);
+ CPPUNIT_TEST(testVisitOrder);
+ CPPUNIT_TEST(testVisitInChunks);
+ CPPUNIT_TEST(testVisitOrderSplitPastOrderBits);
+ CPPUNIT_TEST(testVisitOrderInconsistentlySplit);
+ CPPUNIT_TEST(testUserVisitorOrder);
+ CPPUNIT_TEST(testUserVisitorOrderSplitPastOrderBits);
+ CPPUNIT_TEST(testNoClientReplyBeforeAllStorageRepliesReceived);
+ CPPUNIT_TEST(testSkipFailedSubBucketsWhenVisitingInconsistent);
+ CPPUNIT_TEST(testQueueTimeoutIsFactorOfTotalTimeout);
+ CPPUNIT_TEST_SUITE_END();
+
+protected:
+ void testParameterForwarding();
+ void testShutdown();
+ void testNoBucket();
+ void testOnlySuperBucketAndProgressAllowed();
+ void testRetiredStorageNode();
+ void testNoResendAfterTimeoutPassed();
+ void testDistributorNotReady();
+ void testInvalidOrderDocSelection();
+ void testNonExistingBucket();
+ void testUserSingleBucket();
+ void testUserInconsistentlySplitBucket();
+ void testBucketRemovedWhileVisitorPending();
+ void testEmptyBucketsVisitedWhenVisitingRemoves();
+ void testResendToOtherStorageNodeOnFailure();
+ void testTimeoutOnlyAfterReplyFromAllStorageNodes();
+ void testTimeoutDoesNotOverrideCriticalError();
+ void testAbortNonExisting();
+ void testAbort();
+ void testWrongDistribution();
+ void testWrongDistributionInPendingState();
+ void testVisitorAbortedIfNodeIsMarkedAsDown();
+ void testBucketHighBitCount();
+ void testBucketLowBitCount();
+ void testParallelVisitorsToOneStorageNode();
+ void testParallelVisitorsResendOnlyFailing();
+ void testParallelVisitorsToOneStorageNodeOneSuperBucket();
+ void testVisitWhenOneBucketCopyIsInvalid();
+ void testVisitingWhenAllBucketsAreInvalid();
+ void testInconsistencyHandling();
+ void testVisitIdealNode();
+ void testNoResendingOnCriticalFailure();
+ void testFailureOnAllNodes();
+ void testVisitOrder();
+ void testVisitInChunks();
+ void testVisitOrderSplitPastOrderBits();
+ void testVisitOrderInconsistentlySplit();
+ void testUserVisitorOrder();
+ void testUserVisitorOrderSplitPastOrderBits();
+ void testUserVisitorOrderInconsistentlySplit();
+ void testNoClientReplyBeforeAllStorageRepliesReceived();
+ void testSkipFailedSubBucketsWhenVisitingInconsistent();
+ void testQueueTimeoutIsFactorOfTotalTimeout();
+public:
+ VisitorOperationTest()
+ : defaultConfig(framework::MilliSecTime(0),
+ 100,
+ 100)
+ {}
+
+ void setUp() {
+ createLinks();
+ nullId = document::BucketId(0, 0);
+ doneId = document::BucketId(INT_MAX);
+ };
+
+ void tearDown() {
+ close();
+ }
+
+ enum {MAX_PENDING = 2};
+private:
+ document::BucketId nullId;
+ document::BucketId doneId;
+ VisitorOperation::Config defaultConfig;
+
+ api::CreateVisitorCommand::SP
+ createVisitorCommand(std::string instanceId,
+ document::BucketId superBucket,
+ document::BucketId lastBucket,
+ uint32_t maxBuckets = 8,
+ uint32_t timeoutMS = 500,
+ bool visitInconsistentBuckets = false,
+ bool visitRemoves = false,
+ std::string libraryName = "dumpvisitor",
+ document::OrderingSpecification::Order visitorOrdering =
+ document::OrderingSpecification::ASCENDING,
+ const std::string& docSelection = "")
+ {
+ api::CreateVisitorCommand::SP cmd(
+ new api::CreateVisitorCommand(libraryName, instanceId, docSelection));
+ cmd->setControlDestination("controldestination");
+ cmd->setDataDestination("datadestination");
+ cmd->setFieldSet("[header]");
+ if (visitRemoves) {
+ cmd->setVisitRemoves();
+ }
+ cmd->setFromTime(10);
+ cmd->setToTime(100);
+
+ cmd->addBucketToBeVisited(superBucket);
+ cmd->addBucketToBeVisited(lastBucket);
+
+ cmd->setMaximumPendingReplyCount(VisitorOperationTest::MAX_PENDING);
+ cmd->setMaxBucketsPerVisitor(maxBuckets);
+ cmd->setTimeout(timeoutMS);
+ if (visitInconsistentBuckets) {
+ cmd->setVisitInconsistentBuckets();
+ }
+ cmd->setVisitorOrdering(visitorOrdering);
+ return cmd;
+ }
+
+ std::string
+ serializeVisitorCommand(int idx = -1) {
+ if (idx == -1) {
+ idx = _sender.commands.size() - 1;
+ }
+
+ std::ostringstream ost;
+
+ CreateVisitorCommand* cvc = dynamic_cast<CreateVisitorCommand*>(
+ _sender.commands[idx].get());
+
+ ost << *cvc << " Buckets: [ ";
+ for (uint32_t i = 0; i < cvc->getBuckets().size(); ++i) {
+ ost << cvc->getBuckets()[i] << " ";
+ }
+ ost << "]";
+ return ost.str();
+ }
+
+ /**
+ Starts a visitor where we expect no createVisitorCommands to be sent
+ to storage, either due to error or due to no data actually stored.
+ */
+ std::string runEmptyVisitor(api::CreateVisitorCommand::SP msg) {
+ VisitorOperation op(getExternalOperationHandler(),
+ msg,
+ defaultConfig);
+ op.start(_sender, framework::MilliSecTime(0));
+ return _sender.getLastReply();
+ }
+
+ const std::vector<BucketId>& getBucketsFromLastCommand() {
+ const CreateVisitorCommand& cvc(
+ dynamic_cast<const CreateVisitorCommand&>(
+ *_sender.commands[_sender.commands.size() - 1]));
+ return cvc.getBuckets();
+ }
+
+ std::pair<std::string, std::string>
+ runVisitor(document::BucketId id,
+ document::BucketId lastId,
+ uint32_t maxBuckets);
+
+ std::string doOrderedVisitor(document::BucketId startBucket);
+
+ void doStandardVisitTest(const std::string& clusterState);
+
+ std::unique_ptr<VisitorOperation> startOperationWith2StorageNodeVisitors(
+ bool inconsistent);
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(VisitorOperationTest);
+
+void
+VisitorOperationTest::testParameterForwarding()
+{
+ doStandardVisitTest("distributor:1 storage:1");
+}
+
+void
+VisitorOperationTest::doStandardVisitTest(const std::string& clusterState)
+{
+ _distributor->enableClusterState(ClusterState(clusterState));
+
+ // Create bucket in bucketdb
+ document::BucketId id(uint64_t(0x400000000000007b));
+ addNodesToBucketDB(id, "0=1/1/1/t");
+
+ // Send create visitor
+ vespalib::string instanceId("testParameterForwarding");
+ vespalib::string libraryName("dumpvisitor");
+ vespalib::string docSelection("");
+ api::CreateVisitorCommand::SP msg(
+ new api::CreateVisitorCommand(libraryName,
+ instanceId,
+ docSelection));
+ vespalib::string controlDestination("controldestination");
+ msg->setControlDestination(controlDestination);
+ vespalib::string dataDestination("datadestination");
+ msg->setDataDestination(dataDestination);
+ msg->setMaximumPendingReplyCount(VisitorOperationTest::MAX_PENDING);
+ msg->setMaxBucketsPerVisitor(8);
+ msg->setFromTime(10);
+ msg->setToTime(0);
+ msg->addBucketToBeVisited(id);
+ msg->addBucketToBeVisited(nullId);
+ msg->setFieldSet("[header]");
+ msg->setVisitRemoves();
+ msg->setTimeout(1234);
+ msg->getTrace().setLevel(7);
+
+ VisitorOperation op(getExternalOperationHandler(),
+ msg,
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+
+ // Receive create visitor command for storage and simulate reply
+ api::StorageMessage::SP rep0 = _sender.commands[0];
+ CreateVisitorCommand* cvc = dynamic_cast<CreateVisitorCommand*>(rep0.get());
+ CPPUNIT_ASSERT(cvc);
+ CPPUNIT_ASSERT_EQUAL(libraryName, cvc->getLibraryName());
+ CPPUNIT_ASSERT_EQUAL(instanceId, cvc->getInstanceId().substr(0, instanceId.length()));
+ CPPUNIT_ASSERT_EQUAL(docSelection, cvc->getDocumentSelection());
+ CPPUNIT_ASSERT_EQUAL(controlDestination, cvc->getControlDestination());
+ CPPUNIT_ASSERT_EQUAL(dataDestination, cvc->getDataDestination());
+ CPPUNIT_ASSERT_EQUAL((unsigned int) VisitorOperationTest::MAX_PENDING, cvc->getMaximumPendingReplyCount());
+ CPPUNIT_ASSERT_EQUAL((unsigned int) 8, cvc->getMaxBucketsPerVisitor());
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, cvc->getBuckets().size());
+ CPPUNIT_ASSERT_EQUAL((api::Timestamp) 10, cvc->getFromTime());
+ CPPUNIT_ASSERT(cvc->getToTime() > 0);
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("[header]"), cvc->getFieldSet());
+ CPPUNIT_ASSERT_EQUAL((bool) 1, cvc->visitRemoves());
+ CPPUNIT_ASSERT_EQUAL(uint32_t(1234), cvc->getTimeout());
+ CPPUNIT_ASSERT_EQUAL(uint32_t(7), cvc->getTrace().getLevel());
+
+ sendReply(op);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("CreateVisitorReply("
+ "last=BucketId(0x000000007fffffff)) "
+ "ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+VisitorOperationTest::testShutdown()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+
+ // Create bucket in bucketdb
+ document::BucketId id(uint64_t(0x400000000000007b));
+ addNodesToBucketDB(id, "0=1/1/1/t");
+
+ // Send create visitor
+ vespalib::string instanceId("testShutdown");
+ vespalib::string libraryName("dumpvisitor");
+ vespalib::string docSelection("");
+ api::CreateVisitorCommand::SP msg(
+ new api::CreateVisitorCommand(libraryName,
+ instanceId,
+ docSelection));
+ msg->addBucketToBeVisited(id);
+ msg->addBucketToBeVisited(nullId);
+
+ VisitorOperation op(getExternalOperationHandler(), msg, defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+
+ op.onClose(_sender); // This will fail the visitor
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(ABORTED, Process is shutting down)"),
+ _sender.getLastReply());
+}
+
+void
+VisitorOperationTest::testNoBucket()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+
+ // Send create visitor
+ api::CreateVisitorCommand::SP msg(new api::CreateVisitorCommand(
+ "dumpvisitor", "instance", ""));
+
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(ILLEGAL_PARAMETERS, No buckets in "
+ "CreateVisitorCommand for visitor 'instance')"),
+ runEmptyVisitor(msg));
+}
+
+void
+VisitorOperationTest::testOnlySuperBucketAndProgressAllowed()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+
+ // Send create visitor
+ api::CreateVisitorCommand::SP msg(new api::CreateVisitorCommand(
+ "dumpvisitor", "instance", ""));
+ msg->addBucketToBeVisited(nullId);
+ msg->addBucketToBeVisited(nullId);
+ msg->addBucketToBeVisited(nullId);
+
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(ILLEGAL_PARAMETERS, CreateVisitorCommand "
+ "does not contain 2 buckets for visitor "
+ "'instance')"),
+ runEmptyVisitor(msg));
+}
+
+void
+VisitorOperationTest::testRetiredStorageNode()
+{
+ doStandardVisitTest("distributor:1 storage:1 .0.s:r");
+}
+
+void
+VisitorOperationTest::testNoResendAfterTimeoutPassed()
+{
+ document::BucketId id(uint64_t(0x400000000000007b));
+
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:2"));
+ addNodesToBucketDB(id, "0=1/1/1/t,1=1/1/1/t");
+
+ VisitorOperation op(
+ getExternalOperationHandler(),
+ createVisitorCommand("lowtimeoutbusy", id, nullId, 8, 20),
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+
+ getClock().addMilliSecondsToTime(22);
+
+ sendReply(op, -1, api::ReturnCode::BUSY);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(ABORTED, Timeout of 20 ms is running out)"),
+ _sender.getLastReply());
+}
+
+void
+VisitorOperationTest::testDistributorNotReady()
+{
+ _distributor->enableClusterState(ClusterState("distributor:0 storage:0"));
+ document::BucketId id(uint64_t(0x400000000000007b));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(NODE_NOT_READY, No distributors available when "
+ "processing visitor 'notready')"),
+ runEmptyVisitor(createVisitorCommand("notready", id, nullId)));
+}
+
+// Distributor only parses selection if in the order doc case (which is detected
+// by first checking if string contains "order" which it must to refer to
+// "id.order" !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
+void
+VisitorOperationTest::testInvalidOrderDocSelection()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+ document::BucketId id(0x400000000000007b);
+ addNodesToBucketDB(id, "0=1/1/1/t");
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(ILLEGAL_PARAMETERS, Failed to parse document select "
+ "string 'id.order(10,3)=1 and dummy': Document type dummy not found)"),
+ runEmptyVisitor(
+ createVisitorCommand("invalidOrderDoc",
+ id,
+ nullId,
+ 8,
+ 500,
+ false,
+ false,
+ "dumpvisitor",
+ document::OrderingSpecification::ASCENDING,
+ "id.order(10,3)=1 and dummy")));
+}
+
+void
+VisitorOperationTest::testNonExistingBucket()
+{
+ document::BucketId id(uint64_t(0x400000000000007b));
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x000000007fffffff)) "
+ "ReturnCode(NONE)"),
+ runEmptyVisitor(
+ createVisitorCommand("nonExistingBucket",
+ id,
+ nullId)));
+}
+
+void
+VisitorOperationTest::testUserSingleBucket()
+{
+ document::BucketId id(uint64_t(0x400000000000007b));
+ document::BucketId userid(uint64_t(0x800000000000007b));
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+
+ addNodesToBucketDB(id, "0=1/1/1/t");
+
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("userSingleBucket",
+ userid,
+ nullId,
+ 8,
+ 500,
+ false,
+ false,
+ "dumpvisitor",
+ document::OrderingSpecification::ASCENDING,
+ "true"),
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL_MSG(_sender.getLastReply(),
+ std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+ sendReply(op);
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x000000007fffffff)) "
+ "ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+std::pair<std::string, std::string>
+VisitorOperationTest::runVisitor(document::BucketId id,
+ document::BucketId lastId,
+ uint32_t maxBuckets)
+{
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("inconsistentSplit",
+ id,
+ lastId,
+ maxBuckets,
+ 500,
+ false,
+ false,
+ "dumpvisitor",
+ document::OrderingSpecification::ASCENDING,
+ "true"),
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ sendReply(op);
+
+ std::pair<std::string, std::string> retVal =
+ std::make_pair(serializeVisitorCommand(), _sender.getLastReply());
+
+ _sender.clear();
+
+ return retVal;
+}
+
+void
+VisitorOperationTest::testUserInconsistentlySplitBucket()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+
+ // Not containing (19, 0x40001)
+ addNodesToBucketDB(document::BucketId(17, 0x0), "0=1/1/1/t");
+ addNodesToBucketDB(document::BucketId(18, 0x20001), "0=1/1/1/t");
+ addNodesToBucketDB(document::BucketId(19, 0x1), "0=1/1/1/t");
+
+ // Containing (19, 0x40001)
+ addNodesToBucketDB(document::BucketId(17, 0x1), "0=1/1/1/t");
+ addNodesToBucketDB(document::BucketId(18, 0x1), "0=1/1/1/t");
+
+ // Equal to (19, 0x40001)
+ addNodesToBucketDB(document::BucketId(19, 0x40001), "0=1/1/1/t");
+
+ // Contained in (19, 0x40001)
+ addNodesToBucketDB(document::BucketId(20, 0x40001), "0=1/1/1/t");
+ addNodesToBucketDB(document::BucketId(20, 0xc0001), "0=1/1/1/t");
+ addNodesToBucketDB(document::BucketId(21, 0x40001), "0=1/1/1/t");
+ addNodesToBucketDB(document::BucketId(21, 0x140001), "0=1/1/1/t");
+
+ document::BucketId id(19, 0x40001);
+
+ {
+ std::pair<std::string, std::string> val(
+ runVisitor(id, nullId, 100));
+
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "CreateVisitorCommand(dumpvisitor, true, 7 buckets) "
+ "Buckets: [ BucketId(0x4400000000000001) "
+ "BucketId(0x4800000000000001) "
+ "BucketId(0x4c00000000040001) "
+ "BucketId(0x5000000000040001) "
+ "BucketId(0x5400000000040001) "
+ "BucketId(0x5400000000140001) "
+ "BucketId(0x50000000000c0001) ]"),
+ val.first);
+
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "CreateVisitorReply(last=BucketId(0x000000007fffffff)) "
+ "ReturnCode(NONE)"),
+ val.second);
+ }
+}
+
+void
+VisitorOperationTest::testBucketRemovedWhileVisitorPending()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+
+ // Create bucket in bucketdb
+ document::BucketId id(uint64_t(0x400000000000007b));
+
+ addNodesToBucketDB(id, "0=1/1/1/t");
+
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("removefrombucketdb",
+ id,
+ nullId),
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+
+ removeFromBucketDB(id);
+
+ sendReply(op, -1, api::ReturnCode::NOT_CONNECTED);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(BUCKET_NOT_FOUND)"),
+ _sender.getLastReply());
+}
+
+void
+VisitorOperationTest::testEmptyBucketsVisitedWhenVisitingRemoves()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+ document::BucketId id(uint64_t(0x400000000000007b));
+ addNodesToBucketDB(id, "0=0/0/0/1/2/t");
+
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("emptybucket",
+ id,
+ nullId,
+ 8,
+ 500,
+ false,
+ true),
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ // Since visitRemoves is true, the empty bucket will be visited
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+}
+
+void
+VisitorOperationTest::testResendToOtherStorageNodeOnFailure()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:2"));
+ document::BucketId id(uint64_t(0x400000000000007b));
+
+ addNodesToBucketDB(id, "0=1/1/1/t,1=1/1/1/t");
+
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("emptyinconsistent",
+ id,
+ nullId),
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+
+ sendReply(op, -1, api::ReturnCode::NOT_CONNECTED);
+ CPPUNIT_ASSERT_EQUAL(""s, _sender.getReplies());
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0,Visitor Create => 1"),
+ _sender.getCommands(true));
+}
+
+// Since MessageBus handles timeouts for us implicitly, we make the assumption
+// that we can safely wait for all replies to be received before sending a
+// client reply and that this won't cause things to hang for indeterminate
+// amounts of time.
+void
+VisitorOperationTest::testTimeoutOnlyAfterReplyFromAllStorageNodes()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:2"));
+
+ // Contained in (16, 0x1)
+ addNodesToBucketDB(document::BucketId(17, 0x00001), "0=1/1/1/t");
+ addNodesToBucketDB(document::BucketId(17, 0x10001), "1=1/1/1/t");
+
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("timeout2bucketson2nodes",
+ document::BucketId(16, 1),
+ nullId,
+ 8),
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL("Visitor Create => 0,Visitor Create => 1"s,
+ _sender.getCommands(true));
+
+ getClock().addMilliSecondsToTime(501);
+
+ sendReply(op, 0);
+ CPPUNIT_ASSERT_EQUAL(""s, _sender.getReplies()); // No reply yet.
+
+ sendReply(op, 1, api::ReturnCode::BUSY);
+
+ CPPUNIT_ASSERT_EQUAL(
+ "CreateVisitorReply(last=BucketId(0x4400000000000001)) "
+ "ReturnCode(ABORTED, Timeout of 500 ms is running out)"s,
+ _sender.getLastReply());
+
+ // XXX This is sub-optimal in the case that we time out but all storage
+ // visitors return OK, as we'll then be failing an operation that
+ // technically went fine. However, this is assumed to happen sufficiently
+ // rarely (requires timing to be so that mbus timouts don't happen for
+ // neither client -> distributor nor distributor -> storage for the
+ // operation to possibly could have been considered successful) that we
+ // don't bother to add complexity for handling it as a special case.
+}
+
+void
+VisitorOperationTest::testTimeoutDoesNotOverrideCriticalError()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:2"));
+ addNodesToBucketDB(document::BucketId(17, 0x00001), "0=1/1/1/t");
+ addNodesToBucketDB(document::BucketId(17, 0x10001), "1=1/1/1/t");
+
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("timeout2bucketson2nodes",
+ document::BucketId(16, 1),
+ nullId,
+ 8,
+ 500), // ms timeout
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+ CPPUNIT_ASSERT_EQUAL("Visitor Create => 0,Visitor Create => 1"s,
+ _sender.getCommands(true));
+
+ getClock().addMilliSecondsToTime(501);
+ // Technically has timed out at this point, but should still report the
+ // critical failure.
+ sendReply(op, 0, api::ReturnCode::INTERNAL_FAILURE);
+ CPPUNIT_ASSERT_EQUAL(""s, _sender.getReplies());
+ sendReply(op, 1, api::ReturnCode::BUSY);
+
+ CPPUNIT_ASSERT_EQUAL(
+ "CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(INTERNAL_FAILURE, [from content node 0] )"s,
+ _sender.getLastReply());
+}
+
+void
+VisitorOperationTest::testWrongDistribution()
+{
+ setupDistributor(1, 100, "distributor:100 storage:2");
+
+ document::BucketId id(uint64_t(0x400000000000127b));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(WRONG_DISTRIBUTION, distributor:100 storage:2)"),
+ runEmptyVisitor(createVisitorCommand("wrongdist", id, nullId)));
+}
+
+void
+VisitorOperationTest::testWrongDistributionInPendingState()
+{
+ // Force bucket to belong to this distributor in currently enabled state.
+ setupDistributor(1, 100, "distributor:1 storage:2");
+ // Trigger pending cluster state. Note: increase in storage node count
+ // to force resending of bucket info requests.
+ auto stateCmd = std::make_shared<api::SetSystemStateCommand>(
+ lib::ClusterState("distributor:100 storage:3"));
+ getBucketDBUpdater().onSetSystemState(stateCmd);
+
+ document::BucketId id(uint64_t(0x400000000000127b));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(WRONG_DISTRIBUTION, distributor:100 storage:3)"),
+ runEmptyVisitor(createVisitorCommand("wrongdistpending", id, nullId)));
+}
+
+// If the current node state changes, this alters the node's cluster state
+// internally without this change being part of a new version. As a result,
+// we cannot answer with WRONG_DISTRIBUTION as the client expects to see a
+// higher version number.
+// See ticket 6353382 for details.
+void
+VisitorOperationTest::testVisitorAbortedIfNodeIsMarkedAsDown()
+{
+ setupDistributor(1, 10, "distributor:10 .0.s:s storage:10");
+
+ document::BucketId id(uint64_t(0x400000000000127b));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(ABORTED, Distributor is shutting down)"),
+ runEmptyVisitor(createVisitorCommand("wrongdist", id, nullId)));
+}
+
+void
+VisitorOperationTest::testBucketHighBitCount()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1 bits:16"));
+
+ document::BucketId id(18, 0x0);
+ addNodesToBucketDB(id, "0=1/1/1/t");
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(WRONG_DISTRIBUTION, distributor:1 storage:1)"),
+ runEmptyVisitor(createVisitorCommand("buckethigbit", id, nullId)));
+
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("buckethighbitcount",
+ id,
+ nullId,
+ 8,
+ 500,
+ false,
+ false,
+ "dumpvisitor",
+ document::OrderingSpecification::ASCENDING,
+ "true"),
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+}
+
+void
+VisitorOperationTest::testBucketLowBitCount()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1 bits:16"));
+
+ document::BucketId id(1, 0x0);
+ addNodesToBucketDB(id, "0=1/1/1/t");
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(WRONG_DISTRIBUTION, distributor:1 storage:1)"),
+ runEmptyVisitor(createVisitorCommand("bucketlowbit", id, nullId)));
+
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("buckethighbitcount",
+ id,
+ nullId,
+ 8,
+ 500,
+ false,
+ false,
+ "dumpvisitor",
+ document::OrderingSpecification::ASCENDING,
+ "true"),
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(WRONG_DISTRIBUTION, distributor:1 storage:1)"),
+ _sender.getLastReply());
+}
+
+void
+VisitorOperationTest::testParallelVisitorsToOneStorageNode()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+
+ // Create buckets in bucketdb
+ for (int i=0; i<32; i++) {
+ document::BucketId id(21, i*0x10000 + 0x0001);
+ addNodesToBucketDB(id, "0=1/1/1/t");
+ }
+
+ document::BucketId id(16, 1);
+
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("multiplebuckets",
+ id,
+ nullId,
+ 31),
+ VisitorOperation::Config(
+ framework::MilliSecTime(0),
+ 1,
+ 4));
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0,Visitor Create => 0,"
+ "Visitor Create => 0,Visitor Create => 0"),
+ _sender.getCommands(true));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorCommand(dumpvisitor, , 8 buckets) Buckets: [ "
+ "BucketId(0x5400000000000001) BucketId(0x5400000000040001) "
+ "BucketId(0x5400000000020001) BucketId(0x5400000000060001) "
+ "BucketId(0x5400000000010001) BucketId(0x5400000000050001) "
+ "BucketId(0x5400000000030001) BucketId(0x5400000000070001) ]"),
+ serializeVisitorCommand(0));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorCommand(dumpvisitor, , 8 buckets) Buckets: [ "
+ "BucketId(0x5400000000100001) BucketId(0x5400000000140001) "
+ "BucketId(0x5400000000120001) BucketId(0x5400000000160001) "
+ "BucketId(0x5400000000110001) BucketId(0x5400000000150001) "
+ "BucketId(0x5400000000130001) BucketId(0x5400000000170001) ]"),
+ serializeVisitorCommand(1));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorCommand(dumpvisitor, , 8 buckets) Buckets: [ "
+ "BucketId(0x5400000000080001) BucketId(0x54000000000c0001) "
+ "BucketId(0x54000000000a0001) BucketId(0x54000000000e0001) "
+ "BucketId(0x5400000000090001) BucketId(0x54000000000d0001) "
+ "BucketId(0x54000000000b0001) BucketId(0x54000000000f0001) ]"),
+ serializeVisitorCommand(2));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorCommand(dumpvisitor, , 7 buckets) Buckets: [ "
+ "BucketId(0x5400000000180001) BucketId(0x54000000001c0001) "
+ "BucketId(0x54000000001a0001) BucketId(0x54000000001e0001) "
+ "BucketId(0x5400000000190001) BucketId(0x54000000001d0001) "
+ "BucketId(0x54000000001b0001) ]"),
+ serializeVisitorCommand(3));
+
+ for (uint32_t i = 0; i < 4; ++i) {
+ sendReply(op, i);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x54000000000f0001)) "
+ "ReturnCode(NONE)"),
+ _sender.getLastReply());
+
+ _sender.clear();
+
+ uint32_t minBucketsPerVisitor = 1;
+ uint32_t maxVisitorsPerNode = 4;
+ VisitorOperation op2(getExternalOperationHandler(),
+ createVisitorCommand("multiplebuckets",
+ id,
+ document::BucketId(0x54000000000f0001),
+ 31),
+ VisitorOperation::Config(
+ framework::MilliSecTime(0),
+ minBucketsPerVisitor,
+ maxVisitorsPerNode));
+
+ op2.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+
+ sendReply(op2);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x000000007fffffff)) "
+ "ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+VisitorOperationTest::testParallelVisitorsResendOnlyFailing()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:2"));
+
+ // Create buckets in bucketdb
+ for (int i=0; i<32; i++) {
+ document::BucketId id(21, i*0x10000 + 0x0001);
+ addNodesToBucketDB(id, "0=1/1/1/t,1=1/1/1/t");
+ }
+
+ document::BucketId id(16, 1);
+
+ uint32_t minBucketsPerVisitor = 5;
+ uint32_t maxVisitorsPerNode = 4;
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("multiplebuckets",
+ id,
+ nullId,
+ 31),
+ VisitorOperation::Config(
+ framework::MilliSecTime(0),
+ minBucketsPerVisitor,
+ maxVisitorsPerNode));
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0,Visitor Create => 0,"
+ "Visitor Create => 0,Visitor Create => 0"),
+ _sender.getCommands(true));
+
+ for (uint32_t i = 0; i < 2; ++i) {
+ sendReply(op, i, api::ReturnCode::NOT_CONNECTED);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0,Visitor Create => 0,"
+ "Visitor Create => 0,Visitor Create => 0,"
+ "Visitor Create => 1,Visitor Create => 1"),
+ _sender.getCommands(true));
+
+ for (uint32_t i = 2; i < 6; ++i) {
+ sendReply(op, i);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x54000000000f0001)) "
+ "ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+VisitorOperationTest::testParallelVisitorsToOneStorageNodeOneSuperBucket()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+
+ // Create buckets in bucketdb
+ for (int i=0; i<8; i++) {
+ document::BucketId id(0x8c000000e3362b6aULL+i*0x100000000ull);
+ addNodesToBucketDB(id, "0=1/1/1/t");
+ }
+
+ document::BucketId id(16, 0x2b6a);
+
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("multiplebucketsonesuper",
+ id,
+ nullId),
+ VisitorOperation::Config(
+ framework::MilliSecTime(0),
+ 5,
+ 4));
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorCommand(dumpvisitor, , 8 buckets) Buckets: [ "
+ "BucketId(0x8c000000e3362b6a) BucketId(0x8c000004e3362b6a) "
+ "BucketId(0x8c000002e3362b6a) BucketId(0x8c000006e3362b6a) "
+ "BucketId(0x8c000001e3362b6a) BucketId(0x8c000005e3362b6a) "
+ "BucketId(0x8c000003e3362b6a) BucketId(0x8c000007e3362b6a) ]"),
+ serializeVisitorCommand(0));
+
+ sendReply(op);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x000000007fffffff)) "
+ "ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+VisitorOperationTest::testVisitWhenOneBucketCopyIsInvalid()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:2"));
+
+ document::BucketId id(16, 0);
+
+ addNodesToBucketDB(id, "0=100,1=0/0/1");
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(BUCKET_NOT_FOUND)"),
+ runEmptyVisitor(createVisitorCommand("incompletehandling",
+ id,
+ nullId)));
+}
+
+void
+VisitorOperationTest::testVisitingWhenAllBucketsAreInvalid()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:2"));
+
+ document::BucketId id(16, 0);
+
+ addNodesToBucketDB(id, "0=0/0/1,1=0/0/1");
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(BUCKET_NOT_FOUND)"),
+ runEmptyVisitor(createVisitorCommand("allincompletehandling",
+ id,
+ nullId)));
+}
+
+void
+VisitorOperationTest::testInconsistencyHandling()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:2"));
+
+ document::BucketId id(16, 0);
+
+ addNodesToBucketDB(id, "0=1/1/1,1=2/2/2");
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(BUCKET_NOT_FOUND)"),
+ runEmptyVisitor(createVisitorCommand("testinconsistencyhandling",
+ id,
+ nullId)));
+ _sender.clear();
+
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("multiplebucketsonesuper",
+ id,
+ nullId,
+ 8,
+ 500,
+ true),
+ VisitorOperation::Config(
+ framework::MilliSecTime(0),
+ 5,
+ 4));
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 1"),
+ _sender.getCommands(true));
+
+ sendReply(op);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x000000007fffffff)) "
+ "ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+VisitorOperationTest::testVisitIdealNode()
+{
+ ClusterState state("distributor:1 storage:3");
+ _distributor->enableClusterState(state);
+
+ // Create buckets in bucketdb
+ for (int i=0; i<32; i++ ) {
+ document::BucketId id(21, i*0x10000 + 0x0001);
+ addIdealNodes(state, id);
+ }
+
+ document::BucketId id(16, 1);
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("multinode",
+ id,
+ nullId,
+ 8),
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorCommand(dumpvisitor, , 8 buckets) Buckets: [ "
+ "BucketId(0x5400000000000001) BucketId(0x5400000000100001) "
+ "BucketId(0x5400000000080001) BucketId(0x5400000000180001) "
+ "BucketId(0x5400000000040001) BucketId(0x5400000000140001) "
+ "BucketId(0x54000000000c0001) BucketId(0x54000000001c0001) ]"),
+ serializeVisitorCommand(0));
+
+ sendReply(op);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x54000000001c0001)) "
+ "ReturnCode(NONE)"),
+ _sender.getLastReply());
+}
+
+void
+VisitorOperationTest::testNoResendingOnCriticalFailure()
+{
+ ClusterState state("distributor:1 storage:3");
+ _distributor->enableClusterState(state);
+
+ // Create buckets in bucketdb
+ for (int i=0; i<32; i++ ) {
+ document::BucketId id(21, i*0x10000 + 0x0001);
+ addNodesToBucketDB(id, "0=1/1/1/t,1=1/1/1/t");
+ }
+
+ document::BucketId id(16, 1);
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("multinodefailurecritical",
+ id,
+ nullId,
+ 8),
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+
+ sendReply(op, -1, api::ReturnCode::ILLEGAL_PARAMETERS);
+
+ CPPUNIT_ASSERT_EQUAL(
+ "CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(ILLEGAL_PARAMETERS, [from content node 0] )"s,
+ _sender.getLastReply());
+}
+
+void
+VisitorOperationTest::testFailureOnAllNodes()
+{
+ ClusterState state("distributor:1 storage:3");
+ _distributor->enableClusterState(state);
+
+ // Create buckets in bucketdb
+ for (int i=0; i<32; i++ ) {
+ document::BucketId id(21, i*0x10000 + 0x0001);
+ addNodesToBucketDB(id, "0=1/1/1/t,1=1/1/1/t");
+ }
+
+ document::BucketId id(16, 1);
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand("multinodefailurecritical",
+ id,
+ nullId,
+ 8),
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+
+ sendReply(op, -1, api::ReturnCode::NOT_CONNECTED);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0,Visitor Create => 1"),
+ _sender.getCommands(true));
+
+ sendReply(op, -1, api::ReturnCode::NOT_CONNECTED);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(BUCKET_NOT_FOUND)"),
+ _sender.getLastReply());
+}
+
+
+void
+VisitorOperationTest::testVisitOrder()
+{
+ std::vector<document::BucketId> buckets;
+
+ document::BucketId id000(35, 0x0000004d2);
+ buckets.push_back(id000);
+ document::BucketId id001(35, 0x4000004d2);
+ buckets.push_back(id001);
+ document::BucketId id01(34, 0x2000004d2);
+ buckets.push_back(id01);
+ document::BucketId id1(33, 0x1000004d2);
+ buckets.push_back(id1);
+
+ std::sort(buckets.begin(),
+ buckets.end(),
+ VisitorOrder(document::OrderingSpecification(
+ document::OrderingSpecification::ASCENDING, 0x0, 6, 2)));
+
+ CPPUNIT_ASSERT_EQUAL(buckets[0], id000);
+ CPPUNIT_ASSERT_EQUAL(buckets[1], id001);
+ CPPUNIT_ASSERT_EQUAL(buckets[2], id01);
+ CPPUNIT_ASSERT_EQUAL(buckets[3], id1);
+
+ std::sort(buckets.begin(),
+ buckets.end(),
+ VisitorOrder(document::OrderingSpecification(
+ document::OrderingSpecification::DESCENDING, 0xFF, 6, 2)));
+ CPPUNIT_ASSERT_EQUAL(buckets[0], id1);
+ CPPUNIT_ASSERT_EQUAL(buckets[1], id01);
+ CPPUNIT_ASSERT_EQUAL(buckets[2], id001);
+ CPPUNIT_ASSERT_EQUAL(buckets[3], id000);
+
+ std::sort(buckets.begin(),
+ buckets.end(),
+ VisitorOrder(document::OrderingSpecification(
+ document::OrderingSpecification::ASCENDING, 0x14, 6, 2)));
+ CPPUNIT_ASSERT_EQUAL(buckets[0], id01);
+ CPPUNIT_ASSERT_EQUAL(buckets[1], id1);
+ CPPUNIT_ASSERT_EQUAL(buckets[2], id000);
+ CPPUNIT_ASSERT_EQUAL(buckets[3], id001);
+
+ std::sort(buckets.begin(),
+ buckets.end(),
+ VisitorOrder(document::OrderingSpecification(
+ document::OrderingSpecification::DESCENDING, 0x14, 6, 2)));
+ CPPUNIT_ASSERT_EQUAL(buckets[0], id01);
+ CPPUNIT_ASSERT_EQUAL(buckets[1], id001);
+ CPPUNIT_ASSERT_EQUAL(buckets[2], id000);
+ CPPUNIT_ASSERT_EQUAL(buckets[3], id1);
+}
+
+void
+VisitorOperationTest::testVisitInChunks()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+
+ for (int i = 0; i < 9; ++i) {
+ addNodesToBucketDB(document::BucketId(30, i << 16), "0=1/1/1/t");
+ }
+
+ document::BucketId id(16, 0);
+
+ std::pair<std::string, std::string> val(runVisitor(id, nullId, 3));
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "CreateVisitorCommand(dumpvisitor, true, 3 buckets) "
+ "Buckets: [ BucketId(0x7800000000000000) "
+ "BucketId(0x7800000000080000) "
+ "BucketId(0x7800000000040000) ]"),
+ val.first);
+
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "CreateVisitorReply(last=BucketId(0x7800000000040000)) "
+ "ReturnCode(NONE)"),
+ val.second);
+
+ val = runVisitor(id, document::BucketId(0x7800000000040000), 3);
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "CreateVisitorCommand(dumpvisitor, true, 3 buckets) "
+ "Buckets: [ BucketId(0x7800000000020000) "
+ "BucketId(0x7800000000060000) "
+ "BucketId(0x7800000000010000) ]"),
+ val.first);
+
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "CreateVisitorReply(last=BucketId(0x7800000000010000)) "
+ "ReturnCode(NONE)"),
+ val.second);
+
+ val = runVisitor(id, document::BucketId(0x7800000000010000), 3);
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "CreateVisitorCommand(dumpvisitor, true, 3 buckets) "
+ "Buckets: [ BucketId(0x7800000000050000) "
+ "BucketId(0x7800000000030000) "
+ "BucketId(0x7800000000070000) ]"),
+ val.first);
+
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "CreateVisitorReply(last=BucketId(0x000000007fffffff)) "
+ "ReturnCode(NONE)"),
+ val.second);
+}
+
+void
+VisitorOperationTest::testVisitOrderSplitPastOrderBits()
+{
+ std::vector<document::BucketId> buckets;
+
+ document::BucketId max(INT_MAX);
+ buckets.push_back(max);
+ document::BucketId id1(33, 0x1000004d2);
+ buckets.push_back(id1);
+ document::BucketId id01(34, 0x2000004d2);
+ buckets.push_back(id01);
+ document::BucketId id00001(37, 0x10000004d2);
+ buckets.push_back(id00001);
+ document::BucketId id00000(37, 0x00000004d2);
+ buckets.push_back(id00000);
+ document::BucketId id0000(36, 0x0000004d2);
+ buckets.push_back(id0000);
+ document::BucketId null(0, 0);
+ buckets.push_back(null);
+
+ std::sort(buckets.begin(), buckets.end(), VisitorOrder(document::OrderingSpecification(document::OrderingSpecification::ASCENDING, 0x0, 6, 2)));
+ CPPUNIT_ASSERT_EQUAL(buckets[0], null);
+ CPPUNIT_ASSERT_EQUAL(buckets[1], id0000);
+ CPPUNIT_ASSERT_EQUAL(buckets[2], id00000);
+ CPPUNIT_ASSERT_EQUAL(buckets[3], id00001);
+ CPPUNIT_ASSERT_EQUAL(buckets[4], id01);
+ CPPUNIT_ASSERT_EQUAL(buckets[5], id1);
+ CPPUNIT_ASSERT_EQUAL(buckets[6], max);
+
+ std::sort(buckets.begin(), buckets.end(), VisitorOrder(document::OrderingSpecification(document::OrderingSpecification::DESCENDING, 0xFF, 6, 2)));
+ CPPUNIT_ASSERT_EQUAL(buckets[0], null);
+ CPPUNIT_ASSERT_EQUAL(buckets[1], id1);
+ CPPUNIT_ASSERT_EQUAL(buckets[2], id01);
+ CPPUNIT_ASSERT_EQUAL(buckets[3], id0000);
+ CPPUNIT_ASSERT_EQUAL(buckets[4], id00000);
+ CPPUNIT_ASSERT_EQUAL(buckets[5], id00001);
+ CPPUNIT_ASSERT_EQUAL(buckets[6], max);
+
+ std::sort(buckets.begin(), buckets.end(), VisitorOrder(document::OrderingSpecification(document::OrderingSpecification::ASCENDING, 0x14, 6, 2)));
+ CPPUNIT_ASSERT_EQUAL(buckets[0], null);
+ CPPUNIT_ASSERT_EQUAL(buckets[1], id01);
+ CPPUNIT_ASSERT_EQUAL(buckets[2], id1);
+ CPPUNIT_ASSERT_EQUAL(buckets[3], id0000);
+ CPPUNIT_ASSERT_EQUAL(buckets[4], id00000);
+ CPPUNIT_ASSERT_EQUAL(buckets[5], id00001);
+ CPPUNIT_ASSERT_EQUAL(buckets[6], max);
+
+ std::sort(buckets.begin(), buckets.end(), VisitorOrder(document::OrderingSpecification(document::OrderingSpecification::DESCENDING, 0x14, 6, 2)));
+ CPPUNIT_ASSERT_EQUAL(buckets[0], null);
+ CPPUNIT_ASSERT_EQUAL(buckets[1], id01);
+ CPPUNIT_ASSERT_EQUAL(buckets[2], id0000);
+ CPPUNIT_ASSERT_EQUAL(buckets[3], id00000);
+ CPPUNIT_ASSERT_EQUAL(buckets[4], id00001);
+ CPPUNIT_ASSERT_EQUAL(buckets[5], id1);
+ CPPUNIT_ASSERT_EQUAL(buckets[6], max);
+}
+
+void
+VisitorOperationTest::testVisitOrderInconsistentlySplit()
+{
+ std::vector<document::BucketId> buckets;
+
+ document::BucketId max(INT_MAX);
+ buckets.push_back(max);
+ document::BucketId id000(35, 0x0000004d2);
+ buckets.push_back(id000);
+ document::BucketId id001(35, 0x4000004d2);
+ buckets.push_back(id001);
+ document::BucketId id01(34, 0x2000004d2);
+ buckets.push_back(id01);
+ document::BucketId id1(33, 0x1000004d2);
+ buckets.push_back(id1);
+ document::BucketId idsuper(16, 0x04d2);
+ buckets.push_back(idsuper);
+ document::BucketId null(0, 0);
+ buckets.push_back(null);
+
+ std::sort(buckets.begin(), buckets.end(), VisitorOrder(document::OrderingSpecification(document::OrderingSpecification::ASCENDING, 0x0, 6, 2)));
+ CPPUNIT_ASSERT_EQUAL(buckets[0], null);
+ CPPUNIT_ASSERT_EQUAL(buckets[1], idsuper);
+ CPPUNIT_ASSERT_EQUAL(buckets[2], id000);
+ CPPUNIT_ASSERT_EQUAL(buckets[3], id001);
+ CPPUNIT_ASSERT_EQUAL(buckets[4], id01);
+ CPPUNIT_ASSERT_EQUAL(buckets[5], id1);
+ CPPUNIT_ASSERT_EQUAL(buckets[6], max);
+
+ std::sort(buckets.begin(), buckets.end(), VisitorOrder(document::OrderingSpecification(document::OrderingSpecification::DESCENDING, 0xFF, 6, 2)));
+ CPPUNIT_ASSERT_EQUAL(buckets[0], null);
+ CPPUNIT_ASSERT_EQUAL(buckets[1], idsuper);
+ CPPUNIT_ASSERT_EQUAL(buckets[2], id1);
+ CPPUNIT_ASSERT_EQUAL(buckets[3], id01);
+ CPPUNIT_ASSERT_EQUAL(buckets[4], id001);
+ CPPUNIT_ASSERT_EQUAL(buckets[5], id000);
+ CPPUNIT_ASSERT_EQUAL(buckets[6], max);
+
+ std::sort(buckets.begin(), buckets.end(), VisitorOrder(document::OrderingSpecification(document::OrderingSpecification::ASCENDING, 0x14, 6, 2)));
+ CPPUNIT_ASSERT_EQUAL(buckets[0], null);
+ CPPUNIT_ASSERT_EQUAL(buckets[1], idsuper);
+ CPPUNIT_ASSERT_EQUAL(buckets[2], id01);
+ CPPUNIT_ASSERT_EQUAL(buckets[3], id1);
+ CPPUNIT_ASSERT_EQUAL(buckets[4], id000);
+ CPPUNIT_ASSERT_EQUAL(buckets[5], id001);
+ CPPUNIT_ASSERT_EQUAL(buckets[6], max);
+
+ std::sort(buckets.begin(), buckets.end(), VisitorOrder(document::OrderingSpecification(document::OrderingSpecification::DESCENDING, 0x14, 6, 2)));
+ CPPUNIT_ASSERT_EQUAL(buckets[0], null);
+ CPPUNIT_ASSERT_EQUAL(buckets[1], idsuper);
+ CPPUNIT_ASSERT_EQUAL(buckets[2], id01);
+ CPPUNIT_ASSERT_EQUAL(buckets[3], id001);
+ CPPUNIT_ASSERT_EQUAL(buckets[4], id000);
+ CPPUNIT_ASSERT_EQUAL(buckets[5], id1);
+ CPPUNIT_ASSERT_EQUAL(buckets[6], max);
+}
+
+std::string
+VisitorOperationTest::doOrderedVisitor(document::BucketId startBucket)
+{
+ std::vector<document::BucketId> buckets;
+
+ while (true) {
+ _sender.clear();
+
+ VisitorOperation op(getExternalOperationHandler(),
+ createVisitorCommand(
+ "uservisitororder",
+ startBucket,
+ buckets.size() ? buckets[buckets.size() - 1] :
+ nullId,
+ 1,
+ 500,
+ false,
+ false,
+ "dumpvisitor",
+ document::OrderingSpecification::DESCENDING,
+ "id.order(6,2)<= 20"),
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+
+ for (uint32_t i = 0; i < _sender.commands.size(); ++i) {
+ const api::CreateVisitorCommand cmd(
+ static_cast<const api::CreateVisitorCommand&>(
+ *_sender.commands[i]));
+
+ for (uint32_t j = 0; j < cmd.getBuckets().size(); ++j) {
+ buckets.push_back(cmd.getBuckets()[j]);
+ }
+ }
+
+ sendReply(op);
+
+ CPPUNIT_ASSERT_EQUAL(1, (int)_sender.replies.size());
+
+ const api::CreateVisitorReply& reply(
+ static_cast<const api::CreateVisitorReply&>(*_sender.replies[0]));
+
+ if (reply.getLastBucket() == document::BucketId(0x000000007fffffff)) {
+ break;
+ }
+ }
+
+ std::ostringstream ost;
+ for (uint32_t i = 0; i < buckets.size(); ++i) {
+ ost << buckets[i] << "\n";
+ }
+
+ return ost.str();
+}
+
+void
+VisitorOperationTest::testUserVisitorOrder()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+
+ // Create buckets in bucketdb
+ std::vector<document::BucketId> buckets;
+ document::BucketId id000(35, 0x0000004d2);
+ buckets.push_back(id000);
+ document::BucketId id001(35, 0x4000004d2);
+ buckets.push_back(id001);
+ document::BucketId id01(34, 0x2000004d2);
+ buckets.push_back(id01);
+ document::BucketId id1(33, 0x1000004d2);
+ buckets.push_back(id1);
+
+ for (uint32_t i=0; i<buckets.size(); i++) {
+ addNodesToBucketDB(buckets[i], "0=1/1/1/t");
+ }
+
+ document::BucketId id(16, 0x04d2);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("BucketId(0x88000002000004d2)\n"
+ "BucketId(0x8c000004000004d2)\n"
+ "BucketId(0x8c000000000004d2)\n"
+ "BucketId(0x84000001000004d2)\n"),
+ doOrderedVisitor(id));
+}
+
+void
+VisitorOperationTest::testUserVisitorOrderSplitPastOrderBits()
+{
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:1"));
+
+ // Create buckets in bucketdb
+ std::vector<document::BucketId> buckets;
+ document::BucketId id1(33, 0x1000004d2);
+ buckets.push_back(id1);
+ document::BucketId id01(34, 0x2000004d2);
+ buckets.push_back(id01);
+ document::BucketId id00001(37, 0x10000004d2);
+ buckets.push_back(id00001);
+ document::BucketId id00000(37, 0x00000004d2);
+ buckets.push_back(id00000);
+ document::BucketId id0000(36, 0x0000004d2);
+ buckets.push_back(id0000);
+ for (uint32_t i=0; i<buckets.size(); i++) {
+ addNodesToBucketDB(buckets[i], "0=1/1/1/t");
+ }
+
+ document::BucketId id(16, 0x04d2);
+
+ CPPUNIT_ASSERT_EQUAL(std::string("BucketId(0x88000002000004d2)\n"
+ "BucketId(0x90000000000004d2)\n"
+ "BucketId(0x94000000000004d2)\n"
+ "BucketId(0x94000010000004d2)\n"
+ "BucketId(0x84000001000004d2)\n"),
+ doOrderedVisitor(id));
+}
+
+std::unique_ptr<VisitorOperation>
+VisitorOperationTest::startOperationWith2StorageNodeVisitors(bool inconsistent)
+{
+ ClusterState state("distributor:1 storage:3");
+ _distributor->enableClusterState(state);
+
+ addNodesToBucketDB(document::BucketId(17, 1), "0=1/1/1/t");
+ addNodesToBucketDB(document::BucketId(17, 1 << 16 | 1),
+ "1=1/1/1/t");
+
+ document::BucketId id(16, 1);
+ auto op = std::make_unique<VisitorOperation>(
+ getExternalOperationHandler(),
+ createVisitorCommand(
+ "multinodefailurecritical",
+ id,
+ nullId,
+ 8,
+ 500,
+ inconsistent),
+ defaultConfig);
+
+ op->start(_sender, framework::MilliSecTime(0));
+
+ CPPUNIT_ASSERT_EQUAL("Visitor Create => 0,Visitor Create => 1"s,
+ _sender.getCommands(true));
+ return op;
+}
+
+void
+VisitorOperationTest::testNoClientReplyBeforeAllStorageRepliesReceived()
+{
+ auto op = startOperationWith2StorageNodeVisitors(false);
+
+ sendReply(*op, 0, api::ReturnCode::BUSY);
+ // We don't want to see a reply here until the other node has replied.
+ CPPUNIT_ASSERT_EQUAL(""s, _sender.getReplies(true));
+ // OK reply from 1, but have to retry from client anyhow since one of
+ // the sub buckets failed to be processed and we don't have inconsistent
+ // visiting set in the client visitor command.
+ sendReply(*op, 1);
+ CPPUNIT_ASSERT_EQUAL(
+ "CreateVisitorReply(last=BucketId(0x0000000000000000)) "
+ "ReturnCode(BUCKET_NOT_FOUND)"s,
+ _sender.getLastReply());
+ // XXX we should consider wether we want BUSY to be returned instead.
+ // Non-critical error codes are currently converted to a generic "not found"
+ // code to let the client silently retry until the bucket has hopefully
+ // become consistent/available.
+}
+
+void
+VisitorOperationTest::testSkipFailedSubBucketsWhenVisitingInconsistent()
+{
+ auto op = startOperationWith2StorageNodeVisitors(true);
+
+ sendReply(*op, 0, api::ReturnCode::BUSY);
+ CPPUNIT_ASSERT_EQUAL(""s, _sender.getReplies(true));
+ // Subset of buckets could not be visited, but visit inconsistent flag is
+ // set in the client visitor so we treat it as a success anyway. In this
+ // case we've expanded the entire superbucket sub-tree so return with magic
+ // number to signify this.
+ sendReply(*op, 1);
+ CPPUNIT_ASSERT_EQUAL(
+ "CreateVisitorReply(last=BucketId(0x000000007fffffff)) "
+ "ReturnCode(NONE)"s,
+ _sender.getLastReply());
+}
+
+// By default, queue timeout should be half of remaining visitor time. This
+// is a highly un-scientific heuristic, but seems rather more reasonable than
+// having it hard-coded to 2000 ms as was the case earlier.
+void
+VisitorOperationTest::testQueueTimeoutIsFactorOfTotalTimeout()
+{
+ document::BucketId id(uint64_t(0x400000000000007b));
+ _distributor->enableClusterState(ClusterState("distributor:1 storage:2"));
+ addNodesToBucketDB(id, "0=1/1/1/t,1=1/1/1/t");
+
+ VisitorOperation op(
+ getExternalOperationHandler(),
+ createVisitorCommand("foo", id, nullId, 8, 10000),
+ defaultConfig);
+
+ op.start(_sender, framework::MilliSecTime(0));
+ CPPUNIT_ASSERT_EQUAL(std::string("Visitor Create => 0"),
+ _sender.getCommands(true));
+
+ auto& cmd(dynamic_cast<CreateVisitorCommand&>(*_sender.commands[0]));
+ CPPUNIT_ASSERT_EQUAL(uint32_t(5000), cmd.getQueueTimeout());
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/tests/fastos.project.newcore b/storage/src/tests/fastos.project.newcore
new file mode 100644
index 00000000000..7b5cad846b1
--- /dev/null
+++ b/storage/src/tests/fastos.project.newcore
@@ -0,0 +1,80 @@
+APPLICATION testrunner
+OBJS storageserver/dummystoragelink
+OBJS testrunner testhelper
+LIBS tests/persistence/memfile/testmemfiletop
+LIBS tests/serverapp/testserverapp
+LIBS tests/storageserver/teststorageserver
+LIBS tests/bucketmover/testbucketmover
+LIBS tests/storageutil/teststorageutil
+LIBS tests/visiting/testvisiting
+LIBS tests/bucketdb/testbucketdb
+LIBS tests/common/testcommon
+LIBS tests/common/hostreporter/testhostreporter
+LIBS tests/distributor/testdistributor
+LIBS tests/persistence/testpersistence
+LIBS tests/persistence/device/testdevice
+LIBS tests/persistence/filestorage/testfilestorage
+LIBS tests/persistence/filestorage/slotfile/testslotfile
+LIBS tests/splitting/testsplitting
+LIBS tests/memorymanager/testmemorymanager
+
+LIBS storage/storageserver/storageserver
+LIBS storage/bucketmover/bucketmover
+LIBS storage/visiting/visitor
+LIBS storage/memorymanager/memorymanager
+LIBS storage/persistence/persistence
+LIBS storage/persistence/filestorage/filestorpersistence
+LIBS storage/persistence/memfile/memfiletop
+LIBS storage/persistence/memfile/common/memfilecommon
+LIBS storage/persistence/memfile/mapper/memfilemapper
+LIBS storage/persistence/memfile/handler/memfilehandler
+LIBS storage/persistence/memfile/memfile/memfile
+LIBS storage/persistence/memfile/common/memfilecommon
+LIBS storage/persistence/memfile/memfiletop
+LIBS storage/storageutil/storageutil
+LIBS storage/persistence/device/device
+LIBS storage/persistence/filestorage/slotfile/slotfile
+LIBS storage/bucketdb/bucketdb
+LIBS storage/distributor/distributor
+LIBS storage/common/common
+LIBS storage/config/storageconfig
+EXTERNALLIBS cppunit vdslib storageapi
+EXTERNALLIBS document metrics boost_regex-mt-d
+EXTERNALLIBS fast iconv
+EXTERNALLIBS vespa
+EXTERNALLIBS config vespalog Judy vdslib documentapi vespalib
+EXTERNALLIBS messagebus-test slobrokserver
+
+CUSTOMMAKE
+
+LIBDIR_TESTS=persistence/memfile:bucketdb:common:distributor:persistence:persistence/device:persistence/filestorage:persistence/filestorage/slotfile:serverapp:storageserver:storageutil:visiting:splitting:memorymanager:bucketmover
+
+test: all
+ rm -f test.vlog
+ VESPA_LOG_TARGET=file:test.vlog LD_LIBRARY_PATH=$(LIBDIR_BOOST_REGEX-MT-D):$(LIBDIR_DSTORE):$(LIBDIR_ICONV):$(LIBDIR_CPPUNIT):$(LIBDIR_TESTS):$(LD_LIBRARY_PATH) $(VALGRIND) ./testrunner --verbose $(TESTRUNARGS)
+
+vtest: all
+ rm -f test.vlog
+ VESPA_LOG_TARGET=file:test.vlog LD_LIBRARY_PATH=$(LIBDIR_BOOST_REGEX-MT-D):$(LIBDIR_DSTORE):$(LIBDIR_ICONV):$(LIBDIR_CPPUNIT):$(LIBDIR_TESTS):$(LD_LIBRARY_PATH) valgrind --leak-check=no ./testrunner --verbose
+
+testdebug: all
+ rm -f test.vlog
+ VESPA_LOG_TARGET=file:test.vlog LD_LIBRARY_PATH=$(LIBDIR_BOOST_REGEX-MT-D):$(LIBDIR_DSTORE):$(LIBDIR_ICONV):$(LIBDIR_CPPUNIT):$(LIBDIR_TESTS):$(LD_LIBRARY_PATH) gdb53 ./testrunner --verbose
+
+testwithlog: all
+ LD_LIBRARY_PATH=$(LIBDIR_BOOST_REGEX-MT-D):$(LIBDIR_DSTORE):$(LIBDIR_ICONV):$(LIBDIR_CPPUNIT):$(LIBDIR_TESTS):$(LD_LIBRARY_PATH) ./testrunner --verbose
+
+vtestwithlog: all
+ LD_LIBRARY_PATH=$(LIBDIR_BOOST_REGEX-MT-D):$(LIBDIR_DSTORE):$(LIBDIR_ICONV):$(LIBDIR_CPPUNIT):$(LIBDIR_TESTS):$(LD_LIBRARY_PATH) valgrind ./testrunner --verbose
+
+stresstest: all
+ rm -f test.vlog
+ VESPA_LOG_TARGET=file:test.vlog LD_LIBRARY_PATH=$(LIBDIR_BOOST_REGEX-MT-D):$(LIBDIR_DSTORE):$(LIBDIR_ICONV):$(LIBDIR_CPPUNIT):$(LIBDIR_TESTS):$(LD_LIBRARY_PATH) ./testrunner --verbose --includestress stress Stress
+
+testverbose: all
+ rm -f test.vlog
+ VESPA_LOG_TARGET=file:test.vlog LD_LIBRARY_PATH=$(LIBDIR_BOOST_REGEX-MT-D):$(LIBDIR_DSTORE):$(LIBDIR_ICONV):$(LIBDIR_CPPUNIT):$(LIBDIR_TESTS):$(LD_LIBRARY_PATH) ./testrunner --verbose
+
+testall: all
+ rm -f test.vlog
+ VESPA_LOG_TARGET=file:test.vlog LD_LIBRARY_PATH=$(LIBDIR_BOOST_REGEX-MT-D):$(LIBDIR_DSTORE):$(LIBDIR_ICONV):$(LIBDIR_CPPUNIT):$(LIBDIR_TESTS):$(LD_LIBRARY_PATH) $(VALGRIND) ./testrunner --verbose --includestress
diff --git a/storage/src/tests/frameworkimpl/memory/CMakeLists.txt b/storage/src/tests/frameworkimpl/memory/CMakeLists.txt
new file mode 100644
index 00000000000..da78716459f
--- /dev/null
+++ b/storage/src/tests/frameworkimpl/memory/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_testmemory
+ SOURCES
+ memorystatusviewertest.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/tests/frameworkimpl/memory/memorystatusviewertest.cpp b/storage/src/tests/frameworkimpl/memory/memorystatusviewertest.cpp
new file mode 100644
index 00000000000..cc7e98d8718
--- /dev/null
+++ b/storage/src/tests/frameworkimpl/memory/memorystatusviewertest.cpp
@@ -0,0 +1,168 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/metrics/metrics.h>
+#include <vespa/storage/frameworkimpl/memory/memorystatusviewer.h>
+#include <vespa/storageframework/defaultimplementation/memory/prioritymemorylogic.h>
+#include <tests/common/teststorageapp.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+
+namespace storage {
+
+struct MemoryStatusViewerTest : public CppUnit::TestFixture
+{
+ static const int maxMemory = 1000;
+ std::unique_ptr<TestServiceLayerApp> _node;
+ std::unique_ptr<framework::defaultimplementation::MemoryManager> _memMan;
+
+ void setUp();
+
+ void testEmptyState();
+ void testSnapshots();
+
+ CPPUNIT_TEST_SUITE(MemoryStatusViewerTest);
+ CPPUNIT_TEST(testEmptyState);
+ CPPUNIT_TEST(testSnapshots);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(MemoryStatusViewerTest);
+
+void
+MemoryStatusViewerTest::setUp()
+{
+ _node.reset(new TestServiceLayerApp(DiskCount(2)));
+ framework::defaultimplementation::PriorityMemoryLogic* logic(
+ new framework::defaultimplementation::PriorityMemoryLogic(
+ _node->getClock(), maxMemory));
+ logic->setMinJumpToUpdateMax(1);
+ _memMan.reset(new framework::defaultimplementation::MemoryManager(
+ framework::defaultimplementation::AllocationLogic::UP(logic)));
+}
+
+void
+MemoryStatusViewerTest::testEmptyState()
+{
+ // Add a memory manager, and add a bit of load to it, so it's not
+ // totally empty.
+ StorageComponent component(_node->getComponentRegister(), "test");
+
+ metrics::MetricManager mm;
+ MemoryStatusViewer viewer(
+ *_memMan, mm, _node->getComponentRegister());
+ std::ostringstream actual;
+ viewer.reportStatus(actual, framework::HttpUrlPath("/"));
+ CPPUNIT_ASSERT_MATCH_REGEX(".*Plotr.LineChart.*", actual.str());
+ CPPUNIT_ASSERT_MATCH_REGEX(
+ ".*Current: 1970-01-01 00:00:00 Max memory 1000 SnapShot\\(Used 0, w/o cache 0\\).*",
+ actual.str());
+ CPPUNIT_ASSERT_MATCH_REGEX(
+ ".*Last hour: na.*", actual.str());
+}
+
+namespace {
+ void waitForProcessedTime(
+ const MemoryStatusViewer& viewer, framework::SecondTime time,
+ framework::SecondTime timeout = framework::SecondTime(30))
+ {
+ framework::defaultimplementation::RealClock clock;
+ framework::MilliSecTime endTime(
+ clock.getTimeInMillis() + timeout.getMillis());
+ framework::SecondTime processedTime(0);
+ while (clock.getTimeInMillis() < endTime) {
+ processedTime = viewer.getProcessedTime();
+ if (processedTime >= time) return;
+ FastOS_Thread::Sleep(1);
+ }
+ std::ostringstream ost;
+ ost << "Timed out waiting " << timeout << " ms for time " << time
+ << " to be processed. Currently time is only processed up to "
+ << processedTime;
+ throw new vespalib::IllegalStateException(ost.str(), VESPA_STRLOC);
+ }
+}
+
+#define ASSERT_MEMORY(output, period, maxmem, used, usedwocache) \
+{ \
+ std::string::size_type _pos1_(output.find(period)); \
+ std::string::size_type _pos2_(output.find("Max memory", _pos1_)); \
+ std::string::size_type _pos3_(output.find("SnapShot", _pos2_)); \
+ std::string _maxMemory_(output.substr(_pos2_ + 11, _pos3_ - _pos2_ - 12)); \
+ std::string::size_type _pos4_(output.find(",", _pos3_)); \
+ std::string _used_(output.substr(_pos3_ + 14, _pos4_ - _pos3_ - 14)); \
+ std::string::size_type _pos5_(output.find(")", _pos4_)); \
+ std::string _usedwo_(output.substr(_pos4_ + 12, _pos5_ - _pos4_ - 12)); \
+ std::ostringstream _failure_; \
+ _failure_ << "Wrong match in period " << period << " in output:\n" \
+ << output << "\nFor value: "; \
+ \
+ CPPUNIT_ASSERT_EQUAL_MSG(_failure_.str() + "Max memory", \
+ uint64_t(maxmem), boost::lexical_cast<uint64_t>(_maxMemory_)); \
+ CPPUNIT_ASSERT_EQUAL_MSG(_failure_.str() + "Used memory", \
+ uint64_t(used), boost::lexical_cast<uint64_t>(_used_)); \
+ CPPUNIT_ASSERT_EQUAL_MSG(_failure_.str() + "Used memory w/o cache", \
+ uint64_t(usedwocache), boost::lexical_cast<uint64_t>(_usedwo_)); \
+}
+
+void
+MemoryStatusViewerTest::testSnapshots()
+{
+ // Add a memory manager, and add a bit of load to it, so it's not
+ // totally empty.
+ StorageComponent component(_node->getComponentRegister(), "test");
+ const framework::MemoryAllocationType putAlloc(
+ component.getMemoryManager().registerAllocationType(
+ framework::MemoryAllocationType("PUT")));
+ const framework::MemoryAllocationType getAlloc(
+ component.getMemoryManager().registerAllocationType(
+ framework::MemoryAllocationType("GET")));
+
+ framework::MemoryToken::UP put = _memMan->allocate(putAlloc, 0, 100, 80);
+ framework::MemoryToken::UP get = _memMan->allocate(getAlloc, 30, 200, 50);
+ framework::MemoryToken::UP get2 = _memMan->allocate(getAlloc, 70, 150, 60);
+
+ metrics::MetricManager mm;
+ MemoryStatusViewer viewer(*_memMan, mm, _node->getComponentRegister());
+
+ _node->getClock().addSecondsToTime(1000);
+ viewer.notifyThread();
+ waitForProcessedTime(viewer, framework::SecondTime(1000));
+
+ std::ostringstream actual;
+ viewer.printDebugOutput(actual);
+ //std::cerr << actual.str() << "\n";
+ ASSERT_MEMORY(actual.str(), "Current", 1000, 450, 450);
+ ASSERT_MEMORY(actual.str(), "Last hour", 1000, 450, 450);
+ ASSERT_MEMORY(actual.str(), "Last ever", 1000, 450, 450);
+
+ put = _memMan->allocate(putAlloc, 0, 50, 80);
+ get = _memMan->allocate(getAlloc, 100, 140, 50);
+ get2 = _memMan->allocate(getAlloc, 20, 100, 70);
+
+ _node->getClock().addSecondsToTime(3600);
+ viewer.notifyThread();
+ waitForProcessedTime(viewer, framework::SecondTime(4600));
+
+ actual.str("");
+ viewer.printDebugOutput(actual);
+ //std::cerr << actual.str() << "\n";
+ ASSERT_MEMORY(actual.str(), "Current", 1000, 290, 290);
+ ASSERT_MEMORY(actual.str(), "Last hour", 1000, 540, 540);
+ ASSERT_MEMORY(actual.str(), "Last ever", 1000, 540, 540);
+
+ get.reset();
+
+ _node->getClock().addSecondsToTime(3600);
+ viewer.notifyThread();
+ waitForProcessedTime(viewer, framework::SecondTime(4600 + 3600));
+
+ actual.str("");
+ viewer.printDebugOutput(actual);
+ //std::cerr << actual.str() << "\n";
+ ASSERT_MEMORY(actual.str(), "Current", 1000, 150, 150);
+ ASSERT_MEMORY(actual.str(), "Last hour", 1000, 290, 290);
+ ASSERT_MEMORY(actual.str(), "Last ever", 1000, 540, 540);
+
+}
+
+} // storage
diff --git a/storage/src/tests/frameworkimpl/status/CMakeLists.txt b/storage/src/tests/frameworkimpl/status/CMakeLists.txt
new file mode 100644
index 00000000000..734be8e9998
--- /dev/null
+++ b/storage/src/tests/frameworkimpl/status/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_teststatus
+ SOURCES
+ statustest.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/tests/frameworkimpl/status/statustest.cpp b/storage/src/tests/frameworkimpl/status/statustest.cpp
new file mode 100644
index 00000000000..0fc10e411cb
--- /dev/null
+++ b/storage/src/tests/frameworkimpl/status/statustest.cpp
@@ -0,0 +1,222 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/document/util/stringutil.h>
+#include <vespa/log/log.h>
+#include <sstream>
+#include <vespa/storageframework/defaultimplementation/component/componentregisterimpl.h>
+#include <vespa/storage/frameworkimpl/status/statuswebserver.h>
+#include <vespa/storageframework/defaultimplementation/thread/threadpoolimpl.h>
+#include <tests/common/teststorageapp.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+
+LOG_SETUP(".test.status");
+
+namespace storage {
+
+struct StatusTest : public CppUnit::TestFixture {
+ std::unique_ptr<TestServiceLayerApp> _node;
+
+ void setUp();
+
+ void testIndexStatusPage();
+ void testHtmlStatus();
+ void testXmlStatus();
+ void test404();
+ void requireThatServerSpecIsConstructedCorrectly();
+
+ CPPUNIT_TEST_SUITE(StatusTest);
+ CPPUNIT_TEST(testIndexStatusPage);
+ CPPUNIT_TEST(testHtmlStatus);
+ CPPUNIT_TEST(testXmlStatus);
+ CPPUNIT_TEST(test404);
+ CPPUNIT_TEST(requireThatServerSpecIsConstructedCorrectly);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(StatusTest);
+
+namespace {
+ struct HtmlStatusReporter : public framework::HtmlStatusReporter {
+ std::string _headerAddition;
+ std::string _content;
+
+ HtmlStatusReporter(const std::string& id, const std::string& name,
+ const std::string& content,
+ const std::string& headerAddition = "")
+ : framework::HtmlStatusReporter(id, name),
+ _headerAddition(headerAddition),
+ _content(content)
+ {
+ }
+
+ virtual void reportHtmlHeaderAdditions(
+ std::ostream& out, const framework::HttpUrlPath&) const
+ {
+ out << _headerAddition;
+ }
+
+ virtual void reportHtmlStatus(
+ std::ostream& out, const framework::HttpUrlPath&) const
+ {
+ out << _content;
+ }
+ };
+
+ struct XmlStatusReporter : public framework::XmlStatusReporter {
+ XmlStatusReporter(const std::string& id, const std::string& name)
+ : framework::XmlStatusReporter(id, name) {}
+ virtual vespalib::string reportXmlStatus(
+ vespalib::xml::XmlOutputStream& xos,
+ const framework::HttpUrlPath&) const
+ {
+ xos << vespalib::xml::XmlTag("mytag")
+ << vespalib::xml::XmlAttribute("foo", "bar")
+ << vespalib::xml::XmlContent("content")
+ << vespalib::xml::XmlEndTag();
+ return "";
+ }
+ };
+
+ struct StatusComponent : public framework::Component {
+ framework::StatusReporter* _reporter;
+
+ StatusComponent(framework::ComponentRegister& reg, const char* name,
+ framework::StatusReporter* reporter)
+ : framework::Component(reg, name),
+ _reporter(reporter)
+ {
+ registerStatusPage(*_reporter);
+ }
+ ~StatusComponent() { delete _reporter; }
+ };
+
+}
+
+void
+StatusTest::setUp()
+{
+ _node.reset(new TestServiceLayerApp);
+}
+
+void
+StatusTest::testIndexStatusPage()
+{
+ StatusComponent rep1(_node->getComponentRegister(), "foo",
+ new HtmlStatusReporter(
+ "fooid", "Foo impl", "<p>info</p>"));
+ StatusComponent rep2(_node->getComponentRegister(), "bar",
+ new HtmlStatusReporter(
+ "barid", "Bar impl", "<p>info</p>"));
+ StatusWebServer webServer(_node->getComponentRegister(),
+ _node->getComponentRegister(),
+ "raw:httpport -1");
+ std::ostringstream ss;
+ framework::HttpUrlPath path("");
+ webServer.handlePage(path, ss);
+ std::string expected(
+ "HTTP\\/1.1 200 OK\r\n"
+ "Connection: Close\r\n"
+ "Content-type: text\\/html\r\n"
+ "\r\n"
+ "<html>\n"
+ "<head>\n"
+ " <title>Index page</title>\n"
+ "<\\/head>\n"
+ "<body>\n"
+ " <h1>Index page</h1>\n"
+ "<p><b>Binary version of Vespa:<\\/b> [0-9.]+<\\/p>\n"
+ "<a href=\"fooid\">Foo impl<\\/a><br>\n"
+ "<a href=\"barid\">Bar impl<\\/a><br>\n"
+ "<\\/body>\n"
+ "<\\/html>\n"
+ );
+ CPPUNIT_ASSERT_MATCH_REGEX(expected, ss.str());
+}
+
+void
+StatusTest::testHtmlStatus()
+{
+ StatusComponent rep1(_node->getComponentRegister(), "foo",
+ new HtmlStatusReporter(
+ "fooid", "Foo impl", "<p>info</p>", "<!-- script -->"));
+ StatusWebServer webServer(_node->getComponentRegister(),
+ _node->getComponentRegister(),
+ "raw:httpport -1");
+ std::ostringstream ost;
+ framework::HttpUrlPath path("/fooid?unusedParam");
+ webServer.handlePage(path, ost);
+ std::string expected(
+ "HTTP/1.1 200 OK\r\n"
+ "Connection: Close\r\n"
+ "Content-type: text/html\r\n"
+ "\r\n"
+ "<html>\n"
+ "<head>\n"
+ " <title>Foo impl</title>\n"
+ "<!-- script --></head>\n"
+ "<body>\n"
+ " <h1>Foo impl</h1>\n"
+ "<p>info</p></body>\n"
+ "</html>\n"
+ );
+ CPPUNIT_ASSERT_EQUAL(expected, ost.str());
+}
+
+void
+StatusTest::testXmlStatus()
+{
+ StatusComponent rep1(_node->getComponentRegister(), "foo",
+ new XmlStatusReporter(
+ "fooid", "Foo impl"));
+ StatusWebServer webServer(_node->getComponentRegister(),
+ _node->getComponentRegister(),
+ "raw:httpport -1");
+ std::ostringstream ost;
+ framework::HttpUrlPath path("/fooid?unusedParam");
+ webServer.handlePage(path, ost);
+ std::string expected(
+ "HTTP/1.1 200 OK\r\n"
+ "Connection: Close\r\n"
+ "Content-type: application/xml\r\n"
+ "\r\n"
+ "<?xml version=\"1.0\"?>\n"
+ "<status id=\"fooid\" name=\"Foo impl\">\n"
+ "<mytag foo=\"bar\">content</mytag>\n"
+ "</status>"
+ );
+ CPPUNIT_ASSERT_EQUAL(expected, ost.str());
+}
+
+void
+StatusTest::test404()
+{
+ StatusWebServer webServer(_node->getComponentRegister(),
+ _node->getComponentRegister(),
+ "raw:httpport -1");
+ std::ostringstream ost;
+ framework::HttpUrlPath path("/fooid?unusedParam");
+ webServer.handlePage(path, ost);
+ std::string expected(
+ "HTTP/1.1 404 Not found\r\n"
+ "Connection: Close\r\n"
+ "Content-type: text/html\r\n"
+ "\r\n"
+ "<html><head><title>404 Not found</title></head>\r\n"
+ "<body><h1>404 Not found</h1>\r\n"
+ "<p></p></body>\r\n"
+ "</html>\r\n"
+ );
+ CPPUNIT_ASSERT_EQUAL_ESCAPED(expected, ost.str());
+}
+
+void
+StatusTest::requireThatServerSpecIsConstructedCorrectly()
+{
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("requesthost:10"),
+ StatusWebServer::getServerSpec("requesthost:10", "serverhost:20"));
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("serverhost:20"),
+ StatusWebServer::getServerSpec("", "serverhost:20"));
+}
+
+} // storage
diff --git a/storage/src/tests/persistence/.gitignore b/storage/src/tests/persistence/.gitignore
new file mode 100644
index 00000000000..184e5d1c936
--- /dev/null
+++ b/storage/src/tests/persistence/.gitignore
@@ -0,0 +1,12 @@
+*.So
+*.lo
+*.o
+.*.swp
+.config.log
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
+testrunner
+testrunner.core
diff --git a/storage/src/tests/persistence/CMakeLists.txt b/storage/src/tests/persistence/CMakeLists.txt
new file mode 100644
index 00000000000..c065c3eef5b
--- /dev/null
+++ b/storage/src/tests/persistence/CMakeLists.txt
@@ -0,0 +1,19 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_testpersistence
+ SOURCES
+ processalltest.cpp
+ persistencetestutils.cpp
+ splitbitdetectortest.cpp
+ legacyoperationhandlertest.cpp
+ persistenceproviderwrapper.cpp
+ diskmoveoperationhandlertest.cpp
+ providershutdownwrappertest.cpp
+ mergehandlertest.cpp
+ persistencethread_splittest.cpp
+ bucketownershipnotifiertest.cpp
+ persistencequeuetest.cpp
+ testandsettest.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/tests/persistence/bucketownershipnotifiertest.cpp b/storage/src/tests/persistence/bucketownershipnotifiertest.cpp
new file mode 100644
index 00000000000..ae54e629473
--- /dev/null
+++ b/storage/src/tests/persistence/bucketownershipnotifiertest.cpp
@@ -0,0 +1,162 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <tests/distributor/messagesenderstub.h>
+#include <tests/common/teststorageapp.h>
+#include <vespa/storage/persistence/bucketownershipnotifier.h>
+
+namespace storage {
+
+class BucketOwnershipNotifierTest : public CppUnit::TestFixture
+{
+ std::unique_ptr<TestServiceLayerApp> _app;
+ lib::ClusterState _clusterState;
+public:
+
+ BucketOwnershipNotifierTest()
+ : _app(),
+ _clusterState("distributor:2 storage:1")
+ {}
+
+ void setUp();
+
+ CPPUNIT_TEST_SUITE(BucketOwnershipNotifierTest);
+ CPPUNIT_TEST(testSendNotifyBucketChangeIfOwningDistributorChanged);
+ CPPUNIT_TEST(testDoNotSendNotifyBucketChangeIfBucketOwnedByInitialSender);
+ CPPUNIT_TEST(testIgnoreIdealStateCalculationExceptions);
+ CPPUNIT_TEST(testGuardNotifyAlways);
+ CPPUNIT_TEST_SUITE_END();
+
+ bool ownsBucket(uint16_t distributorIndex,
+ const document::BucketId& bucket) const
+ {
+ uint16_t distributor = _app->getDistribution()->getIdealDistributorNode(
+ _clusterState, bucket);
+ return distributor == distributorIndex;
+ }
+
+ document::BucketId getFirstNonOwnedBucket() {
+ for (int i = 0; i < 1000; ++i) {
+ if (!ownsBucket(0, document::BucketId(16, i))) {
+ return document::BucketId(16, i);
+ }
+ }
+ return document::BucketId(0);
+ }
+
+ document::BucketId getFirstOwnedBucket() {
+ for (int i = 0; i < 1000; ++i) {
+ if (ownsBucket(0, document::BucketId(16, i))) {
+ return document::BucketId(16, i);
+ }
+ }
+ return document::BucketId(0);
+ }
+
+
+ void testSendNotifyBucketChangeIfOwningDistributorChanged();
+ void testDoNotSendNotifyBucketChangeIfBucketOwnedByInitialSender();
+ void testIgnoreIdealStateCalculationExceptions();
+ void testGuardNotifyAlways();
+
+ void doTestNotification(const document::BucketId& bucket,
+ const api::BucketInfo& info,
+ const std::string& wantedSend);
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(BucketOwnershipNotifierTest);
+
+void
+BucketOwnershipNotifierTest::setUp()
+{
+ _app.reset(new TestServiceLayerApp);
+ _app->setDistribution(Redundancy(1), NodeCount(2));
+ _app->setClusterState(_clusterState);
+}
+
+void
+BucketOwnershipNotifierTest::doTestNotification(const document::BucketId& bucket,
+ const api::BucketInfo& info,
+ const std::string& wantedSend)
+{
+ ServiceLayerComponent component(_app->getComponentRegister(), "dummy");
+ MessageSenderStub sender;
+
+ BucketOwnershipNotifier notifier(component, sender);
+
+ notifier.notifyIfOwnershipChanged(bucket, 0, info);
+
+ CPPUNIT_ASSERT_EQUAL(wantedSend, sender.getCommands(true, true));
+}
+
+void
+BucketOwnershipNotifierTest::testSendNotifyBucketChangeIfOwningDistributorChanged()
+{
+ api::BucketInfo info(0x1, 2, 3);
+ document::BucketId bucket(getFirstNonOwnedBucket());
+ CPPUNIT_ASSERT(bucket.getRawId() != 0);
+
+ std::ostringstream wanted;
+ wanted << "NotifyBucketChangeCommand("
+ << bucket
+ << ", " << info
+ << ") => 1";
+
+ doTestNotification(bucket, info, wanted.str());
+}
+
+void
+BucketOwnershipNotifierTest::testDoNotSendNotifyBucketChangeIfBucketOwnedByInitialSender()
+{
+ api::BucketInfo info(0x1, 2, 3);
+ document::BucketId bucket(getFirstOwnedBucket());
+ CPPUNIT_ASSERT(bucket.getRawId() != 0);
+
+ doTestNotification(bucket, info, "");
+}
+
+void
+BucketOwnershipNotifierTest::testIgnoreIdealStateCalculationExceptions()
+{
+ api::BucketInfo info(0x1, 2, 3);
+ document::BucketId bucket(getFirstNonOwnedBucket());
+ CPPUNIT_ASSERT(bucket.getRawId() != 0);
+
+ _app->setClusterState(lib::ClusterState("distributor:0 storage:1"));
+
+ doTestNotification(bucket, info, "");
+}
+
+void
+BucketOwnershipNotifierTest::testGuardNotifyAlways()
+{
+ ServiceLayerComponent component(_app->getComponentRegister(), "dummy");
+ MessageSenderStub sender;
+ BucketOwnershipNotifier notifier(component, sender);
+ std::ostringstream wanted;
+ {
+ NotificationGuard guard(notifier);
+
+ api::BucketInfo info(0x1, 2, 3);
+ document::BucketId bucket1(getFirstOwnedBucket());
+ guard.notifyAlways(bucket1, info);
+
+ document::BucketId bucket2(getFirstNonOwnedBucket());
+ guard.notifyAlways(bucket2, info);
+
+ wanted << "NotifyBucketChangeCommand("
+ << bucket1
+ << ", " << info
+ << ") => 0,"
+ << "NotifyBucketChangeCommand("
+ << bucket2
+ << ", " << info
+ << ") => 1";
+ }
+
+ CPPUNIT_ASSERT_EQUAL(wanted.str(), sender.getCommands(true, true));
+}
+
+} // storage
+
diff --git a/storage/src/tests/persistence/diskmoveoperationhandlertest.cpp b/storage/src/tests/persistence/diskmoveoperationhandlertest.cpp
new file mode 100644
index 00000000000..f47cc334e30
--- /dev/null
+++ b/storage/src/tests/persistence/diskmoveoperationhandlertest.cpp
@@ -0,0 +1,57 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/persistence/diskmoveoperationhandler.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storage/persistence/messages.h>
+#include <tests/persistence/persistencetestutils.h>
+
+namespace storage {
+
+class DiskMoveOperationHandlerTest : public PersistenceTestUtils
+{
+ CPPUNIT_TEST_SUITE(DiskMoveOperationHandlerTest);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST_SUITE_END();
+
+public:
+ void testSimple();
+ void testTargetExists();
+ void testTargetWithOverlap();
+
+ void insertDocumentInBucket(uint64_t location, uint64_t timestamp, document::BucketId bucket);
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(DiskMoveOperationHandlerTest);
+
+void
+DiskMoveOperationHandlerTest::testSimple()
+{
+ setupDisks(10);
+
+ // Create bucket 16, 4 on disk 3.
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ createBucket(document::BucketId(16, 4)));
+ entry->disk = 3;
+ entry.write();
+ }
+
+ for (uint32_t i = 0; i < 10; i++) {
+ doPutOnDisk(3, 4, spi::Timestamp(1000 + i));
+ }
+
+ DiskMoveOperationHandler diskMoveHandler(
+ getEnv(3),
+ getPersistenceProvider());
+ BucketDiskMoveCommand move(document::BucketId(16, 4), 3, 4);
+
+ spi::Context context(documentapi::LoadType::DEFAULT, 0, 0);
+ diskMoveHandler.handleBucketDiskMove(move, context);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("BucketId(0x4000000000000004): 10,4"),
+ getBucketStatus(document::BucketId(16,4)));
+}
+
+}
diff --git a/storage/src/tests/persistence/filestorage/.gitignore b/storage/src/tests/persistence/filestorage/.gitignore
new file mode 100644
index 00000000000..cfeb99e9e3f
--- /dev/null
+++ b/storage/src/tests/persistence/filestorage/.gitignore
@@ -0,0 +1,13 @@
+*.So
+*.lo
+*.o
+.*.swp
+.config.log
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
+persistence
+testrunner
+testrunner.core
diff --git a/storage/src/tests/persistence/filestorage/CMakeLists.txt b/storage/src/tests/persistence/filestorage/CMakeLists.txt
new file mode 100644
index 00000000000..b1314ca0537
--- /dev/null
+++ b/storage/src/tests/persistence/filestorage/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_testfilestorage
+ SOURCES
+ filestormanagertest.cpp
+ operationabortingtest.cpp
+ filestortestfixture.cpp
+ mergeblockingtest.cpp
+ sanitycheckeddeletetest.cpp
+ deactivatebucketstest.cpp
+ modifiedbucketcheckertest.cpp
+ filestormodifiedbucketstest.cpp
+ deletebuckettest.cpp
+ singlebucketjointest.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/tests/persistence/filestorage/deactivatebucketstest.cpp b/storage/src/tests/persistence/filestorage/deactivatebucketstest.cpp
new file mode 100644
index 00000000000..6de67a3fec0
--- /dev/null
+++ b/storage/src/tests/persistence/filestorage/deactivatebucketstest.cpp
@@ -0,0 +1,66 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/state.h>
+#include <tests/persistence/persistenceproviderwrapper.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+#include <tests/persistence/filestorage/filestortestfixture.h>
+
+namespace storage {
+
+class DeactivateBucketsTest : public FileStorTestFixture
+{
+ bool isActive(const document::BucketId&) const;
+public:
+ void bucketsInDatabaseDeactivatedWhenNodeDownInClusterState();
+
+ CPPUNIT_TEST_SUITE(DeactivateBucketsTest);
+ CPPUNIT_TEST(bucketsInDatabaseDeactivatedWhenNodeDownInClusterState);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(DeactivateBucketsTest);
+
+bool
+DeactivateBucketsTest::isActive(const document::BucketId& bucket) const
+{
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(bucket, "foo"));
+ CPPUNIT_ASSERT(entry.exist());
+ return entry->info.isActive();
+}
+
+void
+DeactivateBucketsTest::bucketsInDatabaseDeactivatedWhenNodeDownInClusterState()
+{
+ TestFileStorComponents c(*this, "bucketsInDatabaseDeactivatedWhenNodeDownInClusterState");
+ // Must set state to up first, or down-edge case won't trigger.
+ std::string upState("storage:2 distributor:2");
+ _node->getStateUpdater().setClusterState(
+ lib::ClusterState::CSP(new lib::ClusterState(upState)));
+
+ document::BucketId bucket(8, 123);
+ spi::Bucket spiBucket(bucket, spi::PartitionId(0));
+
+ createBucket(bucket);
+ api::BucketInfo serviceLayerInfo(1, 2, 3, 4, 5, true, true);
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(bucket, "foo",
+ StorBucketDatabase::CREATE_IF_NONEXISTING));
+ entry->disk = 0;
+ entry->info = serviceLayerInfo;
+ entry.write();
+ }
+ CPPUNIT_ASSERT(isActive(bucket));
+ std::string downState("storage:2 .1.s:d distributor:2");
+ _node->getStateUpdater().setClusterState(
+ lib::ClusterState::CSP(new lib::ClusterState(downState)));
+
+ // Buckets should have been deactivated in content layer
+ CPPUNIT_ASSERT(!isActive(bucket));
+}
+
+} // namespace storage
diff --git a/storage/src/tests/persistence/filestorage/deletebuckettest.cpp b/storage/src/tests/persistence/filestorage/deletebuckettest.cpp
new file mode 100644
index 00000000000..08ca9bc68fa
--- /dev/null
+++ b/storage/src/tests/persistence/filestorage/deletebuckettest.cpp
@@ -0,0 +1,63 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <tests/persistence/persistenceproviderwrapper.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+#include <tests/persistence/filestorage/filestortestfixture.h>
+
+LOG_SETUP(".deletebuckettest");
+
+namespace storage {
+
+class DeleteBucketTest : public FileStorTestFixture
+{
+public:
+ void testDeleteAbortsOperationsForBucket();
+
+ CPPUNIT_TEST_SUITE(DeleteBucketTest);
+ CPPUNIT_TEST(testDeleteAbortsOperationsForBucket);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(DeleteBucketTest);
+
+void
+DeleteBucketTest::testDeleteAbortsOperationsForBucket()
+{
+ TestFileStorComponents c(*this, "testDeleteAbortsOperationsForBucket");
+ document::BucketId bucket(16, 1);
+
+ createBucket(bucket);
+ LOG(info, "TEST STAGE: taking resume guard");
+ ResumeGuard rg(c.manager->getFileStorHandler().pause());
+ // First put may or may not be queued, since pausing might race with
+ // an existing getNextMessage iteration (ugh...).
+ c.sendPut(bucket, DocumentIndex(0), PutTimestamp(1000));
+ // Put will be queued since thread now must know it's paused.
+ c.sendPut(bucket, DocumentIndex(1), PutTimestamp(1000));
+
+ auto deleteMsg = std::make_shared<api::DeleteBucketCommand>(bucket);
+ c.top.sendDown(deleteMsg);
+ // We should now have two put replies. The first one will either be OK
+ // or BUCKET_DELETED depending on whether it raced. The second (which is
+ // the one we care about since it's deterministic) must be BUCKET_DELETED.
+ // Problem is, their returned ordering is not deterministic so we're left
+ // with having to check that _at least_ 1 reply had BUCKET_DELETED. Joy!
+ c.top.waitForMessages(2, 60*2);
+ std::vector<api::StorageMessage::SP> msgs(c.top.getRepliesOnce());
+ CPPUNIT_ASSERT_EQUAL(size_t(2), msgs.size());
+ int numDeleted = 0;
+ for (uint32_t i = 0; i < 2; ++i) {
+ api::StorageReply& reply(dynamic_cast<api::StorageReply&>(*msgs[i]));
+ if (reply.getResult().getResult() == api::ReturnCode::BUCKET_DELETED) {
+ ++numDeleted;
+ }
+ }
+ CPPUNIT_ASSERT(numDeleted >= 1);
+ LOG(info, "TEST STAGE: done, releasing resume guard");
+}
+
+} // namespace storage
diff --git a/storage/src/tests/persistence/filestorage/filestormanagertest.cpp b/storage/src/tests/persistence/filestorage/filestormanagertest.cpp
new file mode 100644
index 00000000000..0ffbe9fa440
--- /dev/null
+++ b/storage/src/tests/persistence/filestorage/filestormanagertest.cpp
@@ -0,0 +1,3150 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/document/update/assignvalueupdate.h>
+#include <vespa/document/datatype/datatype.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/document/datatype/documenttype.h>
+#include <vespa/document/update/documentupdate.h>
+#include <vespa/document/fieldvalue/rawfieldvalue.h>
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/document/select/parser.h>
+#include <fstream>
+#include <memory>
+#include <atomic>
+#include <vespa/vdslib/state/random.h>
+#include <vespa/vdslib/container/mutabledocumentlist.h>
+#include <vespa/vdslib/container/operationlist.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/vespalib/io/fileutil.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/removelocation.h>
+#include <vespa/storage/bucketdb/bucketmanager.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storage/persistence/persistencethread.h>
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/storage/persistence/filestorage/filestormanager.h>
+#include <vespa/storage/persistence/filestorage/modifiedbucketchecker.h>
+#include <tests/common/testhelper.h>
+#include <tests/common/storagelinktest.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/dummystoragelink.h>
+#include <tests/persistence/filestorage/forwardingmessagesender.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+#include <vespa/log/log.h>
+#include <vespa/storageapi/message/batch.h>
+#include <vespa/storage/storageserver/statemanager.h>
+
+LOG_SETUP(".filestormanagertest");
+
+using std::unique_ptr;
+using document::Document;
+using namespace storage::api;
+
+#define ASSERT_SINGLE_REPLY(replytype, reply, link, time) \
+reply = 0; \
+try{ \
+ link.waitForMessages(1, time); \
+ CPPUNIT_ASSERT_EQUAL((size_t)1, link.getNumReplies()); \
+ reply = dynamic_cast<replytype*>(link.getReply(0).get()); \
+ if (reply == 0) { \
+ CPPUNIT_FAIL("Got reply of unexpected type: " \
+ + link.getReply(0)->getType().toString()); \
+ } \
+} catch (vespalib::Exception& e) { \
+ reply = 0; \
+ CPPUNIT_FAIL("Failed to find single reply in time"); \
+}
+
+namespace storage {
+
+namespace {
+ spi::LoadType defaultLoadType(0, "default");
+}
+
+struct FileStorManagerTest : public CppUnit::TestFixture {
+ enum {LONG_WAITTIME=60};
+ unique_ptr<TestServiceLayerApp> _node;
+ std::unique_ptr<vdstestlib::DirConfig> config;
+ std::unique_ptr<vdstestlib::DirConfig> config2;
+ std::unique_ptr<vdstestlib::DirConfig> smallConfig;
+ const uint32_t _waitTime;
+ const document::DocumentType* _testdoctype1;
+
+ FileStorManagerTest() : _node(), _waitTime(LONG_WAITTIME) {}
+
+ void setUp();
+ void tearDown();
+
+ void testPut();
+ void testHeaderOnlyPut();
+ void testFlush();
+ void testRemapSplit();
+ void testHandlerPriority();
+ void testHandlerPriorityBlocking();
+ void testHandlerPriorityPreempt();
+ void testHandlerMulti();
+ void testHandlerTimeout();
+ void testHandlerPause();
+ void testHandlerPausedMultiThread();
+ void testPriority();
+ void testSplit1();
+ void testSplitSingleGroup();
+ void testSplitEmptyTargetWithRemappedOps();
+ void testNotifyOnSplitSourceOwnershipChanged();
+ void testJoin();
+ void testVisiting();
+ void testRemoveLocation();
+ void testDeleteBucket();
+ void testDeleteBucketRejectOutdatedBucketInfo();
+ void testDeleteBucketWithInvalidBucketInfo();
+ void testNoTimestamps();
+ void testEqualTimestamps();
+ void testMultiOp();
+ void testGetIter();
+ void testSetBucketActiveState();
+ void testNotifyOwnerDistributorOnOutdatedSetBucketState();
+ void testGetBucketDiffImplicitCreateBucket();
+ void testMergeBucketImplicitCreateBucket();
+ void testNewlyCreatedBucketIsReady();
+ void testCreateBucketSetsActiveFlagInDatabaseAndReply();
+ void testFileStorThreadLockingStressTest();
+ void testStateChange();
+ void testRepairNotifiesDistributorOnChange();
+ void testDiskMove();
+
+ CPPUNIT_TEST_SUITE(FileStorManagerTest);
+ CPPUNIT_TEST(testPut);
+ CPPUNIT_TEST(testHeaderOnlyPut);
+ CPPUNIT_TEST(testFlush);
+ CPPUNIT_TEST(testRemapSplit);
+ CPPUNIT_TEST(testHandlerPriority);
+ CPPUNIT_TEST(testHandlerPriorityBlocking);
+ CPPUNIT_TEST(testHandlerPriorityPreempt);
+ CPPUNIT_TEST(testHandlerMulti);
+ CPPUNIT_TEST(testHandlerTimeout);
+ CPPUNIT_TEST(testHandlerPause);
+ CPPUNIT_TEST(testHandlerPausedMultiThread);
+ CPPUNIT_TEST(testPriority);
+ CPPUNIT_TEST(testSplit1);
+ CPPUNIT_TEST(testSplitSingleGroup);
+ CPPUNIT_TEST(testSplitEmptyTargetWithRemappedOps);
+ CPPUNIT_TEST(testNotifyOnSplitSourceOwnershipChanged);
+ CPPUNIT_TEST(testJoin);
+ CPPUNIT_TEST(testVisiting);
+ CPPUNIT_TEST(testRemoveLocation);
+ CPPUNIT_TEST(testDeleteBucket);
+ CPPUNIT_TEST(testDeleteBucketRejectOutdatedBucketInfo);
+ CPPUNIT_TEST(testDeleteBucketWithInvalidBucketInfo);
+ CPPUNIT_TEST(testNoTimestamps);
+ CPPUNIT_TEST(testEqualTimestamps);
+ CPPUNIT_TEST(testMultiOp);
+ CPPUNIT_TEST(testGetIter);
+ CPPUNIT_TEST(testSetBucketActiveState);
+ CPPUNIT_TEST(testNotifyOwnerDistributorOnOutdatedSetBucketState);
+ CPPUNIT_TEST(testGetBucketDiffImplicitCreateBucket);
+ CPPUNIT_TEST(testMergeBucketImplicitCreateBucket);
+ CPPUNIT_TEST(testNewlyCreatedBucketIsReady);
+ CPPUNIT_TEST(testCreateBucketSetsActiveFlagInDatabaseAndReply);
+ CPPUNIT_TEST(testStateChange);
+ CPPUNIT_TEST(testRepairNotifiesDistributorOnChange);
+ CPPUNIT_TEST(testDiskMove);
+ CPPUNIT_TEST_SUITE_END();
+
+ void createBucket(document::BucketId bid, uint16_t disk)
+ {
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ _node->getPersistenceProvider().createBucket(
+ spi::Bucket(bid, spi::PartitionId(disk)), context);
+
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(bid, "foo",
+ StorBucketDatabase::CREATE_IF_NONEXISTING));
+ entry->disk = disk;
+ entry->info = api::BucketInfo(0, 0, 0, 0, 0, true, false);
+ entry.write();
+ }
+
+ document::Document::UP createDocument(
+ const std::string& content, const std::string& id)
+ {
+ return _node->getTestDocMan().createDocument(content, id);
+ }
+
+ bool ownsBucket(uint16_t distributorIndex,
+ const document::BucketId& bucket) const
+ {
+ uint16_t distributor(
+ _node->getDistribution()->getIdealDistributorNode(
+ *_node->getStateUpdater().getSystemState(), bucket));
+ return distributor == distributorIndex;
+ }
+
+ document::BucketId getFirstBucketNotOwnedByDistributor(uint16_t distributor) {
+ for (int i = 0; i < 1000; ++i) {
+ if (!ownsBucket(distributor, document::BucketId(16, i))) {
+ return document::BucketId(16, i);
+ }
+ }
+ return document::BucketId(0);
+ }
+
+ spi::dummy::DummyPersistence& getDummyPersistence() {
+ return static_cast<spi::dummy::DummyPersistence&>
+ (_node->getPersistenceProvider());
+ }
+
+ void setClusterState(const std::string& state) {
+ _node->getStateUpdater().setClusterState(
+ lib::ClusterState::CSP(
+ new lib::ClusterState(state)));
+ }
+
+ void setupDisks(uint32_t diskCount) {
+ config.reset(new vdstestlib::DirConfig(getStandardConfig(true)));
+
+ config2.reset(new vdstestlib::DirConfig(*config));
+ config2->getConfig("stor-server").set("root_folder", "vdsroot.2");
+ config2->getConfig("stor-devices").set("root_folder", "vdsroot.2");
+ config2->getConfig("stor-server").set("node_index", "1");
+
+ smallConfig.reset(new vdstestlib::DirConfig(*config));
+ vdstestlib::DirConfig::Config& c(
+ smallConfig->getConfig("stor-filestor", true));
+ c.set("initial_index_read", "128");
+ c.set("use_direct_io", "false");
+ c.set("maximum_gap_to_read_through", "64");
+
+ assert(system("rm -rf vdsroot") == 0);
+ assert(system("rm -rf vdsroot.2") == 0);
+ assert(system("mkdir -p vdsroot/disks/d0") == 0);
+ assert(system("mkdir -p vdsroot.2/disks/d0") == 0);
+ try {
+ _node.reset(new TestServiceLayerApp(DiskCount(diskCount), NodeIndex(0),
+ config->getConfigId()));
+ _node->setupDummyPersistence();
+ } catch (config::InvalidConfigException& e) {
+ fprintf(stderr, "%s\n", e.what());
+ }
+ _testdoctype1 = _node->getTypeRepo()->getDocumentType("testdoctype1");
+ _node->getMemoryManager().registerAllocationType(
+ framework::MemoryAllocationType("VISITOR_BUFFER"));
+ }
+
+ void putDoc(DummyStorageLink& top,
+ FileStorHandler& filestorHandler,
+ const document::BucketId& bucket,
+ uint32_t docNum);
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(FileStorManagerTest);
+
+std::string findFile(const std::string& path, const std::string& file) {
+ FastOS_DirectoryScan dirScan(path.c_str());
+ while (dirScan.ReadNext()) {
+ if (dirScan.GetName()[0] == '.') {
+ // Ignore current and parent dir.. Ignores hidden files too, but
+ // that doesn't matter as we're not trying to find them.
+ continue;
+ }
+ std::string filename(dirScan.GetName());
+ if (dirScan.IsDirectory()) {
+ std::string result = findFile(path + "/" + filename, file);
+ if (result != "") {
+ return result;
+ }
+ }
+ if (filename == file) {
+ return path + "/" + filename;
+ }
+ }
+ return "";
+}
+
+bool fileExistsWithin(const std::string& path, const std::string& file) {
+ return !(findFile(path, file) == "");
+}
+
+std::unique_ptr<DiskThread> createThread(vdstestlib::DirConfig& config,
+ TestServiceLayerApp& node,
+ spi::PersistenceProvider& provider,
+ FileStorHandler& filestorHandler,
+ FileStorThreadMetrics& metrics,
+ uint16_t deviceIndex,
+ uint8_t lowestPriority)
+{
+ (void) config;
+ std::unique_ptr<DiskThread> disk;
+ disk.reset(new PersistenceThread(
+ node.getComponentRegister(), config.getConfigId(), provider,
+ filestorHandler, metrics,
+ deviceIndex, lowestPriority));
+ return disk;
+}
+
+namespace {
+
+struct TestFileStorComponents
+{
+private:
+ TestName _testName;
+public:
+ DummyStorageLink top;
+ FileStorManager* manager;
+
+ TestFileStorComponents(FileStorManagerTest& test, const char* testName)
+ : _testName(testName),
+ manager(new FileStorManager(test.config->getConfigId(),
+ test._node->getPartitions(),
+ test._node->getPersistenceProvider(),
+ test._node->getComponentRegister()))
+ {
+ top.push_back(unique_ptr<StorageLink>(manager));
+ top.open();
+ }
+};
+
+}
+
+void
+FileStorManagerTest::setUp()
+{
+ setupDisks(1);
+}
+
+void
+FileStorManagerTest::tearDown()
+{
+ _node.reset(0);
+}
+
+void
+FileStorManagerTest::testHeaderOnlyPut()
+{
+ TestName testName("testHeaderOnlyPut");
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager =
+ new FileStorManager(config->getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ top.open();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 3);
+ // Creating a document to test with
+ Document::SP doc(createDocument(
+ "some content", "userdoc:crawler:4000:foo").release());
+
+ document::BucketId bid(16, 4000);
+
+ createBucket(bid, 0);
+
+ // Putting it
+ {
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, doc, 105));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ CPPUNIT_ASSERT_EQUAL(1, (int)reply->getBucketInfo().getDocumentCount());
+ }
+ doc->setValue(doc->getField("headerval"), document::IntFieldValue(42));
+ // Putting it again, this time with header only
+ {
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, doc, 124));
+ cmd->setUpdateTimestamp(105);
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode::OK, reply->getResult().getResult());
+ }
+ // Getting it
+ {
+ std::shared_ptr<api::GetCommand> cmd(new api::GetCommand(
+ bid, doc->getId(), "[all]"));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::GetReply> reply2(
+ std::dynamic_pointer_cast<api::GetReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply2.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply2->getResult());
+ CPPUNIT_ASSERT_EQUAL(doc->getId().toString(),
+ reply2->getDocumentId().toString());
+ // Ensure partial update was done, but other things are equal
+ document::FieldValue::UP value(
+ reply2->getDocument()->getValue(doc->getField("headerval")));
+ CPPUNIT_ASSERT(value.get());
+ CPPUNIT_ASSERT_EQUAL(42, dynamic_cast<document::IntFieldValue&>(
+ *value).getAsInt());
+ reply2->getDocument()->remove("headerval");
+ doc->remove("headerval");
+ CPPUNIT_ASSERT_EQUAL(*doc, *reply2->getDocument());
+ }
+}
+
+void
+FileStorManagerTest::testPut()
+{
+ TestName testName("testPut");
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager =
+ new FileStorManager(config->getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ top.open();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 3);
+ // Creating a document to test with
+ Document::SP doc(createDocument(
+ "some content", "userdoc:crawler:4000:foo").release());
+
+ document::BucketId bid(16, 4000);
+
+ createBucket(bid, 0);
+
+ // Putting it
+ {
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, doc, 105));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ CPPUNIT_ASSERT_EQUAL(1, (int)reply->getBucketInfo().getDocumentCount());
+ }
+}
+
+void
+FileStorManagerTest::testDiskMove()
+{
+ setupDisks(2);
+
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager =
+ new FileStorManager(config->getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ top.open();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 3);
+ // Creating a document to test with
+ Document::SP doc(createDocument(
+ "some content", "userdoc:crawler:4000:foo").release());
+
+ document::BucketId bid(16, 4000);
+
+ createBucket(bid, 0);
+
+ // Putting it
+ {
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, doc, 105));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ CPPUNIT_ASSERT_EQUAL(1, (int)reply->getBucketInfo().getDocumentCount());
+ }
+
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(bid, "foo"));
+
+ CPPUNIT_ASSERT_EQUAL(0, (int)entry->disk);
+ CPPUNIT_ASSERT_EQUAL(
+ vespalib::string(
+ "BucketInfo(crc 0x28cc441f, docCount 1, totDocSize 122, "
+ "ready true, active false)"),
+ entry->getBucketInfo().toString());
+ }
+
+ {
+ std::shared_ptr<BucketDiskMoveCommand> cmd(
+ new BucketDiskMoveCommand(bid, 0, 1));
+
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<BucketDiskMoveReply> reply(
+ std::dynamic_pointer_cast<BucketDiskMoveReply>(top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ CPPUNIT_ASSERT_EQUAL(1, (int)reply->getBucketInfo().getDocumentCount());
+ }
+
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(bid, "foo"));
+
+ CPPUNIT_ASSERT_EQUAL(1, (int)entry->disk);
+ CPPUNIT_ASSERT_EQUAL(
+ vespalib::string(
+ "BucketInfo(crc 0x28cc441f, docCount 1, totDocSize 122, "
+ "ready true, active false)"),
+ entry->getBucketInfo().toString());
+ }
+}
+
+
+void
+FileStorManagerTest::testStateChange()
+{
+ TestName testName("testStateChange");
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager =
+ new FileStorManager(config->getConfigId(), _node->getPartitions(),
+ _node->getPersistenceProvider(),
+ _node->getComponentRegister())));
+ top.open();
+
+ setClusterState("storage:3 distributor:3");
+
+ CPPUNIT_ASSERT_EQUAL(true, getDummyPersistence().getClusterState().nodeUp());
+
+ setClusterState("storage:3 .0.s:d distributor:3");
+
+ CPPUNIT_ASSERT_EQUAL(false, getDummyPersistence().getClusterState().nodeUp());
+}
+
+void
+FileStorManagerTest::testRepairNotifiesDistributorOnChange()
+{
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager =
+ new FileStorManager(config->getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ setClusterState("storage:1 distributor:1");
+ top.open();
+
+ createBucket(document::BucketId(16, 1), 0);
+
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 3);
+
+ // Creating a document to test with
+
+ for (uint32_t i = 0; i < 3; ++i) {
+ document::DocumentId docId(vespalib::make_string("userdoc:ns:1:%d", i));
+ Document::SP doc(new Document(*_testdoctype1, docId));
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(document::BucketId(16, 1), doc, i + 1));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ }
+
+ top.waitForMessages(3, _waitTime);
+ top.reset();
+
+ getDummyPersistence().simulateMaintenanceFailure();
+
+ std::shared_ptr<RepairBucketCommand> cmd(
+ new RepairBucketCommand(document::BucketId(16, 1), 0));
+ top.sendDown(cmd);
+
+ top.waitForMessages(2, _waitTime);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("NotifyBucketChangeCommand(BucketId(0x4000000000000001), "
+ "BucketInfo(crc 0x2625a314, docCount 2, totDocSize 170, "
+ "ready true, active false))"), top.getReply(0)->toString());
+
+ top.close();
+}
+
+
+void
+FileStorManagerTest::testFlush()
+{
+ TestName testName("testFlush");
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager = new FileStorManager(
+ config->getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ top.open();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 3);
+ // Creating a document to test with
+
+ document::DocumentId docId("doc:crawler:http://www.ntnu.no/");
+ Document::SP doc(new Document(*_testdoctype1, docId));
+ document::BucketId bid(4000);
+
+ static const uint32_t msgCount = 10;
+
+ // Generating many put commands
+ std::vector<std::shared_ptr<api::StorageCommand> > _commands;
+ for (uint32_t i=0; i<msgCount; ++i) {
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, doc, i+1));
+ cmd->setAddress(address);
+ _commands.push_back(cmd);
+ }
+ for (uint32_t i=0; i<msgCount; ++i) {
+ top.sendDown(_commands[i]);
+ }
+ top.close();
+ top.flush();
+ CPPUNIT_ASSERT_EQUAL((size_t) msgCount, top.getNumReplies());
+}
+
+void
+FileStorManagerTest::testHandlerPriority()
+{
+ TestName testName("testHandlerPriority");
+ // Setup a filestorthread to test
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(
+ dummyManager = new DummyStorageLink));
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+ // Since we fake time with small numbers, we need to make sure we dont
+ // compact them away, as they will seem to be from 1970
+
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(), loadTypes.getMetricLoadTypes(), 1);
+
+ FileStorHandler filestorHandler(messageSender, metrics, _node->getPartitions(),
+ _node->getComponentRegister(), 255, 0);
+ filestorHandler.setGetNextMessageTimeout(50);
+
+ std::string content("Here is some content which is in all documents");
+ std::ostringstream uri;
+
+ Document::SP doc(createDocument(
+ content, "userdoc:footype:1234:bar").release());
+
+ document::BucketIdFactory factory;
+ document::BucketId bucket(16, factory.getBucketId(
+ doc->getId()).getRawId());
+
+ // Populate bucket with the given data
+ for (uint32_t i = 1; i < 6; i++) {
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bucket, doc, 100));
+ std::unique_ptr<api::StorageMessageAddress> address(
+ new api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 3));
+ cmd->setAddress(*address);
+ cmd->setPriority(i * 15);
+ filestorHandler.schedule(cmd, 0);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(15, (int)filestorHandler.getNextMessage(0, 20).second->getPriority());
+ CPPUNIT_ASSERT(filestorHandler.getNextMessage(0, 20).second.get() == NULL);
+ CPPUNIT_ASSERT_EQUAL(30, (int)filestorHandler.getNextMessage(0, 50).second->getPriority());
+ CPPUNIT_ASSERT_EQUAL(45, (int)filestorHandler.getNextMessage(0, 50).second->getPriority());
+ CPPUNIT_ASSERT(filestorHandler.getNextMessage(0, 50).second.get() == NULL);
+ CPPUNIT_ASSERT_EQUAL(60, (int)filestorHandler.getNextMessage(0, 255).second->getPriority());
+ CPPUNIT_ASSERT_EQUAL(75, (int)filestorHandler.getNextMessage(0, 255).second->getPriority());
+}
+
+class MessagePusherThread : public document::Runnable
+{
+public:
+ FileStorHandler& _handler;
+ Document::SP _doc;
+ bool _done;
+ bool _threadDone;
+
+ MessagePusherThread(FileStorHandler& handler, Document::SP doc)
+ : _handler(handler), _doc(doc), _done(false), _threadDone(false) {}
+
+ void run() {
+ while (!_done) {
+ document::BucketIdFactory factory;
+ document::BucketId bucket(16, factory.getBucketId(
+ _doc->getId()).getRawId());
+
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bucket, _doc, 100));
+ _handler.schedule(cmd, 0);
+ FastOS_Thread::Sleep(1);
+ }
+
+ _threadDone = true;
+ }
+};
+
+class MessageFetchingThread : public document::Runnable {
+public:
+ FileStorHandler& _handler;
+ std::atomic<uint32_t> _config;
+ uint32_t _fetchedCount;
+ bool _done;
+ bool _failed;
+ bool _threadDone;
+
+ MessageFetchingThread(FileStorHandler& handler)
+ : _handler(handler), _config(0), _fetchedCount(0), _done(false),
+ _failed(false), _threadDone(false) {}
+
+ void run() {
+ while (!_done) {
+ FileStorHandler::LockedMessage msg = _handler.getNextMessage(0, 255);
+ if (msg.second.get()) {
+ uint32_t originalConfig = _config.load();
+ _fetchedCount++;
+ FastOS_Thread::Sleep(5);
+
+ if (_config.load() != originalConfig) {
+ _failed = true;
+ }
+ } else {
+ FastOS_Thread::Sleep(1);
+ }
+ }
+
+ _threadDone = true;
+ };
+};
+
+void
+FileStorManagerTest::testHandlerPausedMultiThread()
+{
+ TestName testName("testHandlerPausedMultiThread");
+ // Setup a filestorthread to test
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(
+ dummyManager = new DummyStorageLink));
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+ // Since we fake time with small numbers, we need to make sure we dont
+ // compact them away, as they will seem to be from 1970
+
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(), loadTypes.getMetricLoadTypes(), 1);
+
+ FileStorHandler filestorHandler(messageSender, metrics, _node->getPartitions(),
+ _node->getComponentRegister(), 255, 0);
+ filestorHandler.setGetNextMessageTimeout(50);
+
+ std::string content("Here is some content which is in all documents");
+ std::ostringstream uri;
+
+ Document::SP doc(createDocument(content, "userdoc:footype:1234:bar").release());
+
+ FastOS_ThreadPool pool(512 * 1024);
+ MessagePusherThread pushthread(filestorHandler, doc);
+ pushthread.start(pool);
+
+ MessageFetchingThread thread(filestorHandler);
+ thread.start(pool);
+
+ for (uint32_t i = 0; i < 50; ++i) {
+ FastOS_Thread::Sleep(2);
+ ResumeGuard guard = filestorHandler.pause();
+ thread._config.fetch_add(1);
+ uint32_t count = thread._fetchedCount;
+ CPPUNIT_ASSERT_EQUAL(count, thread._fetchedCount);
+ }
+
+ pushthread._done = true;
+ thread._done = true;
+ CPPUNIT_ASSERT(!thread._failed);
+
+ while (!pushthread._threadDone || !thread._threadDone) {
+ FastOS_Thread::Sleep(1);
+ }
+}
+
+
+void
+FileStorManagerTest::testHandlerPause()
+{
+ TestName testName("testHandlerPriority");
+ // Setup a filestorthread to test
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(
+ dummyManager = new DummyStorageLink));
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+ // Since we fake time with small numbers, we need to make sure we dont
+ // compact them away, as they will seem to be from 1970
+
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(), loadTypes.getMetricLoadTypes(), 1);
+
+ FileStorHandler filestorHandler(messageSender, metrics, _node->getPartitions(),
+ _node->getComponentRegister(), 255, 0);
+ filestorHandler.setGetNextMessageTimeout(50);
+
+ std::string content("Here is some content which is in all documents");
+ std::ostringstream uri;
+
+ Document::SP doc(createDocument(content, "userdoc:footype:1234:bar").release());
+
+ document::BucketIdFactory factory;
+ document::BucketId bucket(16, factory.getBucketId(
+ doc->getId()).getRawId());
+
+ // Populate bucket with the given data
+ for (uint32_t i = 1; i < 6; i++) {
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bucket, doc, 100));
+ std::unique_ptr<api::StorageMessageAddress> address(
+ new api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 3));
+ cmd->setAddress(*address);
+ cmd->setPriority(i * 15);
+ filestorHandler.schedule(cmd, 0);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(15, (int)filestorHandler.getNextMessage(0, 255).second->getPriority());
+
+ {
+ ResumeGuard guard = filestorHandler.pause();
+ (void)guard;
+ CPPUNIT_ASSERT(filestorHandler.getNextMessage(0, 255).second.get() == NULL);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(30, (int)filestorHandler.getNextMessage(0, 255).second->getPriority());
+}
+
+namespace {
+
+uint64_t getPutTime(api::StorageMessage::SP& msg)
+{
+ if (!msg.get()) {
+ return (uint64_t)-1;
+ }
+
+ return static_cast<api::PutCommand*>(msg.get())->getTimestamp();
+};
+
+}
+
+void
+FileStorManagerTest::testRemapSplit()
+{
+ TestName testName("testRemapSplit");
+ // Setup a filestorthread to test
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(
+ dummyManager = new DummyStorageLink));
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+ // Since we fake time with small numbers, we need to make sure we dont
+ // compact them away, as they will seem to be from 1970
+
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(), loadTypes.getMetricLoadTypes(), 1);
+
+ FileStorHandler filestorHandler(messageSender, metrics, _node->getPartitions(),
+ _node->getComponentRegister(), 255, 0);
+ filestorHandler.setGetNextMessageTimeout(50);
+
+ std::string content("Here is some content which is in all documents");
+
+ Document::SP doc1(createDocument(content, "userdoc:footype:1234:bar").release());
+
+ Document::SP doc2(createDocument(content, "userdoc:footype:4567:bar").release());
+
+ document::BucketIdFactory factory;
+ document::BucketId bucket1(16, 1234);
+ document::BucketId bucket2(16, 4567);
+
+ // Populate bucket with the given data
+ for (uint32_t i = 1; i < 4; i++) {
+ filestorHandler.schedule(
+ api::StorageMessage::SP(new api::PutCommand(bucket1, doc1, i)), 0);
+ filestorHandler.schedule(
+ api::StorageMessage::SP(new api::PutCommand(bucket2, doc2, i + 10)), 0);
+ }
+
+ CPPUNIT_ASSERT_EQUAL(std::string("BucketId(0x40000000000004d2): Put(BucketId(0x40000000000004d2), userdoc:footype:1234:bar, timestamp 1, size 108) (priority: 127)\n"
+ "BucketId(0x40000000000011d7): Put(BucketId(0x40000000000011d7), userdoc:footype:4567:bar, timestamp 11, size 108) (priority: 127)\n"
+ "BucketId(0x40000000000004d2): Put(BucketId(0x40000000000004d2), userdoc:footype:1234:bar, timestamp 2, size 108) (priority: 127)\n"
+ "BucketId(0x40000000000011d7): Put(BucketId(0x40000000000011d7), userdoc:footype:4567:bar, timestamp 12, size 108) (priority: 127)\n"
+ "BucketId(0x40000000000004d2): Put(BucketId(0x40000000000004d2), userdoc:footype:1234:bar, timestamp 3, size 108) (priority: 127)\n"
+ "BucketId(0x40000000000011d7): Put(BucketId(0x40000000000011d7), userdoc:footype:4567:bar, timestamp 13, size 108) (priority: 127)\n"),
+ filestorHandler.dumpQueue(0));
+
+ FileStorHandler::RemapInfo a(document::BucketId(17, 1234), 0);
+ FileStorHandler::RemapInfo b(document::BucketId(17, 1234 | 1 << 16), 0);
+ filestorHandler.remapQueueAfterSplit(FileStorHandler::RemapInfo(bucket1, 0), a, b);
+
+ CPPUNIT_ASSERT(a.foundInQueue);
+ CPPUNIT_ASSERT(!b.foundInQueue);
+
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "BucketId(0x40000000000011d7): Put(BucketId(0x40000000000011d7), userdoc:footype:4567:bar, timestamp 11, size 108) (priority: 127)\n"
+ "BucketId(0x40000000000011d7): Put(BucketId(0x40000000000011d7), userdoc:footype:4567:bar, timestamp 12, size 108) (priority: 127)\n"
+ "BucketId(0x40000000000011d7): Put(BucketId(0x40000000000011d7), userdoc:footype:4567:bar, timestamp 13, size 108) (priority: 127)\n"
+ "BucketId(0x44000000000004d2): Put(BucketId(0x44000000000004d2), userdoc:footype:1234:bar, timestamp 1, size 108) (priority: 127)\n"
+ "BucketId(0x44000000000004d2): Put(BucketId(0x44000000000004d2), userdoc:footype:1234:bar, timestamp 2, size 108) (priority: 127)\n"
+ "BucketId(0x44000000000004d2): Put(BucketId(0x44000000000004d2), userdoc:footype:1234:bar, timestamp 3, size 108) (priority: 127)\n"),
+ filestorHandler.dumpQueue(0));
+
+}
+
+void
+FileStorManagerTest::testHandlerMulti()
+{
+ TestName testName("testHandlerMulti");
+ // Setup a filestorthread to test
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(
+ dummyManager = new DummyStorageLink));
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+ // Since we fake time with small numbers, we need to make sure we dont
+ // compact them away, as they will seem to be from 1970
+
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(), loadTypes.getMetricLoadTypes(), 1);
+
+ FileStorHandler filestorHandler(messageSender, metrics, _node->getPartitions(),
+ _node->getComponentRegister(), 255, 0);
+ filestorHandler.setGetNextMessageTimeout(50);
+
+ std::string content("Here is some content which is in all documents");
+
+ Document::SP doc1(createDocument(content, "userdoc:footype:1234:bar").release());
+
+ Document::SP doc2(createDocument(content, "userdoc:footype:4567:bar").release());
+
+ document::BucketIdFactory factory;
+ document::BucketId bucket1(16, factory.getBucketId(
+ doc1->getId()).getRawId());
+ document::BucketId bucket2(16, factory.getBucketId(
+ doc2->getId()).getRawId());
+
+ // Populate bucket with the given data
+ for (uint32_t i = 1; i < 10; i++) {
+ filestorHandler.schedule(
+ api::StorageMessage::SP(new api::PutCommand(bucket1, doc1, i)), 0);
+ filestorHandler.schedule(
+ api::StorageMessage::SP(new api::PutCommand(bucket2, doc2, i + 10)), 0);
+ }
+
+ {
+ FileStorHandler::LockedMessage lock = filestorHandler.getNextMessage(0, 255);
+ CPPUNIT_ASSERT_EQUAL((uint64_t)1, getPutTime(lock.second));
+
+ lock = filestorHandler.getNextMessage(0, lock, 255);
+ CPPUNIT_ASSERT_EQUAL((uint64_t)2, getPutTime(lock.second));
+
+ lock = filestorHandler.getNextMessage(0, lock, 255);
+ CPPUNIT_ASSERT_EQUAL((uint64_t)3, getPutTime(lock.second));
+ }
+
+ {
+ FileStorHandler::LockedMessage lock = filestorHandler.getNextMessage(0, 255);
+ CPPUNIT_ASSERT_EQUAL((uint64_t)11, getPutTime(lock.second));
+
+ lock = filestorHandler.getNextMessage(0, lock, 255);
+ CPPUNIT_ASSERT_EQUAL((uint64_t)12, getPutTime(lock.second));
+ }
+}
+
+
+void
+FileStorManagerTest::testHandlerTimeout()
+{
+ TestName testName("testHandlerTimeout");
+ // Setup a filestorthread to test
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(
+ dummyManager = new DummyStorageLink));
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+
+ // Since we fake time with small numbers, we need to make sure we dont
+ // compact them away, as they will seem to be from 1970
+
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(), loadTypes.getMetricLoadTypes(), 1);
+
+ FileStorHandler filestorHandler(messageSender, metrics, _node->getPartitions(),
+ _node->getComponentRegister(), 255, 0);
+ filestorHandler.setGetNextMessageTimeout(50);
+
+ std::string content("Here is some content which is in all documents");
+ std::ostringstream uri;
+
+ Document::SP doc(createDocument(content, "userdoc:footype:1234:bar").release());
+
+ document::BucketIdFactory factory;
+ document::BucketId bucket(16, factory.getBucketId(
+ doc->getId()).getRawId());
+
+ // Populate bucket with the given data
+ {
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bucket, doc, 100));
+ std::unique_ptr<api::StorageMessageAddress> address(
+ new api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 3));
+ cmd->setAddress(*address);
+ cmd->setPriority(0);
+ cmd->setTimeout(50);
+ filestorHandler.schedule(cmd, 0);
+ }
+
+ {
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bucket, doc, 100));
+ std::unique_ptr<api::StorageMessageAddress> address(
+ new api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 3));
+ cmd->setAddress(*address);
+ cmd->setPriority(200);
+ cmd->setTimeout(10000);
+ filestorHandler.schedule(cmd, 0);
+ }
+
+ FastOS_Thread::Sleep(51);
+ for (;;) {
+ auto lock = filestorHandler.getNextMessage(0, 255);
+ if (lock.first.get()) {
+ CPPUNIT_ASSERT_EQUAL(uint8_t(200), lock.second->getPriority());
+ break;
+ }
+ }
+
+ CPPUNIT_ASSERT_EQUAL(size_t(1), top.getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::TIMEOUT,
+ static_cast<api::StorageReply&>(*top.getReply(0))
+ .getResult().getResult());
+}
+
+void
+FileStorManagerTest::testHandlerPriorityBlocking()
+{
+ TestName testName("testHandlerPriorityBlocking");
+ // Setup a filestorthread to test
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(
+ dummyManager = new DummyStorageLink));
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+ // Since we fake time with small numbers, we need to make sure we dont
+ // compact them away, as they will seem to be from 1970
+
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(), loadTypes.getMetricLoadTypes(), 1);
+
+ FileStorHandler filestorHandler(messageSender, metrics, _node->getPartitions(),
+ _node->getComponentRegister(), 21, 21);
+ filestorHandler.setGetNextMessageTimeout(50);
+
+ std::string content("Here is some content which is in all documents");
+ std::ostringstream uri;
+
+ document::BucketIdFactory factory;
+
+ // Populate bucket with the given data
+ for (uint32_t i = 1; i < 6; i++) {
+ Document::SP doc(createDocument(content, vespalib::make_string("doc:foo:%d",i)).release());
+ document::BucketId bucket(16, factory.getBucketId(
+ doc->getId()).getRawId());
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bucket, doc, 100));
+ std::unique_ptr<api::StorageMessageAddress> address(
+ new api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 3));
+ cmd->setAddress(*address);
+ cmd->setPriority(i * 15);
+ filestorHandler.schedule(cmd, 0);
+ }
+
+ {
+ FileStorHandler::LockedMessage lock1 = filestorHandler.getNextMessage(0, 20);
+ CPPUNIT_ASSERT_EQUAL(15, (int)lock1.second->getPriority());
+
+ LOG(debug, "Waiting for request that should time out");
+ FileStorHandler::LockedMessage lock2 = filestorHandler.getNextMessage(0, 30);
+ LOG(debug, "Got request that should time out");
+ CPPUNIT_ASSERT(lock2.second.get() == NULL);
+ }
+
+ {
+ FileStorHandler::LockedMessage lock1 = filestorHandler.getNextMessage(0, 40);
+ CPPUNIT_ASSERT_EQUAL(30, (int)lock1.second->getPriority());
+
+ // New high-pri message comes in
+ Document::SP doc(createDocument(content, vespalib::make_string("doc:foo:%d", 100)).release());
+ document::BucketId bucket(16, factory.getBucketId(
+ doc->getId()).getRawId());
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bucket, doc, 100));
+ std::unique_ptr<api::StorageMessageAddress> address(
+ new api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 3));
+ cmd->setAddress(*address);
+ cmd->setPriority(15);
+ filestorHandler.schedule(cmd, 0);
+
+ FileStorHandler::LockedMessage lock2 = filestorHandler.getNextMessage(0, 20);
+ CPPUNIT_ASSERT_EQUAL(15, (int)lock2.second->getPriority());
+
+ LOG(debug, "Waiting for request that should time out");
+ FileStorHandler::LockedMessage lock3 = filestorHandler.getNextMessage(0, 255);
+ LOG(debug, "Got request that should time out");
+ CPPUNIT_ASSERT(lock3.second.get() == NULL);
+ }
+
+ {
+ FileStorHandler::LockedMessage lock1 = filestorHandler.getNextMessage(0, 255);
+ CPPUNIT_ASSERT_EQUAL(45, (int)lock1.second->getPriority());
+
+ FileStorHandler::LockedMessage lock = filestorHandler.getNextMessage(0, 255);
+ CPPUNIT_ASSERT_EQUAL(60, (int)lock.second->getPriority());
+ }
+ LOG(debug, "Test done");
+}
+
+class PausedThread : public document::Runnable {
+private:
+ FileStorHandler& _handler;
+
+public:
+ bool pause;
+ bool done;
+ bool gotoperation;
+
+ PausedThread(FileStorHandler& handler)
+ : _handler(handler), pause(false), done(false), gotoperation(false) {}
+
+ void run() {
+ FileStorHandler::LockedMessage msg = _handler.getNextMessage(0, 255);
+ gotoperation = true;
+
+ while (!done) {
+ if (pause) {
+ _handler.pause(0, msg.second->getPriority());
+ pause = false;
+ }
+ FastOS_Thread::Sleep(10);
+ }
+
+ done = false;
+ };
+};
+
+void
+FileStorManagerTest::testHandlerPriorityPreempt()
+{
+ TestName testName("testHandlerPriorityPreempt");
+ // Setup a filestorthread to test
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(
+ dummyManager = new DummyStorageLink));
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+ // Since we fake time with small numbers, we need to make sure we dont
+ // compact them away, as they will seem to be from 1970
+
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(), loadTypes.getMetricLoadTypes(), 1);
+
+ FileStorHandler filestorHandler(messageSender, metrics, _node->getPartitions(),
+ _node->getComponentRegister(), 21, 21);
+ filestorHandler.setGetNextMessageTimeout(50);
+
+ std::string content("Here is some content which is in all documents");
+ std::ostringstream uri;
+
+ document::BucketIdFactory factory;
+
+ {
+ Document::SP doc(createDocument(content, "doc:foo:1").release());
+ document::BucketId bucket(16, factory.getBucketId(
+ doc->getId()).getRawId());
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bucket, doc, 100));
+ std::unique_ptr<api::StorageMessageAddress> address(
+ new api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 3));
+ cmd->setAddress(*address);
+ cmd->setPriority(60);
+ filestorHandler.schedule(cmd, 0);
+ }
+
+ PausedThread thread(filestorHandler);
+ FastOS_ThreadPool pool(512 * 1024);
+ thread.start(pool);
+
+ while (!thread.gotoperation) {
+ FastOS_Thread::Sleep(10);
+ }
+
+ {
+ Document::SP doc(createDocument(content, "doc:foo:2").release());
+ document::BucketId bucket(16, factory.getBucketId(
+ doc->getId()).getRawId());
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bucket, doc, 100));
+ std::unique_ptr<api::StorageMessageAddress> address(
+ new api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 3));
+ cmd->setAddress(*address);
+ cmd->setPriority(20);
+ filestorHandler.schedule(cmd, 0);
+ }
+
+ {
+ FileStorHandler::LockedMessage lock1 = filestorHandler.getNextMessage(0, 20);
+ CPPUNIT_ASSERT_EQUAL(20, (int)lock1.second->getPriority());
+
+ thread.pause = true;
+
+ for (uint32_t i = 0; i < 10; i++) {
+ CPPUNIT_ASSERT(thread.pause);
+ FastOS_Thread::Sleep(100);
+ }
+ }
+
+ while (thread.pause) {
+ FastOS_Thread::Sleep(10);
+ }
+
+ thread.done = true;
+
+ while (thread.done) {
+ FastOS_Thread::Sleep(10);
+ }
+}
+
+void
+FileStorManagerTest::testPriority()
+{
+ TestName testName("testPriority");
+ // Setup a filestorthread to test
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(
+ dummyManager = new DummyStorageLink));
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+ // Since we fake time with small numbers, we need to make sure we dont
+ // compact them away, as they will seem to be from 1970
+
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(), loadTypes.getMetricLoadTypes(), 2);
+
+ FileStorHandler filestorHandler(messageSender, metrics, _node->getPartitions(),
+ _node->getComponentRegister(), 255, 0);
+ std::unique_ptr<DiskThread> thread(createThread(
+ *config, *_node, _node->getPersistenceProvider(),
+ filestorHandler, *metrics.disks[0]->threads[0], 0, 25));
+ std::unique_ptr<DiskThread> thread2(createThread(
+ *config, *_node, _node->getPersistenceProvider(),
+ filestorHandler, *metrics.disks[0]->threads[1], 0, 255));
+
+ // Creating documents to test with. Different gids, 2 locations.
+ std::vector<document::Document::SP > documents;
+ for (uint32_t i=0; i<50; ++i) {
+ std::string content("Here is some content which is in all documents");
+ std::ostringstream uri;
+
+ uri << "userdoc:footype:" << (i % 3 == 0 ? 0x10001 : 0x0100001)
+ << ":mydoc-" << i;
+ Document::SP doc(createDocument(content, uri.str()).release());
+ documents.push_back(doc);
+ }
+
+ document::BucketIdFactory factory;
+
+ // Create buckets in separate, initial pass to avoid races with puts
+ for (uint32_t i=0; i<documents.size(); ++i) {
+ document::BucketId bucket(16, factory.getBucketId(
+ documents[i]->getId()).getRawId());
+
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+
+ _node->getPersistenceProvider().createBucket(
+ spi::Bucket(bucket, spi::PartitionId(0)), context);
+ }
+
+ // Populate bucket with the given data
+ for (uint32_t i=0; i<documents.size(); ++i) {
+ document::BucketId bucket(16, factory.getBucketId(
+ documents[i]->getId()).getRawId());
+
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bucket, documents[i], 100 + i));
+ std::unique_ptr<api::StorageMessageAddress> address(
+ new api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 3));
+ cmd->setAddress(*address);
+ cmd->setPriority(i * 2);
+ filestorHandler.schedule(cmd, 0);
+ }
+
+ filestorHandler.flush(true);
+
+ // Wait until everything is done.
+ int count = 0;
+ while (documents.size() != top.getNumReplies() && count < 1000) {
+ FastOS_Thread::Sleep(100);
+ count++;
+ }
+ CPPUNIT_ASSERT(count < 1000);
+
+ for (uint32_t i = 0; i < documents.size(); i++) {
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(i)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ }
+
+ // Verify that thread 1 gets documents over 50 pri
+ CPPUNIT_ASSERT_EQUAL(uint64_t(documents.size()),
+ metrics.disks[0]->threads[0]->operations.getValue()
+ + metrics.disks[0]->threads[1]->operations.getValue());
+ CPPUNIT_ASSERT(metrics.disks[0]->threads[0]->operations.getValue() <= 13);
+ // Closing file stor handler before threads are deleted, such that
+ // file stor threads getNextMessage calls returns.
+ filestorHandler.close();
+}
+
+void
+FileStorManagerTest::testSplit1()
+{
+ TestName testName("testSplit1");
+ // Setup a filestorthread to test
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(
+ dummyManager = new DummyStorageLink));
+ setClusterState("storage:2 distributor:1");
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(), loadTypes.getMetricLoadTypes(), 1);
+ FileStorHandler filestorHandler(messageSender, metrics, _node->getPartitions(),
+ _node->getComponentRegister(), 255, 0);
+ std::unique_ptr<DiskThread> thread(createThread(
+ *config, *_node, _node->getPersistenceProvider(),
+ filestorHandler, *metrics.disks[0]->threads[0], 0, 255));
+ // Creating documents to test with. Different gids, 2 locations.
+ std::vector<document::Document::SP > documents;
+ for (uint32_t i=0; i<20; ++i) {
+ std::string content("Here is some content which is in all documents");
+ std::ostringstream uri;
+
+ uri << "userdoc:footype:" << (i % 3 == 0 ? 0x10001 : 0x0100001)
+ << ":mydoc-" << i;
+ Document::SP doc(createDocument(
+ content, uri.str()).release());
+ documents.push_back(doc);
+ }
+ document::BucketIdFactory factory;
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ {
+ // Populate bucket with the given data
+ for (uint32_t i=0; i<documents.size(); ++i) {
+ document::BucketId bucket(16, factory.getBucketId(
+ documents[i]->getId()).getRawId());
+
+ _node->getPersistenceProvider().createBucket(
+ spi::Bucket(bucket, spi::PartitionId(0)), context);
+
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bucket, documents[i], 100 + i));
+ std::unique_ptr<api::StorageMessageAddress> address(
+ new api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 3));
+ cmd->setAddress(*address);
+ cmd->setSourceIndex(0);
+
+ filestorHandler.schedule(cmd, 0);
+ filestorHandler.flush(true);
+ LOG(debug, "Got %" PRIu64 " replies", top.getNumReplies());
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ top.reset();
+
+ // Delete every 5th document to have delete entries in file too
+ if (i % 5 == 0) {
+ std::shared_ptr<api::RemoveCommand> rcmd(
+ new api::RemoveCommand(
+ bucket, documents[i]->getId(), 1000000 + 100 + i));
+ rcmd->setAddress(*address);
+ filestorHandler.schedule(rcmd, 0);
+ filestorHandler.flush(true);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::RemoveReply> rreply(
+ std::dynamic_pointer_cast<api::RemoveReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT_MSG(top.getReply(0)->getType().toString(),
+ rreply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ rreply->getResult());
+ top.reset();
+ }
+ }
+
+ // Perform a split, check that locations are split
+ {
+ std::shared_ptr<api::SplitBucketCommand> cmd(
+ new api::SplitBucketCommand(document::BucketId(16, 1)));
+ cmd->setSourceIndex(0);
+ filestorHandler.schedule(cmd, 0);
+ filestorHandler.flush(true);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::SplitBucketReply> reply(
+ std::dynamic_pointer_cast<api::SplitBucketReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ top.reset();
+ }
+
+ // Test that the documents have gotten into correct parts.
+ for (uint32_t i=0; i<documents.size(); ++i) {
+ document::BucketId bucket(
+ 17, i % 3 == 0 ? 0x10001 : 0x0100001);
+ std::shared_ptr<api::GetCommand> cmd(
+ new api::GetCommand(bucket, documents[i]->getId(), "[all]"));
+ api::StorageMessageAddress address(
+ "storage", lib::NodeType::STORAGE, 3);
+ cmd->setAddress(address);
+ filestorHandler.schedule(cmd, 0);
+ filestorHandler.flush(true);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::GetReply> reply(
+ std::dynamic_pointer_cast<api::GetReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(i % 5 != 0 ? true : false, reply->wasFound());
+ top.reset();
+ }
+
+ // Keep splitting location 1 until we gidsplit
+ for (int i=17; i<=32; ++i) {
+ std::shared_ptr<api::SplitBucketCommand> cmd(
+ new api::SplitBucketCommand(
+ document::BucketId(i, 0x0100001)));
+ cmd->setSourceIndex(0);
+ filestorHandler.schedule(cmd, 0);
+ filestorHandler.flush(true);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::SplitBucketReply> reply(
+ std::dynamic_pointer_cast<api::SplitBucketReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ top.reset();
+ }
+
+ // Test that the documents have gotten into correct parts.
+ for (uint32_t i=0; i<documents.size(); ++i) {
+ document::BucketId bucket;
+ if (i % 3 == 0) {
+ bucket = document::BucketId(17, 0x10001);
+ } else {
+ bucket = document::BucketId(33, factory.getBucketId(
+ documents[i]->getId()).getRawId());
+ }
+ std::shared_ptr<api::GetCommand> cmd(
+ new api::GetCommand(bucket, documents[i]->getId(), "[all]"));
+ api::StorageMessageAddress address(
+ "storage", lib::NodeType::STORAGE, 3);
+ cmd->setAddress(address);
+ filestorHandler.schedule(cmd, 0);
+ filestorHandler.flush(true);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::GetReply> reply(
+ std::dynamic_pointer_cast<api::GetReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(i % 5 != 0 ? true : false, reply->wasFound());
+ top.reset();
+ }
+ }
+ // Closing file stor handler before threads are deleted, such that
+ // file stor threads getNextMessage calls returns.
+ filestorHandler.close();
+}
+
+void
+FileStorManagerTest::testSplitSingleGroup()
+{
+ TestName testName("testSplitSingleGroup");
+ // Setup a filestorthread to test
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(
+ dummyManager = new DummyStorageLink));
+ setClusterState("storage:2 distributor:1");
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(), loadTypes.getMetricLoadTypes(), 1);
+ FileStorHandler filestorHandler(messageSender, metrics, _node->getPartitions(),
+ _node->getComponentRegister(), 255, 0);
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ for (uint32_t j=0; j<1; ++j) {
+ // Test this twice, once where all the data ends up in file with
+ // splitbit set, and once where all the data ends up in file with
+ // splitbit unset
+ bool state = (j == 0);
+
+ std::unique_ptr<DiskThread> thread(createThread(
+ *config, *_node, _node->getPersistenceProvider(),
+ filestorHandler, *metrics.disks[0]->threads[0], 0, 255));
+ // Creating documents to test with. Different gids, 2 locations.
+ std::vector<document::Document::SP > documents;
+ for (uint32_t i=0; i<20; ++i) {
+ std::string content("Here is some content for all documents");
+ std::ostringstream uri;
+
+ uri << "userdoc:footype:" << (state ? 0x10001 : 0x0100001)
+ << ":mydoc-" << i;
+ Document::SP doc(createDocument(
+ content, uri.str()).release());
+ documents.push_back(doc);
+ }
+ document::BucketIdFactory factory;
+
+ // Populate bucket with the given data
+ for (uint32_t i=0; i<documents.size(); ++i) {
+ document::BucketId bucket(16, factory.getBucketId(
+ documents[i]->getId()).getRawId());
+
+ _node->getPersistenceProvider().createBucket(
+ spi::Bucket(bucket, spi::PartitionId(0)), context);
+
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bucket, documents[i], 100 + i));
+ api::StorageMessageAddress address(
+ "storage", lib::NodeType::STORAGE, 3);
+ cmd->setAddress(address);
+ filestorHandler.schedule(cmd, 0);
+ filestorHandler.flush(true);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ top.reset();
+ }
+ // Perform a split, check that locations are split
+ {
+ std::shared_ptr<api::SplitBucketCommand> cmd(
+ new api::SplitBucketCommand(document::BucketId(16, 1)));
+ cmd->setSourceIndex(0);
+ filestorHandler.schedule(cmd, 0);
+ filestorHandler.flush(true);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::SplitBucketReply> reply(
+ std::dynamic_pointer_cast<api::SplitBucketReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ top.reset();
+ }
+
+
+ // Test that the documents are all still there
+ for (uint32_t i=0; i<documents.size(); ++i) {
+ document::BucketId bucket(17, state ? 0x10001 : 0x00001);
+ std::shared_ptr<api::GetCommand> cmd(
+ new api::GetCommand(bucket, documents[i]->getId(), "[all]"));
+ api::StorageMessageAddress address(
+ "storage", lib::NodeType::STORAGE, 3);
+ cmd->setAddress(address);
+ filestorHandler.schedule(cmd, 0);
+ filestorHandler.flush(true);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::GetReply> reply(
+ std::dynamic_pointer_cast<api::GetReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ top.reset();
+ }
+ // Closing file stor handler before threads are deleted, such that
+ // file stor threads getNextMessage calls returns.
+ filestorHandler.close();
+ }
+}
+
+void
+FileStorManagerTest::putDoc(DummyStorageLink& top,
+ FileStorHandler& filestorHandler,
+ const document::BucketId& target,
+ uint32_t docNum)
+{
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 3);
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ document::BucketIdFactory factory;
+ document::DocumentId docId(vespalib::make_string("userdoc:ns:%zu:%d", target.getId(), docNum));
+ document::BucketId bucket(16, factory.getBucketId(docId).getRawId());
+ //std::cerr << "doc bucket is " << bucket << " vs source " << source << "\n";
+ _node->getPersistenceProvider().createBucket(
+ spi::Bucket(target, spi::PartitionId(0)), context);
+ Document::SP doc(new Document(*_testdoctype1, docId));
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(target, doc, docNum+1));
+ cmd->setAddress(address);
+ cmd->setPriority(120);
+ filestorHandler.schedule(cmd, 0);
+ filestorHandler.flush(true);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ top.reset();
+}
+
+void
+FileStorManagerTest::testSplitEmptyTargetWithRemappedOps()
+{
+ TestName testName("testSplitEmptyTargetWithRemappedOps");
+
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(
+ dummyManager = new DummyStorageLink));
+ setClusterState("storage:2 distributor:1");
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(), loadTypes.getMetricLoadTypes(), 1);
+ FileStorHandler filestorHandler(messageSender, metrics, _node->getPartitions(),
+ _node->getComponentRegister(), 255, 0);
+ std::unique_ptr<DiskThread> thread(createThread(
+ *config, *_node, _node->getPersistenceProvider(),
+ filestorHandler, *metrics.disks[0]->threads[0], 0, 255));
+
+ document::BucketId source(16, 0x10001);
+
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 3);
+
+ for (uint32_t i=0; i<10; ++i) {
+ putDoc(top, filestorHandler, source, i);
+ }
+
+ // Send split followed by a put that is bound for a target bucket that
+ // will end up empty in the split itself. The split should notice this
+ // and create the bucket explicitly afterwards in order to compensate for
+ // the persistence provider deleting it internally.
+ // Make sure we block the operation queue until we've scheduled all
+ // the operations.
+ std::unique_ptr<ResumeGuard> resumeGuard(
+ new ResumeGuard(filestorHandler.pause()));
+
+ std::shared_ptr<api::SplitBucketCommand> splitCmd(
+ new api::SplitBucketCommand(source));
+ splitCmd->setPriority(120);
+ splitCmd->setSourceIndex(0);
+
+ document::DocumentId docId(
+ vespalib::make_string("userdoc:ns:%d:1234", 0x100001));
+ Document::SP doc(new Document(*_testdoctype1, docId));
+ std::shared_ptr<api::PutCommand> putCmd(
+ new api::PutCommand(source, doc, 1001));
+ putCmd->setAddress(address);
+ putCmd->setPriority(120);
+
+ filestorHandler.schedule(splitCmd, 0);
+ filestorHandler.schedule(putCmd, 0);
+ resumeGuard.reset(0); // Unpause
+ filestorHandler.flush(true);
+
+ top.waitForMessages(2, _waitTime);
+
+ CPPUNIT_ASSERT_EQUAL((size_t) 2, top.getNumReplies());
+ {
+ std::shared_ptr<api::SplitBucketReply> reply(
+ std::dynamic_pointer_cast<api::SplitBucketReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ }
+ {
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(1)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ }
+
+ top.reset();
+}
+
+void
+FileStorManagerTest::testNotifyOnSplitSourceOwnershipChanged()
+{
+ TestName testName("testSplit1");
+ // Setup a filestorthread to test
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(dummyManager = new DummyStorageLink));
+ setClusterState("storage:2 distributor:2");
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(), loadTypes.getMetricLoadTypes(), 1);
+ FileStorHandler filestorHandler(messageSender, metrics, _node->getPartitions(),
+ _node->getComponentRegister(), 255, 0);
+ std::unique_ptr<DiskThread> thread(createThread(
+ *config, *_node, _node->getPersistenceProvider(),
+ filestorHandler, *metrics.disks[0]->threads[0], 0, 255));
+
+ document::BucketId source(getFirstBucketNotOwnedByDistributor(0));
+ createBucket(source, 0);
+ for (uint32_t i=0; i<10; ++i) {
+ putDoc(top, filestorHandler, source, i);
+ }
+
+ std::shared_ptr<api::SplitBucketCommand> splitCmd(
+ new api::SplitBucketCommand(source));
+ splitCmd->setPriority(120);
+ splitCmd->setSourceIndex(0); // Source not owned by this distributor.
+
+ filestorHandler.schedule(splitCmd, 0);
+ filestorHandler.flush(true);
+ top.waitForMessages(4, _waitTime); // 3 notify cmds + split reply
+
+ CPPUNIT_ASSERT_EQUAL(size_t(4), top.getNumReplies());
+ for (int i = 0; i < 3; ++i) {
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::NOTIFYBUCKETCHANGE,
+ top.getReply(i)->getType());
+ }
+
+ std::shared_ptr<api::SplitBucketReply> reply(
+ std::dynamic_pointer_cast<api::SplitBucketReply>(
+ top.getReply(3)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+}
+
+void
+FileStorManagerTest::testJoin()
+{
+ TestName testName("testJoin");
+ // Setup a filestorthread to test
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(
+ dummyManager = new DummyStorageLink));
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(), loadTypes.getMetricLoadTypes(), 1);
+ FileStorHandler filestorHandler(messageSender, metrics, _node->getPartitions(),
+ _node->getComponentRegister(), 255, 0);
+ std::unique_ptr<DiskThread> thread(createThread(
+ *config, *_node, _node->getPersistenceProvider(),
+ filestorHandler, *metrics.disks[0]->threads[0], 0, 255));
+ // Creating documents to test with. Different gids, 2 locations.
+ std::vector<document::Document::SP > documents;
+ for (uint32_t i=0; i<20; ++i) {
+ std::string content("Here is some content which is in all documents");
+ std::ostringstream uri;
+
+ uri << "userdoc:footype:" << (i % 3 == 0 ? 0x10001 : 0x0100001)
+ << ":mydoc-" << i;
+ Document::SP doc(createDocument(
+ content, uri.str()).release());
+ documents.push_back(doc);
+ }
+ document::BucketIdFactory factory;
+
+ createBucket(document::BucketId(17, 0x00001), 0);
+ createBucket(document::BucketId(17, 0x10001), 0);
+
+ {
+ // Populate bucket with the given data
+ for (uint32_t i=0; i<documents.size(); ++i) {
+ document::BucketId bucket(17, factory.getBucketId(
+ documents[i]->getId()).getRawId());
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bucket, documents[i], 100 + i));
+ std::unique_ptr<api::StorageMessageAddress> address(
+ new api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 3));
+ cmd->setAddress(*address);
+ filestorHandler.schedule(cmd, 0);
+ filestorHandler.flush(true);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ top.reset();
+ // Delete every 5th document to have delete entries in file too
+ if (i % 5 == 0) {
+ std::shared_ptr<api::RemoveCommand> rcmd(
+ new api::RemoveCommand(
+ bucket, documents[i]->getId(), 1000000 + 100 + i));
+ rcmd->setAddress(*address);
+ filestorHandler.schedule(rcmd, 0);
+ filestorHandler.flush(true);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::RemoveReply> rreply(
+ std::dynamic_pointer_cast<api::RemoveReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT_MSG(top.getReply(0)->getType().toString(),
+ rreply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ rreply->getResult());
+ top.reset();
+ }
+ }
+ LOG(debug, "Starting the actual join after populating data");
+ // Perform a join, check that other files are gone
+ {
+ std::shared_ptr<api::JoinBucketsCommand> cmd(
+ new api::JoinBucketsCommand(document::BucketId(16, 1)));
+ cmd->getSourceBuckets().push_back(document::BucketId(17, 0x00001));
+ cmd->getSourceBuckets().push_back(document::BucketId(17, 0x10001));
+ filestorHandler.schedule(cmd, 0);
+ filestorHandler.flush(true);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::JoinBucketsReply> reply(
+ std::dynamic_pointer_cast<api::JoinBucketsReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ top.reset();
+ }
+ // Test that the documents have gotten into the file.
+ for (uint32_t i=0; i<documents.size(); ++i) {
+ document::BucketId bucket(16, 1);
+ std::shared_ptr<api::GetCommand> cmd(
+ new api::GetCommand(bucket, documents[i]->getId(), "[all]"));
+ api::StorageMessageAddress address(
+ "storage", lib::NodeType::STORAGE, 3);
+ cmd->setAddress(address);
+ filestorHandler.schedule(cmd, 0);
+ filestorHandler.flush(true);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::GetReply> reply(
+ std::dynamic_pointer_cast<api::GetReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(i % 5 != 0 ? true : false, reply->wasFound());
+ top.reset();
+ }
+ }
+ // Closing file stor handler before threads are deleted, such that
+ // file stor threads getNextMessage calls returns.
+ filestorHandler.close();
+}
+
+namespace {
+
+spi::IteratorId
+createIterator(DummyStorageLink& link,
+ const document::BucketId& bucketId,
+ const std::string& docSel,
+ framework::MicroSecTime fromTime = framework::MicroSecTime(0),
+ framework::MicroSecTime toTime = framework::MicroSecTime::max(),
+ bool headerOnly = false)
+{
+ spi::Bucket bucket(bucketId, spi::PartitionId(0));
+
+ spi::Selection selection =
+ spi::Selection(spi::DocumentSelection(docSel));
+ selection.setFromTimestamp(spi::Timestamp(fromTime.getTime()));
+ selection.setToTimestamp(spi::Timestamp(toTime.getTime()));
+ CreateIteratorCommand::SP createIterCmd(
+ new CreateIteratorCommand(bucket,
+ selection,
+ headerOnly ? "[header]" : "[all]",
+ spi::NEWEST_DOCUMENT_ONLY));
+ link.sendDown(createIterCmd);
+ link.waitForMessages(1, FileStorManagerTest::LONG_WAITTIME);
+ CPPUNIT_ASSERT_EQUAL(size_t(1), link.getNumReplies());
+ std::shared_ptr<CreateIteratorReply> reply(
+ std::dynamic_pointer_cast<CreateIteratorReply>(
+ link.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ link.reset();
+ CPPUNIT_ASSERT(reply->getResult().success());
+ return reply->getIteratorId();
+}
+
+}
+
+void
+FileStorManagerTest::testVisiting()
+{
+ TestName testName("testVisiting");
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager = new FileStorManager(
+ smallConfig->getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ top.open();
+ // Adding documents to two buckets which we are going to visit
+ // We want one bucket in one slotfile, and one bucket with a file split
+ uint32_t docCount = 50;
+ std::vector<document::BucketId> ids(2);
+ ids[0] = document::BucketId(16, 1);
+ ids[1] = document::BucketId(16, 2);
+
+ createBucket(ids[0], 0);
+ createBucket(ids[1], 0);
+
+ lib::RandomGen randomizer(523);
+ for (uint32_t i=0; i<docCount; ++i) {
+ std::string content("Here is some content which is in all documents");
+ std::ostringstream uri;
+
+ uri << "userdoc:crawler:" << (i < 3 ? 1 : 2) << ":"
+ << randomizer.nextUint32() << ".html";
+ Document::SP doc(createDocument(
+ content, uri.str()).release());
+ const document::DocumentType& type(doc->getType());
+ if (i < 30) {
+ doc->setValue(type.getField("hstringval"),
+ document::StringFieldValue("John Doe"));
+ } else {
+ doc->setValue(type.getField("hstringval"),
+ document::StringFieldValue("Jane Doe"));
+ }
+ std::shared_ptr<api::PutCommand> cmd(new api::PutCommand(
+ ids[i < 3 ? 0 : 1], doc, i+1));
+ top.sendDown(cmd);
+ }
+ top.waitForMessages(docCount, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) docCount, top.getNumReplies());
+ // Check nodestate with splitting
+ {
+ api::BucketInfo info;
+ for (uint32_t i=3; i<docCount; ++i) {
+ std::shared_ptr<api::BucketInfoReply> reply(
+ std::dynamic_pointer_cast<api::BucketInfoReply>(
+ top.getReply(i)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_MESSAGE(reply->getResult().toString(),
+ reply->getResult().success());
+
+ info = reply->getBucketInfo();
+ }
+ CPPUNIT_ASSERT_EQUAL(docCount-3, info.getDocumentCount());
+ }
+ top.reset();
+ // Visit bucket with no split, using no selection
+ {
+ framework::MemoryToken::UP token(
+ _node->getMemoryManager().allocate(
+ _node->getMemoryManager().getAllocationType(
+ "VISITOR_BUFFER"),
+ 16*1024,
+ 16*1024,
+ 127));
+ spi::IteratorId iterId(createIterator(top, ids[0], "true"));
+ std::shared_ptr<GetIterCommand> cmd(
+ new GetIterCommand(std::move(token), ids[0], iterId, 16*1024));
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<GetIterReply> reply(
+ std::dynamic_pointer_cast<GetIterReply>(top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ CPPUNIT_ASSERT_EQUAL(ids[0], reply->getBucketId());
+ CPPUNIT_ASSERT_EQUAL(size_t(3), reply->getEntries().size());
+ top.reset();
+ }
+ // Visit bucket with split, using selection
+ {
+ uint32_t totalDocs = 0;
+ spi::IteratorId iterId(
+ createIterator(top,
+ ids[1],
+ "testdoctype1.hstringval = \"John Doe\""));
+ while (true) {
+ framework::MemoryToken::UP token(
+ _node->getMemoryManager().allocate(
+ _node->getMemoryManager().getAllocationType(
+ "VISITOR_BUFFER"),
+ 16*1024,
+ 16*1024,
+ 127));
+ std::shared_ptr<GetIterCommand> cmd(
+ new GetIterCommand(std::move(token), ids[1], iterId, 16*1024));
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<GetIterReply> reply(
+ std::dynamic_pointer_cast<GetIterReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ CPPUNIT_ASSERT_EQUAL(ids[1], reply->getBucketId());
+ totalDocs += reply->getEntries().size();
+ top.reset();
+ if (reply->isCompleted()) {
+ break;
+ }
+ }
+ CPPUNIT_ASSERT_EQUAL(27u, totalDocs);
+ }
+ // Visit bucket with min and max timestamps set, headers only
+ {
+ document::BucketId bucket(16, 2);
+ spi::IteratorId iterId(
+ createIterator(top,
+ ids[1],
+ "",
+ framework::MicroSecTime(30),
+ framework::MicroSecTime(40),
+ true));
+ uint32_t totalDocs = 0;
+ while (true) {
+ framework::MemoryToken::UP token(
+ _node->getMemoryManager().allocate(
+ _node->getMemoryManager().getAllocationType(
+ "VISITOR_BUFFER"),
+ 16*1024,
+ 16*1024,
+ 127));
+ std::shared_ptr<GetIterCommand> cmd(
+ new GetIterCommand(std::move(token), ids[1], iterId, 16*1024));
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<GetIterReply> reply(
+ std::dynamic_pointer_cast<GetIterReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ CPPUNIT_ASSERT_EQUAL(bucket, reply->getBucketId());
+/* Header only is a VDS-specific thing.
+
+ for (size_t i = 0; i < reply->getEntries().size(); ++i) {
+ CPPUNIT_ASSERT(reply->getEntries()[i]->getDocument()
+ ->getBody().empty());
+ }
+*/
+ totalDocs += reply->getEntries().size();
+ top.reset();
+ if (reply->isCompleted()) {
+ break;
+ }
+ }
+ CPPUNIT_ASSERT_EQUAL(11u, totalDocs);
+ }
+
+}
+
+void
+FileStorManagerTest::testRemoveLocation()
+{
+ TestName testName("testRemoveLocation");
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager =
+ new FileStorManager(config->getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ top.open();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 3);
+ document::BucketId bid(8, 0);
+
+ createBucket(bid, 0);
+
+ // Adding some documents to be removed later
+ for (uint32_t i=0; i<=10; ++i) {
+ std::ostringstream docid;
+ docid << "userdoc:ns:" << (i << 8) << ":foo";
+ Document::SP doc(createDocument(
+ "some content", docid.str()).release());
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, doc, 1000 + i));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ CPPUNIT_ASSERT_EQUAL(i + 1u, reply->getBucketInfo().getDocumentCount());
+ }
+ // Issuing remove location command
+ {
+ std::shared_ptr<api::RemoveLocationCommand> cmd(
+ new api::RemoveLocationCommand("id.user % 512 == 0", bid));
+ //new api::RemoveLocationCommand("id.user == 1", bid));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::RemoveLocationReply> reply(
+ std::dynamic_pointer_cast<api::RemoveLocationReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ CPPUNIT_ASSERT_EQUAL(5u, reply->getBucketInfo().getDocumentCount());
+ }
+}
+
+void FileStorManagerTest::testDeleteBucket()
+{
+ TestName testName("testDeleteBucket");
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager = new FileStorManager(
+ config->getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ top.open();
+ api::StorageMessageAddress address(
+ "storage", lib::NodeType::STORAGE, 2);
+ // Creating a document to test with
+ document::DocumentId docId("userdoc:crawler:4000:http://www.ntnu.no/");
+ Document::SP doc(new Document(*_testdoctype1, docId));
+ document::BucketId bid(16, 4000);
+
+ createBucket(bid, 0);
+
+ api::BucketInfo bucketInfo;
+ // Putting it
+ {
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, doc, 105));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+
+ CPPUNIT_ASSERT_EQUAL(1, (int)reply->getBucketInfo().getDocumentCount());
+ bucketInfo = reply->getBucketInfo();
+ top.reset();
+ }
+
+ // Delete bucket
+ {
+ std::shared_ptr<api::DeleteBucketCommand> cmd(
+ new api::DeleteBucketCommand(bid));
+ cmd->setAddress(address);
+ cmd->setBucketInfo(bucketInfo);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::DeleteBucketReply> reply(
+ std::dynamic_pointer_cast<api::DeleteBucketReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ }
+}
+
+void
+FileStorManagerTest::testDeleteBucketRejectOutdatedBucketInfo()
+{
+ TestName testName("testDeleteBucketRejectOutdatedBucketInfo");
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager = new FileStorManager(
+ config->getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ top.open();
+ api::StorageMessageAddress address(
+ "storage", lib::NodeType::STORAGE, 2);
+ // Creating a document to test with
+ document::DocumentId docId("userdoc:crawler:4000:http://www.ntnu.no/");
+ Document::SP doc(new Document(*_testdoctype1, docId));
+ document::BucketId bid(16, 4000);
+
+ createBucket(bid, 0);
+
+ api::BucketInfo bucketInfo;
+
+ // Putting it
+ {
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, doc, 105));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+
+ CPPUNIT_ASSERT_EQUAL(1, (int)reply->getBucketInfo().getDocumentCount());
+ bucketInfo = reply->getBucketInfo();
+ top.reset();
+ }
+
+ // Attempt to delete bucket, but with non-matching bucketinfo
+ {
+ std::shared_ptr<api::DeleteBucketCommand> cmd(
+ new api::DeleteBucketCommand(bid));
+ cmd->setBucketInfo(BucketInfo(0xf000baaa, 1, 123, 1, 456));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::DeleteBucketReply> reply(
+ std::dynamic_pointer_cast<api::DeleteBucketReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(
+ ReturnCode::REJECTED,
+ reply->getResult().getResult());
+ CPPUNIT_ASSERT_EQUAL(bucketInfo, reply->getBucketInfo());
+ }
+}
+
+/**
+ * Test that receiving a DeleteBucketCommand with invalid
+ * BucketInfo deletes the bucket and does not fail the operation.
+ */
+void
+FileStorManagerTest::testDeleteBucketWithInvalidBucketInfo()
+{
+ TestName testName("testDeleteBucketWithInvalidBucketInfo");
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager = new FileStorManager(
+ config->getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ top.open();
+ api::StorageMessageAddress address(
+ "storage", lib::NodeType::STORAGE, 2);
+ // Creating a document to test with
+ document::DocumentId docId("userdoc:crawler:4000:http://www.ntnu.no/");
+ Document::SP doc(new Document(*_testdoctype1, docId));
+ document::BucketId bid(16, 4000);
+
+ createBucket(bid, 0);
+
+ // Putting it
+ {
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, doc, 105));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ CPPUNIT_ASSERT_EQUAL(1, (int)reply->getBucketInfo().getDocumentCount());
+ top.reset();
+ }
+
+ // Attempt to delete bucket with invalid bucketinfo
+ {
+ std::shared_ptr<api::DeleteBucketCommand> cmd(
+ new api::DeleteBucketCommand(bid));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::DeleteBucketReply> reply(
+ std::dynamic_pointer_cast<api::DeleteBucketReply>(
+ top.getReply(0)));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(
+ ReturnCode::OK,
+ reply->getResult().getResult());
+ CPPUNIT_ASSERT_EQUAL(api::BucketInfo(), reply->getBucketInfo());
+ }
+}
+
+namespace {
+
+ /**
+ * Utility storage link, sending data to the given links instead of through
+ * a regular chain.
+ */
+ struct MidLink : public StorageLink {
+ StorageLink& _up;
+
+ public:
+ MidLink(std::unique_ptr<StorageLink> down, StorageLink& up)
+ : StorageLink("MidLink"), _up(up)
+ {
+ push_back(std::move(down));
+ }
+ ~MidLink() {
+ closeNextLink();
+ }
+
+ virtual void print(std::ostream& out, bool, const std::string&) const
+ { out << "MidLink"; }
+
+ virtual bool onUp(const std::shared_ptr<api::StorageMessage> & msg) {
+ if (!StorageLinkTest::callOnUp(_up, msg)) _up.sendUp(msg);
+ return true;
+ }
+
+ };
+
+ /**
+ * Utility class, connecting two storage links below it, sending
+ * messages coming up from one down the other (providing address is set
+ * correctly.)
+ */
+ class BinaryStorageLink : public DummyStorageLink {
+ vespalib::Lock _lock;
+ std::set<api::StorageMessage::Id> _seen;
+ MidLink _left;
+ MidLink _right;
+ uint16_t _leftAddr;
+ uint16_t _rightAddr;
+
+ public:
+ BinaryStorageLink(uint16_t leftAddr, std::unique_ptr<StorageLink> left,
+ uint16_t rightAddr, std::unique_ptr<StorageLink> right)
+ : _left(std::move(left), *this),
+ _right(std::move(right), *this),
+ _leftAddr(leftAddr),
+ _rightAddr(rightAddr) {}
+
+ virtual void print(std::ostream& out, bool, const std::string&) const
+ { out << "BinaryStorageLink"; }
+
+ virtual bool onDown(const std::shared_ptr<api::StorageMessage> & msg) {
+// LOG(debug, "onDown Received msg: ->%s, %s %llu\n", msg->getAddress() ? msg->getAddress()->toString().c_str() : "(null)", msg->toString().c_str(), msg->getMsgId());
+
+ vespalib::LockGuard lock(_lock);
+ _seen.insert(msg->getMsgId());
+ return sendOn(msg);
+ }
+
+ bool sendOn(const std::shared_ptr<api::StorageMessage> & msg) {
+ if (msg->getAddress()) {
+ uint16_t address = msg->getAddress()->getIndex();
+ if ((address == _leftAddr && !msg->getType().isReply()) ||
+ (address == _rightAddr && msg->getType().isReply()))
+ {
+ if (!StorageLinkTest::callOnDown(_left, msg)) {
+ _left.sendDown(msg);
+ }
+ } else if ((address == _rightAddr && !msg->getType().isReply()) ||
+ (address == _leftAddr && msg->getType().isReply()))
+ {
+ if (!StorageLinkTest::callOnDown(_right, msg)) {
+ _right.sendDown(msg);
+ }
+ } else {
+ std::ostringstream ost;
+ ost << "Address " << address << " is neither " << _leftAddr
+ << " or " << _rightAddr << " in message " << *msg
+ << ".\n";
+ CPPUNIT_FAIL(ost.str());
+ }
+ }
+ return true;
+ }
+
+ virtual bool onUp(const std::shared_ptr<api::StorageMessage> & msg) { // LOG(debug, "onUp Received msg: ->%s, %s %llu\n", msg->getAddress() ? msg->getAddress()->toString().c_str() : "(null)", msg->toString().c_str(), msg->getMsgId());
+
+ vespalib::LockGuard lock(_lock);
+ std::set<api::StorageMessage::Id>::iterator it
+ = _seen.find(msg->getMsgId());
+ // If message originated from the outside
+ if (it != _seen.end()) {
+ LOG(debug, "Have seen this message before, storing");
+
+ _seen.erase(it);
+ return DummyStorageLink::onUp(msg);
+ // If it originated from below, send it down again.
+ } else if (msg->getType() == api::MessageType::NOTIFYBUCKETCHANGE) {
+ // Just throw away notify bucket change
+ return true;
+ } else {
+ LOG(debug, "Never seen %s, sending on!",
+ msg->toString().c_str());
+
+ return sendOn(msg);
+ }
+ }
+
+ void onFlush(bool downwards) {
+ if (downwards) {
+ _left.flush();
+ _right.flush();
+ }
+ }
+ void onOpen() {
+ _left.open();
+ _right.open();
+ }
+ void onClose() {
+ _left.close();
+ _right.close();
+ }
+ };
+}
+
+void
+FileStorManagerTest::testNoTimestamps()
+{
+ TestName testName("testNoTimestamps");
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager =
+ new FileStorManager(config->getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ top.open();
+ api::StorageMessageAddress address(
+ "storage", lib::NodeType::STORAGE, 3);
+ // Creating a document to test with
+ Document::SP doc(createDocument(
+ "some content", "doc:crawler:http://www.ntnu.no/").release());
+ document::BucketId bid(16, 4000);
+
+ createBucket(bid, 0);
+
+ // Putting it
+ {
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, doc, 0));
+ cmd->setAddress(address);
+ CPPUNIT_ASSERT_EQUAL((api::Timestamp)0, cmd->getTimestamp());
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode::REJECTED,
+ reply->getResult().getResult());
+ }
+ // Removing it
+ {
+ std::shared_ptr<api::RemoveCommand> cmd(
+ new api::RemoveCommand(bid, doc->getId(), 0));
+ cmd->setAddress(address);
+ CPPUNIT_ASSERT_EQUAL((api::Timestamp)0, cmd->getTimestamp());
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::RemoveReply> reply(
+ std::dynamic_pointer_cast<api::RemoveReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode::REJECTED,
+ reply->getResult().getResult());
+ }
+}
+
+void
+FileStorManagerTest::testEqualTimestamps()
+{
+ TestName testName("testEqualTimestamps");
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager =
+ new FileStorManager(config->getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ top.open();
+ api::StorageMessageAddress address(
+ "storage", lib::NodeType::STORAGE, 3);
+ // Creating a document to test with
+ document::BucketId bid(16, 4000);
+
+ createBucket(bid, 0);
+
+ // Putting it
+ {
+ Document::SP doc(createDocument(
+ "some content", "userdoc:crawler:4000:http://www.ntnu.no/")
+ .release());
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, doc, 100));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode::OK, reply->getResult().getResult());
+ }
+
+ // Putting it on same timestamp again
+ // (ok as doc is the same. Since merge can move doc to other copy we
+ // have to accept this)
+ {
+ Document::SP doc(createDocument(
+ "some content", "userdoc:crawler:4000:http://www.ntnu.no/")
+ .release());
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, doc, 100));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode::OK, reply->getResult().getResult());
+ }
+
+ // Putting the doc with other id. Now we should fail
+ {
+ Document::SP doc(createDocument(
+ "some content", "userdoc:crawler:4000:http://www.ntnu.nu/")
+ .release());
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, doc, 100));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode::TIMESTAMP_EXIST,
+ reply->getResult().getResult());
+ }
+}
+
+void
+FileStorManagerTest::testMultiOp()
+{
+ TestName testName("testMultiOp");
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager =
+ new FileStorManager(config->getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ top.open();
+ api::StorageMessageAddress address(
+ "storage", lib::NodeType::STORAGE, 3);
+
+ createBucket(document::BucketId(16, 0), 0);
+
+ // Add some documents to remove/update later
+ for (uint32_t i=0; i<10; ++i) {
+ std::ostringstream did;
+ did << "userdoc:crawler:0:http://www.ntnu.no/" << i;
+ Document::SP doc(createDocument(
+ "some content", did.str()).release());
+ doc->set("headerval", (int) i);
+ doc->set("content", "some content");
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(document::BucketId(16, 0), doc, 100 + i));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ }
+ document::DocumentTypeRepo::SP repo = _node->getTypeRepo();
+
+ // Create operation list
+ std::vector<char> buffer(1024 * 1024);
+ vdslib::WritableDocumentList mdl(repo, &buffer[0], buffer.size());
+ for (uint32_t i=10; i<15; ++i) {
+ std::ostringstream did;
+ did << "userdoc:crawler:0:http://www.ntnu.no/" << i;
+ mdl.addPut(*createDocument("foo bar", did.str()),
+ 1000 + i);
+ }
+ for (uint32_t i=4; i<8; ++i) {
+ std::ostringstream did;
+ did << "userdoc:crawler:0:http://www.ntnu.no/" << i;
+ mdl.addRemove(document::DocumentId(did.str()), 2000 + i);
+ }
+ for (uint32_t i=1; i<3; ++i) {
+ std::ostringstream did;
+ did << "userdoc:crawler:0:http://www.ntnu.no/" << i;
+ document::DocumentUpdate update(*_testdoctype1,
+ document::DocumentId(did.str()));
+ if (i % 2 == 0) {
+ document::FieldUpdate fupd(_testdoctype1->getField("content"));
+ fupd.addUpdate(document::AssignValueUpdate(
+ document::StringFieldValue("baah")));
+ update.addUpdate(fupd);
+ } else {
+ document::FieldUpdate fupd(_testdoctype1->getField("headerval"));
+ fupd.addUpdate(document::AssignValueUpdate(
+ document::IntFieldValue(i + 100)));
+ update.addUpdate(fupd);
+ }
+ mdl.addUpdate(update, 3000 + i);
+ }
+ // Add a non-existing update
+ {
+ std::ostringstream did;
+ did << "userdoc:crawler:0:http://www.ntnu.no/nonexisting1";
+ document::DocumentUpdate update(*_testdoctype1,
+ document::DocumentId(did.str()));
+ document::FieldUpdate fupd(_testdoctype1->getField("content"));
+ fupd.addUpdate(document::AssignValueUpdate(
+ document::StringFieldValue("baah")));
+ update.addUpdate(fupd);
+ mdl.addUpdate(update, 4000);
+ }
+
+ // Issue operation.
+ {
+ std::shared_ptr<api::MultiOperationCommand> cmd(
+ new api::MultiOperationCommand(
+ repo, document::BucketId(16, 0), buffer));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::MultiOperationReply> reply(
+ std::dynamic_pointer_cast<api::MultiOperationReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply->getResult());
+ }
+ // Verify that new documents exist and that removed are gone.
+ // Removing it
+ for (uint32_t i=0; i<16; ++i) {
+ std::ostringstream did;
+ did << "userdoc:crawler:0:http://www.ntnu.no/" << i;
+ std::shared_ptr<api::GetCommand> cmd(new api::GetCommand(
+ document::BucketId(16, 0), document::DocumentId(did.str()),
+ "[all]"));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::GetReply> reply3(
+ std::dynamic_pointer_cast<api::GetReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply3.get());
+ if (i < 4 || (i >= 8 && i < 15)) {
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ reply3->getResult());
+ CPPUNIT_ASSERT_EQUAL(vespalib::string(did.str()),
+ reply3->getDocumentId().toString());
+ if (i >= 10) {
+ CPPUNIT_ASSERT(!reply3->getDocument()->hasValue("headerval"));
+ CPPUNIT_ASSERT(reply3->getDocument()->hasValue("content"));
+ } else if (i >= 1 && i <3) {
+ CPPUNIT_ASSERT(reply3->getDocument()->hasValue("headerval"));
+ CPPUNIT_ASSERT(reply3->getDocument()->hasValue("content"));
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<const document::FieldValue&>(
+ document::IntFieldValue(i % 2 == 0 ? i : i + 100)),
+ *reply3->getDocument()->getValue("headerval"));
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<const document::FieldValue&>(
+ document::StringFieldValue(i % 2 == 0 ? "baah" : "some content")),
+ *reply3->getDocument()->getValue("content"));
+ } else {
+ CPPUNIT_ASSERT(reply3->getDocument()->hasValue("headerval"));
+ CPPUNIT_ASSERT(reply3->getDocument()->hasValue("content"));
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<const document::FieldValue&>(
+ document::IntFieldValue(i)),
+ *reply3->getDocument()->getValue("headerval"));
+ }
+ } else {
+ CPPUNIT_ASSERT_EQUAL(false, reply3->wasFound());
+ CPPUNIT_ASSERT_EQUAL(vespalib::string(did.str()),
+ reply3->getDocumentId().toString());
+ }
+ }
+}
+
+void
+FileStorManagerTest::testGetIter()
+{
+ TestName testName("testGetIter");
+ // Setting up manager
+ DummyStorageLink top;
+ FileStorManager *manager;
+ top.push_back(unique_ptr<StorageLink>(manager =
+ new FileStorManager(config->getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ top.open();
+ api::StorageMessageAddress address(
+ "storage", lib::NodeType::STORAGE, 3);
+ document::BucketId bid(16, 4000);
+
+ createBucket(bid, 0);
+
+ std::vector<Document::SP > docs;
+ // Creating some documents to test with
+ for (uint32_t i=0; i<10; ++i) {
+ std::ostringstream id;
+ id << "userdoc:crawler:4000:http://www.ntnu.no/" << i;
+ docs.push_back(
+ Document::SP(
+ _node->getTestDocMan().createRandomDocumentAtLocation(
+ 4000, i, 400, 400)));
+ }
+ BucketInfo bucketInfo;
+ // Putting all docs to have something to visit
+ for (uint32_t i=0; i<docs.size(); ++i) {
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, docs[i], 100 + i));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ bucketInfo = reply->getBucketInfo();
+ }
+ // Sending a getiter request that will only visit some of the docs
+ spi::IteratorId iterId(createIterator(top, bid, ""));
+ {
+ framework::MemoryToken::UP token(
+ _node->getMemoryManager().allocate(
+ _node->getMemoryManager().getAllocationType(
+ "VISITOR_BUFFER"),
+ 2048,
+ 2048,
+ 127));
+ std::shared_ptr<GetIterCommand> cmd(
+ new GetIterCommand(std::move(token), bid, iterId, 2048));
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<GetIterReply> reply(
+ std::dynamic_pointer_cast<GetIterReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ CPPUNIT_ASSERT(reply->getEntries().size() > 0);
+ CPPUNIT_ASSERT(reply->getEntries().size() < docs.size());
+ }
+ // Normal case of get iter is testing through visitor tests.
+ // Testing specific situation where file is deleted while visiting here
+ {
+ std::shared_ptr<api::DeleteBucketCommand> cmd(
+ new api::DeleteBucketCommand(bid));
+ cmd->setBucketInfo(bucketInfo);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::DeleteBucketReply> reply(
+ std::dynamic_pointer_cast<api::DeleteBucketReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ }
+ {
+ framework::MemoryToken::UP token(
+ _node->getMemoryManager().allocate(
+ _node->getMemoryManager().getAllocationType(
+ "VISITOR_BUFFER"),
+ 2048,
+ 2048,
+ 127));
+ std::shared_ptr<GetIterCommand> cmd(
+ new GetIterCommand(std::move(token), bid, iterId, 2048));
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<GetIterReply> reply(
+ std::dynamic_pointer_cast<GetIterReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode::BUCKET_NOT_FOUND,
+ reply->getResult().getResult());
+ CPPUNIT_ASSERT(reply->getEntries().empty());
+ }
+}
+
+void
+FileStorManagerTest::testSetBucketActiveState()
+{
+ TestName testName("testSetBucketActiveState");
+ DummyStorageLink top;
+ FileStorManager* manager(
+ new FileStorManager(config->getConfigId(),
+ _node->getPartitions(),
+ _node->getPersistenceProvider(),
+ _node->getComponentRegister()));
+ top.push_back(unique_ptr<StorageLink>(manager));
+ setClusterState("storage:4 distributor:1");
+ top.open();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 3);
+
+ document::BucketId bid(16, 4000);
+
+ const uint16_t disk = 0;
+ createBucket(bid, disk);
+ spi::dummy::DummyPersistence& provider(
+ dynamic_cast<spi::dummy::DummyPersistence&>(_node->getPersistenceProvider()));
+ CPPUNIT_ASSERT(!provider.isActive(spi::Bucket(bid, spi::PartitionId(disk))));
+
+ {
+ std::shared_ptr<api::SetBucketStateCommand> cmd(
+ new api::SetBucketStateCommand(
+ bid, api::SetBucketStateCommand::ACTIVE));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::SetBucketStateReply> reply(
+ std::dynamic_pointer_cast<api::SetBucketStateReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ }
+
+ CPPUNIT_ASSERT(provider.isActive(spi::Bucket(bid, spi::PartitionId(disk))));
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(
+ bid, "foo"));
+ CPPUNIT_ASSERT(entry->info.isActive());
+ }
+ // Trigger bucket info to be read back into the database
+ {
+ std::shared_ptr<ReadBucketInfo> cmd(
+ new ReadBucketInfo(bid));
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<ReadBucketInfoReply> reply(
+ std::dynamic_pointer_cast<ReadBucketInfoReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ }
+ // Should not have lost active flag
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(
+ bid, "foo"));
+ CPPUNIT_ASSERT(entry->info.isActive());
+ }
+
+ {
+ std::shared_ptr<api::SetBucketStateCommand> cmd(
+ new api::SetBucketStateCommand(
+ bid, api::SetBucketStateCommand::INACTIVE));
+ cmd->setAddress(address);
+ top.sendDown(cmd);
+ top.waitForMessages(1, _waitTime);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, top.getNumReplies());
+ std::shared_ptr<api::SetBucketStateReply> reply(
+ std::dynamic_pointer_cast<api::SetBucketStateReply>(
+ top.getReply(0)));
+ top.reset();
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK), reply->getResult());
+ }
+
+ CPPUNIT_ASSERT(!provider.isActive(spi::Bucket(bid, spi::PartitionId(disk))));
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(
+ bid, "foo"));
+ CPPUNIT_ASSERT(!entry->info.isActive());
+ }
+}
+
+void
+FileStorManagerTest::testNotifyOwnerDistributorOnOutdatedSetBucketState()
+{
+ TestName testName("testNotifyOwnerDistributorOnOutdatedSetBucketState");
+ DummyStorageLink top;
+ FileStorManager* manager(
+ new FileStorManager(config->getConfigId(),
+ _node->getPartitions(),
+ _node->getPersistenceProvider(),
+ _node->getComponentRegister()));
+ top.push_back(unique_ptr<StorageLink>(manager));
+
+ setClusterState("storage:2 distributor:2");
+ top.open();
+
+ document::BucketId bid(getFirstBucketNotOwnedByDistributor(0));
+ CPPUNIT_ASSERT(bid.getRawId() != 0);
+ createBucket(bid, 0);
+
+ std::shared_ptr<api::SetBucketStateCommand> cmd(
+ new api::SetBucketStateCommand(
+ bid, api::SetBucketStateCommand::ACTIVE));
+ cmd->setAddress(api::StorageMessageAddress(
+ "cluster", lib::NodeType::STORAGE, 1));
+ cmd->setSourceIndex(0);
+
+ top.sendDown(cmd);
+ top.waitForMessages(2, _waitTime);
+
+ CPPUNIT_ASSERT_EQUAL(size_t(2), top.getNumReplies());
+ // Not necessarily deterministic order.
+ int idxOffset = 0;
+ if (top.getReply(0)->getType() != api::MessageType::NOTIFYBUCKETCHANGE) {
+ ++idxOffset;
+ }
+ std::shared_ptr<api::NotifyBucketChangeCommand> notifyCmd(
+ std::dynamic_pointer_cast<api::NotifyBucketChangeCommand>(
+ top.getReply(idxOffset)));
+ std::shared_ptr<api::SetBucketStateReply> stateReply(
+ std::dynamic_pointer_cast<api::SetBucketStateReply>(
+ top.getReply(1 - idxOffset)));
+
+ CPPUNIT_ASSERT(stateReply.get());
+ CPPUNIT_ASSERT_EQUAL(ReturnCode(ReturnCode::OK),
+ stateReply->getResult());
+
+ CPPUNIT_ASSERT(notifyCmd.get());
+ CPPUNIT_ASSERT_EQUAL(uint16_t(1), notifyCmd->getAddress()->getIndex());
+ // Not necessary for this to be set since distributor does not insert this
+ // info into its db, but useful for debugging purposes.
+ CPPUNIT_ASSERT(notifyCmd->getBucketInfo().isActive());
+}
+
+void
+FileStorManagerTest::testGetBucketDiffImplicitCreateBucket()
+{
+ TestName testName("testGetBucketDiffImplicitCreateBucket");
+ DummyStorageLink top;
+ FileStorManager* manager(
+ new FileStorManager(config->getConfigId(),
+ _node->getPartitions(),
+ _node->getPersistenceProvider(),
+ _node->getComponentRegister()));
+ top.push_back(unique_ptr<StorageLink>(manager));
+ setClusterState("storage:2 distributor:1");
+ top.open();
+
+ document::BucketId bid(16, 4000);
+
+ std::vector<api::MergeBucketCommand::Node> nodes;
+ nodes.push_back(1);
+ nodes.push_back(0);
+
+ std::shared_ptr<api::GetBucketDiffCommand> cmd(
+ new api::GetBucketDiffCommand(bid, nodes, Timestamp(1000)));
+ cmd->setAddress(api::StorageMessageAddress(
+ "cluster", lib::NodeType::STORAGE, 1));
+ cmd->setSourceIndex(0);
+ top.sendDown(cmd);
+
+ api::GetBucketDiffReply* reply;
+ ASSERT_SINGLE_REPLY(api::GetBucketDiffReply, reply, top, _waitTime);
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode(api::ReturnCode::OK),
+ reply->getResult());
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(
+ bid, "foo"));
+ CPPUNIT_ASSERT(entry.exist());
+ CPPUNIT_ASSERT(entry->info.isReady());
+ }
+}
+
+void
+FileStorManagerTest::testMergeBucketImplicitCreateBucket()
+{
+ TestName testName("testMergeBucketImplicitCreateBucket");
+ DummyStorageLink top;
+ FileStorManager* manager(
+ new FileStorManager(config->getConfigId(),
+ _node->getPartitions(),
+ _node->getPersistenceProvider(),
+ _node->getComponentRegister()));
+ top.push_back(unique_ptr<StorageLink>(manager));
+ setClusterState("storage:3 distributor:1");
+ top.open();
+
+ document::BucketId bid(16, 4000);
+
+ std::vector<api::MergeBucketCommand::Node> nodes;
+ nodes.push_back(1);
+ nodes.push_back(2);
+
+ std::shared_ptr<api::MergeBucketCommand> cmd(
+ new api::MergeBucketCommand(bid, nodes, Timestamp(1000)));
+ cmd->setAddress(api::StorageMessageAddress(
+ "cluster", lib::NodeType::STORAGE, 1));
+ cmd->setSourceIndex(0);
+ top.sendDown(cmd);
+
+ api::GetBucketDiffCommand* diffCmd;
+ ASSERT_SINGLE_REPLY(api::GetBucketDiffCommand, diffCmd, top, _waitTime);
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(
+ bid, "foo"));
+ CPPUNIT_ASSERT(entry.exist());
+ CPPUNIT_ASSERT(entry->info.isReady());
+ }
+}
+
+void
+FileStorManagerTest::testNewlyCreatedBucketIsReady()
+{
+ TestName testName("testNewlyCreatedBucketIsReady");
+ DummyStorageLink top;
+ FileStorManager* manager(
+ new FileStorManager(config->getConfigId(),
+ _node->getPartitions(),
+ _node->getPersistenceProvider(),
+ _node->getComponentRegister()));
+ top.push_back(unique_ptr<StorageLink>(manager));
+ setClusterState("storage:2 distributor:1");
+ top.open();
+
+ document::BucketId bid(16, 4000);
+
+ std::shared_ptr<api::CreateBucketCommand> cmd(
+ new api::CreateBucketCommand(bid));
+ cmd->setAddress(api::StorageMessageAddress(
+ "cluster", lib::NodeType::STORAGE, 1));
+ cmd->setSourceIndex(0);
+ top.sendDown(cmd);
+
+ api::CreateBucketReply* reply;
+ ASSERT_SINGLE_REPLY(api::CreateBucketReply, reply, top, _waitTime);
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode(api::ReturnCode::OK),
+ reply->getResult());
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(
+ bid, "foo"));
+ CPPUNIT_ASSERT(entry.exist());
+ CPPUNIT_ASSERT(entry->info.isReady());
+ CPPUNIT_ASSERT(!entry->info.isActive());
+ }
+}
+
+void
+FileStorManagerTest::testCreateBucketSetsActiveFlagInDatabaseAndReply()
+{
+ TestFileStorComponents c(*this, "testNotifyOnSplitSourceOwnershipChanged");
+ setClusterState("storage:2 distributor:1");
+
+ document::BucketId bid(16, 4000);
+ std::shared_ptr<api::CreateBucketCommand> cmd(
+ new api::CreateBucketCommand(bid));
+ cmd->setAddress(api::StorageMessageAddress(
+ "cluster", lib::NodeType::STORAGE, 1));
+ cmd->setSourceIndex(0);
+ cmd->setActive(true);
+ c.top.sendDown(cmd);
+
+ api::CreateBucketReply* reply;
+ ASSERT_SINGLE_REPLY(api::CreateBucketReply, reply, c.top, _waitTime);
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode(api::ReturnCode::OK),
+ reply->getResult());
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(
+ bid, "foo"));
+ CPPUNIT_ASSERT(entry.exist());
+ CPPUNIT_ASSERT(entry->info.isReady());
+ CPPUNIT_ASSERT(entry->info.isActive());
+ }
+}
+
+} // storage
diff --git a/storage/src/tests/persistence/filestorage/filestormodifiedbucketstest.cpp b/storage/src/tests/persistence/filestorage/filestormodifiedbucketstest.cpp
new file mode 100644
index 00000000000..19b84ef475b
--- /dev/null
+++ b/storage/src/tests/persistence/filestorage/filestormodifiedbucketstest.cpp
@@ -0,0 +1,142 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <memory>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storage/persistence/filestorage/modifiedbucketchecker.h>
+#include <tests/persistence/persistenceproviderwrapper.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+#include <tests/persistence/filestorage/filestortestfixture.h>
+
+namespace storage {
+
+/**
+ * Effectively an integration test between the ModifiedBucketChecker storage
+ * link and the behavior of the filestor component.
+ */
+class FileStorModifiedBucketsTest : public FileStorTestFixture
+{
+public:
+ void modifiedBucketsSendNotifyBucketChange();
+ void fileStorRepliesToRecheckBucketCommands();
+
+ void modifyBuckets(uint32_t first, uint32_t count);
+
+ spi::dummy::DummyPersistence& getDummyPersistence() {
+ return dynamic_cast<spi::dummy::DummyPersistence&>(_node->getPersistenceProvider());
+ }
+
+ CPPUNIT_TEST_SUITE(FileStorModifiedBucketsTest);
+ CPPUNIT_TEST(modifiedBucketsSendNotifyBucketChange);
+ CPPUNIT_TEST(fileStorRepliesToRecheckBucketCommands);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(FileStorModifiedBucketsTest);
+
+namespace {
+
+struct BucketCheckerInjector : FileStorTestFixture::StorageLinkInjector
+{
+ TestServiceLayerApp& _node;
+ FileStorTestFixture& _fixture;
+ BucketCheckerInjector(TestServiceLayerApp& node,
+ FileStorTestFixture& fixture)
+ : _node(node),
+ _fixture(fixture)
+ {}
+ void inject(DummyStorageLink& link) const {
+ link.push_back(std::unique_ptr<ModifiedBucketChecker>(
+ new ModifiedBucketChecker(_node.getComponentRegister(),
+ _node.getPersistenceProvider(),
+ _fixture._config->getConfigId())));
+ }
+};
+
+void
+assertIsNotifyCommandWithActiveBucket(api::StorageMessage& msg)
+{
+ api::NotifyBucketChangeCommand& cmd(
+ dynamic_cast<api::NotifyBucketChangeCommand&>(msg));
+ CPPUNIT_ASSERT(cmd.getBucketInfo().isActive());
+ CPPUNIT_ASSERT_EQUAL(
+ vespalib::string("StorageMessageAddress(Storage protocol, "
+ "cluster storage, nodetype distributor, index 0)"),
+ cmd.getAddress()->toString());
+}
+
+}
+
+void
+FileStorModifiedBucketsTest::modifyBuckets(uint32_t first, uint32_t count)
+{
+ spi::BucketIdListResult::List buckets;
+ for (uint32_t i = 0; i < count; ++i) {
+ buckets.push_back(document::BucketId(16, first + i));
+ _node->getPersistenceProvider().setActiveState(
+ spi::Bucket(buckets[i], spi::PartitionId(0)),
+ spi::BucketInfo::ACTIVE);
+ }
+
+ getDummyPersistence().setModifiedBuckets(buckets);
+}
+
+void
+FileStorModifiedBucketsTest::modifiedBucketsSendNotifyBucketChange()
+{
+ BucketCheckerInjector bcj(*_node, *this);
+ TestFileStorComponents c(*this, "modifiedBucketsSendNotifyBucketChange", bcj);
+ setClusterState("storage:1 distributor:1");
+
+ uint32_t numBuckets = 10;
+
+ for (uint32_t i = 0; i < numBuckets; ++i) {
+ document::BucketId bucket(16, i);
+ createBucket(spi::Bucket(bucket, spi::PartitionId(0)));
+ c.sendPut(bucket, DocumentIndex(0), PutTimestamp(1000));
+ }
+ c.top.waitForMessages(numBuckets, MSG_WAIT_TIME);
+ c.top.reset();
+
+ modifyBuckets(0, numBuckets);
+ c.top.waitForMessages(numBuckets, MSG_WAIT_TIME);
+
+ for (uint32_t i = 0; i < 10; ++i) {
+ assertIsNotifyCommandWithActiveBucket(*c.top.getReply(i));
+
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(
+ document::BucketId(16, i), "foo"));
+
+ CPPUNIT_ASSERT(entry->info.isActive());
+ }
+}
+
+void
+FileStorModifiedBucketsTest::fileStorRepliesToRecheckBucketCommands()
+{
+ BucketCheckerInjector bcj(*_node, *this);
+ TestFileStorComponents c(*this, "fileStorRepliesToRecheckBucketCommands", bcj);
+ setClusterState("storage:1 distributor:1");
+
+ document::BucketId bucket(16, 0);
+ createBucket(spi::Bucket(bucket, spi::PartitionId(0)));
+ c.sendPut(bucket, DocumentIndex(0), PutTimestamp(1000));
+ c.top.waitForMessages(1, MSG_WAIT_TIME);
+ c.top.reset();
+
+ modifyBuckets(0, 1);
+ c.top.waitForMessages(1, MSG_WAIT_TIME);
+ assertIsNotifyCommandWithActiveBucket(*c.top.getReply(0));
+
+ // If we don't reply to the recheck bucket commands, we won't trigger
+ // a new round of getModifiedBuckets and recheck commands.
+ c.top.reset();
+ createBucket(spi::Bucket(document::BucketId(16, 1), spi::PartitionId(0)));
+ modifyBuckets(1, 1);
+ c.top.waitForMessages(1, MSG_WAIT_TIME);
+ assertIsNotifyCommandWithActiveBucket(*c.top.getReply(0));
+}
+
+} // storage
+
diff --git a/storage/src/tests/persistence/filestorage/filestortestfixture.cpp b/storage/src/tests/persistence/filestorage/filestortestfixture.cpp
new file mode 100644
index 00000000000..69b109b5cfc
--- /dev/null
+++ b/storage/src/tests/persistence/filestorage/filestortestfixture.cpp
@@ -0,0 +1,143 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <sstream>
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/storage/persistence/filestorage/filestormanager.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+#include <tests/persistence/filestorage/filestortestfixture.h>
+
+namespace storage {
+
+spi::LoadType FileStorTestFixture::defaultLoadType = spi::LoadType(0, "default");
+const uint32_t FileStorTestFixture::MSG_WAIT_TIME;
+
+void
+FileStorTestFixture::setupDisks(uint32_t diskCount)
+{
+ _config.reset(new vdstestlib::DirConfig(getStandardConfig(true)));
+
+ _config2.reset(new vdstestlib::DirConfig(*_config));
+ _config2->getConfig("stor-server").set("root_folder", "vdsroot.2");
+ _config2->getConfig("stor-devices").set("root_folder", "vdsroot.2");
+ _config2->getConfig("stor-server").set("node_index", "1");
+
+ _smallConfig.reset(new vdstestlib::DirConfig(*_config));
+ _node.reset(new TestServiceLayerApp(DiskCount(diskCount), NodeIndex(1),
+ _config->getConfigId()));
+ _testdoctype1 = _node->getTypeRepo()->getDocumentType("testdoctype1");
+}
+
+// Default provider setup which should work out of the box for most tests.
+void
+FileStorTestFixture::setUp()
+{
+ setupDisks(1);
+ _node->setPersistenceProvider(
+ spi::PersistenceProvider::UP(
+ new spi::dummy::DummyPersistence(_node->getTypeRepo(), 1)));
+}
+
+void
+FileStorTestFixture::tearDown()
+{
+ _node.reset(0);
+}
+
+void
+FileStorTestFixture::createBucket(const document::BucketId& bid)
+{
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ _node->getPersistenceProvider().createBucket(
+ spi::Bucket(bid, spi::PartitionId(0)), context);
+
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(bid, "foo",
+ StorBucketDatabase::CREATE_IF_NONEXISTING));
+ entry->disk = 0;
+ entry->info = api::BucketInfo(0, 0, 0, 0, 0, true, false);
+ entry.write();
+}
+
+bool
+FileStorTestFixture::bucketExistsInDb(const document::BucketId& bucket) const
+{
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(bucket, "bucketExistsInDb"));
+ return entry.exist();
+}
+
+FileStorTestFixture::TestFileStorComponents::TestFileStorComponents(
+ FileStorTestFixture& fixture,
+ const char* testName,
+ const StorageLinkInjector& injector)
+ : _testName(testName),
+ _fixture(fixture),
+ manager(new FileStorManager(fixture._config->getConfigId(),
+ fixture._node->getPartitions(),
+ fixture._node->getPersistenceProvider(),
+ fixture._node->getComponentRegister()))
+{
+ injector.inject(top);
+ top.push_back(StorageLink::UP(manager));
+ top.open();
+}
+
+api::StorageMessageAddress
+FileStorTestFixture::TestFileStorComponents::makeSelfAddress() const {
+ return api::StorageMessageAddress("storage", lib::NodeType::STORAGE, 0);
+}
+
+void
+FileStorTestFixture::TestFileStorComponents::sendDummyGet(
+ const document::BucketId& bid)
+{
+ std::ostringstream id;
+ id << "id:foo:testdoctype1:n=" << bid.getId() << ":0";
+ std::shared_ptr<api::GetCommand> cmd(
+ new api::GetCommand(bid, document::DocumentId(id.str()), "[all]"));
+ cmd->setAddress(makeSelfAddress());
+ cmd->setPriority(255);
+ top.sendDown(cmd);
+}
+
+void
+FileStorTestFixture::TestFileStorComponents::sendDummyGetDiff(
+ const document::BucketId& bid)
+{
+ std::vector<api::GetBucketDiffCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ std::shared_ptr<api::GetBucketDiffCommand> cmd(
+ new api::GetBucketDiffCommand(bid, nodes, 12345));
+ cmd->setAddress(makeSelfAddress());
+ cmd->setPriority(255);
+ top.sendDown(cmd);
+}
+
+void
+FileStorTestFixture::TestFileStorComponents::sendPut(
+ const document::BucketId& bid,
+ uint32_t docIdx,
+ uint64_t timestamp)
+{
+ std::ostringstream id;
+ id << "id:foo:testdoctype1:n=" << bid.getId() << ":" << docIdx;
+ document::Document::SP doc(
+ _fixture._node->getTestDocMan().createDocument("foobar", id.str()));
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, doc, timestamp));
+ cmd->setAddress(makeSelfAddress());
+ top.sendDown(cmd);
+}
+
+void
+FileStorTestFixture::setClusterState(const std::string& state)
+{
+ _node->getStateUpdater().setClusterState(
+ lib::ClusterState::CSP(new lib::ClusterState(state)));
+}
+
+
+} // ns storage
diff --git a/storage/src/tests/persistence/filestorage/filestortestfixture.h b/storage/src/tests/persistence/filestorage/filestortestfixture.h
new file mode 100644
index 00000000000..4f1de549f47
--- /dev/null
+++ b/storage/src/tests/persistence/filestorage/filestortestfixture.h
@@ -0,0 +1,112 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <tests/common/testhelper.h>
+#include <vespa/persistence/spi/persistenceprovider.h>
+#include <vespa/storage/persistence/filestorage/filestormanager.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <tests/common/dummystoragelink.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/testhelper.h>
+
+namespace storage {
+
+class FileStorTestFixture : public CppUnit::TestFixture
+{
+public:
+ static spi::LoadType defaultLoadType;
+
+ std::unique_ptr<TestServiceLayerApp> _node;
+ std::unique_ptr<vdstestlib::DirConfig> _config;
+ std::unique_ptr<vdstestlib::DirConfig> _config2;
+ std::unique_ptr<vdstestlib::DirConfig> _smallConfig;
+ const document::DocumentType* _testdoctype1;
+
+ static const uint32_t MSG_WAIT_TIME = 60 * 1000;
+
+ typedef uint32_t DocumentIndex;
+ typedef uint64_t PutTimestamp;
+
+ void setUp() override;
+ void tearDown() override;
+ void setupDisks(uint32_t diskCount);
+ void createBucket(const document::BucketId& bid);
+ bool bucketExistsInDb(const document::BucketId& bucket) const;
+
+ api::ReturnCode::Result resultOf(const api::StorageReply& reply) const {
+ return reply.getResult().getResult();
+ }
+ void setClusterState(const std::string&);
+
+ struct StorageLinkInjector
+ {
+ virtual ~StorageLinkInjector() {}
+
+ virtual void inject(DummyStorageLink&) const = 0;
+ };
+
+ struct NoOpStorageLinkInjector : StorageLinkInjector
+ {
+ void inject(DummyStorageLink&) const {}
+ };
+
+ void
+ expectNoReplies(DummyStorageLink& link) {
+ CPPUNIT_ASSERT_EQUAL(size_t(0), link.getNumReplies());
+ }
+
+ template <typename ReplyType>
+ void
+ expectReply(DummyStorageLink& link,
+ api::ReturnCode::Result result)
+ {
+ link.waitForMessages(1, 60*1000);
+ api::StorageReply* reply(
+ dynamic_cast<ReplyType*>(link.getReply(0).get()));
+ if (reply == 0) {
+ std::ostringstream ss;
+ ss << "got unexpected reply "
+ << link.getReply(0)->toString(true);
+ CPPUNIT_FAIL(ss.str());
+ }
+ CPPUNIT_ASSERT_EQUAL(result, reply->getResult().getResult());
+ }
+
+ template <typename ReplyType>
+ void
+ expectAbortedReply(DummyStorageLink& link) {
+ expectReply<ReplyType>(link, api::ReturnCode::ABORTED);
+ }
+
+ template <typename ReplyType>
+ void
+ expectOkReply(DummyStorageLink& link) {
+ expectReply<ReplyType>(link, api::ReturnCode::OK);
+ }
+
+
+ struct TestFileStorComponents
+ {
+ private:
+ TestName _testName;
+ FileStorTestFixture& _fixture;
+ public:
+ DummyStorageLink top;
+ FileStorManager* manager;
+
+ TestFileStorComponents(FileStorTestFixture& fixture,
+ const char* testName,
+ const StorageLinkInjector& i = NoOpStorageLinkInjector());
+
+ api::StorageMessageAddress makeSelfAddress() const;
+
+ void sendDummyGet(const document::BucketId& bid);
+ void sendPut(const document::BucketId& bid,
+ uint32_t docIdx,
+ uint64_t timestamp);
+ void sendDummyGetDiff(const document::BucketId& bid);
+ };
+};
+
+} // ns storage
diff --git a/storage/src/tests/persistence/filestorage/forwardingmessagesender.h b/storage/src/tests/persistence/filestorage/forwardingmessagesender.h
new file mode 100644
index 00000000000..691e291e534
--- /dev/null
+++ b/storage/src/tests/persistence/filestorage/forwardingmessagesender.h
@@ -0,0 +1,26 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/common/messagesender.h>
+#include <vespa/storage/common/storagelink.h>
+
+namespace storage {
+
+/**
+ * Simple implementation of MessageSender which simply forwards all messages
+ * to a provided storage link.
+ */
+struct ForwardingMessageSender : public MessageSender {
+ StorageLink& link;
+
+ ForwardingMessageSender(StorageLink& l) : link(l) {}
+
+ void sendCommand(const std::shared_ptr<api::StorageCommand> & cmd)
+ { link.sendUp(cmd); }
+
+ void sendReply(const std::shared_ptr<api::StorageReply> & reply)
+ { link.sendUp(reply); }
+};
+
+} // storage
+
diff --git a/storage/src/tests/persistence/filestorage/mergeblockingtest.cpp b/storage/src/tests/persistence/filestorage/mergeblockingtest.cpp
new file mode 100644
index 00000000000..ff9ec063555
--- /dev/null
+++ b/storage/src/tests/persistence/filestorage/mergeblockingtest.cpp
@@ -0,0 +1,239 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vector>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storage/persistence/messages.h>
+#include <tests/persistence/persistenceproviderwrapper.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+#include <tests/persistence/filestorage/filestortestfixture.h>
+
+namespace storage {
+
+class MergeBlockingTest : public FileStorTestFixture
+{
+public:
+ void setupDisks() {
+ FileStorTestFixture::setupDisks(1);
+ _node->setPersistenceProvider(
+ spi::PersistenceProvider::UP(
+ new spi::dummy::DummyPersistence(_node->getTypeRepo(), 1)));
+ }
+
+public:
+ void testRejectMergeForInconsistentInnerBucket();
+ void testRejectMergeForInconsistentLeafBucket();
+ void testRejectGetBucketDiffWithInconsistentBucket();
+ void testRejectApplyDiffWhenBucketHasBecomeInconsistent();
+ void testRejectApplyReplyWhenBucketHasBecomeInconsistent();
+ void testRejectGetDiffReplyWhenBucketHasBecomeInconsistent();
+ void testRejectMergeWhenLowUsedBitCount();
+
+ void setUp() override;
+
+ CPPUNIT_TEST_SUITE(MergeBlockingTest);
+ CPPUNIT_TEST(testRejectMergeForInconsistentInnerBucket);
+ CPPUNIT_TEST(testRejectMergeForInconsistentLeafBucket);
+ CPPUNIT_TEST(testRejectGetBucketDiffWithInconsistentBucket);
+ CPPUNIT_TEST(testRejectApplyDiffWhenBucketHasBecomeInconsistent);
+ CPPUNIT_TEST(testRejectApplyReplyWhenBucketHasBecomeInconsistent);
+ CPPUNIT_TEST(testRejectGetDiffReplyWhenBucketHasBecomeInconsistent);
+ CPPUNIT_TEST(testRejectMergeWhenLowUsedBitCount);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(MergeBlockingTest);
+
+void
+MergeBlockingTest::setUp()
+{
+ setupDisks();
+}
+
+namespace {
+
+api::StorageMessageAddress
+makeAddress() {
+ return api::StorageMessageAddress("storage", lib::NodeType::STORAGE, 0);
+}
+
+void
+assignCommandMeta(api::StorageCommand& msg) {
+ msg.setAddress(makeAddress());
+ msg.setSourceIndex(0);
+}
+
+std::vector<api::MergeBucketCommand::Node>
+getNodes() {
+ std::vector<api::MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ return nodes;
+}
+
+std::vector<api::MergeBucketCommand::Node>
+getNodesWithForwarding() {
+ std::vector<api::MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ return nodes;
+}
+
+std::shared_ptr<api::MergeBucketCommand>
+createMerge(const document::BucketId& bucket) {
+ std::shared_ptr<api::MergeBucketCommand> cmd(
+ new api::MergeBucketCommand(bucket, getNodes(), api::Timestamp(1000)));
+ assignCommandMeta(*cmd);
+ return cmd;
+}
+
+std::shared_ptr<api::GetBucketDiffCommand>
+createGetDiff(const document::BucketId& bucket,
+ const std::vector<api::MergeBucketCommand::Node>& nodes)
+{
+ std::shared_ptr<api::GetBucketDiffCommand> cmd(
+ new api::GetBucketDiffCommand(bucket, nodes, api::Timestamp(1000)));
+ assignCommandMeta(*cmd);
+ return cmd;
+}
+
+std::shared_ptr<api::ApplyBucketDiffCommand>
+createApplyDiff(const document::BucketId& bucket,
+ const std::vector<api::MergeBucketCommand::Node>& nodes) {
+ std::shared_ptr<api::ApplyBucketDiffCommand> cmd(
+ new api::ApplyBucketDiffCommand(bucket, nodes, 1024*1024));
+ assignCommandMeta(*cmd);
+ return cmd;
+}
+
+const document::BucketId leafBucket(17, 1);
+const document::BucketId innerBucket(16, 1);
+const document::BucketId innerBucket2(15, 1);
+
+}
+
+void
+MergeBlockingTest::testRejectMergeForInconsistentInnerBucket()
+{
+ TestFileStorComponents c(*this, "testRejectMergeForInconsistentInnerBucket");
+ createBucket(leafBucket);
+
+ std::shared_ptr<api::MergeBucketCommand> cmd(createMerge(innerBucket));
+ c.top.sendDown(cmd);
+
+ expectAbortedReply<api::MergeBucketReply>(c.top);
+ CPPUNIT_ASSERT(!bucketExistsInDb(innerBucket));
+}
+
+void
+MergeBlockingTest::testRejectMergeForInconsistentLeafBucket()
+{
+ TestFileStorComponents c(*this, "testRejectMergeForInconsistentInnerBucket");
+ createBucket(innerBucket);
+
+ std::shared_ptr<api::MergeBucketCommand> cmd(createMerge(leafBucket));
+ c.top.sendDown(cmd);
+
+ expectAbortedReply<api::MergeBucketReply>(c.top);
+ CPPUNIT_ASSERT(!bucketExistsInDb(leafBucket));
+}
+
+void
+MergeBlockingTest::testRejectGetBucketDiffWithInconsistentBucket()
+{
+ TestFileStorComponents c(*this, "testRejectGetBucketDiffWithInconsistentBucket");
+ CPPUNIT_ASSERT(innerBucket.contains(leafBucket));
+ createBucket(innerBucket);
+
+ std::shared_ptr<api::GetBucketDiffCommand> cmd(createGetDiff(leafBucket, getNodes()));
+ c.top.sendDown(cmd);
+
+ expectAbortedReply<api::GetBucketDiffReply>(c.top);
+ CPPUNIT_ASSERT(!bucketExistsInDb(leafBucket));
+}
+
+void
+MergeBlockingTest::testRejectApplyDiffWhenBucketHasBecomeInconsistent()
+{
+ TestFileStorComponents c(*this, "testRejectApplyDiffWhenBucketHasBecomeInconsistent");
+ createBucket(leafBucket);
+ createBucket(innerBucket);
+
+ std::shared_ptr<api::ApplyBucketDiffCommand> applyDiff(
+ createApplyDiff(innerBucket, getNodes()));
+ c.top.sendDown(applyDiff);
+
+ expectAbortedReply<api::ApplyBucketDiffReply>(c.top);
+}
+
+void
+MergeBlockingTest::testRejectApplyReplyWhenBucketHasBecomeInconsistent()
+{
+ TestFileStorComponents c(*this, "testRejectApplyReplyWhenBucketHasBecomeInconsistent");
+ createBucket(innerBucket);
+
+ std::shared_ptr<api::ApplyBucketDiffCommand> applyDiff(
+ createApplyDiff(innerBucket, getNodesWithForwarding()));
+ c.top.sendDown(applyDiff);
+ c.top.waitForMessages(1, MSG_WAIT_TIME);
+
+ api::StorageMessage::SP fwdDiff(
+ c.top.getAndRemoveMessage(api::MessageType::APPLYBUCKETDIFF));
+ api::ApplyBucketDiffCommand& diffCmd(
+ dynamic_cast<api::ApplyBucketDiffCommand&>(*fwdDiff));
+
+ api::ApplyBucketDiffReply::SP diffReply(
+ new api::ApplyBucketDiffReply(diffCmd));
+ createBucket(leafBucket);
+ c.top.sendDown(diffReply);
+
+ expectAbortedReply<api::ApplyBucketDiffReply>(c.top);
+}
+
+void
+MergeBlockingTest::testRejectGetDiffReplyWhenBucketHasBecomeInconsistent()
+{
+ TestFileStorComponents c(*this, "testRejectGetDiffReplyWhenBucketHasBecomeInconsistent");
+ createBucket(innerBucket);
+
+ std::shared_ptr<api::GetBucketDiffCommand> getDiff(
+ createGetDiff(innerBucket, getNodesWithForwarding()));
+ c.top.sendDown(getDiff);
+ c.top.waitForMessages(1, MSG_WAIT_TIME);
+
+ api::StorageMessage::SP fwdDiff(
+ c.top.getAndRemoveMessage(api::MessageType::GETBUCKETDIFF));
+ api::GetBucketDiffCommand& diffCmd(
+ dynamic_cast<api::GetBucketDiffCommand&>(*fwdDiff));
+
+ api::GetBucketDiffReply::SP diffReply(
+ new api::GetBucketDiffReply(diffCmd));
+ createBucket(innerBucket2);
+ c.top.sendDown(diffReply);
+
+ expectAbortedReply<api::GetBucketDiffReply>(c.top);
+}
+
+/**
+ * Test case for buckets in ticket 6389558, comment #4.
+ */
+void
+MergeBlockingTest::testRejectMergeWhenLowUsedBitCount()
+{
+ document::BucketId superBucket(1, 0x1);
+ document::BucketId subBucket(2, 0x1);
+
+ CPPUNIT_ASSERT(superBucket.contains(subBucket));
+
+ TestFileStorComponents c(*this, "testRejectMergeWithInconsistentBucket");
+ createBucket(superBucket);
+
+ std::shared_ptr<api::MergeBucketCommand> cmd(createMerge(subBucket));
+ c.top.sendDown(cmd);
+
+ expectAbortedReply<api::MergeBucketReply>(c.top);
+ CPPUNIT_ASSERT(!bucketExistsInDb(subBucket));
+}
+
+} // ns storage
diff --git a/storage/src/tests/persistence/filestorage/modifiedbucketcheckertest.cpp b/storage/src/tests/persistence/filestorage/modifiedbucketcheckertest.cpp
new file mode 100644
index 00000000000..848799fde95
--- /dev/null
+++ b/storage/src/tests/persistence/filestorage/modifiedbucketcheckertest.cpp
@@ -0,0 +1,214 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <tests/common/testhelper.h>
+#include <tests/common/storagelinktest.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+#include <vespa/storage/persistence/filestorage/modifiedbucketchecker.h>
+#include <vespa/storage/persistence/messages.h>
+
+namespace storage {
+
+class ModifiedBucketCheckerTest : public CppUnit::TestFixture
+{
+public:
+ enum {
+ MESSAGE_WAIT_TIME = 60*2
+ };
+
+ void setUp();
+ void tearDown();
+
+ void testModifiedBucketThreadSendsRecheckBucketCommands();
+ void testDoNotCheckModifiedBucketsIfAlreadyPending();
+ void testBucketCheckerOnlySwallowsRecheckBucketReplies();
+ void testRecheckRequestsAreChunked();
+ void testInvalidChunkSizeConfigIsRejected();
+
+ CPPUNIT_TEST_SUITE(ModifiedBucketCheckerTest);
+ CPPUNIT_TEST(testModifiedBucketThreadSendsRecheckBucketCommands);
+ CPPUNIT_TEST(testDoNotCheckModifiedBucketsIfAlreadyPending);
+ CPPUNIT_TEST(testBucketCheckerOnlySwallowsRecheckBucketReplies);
+ CPPUNIT_TEST(testRecheckRequestsAreChunked);
+ CPPUNIT_TEST(testInvalidChunkSizeConfigIsRejected);
+ CPPUNIT_TEST_SUITE_END();
+private:
+ spi::dummy::DummyPersistence& getDummyPersistence() {
+ return static_cast<spi::dummy::DummyPersistence&>(
+ _node->getPersistenceProvider());
+ }
+ void expectCommandsAndSendReplies(uint32_t count, uint32_t firstBucket);
+ void modifyBuckets(uint32_t count, uint32_t firstBucket);
+ void replyToAll(const std::vector<api::StorageMessage::SP>& messages,
+ uint32_t firstBucket);
+
+ std::unique_ptr<DummyStorageLink> _top;
+ ModifiedBucketChecker* _handler;
+ DummyStorageLink* _bottom;
+
+ std::unique_ptr<TestServiceLayerApp> _node;
+ std::unique_ptr<vdstestlib::DirConfig> _config;
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(ModifiedBucketCheckerTest);
+
+void
+ModifiedBucketCheckerTest::setUp()
+{
+ _config.reset(new vdstestlib::DirConfig(getStandardConfig(true)));
+ _node.reset(new TestServiceLayerApp(DiskCount(1), NodeIndex(0),
+ _config->getConfigId()));
+ _node->setupDummyPersistence();
+
+ _top.reset(new DummyStorageLink);
+ _handler = new ModifiedBucketChecker(_node->getComponentRegister(),
+ _node->getPersistenceProvider(),
+ _config->getConfigId());
+ _top->push_back(std::unique_ptr<StorageLink>(_handler));
+ _bottom = new DummyStorageLink;
+ _handler->push_back(std::unique_ptr<StorageLink>(_bottom));
+}
+
+void
+ModifiedBucketCheckerTest::tearDown()
+{
+ _top->close();
+ _top.reset(0);
+ _node.reset(0);
+ _config.reset(0);
+}
+
+void
+ModifiedBucketCheckerTest::modifyBuckets(uint32_t count, uint32_t firstBucket)
+{
+ spi::BucketIdListResult::List buckets;
+ for (uint32_t i = firstBucket; i < firstBucket + count; ++i) {
+ buckets.push_back(document::BucketId(16, i));
+ }
+ getDummyPersistence().setModifiedBuckets(buckets);
+}
+
+void
+ModifiedBucketCheckerTest::replyToAll(
+ const std::vector<api::StorageMessage::SP>& messages,
+ uint32_t firstBucket)
+{
+ for (uint32_t i = 0; i < messages.size(); ++i) {
+ RecheckBucketInfoCommand& cmd(
+ dynamic_cast<RecheckBucketInfoCommand&>(*messages[i]));
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, i+firstBucket),
+ cmd.getBucketId());
+ _bottom->sendUp(cmd.makeReply());
+ }
+}
+
+void
+ModifiedBucketCheckerTest::expectCommandsAndSendReplies(
+ uint32_t count, uint32_t firstBucket)
+{
+ std::vector<api::StorageMessage::SP> messages(_bottom->getCommandsOnce());
+ CPPUNIT_ASSERT_EQUAL(size_t(count), messages.size());
+ replyToAll(messages, firstBucket);
+}
+
+void
+ModifiedBucketCheckerTest::testModifiedBucketThreadSendsRecheckBucketCommands()
+{
+ _top->open(); // Multi-threaded test
+ modifyBuckets(3, 0);
+ // Should now get 3 RecheckBucketInfo commands down the dummy link.
+ _bottom->waitForMessages(3, MESSAGE_WAIT_TIME);
+ expectCommandsAndSendReplies(3, 0);
+ // No replies should reach top link
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _top->getNumReplies());
+}
+
+void
+ModifiedBucketCheckerTest::testDoNotCheckModifiedBucketsIfAlreadyPending()
+{
+ _handler->setUnitTestingSingleThreadedMode();
+ _top->open();
+ modifyBuckets(3, 0);
+ _handler->tick();
+
+ std::vector<api::StorageMessage::SP> messages(_bottom->getCommandsOnce());
+ CPPUNIT_ASSERT_EQUAL(size_t(3), messages.size());
+
+ modifyBuckets(3, 3);
+ _handler->tick();
+ expectCommandsAndSendReplies(0, 0);
+ // After replies received, tick should send new requests again.
+ replyToAll(messages, 0);
+ _handler->tick();
+ expectCommandsAndSendReplies(3, 3);
+}
+
+void
+ModifiedBucketCheckerTest::testBucketCheckerOnlySwallowsRecheckBucketReplies()
+{
+ _top->open();
+ DestroyIteratorCommand cmd(spi::IteratorId(123));
+ _bottom->sendUp(api::StorageMessage::SP(cmd.makeReply()));
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _top->getNumReplies());
+}
+
+void
+ModifiedBucketCheckerTest::testRecheckRequestsAreChunked()
+{
+ namespace cfgns = vespa::config::content::core;
+ _handler->setUnitTestingSingleThreadedMode();
+ _top->open();
+ cfgns::StorServerConfigBuilder cfgBuilder;
+ cfgBuilder.bucketRecheckingChunkSize = 2;
+ _handler->configure(std::unique_ptr<cfgns::StorServerConfig>(
+ new cfgns::StorServerConfig(cfgBuilder)));
+
+ modifyBuckets(5, 0);
+ _handler->tick();
+
+ modifyBuckets(1, 10); // should not be checked yet;
+ // Rechecks should now be done in 3 chunks of 2, 2 and 1 each, respectively.
+ expectCommandsAndSendReplies(2, 0);
+
+ _handler->tick();
+ expectCommandsAndSendReplies(2, 2);
+
+ _handler->tick();
+ expectCommandsAndSendReplies(1, 4);
+
+ // New round of fetching
+ _handler->tick();
+ expectCommandsAndSendReplies(1, 10);
+
+ // And done!
+ _handler->tick();
+ expectCommandsAndSendReplies(0, 0);
+}
+
+void
+ModifiedBucketCheckerTest::testInvalidChunkSizeConfigIsRejected()
+{
+ namespace cfgns = vespa::config::content::core;
+ _handler->setUnitTestingSingleThreadedMode();
+ _top->open();
+ cfgns::StorServerConfigBuilder cfgBuilder;
+ cfgBuilder.bucketRecheckingChunkSize = 0;
+ try {
+ _handler->configure(std::unique_ptr<cfgns::StorServerConfig>(
+ new cfgns::StorServerConfig(cfgBuilder)));
+ CPPUNIT_FAIL("Expected bad config to be rejected");
+ } catch (const config::InvalidConfigException&) {
+ // Happy days
+ } catch (...) {
+ CPPUNIT_FAIL("Got unexpected exception");
+ }
+}
+
+// RecheckBucketInfoCommand handling is done in persistence threads,
+// so that functionality is tested in the filestor tests.
+
+} // ns storage
+
diff --git a/storage/src/tests/persistence/filestorage/operationabortingtest.cpp b/storage/src/tests/persistence/filestorage/operationabortingtest.cpp
new file mode 100644
index 00000000000..0d6583cacdb
--- /dev/null
+++ b/storage/src/tests/persistence/filestorage/operationabortingtest.cpp
@@ -0,0 +1,470 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vector>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storage/persistence/messages.h>
+#include <tests/persistence/persistenceproviderwrapper.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+#include <tests/persistence/filestorage/filestortestfixture.h>
+#include <vespa/vespalib/util/barrier.h>
+#include <vespa/vespalib/util/thread.h>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".operationabortingtest");
+
+namespace storage {
+
+namespace {
+
+// Exploit the fact that PersistenceProviderWrapper already provides a forwarding
+// implementation of all SPI calls, so we can selectively override.
+class BlockingMockProvider : public PersistenceProviderWrapper
+{
+ vespalib::Barrier& _queueBarrier;
+ vespalib::Barrier& _completionBarrier;
+public:
+ typedef std::unique_ptr<BlockingMockProvider> UP;
+
+ mutable uint32_t _bucketInfoInvocations;
+ uint32_t _createBucketInvocations;
+ uint32_t _deleteBucketInvocations;
+
+ BlockingMockProvider(spi::PersistenceProvider& wrappedProvider,
+ vespalib::Barrier& queueBarrier,
+ vespalib::Barrier& completionBarrier)
+ : PersistenceProviderWrapper(wrappedProvider),
+ _queueBarrier(queueBarrier),
+ _completionBarrier(completionBarrier),
+ _bucketInfoInvocations(0),
+ _createBucketInvocations(0),
+ _deleteBucketInvocations(0)
+ {}
+
+ spi::Result put(const spi::Bucket& bucket,
+ spi::Timestamp timestamp,
+ const document::Document::SP& doc,
+ spi::Context& context) override
+ {
+ (void) bucket;
+ (void) timestamp;
+ (void) doc;
+ (void) context;
+ _queueBarrier.await();
+ // message abort stage with active opertion in disk queue
+ FastOS_Thread::Sleep(75);
+ _completionBarrier.await();
+ // test finished
+ return spi::Result();
+ }
+
+ spi::BucketInfoResult getBucketInfo(const spi::Bucket& bucket) const override {
+ ++_bucketInfoInvocations;
+ return PersistenceProviderWrapper::getBucketInfo(bucket);
+ }
+
+ spi::Result createBucket(const spi::Bucket& bucket, spi::Context& ctx) override {
+ ++_createBucketInvocations;
+ return PersistenceProviderWrapper::createBucket(bucket, ctx);
+ }
+
+ spi::Result deleteBucket(const spi::Bucket& bucket, spi::Context& ctx) override {
+ ++_deleteBucketInvocations;
+ return PersistenceProviderWrapper::deleteBucket(bucket, ctx);
+ }
+};
+
+spi::LoadType defaultLoadType(0, "default");
+
+}
+
+class OperationAbortingTest : public FileStorTestFixture
+{
+public:
+ spi::PersistenceProvider::UP _dummyProvider;
+ BlockingMockProvider* _blockingProvider;
+ std::unique_ptr<vespalib::Barrier> _queueBarrier;
+ std::unique_ptr<vespalib::Barrier> _completionBarrier;
+
+ void setupDisks(uint32_t diskCount, uint32_t queueBarrierThreads) {
+ FileStorTestFixture::setupDisks(diskCount);
+ _dummyProvider.reset(new spi::dummy::DummyPersistence(
+ _node->getTypeRepo(), diskCount));
+ _queueBarrier.reset(new vespalib::Barrier(queueBarrierThreads));
+ _completionBarrier.reset(new vespalib::Barrier(2));
+ _blockingProvider = new BlockingMockProvider(*_dummyProvider,
+ *_queueBarrier, *_completionBarrier);
+
+ _node->setPersistenceProvider(
+ spi::PersistenceProvider::UP(_blockingProvider));
+ }
+
+ void validateReplies(DummyStorageLink& link,
+ size_t repliesTotal,
+ const std::vector<document::BucketId>& okReplies,
+ const std::vector<document::BucketId>& abortedGetDiffs);
+
+ void doTestSpecificOperationsNotAborted(
+ const char* testName,
+ const std::vector<api::StorageMessage::SP>& msgs,
+ bool shouldCreateBucketInitially);
+
+ api::BucketInfo getBucketInfoFromDB(const document::BucketId&) const;
+
+public:
+ void testAbortMessageClearsRelevantQueuedOperations();
+ void testWaitForCurrentOperationCompletionForAbortedBucket();
+ void testDoNotAbortCreateBucketCommands();
+ void testDoNotAbortRecheckBucketCommands();
+ void testDoNotAbortDeleteBucketCommands();
+
+ void setUp() override;
+
+ CPPUNIT_TEST_SUITE(OperationAbortingTest);
+ CPPUNIT_TEST(testAbortMessageClearsRelevantQueuedOperations);
+ CPPUNIT_TEST(testWaitForCurrentOperationCompletionForAbortedBucket);
+ CPPUNIT_TEST(testDoNotAbortCreateBucketCommands);
+ CPPUNIT_TEST(testDoNotAbortRecheckBucketCommands);
+ CPPUNIT_TEST(testDoNotAbortDeleteBucketCommands);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(OperationAbortingTest);
+
+namespace {
+
+template <typename T, typename Collection>
+bool
+existsIn(const T& elem, const Collection& collection) {
+ return (std::find(collection.begin(), collection.end(), elem)
+ != collection.end());
+}
+
+}
+
+void
+OperationAbortingTest::setUp()
+{
+}
+
+void
+OperationAbortingTest::validateReplies(
+ DummyStorageLink& link,
+ size_t repliesTotal,
+ const std::vector<document::BucketId>& okReplies,
+ const std::vector<document::BucketId>& abortedGetDiffs)
+{
+ link.waitForMessages(repliesTotal, MSG_WAIT_TIME);
+ CPPUNIT_ASSERT_EQUAL(repliesTotal, link.getNumReplies());
+
+ for (uint32_t i = 0; i < repliesTotal; ++i) {
+ api::StorageReply& reply(
+ dynamic_cast<api::StorageReply&>(*link.getReply(i)));
+ LOG(info, "Checking reply %s", reply.toString(true).c_str());
+ switch (static_cast<uint32_t>(reply.getType().getId())) {
+ case api::MessageType::PUT_REPLY_ID:
+ case api::MessageType::CREATEBUCKET_REPLY_ID:
+ case api::MessageType::DELETEBUCKET_REPLY_ID:
+ case api::MessageType::GET_REPLY_ID:
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::OK, resultOf(reply));
+ break;
+ case api::MessageType::GETBUCKETDIFF_REPLY_ID:
+ {
+ api::GetBucketDiffReply& gr(
+ static_cast<api::GetBucketDiffReply&>(reply));
+ if (existsIn(gr.getBucketId(), abortedGetDiffs)) {
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::ABORTED, resultOf(reply));
+ } else {
+ CPPUNIT_ASSERT(existsIn(gr.getBucketId(), okReplies));
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::OK, resultOf(reply));
+ }
+ break;
+ }
+ case api::MessageType::INTERNAL_REPLY_ID:
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::OK, resultOf(reply));
+ break;
+ default:
+ CPPUNIT_FAIL("got unknown reply type");
+ }
+ }
+}
+
+namespace {
+
+template <typename Container>
+AbortBucketOperationsCommand::SP
+makeAbortCmd(const Container& buckets)
+{
+ std::unique_ptr<AbortBucketOperationsCommand::AbortPredicate> pred(
+ new AbortBucketOperationsCommand::ExplicitBucketSetPredicate(
+ buckets.begin(), buckets.end()));
+ AbortBucketOperationsCommand::SP cmd(
+ new AbortBucketOperationsCommand(std::move(pred)));
+ return cmd;
+}
+
+}
+
+void
+OperationAbortingTest::testAbortMessageClearsRelevantQueuedOperations()
+{
+ uint32_t queueBarrierThreads = 2;
+ setupDisks(1, queueBarrierThreads);
+ TestFileStorComponents c(*this, "testAbortMessageClearsRelevantQueuedOperations");
+ document::BucketId bucket(16, 1);
+ createBucket(bucket);
+ LOG(info, "Sending put to trigger thread barrier");
+ c.sendPut(bucket, DocumentIndex(0), PutTimestamp(1000));
+ LOG(info, "waiting for test and persistence thread to reach barriers");
+ _queueBarrier->await();
+ LOG(info, "barrier passed");
+ /*
+ * All load we send down to filestor from now on wil be enqueued, as the
+ * persistence thread is blocked.
+ *
+ * Cannot abort the bucket we're blocking the thread on since we'll
+ * deadlock the test if we do.
+ */
+ std::vector<document::BucketId> bucketsToAbort;
+ bucketsToAbort.push_back(document::BucketId(16, 3));
+ bucketsToAbort.push_back(document::BucketId(16, 5));
+ std::vector<document::BucketId> bucketsToKeep;
+ bucketsToKeep.push_back(document::BucketId(16, 2));
+ bucketsToKeep.push_back(document::BucketId(16, 4));
+
+ for (uint32_t i = 0; i < bucketsToAbort.size(); ++i) {
+ createBucket(bucketsToAbort[i]);
+ c.sendDummyGetDiff(bucketsToAbort[i]);
+ }
+ for (uint32_t i = 0; i < bucketsToKeep.size(); ++i) {
+ createBucket(bucketsToKeep[i]);
+ c.sendDummyGetDiff(bucketsToKeep[i]);
+ }
+
+ AbortBucketOperationsCommand::SP abortCmd(makeAbortCmd(bucketsToAbort));
+ c.top.sendDown(abortCmd);
+
+ LOG(info, "waiting on completion barrier");
+ _completionBarrier->await();
+
+ // put+abort+get replies
+ size_t expectedMsgs(2 + bucketsToAbort.size() + bucketsToKeep.size());
+ LOG(info, "barrier passed, waiting for %zu replies", expectedMsgs);
+
+ validateReplies(c.top, expectedMsgs, bucketsToKeep, bucketsToAbort);
+}
+
+namespace {
+
+/**
+ * Sending an abort while we're processing a message for a bucket in its set
+ * will block until the operation has completed. Therefore we logically cannot
+ * do any operations to trigger the operation to complete after the send in
+ * the same thread as we're sending in...
+ */
+class SendTask : public vespalib::Runnable
+{
+ AbortBucketOperationsCommand::SP _abortCmd;
+ vespalib::Barrier& _queueBarrier;
+ StorageLink& _downLink;
+public:
+ SendTask(const AbortBucketOperationsCommand::SP& abortCmd,
+ vespalib::Barrier& queueBarrier,
+ StorageLink& downLink)
+ : _abortCmd(abortCmd),
+ _queueBarrier(queueBarrier),
+ _downLink(downLink)
+ {}
+
+ void run() {
+ // Best-effort synchronized starting
+ _queueBarrier.await();
+ _downLink.sendDown(_abortCmd);
+ }
+};
+
+}
+
+/**
+ * This test basically is not fully deterministic in that it tests cross-thread
+ * behavior on mutexes that are not visible to the thread itself and where there
+ * are no available side-effects to consistently sync around. However, it should
+ * impose sufficient ordering guarantees that it never provides false positives
+ * as long as the tested functionality is in fact correct.
+ */
+void
+OperationAbortingTest::testWaitForCurrentOperationCompletionForAbortedBucket()
+{
+ uint32_t queueBarrierThreads = 3;
+ setupDisks(1, queueBarrierThreads);
+ TestFileStorComponents c(*this, "testWaitForCurrentOperationCompletionForAbortedBucket");
+
+ document::BucketId bucket(16, 1);
+ createBucket(bucket);
+ LOG(info, "Sending put to trigger thread barrier");
+ c.sendPut(bucket, DocumentIndex(0), PutTimestamp(1000));
+
+ std::vector<document::BucketId> abortSet { bucket };
+ AbortBucketOperationsCommand::SP abortCmd(makeAbortCmd(abortSet));
+
+ SendTask sendTask(abortCmd, *_queueBarrier, c.top);
+ vespalib::Thread thread(sendTask);
+ thread.start();
+
+ LOG(info, "waiting for threads to reach barriers");
+ _queueBarrier->await();
+ LOG(info, "barrier passed");
+
+ LOG(info, "waiting on completion barrier");
+ _completionBarrier->await();
+
+ thread.stop();
+ thread.join();
+
+ // If waiting works, put reply shall always be ordered before the internal
+ // reply, as it must finish processing fully before the abort returns.
+ c.top.waitForMessages(2, MSG_WAIT_TIME);
+ CPPUNIT_ASSERT_EQUAL(size_t(2), c.top.getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::PUT_REPLY,
+ c.top.getReply(0)->getType());
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::INTERNAL_REPLY,
+ c.top.getReply(1)->getType());
+}
+
+void
+OperationAbortingTest::testDoNotAbortCreateBucketCommands()
+{
+ document::BucketId bucket(16, 1);
+ std::vector<api::StorageMessage::SP> msgs;
+ msgs.push_back(api::StorageMessage::SP(new api::CreateBucketCommand(bucket)));
+
+ bool shouldCreateBucketInitially(false);
+ doTestSpecificOperationsNotAborted(
+ "testDoNotAbortCreateBucketCommands",
+ msgs,
+ shouldCreateBucketInitially);
+}
+
+void
+OperationAbortingTest::testDoNotAbortRecheckBucketCommands()
+{
+ document::BucketId bucket(16, 1);
+ std::vector<api::StorageMessage::SP> msgs;
+ msgs.push_back(api::StorageMessage::SP(new RecheckBucketInfoCommand(bucket)));
+
+ bool shouldCreateBucketInitially(true);
+ doTestSpecificOperationsNotAborted(
+ "testDoNotAbortRecheckBucketCommands",
+ msgs,
+ shouldCreateBucketInitially);
+}
+
+api::BucketInfo
+OperationAbortingTest::getBucketInfoFromDB(const document::BucketId& id) const
+{
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(id, "foo",
+ StorBucketDatabase::CREATE_IF_NONEXISTING));
+ CPPUNIT_ASSERT(entry.exist());
+ return entry->info;
+}
+
+void
+OperationAbortingTest::testDoNotAbortDeleteBucketCommands()
+{
+ document::BucketId bucket(16, 1);
+ std::vector<api::StorageMessage::SP> msgs;
+ api::DeleteBucketCommand::SP cmd(new api::DeleteBucketCommand(bucket));
+ msgs.push_back(cmd);
+
+ bool shouldCreateBucketInitially(true);
+ doTestSpecificOperationsNotAborted(
+ "testDoNotAbortRecheckBucketCommands",
+ msgs,
+ shouldCreateBucketInitially);
+}
+
+void
+OperationAbortingTest::doTestSpecificOperationsNotAborted(
+ const char* testName,
+ const std::vector<api::StorageMessage::SP>& msgs,
+ bool shouldCreateBucketInitially)
+{
+ uint32_t queueBarrierThreads = 2;
+ setupDisks(1, queueBarrierThreads);
+ TestFileStorComponents c(*this, testName);
+ document::BucketId bucket(16, 1);
+ document::BucketId blockerBucket(16, 2);
+
+ if (shouldCreateBucketInitially) {
+ createBucket(bucket);
+ }
+ createBucket(blockerBucket);
+ LOG(info, "Sending put to trigger thread barrier");
+ c.sendPut(blockerBucket, DocumentIndex(0), PutTimestamp(1000));
+ LOG(info, "waiting for test and persistence thread to reach barriers");
+ _queueBarrier->await();
+ LOG(info, "barrier passed");
+
+ uint32_t expectedCreateBuckets = 0;
+ uint32_t expectedDeleteBuckets = 0;
+ uint32_t expectedBucketInfoInvocations = 1; // from blocker put
+ uint32_t expectedRecheckReplies = 0;
+
+ for (uint32_t i = 0; i < msgs.size(); ++i) {
+ switch (msgs[i]->getType().getId()) {
+ case api::MessageType::CREATEBUCKET_ID:
+ ++expectedCreateBuckets;
+ break;
+ case api::MessageType::DELETEBUCKET_ID:
+ {
+ api::DeleteBucketCommand& delCmd(
+ dynamic_cast<api::DeleteBucketCommand&>(*msgs[i]));
+ delCmd.setBucketInfo(getBucketInfoFromDB(delCmd.getBucketId()));
+ }
+ ++expectedDeleteBuckets;
+ ++expectedBucketInfoInvocations;
+ break;
+ case api::MessageType::INTERNAL_ID:
+ ++expectedRecheckReplies;
+ ++expectedBucketInfoInvocations;
+ break;
+ default:
+ CPPUNIT_FAIL("unsupported message type");
+ }
+ c.top.sendDown(msgs[i]);
+ }
+
+ std::vector<document::BucketId> abortSet { bucket };
+ AbortBucketOperationsCommand::SP abortCmd(makeAbortCmd(abortSet));
+ c.top.sendDown(abortCmd);
+
+ LOG(info, "waiting on completion barrier");
+ _completionBarrier->await();
+
+ // At this point, the recheck command is still either enqueued, is processing
+ // or has finished. Since it does not generate any replies, send a low priority
+ // get which will wait until it has finished processing.
+ c.sendDummyGet(blockerBucket);
+
+ // put+abort+get + any other creates/deletes/rechecks
+ size_t expectedMsgs(3 + expectedCreateBuckets + expectedDeleteBuckets
+ + expectedRecheckReplies);
+ LOG(info, "barrier passed, waiting for %zu replies", expectedMsgs);
+
+ std::vector<document::BucketId> okReplies;
+ okReplies.push_back(bucket);
+ okReplies.push_back(blockerBucket);
+ std::vector<document::BucketId> abortedGetDiffs;
+ validateReplies(c.top, expectedMsgs, okReplies, abortedGetDiffs);
+
+ CPPUNIT_ASSERT_EQUAL(expectedBucketInfoInvocations,
+ _blockingProvider->_bucketInfoInvocations);
+ CPPUNIT_ASSERT_EQUAL(expectedCreateBuckets + (shouldCreateBucketInitially ? 2 : 1),
+ _blockingProvider->_createBucketInvocations);
+ CPPUNIT_ASSERT_EQUAL(expectedDeleteBuckets,
+ _blockingProvider->_deleteBucketInvocations);
+}
+
+
+} // storage
diff --git a/storage/src/tests/persistence/filestorage/sanitycheckeddeletetest.cpp b/storage/src/tests/persistence/filestorage/sanitycheckeddeletetest.cpp
new file mode 100644
index 00000000000..9b492a3aaa6
--- /dev/null
+++ b/storage/src/tests/persistence/filestorage/sanitycheckeddeletetest.cpp
@@ -0,0 +1,78 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <tests/persistence/persistenceproviderwrapper.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+#include <tests/persistence/filestorage/filestortestfixture.h>
+
+namespace storage {
+
+class SanityCheckedDeleteTest : public FileStorTestFixture
+{
+public:
+ void testDeleteBucketFailsWhenProviderOutOfSync();
+
+ CPPUNIT_TEST_SUITE(SanityCheckedDeleteTest);
+ CPPUNIT_TEST(testDeleteBucketFailsWhenProviderOutOfSync);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(SanityCheckedDeleteTest);
+
+void
+SanityCheckedDeleteTest::testDeleteBucketFailsWhenProviderOutOfSync()
+{
+ TestFileStorComponents c(*this, "testDeleteBucketFailsWhenProviderOutOfSync");
+ document::BucketId bucket(8, 123);
+ document::BucketId syncBucket(8, 234);
+ spi::Bucket spiBucket(bucket, spi::PartitionId(0));
+
+ createBucket(bucket);
+ // Send a put to ensure bucket isn't empty.
+ c.sendPut(bucket, DocumentIndex(0), PutTimestamp(1000));
+ c.top.waitForMessages(1, MSG_WAIT_TIME);
+ c.top.getRepliesOnce();
+ spi::BucketInfo infoBefore(
+ _node->getPersistenceProvider()
+ .getBucketInfo(spiBucket).getBucketInfo());
+
+ createBucket(syncBucket);
+
+ api::BucketInfo serviceLayerInfo(1, 2, 3, 4, 5, true, false);
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(bucket, "foo",
+ StorBucketDatabase::CREATE_IF_NONEXISTING));
+ entry->disk = 0;
+ entry->info = serviceLayerInfo;
+ entry.write();
+ }
+
+ std::shared_ptr<api::DeleteBucketCommand> cmd(
+ new api::DeleteBucketCommand(bucket));
+ cmd->setBucketInfo(serviceLayerInfo);
+
+ c.top.sendDown(cmd);
+ c.top.waitForMessages(1, MSG_WAIT_TIME);
+ api::StorageMessage::SP reply(c.top.getReply(0));
+ api::DeleteBucketReply& deleteReply(
+ dynamic_cast<api::DeleteBucketReply&>(*reply));
+ // Reply happens in a filestor manager context and before the sanity
+ // check kicks in, meaning it will always be OK.
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::OK, resultOf(deleteReply));
+ // At this point we do not know if the scheduled delete has been
+ // executed; it may still be in the persistence queue.
+ // Send a put to another bucket to serialize the operation (guaranteed
+ // since we only have 1 thread and the delete always has max priority).
+ c.sendPut(syncBucket, DocumentIndex(0), PutTimestamp(1001));
+ c.top.waitForMessages(1, MSG_WAIT_TIME);
+ // Should still be able to get identical bucket info for bucket.
+ spi::BucketInfoResult infoResult(
+ _node->getPersistenceProvider().getBucketInfo(spiBucket));
+ CPPUNIT_ASSERT_MSG(infoResult.getErrorMessage(), !infoResult.hasError());
+ CPPUNIT_ASSERT(infoBefore == infoResult.getBucketInfo());
+}
+
+} // namespace storage
diff --git a/storage/src/tests/persistence/filestorage/singlebucketjointest.cpp b/storage/src/tests/persistence/filestorage/singlebucketjointest.cpp
new file mode 100644
index 00000000000..480652207d3
--- /dev/null
+++ b/storage/src/tests/persistence/filestorage/singlebucketjointest.cpp
@@ -0,0 +1,51 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <tests/persistence/persistenceproviderwrapper.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+#include <tests/persistence/filestorage/filestortestfixture.h>
+
+LOG_SETUP(".singlebucketjointest");
+
+namespace storage {
+
+class SingleBucketJoinTest : public FileStorTestFixture
+{
+public:
+ void testPersistenceCanHandleSingleBucketJoin();
+
+ CPPUNIT_TEST_SUITE(SingleBucketJoinTest);
+ CPPUNIT_TEST(testPersistenceCanHandleSingleBucketJoin);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(SingleBucketJoinTest);
+
+void
+SingleBucketJoinTest::testPersistenceCanHandleSingleBucketJoin()
+{
+ TestFileStorComponents c(*this, "testPersistenceCanHandleSingleBucketJoin");
+ document::BucketId targetBucket(16, 1);
+ document::BucketId sourceBucket(17, 1);
+
+ createBucket(sourceBucket);
+ // Make sure it's not empty
+ c.sendPut(sourceBucket, DocumentIndex(0), PutTimestamp(1000));
+ expectOkReply<api::PutReply>(c.top);
+ c.top.getRepliesOnce();
+
+ auto cmd = std::make_shared<api::JoinBucketsCommand>(targetBucket);
+ cmd->getSourceBuckets().push_back(sourceBucket);
+ cmd->getSourceBuckets().push_back(sourceBucket);
+
+ c.top.sendDown(cmd);
+ // If single bucket join locking is not working properly, this
+ // will hang forever.
+ expectOkReply<api::JoinBucketsReply>(c.top);
+}
+
+} // namespace storage
diff --git a/storage/src/tests/persistence/legacyoperationhandlertest.cpp b/storage/src/tests/persistence/legacyoperationhandlertest.cpp
new file mode 100644
index 00000000000..ca496f4a260
--- /dev/null
+++ b/storage/src/tests/persistence/legacyoperationhandlertest.cpp
@@ -0,0 +1,190 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/document/base/testdocrepo.h>
+#include <vespa/document/repo/documenttyperepo.h>
+#include <vespa/documentapi/loadtypes/loadtype.h>
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+#include <tests/persistence/persistencetestutils.h>
+#include <vespa/storage/persistence/types.h>
+
+using document::DocumentTypeRepo;
+using document::TestDocRepo;
+
+namespace storage {
+
+class LegacyOperationHandlerTest : public SingleDiskPersistenceTestUtils
+{
+ CPPUNIT_TEST_SUITE(LegacyOperationHandlerTest);
+ CPPUNIT_TEST(testMultioperationSingleBodyPut);
+ CPPUNIT_TEST(testMultioperationSingleRemove);
+ CPPUNIT_TEST(testMultioperationSingleUpdate);
+ CPPUNIT_TEST(testMultioperationUpdateNotFound);
+ CPPUNIT_TEST(testMultioperationMixedOperations);
+ CPPUNIT_TEST_SUITE_END();
+
+public:
+ void setUp() {
+ SingleDiskPersistenceTestUtils::setUp();
+ createBucket(document::BucketId(16, 4));
+ spi::Context context(spi::LoadType(0, "default"), spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ getPersistenceProvider().createBucket(
+ spi::Bucket(document::BucketId(16, 4), spi::PartitionId(0)),
+ context);
+ }
+
+ std::string stat() {
+ return dumpBucket(document::BucketId(16, 4), 0);
+ }
+
+ void testMultioperationSingleBodyPut();
+ void testMultioperationSingleRemove();
+ void testMultioperationSingleUpdate();
+ void testMultioperationUpdateNotFound();
+ void testMultioperationMixedOperations();
+ void testMultioperationMixedOperationsWrongBucket();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(LegacyOperationHandlerTest);
+
+void
+LegacyOperationHandlerTest::testMultioperationSingleBodyPut()
+{
+ std::unique_ptr<PersistenceThread> thread(createPersistenceThread(0));
+ document::BucketId bucketId(16, 4);
+
+ document::Document::SP doc(createRandomDocumentAtLocation(4, 1234, 0, 128));
+
+ std::vector<char> buffer(1024);
+ vdslib::WritableDocumentList block(getTypeRepo(), &buffer[0], buffer.size());
+ block.addPut(*doc, api::Timestamp(1234));
+
+ api::MultiOperationCommand cmd(getTypeRepo(), bucketId, 0);
+ cmd.setOperations(block);
+
+ thread->handleMultiOperation(cmd);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("DocEntry(1234, 0, Doc(id:mail:testdoctype1:n=4:3619.html))\n"), stat());
+}
+
+void
+LegacyOperationHandlerTest::testMultioperationSingleRemove()
+{
+ std::unique_ptr<PersistenceThread> thread(createPersistenceThread(0));
+ document::BucketId bucketId(16, 4);
+
+ document::Document::SP doc = doPut(4, spi::Timestamp(1234));
+
+ std::vector<char> buffer(1024);
+ vdslib::WritableDocumentList block(getTypeRepo(), &buffer[0], buffer.size());
+ block.addRemove(doc->getId(), spi::Timestamp(1235));
+
+ api::MultiOperationCommand cmd(getTypeRepo(), bucketId, 0);
+ cmd.setOperations(block);
+
+ thread->handleMultiOperation(cmd);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("DocEntry(1234, 0, Doc(id:mail:testdoctype1:n=4:3619.html))\n"
+ "DocEntry(1235, 1, id:mail:testdoctype1:n=4:3619.html)\n"), stat());
+}
+
+void
+LegacyOperationHandlerTest::testMultioperationSingleUpdate()
+{
+ std::unique_ptr<PersistenceThread> thread(createPersistenceThread(0));
+ document::BucketId bucketId(16, 4);
+ document::StringFieldValue updateValue("foo");
+
+ document::Document::SP doc = doPut(4, spi::Timestamp(1234));
+ document::Document originalDoc(*doc);
+
+ document::DocumentUpdate::SP update = createBodyUpdate(
+ doc->getId(), updateValue);
+
+ std::vector<char> buffer(1024);
+ vdslib::WritableDocumentList block(getTypeRepo(), &buffer[0], buffer.size());
+ block.addUpdate(*update, api::Timestamp(1235));
+
+ api::MultiOperationCommand cmd(getTypeRepo(), bucketId, 0);
+ cmd.setOperations(block);
+
+ thread->handleMultiOperation(cmd);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("DocEntry(1234, 0, Doc(id:mail:testdoctype1:n=4:3619.html))\n"
+ "DocEntry(1235, 0, Doc(id:mail:testdoctype1:n=4:3619.html))\n"), stat());
+}
+
+void
+LegacyOperationHandlerTest::testMultioperationUpdateNotFound()
+{
+ std::unique_ptr<PersistenceThread> thread(createPersistenceThread(0));
+ document::BucketId bucketId(16, 4);
+ document::DocumentId docId("userdoc:test:4:0");
+ document::StringFieldValue updateValue("foo");
+
+ document::DocumentUpdate::SP update = createBodyUpdate(
+ docId, updateValue);
+
+ std::vector<char> buffer(1024);
+ vdslib::WritableDocumentList block(getTypeRepo(), &buffer[0], buffer.size());
+ block.addUpdate(*update, api::Timestamp(1235));
+
+ api::MultiOperationCommand cmd(getTypeRepo(), bucketId, 0);
+ cmd.setOperations(block);
+
+ thread->handleMultiOperation(cmd);
+
+ CPPUNIT_ASSERT_EQUAL(std::string(""), stat());
+}
+
+void
+LegacyOperationHandlerTest::testMultioperationMixedOperations()
+{
+ std::unique_ptr<PersistenceThread> thread(createPersistenceThread(0));
+ document::BucketId bucketId(16, 4);
+ document::StringFieldValue updateValue("bar");
+
+ document::Document::SP originalUpdateDoc = doPut(4, spi::Timestamp(1234));
+ document::Document::SP originalRemoveDoc = doPut(4, spi::Timestamp(2345));
+
+ document::DocumentUpdate::SP update = createBodyUpdate(
+ originalUpdateDoc->getId(), updateValue);
+
+ document::DocumentUpdate::SP nonExistingUpdate = createBodyUpdate(
+ document::DocumentId("id:test:testdoctype1:n=4:nonexisting1"), updateValue);
+
+ document::Document::SP putDoc(createRandomDocumentAtLocation(4, 5678, 0, 128));
+
+ std::vector<char> buffer(1024);
+ vdslib::WritableDocumentList block(getTypeRepo(), &buffer[0], buffer.size());
+
+ block.addUpdate(*update, api::Timestamp(3456));
+ block.addUpdate(*nonExistingUpdate, api::Timestamp(3457));
+ block.addRemove(originalRemoveDoc->getId(), api::Timestamp(4567));
+ block.addRemove(document::DocumentId("id:test:testdoctype1:n=4:nonexisting2"),
+ api::Timestamp(4568));
+ block.addPut(*putDoc, api::Timestamp(5678));
+
+ api::MultiOperationCommand cmd(getTypeRepo(), bucketId, 0);
+ cmd.setOperations(block);
+
+ thread->handleMultiOperation(cmd);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("DocEntry(1234, 0, Doc(id:mail:testdoctype1:n=4:3619.html))\n"
+ "DocEntry(2345, 0, Doc(id:mail:testdoctype1:n=4:4008.html))\n"
+ "DocEntry(3456, 0, Doc(id:mail:testdoctype1:n=4:3619.html))\n"
+ "DocEntry(4567, 1, id:mail:testdoctype1:n=4:4008.html)\n"
+ "DocEntry(4568, 1, id:test:testdoctype1:n=4:nonexisting2)\n"
+ "DocEntry(5678, 0, Doc(id:mail:testdoctype1:n=4:5177.html))\n"),
+ stat());
+}
+
+}
diff --git a/storage/src/tests/persistence/mergehandlertest.cpp b/storage/src/tests/persistence/mergehandlertest.cpp
new file mode 100644
index 00000000000..3d3ce25a7d7
--- /dev/null
+++ b/storage/src/tests/persistence/mergehandlertest.cpp
@@ -0,0 +1,1494 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/document/base/testdocman.h>
+#include <vespa/storage/persistence/mergehandler.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/log/log.h>
+#include <tests/persistence/persistencetestutils.h>
+#include <tests/persistence/persistenceproviderwrapper.h>
+#include <tests/distributor/messagesenderstub.h>
+#include <vespa/storageframework/defaultimplementation/clock/fakeclock.h>
+
+LOG_SETUP(".test.persistence.handler.merge");
+
+namespace storage {
+
+struct MergeHandlerTest : public SingleDiskPersistenceTestUtils
+{
+ uint32_t _location; // Location used for all merge tests
+ document::BucketId _bucket; // Bucket used for all merge tests
+ uint64_t _maxTimestamp;
+ std::vector<api::MergeBucketCommand::Node> _nodes;
+ std::unique_ptr<spi::Context> _context;
+
+ // Fetch a single command or reply; doesn't care which.
+ template <typename T>
+ std::shared_ptr<T> fetchSingleMessage();
+
+ void setUp();
+
+ enum ChainPos { FRONT, MIDDLE, BACK };
+ void setUpChain(ChainPos);
+
+ // Test a regular merge bucket command fetching data, including
+ // puts, removes, unrevertable removes & duplicates.
+ void testMergeBucketCommand();
+ // Test that a simplistic merge with nothing to actually merge,
+ // sends get bucket diff through the entire chain of 3 nodes.
+ void testGetBucketDiffChain(bool midChain);
+ void testGetBucketDiffMidChain() { testGetBucketDiffChain(true); }
+ void testGetBucketDiffEndOfChain() { testGetBucketDiffChain(false); }
+ // Test that a simplistic merge with nothing to actually merge,
+ // sends apply bucket diff through the entire chain of 3 nodes.
+ void testApplyBucketDiffChain(bool midChain);
+ void testApplyBucketDiffMidChain() { testApplyBucketDiffChain(true); }
+ void testApplyBucketDiffEndOfChain() { testApplyBucketDiffChain(false); }
+ // Test that a simplistic merge with one thing to actually merge,
+ // sends correct commands and finish.
+ void testMasterMessageFlow();
+ // Test that a simplistic merge with 1 doc to actually merge,
+ // sends apply bucket diff through the entire chain of 3 nodes.
+ void testApplyBucketDiffChain();
+ void testMergeUnrevertableRemove();
+ void testChunkedApplyBucketDiff();
+ void testChunkLimitPartiallyFilledDiff();
+ void testMaxTimestamp();
+ void testSPIFlushGuard();
+ void testBucketNotFoundInDb();
+ void testMergeProgressSafeGuard();
+ void testSafeGuardNotInvokedWhenHasMaskChanges();
+ void testEntryRemovedAfterGetBucketDiff();
+
+ void testMergeBucketSPIFailures();
+ void testGetBucketDiffSPIFailures();
+ void testApplyBucketDiffSPIFailures();
+ void testGetBucketDiffReplySPIFailures();
+ void testApplyBucketDiffReplySPIFailures();
+
+ void testRemoveFromDiff();
+
+ void testRemovePutOnExistingTimestamp();
+
+ CPPUNIT_TEST_SUITE(MergeHandlerTest);
+ CPPUNIT_TEST(testMergeBucketCommand);
+ CPPUNIT_TEST(testGetBucketDiffMidChain);
+ CPPUNIT_TEST(testGetBucketDiffEndOfChain);
+ CPPUNIT_TEST(testApplyBucketDiffMidChain);
+ CPPUNIT_TEST(testApplyBucketDiffEndOfChain);
+ CPPUNIT_TEST(testMasterMessageFlow);
+ CPPUNIT_TEST(testMergeUnrevertableRemove);
+ CPPUNIT_TEST(testChunkedApplyBucketDiff);
+ CPPUNIT_TEST(testChunkLimitPartiallyFilledDiff);
+ CPPUNIT_TEST(testMaxTimestamp);
+ CPPUNIT_TEST(testSPIFlushGuard);
+ CPPUNIT_TEST(testBucketNotFoundInDb);
+ CPPUNIT_TEST(testMergeProgressSafeGuard);
+ CPPUNIT_TEST(testSafeGuardNotInvokedWhenHasMaskChanges);
+ CPPUNIT_TEST(testEntryRemovedAfterGetBucketDiff);
+ CPPUNIT_TEST(testMergeBucketSPIFailures);
+ CPPUNIT_TEST(testGetBucketDiffSPIFailures);
+ CPPUNIT_TEST(testApplyBucketDiffSPIFailures);
+ CPPUNIT_TEST(testGetBucketDiffReplySPIFailures);
+ CPPUNIT_TEST(testApplyBucketDiffReplySPIFailures);
+ CPPUNIT_TEST(testRemoveFromDiff);
+ CPPUNIT_TEST(testRemovePutOnExistingTimestamp);
+ CPPUNIT_TEST_SUITE_END();
+
+ // @TODO Add test to test that buildBucketInfo and mergeLists create minimal list (wrong sorting screws this up)
+private:
+ void fillDummyApplyDiff(std::vector<api::ApplyBucketDiffCommand::Entry>& diff);
+ std::shared_ptr<api::ApplyBucketDiffCommand> createDummyApplyDiff(
+ int timestampOffset,
+ uint16_t hasMask = 0x1,
+ bool filled = true);
+
+ std::shared_ptr<api::GetBucketDiffCommand>
+ createDummyGetBucketDiff(int timestampOffset,
+ uint16_t hasMask);
+
+ struct ExpectedExceptionSpec // Try saying this out loud 3 times in a row.
+ {
+ uint32_t mask;
+ const char* expected;
+ };
+
+ class HandlerInvoker
+ {
+ public:
+ virtual ~HandlerInvoker() {}
+ virtual void beforeInvoke(MergeHandlerTest&, MergeHandler&, spi::Context&) {}
+ virtual void invoke(MergeHandlerTest&, MergeHandler&, spi::Context&) = 0;
+ virtual std::string afterInvoke(MergeHandlerTest&, MergeHandler&) = 0;
+ };
+ friend class HandlerInvoker;
+
+ class NoReplyHandlerInvoker
+ : public HandlerInvoker
+ {
+ public:
+ std::string afterInvoke(MergeHandlerTest&, MergeHandler&);
+ };
+
+ template <typename ExpectedMessage>
+ std::string checkMessage(api::ReturnCode::Result expectedResult);
+
+ class HandleMergeBucketInvoker
+ : public NoReplyHandlerInvoker
+ {
+ public:
+ void invoke(MergeHandlerTest&, MergeHandler&, spi::Context&);
+ };
+
+ class HandleMergeBucketReplyInvoker
+ : public NoReplyHandlerInvoker
+ {
+ public:
+ void invoke(MergeHandlerTest&, MergeHandler&, spi::Context&);
+ };
+
+ class HandleGetBucketDiffInvoker
+ : public NoReplyHandlerInvoker
+ {
+ public:
+ void invoke(MergeHandlerTest&, MergeHandler&, spi::Context&);
+ };
+
+ class MultiPositionHandlerInvoker
+ : public HandlerInvoker
+ {
+ public:
+ MultiPositionHandlerInvoker()
+ : _pos(FRONT)
+ {
+ }
+ void setChainPos(ChainPos pos) { _pos = pos; }
+ ChainPos getChainPos() const { return _pos; }
+ private:
+ ChainPos _pos;
+ };
+
+ class HandleGetBucketDiffReplyInvoker
+ : public HandlerInvoker
+ {
+ public:
+ void beforeInvoke(MergeHandlerTest&, MergeHandler&, spi::Context&);
+ void invoke(MergeHandlerTest&, MergeHandler&, spi::Context&);
+ std::string afterInvoke(MergeHandlerTest&, MergeHandler&);
+ private:
+ MessageSenderStub _stub;
+ std::shared_ptr<api::GetBucketDiffCommand> _diffCmd;
+ };
+
+ class HandleApplyBucketDiffInvoker
+ : public NoReplyHandlerInvoker
+ {
+ public:
+ HandleApplyBucketDiffInvoker() : _counter(0) {}
+ void invoke(MergeHandlerTest&, MergeHandler&, spi::Context&);
+ private:
+ int _counter;
+ };
+
+ class HandleApplyBucketDiffReplyInvoker
+ : public MultiPositionHandlerInvoker
+ {
+ public:
+ HandleApplyBucketDiffReplyInvoker()
+ : _counter(0),
+ _stub(),
+ _applyCmd()
+ {}
+ void beforeInvoke(MergeHandlerTest&, MergeHandler&, spi::Context&);
+ void invoke(MergeHandlerTest&, MergeHandler&, spi::Context&);
+ std::string afterInvoke(MergeHandlerTest&, MergeHandler&);
+ private:
+ int _counter;
+ MessageSenderStub _stub;
+ std::shared_ptr<api::ApplyBucketDiffCommand> _applyCmd;
+ };
+
+ std::string
+ doTestSPIException(MergeHandler& handler,
+ PersistenceProviderWrapper& providerWrapper,
+ HandlerInvoker& invoker,
+ const ExpectedExceptionSpec& spec);
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(MergeHandlerTest);
+
+void
+MergeHandlerTest::setUp() {
+ _context.reset(new spi::Context(documentapi::LoadType::DEFAULT, 0, 0));
+ SingleDiskPersistenceTestUtils::setUp();
+
+ _location = 1234;
+ _bucket = document::BucketId(16, _location);
+ _maxTimestamp = 11501;
+
+ LOG(info, "Creating %s in bucket database", _bucket.toString().c_str());
+ bucketdb::StorageBucketInfo bucketDBEntry;
+ bucketDBEntry.disk = 0;
+ getEnv().getBucketDatabase().insert(_bucket, bucketDBEntry, "mergetestsetup");
+
+ LOG(info, "Creating bucket to merge");
+ createTestBucket(_bucket);
+
+ setUpChain(FRONT);
+}
+
+void
+MergeHandlerTest::setUpChain(ChainPos pos) {
+ _nodes.clear();
+ if (pos != FRONT) {
+ _nodes.push_back(api::MergeBucketCommand::Node(2, false));
+ }
+ _nodes.push_back(api::MergeBucketCommand::Node(0, false));
+ if (pos != BACK) {
+ _nodes.push_back(api::MergeBucketCommand::Node(1, false));
+ }
+}
+
+void
+MergeHandlerTest::testMergeBucketCommand()
+{
+ MergeHandler handler(getPersistenceProvider(), getEnv());
+
+ LOG(info, "Handle a merge bucket command");
+ api::MergeBucketCommand cmd(_bucket, _nodes, _maxTimestamp);
+ cmd.setSourceIndex(1234);
+ MessageTracker::UP tracker = handler.handleMergeBucket(cmd, *_context);
+
+ LOG(info, "Check state");
+ CPPUNIT_ASSERT_EQUAL(uint64_t(1), messageKeeper()._msgs.size());
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::GETBUCKETDIFF,
+ messageKeeper()._msgs[0]->getType());
+ api::GetBucketDiffCommand& cmd2(dynamic_cast<api::GetBucketDiffCommand&>(
+ *messageKeeper()._msgs[0]));
+ CPPUNIT_ASSERT_EQUAL(_nodes, cmd2.getNodes());
+ std::vector<api::GetBucketDiffCommand::Entry> diff(cmd2.getDiff());
+ CPPUNIT_ASSERT_EQUAL(uint64_t(17), diff.size());
+ CPPUNIT_ASSERT_EQUAL(uint16_t(1), cmd2.getAddress()->getIndex());
+ CPPUNIT_ASSERT_EQUAL(uint16_t(1234), cmd2.getSourceIndex());
+
+ tracker->generateReply(cmd);
+ CPPUNIT_ASSERT(!tracker->getReply().get());
+}
+
+void
+MergeHandlerTest::testGetBucketDiffChain(bool midChain)
+{
+ setUpChain(midChain ? MIDDLE : BACK);
+ MergeHandler handler(getPersistenceProvider(), getEnv());
+
+ LOG(info, "Verifying that get bucket diff is sent on");
+ api::GetBucketDiffCommand cmd(_bucket, _nodes, _maxTimestamp);
+ MessageTracker::UP tracker1 = handler.handleGetBucketDiff(cmd, *_context);
+ api::StorageMessage::SP replySent = tracker1->getReply();
+
+ if (midChain) {
+ LOG(info, "Check state");
+ CPPUNIT_ASSERT_EQUAL(uint64_t(1), messageKeeper()._msgs.size());
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::GETBUCKETDIFF,
+ messageKeeper()._msgs[0]->getType());
+ api::GetBucketDiffCommand& cmd2(
+ dynamic_cast<api::GetBucketDiffCommand&>(
+ *messageKeeper()._msgs[0]));
+ CPPUNIT_ASSERT_EQUAL(_nodes, cmd2.getNodes());
+ std::vector<api::GetBucketDiffCommand::Entry> diff(cmd2.getDiff());
+ CPPUNIT_ASSERT_EQUAL(uint64_t(17), diff.size());
+ CPPUNIT_ASSERT_EQUAL(uint16_t(1), cmd2.getAddress()->getIndex());
+
+ LOG(info, "Verifying that replying the diff sends on back");
+ api::GetBucketDiffReply::UP reply(new api::GetBucketDiffReply(cmd2));
+
+ CPPUNIT_ASSERT(!replySent.get());
+
+ MessageSenderStub stub;
+ handler.handleGetBucketDiffReply(*reply, stub);
+ CPPUNIT_ASSERT_EQUAL(1, (int)stub.replies.size());
+ replySent = stub.replies[0];
+ }
+ api::GetBucketDiffReply::SP reply2(
+ std::dynamic_pointer_cast<api::GetBucketDiffReply>(
+ replySent));
+ CPPUNIT_ASSERT(reply2.get());
+
+ CPPUNIT_ASSERT_EQUAL(_nodes, reply2->getNodes());
+ std::vector<api::GetBucketDiffCommand::Entry> diff(reply2->getDiff());
+ CPPUNIT_ASSERT_EQUAL(uint64_t(17), diff.size());
+}
+
+void
+MergeHandlerTest::testApplyBucketDiffChain(bool midChain)
+{
+ setUpChain(midChain ? MIDDLE : BACK);
+ MergeHandler handler(getPersistenceProvider(), getEnv());
+
+ LOG(info, "Verifying that apply bucket diff is sent on");
+ api::ApplyBucketDiffCommand cmd(_bucket, _nodes, _maxTimestamp);
+ MessageTracker::UP tracker1 = handler.handleApplyBucketDiff(cmd, *_context);
+ api::StorageMessage::SP replySent = tracker1->getReply();
+
+ if (midChain) {
+ LOG(info, "Check state");
+ CPPUNIT_ASSERT_EQUAL(uint64_t(1), messageKeeper()._msgs.size());
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::APPLYBUCKETDIFF,
+ messageKeeper()._msgs[0]->getType());
+ api::ApplyBucketDiffCommand& cmd2(
+ dynamic_cast<api::ApplyBucketDiffCommand&>(
+ *messageKeeper()._msgs[0]));
+ CPPUNIT_ASSERT_EQUAL(_nodes, cmd2.getNodes());
+ std::vector<api::ApplyBucketDiffCommand::Entry> diff(cmd2.getDiff());
+ CPPUNIT_ASSERT_EQUAL(uint64_t(0), diff.size());
+ CPPUNIT_ASSERT_EQUAL(uint16_t(1), cmd2.getAddress()->getIndex());
+
+ CPPUNIT_ASSERT(!replySent.get());
+
+ LOG(info, "Verifying that replying the diff sends on back");
+ api::ApplyBucketDiffReply::UP reply(
+ new api::ApplyBucketDiffReply(cmd2));
+
+ MessageSenderStub stub;
+ handler.handleApplyBucketDiffReply(*reply, stub);
+ CPPUNIT_ASSERT_EQUAL(1, (int)stub.replies.size());
+ replySent = stub.replies[0];
+ }
+
+ api::ApplyBucketDiffReply::SP reply2(
+ std::dynamic_pointer_cast<api::ApplyBucketDiffReply>(replySent));
+ CPPUNIT_ASSERT(reply2.get());
+
+ CPPUNIT_ASSERT_EQUAL(_nodes, reply2->getNodes());
+ std::vector<api::ApplyBucketDiffCommand::Entry> diff(reply2->getDiff());
+ CPPUNIT_ASSERT_EQUAL(uint64_t(0), diff.size());
+}
+
+void
+MergeHandlerTest::testMasterMessageFlow()
+{
+ MergeHandler handler(getPersistenceProvider(), getEnv());
+
+ LOG(info, "Handle a merge bucket command");
+ api::MergeBucketCommand cmd(_bucket, _nodes, _maxTimestamp);
+
+ handler.handleMergeBucket(cmd, *_context);
+ LOG(info, "Check state");
+ CPPUNIT_ASSERT_EQUAL(uint64_t(1), messageKeeper()._msgs.size());
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::GETBUCKETDIFF,
+ messageKeeper()._msgs[0]->getType());
+ api::GetBucketDiffCommand& cmd2(dynamic_cast<api::GetBucketDiffCommand&>(
+ *messageKeeper()._msgs[0]));
+
+ api::GetBucketDiffReply::UP reply(new api::GetBucketDiffReply(cmd2));
+ // End of chain can remove entries all have. This should end up with
+ // one entry master node has other node don't have
+ reply->getDiff().resize(1);
+
+ handler.handleGetBucketDiffReply(*reply, messageKeeper());
+
+ LOG(info, "Check state");
+ CPPUNIT_ASSERT_EQUAL(uint64_t(2), messageKeeper()._msgs.size());
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::APPLYBUCKETDIFF,
+ messageKeeper()._msgs[1]->getType());
+ api::ApplyBucketDiffCommand& cmd3(
+ dynamic_cast<api::ApplyBucketDiffCommand&>(
+ *messageKeeper()._msgs[1]));
+ api::ApplyBucketDiffReply::UP reply2(new api::ApplyBucketDiffReply(cmd3));
+ CPPUNIT_ASSERT_EQUAL(size_t(1), reply2->getDiff().size());
+ reply2->getDiff()[0]._entry._hasMask |= 2;
+
+ MessageSenderStub stub;
+ handler.handleApplyBucketDiffReply(*reply2, stub);
+
+ CPPUNIT_ASSERT_EQUAL(1, (int)stub.replies.size());
+
+ api::MergeBucketReply::SP reply3(
+ std::dynamic_pointer_cast<api::MergeBucketReply>(stub.replies[0]));
+ CPPUNIT_ASSERT(reply3.get());
+
+ CPPUNIT_ASSERT_EQUAL(_nodes, reply3->getNodes());
+ CPPUNIT_ASSERT(reply3->getResult().success());
+ CPPUNIT_ASSERT(!fsHandler().isMerging(_bucket));
+}
+
+void
+MergeHandlerTest::testMergeUnrevertableRemove()
+{
+/*
+ MergeHandler handler(getPersistenceProvider(), getEnv());
+
+ LOG(info, "Handle a merge bucket command");
+ api::MergeBucketCommand cmd(_bucket, _nodes, _maxTimestamp);
+ {
+ MessageTracker tracker;
+ handler.handleMergeBucket(cmd, tracker);
+ }
+
+ LOG(info, "Check state");
+ CPPUNIT_ASSERT_EQUAL(uint64_t(1), messageKeeper()._msgs.size());
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::GETBUCKETDIFF,
+ messageKeeper()._msgs[0]->getType());
+ api::GetBucketDiffCommand& cmd2(
+ dynamic_cast<api::GetBucketDiffCommand&>(
+ *messageKeeper()._msgs[0]));
+
+ api::GetBucketDiffReply::UP reply(new api::GetBucketDiffReply(cmd2));
+
+ std::vector<Timestamp> docTimestamps;
+ for (int i = 0; i < 4; ++i) {
+ docTimestamps.push_back(Timestamp(reply->getDiff()[i]._timestamp));
+ }
+ CPPUNIT_ASSERT(reply->getDiff().size() >= 4);
+ reply->getDiff().resize(4);
+ // Add one non-unrevertable entry for existing timestamp which
+ // should not be added
+ reply->getDiff()[0]._flags |= Types::DELETED;
+ reply->getDiff()[0]._bodySize = 0;
+ reply->getDiff()[0]._hasMask = 2;
+ // Add a unrevertable entry which should be modified
+ reply->getDiff()[1]._flags |= Types::DELETED | Types::DELETED_IN_PLACE;
+ reply->getDiff()[1]._bodySize = 0;
+ reply->getDiff()[1]._hasMask = 2;
+ // Add one non-unrevertable entry that is a duplicate put
+ // which should not be added or fail the merge.
+ LOG(info, "duplicate put has timestamp %zu and flags %u",
+ reply->getDiff()[2]._timestamp,
+ reply->getDiff()[2]._flags);
+ reply->getDiff()[2]._hasMask = 2;
+ // Add one unrevertable entry for a timestamp that does not exist
+ reply->getDiff()[3]._flags |= Types::DELETED | Types::DELETED_IN_PLACE;
+ reply->getDiff()[3]._timestamp = 12345678;
+ reply->getDiff()[3]._bodySize = 0;
+ reply->getDiff()[3]._hasMask = 2;
+ {
+ MessageTracker tracker;
+ handler.handleGetBucketDiffReply(*reply, tracker);
+ }
+
+ LOG(info, "%s", reply->toString(true).c_str());
+
+ LOG(info, "Create bucket diff reply");
+ CPPUNIT_ASSERT_EQUAL(uint64_t(2), messageKeeper()._msgs.size());
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::APPLYBUCKETDIFF,
+ messageKeeper()._msgs[1]->getType());
+ api::ApplyBucketDiffCommand& cmd3(
+ dynamic_cast<api::ApplyBucketDiffCommand&>(
+ *messageKeeper()._msgs[1]));
+ api::ApplyBucketDiffReply::UP reply2(
+ new api::ApplyBucketDiffReply(cmd3));
+ CPPUNIT_ASSERT_EQUAL(size_t(4), reply2->getDiff().size());
+
+ memfile::DataLocation headerLocs[4];
+ std::vector<DocumentId> documentIds;
+ // So deserialization won't fail, we need some kind of header blob
+ // for each entry
+
+ for (int i = 0; i < 4; ++i) {
+ api::ApplyBucketDiffReply::Entry& entry = reply2->getDiff()[i];
+ CPPUNIT_ASSERT_EQUAL(uint16_t(2), entry._entry._hasMask);
+
+ memfile::MemFilePtr file(getMemFile(_bucket));
+ const memfile::MemSlot* slot = file->getSlotAtTime(docTimestamps[i]);
+ CPPUNIT_ASSERT(slot != NULL);
+ LOG(info, "Processing slot %s", slot->toString().c_str());
+ CPPUNIT_ASSERT(slot->hasBodyContent());
+ documentIds.push_back(file->getDocumentId(*slot));
+ entry._docName = documentIds.back().toString();
+ headerLocs[i] = slot->getLocation(HEADER);
+
+ document::Document::UP doc(file->getDocument(*slot, ALL));
+ {
+ vespalib::nbostream stream;
+ doc->serializeHeader(stream);
+ std::vector<char> buf(
+ stream.peek(), stream.peek() + stream.size());
+ entry._headerBlob.swap(buf);
+ }
+ // Put duplicate needs body blob as well
+ if (i == 2) {
+ vespalib::nbostream stream;
+ doc->serializeBody(stream);
+ std::vector<char> buf(
+ stream.peek(), stream.peek() + stream.size());
+ entry._bodyBlob.swap(buf);
+ }
+ }
+
+ LOG(info, "%s", reply2->toString(true).c_str());
+
+ MessageTracker tracker;
+ handler.handleApplyBucketDiffReply(*reply2, tracker);
+
+ CPPUNIT_ASSERT(tracker._sendReply);
+ api::MergeBucketReply::SP reply3(
+ std::dynamic_pointer_cast<api::MergeBucketReply>(
+ tracker._reply));
+ CPPUNIT_ASSERT(reply3.get());
+
+ CPPUNIT_ASSERT_EQUAL(_nodes, reply3->getNodes());
+ CPPUNIT_ASSERT(reply3->getResult().success());
+
+ memfile::MemFilePtr file(getMemFile(_bucket));
+ // Existing timestamp should not be modified by
+ // non-unrevertable entry
+ {
+ const memfile::MemSlot* slot = file->getSlotAtTime(
+ Timestamp(reply->getDiff()[0]._timestamp));
+ CPPUNIT_ASSERT(slot != NULL);
+ CPPUNIT_ASSERT(!slot->deleted());
+ }
+ // Ensure unrevertable remove for existing put was merged in OK
+ {
+ const memfile::MemSlot* slot = file->getSlotAtTime(
+ Timestamp(reply->getDiff()[1]._timestamp));
+ CPPUNIT_ASSERT(slot != NULL);
+ CPPUNIT_ASSERT(slot->deleted());
+ CPPUNIT_ASSERT(slot->deletedInPlace());
+ CPPUNIT_ASSERT(!slot->hasBodyContent());
+ // Header location should not have changed
+ CPPUNIT_ASSERT_EQUAL(headerLocs[1], slot->getLocation(HEADER));
+ }
+
+ // Non-existing timestamp unrevertable remove should be added as
+ // entry with doc id-only header
+ {
+ const memfile::MemSlot* slot = file->getSlotAtTime(
+ Timestamp(reply->getDiff()[3]._timestamp));
+ CPPUNIT_ASSERT(slot != NULL);
+ CPPUNIT_ASSERT(slot->deleted());
+ CPPUNIT_ASSERT(slot->deletedInPlace());
+ CPPUNIT_ASSERT(!slot->hasBodyContent());
+ CPPUNIT_ASSERT_EQUAL(documentIds[3], file->getDocumentId(*slot));
+ }
+
+*/
+}
+
+template <typename T>
+std::shared_ptr<T>
+MergeHandlerTest::fetchSingleMessage()
+{
+ std::vector<api::StorageMessage::SP>& msgs(messageKeeper()._msgs);
+ if (msgs.empty()) {
+ std::ostringstream oss;
+ oss << "No messages available to fetch (expected type "
+ << typeid(T).name()
+ << ")";
+ throw std::runtime_error(oss.str());
+ }
+ std::shared_ptr<T> ret(std::dynamic_pointer_cast<T>(
+ messageKeeper()._msgs.back()));
+ if (!ret) {
+ std::ostringstream oss;
+ oss << "Expected message of type "
+ << typeid(T).name()
+ << ", but got "
+ << messageKeeper()._msgs[0]->toString();
+ throw std::runtime_error(oss.str());
+ }
+ messageKeeper()._msgs.pop_back();
+
+ return ret;
+}
+
+namespace {
+
+size_t
+getFilledCount(const std::vector<api::ApplyBucketDiffCommand::Entry>& diff)
+{
+ size_t filledCount = 0;
+ for (size_t i=0; i<diff.size(); ++i) {
+ if (diff[i].filled()) {
+ ++filledCount;
+ }
+ }
+ return filledCount;
+}
+
+size_t
+getFilledDataSize(const std::vector<api::ApplyBucketDiffCommand::Entry>& diff)
+{
+ size_t filledSize = 0;
+ for (size_t i=0; i<diff.size(); ++i) {
+ filledSize += diff[i]._headerBlob.size();
+ filledSize += diff[i]._bodyBlob.size();
+ }
+ return filledSize;
+}
+
+}
+
+void
+MergeHandlerTest::testChunkedApplyBucketDiff()
+{
+ uint32_t docSize = 1024;
+ uint32_t docCount = 10;
+ uint32_t maxChunkSize = docSize * 3;
+ for (uint32_t i = 0; i < docCount; ++i) {
+ doPut(1234, spi::Timestamp(4000 + i), docSize, docSize);
+ }
+
+ MergeHandler handler(getPersistenceProvider(), getEnv(), maxChunkSize);
+
+ LOG(info, "Handle a merge bucket command");
+ api::MergeBucketCommand cmd(_bucket, _nodes, _maxTimestamp);
+ handler.handleMergeBucket(cmd, *_context);
+
+ std::shared_ptr<api::GetBucketDiffCommand> getBucketDiffCmd(
+ fetchSingleMessage<api::GetBucketDiffCommand>());
+ api::GetBucketDiffReply::UP getBucketDiffReply(
+ new api::GetBucketDiffReply(*getBucketDiffCmd));
+
+ handler.handleGetBucketDiffReply(*getBucketDiffReply, messageKeeper());
+
+ uint32_t totalDiffs = getBucketDiffCmd->getDiff().size();
+ std::set<spi::Timestamp> seen;
+
+ api::MergeBucketReply::SP reply;
+ while (seen.size() != totalDiffs) {
+ std::shared_ptr<api::ApplyBucketDiffCommand> applyBucketDiffCmd(
+ fetchSingleMessage<api::ApplyBucketDiffCommand>());
+
+ LOG(info, "Test that we get chunked diffs in ApplyBucketDiff");
+ std::vector<api::ApplyBucketDiffCommand::Entry>& diff(
+ applyBucketDiffCmd->getDiff());
+ CPPUNIT_ASSERT(getFilledCount(diff) < totalDiffs);
+ CPPUNIT_ASSERT(getFilledDataSize(diff) <= maxChunkSize);
+
+ // Include node 1 in hasmask for all diffs to indicate it's done
+ // Also remember the diffs we've seen thus far to ensure chunking
+ // does not send duplicates.
+ for (size_t i = 0; i < diff.size(); ++i) {
+ if (!diff[i].filled()) {
+ continue;
+ }
+ diff[i]._entry._hasMask |= 2;
+ std::pair<std::set<spi::Timestamp>::iterator, bool> inserted(
+ seen.insert(spi::Timestamp(diff[i]._entry._timestamp)));
+ if (!inserted.second) {
+ std::ostringstream ss;
+ ss << "Diff for " << diff[i]
+ << " has already been seen in another ApplyBucketDiff";
+ CPPUNIT_FAIL(ss.str());
+ }
+ }
+
+ api::ApplyBucketDiffReply::UP applyBucketDiffReply(
+ new api::ApplyBucketDiffReply(*applyBucketDiffCmd));
+ {
+ handler.handleApplyBucketDiffReply(*applyBucketDiffReply, messageKeeper());
+
+ if (messageKeeper()._msgs.size()) {
+ CPPUNIT_ASSERT(!reply.get());
+ reply = std::dynamic_pointer_cast<api::MergeBucketReply>(
+ messageKeeper()._msgs[messageKeeper()._msgs.size() - 1]);
+ }
+ }
+ }
+ LOG(info, "Done with applying diff");
+
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(_nodes, reply->getNodes());
+ CPPUNIT_ASSERT(reply->getResult().success());
+}
+
+void
+MergeHandlerTest::testChunkLimitPartiallyFilledDiff()
+{
+ setUpChain(FRONT);
+
+ uint32_t docSize = 1024;
+ uint32_t docCount = 3;
+ uint32_t maxChunkSize = 1024 + 1024 + 512;
+
+ for (uint32_t i = 0; i < docCount; ++i) {
+ doPut(1234, spi::Timestamp(4000 + i), docSize, docSize);
+ }
+
+ std::vector<api::ApplyBucketDiffCommand::Entry> applyDiff;
+ for (uint32_t i = 0; i < docCount; ++i) {
+ api::ApplyBucketDiffCommand::Entry e;
+ e._entry._timestamp = 4000 + i;
+ if (i == 0) {
+ e._headerBlob.resize(docSize);
+ }
+ e._entry._hasMask = 0x3;
+ e._entry._flags = MergeHandler::IN_USE;
+ applyDiff.push_back(e);
+ }
+
+ setUpChain(MIDDLE);
+ std::shared_ptr<api::ApplyBucketDiffCommand> applyBucketDiffCmd(
+ new api::ApplyBucketDiffCommand(_bucket, _nodes, maxChunkSize));
+ applyBucketDiffCmd->getDiff() = applyDiff;
+
+ MergeHandler handler(
+ getPersistenceProvider(), getEnv(), maxChunkSize);
+ handler.handleApplyBucketDiff(*applyBucketDiffCmd, *_context);
+
+ std::shared_ptr<api::ApplyBucketDiffCommand> fwdDiffCmd(
+ fetchSingleMessage<api::ApplyBucketDiffCommand>());
+ // Should not fill up more than chunk size allows for
+ CPPUNIT_ASSERT_EQUAL(size_t(2), getFilledCount(fwdDiffCmd->getDiff()));
+ CPPUNIT_ASSERT(getFilledDataSize(fwdDiffCmd->getDiff()) <= maxChunkSize);
+}
+
+void
+MergeHandlerTest::testMaxTimestamp()
+{
+ doPut(1234, spi::Timestamp(_maxTimestamp + 10), 1024, 1024);
+
+ MergeHandler handler(getPersistenceProvider(), getEnv());
+
+ api::MergeBucketCommand cmd(_bucket, _nodes, _maxTimestamp);
+ handler.handleMergeBucket(cmd, *_context);
+
+ std::shared_ptr<api::GetBucketDiffCommand> getCmd(
+ fetchSingleMessage<api::GetBucketDiffCommand>());
+
+ CPPUNIT_ASSERT(!getCmd->getDiff().empty());
+ CPPUNIT_ASSERT(getCmd->getDiff().back()._timestamp <= _maxTimestamp);
+}
+
+void
+MergeHandlerTest::fillDummyApplyDiff(
+ std::vector<api::ApplyBucketDiffCommand::Entry>& diff)
+{
+ document::TestDocMan docMan;
+ document::Document::SP doc(
+ docMan.createRandomDocumentAtLocation(_location));
+ std::vector<char> headerBlob;
+ {
+ vespalib::nbostream stream;
+ doc->serializeHeader(stream);
+ headerBlob.resize(stream.size());
+ memcpy(&headerBlob[0], stream.peek(), stream.size());
+ }
+
+ assert(diff.size() == 3);
+ diff[0]._headerBlob = headerBlob;
+ diff[1]._docName = doc->getId().toString();
+ diff[2]._docName = doc->getId().toString();
+}
+
+std::shared_ptr<api::ApplyBucketDiffCommand>
+MergeHandlerTest::createDummyApplyDiff(int timestampOffset,
+ uint16_t hasMask,
+ bool filled)
+{
+
+ std::vector<api::ApplyBucketDiffCommand::Entry> applyDiff;
+ {
+ api::ApplyBucketDiffCommand::Entry e;
+ e._entry._timestamp = timestampOffset;
+ e._entry._hasMask = hasMask;
+ e._entry._flags = MergeHandler::IN_USE;
+ applyDiff.push_back(e);
+ }
+ {
+ api::ApplyBucketDiffCommand::Entry e;
+ e._entry._timestamp = timestampOffset + 1;
+ e._entry._hasMask = hasMask;
+ e._entry._flags = MergeHandler::IN_USE | MergeHandler::DELETED;
+ applyDiff.push_back(e);
+ }
+ {
+ api::ApplyBucketDiffCommand::Entry e;
+ e._entry._timestamp = timestampOffset + 2;
+ e._entry._hasMask = hasMask;
+ e._entry._flags = MergeHandler::IN_USE |
+ MergeHandler::DELETED |
+ MergeHandler::DELETED_IN_PLACE;
+ applyDiff.push_back(e);
+ }
+
+ if (filled) {
+ fillDummyApplyDiff(applyDiff);
+ }
+
+ std::shared_ptr<api::ApplyBucketDiffCommand> applyBucketDiffCmd(
+ new api::ApplyBucketDiffCommand(_bucket, _nodes, 1024*1024));
+ applyBucketDiffCmd->getDiff() = applyDiff;
+ return applyBucketDiffCmd;
+}
+
+// Must match up with diff used in createDummyApplyDiff
+std::shared_ptr<api::GetBucketDiffCommand>
+MergeHandlerTest::createDummyGetBucketDiff(int timestampOffset,
+ uint16_t hasMask)
+{
+ std::vector<api::GetBucketDiffCommand::Entry> diff;
+ {
+ api::GetBucketDiffCommand::Entry e;
+ e._timestamp = timestampOffset;
+ e._hasMask = hasMask;
+ e._flags = MergeHandler::IN_USE;
+ diff.push_back(e);
+ }
+ {
+ api::GetBucketDiffCommand::Entry e;
+ e._timestamp = timestampOffset + 1;
+ e._hasMask = hasMask;
+ e._flags = MergeHandler::IN_USE | MergeHandler::DELETED;
+ diff.push_back(e);
+ }
+ {
+ api::GetBucketDiffCommand::Entry e;
+ e._timestamp = timestampOffset + 2;
+ e._hasMask = hasMask;
+ e._flags = MergeHandler::IN_USE |
+ MergeHandler::DELETED |
+ MergeHandler::DELETED_IN_PLACE;
+ diff.push_back(e);
+ }
+
+ std::shared_ptr<api::GetBucketDiffCommand> getBucketDiffCmd(
+ new api::GetBucketDiffCommand(_bucket, _nodes, 1024*1024));
+ getBucketDiffCmd->getDiff() = diff;
+ return getBucketDiffCmd;
+}
+
+void
+MergeHandlerTest::testSPIFlushGuard()
+{
+ PersistenceProviderWrapper providerWrapper(
+ getPersistenceProvider());
+ MergeHandler handler(providerWrapper, getEnv());
+
+ providerWrapper.setResult(
+ spi::Result(spi::Result::PERMANENT_ERROR,
+ "who you gonna call?"));
+
+ setUpChain(MIDDLE);
+ // Fail applying unrevertable remove
+ providerWrapper.setFailureMask(
+ PersistenceProviderWrapper::FAIL_REMOVE);
+ providerWrapper.clearOperationLog();
+ try {
+ handler.handleApplyBucketDiff(*createDummyApplyDiff(6000), *_context);
+ CPPUNIT_FAIL("No exception thrown on failing in-place remove");
+ } catch (const std::runtime_error& e) {
+ CPPUNIT_ASSERT(std::string(e.what()).find("Failed remove")
+ != std::string::npos);
+ }
+ // Test that we always flush after applying diff locally, even when
+ // errors are encountered.
+ const std::vector<std::string>& opLog(providerWrapper.getOperationLog());
+ CPPUNIT_ASSERT(!opLog.empty());
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("flush(Bucket(0x40000000000004d2, partition 0))"),
+ opLog.back());
+}
+
+void
+MergeHandlerTest::testBucketNotFoundInDb()
+{
+ MergeHandler handler(getPersistenceProvider(), getEnv());
+ // Send merge for unknown bucket
+ api::MergeBucketCommand cmd(document::BucketId(16, 6789), _nodes, _maxTimestamp);
+ MessageTracker::UP tracker = handler.handleMergeBucket(cmd, *_context);
+ CPPUNIT_ASSERT(tracker->getResult().isBucketDisappearance());
+}
+
+void
+MergeHandlerTest::testMergeProgressSafeGuard()
+{
+ MergeHandler handler(getPersistenceProvider(), getEnv());
+ api::MergeBucketCommand cmd(_bucket, _nodes, _maxTimestamp);
+ handler.handleMergeBucket(cmd, *_context);
+
+ std::shared_ptr<api::GetBucketDiffCommand> getBucketDiffCmd(
+ fetchSingleMessage<api::GetBucketDiffCommand>());
+ api::GetBucketDiffReply::UP getBucketDiffReply(
+ new api::GetBucketDiffReply(*getBucketDiffCmd));
+
+ handler.handleGetBucketDiffReply(*getBucketDiffReply, messageKeeper());
+
+ std::shared_ptr<api::ApplyBucketDiffCommand> applyBucketDiffCmd(
+ fetchSingleMessage<api::ApplyBucketDiffCommand>());
+ api::ApplyBucketDiffReply::UP applyBucketDiffReply(
+ new api::ApplyBucketDiffReply(*applyBucketDiffCmd));
+
+ MessageSenderStub stub;
+ handler.handleApplyBucketDiffReply(*applyBucketDiffReply, stub);
+
+ CPPUNIT_ASSERT_EQUAL(1, (int)stub.replies.size());
+
+ api::MergeBucketReply::SP mergeReply(
+ std::dynamic_pointer_cast<api::MergeBucketReply>(
+ stub.replies[0]));
+ CPPUNIT_ASSERT(mergeReply.get());
+ CPPUNIT_ASSERT(mergeReply->getResult().getResult()
+ == api::ReturnCode::INTERNAL_FAILURE);
+}
+
+void
+MergeHandlerTest::testSafeGuardNotInvokedWhenHasMaskChanges()
+{
+ MergeHandler handler(getPersistenceProvider(), getEnv());
+ _nodes.clear();
+ _nodes.push_back(api::MergeBucketCommand::Node(0, false));
+ _nodes.push_back(api::MergeBucketCommand::Node(1, false));
+ _nodes.push_back(api::MergeBucketCommand::Node(2, false));
+ api::MergeBucketCommand cmd(_bucket, _nodes, _maxTimestamp);
+ handler.handleMergeBucket(cmd, *_context);
+
+ std::shared_ptr<api::GetBucketDiffCommand> getBucketDiffCmd(
+ fetchSingleMessage<api::GetBucketDiffCommand>());
+ api::GetBucketDiffReply::UP getBucketDiffReply(
+ new api::GetBucketDiffReply(*getBucketDiffCmd));
+
+ handler.handleGetBucketDiffReply(*getBucketDiffReply, messageKeeper());
+
+ std::shared_ptr<api::ApplyBucketDiffCommand> applyBucketDiffCmd(
+ fetchSingleMessage<api::ApplyBucketDiffCommand>());
+ api::ApplyBucketDiffReply::UP applyBucketDiffReply(
+ new api::ApplyBucketDiffReply(*applyBucketDiffCmd));
+ CPPUNIT_ASSERT(!applyBucketDiffReply->getDiff().empty());
+ // Change a hasMask to indicate something changed during merging.
+ applyBucketDiffReply->getDiff()[0]._entry._hasMask = 0x5;
+
+ MessageSenderStub stub;
+ LOG(debug, "sending apply bucket diff reply");
+ handler.handleApplyBucketDiffReply(*applyBucketDiffReply, stub);
+
+ CPPUNIT_ASSERT_EQUAL(1, (int)stub.commands.size());
+
+ api::ApplyBucketDiffCommand::SP applyBucketDiffCmd2(
+ std::dynamic_pointer_cast<api::ApplyBucketDiffCommand>(
+ stub.commands[0]));
+ CPPUNIT_ASSERT(applyBucketDiffCmd2.get());
+ CPPUNIT_ASSERT_EQUAL(applyBucketDiffCmd->getDiff().size(),
+ applyBucketDiffCmd2->getDiff().size());
+ CPPUNIT_ASSERT_EQUAL(uint16_t(0x5),
+ applyBucketDiffCmd2->getDiff()[0]._entry._hasMask);
+}
+
+void
+MergeHandlerTest::testEntryRemovedAfterGetBucketDiff()
+{
+ MergeHandler handler(getPersistenceProvider(), getEnv());
+ std::vector<api::ApplyBucketDiffCommand::Entry> applyDiff;
+ {
+ api::ApplyBucketDiffCommand::Entry e;
+ e._entry._timestamp = 13001; // Removed in persistence
+ e._entry._hasMask = 0x2;
+ e._entry._flags = MergeHandler::IN_USE;
+ applyDiff.push_back(e);
+ }
+ setUpChain(BACK);
+ std::shared_ptr<api::ApplyBucketDiffCommand> applyBucketDiffCmd(
+ new api::ApplyBucketDiffCommand(_bucket, _nodes, 1024*1024));
+ applyBucketDiffCmd->getDiff() = applyDiff;
+
+ MessageTracker::UP tracker = handler.handleApplyBucketDiff(*applyBucketDiffCmd, *_context);
+
+ api::ApplyBucketDiffReply::SP applyBucketDiffReply(
+ std::dynamic_pointer_cast<api::ApplyBucketDiffReply>(
+ tracker->getReply()));
+ CPPUNIT_ASSERT(applyBucketDiffReply.get());
+
+ std::vector<api::ApplyBucketDiffCommand::Entry>& diff(
+ applyBucketDiffReply->getDiff());
+ CPPUNIT_ASSERT_EQUAL(size_t(1), diff.size());
+ CPPUNIT_ASSERT(!diff[0].filled());
+ CPPUNIT_ASSERT_EQUAL(uint16_t(0x0), diff[0]._entry._hasMask);
+}
+
+std::string
+MergeHandlerTest::doTestSPIException(MergeHandler& handler,
+ PersistenceProviderWrapper& providerWrapper,
+ HandlerInvoker& invoker,
+ const ExpectedExceptionSpec& spec)
+{
+ providerWrapper.setFailureMask(0);
+ invoker.beforeInvoke(*this, handler, *_context); // Do any setup stuff first
+
+ uint32_t failureMask = spec.mask;
+ const char* expectedSubstring = spec.expected;
+ providerWrapper.setFailureMask(failureMask);
+ try {
+ invoker.invoke(*this, handler, *_context);
+ if (failureMask != 0) {
+ return (std::string("No exception was thrown during handler "
+ "invocation. Expected exception containing '")
+ + expectedSubstring + "'");
+ }
+ } catch (const std::runtime_error& e) {
+ if (std::string(e.what()).find(expectedSubstring)
+ == std::string::npos)
+ {
+ return (std::string("Expected exception to contain substring '")
+ + expectedSubstring + "', but message was: " + e.what());
+ }
+ }
+ if (fsHandler().isMerging(_bucket)) {
+ return (std::string("After operation with expected exception '")
+ + expectedSubstring + "', merge state was not cleared");
+ }
+ // Postcondition check.
+ std::string check = invoker.afterInvoke(*this, handler);
+ if (!check.empty()) {
+ return (std::string("Postcondition validation failed for operation "
+ "with expected exception '")
+ + expectedSubstring + "': " + check);
+ }
+ return "";
+}
+
+std::string
+MergeHandlerTest::NoReplyHandlerInvoker::afterInvoke(
+ MergeHandlerTest& test,
+ MergeHandler& handler)
+{
+ (void) handler;
+ if (!test.messageKeeper()._msgs.empty()) {
+ std::ostringstream ss;
+ ss << "Expected 0 explicit replies, got "
+ << test.messageKeeper()._msgs.size();
+ return ss.str();
+ }
+ return "";
+}
+
+template <typename ExpectedMessage>
+std::string
+MergeHandlerTest::checkMessage(api::ReturnCode::Result expectedResult)
+{
+ try {
+ std::shared_ptr<ExpectedMessage> msg(
+ fetchSingleMessage<ExpectedMessage>());
+ if (msg->getResult().getResult() != expectedResult) {
+ return "Got unexpected result: " + msg->getResult().toString();
+ }
+ } catch (std::exception& e) {
+ return e.what();
+ }
+ return "";
+}
+
+void
+MergeHandlerTest::HandleMergeBucketInvoker::invoke(
+ MergeHandlerTest& test,
+ MergeHandler& handler,
+ spi::Context& context)
+{
+ api::MergeBucketCommand cmd(test._bucket, test._nodes, test._maxTimestamp);
+ handler.handleMergeBucket(cmd, context);
+}
+
+void
+MergeHandlerTest::testMergeBucketSPIFailures()
+{
+ PersistenceProviderWrapper providerWrapper(
+ getPersistenceProvider());
+ MergeHandler handler(providerWrapper, getEnv());
+ providerWrapper.setResult(
+ spi::Result(spi::Result::PERMANENT_ERROR,
+ "who you gonna call?"));
+ setUpChain(MIDDLE);
+
+ ExpectedExceptionSpec exceptions[] = {
+ { PersistenceProviderWrapper::FAIL_CREATE_BUCKET, "create bucket" },
+ { PersistenceProviderWrapper::FAIL_BUCKET_INFO, "get bucket info" },
+ { PersistenceProviderWrapper::FAIL_CREATE_ITERATOR, "create iterator" },
+ { PersistenceProviderWrapper::FAIL_ITERATE, "iterate" },
+ };
+ typedef ExpectedExceptionSpec* ExceptionIterator;
+ ExceptionIterator last = exceptions + sizeof(exceptions)/sizeof(exceptions[0]);
+
+ for (ExceptionIterator it = exceptions; it != last; ++it) {
+ HandleMergeBucketInvoker invoker;
+ CPPUNIT_ASSERT_EQUAL(std::string(),
+ doTestSPIException(handler,
+ providerWrapper,
+ invoker,
+ *it));
+ }
+}
+
+void
+MergeHandlerTest::HandleGetBucketDiffInvoker::invoke(
+ MergeHandlerTest& test,
+ MergeHandler& handler,
+ spi::Context& context)
+{
+ api::GetBucketDiffCommand cmd(test._bucket, test._nodes, test._maxTimestamp);
+ handler.handleGetBucketDiff(cmd, context);
+}
+
+void
+MergeHandlerTest::testGetBucketDiffSPIFailures()
+{
+ PersistenceProviderWrapper providerWrapper(
+ getPersistenceProvider());
+ MergeHandler handler(providerWrapper, getEnv());
+ providerWrapper.setResult(
+ spi::Result(spi::Result::PERMANENT_ERROR,
+ "who you gonna call?"));
+ setUpChain(MIDDLE);
+
+ ExpectedExceptionSpec exceptions[] = {
+ { PersistenceProviderWrapper::FAIL_CREATE_BUCKET, "create bucket" },
+ { PersistenceProviderWrapper::FAIL_BUCKET_INFO, "get bucket info" },
+ { PersistenceProviderWrapper::FAIL_CREATE_ITERATOR, "create iterator" },
+ { PersistenceProviderWrapper::FAIL_ITERATE, "iterate" },
+ };
+
+ typedef ExpectedExceptionSpec* ExceptionIterator;
+ ExceptionIterator last = exceptions + sizeof(exceptions)/sizeof(exceptions[0]);
+
+ for (ExceptionIterator it = exceptions; it != last; ++it) {
+ HandleGetBucketDiffInvoker invoker;
+ CPPUNIT_ASSERT_EQUAL(std::string(),
+ doTestSPIException(handler,
+ providerWrapper,
+ invoker,
+ *it));
+ }
+}
+
+void
+MergeHandlerTest::HandleApplyBucketDiffInvoker::invoke(
+ MergeHandlerTest& test,
+ MergeHandler& handler,
+ spi::Context& context)
+{
+ ++_counter;
+ std::shared_ptr<api::ApplyBucketDiffCommand> cmd(
+ test.createDummyApplyDiff(100000 * _counter));
+ handler.handleApplyBucketDiff(*cmd, context);
+}
+
+void
+MergeHandlerTest::testApplyBucketDiffSPIFailures()
+{
+ PersistenceProviderWrapper providerWrapper(
+ getPersistenceProvider());
+ MergeHandler handler(providerWrapper, getEnv());
+ providerWrapper.setResult(
+ spi::Result(spi::Result::PERMANENT_ERROR,
+ "who you gonna call?"));
+ setUpChain(MIDDLE);
+
+ ExpectedExceptionSpec exceptions[] = {
+ { PersistenceProviderWrapper::FAIL_CREATE_ITERATOR, "create iterator" },
+ { PersistenceProviderWrapper::FAIL_ITERATE, "iterate" },
+ { PersistenceProviderWrapper::FAIL_PUT, "Failed put" },
+ { PersistenceProviderWrapper::FAIL_REMOVE, "Failed remove" },
+ { PersistenceProviderWrapper::FAIL_FLUSH, "Failed flush" },
+ };
+
+ typedef ExpectedExceptionSpec* ExceptionIterator;
+ ExceptionIterator last = exceptions + sizeof(exceptions)/sizeof(exceptions[0]);
+
+ for (ExceptionIterator it = exceptions; it != last; ++it) {
+ HandleApplyBucketDiffInvoker invoker;
+ CPPUNIT_ASSERT_EQUAL(std::string(),
+ doTestSPIException(handler,
+ providerWrapper,
+ invoker,
+ *it));
+ // Casual, in-place testing of bug 6752085.
+ // This will fail if we give NaN to the metric in question.
+ CPPUNIT_ASSERT(std::isfinite(getEnv()._metrics
+ .mergeAverageDataReceivedNeeded.getLast()));
+ }
+}
+
+void
+MergeHandlerTest::HandleGetBucketDiffReplyInvoker::beforeInvoke(
+ MergeHandlerTest& test,
+ MergeHandler& handler,
+ spi::Context& context)
+{
+ api::MergeBucketCommand cmd(test._bucket, test._nodes, test._maxTimestamp);
+ handler.handleMergeBucket(cmd, context);
+ _diffCmd = test.fetchSingleMessage<api::GetBucketDiffCommand>();
+}
+
+void
+MergeHandlerTest::HandleGetBucketDiffReplyInvoker::invoke(
+ MergeHandlerTest& test,
+ MergeHandler& handler,
+ spi::Context&)
+{
+ (void) test;
+ api::GetBucketDiffReply reply(*_diffCmd);
+ handler.handleGetBucketDiffReply(reply, _stub);
+}
+
+std::string
+MergeHandlerTest::HandleGetBucketDiffReplyInvoker::afterInvoke(
+ MergeHandlerTest& test,
+ MergeHandler& handler)
+{
+ (void) handler;
+ if (!_stub.commands.empty()) {
+ return "Unexpected commands in reply stub";
+ }
+ if (!_stub.replies.empty()) {
+ return "Unexpected replies in reply stub";
+ }
+ // Initial merge bucket should have been replied to by clearMergeStatus.
+ return test.checkMessage<api::MergeBucketReply>(
+ api::ReturnCode::INTERNAL_FAILURE);
+}
+
+void
+MergeHandlerTest::testGetBucketDiffReplySPIFailures()
+{
+ PersistenceProviderWrapper providerWrapper(
+ getPersistenceProvider());
+ MergeHandler handler(providerWrapper, getEnv());
+ providerWrapper.setResult(
+ spi::Result(spi::Result::PERMANENT_ERROR,
+ "who you gonna call?"));
+ HandleGetBucketDiffReplyInvoker invoker;
+
+ setUpChain(FRONT);
+
+ ExpectedExceptionSpec exceptions[] = {
+ { PersistenceProviderWrapper::FAIL_CREATE_ITERATOR, "create iterator" },
+ { PersistenceProviderWrapper::FAIL_ITERATE, "iterate" },
+ };
+
+ typedef ExpectedExceptionSpec* ExceptionIterator;
+ ExceptionIterator last = exceptions + sizeof(exceptions)/sizeof(exceptions[0]);
+
+ for (ExceptionIterator it = exceptions; it != last; ++it) {
+ CPPUNIT_ASSERT_EQUAL(std::string(),
+ doTestSPIException(handler,
+ providerWrapper,
+ invoker,
+ *it));
+ }
+}
+
+void
+MergeHandlerTest::HandleApplyBucketDiffReplyInvoker::beforeInvoke(
+ MergeHandlerTest& test,
+ MergeHandler& handler,
+ spi::Context& context)
+{
+ ++_counter;
+ _stub.clear();
+ if (getChainPos() == FRONT) {
+ api::MergeBucketCommand cmd(test._bucket, test._nodes, test._maxTimestamp);
+ handler.handleMergeBucket(cmd, context);
+ std::shared_ptr<api::GetBucketDiffCommand> diffCmd(
+ test.fetchSingleMessage<api::GetBucketDiffCommand>());
+ std::shared_ptr<api::GetBucketDiffCommand> dummyDiff(
+ test.createDummyGetBucketDiff(100000 * _counter, 0x4));
+ diffCmd->getDiff() = dummyDiff->getDiff();
+
+ api::GetBucketDiffReply diffReply(*diffCmd);
+ handler.handleGetBucketDiffReply(diffReply, _stub);
+
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _stub.commands.size());
+ _applyCmd = std::dynamic_pointer_cast<api::ApplyBucketDiffCommand>(
+ _stub.commands[0]);
+ } else {
+ // Pretend last node in chain has data and that it will be fetched when
+ // chain is unwinded.
+ std::shared_ptr<api::ApplyBucketDiffCommand> cmd(
+ test.createDummyApplyDiff(100000 * _counter, 0x4, false));
+ handler.handleApplyBucketDiff(*cmd, context);
+ _applyCmd = test.fetchSingleMessage<api::ApplyBucketDiffCommand>();
+ }
+}
+
+void
+MergeHandlerTest::HandleApplyBucketDiffReplyInvoker::invoke(
+ MergeHandlerTest& test,
+ MergeHandler& handler,
+ spi::Context&)
+{
+ (void) test;
+ api::ApplyBucketDiffReply reply(*_applyCmd);
+ test.fillDummyApplyDiff(reply.getDiff());
+ _stub.clear();
+ handler.handleApplyBucketDiffReply(reply, _stub);
+}
+
+std::string
+MergeHandlerTest::HandleApplyBucketDiffReplyInvoker::afterInvoke(
+ MergeHandlerTest& test,
+ MergeHandler& handler)
+{
+ (void) handler;
+ if (!_stub.commands.empty()) {
+ return "Unexpected commands in reply stub";
+ }
+ if (!_stub.replies.empty()) {
+ return "Unexpected replies in reply stub";
+ }
+ if (getChainPos() == FRONT) {
+ return test.checkMessage<api::MergeBucketReply>(
+ api::ReturnCode::INTERNAL_FAILURE);
+ } else {
+ return test.checkMessage<api::ApplyBucketDiffReply>(
+ api::ReturnCode::INTERNAL_FAILURE);
+ }
+}
+
+void
+MergeHandlerTest::testApplyBucketDiffReplySPIFailures()
+{
+ PersistenceProviderWrapper providerWrapper(
+ getPersistenceProvider());
+ HandleApplyBucketDiffReplyInvoker invoker;
+ for (int i = 0; i < 2; ++i) {
+ ChainPos pos(i == 0 ? FRONT : MIDDLE);
+ setUpChain(pos);
+ invoker.setChainPos(pos);
+ MergeHandler handler(providerWrapper, getEnv());
+ providerWrapper.setResult(
+ spi::Result(spi::Result::PERMANENT_ERROR,
+ "who you gonna call?"));
+
+ ExpectedExceptionSpec exceptions[] = {
+ { PersistenceProviderWrapper::FAIL_CREATE_ITERATOR, "create iterator" },
+ { PersistenceProviderWrapper::FAIL_ITERATE, "iterate" },
+ { PersistenceProviderWrapper::FAIL_PUT, "Failed put" },
+ { PersistenceProviderWrapper::FAIL_REMOVE, "Failed remove" },
+ { PersistenceProviderWrapper::FAIL_FLUSH, "Failed flush" },
+ };
+
+ typedef ExpectedExceptionSpec* ExceptionIterator;
+ ExceptionIterator last = exceptions + sizeof(exceptions)/sizeof(exceptions[0]);
+
+ for (ExceptionIterator it = exceptions; it != last; ++it) {
+ CPPUNIT_ASSERT_EQUAL(std::string(),
+ doTestSPIException(handler,
+ providerWrapper,
+ invoker,
+ *it));
+ }
+ }
+}
+
+void
+MergeHandlerTest::testRemoveFromDiff()
+{
+ framework::defaultimplementation::FakeClock clock;
+ MergeStatus status(clock, documentapi::LoadType::DEFAULT, 0, 0);
+
+ std::vector<api::GetBucketDiffCommand::Entry> diff(2);
+ diff[0]._timestamp = 1234;
+ diff[0]._flags = 0x1;
+ diff[0]._hasMask = 0x2;
+
+ diff[1]._timestamp = 5678;
+ diff[1]._flags = 0x3;
+ diff[1]._hasMask = 0x6;
+
+ status.diff.insert(status.diff.end(), diff.begin(), diff.end());
+
+ {
+ std::vector<api::ApplyBucketDiffCommand::Entry> applyDiff(2);
+ applyDiff[0]._entry._timestamp = 1234;
+ applyDiff[0]._entry._flags = 0x1;
+ applyDiff[0]._entry._hasMask = 0x0; // Removed during merging
+
+ applyDiff[1]._entry._timestamp = 5678;
+ applyDiff[1]._entry._flags = 0x3;
+ applyDiff[1]._entry._hasMask = 0x7;
+
+ CPPUNIT_ASSERT(status.removeFromDiff(applyDiff, 0x7));
+ CPPUNIT_ASSERT(status.diff.empty());
+ }
+
+ status.diff.insert(status.diff.end(), diff.begin(), diff.end());
+
+ {
+ std::vector<api::ApplyBucketDiffCommand::Entry> applyDiff(2);
+ applyDiff[0]._entry._timestamp = 1234;
+ applyDiff[0]._entry._flags = 0x1;
+ applyDiff[0]._entry._hasMask = 0x2;
+
+ applyDiff[1]._entry._timestamp = 5678;
+ applyDiff[1]._entry._flags = 0x3;
+ applyDiff[1]._entry._hasMask = 0x6;
+
+ CPPUNIT_ASSERT(!status.removeFromDiff(applyDiff, 0x7));
+ CPPUNIT_ASSERT_EQUAL(size_t(2), status.diff.size());
+ }
+
+ status.diff.clear();
+ status.diff.insert(status.diff.end(), diff.begin(), diff.end());
+
+ {
+ // Hasmasks have changed but diff still remains the same size.
+ std::vector<api::ApplyBucketDiffCommand::Entry> applyDiff(2);
+ applyDiff[0]._entry._timestamp = 1234;
+ applyDiff[0]._entry._flags = 0x1;
+ applyDiff[0]._entry._hasMask = 0x1;
+
+ applyDiff[1]._entry._timestamp = 5678;
+ applyDiff[1]._entry._flags = 0x3;
+ applyDiff[1]._entry._hasMask = 0x5;
+
+ CPPUNIT_ASSERT(status.removeFromDiff(applyDiff, 0x7));
+ CPPUNIT_ASSERT_EQUAL(size_t(2), status.diff.size());
+ }
+}
+
+void
+MergeHandlerTest::testRemovePutOnExistingTimestamp()
+{
+ setUpChain(BACK);
+
+ document::TestDocMan docMan;
+ document::Document::SP doc(
+ docMan.createRandomDocumentAtLocation(_location));
+ spi::Timestamp ts(10111);
+ doPut(doc, ts);
+
+ MergeHandler handler(getPersistenceProvider(), getEnv());
+ std::vector<api::ApplyBucketDiffCommand::Entry> applyDiff;
+ {
+ api::ApplyBucketDiffCommand::Entry e;
+ e._entry._timestamp = ts;
+ e._entry._hasMask = 0x1;
+ e._docName = doc->getId().toString();
+ e._entry._flags = MergeHandler::IN_USE | MergeHandler::DELETED;
+ applyDiff.push_back(e);
+ }
+
+ std::shared_ptr<api::ApplyBucketDiffCommand> applyBucketDiffCmd(
+ new api::ApplyBucketDiffCommand(_bucket, _nodes, 1024*1024));
+ applyBucketDiffCmd->getDiff() = applyDiff;
+
+ MessageTracker::UP tracker = handler.handleApplyBucketDiff(*applyBucketDiffCmd, *_context);
+
+ api::ApplyBucketDiffReply::SP applyBucketDiffReply(
+ std::dynamic_pointer_cast<api::ApplyBucketDiffReply>(
+ tracker->getReply()));
+ CPPUNIT_ASSERT(applyBucketDiffReply.get());
+
+ api::MergeBucketCommand cmd(_bucket, _nodes, _maxTimestamp);
+ handler.handleMergeBucket(cmd, *_context);
+
+ std::shared_ptr<api::GetBucketDiffCommand> getBucketDiffCmd(
+ fetchSingleMessage<api::GetBucketDiffCommand>());
+
+ // Timestamp should now be a regular remove
+ bool foundTimestamp = false;
+ for (size_t i = 0; i < getBucketDiffCmd->getDiff().size(); ++i) {
+ const api::GetBucketDiffCommand::Entry& e(
+ getBucketDiffCmd->getDiff()[i]);
+ if (e._timestamp == ts) {
+ CPPUNIT_ASSERT_EQUAL(
+ uint16_t(MergeHandler::IN_USE | MergeHandler::DELETED),
+ e._flags);
+ foundTimestamp = true;
+ break;
+ }
+ }
+ CPPUNIT_ASSERT(foundTimestamp);
+}
+
+} // storage
diff --git a/storage/src/tests/persistence/persistenceproviderwrapper.cpp b/storage/src/tests/persistence/persistenceproviderwrapper.cpp
new file mode 100644
index 00000000000..4a09235ddce
--- /dev/null
+++ b/storage/src/tests/persistence/persistenceproviderwrapper.cpp
@@ -0,0 +1,222 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <iostream>
+#include <sstream>
+#include <tests/persistence/persistenceproviderwrapper.h>
+
+#define LOG_SPI(ops) \
+ { \
+ std::ostringstream logStream; \
+ logStream << ops; \
+ _log.push_back(logStream.str()); \
+ }
+
+#define CHECK_ERROR(className, failType) \
+ { \
+ if (_result.getErrorCode() != spi::Result::NONE && (_failureMask & (failType))) { \
+ return className(_result.getErrorCode(), _result.getErrorMessage()); \
+ } \
+ }
+
+namespace storage {
+
+namespace {
+
+const char*
+includedVersionsToString(spi::IncludedVersions versions)
+{
+ switch (versions) {
+ case spi::NEWEST_DOCUMENT_ONLY:
+ return "NEWEST_DOCUMENT_ONLY";
+ case spi::NEWEST_DOCUMENT_OR_REMOVE:
+ return "NEWEST_DOCUMENT_OR_REMOVE";
+ case spi::ALL_VERSIONS:
+ return "ALL_VERSIONS";
+ }
+ return "!!UNKNOWN!!";
+}
+
+} // anon namespace
+
+std::string
+PersistenceProviderWrapper::toString() const
+{
+ std::ostringstream ss;
+ for (size_t i = 0; i < _log.size(); ++i) {
+ ss << _log[i] << "\n";
+ }
+ return ss.str();
+}
+
+spi::PartitionStateListResult
+PersistenceProviderWrapper::getPartitionStates() const
+{
+ LOG_SPI("getPartitionStates()");
+ return _spi.getPartitionStates();
+}
+
+spi::BucketIdListResult
+PersistenceProviderWrapper::listBuckets(spi::PartitionId partitionId) const
+{
+ LOG_SPI("listBuckets(" << uint16_t(partitionId) << ")");
+ CHECK_ERROR(spi::BucketIdListResult, FAIL_LIST_BUCKETS);
+ return _spi.listBuckets(partitionId);
+}
+
+spi::Result
+PersistenceProviderWrapper::createBucket(const spi::Bucket& bucket,
+ spi::Context& context)
+{
+ LOG_SPI("createBucket(" << bucket << ")");
+ CHECK_ERROR(spi::Result, FAIL_CREATE_BUCKET);
+ return _spi.createBucket(bucket, context);
+}
+
+spi::BucketInfoResult
+PersistenceProviderWrapper::getBucketInfo(const spi::Bucket& bucket) const
+{
+ LOG_SPI("getBucketInfo(" << bucket << ")");
+ CHECK_ERROR(spi::BucketInfoResult, FAIL_BUCKET_INFO);
+ return _spi.getBucketInfo(bucket);
+}
+
+spi::Result
+PersistenceProviderWrapper::put(const spi::Bucket& bucket,
+ spi::Timestamp timestamp,
+ const document::Document::SP& doc,
+ spi::Context& context)
+{
+ LOG_SPI("put(" << bucket << ", " << timestamp << ", " << doc->getId() << ")");
+ CHECK_ERROR(spi::Result, FAIL_PUT);
+ return _spi.put(bucket, timestamp, doc, context);
+}
+
+spi::RemoveResult
+PersistenceProviderWrapper::remove(const spi::Bucket& bucket,
+ spi::Timestamp timestamp,
+ const spi::DocumentId& id,
+ spi::Context& context)
+{
+ LOG_SPI("remove(" << bucket << ", " << timestamp << ", " << id << ")");
+ CHECK_ERROR(spi::RemoveResult, FAIL_REMOVE);
+ return _spi.remove(bucket, timestamp, id, context);
+}
+
+spi::RemoveResult
+PersistenceProviderWrapper::removeIfFound(const spi::Bucket& bucket,
+ spi::Timestamp timestamp,
+ const spi::DocumentId& id,
+ spi::Context& context)
+{
+ LOG_SPI("removeIfFound(" << bucket << ", " << timestamp << ", " << id << ")");
+ CHECK_ERROR(spi::RemoveResult, FAIL_REMOVE_IF_FOUND);
+ return _spi.removeIfFound(bucket, timestamp, id, context);
+}
+
+spi::UpdateResult
+PersistenceProviderWrapper::update(const spi::Bucket& bucket,
+ spi::Timestamp timestamp,
+ const document::DocumentUpdate::SP& upd,
+ spi::Context& context)
+{
+ LOG_SPI("update(" << bucket << ", " << timestamp << ", " << upd->getId() << ")");
+ CHECK_ERROR(spi::UpdateResult, FAIL_UPDATE);
+ return _spi.update(bucket, timestamp, upd, context);
+}
+
+spi::GetResult
+PersistenceProviderWrapper::get(const spi::Bucket& bucket,
+ const document::FieldSet& fieldSet,
+ const spi::DocumentId& id,
+ spi::Context& context) const
+{
+ LOG_SPI("get(" << bucket << ", " << id << ")");
+ CHECK_ERROR(spi::GetResult, FAIL_GET);
+ return _spi.get(bucket, fieldSet, id, context);
+}
+
+spi::Result
+PersistenceProviderWrapper::flush(const spi::Bucket& bucket,
+ spi::Context& context)
+{
+ LOG_SPI("flush(" << bucket << ")");
+ CHECK_ERROR(spi::Result, FAIL_FLUSH);
+ return _spi.flush(bucket, context);
+}
+
+spi::CreateIteratorResult
+PersistenceProviderWrapper::createIterator(const spi::Bucket& bucket,
+ const document::FieldSet& fields,
+ const spi::Selection& sel,
+ spi::IncludedVersions versions,
+ spi::Context& context)
+{
+ // TODO: proper printing of FieldSet and Selection
+
+ LOG_SPI("createIterator(" << bucket << ", "
+ << includedVersionsToString(versions) << ")");
+ CHECK_ERROR(spi::CreateIteratorResult, FAIL_CREATE_ITERATOR);
+ return _spi.createIterator(bucket, fields, sel, versions, context);
+}
+
+spi::IterateResult
+PersistenceProviderWrapper::iterate(spi::IteratorId iterId,
+ uint64_t maxByteSize,
+ spi::Context& context) const
+{
+ LOG_SPI("iterate(" << uint64_t(iterId) << ", " << maxByteSize << ")");
+ CHECK_ERROR(spi::IterateResult, FAIL_ITERATE);
+ return _spi.iterate(iterId, maxByteSize, context);
+}
+
+spi::Result
+PersistenceProviderWrapper::destroyIterator(spi::IteratorId iterId,
+ spi::Context& context)
+{
+ LOG_SPI("destroyIterator(" << uint64_t(iterId) << ")");
+ CHECK_ERROR(spi::Result, FAIL_DESTROY_ITERATOR);
+ return _spi.destroyIterator(iterId, context);
+}
+
+spi::Result
+PersistenceProviderWrapper::deleteBucket(const spi::Bucket& bucket,
+ spi::Context& context)
+{
+ LOG_SPI("deleteBucket(" << bucket << ")");
+ CHECK_ERROR(spi::Result, FAIL_DELETE_BUCKET);
+ return _spi.deleteBucket(bucket, context);
+}
+
+spi::Result
+PersistenceProviderWrapper::split(const spi::Bucket& source,
+ const spi::Bucket& target1,
+ const spi::Bucket& target2,
+ spi::Context& context)
+{
+ LOG_SPI("split(" << source << ", " << target1 << ", " << target2 << ")");
+ CHECK_ERROR(spi::Result, FAIL_SPLIT);
+ return _spi.split(source, target1, target2, context);
+}
+
+spi::Result
+PersistenceProviderWrapper::join(const spi::Bucket& source1,
+ const spi::Bucket& source2,
+ const spi::Bucket& target,
+ spi::Context& context)
+{
+ LOG_SPI("join(" << source1 << ", " << source2 << ", " << target << ")");
+ CHECK_ERROR(spi::Result, FAIL_JOIN);
+ return _spi.join(source1, source2, target, context);
+}
+
+spi::Result
+PersistenceProviderWrapper::removeEntry(const spi::Bucket& bucket,
+ spi::Timestamp timestamp,
+ spi::Context& context)
+{
+ LOG_SPI("revert(" << bucket << ", " << timestamp << ")");
+ CHECK_ERROR(spi::Result, FAIL_REVERT);
+ return _spi.removeEntry(bucket, timestamp, context);
+}
+
+}
diff --git a/storage/src/tests/persistence/persistenceproviderwrapper.h b/storage/src/tests/persistence/persistenceproviderwrapper.h
new file mode 100644
index 00000000000..b115eb7ef3d
--- /dev/null
+++ b/storage/src/tests/persistence/persistenceproviderwrapper.h
@@ -0,0 +1,153 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::PersistenceProviderWrapper
+ *
+ * \brief Test utility class for intercepting all operations upon a
+ * persistence layer, injecting errors and performing logging.
+ *
+ * The PersistenceProviderWrapper class implements the basic SPI by
+ * logging all operations and then delegating handling the operation
+ * to the SPI instance given during construction. If an error result
+ * is specified and the operation invoked is tagged that it should be
+ * failed via setFailureMask(), the operation on the wrapped SPI will
+ * not be executed, but the given error result will be immediately
+ * returned instead (wrapped in the proper return type).
+ */
+#pragma once
+
+#include <vector>
+#include <string>
+#include <vespa/persistence/spi/abstractpersistenceprovider.h>
+
+namespace storage {
+
+class PersistenceProviderWrapper : public spi::AbstractPersistenceProvider
+{
+public:
+ enum OPERATION_FAILURE_FLAGS
+ {
+ FAIL_LIST_BUCKETS = 1 << 0,
+ FAIL_BUCKET_INFO = 1 << 1,
+ FAIL_GET = 1 << 2,
+ FAIL_PUT = 1 << 3,
+ FAIL_REMOVE = 1 << 4,
+ FAIL_REMOVE_IF_FOUND = 1 << 5,
+ FAIL_REPLACE_WITH_REMOVE = 1 << 6,
+ FAIL_UPDATE = 1 << 7,
+ FAIL_REVERT = 1 << 8,
+ FAIL_FLUSH = 1 << 9,
+ FAIL_CREATE_ITERATOR = 1 << 10,
+ FAIL_ITERATE = 1 << 11,
+ FAIL_DESTROY_ITERATOR = 1 << 12,
+ FAIL_DELETE_BUCKET = 1 << 13,
+ FAIL_SPLIT = 1 << 14,
+ FAIL_JOIN = 1 << 15,
+ FAIL_CREATE_BUCKET = 1 << 16,
+ FAIL_BUCKET_PERSISTENCE = FAIL_PUT|FAIL_REMOVE|FAIL_UPDATE|FAIL_REVERT|FAIL_FLUSH,
+ FAIL_ALL_OPERATIONS = 0xffff,
+ // TODO: add more as needed
+ };
+private:
+ spi::PersistenceProvider& _spi;
+ spi::Result _result;
+ mutable std::vector<std::string> _log;
+ uint32_t _failureMask;
+public:
+ PersistenceProviderWrapper(spi::PersistenceProvider& spi)
+ : _spi(spi),
+ _result(spi::Result(spi::Result::NONE, "")),
+ _log(),
+ _failureMask(0)
+ {
+ }
+
+ /**
+ * Explicitly set result to anything != NONE to have all operations
+ * return the given error without the wrapped SPI ever being invoked.
+ */
+ void setResult(const spi::Result& result) {
+ _result = result;
+ }
+ void clearResult() {
+ _result = spi::Result(spi::Result::NONE, "");
+ }
+ const spi::Result& getResult() const { return _result; }
+ /**
+ * Set a mask for operations to fail with _result
+ */
+ void setFailureMask(uint32_t mask) { _failureMask = mask; }
+ uint32_t getFailureMask() const { return _failureMask; }
+
+ /**
+ * Get a string representation of all the operations performed on the
+ * SPI with a newline separating each operation.
+ */
+ std::string toString() const;
+ /**
+ * Clear log of all operations performed.
+ */
+ void clearOperationLog() { _log.clear(); }
+ const std::vector<std::string>& getOperationLog() const { return _log; }
+
+ spi::Result createBucket(const spi::Bucket&, spi::Context&);
+
+ spi::PartitionStateListResult getPartitionStates() const;
+
+ spi::BucketIdListResult listBuckets(spi::PartitionId) const;
+
+ spi::BucketInfoResult getBucketInfo(const spi::Bucket&) const;
+
+ spi::Result put(const spi::Bucket&, spi::Timestamp, const document::Document::SP&, spi::Context&);
+
+ spi::RemoveResult remove(const spi::Bucket&,
+ spi::Timestamp,
+ const spi::DocumentId&,
+ spi::Context&);
+
+ spi::RemoveResult removeIfFound(const spi::Bucket&,
+ spi::Timestamp,
+ const spi::DocumentId&,
+ spi::Context&);
+
+ spi::UpdateResult update(const spi::Bucket&,
+ spi::Timestamp,
+ const document::DocumentUpdate::SP&,
+ spi::Context&);
+
+ spi::GetResult get(const spi::Bucket&,
+ const document::FieldSet&,
+ const spi::DocumentId&,
+ spi::Context&) const;
+
+ spi::Result flush(const spi::Bucket&, spi::Context&);
+
+ spi::CreateIteratorResult createIterator(const spi::Bucket&,
+ const document::FieldSet&,
+ const spi::Selection&,
+ spi::IncludedVersions versions,
+ spi::Context&);
+
+ spi::IterateResult iterate(spi::IteratorId,
+ uint64_t maxByteSize, spi::Context&) const;
+
+ spi::Result destroyIterator(spi::IteratorId, spi::Context&);
+
+ spi::Result deleteBucket(const spi::Bucket&, spi::Context&);
+
+ spi::Result split(const spi::Bucket& source,
+ const spi::Bucket& target1,
+ const spi::Bucket& target2,
+ spi::Context&);
+
+ spi::Result join(const spi::Bucket& source1,
+ const spi::Bucket& source2,
+ const spi::Bucket& target,
+ spi::Context&);
+
+ spi::Result removeEntry(const spi::Bucket&,
+ spi::Timestamp,
+ spi::Context&);
+};
+
+} // storage
+
diff --git a/storage/src/tests/persistence/persistencequeuetest.cpp b/storage/src/tests/persistence/persistencequeuetest.cpp
new file mode 100644
index 00000000000..06daf2a975c
--- /dev/null
+++ b/storage/src/tests/persistence/persistencequeuetest.cpp
@@ -0,0 +1,103 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <tests/persistence/persistenceproviderwrapper.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+#include <tests/persistence/filestorage/filestortestfixture.h>
+#include <tests/persistence/filestorage/forwardingmessagesender.h>
+
+LOG_SETUP(".persistencequeuetest");
+
+namespace storage {
+
+class PersistenceQueueTest : public FileStorTestFixture
+{
+public:
+ void testFetchNextUnlockedMessageIfBucketLocked();
+
+ std::shared_ptr<api::StorageMessage>
+ createPut(uint64_t bucket, uint64_t docIdx);
+
+ void setUp() override;
+ void tearDown() override;
+
+ CPPUNIT_TEST_SUITE(PersistenceQueueTest);
+ CPPUNIT_TEST(testFetchNextUnlockedMessageIfBucketLocked);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(PersistenceQueueTest);
+
+void
+PersistenceQueueTest::setUp()
+{
+ setupDisks(1);
+ _node->setPersistenceProvider(
+ spi::PersistenceProvider::UP(
+ new spi::dummy::DummyPersistence(_node->getTypeRepo(), 1)));
+}
+
+void
+PersistenceQueueTest::tearDown()
+{
+ _node.reset(0);
+}
+
+std::shared_ptr<api::StorageMessage>
+PersistenceQueueTest::createPut(uint64_t bucket, uint64_t docIdx)
+{
+ std::ostringstream id;
+ id << "id:foo:testdoctype1:n=" << bucket << ":" << docIdx;
+ document::Document::SP doc(
+ _node->getTestDocMan().createDocument("foobar", id.str()));
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(document::BucketId(16, bucket), doc, 1234));
+ cmd->setAddress(api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 0));
+ return cmd;
+}
+
+void
+PersistenceQueueTest::testFetchNextUnlockedMessageIfBucketLocked()
+{
+ DummyStorageLink top;
+ DummyStorageLink *dummyManager;
+ top.push_back(std::unique_ptr<StorageLink>(
+ dummyManager = new DummyStorageLink));
+ top.open();
+ ForwardingMessageSender messageSender(*dummyManager);
+
+ documentapi::LoadTypeSet loadTypes("raw:");
+ FileStorMetrics metrics(loadTypes.getMetricLoadTypes());
+ metrics.initDiskMetrics(_node->getPartitions().size(),
+ loadTypes.getMetricLoadTypes(), 1);
+
+ FileStorHandler filestorHandler(messageSender, metrics,
+ _node->getPartitions(),
+ _node->getComponentRegister(), 255, 0);
+
+ // Send 2 puts, 2 to the first bucket, 1 to the second. Calling
+ // getNextMessage 2 times should then return a lock on the first bucket,
+ // then subsequently on the second, skipping the already locked bucket.
+ // Puts all have same pri, so order is well defined.
+ filestorHandler.schedule(createPut(1234, 0), 0);
+ filestorHandler.schedule(createPut(1234, 1), 0);
+ filestorHandler.schedule(createPut(5432, 0), 0);
+
+ auto lock0 = filestorHandler.getNextMessage(0, 255);
+ CPPUNIT_ASSERT(lock0.first.get());
+ CPPUNIT_ASSERT_EQUAL(
+ document::BucketId(16, 1234),
+ dynamic_cast<api::PutCommand&>(*lock0.second).getBucketId());
+
+ auto lock1 = filestorHandler.getNextMessage(0, 255);
+ CPPUNIT_ASSERT(lock1.first.get());
+ CPPUNIT_ASSERT_EQUAL(
+ document::BucketId(16, 5432),
+ dynamic_cast<api::PutCommand&>(*lock1.second).getBucketId());
+}
+
+} // namespace storage
diff --git a/storage/src/tests/persistence/persistencetestutils.cpp b/storage/src/tests/persistence/persistencetestutils.cpp
new file mode 100644
index 00000000000..47ec23147f1
--- /dev/null
+++ b/storage/src/tests/persistence/persistencetestutils.cpp
@@ -0,0 +1,412 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+
+#include <vespa/document/datatype/documenttype.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <tests/persistence/persistencetestutils.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+
+using document::DocumentType;
+using storage::framework::defaultimplementation::AllocationLogic;
+
+namespace storage {
+
+namespace {
+
+ spi::LoadType defaultLoadType(0, "default");
+
+ vdstestlib::DirConfig initialize(uint32_t numDisks) {
+ system(vespalib::make_string("rm -rf vdsroot").c_str());
+ for (uint32_t i = 0; i < numDisks; i++) {
+ system(vespalib::make_string("mkdir -p vdsroot/disks/d%d", i).c_str());
+ }
+ vdstestlib::DirConfig config(getStandardConfig(true));
+ return config;
+ }
+
+ template<typename T>
+ struct ConfigReader : public T::Subscriber
+ {
+ T config;
+
+ ConfigReader(const std::string& configId) {
+ T::subscribe(configId, *this);
+ }
+ void configure(const T& c) { config = c; }
+ };
+}
+
+PersistenceTestEnvironment::PersistenceTestEnvironment(DiskCount numDisks)
+ : _config(initialize(numDisks)),
+ _messageKeeper(),
+ _node(numDisks, NodeIndex(0), _config.getConfigId()),
+ _component(_node.getComponentRegister(), "persistence test env"),
+ _metrics(_component.getLoadTypes()->getMetricLoadTypes())
+{
+ _node.setupDummyPersistence();
+ _metrics.initDiskMetrics(
+ numDisks, _node.getLoadTypes()->getMetricLoadTypes(), 1);
+ _handler.reset(new FileStorHandler(
+ _messageKeeper, _metrics,
+ _node.getPersistenceProvider().getPartitionStates().getList(),
+ _node.getComponentRegister(), 255, 0));
+ for (uint32_t i = 0; i < numDisks; i++) {
+ _diskEnvs.push_back(
+ vespalib::LinkedPtr<PersistenceUtil>(
+ new PersistenceUtil(
+ _config.getConfigId(),
+ _node.getComponentRegister(),
+ *_handler,
+ *_metrics.disks[i]->threads[0],
+ i,
+ 255,
+ _node.getPersistenceProvider())));
+ }
+}
+
+PersistenceTestUtils::PersistenceTestUtils()
+{
+}
+
+PersistenceTestUtils::~PersistenceTestUtils()
+{
+}
+
+std::string
+PersistenceTestUtils::dumpBucket(const document::BucketId& bid,
+ uint16_t disk) {
+ return dynamic_cast<spi::dummy::DummyPersistence&>(_env->_node.getPersistenceProvider()).dumpBucket(spi::Bucket(bid, spi::PartitionId(disk)));
+}
+
+void
+PersistenceTestUtils::setupDisks(uint32_t numDisks) {
+ _env.reset(new PersistenceTestEnvironment(DiskCount(numDisks)));
+}
+
+std::unique_ptr<PersistenceThread>
+PersistenceTestUtils::createPersistenceThread(uint32_t disk)
+{
+ return std::unique_ptr<PersistenceThread>(
+ new PersistenceThread(_env->_node.getComponentRegister(),
+ _env->_config.getConfigId(),
+ getPersistenceProvider(),
+ getEnv()._fileStorHandler,
+ getEnv()._metrics,
+ disk,
+ 255,
+ false));
+}
+
+document::Document::SP
+PersistenceTestUtils::schedulePut(
+ uint32_t location,
+ spi::Timestamp timestamp,
+ uint16_t disk,
+ uint32_t minSize,
+ uint32_t maxSize)
+{
+ document::Document::SP doc(createRandomDocumentAtLocation(
+ location, timestamp, minSize, maxSize));
+ std::shared_ptr<api::StorageMessage> msg(
+ new api::PutCommand(
+ document::BucketId(16, location), doc, timestamp));
+ fsHandler().schedule(msg, disk);
+ return doc;
+}
+
+StorBucketDatabase::WrappedEntry
+PersistenceTestUtils::getBucket(const document::BucketId& id)
+{
+ return _env->_node.getStorageBucketDatabase().get(id, "foo");
+}
+
+StorBucketDatabase::WrappedEntry
+PersistenceTestUtils::createBucket(const document::BucketId& id)
+{
+ return _env->_node.getStorageBucketDatabase().get(
+ id,
+ "foo",
+ StorBucketDatabase::CREATE_IF_NONEXISTING);
+}
+
+spi::PersistenceProvider&
+PersistenceTestUtils::getPersistenceProvider()
+{
+ return _env->_node.getPersistenceProvider();
+}
+
+std::string
+PersistenceTestUtils::getBucketStatus(const document::BucketId& id)
+{
+ std::ostringstream ost;
+ StorBucketDatabase::WrappedEntry entry(
+ _env->_node.getStorageBucketDatabase().get(
+ id, "foo"));
+
+ ost << id << ": ";
+ if (!entry.exist()) {
+ ost << "null";
+ } else {
+ ost << entry->getBucketInfo().getDocumentCount() << "," << entry->disk;
+ }
+
+ return ost.str();
+}
+
+document::Document::SP
+PersistenceTestUtils::doPutOnDisk(
+ uint16_t disk,
+ uint32_t location,
+ spi::Timestamp timestamp,
+ uint32_t minSize,
+ uint32_t maxSize)
+{
+ document::Document::SP doc(createRandomDocumentAtLocation(
+ location, timestamp, minSize, maxSize));
+ spi::Bucket b(document::BucketId(16, location), spi::PartitionId(disk));
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+
+ getPersistenceProvider().createBucket(b, context);
+
+ getPersistenceProvider().put(spi::Bucket(b), timestamp, doc, context);
+
+ getPersistenceProvider().flush(b, context);
+ return doc;
+}
+
+bool
+PersistenceTestUtils::doRemoveOnDisk(
+ uint16_t disk,
+ const document::BucketId& bucketId,
+ const document::DocumentId& docId,
+ spi::Timestamp timestamp,
+ bool persistRemove)
+{
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ if (persistRemove) {
+ spi::RemoveResult result = getPersistenceProvider().removeIfFound(
+ spi::Bucket(bucketId, spi::PartitionId(disk)),
+ timestamp, docId, context);
+ return result.wasFound();
+ }
+ spi::RemoveResult result = getPersistenceProvider().remove(
+ spi::Bucket(bucketId, spi::PartitionId(disk)),
+ timestamp, docId, context);
+
+ return result.wasFound();
+}
+
+bool
+PersistenceTestUtils::doUnrevertableRemoveOnDisk(
+ uint16_t disk,
+ const document::BucketId& bucketId,
+ const document::DocumentId& docId,
+ spi::Timestamp timestamp)
+{
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ spi::RemoveResult result = getPersistenceProvider().remove(
+ spi::Bucket(bucketId, spi::PartitionId(disk)),
+ timestamp, docId, context);
+ return result.wasFound();
+}
+
+spi::GetResult
+PersistenceTestUtils::doGetOnDisk(
+ uint16_t disk,
+ const document::BucketId& bucketId,
+ const document::DocumentId& docId,
+ bool headerOnly)
+{
+ document::FieldSet::UP fieldSet(new document::AllFields());
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ if (headerOnly) {
+ fieldSet.reset(new document::HeaderFields());
+ }
+ return getPersistenceProvider().get(spi::Bucket(
+ bucketId, spi::PartitionId(disk)), *fieldSet, docId, context);
+}
+
+document::DocumentUpdate::SP
+PersistenceTestUtils::createBodyUpdate(
+ const document::DocumentId& docId,
+ const document::FieldValue& updateValue)
+{
+ const DocumentType* docType(_env->_component.getTypeRepo()
+ ->getDocumentType("testdoctype1"));
+ document::DocumentUpdate::SP update(
+ new document::DocumentUpdate(*docType, docId));
+ std::shared_ptr<document::AssignValueUpdate> assignUpdate(
+ new document::AssignValueUpdate(updateValue));
+ document::FieldUpdate fieldUpdate(docType->getField("content"));
+ fieldUpdate.addUpdate(*assignUpdate);
+ update->addUpdate(fieldUpdate);
+ return update;
+}
+
+document::DocumentUpdate::SP
+PersistenceTestUtils::createHeaderUpdate(
+ const document::DocumentId& docId,
+ const document::FieldValue& updateValue)
+{
+ const DocumentType* docType(_env->_component.getTypeRepo()
+ ->getDocumentType("testdoctype1"));
+ document::DocumentUpdate::SP update(
+ new document::DocumentUpdate(*docType, docId));
+ std::shared_ptr<document::AssignValueUpdate> assignUpdate(
+ new document::AssignValueUpdate(updateValue));
+ document::FieldUpdate fieldUpdate(docType->getField("headerval"));
+ fieldUpdate.addUpdate(*assignUpdate);
+ update->addUpdate(fieldUpdate);
+ return update;
+}
+
+uint16_t
+PersistenceTestUtils::getDiskFromBucketDatabaseIfUnset(const document::BucketId& bucket,
+ uint16_t disk)
+{
+ if (disk == 0xffff) {
+ StorBucketDatabase::WrappedEntry entry(
+ getEnv().getBucketDatabase().get(bucket, "createTestBucket"));
+ if (entry.exist()) {
+ return entry->disk;
+ } else {
+ std::ostringstream error;
+ error << bucket << " not in db and disk unset";
+ throw vespalib::IllegalStateException(error.str(), VESPA_STRLOC);
+ }
+ }
+ return disk;
+}
+
+void
+PersistenceTestUtils::doPut(const document::Document::SP& doc,
+ spi::Timestamp time,
+ uint16_t disk,
+ uint16_t usedBits)
+{
+ document::BucketId bucket(
+ _env->_component.getBucketIdFactory().getBucketId(doc->getId()));
+ bucket.setUsedBits(usedBits);
+ disk = getDiskFromBucketDatabaseIfUnset(bucket, disk);
+
+ doPut(doc, bucket, time, disk);
+}
+
+void
+PersistenceTestUtils::doPut(const document::Document::SP& doc,
+ document::BucketId bid,
+ spi::Timestamp time,
+ uint16_t disk)
+{
+ spi::Bucket b(bid, spi::PartitionId(disk));
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ getPersistenceProvider().createBucket(b, context);
+ getPersistenceProvider().put(b, time, doc, context);
+}
+
+spi::UpdateResult
+PersistenceTestUtils::doUpdate(document::BucketId bid,
+ const document::DocumentUpdate::SP& update,
+ spi::Timestamp time,
+ uint16_t disk)
+{
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ return getPersistenceProvider().update(
+ spi::Bucket(bid, spi::PartitionId(disk)), time, update, context);
+}
+
+void
+PersistenceTestUtils::doRemove(const document::DocumentId& id, spi::Timestamp time,
+ uint16_t disk, bool unrevertableRemove,
+ uint16_t usedBits)
+{
+ document::BucketId bucket(
+ _env->_component.getBucketIdFactory().getBucketId(id));
+ bucket.setUsedBits(usedBits);
+ disk = getDiskFromBucketDatabaseIfUnset(bucket, disk);
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ if (unrevertableRemove) {
+ getPersistenceProvider().remove(
+ spi::Bucket(bucket, spi::PartitionId(disk)), time, id, context);
+ } else {
+ spi::RemoveResult result = getPersistenceProvider().removeIfFound(
+ spi::Bucket(bucket, spi::PartitionId(disk)), time, id, context);
+ if (!result.wasFound()) {
+ throw vespalib::IllegalStateException(
+ "Attempted to remove non-existing doc " + id.toString(),
+ VESPA_STRLOC);
+ }
+ }
+}
+
+void
+PersistenceTestUtils::clearBody(document::Document& doc)
+{
+ // FIXME(vekterli): temporary solution while we don't have
+ // fieldset pruning functionality in Document.
+ //doc->getBody().clear();
+ vespalib::nbostream stream;
+ doc.serializeHeader(stream);
+ doc.deserialize(*_env->_component.getTypeRepo(), stream);
+}
+
+document::Document::UP
+PersistenceTestUtils::createRandomDocumentAtLocation(
+ uint64_t location, uint32_t seed,
+ uint32_t minDocSize, uint32_t maxDocSize)
+{
+ return _env->_testDocMan.createRandomDocumentAtLocation(
+ location, seed, minDocSize, maxDocSize);
+}
+
+void
+PersistenceTestUtils::createTestBucket(const document::BucketId& bucket,
+ uint16_t disk)
+{
+
+ uint32_t opsPerType = 2;
+ uint32_t numberOfLocations = 2;
+ uint32_t minDocSize = 0;
+ uint32_t maxDocSize = 128;
+ for (uint32_t useHeaderOnly = 0; useHeaderOnly < 2; ++useHeaderOnly) {
+ bool headerOnly = (useHeaderOnly == 1);
+ for (uint32_t optype=0; optype < 4; ++optype) {
+ for (uint32_t i=0; i<opsPerType; ++i) {
+ uint32_t seed = useHeaderOnly * 10000 + optype * 1000 + i + 1;
+ uint64_t location = (seed % numberOfLocations);
+ location <<= 32;
+ location += (bucket.getRawId() & 0xffffffff);
+ document::Document::SP doc(
+ createRandomDocumentAtLocation(
+ location, seed, minDocSize, maxDocSize));
+ if (headerOnly) {
+ clearBody(*doc);
+ }
+ doPut(doc, spi::Timestamp(seed), disk, bucket.getUsedBits());
+ if (optype == 0) { // Regular put
+ } else if (optype == 1) { // Overwritten later in time
+ document::Document::SP doc2(new document::Document(*doc));
+ doc2->setValue(doc2->getField("content"),
+ document::StringFieldValue("overwritten"));
+ doPut(doc2, spi::Timestamp(seed + 500),
+ disk, bucket.getUsedBits());
+ } else if (optype == 2) { // Removed
+ doRemove(doc->getId(), spi::Timestamp(seed + 500), disk, false,
+ bucket.getUsedBits());
+ } else if (optype == 3) { // Unrevertable removed
+ doRemove(doc->getId(), spi::Timestamp(seed), disk, true,
+ bucket.getUsedBits());
+ }
+ }
+ }
+ }
+}
+
+} // storage
diff --git a/storage/src/tests/persistence/persistencetestutils.h b/storage/src/tests/persistence/persistencetestutils.h
new file mode 100644
index 00000000000..d584b4dce45
--- /dev/null
+++ b/storage/src/tests/persistence/persistencetestutils.h
@@ -0,0 +1,214 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/document/base/testdocman.h>
+#include <vespa/storage/common/messagesender.h>
+#include <vespa/storage/common/storagecomponent.h>
+#include <vespa/storage/persistence/filestorage/filestorhandler.h>
+#include <vespa/storage/persistence/persistenceutil.h>
+#include <vespa/storageframework/defaultimplementation/memory/memorymanager.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/testhelper.h>
+#include <vespa/persistence/spi/persistenceprovider.h>
+#include <vespa/storage/persistence/persistencethread.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+
+namespace storage {
+
+struct MessageKeeper : public MessageSender {
+ std::vector<api::StorageMessage::SP> _msgs;
+
+ void sendCommand(const api::StorageCommand::SP& m) { _msgs.push_back(m); }
+ void sendReply(const api::StorageReply::SP& m) { _msgs.push_back(m); }
+};
+
+struct PersistenceTestEnvironment {
+ PersistenceTestEnvironment(
+ DiskCount numDisks);
+
+ document::TestDocMan _testDocMan;
+ vdstestlib::DirConfig _config;
+ MessageKeeper _messageKeeper;
+ TestServiceLayerApp _node;
+ StorageComponent _component;
+ FileStorMetrics _metrics;
+ std::unique_ptr<FileStorHandler> _handler;
+ std::vector<vespalib::LinkedPtr<PersistenceUtil> > _diskEnvs;
+};
+
+class PersistenceTestUtils : public CppUnit::TestFixture {
+private:
+ std::unique_ptr<PersistenceTestEnvironment> _env;
+
+public:
+ PersistenceTestUtils();
+ virtual ~PersistenceTestUtils();
+
+ document::Document::SP schedulePut(
+ uint32_t location,
+ spi::Timestamp timestamp,
+ uint16_t disk,
+ uint32_t minSize = 0,
+ uint32_t maxSize = 128);
+
+ void setupDisks(uint32_t disks);
+
+ void tearDown() {
+ _env.reset();
+ }
+
+ std::string dumpBucket(const document::BucketId& bid, uint16_t disk = 0);
+
+ PersistenceUtil& getEnv(uint32_t disk = 0)
+ { return *_env->_diskEnvs[disk]; }
+ FileStorHandler& fsHandler() { return *_env->_handler; }
+ FileStorMetrics& metrics() { return _env->_metrics; }
+ MessageKeeper& messageKeeper() { return _env->_messageKeeper; }
+ document::DocumentTypeRepo::SP getTypeRepo() { return _env->_component.getTypeRepo(); }
+ StorageComponent& getComponent() { return _env->_component; }
+ TestServiceLayerApp& getNode() { return _env->_node; }
+
+ StorBucketDatabase::WrappedEntry getBucket(const document::BucketId& id);
+ StorBucketDatabase::WrappedEntry createBucket(const document::BucketId& id);
+
+ std::string getBucketStatus(const document::BucketId& id);
+
+ spi::PersistenceProvider& getPersistenceProvider();
+
+ /**
+ Performs a put to the given disk.
+ Returns the document that was inserted.
+ */
+ document::Document::SP doPutOnDisk(
+ uint16_t disk,
+ uint32_t location,
+ spi::Timestamp timestamp,
+ uint32_t minSize = 0,
+ uint32_t maxSize = 128);
+
+ document::Document::SP doPut(
+ uint32_t location,
+ spi::Timestamp timestamp,
+ uint32_t minSize = 0,
+ uint32_t maxSize = 128)
+ { return doPutOnDisk(0, location, timestamp, minSize, maxSize); }
+
+ /**
+ Performs a remove to the given disk.
+ Returns the new doccount if document was removed, or -1 if not found.
+ */
+ bool doRemoveOnDisk(
+ uint16_t disk,
+ const document::BucketId& bid,
+ const document::DocumentId& id,
+ spi::Timestamp timestamp,
+ bool persistRemove);
+
+ bool doRemove(
+ const document::BucketId& bid,
+ const document::DocumentId& id,
+ spi::Timestamp timestamp,
+ bool persistRemove) {
+ return doRemoveOnDisk(0, bid, id, timestamp, persistRemove);
+ }
+
+ bool doUnrevertableRemoveOnDisk(uint16_t disk,
+ const document::BucketId& bid,
+ const document::DocumentId& id,
+ spi::Timestamp timestamp);
+
+ bool doUnrevertableRemove(const document::BucketId& bid,
+ const document::DocumentId& id,
+ spi::Timestamp timestamp)
+ {
+ return doUnrevertableRemoveOnDisk(0, bid, id, timestamp);
+ }
+
+ /**
+ * Do a remove toward storage set up in test environment.
+ *
+ * @id Document to remove.
+ * @disk If set, use this disk, otherwise lookup in bucket db.
+ * @unrevertableRemove If set, instead of adding put, turn put to remove.
+ * @usedBits Generate bucket to use from docid using this amount of bits.
+ */
+ void doRemove(const document::DocumentId& id, spi::Timestamp, uint16_t disk = 0xffff,
+ bool unrevertableRemove = false, uint16_t usedBits = 16);
+
+ spi::GetResult doGetOnDisk(
+ uint16_t disk,
+ const document::BucketId& bucketId,
+ const document::DocumentId& docId,
+ bool headerOnly);
+
+ spi::GetResult doGet(
+ const document::BucketId& bucketId,
+ const document::DocumentId& docId,
+ bool headerOnly)
+ { return doGetOnDisk(0, bucketId, docId, headerOnly); }
+
+ document::DocumentUpdate::SP createBodyUpdate(
+ const document::DocumentId& id,
+ const document::FieldValue& updateValue);
+
+ document::DocumentUpdate::SP createHeaderUpdate(
+ const document::DocumentId& id,
+ const document::FieldValue& updateValue);
+
+ uint16_t getDiskFromBucketDatabaseIfUnset(const document::BucketId&,
+ uint16_t disk = 0xffff);
+
+ /**
+ * Do a put toward storage set up in test environment.
+ *
+ * @doc Document to put. Use TestDocMan to generate easily.
+ * @disk If set, use this disk, otherwise lookup in bucket db.
+ * @usedBits Generate bucket to use from docid using this amount of bits.
+ */
+ void doPut(const document::Document::SP& doc, spi::Timestamp,
+ uint16_t disk = 0xffff, uint16_t usedBits = 16);
+
+ void doPut(const document::Document::SP& doc,
+ document::BucketId bid,
+ spi::Timestamp time,
+ uint16_t disk = 0);
+
+ spi::UpdateResult doUpdate(document::BucketId bid,
+ const document::DocumentUpdate::SP& update,
+ spi::Timestamp time,
+ uint16_t disk = 0);
+
+ document::Document::UP createRandomDocumentAtLocation(
+ uint64_t location, uint32_t seed,
+ uint32_t minDocSize, uint32_t maxDocSize);
+
+ /**
+ * Create a test bucket with various content representing most states a
+ * bucket can represent. (Such that tests have a nice test bucket to use
+ * that require operations to handle all the various bucket contents.
+ *
+ * @disk If set, use this disk, otherwise lookup in bucket db.
+ */
+ void createTestBucket(const document::BucketId&, uint16_t disk = 0xffff);
+
+ /**
+ * Create a new persistence thread.
+ */
+ std::unique_ptr<PersistenceThread> createPersistenceThread(uint32_t disk);
+
+ /**
+ * In-place modify doc so that it has no more body fields.
+ */
+ void clearBody(document::Document& doc);
+};
+
+class SingleDiskPersistenceTestUtils : public PersistenceTestUtils
+{
+public:
+ void setUp() {
+ setupDisks(1);
+ }
+};
+
+} // storage
+
diff --git a/storage/src/tests/persistence/persistencethread_splittest.cpp b/storage/src/tests/persistence/persistencethread_splittest.cpp
new file mode 100644
index 00000000000..f50e62e0aeb
--- /dev/null
+++ b/storage/src/tests/persistence/persistencethread_splittest.cpp
@@ -0,0 +1,234 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storage/persistence/persistencethread.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <tests/persistence/persistencetestutils.h>
+
+namespace storage {
+namespace {
+ spi::LoadType defaultLoadType(0, "default");
+}
+
+struct PersistenceThread_SplitTest : public SingleDiskPersistenceTestUtils
+{
+ enum SplitCase {
+ TOO_MANY_DOCS_SPLIT_ONCE, // Only one split needed to divide
+ TOO_MANY_DOCS_SPLIT_MULTIPLE_BITS, // Multiple bits needed to divide
+ TOO_MANY_DOCS_ACTUALLY_NOT, // Other copy is too big but not this one
+ // Multi bits needed, but dont do it.
+ TOO_LARGE_DOCS_SPLIT_ONCE,
+ TOO_LARGE_DOCS_SPLIT_MULTIPLE_BITS,
+ TOO_LARGE_DOCS_SINGLE_DOC, // Cannot split single doc even if too large
+ TOO_LARGE_DOCS_ACTUALLY_NOT, // Other copy is too large, not this one
+ // Need to split to X bits to get in line with other copy or distr.
+ SPLIT_TOO_LITTLE_SINGLE_SPLIT, // Split all to one target
+ SPLIT_TOO_LITTLE_JUST_RIGHT, // Just manage to split in two at that lvl
+ SPLIT_TOO_LITTLE_SPLIT_TOWARDS_ENOUGH, // Has to split shorter
+ SPLIT_INCONSISTENT_1_DOC,
+ SPLIT_INCONSISTENT_ALL_DOCS_SAME_GID,
+ };
+
+ void doTest(SplitCase);
+
+ void testTooManyDocsSplitOnce()
+ { doTest(TOO_MANY_DOCS_SPLIT_ONCE); }
+ void testTooManyDocsSplitMulti()
+ { doTest(TOO_MANY_DOCS_SPLIT_MULTIPLE_BITS); }
+ void testTooManyDocsActuallyNot()
+ { doTest(TOO_MANY_DOCS_ACTUALLY_NOT); }
+ void testTooLargeDocsSplitOnce()
+ { doTest(TOO_LARGE_DOCS_SPLIT_ONCE); }
+ void testTooLargeDocsSplitMulti()
+ { doTest(TOO_LARGE_DOCS_SPLIT_MULTIPLE_BITS); }
+ void testTooLargeDocsSingleDoc()
+ { doTest(TOO_LARGE_DOCS_SINGLE_DOC); }
+ void testTooLargeDocsActuallyNot()
+ { doTest(TOO_LARGE_DOCS_ACTUALLY_NOT); }
+ void testSplitTooLittleSingleSplit()
+ { doTest(SPLIT_TOO_LITTLE_SINGLE_SPLIT); }
+ void testSplitTooLittleJustRight()
+ { doTest(SPLIT_TOO_LITTLE_JUST_RIGHT); }
+ void testSplitTooLittleSplitTowardsEnough()
+ { doTest(SPLIT_TOO_LITTLE_SPLIT_TOWARDS_ENOUGH); }
+ void testInconsistentSplitHasOneBitFallbackWhen1Doc() {
+ doTest(SPLIT_INCONSISTENT_1_DOC);
+ }
+ void testInconsistentSplitHasOneBitFallbackWhenAllDocsHaveSameGid() {
+ doTest(SPLIT_INCONSISTENT_ALL_DOCS_SAME_GID);
+ }
+
+ CPPUNIT_TEST_SUITE(PersistenceThread_SplitTest);
+ CPPUNIT_TEST(testTooManyDocsSplitOnce);
+ CPPUNIT_TEST(testTooManyDocsSplitMulti);
+ CPPUNIT_TEST(testTooManyDocsActuallyNot);
+ CPPUNIT_TEST(testTooLargeDocsSplitOnce);
+ CPPUNIT_TEST(testTooLargeDocsSplitMulti);
+ CPPUNIT_TEST(testTooLargeDocsSingleDoc);
+ CPPUNIT_TEST(testTooLargeDocsActuallyNot);
+ CPPUNIT_TEST(testSplitTooLittleSingleSplit);
+ CPPUNIT_TEST(testSplitTooLittleJustRight);
+ CPPUNIT_TEST(testSplitTooLittleSplitTowardsEnough);
+ CPPUNIT_TEST(testInconsistentSplitHasOneBitFallbackWhen1Doc);
+ CPPUNIT_TEST(testInconsistentSplitHasOneBitFallbackWhenAllDocsHaveSameGid);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(PersistenceThread_SplitTest);
+
+void
+PersistenceThread_SplitTest::doTest(SplitCase splitCase)
+{
+ uint32_t maxCount = 4;
+ uint32_t maxSize = 1000 * 1000;
+ uint32_t maxBits = 58;
+ uint32_t minBits = 1;
+ uint32_t docCount = 8;
+ uint32_t docSize = 100 * 1000;
+ uint32_t currentSplitLevel = 1;
+ uint32_t splitLevelToDivide = 2;
+ uint32_t resultSplitLevel = 2;
+ size_t resultBuckets = 2;
+ bool simulateGidCollision = false;
+ api::ReturnCode error(api::ReturnCode::OK);
+ switch (splitCase) {
+ case TOO_MANY_DOCS_SPLIT_ONCE:
+ break; // Default. Do nothing
+ case TOO_MANY_DOCS_SPLIT_MULTIPLE_BITS:
+ splitLevelToDivide = 3;
+ resultSplitLevel = 3;
+ break;
+ case TOO_MANY_DOCS_ACTUALLY_NOT:
+ splitLevelToDivide = 3;
+ docCount = 2;
+ resultBuckets = 1;
+ break;
+ case TOO_LARGE_DOCS_SPLIT_ONCE:
+ maxCount = 100;
+ docSize = 400 * 1000;
+ break;
+ case TOO_LARGE_DOCS_SPLIT_MULTIPLE_BITS:
+ maxCount = 100;
+ docSize = 400 * 1000;
+ splitLevelToDivide = 3;
+ resultSplitLevel = 3;
+ break;
+ case TOO_LARGE_DOCS_SINGLE_DOC:
+ // It is possible for bucket to be inconsistent being big enough
+ // to split in other copy but this copy has only 1 too big doc.
+ docCount = 1;
+ docSize = 3000 * 1000;
+ splitLevelToDivide = 3;
+ resultBuckets = 1;
+ break;
+ case TOO_LARGE_DOCS_ACTUALLY_NOT:
+ maxCount = 100;
+ splitLevelToDivide = 3;
+ resultSplitLevel = 2;
+ resultBuckets = 1;
+ break;
+ case SPLIT_TOO_LITTLE_SINGLE_SPLIT:
+ maxBits = 5;
+ maxSize = 0;
+ maxCount = 0;
+ splitLevelToDivide = 16;
+ resultSplitLevel = 5;
+ resultBuckets = 1;
+ break;
+ case SPLIT_TOO_LITTLE_JUST_RIGHT:
+ maxBits = 5;
+ maxSize = 0;
+ maxCount = 0;
+ splitLevelToDivide = 5;
+ resultSplitLevel = 5;
+ break;
+ case SPLIT_TOO_LITTLE_SPLIT_TOWARDS_ENOUGH:
+ maxBits = 8;
+ maxSize = 0;
+ maxCount = 0;
+ splitLevelToDivide = 5;
+ resultSplitLevel = 5;
+ break;
+ case SPLIT_INCONSISTENT_1_DOC:
+ docCount = 1;
+ maxSize = 0;
+ maxCount = 0;
+ currentSplitLevel = 16;
+ resultSplitLevel = 17;
+ resultBuckets = 1;
+ break;
+ case SPLIT_INCONSISTENT_ALL_DOCS_SAME_GID:
+ docCount = 2;
+ maxSize = 0;
+ maxCount = 0;
+ currentSplitLevel = 16;
+ resultSplitLevel = 17;
+ resultBuckets = 1;
+ simulateGidCollision = true;
+ break;
+ default:
+ assert(false);
+ }
+
+ uint64_t location = 0;
+ uint64_t splitMask = 1 << (splitLevelToDivide - 1);
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ spi::Bucket bucket(document::BucketId(currentSplitLevel, 1),
+ spi::PartitionId(0));
+ spi::PersistenceProvider& spi(getPersistenceProvider());
+ spi.deleteBucket(bucket, context);
+ spi.createBucket(bucket, context);
+ document::TestDocMan testDocMan;
+ for (uint32_t i=0; i<docCount; ++i) {
+ uint64_t docloc;
+ uint32_t seed;
+ if (!simulateGidCollision) {
+ docloc = location | (i % 2 == 0 ? 0 : splitMask);
+ seed = i;
+ } else {
+ docloc = location;
+ seed = 0;
+ }
+ document::Document::SP doc(testDocMan.createRandomDocumentAtLocation(
+ docloc, seed, docSize, docSize));
+ spi.put(bucket, spi::Timestamp(1000 + i), doc, context);
+ }
+
+ std::unique_ptr<PersistenceThread> thread(createPersistenceThread(0));
+ getNode().getStateUpdater().setClusterState(
+ lib::ClusterState::CSP(
+ new lib::ClusterState("distributor:1 storage:1")));
+ api::SplitBucketCommand cmd(document::BucketId(currentSplitLevel, 1));
+ cmd.setMaxSplitBits(maxBits);
+ cmd.setMinSplitBits(minBits);
+ cmd.setMinByteSize(maxSize);
+ cmd.setMinDocCount(maxCount);
+ cmd.setSourceIndex(0);
+ MessageTracker::UP result(thread->handleSplitBucket(cmd));
+ api::ReturnCode code(result->getResult());
+ CPPUNIT_ASSERT_EQUAL(error, code);
+ if (!code.success()) return;
+ api::SplitBucketReply& reply(
+ dynamic_cast<api::SplitBucketReply&>(*result->getReply()));
+ std::set<std::string> expected;
+ for (uint32_t i=0; i<resultBuckets; ++i) {
+ document::BucketId b(resultSplitLevel,
+ location | (i == 0 ? 0 : splitMask));
+ std::ostringstream ost;
+ ost << b << " - " << b.getUsedBits();
+ expected.insert(ost.str());
+ }
+ std::set<std::string> actual;
+ for (uint32_t i=0; i<reply.getSplitInfo().size(); ++i) {
+ std::ostringstream ost;
+ document::BucketId b(reply.getSplitInfo()[i].first);
+ ost << b << " - " << b.getUsedBits();
+ actual.insert(ost.str());
+ }
+ CPPUNIT_ASSERT_EQUAL(expected, actual);
+}
+
+} // storage
+
diff --git a/storage/src/tests/persistence/processalltest.cpp b/storage/src/tests/persistence/processalltest.cpp
new file mode 100644
index 00000000000..db75725db6f
--- /dev/null
+++ b/storage/src/tests/persistence/processalltest.cpp
@@ -0,0 +1,262 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/document/base/testdocman.h>
+#include <vespa/storage/persistence/processallhandler.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/documentapi/loadtypes/loadtype.h>
+#include <tests/persistence/persistencetestutils.h>
+
+namespace storage {
+
+class ProcessAllHandlerTest : public SingleDiskPersistenceTestUtils
+{
+ CPPUNIT_TEST_SUITE(ProcessAllHandlerTest);
+ CPPUNIT_TEST(testRemoveLocation);
+ CPPUNIT_TEST(testRemoveLocationDocumentSubset);
+ CPPUNIT_TEST(testRemoveLocationUnknownDocType);
+ CPPUNIT_TEST(testRemoveLocationBogusSelection);
+ CPPUNIT_TEST(testStat);
+ CPPUNIT_TEST(testStatWithRemove);
+ CPPUNIT_TEST(testStatWholeBucket);
+ CPPUNIT_TEST_SUITE_END();
+
+public:
+ void testRemoveLocation();
+ void testRemoveLocationDocumentSubset();
+ void testRemoveLocationUnknownDocType();
+ void testRemoveLocationEmptySelection();
+ void testRemoveLocationBogusSelection();
+ void testStat();
+ void testStatWithRemove();
+ void testStatWholeBucket();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(ProcessAllHandlerTest);
+
+void
+ProcessAllHandlerTest::testRemoveLocation()
+{
+ document::BucketId bucketId(16, 4);
+ doPut(4, spi::Timestamp(1234));
+ doPut(4, spi::Timestamp(2345));
+
+ api::RemoveLocationCommand removeLocation("id.user == 4", bucketId);
+ ProcessAllHandler handler(getEnv(), getPersistenceProvider());
+ spi::Context context(documentapi::LoadType::DEFAULT, 0, 0);
+ handler.handleRemoveLocation(removeLocation, context);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string(
+ "DocEntry(1234, 1, id:mail:testdoctype1:n=4:3619.html)\n"
+ "DocEntry(2345, 1, id:mail:testdoctype1:n=4:4008.html)\n"),
+ dumpBucket(bucketId));
+}
+
+void
+ProcessAllHandlerTest::testRemoveLocationDocumentSubset()
+{
+ document::BucketId bucketId(16, 4);
+ ProcessAllHandler handler(getEnv(), getPersistenceProvider());
+
+ document::TestDocMan docMan;
+ for (int i = 0; i < 10; ++i) {
+ document::Document::SP doc(docMan.createRandomDocumentAtLocation(4, 1234 + i));
+ doc->setValue(doc->getField("headerval"), document::IntFieldValue(i));
+ doPut(doc, bucketId, spi::Timestamp(100 + i), 0);
+ }
+
+ api::RemoveLocationCommand
+ removeLocation("testdoctype1.headerval % 2 == 0", bucketId);
+ spi::Context context(documentapi::LoadType::DEFAULT, 0, 0);
+ handler.handleRemoveLocation(removeLocation, context);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("DocEntry(100, 1, id:mail:testdoctype1:n=4:3619.html)\n"
+ "DocEntry(101, 0, Doc(id:mail:testdoctype1:n=4:33113.html))\n"
+ "DocEntry(102, 1, id:mail:testdoctype1:n=4:62608.html)\n"
+ "DocEntry(103, 0, Doc(id:mail:testdoctype1:n=4:26566.html))\n"
+ "DocEntry(104, 1, id:mail:testdoctype1:n=4:56061.html)\n"
+ "DocEntry(105, 0, Doc(id:mail:testdoctype1:n=4:20019.html))\n"
+ "DocEntry(106, 1, id:mail:testdoctype1:n=4:49514.html)\n"
+ "DocEntry(107, 0, Doc(id:mail:testdoctype1:n=4:13472.html))\n"
+ "DocEntry(108, 1, id:mail:testdoctype1:n=4:42967.html)\n"
+ "DocEntry(109, 0, Doc(id:mail:testdoctype1:n=4:6925.html))\n"),
+ dumpBucket(bucketId));
+}
+
+void
+ProcessAllHandlerTest::testRemoveLocationUnknownDocType()
+{
+ document::BucketId bucketId(16, 4);
+ doPut(4, spi::Timestamp(1234));
+
+ api::RemoveLocationCommand
+ removeLocation("unknowndoctype.headerval % 2 == 0", bucketId);
+
+ bool gotException = false;
+ try {
+ ProcessAllHandler handler(getEnv(), getPersistenceProvider());
+ spi::Context context(documentapi::LoadType::DEFAULT, 0, 0);
+ handler.handleRemoveLocation(removeLocation, context);
+ } catch (...) {
+ gotException = true;
+ }
+ CPPUNIT_ASSERT(gotException);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("DocEntry(1234, 0, Doc(id:mail:testdoctype1:n=4:3619.html))\n"),
+ dumpBucket(bucketId));
+}
+
+void
+ProcessAllHandlerTest::testRemoveLocationBogusSelection()
+{
+ document::BucketId bucketId(16, 4);
+ doPut(4, spi::Timestamp(1234));
+
+ api::RemoveLocationCommand removeLocation("id.bogus != badgers", bucketId);
+
+ bool gotException = false;
+ try {
+ ProcessAllHandler handler(getEnv(), getPersistenceProvider());
+ spi::Context context(documentapi::LoadType::DEFAULT, 0, 0);
+ handler.handleRemoveLocation(removeLocation, context);
+ } catch (...) {
+ gotException = true;
+ }
+ CPPUNIT_ASSERT(gotException);
+
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("DocEntry(1234, 0, Doc(id:mail:testdoctype1:n=4:3619.html))\n"),
+ dumpBucket(bucketId));
+}
+
+void
+ProcessAllHandlerTest::testStat()
+{
+ document::BucketId bucketId(16, 4);
+ ProcessAllHandler handler(getEnv(), getPersistenceProvider());
+
+ document::TestDocMan docMan;
+ for (int i = 0; i < 10; ++i) {
+ document::Document::SP doc(docMan.createRandomDocumentAtLocation(4, 1234 + i));
+ doc->setValue(doc->getField("headerval"), document::IntFieldValue(i));
+ doPut(doc, bucketId, spi::Timestamp(100 + i), 0);
+ }
+
+ api::StatBucketCommand statBucket(bucketId,
+ "testdoctype1.headerval % 2 == 0");
+ spi::Context context(documentapi::LoadType::DEFAULT, 0, 0);
+ MessageTracker::UP tracker = handler.handleStatBucket(statBucket, context);
+
+ CPPUNIT_ASSERT(tracker->getReply().get());
+ api::StatBucketReply& reply =
+ dynamic_cast<api::StatBucketReply&>(*tracker->getReply().get());
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::OK, reply.getResult().getResult());
+
+ vespalib::string expected =
+ "Persistence bucket BucketId(0x4000000000000004), partition 0\n"
+ " Timestamp: 100, Doc(id:mail:testdoctype1:n=4:3619.html), gid(0x0400000092bb8d298934253a), size: 169\n"
+ " Timestamp: 102, Doc(id:mail:testdoctype1:n=4:62608.html), gid(0x04000000ce878d2488413bc4), size: 147\n"
+ " Timestamp: 104, Doc(id:mail:testdoctype1:n=4:56061.html), gid(0x040000002b8f80f0160f6c5c), size: 124\n"
+ " Timestamp: 106, Doc(id:mail:testdoctype1:n=4:49514.html), gid(0x04000000d45ca9abb47567f0), size: 101\n"
+ " Timestamp: 108, Doc(id:mail:testdoctype1:n=4:42967.html), gid(0x04000000f19ece1668e6de48), size: 206\n";
+
+
+ CPPUNIT_ASSERT_EQUAL(expected, reply.getResults());
+}
+
+void
+ProcessAllHandlerTest::testStatWithRemove()
+{
+ document::BucketId bucketId(16, 4);
+ ProcessAllHandler handler(getEnv(), getPersistenceProvider());
+
+ document::TestDocMan docMan;
+ for (int i = 0; i < 10; ++i) {
+ document::Document::SP doc(docMan.createRandomDocumentAtLocation(4, 1234 + i));
+ doc->setValue(doc->getField("headerval"), document::IntFieldValue(i));
+ doPut(doc, bucketId, spi::Timestamp(100 + i), 0);
+ doRemove(bucketId,
+ doc->getId(),
+ spi::Timestamp(200 + i),
+ true);
+ }
+
+ api::StatBucketCommand statBucket(bucketId, "true");
+ spi::Context context(documentapi::LoadType::DEFAULT, 0, 0);
+ MessageTracker::UP tracker = handler.handleStatBucket(statBucket, context);
+
+ CPPUNIT_ASSERT(tracker->getReply().get());
+ api::StatBucketReply& reply =
+ dynamic_cast<api::StatBucketReply&>(*tracker->getReply().get());
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::OK, reply.getResult().getResult());
+
+ vespalib::string expected =
+ "Persistence bucket BucketId(0x4000000000000004), partition 0\n"
+ " Timestamp: 100, Doc(id:mail:testdoctype1:n=4:3619.html), gid(0x0400000092bb8d298934253a), size: 169\n"
+ " Timestamp: 101, Doc(id:mail:testdoctype1:n=4:33113.html), gid(0x04000000b121a632741db368), size: 95\n"
+ " Timestamp: 102, Doc(id:mail:testdoctype1:n=4:62608.html), gid(0x04000000ce878d2488413bc4), size: 147\n"
+ " Timestamp: 103, Doc(id:mail:testdoctype1:n=4:26566.html), gid(0x04000000177f8240bdd2bef0), size: 200\n"
+ " Timestamp: 104, Doc(id:mail:testdoctype1:n=4:56061.html), gid(0x040000002b8f80f0160f6c5c), size: 124\n"
+ " Timestamp: 105, Doc(id:mail:testdoctype1:n=4:20019.html), gid(0x040000001550c67f28ea7b03), size: 177\n"
+ " Timestamp: 106, Doc(id:mail:testdoctype1:n=4:49514.html), gid(0x04000000d45ca9abb47567f0), size: 101\n"
+ " Timestamp: 107, Doc(id:mail:testdoctype1:n=4:13472.html), gid(0x040000005d01f3fd960f8098), size: 154\n"
+ " Timestamp: 108, Doc(id:mail:testdoctype1:n=4:42967.html), gid(0x04000000f19ece1668e6de48), size: 206\n"
+ " Timestamp: 109, Doc(id:mail:testdoctype1:n=4:6925.html), gid(0x04000000667c0b3cada830be), size: 130\n"
+ " Timestamp: 200, id:mail:testdoctype1:n=4:3619.html, gid(0x0400000092bb8d298934253a) (remove)\n"
+ " Timestamp: 201, id:mail:testdoctype1:n=4:33113.html, gid(0x04000000b121a632741db368) (remove)\n"
+ " Timestamp: 202, id:mail:testdoctype1:n=4:62608.html, gid(0x04000000ce878d2488413bc4) (remove)\n"
+ " Timestamp: 203, id:mail:testdoctype1:n=4:26566.html, gid(0x04000000177f8240bdd2bef0) (remove)\n"
+ " Timestamp: 204, id:mail:testdoctype1:n=4:56061.html, gid(0x040000002b8f80f0160f6c5c) (remove)\n"
+ " Timestamp: 205, id:mail:testdoctype1:n=4:20019.html, gid(0x040000001550c67f28ea7b03) (remove)\n"
+ " Timestamp: 206, id:mail:testdoctype1:n=4:49514.html, gid(0x04000000d45ca9abb47567f0) (remove)\n"
+ " Timestamp: 207, id:mail:testdoctype1:n=4:13472.html, gid(0x040000005d01f3fd960f8098) (remove)\n"
+ " Timestamp: 208, id:mail:testdoctype1:n=4:42967.html, gid(0x04000000f19ece1668e6de48) (remove)\n"
+ " Timestamp: 209, id:mail:testdoctype1:n=4:6925.html, gid(0x04000000667c0b3cada830be) (remove)\n";
+
+ CPPUNIT_ASSERT_EQUAL(expected, reply.getResults());
+}
+
+
+void
+ProcessAllHandlerTest::testStatWholeBucket()
+{
+ document::BucketId bucketId(16, 4);
+ ProcessAllHandler handler(getEnv(), getPersistenceProvider());
+
+ document::TestDocMan docMan;
+ for (int i = 0; i < 10; ++i) {
+ document::Document::SP doc(docMan.createRandomDocumentAtLocation(4, 1234 + i));
+ doc->setValue(doc->getField("headerval"), document::IntFieldValue(i));
+ doPut(doc, bucketId, spi::Timestamp(100 + i), 0);
+ }
+
+ api::StatBucketCommand statBucket(bucketId, "true");
+ spi::Context context(documentapi::LoadType::DEFAULT, 0, 0);
+ MessageTracker::UP tracker = handler.handleStatBucket(statBucket, context);
+
+ CPPUNIT_ASSERT(tracker->getReply().get());
+ api::StatBucketReply& reply =
+ dynamic_cast<api::StatBucketReply&>(*tracker->getReply().get());
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::OK, reply.getResult().getResult());
+
+ vespalib::string expected =
+ "Persistence bucket BucketId(0x4000000000000004), partition 0\n"
+ " Timestamp: 100, Doc(id:mail:testdoctype1:n=4:3619.html), gid(0x0400000092bb8d298934253a), size: 169\n"
+ " Timestamp: 101, Doc(id:mail:testdoctype1:n=4:33113.html), gid(0x04000000b121a632741db368), size: 95\n"
+ " Timestamp: 102, Doc(id:mail:testdoctype1:n=4:62608.html), gid(0x04000000ce878d2488413bc4), size: 147\n"
+ " Timestamp: 103, Doc(id:mail:testdoctype1:n=4:26566.html), gid(0x04000000177f8240bdd2bef0), size: 200\n"
+ " Timestamp: 104, Doc(id:mail:testdoctype1:n=4:56061.html), gid(0x040000002b8f80f0160f6c5c), size: 124\n"
+ " Timestamp: 105, Doc(id:mail:testdoctype1:n=4:20019.html), gid(0x040000001550c67f28ea7b03), size: 177\n"
+ " Timestamp: 106, Doc(id:mail:testdoctype1:n=4:49514.html), gid(0x04000000d45ca9abb47567f0), size: 101\n"
+ " Timestamp: 107, Doc(id:mail:testdoctype1:n=4:13472.html), gid(0x040000005d01f3fd960f8098), size: 154\n"
+ " Timestamp: 108, Doc(id:mail:testdoctype1:n=4:42967.html), gid(0x04000000f19ece1668e6de48), size: 206\n"
+ " Timestamp: 109, Doc(id:mail:testdoctype1:n=4:6925.html), gid(0x04000000667c0b3cada830be), size: 130\n";
+
+ CPPUNIT_ASSERT_EQUAL(expected, reply.getResults());
+}
+
+}
diff --git a/storage/src/tests/persistence/providershutdownwrappertest.cpp b/storage/src/tests/persistence/providershutdownwrappertest.cpp
new file mode 100644
index 00000000000..0731dcb155a
--- /dev/null
+++ b/storage/src/tests/persistence/providershutdownwrappertest.cpp
@@ -0,0 +1,87 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <tests/persistence/persistencetestutils.h>
+#include <tests/persistence/persistenceproviderwrapper.h>
+#include <vespa/storage/persistence/providershutdownwrapper.h>
+
+namespace storage {
+
+class ProviderShutdownWrapperTest : public SingleDiskPersistenceTestUtils
+{
+public:
+ CPPUNIT_TEST_SUITE(ProviderShutdownWrapperTest);
+ CPPUNIT_TEST(testShutdownOnFatalError);
+ CPPUNIT_TEST_SUITE_END();
+
+ void testShutdownOnFatalError();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(ProviderShutdownWrapperTest);
+
+namespace {
+
+class TestShutdownListener
+ : public framework::defaultimplementation::ShutdownListener
+{
+public:
+ TestShutdownListener() : _reason() {}
+
+ void requestShutdown(vespalib::stringref reason) {
+ _reason = reason;
+ }
+
+ bool shutdownRequested() const { return !_reason.empty(); }
+ const vespalib::string& getReason() const { return _reason; }
+private:
+ vespalib::string _reason;
+};
+
+}
+
+void
+ProviderShutdownWrapperTest::testShutdownOnFatalError()
+{
+ // We wrap the wrapper. It's turtles all the way down!
+ PersistenceProviderWrapper providerWrapper(
+ getPersistenceProvider());
+ TestServiceLayerApp app;
+ ServiceLayerComponent component(app.getComponentRegister(), "dummy");
+
+ ProviderShutdownWrapper shutdownWrapper(providerWrapper, component);
+
+ TestShutdownListener shutdownListener;
+
+ app.getComponentRegister().registerShutdownListener(shutdownListener);
+
+ providerWrapper.setResult(
+ spi::Result(spi::Result::FATAL_ERROR, "eject! eject!"));
+ providerWrapper.setFailureMask(
+ PersistenceProviderWrapper::FAIL_ALL_OPERATIONS);
+
+ CPPUNIT_ASSERT(!shutdownListener.shutdownRequested());
+ // This should cause the node to implicitly be shut down
+ shutdownWrapper.getBucketInfo(
+ spi::Bucket(document::BucketId(16, 1234),
+ spi::PartitionId(0)));
+
+ CPPUNIT_ASSERT(shutdownListener.shutdownRequested());
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("eject! eject!"),
+ shutdownListener.getReason());
+
+ // Triggering a new error should not cause shutdown to be requested twice.
+ providerWrapper.setResult(
+ spi::Result(spi::Result::FATAL_ERROR, "boom!"));
+
+ shutdownWrapper.getBucketInfo(
+ spi::Bucket(document::BucketId(16, 1234),
+ spi::PartitionId(0)));
+
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("eject! eject!"),
+ shutdownListener.getReason());
+}
+
+} // ns storage
+
+
diff --git a/storage/src/tests/persistence/splitbitdetectortest.cpp b/storage/src/tests/persistence/splitbitdetectortest.cpp
new file mode 100644
index 00000000000..5cc9c5da721
--- /dev/null
+++ b/storage/src/tests/persistence/splitbitdetectortest.cpp
@@ -0,0 +1,363 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <fstream>
+#include <vespa/storage/persistence/splitbitdetector.h>
+#include <vespa/vespalib/io/fileutil.h>
+#include <vespa/persistence/dummyimpl/dummypersistence.h>
+#include <vespa/document/base/testdocman.h>
+
+
+namespace storage {
+
+namespace {
+ spi::LoadType defaultLoadType(0, "default");
+}
+
+struct SplitBitDetectorTest : public CppUnit::TestFixture {
+ void testSingleUser();
+ void testTwoUsers();
+ void testMaxBits();
+ void testMaxBitsOneBelowMax();
+ void testUnsplittable();
+ void testUnsplittableMinCount();
+ void testEmpty();
+ void testZeroDocLimitFallbacksToOneBitIncreaseWith1Doc();
+ void testZeroDocLimitFallbacksToOneBitIncreaseOnGidCollision();
+ void findBucketCollisionIds();
+
+ spi::DocEntry::LP
+ generateDocEntry(uint32_t userId,
+ uint32_t docNum,
+ spi::Timestamp timestamp)
+ {
+ std::ostringstream ost;
+ ost << "id:storage_test:testdoctype1:n=" << userId << ":" << docNum;
+ return spi::DocEntry::LP(new spi::DocEntry(
+ timestamp, 0, document::DocumentId(ost.str())));
+ };
+
+ CPPUNIT_TEST_SUITE(SplitBitDetectorTest);
+ CPPUNIT_TEST(testSingleUser);
+ CPPUNIT_TEST(testTwoUsers);
+ CPPUNIT_TEST(testMaxBits);
+ CPPUNIT_TEST(testMaxBitsOneBelowMax);
+ CPPUNIT_TEST(testUnsplittable);
+ CPPUNIT_TEST(testUnsplittableMinCount);
+ CPPUNIT_TEST(testEmpty);
+ CPPUNIT_TEST(testZeroDocLimitFallbacksToOneBitIncreaseWith1Doc);
+ CPPUNIT_TEST(testZeroDocLimitFallbacksToOneBitIncreaseOnGidCollision);
+ CPPUNIT_TEST_DISABLED(findBucketCollisionIds);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(SplitBitDetectorTest);
+
+void
+SplitBitDetectorTest::testTwoUsers()
+{
+ document::TestDocMan testDocMan;
+ spi::dummy::DummyPersistence provider(testDocMan.getTypeRepoSP(), 1);
+ provider.getPartitionStates();
+ spi::Bucket bucket(document::BucketId(1, 1),
+ spi::PartitionId(0));
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+
+ provider.createBucket(bucket, context);
+
+ std::vector<spi::DocEntry::LP> entries;
+ for (uint32_t i = 0; i < 5; ++i) {
+ document::Document::SP doc(
+ testDocMan.createRandomDocumentAtLocation(1, i, 1, 1));
+ provider.put(bucket, spi::Timestamp(1000 + i), doc, context);
+ }
+
+ for (uint32_t i = 5; i < 10; ++i) {
+ document::Document::SP doc(
+ testDocMan.createRandomDocumentAtLocation(3, i, 1, 1));
+ provider.put(bucket, spi::Timestamp(1000 + i), doc, context);
+ }
+
+ SplitBitDetector::Result result(
+ SplitBitDetector::detectSplit(provider, bucket, 58, context));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("SplitTargets(2: BucketId(0x0800000000000001), "
+ "BucketId(0x0800000000000003))"),
+ result.toString());
+}
+
+void
+SplitBitDetectorTest::testSingleUser()
+{
+ document::TestDocMan testDocMan;
+ spi::dummy::DummyPersistence provider(testDocMan.getTypeRepoSP(), 1);
+ provider.getPartitionStates();
+ spi::Bucket bucket(document::BucketId(1, 1),
+ spi::PartitionId(0));
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+
+ provider.createBucket(bucket, context);
+
+ std::vector<spi::DocEntry::LP> entries;
+ for (uint32_t i = 0; i < 10; ++i) {
+ document::Document::SP doc(
+ testDocMan.createRandomDocumentAtLocation(1, i, 1, 1));
+ provider.put(bucket, spi::Timestamp(1000 + i), doc, context);
+ }
+
+ SplitBitDetector::Result result(
+ SplitBitDetector::detectSplit(provider, bucket, 58, context));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("SplitTargets(33: BucketId(0x8400000000000001), "
+ "BucketId(0x8400000100000001))"),
+ result.toString());
+}
+
+void
+SplitBitDetectorTest::testMaxBits()
+{
+ document::TestDocMan testDocMan;
+ spi::dummy::DummyPersistence provider(testDocMan.getTypeRepoSP(), 1);
+ provider.getPartitionStates();
+ spi::Bucket bucket(document::BucketId(1, 1),
+ spi::PartitionId(0));
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ int minContentSize = 1, maxContentSize = 1;
+
+ provider.createBucket(bucket, context);
+
+ std::vector<spi::DocEntry::LP> entries;
+ for (uint32_t seed = 0; seed < 10; ++seed) {
+ int location = 1;
+ document::Document::SP doc(testDocMan.createRandomDocumentAtLocation(
+ location, seed, minContentSize, maxContentSize));
+ provider.put(bucket, spi::Timestamp(1000 + seed), doc, context);
+ }
+
+ SplitBitDetector::Result result(
+ SplitBitDetector::detectSplit(provider, bucket, 3, context));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("SplitTargets(3: BucketId(0x0c00000000000001), "
+ "[ BucketId(0x0c00000000000005) ])"),
+ result.toString());
+}
+
+void
+SplitBitDetectorTest::testMaxBitsOneBelowMax()
+{
+ document::TestDocMan testDocMan;
+ spi::dummy::DummyPersistence provider(testDocMan.getTypeRepoSP(), 1);
+ provider.getPartitionStates();
+ spi::Bucket bucket(document::BucketId(15, 1), spi::PartitionId(0));
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+ int minContentSize = 1, maxContentSize = 1;
+
+ provider.createBucket(bucket, context);
+
+ std::vector<spi::DocEntry::LP> entries;
+ for (uint32_t seed = 0; seed < 10; ++seed) {
+ int location = 1 | (seed % 2 == 0 ? 0x8000 : 0);
+ document::Document::SP doc(testDocMan.createRandomDocumentAtLocation(
+ location, seed, minContentSize, maxContentSize));
+ provider.put(bucket, spi::Timestamp(1000 + seed), doc, context);
+ }
+
+ //std::cerr << provider.dumpBucket(bucket) << "\n";
+
+ SplitBitDetector::Result result(
+ SplitBitDetector::detectSplit(provider, bucket, 15, context));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("SplitTargets(error: No use in trying to split "
+ "Bucket(0x3c00000000000001, partition 0) when max split"
+ " bit is set to 15.)"),
+ result.toString());
+
+ result = SplitBitDetector::detectSplit(provider, bucket, 16, context);
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("SplitTargets(16: BucketId(0x4000000000000001), "
+ "BucketId(0x4000000000008001))"),
+ result.toString());
+}
+
+void
+SplitBitDetectorTest::testUnsplittable()
+{
+ document::TestDocMan testDocMan;
+ spi::dummy::DummyPersistence provider(testDocMan.getTypeRepoSP(), 1);
+ provider.getPartitionStates();
+ spi::Bucket bucket(document::BucketId(1, 1),
+ spi::PartitionId(0));
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+
+ provider.createBucket(bucket, context);
+
+ std::vector<spi::DocEntry::LP> entries;
+
+ for (uint32_t i = 0; i < 10; ++i) {
+ document::Document::SP doc(
+ testDocMan.createRandomDocumentAtLocation(1, 1, 1, 1));
+ provider.put(bucket, spi::Timestamp(1000 + i), doc, context);
+ }
+
+ SplitBitDetector::Result result(
+ SplitBitDetector::detectSplit(provider, bucket, 58, context, 100));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("SplitTargets(58: BucketId(0xe94c074f00000001), "
+ "BucketId(0xeb4c074f00000001))"),
+ result.toString());
+}
+
+void
+SplitBitDetectorTest::testUnsplittableMinCount()
+{
+ document::TestDocMan testDocMan;
+ spi::dummy::DummyPersistence provider(testDocMan.getTypeRepoSP(), 1);
+ provider.getPartitionStates();
+ spi::Bucket bucket(document::BucketId(1, 1),
+ spi::PartitionId(0));
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+
+ provider.createBucket(bucket, context);
+
+ std::vector<spi::DocEntry::LP> entries;
+
+ for (uint32_t i = 0; i < 10; ++i) {
+ document::Document::SP doc(
+ testDocMan.createRandomDocumentAtLocation(1, 1, 1, 1));
+ provider.put(bucket, spi::Timestamp(1000 + i), doc, context);
+ }
+
+ SplitBitDetector::Result result(
+ SplitBitDetector::detectSplit(provider, bucket, 58, context, 5, 0));
+ // Still no other choice than split out to 58 bits regardless of minCount.
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("SplitTargets(58: BucketId(0xe94c074f00000001), "
+ "BucketId(0xeb4c074f00000001))"),
+ result.toString());
+}
+
+
+void
+SplitBitDetectorTest::testEmpty()
+{
+ document::TestDocMan testDocMan;
+ spi::dummy::DummyPersistence provider(testDocMan.getTypeRepoSP(), 1);
+ provider.getPartitionStates();
+ spi::Bucket bucket(document::BucketId(1, 1),
+ spi::PartitionId(0));
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+
+ provider.createBucket(bucket, context);
+
+ SplitBitDetector::Result result(
+ SplitBitDetector::detectSplit(provider, bucket, 58, context));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("SplitTargets(source empty)"),
+ result.toString());
+}
+
+void
+SplitBitDetectorTest::testZeroDocLimitFallbacksToOneBitIncreaseWith1Doc()
+{
+ document::TestDocMan testDocMan;
+ spi::dummy::DummyPersistence provider(testDocMan.getTypeRepoSP(), 1);
+ provider.getPartitionStates();
+ spi::Bucket bucket(document::BucketId(1, 1),
+ spi::PartitionId(0));
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+
+ provider.createBucket(bucket, context);
+ document::Document::SP doc(
+ testDocMan.createRandomDocumentAtLocation(1, 0, 1, 1));
+ provider.put(bucket, spi::Timestamp(1000), doc, context);
+
+ SplitBitDetector::Result result(
+ SplitBitDetector::detectSplit(provider, bucket, 58, context, 0, 0));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("SplitTargets(2: BucketId(0x0800000000000001), "
+ "BucketId(0x0800000000000003))"),
+ result.toString());
+}
+
+void
+SplitBitDetectorTest::testZeroDocLimitFallbacksToOneBitIncreaseOnGidCollision()
+{
+ document::TestDocMan testDocMan;
+ spi::dummy::DummyPersistence provider(testDocMan.getTypeRepoSP(), 1);
+ provider.getPartitionStates();
+ spi::Bucket bucket(document::BucketId(1, 1),
+ spi::PartitionId(0));
+ spi::Context context(defaultLoadType, spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+
+ provider.createBucket(bucket, context);
+ document::Document::SP doc(
+ testDocMan.createRandomDocumentAtLocation(1, 0, 1, 1));
+ provider.put(bucket, spi::Timestamp(1000), doc, context);
+ provider.put(bucket, spi::Timestamp(2000), doc, context);
+
+ SplitBitDetector::Result result(
+ SplitBitDetector::detectSplit(provider, bucket, 58, context, 0, 0));
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("SplitTargets(2: BucketId(0x0800000000000001), "
+ "BucketId(0x0800000000000003))"),
+ result.toString());
+}
+
+/**
+ * Not a regular unit test in itself, but more of an utility to find non-unique
+ * document IDs that map to the same 58-bit bucket ID. Disabled by default since
+ * it costs CPU to do this and is not necessary during normal testing.
+ */
+void
+SplitBitDetectorTest::findBucketCollisionIds()
+{
+ using document::DocumentId;
+ using document::BucketId;
+
+ document::BucketIdFactory factory;
+
+ DocumentId targetId("id:foo:music:n=123456:ABCDEFGHIJKLMN");
+ BucketId targetBucket(factory.getBucketId(targetId));
+ char candidateSuffix[] = "ABCDEFGHIJKLMN";
+
+ size_t iterations = 0;
+ constexpr size_t maxIterations = 100000000;
+ while (std::next_permutation(std::begin(candidateSuffix),
+ std::end(candidateSuffix) - 1))
+ {
+ ++iterations;
+
+ DocumentId candidateId(
+ vespalib::make_string("id:foo:music:n=123456:%s",
+ candidateSuffix));
+ BucketId candidateBucket(factory.getBucketId(candidateId));
+ if (targetBucket == candidateBucket) {
+ std::cerr << "\nFound a collision after " << iterations
+ << " iterations!\n"
+ << "target: " << targetId << " -> " << targetBucket
+ << "\ncollision: " << candidateId << " -> "
+ << candidateBucket << "\n";
+ return;
+ }
+
+ if (iterations == maxIterations) {
+ std::cerr << "\nNo collision found after " << iterations
+ << " iterations :[\n";
+ return;
+ }
+ }
+ std::cerr << "\nRan out of permutations after " << iterations
+ << " iterations!\n";
+}
+
+}
diff --git a/storage/src/tests/persistence/testandsettest.cpp b/storage/src/tests/persistence/testandsettest.cpp
new file mode 100644
index 00000000000..984e06dc6e3
--- /dev/null
+++ b/storage/src/tests/persistence/testandsettest.cpp
@@ -0,0 +1,331 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// @author Vegard Sjonfjell
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storage/persistence/persistencethread.h>
+#include <tests/persistence/persistencetestutils.h>
+#include <vespa/documentapi/messagebus/messages/testandsetcondition.h>
+#include <vespa/document/fieldvalue/fieldvalues.h>
+#include <functional>
+
+using std::unique_ptr;
+using std::shared_ptr;
+
+using namespace std::string_literals;
+
+namespace storage {
+
+class TestAndSetTest : public SingleDiskPersistenceTestUtils
+{
+ static constexpr int MIN_DOCUMENT_SIZE = 0;
+ static constexpr int MAX_DOCUMENT_SIZE = 128;
+ static constexpr int RANDOM_SEED = 1234;
+
+ const document::BucketId BUCKET_ID{16, 4};
+ const document::StringFieldValue MISMATCHING_HEADER{"Definitely nothing about loud canines"};
+ const document::StringFieldValue MATCHING_HEADER{"Some string with woofy dog as a substring"};
+ const document::StringFieldValue OLD_CONTENT{"Some old content"};
+ const document::StringFieldValue NEW_CONTENT{"Freshly pressed and squeezed content"};
+
+ unique_ptr<PersistenceThread> thread;
+ shared_ptr<document::Document> testDoc;
+ document::DocumentId testDocId;
+
+public:
+ void setUp() override {
+ SingleDiskPersistenceTestUtils::setUp();
+
+ spi::Context context(
+ spi::LoadType(0, "default"),
+ spi::Priority(0),
+ spi::Trace::TraceLevel(0));
+
+ createBucket(BUCKET_ID);
+ getPersistenceProvider().createBucket(
+ spi::Bucket(BUCKET_ID, spi::PartitionId(0)),
+ context);
+
+ thread = createPersistenceThread(0);
+ testDoc = createTestDocument();
+ testDocId = testDoc->getId();
+ }
+
+ void tearDown() override {
+ thread.reset(nullptr);
+ SingleDiskPersistenceTestUtils::tearDown();
+ }
+
+ void conditional_put_not_executed_on_condition_mismatch();
+ void conditional_put_executed_on_condition_match();
+ void conditional_remove_not_executed_on_condition_mismatch();
+ void conditional_remove_executed_on_condition_match();
+ void conditional_update_not_executed_on_condition_mismatch();
+ void conditional_update_executed_on_condition_match();
+ void invalid_document_selection_should_fail();
+ void non_existing_document_should_fail();
+ void document_with_no_type_should_fail();
+
+ CPPUNIT_TEST_SUITE(TestAndSetTest);
+ CPPUNIT_TEST(conditional_put_not_executed_on_condition_mismatch);
+ CPPUNIT_TEST(conditional_put_executed_on_condition_match);
+ CPPUNIT_TEST(conditional_remove_not_executed_on_condition_mismatch);
+ CPPUNIT_TEST(conditional_remove_executed_on_condition_match);
+ CPPUNIT_TEST(conditional_update_not_executed_on_condition_mismatch);
+ CPPUNIT_TEST(conditional_update_executed_on_condition_match);
+ CPPUNIT_TEST(invalid_document_selection_should_fail);
+ CPPUNIT_TEST(non_existing_document_should_fail);
+ CPPUNIT_TEST(document_with_no_type_should_fail);
+ CPPUNIT_TEST_SUITE_END();
+
+protected:
+ std::unique_ptr<api::UpdateCommand> conditional_update_test(
+ bool matchingHeader,
+ api::Timestamp timestampOne,
+ api::Timestamp timestampTwo);
+
+ document::Document::SP createTestDocument();
+ document::Document::SP retrieveTestDocument();
+ void setTestCondition(api::TestAndSetCommand & command);
+ void putTestDocument(bool matchingHeader, api::Timestamp timestamp);
+ void assertTestDocumentFoundAndMatchesContent(const document::FieldValue & value);
+
+ static std::string expectedDocEntryString(
+ api::Timestamp timestamp,
+ const document::DocumentId & testDocId,
+ spi::DocumentMetaFlags removeFlag = spi::NONE);
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(TestAndSetTest);
+
+void TestAndSetTest::conditional_put_not_executed_on_condition_mismatch()
+{
+ // Put document with mismatching header
+ api::Timestamp timestampOne = 0;
+ putTestDocument(false, timestampOne);
+
+ CPPUNIT_ASSERT_EQUAL(expectedDocEntryString(timestampOne, testDocId), dumpBucket(BUCKET_ID));
+
+ // Conditionally replace document, but fail due to lack of woofy dog
+ api::Timestamp timestampTwo = 1;
+ api::PutCommand putTwo(BUCKET_ID, testDoc, timestampTwo);
+ setTestCondition(putTwo);
+
+ CPPUNIT_ASSERT(thread->handlePut(putTwo)->getResult() == api::ReturnCode::Result::TEST_AND_SET_CONDITION_FAILED);
+ CPPUNIT_ASSERT_EQUAL(expectedDocEntryString(timestampOne, testDocId), dumpBucket(BUCKET_ID));
+}
+
+void TestAndSetTest::conditional_put_executed_on_condition_match()
+{
+ // Put document with matching header
+ api::Timestamp timestampOne = 0;
+ putTestDocument(true, timestampOne);
+
+ CPPUNIT_ASSERT_EQUAL(expectedDocEntryString(timestampOne, testDocId), dumpBucket(BUCKET_ID));
+
+ // Update content of document
+ testDoc->setValue(testDoc->getField("content"), NEW_CONTENT);
+
+ // Conditionally replace document with updated version, succeed in doing so
+ api::Timestamp timestampTwo = 1;
+ api::PutCommand putTwo(BUCKET_ID, testDoc, timestampTwo);
+ setTestCondition(putTwo);
+
+ CPPUNIT_ASSERT(thread->handlePut(putTwo)->getResult() == api::ReturnCode::Result::OK);
+ CPPUNIT_ASSERT_EQUAL(expectedDocEntryString(timestampOne, testDocId) +
+ expectedDocEntryString(timestampTwo, testDocId),
+ dumpBucket(BUCKET_ID));
+
+ assertTestDocumentFoundAndMatchesContent(NEW_CONTENT);
+}
+
+void TestAndSetTest::conditional_remove_not_executed_on_condition_mismatch()
+{
+ // Put document with mismatching header
+ api::Timestamp timestampOne = 0;
+ putTestDocument(false, timestampOne);
+
+ CPPUNIT_ASSERT_EQUAL(expectedDocEntryString(timestampOne, testDocId), dumpBucket(BUCKET_ID));
+
+ // Conditionally remove document, fail in doing so
+ api::Timestamp timestampTwo = 1;
+ api::RemoveCommand remove(BUCKET_ID, testDocId, timestampTwo);
+ setTestCondition(remove);
+
+ CPPUNIT_ASSERT(thread->handleRemove(remove)->getResult() == api::ReturnCode::Result::TEST_AND_SET_CONDITION_FAILED);
+ CPPUNIT_ASSERT_EQUAL(expectedDocEntryString(timestampOne, testDocId), dumpBucket(BUCKET_ID));
+
+ // Assert that the document is still there
+ retrieveTestDocument();
+}
+
+void TestAndSetTest::conditional_remove_executed_on_condition_match()
+{
+ // Put document with matching header
+ api::Timestamp timestampOne = 0;
+ putTestDocument(true, timestampOne);
+
+ CPPUNIT_ASSERT_EQUAL(expectedDocEntryString(timestampOne, testDocId), dumpBucket(BUCKET_ID));
+
+ // Conditionally remove document, succeed in doing so
+ api::Timestamp timestampTwo = 1;
+ api::RemoveCommand remove(BUCKET_ID, testDocId, timestampTwo);
+ setTestCondition(remove);
+
+ CPPUNIT_ASSERT(thread->handleRemove(remove)->getResult() == api::ReturnCode::Result::OK);
+ CPPUNIT_ASSERT_EQUAL(expectedDocEntryString(timestampOne, testDocId) +
+ expectedDocEntryString(timestampTwo, testDocId, spi::REMOVE_ENTRY),
+ dumpBucket(BUCKET_ID));
+}
+
+std::unique_ptr<api::UpdateCommand> TestAndSetTest::conditional_update_test(
+ bool matchingHeader,
+ api::Timestamp timestampOne,
+ api::Timestamp timestampTwo)
+{
+ putTestDocument(matchingHeader, timestampOne);
+
+ auto docUpdate = std::make_shared<document::DocumentUpdate>(testDoc->getType(), testDocId);
+ auto fieldUpdate = document::FieldUpdate(testDoc->getField("content"));
+ fieldUpdate.addUpdate(document::AssignValueUpdate(NEW_CONTENT));
+ docUpdate->addUpdate(fieldUpdate);
+
+ auto updateUp = std::make_unique<api::UpdateCommand>(BUCKET_ID, docUpdate, timestampTwo);
+ setTestCondition(*updateUp);
+ return updateUp;
+}
+
+void TestAndSetTest::conditional_update_not_executed_on_condition_mismatch()
+{
+ api::Timestamp timestampOne = 0;
+ api::Timestamp timestampTwo = 1;
+ auto updateUp = conditional_update_test(false, timestampOne, timestampTwo);
+
+ CPPUNIT_ASSERT(thread->handleUpdate(*updateUp)->getResult() == api::ReturnCode::Result::TEST_AND_SET_CONDITION_FAILED);
+ CPPUNIT_ASSERT_EQUAL(expectedDocEntryString(timestampOne, testDocId),
+ dumpBucket(BUCKET_ID));
+
+ assertTestDocumentFoundAndMatchesContent(OLD_CONTENT);
+}
+
+void TestAndSetTest::conditional_update_executed_on_condition_match()
+{
+ api::Timestamp timestampOne = 0;
+ api::Timestamp timestampTwo = 1;
+ auto updateUp = conditional_update_test(true, timestampOne, timestampTwo);
+
+ CPPUNIT_ASSERT(thread->handleUpdate(*updateUp)->getResult() == api::ReturnCode::Result::OK);
+ CPPUNIT_ASSERT_EQUAL(expectedDocEntryString(timestampOne, testDocId) +
+ expectedDocEntryString(timestampTwo, testDocId),
+ dumpBucket(BUCKET_ID));
+
+ assertTestDocumentFoundAndMatchesContent(NEW_CONTENT);
+}
+
+void TestAndSetTest::invalid_document_selection_should_fail()
+{
+ // Conditionally replace nonexisting document
+ // Fail early since document selection is invalid
+ api::Timestamp timestamp = 0;
+ api::PutCommand put(BUCKET_ID, testDoc, timestamp);
+ put.setCondition(documentapi::TestAndSetCondition("bjarne"));
+
+ CPPUNIT_ASSERT(thread->handlePut(put)->getResult() == api::ReturnCode::Result::ILLEGAL_PARAMETERS);
+ CPPUNIT_ASSERT_EQUAL(""s, dumpBucket(BUCKET_ID));
+}
+
+void TestAndSetTest::non_existing_document_should_fail()
+{
+ // Conditionally replace nonexisting document
+ // Fail since no document exists to match with test and set
+ api::Timestamp timestamp = 0;
+ api::PutCommand put(BUCKET_ID, testDoc, timestamp);
+ setTestCondition(put);
+ thread->handlePut(put);
+
+ CPPUNIT_ASSERT(thread->handlePut(put)->getResult() == api::ReturnCode::Result::TEST_AND_SET_CONDITION_FAILED);
+ CPPUNIT_ASSERT_EQUAL(""s, dumpBucket(BUCKET_ID));
+}
+
+void TestAndSetTest::document_with_no_type_should_fail()
+{
+ // Conditionally replace nonexisting document
+ // Fail since no document exists to match with test and set
+ api::Timestamp timestamp = 0;
+ document::DocumentId legacyDocId("doc:mail:3619.html");
+ api::RemoveCommand remove(BUCKET_ID, legacyDocId, timestamp);
+ setTestCondition(remove);
+
+ auto code = thread->handleRemove(remove)->getResult();
+ CPPUNIT_ASSERT(code == api::ReturnCode::Result::ILLEGAL_PARAMETERS);
+ CPPUNIT_ASSERT(code.getMessage() == "Document id has no doctype");
+ CPPUNIT_ASSERT_EQUAL(""s, dumpBucket(BUCKET_ID));
+}
+
+document::Document::SP
+TestAndSetTest::createTestDocument()
+{
+ auto doc = document::Document::SP(
+ createRandomDocumentAtLocation(
+ BUCKET_ID.getId(),
+ RANDOM_SEED,
+ MIN_DOCUMENT_SIZE,
+ MAX_DOCUMENT_SIZE));
+
+ doc->setValue(doc->getField("content"), OLD_CONTENT);
+ doc->setValue(doc->getField("hstringval"), MISMATCHING_HEADER);
+
+ return doc;
+}
+
+document::Document::SP TestAndSetTest::retrieveTestDocument()
+{
+ api::GetCommand get(BUCKET_ID, testDocId, "[all]");
+ auto tracker = thread->handleGet(get);
+ CPPUNIT_ASSERT(tracker->getResult() == api::ReturnCode::Result::OK);
+
+ auto & reply = static_cast<api::GetReply &>(*tracker->getReply());
+ CPPUNIT_ASSERT(reply.wasFound());
+
+ return reply.getDocument();
+}
+
+void TestAndSetTest::setTestCondition(api::TestAndSetCommand & command)
+{
+ command.setCondition(documentapi::TestAndSetCondition("testdoctype1.hstringval=\"*woofy dog*\""));
+}
+
+void TestAndSetTest::putTestDocument(bool matchingHeader, api::Timestamp timestamp) {
+ if (matchingHeader) {
+ testDoc->setValue(testDoc->getField("hstringval"), MATCHING_HEADER);
+ }
+
+ api::PutCommand put(BUCKET_ID, testDoc, timestamp);
+ thread->handlePut(put);
+}
+
+void TestAndSetTest::assertTestDocumentFoundAndMatchesContent(const document::FieldValue & value)
+{
+ auto doc = retrieveTestDocument();
+ auto & field = doc->getField("content");
+
+ CPPUNIT_ASSERT_EQUAL(*doc->getValue(field), value);
+}
+
+std::string TestAndSetTest::expectedDocEntryString(
+ api::Timestamp timestamp,
+ const document::DocumentId & docId,
+ spi::DocumentMetaFlags removeFlag)
+{
+ std::stringstream ss;
+
+ ss << "DocEntry(" << timestamp << ", " << removeFlag << ", ";
+ if (removeFlag == spi::REMOVE_ENTRY) {
+ ss << docId.toString() << ")\n";
+ } else {
+ ss << "Doc(" << docId.toString() << "))\n";
+ }
+
+ return ss.str();
+}
+
+} // storage
diff --git a/storage/src/tests/pstack_testrunner b/storage/src/tests/pstack_testrunner
new file mode 100755
index 00000000000..320d47f7e35
--- /dev/null
+++ b/storage/src/tests/pstack_testrunner
@@ -0,0 +1,14 @@
+#!/usr/bin/perl -w
+
+use strict;
+
+my @pids = `ps auxww | grep "./testrunner" | grep -v grep`;
+foreach (@pids) {
+ s/^\S+\s+(\d+)\s+.*$/$1/;
+ chomp;
+}
+
+foreach my $pid (@pids) {
+ my $cmd = "pstack $pid";
+ system($cmd) == 0 or die "Failed to run '$cmd'";
+}
diff --git a/storage/src/tests/serverapp/.gitignore b/storage/src/tests/serverapp/.gitignore
new file mode 100644
index 00000000000..333f254ba10
--- /dev/null
+++ b/storage/src/tests/serverapp/.gitignore
@@ -0,0 +1,8 @@
+*.So
+*.lo
+.*.swp
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
diff --git a/storage/src/tests/storageserver/.gitignore b/storage/src/tests/storageserver/.gitignore
new file mode 100644
index 00000000000..c4098089f09
--- /dev/null
+++ b/storage/src/tests/storageserver/.gitignore
@@ -0,0 +1,13 @@
+*.So
+*.lo
+*.o
+.*.swp
+.config.log
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
+filestorage
+testrunner
+testrunner.core
diff --git a/storage/src/tests/storageserver/CMakeLists.txt b/storage/src/tests/storageserver/CMakeLists.txt
new file mode 100644
index 00000000000..2e327089b4c
--- /dev/null
+++ b/storage/src/tests/storageserver/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_teststorageserver
+ SOURCES
+ communicationmanagertest.cpp
+ statemanagertest.cpp
+ documentapiconvertertest.cpp
+ mergethrottlertest.cpp
+ testvisitormessagesession.cpp
+ bouncertest.cpp
+ bucketintegritycheckertest.cpp
+ priorityconvertertest.cpp
+ statereportertest.cpp
+ changedbucketownershiphandlertest.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/tests/storageserver/bouncertest.cpp b/storage/src/tests/storageserver/bouncertest.cpp
new file mode 100644
index 00000000000..f00e4b19c31
--- /dev/null
+++ b/storage/src/tests/storageserver/bouncertest.cpp
@@ -0,0 +1,285 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <boost/pointer_cast.hpp>
+#include <cppunit/extensions/HelperMacros.h>
+#include <iostream>
+#include <string>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storageapi/message/stat.h>
+#include <vespa/vdslib/state/nodestate.h>
+#include <vespa/storage/storageserver/bouncer.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/testhelper.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/storageapi/message/persistence.h>
+
+namespace storage {
+
+struct BouncerTest : public CppUnit::TestFixture {
+ std::unique_ptr<TestServiceLayerApp> _node;
+ std::unique_ptr<DummyStorageLink> _upper;
+ Bouncer* _manager;
+ DummyStorageLink* _lower;
+
+ BouncerTest();
+
+ void setUp();
+ void tearDown();
+
+ void testFutureTimestamp();
+ void testAllowNotifyBucketChangeEvenWhenDistributorDown();
+ void rejectLowerPrioritizedFeedMessagesWhenConfigured();
+ void doNotRejectHigherPrioritizedFeedMessagesThanConfigured();
+ void rejectionThresholdIsExclusive();
+ void onlyRejectFeedMessagesWhenConfigured();
+ void rejectionIsDisabledByDefaultInConfig();
+ void readOnlyOperationsAreNotRejected();
+ void internalOperationsAreNotRejected();
+ void outOfBoundsConfigValuesThrowException();
+
+ CPPUNIT_TEST_SUITE(BouncerTest);
+ CPPUNIT_TEST(testFutureTimestamp);
+ CPPUNIT_TEST(testAllowNotifyBucketChangeEvenWhenDistributorDown);
+ CPPUNIT_TEST(rejectLowerPrioritizedFeedMessagesWhenConfigured);
+ CPPUNIT_TEST(doNotRejectHigherPrioritizedFeedMessagesThanConfigured);
+ CPPUNIT_TEST(rejectionThresholdIsExclusive);
+ CPPUNIT_TEST(onlyRejectFeedMessagesWhenConfigured);
+ CPPUNIT_TEST(rejectionIsDisabledByDefaultInConfig);
+ CPPUNIT_TEST(readOnlyOperationsAreNotRejected);
+ CPPUNIT_TEST(internalOperationsAreNotRejected);
+ CPPUNIT_TEST(outOfBoundsConfigValuesThrowException);
+ CPPUNIT_TEST_SUITE_END();
+
+ using Priority = api::StorageMessage::Priority;
+
+ static constexpr int RejectionDisabledConfigValue = -1;
+
+ // Note: newThreshold is intentionally int (rather than Priority) in order
+ // to be able to test out of bounds values.
+ void configureRejectionThreshold(int newThreshold);
+
+ std::shared_ptr<api::StorageCommand> createDummyFeedMessage(
+ api::Timestamp timestamp,
+ Priority priority = 0);
+
+ void assertMessageBouncedWithRejection();
+ void assertMessageNotBounced();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(BouncerTest);
+
+BouncerTest::BouncerTest()
+ : _node(),
+ _upper(),
+ _manager(0),
+ _lower(0)
+{
+}
+
+void
+BouncerTest::setUp() {
+ try{
+ vdstestlib::DirConfig config(getStandardConfig(true));
+ _node.reset(new TestServiceLayerApp(
+ DiskCount(1), NodeIndex(2), config.getConfigId()));
+ _upper.reset(new DummyStorageLink());
+ _manager = new Bouncer(_node->getComponentRegister(),
+ config.getConfigId());
+ _lower = new DummyStorageLink();
+ _upper->push_back(std::unique_ptr<StorageLink>(_manager));
+ _upper->push_back(std::unique_ptr<StorageLink>(_lower));
+ _upper->open();
+ } catch (std::exception& e) {
+ std::cerr << "Failed to static initialize objects: " << e.what()
+ << "\n";
+ }
+ _node->getClock().setAbsoluteTimeInSeconds(10);
+}
+
+void
+BouncerTest::tearDown() {
+ _manager = 0;
+ _lower = 0;
+ _upper->close();
+ _upper->flush();
+ _upper.reset(0);
+ _node.reset(0);
+}
+
+std::shared_ptr<api::StorageCommand>
+BouncerTest::createDummyFeedMessage(api::Timestamp timestamp,
+ api::StorageMessage::Priority priority)
+{
+ auto cmd = std::make_shared<api::RemoveCommand>(
+ document::BucketId(0),
+ document::DocumentId("doc:foo:bar"),
+ timestamp);
+ cmd->setPriority(priority);
+ return cmd;
+}
+
+void
+BouncerTest::testFutureTimestamp()
+{
+
+ // Fail when future timestamps (more than 5 seconds) are received.
+ {
+ _upper->sendDown(createDummyFeedMessage(16 * 1000000));
+
+ CPPUNIT_ASSERT_EQUAL(1, (int)_upper->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(0, (int)_upper->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::ABORTED,
+ static_cast<api::RemoveReply&>(*_upper->getReply(0)).
+ getResult().getResult());
+ _upper->reset();
+ }
+
+ // Verify that 1 second clock skew is OK
+ {
+ _upper->sendDown(createDummyFeedMessage(11 * 1000000));
+
+ CPPUNIT_ASSERT_EQUAL(0, (int)_upper->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(1, (int)_lower->getNumCommands());
+ _lower->reset();
+ }
+
+ // Verify that past is OK
+ {
+ _upper->sendDown(createDummyFeedMessage(5 * 1000000));
+
+ CPPUNIT_ASSERT_EQUAL(1, (int)_lower->getNumCommands());
+ }
+
+
+}
+
+void
+BouncerTest::testAllowNotifyBucketChangeEvenWhenDistributorDown()
+{
+ lib::NodeState state(lib::NodeType::DISTRIBUTOR, lib::State::DOWN);
+ _node->getNodeStateUpdater().setReportedNodeState(state);
+ // Trigger Bouncer state update
+ auto clusterState = std::make_shared<lib::ClusterState>(
+ "distributor:3 storage:3");
+ _node->getNodeStateUpdater().setClusterState(clusterState);
+
+
+ document::BucketId bucket(16, 1234);
+ api::BucketInfo info(0x1, 0x2, 0x3);
+ auto cmd = std::make_shared<api::NotifyBucketChangeCommand>(bucket, info);
+ _upper->sendDown(cmd);
+
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _upper->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _lower->getNumCommands());
+}
+
+void
+BouncerTest::assertMessageBouncedWithRejection()
+{
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _upper->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _upper->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::REJECTED,
+ static_cast<api::RemoveReply&>(*_upper->getReply(0)).
+ getResult().getResult());
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _lower->getNumCommands());
+}
+
+void
+BouncerTest::assertMessageNotBounced()
+{
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _upper->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _lower->getNumCommands());
+}
+
+void
+BouncerTest::configureRejectionThreshold(int newThreshold)
+{
+ using Builder = vespa::config::content::core::StorBouncerConfigBuilder;
+ auto config = std::make_unique<Builder>();
+ config->feedRejectionPriorityThreshold = newThreshold;
+ _manager->configure(std::move(config));
+}
+
+void
+BouncerTest::rejectLowerPrioritizedFeedMessagesWhenConfigured()
+{
+ configureRejectionThreshold(Priority(120));
+ _upper->sendDown(createDummyFeedMessage(11 * 1000000, Priority(121)));
+ assertMessageBouncedWithRejection();
+}
+
+void
+BouncerTest::doNotRejectHigherPrioritizedFeedMessagesThanConfigured()
+{
+ configureRejectionThreshold(Priority(120));
+ _upper->sendDown(createDummyFeedMessage(11 * 1000000, Priority(119)));
+ assertMessageNotBounced();
+}
+
+void
+BouncerTest::rejectionThresholdIsExclusive()
+{
+ configureRejectionThreshold(Priority(120));
+ _upper->sendDown(createDummyFeedMessage(11 * 1000000, Priority(120)));
+ assertMessageNotBounced();
+}
+
+void
+BouncerTest::onlyRejectFeedMessagesWhenConfigured()
+{
+ configureRejectionThreshold(RejectionDisabledConfigValue);
+ // A message with even the lowest priority should not be rejected.
+ _upper->sendDown(createDummyFeedMessage(11 * 1000000, Priority(255)));
+ assertMessageNotBounced();
+}
+
+void
+BouncerTest::rejectionIsDisabledByDefaultInConfig()
+{
+ _upper->sendDown(createDummyFeedMessage(11 * 1000000, Priority(255)));
+ assertMessageNotBounced();
+}
+
+void
+BouncerTest::readOnlyOperationsAreNotRejected()
+{
+ configureRejectionThreshold(Priority(1));
+ // StatBucket is an external operation, but it's not a mutating operation
+ // and should therefore not be blocked.
+ auto cmd = std::make_shared<api::StatBucketCommand>(
+ document::BucketId(16, 5), "");
+ cmd->setPriority(Priority(2));
+ _upper->sendDown(cmd);
+ assertMessageNotBounced();
+}
+
+void
+BouncerTest::internalOperationsAreNotRejected()
+{
+ configureRejectionThreshold(Priority(1));
+ document::BucketId bucket(16, 1234);
+ api::BucketInfo info(0x1, 0x2, 0x3);
+ auto cmd = std::make_shared<api::NotifyBucketChangeCommand>(bucket, info);
+ cmd->setPriority(Priority(2));
+ _upper->sendDown(cmd);
+ assertMessageNotBounced();
+}
+
+void
+BouncerTest::outOfBoundsConfigValuesThrowException()
+{
+ try {
+ configureRejectionThreshold(256);
+ CPPUNIT_FAIL("Upper bound violation not caught");
+ } catch (config::InvalidConfigException) {}
+
+ try {
+ configureRejectionThreshold(-2);
+ CPPUNIT_FAIL("Lower bound violation not caught");
+ } catch (config::InvalidConfigException) {}
+}
+
+} // storage
+
diff --git a/storage/src/tests/storageserver/bucketintegritycheckertest.cpp b/storage/src/tests/storageserver/bucketintegritycheckertest.cpp
new file mode 100644
index 00000000000..88a5546b174
--- /dev/null
+++ b/storage/src/tests/storageserver/bucketintegritycheckertest.cpp
@@ -0,0 +1,302 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <boost/lexical_cast.hpp>
+#include <cppunit/extensions/HelperMacros.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/bucketdb/bucketmanager.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storage/persistence/filestorage/filestormanager.h>
+#include <vespa/storage/storageserver/bucketintegritychecker.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <tests/common/testhelper.h>
+#include <tests/common/storagelinktest.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/vespalib/io/fileutil.h>
+#include <tests/common/teststorageapp.h>
+
+LOG_SETUP(".test.bucketintegritychecker");
+
+namespace storage {
+
+struct BucketIntegrityCheckerTest : public CppUnit::TestFixture {
+ std::unique_ptr<vdstestlib::DirConfig> _config;
+ std::unique_ptr<TestServiceLayerApp> _node;
+ int _timeout; // Timeout in seconds before aborting
+
+ void setUp() {
+ _timeout = 60*2;
+ _config.reset(new vdstestlib::DirConfig(getStandardConfig(true)));
+ _node.reset(new TestServiceLayerApp(DiskCount(256),
+ NodeIndex(0),
+ _config->getConfigId()));
+ }
+
+ void tearDown() {
+ LOG(info, "Finished test");
+ }
+
+ void testConfig();
+ void testBasicFunctionality();
+ void testTiming();
+
+ CPPUNIT_TEST_SUITE(BucketIntegrityCheckerTest);
+ CPPUNIT_TEST(testConfig);
+ CPPUNIT_TEST(testBasicFunctionality);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(BucketIntegrityCheckerTest);
+
+void BucketIntegrityCheckerTest::testConfig()
+{
+
+ // Verify that config is read correctly. Given config should not use
+ // any default values.
+ vdstestlib::DirConfig::Config& config(
+ _config->getConfig("stor-integritychecker"));
+ config.set("dailycyclestart", "60");
+ config.set("dailycyclestop", "360");
+ config.set("weeklycycle", "crRc-rc");
+ config.set("maxpending", "2");
+ config.set("mincycletime", "120");
+ config.set("requestdelay", "5");
+
+ BucketIntegrityChecker checker(_config->getConfigId(),
+ _node->getComponentRegister());
+ checker.setMaxThreadWaitTime(framework::MilliSecTime(10));
+ SchedulingOptions& opt(checker.getSchedulingOptions());
+ CPPUNIT_ASSERT_EQUAL(60u, opt._dailyCycleStart);
+ CPPUNIT_ASSERT_EQUAL(360u, opt._dailyCycleStop);
+ CPPUNIT_ASSERT_EQUAL(SchedulingOptions::CONTINUE, opt._dailyStates[0]);
+ CPPUNIT_ASSERT_EQUAL(SchedulingOptions::RUN_CHEAP, opt._dailyStates[1]);
+ CPPUNIT_ASSERT_EQUAL(SchedulingOptions::RUN_FULL, opt._dailyStates[2]);
+ CPPUNIT_ASSERT_EQUAL(SchedulingOptions::CONTINUE, opt._dailyStates[3]);
+ CPPUNIT_ASSERT_EQUAL(SchedulingOptions::DONT_RUN, opt._dailyStates[4]);
+ CPPUNIT_ASSERT_EQUAL(SchedulingOptions::RUN_CHEAP, opt._dailyStates[5]);
+ CPPUNIT_ASSERT_EQUAL(SchedulingOptions::CONTINUE, opt._dailyStates[6]);
+ CPPUNIT_ASSERT_EQUAL(2u, opt._maxPendingCount);
+ CPPUNIT_ASSERT_EQUAL(framework::SecondTime(7200), opt._minCycleTime);
+ CPPUNIT_ASSERT_EQUAL(framework::SecondTime(5), opt._requestDelay);
+}
+
+namespace {
+ /**
+ * Calculate a date based on the following format:
+ * week<#> <day> <hh>:<mm>:<ss>
+ * Examples: "week3 mon 00:30:00"
+ * "week3 tue 04:20:00"
+ * "week9 thi 14:00:24"
+ */
+ time_t getDate(const std::string& datestring) {
+ vespalib::string rest(datestring);
+ int spacePos = rest.find(' ');
+ uint32_t week = strtoul(rest.substr(4, spacePos-4).c_str(), NULL, 0);
+ rest = rest.substr(spacePos+1);
+ vespalib::string wday(rest.substr(0,3));
+ rest = rest.substr(4);
+ uint32_t hours = strtoul(rest.substr(0, 2).c_str(), NULL, 0);
+ uint32_t minutes = strtoul(rest.substr(3, 2).c_str(), NULL, 0);
+ uint32_t seconds = strtoul(rest.substr(6, 2).c_str(), NULL, 0);
+ uint32_t day(0);
+ if (wday == "mon") { day = 1; }
+ else if (wday == "tue") { day = 2; }
+ else if (wday == "wed") { day = 3; }
+ else if (wday == "thi") { day = 4; }
+ else if (wday == "fri") { day = 5; }
+ else if (wday == "sat") { day = 6; }
+ else if (wday == "sun") { day = 0; }
+ else { assert(false); }
+ // Create a start time that points to the start of some week.
+ // A random sunday 00:00:00, which we will use as start of time
+ struct tm mytime;
+ memset(&mytime, 0, sizeof(mytime));
+ mytime.tm_year = 2008 - 1900;
+ mytime.tm_mon = 0;
+ mytime.tm_mday = 1;
+ mytime.tm_hour = 0;
+ mytime.tm_min = 0;
+ mytime.tm_sec = 0;
+ time_t startTime = timegm(&mytime);
+ CPPUNIT_ASSERT(gmtime_r(&startTime, &mytime));
+ while (mytime.tm_wday != 0) {
+ ++mytime.tm_mday;
+ startTime = timegm(&mytime);
+ CPPUNIT_ASSERT(gmtime_r(&startTime, &mytime));
+ }
+ // Add the wanted values to the start time
+ time_t resultTime = startTime;
+ resultTime += week * 7 * 24 * 60 * 60
+ + day * 24 * 60 * 60
+ + hours * 60 * 60
+ + minutes * 60
+ + seconds;
+ // std::cerr << "Time requested " << datestring << ". Got time "
+ // << framework::SecondTime(resultTime).toString() << "\n";
+ return resultTime;
+ }
+
+ void addBucketToDatabase(TestServiceLayerApp& server,
+ const document::BucketId& id, uint8_t disk,
+ uint32_t numDocs, uint32_t crc, uint32_t totalSize)
+ {
+ bucketdb::StorageBucketInfo info;
+ info.setBucketInfo(api::BucketInfo(crc, numDocs, totalSize));
+ info.disk = disk;
+ server.getStorageBucketDatabase().insert(id, info, "foo");
+ }
+
+
+ /**
+ * In tests wanting to only have one pending, only add buckets for one disk
+ * as pending is per disk. If so set singleDisk true.
+ */
+ void addBucketsToDatabase(TestServiceLayerApp& server, bool singleDisk) {
+ addBucketToDatabase(server, document::BucketId(16, 0x123), 0,
+ 14, 0x123, 1024);
+ addBucketToDatabase(server, document::BucketId(16, 0x234), 0,
+ 18, 0x234, 1024);
+ addBucketToDatabase(server, document::BucketId(16, 0x345), 0,
+ 11, 0x345, 2048);
+ addBucketToDatabase(server, document::BucketId(16, 0x456), 0,
+ 13, 0x456, 1280);
+ if (!singleDisk) {
+ addBucketToDatabase(server, document::BucketId(16, 0x567), 1,
+ 20, 0x567, 4096);
+ addBucketToDatabase(server, document::BucketId(16, 0x987), 254,
+ 8, 0x987, 65536);
+ }
+ }
+}
+
+void BucketIntegrityCheckerTest::testBasicFunctionality()
+{
+ _node->getClock().setAbsoluteTimeInSeconds(getDate("week1 sun 00:00:00"));
+ addBucketsToDatabase(*_node, false);
+ DummyStorageLink* dummyLink = 0;
+ {
+ std::unique_ptr<BucketIntegrityChecker> midLink(
+ new BucketIntegrityChecker("", _node->getComponentRegister()));
+ BucketIntegrityChecker& checker(*midLink);
+ checker.setMaxThreadWaitTime(framework::MilliSecTime(10));
+ // Setup and start checker
+ DummyStorageLink topLink;
+ topLink.push_back(StorageLink::UP(midLink.release()));
+ checker.push_back(std::unique_ptr<StorageLink>(
+ dummyLink = new DummyStorageLink()));
+ checker.getSchedulingOptions()._maxPendingCount = 2;
+ checker.getSchedulingOptions()._minCycleTime = framework::SecondTime(60 * 60);
+ topLink.open();
+ // Waiting for system to be initialized
+ FastOS_Thread::Sleep(10); // Give next message chance to come
+ ASSERT_COMMAND_COUNT(0, *dummyLink);
+ topLink.doneInit();
+ checker.bump();
+ // Should have started new run with 2 pending per disk
+ dummyLink->waitForMessages(4, _timeout);
+ FastOS_Thread::Sleep(10); // Give 5th message chance to come
+ ASSERT_COMMAND_COUNT(4, *dummyLink);
+ RepairBucketCommand *cmd1 = dynamic_cast<RepairBucketCommand*>(
+ dummyLink->getCommand(0).get());
+ CPPUNIT_ASSERT_EQUAL(230, (int)cmd1->getPriority());
+ CPPUNIT_ASSERT(cmd1);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 0x234),
+ cmd1->getBucketId());
+ RepairBucketCommand *cmd2 = dynamic_cast<RepairBucketCommand*>(
+ dummyLink->getCommand(1).get());
+ CPPUNIT_ASSERT(cmd2);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 0x456),
+ cmd2->getBucketId());
+ RepairBucketCommand *cmd3 = dynamic_cast<RepairBucketCommand*>(
+ dummyLink->getCommand(2).get());
+ CPPUNIT_ASSERT(cmd3);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 0x567),
+ cmd3->getBucketId());
+ RepairBucketCommand *cmd4 = dynamic_cast<RepairBucketCommand*>(
+ dummyLink->getCommand(3).get());
+ CPPUNIT_ASSERT(cmd4);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 0x987),
+ cmd4->getBucketId());
+
+ // Answering a message on disk with no more buckets does not trigger new
+ std::shared_ptr<RepairBucketReply> reply1(
+ new RepairBucketReply(*cmd3));
+ CPPUNIT_ASSERT(StorageLinkTest::callOnUp(checker, reply1));
+ FastOS_Thread::Sleep(10); // Give next message chance to come
+ ASSERT_COMMAND_COUNT(4, *dummyLink);
+ // Answering a message on disk with more buckets trigger new repair
+ std::shared_ptr<RepairBucketReply> reply2(
+ new RepairBucketReply(*cmd2));
+ CPPUNIT_ASSERT(StorageLinkTest::callOnUp(checker, reply2));
+ dummyLink->waitForMessages(5, _timeout);
+ FastOS_Thread::Sleep(10); // Give 6th message chance to come
+ ASSERT_COMMAND_COUNT(5, *dummyLink);
+ RepairBucketCommand *cmd5 = dynamic_cast<RepairBucketCommand*>(
+ dummyLink->getCommand(4).get());
+ CPPUNIT_ASSERT(cmd5);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 0x345),
+ cmd5->getBucketId());
+ // Fail a repair, causing it to be resent later, but first continue
+ // with other bucket.
+ std::shared_ptr<RepairBucketReply> reply3(
+ new RepairBucketReply(*cmd1));
+ reply3->setResult(api::ReturnCode(api::ReturnCode::IGNORED));
+ CPPUNIT_ASSERT(StorageLinkTest::callOnUp(checker, reply3));
+ dummyLink->waitForMessages(6, _timeout);
+ FastOS_Thread::Sleep(10); // Give 7th message chance to come
+ ASSERT_COMMAND_COUNT(6, *dummyLink);
+ RepairBucketCommand *cmd6 = dynamic_cast<RepairBucketCommand*>(
+ dummyLink->getCommand(5).get());
+ CPPUNIT_ASSERT(cmd6);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 0x123),
+ cmd6->getBucketId());
+ // Fail a repair with not found. That is an acceptable return code.
+ // (No more requests as this was last for that disk)
+ std::shared_ptr<RepairBucketReply> reply4(
+ new RepairBucketReply(*cmd4));
+ reply3->setResult(api::ReturnCode(api::ReturnCode::BUCKET_NOT_FOUND));
+ CPPUNIT_ASSERT(StorageLinkTest::callOnUp(checker, reply4));
+ FastOS_Thread::Sleep(10); // Give 7th message chance to come
+ ASSERT_COMMAND_COUNT(6, *dummyLink);
+
+ // Send a repair reply that actually have corrected the bucket.
+ api::BucketInfo newInfo(0x3456, 4, 8192);
+ std::shared_ptr<RepairBucketReply> reply5(
+ new RepairBucketReply(*cmd5, newInfo));
+ reply5->setAltered(true);
+ CPPUNIT_ASSERT(StorageLinkTest::callOnUp(checker, reply5));
+
+ // Finish run. New iteration should not start yet as min
+ // cycle time has not passed
+ std::shared_ptr<RepairBucketReply> reply6(
+ new RepairBucketReply(*cmd6));
+ CPPUNIT_ASSERT(StorageLinkTest::callOnUp(checker, reply6));
+ dummyLink->waitForMessages(7, _timeout);
+ ASSERT_COMMAND_COUNT(7, *dummyLink);
+ RepairBucketCommand *cmd7 = dynamic_cast<RepairBucketCommand*>(
+ dummyLink->getCommand(6).get());
+ CPPUNIT_ASSERT(cmd7);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 0x234),
+ cmd7->getBucketId());
+ std::shared_ptr<RepairBucketReply> reply7(
+ new RepairBucketReply(*cmd7));
+ CPPUNIT_ASSERT(StorageLinkTest::callOnUp(checker, reply7));
+ FastOS_Thread::Sleep(10); // Give 8th message chance to come
+ ASSERT_COMMAND_COUNT(7, *dummyLink);
+
+ // Still not time for next iteration
+ dummyLink->reset();
+ _node->getClock().setAbsoluteTimeInSeconds(getDate("week1 sun 00:59:59"));
+ FastOS_Thread::Sleep(10); // Give new run chance to start
+ ASSERT_COMMAND_COUNT(0, *dummyLink);
+
+ // Pass time until next cycle should start
+ dummyLink->reset();
+ _node->getClock().setAbsoluteTimeInSeconds(getDate("week1 sun 01:00:00"));
+ dummyLink->waitForMessages(4, _timeout);
+ ASSERT_COMMAND_COUNT(4, *dummyLink);
+ }
+}
+
+} // storage
diff --git a/storage/src/tests/storageserver/changedbucketownershiphandlertest.cpp b/storage/src/tests/storageserver/changedbucketownershiphandlertest.cpp
new file mode 100644
index 00000000000..3b83d71d8f3
--- /dev/null
+++ b/storage/src/tests/storageserver/changedbucketownershiphandlertest.cpp
@@ -0,0 +1,648 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/document/base/testdocman.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <vespa/storageapi/message/removelocation.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/testhelper.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/storage/storageserver/changedbucketownershiphandler.h>
+#include <memory>
+
+namespace storage {
+
+class ChangedBucketOwnershipHandlerTest : public CppUnit::TestFixture
+{
+ std::unique_ptr<TestServiceLayerApp> _app;
+ std::unique_ptr<DummyStorageLink> _top;
+ ChangedBucketOwnershipHandler* _handler;
+ DummyStorageLink* _bottom;
+ document::TestDocMan _testDocRepo;
+
+ CPPUNIT_TEST_SUITE(ChangedBucketOwnershipHandlerTest);
+ CPPUNIT_TEST(testEnumerateBucketsBelongingOnChangedNodes);
+ CPPUNIT_TEST(testNoPreExistingClusterState);
+ CPPUNIT_TEST(testNoAvailableDistributorsInCurrentState);
+ CPPUNIT_TEST(testNoAvailableDistributorsInCurrentAndNewState);
+ CPPUNIT_TEST(testDownEdgeToNoAvailableDistributors);
+ CPPUNIT_TEST(testOwnershipChangedOnDistributorUpEdge);
+ CPPUNIT_TEST(testDistributionConfigChangeUpdatesOwnership);
+ CPPUNIT_TEST(testAbortOpsWhenNoClusterStateSet);
+ CPPUNIT_TEST(testAbortOutdatedSplit);
+ CPPUNIT_TEST(testAbortOutdatedJoin);
+ CPPUNIT_TEST(testAbortOutdatedSetBucketState);
+ CPPUNIT_TEST(testAbortOutdatedCreateBucket);
+ CPPUNIT_TEST(testAbortOutdatedDeleteBucket);
+ CPPUNIT_TEST(testAbortOutdatedMergeBucket);
+ CPPUNIT_TEST(testAbortOutdatedRemoveLocation);
+ CPPUNIT_TEST(testIdealStateAbortsAreConfigurable);
+ CPPUNIT_TEST(testAbortOutdatedPutOperation);
+ CPPUNIT_TEST(testAbortOutdatedMultiOperation);
+ CPPUNIT_TEST(testAbortOutdatedUpdateCommand);
+ CPPUNIT_TEST(testAbortOutdatedRemoveCommand);
+ CPPUNIT_TEST(testAbortOutdatedRevertCommand);
+ CPPUNIT_TEST(testIdealStateAbortUpdatesMetric);
+ CPPUNIT_TEST(testExternalLoadOpAbortUpdatesMetric);
+ CPPUNIT_TEST(testExternalLoadOpAbortsAreConfigurable);
+ CPPUNIT_TEST_SUITE_END();
+
+ // TODO test: down edge triggered on cluster state with cluster down?
+
+ std::vector<document::BucketId> insertBuckets(
+ uint32_t numBuckets,
+ uint16_t wantedOwner,
+ const lib::ClusterState& state);
+
+ std::shared_ptr<api::SetSystemStateCommand> createStateCmd(
+ const lib::ClusterState& state) const
+ {
+ return std::make_shared<api::SetSystemStateCommand>(state);
+ }
+
+ std::shared_ptr<api::SetSystemStateCommand> createStateCmd(
+ const std::string& stateStr) const
+ {
+ return createStateCmd(lib::ClusterState(stateStr));
+ }
+
+ void applyDistribution(Redundancy, NodeCount);
+ void applyClusterState(const lib::ClusterState&);
+
+ document::BucketId nextOwnedBucket(
+ uint16_t wantedOwner,
+ const lib::ClusterState& state,
+ const document::BucketId& lastId) const;
+
+ document::BucketId getBucketToAbort() const;
+ document::BucketId getBucketToAllow() const;
+
+ void sendAndExpectAbortedCreateBucket(uint16_t fromDistributorIndex);
+
+ template <typename MsgType, typename... MsgParams>
+ bool changeAbortsMessage(MsgParams&&... params);
+
+ lib::ClusterState getDefaultTestClusterState() const {
+ return lib::ClusterState("distributor:4 storage:1");
+ }
+
+public:
+ void testEnumerateBucketsBelongingOnChangedNodes();
+ void testNoPreExistingClusterState();
+ void testNoAvailableDistributorsInCurrentState();
+ void testNoAvailableDistributorsInCurrentAndNewState();
+ void testDownEdgeToNoAvailableDistributors();
+ void testOwnershipChangedOnDistributorUpEdge();
+ void testDistributionConfigChangeUpdatesOwnership();
+ void testAbortOpsWhenNoClusterStateSet();
+ void testAbortOutdatedSplit();
+ void testAbortOutdatedJoin();
+ void testAbortOutdatedSetBucketState();
+ void testAbortOutdatedCreateBucket();
+ void testAbortOutdatedDeleteBucket();
+ void testAbortOutdatedMergeBucket();
+ void testAbortOutdatedRemoveLocation();
+ void testIdealStateAbortsAreConfigurable();
+ void testAbortOutdatedPutOperation();
+ void testAbortOutdatedMultiOperation();
+ void testAbortOutdatedUpdateCommand();
+ void testAbortOutdatedRemoveCommand();
+ void testAbortOutdatedRevertCommand();
+ void testIdealStateAbortUpdatesMetric();
+ void testExternalLoadOpAbortUpdatesMetric();
+ void testExternalLoadOpAbortsAreConfigurable();
+
+ void setUp();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(ChangedBucketOwnershipHandlerTest);
+
+document::BucketId
+ChangedBucketOwnershipHandlerTest::nextOwnedBucket(
+ uint16_t wantedOwner,
+ const lib::ClusterState& state,
+ const document::BucketId& lastId) const
+{
+ uint32_t idx(lastId.getId() + 1);
+ while (true) {
+ document::BucketId candidate(16, idx);
+ uint16_t owner(_app->getDistribution()->getIdealDistributorNode(
+ state, candidate));
+ if (owner == wantedOwner) {
+ return candidate;
+ }
+ ++idx;
+ }
+ assert(!"should never get here");
+}
+
+std::vector<document::BucketId>
+ChangedBucketOwnershipHandlerTest::insertBuckets(uint32_t numBuckets,
+ uint16_t wantedOwner,
+ const lib::ClusterState& state)
+{
+ std::vector<document::BucketId> inserted;
+ document::BucketId bucket;
+ while (inserted.size() < numBuckets) {
+ bucket = nextOwnedBucket(wantedOwner, state, bucket);
+
+ bucketdb::StorageBucketInfo sbi;
+ sbi.setBucketInfo(api::BucketInfo(1, 2, 3));
+ sbi.disk = 0;
+ _app->getStorageBucketDatabase().insert(bucket, sbi, "test");
+ inserted.push_back(bucket);
+ }
+ return inserted;
+}
+
+void
+ChangedBucketOwnershipHandlerTest::setUp()
+{
+ vdstestlib::DirConfig config(getStandardConfig(true));
+
+ _app.reset(new TestServiceLayerApp);
+ _top.reset(new DummyStorageLink);
+ _handler = new ChangedBucketOwnershipHandler(config.getConfigId(),
+ _app->getComponentRegister());
+ _top->push_back(std::unique_ptr<StorageLink>(_handler));
+ _bottom = new DummyStorageLink;
+ _handler->push_back(std::unique_ptr<StorageLink>(_bottom));
+ _top->open();
+
+ // Ensure we're not dependent on config schema default values.
+ std::unique_ptr<vespa::config::content::PersistenceConfigBuilder> pconfig(
+ new vespa::config::content::PersistenceConfigBuilder);
+ pconfig->abortOutdatedMutatingIdealStateOps = true;
+ pconfig->abortOutdatedMutatingExternalLoadOps = true;
+ _handler->configure(std::move(pconfig));
+}
+
+namespace {
+
+template <typename Set, typename K>
+bool has(const Set& s, const K& key) {
+ return s.find(key) != s.end();
+}
+
+template <typename Vec>
+bool
+hasAbortedAllOf(const AbortBucketOperationsCommand::SP& cmd, const Vec& v)
+{
+ for (auto& b : v) {
+ if (!cmd->shouldAbort(b)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+template <typename Vec>
+bool
+hasAbortedNoneOf(const AbortBucketOperationsCommand::SP& cmd, const Vec& v)
+{
+ for (auto& b : v) {
+ if (cmd->shouldAbort(b)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool
+hasOnlySetSystemStateCmdQueued(DummyStorageLink& link) {
+ if (link.getNumCommands() != 1) {
+ std::cerr << "expected 1 command, found"
+ << link.getNumCommands() << "\n";
+ }
+ api::SetSystemStateCommand::SP cmd(
+ std::dynamic_pointer_cast<api::SetSystemStateCommand>(
+ link.getCommand(0)));
+ return (cmd.get() != 0);
+}
+
+}
+
+void
+ChangedBucketOwnershipHandlerTest::applyDistribution(
+ Redundancy redundancy, NodeCount nodeCount)
+{
+ _app->setDistribution(redundancy, nodeCount);
+ _handler->storageDistributionChanged();
+}
+
+void
+ChangedBucketOwnershipHandlerTest::applyClusterState(
+ const lib::ClusterState& state)
+{
+ _app->setClusterState(state);
+ _handler->reloadClusterState();
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testEnumerateBucketsBelongingOnChangedNodes()
+{
+ lib::ClusterState stateBefore("distributor:4 storage:1");
+ applyDistribution(Redundancy(1), NodeCount(4));
+ applyClusterState(stateBefore);
+ auto node1Buckets(insertBuckets(2, 1, stateBefore));
+ auto node3Buckets(insertBuckets(2, 3, stateBefore));
+ // Add some buckets that will not be part of the change set
+ auto node0Buckets(insertBuckets(3, 0, stateBefore));
+ auto node2Buckets(insertBuckets(2, 2, stateBefore));
+
+ _top->sendDown(createStateCmd("distributor:4 .1.s:d .3.s:d storage:1"));
+ // TODO: refactor into own function
+ CPPUNIT_ASSERT_EQUAL(size_t(2), _bottom->getNumCommands());
+ AbortBucketOperationsCommand::SP cmd(
+ std::dynamic_pointer_cast<AbortBucketOperationsCommand>(
+ _bottom->getCommand(0)));
+ CPPUNIT_ASSERT(cmd.get() != 0);
+
+ CPPUNIT_ASSERT(hasAbortedAllOf(cmd, node1Buckets));
+ CPPUNIT_ASSERT(hasAbortedAllOf(cmd, node3Buckets));
+ CPPUNIT_ASSERT(hasAbortedNoneOf(cmd, node0Buckets));
+ CPPUNIT_ASSERT(hasAbortedNoneOf(cmd, node2Buckets));
+
+ // Handler must swallow abort replies
+ _bottom->sendUp(api::StorageMessage::SP(cmd->makeReply().release()));
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _top->getNumReplies());
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testNoPreExistingClusterState()
+{
+ applyDistribution(Redundancy(1), NodeCount(4));
+ lib::ClusterState stateBefore("distributor:4 storage:1");
+ insertBuckets(2, 1, stateBefore);
+ insertBuckets(3, 0, stateBefore);
+ insertBuckets(2, 2, stateBefore);
+
+ _top->sendDown(createStateCmd("distributor:4 .1.s:d .3.s:d storage:1"));
+ CPPUNIT_ASSERT(hasOnlySetSystemStateCmdQueued(*_bottom));
+}
+
+/**
+ * When current state has no distributors and we receive a state with one or
+ * more distributors, we do not send any abort messages since this should
+ * already have been done on the down-edge.
+ */
+void
+ChangedBucketOwnershipHandlerTest::testNoAvailableDistributorsInCurrentState()
+{
+ applyDistribution(Redundancy(1), NodeCount(3));
+ lib::ClusterState insertedState("distributor:3 storage:1");
+ insertBuckets(2, 0, insertedState);
+ insertBuckets(2, 1, insertedState);
+ insertBuckets(2, 2, insertedState);
+ lib::ClusterState downState("distributor:3 .0.s:d .1.s:d .2.s:d storage:1");
+ _app->setClusterState(downState);
+
+ _top->sendDown(createStateCmd("distributor:3 .1.s:d storage:1"));
+ CPPUNIT_ASSERT(hasOnlySetSystemStateCmdQueued(*_bottom));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testNoAvailableDistributorsInCurrentAndNewState()
+{
+ applyDistribution(Redundancy(1), NodeCount(3));
+ lib::ClusterState insertedState("distributor:3 storage:1");
+ insertBuckets(2, 0, insertedState);
+ insertBuckets(2, 1, insertedState);
+ insertBuckets(2, 2, insertedState);
+ lib::ClusterState stateBefore("distributor:3 .0.s:s .1.s:s .2.s:d storage:1");
+ applyClusterState(stateBefore);
+ lib::ClusterState downState("distributor:3 .0.s:d .1.s:d .2.s:d storage:1");
+
+ _top->sendDown(createStateCmd(downState));
+ CPPUNIT_ASSERT(hasOnlySetSystemStateCmdQueued(*_bottom));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testDownEdgeToNoAvailableDistributors()
+{
+ lib::ClusterState insertedState("distributor:3 storage:1");
+ applyDistribution(Redundancy(1), NodeCount(3));
+ applyClusterState(insertedState);
+ auto node0Buckets(insertBuckets(2, 0, insertedState));
+ auto node1Buckets(insertBuckets(2, 1, insertedState));
+ auto node2Buckets(insertBuckets(2, 2, insertedState));
+ lib::ClusterState downState("distributor:3 .0.s:d .1.s:s .2.s:s storage:1");
+
+ _top->sendDown(createStateCmd(downState));
+ // TODO: refactor into own function
+ CPPUNIT_ASSERT_EQUAL(size_t(2), _bottom->getNumCommands());
+ AbortBucketOperationsCommand::SP cmd(
+ std::dynamic_pointer_cast<AbortBucketOperationsCommand>(
+ _bottom->getCommand(0)));
+ CPPUNIT_ASSERT(cmd.get() != 0);
+
+ CPPUNIT_ASSERT(hasAbortedAllOf(cmd, node0Buckets));
+ CPPUNIT_ASSERT(hasAbortedAllOf(cmd, node1Buckets));
+ CPPUNIT_ASSERT(hasAbortedAllOf(cmd, node2Buckets));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testOwnershipChangedOnDistributorUpEdge()
+{
+ lib::ClusterState stateBefore(
+ "version:10 distributor:4 .1.s:d storage:4 .1.s:d");
+ lib::ClusterState stateAfter(
+ "version:11 distributor:4 .1.t:1369990247 storage:4 .1.s:d");
+ applyDistribution(Redundancy(1), NodeCount(4));
+ applyClusterState(stateBefore);
+ // Add buckets that will belong to distributor 1 after it has come back up
+ auto node1Buckets(insertBuckets(2, 1, stateAfter));
+ // Add some buckets that will not be part of the change set
+ auto node0Buckets(insertBuckets(3, 0, stateAfter));
+ auto node2Buckets(insertBuckets(2, 2, stateAfter));
+
+ _top->sendDown(createStateCmd(stateAfter));
+ // TODO: refactor into own function
+ CPPUNIT_ASSERT_EQUAL(size_t(2), _bottom->getNumCommands());
+ AbortBucketOperationsCommand::SP cmd(
+ std::dynamic_pointer_cast<AbortBucketOperationsCommand>(
+ _bottom->getCommand(0)));
+ CPPUNIT_ASSERT(cmd.get() != 0);
+
+ CPPUNIT_ASSERT(hasAbortedAllOf(cmd, node1Buckets));
+ CPPUNIT_ASSERT(hasAbortedNoneOf(cmd, node0Buckets));
+ CPPUNIT_ASSERT(hasAbortedNoneOf(cmd, node2Buckets));
+
+ // Handler must swallow abort replies
+ _bottom->sendUp(api::StorageMessage::SP(cmd->makeReply().release()));
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _top->getNumReplies());
+}
+
+void
+ChangedBucketOwnershipHandlerTest::sendAndExpectAbortedCreateBucket(
+ uint16_t fromDistributorIndex)
+{
+ document::BucketId bucket(16, 6786);
+ auto msg = std::make_shared<api::CreateBucketCommand>(bucket);
+ msg->setSourceIndex(fromDistributorIndex);
+
+ _top->sendDown(msg);
+ std::vector<api::StorageMessage::SP> replies(_top->getRepliesOnce());
+ CPPUNIT_ASSERT_EQUAL(size_t(1), replies.size());
+ api::StorageReply& reply(dynamic_cast<api::StorageReply&>(*replies[0]));
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::ABORTED,
+ reply.getResult().getResult());
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testAbortOpsWhenNoClusterStateSet()
+{
+ sendAndExpectAbortedCreateBucket(1);
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testDistributionConfigChangeUpdatesOwnership()
+{
+ lib::ClusterState insertedState("distributor:3 storage:1");
+ applyClusterState(insertedState);
+ applyDistribution(Redundancy(1), NodeCount(3));
+
+ // Apply new distribution config containing only 1 distributor, meaning
+ // any messages sent from >1 must be aborted.
+ applyDistribution(Redundancy(1), NodeCount(1));
+ sendAndExpectAbortedCreateBucket(2);
+}
+
+/**
+ * Generate and dispatch a message of the given type with the provided
+ * aruments as if that message was sent from distributor 1. Messages will
+ * be checked as if the state contains 4 distributors in Up state. This
+ * means that it suffices to send in a message with a bucket that is not
+ * owned by distributor 1 in this state to trigger an abort.
+ */
+template <typename MsgType, typename... MsgParams>
+bool
+ChangedBucketOwnershipHandlerTest::changeAbortsMessage(MsgParams&&... params)
+{
+ auto msg = std::make_shared<MsgType>(std::forward<MsgParams>(params)...);
+ msg->setSourceIndex(1);
+
+ applyDistribution(Redundancy(1), NodeCount(4));
+ applyClusterState(getDefaultTestClusterState());
+
+ _top->sendDown(msg);
+ std::vector<api::StorageMessage::SP> replies(_top->getRepliesOnce());
+ // Test is single-threaded, no need to do any waiting.
+ if (replies.empty()) {
+ return false;
+ } else {
+ CPPUNIT_ASSERT_EQUAL(size_t(1), replies.size());
+ // Make sure the message was actually aborted and not bounced with
+ // some other arbitrary failure code.
+ api::StorageReply& reply(dynamic_cast<api::StorageReply&>(*replies[0]));
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::ABORTED,
+ reply.getResult().getResult());
+ return true;
+ }
+}
+
+/**
+ * Returns a bucket that is not owned by the sending distributor (1). More
+ * specifically, it returns a bucket that is owned by distributor 2.
+ */
+document::BucketId
+ChangedBucketOwnershipHandlerTest::getBucketToAbort() const
+{
+ lib::ClusterState state(getDefaultTestClusterState());
+ return nextOwnedBucket(2, state, document::BucketId());
+}
+
+/**
+ * Returns a bucket that _is_ owned by distributor 1 and should thus be
+ * allowed through.
+ */
+document::BucketId
+ChangedBucketOwnershipHandlerTest::getBucketToAllow() const
+{
+ lib::ClusterState state(getDefaultTestClusterState());
+ return nextOwnedBucket(1, state, document::BucketId());
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testAbortOutdatedSplit()
+{
+ CPPUNIT_ASSERT(changeAbortsMessage<api::SplitBucketCommand>(
+ getBucketToAbort()));
+ CPPUNIT_ASSERT(!changeAbortsMessage<api::SplitBucketCommand>(
+ getBucketToAllow()));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testAbortOutdatedJoin()
+{
+ CPPUNIT_ASSERT(changeAbortsMessage<api::JoinBucketsCommand>(
+ getBucketToAbort()));
+ CPPUNIT_ASSERT(!changeAbortsMessage<api::JoinBucketsCommand>(
+ getBucketToAllow()));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testAbortOutdatedSetBucketState()
+{
+ CPPUNIT_ASSERT(changeAbortsMessage<api::SetBucketStateCommand>(
+ getBucketToAbort(), api::SetBucketStateCommand::ACTIVE));
+ CPPUNIT_ASSERT(!changeAbortsMessage<api::SetBucketStateCommand>(
+ getBucketToAllow(), api::SetBucketStateCommand::ACTIVE));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testAbortOutdatedCreateBucket()
+{
+ CPPUNIT_ASSERT(changeAbortsMessage<api::CreateBucketCommand>(
+ getBucketToAbort()));
+ CPPUNIT_ASSERT(!changeAbortsMessage<api::CreateBucketCommand>(
+ getBucketToAllow()));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testAbortOutdatedDeleteBucket()
+{
+ CPPUNIT_ASSERT(changeAbortsMessage<api::DeleteBucketCommand>(
+ getBucketToAbort()));
+ CPPUNIT_ASSERT(!changeAbortsMessage<api::DeleteBucketCommand>(
+ getBucketToAllow()));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testAbortOutdatedMergeBucket()
+{
+ std::vector<api::MergeBucketCommand::Node> nodes;
+ CPPUNIT_ASSERT(changeAbortsMessage<api::MergeBucketCommand>(
+ getBucketToAbort(), nodes, 0));
+ CPPUNIT_ASSERT(!changeAbortsMessage<api::MergeBucketCommand>(
+ getBucketToAllow(), nodes, 0));
+}
+
+/**
+ * RemoveLocation is technically an external load class, but since it's also
+ * used as the backing operation for GC we have to treat it as if it were an
+ * ideal state operation class.
+ */
+void
+ChangedBucketOwnershipHandlerTest::testAbortOutdatedRemoveLocation()
+{
+ std::vector<api::MergeBucketCommand::Node> nodes;
+ CPPUNIT_ASSERT(changeAbortsMessage<api::RemoveLocationCommand>(
+ "foo", getBucketToAbort()));
+ CPPUNIT_ASSERT(!changeAbortsMessage<api::RemoveLocationCommand>(
+ "foo", getBucketToAllow()));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testIdealStateAbortsAreConfigurable()
+{
+ std::unique_ptr<vespa::config::content::PersistenceConfigBuilder> config(
+ new vespa::config::content::PersistenceConfigBuilder);
+ config->abortOutdatedMutatingIdealStateOps = false;
+ _handler->configure(std::move(config));
+ // Should not abort operation, even when ownership has changed.
+ CPPUNIT_ASSERT(!changeAbortsMessage<api::CreateBucketCommand>(
+ getBucketToAbort()));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testAbortOutdatedPutOperation()
+{
+ document::Document::SP doc(_testDocRepo.createRandomDocumentAtLocation(1));
+ CPPUNIT_ASSERT(changeAbortsMessage<api::PutCommand>(
+ getBucketToAbort(), doc, api::Timestamp(1234)));
+ CPPUNIT_ASSERT(!changeAbortsMessage<api::PutCommand>(
+ getBucketToAllow(), doc, api::Timestamp(1234)));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testAbortOutdatedMultiOperation()
+{
+ CPPUNIT_ASSERT(changeAbortsMessage<api::MultiOperationCommand>(
+ _testDocRepo.getTypeRepoSP(), getBucketToAbort(), 1024));
+ CPPUNIT_ASSERT(!changeAbortsMessage<api::MultiOperationCommand>(
+ _testDocRepo.getTypeRepoSP(), getBucketToAllow(), 1024));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testAbortOutdatedUpdateCommand()
+{
+ const document::DocumentType* docType(_testDocRepo.getTypeRepo()
+ .getDocumentType("testdoctype1"));
+ document::DocumentId docId("id:foo:testdoctype1::bar");
+ document::DocumentUpdate::SP update(
+ std::make_shared<document::DocumentUpdate>(*docType, docId));
+ CPPUNIT_ASSERT(changeAbortsMessage<api::UpdateCommand>(
+ getBucketToAbort(), update, api::Timestamp(1234)));
+ CPPUNIT_ASSERT(!changeAbortsMessage<api::UpdateCommand>(
+ getBucketToAllow(), update, api::Timestamp(1234)));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testAbortOutdatedRemoveCommand()
+{
+ document::DocumentId docId("id:foo:testdoctype1::bar");
+ CPPUNIT_ASSERT(changeAbortsMessage<api::RemoveCommand>(
+ getBucketToAbort(), docId, api::Timestamp(1234)));
+ CPPUNIT_ASSERT(!changeAbortsMessage<api::RemoveCommand>(
+ getBucketToAllow(), docId, api::Timestamp(1234)));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testAbortOutdatedRevertCommand()
+{
+ std::vector<api::Timestamp> timestamps;
+ CPPUNIT_ASSERT(changeAbortsMessage<api::RevertCommand>(
+ getBucketToAbort(), timestamps));
+ CPPUNIT_ASSERT(!changeAbortsMessage<api::RevertCommand>(
+ getBucketToAllow(), timestamps));
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testIdealStateAbortUpdatesMetric()
+{
+ CPPUNIT_ASSERT(changeAbortsMessage<api::SplitBucketCommand>(
+ getBucketToAbort()));
+ CPPUNIT_ASSERT_EQUAL(
+ uint64_t(1),
+ _handler->getMetrics().idealStateOpsAborted.getValue());
+ CPPUNIT_ASSERT_EQUAL(
+ uint64_t(0),
+ _handler->getMetrics().externalLoadOpsAborted.getValue());
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testExternalLoadOpAbortUpdatesMetric()
+{
+ document::DocumentId docId("id:foo:testdoctype1::bar");
+ CPPUNIT_ASSERT(changeAbortsMessage<api::RemoveCommand>(
+ getBucketToAbort(), docId, api::Timestamp(1234)));
+ CPPUNIT_ASSERT_EQUAL(
+ uint64_t(0),
+ _handler->getMetrics().idealStateOpsAborted.getValue());
+ CPPUNIT_ASSERT_EQUAL(
+ uint64_t(1),
+ _handler->getMetrics().externalLoadOpsAborted.getValue());
+}
+
+void
+ChangedBucketOwnershipHandlerTest::testExternalLoadOpAbortsAreConfigurable()
+{
+ std::unique_ptr<vespa::config::content::PersistenceConfigBuilder> config(
+ new vespa::config::content::PersistenceConfigBuilder);
+ config->abortOutdatedMutatingExternalLoadOps = false;
+ _handler->configure(std::move(config));
+ // Should not abort operation, even when ownership has changed.
+ document::DocumentId docId("id:foo:testdoctype1::bar");
+ CPPUNIT_ASSERT(!changeAbortsMessage<api::RemoveCommand>(
+ getBucketToAbort(), docId, api::Timestamp(1234)));
+}
+
+} // storage
diff --git a/storage/src/tests/storageserver/communicationmanagertest.cpp b/storage/src/tests/storageserver/communicationmanagertest.cpp
new file mode 100644
index 00000000000..fe062a9ee30
--- /dev/null
+++ b/storage/src/tests/storageserver/communicationmanagertest.cpp
@@ -0,0 +1,235 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/communicationmanager.h>
+
+#include <vespa/messagebus/testlib/slobrok.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h>
+#include <vespa/storageframework/defaultimplementation/memory/nomemorymanager.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/dummystoragelink.h>
+#include <tests/common/testhelper.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+
+namespace storage {
+
+struct CommunicationManagerTest : public CppUnit::TestFixture {
+ void testSimple();
+ void testDistPendingLimitConfigsArePropagatedToMessageBus();
+ void testStorPendingLimitConfigsArePropagatedToMessageBus();
+ void testCommandsAreDequeuedInPriorityOrder();
+ void testRepliesAreDequeuedInFifoOrder();
+
+ static constexpr uint32_t MESSAGE_WAIT_TIME_SEC = 60;
+
+ void doTestConfigPropagation(bool isContentNode);
+
+ std::shared_ptr<api::StorageCommand> createDummyCommand(
+ api::StorageMessage::Priority priority)
+ {
+ auto cmd = std::make_shared<api::GetCommand>(
+ document::BucketId(0),
+ document::DocumentId("doc::mydoc"),
+ "[all]");
+ cmd->setAddress(api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 1));
+ cmd->setPriority(priority);
+ return cmd;
+ }
+
+ CPPUNIT_TEST_SUITE(CommunicationManagerTest);
+ CPPUNIT_TEST(testSimple);
+ CPPUNIT_TEST(testDistPendingLimitConfigsArePropagatedToMessageBus);
+ CPPUNIT_TEST(testStorPendingLimitConfigsArePropagatedToMessageBus);
+ CPPUNIT_TEST(testCommandsAreDequeuedInPriorityOrder);
+ CPPUNIT_TEST(testRepliesAreDequeuedInFifoOrder);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(CommunicationManagerTest);
+
+void CommunicationManagerTest::testSimple()
+{
+ mbus::Slobrok slobrok;
+ vdstestlib::DirConfig distConfig(getStandardConfig(false));
+ vdstestlib::DirConfig storConfig(getStandardConfig(true));
+ distConfig.getConfig("stor-server").set("node_index", "1");
+ storConfig.getConfig("stor-server").set("node_index", "1");
+ addSlobrokConfig(distConfig, slobrok);
+ addSlobrokConfig(storConfig, slobrok);
+
+ // Set up a "distributor" and a "storage" node with communication
+ // managers and a dummy storage link below we can use for testing.
+ TestServiceLayerApp storNode(storConfig.getConfigId());
+ TestDistributorApp distNode(distConfig.getConfigId());
+
+ CommunicationManager distributor(distNode.getComponentRegister(),
+ distConfig.getConfigId());
+ CommunicationManager storage(storNode.getComponentRegister(),
+ storConfig.getConfigId());
+ DummyStorageLink *distributorLink = new DummyStorageLink();
+ DummyStorageLink *storageLink = new DummyStorageLink();
+ distributor.push_back(std::unique_ptr<StorageLink>(distributorLink));
+ storage.push_back(std::unique_ptr<StorageLink>(storageLink));
+ distributor.open();
+ storage.open();
+
+ FastOS_Thread::Sleep(1000);
+
+ // Send a message through from distributor to storage
+ std::shared_ptr<api::StorageCommand> cmd(
+ new api::GetCommand(
+ document::BucketId(0), document::DocumentId("doc::mydoc"), "[all]"));
+ cmd->setAddress(api::StorageMessageAddress(
+ "storage", lib::NodeType::STORAGE, 1));
+ distributorLink->sendUp(cmd);
+ storageLink->waitForMessages(1, MESSAGE_WAIT_TIME_SEC);
+ CPPUNIT_ASSERT(storageLink->getNumCommands() > 0);
+ std::shared_ptr<api::StorageCommand> cmd2(
+ std::dynamic_pointer_cast<api::StorageCommand>(
+ storageLink->getCommand(0)));
+ CPPUNIT_ASSERT_EQUAL(
+ vespalib::string("doc::mydoc"),
+ static_cast<api::GetCommand&>(*cmd2).getDocumentId().toString());
+ // Reply to the message
+ std::shared_ptr<api::StorageReply> reply(cmd2->makeReply().release());
+ storageLink->sendUp(reply);
+ storageLink->sendUp(reply);
+ distributorLink->waitForMessages(1, MESSAGE_WAIT_TIME_SEC);
+ CPPUNIT_ASSERT(distributorLink->getNumCommands() > 0);
+ std::shared_ptr<api::GetReply> reply2(
+ std::dynamic_pointer_cast<api::GetReply>(
+ distributorLink->getCommand(0)));
+ CPPUNIT_ASSERT_EQUAL(false, reply2->wasFound());
+}
+
+void
+CommunicationManagerTest::doTestConfigPropagation(bool isContentNode)
+{
+ mbus::Slobrok slobrok;
+ vdstestlib::DirConfig config(getStandardConfig(isContentNode));
+ config.getConfig("stor-server").set("node_index", "1");
+ auto& cfg = config.getConfig("stor-communicationmanager");
+ cfg.set("mbus_content_node_max_pending_count", "12345");
+ cfg.set("mbus_content_node_max_pending_size", "555666");
+ cfg.set("mbus_distributor_node_max_pending_count", "6789");
+ cfg.set("mbus_distributor_node_max_pending_size", "777888");
+ addSlobrokConfig(config, slobrok);
+
+ std::unique_ptr<TestStorageApp> node;
+ if (isContentNode) {
+ node = std::make_unique<TestServiceLayerApp>(config.getConfigId());
+ } else {
+ node = std::make_unique<TestDistributorApp>(config.getConfigId());
+ }
+
+ CommunicationManager commMgr(node->getComponentRegister(),
+ config.getConfigId());
+ DummyStorageLink *storageLink = new DummyStorageLink();
+ commMgr.push_back(std::unique_ptr<StorageLink>(storageLink));
+ commMgr.open();
+
+ // Outer type is RPCMessageBus, which wraps regular MessageBus.
+ auto& mbus = commMgr.getMessageBus().getMessageBus();
+ if (isContentNode) {
+ CPPUNIT_ASSERT_EQUAL(uint32_t(12345), mbus.getMaxPendingCount());
+ CPPUNIT_ASSERT_EQUAL(uint32_t(555666), mbus.getMaxPendingSize());
+ } else {
+ CPPUNIT_ASSERT_EQUAL(uint32_t(6789), mbus.getMaxPendingCount());
+ CPPUNIT_ASSERT_EQUAL(uint32_t(777888), mbus.getMaxPendingSize());
+ }
+
+ // Test live reconfig of limits.
+ using ConfigBuilder
+ = vespa::config::content::core::StorCommunicationmanagerConfigBuilder;
+ auto liveCfg = std::make_unique<ConfigBuilder>();
+ liveCfg->mbusContentNodeMaxPendingCount = 777777;
+ liveCfg->mbusDistributorNodeMaxPendingCount = 999999;
+
+ commMgr.configure(std::move(liveCfg));
+ if (isContentNode) {
+ CPPUNIT_ASSERT_EQUAL(uint32_t(777777), mbus.getMaxPendingCount());
+ } else {
+ CPPUNIT_ASSERT_EQUAL(uint32_t(999999), mbus.getMaxPendingCount());
+ }
+}
+
+void
+CommunicationManagerTest::testDistPendingLimitConfigsArePropagatedToMessageBus()
+{
+ doTestConfigPropagation(false);
+}
+
+void
+CommunicationManagerTest::testStorPendingLimitConfigsArePropagatedToMessageBus()
+{
+ doTestConfigPropagation(true);
+}
+
+void
+CommunicationManagerTest::testCommandsAreDequeuedInPriorityOrder()
+{
+ mbus::Slobrok slobrok;
+ vdstestlib::DirConfig storConfig(getStandardConfig(true));
+ storConfig.getConfig("stor-server").set("node_index", "1");
+ addSlobrokConfig(storConfig, slobrok);
+ TestServiceLayerApp storNode(storConfig.getConfigId());
+
+ CommunicationManager storage(storNode.getComponentRegister(),
+ storConfig.getConfigId());
+ DummyStorageLink *storageLink = new DummyStorageLink();
+ storage.push_back(std::unique_ptr<StorageLink>(storageLink));
+
+ // Message dequeing does not start before we invoke `open` on the storage
+ // link chain, so we enqueue messages in randomized priority order before
+ // doing so. After starting the thread, we should then get messages down
+ // the chain in a deterministic, prioritized order.
+ // Lower number == higher priority.
+ std::vector<api::StorageMessage::Priority> pris{200, 0, 255, 128};
+ for (auto pri : pris) {
+ storage.enqueue(createDummyCommand(pri));
+ }
+ storage.open();
+ storageLink->waitForMessages(pris.size(), MESSAGE_WAIT_TIME_SEC);
+
+ std::sort(pris.begin(), pris.end());
+ for (size_t i = 0; i < pris.size(); ++i) {
+ // Casting is just to avoid getting mismatched values printed to the
+ // output verbatim as chars.
+ CPPUNIT_ASSERT_EQUAL(
+ uint32_t(pris[i]),
+ uint32_t(storageLink->getCommand(i)->getPriority()));
+ }
+}
+
+void
+CommunicationManagerTest::testRepliesAreDequeuedInFifoOrder()
+{
+ mbus::Slobrok slobrok;
+ vdstestlib::DirConfig storConfig(getStandardConfig(true));
+ storConfig.getConfig("stor-server").set("node_index", "1");
+ addSlobrokConfig(storConfig, slobrok);
+ TestServiceLayerApp storNode(storConfig.getConfigId());
+
+ CommunicationManager storage(storNode.getComponentRegister(),
+ storConfig.getConfigId());
+ DummyStorageLink *storageLink = new DummyStorageLink();
+ storage.push_back(std::unique_ptr<StorageLink>(storageLink));
+
+ std::vector<api::StorageMessage::Priority> pris{200, 0, 255, 128};
+ for (auto pri : pris) {
+ storage.enqueue(createDummyCommand(pri)->makeReply());
+ }
+ storage.open();
+ storageLink->waitForMessages(pris.size(), MESSAGE_WAIT_TIME_SEC);
+
+ // Want FIFO order for replies, not priority-sorted order.
+ for (size_t i = 0; i < pris.size(); ++i) {
+ CPPUNIT_ASSERT_EQUAL(
+ uint32_t(pris[i]),
+ uint32_t(storageLink->getCommand(i)->getPriority()));
+ }
+}
+
+} // storage
diff --git a/storage/src/tests/storageserver/documentapiconvertertest.cpp b/storage/src/tests/storageserver/documentapiconvertertest.cpp
new file mode 100644
index 00000000000..69083352c4a
--- /dev/null
+++ b/storage/src/tests/storageserver/documentapiconvertertest.cpp
@@ -0,0 +1,529 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <vespa/document/config/config-documenttypes.h>
+#include <vespa/document/datatype/datatype.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/document/repo/documenttyperepo.h>
+#include <vespa/documentapi/documentapi.h>
+#include <vespa/messagebus/emptyreply.h>
+#include <vespa/storage/storageserver/documentapiconverter.h>
+#include <vespa/storageapi/message/batch.h>
+#include <vespa/storageapi/message/datagram.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/visitor.h>
+#include <vespa/vdslib/container/writabledocumentlist.h>
+
+using document::DataType;
+using document::DocIdString;
+using document::Document;
+using document::DocumentId;
+using document::DocumentTypeRepo;
+using document::readDocumenttypesConfig;
+
+namespace storage {
+
+struct DocumentApiConverterTest : public CppUnit::TestFixture
+{
+ std::unique_ptr<DocumentApiConverter> _converter;
+ const DocumentTypeRepo::SP _repo;
+ const DataType& _html_type;
+
+ DocumentApiConverterTest()
+ : _repo(new DocumentTypeRepo(readDocumenttypesConfig(
+ "config-doctypes.cfg"))),
+ _html_type(*_repo->getDocumentType("text/html"))
+ {
+ }
+
+ void setUp() {
+ _converter.reset(new DocumentApiConverter("raw:"));
+ };
+
+ void testPut();
+ void testForwardedPut();
+ void testRemove();
+ void testGet();
+ void testCreateVisitor();
+ void testCreateVisitorHighTimeout();
+ void testCreateVisitorReplyNotReady();
+ void testCreateVisitorReplyLastBucket();
+ void testDestroyVisitor();
+ void testVisitorInfo();
+ void testDocBlock();
+ void testDocBlockWithKeepTimeStamps();
+ void testMultiOperation();
+ void testBatchDocumentUpdate();
+
+ CPPUNIT_TEST_SUITE(DocumentApiConverterTest);
+ CPPUNIT_TEST(testPut);
+ CPPUNIT_TEST(testForwardedPut);
+ CPPUNIT_TEST(testRemove);
+ CPPUNIT_TEST(testGet);
+ CPPUNIT_TEST(testCreateVisitor);
+ CPPUNIT_TEST(testCreateVisitorHighTimeout);
+ CPPUNIT_TEST(testCreateVisitorReplyNotReady);
+ CPPUNIT_TEST(testCreateVisitorReplyLastBucket);
+ CPPUNIT_TEST(testDestroyVisitor);
+ CPPUNIT_TEST(testVisitorInfo);
+ CPPUNIT_TEST(testDocBlock);
+ CPPUNIT_TEST(testDocBlockWithKeepTimeStamps);
+ CPPUNIT_TEST(testMultiOperation);
+ CPPUNIT_TEST(testBatchDocumentUpdate);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(DocumentApiConverterTest);
+
+void DocumentApiConverterTest::testPut()
+{
+ Document::SP
+ doc(new Document(_html_type, DocumentId(DocIdString("test", "test"))));
+
+ documentapi::PutDocumentMessage putmsg(doc);
+ putmsg.setTimestamp(1234);
+
+ std::unique_ptr<storage::api::StorageCommand> cmd =
+ _converter->toStorageAPI(putmsg, _repo);
+
+ api::PutCommand* pc = dynamic_cast<api::PutCommand*>(cmd.get());
+
+ CPPUNIT_ASSERT(pc);
+ CPPUNIT_ASSERT(pc->getDocument().get() == doc.get());
+
+ std::unique_ptr<mbus::Reply> reply = putmsg.createReply();
+ CPPUNIT_ASSERT(reply.get());
+
+ std::unique_ptr<storage::api::StorageReply> rep = _converter->toStorageAPI(
+ static_cast<documentapi::DocumentReply&>(*reply), *cmd);
+ api::PutReply* pr = dynamic_cast<api::PutReply*>(rep.get());
+ CPPUNIT_ASSERT(pr);
+
+ std::unique_ptr<mbus::Message> mbusmsg =
+ _converter->toDocumentAPI(*pc, _repo);
+
+ documentapi::PutDocumentMessage* mbusput = dynamic_cast<documentapi::PutDocumentMessage*>(mbusmsg.get());
+ CPPUNIT_ASSERT(mbusput);
+ CPPUNIT_ASSERT(mbusput->getDocument().get() == doc.get());
+ CPPUNIT_ASSERT(mbusput->getTimestamp() == 1234);
+};
+
+void DocumentApiConverterTest::testForwardedPut()
+{
+ Document::SP
+ doc(new Document(_html_type, DocumentId(DocIdString("test", "test"))));
+
+ documentapi::PutDocumentMessage* putmsg = new documentapi::PutDocumentMessage(doc);
+ std::unique_ptr<mbus::Reply> reply(((documentapi::DocumentMessage*)putmsg)->createReply());
+ reply->setMessage(std::unique_ptr<mbus::Message>(putmsg));
+
+ std::unique_ptr<storage::api::StorageCommand> cmd =
+ _converter->toStorageAPI(*putmsg, _repo);
+ ((storage::api::PutCommand*)cmd.get())->setTimestamp(1234);
+
+ std::unique_ptr<storage::api::StorageReply> rep = cmd->makeReply();
+ api::PutReply* pr = dynamic_cast<api::PutReply*>(rep.get());
+ CPPUNIT_ASSERT(pr);
+
+ _converter->transferReplyState(*pr, *reply);
+}
+
+void DocumentApiConverterTest::testRemove()
+{
+ documentapi::RemoveDocumentMessage removemsg(document::DocumentId(document::DocIdString("test", "test")));
+ std::unique_ptr<storage::api::StorageCommand> cmd =
+ _converter->toStorageAPI(removemsg, _repo);
+
+ api::RemoveCommand* rc = dynamic_cast<api::RemoveCommand*>(cmd.get());
+
+ CPPUNIT_ASSERT(rc);
+ CPPUNIT_ASSERT_EQUAL(document::DocumentId(document::DocIdString("test", "test")), rc->getDocumentId());
+
+ std::unique_ptr<mbus::Reply> reply = removemsg.createReply();
+ CPPUNIT_ASSERT(reply.get());
+
+ std::unique_ptr<storage::api::StorageReply> rep = _converter->toStorageAPI(
+ static_cast<documentapi::DocumentReply&>(*reply), *cmd);
+ api::RemoveReply* pr = dynamic_cast<api::RemoveReply*>(rep.get());
+ CPPUNIT_ASSERT(pr);
+
+ std::unique_ptr<mbus::Message> mbusmsg =
+ _converter->toDocumentAPI(*rc, _repo);
+
+ documentapi::RemoveDocumentMessage* mbusremove = dynamic_cast<documentapi::RemoveDocumentMessage*>(mbusmsg.get());
+ CPPUNIT_ASSERT(mbusremove);
+ CPPUNIT_ASSERT_EQUAL(document::DocumentId(document::DocIdString("test", "test")), mbusremove->getDocumentId());
+};
+
+void DocumentApiConverterTest::testGet()
+{
+ documentapi::GetDocumentMessage getmsg(
+ document::DocumentId(document::DocIdString("test", "test")),
+ "foo bar");
+
+ std::unique_ptr<storage::api::StorageCommand> cmd =
+ _converter->toStorageAPI(getmsg, _repo);
+
+ api::GetCommand* rc = dynamic_cast<api::GetCommand*>(cmd.get());
+
+ CPPUNIT_ASSERT(rc);
+ CPPUNIT_ASSERT_EQUAL(document::DocumentId(document::DocIdString("test", "test")), rc->getDocumentId());
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("foo bar"), rc->getFieldSet());
+};
+
+void DocumentApiConverterTest::testCreateVisitor()
+{
+ documentapi::CreateVisitorMessage cv(
+ "mylib",
+ "myinstance",
+ "control-dest",
+ "data-dest");
+
+ cv.setTimeRemaining(123456);
+
+ std::unique_ptr<storage::api::StorageCommand> cmd =
+ _converter->toStorageAPI(cv, _repo);
+
+ api::CreateVisitorCommand* pc = dynamic_cast<api::CreateVisitorCommand*>(cmd.get());
+
+ CPPUNIT_ASSERT(pc);
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("mylib"), pc->getLibraryName());
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("myinstance"), pc->getInstanceId());
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("control-dest"), pc->getControlDestination());
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("data-dest"), pc->getDataDestination());
+ CPPUNIT_ASSERT_EQUAL(123456u, pc->getTimeout());
+}
+
+void DocumentApiConverterTest::testCreateVisitorHighTimeout()
+{
+ documentapi::CreateVisitorMessage cv(
+ "mylib",
+ "myinstance",
+ "control-dest",
+ "data-dest");
+
+ cv.setTimeRemaining((uint64_t)std::numeric_limits<uint32_t>::max() + 1); // Will be INT_MAX
+
+ std::unique_ptr<storage::api::StorageCommand> cmd =
+ _converter->toStorageAPI(cv, _repo);
+
+ api::CreateVisitorCommand* pc = dynamic_cast<api::CreateVisitorCommand*>(cmd.get());
+
+ CPPUNIT_ASSERT(pc);
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("mylib"), pc->getLibraryName());
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("myinstance"), pc->getInstanceId());
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("control-dest"), pc->getControlDestination());
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("data-dest"), pc->getDataDestination());
+ CPPUNIT_ASSERT_EQUAL((uint32_t) std::numeric_limits<int32_t>::max(),
+ pc->getTimeout());
+}
+
+void DocumentApiConverterTest::testCreateVisitorReplyNotReady()
+{
+ documentapi::CreateVisitorMessage cv(
+ "mylib",
+ "myinstance",
+ "control-dest",
+ "data-dest");
+
+ std::unique_ptr<storage::api::StorageCommand> cmd =
+ _converter->toStorageAPI(cv, _repo);
+ CPPUNIT_ASSERT(cmd.get());
+ api::CreateVisitorCommand& cvc = dynamic_cast<api::CreateVisitorCommand&>(*cmd);
+
+ api::CreateVisitorReply cvr(cvc);
+ cvr.setResult(api::ReturnCode(api::ReturnCode::NOT_READY, "not ready"));
+
+ std::unique_ptr<documentapi::CreateVisitorReply> reply(
+ dynamic_cast<documentapi::CreateVisitorReply*>(
+ cv.createReply().release()));
+ CPPUNIT_ASSERT(reply.get());
+
+ _converter->transferReplyState(cvr, *reply);
+
+ CPPUNIT_ASSERT_EQUAL((uint32_t)documentapi::DocumentProtocol::ERROR_NODE_NOT_READY, reply->getError(0).getCode());
+
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(INT_MAX), reply->getLastBucket());
+}
+
+
+void DocumentApiConverterTest::testCreateVisitorReplyLastBucket()
+{
+ documentapi::CreateVisitorMessage cv(
+ "mylib",
+ "myinstance",
+ "control-dest",
+ "data-dest");
+
+ std::unique_ptr<storage::api::StorageCommand> cmd =
+ _converter->toStorageAPI(cv, _repo);
+ CPPUNIT_ASSERT(cmd.get());
+ api::CreateVisitorCommand& cvc = dynamic_cast<api::CreateVisitorCommand&>(*cmd);
+
+
+ api::CreateVisitorReply cvr(cvc);
+ cvr.setLastBucket(document::BucketId(123));
+
+
+ std::unique_ptr<documentapi::CreateVisitorReply> reply(
+ dynamic_cast<documentapi::CreateVisitorReply*>(
+ cv.createReply().release()));
+
+ CPPUNIT_ASSERT(reply.get());
+
+ _converter->transferReplyState(cvr, *reply);
+
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(123), reply->getLastBucket());
+}
+
+
+void DocumentApiConverterTest::testDestroyVisitor()
+{
+ documentapi::DestroyVisitorMessage cv("myinstance");
+
+ std::unique_ptr<storage::api::StorageCommand> cmd =
+ _converter->toStorageAPI(cv, _repo);
+
+ api::DestroyVisitorCommand* pc = dynamic_cast<api::DestroyVisitorCommand*>(cmd.get());
+
+ CPPUNIT_ASSERT(pc);
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("myinstance"), pc->getInstanceId());
+}
+
+void
+DocumentApiConverterTest::testVisitorInfo()
+{
+ api::VisitorInfoCommand vicmd;
+ std::vector<api::VisitorInfoCommand::BucketTimestampPair> bucketsCompleted;
+ bucketsCompleted.push_back(api::VisitorInfoCommand::BucketTimestampPair(document::BucketId(16, 1), 0));
+ bucketsCompleted.push_back(api::VisitorInfoCommand::BucketTimestampPair(document::BucketId(16, 2), 0));
+ bucketsCompleted.push_back(api::VisitorInfoCommand::BucketTimestampPair(document::BucketId(16, 4), 0));
+
+ vicmd.setBucketsCompleted(bucketsCompleted);
+
+ std::unique_ptr<mbus::Message> mbusmsg =
+ _converter->toDocumentAPI(vicmd, _repo);
+
+ documentapi::VisitorInfoMessage* mbusvi = dynamic_cast<documentapi::VisitorInfoMessage*>(mbusmsg.get());
+ CPPUNIT_ASSERT(mbusvi);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 1), mbusvi->getFinishedBuckets()[0]);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 2), mbusvi->getFinishedBuckets()[1]);
+ CPPUNIT_ASSERT_EQUAL(document::BucketId(16, 4), mbusvi->getFinishedBuckets()[2]);
+
+ std::unique_ptr<mbus::Reply> reply = mbusvi->createReply();
+ CPPUNIT_ASSERT(reply.get());
+
+ std::unique_ptr<storage::api::StorageReply> rep = _converter->toStorageAPI(
+ static_cast<documentapi::DocumentReply&>(*reply), vicmd);
+ api::VisitorInfoReply* pr = dynamic_cast<api::VisitorInfoReply*>(rep.get());
+ CPPUNIT_ASSERT(pr);
+}
+
+void
+DocumentApiConverterTest::testDocBlock()
+{
+ Document::SP
+ doc(new Document(_html_type, DocumentId(DocIdString("test", "test"))));
+
+ char buffer[10000];
+ vdslib::WritableDocumentList docBlock(_repo, buffer, sizeof(buffer));
+ docBlock.addPut(*doc, 100);
+
+ document::BucketIdFactory fac;
+ document::BucketId bucketId = fac.getBucketId(doc->getId());
+ bucketId.setUsedBits(32);
+
+ api::DocBlockCommand dbcmd(bucketId, docBlock, std::shared_ptr<void>());
+
+ dbcmd.setTimeout(123456);
+
+ std::unique_ptr<mbus::Message> mbusmsg =
+ _converter->toDocumentAPI(dbcmd, _repo);
+
+ documentapi::MultiOperationMessage* mbusdb = dynamic_cast<documentapi::MultiOperationMessage*>(mbusmsg.get());
+ CPPUNIT_ASSERT(mbusdb);
+
+ CPPUNIT_ASSERT_EQUAL((uint64_t)123456, mbusdb->getTimeRemaining());
+
+ const vdslib::DocumentList& list = mbusdb->getOperations();
+ CPPUNIT_ASSERT_EQUAL((uint32_t)1, list.size());
+ CPPUNIT_ASSERT_EQUAL(*doc, *dynamic_cast<document::Document*>(list.begin()->getDocument().get()));
+
+ std::unique_ptr<mbus::Reply> reply = mbusdb->createReply();
+ CPPUNIT_ASSERT(reply.get());
+
+ std::unique_ptr<storage::api::StorageReply> rep =
+ _converter->toStorageAPI(static_cast<documentapi::DocumentReply&>(*reply), dbcmd);
+ api::DocBlockReply* pr = dynamic_cast<api::DocBlockReply*>(rep.get());
+ CPPUNIT_ASSERT(pr);
+}
+
+
+void
+DocumentApiConverterTest::testDocBlockWithKeepTimeStamps()
+{
+ char buffer[10000];
+ vdslib::WritableDocumentList docBlock(_repo, buffer, sizeof(buffer));
+ api::DocBlockCommand dbcmd(document::BucketId(0), docBlock, std::shared_ptr<void>());
+
+ {
+ CPPUNIT_ASSERT_EQUAL(dbcmd.keepTimeStamps(), false);
+
+ std::unique_ptr<mbus::Message> mbusmsg =
+ _converter->toDocumentAPI(dbcmd, _repo);
+
+ documentapi::MultiOperationMessage* mbusdb = dynamic_cast<documentapi::MultiOperationMessage*>(mbusmsg.get());
+ CPPUNIT_ASSERT(mbusdb);
+
+ CPPUNIT_ASSERT_EQUAL(mbusdb->keepTimeStamps(), false);
+ }
+
+ {
+ dbcmd.keepTimeStamps(true);
+ CPPUNIT_ASSERT_EQUAL(dbcmd.keepTimeStamps(), true);
+
+ std::unique_ptr<mbus::Message> mbusmsg =
+ _converter->toDocumentAPI(dbcmd, _repo);
+
+ documentapi::MultiOperationMessage* mbusdb = dynamic_cast<documentapi::MultiOperationMessage*>(mbusmsg.get());
+ CPPUNIT_ASSERT(mbusdb);
+
+ CPPUNIT_ASSERT_EQUAL(mbusdb->keepTimeStamps(), true);
+ }
+
+}
+
+
+void
+DocumentApiConverterTest::testMultiOperation()
+{
+ //create a document
+ Document::SP
+ doc(new Document(_html_type, DocumentId(DocIdString("test", "test"))));
+
+ document::BucketIdFactory fac;
+ document::BucketId bucketId = fac.getBucketId(doc->getId());
+ bucketId.setUsedBits(32);
+
+ {
+ documentapi::MultiOperationMessage momsg(_repo, bucketId, 10000);
+
+ vdslib::WritableDocumentList operations(_repo, &(momsg.getBuffer()[0]),
+ momsg.getBuffer().size());
+ operations.addPut(*doc, 100);
+
+ momsg.setOperations(operations);
+
+ CPPUNIT_ASSERT(momsg.getBuffer().size() > 0);
+
+ // Convert it to Storage API
+ std::unique_ptr<api::StorageCommand> stcmd =
+ _converter->toStorageAPI(momsg, _repo);
+
+ api::MultiOperationCommand* mocmd = dynamic_cast<api::MultiOperationCommand*>(stcmd.get());
+ CPPUNIT_ASSERT(mocmd);
+ CPPUNIT_ASSERT(mocmd->getBuffer().size() > 0);
+
+ // Get operations from Storage API message and check document
+ const vdslib::DocumentList& list = mocmd->getOperations();
+ CPPUNIT_ASSERT_EQUAL((uint32_t)1, list.size());
+ CPPUNIT_ASSERT_EQUAL(*doc, *dynamic_cast<document::Document*>(list.begin()->getDocument().get()));
+
+ // Create Storage API Reply
+ std::unique_ptr<api::MultiOperationReply> moreply = std::unique_ptr<api::MultiOperationReply>(new api::MultiOperationReply(*mocmd));
+ CPPUNIT_ASSERT(moreply.get());
+
+ // convert storage api reply to mbus reply.....
+ // ...
+ }
+
+ {
+ api::MultiOperationCommand mocmd(_repo, bucketId, 10000, false);
+ mocmd.getOperations().addPut(*doc, 100);
+
+ // Convert it to documentapi
+ std::unique_ptr<mbus::Message> mbmsg =
+ _converter->toDocumentAPI(mocmd, _repo);
+ documentapi::MultiOperationMessage* momsg = dynamic_cast<documentapi::MultiOperationMessage*>(mbmsg.get());
+ CPPUNIT_ASSERT(momsg);
+
+ // Get operations from Document API msg and check document
+ const vdslib::DocumentList& list = momsg->getOperations();
+ CPPUNIT_ASSERT_EQUAL((uint32_t)1, list.size());
+ CPPUNIT_ASSERT_EQUAL(*doc, *dynamic_cast<document::Document*>(list.begin()->getDocument().get()));
+
+ // Create Document API reply
+ mbus::Reply::UP moreply = momsg->createReply();
+ CPPUNIT_ASSERT(moreply.get());
+
+ //Convert DocumentAPI reply to storageapi reply
+ std::unique_ptr<api::StorageReply> streply =
+ _converter->toStorageAPI(static_cast<documentapi::DocumentReply&>(*moreply), mocmd);
+ api::MultiOperationReply* mostreply = dynamic_cast<api::MultiOperationReply*>(streply.get());
+ CPPUNIT_ASSERT(mostreply);
+
+ }
+}
+
+void
+DocumentApiConverterTest::testBatchDocumentUpdate()
+{
+ std::vector<document::DocumentUpdate::SP > updates;
+
+ {
+ document::DocumentId docId(document::UserDocIdString("userdoc:test:1234:test1"));
+ document::DocumentUpdate::SP update(
+ new document::DocumentUpdate(_html_type, docId));
+ updates.push_back(update);
+ }
+
+ {
+ document::DocumentId docId(document::UserDocIdString("userdoc:test:1234:test2"));
+ document::DocumentUpdate::SP update(
+ new document::DocumentUpdate(_html_type, docId));
+ updates.push_back(update);
+ }
+
+ {
+ document::DocumentId docId(document::UserDocIdString("userdoc:test:1234:test3"));
+ document::DocumentUpdate::SP update(
+ new document::DocumentUpdate(_html_type, docId));
+ updates.push_back(update);
+ }
+
+ std::shared_ptr<documentapi::BatchDocumentUpdateMessage> msg(
+ new documentapi::BatchDocumentUpdateMessage(1234));
+ for (std::size_t i = 0; i < updates.size(); ++i) {
+ msg->addUpdate(updates[i]);
+ }
+
+ std::unique_ptr<storage::api::StorageCommand> cmd =
+ _converter->toStorageAPI(*msg, _repo);
+ api::BatchDocumentUpdateCommand* batchCmd = dynamic_cast<api::BatchDocumentUpdateCommand*>(cmd.get());
+ CPPUNIT_ASSERT(batchCmd);
+ CPPUNIT_ASSERT_EQUAL(updates.size(), batchCmd->getUpdates().size());
+ for (std::size_t i = 0; i < updates.size(); ++i) {
+ CPPUNIT_ASSERT_EQUAL(*updates[i], *batchCmd->getUpdates()[i]);
+ }
+
+ api::BatchDocumentUpdateReply batchReply(*batchCmd);
+ batchReply.getDocumentsNotFound().resize(3);
+ batchReply.getDocumentsNotFound()[0] = true;
+ batchReply.getDocumentsNotFound()[2] = true;
+
+ std::unique_ptr<mbus::Reply> mbusReply = msg->createReply();
+ documentapi::BatchDocumentUpdateReply* mbusBatchReply(
+ dynamic_cast<documentapi::BatchDocumentUpdateReply*>(mbusReply.get()));
+ CPPUNIT_ASSERT(mbusBatchReply != 0);
+
+ _converter->transferReplyState(batchReply, *mbusReply);
+
+ CPPUNIT_ASSERT_EQUAL(std::size_t(3), mbusBatchReply->getDocumentsNotFound().size());
+ CPPUNIT_ASSERT(mbusBatchReply->getDocumentsNotFound()[0] == true);
+ CPPUNIT_ASSERT(mbusBatchReply->getDocumentsNotFound()[1] == false);
+ CPPUNIT_ASSERT(mbusBatchReply->getDocumentsNotFound()[2] == true);
+}
+
+}
diff --git a/storage/src/tests/storageserver/dummystoragelink.cpp b/storage/src/tests/storageserver/dummystoragelink.cpp
new file mode 100644
index 00000000000..7194f1fba3d
--- /dev/null
+++ b/storage/src/tests/storageserver/dummystoragelink.cpp
@@ -0,0 +1,182 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
+#include <tests/common/dummystoragelink.h>
+#include <sys/time.h>
+
+namespace storage {
+
+DummyStorageLink* DummyStorageLink::_last(0);
+
+DummyStorageLink::DummyStorageLink()
+ : StorageLink("Dummy storage link"),
+ _commands(),
+ _replies(),
+ _injected(),
+ _autoReply(false),
+ _useDispatch(false),
+ _ignore(false),
+ _waitMonitor()
+{
+ _last = this;
+}
+
+DummyStorageLink::~DummyStorageLink()
+{
+ // Often a chain with dummy link on top is deleted in unit tests.
+ // If they haven't been closed already, close them for a cleaner
+ // shutdown
+ if (getState() == OPENED) {
+ close();
+ flush();
+ }
+ closeNextLink();
+ reset();
+}
+
+bool DummyStorageLink::onDown(const api::StorageMessage::SP& cmd)
+{
+ if (_ignore) {
+ return false;
+ }
+ if (_injected.size() > 0) {
+ vespalib::LockGuard guard(_lock);
+ sendUp(*_injected.begin());
+ _injected.pop_front();
+ } else if (_autoReply) {
+ if (!cmd->getType().isReply()) {
+ std::shared_ptr<api::StorageReply> reply(
+ std::dynamic_pointer_cast<api::StorageCommand>(cmd)
+ ->makeReply().release());
+ reply->setResult(api::ReturnCode(
+ api::ReturnCode::OK, "Automatically generated reply"));
+ sendUp(reply);
+ }
+ }
+ if (isBottom()) {
+ vespalib::MonitorGuard lock(_waitMonitor);
+ {
+ vespalib::LockGuard guard(_lock);
+ _commands.push_back(cmd);
+ }
+ lock.broadcast();
+ return true;
+ }
+ return StorageLink::onDown(cmd);
+}
+
+bool DummyStorageLink::onUp(const api::StorageMessage::SP& reply) {
+ if (isTop()) {
+ vespalib::MonitorGuard lock(_waitMonitor);
+ {
+ vespalib::LockGuard guard(_lock);
+ _replies.push_back(reply);
+ }
+ lock.broadcast();
+ return true;
+ }
+ return StorageLink::onUp(reply);
+
+}
+
+void DummyStorageLink::injectReply(api::StorageReply* reply)
+{
+ assert(reply);
+ vespalib::LockGuard guard(_lock);
+ _injected.push_back(std::shared_ptr<api::StorageReply>(reply));
+}
+
+void DummyStorageLink::reset() {
+ vespalib::MonitorGuard lock(_waitMonitor);
+ vespalib::LockGuard guard(_lock);
+ _commands.clear();
+ _replies.clear();
+ _injected.clear();
+}
+
+void DummyStorageLink::waitForMessages(unsigned int msgCount, int timeout)
+{
+ framework::defaultimplementation::RealClock clock;
+ framework::MilliSecTime endTime(
+ clock.getTimeInMillis() + framework::MilliSecTime(timeout * 1000));
+ vespalib::MonitorGuard lock(_waitMonitor);
+ while (_commands.size() + _replies.size() < msgCount) {
+ if (timeout != 0 && clock.getTimeInMillis() > endTime) {
+ std::ostringstream ost;
+ ost << "Timed out waiting for " << msgCount << " messages to "
+ << "arrive in dummy storage link. Only "
+ << (_commands.size() + _replies.size()) << " messages seen "
+ << "after timout of " << timeout << " seconds was reached.";
+ throw vespalib::IllegalStateException(ost.str(), VESPA_STRLOC);
+ }
+ if (timeout >= 0) {
+ lock.wait((endTime - clock.getTimeInMillis()).getTime());
+ } else {
+ lock.wait();
+ }
+ }
+}
+
+void DummyStorageLink::waitForMessage(const api::MessageType& type, int timeout)
+{
+ framework::defaultimplementation::RealClock clock;
+ framework::MilliSecTime endTime(
+ clock.getTimeInMillis() + framework::MilliSecTime(timeout * 1000));
+ vespalib::MonitorGuard lock(_waitMonitor);
+ while (true) {
+ for (uint32_t i=0; i<_commands.size(); ++i) {
+ if (_commands[i]->getType() == type) return;
+ }
+ for (uint32_t i=0; i<_replies.size(); ++i) {
+ if (_replies[i]->getType() == type) return;
+ }
+ if (timeout != 0 && clock.getTimeInMillis() > endTime) {
+ std::ostringstream ost;
+ ost << "Timed out waiting for " << type << " message to "
+ << "arrive in dummy storage link. Only "
+ << (_commands.size() + _replies.size()) << " messages seen "
+ << "after timout of " << timeout << " seconds was reached.";
+ if (_commands.size() == 1) {
+ ost << " Found command of type " << _commands[0]->getType();
+ }
+ if (_replies.size() == 1) {
+ ost << " Found command of type " << _replies[0]->getType();
+ }
+ throw vespalib::IllegalStateException(ost.str(), VESPA_STRLOC);
+ }
+ if (timeout >= 0) {
+ lock.wait((endTime - clock.getTimeInMillis()).getTime());
+ } else {
+ lock.wait();
+ }
+ }
+}
+
+api::StorageMessage::SP
+DummyStorageLink::getAndRemoveMessage(const api::MessageType& type)
+{
+ vespalib::MonitorGuard lock(_waitMonitor);
+ for (std::vector<api::StorageMessage::SP>::iterator it = _commands.begin();
+ it != _commands.end(); ++it)
+ {
+ if ((*it)->getType() == type) {
+ api::StorageMessage::SP result(*it);
+ _commands.erase(it);
+ return result;
+ }
+ }
+ for (std::vector<api::StorageMessage::SP>::iterator it = _replies.begin();
+ it != _replies.end(); ++it)
+ {
+ if ((*it)->getType() == type) {
+ api::StorageMessage::SP result(*it);
+ _replies.erase(it);
+ return result;
+ }
+ }
+ std::ostringstream ost;
+ ost << "No message of type " << type << " found.";
+ throw vespalib::IllegalStateException(ost.str(), VESPA_STRLOC);
+}
+
+} // storage
diff --git a/storage/src/tests/storageserver/dummystoragelink.h b/storage/src/tests/storageserver/dummystoragelink.h
new file mode 100644
index 00000000000..cb9df8c5642
--- /dev/null
+++ b/storage/src/tests/storageserver/dummystoragelink.h
@@ -0,0 +1,115 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/util/sync.h>
+#include <list>
+#include <sstream>
+#include <vespa/storageapi/messageapi/storagecommand.h>
+#include <string>
+#include <vector>
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storageapi/message/internal.h>
+
+class FastOS_ThreadPool;
+
+namespace storage {
+
+class DummyStorageLink : public StorageLink {
+
+ mutable vespalib::Lock _lock; // to protect below containers:
+ std::vector<api::StorageMessage::SP> _commands;
+ std::vector<api::StorageMessage::SP> _replies;
+ std::list<api::StorageMessage::SP> _injected;
+
+ bool _autoReply;
+ bool _useDispatch;
+ bool _ignore;
+ static DummyStorageLink* _last;
+ vespalib::Monitor _waitMonitor;
+
+public:
+ DummyStorageLink();
+ ~DummyStorageLink();
+
+ bool onDown(const api::StorageMessage::SP&);
+ bool onUp(const api::StorageMessage::SP&);
+
+ void addOnTopOfChain(StorageLink& link) {
+ link.addTestLinkOnTop(this);
+ }
+
+ void print(std::ostream& ost, bool verbose, const std::string& indent) const
+ {
+ (void) verbose;
+ ost << indent << "DummyStorageLink("
+ << "autoreply = " << (_autoReply ? "on" : "off")
+ << ", dispatch = " << (_useDispatch ? "on" : "off")
+ << ", " << _commands.size() << " commands"
+ << ", " << _replies.size() << " replies";
+ if (_injected.size() > 0)
+ ost << ", " << _injected.size() << " injected";
+ ost << ")";
+ }
+
+ void injectReply(api::StorageReply* reply);
+ void reset();
+ void setAutoreply(bool autoReply) { _autoReply = autoReply; }
+ void setIgnore(bool ignore) { _ignore = ignore; }
+ // Timeout is given in seconds
+ void waitForMessages(unsigned int msgCount = 1, int timeout = -1);
+ // Wait for a single message of a given type
+ void waitForMessage(const api::MessageType&, int timeout = -1);
+
+ api::StorageMessage::SP getCommand(size_t i) const {
+ vespalib::LockGuard guard(_lock);
+ api::StorageMessage::SP ret = _commands[i];
+ return ret;
+ }
+ api::StorageMessage::SP getReply(size_t i) const {
+ vespalib::LockGuard guard(_lock);
+ api::StorageMessage::SP ret = _replies[i];
+ return ret;
+ }
+ size_t getNumCommands() const {
+ vespalib::LockGuard guard(_lock);
+ return _commands.size();
+ }
+ size_t getNumReplies() const {
+ vespalib::LockGuard guard(_lock);
+ return _replies.size();
+ }
+
+ const std::vector<api::StorageMessage::SP>& getCommands() const
+ { return _commands; }
+ const std::vector<api::StorageMessage::SP>& getReplies() const
+ { return _replies; }
+
+ std::vector<api::StorageMessage::SP> getCommandsOnce() {
+ vespalib::MonitorGuard lock(_waitMonitor);
+ std::vector<api::StorageMessage::SP> retval;
+ {
+ vespalib::LockGuard guard(_lock);
+ retval.swap(_commands);
+ }
+ return retval;
+ }
+
+ std::vector<api::StorageMessage::SP> getRepliesOnce() {
+ vespalib::MonitorGuard lock(_waitMonitor);
+ std::vector<api::StorageMessage::SP> retval;
+ {
+ vespalib::LockGuard guard(_lock);
+ retval.swap(_replies);
+ }
+ return retval;
+ }
+
+ api::StorageMessage::SP getAndRemoveMessage(const api::MessageType&);
+
+ static DummyStorageLink* getLast() { return _last; }
+};
+
+}
+
diff --git a/storage/src/tests/storageserver/mergethrottlertest.cpp b/storage/src/tests/storageserver/mergethrottlertest.cpp
new file mode 100644
index 00000000000..e705db80788
--- /dev/null
+++ b/storage/src/tests/storageserver/mergethrottlertest.cpp
@@ -0,0 +1,1566 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <memory>
+#include <iterator>
+#include <vector>
+#include <algorithm>
+#include <ctime>
+#include <vespa/vespalib/util/document_runnable.h>
+#include <vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h>
+#include <tests/common/testhelper.h>
+#include <tests/common/storagelinktest.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/storage/storageserver/mergethrottler.h>
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/state.h>
+
+using namespace document;
+using namespace storage::api;
+
+namespace storage {
+
+namespace {
+
+struct MergeBuilder
+{
+ document::BucketId _bucket;
+ api::Timestamp _maxTimestamp;
+ std::vector<uint16_t> _nodes;
+ std::vector<uint16_t> _chain;
+ uint64_t _clusterStateVersion;
+
+ MergeBuilder(const document::BucketId& bucket)
+ : _bucket(bucket),
+ _maxTimestamp(1234),
+ _chain(),
+ _clusterStateVersion(1)
+ {
+ nodes(0, 1, 2);
+ }
+
+ MergeBuilder& nodes(uint16_t n0) {
+ _nodes.push_back(n0);
+ return *this;
+ }
+ MergeBuilder& nodes(uint16_t n0, uint16_t n1) {
+ _nodes.push_back(n0);
+ _nodes.push_back(n1);
+ return *this;
+ }
+ MergeBuilder& nodes(uint16_t n0, uint16_t n1, uint16_t n2) {
+ _nodes.push_back(n0);
+ _nodes.push_back(n1);
+ _nodes.push_back(n2);
+ return *this;
+ }
+ MergeBuilder& maxTimestamp(api::Timestamp maxTs) {
+ _maxTimestamp = maxTs;
+ return *this;
+ }
+ MergeBuilder& clusterStateVersion(uint64_t csv) {
+ _clusterStateVersion = csv;
+ return *this;
+ }
+ MergeBuilder& chain(uint16_t n0) {
+ _chain.clear();
+ _chain.push_back(n0);
+ return *this;
+ }
+ MergeBuilder& chain(uint16_t n0, uint16_t n1) {
+ _chain.clear();
+ _chain.push_back(n0);
+ _chain.push_back(n1);
+ return *this;
+ }
+ MergeBuilder& chain(uint16_t n0, uint16_t n1, uint16_t n2) {
+ _chain.clear();
+ _chain.push_back(n0);
+ _chain.push_back(n1);
+ _chain.push_back(n2);
+ return *this;
+ }
+
+ api::MergeBucketCommand::SP create() const {
+ std::vector<api::MergeBucketCommand::Node> n;
+ for (uint32_t i = 0; i < _nodes.size(); ++i) {
+ n.push_back(_nodes[i]);
+ }
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(_bucket, n, _maxTimestamp,
+ _clusterStateVersion, _chain));
+ StorageMessageAddress address("storage", lib::NodeType::STORAGE, _nodes[0]);
+ cmd->setAddress(address);
+ return cmd;
+ }
+};
+
+std::shared_ptr<api::SetSystemStateCommand>
+makeSystemStateCmd(const std::string& state)
+{
+ return std::make_shared<api::SetSystemStateCommand>(
+ lib::ClusterState(state));
+}
+
+} // anon ns
+
+class MergeThrottlerTest : public CppUnit::TestFixture
+{
+ CPPUNIT_TEST_SUITE(MergeThrottlerTest);
+ CPPUNIT_TEST(testMergesConfig);
+ CPPUNIT_TEST(testChain);
+ CPPUNIT_TEST(testWithSourceOnlyNode);
+ CPPUNIT_TEST(test42DistributorBehavior);
+ CPPUNIT_TEST(test42DistributorBehaviorDoesNotTakeOwnership);
+ CPPUNIT_TEST(testEndOfChainExecutionDoesNotTakeOwnership);
+ CPPUNIT_TEST(testResendHandling);
+ CPPUNIT_TEST(testPriorityQueuing);
+ CPPUNIT_TEST(testCommandInQueueDuplicateOfKnownMerge);
+ CPPUNIT_TEST(testInvalidReceiverNode);
+ CPPUNIT_TEST(testForwardQueuedMerge);
+ CPPUNIT_TEST(testExecuteQueuedMerge);
+ CPPUNIT_TEST(testFlush);
+ CPPUNIT_TEST(testUnseenMergeWithNodeInChain);
+ CPPUNIT_TEST(testMergeWithNewerClusterStateFlushesOutdatedQueued);
+ CPPUNIT_TEST(testUpdatedClusterStateFlushesOutdatedQueued);
+ CPPUNIT_TEST(test42MergesDoNotTriggerFlush);
+ CPPUNIT_TEST(testOutdatedClusterStateMergesAreRejectedOnArrival);
+ CPPUNIT_TEST(testUnknownMergeWithSelfInChain);
+ CPPUNIT_TEST(testBusyReturnedOnFullQueue);
+ CPPUNIT_TEST(testBrokenCycle);
+ CPPUNIT_TEST(testGetBucketDiffCommandNotInActiveSetIsRejected);
+ CPPUNIT_TEST(testApplyBucketDiffCommandNotInActiveSetIsRejected);
+ CPPUNIT_TEST(testNewClusterStateAbortsAllOutdatedActiveMerges);
+ CPPUNIT_TEST_SUITE_END();
+public:
+ void setUp();
+ void tearDown();
+
+ void testMergesConfig();
+ void testChain();
+ void testWithSourceOnlyNode();
+ void test42DistributorBehavior();
+ void test42DistributorBehaviorDoesNotTakeOwnership();
+ void testEndOfChainExecutionDoesNotTakeOwnership();
+ void testResendHandling();
+ void testPriorityQueuing();
+ void testCommandInQueueDuplicateOfKnownMerge();
+ void testInvalidReceiverNode();
+ void testForwardQueuedMerge();
+ void testExecuteQueuedMerge();
+ void testFlush();
+ void testUnseenMergeWithNodeInChain();
+ void testMergeWithNewerClusterStateFlushesOutdatedQueued();
+ void testUpdatedClusterStateFlushesOutdatedQueued();
+ void test42MergesDoNotTriggerFlush();
+ void testOutdatedClusterStateMergesAreRejectedOnArrival();
+ void testUnknownMergeWithSelfInChain();
+ void testBusyReturnedOnFullQueue();
+ void testBrokenCycle();
+ void testGetBucketDiffCommandNotInActiveSetIsRejected();
+ void testApplyBucketDiffCommandNotInActiveSetIsRejected();
+ void testNewClusterStateAbortsAllOutdatedActiveMerges();
+private:
+ static const int _storageNodeCount = 3;
+ static const int _messageWaitTime = 100;
+
+ // Using n storage node links and dummy servers
+ std::vector<std::shared_ptr<DummyStorageLink> > _topLinks;
+ std::vector<std::shared_ptr<TestServiceLayerApp> > _servers;
+ std::vector<MergeThrottler*> _throttlers;
+ std::vector<DummyStorageLink*> _bottomLinks;
+
+ api::MergeBucketCommand::SP sendMerge(const MergeBuilder&);
+
+ void sendAndExpectReply(
+ const std::shared_ptr<api::StorageMessage>& msg,
+ const api::MessageType& expectedReplyType,
+ api::ReturnCode::Result expectedResultCode);
+};
+
+const int MergeThrottlerTest::_storageNodeCount;
+const int MergeThrottlerTest::_messageWaitTime;
+
+CPPUNIT_TEST_SUITE_REGISTRATION(MergeThrottlerTest);
+
+void
+MergeThrottlerTest::setUp()
+{
+ vdstestlib::DirConfig config(getStandardConfig(true));
+
+ for (int i = 0; i < _storageNodeCount; ++i) {
+ std::unique_ptr<TestServiceLayerApp> server(
+ new TestServiceLayerApp(DiskCount(1), NodeIndex(i)));
+ server->setClusterState(lib::ClusterState(
+ "distributor:100 storage:100 version:1"));
+ std::unique_ptr<DummyStorageLink> top;
+
+ top.reset(new DummyStorageLink);
+ MergeThrottler* throttler = new MergeThrottler(config.getConfigId(), server->getComponentRegister());
+ // MergeThrottler will be sandwiched in between two dummy links
+ top->push_back(std::unique_ptr<StorageLink>(throttler));
+ DummyStorageLink* bottom = new DummyStorageLink;
+ throttler->push_back(std::unique_ptr<StorageLink>(bottom));
+
+ _servers.push_back(std::shared_ptr<TestServiceLayerApp>(server.release()));
+ _throttlers.push_back(throttler);
+ _bottomLinks.push_back(bottom);
+ top->open();
+ _topLinks.push_back(std::shared_ptr<DummyStorageLink>(top.release()));
+ }
+}
+
+void
+MergeThrottlerTest::tearDown()
+{
+ for (std::size_t i = 0; i < _topLinks.size(); ++i) {
+ if (_topLinks[i]->getState() == StorageLink::OPENED) {
+ _topLinks[i]->close();
+ _topLinks[i]->flush();
+ }
+ _topLinks[i] = std::shared_ptr<DummyStorageLink>();
+ }
+ _topLinks.clear();
+ _bottomLinks.clear();
+ _throttlers.clear();
+ _servers.clear();
+}
+
+namespace {
+
+template <typename Iterator>
+bool
+checkChain(const StorageMessage::SP& msg,
+ Iterator first, Iterator end)
+{
+ const MergeBucketCommand& cmd =
+ dynamic_cast<const MergeBucketCommand&>(*msg);
+
+ if (cmd.getChain().size() != static_cast<std::size_t>(std::distance(first, end))) {
+ return false;
+ }
+
+ return std::equal(cmd.getChain().begin(), cmd.getChain().end(), first);
+}
+
+void waitUntilMergeQueueIs(MergeThrottler& throttler, std::size_t sz, int timeout)
+{
+ std::time_t start = std::time(0);
+ while (true) {
+ std::size_t count;
+ {
+ vespalib::LockGuard lock(throttler.getStateLock());
+ count = throttler.getMergeQueue().size();
+ }
+ if (count == sz) {
+ break;
+ }
+ std::time_t now = std::time(0);
+ if (now - start > timeout) {
+ std::ostringstream os;
+ os << "Timeout while waiting for merge queue with " << sz << " items. Had "
+ << count << " at timeout.";
+ throw vespalib::IllegalStateException(os.str(), VESPA_STRLOC);
+ }
+ FastOS_Thread::Sleep(1);
+ }
+}
+
+}
+
+// Extremely simple test that just checks that (min|max)_merges_per_node
+// under the stor-server config gets propagated to all the nodes
+void
+MergeThrottlerTest::testMergesConfig()
+{
+ for (int i = 0; i < _storageNodeCount; ++i) {
+ CPPUNIT_ASSERT_EQUAL(uint32_t(25), _throttlers[i]->getThrottlePolicy().getMaxPendingCount());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(20), _throttlers[i]->getMaxQueueSize());
+ }
+}
+
+// Test that a distributor sending a merge to the lowest-index storage
+// node correctly invokes a merge forwarding chain and subsequent unwind.
+void
+MergeThrottlerTest::testChain()
+{
+ uint16_t indices[_storageNodeCount];
+ for (int i = 0; i < _storageNodeCount; ++i) {
+ indices[i] = i;
+ _servers[i]->setClusterState(lib::ClusterState("distributor:100 storage:100 version:123"));
+ }
+
+ BucketId bid(14, 0x1337);
+
+ // Use different node permutations to ensure it works no matter which node is
+ // set as the executor. More specifically, _all_ permutations.
+ do {
+ uint16_t lastNodeIdx = _storageNodeCount - 1;
+ uint16_t executorNode = indices[0];
+
+ //std::cout << "\n----\n";
+ std::vector<MergeBucketCommand::Node> nodes;
+ for (int i = 0; i < _storageNodeCount; ++i) {
+ nodes.push_back(MergeBucketCommand::Node(indices[i], (i + executorNode) % 2 == 0));
+ //std::cout << indices[i] << " ";
+ }
+ //std::cout << "\n";
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(bid, nodes, UINT_MAX, 123));
+ cmd->setPriority(7);
+ cmd->setTimeout(54321);
+ StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ cmd->setAddress(address);
+ const uint16_t distributorIndex = 123;
+ cmd->setSourceIndex(distributorIndex); // Dummy distributor index that must be forwarded
+
+ StorageMessage::SP fwd = cmd;
+ StorageMessage::SP fwdToExec;
+
+ // TODO: make generic wrt. _storageNodeCount
+
+ for (int i = 0; i < _storageNodeCount - 1; ++i) {
+ if (i == executorNode) {
+ fwdToExec = fwd;
+ }
+ CPPUNIT_ASSERT_EQUAL(uint16_t(i), _servers[i]->getIndex());
+ // No matter the node order, command is always sent to node 0 -> 1 -> 2 etc
+ _topLinks[i]->sendDown(fwd);
+ _topLinks[i]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+
+ //std::cout << "fwd " << i << " -> " << i+1 << "\n";
+
+ // Forwarded merge should not be sent down. Should not be necessary
+ // to lock throttler here, since it should be sleeping like a champion
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _bottomLinks[i]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _topLinks[i]->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _throttlers[i]->getActiveMerges().size());
+
+ fwd = _topLinks[i]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ CPPUNIT_ASSERT_EQUAL(uint16_t(i + 1), fwd->getAddress()->getIndex());
+ CPPUNIT_ASSERT_EQUAL(distributorIndex, dynamic_cast<const StorageCommand&>(*fwd).getSourceIndex());
+ {
+ //uint16_t chain[] = { 0 };
+ std::vector<uint16_t> chain;
+ for (int j = 0; j <= i; ++j) {
+ chain.push_back(j);
+ }
+ CPPUNIT_ASSERT(checkChain(fwd, chain.begin(), chain.end()));
+ }
+ // Ensure priority, cluster state version and timeout is correctly forwarded
+ CPPUNIT_ASSERT_EQUAL(7, static_cast<int>(fwd->getPriority()));
+ CPPUNIT_ASSERT_EQUAL(uint32_t(123), dynamic_cast<const MergeBucketCommand&>(*fwd).getClusterStateVersion());
+ CPPUNIT_ASSERT_EQUAL(uint32_t(54321), dynamic_cast<const StorageCommand&>(*fwd).getTimeout());
+ }
+
+ _topLinks[lastNodeIdx]->sendDown(fwd);
+
+ // If node 2 is the first in the node list, it should immediately execute
+ // the merge. Otherwise, a cycle with the first node should be formed.
+ if (executorNode != lastNodeIdx) {
+ //std::cout << "cycle " << lastNodeIdx << " -> " << executorNode << "\n";
+ _topLinks[lastNodeIdx]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+ // Forwarded merge should not be sent down
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _bottomLinks[lastNodeIdx]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _topLinks[lastNodeIdx]->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _throttlers[lastNodeIdx]->getActiveMerges().size());
+
+ fwd = _topLinks[lastNodeIdx]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ CPPUNIT_ASSERT_EQUAL(uint16_t(executorNode), fwd->getAddress()->getIndex());
+ CPPUNIT_ASSERT_EQUAL(distributorIndex, dynamic_cast<const StorageCommand&>(*fwd).getSourceIndex());
+ {
+ std::vector<uint16_t> chain;
+ for (int j = 0; j < _storageNodeCount; ++j) {
+ chain.push_back(j);
+ }
+ CPPUNIT_ASSERT(checkChain(fwd, chain.begin(), chain.end()));
+ }
+ CPPUNIT_ASSERT_EQUAL(7, static_cast<int>(fwd->getPriority()));
+ CPPUNIT_ASSERT_EQUAL(uint32_t(123), dynamic_cast<const MergeBucketCommand&>(*fwd).getClusterStateVersion());
+ CPPUNIT_ASSERT_EQUAL(uint32_t(54321), dynamic_cast<const StorageCommand&>(*fwd).getTimeout());
+
+ _topLinks[executorNode]->sendDown(fwd);
+ }
+
+ _bottomLinks[executorNode]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+
+ // Forwarded merge has now been sent down to persistence layer
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _bottomLinks[executorNode]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _topLinks[executorNode]->getNumReplies()); // No reply sent yet
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _throttlers[executorNode]->getActiveMerges().size()); // no re-registering merge
+
+ if (executorNode != lastNodeIdx) {
+ // The MergeBucketCommand that is kept in the executor node should
+ // be the one from the node it initially got it from, NOT the one
+ // from the last node, since the chain has looped
+ CPPUNIT_ASSERT(_throttlers[executorNode]->getActiveMerges().find(bid)
+ != _throttlers[executorNode]->getActiveMerges().end());
+ CPPUNIT_ASSERT_EQUAL(static_cast<StorageMessage*>(fwdToExec.get()),
+ _throttlers[executorNode]->getActiveMerges().find(bid)->second.getMergeCmd().get());
+ }
+
+ // Send reply up from persistence layer to simulate a completed
+ // merge operation. Chain should now unwind properly
+ fwd = _bottomLinks[executorNode]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ CPPUNIT_ASSERT_EQUAL(7, static_cast<int>(fwd->getPriority()));
+ CPPUNIT_ASSERT_EQUAL(uint32_t(123), dynamic_cast<const MergeBucketCommand&>(*fwd).getClusterStateVersion());
+ CPPUNIT_ASSERT_EQUAL(uint32_t(54321), dynamic_cast<const StorageCommand&>(*fwd).getTimeout());
+
+ std::shared_ptr<MergeBucketReply> reply(
+ new MergeBucketReply(dynamic_cast<const MergeBucketCommand&>(*fwd)));
+ reply->setResult(ReturnCode(ReturnCode::OK, "Great success! :D-|-<"));
+ _bottomLinks[executorNode]->sendUp(reply);
+
+ _topLinks[executorNode]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+
+ if (executorNode != lastNodeIdx) {
+ // Merge should not be removed yet from executor, since it's pending an unwind
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _throttlers[executorNode]->getActiveMerges().size());
+ CPPUNIT_ASSERT_EQUAL(static_cast<StorageMessage*>(fwdToExec.get()),
+ _throttlers[executorNode]->getActiveMerges().find(bid)->second.getMergeCmd().get());
+ }
+ // MergeBucketReply waiting to be sent back to node 2. NOTE: we don't have any
+ // transport context stuff set up here to perform the reply mapping, so we
+ // have to emulate it
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _topLinks[executorNode]->getNumReplies());
+
+ StorageMessage::SP unwind = _topLinks[executorNode]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(uint16_t(executorNode), unwind->getAddress()->getIndex());
+
+ // eg: 0 -> 2 -> 1 -> 0. Or: 2 -> 1 -> 0 if no cycle
+ for (int i = (executorNode != lastNodeIdx ? _storageNodeCount - 1 : _storageNodeCount - 2); i >= 0; --i) {
+ //std::cout << "unwind " << i << "\n";
+
+ _topLinks[i]->sendDown(unwind);
+ _topLinks[i]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _bottomLinks[i]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _topLinks[i]->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _throttlers[i]->getActiveMerges().size());
+
+ unwind = _topLinks[i]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(uint16_t(i), unwind->getAddress()->getIndex());
+ }
+
+ const MergeBucketReply& mbr = dynamic_cast<const MergeBucketReply&>(*unwind);
+
+ CPPUNIT_ASSERT_EQUAL(ReturnCode::OK, mbr.getResult().getResult());
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("Great success! :D-|-<"), mbr.getResult().getMessage());
+ CPPUNIT_ASSERT_EQUAL(bid, mbr.getBucketId());
+
+ } while (std::next_permutation(indices, indices + _storageNodeCount));
+
+ //std::cout << "\n" << *_topLinks[0] << "\n";
+}
+
+void
+MergeThrottlerTest::testWithSourceOnlyNode()
+{
+ BucketId bid(14, 0x1337);
+
+ StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(2);
+ nodes.push_back(MergeBucketCommand::Node(1, true));
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(bid, nodes, UINT_MAX, 123));
+
+ cmd->setAddress(address);
+ _topLinks[0]->sendDown(cmd);
+
+ _topLinks[0]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+ StorageMessage::SP fwd = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ CPPUNIT_ASSERT_EQUAL(uint16_t(1), fwd->getAddress()->getIndex());
+
+ _topLinks[1]->sendDown(fwd);
+
+ _topLinks[1]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+ fwd = _topLinks[1]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ CPPUNIT_ASSERT_EQUAL(uint16_t(2), fwd->getAddress()->getIndex());
+
+ _topLinks[2]->sendDown(fwd);
+
+ _topLinks[2]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+ fwd = _topLinks[2]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ CPPUNIT_ASSERT_EQUAL(uint16_t(0), fwd->getAddress()->getIndex());
+
+ _topLinks[0]->sendDown(fwd);
+ _bottomLinks[0]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+ _bottomLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ std::shared_ptr<MergeBucketReply> reply(
+ new MergeBucketReply(dynamic_cast<const MergeBucketCommand&>(*fwd)));
+ reply->setResult(ReturnCode(ReturnCode::OK, "Great success! :D-|-<"));
+ _bottomLinks[0]->sendUp(reply);
+
+ _topLinks[0]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+ fwd = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(uint16_t(0), fwd->getAddress()->getIndex());
+
+ // Assume everything's fine from here on out
+}
+
+// 4.2 distributors don't guarantee they'll send to lowest node
+// index, so we must detect such situations and execute the merge
+// immediately rather than attempt to chain it. Test that this
+// is done correctly.
+void
+MergeThrottlerTest::test42DistributorBehavior()
+{
+ BucketId bid(32, 0xfeef00);
+
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(bid, nodes, 1234));
+
+ // Send to node 1, which is not the lowest index
+ StorageMessageAddress address("storage", lib::NodeType::STORAGE, 1);
+
+ cmd->setAddress(address);
+ _topLinks[1]->sendDown(cmd);
+ _bottomLinks[1]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+
+ // Should now have been sent to persistence layer
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _bottomLinks[1]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _topLinks[1]->getNumReplies()); // No reply sent yet
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _throttlers[1]->getActiveMerges().size());
+
+ // Send reply up from persistence layer to simulate a completed
+ // merge operation. Merge should be removed from state.
+ _bottomLinks[1]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ std::shared_ptr<MergeBucketReply> reply(
+ new MergeBucketReply(dynamic_cast<const MergeBucketCommand&>(*cmd)));
+ reply->setResult(ReturnCode(ReturnCode::OK, "Tonight we dine on turtle soup!"));
+ _bottomLinks[1]->sendUp(reply);
+ _topLinks[1]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _bottomLinks[1]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _topLinks[1]->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _throttlers[1]->getActiveMerges().size());
+
+ CPPUNIT_ASSERT_EQUAL(uint64_t(1), _throttlers[1]->getMetrics().local.ok.getValue());
+}
+
+// Test that we don't take ownership of the merge command when we're
+// just passing it through to the persistence layer when receiving
+// a merge command that presumably comes form a 4.2 distributor
+void
+MergeThrottlerTest::test42DistributorBehaviorDoesNotTakeOwnership()
+{
+ BucketId bid(32, 0xfeef00);
+
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(bid, nodes, 1234));
+
+ // Send to node 1, which is not the lowest index
+ StorageMessageAddress address("storage", lib::NodeType::STORAGE, 1);
+
+ cmd->setAddress(address);
+ _topLinks[1]->sendDown(cmd);
+ _bottomLinks[1]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+
+ // Should now have been sent to persistence layer
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _bottomLinks[1]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _topLinks[1]->getNumReplies()); // No reply sent yet
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _throttlers[1]->getActiveMerges().size());
+
+ _bottomLinks[1]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+
+ // To ensure we don't try to deref any non-owned messages
+ framework::HttpUrlPath path("?xml");
+ std::ostringstream ss;
+ _throttlers[1]->reportStatus(ss, path);
+
+ // Flush throttler (synchronously). Should NOT generate a reply
+ // for the merge command, as it is not owned by the throttler
+ StorageLinkTest::callOnFlush(*_throttlers[1], true);
+
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _bottomLinks[1]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _topLinks[1]->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _throttlers[1]->getActiveMerges().size());
+
+ // Send a belated reply from persistence up just to ensure the
+ // throttler doesn't throw a fit if it receives an unknown merge
+ std::shared_ptr<MergeBucketReply> reply(
+ new MergeBucketReply(dynamic_cast<const MergeBucketCommand&>(*cmd)));
+ reply->setResult(ReturnCode(ReturnCode::OK, "Tonight we dine on turtle soup!"));
+ _bottomLinks[1]->sendUp(reply);
+ _topLinks[1]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _bottomLinks[1]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _topLinks[1]->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _throttlers[1]->getActiveMerges().size());
+}
+
+// Test that we don't take ownership of the merge command when we're
+// just passing it through to the persistence layer when we're at the
+// the end of the chain and also the designated executor
+void
+MergeThrottlerTest::testEndOfChainExecutionDoesNotTakeOwnership()
+{
+ BucketId bid(32, 0xfeef00);
+
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(2);
+ nodes.push_back(1);
+ nodes.push_back(0);
+ std::vector<uint16_t> chain;
+ chain.push_back(0);
+ chain.push_back(1);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(bid, nodes, 1234, 1, chain));
+
+ // Send to last node, which is not the lowest index
+ StorageMessageAddress address("storage", lib::NodeType::STORAGE, 3);
+
+ cmd->setAddress(address);
+ _topLinks[2]->sendDown(cmd);
+ _bottomLinks[2]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+
+ // Should now have been sent to persistence layer
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _bottomLinks[2]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _topLinks[2]->getNumReplies()); // No reply sent yet
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _throttlers[2]->getActiveMerges().size());
+
+ _bottomLinks[2]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+
+ // To ensure we don't try to deref any non-owned messages
+ framework::HttpUrlPath path("");
+ std::ostringstream ss;
+ _throttlers[2]->reportStatus(ss, path);
+
+ // Flush throttler (synchronously). Should NOT generate a reply
+ // for the merge command, as it is not owned by the throttler
+ StorageLinkTest::callOnFlush(*_throttlers[2], true);
+
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _bottomLinks[2]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _topLinks[2]->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _throttlers[2]->getActiveMerges().size());
+
+ // Send a belated reply from persistence up just to ensure the
+ // throttler doesn't throw a fit if it receives an unknown merge
+ std::shared_ptr<MergeBucketReply> reply(
+ new MergeBucketReply(dynamic_cast<const MergeBucketCommand&>(*cmd)));
+ reply->setResult(ReturnCode(ReturnCode::OK, "Tonight we dine on turtle soup!"));
+ _bottomLinks[2]->sendUp(reply);
+ _topLinks[2]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _bottomLinks[2]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _topLinks[2]->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _throttlers[2]->getActiveMerges().size());
+}
+
+// Test that nodes resending a merge command won't lead to duplicate
+// state registration/forwarding or erasing the already present state
+// information.
+void
+MergeThrottlerTest::testResendHandling()
+{
+ BucketId bid(32, 0xbadbed);
+
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(bid, nodes, 1234));
+
+ StorageMessageAddress address("storage", lib::NodeType::STORAGE, 1);
+
+ cmd->setAddress(address);
+ _topLinks[0]->sendDown(cmd);
+ _topLinks[0]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+
+ StorageMessage::SP fwd = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+
+ // Resend from "distributor". Just use same message, as that won't matter here
+ _topLinks[0]->sendDown(cmd);
+ _topLinks[0]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+
+ // Reply should be BUSY
+ StorageMessage::SP reply = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<MergeBucketReply&>(*reply).getResult().getResult(),
+ ReturnCode::BUSY);
+
+ _topLinks[1]->sendDown(fwd);
+ _topLinks[1]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+ fwd = _topLinks[1]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+
+ _topLinks[2]->sendDown(fwd);
+ _topLinks[2]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+ _topLinks[2]->sendDown(fwd);
+ _topLinks[2]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+
+ // Reply should be BUSY
+ reply = _topLinks[2]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<MergeBucketReply&>(*reply).getResult().getResult(),
+ ReturnCode::BUSY);
+
+ fwd = _topLinks[2]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+
+ _topLinks[0]->sendDown(fwd);
+ _bottomLinks[0]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+ _topLinks[0]->sendDown(fwd);
+ _topLinks[0]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+
+ reply = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<MergeBucketReply&>(*reply).getResult().getResult(),
+ ReturnCode::BUSY);
+}
+
+void
+MergeThrottlerTest::testPriorityQueuing()
+{
+ // Fill up all active merges
+ std::size_t maxPending = _throttlers[0]->getThrottlePolicy().getMaxPendingCount();
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ CPPUNIT_ASSERT(maxPending >= 4u);
+ for (std::size_t i = 0; i < maxPending; ++i) {
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xf00baa00 + i), nodes, 1234));
+ cmd->setPriority(100);
+ _topLinks[0]->sendDown(cmd);
+ }
+
+ // Wait till we have maxPending replies and 0 queued
+ _topLinks[0]->waitForMessages(maxPending, 5);
+ waitUntilMergeQueueIs(*_throttlers[0], 0, _messageWaitTime);
+
+ // Queue up some merges with different priorities
+ int priorities[4] = { 200, 150, 120, 240 };
+ int sortedPris[4] = { 120, 150, 200, 240 };
+ for (int i = 0; i < 4; ++i) {
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, i), nodes, 1234));
+ cmd->setPriority(priorities[i]);
+ _topLinks[0]->sendDown(cmd);
+ }
+
+ waitUntilMergeQueueIs(*_throttlers[0], 4, _messageWaitTime);
+
+ // Remove all but 4 forwarded merges
+ for (std::size_t i = 0; i < maxPending - 4; ++i) {
+ _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ }
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _topLinks[0]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(4), _topLinks[0]->getNumReplies());
+
+ // Now when we start replying to merges, queued merges should be
+ // processed in priority order
+ for (int i = 0; i < 4; ++i) {
+ StorageMessage::SP replyTo = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ std::shared_ptr<MergeBucketReply> reply(
+ new MergeBucketReply(dynamic_cast<const MergeBucketCommand&>(*replyTo)));
+ reply->setResult(ReturnCode(ReturnCode::OK, "whee"));
+ _topLinks[0]->sendDown(reply);
+ }
+
+ _topLinks[0]->waitForMessages(8, _messageWaitTime); // 4 merges, 4 replies
+ waitUntilMergeQueueIs(*_throttlers[0], 0, _messageWaitTime);
+
+ for (int i = 0; i < 4; ++i) {
+ StorageMessage::SP cmd = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ CPPUNIT_ASSERT_EQUAL(uint8_t(sortedPris[i]), cmd->getPriority());
+ }
+}
+
+// Test that we can detect and reject merges that due to resending
+// and potential priority queue sneaking etc may end up with duplicates
+// in the queue for a merge that is already known.
+void
+MergeThrottlerTest::testCommandInQueueDuplicateOfKnownMerge()
+{
+ // Fill up all active merges and 1 queued one
+ std::size_t maxPending = _throttlers[0]->getThrottlePolicy().getMaxPendingCount();
+ CPPUNIT_ASSERT(maxPending < 100);
+ for (std::size_t i = 0; i < maxPending + 1; ++i) {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(2 + i);
+ nodes.push_back(5 + i);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xf00baa00 + i), nodes, 1234));
+ cmd->setPriority(100 - i);
+ _topLinks[0]->sendDown(cmd);
+ }
+
+ // Wait till we have maxPending replies and 3 queued
+ _topLinks[0]->waitForMessages(maxPending, _messageWaitTime);
+ waitUntilMergeQueueIs(*_throttlers[0], 1, _messageWaitTime);
+
+ // Add a merge for the same bucket twice to the queue
+ {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(12);
+ nodes.push_back(123);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xf000feee), nodes, 1234));
+ _topLinks[0]->sendDown(cmd);
+ }
+ {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(124); // Different node set doesn't matter
+ nodes.push_back(14);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xf000feee), nodes, 1234));
+ _topLinks[0]->sendDown(cmd);
+ }
+
+ waitUntilMergeQueueIs(*_throttlers[0], 3, _messageWaitTime);
+
+ StorageMessage::SP fwd = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+
+ // Remove and success-reply for 2 merges. This will give enough room
+ // for the 2 first queued merges to be processed, the last one having a
+ // duplicate in the queue.
+ for (int i = 0; i < 2; ++i) {
+ StorageMessage::SP fwd2 = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ std::shared_ptr<MergeBucketReply> reply(
+ new MergeBucketReply(dynamic_cast<const MergeBucketCommand&>(*fwd2)));
+ reply->setResult(ReturnCode(ReturnCode::OK, ""));
+ _topLinks[0]->sendDown(reply);
+ }
+
+ _topLinks[0]->waitForMessages(maxPending + 1, _messageWaitTime);
+ waitUntilMergeQueueIs(*_throttlers[0], 1, _messageWaitTime);
+
+ // Remove all current merge commands/replies so we can work with a clean slate
+ _topLinks[0]->getRepliesOnce();
+ // Send a success-reply for fwd, allowing the duplicate from the queue
+ // to have its moment to shine only to then be struck down mercilessly
+ std::shared_ptr<MergeBucketReply> reply(
+ new MergeBucketReply(dynamic_cast<const MergeBucketCommand&>(*fwd)));
+ reply->setResult(ReturnCode(ReturnCode::OK, ""));
+ _topLinks[0]->sendDown(reply);
+
+ _topLinks[0]->waitForMessages(2, _messageWaitTime);
+ waitUntilMergeQueueIs(*_throttlers[0], 0, _messageWaitTime);
+
+ // First reply is the successful merge reply
+ StorageMessage::SP reply2 = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<MergeBucketReply&>(*reply2).getResult().getResult(),
+ ReturnCode::OK);
+
+ // Second reply should be the BUSY-rejected duplicate
+ StorageMessage::SP reply1 = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<MergeBucketReply&>(*reply1).getResult().getResult(),
+ ReturnCode::BUSY);
+ CPPUNIT_ASSERT(static_cast<MergeBucketReply&>(*reply1).getResult()
+ .getMessage().find("out of date;") != std::string::npos);
+}
+
+// Test that sending a merge command to a node not in the set of
+// to-be-merged nodes is handled gracefully.
+// This is not a scenario that should ever actually happen, but for
+// the sake of robustness, include it anyway.
+void
+MergeThrottlerTest::testInvalidReceiverNode()
+{
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(1);
+ nodes.push_back(5);
+ nodes.push_back(9);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xf00baaaa), nodes, 1234));
+
+ // Send to node with index 0
+ _topLinks[0]->sendDown(cmd);
+ _topLinks[0]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+
+ StorageMessage::SP reply = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<MergeBucketReply&>(*reply).getResult().getResult(),
+ ReturnCode::REJECTED);
+ CPPUNIT_ASSERT(static_cast<MergeBucketReply&>(*reply).getResult()
+ .getMessage().find("which is not in its forwarding chain") != std::string::npos);
+}
+
+// Test that the throttling policy kicks in after a certain number of
+// merges are forwarded and that the rest are queued in a prioritized
+// order.
+void
+MergeThrottlerTest::testForwardQueuedMerge()
+{
+ // Fill up all active merges and then 3 queued ones
+ std::size_t maxPending = _throttlers[0]->getThrottlePolicy().getMaxPendingCount();
+ CPPUNIT_ASSERT(maxPending < 100);
+ for (std::size_t i = 0; i < maxPending + 3; ++i) {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(2 + i);
+ nodes.push_back(5 + i);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xf00baa00 + i), nodes, 1234));
+ cmd->setPriority(100 - i);
+ _topLinks[0]->sendDown(cmd);
+ }
+
+ // Wait till we have maxPending replies and 3 queued
+ _topLinks[0]->waitForMessages(maxPending, _messageWaitTime);
+ waitUntilMergeQueueIs(*_throttlers[0], 3, _messageWaitTime);
+
+ // Merge queue state should not be touched by worker thread now
+ StorageMessage::SP nextMerge = _throttlers[0]->getMergeQueue().begin()->_msg;
+
+ StorageMessage::SP fwd = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+
+ // Remove all the rest of the active merges
+ while (!_topLinks[0]->getReplies().empty()) {
+ _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ }
+
+ std::shared_ptr<MergeBucketReply> reply(
+ new MergeBucketReply(dynamic_cast<const MergeBucketCommand&>(*fwd)));
+ reply->setResult(ReturnCode(ReturnCode::OK, "Celebrate good times come on"));
+ _topLinks[0]->sendDown(reply);
+ _topLinks[0]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime); // Success rewind reply
+
+ // Remove reply bound for distributor
+ StorageMessage::SP distReply = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<MergeBucketReply&>(*distReply).getResult().getResult(),
+ ReturnCode::OK);
+
+ waitUntilMergeQueueIs(*_throttlers[0], 2, _messageWaitTime);
+ _topLinks[0]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _topLinks[0]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), _topLinks[0]->getNumReplies());
+
+ // First queued merge should now have been registered and forwarded
+ fwd = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<const MergeBucketCommand&>(*fwd).getBucketId(),
+ static_cast<const MergeBucketCommand&>(*nextMerge).getBucketId());
+
+ CPPUNIT_ASSERT(
+ static_cast<const MergeBucketCommand&>(*fwd).getNodes()
+ == static_cast<const MergeBucketCommand&>(*nextMerge).getNodes());
+
+ // Ensure forwarded merge has a higher priority than the next queued one
+ CPPUNIT_ASSERT(fwd->getPriority() < _throttlers[0]->getMergeQueue().begin()->_msg->getPriority());
+
+ CPPUNIT_ASSERT_EQUAL(uint64_t(1), _throttlers[0]->getMetrics().chaining.ok.getValue());
+
+ /*framework::HttpUrlPath path("?xml");
+ _forwarders[0]->reportStatus(std::cerr, path);*/
+}
+
+void
+MergeThrottlerTest::testExecuteQueuedMerge()
+{
+ MergeThrottler& throttler(*_throttlers[1]);
+ DummyStorageLink& topLink(*_topLinks[1]);
+ DummyStorageLink& bottomLink(*_bottomLinks[1]);
+
+ // Fill up all active merges and then 3 queued ones
+ std::size_t maxPending = throttler.getThrottlePolicy().getMaxPendingCount();
+ CPPUNIT_ASSERT(maxPending < 100);
+ for (std::size_t i = 0; i < maxPending + 3; ++i) {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(1);
+ nodes.push_back(5 + i);
+ nodes.push_back(7 + i);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xf00baa00 + i), nodes, 1234, 1));
+ cmd->setPriority(250 - i + 5);
+ topLink.sendDown(cmd);
+ }
+
+ // Wait till we have maxPending replies and 3 queued
+ topLink.waitForMessages(maxPending, _messageWaitTime);
+ waitUntilMergeQueueIs(throttler, 3, _messageWaitTime);
+
+ // Sneak in a higher priority message that is bound to be executed
+ // on the given node
+ {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(1);
+ nodes.push_back(0);
+ std::vector<uint16_t> chain;
+ chain.push_back(0);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0x1337), nodes, 1234, 1, chain));
+ cmd->setPriority(0);
+ topLink.sendDown(cmd);
+ }
+
+ waitUntilMergeQueueIs(throttler, 4, _messageWaitTime);
+
+ // Merge queue state should not be touched by worker thread now
+ StorageMessage::SP nextMerge(throttler.getMergeQueue().begin()->_msg);
+ /*StorageMessage::SP nextMerge;
+ {
+ vespalib::LockGuard lock(_throttlers[0]->getStateLock());
+ // Dirty: have to check internal state
+ nextMerge = _throttlers[0]->getMergeQueue().begin()->_msg;
+ }*/
+
+ CPPUNIT_ASSERT_EQUAL(
+ BucketId(32, 0x1337),
+ dynamic_cast<const MergeBucketCommand&>(*nextMerge).getBucketId());
+
+ StorageMessage::SP fwd(topLink.getAndRemoveMessage(MessageType::MERGEBUCKET));
+
+ // Remove all the rest of the active merges
+ while (!topLink.getReplies().empty()) {
+ topLink.getAndRemoveMessage(MessageType::MERGEBUCKET);
+ }
+
+ // Free up a merge slot
+ std::shared_ptr<MergeBucketReply> reply(
+ new MergeBucketReply(dynamic_cast<const MergeBucketCommand&>(*fwd)));
+ reply->setResult(ReturnCode(ReturnCode::OK, "Celebrate good times come on"));
+ topLink.sendDown(reply);
+
+ topLink.waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+ // Remove chain reply
+ StorageMessage::SP distReply(topLink.getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY));
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<MergeBucketReply&>(*distReply).getResult().getResult(),
+ ReturnCode::OK);
+
+ waitUntilMergeQueueIs(throttler, 3, _messageWaitTime);
+ bottomLink.waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), topLink.getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), topLink.getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(1), bottomLink.getNumCommands());
+
+ // First queued merge should now have been registered and sent down
+ StorageMessage::SP cmd(bottomLink.getAndRemoveMessage(MessageType::MERGEBUCKET));
+
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<const MergeBucketCommand&>(*cmd).getBucketId(),
+ static_cast<const MergeBucketCommand&>(*nextMerge).getBucketId());
+
+ CPPUNIT_ASSERT(
+ static_cast<const MergeBucketCommand&>(*cmd).getNodes()
+ == static_cast<const MergeBucketCommand&>(*nextMerge).getNodes());
+}
+
+void
+MergeThrottlerTest::testFlush()
+{
+ // Fill up all active merges and then 3 queued ones
+ std::size_t maxPending = _throttlers[0]->getThrottlePolicy().getMaxPendingCount();
+ CPPUNIT_ASSERT(maxPending < 100);
+ for (std::size_t i = 0; i < maxPending + 3; ++i) {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xf00baa00 + i), nodes, 1234, 1));
+ _topLinks[0]->sendDown(cmd);
+ }
+
+ // Wait till we have maxPending replies and 3 queued
+ _topLinks[0]->waitForMessages(maxPending, _messageWaitTime);
+ waitUntilMergeQueueIs(*_throttlers[0], 3, _messageWaitTime);
+
+ // Remove all forwarded commands
+ uint32_t removed = _topLinks[0]->getRepliesOnce().size();
+ CPPUNIT_ASSERT(removed >= 5);
+
+ // Flush the storage link, triggering an abort of all commands
+ // no matter what their current state is.
+ _topLinks[0]->close();
+ _topLinks[0]->flush();
+ _topLinks[0]->waitForMessages(maxPending + 3 - removed, _messageWaitTime);
+
+ while (!_topLinks[0]->getReplies().empty()) {
+ StorageMessage::SP reply = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(
+ ReturnCode::ABORTED,
+ static_cast<const MergeBucketReply&>(*reply).getResult().getResult());
+ }
+ // NOTE: merges that have been immediately executed (i.e. not cycled)
+ // on the node should _not_ be replied to, since they're not owned
+ // by the throttler at that point in time
+}
+
+// If a node goes down and another node has a merge chained through it in
+// its queue, the original node can receive a final chain hop forwarding
+// it knows nothing about when it comes back up. If this is not handled
+// properly, it will attempt to forward this node again with a bogus
+// index. This should be implicitly handled by checking for a full node
+void
+MergeThrottlerTest::testUnseenMergeWithNodeInChain()
+{
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(5);
+ nodes.push_back(9);
+ std::vector<uint16_t> chain;
+ chain.push_back(0);
+ chain.push_back(5);
+ chain.push_back(9);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xdeadbeef), nodes, 1234, 1, chain));
+
+ StorageMessageAddress address("storage", lib::NodeType::STORAGE, 9);
+
+ cmd->setAddress(address);
+ _topLinks[0]->sendDown(cmd);
+
+ // First, test that we get rejected when processing merge immediately
+ // Should get a rejection in return
+ _topLinks[0]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+ StorageMessage::SP reply = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(
+ ReturnCode::REJECTED,
+ dynamic_cast<const MergeBucketReply&>(*reply).getResult().getResult());
+
+ // Second, test that we get rejected before queueing up. This is to
+ // avoid a hypothetical deadlock scenario.
+ // Fill up all active merges
+ {
+
+ std::size_t maxPending(
+ _throttlers[0]->getThrottlePolicy().getMaxPendingCount());
+ for (std::size_t i = 0; i < maxPending; ++i) {
+ std::shared_ptr<MergeBucketCommand> fillCmd(
+ new MergeBucketCommand(BucketId(32, 0xf00baa00 + i), nodes, 1234));
+ _topLinks[0]->sendDown(fillCmd);
+ }
+ }
+
+ _topLinks[0]->sendDown(cmd);
+
+ _topLinks[0]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+ reply = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(
+ ReturnCode::REJECTED,
+ dynamic_cast<const MergeBucketReply&>(*reply).getResult().getResult());
+}
+
+void
+MergeThrottlerTest::testMergeWithNewerClusterStateFlushesOutdatedQueued()
+{
+ // Fill up all active merges and then 3 queued ones with the same
+ // system state
+ std::size_t maxPending = _throttlers[0]->getThrottlePolicy().getMaxPendingCount();
+ CPPUNIT_ASSERT(maxPending < 100);
+ std::vector<api::StorageMessage::Id> ids;
+ for (std::size_t i = 0; i < maxPending + 3; ++i) {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xf00baa00 + i), nodes, 1234, 1));
+ ids.push_back(cmd->getMsgId());
+ _topLinks[0]->sendDown(cmd);
+ }
+
+ // Wait till we have maxPending replies and 3 queued
+ _topLinks[0]->waitForMessages(maxPending, _messageWaitTime);
+ waitUntilMergeQueueIs(*_throttlers[0], 3, _messageWaitTime);
+
+ // Send down merge with newer system state
+ {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0x12345678), nodes, 1234, 2));
+ ids.push_back(cmd->getMsgId());
+ _topLinks[0]->sendDown(cmd);
+ }
+
+ // Queue should now be flushed with all messages being returned with
+ // WRONG_DISTRIBUTION
+ _topLinks[0]->waitForMessages(maxPending + 3, _messageWaitTime);
+ waitUntilMergeQueueIs(*_throttlers[0], 1, _messageWaitTime);
+
+ for (int i = 0; i < 3; ++i) {
+ StorageMessage::SP reply = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<MergeBucketReply&>(*reply).getResult().getResult(),
+ ReturnCode::WRONG_DISTRIBUTION);
+ CPPUNIT_ASSERT_EQUAL(1u, static_cast<MergeBucketReply&>(*reply).getClusterStateVersion());
+ CPPUNIT_ASSERT_EQUAL(ids[maxPending + i], reply->getMsgId());
+ }
+
+ CPPUNIT_ASSERT_EQUAL(uint64_t(3), _throttlers[0]->getMetrics().chaining.failures.wrongdistribution.getValue());
+}
+
+void
+MergeThrottlerTest::testUpdatedClusterStateFlushesOutdatedQueued()
+{
+ // State is version 1. Send down several merges with state version 2.
+ std::size_t maxPending = _throttlers[0]->getThrottlePolicy().getMaxPendingCount();
+ CPPUNIT_ASSERT(maxPending < 100);
+ std::vector<api::StorageMessage::Id> ids;
+ for (std::size_t i = 0; i < maxPending + 3; ++i) {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xf00baa00 + i), nodes, 1234, 2));
+ ids.push_back(cmd->getMsgId());
+ _topLinks[0]->sendDown(cmd);
+ }
+
+ // Wait till we have maxPending replies and 4 queued
+ _topLinks[0]->waitForMessages(maxPending, _messageWaitTime);
+ waitUntilMergeQueueIs(*_throttlers[0], 3, _messageWaitTime);
+
+ // Send down new system state (also set it explicitly)
+ _servers[0]->setClusterState(lib::ClusterState("distributor:100 storage:100 version:3"));
+ std::shared_ptr<api::SetSystemStateCommand> stateCmd(
+ new api::SetSystemStateCommand(lib::ClusterState("distributor:100 storage:100 version:3")));
+ _topLinks[0]->sendDown(stateCmd);
+
+ // Queue should now be flushed with all being replied to with WRONG_DISTRIBUTION
+ waitUntilMergeQueueIs(*_throttlers[0], 0, _messageWaitTime);
+ _topLinks[0]->waitForMessages(maxPending + 3, 5);
+
+ for (int i = 0; i < 3; ++i) {
+ StorageMessage::SP reply = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<MergeBucketReply&>(*reply).getResult().getResult(),
+ ReturnCode::WRONG_DISTRIBUTION);
+ CPPUNIT_ASSERT_EQUAL(2u, static_cast<MergeBucketReply&>(*reply).getClusterStateVersion());
+ CPPUNIT_ASSERT_EQUAL(ids[maxPending + i], reply->getMsgId());
+ }
+
+ CPPUNIT_ASSERT_EQUAL(uint64_t(3), _throttlers[0]->getMetrics().chaining.failures.wrongdistribution.getValue());
+}
+
+void
+MergeThrottlerTest::test42MergesDoNotTriggerFlush()
+{
+ // Fill up all active merges and then 1 queued one
+ std::size_t maxPending = _throttlers[0]->getThrottlePolicy().getMaxPendingCount();
+ CPPUNIT_ASSERT(maxPending < 100);
+ for (std::size_t i = 0; i < maxPending + 1; ++i) {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xf00baa00 + i), nodes, 1234, 1));
+ _topLinks[0]->sendDown(cmd);
+ }
+
+ // Wait till we have maxPending replies and 1 queued
+ _topLinks[0]->waitForMessages(maxPending, _messageWaitTime);
+ waitUntilMergeQueueIs(*_throttlers[0], 1, _messageWaitTime);
+
+ StorageMessage::SP fwd = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+
+ // Remove all the rest of the active merges
+ while (!_topLinks[0]->getReplies().empty()) {
+ _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ }
+
+ // Send down a merge with a cluster state version of 0, which should
+ // be ignored and queued as usual
+ {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xbaaadbed), nodes, 1234, 0));
+ _topLinks[0]->sendDown(cmd);
+ }
+
+ waitUntilMergeQueueIs(*_throttlers[0], 2, _messageWaitTime);
+
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _topLinks[0]->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _topLinks[0]->getNumReplies());
+
+ CPPUNIT_ASSERT_EQUAL(uint64_t(0), _throttlers[0]->getMetrics().local.failures.wrongdistribution.getValue());
+}
+
+// Test that a merge that arrive with a state version that is less than
+// that of the node is rejected immediately
+void
+MergeThrottlerTest::testOutdatedClusterStateMergesAreRejectedOnArrival()
+{
+ _servers[0]->setClusterState(lib::ClusterState("distributor:100 storage:100 version:10"));
+
+ // Send down a merge with a cluster state version of 9, which should
+ // be rejected
+ {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xfeef00), nodes, 1234, 9));
+ _topLinks[0]->sendDown(cmd);
+ }
+
+ _topLinks[0]->waitForMessages(1, _messageWaitTime);
+
+ StorageMessage::SP reply = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(
+ static_cast<MergeBucketReply&>(*reply).getResult().getResult(),
+ ReturnCode::WRONG_DISTRIBUTION);
+
+ CPPUNIT_ASSERT_EQUAL(uint64_t(1), _throttlers[0]->getMetrics().chaining.failures.wrongdistribution.getValue());
+}
+
+// Test erroneous case where node receives merge where the merge does
+// not exist in the state, but it exists in the chain without the chain
+// being full. This is something that shouldn't happen, but must still
+// not crash the node
+void
+MergeThrottlerTest::testUnknownMergeWithSelfInChain()
+{
+ BucketId bid(32, 0xbadbed);
+
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ std::vector<uint16_t> chain;
+ chain.push_back(0);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(bid, nodes, 1234, 1, chain));
+
+ StorageMessageAddress address("storage", lib::NodeType::STORAGE, 1);
+
+ cmd->setAddress(address);
+ _topLinks[0]->sendDown(cmd);
+ _topLinks[0]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+
+ StorageMessage::SP reply = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+
+ CPPUNIT_ASSERT_EQUAL(
+ ReturnCode::REJECTED,
+ static_cast<MergeBucketReply&>(*reply).getResult().getResult());
+}
+
+void
+MergeThrottlerTest::testBusyReturnedOnFullQueue()
+{
+ std::size_t maxPending = _throttlers[0]->getThrottlePolicy().getMaxPendingCount();
+ std::size_t maxQueue = _throttlers[0]->getMaxQueueSize();
+ CPPUNIT_ASSERT(maxPending < 100);
+ for (std::size_t i = 0; i < maxPending + maxQueue; ++i) {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xf00000 + i), nodes, 1234, 1));
+ _topLinks[0]->sendDown(cmd);
+ }
+
+ // Wait till we have maxPending replies and maxQueue queued
+ _topLinks[0]->waitForMessages(maxPending, _messageWaitTime);
+ waitUntilMergeQueueIs(*_throttlers[0], maxQueue, _messageWaitTime);
+
+ // Clear all forwarded merges
+ _topLinks[0]->getRepliesOnce();
+ // Send down another merge which should be immediately busy-returned
+ {
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(0);
+ nodes.push_back(1);
+ nodes.push_back(2);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xf000baaa), nodes, 1234, 1));
+ _topLinks[0]->sendDown(cmd);
+ }
+ _topLinks[0]->waitForMessage(MessageType::MERGEBUCKET_REPLY, _messageWaitTime);
+ StorageMessage::SP reply = _topLinks[0]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+
+ CPPUNIT_ASSERT_EQUAL(
+ BucketId(32, 0xf000baaa),
+ static_cast<MergeBucketReply&>(*reply).getBucketId());
+
+ CPPUNIT_ASSERT_EQUAL(
+ ReturnCode::BUSY,
+ static_cast<MergeBucketReply&>(*reply).getResult().getResult());
+
+ CPPUNIT_ASSERT_EQUAL(uint64_t(0),
+ _throttlers[0]->getMetrics().chaining
+ .failures.busy.getValue());
+ CPPUNIT_ASSERT_EQUAL(uint64_t(1),
+ _throttlers[0]->getMetrics().local
+ .failures.busy.getValue());
+}
+
+void
+MergeThrottlerTest::testBrokenCycle()
+{
+ std::vector<MergeBucketCommand::Node> nodes;
+ nodes.push_back(1);
+ nodes.push_back(0);
+ nodes.push_back(2);
+ {
+ std::vector<uint16_t> chain;
+ chain.push_back(0);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xfeef00), nodes, 1234, 1, chain));
+ _topLinks[1]->sendDown(cmd);
+ }
+
+ _topLinks[1]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+ StorageMessage::SP fwd = _topLinks[1]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ CPPUNIT_ASSERT_EQUAL(uint16_t(2), fwd->getAddress()->getIndex());
+
+ // Send cycled merge which will be executed
+ {
+ std::vector<uint16_t> chain;
+ chain.push_back(0);
+ chain.push_back(1);
+ chain.push_back(2);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xfeef00), nodes, 1234, 1, chain));
+ _topLinks[1]->sendDown(cmd);
+ }
+
+ _bottomLinks[1]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+ StorageMessage::SP cycled = _bottomLinks[1]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+
+ // Now, node 2 goes down, auto sending back a failed merge
+ std::shared_ptr<MergeBucketReply> nodeDownReply(
+ new MergeBucketReply(dynamic_cast<const MergeBucketCommand&>(*fwd)));
+ nodeDownReply->setResult(ReturnCode(ReturnCode::NOT_CONNECTED, "Node went sightseeing"));
+
+ _topLinks[1]->sendDown(nodeDownReply);
+ // Merge reply also arrives from persistence
+ std::shared_ptr<MergeBucketReply> persistenceReply(
+ new MergeBucketReply(dynamic_cast<const MergeBucketCommand&>(*cycled)));
+ persistenceReply->setResult(ReturnCode(ReturnCode::ABORTED, "Oh dear"));
+ _bottomLinks[1]->sendUp(persistenceReply);
+
+ // Should now be two replies from node 1, one to node 2 and one to node 0
+ // since we must handle broken chains
+ _topLinks[1]->waitForMessages(2, _messageWaitTime);
+ // Unwind reply shares the result of the persistence reply
+ for (int i = 0; i < 2; ++i) {
+ StorageMessage::SP reply = _topLinks[1]->getAndRemoveMessage(MessageType::MERGEBUCKET_REPLY);
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode(ReturnCode::ABORTED, "Oh dear"),
+ static_cast<MergeBucketReply&>(*reply).getResult());
+ }
+
+ // Make sure it has been removed from the internal state so we can
+ // send new merges for the bucket
+ {
+ std::vector<uint16_t> chain;
+ chain.push_back(0);
+ std::shared_ptr<MergeBucketCommand> cmd(
+ new MergeBucketCommand(BucketId(32, 0xfeef00), nodes, 1234, 1, chain));
+ _topLinks[1]->sendDown(cmd);
+ }
+
+ _topLinks[1]->waitForMessage(MessageType::MERGEBUCKET, 5);
+ fwd = _topLinks[1]->getAndRemoveMessage(MessageType::MERGEBUCKET);
+ CPPUNIT_ASSERT_EQUAL(uint16_t(2), fwd->getAddress()->getIndex());
+}
+
+void
+MergeThrottlerTest::sendAndExpectReply(
+ const std::shared_ptr<api::StorageMessage>& msg,
+ const api::MessageType& expectedReplyType,
+ api::ReturnCode::Result expectedResultCode)
+{
+ _topLinks[0]->sendDown(msg);
+ _topLinks[0]->waitForMessage(expectedReplyType, _messageWaitTime);
+ StorageMessage::SP reply(_topLinks[0]->getAndRemoveMessage(
+ expectedReplyType));
+ api::StorageReply& storageReply(
+ dynamic_cast<api::StorageReply&>(*reply));
+ CPPUNIT_ASSERT_EQUAL(expectedResultCode,
+ storageReply.getResult().getResult());
+}
+
+void
+MergeThrottlerTest::testGetBucketDiffCommandNotInActiveSetIsRejected()
+{
+ document::BucketId bucket(16, 1234);
+ std::vector<api::GetBucketDiffCommand::Node> nodes;
+ std::shared_ptr<api::GetBucketDiffCommand> getDiffCmd(
+ new api::GetBucketDiffCommand(bucket, nodes, api::Timestamp(1234)));
+
+ sendAndExpectReply(getDiffCmd,
+ api::MessageType::GETBUCKETDIFF_REPLY,
+ api::ReturnCode::ABORTED);
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _bottomLinks[0]->getNumCommands());
+}
+
+void
+MergeThrottlerTest::testApplyBucketDiffCommandNotInActiveSetIsRejected()
+{
+ document::BucketId bucket(16, 1234);
+ std::vector<api::GetBucketDiffCommand::Node> nodes;
+ std::shared_ptr<api::ApplyBucketDiffCommand> applyDiffCmd(
+ new api::ApplyBucketDiffCommand(bucket, nodes, api::Timestamp(1234)));
+
+ sendAndExpectReply(applyDiffCmd,
+ api::MessageType::APPLYBUCKETDIFF_REPLY,
+ api::ReturnCode::ABORTED);
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _bottomLinks[0]->getNumCommands());
+}
+
+api::MergeBucketCommand::SP
+MergeThrottlerTest::sendMerge(const MergeBuilder& builder)
+{
+ api::MergeBucketCommand::SP cmd(builder.create());
+ _topLinks[builder._nodes[0]]->sendDown(cmd);
+ return cmd;
+}
+
+void
+MergeThrottlerTest::testNewClusterStateAbortsAllOutdatedActiveMerges()
+{
+ document::BucketId bucket(16, 6789);
+ _throttlers[0]->getThrottlePolicy().setMaxPendingCount(1);
+
+ // Merge will be forwarded (i.e. active).
+ sendMerge(MergeBuilder(bucket).clusterStateVersion(10));
+ _topLinks[0]->waitForMessage(MessageType::MERGEBUCKET, _messageWaitTime);
+ StorageMessage::SP fwd(_topLinks[0]->getAndRemoveMessage(
+ MessageType::MERGEBUCKET));
+
+ _topLinks[0]->sendDown(makeSystemStateCmd(
+ "version:11 distributor:100 storage:100"));
+ // Cannot send reply until we're unwinding
+ CPPUNIT_ASSERT_EQUAL(std::size_t(0), _topLinks[0]->getNumReplies());
+
+ // Trying to diff the bucket should now fail
+ {
+ std::shared_ptr<api::GetBucketDiffCommand> getDiffCmd(
+ new api::GetBucketDiffCommand(bucket, {}, api::Timestamp(123)));
+
+ sendAndExpectReply(getDiffCmd,
+ api::MessageType::GETBUCKETDIFF_REPLY,
+ api::ReturnCode::ABORTED);
+ }
+}
+
+// TODO test message queue aborting (use rendezvous functionality--make guard)
+
+} // namespace storage
diff --git a/storage/src/tests/storageserver/priorityconvertertest.cpp b/storage/src/tests/storageserver/priorityconvertertest.cpp
new file mode 100644
index 00000000000..ecbdcfb6b91
--- /dev/null
+++ b/storage/src/tests/storageserver/priorityconvertertest.cpp
@@ -0,0 +1,104 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/documentapi/documentapi.h>
+#include <vespa/storage/storageserver/priorityconverter.h>
+#include <tests/common/testhelper.h>
+
+namespace storage {
+
+struct PriorityConverterTest : public CppUnit::TestFixture
+{
+ std::unique_ptr<PriorityConverter> _converter;
+
+ void setUp() {
+ vdstestlib::DirConfig config(getStandardConfig(true));
+ _converter.reset(new PriorityConverter(config.getConfigId()));
+ };
+
+ void testNormalUsage();
+ void testLowestPriorityIsReturnedForUnknownCode();
+
+ CPPUNIT_TEST_SUITE(PriorityConverterTest);
+ CPPUNIT_TEST(testNormalUsage);
+ CPPUNIT_TEST(testLowestPriorityIsReturnedForUnknownCode);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(PriorityConverterTest);
+
+void PriorityConverterTest::testNormalUsage()
+{
+ for (int p=0; p<16; ++p) {
+ CPPUNIT_ASSERT_EQUAL(
+ (uint8_t)(50+p*10),
+ _converter->toStoragePriority(
+ static_cast<documentapi::Priority::Value>(p)));
+ }
+ for (int i=0; i<256; ++i) {
+ uint8_t p = i;
+ if (p <= 50) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_HIGHEST,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 60) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_VERY_HIGH,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 70) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_HIGH_1,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 80) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_HIGH_2,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 90) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_HIGH_3,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 100) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_NORMAL_1,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 110) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_NORMAL_2,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 120) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_NORMAL_3,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 130) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_NORMAL_4,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 140) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_NORMAL_5,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 150) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_NORMAL_6,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 160) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_LOW_1,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 170) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_LOW_2,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 180) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_LOW_3,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 190) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_VERY_LOW,
+ _converter->toDocumentPriority(p));
+ } else if (p <= 200) {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_LOWEST,
+ _converter->toDocumentPriority(p));
+ } else {
+ CPPUNIT_ASSERT_EQUAL(documentapi::Priority::PRI_LOWEST,
+ _converter->toDocumentPriority(p));
+ }
+ }
+}
+
+
+void
+PriorityConverterTest::testLowestPriorityIsReturnedForUnknownCode()
+{
+ CPPUNIT_ASSERT_EQUAL(255,
+ static_cast<int>(_converter->toStoragePriority(
+ static_cast<documentapi::Priority::Value>(123))));
+}
+
+}
diff --git a/storage/src/tests/storageserver/statemanagertest.cpp b/storage/src/tests/storageserver/statemanagertest.cpp
new file mode 100644
index 00000000000..68a35ac37d9
--- /dev/null
+++ b/storage/src/tests/storageserver/statemanagertest.cpp
@@ -0,0 +1,264 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <boost/pointer_cast.hpp>
+#include <cppunit/extensions/HelperMacros.h>
+#include <iostream>
+#include <vespa/metrics/metricmanager.h>
+#include <string>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/vdslib/state/nodestate.h>
+#include <vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h>
+#include <vespa/storage/storageserver/statemanager.h>
+#include <vespa/storage/common/hostreporter/hostinfo.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/testhelper.h>
+#include <tests/common/dummystoragelink.h>
+#include <vespa/vespalib/data/slime/type.h>
+
+using storage::lib::NodeState;
+using storage::lib::NodeType;
+using storage::lib::State;
+using storage::lib::ClusterState;
+
+namespace storage {
+
+struct StateManagerTest : public CppUnit::TestFixture {
+ std::unique_ptr<TestServiceLayerApp> _node;
+ std::unique_ptr<DummyStorageLink> _upper;
+ std::unique_ptr<metrics::MetricManager> _metricManager;
+ StateManager* _manager;
+ DummyStorageLink* _lower;
+
+ StateManagerTest();
+
+ void setUp();
+ void tearDown();
+
+ void testSystemState();
+ void testReportedNodeState();
+ void testClusterStateVersion();
+
+ CPPUNIT_TEST_SUITE(StateManagerTest);
+ CPPUNIT_TEST(testSystemState);
+ CPPUNIT_TEST(testReportedNodeState);
+ CPPUNIT_TEST(testClusterStateVersion);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(StateManagerTest);
+
+StateManagerTest::StateManagerTest()
+ : _node(),
+ _upper(),
+ _manager(0),
+ _lower(0)
+{
+}
+
+void
+StateManagerTest::setUp() {
+ try{
+ vdstestlib::DirConfig config(getStandardConfig(true));
+ _node.reset(new TestServiceLayerApp(DiskCount(1), NodeIndex(2)));
+ // Clock will increase 1 sec per call.
+ _node->getClock().setAbsoluteTimeInSeconds(1);
+ _metricManager.reset(new metrics::MetricManager);
+ _upper.reset(new DummyStorageLink());
+ _manager = new StateManager(_node->getComponentRegister(),
+ *_metricManager,
+ std::unique_ptr<HostInfo>(new HostInfo));
+ _lower = new DummyStorageLink();
+ _upper->push_back(StorageLink::UP(_manager));
+ _upper->push_back(StorageLink::UP(_lower));
+ _upper->open();
+ } catch (std::exception& e) {
+ std::cerr << "Failed to static initialize objects: " << e.what()
+ << "\n";
+ }
+}
+
+void
+StateManagerTest::tearDown() {
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _lower->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _lower->getNumCommands());
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _upper->getNumReplies());
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _upper->getNumCommands());
+ _manager = 0;
+ _lower = 0;
+ _upper->close();
+ _upper->flush();
+ _upper.reset(0);
+ _node.reset(0);
+ _metricManager.reset();
+}
+
+#define GET_ONLY_OK_REPLY(varname) \
+{ \
+ CPPUNIT_ASSERT_EQUAL(size_t(1), _upper->getNumReplies()); \
+ CPPUNIT_ASSERT(_upper->getReply(0)->getType().isReply()); \
+ varname = std::dynamic_pointer_cast<api::StorageReply>( \
+ _upper->getReply(0)); \
+ CPPUNIT_ASSERT(varname != 0); \
+ _upper->reset(); \
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode(api::ReturnCode::OK), \
+ varname->getResult()); \
+}
+
+void
+StateManagerTest::testSystemState()
+{
+ std::shared_ptr<api::StorageReply> reply;
+ // Verify initial state on startup
+ ClusterState::CSP currentState = _manager->getSystemState();
+ CPPUNIT_ASSERT_EQUAL(std::string("cluster:d"),
+ currentState->toString(false));
+
+ NodeState::CSP currentNodeState = _manager->getCurrentNodeState();
+ CPPUNIT_ASSERT_EQUAL(std::string("s:d"), currentNodeState->toString(false));
+
+ ClusterState sendState("storage:4 .2.s:m");
+ std::shared_ptr<api::SetSystemStateCommand> cmd(
+ new api::SetSystemStateCommand(sendState));
+ _upper->sendDown(cmd);
+ GET_ONLY_OK_REPLY(reply);
+
+ currentState = _manager->getSystemState();
+ CPPUNIT_ASSERT_EQUAL(sendState, *currentState);
+
+ currentNodeState = _manager->getCurrentNodeState();
+ CPPUNIT_ASSERT_EQUAL(std::string("s:m"), currentNodeState->toString(false));
+}
+
+namespace {
+ struct MyStateListener : public StateListener {
+ const NodeStateUpdater& updater;
+ lib::NodeState current;
+ std::ostringstream ost;
+
+ MyStateListener(const NodeStateUpdater& upd)
+ : updater(upd), current(*updater.getReportedNodeState()) {}
+
+ void handleNewState()
+ {
+ ost << current << " -> ";
+ current = *updater.getReportedNodeState();
+ ost << current << "\n";
+ }
+ };
+}
+
+void
+StateManagerTest::testReportedNodeState()
+{
+ std::shared_ptr<api::StorageReply> reply;
+ // Add a state listener to check that we get events.
+ MyStateListener stateListener(*_manager);
+ _manager->addStateListener(stateListener);
+ // Test that initial state is initializing
+ NodeState::CSP nodeState = _manager->getReportedNodeState();
+ CPPUNIT_ASSERT_EQUAL(std::string("s:i b:58 i:0 t:1"), nodeState->toString(false));
+ // Test that it works to update the state
+ {
+ NodeStateUpdater::Lock::SP lock(_manager->grabStateChangeLock());
+ NodeState ns(*_manager->getReportedNodeState());
+ ns.setState(State::UP);
+ _manager->setReportedNodeState(ns);
+ }
+ // And that we get the change both through state interface
+ nodeState = _manager->getReportedNodeState();
+ CPPUNIT_ASSERT_EQUAL(std::string("s:u b:58 t:1"),
+ nodeState->toString(false));
+ // And get node state command (no expected state)
+ std::shared_ptr<api::GetNodeStateCommand> cmd(
+ new api::GetNodeStateCommand(lib::NodeState::UP()));
+ _upper->sendDown(cmd);
+ GET_ONLY_OK_REPLY(reply);
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::GETNODESTATE_REPLY,
+ reply->getType());
+ nodeState.reset(new NodeState(
+ dynamic_cast<api::GetNodeStateReply&>(*reply).getNodeState()));
+ CPPUNIT_ASSERT_EQUAL(std::string("s:u b:58 t:1"),
+ nodeState->toString(false));
+ // We should also get it with wrong expected state
+ cmd.reset(new api::GetNodeStateCommand(lib::NodeState::UP(new NodeState(NodeType::STORAGE, State::INITIALIZING))));
+ _upper->sendDown(cmd);
+ GET_ONLY_OK_REPLY(reply);
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::GETNODESTATE_REPLY,
+ reply->getType());
+ nodeState.reset(new NodeState(
+ dynamic_cast<api::GetNodeStateReply&>(*reply).getNodeState()));
+ CPPUNIT_ASSERT_EQUAL(std::string("s:u b:58 t:1"),
+ nodeState->toString(false));
+ // With correct wanted state we should not get response right away
+ cmd.reset(new api::GetNodeStateCommand(
+ lib::NodeState::UP(new NodeState("s:u b:58 t:1", &NodeType::STORAGE))));
+ _upper->sendDown(cmd);
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _upper->getNumReplies());
+ // But when we update state, we get the reply
+ {
+ NodeStateUpdater::Lock::SP lock(_manager->grabStateChangeLock());
+ NodeState ns(*_manager->getReportedNodeState());
+ ns.setState(State::STOPPING);
+ ns.setDescription("Stopping node");
+ _manager->setReportedNodeState(ns);
+ }
+
+ GET_ONLY_OK_REPLY(reply);
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::GETNODESTATE_REPLY,
+ reply->getType());
+ nodeState.reset(new NodeState(
+ dynamic_cast<api::GetNodeStateReply&>(*reply).getNodeState()));
+ CPPUNIT_ASSERT_EQUAL(std::string("s:s b:58 t:1 m:Stopping\\x20node"),
+ nodeState->toString(false));
+
+ // Removing state listener, it stops getting updates
+ _manager->removeStateListener(stateListener);
+ // Do another update which listener should not get..
+ {
+ NodeStateUpdater::Lock::SP lock(_manager->grabStateChangeLock());
+ NodeState ns(*_manager->getReportedNodeState());
+ ns.setState(State::UP);
+ _manager->setReportedNodeState(ns);
+ }
+ std::string expectedEvents =
+ "s:i b:58 i:0 t:1 -> s:u b:58 t:1\n"
+ "s:u b:58 t:1 -> s:s b:58 t:1 m:Stopping\\x20node\n";
+ CPPUNIT_ASSERT_EQUAL(expectedEvents, stateListener.ost.str());
+}
+
+void
+StateManagerTest::testClusterStateVersion()
+{
+ ClusterState state(*_manager->getSystemState());
+ state.setVersion(123);
+ _manager->setClusterState(state);
+
+ std::string nodeInfoString(_manager->getNodeInfo());
+ vespalib::slime::Memory goldenMemory(nodeInfoString);
+ vespalib::Slime nodeInfo;
+ vespalib::slime::JsonFormat::decode(nodeInfoString, nodeInfo);
+
+ vespalib::slime::Symbol lookupSymbol =
+ nodeInfo.lookup("cluster-state-version");
+ if (lookupSymbol.undefined()) {
+ CPPUNIT_FAIL("No cluster-state-version was found in the node info");
+ }
+
+ auto& cursor = nodeInfo.get();
+ auto& clusterStateVersionCursor = cursor["cluster-state-version"];
+ if (!clusterStateVersionCursor.valid()) {
+ CPPUNIT_FAIL("No cluster-state-version was found in the node info");
+ }
+
+ if (clusterStateVersionCursor.type().getId() != vespalib::slime::LONG::ID) {
+ CPPUNIT_FAIL("No cluster-state-version was found in the node info");
+ }
+
+ int version = clusterStateVersionCursor.asLong();
+ CPPUNIT_ASSERT_EQUAL(123, version);
+}
+
+} // storage
+
diff --git a/storage/src/tests/storageserver/statereportertest.cpp b/storage/src/tests/storageserver/statereportertest.cpp
new file mode 100644
index 00000000000..ef1592bce80
--- /dev/null
+++ b/storage/src/tests/storageserver/statereportertest.cpp
@@ -0,0 +1,279 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <vespa/log/log.h>
+#include <vespa/storageframework/defaultimplementation/clock/fakeclock.h>
+#include <vespa/storage/persistence/filestorage/filestormanager.h>
+#include <vespa/storage/storageserver/applicationgenerationfetcher.h>
+#include <vespa/storage/storageserver/statereporter.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/testhelper.h>
+#include <tests/common/dummystoragelink.h>
+
+LOG_SETUP(".test.statereporter");
+
+namespace storage {
+
+class DummyApplicationGenerationFether : public ApplicationGenerationFetcher {
+public:
+ virtual int64_t getGeneration() const { return 1; }
+ virtual std::string getComponentName() const { return "component"; }
+};
+
+struct StateReporterTest : public CppUnit::TestFixture {
+ FastOS_ThreadPool _threadPool;
+ framework::defaultimplementation::FakeClock* _clock;
+ std::unique_ptr<TestServiceLayerApp> _node;
+ std::unique_ptr<DummyStorageLink> _top;
+ DummyApplicationGenerationFether _generationFetcher;
+ std::unique_ptr<StateReporter> _stateReporter;
+ std::unique_ptr<vdstestlib::DirConfig> _config;
+ std::unique_ptr<metrics::MetricSet> _topSet;
+ std::unique_ptr<metrics::MetricManager> _metricManager;
+ std::shared_ptr<FileStorMetrics> _filestorMetrics;
+
+ StateReporterTest();
+
+ void setUp();
+ void tearDown();
+ void runLoad(uint32_t count = 1);
+
+ void testReportConfigGeneration();
+ void testReportHealth();
+ void testReportMetrics();
+
+ CPPUNIT_TEST_SUITE(StateReporterTest);
+ CPPUNIT_TEST(testReportConfigGeneration);
+ CPPUNIT_TEST(testReportHealth);
+ CPPUNIT_TEST(testReportMetrics);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(StateReporterTest);
+
+namespace {
+ struct MetricClock : public metrics::MetricManager::Timer
+ {
+ framework::Clock& _clock;
+ MetricClock(framework::Clock& c) : _clock(c) {}
+ virtual time_t getTime() const
+ { return _clock.getTimeInSeconds().getTime(); }
+ virtual time_t getTimeInMilliSecs() const
+ { return _clock.getTimeInMillis().getTime(); }
+ };
+}
+
+StateReporterTest::StateReporterTest()
+ : _threadPool(256*1024),
+ _clock(0),
+ _top(),
+ _stateReporter()
+{
+}
+
+void StateReporterTest::setUp() {
+ assert(system("rm -rf vdsroot") == 0);
+ _config.reset(new vdstestlib::DirConfig(getStandardConfig(true)));
+ try {
+ _node.reset(new TestServiceLayerApp(DiskCount(4), NodeIndex(0),
+ _config->getConfigId()));
+ _node->setupDummyPersistence();
+ _clock = &_node->getClock();
+ _clock->setAbsoluteTimeInSeconds(1000000);
+ _top.reset(new DummyStorageLink);
+ } catch (config::InvalidConfigException& e) {
+ fprintf(stderr, "%s\n", e.what());
+ }
+ _metricManager.reset(new metrics::MetricManager(
+ std::unique_ptr<metrics::MetricManager::Timer>(
+ new MetricClock(*_clock))));
+ _topSet.reset(new metrics::MetricSet("vds", "", ""));
+ {
+ metrics::MetricLockGuard guard(_metricManager->getMetricLock());
+ _metricManager->registerMetric(guard, *_topSet);
+ }
+
+ _stateReporter.reset(new StateReporter(
+ _node->getComponentRegister(),
+ *_metricManager,
+ _generationFetcher,
+ "status"));
+
+ uint16_t diskCount = _node->getPartitions().size();
+ documentapi::LoadTypeSet::SP loadTypes(_node->getLoadTypes());
+
+ _filestorMetrics.reset(new FileStorMetrics(
+ _node->getLoadTypes()->getMetricLoadTypes()));
+ _filestorMetrics->initDiskMetrics(
+ diskCount, loadTypes->getMetricLoadTypes(), 1);
+ _topSet->registerMetric(*_filestorMetrics);
+
+ _metricManager->init(_config->getConfigId(), _node->getThreadPool());
+}
+
+void StateReporterTest::tearDown() {
+ _metricManager->stop();
+ _stateReporter.reset(0);
+ _topSet.reset(0);
+ _metricManager.reset(0);
+ _top.reset(0);
+ _node.reset(0);
+ _config.reset(0);
+ _filestorMetrics.reset();
+}
+
+#define PARSE_JSON(jsonData) \
+vespalib::Slime slime; \
+{ \
+ using namespace vespalib::slime; \
+ size_t parsed = JsonFormat::decode(Memory(jsonData), slime); \
+ SimpleBuffer buffer; \
+ JsonFormat::encode(slime, buffer, false); \
+ if (jsonData.size() != parsed) { \
+ std::ostringstream error; \
+ error << "Failed to parse JSON: '\n" \
+ << jsonData << "'\n:" << buffer.get().make_string() << "\n"; \
+ CPPUNIT_ASSERT_EQUAL_MSG(error.str(), jsonData.size(), parsed); \
+ } \
+}
+
+#define ASSERT_GENERATION(jsonData, component, generation) \
+{ \
+ PARSE_JSON(jsonData); \
+ CPPUNIT_ASSERT_EQUAL( \
+ generation, \
+ slime.get()["config"][component]["generation"].asDouble()); \
+}
+
+#define ASSERT_NODE_STATUS(jsonData, code, message) \
+{ \
+ PARSE_JSON(jsonData); \
+ CPPUNIT_ASSERT_EQUAL( \
+ vespalib::string(code), \
+ slime.get()["status"]["code"].asString().make_string()); \
+ CPPUNIT_ASSERT_EQUAL( \
+ vespalib::string(message), \
+ slime.get()["status"]["message"].asString().make_string()); \
+}
+
+#define ASSERT_METRIC_GET_PUT(jsonData, expGetCount, expPutCount) \
+{ \
+ PARSE_JSON(jsonData); \
+ double getCount = -1; \
+ double putCount = -1; \
+ size_t metricCount = slime.get()["metrics"]["values"].children(); \
+ /*std::cerr << "\nmetric count=" << metricCount << "\n";*/ \
+ for (size_t j=0; j<metricCount; j++) { \
+ const vespalib::string name = slime.get()["metrics"]["values"][j]["name"] \
+ .asString().make_string(); \
+ if (name.compare("vds.filestor.alldisks.allthreads." \
+ "get.sum.count") == 0) \
+ { \
+ getCount = slime.get()["metrics"]["values"][j]["values"]["count"] \
+ .asDouble(); \
+ } else if (name.compare("vds.filestor.alldisks.allthreads." \
+ "put.sum.count") == 0) \
+ { \
+ putCount = slime.get()["metrics"]["values"][j]["values"]["count"] \
+ .asDouble(); \
+ } \
+ } \
+ CPPUNIT_ASSERT_EQUAL(expGetCount, getCount); \
+ CPPUNIT_ASSERT_EQUAL(expPutCount, putCount); \
+ CPPUNIT_ASSERT(metricCount > 100); \
+}
+
+
+void StateReporterTest::testReportConfigGeneration() {
+ std::ostringstream ost;
+ framework::HttpUrlPath path("/state/v1/config");
+ _stateReporter->reportStatus(ost, path);
+ std::string jsonData = ost.str();
+ //std::cerr << "\nConfig: " << jsonData << "\n";
+ ASSERT_GENERATION(jsonData, "component", 1.0);
+}
+
+void StateReporterTest::testReportHealth() {
+ const int stateCount = 7;
+ const lib::NodeState nodeStates[stateCount] = {
+ lib::NodeState(lib::NodeType::STORAGE, lib::State::UNKNOWN),
+ lib::NodeState(lib::NodeType::STORAGE, lib::State::MAINTENANCE),
+ lib::NodeState(lib::NodeType::STORAGE, lib::State::DOWN),
+ lib::NodeState(lib::NodeType::STORAGE, lib::State::STOPPING),
+ lib::NodeState(lib::NodeType::STORAGE, lib::State::INITIALIZING),
+ lib::NodeState(lib::NodeType::STORAGE, lib::State::RETIRED),
+ lib::NodeState(lib::NodeType::STORAGE, lib::State::UP)
+ };
+ const char* codes[stateCount] = {
+ "down",
+ "down",
+ "down",
+ "down",
+ "down",
+ "down",
+ "up"
+ };
+ const char* messages[stateCount] = {
+ "Node state: Unknown",
+ "Node state: Maintenance",
+ "Node state: Down",
+ "Node state: Stopping",
+ "Node state: Initializing, init progress 0",
+ "Node state: Retired",
+ ""
+ };
+
+ framework::HttpUrlPath path("/state/v1/health");
+ for (int i=0; i<stateCount; i++) {
+ _node->getStateUpdater().setCurrentNodeState(nodeStates[i]);
+ std::ostringstream ost;
+ _stateReporter->reportStatus(ost, path);
+ std::string jsonData = ost.str();
+ //std::cerr << "\nHealth " << i << ":" << jsonData << "\n";
+ ASSERT_NODE_STATUS(jsonData, codes[i], messages[i]);
+ }
+}
+
+void StateReporterTest::testReportMetrics() {
+ FileStorDiskMetrics& disk0(*_filestorMetrics->disks[0]);
+ FileStorThreadMetrics& thread0(*disk0.threads[0]);
+
+ LOG(info, "Adding to get metric");
+
+ using documentapi::LoadType;
+ thread0.get[LoadType::DEFAULT].count.inc(1);
+
+ LOG(info, "Waiting for 5 minute snapshot to be taken");
+ // Wait until active metrics have been added to 5 min snapshot and reset
+ for (uint32_t i=0; i<6; ++i) {
+ _clock->addSecondsToTime(60);
+ _metricManager->timeChangedNotification();
+ while (
+ uint64_t(_metricManager->getLastProcessedTime())
+ < _clock->getTimeInSeconds().getTime())
+ {
+ FastOS_Thread::Sleep(1);
+ }
+ }
+ LOG(info, "5 minute snapshot should have been taken. Adding put count");
+
+ thread0.put[LoadType::DEFAULT].count.inc(1);
+
+ const int pathCount = 2;
+ const char* paths[pathCount] = {
+ "/state/v1/metrics",
+ "/state/v1/metrics?consumer=status"
+ };
+
+ for (int i=0; i<pathCount; i++) {
+ framework::HttpUrlPath path(paths[i]);
+ std::ostringstream ost;
+ _stateReporter->reportStatus(ost, path);
+ std::string jsonData = ost.str();
+ //std::cerr << "\nMetrics:" << jsonData << "\n";
+ ASSERT_METRIC_GET_PUT(jsonData, 1.0, 0.0);
+ }
+ }
+
+} // storage
diff --git a/storage/src/tests/storageserver/testvisitormessagesession.cpp b/storage/src/tests/storageserver/testvisitormessagesession.cpp
new file mode 100644
index 00000000000..e814f6cf229
--- /dev/null
+++ b/storage/src/tests/storageserver/testvisitormessagesession.cpp
@@ -0,0 +1,78 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <tests/storageserver/testvisitormessagesession.h>
+#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
+
+namespace storage {
+
+TestVisitorMessageSession::~TestVisitorMessageSession()
+{
+}
+
+TestVisitorMessageSession::TestVisitorMessageSession(VisitorThread& t,
+ Visitor& v,
+ const mbus::Error& autoReplyError,
+ bool autoReply)
+ : _autoReplyError(autoReplyError),
+ _autoReply(autoReply),
+ thread(t),
+ visitor(v),
+ pendingCount(0)
+{
+}
+
+void
+TestVisitorMessageSession::reply(mbus::Reply::UP rep) {
+ {
+ vespalib::MonitorGuard guard(_waitMonitor);
+ pendingCount--;
+ }
+ thread.handleMessageBusReply(std::move(rep), visitor);
+}
+
+mbus::Result
+TestVisitorMessageSession::send(
+ std::unique_ptr<documentapi::DocumentMessage> message)
+{
+ vespalib::MonitorGuard guard(_waitMonitor);
+ if (_autoReply) {
+ pendingCount++;
+ mbus::Reply::UP rep = message->createReply();
+ rep->setMessage(mbus::Message::UP(message.release()));
+ if (_autoReplyError.getCode() == mbus::ErrorCode::NONE) {
+ reply(std::move(rep));
+ return mbus::Result();
+ } else {
+ return mbus::Result(_autoReplyError,
+ std::unique_ptr<mbus::Message>(message.release()));
+ }
+ } else {
+ pendingCount++;
+ sentMessages.push_back(
+ vespalib::LinkedPtr<documentapi::DocumentMessage>(
+ message.release()));
+ guard.broadcast();
+ return mbus::Result();
+ }
+}
+
+void
+TestVisitorMessageSession::waitForMessages(unsigned int msgCount) {
+ framework::defaultimplementation::RealClock clock;
+ framework::MilliSecTime endTime(
+ clock.getTimeInMillis() + framework::MilliSecTime(60 * 1000));
+
+ vespalib::MonitorGuard guard(_waitMonitor);
+ while (sentMessages.size() < msgCount) {
+ if (clock.getTimeInMillis() > endTime) {
+ throw vespalib::IllegalStateException(
+ vespalib::make_string("Timed out waiting for %u messages "
+ "in test visitor session", msgCount),
+ VESPA_STRLOC);
+ }
+ guard.wait(1000);
+ }
+};
+
+}
diff --git a/storage/src/tests/storageserver/testvisitormessagesession.h b/storage/src/tests/storageserver/testvisitormessagesession.h
new file mode 100644
index 00000000000..3ae6ccafb84
--- /dev/null
+++ b/storage/src/tests/storageserver/testvisitormessagesession.h
@@ -0,0 +1,79 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <deque>
+#include <vespa/storage/visiting/visitormessagesession.h>
+#include <vespa/storage/visiting/visitorthread.h>
+#include <vespa/documentapi/messagebus/messages/documentmessage.h>
+#include <vespa/storage/storageserver/priorityconverter.h>
+
+namespace storage {
+
+class TestVisitorMessageSession : public VisitorMessageSession
+{
+private:
+ vespalib::Monitor _waitMonitor;
+ mbus::Error _autoReplyError;
+ bool _autoReply;
+
+public:
+ typedef std::unique_ptr<TestVisitorMessageSession> UP;
+
+ VisitorThread& thread;
+ Visitor& visitor;
+ uint32_t pendingCount;
+
+ ~TestVisitorMessageSession();
+
+ std::deque<vespalib::LinkedPtr<documentapi::DocumentMessage> > sentMessages;
+
+ TestVisitorMessageSession(VisitorThread& t,
+ Visitor& v,
+ const mbus::Error& autoReplyError,
+ bool autoReply);
+
+ void reply(mbus::Reply::UP rep);
+
+ uint32_t pending() { return pendingCount; }
+
+ mbus::Result send(std::unique_ptr<documentapi::DocumentMessage> message);
+
+ void waitForMessages(unsigned int msgCount);
+
+ vespalib::Monitor& getMonitor() { return _waitMonitor; }
+};
+
+struct TestVisitorMessageSessionFactory : public VisitorMessageSessionFactory
+{
+ vespalib::Lock _accessLock;
+ std::vector<TestVisitorMessageSession*> _visitorSessions;
+ mbus::Error _autoReplyError;
+ bool _createAutoReplyVisitorSessions;
+ PriorityConverter _priConverter;
+
+ TestVisitorMessageSessionFactory(vespalib::stringref configId = "")
+ : _createAutoReplyVisitorSessions(false),
+ _priConverter(configId) {}
+
+ VisitorMessageSession::UP createSession(Visitor& v, VisitorThread& vt) {
+ vespalib::LockGuard lock(_accessLock);
+ TestVisitorMessageSession::UP session(
+ new TestVisitorMessageSession(
+ vt,
+ v,
+ _autoReplyError,
+ _createAutoReplyVisitorSessions));
+ _visitorSessions.push_back(session.get());
+ return VisitorMessageSession::UP(std::move(session));
+ }
+
+ documentapi::Priority::Value toDocumentPriority(uint8_t storagePriority) const
+ {
+ return _priConverter.toDocumentPriority(storagePriority);
+ }
+
+};
+
+} // storage
+
diff --git a/storage/src/tests/storageutil/.gitignore b/storage/src/tests/storageutil/.gitignore
new file mode 100644
index 00000000000..a080232d5f3
--- /dev/null
+++ b/storage/src/tests/storageutil/.gitignore
@@ -0,0 +1,13 @@
+*.So
+*.lo
+*.o
+.*.swp
+.config.log
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
+statefile*
+testrunner
+testrunner.core
diff --git a/storage/src/tests/storageutil/CMakeLists.txt b/storage/src/tests/storageutil/CMakeLists.txt
new file mode 100644
index 00000000000..a48895352e8
--- /dev/null
+++ b/storage/src/tests/storageutil/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_teststorageutil
+ SOURCES
+ functortest.cpp
+ charttest.cpp
+ palettetest.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/tests/storageutil/charttest.cpp b/storage/src/tests/storageutil/charttest.cpp
new file mode 100644
index 00000000000..d9ce3d6f1b4
--- /dev/null
+++ b/storage/src/tests/storageutil/charttest.cpp
@@ -0,0 +1,66 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageutil/piechart.h>
+
+#include <fstream>
+#include <vespa/vdstestlib/cppunit/macros.h>
+
+namespace storage {
+
+struct PieChartTest : public CppUnit::TestFixture
+{
+ void setUp() {}
+ void tearDown() {}
+
+ void testWriteHtmlFile();
+
+ CPPUNIT_TEST_SUITE(PieChartTest);
+ CPPUNIT_TEST(testWriteHtmlFile);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(PieChartTest);
+
+namespace {
+ void printHtmlFile(const std::string& filename, const PieChart& chart) {
+ std::ofstream out(filename.c_str());
+ out << "<html>\n"
+ << " <head>\n"
+ << " ";
+ PieChart::printHtmlHeadAdditions(out, " ");
+ out << "\n <title>Pie example</title>\n"
+ << " </head>\n"
+ << " <body>\n"
+ << " ";
+ chart.printCanvas(out, 500, 400);
+ out << "\n ";
+ chart.printScript(out, " ");
+ out << "\n </body>\n"
+ << "</html>\n";
+ out.close();
+ }
+}
+
+void
+PieChartTest::testWriteHtmlFile()
+{
+ {
+ PieChart chart("mypie");
+ chart.add(10, "put");
+ chart.add(20, "get");
+ chart.add(50, "free");
+
+ printHtmlFile("piefile.html", chart);
+ }
+ {
+ PieChart chart("mypie", PieChart::SCHEME_CUSTOM);
+ chart.add(10, "put", PieChart::RED);
+ chart.add(20, "get", PieChart::GREEN);
+ chart.add(50, "free", PieChart::BLUE);
+
+ printHtmlFile("piefile-customcols.html", chart);
+ }
+}
+
+} // storage
diff --git a/storage/src/tests/storageutil/functortest.cpp b/storage/src/tests/storageutil/functortest.cpp
new file mode 100644
index 00000000000..00b9f5450cb
--- /dev/null
+++ b/storage/src/tests/storageutil/functortest.cpp
@@ -0,0 +1,55 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <list>
+#include <string>
+#include <algorithm>
+#include <vespa/storage/storageutil/functor.h>
+
+class Functor_Test : public CppUnit::TestFixture {
+ CPPUNIT_TEST_SUITE(Functor_Test);
+ CPPUNIT_TEST(testReplace);
+ CPPUNIT_TEST(testDeletePointer);
+ CPPUNIT_TEST_SUITE_END();
+
+public:
+
+protected:
+ void testReplace();
+ void testDeletePointer();
+};
+
+using namespace storage;
+using namespace std;
+
+CPPUNIT_TEST_SUITE_REGISTRATION(Functor_Test);
+
+void Functor_Test::testReplace()
+{
+ string source("this.is.a.string.with.many.dots.");
+ for_each(source.begin(), source.end(), Functor::Replace<char>('.', '_'));
+ CPPUNIT_ASSERT_EQUAL(string("this_is_a_string_with_many_dots_"), source);
+}
+
+namespace {
+
+ static int instanceCounter = 0;
+
+ class TestClass {
+ public:
+ TestClass() { instanceCounter++; }
+ ~TestClass() { instanceCounter--; }
+ };
+}
+
+void Functor_Test::testDeletePointer()
+{
+ list<TestClass*> mylist;
+ mylist.push_back(new TestClass());
+ mylist.push_back(new TestClass());
+ mylist.push_back(new TestClass());
+ CPPUNIT_ASSERT_EQUAL(3, instanceCounter);
+ for_each(mylist.begin(), mylist.end(), Functor::DeletePointer());
+ CPPUNIT_ASSERT_EQUAL(0, instanceCounter);
+}
diff --git a/storage/src/tests/storageutil/palettetest.cpp b/storage/src/tests/storageutil/palettetest.cpp
new file mode 100644
index 00000000000..ffc2dd091ee
--- /dev/null
+++ b/storage/src/tests/storageutil/palettetest.cpp
@@ -0,0 +1,33 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageutil/palette.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+
+namespace storage {
+
+struct PaletteTest : public CppUnit::TestFixture {
+ void setUp() {}
+ void tearDown() {}
+
+ void testNormalUsage();
+
+ CPPUNIT_TEST_SUITE(PaletteTest);
+ CPPUNIT_TEST(testNormalUsage);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(PaletteTest);
+
+void
+PaletteTest::testNormalUsage()
+{
+ std::ofstream out("palette.html");
+ out << "<html><body>\n";
+ Palette palette(75);
+ palette.printHtmlTablePalette(out);
+ out << "</body></html>\n";
+ out.close();
+}
+
+} // storage
diff --git a/storage/src/tests/storageutil/recordflatfiletest.cpp b/storage/src/tests/storageutil/recordflatfiletest.cpp
new file mode 100644
index 00000000000..e08dd88dc67
--- /dev/null
+++ b/storage/src/tests/storageutil/recordflatfiletest.cpp
@@ -0,0 +1,314 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <cppunit/extensions/HelperMacros.h>
+#include <iostream>
+#include <string>
+#include <vespa/storage/storageutil/recordflatfile.h>
+
+using namespace document;
+using namespace storage;
+using namespace std;
+using namespace document;
+
+class RecordFlatFile_Test : public CppUnit::TestFixture {
+ CPPUNIT_TEST_SUITE(RecordFlatFile_Test);
+ CPPUNIT_TEST(testAdd);
+ CPPUNIT_TEST(testUpdate);
+ CPPUNIT_TEST(testRemove);
+ CPPUNIT_TEST(testExists);
+ CPPUNIT_TEST(testGetRecord);
+ CPPUNIT_TEST(testClear);
+ CPPUNIT_TEST(testSimpleUsage);
+ CPPUNIT_TEST(testValid);
+ CPPUNIT_TEST_SUITE_END();
+
+ string _testFile;
+ unsigned int _chunkSize;
+
+ void setupTestFile();
+
+public:
+ void setUp();
+
+ RecordFlatFile_Test(void)
+ : _testFile(),
+ _chunkSize(0)
+ {
+ }
+
+protected:
+ void testAdd();
+ void testUpdate();
+ void testRemove();
+ void testExists();
+ void testGetRecord();
+ void testClear();
+ void testSimpleUsage();
+ void testValid();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(RecordFlatFile_Test);
+
+namespace {
+
+ const bool debug = false;
+
+ class MyRecord {
+ private:
+ unsigned int _id;
+ unsigned int _value;
+ unsigned int _valid;
+
+ public:
+ MyRecord(void)
+ : _id(0u),
+ _value(0u),
+ _valid(0u)
+ {
+ }
+ MyRecord(unsigned int id, unsigned int value, bool valid = true)
+ : _id(id), _value(value), _valid(valid ? 0 : 0xFFFFFFFF) {}
+
+ const unsigned int& getId() const { return _id; }
+ unsigned int getValue() const { return _value; }
+ void setValue(unsigned int value) { _value = value; }
+ bool isValid() const { return (_valid == 0); }
+
+ bool operator==(const MyRecord& record) const {
+ return (_id == record._id && _value == record._value);
+ }
+ };
+
+ ostream& operator<<(ostream& out, MyRecord record) {
+ out << "MyRecord(" << record.getId() << ", " << record.getValue()
+ << ")";
+ return out;
+ }
+
+ class BlockMessage {
+ private:
+ string _name;
+ static unsigned int _indent;
+
+ public:
+ BlockMessage(const string& name) : _name(name) {
+ if (debug) {
+ for (unsigned int i=0; i<_indent; i++) cout << " ";
+ cout << "Block started: " << _name << "\n" << flush;
+ }
+ _indent++;
+ }
+ ~BlockMessage() {
+ _indent--;
+ if (debug) {
+ for (unsigned int i=0; i<_indent; i++) cout << " ";
+ cout << "Block completed: " << _name << "\n" << flush;
+ }
+ }
+ };
+
+ unsigned int BlockMessage::_indent(0);
+
+}
+
+void RecordFlatFile_Test::setUp() {
+ _testFile = "recordflatfile.testfile";
+ _chunkSize = 4;
+}
+
+void RecordFlatFile_Test::setupTestFile() {
+ BlockMessage message("setupTestFile()");
+ RecordFlatFile<MyRecord, unsigned int> flatfile(_testFile, _chunkSize);
+ flatfile.clear();
+ for (unsigned int i=1; i<=8; ++i) {
+ flatfile.add(MyRecord(i, 10+i));
+ }
+ CPPUNIT_ASSERT_EQUAL(8u, flatfile.getSize());
+ for (unsigned int i=1; i<=8; ++i) {
+ CPPUNIT_ASSERT_EQUAL(MyRecord(i, 10+i), *flatfile[i-1]);
+ }
+}
+
+
+void RecordFlatFile_Test::testAdd() {
+ BlockMessage message("testAdd()");
+ setupTestFile();
+ RecordFlatFile<MyRecord, unsigned int> flatfile(_testFile, _chunkSize);
+ flatfile.add(MyRecord(9, 19));
+ CPPUNIT_ASSERT_EQUAL(9u, flatfile.getSize());
+ CPPUNIT_ASSERT_EQUAL(MyRecord(1, 11), *flatfile[0]);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(2, 12), *flatfile[1]);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(7, 17), *flatfile[6]);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(8, 18), *flatfile[7]);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(9, 19), *flatfile[8]);
+}
+
+void RecordFlatFile_Test::testUpdate() {
+ BlockMessage message("testUpdate()");
+ setupTestFile();
+ RecordFlatFile<MyRecord, unsigned int> flatfile(_testFile, _chunkSize);
+ CPPUNIT_ASSERT(!flatfile.update(MyRecord(0, 20)));
+ CPPUNIT_ASSERT(flatfile.update(MyRecord(4, 19)));
+ CPPUNIT_ASSERT_EQUAL(8u, flatfile.getSize());
+ CPPUNIT_ASSERT_EQUAL(MyRecord(1, 11), *flatfile[0]);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(3, 13), *flatfile[2]);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(4, 19), *flatfile[3]);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(5, 15), *flatfile[4]);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(8, 18), *flatfile[7]);
+}
+
+void RecordFlatFile_Test::testRemove() {
+ BlockMessage message("testRemove()");
+ setupTestFile();
+ RecordFlatFile<MyRecord, unsigned int> flatfile(_testFile, _chunkSize);
+ flatfile.remove(3);
+ CPPUNIT_ASSERT_EQUAL(7u, flatfile.getSize());
+ CPPUNIT_ASSERT_EQUAL(MyRecord(1, 11), *flatfile[0]);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(2, 12), *flatfile[1]);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(8, 18), *flatfile[2]);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(4, 14), *flatfile[3]);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(5, 15), *flatfile[4]);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(6, 16), *flatfile[5]);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(7, 17), *flatfile[6]);
+}
+
+void RecordFlatFile_Test::testExists() {
+ BlockMessage message("testExists()");
+ setupTestFile();
+ RecordFlatFile<MyRecord, unsigned int> flatfile(_testFile, _chunkSize);
+ CPPUNIT_ASSERT(flatfile.exists(3));
+ CPPUNIT_ASSERT(flatfile.exists(1));
+ CPPUNIT_ASSERT(!flatfile.exists(11));
+ CPPUNIT_ASSERT(flatfile.exists(6));
+ CPPUNIT_ASSERT(flatfile.exists(5));
+ CPPUNIT_ASSERT(!flatfile.exists(0));
+}
+
+void RecordFlatFile_Test::testGetRecord() {
+ BlockMessage message("testGetRecord()");
+ setupTestFile();
+ RecordFlatFile<MyRecord, unsigned int> flatfile(_testFile, _chunkSize);
+ CPPUNIT_ASSERT_EQUAL(MyRecord(4, 14), *flatfile.getRecord(4));
+ CPPUNIT_ASSERT(flatfile.getRecord(0).get() == 0);
+}
+
+void RecordFlatFile_Test::testClear() {
+ try{
+ BlockMessage message("testClear()");
+ setupTestFile();
+ RecordFlatFile<MyRecord, unsigned int> flatfile(_testFile, _chunkSize);
+ flatfile.clear();
+ struct stat filestats;
+ CPPUNIT_ASSERT(stat(_testFile.c_str(), &filestats) == -1);
+ } catch (exception& e) {
+ cerr << "Caught exception '" << e.what() << "' in testClear()" << endl;
+ throw;
+ }
+}
+
+void RecordFlatFile_Test::testSimpleUsage()
+{
+ BlockMessage message("testSimpleUsage()");
+ RecordFlatFile<MyRecord, unsigned int> flatfile("recordflatfile.testfile");
+ flatfile.clear();
+
+ CPPUNIT_ASSERT_EQUAL(false, flatfile.exists(34u));
+ CPPUNIT_ASSERT_EQUAL((MyRecord*) 0, flatfile.getRecord(23u).get());
+
+ MyRecord record1(12, 54);
+ MyRecord record2(34, 62);
+
+ flatfile.add(record1);
+ flatfile.add(record2);
+
+ CPPUNIT_ASSERT_EQUAL(true, flatfile.exists(12u));
+ CPPUNIT_ASSERT_EQUAL((MyRecord*) 0, flatfile.getRecord(23u).get());
+ unique_ptr<MyRecord> result(flatfile.getRecord(34u));
+ CPPUNIT_ASSERT(result.get() != 0);
+ CPPUNIT_ASSERT_EQUAL(62u, result->getValue());
+
+ record2.setValue(67);
+ flatfile.update(record2);
+
+ unique_ptr<MyRecord> result2(flatfile.getRecord(34u));
+ CPPUNIT_ASSERT(result2.get() != 0);
+ CPPUNIT_ASSERT_EQUAL(67u, result2->getValue());
+
+ flatfile.remove(12);
+ CPPUNIT_ASSERT_EQUAL(false, flatfile.exists(12u));
+
+ flatfile.clear();
+ CPPUNIT_ASSERT_EQUAL(false, flatfile.exists(34u));
+}
+
+void RecordFlatFile_Test::testValid()
+{
+ BlockMessage message("testValid()");
+ RecordFlatFile<MyRecord, unsigned int> flatfile("recordflatfile.testfile");
+ flatfile.clear();
+
+ MyRecord record1(12, 54, true);
+ MyRecord record2(34, 62, false);
+ MyRecord record3(15, 69, true);
+ MyRecord record4(50, 93, false);
+
+ // Test that valid entries doesn't generate errors
+ flatfile.add(record1);
+ CPPUNIT_ASSERT(!flatfile.errorsFound());
+ CPPUNIT_ASSERT_EQUAL((size_t) 0, flatfile.getErrors().size());
+
+ // Test that invalid entries do
+ flatfile.add(record2);
+ CPPUNIT_ASSERT(flatfile.errorsFound());
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, flatfile.getErrors().size());
+ string expected("Adding invalid record '34' to file "
+ "recordflatfile.testfile.");
+ CPPUNIT_ASSERT_EQUAL(expected, *flatfile.getErrors().begin());
+
+ // Checking that errors are kept if not cleared
+ flatfile.add(record3);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, flatfile.getErrors().size());
+ CPPUNIT_ASSERT_EQUAL(expected, *flatfile.getErrors().begin());
+
+ // Checking that clearing errors work
+ flatfile.clearErrors();
+ CPPUNIT_ASSERT_EQUAL((size_t) 0, flatfile.getErrors().size());
+
+ flatfile.add(record4);
+ flatfile.clearErrors();
+
+ // Checking that entries read in get method generates warning
+ unique_ptr<MyRecord> result(flatfile.getRecord(12));
+ CPPUNIT_ASSERT_EQUAL((size_t) 0, flatfile.getErrors().size());
+ result = flatfile.getRecord(15);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, flatfile.getErrors().size());
+ expected = "Found corrupted entry in file recordflatfile.testfile";
+ CPPUNIT_ASSERT_EQUAL(expected, *flatfile.getErrors().begin());
+ flatfile.clearErrors();
+
+ // Checking that reading invalid entries generate exception
+ try{
+ result = flatfile.getRecord(50);
+ CPPUNIT_FAIL("Expected exception");
+ } catch (IoException& e) {
+ expected = "IoException(): Entry requested '50' is corrupted in file "
+ "recordflatfile.testfile at getRecord in";
+ string actual(e.what());
+ if (actual.size() > expected.size())
+ actual = actual.substr(0, expected.size());
+ CPPUNIT_ASSERT_EQUAL(expected, actual);
+ }
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, flatfile.getErrors().size());
+ expected = "Found corrupted entry in file recordflatfile.testfile";
+ CPPUNIT_ASSERT_EQUAL(expected, *flatfile.getErrors().begin());
+ flatfile.clearErrors();
+
+ // Check that you get warning when deleting if last entry is invalid
+ flatfile.remove(12);
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, flatfile.getErrors().size());
+ expected = "Last entry in file recordflatfile.testfile is invalid";
+ CPPUNIT_ASSERT_EQUAL(expected, *flatfile.getErrors().begin());
+
+ flatfile.clear();
+}
diff --git a/storage/src/tests/subscriptions/.gitignore b/storage/src/tests/subscriptions/.gitignore
new file mode 100644
index 00000000000..04a221b8052
--- /dev/null
+++ b/storage/src/tests/subscriptions/.gitignore
@@ -0,0 +1,8 @@
+*.So
+*.lo
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
+features.h
diff --git a/storage/src/tests/systemtests/.gitignore b/storage/src/tests/systemtests/.gitignore
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/storage/src/tests/systemtests/.gitignore
diff --git a/storage/src/tests/testhelper.cpp b/storage/src/tests/testhelper.cpp
new file mode 100644
index 00000000000..c4074aa1ac6
--- /dev/null
+++ b/storage/src/tests/testhelper.cpp
@@ -0,0 +1,175 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <tests/testhelper.h>
+
+#include <vespa/log/log.h>
+#include <vespa/vespalib/io/fileutil.h>
+
+LOG_SETUP(".testhelper");
+
+namespace storage {
+
+namespace {
+ bool useNewStorageCore() {
+ if ( // Unit test directory
+ vespalib::fileExists("use_new_storage_core") ||
+ // src/cpp directory
+ vespalib::fileExists("../use_new_storage_core") ||
+ // Top build directory where storage-HEAD remains
+ vespalib::fileExists("../../../../use_new_storage_core"))
+ {
+ std::cerr << "Using new storage core for unit tests\n";
+ return true;
+ }
+ return false;
+ }
+ bool newStorageCore(useNewStorageCore());
+}
+
+void addStorageDistributionConfig(vdstestlib::DirConfig& dc)
+{
+ vdstestlib::DirConfig::Config* config;
+ config = &dc.getConfig("stor-distribution", true);
+ config->clear();
+ config->set("group[1]");
+ config->set("group[0].name", "invalid");
+ config->set("group[0].index", "invalid");
+ config->set("group[0].nodes[50]");
+ config->set("redundancy", "2");
+
+ for (uint32_t i = 0; i < 50; i++) {
+ std::ostringstream key; key << "group[0].nodes[" << i << "].index";
+ std::ostringstream val; val << i;
+ config->set(key.str(), val.str());
+ }
+}
+
+vdstestlib::DirConfig getStandardConfig(bool storagenode) {
+ std::string clusterName("storage");
+ vdstestlib::DirConfig dc;
+ vdstestlib::DirConfig::Config* config;
+ config = &dc.addConfig("fleetcontroller");
+ config->set("cluster_name", clusterName);
+ config->set("index", "0");
+ config->set("zookeeper_server", "\"\"");
+ config->set("total_distributor_count", "10");
+ config->set("total_storage_count", "10");
+ config = &dc.addConfig("upgrading");
+ config = &dc.addConfig("load-type");
+ config = &dc.addConfig("bucket");
+ config = &dc.addConfig("messagebus");
+ config = &dc.addConfig("stor-prioritymapping");
+ config = &dc.addConfig("stor-bucketdbupdater");
+ config = &dc.addConfig("stor-bucket-init");
+ config = &dc.addConfig("metricsmanager");
+ config->set("consumer[1]");
+ config->set("consumer[0].name", "\"status\"");
+ config->set("consumer[0].addedmetrics[1]");
+ config->set("consumer[0].addedmetrics[0]", "\"*\"");
+ config = &dc.addConfig("stor-communicationmanager");
+ config->set("rpcport", "0");
+ config->set("mbusport", "0");
+ config = &dc.addConfig("stor-bucketdb");
+ config->set("chunklevel", "0");
+ config = &dc.addConfig("stor-distributormanager");
+ config->set("splitcount", "1000000");
+ config->set("splitsize", "1000000");
+ config->set("joincount", "0");
+ config->set("joinsize", "0");
+ config = &dc.addConfig("stor-opslogger");
+ config = &dc.addConfig("persistence");
+ config->set("abort_operations_with_changed_bucket_ownership", "true");
+ config = &dc.addConfig("stor-filestor");
+ // Easier to see what goes wrong with only 1 thread per disk.
+ config->set("minimum_file_meta_slots", "2");
+ config->set("minimum_file_header_block_size", "368");
+ config->set("minimum_file_size", "4096");
+ config->set("threads[1]");
+ config->set("threads[0].lowestpri 255");
+ config->set("dir_spread", "4");
+ config->set("dir_levels", "0");
+ config->set("use_new_core", newStorageCore ? "true" : "false");
+ config->set("maximum_versions_of_single_document_stored", "0");
+ //config->set("enable_slotfile_cache", "false");
+ // Unit tests typically use fake low time values, so don't complain
+ // about them or compact/delete them by default. Override in tests testing that
+ // behavior
+ config->set("time_future_limit", "5");
+ config->set("time_past_limit", "2000000000");
+ config->set("keep_remove_time_period", "2000000000");
+ config->set("revert_time_period", "2000000000");
+ // Don't want test to call exit()
+ config->set("fail_disk_after_error_count", "0");
+ config = &dc.addConfig("stor-bouncer");
+ config = &dc.addConfig("stor-integritychecker");
+ config = &dc.addConfig("stor-bucketmover");
+ config = &dc.addConfig("stor-messageforwarder");
+ config = &dc.addConfig("stor-server");
+ config->set("cluster_name", clusterName);
+ config->set("enable_dead_lock_detector", "false");
+ config->set("enable_dead_lock_detector_warnings", "false");
+ config->set("max_merges_per_node", "25");
+ config->set("max_merge_queue_size", "20");
+ config->set("root_folder",
+ (storagenode ? "vdsroot" : "vdsroot.distributor"));
+ config->set("is_distributor",
+ (storagenode ? "false" : "true"));
+ config = &dc.addConfig("stor-devices");
+ config->set("root_folder",
+ (storagenode ? "vdsroot" : "vdsroot.distributor"));
+ config = &dc.addConfig("stor-status");
+ config->set("httpport", "0");
+ config = &dc.addConfig("stor-visitor");
+ config->set("defaultdocblocksize", "8192");
+ // By default, need "old" behaviour of maxconcurrent
+ config->set("maxconcurrentvisitors_fixed", "4");
+ config->set("maxconcurrentvisitors_variable", "0");
+ config = &dc.addConfig("stor-visitordispatcher");
+ addFileConfig(dc, "documenttypes", "config-doctypes.cfg");
+ addStorageDistributionConfig(dc);
+ return dc;
+}
+
+void addSlobrokConfig(vdstestlib::DirConfig& dc,
+ const mbus::Slobrok& slobrok)
+{
+ std::ostringstream ost;
+ ost << "tcp/localhost:" << slobrok.port();
+ vdstestlib::DirConfig::Config* config;
+ config = &dc.getConfig("slobroks", true);
+ config->clear();
+ config->set("slobrok[1]");
+ config->set("slobrok[0].connectionspec", ost.str());
+}
+
+void addFileConfig(vdstestlib::DirConfig& dc,
+ const std::string& configDefName,
+ const std::string& fileName)
+{
+ vdstestlib::DirConfig::Config* config;
+ config = &dc.getConfig(configDefName, true);
+ config->clear();
+ std::ifstream in(fileName.c_str());
+ std::string line;
+ while (std::getline(in, line, '\n')) {
+ std::string::size_type pos = line.find(' ');
+ if (pos == std::string::npos) {
+ config->set(line);
+ } else {
+ config->set(line.substr(0, pos), line.substr(pos + 1));
+ }
+ }
+ in.close();
+}
+
+TestName::TestName(const std::string& n)
+ : name(n)
+{
+ LOG(debug, "Starting test %s", name.c_str());
+}
+
+TestName::~TestName() {
+ LOG(debug, "Done with test %s", name.c_str());
+}
+
+} // storage
diff --git a/storage/src/tests/testhelper.h b/storage/src/tests/testhelper.h
new file mode 100644
index 00000000000..be2c3e7ec66
--- /dev/null
+++ b/storage/src/tests/testhelper.h
@@ -0,0 +1,58 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+#include <vespa/vdstestlib/cppunit/dirconfig.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+
+
+#include <fstream>
+#include <vespa/fastos/fastos.h>
+#include <vespa/messagebus/testlib/slobrok.h>
+#include <sstream>
+
+#define ASSERT_REPLY_COUNT(count, dummylink) \
+ { \
+ std::ostringstream msgost; \
+ if ((dummylink).getNumReplies() != count) { \
+ for (uint32_t ijx=0; ijx<(dummylink).getNumReplies(); ++ijx) { \
+ msgost << (dummylink).getReply(ijx)->toString(true) << "\n"; \
+ } \
+ } \
+ CPPUNIT_ASSERT_EQUAL_MSG(msgost.str(), size_t(count), \
+ (dummylink).getNumReplies()); \
+ }
+#define ASSERT_COMMAND_COUNT(count, dummylink) \
+ { \
+ std::ostringstream msgost; \
+ if ((dummylink).getNumCommands() != count) { \
+ for (uint32_t ijx=0; ijx<(dummylink).getNumCommands(); ++ijx) { \
+ msgost << (dummylink).getCommand(ijx)->toString(true) << "\n"; \
+ } \
+ } \
+ CPPUNIT_ASSERT_EQUAL_MSG(msgost.str(), size_t(count), \
+ (dummylink).getNumCommands()); \
+ }
+
+namespace storage {
+
+void addFileConfig(vdstestlib::DirConfig& dc,
+ const std::string& configDefName,
+ const std::string& fileName);
+
+
+void addStorageDistributionConfig(vdstestlib::DirConfig& dc);
+
+vdstestlib::DirConfig getStandardConfig(bool storagenode);
+
+void addSlobrokConfig(vdstestlib::DirConfig& dc,
+ const mbus::Slobrok& slobrok);
+
+// Class used to print start and end of test. Enable debug when you want to see
+// which test creates what output or where we get stuck
+struct TestName {
+ std::string name;
+ TestName(const std::string& n);
+ ~TestName();
+};
+
+} // storage
+
diff --git a/storage/src/tests/testrunner.cpp b/storage/src/tests/testrunner.cpp
new file mode 100644
index 00000000000..5d8dc8d4c1f
--- /dev/null
+++ b/storage/src/tests/testrunner.cpp
@@ -0,0 +1,15 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <iostream>
+#include <vespa/log/log.h>
+#include <vespa/vdstestlib/cppunit/cppunittestrunner.h>
+
+LOG_SETUP("storagecppunittests");
+
+int
+main(int argc, char **argv)
+{
+ vdstestlib::CppUnitTestRunner testRunner;
+ return testRunner.run(argc, argv);
+}
diff --git a/storage/src/tests/visiting/.gitignore b/storage/src/tests/visiting/.gitignore
new file mode 100644
index 00000000000..184e5d1c936
--- /dev/null
+++ b/storage/src/tests/visiting/.gitignore
@@ -0,0 +1,12 @@
+*.So
+*.lo
+*.o
+.*.swp
+.config.log
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
+testrunner
+testrunner.core
diff --git a/storage/src/tests/visiting/CMakeLists.txt b/storage/src/tests/visiting/CMakeLists.txt
new file mode 100644
index 00000000000..60e130c003c
--- /dev/null
+++ b/storage/src/tests/visiting/CMakeLists.txt
@@ -0,0 +1,11 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_testvisiting
+ SOURCES
+ commandqueuetest.cpp
+ visitormanagertest.cpp
+ visitortest.cpp
+ memory_bounded_trace_test.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/tests/visiting/commandqueuetest.cpp b/storage/src/tests/visiting/commandqueuetest.cpp
new file mode 100644
index 00000000000..5d6da5f7ea5
--- /dev/null
+++ b/storage/src/tests/visiting/commandqueuetest.cpp
@@ -0,0 +1,223 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storageframework/defaultimplementation/clock/fakeclock.h>
+#include <vespa/storage/visiting/commandqueue.h>
+#include <vespa/storageapi/message/visitor.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+
+using vespalib::string;
+
+namespace storage {
+
+struct CommandQueueTest : public CppUnit::TestFixture
+{
+ void testFIFO();
+ void testFIFOWithPriorities();
+ void testReleaseOldest();
+ void testReleaseLowestPriority();
+ void testDeleteIterator();
+
+ CPPUNIT_TEST_SUITE(CommandQueueTest);
+ CPPUNIT_TEST(testFIFO);
+ CPPUNIT_TEST(testFIFOWithPriorities);
+ CPPUNIT_TEST(testReleaseOldest);
+ CPPUNIT_TEST(testReleaseLowestPriority);
+ CPPUNIT_TEST(testDeleteIterator);
+ CPPUNIT_TEST_SUITE_END();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(CommandQueueTest);
+
+namespace {
+ std::shared_ptr<api::CreateVisitorCommand> getCommand(
+ const vespalib::stringref & name, int timeout,
+ uint8_t priority = 0)
+ {
+ vespalib::asciistream ost;
+ ost << name << " t=" << timeout << " p=" << static_cast<unsigned int>(priority);
+ // Piggyback name in document selection
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("", "", ost.str()));
+ cmd->setQueueTimeout(timeout);
+ cmd->setPriority(priority);
+ return cmd;
+ }
+
+ const vespalib::string &
+ getCommandString(const std::shared_ptr<api::CreateVisitorCommand>& cmd)
+ {
+ return cmd->getDocumentSelection();
+ }
+
+}
+
+void CommandQueueTest::testFIFO() {
+ framework::defaultimplementation::FakeClock clock;
+ CommandQueue<api::CreateVisitorCommand> queue(clock);
+ CPPUNIT_ASSERT(queue.empty());
+ // Use all default priorities, meaning what comes out should be in the same order
+ // as what went in
+ queue.add(getCommand("first", 1));
+ queue.add(getCommand("second", 10));
+ queue.add(getCommand("third", 5));
+ queue.add(getCommand("fourth", 0));
+ queue.add(getCommand("fifth", 3));
+ queue.add(getCommand("sixth", 14));
+ queue.add(getCommand("seventh", 7));
+
+ CPPUNIT_ASSERT(!queue.empty());
+ std::vector<std::shared_ptr<api::CreateVisitorCommand> > commands;
+ for (;;) {
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ queue.releaseNextCommand().first);
+ if (cmd.get() == 0) break;
+ commands.push_back(cmd);
+ }
+ CPPUNIT_ASSERT_EQUAL(size_t(7), commands.size());
+ CPPUNIT_ASSERT_EQUAL(string("first t=1 p=0"), getCommandString(commands[0]));
+ CPPUNIT_ASSERT_EQUAL(string("second t=10 p=0"), getCommandString(commands[1]));
+ CPPUNIT_ASSERT_EQUAL(string("third t=5 p=0"), getCommandString(commands[2]));
+ CPPUNIT_ASSERT_EQUAL(string("fourth t=0 p=0"), getCommandString(commands[3]));
+ CPPUNIT_ASSERT_EQUAL(string("fifth t=3 p=0"), getCommandString(commands[4]));
+ CPPUNIT_ASSERT_EQUAL(string("sixth t=14 p=0"), getCommandString(commands[5]));
+ CPPUNIT_ASSERT_EQUAL(string("seventh t=7 p=0"), getCommandString(commands[6]));
+}
+
+void CommandQueueTest::testFIFOWithPriorities() {
+ framework::defaultimplementation::FakeClock clock;
+ CommandQueue<api::CreateVisitorCommand> queue(clock);
+ CPPUNIT_ASSERT(queue.empty());
+
+ queue.add(getCommand("first", 1, 10));
+ CPPUNIT_ASSERT_EQUAL(string("first t=1 p=10"), getCommandString(queue.peekLowestPriorityCommand()));
+ queue.add(getCommand("second", 10, 22));
+ queue.add(getCommand("third", 5, 9));
+ CPPUNIT_ASSERT_EQUAL(string("second t=10 p=22"), getCommandString(queue.peekLowestPriorityCommand()));
+ queue.add(getCommand("fourth", 0, 22));
+ queue.add(getCommand("fifth", 3, 22));
+ CPPUNIT_ASSERT_EQUAL(string("fifth t=3 p=22"), getCommandString(queue.peekLowestPriorityCommand()));
+ queue.add(getCommand("sixth", 14, 50));
+ queue.add(getCommand("seventh", 7, 0));
+
+ CPPUNIT_ASSERT_EQUAL(string("sixth t=14 p=50"), getCommandString(queue.peekLowestPriorityCommand()));
+
+ CPPUNIT_ASSERT(!queue.empty());
+ std::vector<std::shared_ptr<api::CreateVisitorCommand> > commands;
+ for (;;) {
+ std::shared_ptr<api::CreateVisitorCommand> cmdPeek(queue.peekNextCommand());
+ std::shared_ptr<api::CreateVisitorCommand> cmd(queue.releaseNextCommand().first);
+ if (cmd.get() == 0 || cmdPeek != cmd) break;
+ commands.push_back(cmd);
+ }
+ CPPUNIT_ASSERT_EQUAL(size_t(7), commands.size());
+ CPPUNIT_ASSERT_EQUAL(string("seventh t=7 p=0"), getCommandString(commands[0]));
+ CPPUNIT_ASSERT_EQUAL(string("third t=5 p=9"), getCommandString(commands[1]));
+ CPPUNIT_ASSERT_EQUAL(string("first t=1 p=10"), getCommandString(commands[2]));
+ CPPUNIT_ASSERT_EQUAL(string("second t=10 p=22"), getCommandString(commands[3]));
+ CPPUNIT_ASSERT_EQUAL(string("fourth t=0 p=22"), getCommandString(commands[4]));
+ CPPUNIT_ASSERT_EQUAL(string("fifth t=3 p=22"), getCommandString(commands[5]));
+ CPPUNIT_ASSERT_EQUAL(string("sixth t=14 p=50"), getCommandString(commands[6]));
+}
+
+void CommandQueueTest::testReleaseOldest() {
+ framework::defaultimplementation::FakeClock clock(framework::defaultimplementation::FakeClock::FAKE_ABSOLUTE);
+ CommandQueue<api::CreateVisitorCommand> queue(clock);
+ CPPUNIT_ASSERT(queue.empty());
+ queue.add(getCommand("first", 10));
+ queue.add(getCommand("second", 100));
+ queue.add(getCommand("third", 1000));
+ queue.add(getCommand("fourth", 5));
+ queue.add(getCommand("fifth", 3000));
+ queue.add(getCommand("sixth", 400));
+ queue.add(getCommand("seventh", 700));
+ CPPUNIT_ASSERT_EQUAL(7u, queue.size());
+
+ typedef CommandQueue<api::CreateVisitorCommand>::CommandEntry CommandEntry;
+ std::list<CommandEntry> timedOut(queue.releaseTimedOut());
+ CPPUNIT_ASSERT(timedOut.empty());
+ clock.addMilliSecondsToTime(400 * 1000);
+ timedOut = queue.releaseTimedOut();
+ CPPUNIT_ASSERT_EQUAL(size_t(4), timedOut.size());
+ std::ostringstream ost;
+ for (std::list<CommandEntry>::const_iterator it = timedOut.begin();
+ it != timedOut.end(); ++it)
+ {
+ ost << getCommandString(it->_command) << "\n";
+ }
+ CPPUNIT_ASSERT_EQUAL(std::string(
+ "fourth t=5 p=0\n"
+ "first t=10 p=0\n"
+ "second t=100 p=0\n"
+ "sixth t=400 p=0\n"), ost.str());
+ CPPUNIT_ASSERT_EQUAL(3u, queue.size());
+}
+
+void CommandQueueTest::testReleaseLowestPriority() {
+ framework::defaultimplementation::FakeClock clock;
+ CommandQueue<api::CreateVisitorCommand> queue(clock);
+ CPPUNIT_ASSERT(queue.empty());
+
+ queue.add(getCommand("first", 1, 10));
+ queue.add(getCommand("second", 10, 22));
+ queue.add(getCommand("third", 5, 9));
+ queue.add(getCommand("fourth", 0, 22));
+ queue.add(getCommand("fifth", 3, 22));
+ queue.add(getCommand("sixth", 14, 50));
+ queue.add(getCommand("seventh", 7, 0));
+ CPPUNIT_ASSERT_EQUAL(7u, queue.size());
+
+ std::vector<std::shared_ptr<api::CreateVisitorCommand> > commands;
+ for (;;) {
+ std::shared_ptr<api::CreateVisitorCommand> cmdPeek(queue.peekLowestPriorityCommand());
+ std::pair<std::shared_ptr<api::CreateVisitorCommand>, uint64_t> cmd(
+ queue.releaseLowestPriorityCommand());
+ if (cmd.first.get() == 0 || cmdPeek != cmd.first) break;
+ commands.push_back(cmd.first);
+ }
+ CPPUNIT_ASSERT_EQUAL(size_t(7), commands.size());
+ CPPUNIT_ASSERT_EQUAL(string("sixth t=14 p=50"), getCommandString(commands[0]));
+ CPPUNIT_ASSERT_EQUAL(string("fifth t=3 p=22"), getCommandString(commands[1]));
+ CPPUNIT_ASSERT_EQUAL(string("fourth t=0 p=22"), getCommandString(commands[2]));
+ CPPUNIT_ASSERT_EQUAL(string("second t=10 p=22"), getCommandString(commands[3]));
+ CPPUNIT_ASSERT_EQUAL(string("first t=1 p=10"), getCommandString(commands[4]));
+ CPPUNIT_ASSERT_EQUAL(string("third t=5 p=9"), getCommandString(commands[5]));
+ CPPUNIT_ASSERT_EQUAL(string("seventh t=7 p=0"), getCommandString(commands[6]));
+}
+
+void CommandQueueTest::testDeleteIterator() {
+ framework::defaultimplementation::FakeClock clock;
+ CommandQueue<api::CreateVisitorCommand> queue(clock);
+ CPPUNIT_ASSERT(queue.empty());
+ queue.add(getCommand("first", 10));
+ queue.add(getCommand("second", 100));
+ queue.add(getCommand("third", 1000));
+ queue.add(getCommand("fourth", 5));
+ queue.add(getCommand("fifth", 3000));
+ queue.add(getCommand("sixth", 400));
+ queue.add(getCommand("seventh", 700));
+ CPPUNIT_ASSERT_EQUAL(7u, queue.size());
+
+ CommandQueue<api::CreateVisitorCommand>::iterator it = queue.begin();
+ ++it; ++it;
+ queue.erase(it);
+ CPPUNIT_ASSERT_EQUAL(6u, queue.size());
+
+ std::vector<std::shared_ptr<api::CreateVisitorCommand> > cmds;
+ for (;;) {
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ std::dynamic_pointer_cast<api::CreateVisitorCommand>(
+ queue.releaseNextCommand().first));
+ if (cmd.get() == 0) break;
+ cmds.push_back(cmd);
+ }
+ CPPUNIT_ASSERT_EQUAL(size_t(6), cmds.size());
+ CPPUNIT_ASSERT_EQUAL(string("first t=10 p=0"), getCommandString(cmds[0]));
+ CPPUNIT_ASSERT_EQUAL(string("second t=100 p=0"), getCommandString(cmds[1]));
+ CPPUNIT_ASSERT_EQUAL(string("fourth t=5 p=0"), getCommandString(cmds[2]));
+ CPPUNIT_ASSERT_EQUAL(string("fifth t=3000 p=0"), getCommandString(cmds[3]));
+ CPPUNIT_ASSERT_EQUAL(string("sixth t=400 p=0"), getCommandString(cmds[4]));
+ CPPUNIT_ASSERT_EQUAL(string("seventh t=700 p=0"), getCommandString(cmds[5]));
+}
+
+}
+
diff --git a/storage/src/tests/visiting/memory_bounded_trace_test.cpp b/storage/src/tests/visiting/memory_bounded_trace_test.cpp
new file mode 100644
index 00000000000..85eae12fc34
--- /dev/null
+++ b/storage/src/tests/visiting/memory_bounded_trace_test.cpp
@@ -0,0 +1,131 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/storage/visiting/memory_bounded_trace.h>
+
+namespace storage {
+
+class MemoryBoundedTraceTest : public CppUnit::TestFixture
+{
+ CPPUNIT_TEST_SUITE(MemoryBoundedTraceTest);
+ CPPUNIT_TEST(noMemoryReportedUsedWhenEmpty);
+ CPPUNIT_TEST(memoryUsedIsStringLengthForLeafNode);
+ CPPUNIT_TEST(memoryUsedIsAccumulatedRecursivelyForNonLeafNodes);
+ CPPUNIT_TEST(traceNodesCanBeMovedAndImplicitlyCleared);
+ CPPUNIT_TEST(movedTraceTreeIsMarkedAsStrict);
+ CPPUNIT_TEST(canNotAddMoreNodesWhenMemoryUsedExceedsUpperBound);
+ CPPUNIT_TEST(movedTreeIncludesStatsNodeWhenNodesOmitted);
+ CPPUNIT_TEST_SUITE_END();
+
+public:
+ void noMemoryReportedUsedWhenEmpty();
+ void memoryUsedIsStringLengthForLeafNode();
+ void memoryUsedIsAccumulatedRecursivelyForNonLeafNodes();
+ void traceNodesCanBeMovedAndImplicitlyCleared();
+ void movedTraceTreeIsMarkedAsStrict();
+ void canNotAddMoreNodesWhenMemoryUsedExceedsUpperBound();
+ void movedTreeIncludesStatsNodeWhenNodesOmitted();
+};
+
+CPPUNIT_TEST_SUITE_REGISTRATION(MemoryBoundedTraceTest);
+
+void
+MemoryBoundedTraceTest::noMemoryReportedUsedWhenEmpty()
+{
+ MemoryBoundedTrace trace(100);
+ CPPUNIT_ASSERT_EQUAL(size_t(0), trace.getApproxMemoryUsed());
+}
+
+void
+MemoryBoundedTraceTest::memoryUsedIsStringLengthForLeafNode()
+{
+ MemoryBoundedTrace trace(100);
+ CPPUNIT_ASSERT(trace.add(mbus::TraceNode("hello world", 0)));
+ CPPUNIT_ASSERT_EQUAL(size_t(11), trace.getApproxMemoryUsed());
+}
+
+void
+MemoryBoundedTraceTest::memoryUsedIsAccumulatedRecursivelyForNonLeafNodes()
+{
+ MemoryBoundedTrace trace(100);
+ mbus::TraceNode innerNode;
+ innerNode.addChild("hello world");
+ innerNode.addChild("goodbye moon");
+ CPPUNIT_ASSERT(trace.add(innerNode));
+ CPPUNIT_ASSERT_EQUAL(size_t(23), trace.getApproxMemoryUsed());
+}
+
+void
+MemoryBoundedTraceTest::traceNodesCanBeMovedAndImplicitlyCleared()
+{
+ MemoryBoundedTrace trace(100);
+ CPPUNIT_ASSERT(trace.add(mbus::TraceNode("hello world", 0)));
+ mbus::TraceNode target;
+ trace.moveTraceTo(target);
+ CPPUNIT_ASSERT_EQUAL(uint32_t(1), target.getNumChildren());
+ CPPUNIT_ASSERT_EQUAL(size_t(0), trace.getApproxMemoryUsed());
+
+ mbus::TraceNode emptinessCheck;
+ trace.moveTraceTo(emptinessCheck);
+ CPPUNIT_ASSERT_EQUAL(uint32_t(0), emptinessCheck.getNumChildren());
+}
+
+/**
+ * We want trace subtrees to be strictly ordered so that the message about
+ * omitted traces will remain soundly as the last ordered node. There is no
+ * particular performance reason for not having strict mode enabled to the
+ * best of my knowledge, since the internal backing data structure is an
+ * ordered vector anyhow.
+ */
+void
+MemoryBoundedTraceTest::movedTraceTreeIsMarkedAsStrict()
+{
+ MemoryBoundedTrace trace(100);
+ CPPUNIT_ASSERT(trace.add(mbus::TraceNode("hello world", 0)));
+ mbus::TraceNode target;
+ trace.moveTraceTo(target);
+ CPPUNIT_ASSERT_EQUAL(uint32_t(1), target.getNumChildren());
+ CPPUNIT_ASSERT(target.getChild(0).isStrict());
+}
+
+void
+MemoryBoundedTraceTest::canNotAddMoreNodesWhenMemoryUsedExceedsUpperBound()
+{
+ // Note: we allow one complete node tree to exceed the bounds, but as soon
+ // as the bound is exceeded no further nodes can be added.
+ MemoryBoundedTrace trace(10);
+ CPPUNIT_ASSERT(trace.add(mbus::TraceNode("hello world", 0)));
+ CPPUNIT_ASSERT_EQUAL(size_t(11), trace.getApproxMemoryUsed());
+
+ CPPUNIT_ASSERT(!trace.add(mbus::TraceNode("the quick red fox runs across "
+ "the freeway", 0)));
+ CPPUNIT_ASSERT_EQUAL(size_t(11), trace.getApproxMemoryUsed());
+
+ mbus::TraceNode target;
+ trace.moveTraceTo(target);
+ // Twice nested node (root -> added trace tree -> leaf with txt).
+ CPPUNIT_ASSERT_EQUAL(uint32_t(1), target.getNumChildren());
+ CPPUNIT_ASSERT(target.getChild(0).getNumChildren() >= 1);
+ CPPUNIT_ASSERT_EQUAL(vespalib::string("hello world"),
+ target.getChild(0).getChild(0).getNote());
+}
+
+void
+MemoryBoundedTraceTest::movedTreeIncludesStatsNodeWhenNodesOmitted()
+{
+ MemoryBoundedTrace trace(5);
+ CPPUNIT_ASSERT(trace.add(mbus::TraceNode("abcdef", 0)));
+ CPPUNIT_ASSERT(!trace.add(mbus::TraceNode("ghijkjlmn", 0)));
+
+ mbus::TraceNode target;
+ trace.moveTraceTo(target);
+ CPPUNIT_ASSERT_EQUAL(uint32_t(1), target.getNumChildren());
+ CPPUNIT_ASSERT_EQUAL(uint32_t(2), target.getChild(0).getNumChildren());
+ vespalib::string expected("Trace too large; omitted 1 subsequent trace "
+ "trees containing a total of 9 bytes");
+ CPPUNIT_ASSERT_EQUAL(expected, target.getChild(0).getChild(1).getNote());
+}
+
+} // storage
+
diff --git a/storage/src/tests/visiting/visitormanagertest.cpp b/storage/src/tests/visiting/visitormanagertest.cpp
new file mode 100644
index 00000000000..d782abf7d54
--- /dev/null
+++ b/storage/src/tests/visiting/visitormanagertest.cpp
@@ -0,0 +1,1172 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/document/datatype/datatype.h>
+#include <vespa/document/fieldvalue/intfieldvalue.h>
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/document/fieldvalue/rawfieldvalue.h>
+#include <vespa/log/log.h>
+#include <vespa/storageapi/message/datagram.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/visitor.h>
+#include <vector>
+#include <vespa/storage/persistence/filestorage/filestormanager.h>
+#include <vespa/storage/visiting/visitormanager.h>
+#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/testhelper.h>
+#include <tests/common/dummystoragelink.h>
+#include <tests/storageserver/testvisitormessagesession.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/vdslib/container/visitorordering.h>
+#include <vespa/documentapi/messagebus/messages/multioperationmessage.h>
+#include <vespa/documentapi/messagebus/messages/putdocumentmessage.h>
+#include <vespa/documentapi/messagebus/messages/removedocumentmessage.h>
+
+
+LOG_SETUP(".visitormanagertest");
+
+namespace storage {
+namespace {
+ typedef std::vector<api::StorageMessage::SP> msg_ptr_vector;
+}
+
+class VisitorManagerTest : public CppUnit::TestFixture
+{
+private:
+ CPPUNIT_TEST_SUITE(VisitorManagerTest);
+ CPPUNIT_TEST(testNormalUsage);
+ CPPUNIT_TEST(testResending);
+ CPPUNIT_TEST(testVisitEmptyBucket);
+ CPPUNIT_TEST(testMultiBucketVisit);
+ CPPUNIT_TEST(testNoBuckets);
+ CPPUNIT_TEST(testVisitPutsAndRemoves);
+ CPPUNIT_TEST(testVisitWithTimeframeAndSelection);
+ CPPUNIT_TEST(testVisitWithTimeframeAndBogusSelection);
+ CPPUNIT_TEST(testVisitorCallbacks);
+ CPPUNIT_TEST(testVisitorCleanup);
+ CPPUNIT_TEST(testAbortOnFailedVisitorInfo);
+ CPPUNIT_TEST(testAbortOnFieldPathError);
+ CPPUNIT_TEST(testVisitorQueueTimeout);
+ CPPUNIT_TEST(testVisitorProcessingTimeout);
+ CPPUNIT_TEST(testPrioritizedVisitorQueing);
+ CPPUNIT_TEST(testPrioritizedMaxConcurrentVisitors);
+ CPPUNIT_TEST(testVisitorQueingZeroQueueSize);
+ CPPUNIT_TEST(testHitCounter);
+ CPPUNIT_TEST(testStatusPage);
+ CPPUNIT_TEST_SUITE_END();
+
+ static uint32_t docCount;
+ std::vector<document::Document::SP > _documents;
+ std::unique_ptr<TestVisitorMessageSessionFactory> _messageSessionFactory;
+ std::unique_ptr<TestServiceLayerApp> _node;
+ std::unique_ptr<DummyStorageLink> _top;
+ VisitorManager* _manager;
+
+public:
+ VisitorManagerTest() : _node() {}
+
+ // Not using setUp since can't throw exception out of it.
+ void initializeTest();
+ void addSomeRemoves(bool removeAll = false);
+ void tearDown();
+ TestVisitorMessageSession& getSession(uint32_t n);
+ uint64_t verifyCreateVisitorReply(
+ api::ReturnCode::Result expectedResult,
+ int checkStatsDocsVisited = -1,
+ int checkStatsBytesVisited = -1);
+ void getMessagesAndReply(
+ int expectedCount,
+ TestVisitorMessageSession& session,
+ std::vector<document::Document::SP >& docs,
+ std::vector<document::DocumentId>& docIds,
+ api::ReturnCode::Result returnCode = api::ReturnCode::OK,
+ documentapi::Priority::Value priority = documentapi::Priority::PRI_NORMAL_4);
+ uint32_t getMatchingDocuments(std::vector<document::Document::SP >& docs);
+
+ void testNormalUsage();
+ void testResending();
+ void testVisitEmptyBucket();
+ void testMultiBucketVisit();
+ void testNoBuckets();
+ void testVisitPutsAndRemoves();
+ void testVisitWithTimeframeAndSelection();
+ void testVisitWithTimeframeAndBogusSelection();
+ void testVisitorCallbacks();
+ void testVisitorCleanup();
+ void testAbortOnFailedVisitorInfo();
+ void testAbortOnFieldPathError();
+ void testVisitorQueueTimeout();
+ void testVisitorProcessingTimeout();
+ void testPrioritizedVisitorQueing();
+ void testPrioritizedMaxConcurrentVisitors();
+ void testVisitorQueingZeroQueueSize();
+ void testHitCounter();
+ void testStatusPage();
+};
+
+uint32_t VisitorManagerTest::docCount = 10;
+
+CPPUNIT_TEST_SUITE_REGISTRATION(VisitorManagerTest);
+
+void
+VisitorManagerTest::initializeTest()
+{
+ LOG(debug, "Initializing test");
+ vdstestlib::DirConfig config(getStandardConfig(true));
+ config.getConfig("stor-visitor").set("visitorthreads", "1");
+
+ try {
+ _messageSessionFactory.reset(
+ new TestVisitorMessageSessionFactory(config.getConfigId()));
+ _node.reset(
+ new TestServiceLayerApp(config.getConfigId()));
+ _node->setupDummyPersistence();
+ _node->getStateUpdater().setClusterState(
+ lib::ClusterState::CSP(
+ new lib::ClusterState("storage:1 distributor:1")));
+ _top.reset(new DummyStorageLink());
+ _top->push_back(std::unique_ptr<StorageLink>(_manager
+ = new VisitorManager(
+ config.getConfigId(), _node->getComponentRegister(),
+ *_messageSessionFactory)));
+ _top->push_back(std::unique_ptr<StorageLink>(new FileStorManager(
+ config.getConfigId(), _node->getPartitions(), _node->getPersistenceProvider(), _node->getComponentRegister())));
+ _manager->setTimeBetweenTicks(10);
+ _top->open();
+ } catch (config::InvalidConfigException& e) {
+ fprintf(stderr, "%s\n", e.what());
+ }
+ // Adding some documents so database isn't empty
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ std::string content(
+ "To be, or not to be: that is the question:\n"
+ "Whether 'tis nobler in the mind to suffer\n"
+ "The slings and arrows of outrageous fortune,\n"
+ "Or to take arms against a sea of troubles,\n"
+ "And by opposing end them? To die: to sleep;\n"
+ "No more; and by a sleep to say we end\n"
+ "The heart-ache and the thousand natural shocks\n"
+ "That flesh is heir to, 'tis a consummation\n"
+ "Devoutly to be wish'd. To die, to sleep;\n"
+ "To sleep: perchance to dream: ay, there's the rub;\n"
+ "For in that sleep of death what dreams may come\n"
+ "When we have shuffled off this mortal coil,\n"
+ "Must give us pause: there's the respect\n"
+ "That makes calamity of so long life;\n"
+ "For who would bear the whips and scorns of time,\n"
+ "The oppressor's wrong, the proud man's contumely,\n"
+ "The pangs of despised love, the law's delay,\n"
+ "The insolence of office and the spurns\n"
+ "That patient merit of the unworthy takes,\n"
+ "When he himself might his quietus make\n"
+ "With a bare bodkin? who would fardels bear,\n"
+ "To grunt and sweat under a weary life,\n"
+ "But that the dread of something after death,\n"
+ "The undiscover'd country from whose bourn\n"
+ "No traveller returns, puzzles the will\n"
+ "And makes us rather bear those ills we have\n"
+ "Than fly to others that we know not of?\n"
+ "Thus conscience does make cowards of us all;\n"
+ "And thus the native hue of resolution\n"
+ "Is sicklied o'er with the pale cast of thought,\n"
+ "And enterprises of great pith and moment\n"
+ "With this regard their currents turn awry,\n"
+ "And lose the name of action. - Soft you now!\n"
+ "The fair Ophelia! Nymph, in thy orisons\n"
+ "Be all my sins remember'd.\n");
+ for (uint32_t i=0; i<docCount; ++i) {
+ std::ostringstream uri;
+ uri << "userdoc:test:" << i % 10 << ":http://www.ntnu.no/"
+ << i << ".html";
+
+ _documents.push_back(document::Document::SP(
+ _node->getTestDocMan().createDocument(content, uri.str())));
+ const document::DocumentType& type(_documents.back()->getType());
+ _documents.back()->setValue(type.getField("headerval"),
+ document::IntFieldValue(i % 4));
+ }
+ for (uint32_t i=0; i<10; ++i) {
+ document::BucketId bid(16, i);
+
+ std::shared_ptr<api::CreateBucketCommand> cmd(
+ new api::CreateBucketCommand(bid));
+ cmd->setAddress(address);
+ cmd->setSourceIndex(0);
+ _top->sendDown(cmd);
+ _top->waitForMessages(1, 60);
+ _top->reset();
+
+ StorBucketDatabase::WrappedEntry entry(
+ _node->getStorageBucketDatabase().get(bid, "",
+ StorBucketDatabase::CREATE_IF_NONEXISTING));
+ entry->disk = 0;
+ entry.write();
+ }
+ for (uint32_t i=0; i<docCount; ++i) {
+ document::BucketId bid(16, i);
+
+ std::shared_ptr<api::PutCommand> cmd(
+ new api::PutCommand(bid, _documents[i], i+1));
+ cmd->setAddress(address);
+ _top->sendDown(cmd);
+ _top->waitForMessages(1, 60);
+ const msg_ptr_vector replies = _top->getRepliesOnce();
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, replies.size());
+ std::shared_ptr<api::PutReply> reply(
+ std::dynamic_pointer_cast<api::PutReply>(
+ replies[0]));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode(api::ReturnCode::OK),
+ reply->getResult());
+ }
+ LOG(debug, "Done initializing test");
+}
+
+void
+VisitorManagerTest::addSomeRemoves(bool removeAll)
+{
+ framework::defaultimplementation::FakeClock clock;
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ for (uint32_t i=0; i<docCount; i += (removeAll ? 1 : 4)) {
+ // Add it to the database
+ document::BucketId bid(16, i % 10);
+ std::shared_ptr<api::RemoveCommand> cmd(
+ new api::RemoveCommand(
+ bid, _documents[i]->getId(), clock.getTimeInMicros().getTime() + docCount + i + 1));
+ cmd->setAddress(address);
+ _top->sendDown(cmd);
+ _top->waitForMessages(1, 60);
+ const msg_ptr_vector replies = _top->getRepliesOnce();
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, replies.size());
+ std::shared_ptr<api::RemoveReply> reply(
+ std::dynamic_pointer_cast<api::RemoveReply>(
+ replies[0]));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode(api::ReturnCode::OK),
+ reply->getResult());
+ }
+}
+
+void
+VisitorManagerTest::tearDown()
+{
+ if (_top.get() != 0) {
+ _top->close();
+ _top->flush();
+ _top.reset(0);
+ }
+ _node.reset(0);
+ _messageSessionFactory.reset(0);
+ _manager = 0;
+}
+
+TestVisitorMessageSession&
+VisitorManagerTest::getSession(uint32_t n)
+{
+ // Wait until we have started the visitor
+ const std::vector<TestVisitorMessageSession*>& sessions(
+ _messageSessionFactory->_visitorSessions);
+ framework::defaultimplementation::RealClock clock;
+ framework::MilliSecTime endTime(
+ clock.getTimeInMillis() + framework::MilliSecTime(30 * 1000));
+ while (true) {
+ {
+ vespalib::LockGuard lock(_messageSessionFactory->_accessLock);
+ if (sessions.size() > n) {
+ return *sessions[n];
+ }
+ }
+ if (clock.getTimeInMillis() > endTime) {
+ throw vespalib::IllegalStateException(
+ "Timed out waiting for visitor session", VESPA_STRLOC);
+ }
+ FastOS_Thread::Sleep(10);
+ }
+ throw std::logic_error("unreachable");
+}
+
+void
+VisitorManagerTest::getMessagesAndReply(
+ int expectedCount,
+ TestVisitorMessageSession& session,
+ std::vector<document::Document::SP >& docs,
+ std::vector<document::DocumentId>& docIds,
+ api::ReturnCode::Result result,
+ documentapi::Priority::Value priority)
+{
+ for (int i = 0; i < expectedCount; i++) {
+ session.waitForMessages(i + 1);
+ mbus::Reply::UP reply;
+ {
+ vespalib::MonitorGuard guard(session.getMonitor());
+
+ CPPUNIT_ASSERT_EQUAL(priority,
+ session.sentMessages[i]->getPriority());
+
+ switch (session.sentMessages[i]->getType()) {
+ case documentapi::DocumentProtocol::MESSAGE_PUTDOCUMENT:
+ docs.push_back(static_cast<documentapi::PutDocumentMessage&>(
+ *session.sentMessages[i]).getDocument());
+ break;
+ case documentapi::DocumentProtocol::MESSAGE_REMOVEDOCUMENT:
+ docIds.push_back(static_cast<documentapi::RemoveDocumentMessage&>(
+ *session.sentMessages[i]).getDocumentId());
+ break;
+ default:
+ break;
+ }
+
+ reply = session.sentMessages[i]->createReply();
+ reply->swapState(*session.sentMessages[i]);
+ reply->setMessage(
+ mbus::Message::UP(session.sentMessages[i].release()));
+
+ if (result != api::ReturnCode::OK) {
+ reply->addError(mbus::Error(result, "Generic error"));
+ }
+ }
+
+ session.reply(std::move(reply));
+ }
+}
+
+uint64_t
+VisitorManagerTest::verifyCreateVisitorReply(
+ api::ReturnCode::Result expectedResult,
+ int checkStatsDocsVisited,
+ int checkStatsBytesVisited)
+{
+ _top->waitForMessages(1, 60);
+ const msg_ptr_vector replies = _top->getRepliesOnce();
+ CPPUNIT_ASSERT_EQUAL(1, (int)replies.size());
+
+ std::shared_ptr<api::StorageMessage> msg(replies[0]);
+
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::VISITOR_CREATE_REPLY, msg->getType());
+
+ std::shared_ptr<api::CreateVisitorReply> reply(
+ std::dynamic_pointer_cast<api::CreateVisitorReply>(msg));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(expectedResult, reply->getResult().getResult());
+
+ if (checkStatsDocsVisited >= 0) {
+ CPPUNIT_ASSERT_EQUAL(checkStatsDocsVisited,
+ int(reply->getVisitorStatistics().getDocumentsVisited()));
+ }
+ if (checkStatsBytesVisited >= 0) {
+ CPPUNIT_ASSERT_EQUAL(checkStatsBytesVisited,
+ int(reply->getVisitorStatistics().getBytesVisited()));
+ }
+
+ return reply->getMsgId();
+}
+
+uint32_t
+VisitorManagerTest::getMatchingDocuments(std::vector<document::Document::SP >& docs) {
+ uint32_t equalCount = 0;
+ for (uint32_t i=0; i<docs.size(); ++i) {
+ for (uint32_t j=0; j<_documents.size(); ++j) {
+ if (docs[i]->getId() == _documents[j]->getId()
+ && *docs[i] == *_documents[j])
+
+ {
+ equalCount++;
+ }
+ }
+ }
+
+ return equalCount;
+}
+
+void
+VisitorManagerTest::testHitCounter()
+{
+ document::OrderingSpecification spec(document::OrderingSpecification::ASCENDING, 42, 7, 2);
+ Visitor::HitCounter hitCounter(&spec);
+
+ hitCounter.addHit(document::DocumentId("orderdoc(7,2):mail:1234:42:foo"), 450);
+ hitCounter.addHit(document::DocumentId("orderdoc(7,2):mail:1234:49:foo"), 450);
+ hitCounter.addHit(document::DocumentId("orderdoc(7,2):mail:1234:60:foo"), 450);
+ hitCounter.addHit(document::DocumentId("orderdoc(7,2):mail:1234:10:foo"), 450);
+ hitCounter.addHit(document::DocumentId("orderdoc(7,2):mail:1234:21:foo"), 450);
+
+ CPPUNIT_ASSERT_EQUAL(3, (int)hitCounter.getFirstPassHits());
+ CPPUNIT_ASSERT_EQUAL(1350, (int)hitCounter.getFirstPassBytes());
+ CPPUNIT_ASSERT_EQUAL(2, (int)hitCounter.getSecondPassHits());
+ CPPUNIT_ASSERT_EQUAL(900, (int)hitCounter.getSecondPassBytes());
+}
+
+namespace {
+
+int getTotalSerializedSize(const std::vector<document::Document::SP>& docs)
+{
+ int total = 0;
+ for (size_t i = 0; i < docs.size(); ++i) {
+ total += int(docs[i]->serialize()->getLength());
+ }
+ return total;
+}
+
+}
+
+void
+VisitorManagerTest::testNormalUsage()
+{
+ initializeTest();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor", "testvis", ""));
+ cmd->addBucketToBeVisited(document::BucketId(16, 3));
+ cmd->setAddress(address);
+ cmd->setControlDestination("foo/bar");
+ _top->sendDown(cmd);
+ std::vector<document::Document::SP > docs;
+ std::vector<document::DocumentId> docIds;
+
+ // Should receive one multioperation message (bucket 3 has one document).
+ getMessagesAndReply(1, getSession(0), docs, docIds);
+
+ // All data has been replied to, expecting to get a create visitor reply
+ verifyCreateVisitorReply(api::ReturnCode::OK,
+ int(docs.size()),
+ getTotalSerializedSize(docs));
+
+ CPPUNIT_ASSERT_EQUAL(1u, getMatchingDocuments(docs));
+ CPPUNIT_ASSERT(!_manager->hasPendingMessageState());
+}
+
+void
+VisitorManagerTest::testResending()
+{
+ initializeTest();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor", "testvis", ""));
+ cmd->addBucketToBeVisited(document::BucketId(16, 3));
+ cmd->setAddress(address);
+ cmd->setControlDestination("foo/bar");
+ _top->sendDown(cmd);
+ std::vector<document::Document::SP > docs;
+ std::vector<document::DocumentId> docIds;
+
+ TestVisitorMessageSession& session = getSession(0);
+ getMessagesAndReply(1, session, docs, docIds, api::ReturnCode::NOT_READY);
+
+ {
+ session.waitForMessages(2);
+
+ documentapi::DocumentMessage* msg = session.sentMessages[1].get();
+
+ mbus::Reply::UP reply = msg->createReply();
+
+ CPPUNIT_ASSERT_EQUAL((uint32_t)documentapi::DocumentProtocol::MESSAGE_VISITORINFO,
+ session.sentMessages[1]->getType());
+ reply->swapState(*session.sentMessages[1]);
+ reply->setMessage(mbus::Message::UP(session.sentMessages[1].release()));
+ session.reply(std::move(reply));
+ }
+
+ _node->getClock().addSecondsToTime(1);
+
+ {
+ session.waitForMessages(3);
+
+ documentapi::DocumentMessage* msg = session.sentMessages[2].get();
+
+ mbus::Reply::UP reply = msg->createReply();
+
+ reply->swapState(*session.sentMessages[2]);
+ reply->setMessage(mbus::Message::UP(session.sentMessages[2].release()));
+ session.reply(std::move(reply));
+ }
+
+ // All data has been replied to, expecting to get a create visitor reply
+ verifyCreateVisitorReply(api::ReturnCode::OK);
+}
+
+void
+VisitorManagerTest::testVisitEmptyBucket()
+{
+ initializeTest();
+ addSomeRemoves(true);
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor", "testvis", ""));
+ cmd->addBucketToBeVisited(document::BucketId(16, 3));
+
+ cmd->setAddress(address);
+ _top->sendDown(cmd);
+
+ // All data has been replied to, expecting to get a create visitor reply
+ verifyCreateVisitorReply(api::ReturnCode::OK);
+}
+
+void
+VisitorManagerTest::testMultiBucketVisit()
+{
+ initializeTest();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor", "testvis", ""));
+ for (uint32_t i=0; i<10; ++i) {
+ cmd->addBucketToBeVisited(document::BucketId(16, i));
+ }
+ cmd->setAddress(address);
+ cmd->setDataDestination("fooclient.0");
+ _top->sendDown(cmd);
+ std::vector<document::Document::SP > docs;
+ std::vector<document::DocumentId> docIds;
+
+ // Should receive one multioperation message for each bucket
+ getMessagesAndReply(10, getSession(0), docs, docIds);
+
+ // All data has been replied to, expecting to get a create visitor reply
+ verifyCreateVisitorReply(api::ReturnCode::OK);
+
+ CPPUNIT_ASSERT_EQUAL(docCount, getMatchingDocuments(docs));
+}
+
+void
+VisitorManagerTest::testNoBuckets()
+{
+ initializeTest();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor", "testvis", ""));
+
+ cmd->setAddress(address);
+ _top->sendDown(cmd);
+
+ // Should get one reply; a CreateVisitorReply with error since no
+ // buckets where specified in the CreateVisitorCommand
+ _top->waitForMessages(1, 60);
+ const msg_ptr_vector replies = _top->getRepliesOnce();
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, replies.size());
+ std::shared_ptr<api::CreateVisitorReply> reply(
+ std::dynamic_pointer_cast<api::CreateVisitorReply>(
+ replies[0]));
+ // Verify that cast went ok => it was a CreateVisitorReply message
+ CPPUNIT_ASSERT(reply.get());
+ api::ReturnCode ret(api::ReturnCode::ILLEGAL_PARAMETERS,
+ "No buckets specified");
+ CPPUNIT_ASSERT_EQUAL(ret, reply->getResult());
+}
+
+void VisitorManagerTest::testVisitPutsAndRemoves()
+{
+ initializeTest();
+ addSomeRemoves();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor", "testvis", ""));
+ cmd->setAddress(address);
+ cmd->setVisitRemoves();
+ for (uint32_t i=0; i<10; ++i) {
+ cmd->addBucketToBeVisited(document::BucketId(16, i));
+ }
+ _top->sendDown(cmd);
+ std::vector<document::Document::SP > docs;
+ std::vector<document::DocumentId> docIds;
+
+ getMessagesAndReply(10, getSession(0), docs, docIds);
+
+ verifyCreateVisitorReply(api::ReturnCode::OK);
+
+ CPPUNIT_ASSERT_EQUAL(
+ docCount - (docCount + 3) / 4,
+ getMatchingDocuments(docs));
+
+ CPPUNIT_ASSERT_EQUAL(
+ (size_t) (docCount + 3) / 4,
+ docIds.size());
+}
+
+void VisitorManagerTest::testVisitWithTimeframeAndSelection()
+{
+ initializeTest();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor", "testvis",
+ "testdoctype1.headerval < 2"));
+ cmd->setFromTime(3);
+ cmd->setToTime(8);
+ for (uint32_t i=0; i<10; ++i) {
+ cmd->addBucketToBeVisited(document::BucketId(16, i));
+ }
+ cmd->setAddress(address);
+ _top->sendDown(cmd);
+ std::vector<document::Document::SP > docs;
+ std::vector<document::DocumentId> docIds;
+
+ getMessagesAndReply(2, getSession(0), docs, docIds);
+
+ verifyCreateVisitorReply(api::ReturnCode::OK);
+
+ CPPUNIT_ASSERT_EQUAL((size_t) 2, docs.size());
+ std::set<std::string> expected;
+ expected.insert("userdoc:test:4:http://www.ntnu.no/4.html");
+ expected.insert("userdoc:test:5:http://www.ntnu.no/5.html");
+ std::set<std::string> actual;
+ for (uint32_t i=0; i<docs.size(); ++i) {
+ actual.insert(docs[i]->getId().toString());
+ }
+ CPPUNIT_ASSERT_EQUAL(expected, actual);
+}
+
+void VisitorManagerTest::testVisitWithTimeframeAndBogusSelection()
+{
+ initializeTest();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor", "testvis",
+ "DocType(testdoctype1---///---) XXX BAD Field(headerval) < 2"));
+ cmd->setFromTime(3);
+ cmd->setToTime(8);
+ for (uint32_t i=0; i<10; ++i) {
+ cmd->addBucketToBeVisited(document::BucketId(16, i));
+ }
+ cmd->setAddress(address);
+
+ _top->sendDown(cmd);
+ _top->waitForMessages(1, 60);
+ const msg_ptr_vector replies = _top->getRepliesOnce();
+ CPPUNIT_ASSERT_EQUAL((size_t) 1, replies.size());
+
+ api::StorageReply* reply = dynamic_cast<api::StorageReply*>(
+ replies.front().get());
+ CPPUNIT_ASSERT(reply);
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::ILLEGAL_PARAMETERS,
+ reply->getResult().getResult());
+}
+
+void
+VisitorManagerTest::testVisitorCallbacks()
+{
+ initializeTest();
+ std::ostringstream replydata;
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("TestVisitor", "testvis", ""));
+ cmd->addBucketToBeVisited(document::BucketId(16, 3));
+ cmd->addBucketToBeVisited(document::BucketId(16, 5));
+ cmd->setAddress(address);
+ _top->sendDown(cmd);
+
+ // Wait until we have started the visitor
+ TestVisitorMessageSession& session = getSession(0);
+
+ for (uint32_t i = 0; i < 6; i++) {
+ session.waitForMessages(i + 1);
+ mbus::Reply::UP reply;
+ {
+ vespalib::MonitorGuard guard(session.getMonitor());
+
+ CPPUNIT_ASSERT_EQUAL((uint32_t)documentapi::DocumentProtocol::MESSAGE_MAPVISITOR, session.sentMessages[i]->getType());
+
+ documentapi::MapVisitorMessage* mapvisitormsg(
+ static_cast<documentapi::MapVisitorMessage*>(session.sentMessages[i].get()));
+
+ replydata << mapvisitormsg->getData().get("msg");
+
+ reply = mapvisitormsg->createReply();
+ reply->swapState(*session.sentMessages[i]);
+ reply->setMessage(mbus::Message::UP(session.sentMessages[i].release()));
+ }
+ session.reply(std::move(reply));
+ }
+
+ // All data has been replied to, expecting to get a create visitor reply
+ verifyCreateVisitorReply(api::ReturnCode::OK);
+
+ CPPUNIT_ASSERT_SUBSTRING_COUNT(replydata.str(), 1, "Starting visitor");
+ CPPUNIT_ASSERT_SUBSTRING_COUNT(replydata.str(), 2, "Handling block of 1 documents");
+ CPPUNIT_ASSERT_SUBSTRING_COUNT(replydata.str(), 2, "completedBucket");
+ CPPUNIT_ASSERT_SUBSTRING_COUNT(replydata.str(), 1, "completedVisiting");
+}
+
+void
+VisitorManagerTest::testVisitorCleanup()
+{
+ initializeTest();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+
+ // Start a bunch of invalid visitors
+ for (uint32_t i=0; i<10; ++i) {
+ std::ostringstream ost;
+ ost << "testvis" << i;
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("InvalidVisitor", ost.str(), ""));
+ cmd->addBucketToBeVisited(document::BucketId(16, 3));
+ cmd->setAddress(address);
+ cmd->setQueueTimeout(0);
+ _top->sendDown(cmd);
+ _top->waitForMessages(i+1, 60);
+ }
+
+ // Start a bunch of visitors
+ for (uint32_t i=0; i<10; ++i) {
+ std::ostringstream ost;
+ ost << "testvis" << (i + 10);
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor", ost.str(), ""));
+ cmd->addBucketToBeVisited(document::BucketId(16, 3));
+ cmd->setAddress(address);
+ cmd->setQueueTimeout(0);
+ _top->sendDown(cmd);
+ }
+
+
+ // Should get 14 immediate replies - 10 failures and 4 busy
+ {
+ _top->waitForMessages(14, 60);
+ const msg_ptr_vector replies = _top->getRepliesOnce();
+
+ int failures = 0;
+ int busy = 0;
+
+ for (uint32_t i=0; i< 14; ++i) {
+ std::shared_ptr<api::StorageMessage> msg(replies[i]);
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::VISITOR_CREATE_REPLY, msg->getType());
+ std::shared_ptr<api::CreateVisitorReply> reply(
+ std::dynamic_pointer_cast<api::CreateVisitorReply>(msg));
+ CPPUNIT_ASSERT(reply.get());
+
+ if (i < 10) {
+ if (api::ReturnCode::ILLEGAL_PARAMETERS == reply->getResult().getResult()) {
+ failures++;
+ } else {
+ std::cerr << reply->getResult() << "\n";
+ }
+ } else {
+ if (api::ReturnCode::BUSY == reply->getResult().getResult()) {
+ busy++;
+ }
+ }
+ }
+
+ CPPUNIT_ASSERT_EQUAL(10, failures);
+ CPPUNIT_ASSERT_EQUAL(4, busy);
+ }
+
+ // Finish a visitor
+ std::vector<document::Document::SP > docs;
+ std::vector<document::DocumentId> docIds;
+
+ getMessagesAndReply(1, getSession(0), docs, docIds);
+
+ // Should get a reply for the visitor.
+ verifyCreateVisitorReply(api::ReturnCode::OK);
+
+ // Fail a visitor
+ getMessagesAndReply(1, getSession(1), docs, docIds, api::ReturnCode::INTERNAL_FAILURE);
+
+ // Should get a reply for the visitor.
+ verifyCreateVisitorReply(api::ReturnCode::INTERNAL_FAILURE);
+
+ while (_manager->getActiveVisitorCount() > 2) {
+ FastOS_Thread::Sleep(10);
+ }
+
+ // Start a bunch of more visitors
+ for (uint32_t i=0; i<10; ++i) {
+ std::ostringstream ost;
+ ost << "testvis" << (i + 24);
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor", ost.str(), ""));
+ cmd->addBucketToBeVisited(document::BucketId(16, 3));
+ cmd->setAddress(address);
+ cmd->setQueueTimeout(0);
+ _top->sendDown(cmd);
+ }
+
+ // Should now get 8 busy.
+ _top->waitForMessages(8, 60);
+ const msg_ptr_vector replies = _top->getRepliesOnce();
+ CPPUNIT_ASSERT_EQUAL(8, (int)replies.size());
+
+ for (uint32_t i=0; i< replies.size(); ++i) {
+ std::shared_ptr<api::StorageMessage> msg(replies[i]);
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::VISITOR_CREATE_REPLY, msg->getType());
+ std::shared_ptr<api::CreateVisitorReply> reply(
+ std::dynamic_pointer_cast<api::CreateVisitorReply>(msg));
+ CPPUNIT_ASSERT(reply.get());
+
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::BUSY, reply->getResult().getResult());
+ }
+}
+
+void
+VisitorManagerTest::testAbortOnFailedVisitorInfo()
+{
+ initializeTest();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+
+ {
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor", "testvis", ""));
+ cmd->addBucketToBeVisited(document::BucketId(16, 3));
+ cmd->setAddress(address);
+ cmd->setQueueTimeout(0);
+ _top->sendDown(cmd);
+ }
+
+ uint32_t visitorRepliesReceived = 0;
+ uint32_t oki = 0;
+ uint32_t failed = 0;
+
+ std::vector<document::Document::SP > docs;
+ std::vector<document::DocumentId> docIds;
+
+ TestVisitorMessageSession& session = getSession(0);
+ getMessagesAndReply(1, session, docs, docIds, api::ReturnCode::NOT_READY);
+
+ {
+ session.waitForMessages(2);
+
+ documentapi::DocumentMessage* cmd = session.sentMessages[1].get();
+
+ mbus::Reply::UP reply = cmd->createReply();
+
+ CPPUNIT_ASSERT_EQUAL((uint32_t)documentapi::DocumentProtocol::MESSAGE_VISITORINFO, session.sentMessages[1]->getType());
+ reply->swapState(*session.sentMessages[1]);
+ reply->setMessage(mbus::Message::UP(session.sentMessages[1].release()));
+ reply->addError(mbus::Error(api::ReturnCode::NOT_CONNECTED, "Me no ready"));
+ session.reply(std::move(reply));
+ }
+
+ _top->waitForMessages(1, 60);
+ const msg_ptr_vector replies = _top->getRepliesOnce();
+ for (uint32_t i=0; i< replies.size(); ++i) {
+ std::shared_ptr<api::StorageMessage> msg(replies[i]);
+ if (msg->getType() == api::MessageType::VISITOR_CREATE_REPLY)
+ {
+ ++visitorRepliesReceived;
+ std::shared_ptr<api::CreateVisitorReply> reply(
+ std::dynamic_pointer_cast<api::CreateVisitorReply>(msg));
+ CPPUNIT_ASSERT(reply.get());
+ if (reply->getResult().success()) {
+ ++oki;
+ std::cerr << "\n" << reply->toString(true) << "\n";
+ } else {
+ ++failed;
+ }
+ }
+ }
+
+ std::ostringstream errmsg;
+ errmsg << "oki " << oki << ", failed " << failed;
+
+ CPPUNIT_ASSERT_EQUAL_MSG(errmsg.str(), 0u, oki);
+ CPPUNIT_ASSERT_EQUAL_MSG(errmsg.str(), 1u, failed);
+}
+
+void
+VisitorManagerTest::testAbortOnFieldPathError()
+{
+ initializeTest();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+
+ // Use bogus field path to force error to happen
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor",
+ "testvis",
+ "testdoctype1.headerval{bogus} == 1234"));
+ cmd->addBucketToBeVisited(document::BucketId(16, 3));
+ cmd->setAddress(address);
+ cmd->setQueueTimeout(0);
+ _top->sendDown(cmd);
+
+ verifyCreateVisitorReply(api::ReturnCode::ILLEGAL_PARAMETERS);
+}
+
+void
+VisitorManagerTest::testVisitorQueueTimeout()
+{
+ initializeTest();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ _manager->enforceQueueUsage();
+
+ {
+ vespalib::MonitorGuard guard(_manager->getThread(0).getQueueMonitor());
+
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor", "testvis", ""));
+ cmd->addBucketToBeVisited(document::BucketId(16, 3));
+ cmd->setAddress(address);
+ cmd->setQueueTimeout(1);
+ cmd->setTimeout(100 * 1000 * 1000);
+ _top->sendDown(cmd);
+
+ _node->getClock().addSecondsToTime(1000);
+ }
+
+ // Don't answer any messages. Make sure we timeout anyways.
+ uint32_t visitorRepliesReceived = 0;
+
+ _top->waitForMessages(1, 60);
+ const msg_ptr_vector replies = _top->getRepliesOnce();
+ std::shared_ptr<api::StorageMessage> msg(replies[0]);
+
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::VISITOR_CREATE_REPLY, msg->getType());
+ ++visitorRepliesReceived;
+ std::shared_ptr<api::CreateVisitorReply> reply(
+ std::dynamic_pointer_cast<api::CreateVisitorReply>(msg));
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode(api::ReturnCode::BUSY,
+ "Visitor timed out in visitor queue"),
+ reply->getResult());
+}
+
+void
+VisitorManagerTest::testVisitorProcessingTimeout()
+{
+ initializeTest();
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor", "testvis", ""));
+ cmd->addBucketToBeVisited(document::BucketId(16, 3));
+ cmd->setAddress(address);
+ cmd->setQueueTimeout(0);
+ cmd->setTimeout(100);
+ _top->sendDown(cmd);
+
+ // Wait for Put before increasing the clock
+ TestVisitorMessageSession& session = getSession(0);
+ session.waitForMessages(1);
+
+ _node->getClock().addSecondsToTime(1000);
+
+ // Don't answer any messages. Make sure we timeout anyways.
+ uint32_t visitorRepliesReceived = 0;
+
+ _top->waitForMessages(1, 60);
+ const msg_ptr_vector replies = _top->getRepliesOnce();
+ std::shared_ptr<api::StorageMessage> msg(replies[0]);
+
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::VISITOR_CREATE_REPLY, msg->getType());
+ ++visitorRepliesReceived;
+ std::shared_ptr<api::CreateVisitorReply> reply(
+ std::dynamic_pointer_cast<api::CreateVisitorReply>(msg));
+ CPPUNIT_ASSERT_EQUAL(api::ReturnCode::ABORTED,
+ reply->getResult().getResult());
+}
+
+namespace {
+ uint32_t nextVisitor = 0;
+
+ api::StorageMessage::Id
+ sendCreateVisitor(uint32_t timeout, DummyStorageLink& top, uint8_t priority = 127) {
+ std::ostringstream ost;
+ ost << "testvis" << ++nextVisitor;
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand("DumpVisitor", ost.str(), ""));
+ cmd->addBucketToBeVisited(document::BucketId(16, 3));
+ cmd->setAddress(address);
+ cmd->setQueueTimeout(timeout);
+ cmd->setPriority(priority);
+ top.sendDown(cmd);
+ return cmd->getMsgId();
+ }
+}
+
+void
+VisitorManagerTest::testPrioritizedVisitorQueing()
+{
+ framework::HttpUrlPath path("?verbose=true&allvisitors=true");
+ initializeTest();
+
+ _manager->setMaxConcurrentVisitors(4);
+ _manager->setMaxVisitorQueueSize(4);
+
+ api::StorageMessage::Id ids[10] = { 0 };
+
+ // First 4 should just start..
+ for (uint32_t i = 0; i < 4; ++i) {
+ ids[i] = sendCreateVisitor(i, *_top, i);
+ }
+
+ // Next ones should be queued - (Better not finish before we get here)
+ // Submit with higher priorities
+ for (uint32_t i = 0; i < 4; ++i) {
+ ids[i + 4] = sendCreateVisitor(1000, *_top, 100 - i);
+ }
+
+ // Queue is now full with a pri 100 visitor at its end
+ // Send a lower pri visitor that will be busy-returned immediately
+ ids[8] = sendCreateVisitor(1000, *_top, 130);
+
+ CPPUNIT_ASSERT_EQUAL(ids[8], verifyCreateVisitorReply(api::ReturnCode::BUSY));
+
+ // Send a higher pri visitor that will take the place of pri 100 visitor
+ ids[9] = sendCreateVisitor(1000, *_top, 60);
+
+ CPPUNIT_ASSERT_EQUAL(ids[4], verifyCreateVisitorReply(api::ReturnCode::BUSY));
+
+ // Finish the first visitor
+ std::vector<document::Document::SP > docs;
+ std::vector<document::DocumentId> docIds;
+ getMessagesAndReply(1, getSession(0), docs, docIds, api::ReturnCode::OK,
+ documentapi::Priority::PRI_HIGHEST);
+ verifyCreateVisitorReply(api::ReturnCode::OK);
+
+ // We should now start the highest priority visitor.
+ getMessagesAndReply(1, getSession(4), docs, docIds, api::ReturnCode::OK,
+ documentapi::Priority::PRI_VERY_HIGH);
+ CPPUNIT_ASSERT_EQUAL(ids[9], verifyCreateVisitorReply(api::ReturnCode::OK));
+}
+
+void
+VisitorManagerTest::testPrioritizedMaxConcurrentVisitors() {
+ framework::HttpUrlPath path("?verbose=true&allvisitors=true");
+ initializeTest();
+
+ api::StorageMessage::Id ids[17] = { 0 };
+
+ // Number of concurrent visitors is in [4, 8], depending on priority
+ // Max concurrent:
+ // [0, 1): 4
+ // [1, 64): 3
+ // [64, 128): 2
+ // [128, 192): 1
+ // [192, 256): 0
+ _manager->setMaxConcurrentVisitors(4, 4);
+ _manager->setMaxVisitorQueueSize(6);
+
+ // First 4 should just start..
+ for (uint32_t i = 0; i < 4; ++i) {
+ ids[i] = sendCreateVisitor(i, *_top, i);
+ }
+
+ // Low pri messages; get put into queue
+ for (uint32_t i = 0; i < 6; ++i) {
+ ids[i + 4] = sendCreateVisitor(1000, *_top, 203 - i);
+ }
+
+ // Higher pri message: fits happily into 1 extra concurrent slot
+ ids[10] = sendCreateVisitor(1000, *_top, 190);
+
+ // Should punch pri203 msg out of the queue -> busy
+ ids[11] = sendCreateVisitor(1000, *_top, 197);
+
+ CPPUNIT_ASSERT_EQUAL(ids[4], verifyCreateVisitorReply(api::ReturnCode::BUSY));
+
+ // No concurrency slots left for this message -> busy
+ ids[12] = sendCreateVisitor(1000, *_top, 204);
+
+ CPPUNIT_ASSERT_EQUAL(ids[12], verifyCreateVisitorReply(api::ReturnCode::BUSY));
+
+ // Gets a concurrent slot
+ ids[13] = sendCreateVisitor(1000, *_top, 80);
+
+ // Kicks pri 202 out of the queue -> busy
+ ids[14] = sendCreateVisitor(1000, *_top, 79);
+
+ CPPUNIT_ASSERT_EQUAL(ids[5], verifyCreateVisitorReply(api::ReturnCode::BUSY));
+
+ // Gets a concurrent slot
+ ids[15] = sendCreateVisitor(1000, *_top, 63);
+
+ // Very Important Visitor(tm) gets a concurrent slot
+ ids[16] = sendCreateVisitor(1000, *_top, 0);
+
+ std::vector<document::Document::SP > docs;
+ std::vector<document::DocumentId> docIds;
+
+ std::set<uint64_t> finishedVisitors;
+
+ // Verify that the correct visitors are running.
+ for (int i = 0; i < 8; i++) {
+ documentapi::Priority::Value priority =
+ documentapi::Priority::PRI_HIGHEST; // ids 0-3,16
+ if (i == 4) {
+ priority = documentapi::Priority::PRI_VERY_LOW; // ids 10
+ } else if (i == 5) {
+ priority = documentapi::Priority::PRI_HIGH_2; // ids 13
+ } else if (i == 6) {
+ priority = documentapi::Priority::PRI_HIGH_1; // ids 15
+ }
+ getMessagesAndReply(1, getSession(i), docs, docIds, api::ReturnCode::OK,
+ priority);
+ finishedVisitors.insert(verifyCreateVisitorReply(api::ReturnCode::OK));
+ }
+
+ for (int i = 0; i < 4; i++) {
+ CPPUNIT_ASSERT(finishedVisitors.find(ids[i]) != finishedVisitors.end());
+ }
+
+ CPPUNIT_ASSERT(finishedVisitors.find(ids[10]) != finishedVisitors.end());
+ CPPUNIT_ASSERT(finishedVisitors.find(ids[13]) != finishedVisitors.end());
+ CPPUNIT_ASSERT(finishedVisitors.find(ids[15]) != finishedVisitors.end());
+ CPPUNIT_ASSERT(finishedVisitors.find(ids[16]) != finishedVisitors.end());
+
+ finishedVisitors.clear();
+
+ for (int i = 8; i < 14; i++) {
+ documentapi::Priority::Value priority =
+ documentapi::Priority::PRI_LOWEST; // ids 6-9,11
+ if (i == 8) {
+ priority = documentapi::Priority::PRI_HIGH_2; // ids 14
+ }
+ getMessagesAndReply(1, getSession(i), docs, docIds, api::ReturnCode::OK,
+ priority);
+ uint64_t msgId = verifyCreateVisitorReply(api::ReturnCode::OK);
+ finishedVisitors.insert(msgId);
+ }
+
+ for (int i = 6; i < 10; i++) {
+ CPPUNIT_ASSERT(finishedVisitors.find(ids[i]) != finishedVisitors.end());
+ }
+
+ CPPUNIT_ASSERT(finishedVisitors.find(ids[11]) != finishedVisitors.end());
+ CPPUNIT_ASSERT(finishedVisitors.find(ids[14]) != finishedVisitors.end());
+}
+
+void
+VisitorManagerTest::testVisitorQueingZeroQueueSize() {
+ framework::HttpUrlPath path("?verbose=true&allvisitors=true");
+ initializeTest();
+
+ _manager->setMaxConcurrentVisitors(4);
+ _manager->setMaxVisitorQueueSize(0);
+
+ // First 4 should just start..
+ for (uint32_t i = 0; i < 4; ++i) {
+ sendCreateVisitor(i, *_top, i);
+ }
+ // Queue size is zero, all visitors will be busy-returned
+ for (uint32_t i = 0; i < 5; ++i) {
+ sendCreateVisitor(1000, *_top, 100 - i);
+ verifyCreateVisitorReply(api::ReturnCode::BUSY);
+ }
+}
+
+void
+VisitorManagerTest::testStatusPage() {
+ framework::HttpUrlPath path("?verbose=true&allvisitors=true");
+ initializeTest();
+
+ _manager->setMaxConcurrentVisitors(1, 1);
+ _manager->setMaxVisitorQueueSize(6);
+ // 1 running, 1 queued
+ sendCreateVisitor(1000000, *_top, 1);
+ sendCreateVisitor(1000000, *_top, 128);
+
+ TestVisitorMessageSession& session = getSession(0);
+ session.waitForMessages(1);
+
+ std::ostringstream ss;
+ static_cast<framework::HtmlStatusReporter&>(*_manager).reportHtmlStatus(ss, path);
+
+ std::string str(ss.str());
+ CPPUNIT_ASSERT(str.find("Currently running visitors") != std::string::npos);
+ // Should be propagated to visitor thread
+ CPPUNIT_ASSERT(str.find("Running 1 visitors") != std::string::npos); // 1 active
+ CPPUNIT_ASSERT(str.find("waiting visitors 1") != std::string::npos); // 1 queued
+ CPPUNIT_ASSERT(str.find("Visitor thread 0") != std::string::npos);
+ CPPUNIT_ASSERT(str.find("Disconnected visitor timeout") != std::string::npos); // verbose per thread
+ CPPUNIT_ASSERT(str.find("Message #1 <b>putdocumentmessage</b>") != std::string::npos); // 1 active
+}
+
+}
diff --git a/storage/src/tests/visiting/visitortest.cpp b/storage/src/tests/visiting/visitortest.cpp
new file mode 100644
index 00000000000..aed08a676b8
--- /dev/null
+++ b/storage/src/tests/visiting/visitortest.cpp
@@ -0,0 +1,1023 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/document/datatype/datatype.h>
+#include <vespa/document/fieldvalue/intfieldvalue.h>
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/document/fieldvalue/rawfieldvalue.h>
+#include <vespa/log/log.h>
+#include <vespa/storageapi/message/datagram.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/visitor.h>
+#include <vespa/storage/persistence/filestorage/filestormanager.h>
+#include <vespa/storage/visiting/visitormanager.h>
+#include <tests/common/testhelper.h>
+#include <tests/common/teststorageapp.h>
+#include <tests/common/dummystoragelink.h>
+#include <tests/storageserver/testvisitormessagesession.h>
+#include <vespa/vdstestlib/cppunit/macros.h>
+#include <vespa/vdslib/container/visitorordering.h>
+#include <vespa/documentapi/messagebus/messages/multioperationmessage.h>
+#include <vespa/documentapi/messagebus/messages/putdocumentmessage.h>
+#include <vespa/documentapi/messagebus/messages/removedocumentmessage.h>
+#include <vector>
+#include <thread>
+#include <chrono>
+
+LOG_SETUP(".visitortest");
+
+using namespace std::chrono_literals;
+
+namespace storage {
+
+namespace {
+
+using msg_ptr_vector = std::vector<api::StorageMessage::SP>;
+
+struct TestParams
+{
+ TestParams& iteratorsPerBucket(uint32_t n) {
+ _iteratorsPerBucket = n;
+ return *this;
+ }
+ TestParams& maxVisitorMemoryUsage(uint32_t bytes) {
+ _maxVisitorMemoryUsage = bytes;
+ return *this;
+ }
+ TestParams& parallelBuckets(uint32_t n) {
+ _parallelBuckets = n;
+ return *this;
+ }
+ TestParams& autoReplyError(const mbus::Error& error) {
+ _autoReplyError = error;
+ return *this;
+ }
+
+ uint32_t _iteratorsPerBucket {1};
+ uint32_t _maxVisitorMemoryUsage {UINT32_MAX};
+ uint32_t _parallelBuckets {1};
+ mbus::Error _autoReplyError;
+};
+
+}
+
+class VisitorTest : public CppUnit::TestFixture
+{
+private:
+ CPPUNIT_TEST_SUITE(VisitorTest);
+ CPPUNIT_TEST(testNormalUsage);
+ CPPUNIT_TEST(testFailedCreateIterator);
+ CPPUNIT_TEST(testFailedGetIter);
+ CPPUNIT_TEST(testMultipleFailedGetIter);
+ CPPUNIT_TEST(testDocumentAPIClientError);
+ CPPUNIT_TEST(testNoDocumentAPIResendingForFailedVisitor);
+ CPPUNIT_TEST(testIteratorCreatedForFailedVisitor);
+ CPPUNIT_TEST(testFailedDocumentAPISend);
+ CPPUNIT_TEST(testNoVisitorNotificationForTransientFailures);
+ CPPUNIT_TEST(testNotificationSentIfTransientErrorRetriedManyTimes);
+ CPPUNIT_TEST(testNoMbusTracingIfTraceLevelIsZero);
+ CPPUNIT_TEST(testReplyContainsTraceIfTraceLevelAboveZero);
+ CPPUNIT_TEST(testNoMoreIteratorsSentWhileMemoryUsedAboveLimit);
+ CPPUNIT_TEST(testDumpVisitorInvokesStrongReadConsistencyIteration);
+ CPPUNIT_TEST(testTestVisitorInvokesWeakReadConsistencyIteration);
+ CPPUNIT_TEST_SUITE_END();
+
+ static uint32_t docCount;
+ std::vector<document::Document::SP > _documents;
+ std::unique_ptr<TestVisitorMessageSessionFactory> _messageSessionFactory;
+ std::unique_ptr<TestServiceLayerApp> _node;
+ std::unique_ptr<DummyStorageLink> _top;
+ DummyStorageLink* _bottom;
+ VisitorManager* _manager;
+
+public:
+ VisitorTest() : _node() {}
+
+ void testNormalUsage();
+ void testFailedCreateIterator();
+ void testFailedGetIter();
+ void testMultipleFailedGetIter();
+ void testDocumentAPIClientError();
+ void testNoDocumentAPIResendingForFailedVisitor();
+ void testIteratorCreatedForFailedVisitor();
+ void testFailedDocumentAPISend();
+ void testNoVisitorNotificationForTransientFailures();
+ void testNotificationSentIfTransientErrorRetriedManyTimes();
+ void testNoMbusTracingIfTraceLevelIsZero();
+ void testReplyContainsTraceIfTraceLevelAboveZero();
+ void testNoMoreIteratorsSentWhileMemoryUsedAboveLimit();
+ void testDumpVisitorInvokesStrongReadConsistencyIteration();
+ void testTestVisitorInvokesWeakReadConsistencyIteration();
+ // TODO:
+ void testVisitMultipleBuckets() {}
+
+ // Not using setUp since can't throw exception out of it.
+ void initializeTest(const TestParams& params = TestParams());
+
+ struct VisitorOptions {
+ std::string visitorType{"dumpvisitor"};
+
+ VisitorOptions() {}
+
+ VisitorOptions& withVisitorType(vespalib::stringref type) {
+ visitorType = type;
+ return *this;
+ }
+ };
+
+ std::shared_ptr<api::CreateVisitorCommand> makeCreateVisitor(
+ const VisitorOptions& options = VisitorOptions());
+ void tearDown();
+ bool waitUntilNoActiveVisitors();
+ TestVisitorMessageSession& getSession(uint32_t n);
+ uint64_t verifyCreateVisitorReply(
+ api::ReturnCode::Result expectedResult,
+ int checkStatsDocsVisited = -1,
+ int checkStatsBytesVisited = -1);
+ void getMessagesAndReply(
+ int expectedCount,
+ TestVisitorMessageSession& session,
+ std::vector<document::Document::SP >& docs,
+ std::vector<document::DocumentId>& docIds,
+ std::vector<std::string>& infoMessages,
+ api::ReturnCode::Result returnCode = api::ReturnCode::OK);
+ uint32_t getMatchingDocuments(std::vector<document::Document::SP >& docs);
+
+private:
+ void doTestVisitorInstanceHasConsistencyLevel(
+ vespalib::stringref visitorType,
+ spi::ReadConsistency expectedConsistency);
+
+ template <typename T>
+ std::vector<std::shared_ptr<T> >
+ fetchMultipleCommands(DummyStorageLink& link, size_t count);
+
+ template <typename T>
+ std::shared_ptr<T>
+ fetchSingleCommand(DummyStorageLink& link);
+
+ void sendGetIterReply(GetIterCommand& cmd,
+ const api::ReturnCode& result =
+ api::ReturnCode(api::ReturnCode::OK),
+ uint32_t maxDocuments = 0,
+ bool overrideCompleted = false);
+ void sendCreateIteratorReply(uint64_t iteratorId = 1234);
+ std::shared_ptr<api::CreateVisitorReply> doCompleteVisitingSession(
+ const std::shared_ptr<api::CreateVisitorCommand>& cmd);
+
+ void sendInitialCreateVisitorAndGetIterRound();
+
+ int64_t getFailedVisitorDestinationReplyCount() const {
+ // There's no metric manager attached to these tests, so even if the
+ // test should magically freeze here for 5+ minutes, nothing should
+ // come in and wipe our accumulated failure metrics.
+ // Only 1 visitor thread running, so we know it has the metrics.
+ const auto& metrics = _manager->getThread(0).getMetrics();
+ auto loadType = documentapi::LoadType::DEFAULT;
+ return metrics.visitorDestinationFailureReplies[loadType].getCount();
+ }
+};
+
+uint32_t VisitorTest::docCount = 10;
+
+CPPUNIT_TEST_SUITE_REGISTRATION(VisitorTest);
+
+void
+VisitorTest::initializeTest(const TestParams& params)
+{
+ LOG(debug, "Initializing test");
+ vdstestlib::DirConfig config(getStandardConfig(true));
+ config.getConfig("stor-visitor").set("visitorthreads", "1");
+ config.getConfig("stor-visitor").set(
+ "iterators_per_bucket",
+ std::to_string(params._iteratorsPerBucket));
+ config.getConfig("stor-visitor").set(
+ "defaultparalleliterators",
+ std::to_string(params._parallelBuckets));
+ config.getConfig("stor-visitor").set(
+ "visitor_memory_usage_limit",
+ std::to_string(params._maxVisitorMemoryUsage));
+
+ system("chmod 755 vdsroot 2>/dev/null");
+ system("rm -rf vdsroot* 2>/dev/null");
+ assert(system("mkdir -p vdsroot/disks/d0") == 0);
+ assert(system("mkdir -p vdsroot/disks/d1") == 0);
+
+ try {
+ _messageSessionFactory.reset(
+ new TestVisitorMessageSessionFactory(config.getConfigId()));
+ if (params._autoReplyError.getCode() != mbus::ErrorCode::NONE) {
+ _messageSessionFactory->_autoReplyError = params._autoReplyError;
+ _messageSessionFactory->_createAutoReplyVisitorSessions = true;
+ }
+ _node.reset(new TestServiceLayerApp(config.getConfigId()));
+ _top.reset(new DummyStorageLink());
+ _top->push_back(std::unique_ptr<StorageLink>(_manager
+ = new VisitorManager(
+ config.getConfigId(),
+ _node->getComponentRegister(), *_messageSessionFactory)));
+ _bottom = new DummyStorageLink();
+ _top->push_back(std::unique_ptr<StorageLink>(_bottom));
+ _manager->setTimeBetweenTicks(10);
+ _top->open();
+ } catch (config::InvalidConfigException& e) {
+ fprintf(stderr, "%s\n", e.what());
+ }
+ std::string content(
+ "To be, or not to be: that is the question:\n"
+ "Whether 'tis nobler in the mind to suffer\n"
+ "The slings and arrows of outrageous fortune,\n"
+ "Or to take arms against a sea of troubles,\n"
+ "And by opposing end them? To die: to sleep;\n"
+ "No more; and by a sleep to say we end\n"
+ "The heart-ache and the thousand natural shocks\n"
+ "That flesh is heir to, 'tis a consummation\n"
+ "Devoutly to be wish'd. To die, to sleep;\n"
+ "To sleep: perchance to dream: ay, there's the rub;\n"
+ "For in that sleep of death what dreams may come\n"
+ "When we have shuffled off this mortal coil,\n"
+ "Must give us pause: there's the respect\n"
+ "That makes calamity of so long life;\n"
+ "For who would bear the whips and scorns of time,\n"
+ "The oppressor's wrong, the proud man's contumely,\n"
+ "The pangs of despised love, the law's delay,\n"
+ "The insolence of office and the spurns\n"
+ "That patient merit of the unworthy takes,\n"
+ "When he himself might his quietus make\n"
+ "With a bare bodkin? who would fardels bear,\n"
+ "To grunt and sweat under a weary life,\n"
+ "But that the dread of something after death,\n"
+ "The undiscover'd country from whose bourn\n"
+ "No traveller returns, puzzles the will\n"
+ "And makes us rather bear those ills we have\n"
+ "Than fly to others that we know not of?\n"
+ "Thus conscience does make cowards of us all;\n"
+ "And thus the native hue of resolution\n"
+ "Is sicklied o'er with the pale cast of thought,\n"
+ "And enterprises of great pith and moment\n"
+ "With this regard their currents turn awry,\n"
+ "And lose the name of action. - Soft you now!\n"
+ "The fair Ophelia! Nymph, in thy orisons\n"
+ "Be all my sins remember'd.\n");
+ _documents.clear();
+ for (uint32_t i=0; i<docCount; ++i) {
+ std::ostringstream uri;
+ uri << "userdoc:test:" << i % 10 << ":http://www.ntnu.no/"
+ << i << ".html";
+
+ _documents.push_back(document::Document::SP(
+ _node->getTestDocMan().createDocument(content, uri.str())));
+ const document::DocumentType& type(_documents.back()->getType());
+ _documents.back()->setValue(type.getField("headerval"),
+ document::IntFieldValue(i % 4));
+ }
+ LOG(debug, "Done initializing test");
+}
+
+void
+VisitorTest::tearDown()
+{
+ if (_top.get() != 0) {
+ _top->close();
+ _top->flush();
+ _top.reset(0);
+ }
+ _node.reset(0);
+ _messageSessionFactory.reset(0);
+ _manager = 0;
+}
+
+bool
+VisitorTest::waitUntilNoActiveVisitors()
+{
+ int i = 0;
+ for (; i < 1000; ++i) {
+ if (_manager->getActiveVisitorCount() == 0) {
+ return true;
+ }
+ std::this_thread::sleep_for(10ms);
+ }
+ return false;
+}
+
+TestVisitorMessageSession&
+VisitorTest::getSession(uint32_t n)
+{
+ // Wait until we have started the visitor
+ const std::vector<TestVisitorMessageSession*>& sessions(
+ _messageSessionFactory->_visitorSessions);
+ framework::defaultimplementation::RealClock clock;
+ framework::MilliSecTime endTime(
+ clock.getTimeInMillis() + framework::MilliSecTime(30 * 1000));
+ while (true) {
+ {
+ vespalib::LockGuard lock(_messageSessionFactory->_accessLock);
+ if (sessions.size() > n) {
+ return *sessions[n];
+ }
+ }
+ if (clock.getTimeInMillis() > endTime) {
+ throw vespalib::IllegalStateException(
+ "Timed out waiting for visitor session", VESPA_STRLOC);
+ }
+ std::this_thread::sleep_for(10ms);
+ }
+ throw std::logic_error("unreachable");
+}
+
+void
+VisitorTest::getMessagesAndReply(
+ int expectedCount,
+ TestVisitorMessageSession& session,
+ std::vector<document::Document::SP >& docs,
+ std::vector<document::DocumentId>& docIds,
+ std::vector<std::string>& infoMessages,
+ api::ReturnCode::Result result)
+{
+ for (int i = 0; i < expectedCount; i++) {
+ session.waitForMessages(1);
+ mbus::Reply::UP reply;
+ {
+ vespalib::MonitorGuard guard(session.getMonitor());
+ CPPUNIT_ASSERT(!session.sentMessages.empty());
+ vespalib::LinkedPtr<documentapi::DocumentMessage> msg(
+ session.sentMessages.front());
+ CPPUNIT_ASSERT(msg->getPriority() < 16);
+
+ switch (msg->getType()) {
+ case documentapi::DocumentProtocol::MESSAGE_PUTDOCUMENT:
+ docs.push_back(
+ static_cast<documentapi::PutDocumentMessage&>(*msg)
+ .getDocument());
+ break;
+ case documentapi::DocumentProtocol::MESSAGE_REMOVEDOCUMENT:
+ docIds.push_back(
+ static_cast<documentapi::RemoveDocumentMessage&>(*msg)
+ .getDocumentId());
+ break;
+ case documentapi::DocumentProtocol::MESSAGE_VISITORINFO:
+ infoMessages.push_back(
+ static_cast<documentapi::VisitorInfoMessage&>(*msg)
+ .getErrorMessage());
+ break;
+ default:
+ break;
+ }
+
+ reply = msg->createReply();
+ reply->swapState(*msg);
+
+ session.sentMessages.pop_front(); // Release linked ptr ref.
+ reply->setMessage(mbus::Message::UP(msg.release()));
+
+ if (result != api::ReturnCode::OK) {
+ reply->addError(mbus::Error(result, "Generic error"));
+ }
+ }
+ session.reply(std::move(reply));
+ }
+}
+
+uint64_t
+VisitorTest::verifyCreateVisitorReply(
+ api::ReturnCode::Result expectedResult,
+ int checkStatsDocsVisited,
+ int checkStatsBytesVisited)
+{
+ _top->waitForMessages(1, 60);
+ const msg_ptr_vector replies = _top->getRepliesOnce();
+ CPPUNIT_ASSERT_EQUAL(1, (int)replies.size());
+
+ std::shared_ptr<api::StorageMessage> msg(replies[0]);
+
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::VISITOR_CREATE_REPLY, msg->getType());
+
+ std::shared_ptr<api::CreateVisitorReply> reply(
+ std::dynamic_pointer_cast<api::CreateVisitorReply>(msg));
+ CPPUNIT_ASSERT(reply.get());
+ CPPUNIT_ASSERT_EQUAL(expectedResult, reply->getResult().getResult());
+
+ if (checkStatsDocsVisited >= 0) {
+ CPPUNIT_ASSERT_EQUAL(checkStatsDocsVisited,
+ int(reply->getVisitorStatistics().getDocumentsVisited()));
+ }
+ if (checkStatsBytesVisited >= 0) {
+ CPPUNIT_ASSERT_EQUAL(checkStatsBytesVisited,
+ int(reply->getVisitorStatistics().getBytesVisited()));
+ }
+
+ return reply->getMsgId();
+}
+
+uint32_t
+VisitorTest::getMatchingDocuments(std::vector<document::Document::SP >& docs) {
+ uint32_t equalCount = 0;
+ for (uint32_t i=0; i<docs.size(); ++i) {
+ for (uint32_t j=0; j<_documents.size(); ++j) {
+ if (*docs[i] == *_documents[j] &&
+ docs[i]->getId() == _documents[j]->getId())
+ {
+ equalCount++;
+ }
+ }
+ }
+
+ return equalCount;
+}
+
+void
+VisitorTest::sendGetIterReply(GetIterCommand& cmd,
+ const api::ReturnCode& result,
+ uint32_t maxDocuments,
+ bool overrideCompleted)
+{
+ GetIterReply::SP reply(new GetIterReply(cmd));
+ if (result.failed()) {
+ reply->setResult(result);
+ _bottom->sendUp(reply);
+ return;
+ }
+ assert(maxDocuments < _documents.size());
+ size_t documentCount = maxDocuments != 0 ? maxDocuments : _documents.size();
+ for (size_t i = 0; i < documentCount; ++i) {
+ reply->getEntries().push_back(
+ spi::DocEntry::LP(
+ new spi::DocEntry(
+ spi::Timestamp(1000 + i),
+ spi::NONE,
+ document::Document::UP(_documents[i]->clone()))));
+ }
+ if (documentCount == _documents.size() || overrideCompleted) {
+ reply->setCompleted();
+ }
+ _bottom->sendUp(reply);
+}
+
+template <typename T>
+std::vector<std::shared_ptr<T> >
+VisitorTest::fetchMultipleCommands(DummyStorageLink& link, size_t count)
+{
+ link.waitForMessages(count, 60);
+ std::vector<api::StorageMessage::SP> msgs(link.getCommandsOnce());
+ std::vector<std::shared_ptr<T> > fetched;
+ if (msgs.size() != count) {
+ std::ostringstream oss;
+ oss << "Expected "
+ << count
+ << " messages, got "
+ << msgs.size()
+ << ":\n";
+ for (size_t i = 0; i < msgs.size(); ++i) {
+ oss << i << ": " << *msgs[i] << "\n";
+ }
+ CPPUNIT_FAIL(oss.str());
+ }
+ for (size_t i = 0; i < count; ++i) {
+ std::shared_ptr<T> ret(std::dynamic_pointer_cast<T>(msgs[i]));
+ if (!ret) {
+ std::ostringstream oss;
+ oss << "Expected message of type "
+ << typeid(T).name()
+ << ", but got "
+ << msgs[0]->toString();
+ CPPUNIT_FAIL(oss.str());
+ }
+ fetched.push_back(ret);
+ }
+ return fetched;
+}
+
+template <typename T>
+std::shared_ptr<T>
+VisitorTest::fetchSingleCommand(DummyStorageLink& link)
+{
+ std::vector<std::shared_ptr<T> > ret(
+ fetchMultipleCommands<T>(link, 1));
+ return ret[0];
+}
+
+std::shared_ptr<api::CreateVisitorCommand>
+VisitorTest::makeCreateVisitor(const VisitorOptions& options)
+{
+ api::StorageMessageAddress address("storage", lib::NodeType::STORAGE, 0);
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ new api::CreateVisitorCommand(options.visitorType, "testvis", ""));
+ cmd->addBucketToBeVisited(document::BucketId(16, 3));
+ cmd->setAddress(address);
+ cmd->setMaximumPendingReplyCount(UINT32_MAX);
+ cmd->setControlDestination("foo/bar");
+ return cmd;
+}
+
+void
+VisitorTest::sendCreateIteratorReply(uint64_t iteratorId)
+{
+ CreateIteratorCommand::SP createCmd(
+ fetchSingleCommand<CreateIteratorCommand>(*_bottom));
+ spi::IteratorId id(iteratorId);
+ api::StorageReply::SP reply(
+ new CreateIteratorReply(*createCmd, id));
+ _bottom->sendUp(reply);
+}
+
+void
+VisitorTest::testNormalUsage()
+{
+ initializeTest();
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ makeCreateVisitor());
+ _top->sendDown(cmd);
+
+ CreateIteratorCommand::SP createCmd(
+ fetchSingleCommand<CreateIteratorCommand>(*_bottom));
+ CPPUNIT_ASSERT_EQUAL(uint8_t(0), createCmd->getPriority()); // Highest pri
+ spi::IteratorId id(1234);
+ api::StorageReply::SP reply(
+ new CreateIteratorReply(*createCmd, id));
+ _bottom->sendUp(reply);
+
+ GetIterCommand::SP getIterCmd(
+ fetchSingleCommand<GetIterCommand>(*_bottom));
+ CPPUNIT_ASSERT_EQUAL(spi::IteratorId(1234),
+ getIterCmd->getIteratorId());
+
+ sendGetIterReply(*getIterCmd);
+
+ std::vector<document::Document::SP> docs;
+ std::vector<document::DocumentId> docIds;
+ std::vector<std::string> infoMessages;
+ getMessagesAndReply(_documents.size(), getSession(0), docs, docIds, infoMessages);
+ CPPUNIT_ASSERT_EQUAL(size_t(0), infoMessages.size());
+ CPPUNIT_ASSERT_EQUAL(size_t(0), docIds.size());
+
+ DestroyIteratorCommand::SP destroyIterCmd(
+ fetchSingleCommand<DestroyIteratorCommand>(*_bottom));
+
+ verifyCreateVisitorReply(api::ReturnCode::OK);
+ CPPUNIT_ASSERT(waitUntilNoActiveVisitors());
+ CPPUNIT_ASSERT_EQUAL(0L, getFailedVisitorDestinationReplyCount());
+}
+
+void
+VisitorTest::testFailedCreateIterator()
+{
+ initializeTest();
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ makeCreateVisitor());
+ cmd->addBucketToBeVisited(document::BucketId(16, 4));
+ _top->sendDown(cmd);
+
+ CreateIteratorCommand::SP createCmd(
+ fetchSingleCommand<CreateIteratorCommand>(*_bottom));
+ spi::IteratorId id(0);
+ api::StorageReply::SP reply(
+ new CreateIteratorReply(*createCmd, id));
+ reply->setResult(api::ReturnCode(api::ReturnCode::INTERNAL_FAILURE));
+ _bottom->sendUp(reply);
+
+ verifyCreateVisitorReply(api::ReturnCode::INTERNAL_FAILURE, 0, 0);
+ CPPUNIT_ASSERT(waitUntilNoActiveVisitors());
+}
+
+void
+VisitorTest::testFailedGetIter()
+{
+ initializeTest();
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ makeCreateVisitor());
+ _top->sendDown(cmd);
+ sendCreateIteratorReply();
+
+ GetIterCommand::SP getIterCmd(
+ fetchSingleCommand<GetIterCommand>(*_bottom));
+ CPPUNIT_ASSERT_EQUAL(spi::IteratorId(1234),
+ getIterCmd->getIteratorId());
+
+ sendGetIterReply(*getIterCmd,
+ api::ReturnCode(api::ReturnCode::BUCKET_NOT_FOUND));
+
+ DestroyIteratorCommand::SP destroyIterCmd(
+ fetchSingleCommand<DestroyIteratorCommand>(*_bottom));
+
+ verifyCreateVisitorReply(api::ReturnCode::BUCKET_NOT_FOUND, 0, 0);
+ CPPUNIT_ASSERT(waitUntilNoActiveVisitors());
+}
+
+void
+VisitorTest::testMultipleFailedGetIter()
+{
+ initializeTest(TestParams().iteratorsPerBucket(2));
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ makeCreateVisitor());
+ _top->sendDown(cmd);
+ sendCreateIteratorReply();
+
+ std::vector<GetIterCommand::SP> getIterCmds(
+ fetchMultipleCommands<GetIterCommand>(*_bottom, 2));
+
+ sendGetIterReply(*getIterCmds[0],
+ api::ReturnCode(api::ReturnCode::BUCKET_NOT_FOUND));
+
+ // Wait for an "appropriate" amount of time so that wrongful logic
+ // will send a DestroyIteratorCommand before all pending GetIters
+ // have been replied to.
+ std::this_thread::sleep_for(100ms);
+
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _bottom->getNumCommands());
+
+ sendGetIterReply(*getIterCmds[1],
+ api::ReturnCode(api::ReturnCode::BUCKET_DELETED));
+
+ DestroyIteratorCommand::SP destroyIterCmd(
+ fetchSingleCommand<DestroyIteratorCommand>(*_bottom));
+
+ verifyCreateVisitorReply(api::ReturnCode::BUCKET_DELETED, 0, 0);
+ CPPUNIT_ASSERT(waitUntilNoActiveVisitors());
+}
+
+void
+VisitorTest::testDocumentAPIClientError()
+{
+ initializeTest();
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ makeCreateVisitor());
+ _top->sendDown(cmd);
+ sendCreateIteratorReply();
+
+ {
+ GetIterCommand::SP getIterCmd(
+ fetchSingleCommand<GetIterCommand>(*_bottom));
+ CPPUNIT_ASSERT_EQUAL(spi::IteratorId(1234),
+ getIterCmd->getIteratorId());
+
+ sendGetIterReply(*getIterCmd, api::ReturnCode(api::ReturnCode::OK), 1);
+ }
+
+ std::vector<document::Document::SP> docs;
+ std::vector<document::DocumentId> docIds;
+ std::vector<std::string> infoMessages;
+ getMessagesAndReply(1, getSession(0), docs, docIds, infoMessages,
+ api::ReturnCode::INTERNAL_FAILURE);
+ // INTERNAL_FAILURE is critical, so no visitor info sent
+ CPPUNIT_ASSERT_EQUAL(size_t(0), infoMessages.size());
+
+ std::this_thread::sleep_for(100ms);
+
+ {
+ GetIterCommand::SP getIterCmd(
+ fetchSingleCommand<GetIterCommand>(*_bottom));
+ CPPUNIT_ASSERT_EQUAL(spi::IteratorId(1234),
+ getIterCmd->getIteratorId());
+
+ sendGetIterReply(*getIterCmd);
+ }
+
+ DestroyIteratorCommand::SP destroyIterCmd(
+ fetchSingleCommand<DestroyIteratorCommand>(*_bottom));
+
+ verifyCreateVisitorReply(api::ReturnCode::INTERNAL_FAILURE);
+ CPPUNIT_ASSERT(waitUntilNoActiveVisitors());
+}
+
+void
+VisitorTest::testNoDocumentAPIResendingForFailedVisitor()
+{
+ initializeTest();
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ makeCreateVisitor());
+ _top->sendDown(cmd);
+ sendCreateIteratorReply();
+
+ {
+ GetIterCommand::SP getIterCmd(
+ fetchSingleCommand<GetIterCommand>(*_bottom));
+ CPPUNIT_ASSERT_EQUAL(spi::IteratorId(1234),
+ getIterCmd->getIteratorId());
+
+ sendGetIterReply(*getIterCmd, api::ReturnCode(api::ReturnCode::OK), 2, true);
+ }
+
+ std::vector<document::Document::SP> docs;
+ std::vector<document::DocumentId> docIds;
+ std::vector<std::string> infoMessages;
+ // Use non-critical result. Visitor info message should be received
+ // after we send a NOT_CONNECTED reply. Failing this message as well
+ // should cause the entire visitor to fail.
+ getMessagesAndReply(3, getSession(0), docs, docIds, infoMessages,
+ api::ReturnCode::NOT_CONNECTED);
+ CPPUNIT_ASSERT_EQUAL(size_t(1), infoMessages.size());
+ CPPUNIT_ASSERT_EQUAL(
+ std::string("[From content node 0] NOT_CONNECTED: Generic error"),
+ infoMessages[0]);
+
+ DestroyIteratorCommand::SP destroyIterCmd(
+ fetchSingleCommand<DestroyIteratorCommand>(*_bottom));
+
+ verifyCreateVisitorReply(api::ReturnCode::NOT_CONNECTED);
+ CPPUNIT_ASSERT(waitUntilNoActiveVisitors());
+ CPPUNIT_ASSERT_EQUAL(3L, getFailedVisitorDestinationReplyCount());
+}
+
+void
+VisitorTest::testIteratorCreatedForFailedVisitor()
+{
+ initializeTest(TestParams().iteratorsPerBucket(1).parallelBuckets(2));
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ makeCreateVisitor());
+ cmd->addBucketToBeVisited(document::BucketId(16, 4));
+ _top->sendDown(cmd);
+
+ std::vector<CreateIteratorCommand::SP> createCmds(
+ fetchMultipleCommands<CreateIteratorCommand>(*_bottom, 2));
+ {
+ spi::IteratorId id(0);
+ api::StorageReply::SP reply(
+ new CreateIteratorReply(*createCmds[0], id));
+ reply->setResult(api::ReturnCode(api::ReturnCode::INTERNAL_FAILURE));
+ _bottom->sendUp(reply);
+ }
+ {
+ spi::IteratorId id(1234);
+ api::StorageReply::SP reply(
+ new CreateIteratorReply(*createCmds[1], id));
+ _bottom->sendUp(reply);
+ }
+ // Want to immediately receive destroyiterator for newly created
+ // iterator, since we cannot use it anyway when the visitor has failed.
+ DestroyIteratorCommand::SP destroyCmd(
+ fetchSingleCommand<DestroyIteratorCommand>(*_bottom));
+
+ verifyCreateVisitorReply(api::ReturnCode::INTERNAL_FAILURE, 0, 0);
+ CPPUNIT_ASSERT(waitUntilNoActiveVisitors());
+}
+
+/**
+ * Test that if a visitor fails to send a document API message outright
+ * (i.e. a case where it will never get a reply), the session is failed
+ * and the visitor terminates cleanly without counting the failed message
+ * as pending.
+ */
+void
+VisitorTest::testFailedDocumentAPISend()
+{
+ initializeTest(TestParams().autoReplyError(
+ mbus::Error(mbus::ErrorCode::HANDSHAKE_FAILED,
+ "abandon ship!")));
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ makeCreateVisitor());
+ cmd->addBucketToBeVisited(document::BucketId(16, 4));
+ _top->sendDown(cmd);
+
+ sendCreateIteratorReply();
+ GetIterCommand::SP getIterCmd(
+ fetchSingleCommand<GetIterCommand>(*_bottom));
+ CPPUNIT_ASSERT_EQUAL(spi::IteratorId(1234),
+ getIterCmd->getIteratorId());
+ sendGetIterReply(*getIterCmd,
+ api::ReturnCode(api::ReturnCode::OK),
+ 2,
+ true);
+
+ DestroyIteratorCommand::SP destroyIterCmd(
+ fetchSingleCommand<DestroyIteratorCommand>(*_bottom));
+
+ verifyCreateVisitorReply(
+ static_cast<api::ReturnCode::Result>(
+ mbus::ErrorCode::HANDSHAKE_FAILED),
+ 0,
+ 0);
+ CPPUNIT_ASSERT(waitUntilNoActiveVisitors());
+ // We currently don't count failures to send in this metric; send failures
+ // indicate a message bus problem and already log a warning when they happen
+ CPPUNIT_ASSERT_EQUAL(0L, getFailedVisitorDestinationReplyCount());
+}
+
+void
+VisitorTest::sendInitialCreateVisitorAndGetIterRound()
+{
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ makeCreateVisitor());
+ _top->sendDown(cmd);
+ sendCreateIteratorReply();
+
+ {
+ GetIterCommand::SP getIterCmd(
+ fetchSingleCommand<GetIterCommand>(*_bottom));
+ sendGetIterReply(*getIterCmd, api::ReturnCode(api::ReturnCode::OK),
+ 1, true);
+ }
+}
+
+void
+VisitorTest::testNoVisitorNotificationForTransientFailures()
+{
+ initializeTest();
+ sendInitialCreateVisitorAndGetIterRound();
+
+ std::vector<document::Document::SP> docs;
+ std::vector<document::DocumentId> docIds;
+ std::vector<std::string> infoMessages;
+ // Have to make sure time increases in visitor thread so that resend
+ // times are reached.
+ _node->getClock().setFakeCycleMode();
+ // Should not get info message for BUCKET_DELETED, but resend of Put.
+ getMessagesAndReply(1, getSession(0), docs, docIds, infoMessages,
+ api::ReturnCode::BUCKET_DELETED);
+ CPPUNIT_ASSERT_EQUAL(size_t(0), infoMessages.size());
+ // Should not get info message for BUCKET_NOT_FOUND, but resend of Put.
+ getMessagesAndReply(1, getSession(0), docs, docIds, infoMessages,
+ api::ReturnCode::BUCKET_NOT_FOUND);
+ CPPUNIT_ASSERT_EQUAL(size_t(0), infoMessages.size());
+ // MessageBus error codes guaranteed to fit in return code result.
+ // Should not get info message for SESSION_BUSY, but resend of Put.
+ getMessagesAndReply(1, getSession(0), docs, docIds, infoMessages,
+ static_cast<api::ReturnCode::Result>(
+ mbus::ErrorCode::SESSION_BUSY));
+ CPPUNIT_ASSERT_EQUAL(size_t(0), infoMessages.size());
+ // WRONG_DISTRIBUTION should not be reported, as it will happen all the
+ // time when initiating remote migrations et al.
+ getMessagesAndReply(1, getSession(0), docs, docIds, infoMessages,
+ api::ReturnCode::WRONG_DISTRIBUTION);
+ CPPUNIT_ASSERT_EQUAL(size_t(0), infoMessages.size());
+
+ // Complete message successfully to finish the visitor.
+ getMessagesAndReply(1, getSession(0), docs, docIds, infoMessages,
+ api::ReturnCode::OK);
+ CPPUNIT_ASSERT_EQUAL(size_t(0), infoMessages.size());
+
+ fetchSingleCommand<DestroyIteratorCommand>(*_bottom);
+
+ verifyCreateVisitorReply(api::ReturnCode::OK);
+ CPPUNIT_ASSERT(waitUntilNoActiveVisitors());
+}
+
+void
+VisitorTest::testNotificationSentIfTransientErrorRetriedManyTimes()
+{
+ constexpr size_t retries(
+ Visitor::TRANSIENT_ERROR_RETRIES_BEFORE_NOTIFY);
+
+ initializeTest();
+ sendInitialCreateVisitorAndGetIterRound();
+
+ std::vector<document::Document::SP> docs;
+ std::vector<document::DocumentId> docIds;
+ std::vector<std::string> infoMessages;
+ // Have to make sure time increases in visitor thread so that resend
+ // times are reached.
+ _node->getClock().setFakeCycleMode();
+ for (size_t attempt = 0; attempt < retries; ++attempt) {
+ getMessagesAndReply(1, getSession(0), docs, docIds, infoMessages,
+ api::ReturnCode::WRONG_DISTRIBUTION);
+ CPPUNIT_ASSERT_EQUAL(size_t(0), infoMessages.size());
+ }
+ // Should now have a client notification along for the ride.
+ // This has to be ACKed as OK or the visitor will fail.
+ getMessagesAndReply(2, getSession(0), docs, docIds, infoMessages,
+ api::ReturnCode::OK);
+ CPPUNIT_ASSERT_EQUAL(size_t(1), infoMessages.size());
+ // TODO(vekterli) ideally we'd want to test that this happens only once
+ // per message, but this seems frustratingly complex to do currently.
+ fetchSingleCommand<DestroyIteratorCommand>(*_bottom);
+
+ verifyCreateVisitorReply(api::ReturnCode::OK);
+ CPPUNIT_ASSERT(waitUntilNoActiveVisitors());
+}
+
+std::shared_ptr<api::CreateVisitorReply>
+VisitorTest::doCompleteVisitingSession(
+ const std::shared_ptr<api::CreateVisitorCommand>& cmd)
+{
+ initializeTest();
+ _top->sendDown(cmd);
+ sendCreateIteratorReply();
+
+ GetIterCommand::SP getIterCmd(
+ fetchSingleCommand<GetIterCommand>(*_bottom));
+ sendGetIterReply(*getIterCmd,
+ api::ReturnCode(api::ReturnCode::OK),
+ 1,
+ true);
+
+ std::vector<document::Document::SP> docs;
+ std::vector<document::DocumentId> docIds;
+ std::vector<std::string> infoMessages;
+ getMessagesAndReply(1, getSession(0), docs, docIds, infoMessages);
+
+ DestroyIteratorCommand::SP destroyIterCmd(
+ fetchSingleCommand<DestroyIteratorCommand>(*_bottom));
+
+ _top->waitForMessages(1, 60);
+ const msg_ptr_vector replies = _top->getRepliesOnce();
+ CPPUNIT_ASSERT_EQUAL(size_t(1), replies.size());
+
+ std::shared_ptr<api::StorageMessage> msg(replies[0]);
+
+ CPPUNIT_ASSERT_EQUAL(api::MessageType::VISITOR_CREATE_REPLY,
+ msg->getType());
+ return std::dynamic_pointer_cast<api::CreateVisitorReply>(msg);
+}
+
+void
+VisitorTest::testNoMbusTracingIfTraceLevelIsZero()
+{
+ std::shared_ptr<api::CreateVisitorCommand> cmd(makeCreateVisitor());
+ cmd->getTrace().setLevel(0);
+ auto reply = doCompleteVisitingSession(cmd);
+ CPPUNIT_ASSERT(reply->getTrace().getRoot().isEmpty());
+}
+
+void
+VisitorTest::testReplyContainsTraceIfTraceLevelAboveZero()
+{
+ std::shared_ptr<api::CreateVisitorCommand> cmd(makeCreateVisitor());
+ cmd->getTrace().setLevel(1);
+ auto reply = doCompleteVisitingSession(cmd);
+ CPPUNIT_ASSERT(!reply->getTrace().getRoot().isEmpty());
+}
+
+void
+VisitorTest::testNoMoreIteratorsSentWhileMemoryUsedAboveLimit()
+{
+ initializeTest(TestParams().maxVisitorMemoryUsage(1)
+ .parallelBuckets(1)
+ .iteratorsPerBucket(1));
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ makeCreateVisitor());
+ _top->sendDown(cmd);
+ sendCreateIteratorReply();
+
+ GetIterCommand::SP getIterCmd(
+ fetchSingleCommand<GetIterCommand>(*_bottom));
+ sendGetIterReply(*getIterCmd,
+ api::ReturnCode(api::ReturnCode::OK),
+ 1);
+
+ // Pending Document API message towards client; memory usage should prevent
+ // visitor from sending down additional GetIter messages until the pending
+ // client message has been replied to and cleared from the internal state.
+ getSession(0).waitForMessages(1);
+ // Note that it's possible for this test to exhibit false negatives (but not
+ // false positives) since the _absence_ of a message means we don't have any
+ // kind of explicit barrier with which we can synchronize the test and the
+ // running visitor thread.
+ std::this_thread::sleep_for(100ms);
+ CPPUNIT_ASSERT_EQUAL(size_t(0), _bottom->getNumCommands());
+
+ std::vector<document::Document::SP> docs;
+ std::vector<document::DocumentId> docIds;
+ std::vector<std::string> infoMessages;
+ getMessagesAndReply(1, getSession(0), docs, docIds, infoMessages);
+
+ // 2nd round of GetIter now allowed. Send reply indicating completion.
+ getIterCmd = fetchSingleCommand<GetIterCommand>(*_bottom);
+ sendGetIterReply(*getIterCmd,
+ api::ReturnCode(api::ReturnCode::OK),
+ 1,
+ true);
+
+ getMessagesAndReply(1, getSession(0), docs, docIds, infoMessages);
+
+ DestroyIteratorCommand::SP destroyIterCmd(
+ fetchSingleCommand<DestroyIteratorCommand>(*_bottom));
+
+ verifyCreateVisitorReply(api::ReturnCode::OK);
+ CPPUNIT_ASSERT(waitUntilNoActiveVisitors());
+}
+
+void
+VisitorTest::doTestVisitorInstanceHasConsistencyLevel(
+ vespalib::stringref visitorType,
+ spi::ReadConsistency expectedConsistency)
+{
+ initializeTest();
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ makeCreateVisitor(VisitorOptions().withVisitorType(visitorType)));
+ _top->sendDown(cmd);
+
+ auto createCmd = fetchSingleCommand<CreateIteratorCommand>(*_bottom);
+ CPPUNIT_ASSERT_EQUAL(expectedConsistency,
+ createCmd->getReadConsistency());
+}
+
+void
+VisitorTest::testDumpVisitorInvokesStrongReadConsistencyIteration()
+{
+ doTestVisitorInstanceHasConsistencyLevel(
+ "dumpvisitor", spi::ReadConsistency::STRONG);
+}
+
+// NOTE: SearchVisitor cannot be tested here since it's in a separate module
+// which depends on _this_ module for compilation. Instead we let TestVisitor
+// use weak consistency, as this is just some internal stuff not used for/by
+// any external client use cases. Our primary concern is to test that each
+// visitor subclass might report its own read consistency requirement and that
+// this is carried along to the CreateIteratorCommand.
+void
+VisitorTest::testTestVisitorInvokesWeakReadConsistencyIteration()
+{
+ doTestVisitorInstanceHasConsistencyLevel(
+ "testvisitor", spi::ReadConsistency::WEAK);
+}
+
+} // namespace storage
diff --git a/storage/src/versiontag.mak b/storage/src/versiontag.mak
new file mode 100644
index 00000000000..ffd6a4c899f
--- /dev/null
+++ b/storage/src/versiontag.mak
@@ -0,0 +1,7 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+VTAG := $(shell $(VESPALIB_DIR)/bin/getversion -D $(TOP) )
+
+ifneq (X$(SPECIFIED_VTAG),XDISABLE)
+ VTAG += -DV_TAG='"$(SPECIFIED_VTAG)"'
+endif
diff --git a/storage/src/vespa/storage/.gitignore b/storage/src/vespa/storage/.gitignore
new file mode 100644
index 00000000000..107a86953a4
--- /dev/null
+++ b/storage/src/vespa/storage/.gitignore
@@ -0,0 +1,9 @@
+*.So
+*.lo
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
+features.h
+/libstorage.so.5.1
diff --git a/storage/src/vespa/storage/CMakeLists.txt b/storage/src/vespa/storage/CMakeLists.txt
new file mode 100644
index 00000000000..2adfe73db38
--- /dev/null
+++ b/storage/src/vespa/storage/CMakeLists.txt
@@ -0,0 +1,25 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage
+ SOURCES
+ $<TARGET_OBJECTS:storage_bucketdb>
+ $<TARGET_OBJECTS:storage_common>
+ $<TARGET_OBJECTS:storage_storageconfig>
+ $<TARGET_OBJECTS:storage_hostreporter>
+ $<TARGET_OBJECTS:storage_distributoroperation>
+ $<TARGET_OBJECTS:storage_distributoroperationexternal>
+ $<TARGET_OBJECTS:storage_distributoroperationidealstate>
+ $<TARGET_OBJECTS:storage_distributormaintenance>
+ $<TARGET_OBJECTS:storage_filestorpersistence>
+ $<TARGET_OBJECTS:storage_spersistence>
+ $<TARGET_OBJECTS:storage_storageserver>
+ $<TARGET_OBJECTS:storage_storageutil>
+ $<TARGET_OBJECTS:storage_visitor>
+ $<TARGET_OBJECTS:storage_bucketmover>
+ $<TARGET_OBJECTS:storage_thread>
+ $<TARGET_OBJECTS:storage_status>
+ $<TARGET_OBJECTS:storage_memory>
+ $<TARGET_OBJECTS:storage_component>
+ INSTALL lib64
+ DEPENDS
+ storage_distributor
+)
diff --git a/storage/src/vespa/storage/bucketdb/.gitignore b/storage/src/vespa/storage/bucketdb/.gitignore
new file mode 100644
index 00000000000..54d8faa8201
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/.gitignore
@@ -0,0 +1,11 @@
+*.So
+*.lo
+.*.swp
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
+config-stor-bucketdb.*
+/config-stor-bucket-init.h
+/config-stor-bucket-init.cpp
diff --git a/storage/src/vespa/storage/bucketdb/CMakeLists.txt b/storage/src/vespa/storage/bucketdb/CMakeLists.txt
new file mode 100644
index 00000000000..189f773dcbe
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/CMakeLists.txt
@@ -0,0 +1,17 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_bucketdb OBJECT
+ SOURCES
+ storagebucketdbinitializer.cpp
+ distrbucketdb.cpp
+ storbucketdb.cpp
+ judyarray.cpp
+ bucketmanager.cpp
+ distribution_hash_normalizer.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
+vespa_generate_config(storage_bucketdb stor-bucketdb.def)
+install(FILES stor-bucketdb.def DESTINATION var/db/vespa/config_server/serverdb/classes)
+vespa_generate_config(storage_bucketdb stor-bucket-init.def)
+install(FILES stor-bucket-init.def DESTINATION var/db/vespa/config_server/serverdb/classes)
diff --git a/storage/src/vespa/storage/bucketdb/bucketmanager.cpp b/storage/src/vespa/storage/bucketdb/bucketmanager.cpp
new file mode 100644
index 00000000000..109db0a7c0f
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/bucketmanager.cpp
@@ -0,0 +1,871 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/bucketdb/bucketmanager.h>
+
+#include <iomanip>
+#include <vespa/log/log.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storageapi/message/visitor.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/vdslib/state/nodetype.h>
+#include <vespa/storage/bucketdb/minimumusedbitstracker.h>
+#include <vespa/storage/bucketdb/distribution_hash_normalizer.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/storageserver/storagemetricsset.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <vespa/storageapi/message/stat.h>
+#include <vespa/storage/storageutil/distributorstatecache.h>
+#include <vespa/config/config.h>
+#include <unordered_map>
+
+LOG_SETUP(".storage.bucketdb.manager");
+
+namespace storage {
+
+BucketManager::BucketManager(const config::ConfigUri & configUri,
+ ServiceLayerComponentRegister& compReg)
+ : StorageLinkQueued("Bucket manager", compReg),
+ framework::StatusReporter("bucketdb", "Bucket database"),
+ _configUri(configUri),
+ _stateAccess(),
+ _bucketDBMemoryToken(),
+ _workerMonitor(),
+ _clusterStateLock(),
+ _queueProcessingLock(),
+ _queuedReplies(),
+ _firstEqualClusterStateVersion(0),
+ _lastClusterStateSeen(0),
+ _lastUnifiedClusterState(""),
+ _metrics(new BucketManagerMetrics),
+ _doneInitialized(false),
+ _requestsCurrentlyProcessing(0),
+ _component(compReg, "bucketmanager")
+{
+ const framework::MemoryAllocationType& allocType(
+ _component.getMemoryManager().registerAllocationType(
+ framework::MemoryAllocationType("DATABASE")));
+ _bucketDBMemoryToken = _component.getMemoryManager().allocate(
+ allocType, 0, 0, api::StorageMessage::HIGH);
+ assert(_bucketDBMemoryToken.get() != 0);
+ _metrics->setDisks(_component.getDiskCount());
+ _component.registerStatusPage(*this);
+ _component.registerMetric(*_metrics);
+ _component.registerMetricUpdateHook(*this, framework::SecondTime(300));
+
+ // Initialize min used bits to default value used here.
+ NodeStateUpdater::Lock::SP lock(
+ _component.getStateUpdater().grabStateChangeLock());
+ lib::NodeState ns(
+ *_component.getStateUpdater().getReportedNodeState());
+ ns.setMinUsedBits(58);
+ _component.getStateUpdater().setReportedNodeState(ns);
+}
+
+BucketManager::~BucketManager()
+{
+ if (_thread.get() != 0) {
+ LOG(error, "BucketManager deleted without calling close() first");
+ onClose();
+ }
+ LOG(debug, "Deleting link %s.", toString().c_str());
+ closeNextLink();
+}
+
+void BucketManager::onClose()
+{
+ // Stop internal thread such that we don't send any more messages down.
+ if (_thread.get() != 0) {
+ _thread->interruptAndJoin(&_workerMonitor);
+ _thread.reset(0);
+ }
+ StorageLinkQueued::onClose();
+}
+
+void
+BucketManager::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ out << "BucketManager()";
+}
+
+namespace {
+
+ template<bool log>
+ class DistributorInfoGatherer
+ {
+ typedef api::RequestBucketInfoReply::EntryVector ResultArray;
+ DistributorStateCache _state;
+ std::unordered_map<uint16_t, ResultArray>& _result;
+ const document::BucketIdFactory& _factory;
+ std::shared_ptr<lib::Distribution> _storageDistribution;
+
+ public:
+ DistributorInfoGatherer(
+ const lib::ClusterState& systemState,
+ std::unordered_map<uint16_t, ResultArray>& result,
+ const document::BucketIdFactory& factory,
+ std::shared_ptr<lib::Distribution> distribution)
+ : _state(*distribution, systemState),
+ _result(result),
+ _factory(factory),
+ _storageDistribution(distribution)
+ {
+ }
+
+ StorBucketDatabase::Decision operator()(uint64_t bucketId,
+ StorBucketDatabase::Entry& data)
+ {
+ document::BucketId b(document::BucketId::keyToBucketId(bucketId));
+ try{
+ uint16_t i = _state.getOwner(b);
+ auto it = _result.find(i);
+ // Template parameter. This block should not be included
+ // in version not logging.
+ if (log) {
+ LOG(spam, "Bucket %s (reverse %" PRIu64 "), should be handled"
+ " by distributor %u which we are %sgenerating "
+ "state for.",
+ b.toString().c_str(), bucketId, i,
+ it == _result.end() ? "not " : "");
+ }
+ if (it != _result.end()) {
+ api::RequestBucketInfoReply::Entry entry;
+ entry._bucketId = b;
+ entry._info = data.getBucketInfo();
+ it->second.push_back(entry);
+ }
+ } catch (lib::TooFewBucketBitsInUseException& e) {
+ LOGBP(warning, "Cannot assign bucket %s to a distributor "
+ " as bucket only specifies %u bits.",
+ b.toString().c_str(),
+ b.getUsedBits());
+ } catch (lib::NoDistributorsAvailableException& e) {
+ LOGBP(warning, "No distributors available while processing "
+ "request bucket info. Distribution hash: %s, "
+ "cluster state: %s",
+ _state.getDistribution().getNodeGraph()
+ .getDistributionConfigHash().c_str(),
+ _state.getClusterState().toString().c_str());
+ }
+ return StorBucketDatabase::CONTINUE;
+ }
+
+ };
+
+ struct MetricsUpdater {
+ struct Count {
+ uint64_t docs;
+ uint64_t bytes;
+ uint64_t buckets;
+ uint64_t active;
+ uint64_t ready;
+
+ Count() : docs(0), bytes(0), buckets(0), active(0), ready(0) {}
+ };
+
+ uint16_t diskCount;
+ std::vector<Count> disk;
+ uint32_t lowestUsedBit;
+
+ MetricsUpdater(uint16_t diskCnt)
+ : diskCount(diskCnt), disk(diskCnt), lowestUsedBit(58) {}
+
+ StorBucketDatabase::Decision operator()(
+ document::BucketId::Type bucketId,
+ StorBucketDatabase::Entry& data)
+ {
+ document::BucketId bucket(
+ document::BucketId::keyToBucketId(bucketId));
+
+ if (data.valid()) {
+ assert(data.disk < diskCount);
+ ++disk[data.disk].buckets;
+ if (data.getBucketInfo().isActive()) {
+ ++disk[data.disk].active;
+ }
+ if (data.getBucketInfo().isReady()) {
+ ++disk[data.disk].ready;
+ }
+ disk[data.disk].docs += data.getBucketInfo().getDocumentCount();
+ disk[data.disk].bytes += data.getBucketInfo().getTotalDocumentSize();
+
+ if (bucket.getUsedBits() < lowestUsedBit) {
+ lowestUsedBit = bucket.getUsedBits();
+ }
+ }
+
+ return StorBucketDatabase::CONTINUE;
+ };
+ };
+
+} // End of anonymous namespace
+
+StorBucketDatabase::Entry
+BucketManager::getBucketInfo(const document::BucketId& id) const
+{
+ StorBucketDatabase::WrappedEntry entry(
+ _component.getBucketDatabase().get(id, "BucketManager::getBucketInfo"));
+ return *entry;
+}
+
+void
+BucketManager::updateMetrics(bool updateDocCount)
+{
+ LOG(debug, "Iterating bucket database to update metrics%s%s",
+ updateDocCount ? "" : ", minusedbits only",
+ _doneInitialized ? "" : ", server is not done initializing");
+ uint64_t dbMemSize = _component.getBucketDatabase().getMemoryUsage();
+ _bucketDBMemoryToken->resize(dbMemSize, dbMemSize);
+
+ uint32_t diskCount = _component.getDiskCount();
+ if (!updateDocCount || _doneInitialized) {
+ MetricsUpdater m(diskCount);
+ _component.getBucketDatabase().chunkedAll(
+ m, "BucketManager::updateMetrics");
+ if (updateDocCount) {
+ for (uint16_t i = 0; i< diskCount; i++) {
+ _metrics->disks[i]->buckets.addValue(m.disk[i].buckets);
+ _metrics->disks[i]->docs.addValue(m.disk[i].docs);
+ _metrics->disks[i]->bytes.addValue(m.disk[i].bytes);
+ _metrics->disks[i]->active.addValue(m.disk[i].active);
+ _metrics->disks[i]->ready.addValue(m.disk[i].ready);
+ }
+ }
+ }
+}
+
+void BucketManager::updateMinUsedBits()
+{
+ MetricsUpdater m(_component.getDiskCount());
+ _component.getBucketDatabase().chunkedAll(
+ m, "BucketManager::updateMetrics");
+ // When going through to get sizes, we also record min bits
+ MinimumUsedBitsTracker& bitTracker(_component.getMinUsedBitsTracker());
+ if (bitTracker.getMinUsedBits() != m.lowestUsedBit) {
+ NodeStateUpdater::Lock::SP lock(
+ _component.getStateUpdater().grabStateChangeLock());
+ lib::NodeState ns(
+ *_component.getStateUpdater().getReportedNodeState());
+ bitTracker.setMinUsedBits(m.lowestUsedBit);
+ ns.setMinUsedBits(m.lowestUsedBit);
+ _component.getStateUpdater().setReportedNodeState(ns);
+ }
+}
+
+// Responsible for sending on messages that was previously queued
+void BucketManager::run(framework::ThreadHandle& thread)
+{
+ const int64_t CHECK_MINUSEDBITS_INTERVAL = 1000*30;
+ framework::MilliSecTime timeToCheckMinUsedBits(0);
+ while (!thread.interrupted()) {
+ bool didWork = false;
+ BIList infoReqs;
+ {
+ vespalib::MonitorGuard monitor(_workerMonitor);
+ infoReqs.swap(_bucketInfoRequests);
+ }
+
+ didWork |= processRequestBucketInfoCommands(infoReqs);
+
+ {
+ vespalib::MonitorGuard monitor(_workerMonitor);
+ if (!infoReqs.empty()) {
+ infoReqs.insert(infoReqs.end(),
+ _bucketInfoRequests.begin(), _bucketInfoRequests.end());
+ _bucketInfoRequests.swap(infoReqs);
+ }
+ if (!didWork) {
+ monitor.wait(1000);
+ thread.registerTick(framework::WAIT_CYCLE);
+ } else {
+ thread.registerTick(framework::PROCESS_CYCLE);
+ }
+ }
+ if (timeToCheckMinUsedBits < _component.getClock().getTimeInMillis()) {
+ updateMinUsedBits();
+ timeToCheckMinUsedBits = _component.getClock().getTimeInMillis();
+ timeToCheckMinUsedBits += framework::MilliSecTime(CHECK_MINUSEDBITS_INTERVAL);
+ }
+ }
+}
+
+vespalib::string
+BucketManager::getReportContentType(const framework::HttpUrlPath& path) const
+{
+ bool showAll = path.hasAttribute("showall");
+ if (showAll) {
+ return "application/xml";
+ } else {
+ return "text/html";
+ }
+}
+
+namespace {
+ class BucketDBDumper {
+ vespalib::XmlOutputStream& _xos;
+ public:
+ BucketDBDumper(vespalib::XmlOutputStream& xos) : _xos(xos) {}
+
+ StorBucketDatabase::Decision operator()(
+ uint64_t bucketId, StorBucketDatabase::Entry& info)
+ {
+ using namespace vespalib::xml;
+ document::BucketId bucket(
+ document::BucketId::keyToBucketId(bucketId));
+
+ std::ostringstream ost;
+ ost << "0x" << std::hex << std::setw(16)
+ << std::setfill('0') << bucket.getId();
+
+ _xos << XmlTag("bucket")
+ << XmlAttribute("id", ost.str());
+ info.getBucketInfo().printXml(_xos);
+ _xos << XmlAttribute("disk", info.disk);
+ _xos << XmlEndTag();
+ return StorBucketDatabase::CONTINUE;
+ };
+ };
+}
+
+bool
+BucketManager::reportStatus(std::ostream& out,
+ const framework::HttpUrlPath& path) const
+{
+ bool showAll = path.hasAttribute("showall");
+ if (showAll) {
+ framework::PartlyXmlStatusReporter xmlReporter(*this, out, path);
+ xmlReporter << vespalib::xml::XmlTag("buckets");
+ BucketDBDumper dumper(xmlReporter.getStream());
+ _component.getBucketDatabase().chunkedAll(
+ dumper, "BucketManager::getStatus");
+ xmlReporter << vespalib::xml::XmlEndTag();
+ } else {
+ framework::PartlyHtmlStatusReporter htmlReporter(*this);
+ htmlReporter.reportHtmlHeader(out, path);
+ // Print menu
+ out << "<font size=\"-1\">[ <a href=\"/\">Back to top</a>"
+ << " | <a href=\"?showall\">Show all buckets</a> ]</font>";
+ htmlReporter.reportHtmlFooter(out, path);
+ }
+ return true;
+}
+
+void
+BucketManager::dump(std::ostream& out) const
+{
+ vespalib::XmlOutputStream xos(out);
+ BucketDBDumper dumper(xos);
+ _component.getBucketDatabase().chunkedAll(dumper, 0);
+}
+
+
+void BucketManager::onOpen()
+{
+ if (!_configUri.empty()) {
+ startWorkerThread();
+ }
+}
+
+void BucketManager::startWorkerThread()
+{
+ framework::MilliSecTime maxProcessingTime(30 * 1000);
+ framework::MilliSecTime waitTime(1000);
+ _thread = _component.startThread(*this, maxProcessingTime, waitTime);
+}
+
+void BucketManager::onFlush(bool downwards)
+{
+ StorageLinkQueued::onFlush(downwards);
+}
+
+// --------- Commands --------- //
+
+bool BucketManager::onRequestBucketInfo(
+ const std::shared_ptr<api::RequestBucketInfoCommand>& cmd)
+{
+ LOG(debug, "Got request bucket info command");
+ if (cmd->getBuckets().size() == 0 && cmd->hasSystemState()) {
+
+ vespalib::MonitorGuard monitor(_workerMonitor);
+ _bucketInfoRequests.push_back(cmd);
+ monitor.signal();
+ LOG(spam, "Scheduled request bucket info request for retrieval");
+ return true;
+ }
+
+ ScopedQueueDispatchGuard queueGuard(*this);
+
+ api::RequestBucketInfoReply::EntryVector info;
+ if (cmd->getBuckets().size()) {
+ typedef std::map<document::BucketId,
+ StorBucketDatabase::WrappedEntry> BucketMap;
+ for (uint32_t i = 0; i < cmd->getBuckets().size(); i++) {
+ BucketMap entries(_component.getBucketDatabase().getAll(
+ cmd->getBuckets()[i],
+ "BucketManager::onRequestBucketInfo"));
+ for (BucketMap::iterator it = entries.begin();
+ it != entries.end(); ++it)
+ {
+ info.push_back(api::RequestBucketInfoReply::Entry(
+ it->first, it->second->getBucketInfo()));
+ }
+ }
+ } else {
+ LOG(error, "We don't support fetching bucket info without bucket "
+ "list or system state");
+ assert(false);
+ }
+ _metrics->simpleBucketInfoRequestSize.addValue(info.size());
+ auto reply = std::make_shared<api::RequestBucketInfoReply>(*cmd);
+ reply->getBucketInfo().swap(info);
+ LOG(spam, "Sending %s", reply->toString().c_str());
+
+ LOG(spam, "Returning list of checksums:");
+ for (const auto & entry : reply->getBucketInfo()) {
+ LOG(spam, "%s: %s",
+ entry._bucketId.toString().c_str(),
+ entry._info.toString().c_str());
+ }
+ dispatchUp(reply);
+ // Remaining replies dispatched by queueGuard upon function exit.
+ return true;
+}
+
+namespace {
+ std::string unifyState(const lib::ClusterState& state) {
+ std::vector<char> distributors(
+ state.getNodeCount(lib::NodeType::DISTRIBUTOR), 'd');
+
+ uint32_t length = 0;
+ for (uint32_t i = 0; i < distributors.size(); ++i) {
+ const lib::NodeState& ns(state.getNodeState(
+ lib::Node(lib::NodeType::DISTRIBUTOR, i)));
+ if (ns.getState().oneOf("uirm")) {
+ distributors[i] = 'u';
+ length = i + 1;
+ }
+ }
+ return std::string(&distributors[0], length);
+ }
+}
+
+BucketManager::ScopedQueueDispatchGuard::ScopedQueueDispatchGuard(
+ BucketManager& mgr)
+ : _mgr(mgr)
+{
+ _mgr.enterQueueProtectedSection();
+}
+
+BucketManager::ScopedQueueDispatchGuard::~ScopedQueueDispatchGuard()
+{
+ _mgr.leaveQueueProtectedSection(*this);
+}
+
+void
+BucketManager::enterQueueProtectedSection()
+{
+ vespalib::LockGuard guard(_queueProcessingLock);
+ ++_requestsCurrentlyProcessing;
+}
+
+void
+BucketManager::leaveQueueProtectedSection(ScopedQueueDispatchGuard& queueGuard)
+{
+ (void) queueGuard; // Only used to enforce guard is held while calling.
+ vespalib::LockGuard guard(_queueProcessingLock);
+ assert(_requestsCurrentlyProcessing > 0);
+ // Full bucket info fetches may be concurrently interleaved with bucket-
+ // specific fetches outside of the processing thread. We only allow queued
+ // messages to go through once _all_ of these are done, since we do not
+ // keep per-bucket info request queues and thus cannot know which replies
+ // may alter the relevant state.
+ --_requestsCurrentlyProcessing;
+ if (_requestsCurrentlyProcessing == 0) {
+ for (auto& qr : _queuedReplies) {
+ dispatchUp(qr);
+ }
+ _queuedReplies.clear();
+ _conflictingBuckets.clear();
+ }
+}
+
+bool
+BucketManager::processRequestBucketInfoCommands(BIList& reqs)
+{
+ if (reqs.empty()) return false;
+
+ ScopedQueueDispatchGuard queueGuard(*this);
+
+ // - Fail all but the latest request for each node.
+ // - Fail all requests to a cluster state that after unification differs
+ // from the current cluster state.
+
+ std::set<uint16_t> seenDistributors;
+ typedef std::shared_ptr<api::RequestBucketInfoCommand> RBISP;
+ std::map<uint16_t, RBISP> requests;
+
+ lib::Distribution::SP distribution(_component.getDistribution());
+ lib::ClusterState::CSP clusterState(
+ _component.getStateUpdater().getSystemState());
+ assert(clusterState.get());
+
+ DistributionHashNormalizer normalizer;
+
+ const auto our_hash = normalizer.normalize(
+ distribution->getNodeGraph().getDistributionConfigHash());
+
+ LOG(debug, "Processing %" PRIu64 " queued request bucket info commands. "
+ "Using cluster state '%s' and distribution hash '%s'",
+ reqs.size(),
+ clusterState->toString().c_str(),
+ our_hash.c_str());
+
+ vespalib::LockGuard lock(_clusterStateLock);
+ for (BIList::reverse_iterator it = reqs.rbegin(); it != reqs.rend(); ++it) {
+ // Currently small requests should not be forwarded to worker thread
+ assert((*it)->hasSystemState());
+ const auto their_hash = normalizer.normalize(
+ (*it)->getDistributionHash());
+
+ std::ostringstream error;
+ if ((*it)->getSystemState().getVersion() > _lastClusterStateSeen) {
+ error << "Ignoring bucket info request for cluster state version "
+ << (*it)->getSystemState().getVersion() << " as newest "
+ << "version we know of is " << _lastClusterStateSeen;
+ } else if ((*it)->getSystemState().getVersion()
+ < _firstEqualClusterStateVersion)
+ {
+ error << "Ignoring bucket info request for cluster state version "
+ << (*it)->getSystemState().getVersion() << " as versions "
+ << "from version " << _firstEqualClusterStateVersion
+ << " differs from this state.";
+ } else if (!their_hash.empty() && their_hash != our_hash) {
+ // Empty hash indicates request from 4.2 protocol or earlier
+ error << "Distribution config has changed since request.";
+ }
+ if (error.str().empty()) {
+ std::pair<std::set<uint16_t>::iterator, bool> result(
+ seenDistributors.insert((*it)->getDistributor()));
+ if (result.second) {
+ requests[(*it)->getDistributor()] = *it;
+ continue;
+ } else {
+ error << "There is already a newer bucket info request for this"
+ << " node from distributor " << (*it)->getDistributor();
+ }
+ }
+
+ // If we get here, message should be failed
+ auto reply = std::make_shared<api::RequestBucketInfoReply>(**it);
+ reply->setResult(api::ReturnCode(
+ api::ReturnCode::REJECTED, error.str()));
+ LOG(debug, "Rejecting request from distributor %u: %s",
+ (*it)->getDistributor(),
+ error.str().c_str());
+ dispatchUp(reply);
+ }
+
+ if (requests.empty()) {
+ reqs.clear();
+ return true; // No need to waste CPU when no requests are left.
+ }
+
+ std::ostringstream distrList;
+ std::unordered_map<
+ uint16_t,
+ api::RequestBucketInfoReply::EntryVector
+ > result;
+ for (auto& nodeAndCmd : requests) {
+ result[nodeAndCmd.first];
+ if (LOG_WOULD_LOG(debug)) {
+ distrList << ' ' << nodeAndCmd.first;
+ }
+ }
+
+ _metrics->fullBucketInfoRequestSize.addValue(requests.size());
+ LOG(debug, "Processing %" PRIu64 " bucket info requests for "
+ "distributors %s, using system state %s",
+ requests.size(), distrList.str().c_str(),
+ clusterState->toString().c_str());
+ framework::MilliSecTimer runStartTime(_component.getClock());
+ // Don't allow logging to lower performance of inner loop.
+ // Call other type of instance if logging
+ const document::BucketIdFactory& idFac(_component.getBucketIdFactory());
+ if (LOG_WOULD_LOG(spam)) {
+ DistributorInfoGatherer<true> builder(
+ *clusterState, result, idFac, distribution);
+ _component.getBucketDatabase().chunkedAll(builder,
+ "BucketManager::processRequestBucketInfoCommands-1");
+ } else {
+ DistributorInfoGatherer<false> builder(
+ *clusterState, result, idFac, distribution);
+ _component.getBucketDatabase().chunkedAll(builder,
+ "BucketManager::processRequestBucketInfoCommands-2");
+ }
+ _metrics->fullBucketInfoLatency.addValue(runStartTime);
+ for (auto& nodeAndCmd : requests) {
+ auto reply(std::make_shared<api::RequestBucketInfoReply>(
+ *nodeAndCmd.second));
+ reply->getBucketInfo().swap(result[nodeAndCmd.first]);
+ dispatchUp(reply);
+ }
+
+ reqs.clear();
+
+ // Remaining replies dispatched by queueGuard upon function exit.
+ return true;
+}
+
+size_t
+BucketManager::bucketInfoRequestsCurrentlyProcessing() const noexcept
+{
+ vespalib::LockGuard guard(_queueProcessingLock);
+ return _requestsCurrentlyProcessing;
+}
+
+bool
+BucketManager::onUp(const std::shared_ptr<api::StorageMessage>& msg)
+{
+ if (!StorageLink::onUp(msg)) {
+ dispatchUp(msg);
+ }
+ return true;
+}
+
+bool
+BucketManager::verifyAndUpdateLastModified(api::StorageCommand& cmd,
+ const document::BucketId& bucketId,
+ uint64_t lastModified)
+{
+ LOG(spam, "Received operation %s with modification timestamp %zu",
+ cmd.toString().c_str(),
+ lastModified);
+
+ uint64_t prevLastModified = 0;
+
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _component.getBucketDatabase().get(bucketId, "BucketManager::verify"));
+
+ if (entry.exist()) {
+ prevLastModified = entry->info.getLastModified();
+
+ if (lastModified > prevLastModified) {
+ entry->info.setLastModified(lastModified);
+ entry.write();
+ return true;
+ }
+ } else {
+ return true;
+ }
+ }
+
+ api::StorageReply::UP reply = cmd.makeReply();
+ reply->setResult(api::ReturnCode(
+ api::ReturnCode::STALE_TIMESTAMP,
+ vespalib::make_string(
+ "Received command %s with a lower/equal timestamp "
+ " (%zu) than the last operation received for "
+ "bucket %s, with timestamp %zu",
+ cmd.toString().c_str(),
+ lastModified,
+ bucketId.toString().c_str(),
+ prevLastModified)));
+
+
+ sendUp(api::StorageMessage::SP(reply.release()));
+ return false;
+}
+
+bool
+BucketManager::onSetSystemState(
+ const std::shared_ptr<api::SetSystemStateCommand>& cmd)
+{
+ LOG(debug, "onSetSystemState(%s)", cmd->toString().c_str());
+ const lib::ClusterState& state(cmd->getSystemState());
+ std::string unified(unifyState(state));
+ vespalib::LockGuard lock(_clusterStateLock);
+ if (unified != _lastUnifiedClusterState
+ || state.getVersion() != _lastClusterStateSeen + 1)
+ {
+ _lastUnifiedClusterState = unified;
+ _firstEqualClusterStateVersion = state.getVersion();
+ }
+ _lastClusterStateSeen = state.getVersion();
+ return false;
+}
+
+bool
+BucketManager::onCreateBucket(const api::CreateBucketCommand::SP& cmd)
+{
+ MinimumUsedBitsTracker& bitTracker(_component.getMinUsedBitsTracker());
+ if (bitTracker.update(cmd->getBucketId())) {
+ NodeStateUpdater::Lock::SP lock(
+ _component.getStateUpdater().grabStateChangeLock());
+ lib::NodeState ns(
+ *_component.getStateUpdater().getReportedNodeState());
+ ns.setMinUsedBits(bitTracker.getMinUsedBits());
+ _component.getStateUpdater().setReportedNodeState(ns);
+ }
+
+ return false;
+}
+
+bool
+BucketManager::onMergeBucket(const api::MergeBucketCommand::SP& cmd)
+{
+ MinimumUsedBitsTracker& bitTracker(_component.getMinUsedBitsTracker());
+ if (bitTracker.update(cmd->getBucketId())) {
+ NodeStateUpdater::Lock::SP lock(
+ _component.getStateUpdater().grabStateChangeLock());
+ lib::NodeState ns(
+ *_component.getStateUpdater().getReportedNodeState());
+ ns.setMinUsedBits(bitTracker.getMinUsedBits());
+ _component.getStateUpdater().setReportedNodeState(ns);
+ }
+ return false;
+}
+
+bool
+BucketManager::onRemove(const api::RemoveCommand::SP& cmd)
+{
+ if (!verifyAndUpdateLastModified(*cmd,
+ cmd->getBucketId(),
+ cmd->getTimestamp())) {
+ return true;
+ }
+
+ return false;
+}
+
+bool
+BucketManager::onRemoveReply(const api::RemoveReply::SP& reply)
+{
+ return enqueueIfBucketHasConflicts(reply);
+}
+
+bool
+BucketManager::onPut(const api::PutCommand::SP& cmd)
+{
+ if (!verifyAndUpdateLastModified(*cmd,
+ cmd->getBucketId(),
+ cmd->getTimestamp())) {
+ return true;
+ }
+
+ return false;
+}
+
+bool
+BucketManager::onPutReply(const api::PutReply::SP& reply)
+{
+ return enqueueIfBucketHasConflicts(reply);
+}
+
+bool
+BucketManager::onUpdate(const api::UpdateCommand::SP& cmd)
+{
+ if (!verifyAndUpdateLastModified(*cmd,
+ cmd->getBucketId(),
+ cmd->getTimestamp())) {
+ return true;
+ }
+
+ return false;
+}
+
+bool
+BucketManager::onUpdateReply(const api::UpdateReply::SP& reply)
+{
+ return enqueueIfBucketHasConflicts(reply);
+}
+
+bool
+BucketManager::onNotifyBucketChangeReply(
+ const api::NotifyBucketChangeReply::SP& reply)
+{
+ (void) reply;
+ // Handling bucket change replies is a no-op.
+ return true;
+}
+
+bool
+BucketManager::enqueueIfBucketHasConflicts(const api::BucketReply::SP& reply)
+{
+ // Should very rarely contend, since persistence replies are all sent up
+ // via a single dispatcher thread.
+ vespalib::LockGuard guard(_queueProcessingLock);
+ if (_requestsCurrentlyProcessing == 0) {
+ return false; // Nothing to do here; pass through reply.
+ }
+ if (replyConflictsWithConcurrentOperation(*reply)) {
+ LOG(debug,
+ "Reply %s conflicted with a bucket that has been concurrently "
+ "modified while a RequestBucketInfo was active; enqueuing it.",
+ reply->toString().c_str());
+ _queuedReplies.push_back(reply);
+ return true;
+ }
+ return false; // No conflicting ops in queue.
+}
+
+bool
+BucketManager::replyConflictsWithConcurrentOperation(
+ const api::BucketReply& reply) const
+{
+ if (bucketHasConflicts(reply.getBucketId())) {
+ return true;
+ }
+ // A Put (or Update/Remove) scheduled towards a bucket that is split or
+ // joined will be "remapped" to a new bucket id that is the _result_ of
+ // said operation. This means that the bucket id for a split reply and
+ // a put reply originally for that bucket will differ and just checking
+ // on getBucketId() would not capture all true conflicts. However, replies
+ // know whether they've been remapped and we can get the non-remapped
+ // bucket from it (the "original" bucket).
+ return (reply.hasBeenRemapped()
+ && bucketHasConflicts(reply.getOriginalBucketId()));
+}
+
+bool
+BucketManager::enqueueAsConflictIfProcessingRequest(
+ const api::StorageReply::SP& reply)
+{
+ vespalib::LockGuard guard(_queueProcessingLock);
+ if (_requestsCurrentlyProcessing != 0) {
+ LOG(debug, "Enqueued %s due to concurrent RequestBucketInfo",
+ reply->toString().c_str());
+ _queuedReplies.push_back(reply);
+ _conflictingBuckets.insert(reply->getBucketId());
+ return true;
+ }
+ return false;
+}
+
+bool
+BucketManager::onSplitBucketReply(const api::SplitBucketReply::SP& reply)
+{
+ return enqueueAsConflictIfProcessingRequest(reply);
+}
+
+bool
+BucketManager::onJoinBucketsReply(const api::JoinBucketsReply::SP& reply)
+{
+ return enqueueAsConflictIfProcessingRequest(reply);
+}
+
+bool
+BucketManager::onDeleteBucketReply(const api::DeleteBucketReply::SP& reply)
+{
+ return enqueueAsConflictIfProcessingRequest(reply);
+}
+
+} // storage
+
diff --git a/storage/src/vespa/storage/bucketdb/bucketmanager.h b/storage/src/vespa/storage/bucketdb/bucketmanager.h
new file mode 100644
index 00000000000..cec059c0a8e
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/bucketmanager.h
@@ -0,0 +1,245 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::BucketManager
+ * @ingroup bucketdb
+ *
+ * @brief Storage link handling requests concerning buckets.
+ *
+ * @author H�kon Humberset
+ * @date 2006-01-16
+ * @version $Id$
+ */
+
+#pragma once
+
+#include <vespa/vespalib/util/printable.h>
+#include <vespa/vespalib/util/document_runnable.h>
+#include <memory>
+#include <vespa/storage/bucketdb/config-stor-bucketdb.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storage/common/servicelayercomponent.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storage/common/storagelinkqueued.h>
+#include <vespa/storage/bucketdb/bucketmanagermetrics.h>
+#include <vespa/storageframework/generic/memory/memorymanagerinterface.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <unordered_set>
+
+namespace storage {
+
+class BucketManager : public StorageLinkQueued,
+ public framework::StatusReporter,
+ private framework::Runnable,
+ private framework::MetricUpdateHook
+{
+ /** Type used for message queues */
+ typedef std::list<std::shared_ptr<api::StorageCommand> > CommandList;
+ typedef std::list<std::shared_ptr<api::RequestBucketInfoCommand> > BIList;
+
+ config::ConfigUri _configUri;
+
+ uint32_t _chunkLevel;
+ mutable vespalib::Lock _stateAccess;
+ framework::MemoryToken::UP _bucketDBMemoryToken;
+ BIList _bucketInfoRequests;
+
+ /**
+ * We have our own thread running, which we use to send messages down.
+ * Take worker monitor, add to list and signal for messages to be sent.
+ */
+ mutable vespalib::Monitor _workerMonitor;
+ /**
+ * Lock kept for access to 3 values below concerning cluster state.
+ */
+ vespalib::Lock _clusterStateLock;
+
+ vespalib::Lock _queueProcessingLock;
+ using ReplyQueue = std::vector<api::StorageReply::SP>;
+ using ConflictingBuckets = std::unordered_set<document::BucketId,
+ document::BucketId::hash>;
+ ReplyQueue _queuedReplies;
+ ConflictingBuckets _conflictingBuckets;
+ /**
+ * Keeps the version number of the first cluster state version seen that
+ * after distributor unification is equal to all cluster states seen after.
+ */
+ uint32_t _firstEqualClusterStateVersion;
+ /**
+ * The last cluster state version seen. We must ensure we dont answer to
+ * cluster states we haven't seen.
+ */
+ uint32_t _lastClusterStateSeen;
+ /**
+ * The unified version of the last cluster state.
+ */
+ std::string _lastUnifiedClusterState;
+ std::shared_ptr<BucketManagerMetrics> _metrics;
+ bool _doneInitialized;
+ size_t _requestsCurrentlyProcessing;
+ ServiceLayerComponent _component;
+ framework::Thread::UP _thread;
+
+ BucketManager(const BucketManager&);
+ BucketManager& operator=(const BucketManager&);
+
+ class ScopedQueueDispatchGuard {
+ BucketManager& _mgr;
+ public:
+ ScopedQueueDispatchGuard(BucketManager&);
+ ~ScopedQueueDispatchGuard();
+
+ ScopedQueueDispatchGuard(const ScopedQueueDispatchGuard&) = delete;
+ ScopedQueueDispatchGuard& operator=(const ScopedQueueDispatchGuard&) = delete;
+ };
+
+public:
+ explicit BucketManager(const config::ConfigUri&,
+ ServiceLayerComponentRegister&);
+ ~BucketManager();
+
+ void startWorkerThread();
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ /** Dump the whole database to the given output. Use for debugging. */
+ void dump(std::ostream& out) const;
+
+ /** Get info for given bucket (Used for whitebox testing) */
+ StorBucketDatabase::Entry getBucketInfo(const document::BucketId& id) const;
+
+private:
+ friend class BucketManagerTest;
+
+ void run(framework::ThreadHandle&);
+
+ // Status::Reporter implementation
+ vespalib::string getReportContentType(
+ const framework::HttpUrlPath&) const;
+ bool reportStatus(std::ostream&, const framework::HttpUrlPath&) const;
+
+ /** Event saying node is up and running. We can start to build cache. */
+ virtual void onOpen();
+ virtual void onDoneInit() { _doneInitialized = true; }
+ virtual void onClose();
+ virtual void onFlush(bool downwards);
+
+ void updateMetrics(bool updateDocCount);
+ void updateMetrics(const MetricLockGuard &) override { updateMetrics(true); }
+ void updateMinUsedBits();
+
+ bool onRequestBucketInfo(
+ const std::shared_ptr<api::RequestBucketInfoCommand>&);
+ bool processRequestBucketInfoCommands(BIList&);
+
+ /**
+ * Enqueue reply and add its bucket to the set of conflicting buckets iff
+ * a RequestBucketInfo command is currently being processed.
+ *
+ * Returns whether request was enqueued (and should thus not be forwarded
+ * by the caller).
+ */
+ bool enqueueAsConflictIfProcessingRequest(
+ const api::StorageReply::SP& reply);
+
+ /**
+ * Signals that code is entering a section where certain bucket tree
+ * modifying replies must be enqueued to prevent distributor bucket DB
+ * inconsistencies. This does not model a regular mutex; multiple threads
+ * concurrently calling this function will not be blocked on each other.
+ *
+ * A call must always be paired with exactly one subsequent call of
+ * leaveQueueProtectedSection()
+ *
+ * Calls to this function nest so that the queue dispatch only happens
+ * when a matching number of calls to leaveQueueProtectedSection have
+ * taken place.
+ */
+ void enterQueueProtectedSection();
+ /**
+ * Leaves the current protected section and atomically dispatches any and
+ * all queued replies iff no threads are in a protected section after this
+ * has been done.
+ *
+ * Precondition: enterQueueProtectedSection must have been called earlier.
+ */
+ void leaveQueueProtectedSection(ScopedQueueDispatchGuard&);
+
+ /**
+ * Used by tests to synchronize against worker thread, as it is not
+ * otherwise directly visible to other threads when it's processing
+ * requests.
+ *
+ * Function is thread safe.
+ *
+ * Precondition: _queueProcessingLock must NOT be held.
+ */
+ size_t bucketInfoRequestsCurrentlyProcessing() const noexcept;
+
+ /**
+ * A bucket is said to have conflicts if a reply has been received that
+ * somehow changes that bucket in the bucket tree (split, join or delete)
+ * while a bucket info request is ongoing. Such replies must be queued up
+ * in order to prevent them from arriving in the wrong order at the
+ * distributor relative to the conflicting reply.
+ *
+ * During bucket info requests, we maintain a temporary conflict set against
+ * which all put, remove and update replies are checked. These will be
+ * dequeued together with the reply that caused the conflict as soon as the
+ * bucket info request is done, ensuring replies are in the original
+ * execution order.
+ *
+ * Not thread safe.
+ */
+ bool bucketHasConflicts(const document::BucketId& bucket) const noexcept {
+ return (_conflictingBuckets.find(bucket) != _conflictingBuckets.end());
+ }
+
+ /**
+ * Checks whether at least one of the reply's bucket ID or the original
+ * (in case of remappings) bucket ID match a bucket in the conflict set.
+ *
+ * Not thread safe.
+ */
+ bool replyConflictsWithConcurrentOperation(
+ const api::BucketReply& reply) const;
+
+ bool enqueueIfBucketHasConflicts(const api::BucketReply::SP& reply);
+
+ bool onUp(const std::shared_ptr<api::StorageMessage>&) override;
+ bool onSetSystemState(
+ const std::shared_ptr<api::SetSystemStateCommand>&) override;
+ bool onCreateBucket(
+ const std::shared_ptr<api::CreateBucketCommand>&) override;
+ bool onMergeBucket(
+ const std::shared_ptr<api::MergeBucketCommand>&) override;
+ bool onRemove(
+ const std::shared_ptr<api::RemoveCommand>&) override;
+ bool onRemoveReply(
+ const std::shared_ptr<api::RemoveReply>&) override;
+ bool onPut(
+ const std::shared_ptr<api::PutCommand>&) override;
+ bool onPutReply(
+ const std::shared_ptr<api::PutReply>&) override;
+ bool onUpdate(
+ const std::shared_ptr<api::UpdateCommand>&) override;
+ bool onUpdateReply(
+ const std::shared_ptr<api::UpdateReply>&) override;
+ bool onNotifyBucketChangeReply(
+ const std::shared_ptr<api::NotifyBucketChangeReply>&) override;
+
+ bool verifyAndUpdateLastModified(api::StorageCommand& cmd,
+ const document::BucketId& bucketId,
+ uint64_t lastModified);
+ bool onSplitBucketReply(
+ const std::shared_ptr<api::SplitBucketReply>&) override;
+ bool onJoinBucketsReply(
+ const std::shared_ptr<api::JoinBucketsReply>&) override;
+ bool onDeleteBucketReply(
+ const std::shared_ptr<api::DeleteBucketReply>&) override;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/bucketdb/bucketmanagermetrics.h b/storage/src/vespa/storage/bucketdb/bucketmanagermetrics.h
new file mode 100644
index 00000000000..c831e3799fe
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/bucketmanagermetrics.h
@@ -0,0 +1,80 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/metrics/metrics.h>
+
+namespace storage {
+
+struct DataStoredMetrics : public metrics::MetricSet
+{
+ typedef std::shared_ptr<DataStoredMetrics> SP;
+
+ metrics::LongValueMetric buckets;
+ metrics::LongValueMetric docs;
+ metrics::LongValueMetric bytes;
+ metrics::LongValueMetric active;
+ metrics::LongValueMetric ready;
+
+ DataStoredMetrics(const std::string& name, metrics::MetricSet* owner)
+ : metrics::MetricSet(name, "partofsum yamasdefault", "", owner, "disk"),
+ buckets("buckets", "", "buckets managed", this),
+ docs("docs", "", "documents stored", this),
+ bytes("bytes", "", "bytes stored", this),
+ active("activebuckets", "", "Number of active buckets on the node",
+ this),
+ ready("readybuckets", "", "Number of ready buckets on the node",
+ this)
+ {
+ docs.logOnlyIfSet();
+ bytes.logOnlyIfSet();
+ active.logOnlyIfSet();
+ ready.logOnlyIfSet();
+ }
+};
+
+class BucketManagerMetrics : public metrics::MetricSet
+{
+public:
+ std::vector<std::shared_ptr<DataStoredMetrics> > disks;
+ metrics::SumMetric<metrics::MetricSet> total;
+ metrics::LongValueMetric simpleBucketInfoRequestSize;
+ metrics::LongAverageMetric fullBucketInfoRequestSize;
+ metrics::LongAverageMetric fullBucketInfoLatency;
+
+ BucketManagerMetrics()
+ : metrics::MetricSet("datastored", "", ""),
+ disks(),
+ total("alldisks", "sum",
+ "Sum of data stored metrics for all disks", this),
+ simpleBucketInfoRequestSize("simplebucketinforeqsize", "",
+ "Amount of buckets returned in simple bucket info requests",
+ this),
+ fullBucketInfoRequestSize("fullbucketinforeqsize", "",
+ "Amount of distributors answered at once in full bucket "
+ "info requests.", this),
+ fullBucketInfoLatency("fullbucketinfolatency", "",
+ "Amount of time spent to process a full bucket info request",
+ this)
+
+ {
+ }
+
+ void setDisks(uint16_t numDisks) {
+ assert(numDisks > 0);
+ if (!disks.empty()) {
+ throw vespalib::IllegalStateException(
+ "Cannot initialize disks twice", VESPA_STRLOC);
+ }
+ for (uint16_t i = 0; i<numDisks; i++) {
+ disks.push_back(DataStoredMetrics::SP(
+ new DataStoredMetrics(
+ vespalib::make_string("disk%d", i), this)));
+ total.addMetricToSum(*disks.back());
+ }
+ }
+};
+
+}
+
+
diff --git a/storage/src/vespa/storage/bucketdb/distrbucketdb.cpp b/storage/src/vespa/storage/bucketdb/distrbucketdb.cpp
new file mode 100644
index 00000000000..6bca384a076
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/distrbucketdb.cpp
@@ -0,0 +1,44 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/storageutil/utils.h>
+
+LOG_SETUP(".distributor.bucketdb");
+
+namespace storage {
+namespace bucketdb {
+
+void
+DistrBucketDatabase::insert(const document::BucketId& bucket,
+ const distributor::BucketInfo& entry,
+ const char* clientId)
+{
+ bool preExisted;
+#ifdef USE_JUDY
+ return LockableMap<JudyMultiMap<distributor::BucketInfo> >::insert(
+ bucket.toKey(), entry, clientId, preExisted);
+#else
+ return LockableMap<StdMapWrapper<document::BucketId::Type,
+ distributor::BucketInfo> >::insert(
+ bucket.toKey(), entry, clientId, preExisted);
+#endif
+}
+
+DistrBucketDatabase::WrappedEntry
+DistrBucketDatabase::get(const document::BucketId& bucket, const char* clientId,
+ bool createIfNonExisting)
+{
+#ifdef USE_JUDY
+ return LockableMap<JudyMultiMap<distributor::BucketInfo> >::get(
+ bucket.stripUnused().toKey(), clientId, createIfNonExisting);
+#else
+ return LockableMap<StdMapWrapper<document::BucketId::Type,
+ distributor::BucketInfo> >::get(
+ bucket.stripUnused().toKey(), clientId, createIfNonExisting);
+#endif
+}
+
+} // storage
+
+}
diff --git a/storage/src/vespa/storage/bucketdb/distrbucketdb.h b/storage/src/vespa/storage/bucketdb/distrbucketdb.h
new file mode 100644
index 00000000000..57bdfede403
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/distrbucketdb.h
@@ -0,0 +1,53 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/bucketdb/judymultimap.h>
+#include <vespa/storage/bucketdb/lockablemap.h>
+#include <vespa/storage/bucketdb/stdmapwrapper.h>
+#include <deque>
+#include <vespa/vespalib/util/printable.h>
+#include <inttypes.h>
+#include <map>
+#include <stdexcept>
+#include <vector>
+#include <vespa/vespalib/util/sync.h>
+#include <vespa/metrics/valuemetric.h>
+#include <vespa/storage/distributor/bucketdb/bucketinfo.h>
+
+#if __WORDSIZE == 64
+ #define USE_JUDY
+#endif
+
+//#undef USE_JUDY
+
+namespace storage {
+
+namespace bucketdb {
+
+class DistrBucketDatabase
+#ifdef USE_JUDY
+ : public LockableMap<JudyMultiMap<distributor::BucketInfo> >
+#else
+ : public LockableMap<StdMapWrapper<document::BucketId::Type,
+ distributor::BucketInfo> >
+#endif
+{
+public:
+ DistrBucketDatabase() {};
+
+ typedef distributor::BucketInfo Entry;
+
+ void insert(const document::BucketId&,
+ const distributor::BucketInfo&,
+ const char* clientId);
+
+ WrappedEntry get(const document::BucketId& bucket,
+ const char* clientId,
+ bool createIfNonExisting = false);
+};
+
+}
+
+}
+
+
diff --git a/storage/src/vespa/storage/bucketdb/distribution_hash_normalizer.cpp b/storage/src/vespa/storage/bucketdb/distribution_hash_normalizer.cpp
new file mode 100644
index 00000000000..a204ce112cd
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/distribution_hash_normalizer.cpp
@@ -0,0 +1,205 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "distribution_hash_normalizer.h"
+#include <vespa/log/log.h>
+#include <vespa/vespalib/stllike/asciistream.h>
+#include <boost/spirit/include/qi.hpp>
+#include <boost/spirit/include/phoenix_core.hpp>
+#include <boost/spirit/include/phoenix_object.hpp>
+#include <boost/fusion/include/adapt_struct.hpp>
+#include <boost/optional.hpp>
+#include <boost/variant/recursive_wrapper.hpp>
+#include <vector>
+#include <algorithm>
+#include <iterator>
+#include <functional>
+
+LOG_SETUP(".storage.bucketdb.distribution_hash_normalizer");
+
+// TODO
+// This code can be removed once we have a model out which ensures consistent
+// ordering of nodes in the stor-distribution config.
+
+namespace qi = boost::spirit::qi;
+namespace ascii = boost::spirit::ascii;
+namespace phoenix = boost::phoenix;
+
+namespace {
+
+struct GroupSet;
+
+using Children = boost::variant<
+ std::vector<unsigned int>,
+ boost::recursive_wrapper<GroupSet>
+>;
+
+struct Group {
+ uint16_t index;
+ boost::optional<double> capacity;
+ Children children;
+};
+
+struct GroupSet {
+ std::string distribution_spec;
+ std::vector<Group> subgroups;
+};
+
+} // anon ns
+
+// Fusion adaptations must be in global scope.
+BOOST_FUSION_ADAPT_STRUCT(
+ ::Group,
+ (uint16_t, index)
+ (boost::optional<double>, capacity)
+ (::Children, children)
+)
+
+BOOST_FUSION_ADAPT_STRUCT(
+ ::GroupSet,
+ (std::string, distribution_spec)
+ (std::vector<Group>, subgroups)
+)
+
+namespace storage {
+namespace {
+
+// Boost.Spirit v2 grammar for parsing the output of lib::Group::getConfigHash.
+template <typename Iterator>
+struct HashGrammar
+ : qi::grammar<Iterator, Group()>
+{
+ HashGrammar()
+ : HashGrammar::base_type(group)
+ {
+ using qi::uint_;
+ using qi::double_;
+ using ascii::char_;
+ /*
+ * This grammar makes the (reasonable) assumption that you can't have
+ * empty groups.
+ *
+ * Quick Spirit PEG DSL syntax primer for any two arbitrary parsers
+ * a and b (all subcomponents of parsers are themselves parsers):
+ *
+ * 'X' : character literal match parser
+ * a >> b : a must be followed by b ("a b" in EBNF)
+ * -a : optional ("a?" in EBNF)
+ * a | b : a or b must match (same as in EBNF)
+ * +a : match 1 or more times ("a+" in EBNF)
+ * *a : kleene star; 0 or more times ("a*" in EBNF)
+ * a - b : difference; a but not b
+ *
+ * Please see Boost.Spirit docs on how these map to parser attributes
+ * (optional maps to boost::optional of nested attribute, + or kleene
+ * star maps to an iterable range (std::vector) of nested attributes,
+ * a | b maps to a boost::variant of the attributes of a and b,
+ * a >> b maps to a boost::tuple of the attributes and so on; usually
+ * fairly intuitive).
+ */
+ group =
+ '('
+ >> uint_
+ >> -('c' >> double_)
+ >> ( +(';' >> uint_)
+ | subgroups
+ )
+ >> ')';
+
+ subgroups = ('d' >> distr_spec >> +group);
+
+ distr_spec = +(char_ - '('); // Match everything until open paren.
+ }
+
+ qi::rule<Iterator, Group()> group;
+ qi::rule<Iterator, GroupSet()> subgroups;
+ qi::rule<Iterator, std::string()> distr_spec;
+};
+
+template <typename Range, typename Predicate>
+auto ordered_by(const Range& range, Predicate pred) {
+ std::vector<typename Range::value_type> copy(
+ std::begin(range), std::end(range));
+ std::sort(copy.begin(), copy.end(), pred);
+ return copy;
+}
+
+void emit_normalized_groups(vespalib::asciistream& out, const Group& g);
+
+struct InOrderGroupVisitor : boost::static_visitor<void> {
+ vespalib::asciistream& _out;
+ InOrderGroupVisitor(vespalib::asciistream& out)
+ : _out(out)
+ {
+ }
+
+ void operator()(const std::vector<unsigned int>& nodes) const {
+ for (uint16_t node : ordered_by(nodes, std::less<void>())) {
+ _out << ';' << node;
+ }
+ }
+
+ void operator()(const GroupSet& gs) const {
+ _out << 'd' << gs.distribution_spec;
+ auto index_less_than = [](auto& lhs, auto& rhs) {
+ return lhs.index < rhs.index;
+ };
+ // Ordering will also copy nested subgroups, but the number of known
+ // Vespa installations with nested subgroups is currently somewhere
+ // around the high end of zero.
+ for (auto& g : ordered_by(gs.subgroups, index_less_than)) {
+ emit_normalized_groups(_out, g);
+ }
+ }
+};
+
+void emit_normalized_groups(vespalib::asciistream& out, const Group& g) {
+ out << '(' << g.index;
+ if (g.capacity) {
+ out << 'c' << *g.capacity;
+ }
+ boost::apply_visitor(InOrderGroupVisitor(out), g.children);
+ out << ')';
+}
+
+} // anon ns
+
+// We keep the grammar around across multiple normalized() calls because
+// constructing the grammar object(s) isn't free.
+struct DistributionHashNormalizer::ParserImpl {
+ using Iterator = vespalib::string::const_iterator;
+ HashGrammar<Iterator> grammar;
+};
+
+DistributionHashNormalizer::DistributionHashNormalizer()
+ : _impl(std::make_unique<ParserImpl>())
+{
+}
+
+// Required here because of incomplete ParserImpl in header.
+DistributionHashNormalizer::~DistributionHashNormalizer()
+{
+}
+
+vespalib::string
+DistributionHashNormalizer::normalize(vespalib::stringref hash) const
+{
+ Group root;
+
+ auto iter = hash.begin();
+ const bool ok = qi::parse(iter, hash.end(), _impl->grammar, root);
+ if (!ok || iter != hash.end()) {
+ vespalib::string hash_str = hash; // stringref might not be zero-term'd.
+ LOGBT(warning, hash_str.c_str(),
+ "Unable to parse compact distribution config "
+ "representation: '%s'",
+ hash_str.c_str());
+ return hash; // Fallback to input on parse failure.
+ }
+
+ vespalib::asciistream out;
+ emit_normalized_groups(out, root);
+
+ return out.str();
+}
+
+} // storage
+
diff --git a/storage/src/vespa/storage/bucketdb/distribution_hash_normalizer.h b/storage/src/vespa/storage/bucketdb/distribution_hash_normalizer.h
new file mode 100644
index 00000000000..0056887c5ff
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/distribution_hash_normalizer.h
@@ -0,0 +1,28 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/vespalib/stllike/string.h>
+
+namespace storage {
+
+/**
+ * Utility class for "normalizing" a received distribution hash string into
+ * a representation that is ordering invariant across group and node indices.
+ *
+ * All group indices and node indices will be returned in increasing order.
+ *
+ * In the case of a parser error the original string will be returned verbatim.
+ */
+class DistributionHashNormalizer {
+ // PIMPL the parser to avoid Spirit deps in header file.
+ struct ParserImpl;
+ std::unique_ptr<ParserImpl> _impl;
+public:
+ DistributionHashNormalizer();
+ ~DistributionHashNormalizer();
+
+ vespalib::string normalize(vespalib::stringref hash) const;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/bucketdb/judyarray.cpp b/storage/src/vespa/storage/bucketdb/judyarray.cpp
new file mode 100644
index 00000000000..4f5d16b28c1
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/judyarray.cpp
@@ -0,0 +1,90 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/bucketdb/judyarray.h>
+
+namespace storage {
+
+JudyArray::~JudyArray()
+{
+ clear();
+}
+
+bool
+JudyArray::operator==(const JudyArray& array) const
+{
+ if (size() != array.size()) return false;
+ for (JudyArray::const_iterator it1 = begin(), it2 = array.begin();
+ it1 != end(); ++it1, ++it2)
+ {
+ if (*it1 != *it2) return false;
+ }
+ return true;
+}
+
+bool
+JudyArray::operator<(const JudyArray& array) const
+{
+ if (size() != array.size()) return (size() < array.size());
+ for (JudyArray::const_iterator it1 = begin(), it2 = array.begin();
+ it1 != end(); ++it1, ++it2)
+ {
+ if (*it1 != *it2) return (*it1 < *it2);
+ }
+ return false;
+}
+
+JudyArray::size_type
+JudyArray::size() const
+{
+ key_type lastIndex = 0;
+ --lastIndex; // Get last index in size independent way
+ return JudyLCount(_judyArray, 0, lastIndex, PJE0);
+}
+
+void
+JudyArray::swap(JudyArray& other)
+{
+ void* judyArray = _judyArray; // Save our variables
+ _judyArray = other._judyArray; // Assign others to ours
+ other._judyArray = judyArray; // Assign temporary to other
+}
+
+void
+JudyArray::print(std::ostream& out, bool, const std::string& indent) const
+{
+ out << "JudyArray(";
+ for (const_iterator i = begin(); i != end(); ++i) {
+ out << "\n" << indent << " Key: " << i.key()
+ << ", Value: " << i.value();
+ }
+ out << "\n" << indent << ")";
+}
+
+JudyArray::ConstIterator::ConstIterator(const JudyArray& arr)
+ : _key(0), _data(0), _parent(const_cast<JudyArray*>(&arr)) {}
+
+JudyArray::ConstIterator::ConstIterator(const JudyArray& arr, key_type mykey)
+ : _key(mykey), _data(0), _parent(const_cast<JudyArray*>(&arr))
+{
+ _data = reinterpret_cast<data_type*>(
+ JudyLFirst(_parent->_judyArray, &_key, PJE0));
+}
+
+void
+JudyArray::ConstIterator::print(std::ostream& out, bool, const std::string&) const
+{
+ if (dynamic_cast<const Iterator*>(this) == 0) {
+ out << "Const";
+ }
+ out << "Iterator(Key: " << _key << ", Valp: " << _data;
+ if (_data) out << ", Val: " << *_data;
+ out << ")";
+}
+
+JudyArray::Iterator::Iterator(JudyArray& arr)
+ : ConstIterator(arr) {}
+
+JudyArray::Iterator::Iterator(JudyArray& arr, key_type mykey)
+ : ConstIterator(arr, mykey) {}
+
+} // storage
diff --git a/storage/src/vespa/storage/bucketdb/judyarray.h b/storage/src/vespa/storage/bucketdb/judyarray.h
new file mode 100644
index 00000000000..963fdb86f98
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/judyarray.h
@@ -0,0 +1,266 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class JudyArray
+ *
+ * Implements a pair associative container on top of a judy array.
+ *
+ * NB: All iterators are invalidated after writing to judy array.
+ *
+ * NB: Using JudyArray's insert, one can only detect if the element already
+ * existed, if the element didn't have the value 0. Since we don't want to
+ * say that values cannot be 0, size is not counted outside of judy array, but
+ * rather counts elements in the judy array when asked.
+ *
+ * @author Haakon Humberset
+ */
+
+#pragma once
+
+#include <boost/operators.hpp>
+#include <vespa/vespalib/util/exceptions.h>
+#include <vespa/vespalib/util/printable.h>
+#include <vespa/fastos/fastos.h>
+#include <Judy.h>
+#include <sstream>
+
+namespace storage {
+
+class JudyArray : public vespalib::Printable, public boost::operators<JudyArray>
+{
+ JudyArray(const JudyArray&); // Deny copying
+ JudyArray& operator=(const JudyArray&);
+
+public:
+ class Iterator;
+ class ConstIterator;
+
+ typedef Iterator iterator;
+ typedef ConstIterator const_iterator;
+ typedef unsigned long key_type;
+ typedef unsigned long data_type;
+ typedef std::pair<const key_type, data_type> value_type;
+ typedef size_t size_type;
+ typedef value_type& reference;
+ typedef const value_type& const_reference;
+ typedef value_type* pointer;
+ typedef int difference_type;
+
+ JudyArray() : _judyArray(NULL) {}
+ virtual ~JudyArray();
+
+ bool operator==(const JudyArray& array) const;
+ bool operator<(const JudyArray& array) const;
+
+ /** Warning: Size may be a O(n) function (Unknown implementation in judy) */
+ size_type size() const;
+ bool empty() const { return (begin() == end()); }
+
+ iterator begin() { return Iterator(*this, 0); }
+ iterator end() { return Iterator(*this); }
+ const_iterator begin() const { return ConstIterator(*this, 0); }
+ const_iterator end() const { return ConstIterator(*this); }
+
+ void swap(JudyArray&);
+
+ const_iterator find(key_type key) const;
+ /**
+ * Get iterator to value with given key. If non-existing, returns end(),
+ * unless insert is true, in which case the element will be created.
+ */
+ iterator find(key_type key, bool insert, bool& preExisted);
+ iterator find(key_type key) { bool b; return find(key, false, b); }
+
+ const_iterator lower_bound(key_type key) const
+ { return ConstIterator(*this, key); }
+ iterator lower_bound(key_type key) { return Iterator(*this, key); }
+
+ size_type erase(key_type key);
+ void erase(iterator& iter) { iter.remove(); }
+
+ void insert(key_type key, data_type val);
+ void clear();
+
+ data_type& operator[](key_type key);
+ size_type getMemoryUsage() const;
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ class ConstIterator : public vespalib::Printable,
+ public boost::operators<ConstIterator>
+ {
+ public:
+ ConstIterator& operator--();
+ ConstIterator& operator++(); // Prefix, postfix provided by boost
+
+ bool operator==(const ConstIterator &cp) const; // != provided by boost
+ value_type operator*() const { return value_type(_key, *_data); }
+
+ bool end() const { return (_data == 0); }
+ key_type key() const { return _key; }
+ data_type value() const { return *_data; }
+
+ virtual void print(std::ostream& out,
+ bool verbose, const std::string& indent) const;
+
+ protected:
+ // For creating end() iterator
+ ConstIterator(const JudyArray&);
+ // Create iterator pointing to first element >= key.
+ ConstIterator(const JudyArray&, key_type);
+
+ key_type _key; // Key iterator currently points to
+ data_type* _data; // Pointer to member pointed to, or 0 if end().
+ JudyArray* _parent;
+ friend class JudyArray;
+ };
+
+ class Iterator : public ConstIterator,
+ public boost::operators<Iterator>
+ {
+ public:
+ Iterator& operator--()
+ { return static_cast<Iterator&>(ConstIterator::operator--()); }
+
+ Iterator& operator++()
+ { return static_cast<Iterator&>(ConstIterator::operator++()); }
+
+ void setValue(data_type val);
+ void remove();
+
+ private:
+ Iterator(JudyArray&);
+ Iterator(JudyArray&, key_type key);
+ friend class JudyArray;
+ };
+
+private:
+ void *_judyArray;
+ friend class Iterator;
+ friend class ConstIterator;
+};
+
+inline JudyArray::const_iterator
+JudyArray::find(key_type key) const
+{
+ ConstIterator iter(*this, key);
+ if (!iter.end() && iter.key() != key) {
+ iter = ConstIterator(*this);
+ }
+ return iter;
+}
+
+inline JudyArray::iterator
+JudyArray::find(key_type key, bool insertIfNonExisting, bool& preExisted)
+{
+ Iterator iter(*this, key);
+ if (insertIfNonExisting && (iter.end() || iter.key() != key)) {
+ preExisted = false;
+ insert(key, 0);
+ iter = Iterator(*this, key);
+ assert(iter.key() == key);
+ } else if (iter.key() != key) {
+ preExisted = false;
+ iter = Iterator(*this);
+ } else {
+ preExisted = true;
+ }
+ return iter;
+}
+
+inline JudyArray::size_type
+JudyArray::erase(key_type key)
+{
+ JError_t err;
+ size_type result = JudyLDel(&_judyArray, key, &err);
+ if (result == 0 || result == 1) {
+ return result;
+ }
+ std::ostringstream ost;
+ ost << "Judy error in erase(" << std::hex << key << "): " << err.je_Errno;
+ std::cerr << ost.str() << "\n";
+ assert(false);
+ return 0;
+}
+
+inline void
+JudyArray::insert(key_type key, data_type val)
+{
+ data_type* valp = reinterpret_cast<data_type*>(
+ JudyLIns(&_judyArray, key, PJE0));
+ *valp = val;
+}
+
+inline void
+JudyArray::clear()
+{
+ JudyLFreeArray(&_judyArray, PJE0);
+}
+
+inline JudyArray::data_type&
+JudyArray::operator[](key_type key)
+{
+ data_type* valp = reinterpret_cast<data_type*>(
+ JudyLGet(_judyArray, key, PJE0));
+ if (valp == 0) {
+ valp = reinterpret_cast<data_type*>(JudyLIns(&_judyArray, key, PJE0));
+ *valp = 0;
+ }
+ return *valp;
+}
+
+inline JudyArray::size_type
+JudyArray::getMemoryUsage() const
+{
+ return JudyLMemUsed(_judyArray);
+}
+
+inline JudyArray::ConstIterator&
+JudyArray::ConstIterator::operator--() // Prefix
+{
+ if (!_data) {
+ _data = reinterpret_cast<data_type*>(
+ JudyLLast(_parent->_judyArray, &_key, PJE0));
+ } else {
+ _data = reinterpret_cast<data_type*>(
+ JudyLPrev(_parent->_judyArray, &_key, PJE0));
+ }
+ return *this;
+}
+
+inline JudyArray::ConstIterator&
+JudyArray::ConstIterator::operator++() // Prefix
+{
+ _data = reinterpret_cast<data_type*>(
+ JudyLNext(_parent->_judyArray, &_key, PJE0));
+ return *this;
+}
+
+inline bool
+JudyArray::ConstIterator::operator==(const JudyArray::ConstIterator &cp) const
+{
+ return (_data == cp._data);
+}
+
+inline void
+JudyArray::Iterator::setValue(data_type val)
+{
+ if (_data == 0) {
+ throw vespalib::IllegalArgumentException(
+ "Cannot set value of end() iterator", VESPA_STRLOC);
+ }
+ *_data = val;
+}
+
+inline void
+JudyArray::Iterator::remove()
+{
+ if (_data == 0) {
+ throw vespalib::IllegalArgumentException(
+ "Cannot erase end() iterator", VESPA_STRLOC);
+ }
+ _parent->erase(_key);
+}
+
+} // storage
+
diff --git a/storage/src/vespa/storage/bucketdb/judymultimap.h b/storage/src/vespa/storage/bucketdb/judymultimap.h
new file mode 100644
index 00000000000..ea7c005dc24
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/judymultimap.h
@@ -0,0 +1,561 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class JudyMultiMap
+ *
+ * Layer on top of JudyArray, to create a map from the judy array key type,
+ * to any of a given set of array types.
+ *
+ * The value arrays in here all starts with an unused object at index 0.
+ * This is because 0 is used as unset value in judyarray, such that we can
+ * easily detect if we replace or insert new entry.
+ *
+ * NB: The order of the template parameters type must be ordered such that
+ * the types can include less and less.
+ *
+ * NB: All iterators are invalidated after writing to judy map.
+ *
+ * NB: Using JudyArray's insert, one can only detect if the element already
+ * existed, if the element didn't have the value 0. Since we don't want to
+ * say that values cannot be 0, size is not counted outside of judy array, but
+ * rather counts elements in the judy array when asked.
+ *
+ * @author Haakon Humberset<
+ */
+
+
+#pragma once
+
+#include <vespa/storage/bucketdb/judyarray.h>
+#include <vespa/vespalib/util/exceptions.h>
+#include <vespa/vespalib/util/array.h>
+#include <set>
+#include <vector>
+
+namespace storage {
+
+template<class Type0,
+ class Type1 = Type0,
+ class Type2 = Type1,
+ class Type3 = Type2 >
+class JudyMultiMap : public vespalib::Printable {
+public:
+ JudyMultiMap()
+ : _values0(1), _values1(1), _values2(1), _values3(1), _free(4) {}
+
+ class Iterator;
+ class ConstIterator;
+ class ValueType;
+
+ typedef Iterator iterator;
+ typedef ConstIterator const_iterator;
+ typedef JudyArray::key_type key_type;
+ typedef Type3 mapped_type;
+ typedef std::pair<const key_type, mapped_type> value_type;
+ typedef JudyArray::size_type size_type;
+
+ bool operator==(const JudyMultiMap& array) const;
+ bool operator<(const JudyMultiMap& array) const;
+
+ /** Warning: Size may be a O(n) function (Unknown implementation in judy) */
+ size_type size() const;
+ bool empty() const { return (begin() == end()); }
+
+ iterator begin() { return Iterator(*this, 0); }
+ iterator end() { return Iterator(*this); }
+ const_iterator begin() const { return ConstIterator(*this, 0); }
+ const_iterator end() const { return ConstIterator(*this); }
+
+ void swap(JudyMultiMap&);
+
+ const_iterator find(key_type key) const;
+ /**
+ * Get iterator to value with given key. If non-existing, returns end(),
+ * unless insert is true, in which case the element will be created.
+ */
+ iterator find(key_type key, bool insert, bool& preExisted);
+ iterator find(key_type key) { bool b; return find(key, false, b); }
+
+ const_iterator lower_bound(key_type key) const
+ { return ConstIterator(*this, key); }
+ iterator lower_bound(key_type key) { return Iterator(*this, key); }
+
+ size_type erase(key_type key);
+ void erase(iterator& iter) { iter.remove(); }
+
+ void insert(key_type key, const Type3& val, bool& preExisted)
+ {
+ JudyArray::iterator it(_judyArray.find(key, true, preExisted));
+ insert(it, val);
+ }
+ void clear();
+
+ const mapped_type operator[](key_type key);
+ size_type getMemoryUsage() const;
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ class ConstIterator : public vespalib::Printable,
+ public boost::operators<ConstIterator>
+ {
+ public:
+ ConstIterator& operator--() { --_iterator; return *this; }
+ ConstIterator& operator++() { ++_iterator; return *this; }
+
+ bool operator==(const ConstIterator &cp) const; // != provided by boost
+ value_type operator*() const;
+
+ inline bool end() const { return _iterator.end(); }
+ inline key_type key() const { return _iterator.key(); }
+ mapped_type value() const;
+
+ const std::pair<key_type, mapped_type>* operator->() const {
+ _pair = std::pair<key_type, mapped_type>(_iterator.key(), value());
+ return &_pair;
+ }
+
+ virtual void print(std::ostream& out,
+ bool verbose, const std::string& indent) const;
+
+ protected:
+ // For creating end() iterator
+ ConstIterator(const JudyMultiMap&);
+ // Create iterator pointing to first element >= key.
+ ConstIterator(const JudyMultiMap&, key_type);
+
+ JudyArray::ConstIterator _iterator;
+ JudyMultiMap* _parent;
+ friend class JudyMultiMap;
+ mutable std::pair<key_type, mapped_type> _pair;
+ };
+
+ class Iterator : public ConstIterator,
+ public boost::operators<Iterator>
+ {
+ public:
+ Iterator& operator--()
+ { return static_cast<Iterator&>(ConstIterator::operator--()); }
+
+ Iterator& operator++()
+ { return static_cast<Iterator&>(ConstIterator::operator++()); }
+
+ void setValue(const Type3& val);
+ void remove();
+
+ private:
+ Iterator(JudyMultiMap&);
+ Iterator(JudyMultiMap&, key_type key);
+ friend class JudyMultiMap;
+ };
+
+private:
+ JudyArray _judyArray;
+ typedef vespalib::Array<Type0, vespalib::DefaultAlloc> Type0Vector;
+ typedef vespalib::Array<Type1, vespalib::DefaultAlloc> Type1Vector;
+ typedef vespalib::Array<Type2, vespalib::DefaultAlloc> Type2Vector;
+ typedef vespalib::Array<Type3, vespalib::DefaultAlloc> Type3Vector;
+ Type0Vector _values0;
+ Type1Vector _values1;
+ Type2Vector _values2;
+ Type3Vector _values3;
+ std::vector<std::vector<typename Type0Vector::size_type> > _free;
+ friend class Iterator;
+ friend class ConstIterator;
+
+ inline static int getType(JudyArray::data_type index) {
+ return index >> (8 * sizeof(JudyArray::data_type) - 2);
+ }
+ inline static JudyArray::data_type getIndex(JudyArray::data_type index) {
+ return ((index << 2) >> 2);
+ }
+ inline static JudyArray::data_type getValue(JudyArray::data_type type,
+ JudyArray::data_type index)
+ {
+ return (type << (8 * sizeof(JudyArray::data_type) - 2) | index);
+ }
+ void insert(JudyArray::iterator& it, const Type3& val);
+};
+
+template<class T0, class T1, class T2, class T3>
+bool
+JudyMultiMap<T0, T1, T2, T3>::
+operator==(const JudyMultiMap<T0, T1, T2, T3>& map) const
+{
+ if (size() != map.size()) return false;
+ for (typename JudyMultiMap<T0, T1, T2, T3>::const_iterator
+ it1 = begin(), it2 = map.begin(); it1 != end(); ++it1, ++it2)
+ {
+ assert(it2 != end());
+ if (*it1 != *it2) return false;
+ }
+ return true;
+}
+
+template<class T0, class T1, class T2, class T3>
+bool
+JudyMultiMap<T0, T1, T2, T3>::
+operator<(const JudyMultiMap<T0, T1, T2, T3>& map) const
+{
+ if (size() != map.size()) return (size() < map.size());
+ for (typename JudyMultiMap<T0, T1, T2, T3>::const_iterator
+ it1 = begin(), it2 = map.begin(); it1 != end(); ++it1, ++it2)
+ {
+ if (it1.key() != it2.key()) return (it1.key() < it2.key());
+ if (it1.value() != it2.value()) return (it1.value() < it2.value());
+ }
+ return false;
+}
+
+template<class T0, class T1, class T2, class T3>
+inline typename JudyMultiMap<T0, T1, T2, T3>::size_type
+JudyMultiMap<T0, T1, T2, T3>::size() const
+{
+ // First elements in all vectors is bogus, because we use value 0
+ // to mean unset in judyarray. (To be able to detect if we overwrite)
+ return _values0.size() + _values1.size()
+ + _values2.size() + _values3.size() - 4
+ - _free[0].size() - _free[1].size()
+ - _free[2].size() - _free[3].size();
+}
+
+template<class T0, class T1, class T2, class T3>
+void
+JudyMultiMap<T0, T1, T2, T3>::
+swap(JudyMultiMap<T0, T1, T2, T3>& other)
+{
+ _judyArray.swap(other._judyArray);
+ _values0.swap(other._values0);
+ _values1.swap(other._values1);
+ _values2.swap(other._values2);
+ _values3.swap(other._values3);
+ _free.swap(other._free);
+}
+
+template<class T0, class T1, class T2, class T3>
+inline typename JudyMultiMap<T0, T1, T2, T3>::const_iterator
+JudyMultiMap<T0, T1, T2, T3>::find(key_type key) const
+{
+ ConstIterator iter(*this, key);
+ if (!iter.end() && iter.key() != key) {
+ iter = ConstIterator(*this);
+ }
+ return iter;
+}
+
+template<class T0, class T1, class T2, class T3>
+inline typename JudyMultiMap<T0, T1, T2, T3>::iterator
+JudyMultiMap<T0, T1, T2, T3>::find(key_type key, bool insertIfNonExisting,
+ bool& preExisted)
+{
+ Iterator iter(*this, key);
+ if (insertIfNonExisting && (iter.end() || iter.key() != key)) {
+ insert(key, T3(), preExisted);
+ iter = Iterator(*this, key);
+ assert(iter.key() == key);
+ } else if (iter.key() != key) {
+ preExisted = false;
+ iter = Iterator(*this);
+ } else {
+ preExisted = true;
+ }
+ return iter;
+}
+
+template<class T0, class T1, class T2, class T3>
+inline typename JudyMultiMap<T0, T1, T2, T3>::size_type
+JudyMultiMap<T0, T1, T2, T3>::erase(key_type key)
+{
+ JudyArray::iterator it = _judyArray.find(key);
+ if (it == _judyArray.end()) return 0;
+ _free[getType(it.value())].push_back(getIndex(it.value()));
+ _judyArray.erase(key);
+ return 1;
+}
+
+template<class T0, class T1, class T2, class T3>
+inline void
+JudyMultiMap<T0, T1, T2, T3>::clear()
+{
+ _judyArray.clear();
+ _values0.resize(1);
+ _values1.resize(1);
+ _values2.resize(1);
+ _values3.resize(1);
+ _free[0].clear();
+ _free[1].clear();
+ _free[2].clear();
+ _free[3].clear();
+}
+
+template<class T0, class T1, class T2, class T3>
+inline const typename JudyMultiMap<T0, T1, T2, T3>::mapped_type
+JudyMultiMap<T0, T1, T2, T3>::operator[](key_type key)
+{
+ bool preExisted;
+ JudyArray::iterator it = _judyArray.find(key, true, preExisted);
+ // If it doesn't already exist, insert
+ if (it.value() == 0) {
+ if (_free[0].empty()) {
+ it.setValue(getValue(0, _values0.size()));
+ _values0.push_back(T0());
+ } else {
+ it.setValue(getValue(0, _free[0].back()));
+ _values0[_free[0].back()] = T0();
+ _free[0].pop_back();
+ }
+ }
+ switch (getType(it.value())) {
+ case 0: return _values0[getIndex(it.value())];
+ case 1: return _values1[getIndex(it.value())];
+ case 2: return _values2[getIndex(it.value())];
+ case 3: return _values3[getIndex(it.value())];
+ default: assert(false);
+ }
+ return T0(); // Avoid warning of no return
+}
+
+template<class T0, class T1, class T2, class T3>
+inline typename JudyMultiMap<T0, T1, T2, T3>::size_type
+JudyMultiMap<T0, T1, T2, T3>::getMemoryUsage() const
+{
+ return _judyArray.getMemoryUsage()
+ + sizeof(T0) * _values0.capacity()
+ + sizeof(T1) * _values1.capacity()
+ + sizeof(T2) * _values2.capacity()
+ + sizeof(T3) * _values3.capacity()
+ + sizeof(typename Type0Vector::size_type)
+ * (_free[0].capacity() + _free[1].capacity() +
+ _free[2].capacity() + _free[3].capacity());
+}
+
+template<class T0, class T1, class T2, class T3>
+void
+JudyMultiMap<T0, T1, T2, T3>::
+print(std::ostream& out, bool verbose, const std::string& indent) const
+{
+ out << "JudyMultiMap(";
+
+ if (verbose) {
+ for (const_iterator i = begin(); i != end(); ++i) {
+ out << "\n" << indent << " ";
+ i.print(out, verbose, indent + " ");
+ }
+ }
+
+ if (_values0.size() > 1) {
+ std::set<typename Type0Vector::size_type> free(
+ _free[0].begin(), _free[0].end());
+ assert(free.size() == _free[0].size());
+ out << "\n" << indent << " Type0 " << (_values0.size()-1)
+ << " entries, " << free.size() << " free {";
+
+ if (verbose) {
+ for (uint32_t i=1; i<_values0.size(); ++i) {
+ out << "\n" << indent << " ";
+ if (free.find(i) != free.end()) { out << "free"; }
+ else { out << _values0[i]; }
+ }
+ }
+ out << "\n" << indent << " }";
+ }
+ if (_values1.size() > 1) {
+ std::set<typename Type0Vector::size_type> free(
+ _free[1].begin(), _free[1].end());
+ assert(free.size() == _free[1].size());
+ out << "\n" << indent << " Type1 " << (_values1.size()-1)
+ << " entries, " << free.size() << " free {";
+ if (verbose) {
+ for (uint32_t i=1; i<_values1.size(); ++i) {
+ out << "\n" << indent << " ";
+ if (free.find(i) != free.end()) { out << "free"; }
+ else { out << _values1[i]; }
+ }
+ }
+ out << "\n" << indent << " }";
+ }
+ if (_values2.size() > 1) {
+ std::set<typename Type0Vector::size_type> free(
+ _free[2].begin(), _free[2].end());
+ assert(free.size() == _free[2].size());
+ out << "\n" << indent << " Type2 " << (_values2.size()-1)
+ << " entries, " << free.size() << " free {";
+ if (verbose) {
+ for (uint32_t i=1; i<_values2.size(); ++i) {
+ out << "\n" << indent << " ";
+ if (free.find(i) != free.end()) { out << "free"; }
+ else { out << _values2[i]; }
+ }
+ }
+ out << "\n" << indent << " }";
+ }
+
+ if (_values3.size() > 1) {
+ std::set<typename Type0Vector::size_type> free(
+ _free[3].begin(), _free[3].end());
+ assert(free.size() == _free[3].size());
+ out << "\n" << indent << " Type3 " << (_values3.size()-1)
+ << " entries, " << free.size() << " free {";
+
+ if (verbose) {
+ for (uint32_t i=1; i<_values3.size(); ++i) {
+ out << "\n" << indent << " ";
+ if (free.find(i) != free.end()) { out << "free"; }
+ else { out << _values3[i]; }
+ }
+ }
+ out << "\n" << indent << " }";
+ }
+ if (!empty()) { out << "\n" << indent; }
+ out << ")";
+}
+
+template<class T0, class T1, class T2, class T3>
+JudyMultiMap<T0, T1, T2, T3>::
+ConstIterator::ConstIterator(const JudyMultiMap<T0, T1, T2, T3>& map)
+ : _iterator(map._judyArray.end()),
+ _parent(const_cast<JudyMultiMap<T0, T1, T2, T3>*>(&map))
+{
+}
+
+template<class T0, class T1, class T2, class T3>
+JudyMultiMap<T0, T1, T2, T3>::
+ConstIterator::ConstIterator(const JudyMultiMap<T0, T1, T2, T3>& map,
+ key_type mykey)
+ : _iterator(map._judyArray.lower_bound(mykey)),
+ _parent(const_cast<JudyMultiMap<T0, T1, T2, T3>*>(&map))
+{
+}
+
+template<class T0, class T1, class T2, class T3>
+inline bool
+JudyMultiMap<T0, T1, T2, T3>::
+ConstIterator::operator==(const JudyMultiMap::ConstIterator &cp) const
+{
+ return (_iterator == cp._iterator);
+}
+
+template<class T0, class T1, class T2, class T3>
+inline typename JudyMultiMap<T0, T1, T2, T3>::value_type
+JudyMultiMap<T0, T1, T2, T3>::ConstIterator::operator*() const
+{
+ switch (getType(_iterator.value())) {
+ case 0: return value_type(
+ _iterator.key(), _parent->_values0[getIndex(_iterator.value())]);
+ case 1: return value_type(
+ _iterator.key(), _parent->_values1[getIndex(_iterator.value())]);
+ case 2: return value_type(
+ _iterator.key(), _parent->_values2[getIndex(_iterator.value())]);
+ case 3: return value_type(
+ _iterator.key(), _parent->_values3[getIndex(_iterator.value())]);
+ default:
+ assert(false);
+ abort();
+ }
+}
+
+template<class T0, class T1, class T2, class T3>
+inline typename JudyMultiMap<T0, T1, T2, T3>::mapped_type
+JudyMultiMap<T0, T1, T2, T3>::ConstIterator::value() const
+{
+ switch (getType(_iterator.value())) {
+ default: assert(false);
+ case 0: return _parent->_values0[getIndex(_iterator.value())];
+ case 1: return _parent->_values1[getIndex(_iterator.value())];
+ case 2: return _parent->_values2[getIndex(_iterator.value())];
+ case 3: return _parent->_values3[getIndex(_iterator.value())];
+ }
+}
+
+template<class T0, class T1, class T2, class T3>
+void
+JudyMultiMap<T0, T1, T2, T3>::
+ConstIterator::print(std::ostream& out, bool, const std::string&) const
+{
+ if (dynamic_cast<const Iterator*>(this) == 0) {
+ out << "Const";
+ }
+ out << "Iterator(Key: " << _iterator.key() << ", Value: " << value() << ")";
+}
+
+template<class T0, class T1, class T2, class T3>
+JudyMultiMap<T0, T1, T2, T3>::
+Iterator::Iterator(JudyMultiMap<T0, T1, T2, T3>& map)
+ : ConstIterator(map) {}
+
+template<class T0, class T1, class T2, class T3>
+JudyMultiMap<T0, T1, T2, T3>::
+Iterator::Iterator(JudyMultiMap<T0, T1, T2, T3>& map, key_type mykey)
+ : ConstIterator(map, mykey) {}
+
+template<class T0, class T1, class T2, class T3>
+inline void
+JudyMultiMap<T0, T1, T2, T3>::Iterator::setValue(const T3& val)
+{
+ if (this->_iterator.end()) {
+ throw vespalib::IllegalArgumentException(
+ "Cannot set value of end() iterator", VESPA_STRLOC);
+ }
+ insert(this->iterator, val);
+}
+
+template<class T0, class T1, class T2, class T3>
+inline void
+JudyMultiMap<T0, T1, T2, T3>::Iterator::remove()
+{
+ if (this->_iterator.end()) {
+ throw vespalib::IllegalArgumentException(
+ "Cannot erase end() iterator", VESPA_STRLOC);
+ }
+ int type = getType(this->_iterator.value());
+ _free[type].push_back(getIndex(this->_iterator.value()));
+ this->_iterator.remove();
+}
+
+template<class T0, class T1, class T2, class T3>
+void
+JudyMultiMap<T0, T1, T2, T3>::insert(JudyArray::iterator& it, const T3& val)
+{
+ // Find the type we need to save 'val' as
+ int type;
+ if (T0::mayContain(val)) { type = 0; }
+ else if (T1::mayContain(val)) { type = 1; }
+ else if (T2::mayContain(val)) { type = 2; }
+ else { type = 3; }
+ // If already pointing to some value, free that resource.
+ int oldtype = getType(it.value());
+ int index = getIndex(it.value());
+ if (index != 0) {
+ _free[oldtype].push_back(index);
+ }
+ // Insert value into new spot
+ if (_free[type].empty()) {
+ switch (type) {
+ case 0: it.setValue(getValue(type, _values0.size()));
+ _values0.push_back(val);
+ break;
+ case 1: it.setValue(getValue(type, _values1.size()));
+ _values1.push_back(T1(val));
+ break;
+ case 2: it.setValue(getValue(type, _values2.size()));
+ _values2.push_back(T2(val));
+ break;
+ case 3: it.setValue(getValue(type, _values3.size()));
+ _values3.push_back(T3(val));
+ break;
+ default: assert(false);
+ }
+ } else {
+ it.setValue(getValue(type, _free[type].back()));
+ switch (type) {
+ case 0: _values0[_free[type].back()] = val; break;
+ case 1: _values1[_free[type].back()] = val; break;
+ case 2: _values2[_free[type].back()] = val; break;
+ case 3: _values3[_free[type].back()] = val; break;
+ default: assert(false);
+ }
+ _free[type].pop_back();
+ }
+}
+
+} // storage
+
diff --git a/storage/src/vespa/storage/bucketdb/lockablemap.h b/storage/src/vespa/storage/bucketdb/lockablemap.h
new file mode 100644
index 00000000000..bfc35f80f44
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/lockablemap.h
@@ -0,0 +1,1067 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * A map wrapper, adding locking to the map entries. It provides the
+ * following:
+ *
+ * - Guarantees thread safety.
+ * - Each returned value is given within a wrapper. As long as the
+ * wrapper for the value exist, this entry is locked in the map.
+ * This does not prevent other values from being used. Wrappers can
+ * be copied. Reference counting ensures value is locked until last
+ * wrapper copy dies.
+ * - Built in function for iterating taking a functor. Halts when
+ * encountering locked values.
+ */
+#pragma once
+
+#include <map>
+#include <vespa/vespalib/util/printable.h>
+#include <list>
+#include <vespa/vespalib/util/sync.h>
+#include <vespa/vespalib/stllike/hash_map.h>
+#include <vespa/vespalib/stllike/asciistream.h>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/storage/common/bucketoperationlogger.h>
+#include <thread>
+
+namespace storage {
+
+template<typename Map>
+class LockableMap : public vespalib::Printable,
+ public boost::operators<LockableMap<Map> >
+{
+public:
+ typedef typename Map::key_type key_type;
+ typedef typename Map::mapped_type mapped_type;
+ typedef typename Map::value_type value_type;
+ typedef typename Map::size_type size_type;
+
+ /** Responsible for releasing lock in map when out of scope. */
+ class LockKeeper {
+ friend class LockableMap<Map>::WrappedEntry;
+ LockableMap<Map>& _map;
+ key_type _key;
+ bool _locked;
+
+ LockKeeper(LockableMap<Map>& map, key_type key)
+ : _map(map), _key(key), _locked(true) {}
+ void unlock() { _map.unlock(_key); _locked = false;}
+ public:
+ ~LockKeeper() { if (_locked) unlock(); }
+ };
+
+ struct WrappedEntry {
+ WrappedEntry() : _exists(false), _lockKeeper(), _value() {}
+
+ mapped_type* operator->() { return &_value; }
+ const mapped_type* operator->() const { return &_value; }
+ mapped_type& operator*() { return _value; }
+ const mapped_type& operator*() const { return _value; }
+ void write();
+ void remove();
+ void unlock();
+ bool exist() const { return _exists; }
+ bool preExisted() const { return _preExisted; }
+ bool locked() const { return _lockKeeper.get(); }
+ const key_type& getKey() const { return _lockKeeper->_key; };
+
+ document::BucketId getBucketId() const {
+ return document::BucketId(document::BucketId::keyToBucketId(getKey()));
+ }
+
+ protected:
+ WrappedEntry(LockableMap<Map>& map,
+ const key_type& key, const mapped_type& val,
+ const char* clientId, bool preExisted_)
+ : _exists(true),
+ _preExisted(preExisted_),
+ _lockKeeper(new LockKeeper(map, key)),
+ _value(val),
+ _clientId(clientId) {}
+ WrappedEntry(LockableMap<Map>& map, const key_type& key,
+ const char* clientId)
+ : _exists(false),
+ _preExisted(false),
+ _lockKeeper(new LockKeeper(map, key)),
+ _value(),
+ _clientId(clientId) {}
+
+ bool _exists;
+ bool _preExisted;
+ vespalib::LinkedPtr<LockKeeper> _lockKeeper;
+ mapped_type _value;
+ const char* _clientId;
+ friend class LockableMap<Map>;
+ };
+
+ struct LockId {
+ key_type _key;
+ const char* _owner;
+
+ LockId() : _key(0), _owner("none - empty token") {}
+ LockId(key_type key, const char* owner)
+ : _key(key), _owner(owner)
+ {
+ assert(_owner != 0);
+ }
+
+ size_t hash() const { return _key; }
+ size_t operator%(size_t val) const { return _key % val; }
+ bool operator==(const LockId& id) const { return (_key == id._key); }
+ operator key_type() const { return _key; }
+ };
+
+ LockableMap();
+ bool operator==(const LockableMap& other) const;
+ bool operator<(const LockableMap& other) const;
+ typename Map::size_type size() const;
+ size_type getMemoryUsage() const;
+ bool empty() const;
+ void swap(LockableMap&);
+
+ WrappedEntry get(const key_type& key, const char* clientId,
+ bool createIfNonExisting = false,
+ bool lockIfNonExistingAndNotCreating = false);
+ bool erase(const key_type& key, const char* clientId)
+ { return erase(key, clientId, false); }
+ void insert(const key_type& key, const mapped_type& value,
+ const char* clientId, bool& preExisted)
+ { return insert(key, value, clientId, false, preExisted); }
+ void clear();
+
+ enum Decision { ABORT, UPDATE, REMOVE, CONTINUE, DECISION_COUNT };
+
+ template<typename Functor>
+ void each(Functor& functor, const char* clientId,
+ const key_type& first = key_type(),
+ const key_type& last = key_type() - 1 );
+
+ template<typename Functor>
+ void each(const Functor& functor, const char* clientId,
+ const key_type& first = key_type(),
+ const key_type& last = key_type() - 1 );
+
+ template<typename Functor>
+ void all(Functor& functor, const char* clientId,
+ const key_type& first = key_type(),
+ const key_type& last = key_type()-1);
+
+ template<typename Functor>
+ void all(const Functor& functor, const char* clientId,
+ const key_type& first = key_type(),
+ const key_type& last = key_type() - 1 );
+
+ static constexpr uint32_t DEFAULT_CHUNK_SIZE = 10000;
+
+ /**
+ * Iterate over the entire database contents, holding the global database
+ * mutex for `chunkSize` processed entries at a time, yielding the current
+ * thread between each such such to allow other threads to get a chance
+ * at acquiring a bucket lock.
+ */
+ template <typename Functor>
+ void chunkedAll(Functor& functor,
+ const char* clientId,
+ uint32_t chunkSize = DEFAULT_CHUNK_SIZE);
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ /**
+ * Returns all buckets in the bucket database that can contain the given
+ * bucket. Usually, there should be only one such bucket, but in the case
+ * of inconsistent splitting, there may be more than one.
+ */
+ std::map<document::BucketId, WrappedEntry>
+ getContained(const document::BucketId& bucketId, const char* clientId);
+
+ WrappedEntry
+ createAppropriateBucket(uint16_t newBucketBits,
+ const char* clientId,
+ const document::BucketId& bucket);
+
+ typedef std::map<document::BucketId, WrappedEntry> EntryMap;
+
+ /**
+ * Returns all buckets in the bucket database that can contain the given
+ * bucket, and all buckets that that bucket contains.
+ *
+ * If sibling is != 0, also fetch that bucket if possible.
+ */
+ EntryMap getAll(
+ const document::BucketId& bucketId,
+ const char* clientId,
+ const document::BucketId& sibling = document::BucketId(0));
+
+ /**
+ * Returns true iff bucket has no superbuckets or sub-buckets in the
+ * database. Usage assumption is that any operation that can cause the
+ * bucket to become inconsistent will require taking its lock, so by
+ * requiring the lock to be provided here we avoid race conditions.
+ */
+ bool isConsistent(const WrappedEntry& entry);
+
+ void showLockClients(vespalib::asciistream & out) const;
+
+private:
+ struct hasher {
+ size_t operator () (const LockId & lid) const { return lid.hash(); }
+ };
+ class LockIdSet : public vespalib::hash_set<LockId, hasher> {
+ typedef vespalib::hash_set<LockId, hasher> Hash;
+ public:
+ LockIdSet() : Hash() { }
+ void print(std::ostream& out, bool verbose, const std::string& indent) const;
+ bool exist(const LockId & lid) const { return this->find(lid) != Hash::end(); }
+ size_t getMemoryUsage() const { return Hash::getMemoryConsumption(); }
+ };
+
+ class LockWaiters {
+ typedef vespalib::hash_map<size_t, LockId> WaiterMap;
+ public:
+ typedef size_t Key;
+ typedef typename WaiterMap::const_iterator const_iterator;
+ LockWaiters() : _id(0), _map() { }
+ Key insert(const LockId & lid) {
+ Key id(_id++);
+ _map.insert(typename WaiterMap::value_type(id, lid));
+ return id;
+ }
+ void erase(Key id) { _map.erase(id); }
+ const_iterator begin() const { return _map.begin(); }
+ const_iterator end() const { return _map.end(); }
+ private:
+ Key _id;
+ WaiterMap _map;
+ };
+
+ Map _map;
+ vespalib::Monitor _lock;
+ LockIdSet _lockedKeys;
+ LockWaiters _lockWaiters;
+
+ bool erase(const key_type& key, const char* clientId, bool haslock);
+ void insert(const key_type& key, const mapped_type& value,
+ const char* clientId, bool haslock, bool& preExisted);
+ void unlock(const key_type& key);
+ bool findNextKey(key_type& key, mapped_type& val, const char* clientId,
+ vespalib::MonitorGuard& guard);
+ bool handleDecision(key_type& key, mapped_type& val, Decision decision);
+ void ackquireKey(const LockId & lid, vespalib::MonitorGuard & guard);
+
+ /**
+ * Process up to `chunkSize` bucket database entries from--and possibly
+ * including--the bucket pointed to by `key`.
+ *
+ * Returns true if additional chunks may be processed after the call to
+ * this function has returned, false if iteration has completed or if
+ * `functor` returned an abort-decision.
+ *
+ * Modifies `key` in-place to point to the next key to process for the next
+ * invocation of this function.
+ */
+ template <typename Functor>
+ bool processNextChunk(Functor& functor,
+ key_type& key,
+ const char* clientId,
+ const uint32_t chunkSize);
+
+ /**
+ * Returns the given bucket, its super buckets and its sub buckets.
+ */
+ void getAllWithoutLocking(const document::BucketId& bucket,
+ const document::BucketId& sibling,
+ std::vector<document::BucketId::Type>& keys);
+
+ /**
+ * Retrieves the most specific bucket id (highest used bits) that matches
+ * the given bucket.
+ *
+ * If a match is found, result is set to the bucket id found, and keyResult
+ * is set to the corresponding key (reversed)
+ *
+ * If not found, nextKey is set to the key after one that could have
+ * matched and we return false.
+ */
+ bool getMostSpecificMatch(const document::BucketId& bucket,
+ document::BucketId& result,
+ document::BucketId::Type& keyResult,
+ document::BucketId::Type& nextKey);
+
+ /**
+ * Finds all buckets that can contain the given bucket, except for the
+ * bucket itself (that is, its super buckets)
+ */
+ void getAllContaining(const document::BucketId& bucket,
+ std::vector<document::BucketId::Type>& keys);
+
+ /**
+ * Find the given list of keys in the map and add them to the map of
+ * results, locking them in the process.
+ */
+ void addAndLockResults(const std::vector<document::BucketId::Type> keys,
+ const char* clientId,
+ std::map<document::BucketId, WrappedEntry>& results,
+ vespalib::MonitorGuard& guard);
+};
+
+template<typename Map>
+void
+LockableMap<Map>::WrappedEntry::write()
+{
+ assert(_lockKeeper->_locked);
+ assert(_value.verifyLegal());
+ bool b;
+ _lockKeeper->_map.insert(_lockKeeper->_key, _value, _clientId, true, b);
+ _lockKeeper->unlock();
+}
+
+template<typename Map>
+void
+LockableMap<Map>::WrappedEntry::remove()
+{
+ assert(_lockKeeper->_locked);
+ assert(_exists);
+ _lockKeeper->_map.erase(_lockKeeper->_key, _clientId, true);
+ _lockKeeper->unlock();
+}
+
+template<typename Map>
+void
+LockableMap<Map>::WrappedEntry::unlock()
+{
+ assert(_lockKeeper->_locked);
+ _lockKeeper->unlock();
+}
+
+template<typename Map>
+LockableMap<Map>::LockableMap()
+ : _map(),
+ _lock(),
+ _lockedKeys(),
+ _lockWaiters() {}
+
+template<typename Map>
+bool
+LockableMap<Map>::operator==(const LockableMap<Map>& other) const
+{
+ vespalib::LockGuard guard(_lock);
+ vespalib::LockGuard guard2(other._lock);
+ return (_map == other._map);
+}
+
+template<typename Map>
+bool
+LockableMap<Map>::operator<(const LockableMap<Map>& other) const
+{
+ vespalib::LockGuard guard(_lock);
+ vespalib::LockGuard guard2(other._lock);
+ return (_map < other._map);
+}
+
+template<typename Map>
+typename Map::size_type
+LockableMap<Map>::size() const
+{
+ vespalib::LockGuard guard(_lock);
+ return _map.size();
+}
+
+template<typename Map>
+typename Map::size_type
+LockableMap<Map>::getMemoryUsage() const
+{
+ vespalib::MonitorGuard guard(_lock);
+ return _map.getMemoryUsage()
+ + _lockedKeys.getMemoryUsage()
+ + sizeof(vespalib::Monitor);
+}
+
+template<typename Map>
+bool
+LockableMap<Map>::empty() const
+{
+ vespalib::LockGuard guard(_lock);
+ return _map.empty();
+}
+
+template<typename Map>
+void
+LockableMap<Map>::swap(LockableMap<Map>& other)
+{
+ vespalib::LockGuard guard(_lock);
+ vespalib::LockGuard guard2(other._lock);
+ return _map.swap(other._map);
+}
+
+template<typename Map>
+void LockableMap<Map>::ackquireKey(const LockId & lid, vespalib::MonitorGuard & guard)
+{
+ if (_lockedKeys.exist(lid)) {
+ typename LockWaiters::Key waitId(_lockWaiters.insert(lid));
+ while (_lockedKeys.exist(lid)) {
+ guard.wait();
+ }
+ _lockWaiters.erase(waitId);
+ }
+}
+
+template<typename Map>
+typename LockableMap<Map>::WrappedEntry
+LockableMap<Map>::get(const key_type& key, const char* clientId,
+ bool createIfNonExisting,
+ bool lockIfNonExistingAndNotCreating)
+{
+ LockId lid(key, clientId);
+ vespalib::MonitorGuard guard(_lock);
+ ackquireKey(lid, guard);
+ bool preExisted = false;
+ typename Map::iterator it =
+ _map.find(key, createIfNonExisting, preExisted);
+
+ if (it == _map.end()) {
+ if (lockIfNonExistingAndNotCreating) {
+ return WrappedEntry(*this, key, clientId);
+ } else {
+ return WrappedEntry();
+ }
+ }
+ _lockedKeys.insert(lid);
+ return WrappedEntry(*this, key, it->second, clientId, preExisted);
+}
+
+#ifdef ENABLE_BUCKET_OPERATION_LOGGING
+
+namespace bucketdb {
+struct StorageBucketInfo;
+struct BucketInfo;
+}
+
+namespace debug {
+
+template <typename T> struct TypeTag {};
+// Storage
+void logBucketDbInsert(uint64_t key, const bucketdb::StorageBucketInfo& entry);
+void logBucketDbErase(uint64_t key, const TypeTag<bucketdb::StorageBucketInfo>&);
+
+// Distributor
+void logBucketDbInsert(uint64_t key, const bucketdb::BucketInfo& entry);
+void logBucketDbErase(uint64_t key, const TypeTag<bucketdb::BucketInfo>&);
+
+template <typename DummyValue>
+inline void logBucketDbErase(uint64_t, const TypeTag<DummyValue>&) {}
+template <typename DummyKey, typename DummyValue>
+inline void logBucketDbInsert(const DummyKey&, const DummyValue&) {}
+
+}
+
+#endif // ENABLE_BUCKET_OPERATION_LOGGING
+
+template<typename Map>
+bool
+LockableMap<Map>::erase(const key_type& key, const char* clientId, bool haslock)
+{
+ LockId lid(key, clientId);
+ vespalib::MonitorGuard guard(_lock);
+ if (!haslock) {
+ ackquireKey(lid, guard);
+ }
+#ifdef ENABLE_BUCKET_OPERATION_LOGGING
+ debug::logBucketDbErase(key, debug::TypeTag<mapped_type>());
+#endif
+ return _map.erase(key);
+}
+
+template<typename Map>
+void
+LockableMap<Map>::insert(const key_type& key, const mapped_type& value,
+ const char* clientId, bool haslock, bool& preExisted)
+{
+ LockId lid(key, clientId);
+ vespalib::MonitorGuard guard(_lock);
+ if (!haslock) {
+ ackquireKey(lid, guard);
+ }
+#ifdef ENABLE_BUCKET_OPERATION_LOGGING
+ debug::logBucketDbInsert(key, value);
+#endif
+ _map.insert(key, value, preExisted);
+}
+
+template<typename Map>
+void
+LockableMap<Map>::clear()
+{
+ vespalib::LockGuard guard(_lock);
+ _map.clear();
+}
+
+template<typename Map>
+bool
+LockableMap<Map>::findNextKey(key_type& key, mapped_type& val,
+ const char* clientId,
+ vespalib::MonitorGuard& guard)
+{
+ // Wait for next value to unlock.
+ typename Map::iterator it(_map.lower_bound(key));
+ while (it != _map.end() && _lockedKeys.exist(LockId(it->first, ""))) {
+ typename LockWaiters::Key waitId(_lockWaiters.insert(LockId(it->first, clientId)));
+ guard.wait();
+ _lockWaiters.erase(waitId);
+ it = _map.lower_bound(key);
+ }
+ if (it == _map.end()) return true;
+ key = it->first;
+ val = it->second;
+ return false;
+}
+
+template<typename Map>
+bool
+LockableMap<Map>::handleDecision(key_type& key, mapped_type& val,
+ Decision decision)
+{
+ bool b;
+ switch (decision) {
+ case UPDATE: _map.insert(key, val, b);
+ break;
+ case REMOVE: _map.erase(key);
+ break;
+ case ABORT: return true;
+ case CONTINUE: break;
+ default: assert(false);
+ }
+ return false;
+}
+
+template<typename Map>
+template<typename Functor>
+void
+LockableMap<Map>::each(Functor& functor, const char* clientId,
+ const key_type& first, const key_type& last)
+{
+ key_type key = first;
+ mapped_type val;
+ Decision decision;
+ {
+ vespalib::MonitorGuard guard(_lock);
+ if (findNextKey(key, val, clientId, guard) || key > last) return;
+ _lockedKeys.insert(LockId(key, clientId));
+ }
+ try{
+ while (true) {
+ decision = functor(const_cast<const key_type&>(key), val);
+ vespalib::MonitorGuard guard(_lock);
+ _lockedKeys.erase(LockId(key, clientId));
+ guard.broadcast();
+ if (handleDecision(key, val, decision)) return;
+ ++key;
+ if (findNextKey(key, val, clientId, guard) || key > last) return;
+ _lockedKeys.insert(LockId(key, clientId));
+ }
+ } catch (...) {
+ // Assuming only the functor call can throw exceptions, we need
+ // to unlock the current key before exiting
+ vespalib::MonitorGuard guard(_lock);
+ _lockedKeys.erase(LockId(key, clientId));
+ guard.broadcast();
+ throw;
+ }
+}
+
+template<typename Map>
+template<typename Functor>
+void
+LockableMap<Map>::each(const Functor& functor, const char* clientId,
+ const key_type& first, const key_type& last)
+{
+ key_type key = first;
+ mapped_type val;
+ Decision decision;
+ {
+ vespalib::MonitorGuard guard(_lock);
+ if (findNextKey(key, val, clientId, guard) || key > last) return;
+ _lockedKeys.insert(LockId(key, clientId));
+ }
+ try{
+ while (true) {
+ decision = functor(const_cast<const key_type&>(key), val);
+ vespalib::MonitorGuard guard(_lock);
+ _lockedKeys.erase(LockId(key, clientId));
+ guard.broadcast();
+ if (handleDecision(key, val, decision)) return;
+ ++key;
+ if (findNextKey(key, val, clientId, guard) || key > last) return;
+ _lockedKeys.insert(LockId(key, clientId));
+ }
+ } catch (...) {
+ // Assuming only the functor call can throw exceptions, we need
+ // to unlock the current key before exiting
+ vespalib::MonitorGuard guard(_lock);
+ _lockedKeys.erase(LockId(key, clientId));
+ guard.broadcast();
+ throw;
+ }
+}
+
+template<typename Map>
+template<typename Functor>
+void
+LockableMap<Map>::all(Functor& functor, const char* clientId,
+ const key_type& first, const key_type& last)
+{
+ key_type key = first;
+ mapped_type val;
+ vespalib::MonitorGuard guard(_lock);
+ while (true) {
+ if (findNextKey(key, val, clientId, guard) || key > last) return;
+ Decision d(functor(const_cast<const key_type&>(key), val));
+ if (handleDecision(key, val, d)) return;
+ ++key;
+ }
+}
+
+template<typename Map>
+template<typename Functor>
+void
+LockableMap<Map>::all(const Functor& functor, const char* clientId,
+ const key_type& first, const key_type& last)
+{
+ key_type key = first;
+ mapped_type val;
+ vespalib::MonitorGuard guard(_lock);
+ while (true) {
+ if (findNextKey(key, val, clientId, guard) || key > last) return;
+ Decision d(functor(const_cast<const key_type&>(key), val));
+ assert(d == ABORT || d == CONTINUE);
+ if (handleDecision(key, val, d)) return;
+ ++key;
+ }
+}
+
+template <typename Map>
+template <typename Functor>
+bool
+LockableMap<Map>::processNextChunk(Functor& functor,
+ key_type& key,
+ const char* clientId,
+ const uint32_t chunkSize)
+{
+ mapped_type val;
+ vespalib::MonitorGuard guard(_lock);
+ for (uint32_t processed = 0; processed < chunkSize; ++processed) {
+ if (findNextKey(key, val, clientId, guard)) {
+ return false;
+ }
+ Decision d(functor(const_cast<const key_type&>(key), val));
+ if (handleDecision(key, val, d)) {
+ return false;
+ }
+ ++key;
+ }
+ return true;
+}
+
+template <typename Map>
+template <typename Functor>
+void
+LockableMap<Map>::chunkedAll(Functor& functor,
+ const char* clientId,
+ uint32_t chunkSize)
+{
+ key_type key{};
+ while (processNextChunk(functor, key, clientId, chunkSize)) {
+ std::this_thread::yield();
+ }
+}
+
+template<typename Map>
+void
+LockableMap<Map>::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ vespalib::LockGuard guard(_lock);
+ out << "LockableMap {\n" << indent << " ";
+
+ if (verbose) {
+ for (typename Map::const_iterator iter = _map.begin();
+ iter != _map.end();
+ iter++) {
+ out << "Key: " <<
+ document::BucketId(document::BucketId::keyToBucketId(iter->first))
+ << " Value: " << iter->second << "\n" << indent << " ";
+ }
+
+ out << "\n" << indent << " Locked keys: ";
+ _lockedKeys.print(out, verbose, indent + " ");
+ }
+ out << "} : ";
+
+ out << _map;
+}
+
+template<typename Map>
+void
+LockableMap<Map>::LockIdSet::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ out << "hash {";
+ for (typename Hash::const_iterator it(Hash::begin()), mt(Hash::end()); it != mt; it++) {
+ if (verbose) {
+ out << "\n" << indent << " ";
+ } else {
+ out << " ";
+ }
+
+ out << *it;
+ }
+ if (verbose) out << "\n" << indent;
+ out << " }";
+}
+
+template<typename Map>
+void
+LockableMap<Map>::unlock(const key_type& key)
+{
+ vespalib::MonitorGuard guard(_lock);
+ _lockedKeys.erase(LockId(key, ""));
+ guard.broadcast();
+}
+
+namespace {
+
+/**
+ * Check whether the given key contains the given bucket.
+ * Sets result to the bucket corresponding to the key, and keyResult
+ * to the key if true.
+ */
+bool
+checkContains(document::BucketId::Type key, const document::BucketId& bucket,
+ document::BucketId& result, document::BucketId::Type& keyResult)
+{
+ document::BucketId id = document::BucketId(
+ document::BucketId::keyToBucketId(key));
+ if (id.contains(bucket)) {
+ result = id;
+ keyResult = key;
+ return true;
+ }
+
+ return false;
+}
+
+} // anon namespace
+
+/**
+ * Retrieves the most specific bucket id (highest used bits) that contains
+ * the given bucket.
+ *
+ * If a match is found, result is set to the bucket id found, and keyResult is
+ * set to the corresponding key (reversed)
+ *
+ * If not found, nextKey is set to the key after one that could have matched
+ * and we return false.
+ */
+template<typename Map>
+bool
+LockableMap<Map>::getMostSpecificMatch(const document::BucketId& bucket,
+ document::BucketId& result,
+ document::BucketId::Type& keyResult,
+ document::BucketId::Type& nextKey)
+{
+ typename Map::const_iterator iter = _map.lower_bound(bucket.toKey());
+
+ nextKey = 0;
+
+ // We should now have either the bucket we are looking for
+ // (if the exact bucket exists), or one right after.
+ if (iter != _map.end()) {
+ nextKey = iter->first;
+
+ if (checkContains(iter->first, bucket, result, keyResult)) {
+ return true;
+ }
+ }
+
+ if (iter != _map.begin()) {
+ --iter; // If iter was map.end(), we should now end up at the last item in the map
+ nextKey = iter->first;
+
+ if (checkContains(iter->first, bucket, result, keyResult)) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+/**
+ * Finds all buckets that can contain the given bucket, except for the bucket
+ * itself.
+ */
+template<typename Map>
+void
+LockableMap<Map>::getAllContaining(const document::BucketId& bucket,
+ std::vector<document::BucketId::Type>& keys)
+{
+ document::BucketId id = bucket;
+
+ // Find other buckets that contain this bucket.
+ // TODO: Optimize?
+ while (id.getUsedBits() > 1) {
+ id.setUsedBits(id.getUsedBits() - 1);
+ id = id.stripUnused();
+ document::BucketId::Type key = id.toKey();
+
+ typename Map::const_iterator iter = _map.find(key);
+ if (iter != _map.end()) {
+ keys.push_back(key);
+ }
+ }
+}
+
+template<typename Map>
+void
+LockableMap<Map>::addAndLockResults(
+ const std::vector<document::BucketId::Type> keys,
+ const char* clientId,
+ std::map<document::BucketId, WrappedEntry>& results,
+ vespalib::MonitorGuard& guard)
+{
+ // Wait until all buckets are free to be added, then add them all.
+ while (true) {
+ bool allOk = true;
+ key_type waitingFor(0);
+
+ for (uint32_t i=0; i<keys.size(); i++) {
+ if (_lockedKeys.exist(LockId(keys[i], clientId))) {
+ waitingFor = keys[i];
+ allOk = false;
+ break;
+ }
+ }
+
+ if (!allOk) {
+ typename LockWaiters::Key waitId(_lockWaiters.insert(LockId(waitingFor, clientId)));
+ guard.wait();
+ _lockWaiters.erase(waitId);
+ } else {
+ for (uint32_t i=0; i<keys.size(); i++) {
+ typename Map::iterator it = _map.find(keys[i]);
+ if (it != _map.end()) {
+ _lockedKeys.insert(LockId(keys[i], clientId));
+ results[document::BucketId(
+ document::BucketId::keyToBucketId(keys[i]))]
+ = WrappedEntry(*this, keys[i], it->second,
+ clientId, true);
+ }
+ }
+ break;
+ }
+ }
+}
+
+namespace {
+
+uint8_t getMinDiffBits(uint16_t minBits, const document::BucketId& a, const document::BucketId& b) {
+ for (uint32_t i = minBits; i <= std::min(a.getUsedBits(), b.getUsedBits()); i++) {
+ document::BucketId a1 = document::BucketId(i, a.getRawId());
+ document::BucketId b1 = document::BucketId(i, b.getRawId());
+ if (b1.getId() != a1.getId()) {
+ return i;
+ }
+ }
+ return minBits;
+};
+
+}
+
+template<typename Map>
+typename LockableMap<Map>::WrappedEntry
+LockableMap<Map>::createAppropriateBucket(
+ uint16_t newBucketBits,
+ const char* clientId,
+ const document::BucketId& bucket)
+{
+ vespalib::MonitorGuard guard(_lock);
+ typename Map::const_iterator iter = _map.lower_bound(bucket.toKey());
+
+ // Find the two buckets around the possible new bucket. The new
+ // bucket's used bits should be the highest used bits it can be while
+ // still being different from both of these.
+ if (iter != _map.end()) {
+ newBucketBits = getMinDiffBits(newBucketBits,
+ document::BucketId(document::BucketId::keyToBucketId(iter->first)), bucket);
+ }
+
+ if (iter != _map.begin()) {
+ --iter;
+ newBucketBits = getMinDiffBits(newBucketBits,
+ document::BucketId(document::BucketId::keyToBucketId(iter->first)), bucket);
+ }
+
+ document::BucketId newBucket(newBucketBits, bucket.getRawId());
+ newBucket.setUsedBits(newBucketBits);
+ document::BucketId::Type key = newBucket.stripUnused().toKey();
+
+ LockId lid(key, clientId);
+ ackquireKey(lid, guard);
+ bool preExisted;
+ typename Map::iterator it = _map.find(key, true, preExisted);
+ _lockedKeys.insert(LockId(key, clientId));
+ return WrappedEntry(*this, key, it->second, clientId, preExisted);
+}
+
+template<typename Map>
+std::map<document::BucketId, typename LockableMap<Map>::WrappedEntry>
+LockableMap<Map>::getContained(const document::BucketId& bucket,
+ const char* clientId)
+{
+ vespalib::MonitorGuard guard(_lock);
+ std::map<document::BucketId, WrappedEntry> results;
+
+ document::BucketId result;
+ document::BucketId::Type keyResult;
+ document::BucketId::Type nextKey;
+
+ std::vector<document::BucketId::Type> keys;
+
+ if (getMostSpecificMatch(bucket, result, keyResult, nextKey)) {
+ keys.push_back(keyResult);
+
+ // Find the super buckets for the most specific match
+ getAllContaining(result, keys);
+ } else {
+ // Find the super buckets for the input bucket
+ // because getMostSpecificMatch() might not find the most specific
+ // match in all cases of inconsistently split buckets
+ getAllContaining(bucket, keys);
+ }
+
+ if (!keys.empty()) {
+ addAndLockResults(keys, clientId, results, guard);
+ }
+
+ return results;
+}
+
+template<typename Map>
+void
+LockableMap<Map>::getAllWithoutLocking(const document::BucketId& bucket,
+ const document::BucketId& sibling,
+ std::vector<document::BucketId::Type>& keys)
+{
+ document::BucketId result;
+ document::BucketId::Type keyResult;
+ document::BucketId::Type nextKey;
+
+ typename Map::iterator it = _map.end();
+
+ if (getMostSpecificMatch(bucket, result, keyResult, nextKey)) {
+ keys.push_back(keyResult);
+
+ // Find the super buckets for the most specific match
+ getAllContaining(result, keys);
+
+ it = _map.find(keyResult);
+ if (it != _map.end()) {
+ // Skipping nextKey, since it was equal to keyResult
+ it++;
+ }
+ } else {
+ // Find the super buckets for the input bucket
+ // because getMostSpecificMatch() might not find the most specific
+ // match in all cases of inconsistently split buckets
+ getAllContaining(bucket, keys);
+
+ it = _map.find(nextKey);
+ if (it != _map.end()) {
+ // Nextkey might be contained in the imput bucket,
+ // e.g. if it is the first bucket in bucketdb
+ document::BucketId id = document::BucketId(
+ document::BucketId::keyToBucketId(it->first));
+ if (!bucket.contains(id)) {
+ it++;
+ }
+ }
+ }
+
+ // Buckets contained in the found bucket will come immediately after it.
+ // Traverse the map to find them.
+ for (; it != _map.end(); it++) {
+ document::BucketId id(
+ document::BucketId(document::BucketId::keyToBucketId(it->first)));
+
+ if (bucket.contains(id)) {
+ keys.push_back(it->first);
+ } else {
+ break;
+ }
+ }
+
+ if (sibling.getRawId() != 0) {
+ keys.push_back(sibling.toKey());
+ }
+}
+
+/**
+ * Returns the given bucket, its super buckets and its sub buckets.
+ */
+template<typename Map>
+std::map<document::BucketId, typename LockableMap<Map>::WrappedEntry>
+LockableMap<Map>::getAll(const document::BucketId& bucket, const char* clientId,
+ const document::BucketId& sibling)
+{
+ vespalib::MonitorGuard guard(_lock);
+
+ std::map<document::BucketId, WrappedEntry> results;
+ std::vector<document::BucketId::Type> keys;
+
+ getAllWithoutLocking(bucket, sibling, keys);
+
+ addAndLockResults(keys, clientId, results, guard);
+
+ return results;
+}
+
+template<typename Map>
+bool
+LockableMap<Map>::isConsistent(const typename LockableMap<Map>::WrappedEntry& entry)
+{
+ vespalib::MonitorGuard guard(_lock);
+
+ document::BucketId sibling(0);
+ std::vector<document::BucketId::Type> keys;
+
+ getAllWithoutLocking(entry.getBucketId(), sibling, keys);
+ assert(keys.size() >= 1);
+ assert(keys.size() != 1 || keys[0] == entry.getKey());
+
+ return keys.size() == 1;
+}
+
+template<typename Map>
+void
+LockableMap<Map>::showLockClients(vespalib::asciistream & out) const
+{
+ vespalib::MonitorGuard guard(_lock);
+ out << "Currently grabbed locks:";
+ for (typename LockIdSet::const_iterator it = _lockedKeys.begin();
+ it != _lockedKeys.end(); ++it)
+ {
+ out << "\n "
+ << document::BucketId(document::BucketId::keyToBucketId(it->_key))
+ << " - " << it->_owner;
+ }
+ out << "\nClients waiting for keys:";
+ for (typename LockWaiters::const_iterator it = _lockWaiters.begin();
+ it != _lockWaiters.end(); ++it)
+ {
+ out << "\n "
+ << document::BucketId(document::BucketId::keyToBucketId(it->second._key))
+ << " - " << it->second._owner;
+ }
+}
+
+} // storage
+
diff --git a/storage/src/vespa/storage/bucketdb/minimumusedbitstracker.h b/storage/src/vespa/storage/bucketdb/minimumusedbitstracker.h
new file mode 100644
index 00000000000..dadc7a6092f
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/minimumusedbitstracker.h
@@ -0,0 +1,43 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <algorithm>
+#include <vespa/document/bucket/bucketid.h>
+
+namespace storage {
+
+/**
+ * Utility class for keeping track of the lowest used bits count seen
+ * across a set of buckets.
+ *
+ * Not threadsafe by itself.
+ */
+class MinimumUsedBitsTracker
+{
+ uint32_t _minUsedBits;
+public:
+ MinimumUsedBitsTracker()
+ : _minUsedBits(58)
+ {}
+
+ /**
+ * Returns true if new bucket led to a decrease in the used bits count.
+ */
+ bool update(const document::BucketId& bucket) {
+ if (bucket.getUsedBits() < _minUsedBits) {
+ _minUsedBits = bucket.getUsedBits();
+ return true;
+ }
+ return false;
+ }
+
+ uint32_t getMinUsedBits() const {
+ return _minUsedBits;
+ }
+
+ void setMinUsedBits(uint32_t minUsedBits) {
+ _minUsedBits = minUsedBits;
+ }
+};
+
+}
diff --git a/storage/src/vespa/storage/bucketdb/stdmapwrapper.h b/storage/src/vespa/storage/bucketdb/stdmapwrapper.h
new file mode 100644
index 00000000000..55bbaa280db
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/stdmapwrapper.h
@@ -0,0 +1,94 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class StdMapWrapper
+ * @ingroup bucketdb
+ *
+ * @brief Wrapper for std::map to add functionality in JudyMultiMap.
+ *
+ * To remove the need for partial template specialization in lockablemap
+ */
+
+#pragma once
+
+#include <map>
+#include <vespa/vespalib/util/printable.h>
+
+namespace storage {
+
+template<typename Key, typename Value>
+class StdMapWrapper : public std::map<Key, Value>,
+ public vespalib::Printable
+{
+public:
+ StdMapWrapper() {}
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ typename std::map<Key, Value>::iterator find(Key key);
+
+ typename std::map<Key, Value>::iterator find(Key key, bool insert, bool&);
+
+ void insert(Key key, const Value& val, bool&);
+
+ uint32_t getMemoryUsage() const;
+};
+
+template<class Key, class Value>
+uint32_t
+StdMapWrapper<Key, Value>::getMemoryUsage() const
+{
+ Value val;
+
+ return (32 + sizeof(val)) * this->size();
+}
+
+template<class Key, class Value>
+void
+StdMapWrapper<Key, Value>::print(std::ostream& out,
+ bool,
+ const std::string& indent) const
+{
+ out << "StdMapWrapper(";
+ for (typename std::map<Key, Value>::const_iterator i = this->begin();
+ i != this->end(); ++i)
+ {
+ out << "\n" << indent << " " << "Key: " << i->first << ", Value: "
+ << i->second;
+ }
+ out << ")";
+}
+
+template<class Key, class Value>
+inline typename std::map<Key, Value>::iterator
+StdMapWrapper<Key, Value>::
+find(Key key)
+{
+ bool tmp;
+ return find(key, false, tmp);
+}
+
+template<class Key, class Value>
+inline typename std::map<Key, Value>::iterator
+StdMapWrapper<Key, Value>::
+find(Key key, bool insertIfNonExisting, bool&)
+{
+ if (insertIfNonExisting) {
+ std::pair<typename std::map<Key, Value>::iterator, bool> result
+ = std::map<Key, Value>::insert(std::pair<Key, Value>(key, Value()));
+ return result.first;
+ } else {
+ return std::map<Key, Value>::find(key);
+ }
+}
+
+template<class Key, class Value>
+void
+StdMapWrapper<Key, Value>::
+insert(Key key, const Value& val, bool&)
+{
+ this->operator[](key) = val;
+}
+
+}
+
diff --git a/storage/src/vespa/storage/bucketdb/stor-bucket-init.def b/storage/src/vespa/storage/bucketdb/stor-bucket-init.def
new file mode 100644
index 00000000000..70743a38a1c
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/stor-bucket-init.def
@@ -0,0 +1,35 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+# The maximum number of alien files found during init that should be stored in
+# memory so they can be viewed on status page.
+max_alien_files_logged int default=10 restart
+
+# The maximum number of pending info reads to each disk during initialization.
+max_pending_info_reads_per_disk int default=20 restart
+
+# The minimum number of pending info reads to each disk during initialization.
+# When pending falls below this, we will resume database scan to add more
+# pending up to the maximum setting.
+min_pending_info_reads_per_disk int default=4 restart
+
+# The priority of the read bucket info requests the initializer sends to the
+# persistence layer. Currently chosen so that such operations will not pre-
+# empt any regular external load or ideal state operations, but they will block
+# very low priority background operations such as periodic GC (default pri of
+# 200). A tradeoff must be made between fast initialization and the availability
+# of data on the initializing node.
+info_read_priority int default=185 restart
+
+# The priority of the list bucket requests the initializer sends to the
+# persistence layer. Should always be lower than the read priority to ensure
+# starting to read wont make listing wait. However, listing is currently pretty
+# much required to be done before starting anyhow, so this option does little
+# unless your directories are not hardware independent.
+list_priority int default=100 restart
+
+# Whether the initializer should complete listing before starting to read
+# bucket information. Shouldnt matter much performance wise so always set to
+# true as it is now. Setting it false, disks done listing first will start
+# to process info requests a bit earlier than otherwise.
+complete_list_before_starting_read bool default=false restart
diff --git a/storage/src/vespa/storage/bucketdb/stor-bucketdb.def b/storage/src/vespa/storage/bucketdb/stor-bucketdb.def
new file mode 100644
index 00000000000..41f3b8e4a9b
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/stor-bucketdb.def
@@ -0,0 +1,9 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+## Number of elements to retrieve in one bucket info chunk
+bucketinfobatchsize int default=128 restart
+
+## Chunk level. Set what level of the path which defines one chunk.
+## (See doxygen info in bucketmanager.h for more info)
+chunklevel int default=1 restart
diff --git a/storage/src/vespa/storage/bucketdb/storagebucketdbinitializer.cpp b/storage/src/vespa/storage/bucketdb/storagebucketdbinitializer.cpp
new file mode 100644
index 00000000000..74dac0c016c
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/storagebucketdbinitializer.cpp
@@ -0,0 +1,785 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/bucketdb/storagebucketdbinitializer.h>
+
+#include <iomanip>
+#include <vespa/log/log.h>
+#include <vespa/storage/bucketdb/config-stor-bucket-init.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/storageserver/storagemetricsset.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/config-stor-filestor.h>
+#include <vespa/vespalib/io/fileutil.h>
+#include <vespa/config/config.h>
+
+LOG_SETUP(".storage.bucketdb.initializer");
+
+namespace storage {
+
+StorageBucketDBInitializer::Config::Config(const config::ConfigUri & configUri)
+ : _listPriority(0),
+ _infoReadPriority(255),
+ _minPendingInfoReadsPerDisk(16),
+ _maxPendingInfoReadsPerDisk(32)
+{
+ std::unique_ptr<vespa::config::content::core::StorBucketInitConfig> config =
+ config::ConfigGetter<vespa::config::content::core::StorBucketInitConfig>::getConfig(configUri.getConfigId(),
+ configUri.getContext());
+ _maxPendingInfoReadsPerDisk = config->maxPendingInfoReadsPerDisk;
+ _minPendingInfoReadsPerDisk = config->minPendingInfoReadsPerDisk;
+ _infoReadPriority = config->infoReadPriority;
+ _listPriority = config->listPriority;
+ if (config->completeListBeforeStartingRead) {
+ LOG(warning, "This config option is currently not honored. Info "
+ "reading will always start on a directory as soon as "
+ "it is done listing.");
+ }
+ LOG(debug, "Initializing bucket database: List priority %u, info priority "
+ "%u, min/max pending info per disk %u/%u.",
+ _listPriority, _infoReadPriority,
+ _minPendingInfoReadsPerDisk, _maxPendingInfoReadsPerDisk);
+}
+
+StorageBucketDBInitializer::System::System(
+ const spi::PartitionStateList& partitions,
+ DoneInitializeHandler& doneInitializeHandler,
+ ServiceLayerComponentRegister& compReg,
+ const Config&)
+ : _doneInitializeHandler(doneInitializeHandler),
+ _component(compReg, "storagebucketdbinitializer"),
+ _partitions(partitions),
+ _bucketDatabase(_component.getBucketDatabase()),
+ _nodeIndex(_component.getIndex()),
+ _distribution(*_component.getDistribution()),
+ _nodeState()
+{
+ // Is this correct? We should get the node state from the node state updater
+ // so it could work with disk capacities. Object is used to check for
+ // correct disk further down (in the case of internal join, deciding which
+ // should have it). Not that bad if wrong disk is picked though.
+ _nodeState.setDiskCount(_partitions.size());
+ for (uint32_t i=0; i<_partitions.size(); ++i) {
+ if (!_partitions[i].isUp()) {
+ _nodeState.setDiskState(i, lib::State::DOWN);
+ }
+ }
+}
+
+StorageBucketDBInitializer::Metrics::Metrics(framework::Component& component)
+ : metrics::MetricSet("dbinit", "",
+ "Metrics for the storage bucket database initializer"),
+ _wrongDisk("wrongdisk", "",
+ "Number of buckets found on non-ideal disk.", this),
+ _insertedCount("insertedcount", "",
+ "Number of buckets inserted into database in list step.", this),
+ _joinedCount("joinedcount", "",
+ "Number of buckets found in list step already found "
+ "(added from other disks).", this),
+ _infoReadCount("infocount", "",
+ "Number of buckets we have read bucket information from.", this),
+ _infoSetByLoad("infosetbyload", "",
+ "Number of buckets we did not need to request bucket info for "
+ "due to load already having updated them.", this),
+ _dirsListed("dirslisted", "",
+ "Directories listed in list step of initialization.", this),
+ _startTime(component.getClock()),
+ _listLatency("listlatency", "",
+ "Time used until list phase is done. (in ms)", this),
+ _initLatency("initlatency", "",
+ "Time used until initialization is complete. (in ms)", this)
+{
+ component.registerMetric(*this);
+}
+
+StorageBucketDBInitializer::StorageBucketDBInitializer(
+ const config::ConfigUri & configUri,
+ const spi::PartitionStateList& partitions,
+ DoneInitializeHandler& doneInitializeHandler,
+ ServiceLayerComponentRegister& compReg)
+ : StorageLink("StorageBucketDBInitializer"),
+ framework::HtmlStatusReporter("dbinit", "Bucket database initializer"),
+ _config(configUri),
+ _system(partitions, doneInitializeHandler, compReg, _config),
+ _metrics(_system._component),
+ _state(),
+ _readState(_system._partitions.size())
+{
+ // Initialize read state for disks being available
+ for (uint32_t i=0; i<_system._partitions.size(); ++i) {
+ if (!_system._partitions[i].isUp()) continue;
+ _readState[i] = BucketReadState::LP(new BucketReadState);
+ _state._dirsToList += 1;
+ }
+ _system._component.registerStatusPage(*this);
+}
+
+StorageBucketDBInitializer::~StorageBucketDBInitializer()
+{
+ if (_system._thread.get() != 0) {
+ LOG(error, "Deleted without calling close() first");
+ onClose();
+ }
+ closeNextLink();
+}
+
+void
+StorageBucketDBInitializer::onOpen()
+{
+ // Trigger bucket database initialization
+ for (uint32_t i=0; i<_system._partitions.size(); ++i) {
+ if (!_system._partitions[i].isUp()) continue;
+ ReadBucketList::SP msg(new ReadBucketList(spi::PartitionId(i)));
+ _state._lists[msg->getMsgId()] = msg;
+ sendDown(msg);
+ }
+ framework::MilliSecTime maxProcessingTime(10);
+ framework::MilliSecTime sleepTime(1000);
+ _system._thread = _system._component.startThread(
+ *this, maxProcessingTime, sleepTime);
+}
+
+void
+StorageBucketDBInitializer::onClose()
+{
+ if (_system._thread.get() != 0) {
+ _system._thread->interruptAndJoin(&_state._workerMonitor);
+ _system._thread.reset(0);
+ }
+}
+
+void
+StorageBucketDBInitializer::run(framework::ThreadHandle& thread)
+{
+ vespalib::MonitorGuard monitor(_state._workerMonitor);
+ while (!thread.interrupted() && !_state._doneInitializing) {
+ std::list<api::StorageMessage::SP> replies;
+ {
+ vespalib::LockGuard lock(_state._replyLock);
+ _state._replies.swap(replies);
+ }
+ for (std::list<api::StorageMessage::SP>::iterator it = replies.begin();
+ it != replies.end(); ++it)
+ {
+ api::InternalReply& reply(static_cast<api::InternalReply&>(**it));
+ if (reply.getType() == ReadBucketListReply::ID) {
+ handleReadBucketListReply(
+ static_cast<ReadBucketListReply&>(reply));
+ } else if (reply.getType() == ReadBucketInfoReply::ID) {
+ handleReadBucketInfoReply(
+ static_cast<ReadBucketInfoReply&>(reply));
+ } else if (reply.getType() == InternalBucketJoinReply::ID) {
+ handleInternalBucketJoinReply(
+ static_cast<InternalBucketJoinReply&>(reply));
+ }
+ }
+ if (_state._gottenInitProgress) {
+ _state._gottenInitProgress = false;
+ updateInitProgress();
+ }
+ if (replies.empty()) {
+ monitor.wait(10);
+ thread.registerTick(framework::WAIT_CYCLE);
+ } else {
+ thread.registerTick(framework::PROCESS_CYCLE);
+ }
+ }
+}
+
+void
+StorageBucketDBInitializer::print(
+ std::ostream& out, bool verbose, const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ out << "StorageBucketDBInitializer()";
+}
+
+void
+StorageBucketDBInitializer::reportHtmlStatus(
+ std::ostream& out, const framework::HttpUrlPath&) const
+{
+ vespalib::Monitor monitor(_state._workerMonitor);
+ out << "\n <h2>Config</h2>\n"
+ << " <table>\n"
+ << " <tr><td>Max pending info reads per disk</td><td>"
+ << _config._maxPendingInfoReadsPerDisk << "</td></tr>\n"
+ << " <tr><td>Min pending info reads per disk</td><td>"
+ << _config._minPendingInfoReadsPerDisk << "</td></tr>\n"
+ << " <tr><td>List priority</td><td>"
+ << _config._listPriority << "</td></tr>\n"
+ << " <tr><td>Info read priority</td><td>"
+ << _config._infoReadPriority << "</td></tr>\n"
+ << " </table>\n";
+
+ out << "\n <h2>Init progress</h2>\n";
+ if (_state._doneListing) {
+ out << " Done listing.<br/>\n";
+ } else {
+ out << " Listed " << _state._dirsListed << " of "
+ << _state._dirsToList << " partitions.<br/>\n";
+ }
+ if (_state._lists.empty()) {
+ out << " No lists pending.<br/>\n";
+ } else {
+ out << " " << _state._lists.size() << " lists pending.<br/>\n";
+ }
+ if (_state._joins.empty()) {
+ out << " No internal joins pending.<br/>\n";
+ } else {
+ out << " " << _state._joins.size()
+ << " internal joins pending.<br/>\n";
+ }
+ if (_state._infoRequests.empty()) {
+ out << " No info requests pending.<br/>\n";
+ } else {
+ out << " " << _state._infoRequests.size()
+ << " info requests pending.<br/>\n";
+ }
+ uint32_t incompleteScan = 0;
+ for (uint32_t i=0; i<_readState.size(); ++i) {
+ if (_readState[i].get() != 0 && !_readState[i]->_done) ++incompleteScan;
+ }
+ if (incompleteScan == 0) {
+ out << " Done iterating bucket database to generate info "
+ << "requests.<br/>\n";
+ } else {
+ out << " " << incompleteScan << " partitions still have buckets "
+ << "that needs bucket info.<br/>\n";
+ }
+ out << " Init progress gotten after state update: "
+ << (_state._gottenInitProgress ? "true" : "false") << "<br/>\n";
+ if (_state._doneInitializing) {
+ out << " Initialization complete.\n";
+ } else {
+ out << " Initialization not completed.\n";
+ }
+
+ out << "\n <h2>Metrics</h2>\n";
+ out << " " << _metrics._insertedCount.toString(true) << "<br/>\n"
+ << " " << _metrics._joinedCount.toString(true) << "<br/>\n"
+ << " " << _metrics._infoReadCount.toString(true) << "<br/>\n"
+ << " " << _metrics._infoSetByLoad.toString(true) << "<br/>\n"
+ << " " << _metrics._dirsListed.toString(true) << "<br/>\n"
+ << " Dirs to list " << _state._dirsToList << "<br/>\n";
+ if (!_state._joins.empty()) {
+ out << "\n <h2>Pending internal bucket joins</h2>\n";
+ for (vespalib::hash_map<
+ api::StorageMessage::Id,
+ InternalBucketJoinCommand::SP>::const_iterator it
+ = _state._joins.begin();
+ it != _state._joins.end();
+ ++it)
+ {
+ out << " " << it->first << " - " << *it->second << "<br/>\n";
+ }
+ }
+ out << "\n <h2>Info read state</h2>\n";
+ std::map<Disk, uint32_t> pendingCounts;
+ for (IdDiskMap::const_iterator it = _state._infoRequests.begin();
+ it != _state._infoRequests.end(); ++it)
+ {
+ ++pendingCounts[it->second];
+ }
+ for (uint32_t i=0; i<_readState.size(); ++i) {
+ if (_readState[i].get() == 0) {
+ out << " <h3>Disk " << i << " is down</h3>\n";
+ continue;
+ }
+ BucketReadState& state(*_readState[i]);
+ out << " <h3>Disk " << i << "</h3>\n";
+ out << " Pending info requests: " << pendingCounts[i] << " (";
+ if (state._pending.empty()) {
+ out << "none";
+ } else {
+ bool first = true;
+ for (BucketSet::const_iterator it = state._pending.begin();
+ it != state._pending.end(); ++it)
+ {
+ if (!first) {
+ out << ", ";
+ } else {
+ first = false;
+ }
+ out << *it;
+ }
+ }
+ out << ")<br/>\n";
+ out << " Bucked database iterator: " << state._databaseIterator
+ << "<br/>\n";
+ out << " Done iterating bucket database. "
+ << (state._done ? "true" : "false") << "<br/>\n";
+ }
+ for (std::map<Disk, uint32_t>::iterator it = pendingCounts.begin();
+ it != pendingCounts.end(); ++it)
+ {
+ out << " Disk " << it->first << ": " << it->second << "<br/>\n";
+ }
+}
+
+// Always called from worker thread. Worker monitor already grabbed
+void
+StorageBucketDBInitializer::registerBucket(const document::BucketId& bucket,
+ spi::PartitionId partition,
+ api::BucketInfo bucketInfo)
+{
+ StorBucketDatabase::WrappedEntry entry(_system._bucketDatabase.get(
+ bucket, "StorageBucketDBInitializer::registerBucket",
+ StorBucketDatabase::CREATE_IF_NONEXISTING));
+ if (bucketInfo.valid()) {
+ if (entry.preExisted()) {
+ LOG(debug, "Had value %s for %s before registering",
+ entry->getBucketInfo().toString().c_str(),
+ bucket.toString().c_str());
+ }
+ LOG(debug, "Got new value %s from %s partition %u",
+ bucketInfo.toString().c_str(), bucket.toString().c_str(),
+ partition.getValue());
+ entry->setBucketInfo(bucketInfo);
+ } else {
+ LOG(debug, "Got invalid bucket info from %s partition %u: %s",
+ bucket.toString().c_str(), partition.getValue(),
+ bucketInfo.toString().c_str());
+ }
+ if (entry.preExisted()) {
+ if (entry->disk == partition) {
+ LOG(debug, "%s already existed in bucket database on disk %i. "
+ "Might have been moved from wrong directory prior to "
+ "listing this directory.",
+ bucket.toString().c_str(), int(partition));
+ return;
+ }
+ uint32_t keepOnDisk, joinFromDisk;
+ if (_system._distribution.getPreferredAvailableDisk(
+ _system._nodeState, _system._nodeIndex,
+ bucket.stripUnused()) == partition)
+ {
+ keepOnDisk = partition;
+ joinFromDisk = entry->disk;
+ } else {
+ keepOnDisk = entry->disk;
+ joinFromDisk = partition;
+ }
+ LOG(debug, "%s exist on both disk %u and disk %i. Joining two versions "
+ "onto disk %u.",
+ bucket.toString().c_str(), entry->disk, int(partition), keepOnDisk);
+ entry.unlock();
+ // Must not have bucket db lock while sending down
+ InternalBucketJoinCommand::SP cmd(new InternalBucketJoinCommand(
+ bucket, keepOnDisk, joinFromDisk));
+ {
+ _state._joins[cmd->getMsgId()] = cmd;
+ }
+ sendDown(cmd);
+ } else {
+ _system._component.getMinUsedBitsTracker().update(bucket);
+ LOG(spam, "Inserted %s on disk %i into bucket database",
+ bucket.toString().c_str(), int(partition));
+ entry->disk = partition;
+ entry.write();
+ uint16_t disk(_system._distribution.getIdealDisk(
+ _system._nodeState, _system._nodeIndex, bucket.stripUnused(),
+ lib::Distribution::IDEAL_DISK_EVEN_IF_DOWN));
+ if (disk != partition) {
+ ++_metrics._wrongDisk;
+ }
+
+ _metrics._insertedCount.inc();
+ ++_state._insertedCount;
+ }
+}
+
+namespace {
+ struct NextBucketOnDiskFinder {
+ typedef document::BucketId BucketId;
+
+ uint16_t _disk;
+ BucketId& _iterator;
+ uint16_t _count;
+ std::vector<BucketId> _next;
+ uint32_t _alreadySet;
+
+ NextBucketOnDiskFinder(uint16_t disk, BucketId& iterator,
+ uint16_t maxToFind)
+ : _disk(disk), _iterator(iterator), _count(maxToFind),
+ _next(), _alreadySet(0) {}
+
+ StorBucketDatabase::Decision operator()(
+ uint64_t revBucket, StorBucketDatabase::Entry& entry)
+ {
+ BucketId bucket(BucketId::keyToBucketId(revBucket));
+ if (bucket == _iterator) {
+ //LOG(spam, "Ignoring bucket %s as it has value of current "
+ // "iterator", bucket.toString().c_str());
+ return StorBucketDatabase::CONTINUE;
+ }
+ _iterator = bucket;
+ if (entry.disk != _disk) {
+ //LOG(spam, "Ignoring bucket %s as it is not on disk currently "
+ // "being processed", bucket.toString().c_str());
+ // Ignore. We only want to scan for one disk
+ } else if (entry.valid()) {
+ LOG(spam, "%s already initialized by load %s. "
+ "Not requesting info",
+ bucket.toString().c_str(),
+ entry.getBucketInfo().toString().c_str());
+ ++_alreadySet;
+ } else {
+ _next.push_back(_iterator);
+ if (_next.size() >= _count) {
+ LOG(spam, "Aborting iterating for disk %u as we have "
+ "enough results. Leaving iterator at %s",
+ uint32_t(_disk), _iterator.toString().c_str());
+ return StorBucketDatabase::ABORT;
+ }
+ }
+ return StorBucketDatabase::CONTINUE;
+ }
+ };
+}
+
+// Always called from worker thread. It holds worker monitor.
+void
+StorageBucketDBInitializer::sendReadBucketInfo(spi::PartitionId disk)
+{
+ BucketReadState& state(*_readState[disk]);
+ if (state._done
+ || state._pending.size() >= _config._maxPendingInfoReadsPerDisk)
+ {
+ LOG(spam, "No need to iterate further. Database has completed "
+ "iterating buckets for disk %u.", uint32_t(disk));
+ return;
+ }
+ uint32_t count(_config._maxPendingInfoReadsPerDisk - state._pending.size());
+ NextBucketOnDiskFinder finder(disk, state._databaseIterator, count);
+ LOG(spam, "Iterating bucket db further. Starting at iterator %s",
+ state._databaseIterator.toString().c_str());
+ _system._bucketDatabase.all(finder,
+ "StorageBucketDBInitializer::readBucketInfo",
+ state._databaseIterator.stripUnused().toKey());
+ if (finder._alreadySet > 0) {
+ _metrics._infoSetByLoad.inc(finder._alreadySet);
+ _state._infoSetByLoad += finder._alreadySet;
+ }
+ for (uint32_t i=0; i<finder._next.size(); ++i) {
+ ReadBucketInfo::SP cmd(new ReadBucketInfo(finder._next[i]));
+ cmd->setPriority(_config._infoReadPriority);
+ state._pending.insert(finder._next[i]);
+ _state._infoRequests[cmd->getMsgId()] = disk;
+ LOG(spam, "Requesting bucket info from %s on disk %u.",
+ finder._next[i].toString().c_str(), uint32_t(disk));
+ sendDown(cmd);
+ }
+ state._done |= finder._next.empty();
+ _state._gottenInitProgress = true;
+ checkIfDone();
+}
+
+bool
+StorageBucketDBInitializer::onDown(
+ const std::shared_ptr<api::StorageMessage>& msg)
+{
+ // If we're done listing, load can go as normal.
+ // Rationale behind memory_order_relaxed: _doneListing is initially false
+ // and is ever only written once. Since the behavior for temporarily
+ // reading a stale default is safe (block the message) and we do not
+ // access any other shared state dependent on _doneListing, relaxed
+ // semantics should be fine here.
+ if (_state._doneListing.load(std::memory_order_relaxed)) {
+ return StorageLink::onDown(msg);
+ }
+
+ // If we're not done listing, block most types of load
+
+ // There are no known replies, but if there are to come any, they should
+ // likely not be blocked.
+ if (msg->getType().isReply()) return false;
+
+ switch (msg->getType().getId()) {
+ // Don't want to block communication with state manager
+ case api::MessageType::SETSYSTEMSTATE_ID:
+ case api::MessageType::GETNODESTATE_ID:
+ return StorageLink::onDown(msg);
+ default:
+ break;
+ }
+ // Fail everything else
+ std::ostringstream ost;
+ ost << "Cannot perform operation " << msg->getType() << " now because "
+ << "we are still listing buckets from disk.";
+ LOGBP(warning, ost.str().c_str());
+ std::unique_ptr<api::StorageReply> reply(
+ static_cast<api::StorageCommand&>(*msg).makeReply());
+ reply->setResult(api::ReturnCode(api::ReturnCode::ABORTED, ost.str()));
+ sendUp(std::shared_ptr<api::StorageReply>(reply.release()));
+ return true;
+}
+
+// Called from disk threads. Just push replies to reply list so worker thread
+// can handle it. This minimizes locking needed. Disk reads should be the
+// limiting factor, so don't need to update initializer state in multiple
+// threads.
+bool
+StorageBucketDBInitializer::onInternalReply(
+ const std::shared_ptr<api::InternalReply>& reply)
+{
+ switch(reply->getType()) {
+ case ReadBucketListReply::ID:
+ case ReadBucketInfoReply::ID:
+ case InternalBucketJoinReply::ID:
+ {
+ vespalib::LockGuard lock(_state._replyLock);
+ _state._replies.push_back(reply);
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+// Always called from worker thread. It holds worker monitor.
+void
+StorageBucketDBInitializer::handleReadBucketListReply(
+ ReadBucketListReply& reply)
+{
+ vespalib::hash_map<api::StorageMessage::Id,
+ ReadBucketList::SP>::iterator it(
+ _state._lists.find(reply.getMsgId()));
+ if (it == _state._lists.end()) {
+ LOGBP(warning, "Got bucket list reply for partition %u, request "
+ "%" PRIu64 ", that was not registered pending.",
+ reply.getPartition().getValue(), reply.getMsgId());
+ } else {
+ _state._lists.erase(it);
+ }
+ // We don't handle failed bucket listings. Kill process. Bucket lists are
+ // essential for storage node operations
+ if (reply.getResult().failed()) {
+ LOG(debug, "Got failing bucket list reply. Requesting shutdown");
+ _system._component.requestShutdown(
+ "Failed to list buckets. Cannot run without bucket list: "
+ + reply.getResult().toString());
+ return;
+ }
+ _metrics._dirsListed.inc();
+ _state._gottenInitProgress = true;
+ const spi::BucketIdListResult::List& list(reply.getBuckets());
+ api::BucketInfo info;
+ assert(!info.valid());
+ for (uint32_t i=0, n=list.size(); i<n; ++i) {
+ registerBucket(list[i], reply.getPartition(), info);
+ }
+ if (++_state._dirsListed == _state._dirsToList) {
+ handleListingCompleted();
+ }
+ checkIfDone();
+ sendReadBucketInfo(reply.getPartition());
+}
+
+// Always called from worker thread. It holds worker monitor.
+void
+StorageBucketDBInitializer::handleReadBucketInfoReply(
+ ReadBucketInfoReply& reply)
+{
+ if (reply.getResult().failed()) {
+ LOGBP(warning, "Deleting %s from bucket database. Cannot use it as we "
+ "failed to read bucket info for it: %s",
+ reply.getBucketId().toString().c_str(),
+ reply.getResult().toString().c_str());
+ _system._bucketDatabase.erase(reply.getBucketId(),
+ "dbinit.failedreply");
+ }
+ _metrics._infoReadCount.inc();
+ ++_state._infoReadCount;
+ _state._gottenInitProgress = true;
+ vespalib::hash_map<api::StorageMessage::Id, Disk>::iterator it(
+ _state._infoRequests.find(reply.getMsgId()));
+ if (it == _state._infoRequests.end()) {
+ LOGBP(warning, "Got bucket info reply for %s, request %" PRIu64 ", that "
+ "was not registered pending.",
+ reply.getBucketId().toString().c_str(), reply.getMsgId());
+ checkIfDone();
+ } else {
+ uint32_t disk(it->second);
+ _state._infoRequests.erase(it->first);
+ BucketReadState& state(*_readState[disk]);
+ BucketSet::iterator it2(state._pending.find(reply.getBucketId()));
+ if (it2 == state._pending.end()) {
+ LOGBP(warning, "Got bucket info reply for %s that was registered "
+ "in global state but not in disk %u's state.",
+ reply.getBucketId().toString().c_str(), disk);
+ } else {
+ state._pending.erase(reply.getBucketId());
+ LOG(spam, "Got info reply for %s: %s",
+ reply.getBucketId().toString().c_str(),
+ _system._bucketDatabase.get(
+ reply.getBucketId(), "dbinit.inforeply")
+ ->getBucketInfo().toString().c_str());
+ }
+ checkIfDone();
+ sendReadBucketInfo(spi::PartitionId(disk));
+ }
+}
+
+// Always called from worker thread. It holds worker monitor.
+void
+StorageBucketDBInitializer::handleInternalBucketJoinReply(
+ InternalBucketJoinReply& reply)
+{
+ _metrics._joinedCount.inc();
+ vespalib::hash_map<api::StorageMessage::Id,
+ InternalBucketJoinCommand::SP>::iterator it(
+ _state._joins.find(reply.getMsgId()));
+ if (reply.getResult().failed()) {
+ LOGBP(warning, "Failed to join multiple copies of %s. One of the "
+ "versions will not be available: %s",
+ reply.getBucketId().toString().c_str(),
+ reply.getResult().toString().c_str());
+ }
+ if (it != _state._joins.end()) {
+ _state._joins.erase(reply.getMsgId());
+ LOG(debug, "Completed internal bucket join for %s. Got bucket info %s",
+ reply.getBucketId().toString().c_str(),
+ reply.getBucketInfo().toString().c_str());
+ StorBucketDatabase::WrappedEntry entry(_system._bucketDatabase.get(
+ reply.getBucketId(),
+ "StorageBucketDBInitializer::onInternalBucketJoinReply"));
+ entry->setBucketInfo(reply.getBucketInfo());
+ entry.write();
+ } else {
+ LOGBP(warning, "Got internal join reply for %s which was not "
+ "registered to be pending.",
+ reply.getBucketId().toString().c_str());
+ }
+ checkIfDone();
+}
+
+// Always called from worker thread. It holds worker monitor.
+void
+StorageBucketDBInitializer::checkIfDone()
+{
+ if (_state._dirsListed < _state._dirsToList) return;
+ if (!_state._infoRequests.empty()) return;
+ if (!_state._joins.empty()) return;
+ for (uint32_t i=0; i<_readState.size(); ++i) {
+ if (_readState[i].get() != 0 && !_readState[i]->_done) return;
+ }
+ _state._doneInitializing = true;
+ _system._doneInitializeHandler.notifyDoneInitializing();
+ _metrics._initLatency.addValue(_metrics._startTime);
+ LOG(debug, "Completed initializing");
+}
+
+double
+StorageBucketDBInitializer::calculateMinProgressFromDiskIterators() const
+{
+ double minProgress = 1.0;
+ for (size_t disk = 0; disk < _readState.size(); ++disk) {
+ if (_readState[disk].get() == 0) {
+ continue;
+ }
+ const BucketReadState& state(*_readState[disk]);
+ document::BucketId bid(state._databaseIterator);
+
+ double progress;
+ if (!state._done) {
+ progress = BucketProgressCalculator::calculateProgress(bid);
+ } else {
+ progress = 1.0;
+ }
+
+ minProgress = std::min(minProgress, progress);
+ }
+ //std::cerr << "minProgress: " << minProgress << "\n";
+ return minProgress;
+}
+
+// Always called from worker thread. It holds worker monitor.
+double
+StorageBucketDBInitializer::calcInitProgress() const
+{
+ double listProgress(_state._dirsToList == 0
+ ? 0 : _state._dirsListed / _state._dirsToList);
+ // Do sanity check
+ if (_state._dirsListed > _state._dirsToList) {
+ LOG(error, "%" PRIu64 " of %u dirs are reported listed. This is a bug.",
+ _state._dirsListed, _state._dirsToList);
+ listProgress = 1.0;
+ }
+ double infoProgress(calculateMinProgressFromDiskIterators());
+ if (_state._dirsToList > _state._dirsListed
+ && infoProgress > 0)
+ {
+ LOG(debug, "Not done with list step yet. (%" PRIu64 " of %u done). "
+ "Need to nullify info part of progress so fleetcontroller "
+ "doesn't think listing is completed.",
+ _state._dirsListed, _state._dirsToList);
+ infoProgress = 0;
+
+ // Currently we never honor complete_list_before_starting_read option.
+ // We might want to do that later, in order to be able to enforce
+ // waiting to read. For instance, if we have usecase where several
+ // directories map to the same disk, such that reading info is slowing
+ // down directory listing to such an extent that quick restart aint
+ // quick enough anymore. If we do, revert to make this an error if that
+ // config option is enabled
+ }
+
+ double listLimit = lib::NodeState::getListingBucketsInitProgressLimit();
+ double progress(listLimit * listProgress
+ + (1.0 - listLimit) * infoProgress);
+ assert(progress < 1.000000001);
+ return progress;
+}
+
+// Always called from worker thread. It holds worker monitor.
+void
+StorageBucketDBInitializer::updateInitProgress() const
+{
+ double progress = calcInitProgress();
+ NodeStateUpdater::Lock::SP lock(
+ _system._component.getStateUpdater().grabStateChangeLock());
+ lib::NodeState ns(
+ *_system._component.getStateUpdater().getReportedNodeState());
+ LOG(debug, "Reporting node init progress as %g", progress);
+ ns.setInitProgress(progress);
+ ns.setMinUsedBits(_system._component.getMinUsedBitsTracker()
+ .getMinUsedBits());
+ _system._component.getStateUpdater().setReportedNodeState(ns);
+}
+
+// Always called from worker thread. It holds worker monitor.
+void
+StorageBucketDBInitializer::handleListingCompleted()
+{
+ assert(!_state._doneListing);
+ _state._doneListing = true;
+ if (_state._dirsToList != _state._dirsListed) {
+ LOG(warning, "After list phase completed, counters indicate we've "
+ "listed %" PRIu64 " of %u directories. This is a bug.",
+ _state._dirsListed, _state._dirsToList);
+ }
+ LOG(info, "Completed listing buckets from disk. Minimum used bits is %u",
+ _system._component.getMinUsedBitsTracker().getMinUsedBits());
+ _metrics._listLatency.addValue(_metrics._startTime);
+}
+
+double
+StorageBucketDBInitializer::
+BucketProgressCalculator::calculateProgress(const document::BucketId& bid)
+{
+ uint64_t revBucket(document::BucketId::bucketIdToKey(bid.getId()));
+
+ // Remove unused bits
+ uint64_t progressBits(revBucket >> (64 - bid.getUsedBits()));
+/*
+ std::cerr << bid << ":\n";
+ std::cerr << "revBucket: " << std::hex << revBucket << ", progressBits: " << progressBits
+ << ", divisor: " << (1ULL << bid.getUsedBits())
+ << ", result= " << (static_cast<double>(progressBits) / (1ULL << bid.getUsedBits()))
+ << "\n";
+*/
+ return static_cast<double>(progressBits) / (1ULL << bid.getUsedBits());
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/bucketdb/storagebucketdbinitializer.h b/storage/src/vespa/storage/bucketdb/storagebucketdbinitializer.h
new file mode 100644
index 00000000000..365d1df89e7
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/storagebucketdbinitializer.h
@@ -0,0 +1,229 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::StorageBucketDBInitializer
+ * \ingroup bucketdb
+ *
+ * \brief Initializes the bucket database on the storage node.
+ *
+ * The storage bucket DB is responsible for initializing the bucket database on
+ * the storage node. This used to be the task of the bucket manager, but to
+ * make the implementation cleaner, the logic for this has been separated.
+ *
+ * This works as follows:
+ *
+ * 1. When component is started (onOpen), partition states should already have
+ * been aquired from the SPI and made available to this class. Requests for
+ * listing buckets will be sent to all partitions. Background thread will be
+ * started to avoid doing processes in thread sending replies.
+ *
+ * 2. Upon receiving bucket lists into background thread, the bucket database
+ * will be populated with buckets. Bucket information may at this point be
+ * invalid or not, depending on persistence provider. Providers that can list
+ * cheaply but where getting info is more expensive, will likely want to return
+ * invalid entries as the node can start handling load as fast as bucket lists
+ * is known. Providers who gets info and bucket lists equally cheap will likely
+ * prefer to give info at once to avoid the read step.
+ *
+ * 3. Upon receiving the last bucket list, the background thread will be started
+ * to do remaining work.
+ *
+ * 4. Background thread will iterate through the bucket database, issuing
+ * bucket info requests for all buckets that have invalid bucket info. Once the
+ * whole bucket database has been iterated and there are no longer pending
+ * operations, initialization is complete, and node will be tagged in up state.
+ */
+
+#pragma once
+
+#include <atomic>
+#include <vespa/vespalib/util/document_runnable.h>
+#include <vespa/metrics/metrics.h>
+#include <vespa/persistence/spi/persistenceprovider.h>
+#include <vespa/storage/bucketdb/minimumusedbitstracker.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storage/common/doneinitializehandler.h>
+#include <vespa/storage/common/servicelayercomponent.h>
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/vespalib/stllike/hash_map.h>
+#include <vespa/vespalib/util/sync.h>
+
+namespace storage {
+
+class StorageBucketDBInitializer : public StorageLink,
+ public framework::HtmlStatusReporter,
+ private framework::Runnable
+{
+ typedef uint16_t Disk;
+ typedef vespalib::hash_set<document::BucketId,
+ document::BucketId::hash> BucketSet;
+ typedef vespalib::hash_map<api::StorageMessage::Id, Disk> IdDiskMap;
+
+ struct Config {
+ // List priority should be larger than info priority.
+ uint16_t _listPriority;
+ uint16_t _infoReadPriority;
+ // When going below this amount of pending, send more until we reach max
+ uint16_t _minPendingInfoReadsPerDisk;
+ uint16_t _maxPendingInfoReadsPerDisk;
+
+ Config(const config::ConfigUri & configUri);
+ };
+ struct System {
+ DoneInitializeHandler& _doneInitializeHandler;
+ ServiceLayerComponent _component;
+ const spi::PartitionStateList& _partitions;
+ StorBucketDatabase& _bucketDatabase;
+ uint32_t _nodeIndex;
+ lib::Distribution& _distribution;
+ lib::NodeState _nodeState; // Disk info for ideal state calculations
+ framework::Thread::UP _thread;
+
+ System(const spi::PartitionStateList&,
+ DoneInitializeHandler& doneInitializeHandler,
+ ServiceLayerComponentRegister&,
+ const Config&);
+ };
+ struct Metrics : public metrics::MetricSet {
+ metrics::LongCountMetric _wrongDisk;
+ metrics::LongCountMetric _insertedCount;
+ metrics::LongCountMetric _joinedCount;
+ metrics::LongCountMetric _infoReadCount;
+ metrics::LongCountMetric _infoSetByLoad;
+ metrics::LongCountMetric _dirsListed;
+ framework::MilliSecTimer _startTime;
+ metrics::LongAverageMetric _listLatency;
+ metrics::LongAverageMetric _initLatency;
+
+ Metrics(framework::Component&);
+ };
+ struct GlobalState {
+ vespalib::hash_map<api::StorageMessage::Id,
+ ReadBucketList::SP> _lists;
+ vespalib::hash_map<api::StorageMessage::Id,
+ InternalBucketJoinCommand::SP> _joins;
+ IdDiskMap _infoRequests;
+ std::list<api::StorageMessage::SP> _replies;
+ uint64_t _insertedCount;
+ uint64_t _infoReadCount;
+ uint64_t _infoSetByLoad;
+ uint64_t _dirsListed;
+ uint32_t _dirsToList;
+ bool _gottenInitProgress;
+ std::atomic<bool> _doneListing;
+ bool _doneInitializing;
+ // This lock is held while the worker thread is working, such that
+ // status retrieval can lock it. Listing part only grabs it when
+ // needed to supporting listing in multiple threads
+ vespalib::Monitor _workerMonitor;
+ // This lock protects the reply list.
+ vespalib::Monitor _replyLock;
+
+ GlobalState()
+ : _insertedCount(0), _infoReadCount(0),
+ _infoSetByLoad(0), _dirsListed(0), _dirsToList(0),
+ _gottenInitProgress(false), _doneListing(false),
+ _doneInitializing(false) {}
+ };
+ struct BucketReadState {
+ typedef vespalib::LinkedPtr<BucketReadState> LP;
+
+ BucketSet _pending;
+ document::BucketId _databaseIterator;
+ bool _done;
+
+ BucketReadState() : _done(false) {}
+ };
+
+ Config _config;
+ System _system;
+ Metrics _metrics;
+ GlobalState _state;
+ std::vector<BucketReadState::LP> _readState;
+
+public:
+ StorageBucketDBInitializer(const config::ConfigUri&,
+ const spi::PartitionStateList&,
+ DoneInitializeHandler&,
+ ServiceLayerComponentRegister&);
+ ~StorageBucketDBInitializer();
+
+ virtual void print(std::ostream& out,
+ bool verbose, const std::string& indent) const;
+
+ virtual void onOpen();
+ virtual void onClose();
+
+ virtual void run(framework::ThreadHandle&);
+
+ bool onDown(const std::shared_ptr<api::StorageMessage>&);
+ bool onInternalReply(const std::shared_ptr<api::InternalReply>&);
+
+ void handleReadBucketListReply(ReadBucketListReply&);
+ void handleReadBucketInfoReply(ReadBucketInfoReply&);
+ void handleInternalBucketJoinReply(InternalBucketJoinReply&);
+
+ /** Status implementation. */
+ void reportHtmlStatus(std::ostream&, const framework::HttpUrlPath&) const;
+
+ // The below functions should only be called by the class itself, but they
+ // are left public for easability of access for unit tests and anonymous
+ // classes defined in implementation.
+
+ /** Get the path of a given directory. */
+ std::string getPathName(std::vector<uint32_t>& path,
+ const document::BucketId* = 0) const;
+ /** Process a given file found through listing files on disk */
+ bool processFile(std::vector<uint32_t>& path, const std::string& pathName,
+ const std::string& name);
+ /**
+ * Find what bucket identifier file corresponds to.
+ * Invalid bucket indicates none. (Invalid file name)
+ */
+ document::BucketId extractBucketId(const std::string& name) const;
+ /**
+ * Handle that the bucket might have been found in the wrong position.
+ * Returns true if we should attepmt to register the bucket.
+ */
+ bool handleBadLocation(const document::BucketId&,
+ std::vector<uint32_t>& path);
+ /** Register a bucket in the bucket database. */
+ void registerBucket(const document::BucketId&,
+ spi::PartitionId,
+ api::BucketInfo bucketInfo);
+ /**
+ * Sends more read bucket info to a given disk. Lock must already be taken.
+ * Will be released by function prior to sending messages down.
+ */
+ void sendReadBucketInfo(spi::PartitionId);
+ /** Check whether initialization is complete. Should hold lock to call it.*/
+ void checkIfDone();
+
+ /** Calculate minimum progress from all disks' bucket db iterators */
+ double calculateMinProgressFromDiskIterators() const;
+ /** Calculate how far we have progressed initializing. */
+ double calcInitProgress() const;
+ /** Update node state if init progress have changed enough. */
+ void updateInitProgress() const;
+ /** Handle that we're done listing buckets. */
+ void handleListingCompleted();
+
+ /** Used for unit tests to see that stuff has happened. */
+ virtual const Metrics& getMetrics() const { return _metrics; }
+
+
+ class BucketProgressCalculator
+ {
+ public:
+ /**
+ * Estimate progress into the total bucket space.
+ * Done by taking reverse bucket key, shifting away unused bits and
+ * dividing the result by 2**used bits to get approximate progress.
+ * @param bid Current bucket space iterator/cursor.
+ */
+ static double calculateProgress(const document::BucketId& bid);
+ };
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/bucketdb/storbucketdb.cpp b/storage/src/vespa/storage/bucketdb/storbucketdb.cpp
new file mode 100644
index 00000000000..6afe2152cd4
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/storbucketdb.cpp
@@ -0,0 +1,66 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include "storbucketdb.h"
+#include <vespa/storage/common/bucketoperationlogger.h>
+
+namespace storage {
+namespace bucketdb {
+
+void
+StorageBucketInfo::
+print(std::ostream& out, bool, const std::string&) const
+{
+ out << info << ", disk " << disk;
+}
+
+} // bucketdb
+
+void
+StorBucketDatabase::insert(const document::BucketId& bucket,
+ const bucketdb::StorageBucketInfo& entry,
+ const char* clientId)
+{
+ assert(entry.disk != 0xff);
+ bool preExisted;
+#if __WORDSIZE == 64
+ return LockableMap<JudyMultiMap<Entry> >::insert(
+ bucket.toKey(), entry, clientId, preExisted);
+#else
+ return LockableMap<StdMapWrapper<document::BucketId::Type, Entry> >::insert(
+ bucket.toKey(), entry, clientId, preExisted);
+#endif
+}
+
+bool
+StorBucketDatabase::erase(const document::BucketId& bucket,
+ const char* clientId)
+{
+#if __WORDSIZE == 64
+ return LockableMap<JudyMultiMap<Entry> >::erase(
+ bucket.stripUnused().toKey(), clientId);
+#else
+ return LockableMap<StdMapWrapper<document::BucketId::Type, Entry> >::erase(
+ bucket.stripUnused().toKey(), clientId);
+#endif
+}
+
+StorBucketDatabase::WrappedEntry
+StorBucketDatabase::get(const document::BucketId& bucket,
+ const char* clientId,
+ Flag flags)
+{
+ bool createIfNonExisting = (flags & CREATE_IF_NONEXISTING);
+ bool lockIfNonExisting = (flags & LOCK_IF_NONEXISTING_AND_NOT_CREATING);
+#if __WORDSIZE == 64
+ return LockableMap<JudyMultiMap<Entry> >::get(
+ bucket.stripUnused().toKey(), clientId, createIfNonExisting,
+ lockIfNonExisting);
+#else
+ return LockableMap<StdMapWrapper<document::BucketId::Type, Entry> >::get(
+ bucket.stripUnused().toKey(), clientId,
+ createIfNonExisting, lockIfNonExisting);
+#endif
+}
+
+
+} // storage
diff --git a/storage/src/vespa/storage/bucketdb/storbucketdb.h b/storage/src/vespa/storage/bucketdb/storbucketdb.h
new file mode 100644
index 00000000000..e66928b361d
--- /dev/null
+++ b/storage/src/vespa/storage/bucketdb/storbucketdb.h
@@ -0,0 +1,83 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class StorageBucketInfo
+ * \ingroup bucketdb
+ *
+ * \brief An entry in the storage bucket database.
+ *
+ * \class StorBucketDatabase
+ * \ingroup bucketdb
+ *
+ * \brief The storage bucket database.
+ */
+#pragma once
+
+#include <map>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/storage/bucketdb/judymultimap.h>
+#include <vespa/storage/bucketdb/lockablemap.h>
+#include <vespa/storage/bucketdb/stdmapwrapper.h>
+#include <vespa/storageapi/buckets/bucketinfo.h>
+#include <vespa/storageapi/defs.h>
+
+namespace storage {
+
+namespace bucketdb {
+
+struct StorageBucketInfo {
+ api::BucketInfo info;
+ unsigned disk : 8; // The disk containing the bucket
+
+ StorageBucketInfo() : info(), disk(0xff) {}
+ static bool mayContain(const StorageBucketInfo&) { return true; }
+ void print(std::ostream&, bool verbose, const std::string& indent) const;
+ bool valid() const { return info.valid(); }
+ void setBucketInfo(const api::BucketInfo& i) { info = i; }
+ const api::BucketInfo& getBucketInfo() const { return info; }
+ void setEmptyWithMetaData() {
+ info.setChecksum(1);
+ info.setMetaCount(1);
+ info.setDocumentCount(0);
+ info.setTotalDocumentSize(0);
+ }
+ bool verifyLegal() const { return (disk != 0xff); }
+ uint32_t getMetaCount() { return info.getMetaCount(); }
+ void setChecksum(uint32_t crc) { info.setChecksum(crc); }
+};
+
+inline std::ostream& operator<<(std::ostream& out,
+ const StorageBucketInfo& info)
+ { info.print(out, false, ""); return out; }
+
+} // bucketdb
+
+
+class StorBucketDatabase
+#if __WORDSIZE == 64
+ : public LockableMap<JudyMultiMap<bucketdb::StorageBucketInfo> >
+#else
+# warning Bucket database cannot use Judy on non-64 bit platforms
+ : public LockableMap<StdMapWrapper<document::BucketId::Type, bucketdb::StorageBucketInfo> >
+#endif
+{
+public:
+ enum Flag {
+ NONE = 0,
+ CREATE_IF_NONEXISTING = 1,
+ LOCK_IF_NONEXISTING_AND_NOT_CREATING = 2
+ };
+ typedef bucketdb::StorageBucketInfo Entry;
+
+ StorBucketDatabase() {};
+
+ void insert(const document::BucketId&, const bucketdb::StorageBucketInfo&,
+ const char* clientId);
+
+ bool erase(const document::BucketId&, const char* clientId);
+
+ WrappedEntry get(const document::BucketId& bucket, const char* clientId,
+ Flag flags = NONE);
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/bucketmover/CMakeLists.txt b/storage/src/vespa/storage/bucketmover/CMakeLists.txt
new file mode 100644
index 00000000000..637934cb592
--- /dev/null
+++ b/storage/src/vespa/storage/bucketmover/CMakeLists.txt
@@ -0,0 +1,11 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_bucketmover OBJECT
+ SOURCES
+ move.cpp
+ runstatistics.cpp
+ run.cpp
+ bucketmover.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/bucketmover/bucketmover.cpp b/storage/src/vespa/storage/bucketmover/bucketmover.cpp
new file mode 100644
index 00000000000..dab6f2ebdc0
--- /dev/null
+++ b/storage/src/vespa/storage/bucketmover/bucketmover.cpp
@@ -0,0 +1,541 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <boost/lexical_cast.hpp>
+#include <iomanip>
+#include <vespa/storage/bucketmover/bucketmover.h>
+#include <vespa/storage/bucketmover/htmltable.h>
+#include <vespa/storage/config/config-stor-server.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/storageutil/log.h>
+#include <vespa/vdslib/distribution/distribution.h>
+
+LOG_SETUP(".bucketmover");
+
+namespace storage {
+namespace bucketmover {
+
+BucketMover::BucketMover(const config::ConfigUri & configUri,
+ ServiceLayerComponentRegister& reg)
+ : StorageLink("Bucket mover"),
+ Runnable(),
+ framework::HtmlStatusReporter("diskbalancer", "Disk balancer"),
+ _component(reg, "diskbalancer"),
+ _config(new vespa::config::content::core::StorBucketmoverConfig()),
+ _cycleCount(0),
+ _nextRun(0),
+ _configFetcher(configUri.getContext()),
+ _diskDistribution(_component.getDistribution()->getDiskDistribution()),
+ _maxSleepTime(60 * 60)
+{
+ if (!configUri.empty()) {
+ using vespa::config::content::core::StorBucketmoverConfig;
+ _configFetcher.subscribe<StorBucketmoverConfig>(
+ configUri.getConfigId(), this);
+ _configFetcher.start();
+ }
+ _component.registerStatusPage(*this);
+}
+
+BucketMover::~BucketMover()
+{
+ if (_thread.get() != 0) {
+ LOG(error, "BucketMover deleted without calling close() first");
+ onClose();
+ }
+ closeNextLink();
+}
+
+
+void BucketMover::onDoneInit()
+{
+ framework::MilliSecTime maxProcessingTime(60 * 1000);
+ framework::MilliSecTime waitTime(_maxSleepTime * 1000);
+ _thread = _component.startThread(*this, maxProcessingTime, waitTime);
+}
+
+void
+BucketMover::onClose()
+{
+ // Avoid getting config during shutdown
+ _configFetcher.close();
+ // Close thread to ensure we don't send anything more down after
+ if (_thread.get()) {
+ _thread->interruptAndJoin(&_wait);
+ LOG(debug, "Bucket mover worker thread closed.");
+ _thread.reset(0);
+ }
+}
+
+void
+BucketMover::signal()
+{
+ vespalib::MonitorGuard monitor(_wait);
+ monitor.signal();
+}
+
+framework::SecondTime
+BucketMover::calculateWaitTimeOfNextRun() const
+{
+ // _wait lock should have been taken by caller
+
+ // If we haven't tried running at all, run fast to get statistics
+ if (_history.empty()) {
+ return framework::SecondTime(_config->minimumRecheckIntervalInSeconds);
+ }
+
+ // If we have a previous run, assuming our situation haven't changed
+ // much from that one. Use it to calculate time.
+ const RunStatistics& lastRun(_history.front());
+
+ // If there are few buckets in wrong place, don't bother rechecking
+ // often.
+ if (lastRun.getWronglyPlacedRatio() < 0.01) {
+ return framework::SecondTime(_config->maximumRecheckIntervalInSeconds);
+ }
+
+ // If a disk was disabled, wait for a good while.
+ for (uint32_t i = 0; i < lastRun._diskData.size(); ++i) {
+ if (lastRun._diskData[i]._diskDisabled) {
+ return framework::SecondTime(_config->maximumRecheckIntervalInSeconds / 2);
+ }
+ }
+
+ return framework::SecondTime(_config->minimumRecheckIntervalInSeconds);
+}
+
+void
+BucketMover::startNewRun()
+{
+ // If not in a run but time to start another one, do so
+ LOG(debug, "Starting new move cycle at time %s.",
+ _component.getClock().getTimeInSeconds().toString().c_str());
+ _currentRun.reset(new bucketmover::Run(
+ _component.getBucketDatabase(),
+ _component.getDistribution(),
+ *_component.getStateUpdater().getReportedNodeState(),
+ _component.getIndex(),
+ _component.getClock()));
+}
+
+void
+BucketMover::queueNewMoves()
+{
+ // If we have too few pending, send some new moves, if there are more
+ // moves to perform.
+ while (_pendingMoves.size() < uint32_t(_config->maxPending))
+ {
+ Move nextMove = _currentRun->getNextMove();
+
+ // If no more moves to do, stop attempting to send more.
+ if (!nextMove.isDefined()) {
+ break;
+ }
+ _pendingMoves.push_back(nextMove);
+ std::shared_ptr<BucketDiskMoveCommand> cmd(
+ new BucketDiskMoveCommand(nextMove.getBucketId(),
+ nextMove.getSourceDisk(),
+ nextMove.getTargetDisk()));
+ cmd->setPriority(nextMove.getPriority());
+ _newMoves.push_back(cmd);
+ }
+}
+
+void
+BucketMover::finishCurrentRun()
+{
+ RunStatistics stats = _currentRun->getStatistics();
+ if (_currentRun->aborted()) {
+ LOG(debug, "Completed aborted bucket move run: %s",
+ stats.toString().c_str());
+ } else {
+ // If current run is completed, note so in log, and move
+ // run to history track.
+ LOG(debug, "Completed bucket move run: %s",
+ stats.toString().c_str());
+
+ _history.push_front(stats);
+ if (_history.size() > uint32_t(_config->maxHistorySize)) {
+ _history.pop_back();
+ }
+ _nextRun = _component.getClock().getTimeInSeconds() +
+ calculateWaitTimeOfNextRun();
+ }
+
+ _currentRun.reset();
+ ++_cycleCount;
+}
+
+void
+BucketMover::sendNewMoves()
+{
+ for (std::list<BucketDiskMoveCommand::SP>::iterator it
+ = _newMoves.begin(); it != _newMoves.end(); ++it)
+ {
+ LOG(debug, "Moving bucket: %s", (**it).toString().c_str());
+ sendDown(*it);
+
+ // Be able to sleep a bit between moves for debugging to see
+ // what is happening. (Cannot use wait() here as reply of
+ // message sent will signal the monitor)
+ if (_config->operationDelay != 0) {
+ FastOS_Thread::Sleep(_config->operationDelay);
+ }
+ }
+
+ _newMoves.clear();
+}
+
+bool
+BucketMover::tick()
+{
+ {
+ vespalib::MonitorGuard monitor(_wait);
+
+ framework::SecondTime currentTime(
+ _component.getClock().getTimeInSeconds());
+
+ if (_currentRun.get() == 0) {
+ if (currentTime >= _nextRun) {
+ startNewRun();
+ } else {
+ return false;
+ }
+ }
+
+ queueNewMoves();
+
+ if (_newMoves.empty()) {
+ if (_pendingMoves.empty()) {
+ finishCurrentRun();
+ return true;
+ } else {
+ return false;
+ }
+ }
+ }
+
+ // Send delayed after monitor has been unlocked, such that
+ // incoming responses can grab lock. (Response might come back
+ // in this thread on errors)
+ sendNewMoves();
+ return true;
+}
+
+void
+BucketMover::run(framework::ThreadHandle& thread)
+{
+ while (!thread.interrupted()) {
+ thread.registerTick(framework::PROCESS_CYCLE);
+ if (!tick()) {
+ vespalib::MonitorGuard monitor(_wait);
+ monitor.wait(1000);
+ }
+ }
+}
+
+void
+BucketMover::configure(std::unique_ptr<vespa::config::content::core::StorBucketmoverConfig> config)
+{
+ vespalib::MonitorGuard monitor(_wait);
+ if (config->minimumRecheckIntervalInSeconds < 0) {
+ throw config::InvalidConfigException(
+ "Minimum recheck interval must be a positive value",
+ VESPA_STRLOC);
+ }
+ if (config->maximumRecheckIntervalInSeconds
+ < config->minimumRecheckIntervalInSeconds) {
+ throw config::InvalidConfigException(
+ "Maximum recheck interval must be equal or greater "
+ "to minimum recheck interval",
+ VESPA_STRLOC);
+ }
+ if (config->bucketIterationChunk < 1) {
+ throw config::InvalidConfigException(
+ "Bucket iteration chunk must be a positive number",
+ VESPA_STRLOC);
+ }
+ if (config->maxTargetFillRateAboveAverage < 0
+ || config->maxTargetFillRateAboveAverage > 1.0)
+ {
+ throw config::InvalidConfigException(
+ "Max target fill rate above average must be in the range 0-1",
+ VESPA_STRLOC);
+ }
+ if (config->maxPending < 1) {
+ throw config::InvalidConfigException(
+ "Cannot have less than 1 max pending", VESPA_STRLOC);
+ }
+ if (config->maxHistorySize < 1) {
+ throw config::InvalidConfigException(
+ "Cannot have less than 1 max history size", VESPA_STRLOC);
+ }
+ if (config->operationDelay > 0) {
+ LOG(warning, "Operation delay debug option enabled. Slows down bucket "
+ "moving. Should only be used in testing where we want to "
+ "slow down the operation to manually inspect it during "
+ "the run.");
+ }
+ _config = std::move(config);
+ while (_history.size() > uint32_t(_config->maxHistorySize)) {
+ _history.pop_back();
+ }
+}
+
+bool
+BucketMover::onInternalReply(
+ const std::shared_ptr<api::InternalReply>& internalReply)
+{
+ // We only care about move disk bucket replies
+ std::shared_ptr<BucketDiskMoveReply> reply(
+ std::dynamic_pointer_cast<BucketDiskMoveReply>(internalReply));
+ if (!reply.get()) return false;
+
+ // Warn if we see move replies outside of a run. Should not be possible.
+ vespalib::MonitorGuard monitor(_wait);
+ if (_currentRun.get() == 0) {
+ LOG(warning, "Got a bucket disk move reply while no run is active. "
+ "This should not happen, as runs should stay active until "
+ "all requests are answered.");
+ return true;
+ }
+ // Match move against pending ones
+ Move move;
+ for (std::list<Move>::iterator it = _pendingMoves.begin();
+ it != _pendingMoves.end(); ++it)
+ {
+ if (it->getBucketId() == reply->getBucketId()
+ && it->getSourceDisk() == reply->getSrcDisk()
+ && it->getTargetDisk() == reply->getDstDisk())
+ {
+ move = *it;
+ _pendingMoves.erase(it);
+ break;
+ }
+ }
+ // Warn if it wasn't supposed to be active
+ if (!move.isDefined()) {
+ LOG(warning, "Got a bucket disk move reply which wasn't registered "
+ "as pending. This should not happen.");
+ return true;
+ }
+ // Tag move completed in run.
+ if (reply->getResult().success()) {
+ _currentRun->moveOk(move);
+ } else if (reply->getResult().getResult()
+ == api::ReturnCode::BUCKET_NOT_FOUND
+ || reply->getResult().getResult()
+ == api::ReturnCode::BUCKET_DELETED)
+ {
+ _currentRun->moveFailedBucketNotFound(move);
+ } else {
+ _currentRun->moveFailed(move);
+ LOGBP(debug, "Failed %s: %s",
+ move.toString().c_str(), reply->getResult().toString().c_str());
+ }
+ monitor.broadcast();
+ return true;
+}
+
+void
+BucketMover::storageDistributionChanged()
+{
+ lib::Distribution::SP distribution = _component.getDistribution();
+
+ // Verify that the actual disk distribution changed, if not ignore
+ vespa::config::content::StorDistributionConfig::DiskDistribution newDistr(
+ distribution->getDiskDistribution());
+
+ if (_diskDistribution == newDistr) return;
+
+ vespalib::MonitorGuard monitor(_wait);
+ if (_currentRun.get() != 0) {
+ LOG(info, "Aborting bucket mover run as disk distribution changed "
+ "from %s to %s.",
+ vespa::config::content::StorDistributionConfig::getDiskDistributionName(
+ _diskDistribution).c_str(),
+ vespa::config::content::StorDistributionConfig::getDiskDistributionName(
+ newDistr).c_str());
+ _currentRun->abort();
+ } else {
+ LOG(info, "Regathering state as disk distribution changed "
+ "from %s to %s.",
+ vespa::config::content::StorDistributionConfig::getDiskDistributionName(
+ _diskDistribution).c_str(),
+ vespa::config::content::StorDistributionConfig::getDiskDistributionName(
+ newDistr).c_str());
+ }
+ _diskDistribution = newDistr;
+ _nextRun = framework::SecondTime(0);
+}
+
+bool BucketMover::isWorkingOnCycle() const {
+ vespalib::MonitorGuard monitor(_wait);
+ return (_currentRun.get() != 0);
+}
+
+uint32_t BucketMover::getCycleCount() const {
+ vespalib::MonitorGuard monitor(_wait);
+ return _cycleCount;
+}
+
+void
+BucketMover::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ vespalib::MonitorGuard monitor(_wait);
+ out << "BucketMover() {";
+ if (_currentRun.get() != 0) {
+ out << "\n" << indent << " ";
+ _currentRun->print(out, verbose, indent + " ");
+ } else {
+ out << "\n" << indent << " No current run.";
+ }
+ if (verbose && !_history.empty()) {
+ out << "\n" << indent << " History:";
+ for (std::list<RunStatistics>::const_iterator it = _history.begin();
+ it != _history.end(); ++it)
+ {
+ out << "\n" << indent << " ";
+ it->print(out, true, indent + " ");
+ }
+ }
+ out << "\n" << indent << "}";
+}
+
+void
+BucketMover::reportHtmlStatus(std::ostream& out,
+ const framework::HttpUrlPath&) const
+{
+ vespalib::MonitorGuard monitor(_wait);
+ if (_history.empty()) {
+ out << "<h2>Status after last run</h2>\n";
+ out << "<p>No run completed yet. Current status unknown.</p>\n";
+ } else {
+ printCurrentStatus(out, *_history.begin());
+ }
+ out << "<h2>Current move cycle</h2>\n";
+ if (_currentRun.get() != 0) {
+ printRunHtml(out, *_currentRun);
+ if (_currentRun->getPendingMoves().empty()) {
+ out << "<blockquote>No pending moves.</blockquote>\n";
+ } else {
+ out << "<blockquote>Pending bucket moves:<ul>\n";
+ for (std::list<Move>::const_iterator it
+ = _currentRun->getPendingMoves().begin();
+ it != _currentRun->getPendingMoves().end(); ++it)
+ {
+ out << "<li>" << *it << "</li>\n";
+ }
+ out << "</ul></blockquote>\n";
+ }
+ } else {
+ out << "<p>\n"
+ << "No bucket move cycle currently running. ";
+ framework::SecondTime currentTime(
+ _component.getClock().getTimeInSeconds());
+ if (_nextRun <= currentTime) {
+ if (_thread.get() != 0) {
+ out << "Next run to start immediately.";
+ // Wake up thread, so user sees it starts immediately :)
+ monitor.signal();
+ } else {
+ out << "Waiting for node to finish initialization before "
+ << "starting run.";
+ }
+ } else {
+ out << "Next run scheduled to run";
+ framework::SecondTime diff(_nextRun - currentTime);
+ if (diff < framework::SecondTime(24 * 60 * 60)) {
+ out << " in " << diff.toString(framework::DIFFERENCE);
+ } else {
+ out << " at time " << _nextRun;
+ }
+ out << ".";
+ }
+ out << "\n</p>\n";
+ }
+ if (!_history.empty()) {
+ out << "<h2>Statistics from previous bucket mover cycles</h2>\n";
+ for (std::list<RunStatistics>::const_iterator it = _history.begin();
+ it != _history.end(); ++it)
+ {
+ printRunStatisticsHtml(out, *it);
+ }
+ }
+}
+
+void
+BucketMover::printCurrentStatus(std::ostream& out,
+ const RunStatistics& rs) const
+{
+ framework::SecondTime currentTime(_component.getClock().getTimeInSeconds());
+ out << "<h2>Status after last run ("
+ << (currentTime - rs._endTime).toString(framework::DIFFERENCE)
+ << " ago)</h2>\n"
+ << "<p>Disk distribution: "
+ << vespa::config::content::StorDistributionConfig::getDiskDistributionName(
+ _diskDistribution)
+ << "</p>\n";
+ out << "<p>This is the status from the last completed bucket database scan "
+ << "done by the bucket mover. After starting storage, or after "
+ << "configuration changes, a single scan is always done without "
+ << "actually attempting to move anything, just to get status updated "
+ << "quickly. During a move cycle, the data shown for the current cycle "
+ << "will be more recently updated, but will only represent a part of "
+ << "the bucket database.</p>\n";
+ HtmlTable table("Disk");
+ table.addColumnHeader("Real partition byte usage", 3);
+ ByteSizeColumn diskSpaceUsed("Used", &table);
+ ByteSizeColumn diskSpaceTotal("Total", &table);
+ DoubleColumn diskSpaceFillRate("Fill rate", " %", &table);
+ diskSpaceFillRate.addColorLimit(85, Column::LIGHT_GREEN);
+ diskSpaceFillRate.addColorLimit(95, Column::LIGHT_YELLOW);
+ diskSpaceFillRate.addColorLimit(100, Column::LIGHT_RED);
+ diskSpaceFillRate.setTotalAsAverage();
+ table.addColumnHeader("Buckets in directory", 2);
+ LongColumn bucketCount("Count", "", &table);
+ PercentageColumn bucketCountPart("Part", 0, &table);
+ table.addColumnHeader("Total document size directory", 2);
+ ByteSizeColumn documentSize("Size", &table);
+ PercentageColumn documentSizePart("Part", 0, &table);
+ table.addColumnHeader("Buckets on correct disk", 2);
+ LongColumn bucketsCorrectDisk("Count", "", &table);
+ DoubleColumn bucketsCorrectDiskPart("Part", " %", &table);
+ bucketsCorrectDiskPart.setTotalAsAverage();
+ bucketsCorrectDiskPart.addColorLimit(95, Column::LIGHT_YELLOW);
+ bucketsCorrectDiskPart.addColorLimit(100, Column::LIGHT_GREEN);
+ for (uint32_t i=0; i<rs._diskData.size(); ++i) {
+ table.addRow(i);
+ // Ignore disks down
+ bucketCount[i] = rs.getBucketCount(i, true);
+ bucketCountPart[i] = bucketCount[i];
+ documentSize[i] = rs._diskData[i]._bucketSize;
+ documentSizePart[i] = documentSize[i];
+ bucketsCorrectDisk[i] = rs.getBucketCount(i, false);
+ bucketsCorrectDiskPart[i] = 100.0 * rs.getBucketCount(i, false)
+ / rs.getBucketCount(i, true);
+ }
+ table.addTotalRow("Total");
+ table.print(out);
+
+ MATRIX_PRINT("Buckets on wrong disk", _bucketsLeftOnWrongDisk, rs);
+}
+
+void
+BucketMover::printRunHtml(std::ostream& out, const bucketmover::Run& runner) const
+{
+ printRunStatisticsHtml(out, runner.getStatistics());
+}
+
+void
+BucketMover::printRunStatisticsHtml(std::ostream& out,
+ const RunStatistics& rs) const
+{
+ rs.print(out, true, "");
+}
+
+} // bucketmover
+} // storage
diff --git a/storage/src/vespa/storage/bucketmover/bucketmover.h b/storage/src/vespa/storage/bucketmover/bucketmover.h
new file mode 100644
index 00000000000..d54b1c60de8
--- /dev/null
+++ b/storage/src/vespa/storage/bucketmover/bucketmover.h
@@ -0,0 +1,99 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::BucketMover
+ * @ingroup storageserver
+ *
+ * @brief This class moves buckets between disks for reducing node skew. Highly
+ * inspired from BucketIntegrityChecker.
+ *
+ * It uses DiskMonitor class to monitor disk info (space available, space used,
+ * etc), but also to monitor the number of pending moves for each disk.
+ * It also uses BucketMoverHeuristic class to decide on which buckets should be
+ * moved and to what disk.
+ *
+ * @version $Id:
+ */
+
+#pragma once
+
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storage/config/config-stor-bucketmover.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storage/bucketmover/run.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/config-stor-distribution.h>
+#include <vespa/storage/common/servicelayercomponent.h>
+
+namespace storage {
+
+class BucketDiskMoveCommand;
+class Clock;
+
+namespace bucketmover {
+
+class BucketMover : public StorageLink,
+ private framework::Runnable,
+ public framework::HtmlStatusReporter,
+ private config::IFetcherCallback<vespa::config::content::core::StorBucketmoverConfig>
+{
+ ServiceLayerComponent _component;
+ std::unique_ptr<vespa::config::content::core::StorBucketmoverConfig> _config;
+ uint32_t _cycleCount;
+ framework::SecondTime _nextRun;
+ std::unique_ptr<bucketmover::Run> _currentRun;
+ std::list<Move> _pendingMoves;
+ std::list<std::shared_ptr<BucketDiskMoveCommand> > _newMoves;
+ std::list<RunStatistics> _history;
+ vespalib::Monitor _wait;
+ config::ConfigFetcher _configFetcher;
+ vespa::config::content::StorDistributionConfig::DiskDistribution _diskDistribution;
+ uint32_t _maxSleepTime;
+ framework::Thread::UP _thread;
+
+public:
+ BucketMover(const config::ConfigUri & configUri, ServiceLayerComponentRegister&);
+ ~BucketMover();
+
+ virtual void onDoneInit();
+ virtual void onClose();
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ bool isWorkingOnCycle() const;
+ uint32_t getCycleCount() const;
+ void signal();
+ framework::SecondTime getNextRunTime() const { return _nextRun; }
+
+ // Useful for unit testing
+ vespa::config::content::core::StorBucketmoverConfig& getConfig() { return *_config; }
+ RunStatistics& getLastRunStats() { return *_history.begin(); }
+
+private:
+ friend class BucketMoverTest;
+
+ void startNewRun();
+ void queueNewMoves();
+ void sendNewMoves();
+ void finishCurrentRun();
+ bool tick();
+
+ virtual void configure(std::unique_ptr<vespa::config::content::core::StorBucketmoverConfig>);
+ virtual void run(framework::ThreadHandle&);
+ bool onInternalReply(const std::shared_ptr<api::InternalReply>&);
+ virtual void storageDistributionChanged();
+
+ framework::SecondTime calculateWaitTimeOfNextRun() const;
+
+ virtual void reportHtmlStatus(std::ostream&,
+ const framework::HttpUrlPath&) const;
+ void printCurrentStatus(std::ostream&, const RunStatistics&) const;
+ void printRunHtml(std::ostream&, const bucketmover::Run&) const;
+ void printRunStatisticsHtml(std::ostream&, const RunStatistics&) const;
+
+};
+
+} // bucketmover
+} // storage
+
diff --git a/storage/src/vespa/storage/bucketmover/htmltable.h b/storage/src/vespa/storage/bucketmover/htmltable.h
new file mode 100644
index 00000000000..ae9f3d7635b
--- /dev/null
+++ b/storage/src/vespa/storage/bucketmover/htmltable.h
@@ -0,0 +1,318 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <iomanip>
+#include <map>
+#include <sstream>
+#include <string>
+#include <vector>
+#include <memory>
+
+namespace storage {
+
+class HtmlTable;
+
+struct Column {
+ enum Color { DEFAULT_COLOR, LIGHT_GREEN, LIGHT_RED, LIGHT_YELLOW };
+ enum Alignment { DEFAULT_ALIGNMENT, LEFT, CENTER, RIGHT };
+ std::map<uint16_t, Color> _colors;
+ std::string _colName;
+ Alignment _alignment;
+ enum { TOTAL = 0xffff };
+
+ Column(const std::string& colName, HtmlTable* table = 0);
+ virtual ~Column() {}
+
+ virtual void finalize() {} // Called before print is issued
+
+ static void printTdColor(std::ostream& out, Color c) {
+ switch (c) {
+ case LIGHT_GREEN: out << " bgcolor=\"#a0ffa0\""; break;
+ case LIGHT_RED: out << " bgcolor=\"#ffa0a0\""; break;
+ case LIGHT_YELLOW: out << " bgcolor=\"#ffffa0\""; break;
+ case DEFAULT_COLOR: break;
+ }
+ }
+
+ virtual void printElementStart(std::ostream& out, uint16_t row) {
+ std::map<uint16_t, Color>::iterator color(_colors.find(row));
+ out << "<td";
+ if (color != _colors.end()) printTdColor(out, color->second);
+ switch (_alignment) {
+ case LEFT: out << " align=\"left\""; break;
+ case CENTER: out << " align=\"center\""; break;
+ case RIGHT: out << " align=\"right\""; break;
+ case DEFAULT_ALIGNMENT: break;
+ }
+ out << ">";
+ }
+ virtual void printElementStop(std::ostream& out, uint16_t row) {
+ std::map<uint16_t, Color>::iterator color(_colors.find(row));
+ out << "</td>";
+ }
+
+ virtual void printElement(std::ostream& out, uint16_t row) {
+ printElementStart(out, row);
+ printValue(out, row);
+ printElementStop(out, row);
+ }
+
+ virtual void printValue(std::ostream& out, uint16_t) {
+ out << "&nbsp;";
+ }
+};
+
+struct ColHeader {
+ std::string _name;
+ uint32_t _span;
+
+ ColHeader(const std::string& name, uint32_t span)
+ : _name(name), _span(span) {}
+};
+
+struct RowHeader {
+ std::string _name;
+ Column::Color _backgroundColor;
+
+ RowHeader(const std::string& s)
+ : _name(s), _backgroundColor(Column::DEFAULT_COLOR) {}
+};
+
+class HtmlTable {
+ std::string _rowId;
+ std::vector<Column*> _columns;
+ std::vector<RowHeader> _rows;
+ std::vector<ColHeader> _colHeaders;
+ std::unique_ptr<std::string> _totalRow;
+
+public:
+ HtmlTable(const std::string& rowId)
+ : _rowId(rowId), _columns(), _rows() {}
+
+ void addTotalRow(const std::string& name)
+ { _totalRow.reset(new std::string(name)); }
+ void addColumnHeader(const std::string& name, uint32_t span)
+ { _colHeaders.push_back(ColHeader(name, span)); }
+ void addColumn(Column& col) { _columns.push_back(&col); }
+ void addRow(const std::string& rowName) { _rows.push_back(rowName); }
+ void addRow(uint64_t id)
+ { std::ostringstream ost; ost << id; _rows.push_back(ost.str()); }
+ void setRowHeaderColor(Column::Color c)
+ { _rows.back()._backgroundColor = c; }
+ uint32_t getRowCount() const { return _rows.size(); }
+
+ inline void print(std::ostream& out);
+};
+
+inline Column::Column(const std::string& colName, HtmlTable* table)
+ : _colName(colName), _alignment(RIGHT)
+{
+ if (table != 0) table->addColumn(*this);
+}
+
+/** Writes content just as you supply it. */
+template<typename T>
+struct ValueColumn : public Column {
+ std::map<uint16_t, T> _values;
+ std::string _denomination;
+ // Show all values <=T as color.
+ std::map<T, Color> _colorLimits;
+ std::ostringstream _valuePrinter;
+ bool _totalIsAvg;
+
+ ValueColumn(const std::string& colName,
+ const std::string& denomination = "",
+ HtmlTable* table = 0)
+ : Column(colName, table), _values(), _denomination(denomination),
+ _colorLimits(), _totalIsAvg(false)
+ {
+ _valuePrinter << std::fixed << std::setprecision(2);
+ }
+
+ T& operator[](uint16_t row) { return _values[row]; }
+
+ ValueColumn<T>& setPrecision(int precision)
+ { _valuePrinter << std::setprecision(precision); return *this; }
+ ValueColumn<T>& setTotalAsAverage(bool setAsAvg = true)
+ { _totalIsAvg = setAsAvg; return *this; }
+
+ void addColorLimit(T limit, Color c) {
+ _colorLimits[limit] = c;
+ }
+
+ virtual void finalize() {
+ for (typename std::map<uint16_t, T>::iterator val = _values.begin();
+ val != _values.end(); ++val)
+ {
+ Color c = DEFAULT_COLOR;
+ for (typename std::map<T, Color>::iterator it
+ = _colorLimits.begin(); it != _colorLimits.end(); ++it)
+ {
+ if (val->second <= it->first) {
+ c = it->second;
+ break;
+ }
+ }
+ _colors[val->first] = c;
+ }
+ // Set color for total too.
+ T total = getTotalValue();
+ Color c = DEFAULT_COLOR;
+ for (typename std::map<T, Color>::iterator it
+ = _colorLimits.begin(); it != _colorLimits.end(); ++it)
+ {
+ if (total <= it->first) {
+ c = it->second;
+ break;
+ }
+ }
+ _colors[TOTAL] = c;
+ }
+
+ virtual T getTotalValue() {
+ T value = 0;
+ for (typename std::map<uint16_t, T>::iterator val = _values.begin();
+ val != _values.end(); ++val)
+ {
+ value += val->second;
+ }
+ if (_totalIsAvg) value /= _values.size();
+ return value;
+ }
+
+ virtual void printValue(std::ostream& out, uint16_t row) {
+ T value;
+ if (row == TOTAL) {
+ value = getTotalValue();
+ } else {
+ typename std::map<uint16_t, T>::iterator val = _values.find(row);
+ if (val == _values.end()) {
+ Column::printValue(out, row);
+ return;
+ }
+ value = val->second;
+ }
+ _valuePrinter.str("");
+ _valuePrinter << value << _denomination;
+ out << _valuePrinter.str();
+ }
+};
+
+/** Writes content as percentage of a total */
+struct PercentageColumn : public ValueColumn<double> {
+ uint64_t _total;
+ std::map<uint16_t, uint64_t> _values;
+
+ PercentageColumn(const std::string& colName, uint64_t total = 0,
+ HtmlTable* table = 0)
+ : ValueColumn<double>(colName, " %", table), _total(total),
+ _values()
+ {
+ if (total != 0) _totalIsAvg = true;
+ }
+
+ virtual void finalize() {
+ uint64_t total = _total;
+ if (total == 0) {
+ for (std::map<uint16_t, uint64_t>::iterator it = _values.begin();
+ it != _values.end(); ++it)
+ {
+ total += it->second;
+ }
+ }
+ for (std::map<uint16_t, uint64_t>::iterator it = _values.begin();
+ it != _values.end(); ++it)
+ {
+ ValueColumn<double>::_values[it->first]
+ = 100.0 * it->second / total;
+ }
+ ValueColumn<double>::finalize();
+ }
+
+ uint64_t& operator[](uint16_t row) { return _values[row]; }
+};
+
+/** Writes content as a byte size, using an appropriate size. */
+struct ByteSizeColumn : public ValueColumn<uint64_t> {
+ std::pair<const char*, uint64_t> _denomination;
+
+ ByteSizeColumn(const std::string& colName, HtmlTable* table = 0)
+ : ValueColumn<uint64_t>(colName, "", table) {}
+
+ uint64_t& operator[](uint16_t row) { return _values[row]; }
+
+ virtual void finalize() {
+ uint64_t max = 0;
+ for (std::map<uint16_t, uint64_t>::iterator it = _values.begin();
+ it != _values.end(); ++it)
+ {
+ max = std::max(max, it->second);
+ }
+ uint64_t oldMax = max;
+ const char* type = "B";
+ if (max > 10 * 1024) { max /= 1024; type = "kB"; }
+ if (max > 10 * 1024) { max /= 1024; type = "MB"; }
+ if (max > 10 * 1024) { max /= 1024; type = "GB"; }
+ if (max > 10 * 1024) { max /= 1024; type = "TB"; }
+ _denomination = std::pair<const char*, uint64_t>(
+ type, max == 0 ? 1 : oldMax / max);
+ ValueColumn<uint64_t>::finalize();
+ }
+
+ virtual void printValue(std::ostream& out, uint16_t row) {
+ uint64_t value;
+ if (row == TOTAL) {
+ value = getTotalValue();
+ } else {
+ std::map<uint16_t, uint64_t>::iterator val(_values.find(row));
+ if (val == _values.end()) {
+ Column::printValue(out, row);
+ return;
+ }
+ value = val->second;
+ }
+ out << (value / _denomination.second) << ' ' << _denomination.first;
+ }
+};
+
+typedef ValueColumn<int64_t> LongColumn;
+typedef ValueColumn<double> DoubleColumn;
+
+inline void HtmlTable::print(std::ostream& out) {
+ out << "<table border=\"1\" cellpadding=\"2\" cellspacing=\"0\">\n<tr><th";
+ if (!_colHeaders.empty()) out << " rowspan=\"2\"";
+ out << ">" << _rowId << "</th>";
+ if (!_colHeaders.empty()) {
+ for (uint32_t i=0; i<_colHeaders.size(); ++i) {
+ out << "<th colspan=\"" << _colHeaders[i]._span << "\">"
+ << _colHeaders[i]._name << "</th>";
+ }
+ out << "</tr>\n";
+ }
+ for (uint32_t i=0; i<_columns.size(); ++i) {
+ _columns[i]->finalize();
+ out << "<th>" << _columns[i]->_colName << "</th>";
+ }
+ out << "</tr>\n";
+ for (uint32_t i=0; i<_rows.size(); ++i) {
+ out << "<tr><td";
+ Column::printTdColor(out, _rows[i]._backgroundColor);
+ out << ">" << _rows[i]._name << "</td>";
+ for (uint32_t j=0; j<_columns.size(); ++j) {
+ _columns[j]->printElement(out, i);
+ }
+ out << "</tr>\n";
+ }
+ if (_totalRow.get()) {
+ out << "<tr><td>" << *_totalRow << "</td>";
+ for (uint32_t j=0; j<_columns.size(); ++j) {
+ _columns[j]->printElement(out, Column::TOTAL);
+ }
+ out << "</tr>\n";
+ }
+ out << "</table>\n";
+}
+
+} // storage
+
diff --git a/storage/src/vespa/storage/bucketmover/move.cpp b/storage/src/vespa/storage/bucketmover/move.cpp
new file mode 100644
index 00000000000..d570eb48c5e
--- /dev/null
+++ b/storage/src/vespa/storage/bucketmover/move.cpp
@@ -0,0 +1,45 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/bucketmover/move.h>
+
+#include <vespa/log/log.h>
+
+LOG_SETUP(".bucketmover.move");
+
+namespace storage {
+namespace bucketmover {
+
+Move::Move()
+ : _sourceDisk(0),
+ _targetDisk(0),
+ _bucket(0),
+ _totalDocSize(0),
+ _priority(255)
+{
+}
+
+Move::Move(uint16_t source, uint16_t target, const document::BucketId& bucket,
+ uint32_t totalDocSize)
+ : _sourceDisk(source),
+ _targetDisk(target),
+ _bucket(bucket),
+ _totalDocSize(totalDocSize),
+ _priority(255)
+{
+}
+
+void
+Move::print(std::ostream& out, bool verbose, const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ if (!isDefined()) {
+ out << "Move(none)";
+ return;
+ }
+ out << "Move(" << _bucket << ", " << _sourceDisk << " -> " << _targetDisk
+ << ", pri " << (uint16_t) _priority << ")";
+}
+
+} // bucketmover
+} // storage
diff --git a/storage/src/vespa/storage/bucketmover/move.h b/storage/src/vespa/storage/bucketmover/move.h
new file mode 100644
index 00000000000..b032c5f3c51
--- /dev/null
+++ b/storage/src/vespa/storage/bucketmover/move.h
@@ -0,0 +1,44 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::bucketmover::Move
+ * \ingroup bucketmover
+ *
+ * \brief Class representing a bucket move between disks.
+ */
+
+#pragma once
+
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/vespalib/util/printable.h>
+
+namespace storage {
+namespace bucketmover {
+
+class Move : public vespalib::Printable {
+ uint16_t _sourceDisk;
+ uint16_t _targetDisk;
+ document::BucketId _bucket;
+ uint32_t _totalDocSize;
+ uint8_t _priority;
+
+public:
+ Move();
+ Move(uint16_t source, uint16_t target, const document::BucketId& bucket,
+ uint32_t totalDocSize);
+
+ /** False if invalid move. (Empty constructor) Indicates end of run. */
+ bool isDefined() const { return (_bucket.getRawId() != 0); }
+
+ // Only valid to call if move is defined
+ uint16_t getSourceDisk() const { return _sourceDisk; }
+ uint16_t getTargetDisk() const { return _targetDisk; }
+ const document::BucketId& getBucketId() const { return _bucket; }
+ uint8_t getPriority() const { return _priority; }
+ uint32_t getTotalDocSize() const { return _totalDocSize; }
+
+ void print(std::ostream& out, bool verbose, const std::string& indent) const;
+};
+
+} // bucketmover
+} // storage
+
diff --git a/storage/src/vespa/storage/bucketmover/run.cpp b/storage/src/vespa/storage/bucketmover/run.cpp
new file mode 100644
index 00000000000..de5f954d8b9
--- /dev/null
+++ b/storage/src/vespa/storage/bucketmover/run.cpp
@@ -0,0 +1,250 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/bucketmover/run.h>
+
+#include <iomanip>
+#include <vespa/log/log.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+
+LOG_SETUP(".bucketmover.run");
+
+namespace storage {
+namespace bucketmover {
+
+Run::Run(StorBucketDatabase& db,
+ lib::Distribution::SP distribution,
+ const lib::NodeState& nodeState,
+ uint16_t nodeIndex,
+ framework::Clock& clock)
+ : _bucketDatabase(db),
+ _distribution(distribution),
+ _nodeState(nodeState),
+ _nodeIndex(nodeIndex),
+ _entries(),
+ _iterationDone(false),
+ _statistics(distribution->getDiskDistribution(), clock, nodeState),
+ _aborted(false)
+{
+}
+
+namespace {
+ struct BucketIterator {
+ const lib::Distribution& _distribution;
+ const lib::NodeState& _nodeState;
+ RunStatistics& _statistics;
+ std::list<Move>& _entries;
+ uint16_t _nodeIndex;
+ uint32_t _maxBucketsToIterateAtOnce;
+ uint32_t _bucketsVisited;
+ document::BucketId _firstBucket;
+
+ BucketIterator(const lib::Distribution& d, const lib::NodeState& ns,
+ uint16_t nodeIndex, RunStatistics& stats,
+ std::list<Move>& entries)
+ : _distribution(d),
+ _nodeState(ns),
+ _statistics(stats),
+ _entries(entries),
+ _nodeIndex(nodeIndex),
+ _maxBucketsToIterateAtOnce(10000),
+ _bucketsVisited(0),
+ _firstBucket(stats._lastBucketVisited)
+ {
+ }
+
+ StorBucketDatabase::Decision
+ operator()(document::BucketId::Type revId,
+ StorBucketDatabase::Entry& entry)
+ {
+ document::BucketId bucket(document::BucketId::keyToBucketId(revId));
+ if (bucket == _firstBucket) {
+ return StorBucketDatabase::CONTINUE;
+ }
+ uint16_t idealDisk = _distribution.getIdealDisk(
+ _nodeState, _nodeIndex, bucket,
+ lib::Distribution::IDEAL_DISK_EVEN_IF_DOWN);
+ RunStatistics::DiskData& diskData(
+ _statistics._diskData[entry.disk]);
+ bool idealDiskDown(
+ _statistics._diskData[idealDisk]._diskDisabled);
+ if (entry.disk == idealDisk || idealDiskDown) {
+ diskData._bucketSize += entry.getBucketInfo().getTotalDocumentSize();
+ ++diskData._bucketsFoundOnCorrectDisk;
+ } else {
+ _entries.push_back(Move(
+ entry.disk, idealDisk, bucket, entry.getBucketInfo().getTotalDocumentSize()));
+ }
+ _statistics._lastBucketVisited = bucket;
+ if (++_bucketsVisited >= _maxBucketsToIterateAtOnce) {
+ return StorBucketDatabase::ABORT;
+ }
+ return StorBucketDatabase::CONTINUE;
+ }
+ };
+}
+
+Move
+Run::getNextMove()
+{
+ if (_aborted) {
+ LOG(debug, "Run aborted. Returning undefined move.");
+ return Move();
+ }
+ if (_iterationDone) {
+ LOG(debug, "Run completed. End time set. Returning undefined move.");
+ return Move();
+ }
+ while (true) {
+ // Process cached entries until we either found one to move, or
+ // we have no more
+ while (!_entries.empty()) {
+ Move e(_entries.front());
+ _entries.pop_front();
+
+ if (!_statistics._diskData[e.getTargetDisk()]._diskDisabled) {
+ _pending.push_back(e);
+ _statistics._lastBucketProcessed = e.getBucketId();
+ _statistics._lastBucketProcessedTime
+ = _statistics._clock->getTimeInSeconds();
+ return e;
+ }
+ }
+
+ // Cache more entries
+ BucketIterator it(*_distribution, _nodeState, _nodeIndex, _statistics,
+ _entries);
+ _bucketDatabase.all(it, "bucketmover::Run",
+ _statistics._lastBucketVisited.toKey());
+ if (it._bucketsVisited == 0) {
+ _iterationDone = true;
+ if (_pending.empty()) {
+ finalize();
+ }
+ LOG(debug, "Last bucket visited. Done iterating buckets in run.");
+ return Move();
+ }
+ }
+}
+
+void
+Run::depleteMoves()
+{
+ while (true) {
+ // Cache more entries
+ BucketIterator bi(*_distribution, _nodeState, _nodeIndex, _statistics,
+ _entries);
+ _bucketDatabase.all(bi, "bucketmover::depleteMoves",
+ _statistics._lastBucketVisited.toKey());
+ if (bi._bucketsVisited == 0) {
+ break;
+ }
+ for (std::list<Move>::const_iterator it = _entries.begin();
+ it != _entries.end(); ++it)
+ {
+ ++_statistics._diskData[it->getSourceDisk()][it->getTargetDisk()]
+ ._bucketsLeftOnWrongDisk;
+ uint32_t size = it->getTotalDocSize();
+ _statistics._diskData[it->getSourceDisk()]._bucketSize += size;
+ }
+ _entries.clear();
+ }
+ finalize();
+}
+
+void
+Run::finalize()
+{
+ _statistics._endTime = _statistics._clock->getTimeInSeconds();
+}
+
+void
+Run::removePending(Move& move)
+{
+ bool foundPending = false;
+ for (std::list<Move>::iterator it = _pending.begin(); it != _pending.end();
+ ++it)
+ {
+ if (it->getBucketId() == move.getBucketId()) {
+ _pending.erase(it);
+ foundPending = true;
+ break;
+ }
+ }
+ if (!foundPending) {
+ LOG(warning, "Got answer for %s that was not in the pending list.",
+ move.getBucketId().toString().c_str());
+ return;
+ }
+ if (_iterationDone && _pending.empty()) {
+ finalize();
+ }
+}
+
+void
+Run::moveOk(Move& move)
+{
+ ++_statistics._diskData[move.getSourceDisk()][move.getTargetDisk()]
+ ._bucketsMoved;
+ removePending(move);
+ uint32_t size = move.getTotalDocSize();
+
+ _statistics._diskData[move.getSourceDisk()]._bucketSize -= size;
+ _statistics._diskData[move.getTargetDisk()]._bucketSize += size;
+}
+
+void
+Run::moveFailedBucketNotFound(Move& move)
+{
+ ++_statistics._diskData[move.getSourceDisk()][move.getTargetDisk()]
+ ._bucketsNotFoundAtExecutionTime;
+ removePending(move);
+}
+
+void
+Run::moveFailed(Move& move)
+{
+ ++_statistics._diskData[move.getSourceDisk()][move.getTargetDisk()]
+ ._bucketsFailedMoving;
+ _statistics._diskData[move.getTargetDisk()]._diskDisabled = true;
+ removePending(move);
+}
+
+void
+Run::print(std::ostream& out, bool verbose, const std::string& indent) const
+{
+ out << "Run(";
+ if (_aborted) {
+ out << "Aborted";
+ } else if (_statistics._endTime.isSet()) {
+ if (_entries.empty()) {
+ out << "Completed";
+ } else {
+ out << "Iteration done";
+ }
+ }
+ out << ") {\n" << indent << " ";
+ _statistics.print(out, verbose, indent + " ");
+ if (!_entries.empty()) {
+ out << "\n" << indent << " Pending possible moves:";
+ uint32_t i = 0;
+ for (std::list<Move>::const_iterator it = _entries.begin();
+ it != _entries.end() && ++i <= 10; ++it)
+ {
+ out << "\n" << indent << " " << *it;
+ }
+ uint32_t size = _entries.size();
+ if (size > 10) {
+ out << "\n" << indent << " ... and " << (size - 10)
+ << " more.";
+ }
+ }
+ if (!_statistics._endTime.isSet()) {
+ out << "\n" << indent << " Bucket iterator: "
+ << _statistics._lastBucketVisited;
+ }
+ out << "\n" << indent << "}";
+}
+
+} // bucketmover
+} // storage
diff --git a/storage/src/vespa/storage/bucketmover/run.h b/storage/src/vespa/storage/bucketmover/run.h
new file mode 100644
index 00000000000..31f11d74e44
--- /dev/null
+++ b/storage/src/vespa/storage/bucketmover/run.h
@@ -0,0 +1,104 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::bucketmover::Run
+ * \ingroup storageserver
+ *
+ * \brief The run class takes care of creating operations for a single run.
+ *
+ * The run class keeps a matrix that keeps track of up to a max number of
+ * buckets that are located on the wrong disk. The first index of the matrix is
+ * the source disk and the second index is the ideal disk.
+ *
+ * The Run does not care about about pending or wait for that. The caller of
+ * getNextMove() have to regulate max pending for themselevs.
+ */
+
+#pragma once
+
+#include <boost/utility.hpp>
+#include <list>
+#include <map>
+#include <vespa/storage/bucketmover/move.h>
+#include <vespa/storage/bucketmover/runstatistics.h>
+#include <vespa/vespalib/util/linkedptr.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/vdslib/state/nodestate.h>
+
+namespace storage {
+
+class MountPointList;
+class PartitionMonitor;
+class StorBucketDatabase;
+class Clock;
+
+namespace bucketmover {
+
+class Run : public document::Printable, boost::noncopyable {
+ StorBucketDatabase& _bucketDatabase;
+ lib::Distribution::SP _distribution;
+ lib::NodeState _nodeState;
+ uint16_t _nodeIndex;
+ uint32_t _maxEntriesToKeep;
+ std::list<Move> _entries;
+ std::list<Move> _pending;
+ bool _iterationDone;
+ RunStatistics _statistics;
+ bool _aborted;
+ std::map<uint16_t, bool> _diskDisabled;
+
+public:
+ Run(StorBucketDatabase&,
+ lib::Distribution::SP,
+ const lib::NodeState&,
+ uint16_t nodeIndex,
+ framework::Clock&);
+
+ /**
+ * If disk distribution change during runs, they get aborted. We want to
+ * track this in run, as we want run to exist until all pending requests
+ * have been answered.
+ */
+ void abort() { _aborted = true; }
+ bool aborted() { return _aborted; }
+
+ /**
+ * Get next move does the following:
+ * - Sort disks in order of fillrate. (PartitionMonitor keeps cache, so
+ * this will only stat once in a while)
+ * - If the matrix contains a possible move from above average fill rate
+ * to below average fill rate, do that move. Prioritizing moving away
+ * from fullest disk.
+ * - Otherwise, continue visiting bucket database to fill up matrix.
+ * - If any moves left, do next, prioritizing moving away from fullest
+ * disk.
+ *
+ * @return A Move object. If isDefined() returns false, run is complete.
+ * The whole database have been iterated through.
+ */
+ Move getNextMove();
+
+ /**
+ * Run through the database not doing any moves. Useful to do a run only
+ * to gather statistics of current state.
+ */
+ void depleteMoves();
+
+ void moveOk(Move& move);
+ void moveFailedBucketNotFound(Move& move);
+ void moveFailed(Move& move);
+
+ const std::list<Move>& getPendingMoves() const { return _pending; }
+
+ RunStatistics& getStatistics() { return _statistics; }
+ const RunStatistics& getStatistics() const { return _statistics; }
+
+ virtual void print(std::ostream&, bool verbose,
+ const std::string& indent) const;
+private:
+ void removePending(Move&);
+ void finalize();
+};
+
+} // bucketmover
+} // storage
+
diff --git a/storage/src/vespa/storage/bucketmover/runstatistics.cpp b/storage/src/vespa/storage/bucketmover/runstatistics.cpp
new file mode 100644
index 00000000000..a0cd0617f9a
--- /dev/null
+++ b/storage/src/vespa/storage/bucketmover/runstatistics.cpp
@@ -0,0 +1,197 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/bucketmover/runstatistics.h>
+
+#include <iomanip>
+#include <vespa/log/log.h>
+#include <vespa/storage/bucketmover/htmltable.h>
+#include <vespa/vespalib/util/linkedptr.h>
+
+LOG_SETUP(".bucketmover.run.statistics");
+
+namespace storage {
+namespace bucketmover {
+
+RunStatistics::DiskMatrix::DiskMatrix()
+ : _bucketsMoved(0),
+ _bucketsFailedMoving(0),
+ _bucketsLeftOnWrongDisk(0),
+ _bucketsNotFoundAtExecutionTime(0)
+{
+}
+
+RunStatistics::DiskData::DiskData(uint16_t diskCount)
+ : _targetDisks(diskCount),
+ _bucketsFoundOnCorrectDisk(0),
+ _bucketSize(0),
+ _diskDisabled(false)
+{
+}
+
+double
+RunStatistics::DiskData::getWronglyPlacedRatio() const
+{
+ uint64_t wrong = 0;
+ for (uint32_t i=0; i<_targetDisks.size(); ++i) {
+ wrong += _targetDisks[i]._bucketsLeftOnWrongDisk
+ + _targetDisks[i]._bucketsFailedMoving;
+ }
+ uint64_t total = wrong + _bucketsFoundOnCorrectDisk;
+ return static_cast<double>(wrong) / total;
+}
+
+RunStatistics::RunStatistics(DiskDistribution d, framework::Clock& clock,
+ const lib::NodeState& ns)
+ : _clock(&clock),
+ _distribution(d),
+ _lastBucketProcessed(0),
+ _lastBucketVisited(0),
+ _diskData(ns.getDiskCount(), DiskData(ns.getDiskCount())),
+ _startTime(_clock->getTimeInSeconds()),
+ _endTime(0),
+ _lastBucketProcessedTime(0)
+{
+ for (uint32_t i=0; i<ns.getDiskCount(); ++i) {
+ if (!ns.getDiskState(i).getState().oneOf("uis")) {
+ _diskData[i]._diskDisabled = true;
+ }
+ }
+}
+
+void
+RunStatistics::print(std::ostream& out, bool verbose,
+ const std::string& ind) const
+{
+ (void) verbose; (void) ind;
+ bool completed(_endTime.isSet());
+ framework::SecondTime currentTime = _clock->getTimeInSeconds();
+ if (completed) {
+ out << "<h3>Run from " << _startTime << " to " << _endTime;
+ } else {
+ out << "<h3>Run started "
+ << currentTime.getDiff(_startTime).toString(framework::DIFFERENCE)
+ << " ago";
+ }
+ out << " with distribution "
+ << vespa::config::content::StorDistributionConfig::getDiskDistributionName(
+ _distribution)
+ << "</h3>\n<blockquote>";
+ if (!completed) {
+ std::ostringstream progress;
+ progress << std::fixed << std::setprecision(4)
+ << (100.0 * getProgress());
+ out << "<p>Progress: " << progress.str() << " % &nbsp; &nbsp;";
+ if (_lastBucketProcessedTime.isSet()) {
+ out << "<font color=\"gray\" size=\"-1\">Last move for "
+ << _lastBucketProcessed << " "
+ << currentTime.getDiff(_lastBucketProcessedTime)
+ .toString(framework::DIFFERENCE)
+ << " ago</font>";
+ }
+ out << "</p>\n";
+ }
+
+ HtmlTable table("Disk");
+ table.addColumnHeader(completed ? "Buckets in directory after run"
+ : "Processed buckets in directory", 2);
+ LongColumn bucketCount("Count", "", &table);
+ PercentageColumn bucketCountPart("Part", 0, &table);
+
+ table.addColumnHeader(completed
+ ? "Total document size in directory after run"
+ : "Total document size of processed buckets in directory", 2);
+ ByteSizeColumn documentSize("Size", &table);
+ PercentageColumn documentSizePart("Part", 0, &table);
+
+ table.addColumnHeader(completed ? "Buckets on correct disk after run"
+ : "Processed buckets on correct disk", 2);
+ LongColumn bucketsCorrectDisk("Count", "", &table);
+ DoubleColumn bucketsCorrectDiskPart("Part", " %", &table);
+ bucketsCorrectDiskPart.setTotalAsAverage();
+ bucketsCorrectDiskPart.addColorLimit(95, Column::LIGHT_YELLOW);
+ bucketsCorrectDiskPart.addColorLimit(100, Column::LIGHT_GREEN);
+
+ for (uint32_t i=0; i<_diskData.size(); ++i) {
+ table.addRow(i);
+ if (_diskData[i]._diskDisabled) {
+ table.setRowHeaderColor(Column::LIGHT_RED);
+ }
+
+ bucketCount[i] = getBucketCount(i, true);
+ bucketCountPart[i] = bucketCount[i];
+
+ documentSize[i] = _diskData[i]._bucketSize;
+ documentSizePart[i] = documentSize[i];
+
+ bucketsCorrectDisk[i] = getBucketCount(i, false);
+ bucketsCorrectDiskPart[i]
+ = 100.0 * getBucketCount(i, false) / getBucketCount(i, true);
+ }
+ table.addTotalRow("Total");
+ table.print(out);
+
+ MATRIX_PRINT("Buckets left on wrong disk", _bucketsLeftOnWrongDisk, *this);
+ MATRIX_PRINT("Buckets moved", _bucketsMoved, *this);
+ MATRIX_PRINT("Buckets not found at move time",
+ _bucketsNotFoundAtExecutionTime, *this);
+ MATRIX_PRINT("Buckets failed moving for other reasons",
+ _bucketsFailedMoving, *this);
+
+ out << "</blockquote>\n";
+}
+
+double
+RunStatistics::getWronglyPlacedRatio() const
+{
+ uint64_t wrong = 0, total = 0;
+ for (uint32_t i=0; i<_diskData.size(); ++i) {
+ for (uint32_t j=0; j<_diskData.size(); ++j) {
+ wrong += _diskData[i][j]._bucketsLeftOnWrongDisk
+ + _diskData[i][j]._bucketsFailedMoving;
+ }
+ total += _diskData[i]._bucketsFoundOnCorrectDisk;
+ }
+ total += wrong;
+ return static_cast<double>(wrong) / total;
+}
+
+double
+RunStatistics::getProgress() const
+{
+ if (_endTime.isSet()) return 1.0;
+ double result = 0;
+ double weight = 0.5;
+ uint64_t key = _lastBucketProcessed.toKey();
+ for (uint16_t i=0; i<64; ++i) {
+ uint64_t flag = uint64_t(1) << (63 - i);
+ if ((key & flag) == flag) {
+ result += weight;
+ }
+ weight /= 2;
+ }
+ return result;
+}
+
+uint64_t
+RunStatistics::getBucketCount(uint16_t disk, bool includeWrongLocation) const
+{
+ uint64_t total = 0;
+ for (uint32_t i=0; i<_diskData.size(); ++i) {
+ if (disk == i) total += _diskData[i]._bucketsFoundOnCorrectDisk;
+ for (uint32_t j=0; j<_diskData.size(); ++j) {
+ if (disk == i) {
+ if (includeWrongLocation) {
+ total += _diskData[i][j]._bucketsLeftOnWrongDisk;
+ total += _diskData[i][j]._bucketsFailedMoving;
+ }
+ } else if (disk == j) {
+ total += _diskData[i][j]._bucketsMoved;
+ }
+ }
+ }
+ return total;
+}
+
+} // bucketmover
+} // storage
diff --git a/storage/src/vespa/storage/bucketmover/runstatistics.h b/storage/src/vespa/storage/bucketmover/runstatistics.h
new file mode 100644
index 00000000000..161dcacfbe3
--- /dev/null
+++ b/storage/src/vespa/storage/bucketmover/runstatistics.h
@@ -0,0 +1,102 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::bucketmover::RunStatistics
+ * \ingroup bucketmover
+ *
+ * \brief Statistics gathered from a bucket mover cycle.
+ */
+
+#pragma once
+
+#define MATRIX_PRINT(desc, var, rs) \
+{ \
+ bool anyset = false; \
+ for (uint32_t i=0; i<(rs)._diskData.size(); ++i) { \
+ for (uint32_t j=0; j<(rs)._diskData.size(); ++j) { \
+ anyset |= ((rs)._diskData[i][j].var > 0); \
+ } \
+ } \
+ if (anyset) { \
+ out << "<h4>" << desc << "</h4>\n"; \
+ HtmlTable matrixTable("Source \\ Target"); \
+ typedef vespalib::LinkedPtr<LongColumn> LCLP; \
+ std::vector<LCLP> matrixData((rs)._diskData.size()); \
+ for (uint32_t i=0; i<(rs)._diskData.size(); ++i) { \
+ std::ostringstream index; \
+ index << "Disk " << i; \
+ matrixData[i].reset(new LongColumn(index.str(), "", &matrixTable));\
+ matrixTable.addRow(index.str()); \
+ } \
+ for (uint32_t i=0; i<(rs)._diskData.size(); ++i) { \
+ for (uint32_t j=0; j<(rs)._diskData.size(); ++j) { \
+ (*matrixData[j])[i] = (rs)._diskData[i][j].var; \
+ } \
+ } \
+ matrixTable.print(out); \
+ } \
+}
+
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/vespalib/util/printable.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vector>
+#include <vespa/vdslib/state/nodestate.h>
+#include <vespa/config-stor-distribution.h>
+
+namespace storage {
+
+class Clock;
+
+namespace bucketmover {
+
+struct RunStatistics : public document::Printable {
+ typedef vespa::config::content::StorDistributionConfig::DiskDistribution
+ DiskDistribution;
+
+ /** Data kept as targets for moves for each disk. */
+ struct DiskMatrix {
+ uint32_t _bucketsMoved;
+ uint32_t _bucketsFailedMoving;
+ uint32_t _bucketsLeftOnWrongDisk;
+ uint32_t _bucketsNotFoundAtExecutionTime;
+
+ DiskMatrix();
+ };
+
+ /** Data kept per disk. */
+ struct DiskData {
+ std::vector<DiskMatrix> _targetDisks;
+ uint32_t _bucketsFoundOnCorrectDisk;
+ uint64_t _bucketSize;
+ bool _diskDisabled;
+
+ DiskData(uint16_t diskCount);
+
+ DiskMatrix& operator[](uint16_t index) { return _targetDisks[index]; }
+ const DiskMatrix& operator[](uint16_t index) const
+ { return _targetDisks[index]; }
+ double getWronglyPlacedRatio() const;
+ };
+
+ framework::Clock* _clock;
+ DiskDistribution _distribution;
+ document::BucketId _lastBucketProcessed;
+ document::BucketId _lastBucketVisited; // Invalid bucket for starting point
+ std::vector<DiskData> _diskData;
+ framework::SecondTime _startTime;
+ framework::SecondTime _endTime;
+ framework::SecondTime _lastBucketProcessedTime;
+
+ RunStatistics(DiskDistribution, framework::Clock&, const lib::NodeState&);
+
+ double getWronglyPlacedRatio() const;
+ double getProgress() const;
+ uint64_t getBucketCount(uint16_t disk, bool includeWrongLocation) const;
+
+ void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+};
+
+} // bucketmover
+} // storage
+
diff --git a/storage/src/vespa/storage/common/.gitignore b/storage/src/vespa/storage/common/.gitignore
new file mode 100644
index 00000000000..333f254ba10
--- /dev/null
+++ b/storage/src/vespa/storage/common/.gitignore
@@ -0,0 +1,8 @@
+*.So
+*.lo
+.*.swp
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
diff --git a/storage/src/vespa/storage/common/CMakeLists.txt b/storage/src/vespa/storage/common/CMakeLists.txt
new file mode 100644
index 00000000000..e699f055d02
--- /dev/null
+++ b/storage/src/vespa/storage/common/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_common OBJECT
+ SOURCES
+ statusmetricconsumer.cpp
+ storagelink.cpp
+ storagelinkqueued.cpp
+ vtag.cpp
+ bucketoperationlogger.cpp
+ messagebucketid.cpp
+ messagesender.cpp
+ storagecomponent.cpp
+ servicelayercomponent.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/common/bucketmessages.h b/storage/src/vespa/storage/common/bucketmessages.h
new file mode 100644
index 00000000000..81b46642b46
--- /dev/null
+++ b/storage/src/vespa/storage/common/bucketmessages.h
@@ -0,0 +1,477 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/persistence/spi/persistenceprovider.h>
+#include <vespa/storageapi/message/internal.h>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/storageapi/buckets/bucketinfo.h>
+#include <vector>
+#include <set>
+
+namespace storage {
+
+/**
+ * @class ReadBucketList
+ * @ingroup common
+ *
+ * @brief List buckets existing on a partition.
+ */
+class ReadBucketList : public api::InternalCommand {
+ spi::PartitionId _partition;
+
+public:
+ typedef std::unique_ptr<ReadBucketList> UP;
+ static const uint32_t ID = 2003;
+
+ ReadBucketList(spi::PartitionId partition)
+ : api::InternalCommand(ID), _partition(partition)
+ {
+ }
+
+ spi::PartitionId getPartition() const { return _partition; }
+
+ std::unique_ptr<api::StorageReply> makeReply();
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+ {
+ out << "ReadBucketList(" << _partition << ")";
+
+ if (verbose) {
+ out << " : ";
+ InternalCommand::print(out, true, indent);
+ }
+ }
+};
+
+
+/**
+ * @class ReadBucketListReply
+ * @ingroup common
+ */
+class ReadBucketListReply : public api::InternalReply {
+ spi::PartitionId _partition;
+ spi::BucketIdListResult::List _buckets;
+
+public:
+ typedef std::unique_ptr<ReadBucketListReply> UP;
+ typedef std::shared_ptr<ReadBucketListReply> SP;
+ static const uint32_t ID = 2004;
+
+ ReadBucketListReply(const ReadBucketList& cmd)
+ : api::InternalReply(ID, cmd),
+ _partition(cmd.getPartition())
+ {
+ }
+
+ spi::PartitionId getPartition() const { return _partition; }
+
+ spi::BucketIdListResult::List& getBuckets() { return _buckets; }
+ const spi::BucketIdListResult::List& getBuckets() const {
+ return _buckets;
+ }
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+ {
+ out << "ReadBucketListReply(" << _buckets.size() << " buckets)";
+ if (verbose) {
+ out << " : ";
+ InternalReply::print(out, true, indent);
+ }
+ }
+};
+
+inline std::unique_ptr<api::StorageReply> ReadBucketList::makeReply() {
+ return std::unique_ptr<api::StorageReply>(
+ new ReadBucketListReply(*this));
+}
+
+/**
+ * @class ReadBucketInfo
+ * @ingroup common
+ *
+ * @brief Get more detailed information about a set of buckets.
+ *
+ * The distributor wants some information for each bucket, that one
+ * have to open the bucket and read its headers to find. This class is
+ * used to retrieve such information.
+ */
+class ReadBucketInfo : public api::InternalCommand {
+ document::BucketId _bucketId;
+
+public:
+ static const uint32_t ID = 2005;
+
+ ReadBucketInfo(const document::BucketId& bucketId)
+ : api::InternalCommand(ID), _bucketId(bucketId)
+ {
+ }
+
+ document::BucketId getBucketId() const { return _bucketId; }
+ virtual bool hasSingleBucketId() const { return true; }
+
+ std::unique_ptr<api::StorageReply> makeReply();
+
+ virtual void print(std::ostream& out, bool verbose, const std::string& indent) const
+ {
+ out << "ReadBucketInfo(" << _bucketId << ")";
+
+ if (verbose) {
+ out << " : ";
+ InternalCommand::print(out, true, indent);
+ }
+ }
+private:
+ virtual vespalib::string getSummary() const {
+ vespalib::string s("ReadBucketInfo(");
+ s.append(_bucketId.toString());
+ s.append(')');
+ return s;
+ }
+
+};
+
+
+/**
+ * @class ReadBucketInfoReply
+ * @ingroup common
+ */
+class ReadBucketInfoReply : public api::InternalReply {
+ document::BucketId _bucketId;
+
+public:
+ static const uint32_t ID = 2006;
+
+ ReadBucketInfoReply(const ReadBucketInfo& cmd)
+ : api::InternalReply(ID, cmd),
+ _bucketId(cmd.getBucketId())
+ {
+ }
+
+ document::BucketId getBucketId() const { return _bucketId; }
+ virtual bool hasSingleBucketId() const { return true; }
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+ {
+ out << "ReadBucketInfoReply()";
+ if (verbose) {
+ out << " : ";
+ InternalReply::print(out, true, indent);
+ }
+ }
+};
+
+inline std::unique_ptr<api::StorageReply> ReadBucketInfo::makeReply() {
+ return std::unique_ptr<api::StorageReply>(
+ new ReadBucketInfoReply(*this));
+}
+
+
+/**
+ * @class RepairBucketCommand
+ * @ingroup common
+ *
+ * @brief Repair a given bucket (if it contain errors).
+ *
+ * This message is sent continually by the bucket integrity checker.
+ * Errors found are reported back.
+ */
+class RepairBucketCommand : public api::InternalCommand {
+ document::BucketId _bucket;
+ uint16_t _disk;
+ bool _verifyBody; // Optional as it is expensive
+ bool _moveToIdealDisk; // Optional as it is expensive
+
+public:
+ typedef std::unique_ptr<RepairBucketCommand> UP;
+
+ static const uint32_t ID = 2007;
+
+ RepairBucketCommand(const document::BucketId& bucket, uint16_t disk)
+ : api::InternalCommand(ID),
+ _bucket(bucket),
+ _disk(disk),
+ _verifyBody(false),
+ _moveToIdealDisk(false)
+ {
+ setPriority(LOW);
+ }
+
+ virtual bool hasSingleBucketId() const { return true; }
+ document::BucketId getBucketId() const { return _bucket; }
+
+ uint16_t getDisk() const { return _disk; }
+ bool verifyBody() const { return _verifyBody; }
+ bool moveToIdealDisk() const { return _moveToIdealDisk; }
+
+ void setBucketId(const document::BucketId& id) { _bucket = id; }
+ void verifyBody(bool doIt) { _verifyBody = doIt; }
+ void moveToIdealDisk(bool doIt) { _moveToIdealDisk = doIt; }
+
+ std::unique_ptr<api::StorageReply> makeReply();
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+ {
+ out << getSummary();
+ if (verbose) {
+ out << " : ";
+ InternalCommand::print(out, true, indent);
+ }
+ }
+private:
+ virtual vespalib::string getSummary() const {
+ vespalib::asciistream s;
+ s << "ReadBucketInfo(" << _bucket.toString() << ", disk " << _disk
+ << (_verifyBody ? ", verifying body" : "")
+ << (_moveToIdealDisk ? ", moving to ideal disk" : "")
+ << ")";
+ return s.str();
+ }
+};
+
+/**
+ * @class RepairBucketReply
+ * @ingroup common
+ */
+class RepairBucketReply : public api::InternalReply {
+ document::BucketId _bucket;
+ api::BucketInfo _bucketInfo;
+ uint16_t _disk;
+ bool _altered;
+
+public:
+ typedef std::unique_ptr<RepairBucketReply> UP;
+ static const uint32_t ID = 2008;
+
+ RepairBucketReply(const RepairBucketCommand& cmd,
+ const api::BucketInfo& bucketInfo = api::BucketInfo())
+ : api::InternalReply(ID, cmd),
+ _bucket(cmd.getBucketId()),
+ _bucketInfo(bucketInfo),
+ _disk(cmd.getDisk()),
+ _altered(false)
+ {
+ }
+
+ document::BucketId getBucketId() const { return _bucket; }
+ virtual bool hasSingleBucketId() const { return true; }
+
+ const api::BucketInfo& getBucketInfo() const { return _bucketInfo; }
+ uint16_t getDisk() const { return _disk; }
+
+ bool bucketAltered() const { return _altered; }
+ void setAltered(bool altered) { _altered = altered; }
+
+ virtual void print(std::ostream& out, bool verbose, const std::string& indent) const
+ {
+ out << "RepairBucketReply()";
+
+ if (verbose) {
+ out << " : ";
+ InternalReply::print(out, true, indent);
+ }
+ }
+};
+
+inline std::unique_ptr<api::StorageReply> RepairBucketCommand::makeReply() {
+ return std::unique_ptr<api::StorageReply>(
+ new RepairBucketReply(*this));
+}
+
+/**
+ * @class BucketDiskMoveCommand
+ * @ingroup common
+ *
+ * @brief Move a given bucket (from src disk to dst disk).
+ *
+ * This message is sent continually by the bucket mover.
+ * Size of the bucket moved is reported back.
+ */
+class BucketDiskMoveCommand : public api::InternalCommand {
+ document::BucketId _bucket;
+ uint16_t _srcDisk;
+ uint16_t _dstDisk;
+
+public:
+ typedef std::shared_ptr<BucketDiskMoveCommand> SP;
+ static const uint32_t ID = 2012;
+
+ BucketDiskMoveCommand(const document::BucketId& bucket,
+ uint16_t srcDisk, uint16_t dstDisk)
+ : api::InternalCommand(ID),
+ _bucket(bucket),
+ _srcDisk(srcDisk),
+ _dstDisk(dstDisk)
+ {
+ setPriority(LOW);
+ }
+
+ document::BucketId getBucketId() const { return _bucket; }
+ virtual bool hasSingleBucketId() const { return true; }
+
+ uint16_t getSrcDisk() const { return _srcDisk; }
+ uint16_t getDstDisk() const { return _dstDisk; }
+
+ void setBucketId(const document::BucketId& id) { _bucket = id; }
+
+ std::unique_ptr<api::StorageReply> makeReply();
+
+ virtual void print(std::ostream& out, bool, const std::string&) const
+ {
+ out << "BucketDiskMoveCommand(" << _bucket << ", source " << _srcDisk
+ << ", target " << _dstDisk << ")";
+ }
+
+};
+
+/**
+ * @class BucketDiskMoveReply
+ * @ingroup common
+ */
+class BucketDiskMoveReply : public api::InternalReply {
+ document::BucketId _bucket;
+ api::BucketInfo _bucketInfo;
+ uint64_t _fileSizeOnSrc;
+ uint64_t _fileSizeOnDst;
+ uint16_t _srcDisk;
+ uint16_t _dstDisk;
+
+public:
+ typedef std::shared_ptr<BucketDiskMoveReply> SP;
+ static const uint32_t ID = 2013;
+
+ BucketDiskMoveReply(const BucketDiskMoveCommand& cmd,
+ const api::BucketInfo& bucketInfo = api::BucketInfo(),
+ uint32_t sourceFileSize = 0,
+ uint32_t destinationFileSize = 0)
+ : api::InternalReply(ID, cmd),
+ _bucket(cmd.getBucketId()),
+ _bucketInfo(bucketInfo),
+ _fileSizeOnSrc(sourceFileSize),
+ _fileSizeOnDst(destinationFileSize),
+ _srcDisk(cmd.getSrcDisk()),
+ _dstDisk(cmd.getDstDisk())
+ {
+ }
+
+ document::BucketId getBucketId() const { return _bucket; }
+ virtual bool hasSingleBucketId() const { return true; }
+
+ const api::BucketInfo& getBucketInfo() const { return _bucketInfo; }
+ void setFileSizeOnSrc(uint64_t fileSize) { _fileSizeOnSrc = fileSize; }
+ void setFileSizeOnDst(uint64_t fileSize) { _fileSizeOnDst = fileSize; }
+ uint64_t getFileSizeOnSrc() const { return _fileSizeOnSrc; }
+ uint64_t getFileSizeOnDst() const { return _fileSizeOnDst; }
+ uint16_t getSrcDisk() const { return _srcDisk; }
+ uint16_t getDstDisk() const { return _dstDisk; }
+
+ void print(std::ostream& out, bool, const std::string&) const
+ {
+ out << "BucketDiskMoveReply(" << _bucket << ", source " << _srcDisk
+ << ", target " << _dstDisk << ", " << _bucketInfo << ", "
+ << getResult() << ")";
+ }
+};
+
+inline std::unique_ptr<api::StorageReply> BucketDiskMoveCommand::makeReply()
+{
+ return std::unique_ptr<api::StorageReply>(
+ new BucketDiskMoveReply(*this));
+}
+
+/**
+ * @class InternalBucketJoinCommand
+ * @ingroup common
+ *
+ * @brief Joins multiple versions of the same bucket.
+ *
+ * In case disks are reintroduced, we might have several copies of the same
+ * bucket on multiple disks. In such cases we should join these buckets during
+ * initialization as we cannot cope with multiple versions of the same bucket
+ * while storage is running.
+ */
+class InternalBucketJoinCommand : public api::InternalCommand {
+ document::BucketId _bucket;
+ uint16_t _keepOnDisk;
+ uint16_t _joinFromDisk;
+
+public:
+ static const uint32_t ID = 2015;
+
+ InternalBucketJoinCommand(const document::BucketId& bucket,
+ uint16_t keepOnDisk, uint16_t joinFromDisk)
+ : api::InternalCommand(ID),
+ _bucket(bucket),
+ _keepOnDisk(keepOnDisk),
+ _joinFromDisk(joinFromDisk)
+ {
+ setPriority(HIGH); // To not get too many pending of these, prioritize
+ // them higher than getting more bucket info lists.
+ }
+
+ document::BucketId getBucketId() const { return _bucket; }
+ virtual bool hasSingleBucketId() const { return true; }
+
+ uint16_t getDiskOfInstanceToKeep() const { return _keepOnDisk; }
+ uint16_t getDiskOfInstanceToJoin() const { return _joinFromDisk; }
+
+ std::unique_ptr<api::StorageReply> makeReply();
+
+ virtual void print(std::ostream& out, bool verbose, const std::string& indent) const
+ {
+ out << "InternalBucketJoinCommand()";
+
+ if (verbose) {
+ out << " : ";
+ InternalCommand::print(out, true, indent);
+ }
+ }
+};
+
+/**
+ * @class InternalBucketJoinReply
+ * @ingroup common
+ */
+class InternalBucketJoinReply : public api::InternalReply {
+ document::BucketId _bucket;
+ api::BucketInfo _bucketInfo;
+
+public:
+ static const uint32_t ID = 2016;
+
+ InternalBucketJoinReply(const InternalBucketJoinCommand& cmd,
+ const api::BucketInfo& info = api::BucketInfo())
+ : api::InternalReply(ID, cmd),
+ _bucket(cmd.getBucketId()),
+ _bucketInfo(info)
+ {
+ }
+
+ document::BucketId getBucketId() const { return _bucket; }
+ virtual bool hasSingleBucketId() const { return true; }
+
+ const api::BucketInfo& getBucketInfo() const { return _bucketInfo; }
+
+ virtual void print(std::ostream& out, bool verbose, const std::string& indent) const
+ {
+ out << "InternalBucketJoinReply()";
+
+ if (verbose) {
+ out << " : ";
+ InternalReply::print(out, true, indent);
+ }
+ }
+};
+
+inline std::unique_ptr<api::StorageReply>
+InternalBucketJoinCommand::makeReply()
+{
+ return std::unique_ptr<api::StorageReply>(
+ new InternalBucketJoinReply(*this));
+}
+
+} // storage
+
diff --git a/storage/src/vespa/storage/common/bucketoperationlogger.cpp b/storage/src/vespa/storage/common/bucketoperationlogger.cpp
new file mode 100644
index 00000000000..7ec6e2df599
--- /dev/null
+++ b/storage/src/vespa/storage/common/bucketoperationlogger.cpp
@@ -0,0 +1,331 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/common/bucketoperationlogger.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/log/log.h>
+#include <vespa/storageapi/buckets/bucketinfo.h>
+#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
+#include <vespa/vespalib/util/backtrace.h>
+#include <vespa/vespalib/stllike/asciistream.h>
+
+#ifdef ENABLE_BUCKET_OPERATION_LOGGING
+
+LOG_SETUP(".debuglogger");
+
+namespace storage {
+
+namespace debug {
+
+BucketOperationLogger opLogger;
+
+void
+BucketOperationLogger::log(const document::BucketId& id,
+ const vespalib::string& text,
+ bool requireLock,
+ State::LockUpdate lockUpdate)
+{
+ LogEntry entry;
+ framework::defaultimplementation::RealClock rclock;
+ entry._frameCount = vespalib::getStackTraceFrames(entry._stackFrames, MAX_STACK_FRAMES);
+ entry._text = text;
+ entry._timestamp = rclock.getTimeInMicros();
+ entry._threadId = FastOS_Thread::GetCurrentThreadId() & 0xffff;
+ uint32_t lockedByThread = 0;
+ bool hasError = false;
+
+ {
+ vespalib::LockGuard lock(_logLock);
+ BucketMapType::iterator i = _bucketMap.lower_bound(id);
+ if (i != _bucketMap.end() && i->first == id) {
+ if (i->second._history.size() >= MAX_ENTRIES) {
+ i->second._history.pop_front();
+ }
+ i->second._history.push_back(entry);
+ if (lockUpdate == State::BUCKET_LOCKED) {
+ if (i->second._lockedByThread != 0) {
+ LOG(warning, "Attempting to acquire lock, but lock "
+ "is already held by thread %u", i->second._lockedByThread);
+ hasError = true;
+ }
+ i->second._lockedByThread = entry._threadId;
+ }
+ lockedByThread = i->second._lockedByThread;
+ if (lockUpdate == State::BUCKET_UNLOCKED) {
+ if (i->second._lockedByThread == 0) {
+ LOG(warning, "Attempting to release lock, but lock "
+ "is not held");
+ hasError = true;
+ }
+ i->second._lockedByThread = 0;
+ }
+ } else {
+ State addState;
+ addState._lockedByThread = 0;
+ addState._history.push_back(entry);
+ if (lockUpdate == State::BUCKET_LOCKED) {
+ addState._lockedByThread = entry._threadId;
+ } else if (lockUpdate == State::BUCKET_UNLOCKED) {
+ LOG(warning, "Attempting to release lock, but lock "
+ "is not held");
+ hasError = true;
+ }
+ _bucketMap.insert(i, BucketMapType::value_type(id, addState));
+ }
+ }
+
+ if (requireLock && !lockedByThread) {
+ LOG(warning, "Operation '%s' requires lock, but lock is "
+ "not registered as held", text.c_str());
+ hasError = true;
+ }
+ if (hasError) {
+ LOG(warning, "%s", getHistory(id).c_str());
+ }
+}
+
+namespace {
+
+// Must hold logger lock
+template <typename LineHandler>
+void
+processHistory(const BucketOperationLogger& opLogger,
+ const document::BucketId& id, LineHandler& handler)
+{
+ BucketOperationLogger::BucketMapType::const_iterator i(
+ opLogger._bucketMap.find(id));
+ if (i == opLogger._bucketMap.end()) {
+ vespalib::asciistream ss;
+ ss << "No history recorded for bucket '"
+ << id.toString() << "'";
+ handler(ss.str());
+ return;
+ }
+
+ {
+ vespalib::asciistream ss;
+ ss << "Showing last " << i->second._history.size() << " operations on "
+ << "bucket " << id.toString() << " (newest first):";
+ handler(ss.str());
+ }
+ for (BucketOperationLogger::State::LogEntryListType::const_reverse_iterator j(
+ i->second._history.rbegin()), end(i->second._history.rend());
+ j != end; ++j)
+ {
+ vespalib::asciistream ss;
+ ss << storage::framework::getTimeString(
+ j->_timestamp.getTime(),
+ storage::framework::DATETIME_WITH_MICROS)
+ << " " << j->_threadId << " "
+ << j->_text << ". "
+ << vespalib::getStackTrace(1, j->_stackFrames, j->_frameCount);
+ handler(ss.str());
+ }
+}
+
+struct LogWarnAppender
+{
+ void operator()(const vespalib::string& line)
+ {
+ LOG(warning, "%s", line.c_str());
+ }
+};
+
+struct LogStringBuilder
+{
+ vespalib::asciistream ss;
+ void operator()(const vespalib::string& line)
+ {
+ ss << line << "\n";
+ }
+};
+
+}
+
+void
+BucketOperationLogger::dumpHistoryToLog(const document::BucketId& id) const
+{
+ LogWarnAppender handler;
+ vespalib::LockGuard lock(_logLock);
+ processHistory(*this, id, handler);
+}
+
+vespalib::string
+BucketOperationLogger::getHistory(const document::BucketId& id) const
+{
+ LogStringBuilder handler;
+ vespalib::LockGuard lock(_logLock);
+ processHistory(*this, id, handler);
+ return handler.ss.str();
+}
+
+vespalib::string
+BucketOperationLogger::searchBucketHistories(
+ const vespalib::string& sub,
+ const vespalib::string& urlPrefix) const
+{
+ vespalib::asciistream ss;
+ ss << "<ul>\n";
+ // This may block for a while... Assuming such searches run when system
+ // is otherwise idle.
+ vespalib::LockGuard lock(_logLock);
+ for (BucketMapType::const_iterator
+ bIt(_bucketMap.begin()), bEnd(_bucketMap.end());
+ bIt != bEnd; ++bIt)
+ {
+ for (State::LogEntryListType::const_iterator
+ sIt(bIt->second._history.begin()),
+ sEnd(bIt->second._history.end());
+ sIt != sEnd; ++sIt)
+ {
+ if (sIt->_text.find(sub.c_str()) != vespalib::string::npos) {
+ ss << "<li><a href=\"" << urlPrefix
+ << "0x" << vespalib::hex << bIt->first.getId()
+ << vespalib::dec << "\">" << bIt->first.toString()
+ << "</a>:\n";
+ ss << sIt->_text << "</li>\n";
+ }
+ }
+ }
+ ss << "</ul>\n";
+ return ss.str();
+}
+
+BucketOperationLogger&
+BucketOperationLogger::getInstance()
+{
+ return opLogger;
+}
+
+// Storage node
+void logBucketDbInsert(uint64_t key, const bucketdb::StorageBucketInfo& entry)
+{
+ LOG_BUCKET_OPERATION_NO_LOCK(
+ document::BucketId(document::BucketId::keyToBucketId(key)),
+ vespalib::make_vespa_string(
+ "bucketdb insert Bucket(crc=%x, docs=%u, size=%u, "
+ "metacount=%u, usedfilesize=%u, ready=%s, "
+ "active=%s, lastModified=%zu) disk=%u",
+ entry.info.getChecksum(),
+ entry.info.getDocumentCount(),
+ entry.info.getTotalDocumentSize(),
+ entry.info.getMetaCount(),
+ entry.info.getUsedFileSize(),
+ (entry.info.isReady() ? "true" : "false"),
+ (entry.info.isActive() ? "true" : "false"),
+ entry.info.getLastModified(),
+ entry.disk));
+}
+
+void logBucketDbErase(uint64_t key, const TypeTag<bucketdb::StorageBucketInfo>&)
+{
+ LOG_BUCKET_OPERATION_NO_LOCK(
+ document::BucketId(document::BucketId::keyToBucketId(key)),
+ "bucketdb erase");
+}
+
+// Distributor
+void
+checkAllConsistentNodesImpliesTrusted(
+ const document::BucketId& bucket,
+ const distributor::BucketInfo& entry)
+{
+ // If all copies are consistent, they should also be trusted
+ if (entry.validAndConsistent() && entry.getNodeCount() > 1) {
+ for (std::size_t i = 0; i < entry.getNodeCount(); ++i) {
+ const distributor::BucketCopy& copy = entry.getNodeRef(i);
+ if (copy.trusted() == false) {
+ LOG(warning, "Bucket DB entry %s for %s is consistent, but "
+ "contains non-trusted copy %s", entry.toString().c_str(),
+ bucket.toString().c_str(), copy.toString().c_str());
+ DUMP_LOGGED_BUCKET_OPERATIONS(bucket);
+ }
+ }
+ }
+}
+
+std::size_t
+firstTrustedNode(const distributor::BucketInfo& entry)
+{
+ for (std::size_t i = 0; i < entry.getNodeCount(); ++i) {
+ const distributor::BucketCopy& copy = entry.getNodeRef(i);
+ if (copy.trusted()) {
+ return i;
+ }
+ }
+ return std::numeric_limits<std::size_t>::max();
+}
+
+void
+checkNotInSyncImpliesNotTrusted(
+ const document::BucketId& bucket,
+ const distributor::BucketInfo& entry)
+{
+ // If there are copies out of sync, different copies should not
+ // be set to trusted
+ std::size_t trustedNode = firstTrustedNode(entry);
+ if (trustedNode != std::numeric_limits<std::size_t>::max()) {
+ // Ensure all other trusted copies match the metadata of the
+ // first trusted bucket
+ const distributor::BucketCopy& trustedCopy = entry.getNodeRef(trustedNode);
+ for (std::size_t i = 0; i < entry.getNodeCount(); ++i) {
+ if (i == trustedNode) {
+ continue;
+ }
+ const distributor::BucketCopy& copy = entry.getNodeRef(i);
+ const api::BucketInfo& copyInfo = copy.getBucketInfo();
+ const api::BucketInfo& trustedInfo = trustedCopy.getBucketInfo();
+ if (copy.trusted()
+ && ((copyInfo.getChecksum() != trustedInfo.getChecksum())))
+ //|| (copyInfo.getTotalDocumentSize() != trustedInfo.getTotalDocumentSize())))
+ {
+ LOG(warning, "Bucket DB entry %s for %s has trusted node copy "
+ "with differing metadata %s", entry.toString().c_str(),
+ bucket.toString().c_str(), copy.toString().c_str());
+ DUMP_LOGGED_BUCKET_OPERATIONS(bucket);
+ }
+ }
+ }
+}
+
+void
+checkInvalidImpliesNotTrusted(
+ const document::BucketId& bucket,
+ const distributor::BucketInfo& entry)
+{
+ for (std::size_t i = 0; i < entry.getNodeCount(); ++i) {
+ const distributor::BucketCopy& copy = entry.getNodeRef(i);
+ if (!copy.valid() && copy.trusted()) {
+ LOG(warning, "Bucket DB entry %s for %s has invalid copy %s "
+ "marked as trusted", entry.toString().c_str(),
+ bucket.toString().c_str(), copy.toString().c_str());
+ DUMP_LOGGED_BUCKET_OPERATIONS(bucket);
+ }
+ }
+}
+
+void
+logBucketDbInsert(uint64_t key, const distributor::BucketInfo& entry)
+{
+ document::BucketId bucket(document::BucketId::keyToBucketId(key));
+ LOG_BUCKET_OPERATION_NO_LOCK(
+ bucket, vespalib::make_vespa_string(
+ "bucketdb insert of %s", entry.toString().c_str()));
+ // Do some sanity checking of the inserted entry
+ checkAllConsistentNodesImpliesTrusted(bucket, entry);
+ checkNotInSyncImpliesNotTrusted(bucket, entry);
+ checkInvalidImpliesNotTrusted(bucket, entry);
+}
+
+void
+logBucketDbErase(uint64_t key, const TypeTag<distributor::BucketInfo>&)
+{
+ document::BucketId bucket(document::BucketId::keyToBucketId(key));
+ LOG_BUCKET_OPERATION_NO_LOCK(bucket, "bucketdb erase");
+}
+
+} // namespace debug
+
+} // namespace storage
+
+#endif // ENABLE_BUCKET_OPERATION_LOGGING
diff --git a/storage/src/vespa/storage/common/bucketoperationlogger.h b/storage/src/vespa/storage/common/bucketoperationlogger.h
new file mode 100644
index 00000000000..12a07ccf539
--- /dev/null
+++ b/storage/src/vespa/storage/common/bucketoperationlogger.h
@@ -0,0 +1,126 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <string>
+#include <map>
+#include <list>
+#include <vespa/vespalib/stllike/string.h>
+#include <vespa/vespalib/util/sync.h>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/vespalib/util/vstringfmt.h>
+#include <vespa/storageframework/storageframework.h>
+
+/**
+ * Enable this to log most slotfile operations (such as all mutations) as
+ * well as common bucket operations such as splitting, joining and bucket db
+ * updates. Each log entry contains the stack frames for the logging callsite,
+ * a timestamp, the ID of the thread performing the operation as well as a
+ * message. The stack trace is cheaply acquired and does thus not affect runtime
+ * performance to a great degree. Expect some overhead from the logging itself
+ * since it requires a global mutex around the log state.
+ *
+ * All relevant bucket/slotfile operations are checked to ensure that the
+ * filestor lock is held during the operation and that the thread performing
+ * it is the same as the one that acquired the lock.
+ *
+ * Similarly, code has been added to distributor bucket database and ideal
+ * state handling to log these.
+ *
+ * In the case of an invariant violation (such as a locking bug), the last
+ * BUCKET_OPERATION_LOG_ENTRIES log entries will be dumped to the vespalog.
+ * Code may also dump the logged history for a bucket by calling
+ * DUMP_LOGGED_BUCKET_OPERATIONS(bucketid)
+ */
+//#define ENABLE_BUCKET_OPERATION_LOGGING
+#define BUCKET_OPERATION_LOG_ENTRIES 40
+
+#ifdef ENABLE_BUCKET_OPERATION_LOGGING
+#define LOG_BUCKET_OPERATION_NO_LOCK(bucket, string) \
+debug::BucketOperationLogger::getInstance().log( \
+ (bucket), (string), false)
+
+#define LOG_BUCKET_OPERATION(bucket, string) \
+debug::BucketOperationLogger::getInstance().log( \
+ (bucket), (string), true)
+
+#define LOG_BUCKET_OPERATION_SPECIFY_LOCKED(bucket, string, require_locked) \
+debug::BucketOperationLogger::getInstance().log( \
+ (bucket), (string), (require_locked))
+
+#define LOG_BUCKET_OPERATION_SET_LOCK_STATE(bucket, string, require_locked, new_state) \
+debug::BucketOperationLogger::getInstance().log( \
+ (bucket), (string), (require_locked), (new_state))
+
+#define DUMP_LOGGED_BUCKET_OPERATIONS(bucket) \
+ debug::BucketOperationLogger::getInstance().dumpHistoryToLog(bucket)
+
+namespace storage {
+
+// Debug stuff for tracking the last n operations to buckets
+namespace debug {
+
+struct BucketOperationLogger
+{
+ static const std::size_t MAX_ENTRIES = BUCKET_OPERATION_LOG_ENTRIES;
+ static const std::size_t MAX_STACK_FRAMES = 25;
+
+ struct LogEntry
+ {
+ void* _stackFrames[MAX_STACK_FRAMES];
+ vespalib::string _text;
+ framework::MicroSecTime _timestamp;
+ int _frameCount;
+ int32_t _threadId;
+ };
+
+ struct State
+ {
+ typedef std::list<LogEntry> LogEntryListType;
+ enum LockUpdate
+ {
+ NO_UPDATE = 0,
+ BUCKET_LOCKED = 1,
+ BUCKET_UNLOCKED = 2
+ };
+ LogEntryListType _history;
+ uint32_t _lockedByThread;
+ };
+
+ typedef std::map<document::BucketId, State> BucketMapType;
+
+ vespalib::Lock _logLock;
+ BucketMapType _bucketMap;
+
+ void log(const document::BucketId& id,
+ const vespalib::string& text,
+ bool requireLock = true,
+ State::LockUpdate update = State::NO_UPDATE);
+
+ vespalib::string getHistory(const document::BucketId& id) const;
+ void dumpHistoryToLog(const document::BucketId& id) const;
+ //void dumpAllBucketHistoriesToFile(const vespalib::string& filename) const;
+ /**
+ * Search through all bucket history entry descriptions to find substring,
+ * creating a itemized list of buckets containing it as well as a preview.
+ * @param sub the exact substring to search for.
+ * @param urlPrefix the URL used for creating bucket links.
+ */
+ vespalib::string searchBucketHistories(const vespalib::string& sub,
+ const vespalib::string& urlPrefix) const;
+ static BucketOperationLogger& getInstance();
+};
+
+}
+
+}
+
+#else
+
+#define LOG_BUCKET_OPERATION_NO_LOCK(bucket, string)
+#define LOG_BUCKET_OPERATION(bucket, string)
+#define LOG_BUCKET_OPERATION_SPECIFY_LOCKED(bucket, string, require_locked)
+#define DUMP_LOGGED_BUCKET_OPERATIONS(bucket)
+#define LOG_BUCKET_OPERATION_SET_LOCK_STATE(bucket, string, require_locked, new_state)
+
+#endif
+
diff --git a/storage/src/vespa/storage/common/distributorcomponent.h b/storage/src/vespa/storage/common/distributorcomponent.h
new file mode 100644
index 00000000000..586bbf61890
--- /dev/null
+++ b/storage/src/vespa/storage/common/distributorcomponent.h
@@ -0,0 +1,125 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::DistributorComponent
+ * \ingroup common
+ *
+ * \brief Component class including some service layer specific information.
+ */
+
+/**
+ * \class storage::DistributorComponentRegister
+ * \ingroup common
+ *
+ * \brief Specialization of ComponentRegister handling service layer components.
+ */
+
+/**
+ * \class storage::DistributorManagedComponent
+ * \ingroup common
+ *
+ * \brief Specialization of StorageManagedComponent.
+ *
+ * A service layer component register will use this interface in order to set
+ * the service layer functionality parts.
+ */
+
+#pragma once
+
+#include <vespa/storageapi/defs.h>
+#include <vespa/storage/common/storagecomponent.h>
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <vespa/storage/distributor/distributorconfiguration.h>
+#include <vespa/storage/config/config-stor-distributormanager.h>
+#include <vespa/storage/config/config-stor-visitordispatcher.h>
+
+namespace storage {
+
+namespace bucketdb {
+ class DistrBucketDatabase;
+}
+namespace lib {
+ class IdealNodeCalculator;
+}
+
+typedef vespa::config::content::core::internal::InternalStorDistributormanagerType DistributorConfig;
+typedef vespa::config::content::core::internal::InternalStorVisitordispatcherType VisitorConfig;
+
+struct UniqueTimeCalculator {
+ virtual ~UniqueTimeCalculator() {}
+ virtual api::Timestamp getUniqueTimestamp() = 0;
+};
+
+struct DistributorManagedComponent
+{
+ virtual ~DistributorManagedComponent() {}
+
+ virtual void setIdealNodeCalculator(lib::IdealNodeCalculator&) = 0;
+ virtual void setTimeCalculator(UniqueTimeCalculator&) = 0;
+ virtual void setBucketDatabase(distributor::BucketDatabase&) = 0;
+ virtual void setDistributorConfig(const DistributorConfig&)= 0;
+ virtual void setVisitorConfig(const VisitorConfig&) = 0;
+};
+
+struct DistributorComponentRegister : public virtual StorageComponentRegister
+{
+ virtual void registerDistributorComponent(
+ DistributorManagedComponent&) = 0;
+};
+
+class DistributorComponent : public StorageComponent,
+ private DistributorManagedComponent
+{
+ lib::IdealNodeCalculator* _idealNodeCalculator;
+ distributor::BucketDatabase* _bucketDatabase;
+ mutable UniqueTimeCalculator* _timeCalculator;
+ DistributorConfig _distributorConfig;
+ VisitorConfig _visitorConfig;
+ distributor::DistributorConfiguration _totalConfig;
+
+ // DistributorManagedComponent implementation
+ virtual void setBucketDatabase(distributor::BucketDatabase& db)
+ { _bucketDatabase = &db; }
+ virtual void setIdealNodeCalculator(lib::IdealNodeCalculator& c)
+ { _idealNodeCalculator = &c; }
+ virtual void setTimeCalculator(UniqueTimeCalculator& utc)
+ { _timeCalculator = &utc; }
+ virtual void setDistributorConfig(const DistributorConfig& c)
+ { _distributorConfig = c; _totalConfig.configure(c); }
+ virtual void setVisitorConfig(const VisitorConfig& c)
+ { _visitorConfig = c; _totalConfig.configure(c); }
+
+public:
+ typedef std::unique_ptr<DistributorComponent> UP;
+
+ DistributorComponent(DistributorComponentRegister& compReg,
+ vespalib::stringref name)
+ : StorageComponent(compReg, name),
+ _bucketDatabase(0), _timeCalculator(0),
+ _totalConfig(*this)
+ {
+ compReg.registerDistributorComponent(*this);
+ }
+
+ api::Timestamp getUniqueTimestamp() const {
+ assert(_timeCalculator); return _timeCalculator->getUniqueTimestamp();
+ }
+ const DistributorConfig& getDistributorConfig() const {
+ return _distributorConfig;
+ }
+ const VisitorConfig& getVisitorConfig() const {
+ return _visitorConfig;
+ }
+ const distributor::DistributorConfiguration&
+ getTotalDistributorConfig() const {
+ return _totalConfig;
+ }
+ distributor::BucketDatabase& getBucketDatabase() {
+ assert(_bucketDatabase); return *_bucketDatabase;
+ }
+ lib::IdealNodeCalculator& getIdealNodeCalculator() const {
+ assert(_idealNodeCalculator); return *_idealNodeCalculator;
+ }
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/common/doneinitializehandler.h b/storage/src/vespa/storage/common/doneinitializehandler.h
new file mode 100644
index 00000000000..f6ee489bb7b
--- /dev/null
+++ b/storage/src/vespa/storage/common/doneinitializehandler.h
@@ -0,0 +1,21 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::DoneInitializeHandler
+ *
+ * \brief Interface for handler needing to know when initializing is done.
+ *
+ * Every type of node will have one component responsible for calling this
+ * handler.
+ */
+
+#pragma once
+
+namespace storage {
+
+struct DoneInitializeHandler {
+ virtual ~DoneInitializeHandler() {}
+ virtual void notifyDoneInitializing() = 0;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/common/hostreporter/CMakeLists.txt b/storage/src/vespa/storage/common/hostreporter/CMakeLists.txt
new file mode 100644
index 00000000000..89aaaa60f10
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/CMakeLists.txt
@@ -0,0 +1,14 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_hostreporter OBJECT
+ SOURCES
+ cpureporter.cpp
+ kernelmetrictool.cpp
+ memreporter.cpp
+ networkreporter.cpp
+ hostinfo.cpp
+ diskreporter.cpp
+ versionreporter.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/common/hostreporter/cpureporter.cpp b/storage/src/vespa/storage/common/hostreporter/cpureporter.cpp
new file mode 100644
index 00000000000..5ed6d560975
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/cpureporter.cpp
@@ -0,0 +1,150 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include "cpureporter.h"
+
+#include <vespa/log/log.h>
+#include <vespa/vespalib/util/linkedptr.h>
+#include <vespa/vespalib/io/fileutil.h>
+#include <vespa/vespalib/stllike/string.h>
+#include <vespa/vespalib/text/stringtokenizer.h>
+#include "kernelmetrictool.h"
+
+#include <boost/array.hpp>
+
+LOG_SETUP(".cpureporter");
+
+namespace storage {
+namespace {
+
+using Object = vespalib::JsonStream::Object;
+using End = vespalib::JsonStream::End;
+using kernelmetrictool::readFile;
+using kernelmetrictool::getLine;
+using kernelmetrictool::getTokenCount;
+using kernelmetrictool::getToken;
+using kernelmetrictool::toLong;
+
+const int proprityLevels = 7;
+const vespalib::string priorityText[proprityLevels] =
+ { "user", "nice", "system", "idle", "iowait", "ira", "softirq"};
+
+struct CpuInfo {
+ int _cpuIndex;
+ boost::array<uint64_t, proprityLevels> _usage;
+ CpuInfo(int index) : _cpuIndex(index) {}
+
+ uint64_t getTotalUsage() const {
+ uint64_t total = 0;
+ for (uint32_t i=0; i<_usage.size(); ++i) total += _usage[i];
+ return total;
+ }
+};
+
+struct CpuReport {
+ std::vector<CpuInfo> _cpuInfo;
+ uint64_t _contextSwitches = 0;
+ int64_t _swappedIn = 0;
+ int64_t _swappedOut = 0;
+ uint64_t _processesCreated = 0;
+ uint64_t _processesBlocked = 0;
+ uint64_t _processesRunning = 0;
+
+ CpuInfo getTotalCpuInfo() const {
+ CpuInfo total(0);
+ for (uint32_t i=0; i < 7; ++i) total._usage[i] = 0;
+ for (uint32_t i=0; i < _cpuInfo.size(); ++i) {
+ for (uint32_t j=0; j < _cpuInfo[i]._usage.size(); ++j) {
+ total._usage[j] += _cpuInfo[i]._usage[j];
+ }
+ }
+ return total;
+ }
+};
+
+long getValueWithLog(
+ const vespalib::string &content,
+ const vespalib::string &lineStart,
+ int pos) {
+ vespalib::string line = getLine(lineStart, content);
+ if (!line.empty()) {
+ return toLong(getToken(pos, line));
+ } else {
+ LOGBP(debug, "Line not found in /proc/stat : '%s'\nLine start: %s",
+ content.c_str(), lineStart.c_str());
+ }
+ return 0;
+}
+
+void populateCpus(const vespalib::string &content, std::vector<CpuInfo> &cpuInfo) {
+ for (uint32_t i=0; true; ++i) {
+ vespalib::string line = getLine("cpu" + std::to_string(i), content);
+ if (line.empty()) break;
+ if (getTokenCount(line) < 8) {
+ LOGBP(warning, "Unexpected line found in /proc/stat. Expected at "
+ "least 8 tokens in cpu line: '%s'", line.c_str());
+ continue;
+ }
+ CpuInfo info(i);
+ for (uint32_t j=0; j<info._usage.size(); ++j) {
+ info._usage[j] = toLong(getToken(j + 1, line));
+ }
+ cpuInfo.push_back(info);
+ }
+}
+
+void populate(CpuReport& cpu) {
+ /*
+ * Parse /proc/stat. Expected format:
+ * cpu 82190434 7180 85600255 12799031291 18183765 36669 458570
+ * cpu0 10564061 448 10381577 1598933932 3065407 36668 206231
+ * cpu1 10763472 763 10191606 1599538223 2655481 0 38988
+ * cpu2 10206570 720 9845299 1600695947 2402795 0 37218
+ * cpu3 10051762 966 9993106 1600750639 2354533 0 37565
+ * cpu4 10176554 961 10818954 1600288785 1871033 0 32228
+ * cpu5 10261736 845 11475459 1599497420 1917617 0 35456
+ * cpu6 10244739 1050 11095848 1599960998 1851423 0 34488
+ * cpu7 9921536 1422 11798403 1599365345 2065473 0 36392
+ * intr 16439148517 3349609784 9 0 6 17 0 0 0 54121 0 0 0 3 0 0 0 204582604 0 0 0 0 0 85 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+ * swap 49238 9161900
+ * ctxt 17421122498
+ * btime 1296732462
+ * processes 83383001
+ * procs_running 1
+ * procs_blocked 0
+ */
+ vespalib::string content(readFile("/proc/stat"));
+
+ populateCpus(content, cpu._cpuInfo);
+ cpu._contextSwitches = getValueWithLog(content, "ctxt", 1);
+ cpu._swappedIn = getValueWithLog(content, "swap", 1);
+ cpu._swappedOut = getValueWithLog(content, "swap", 2);
+ cpu._processesCreated = getValueWithLog(content, "processes", 1);
+ cpu._processesRunning = getValueWithLog(content, "procs_running", 1);
+ cpu._processesBlocked = getValueWithLog(content, "procs_blocked", 1);
+}
+}
+
+void CpuReporter::report(vespalib::JsonStream& jsonreport) {
+ jsonreport << "cpu" << Object();
+ CpuReport current;
+ populate(current);
+ CpuInfo currTotal = current.getTotalCpuInfo();
+
+ jsonreport << "context switches" << current._contextSwitches;
+ jsonreport << "pages swapped in"<< current._swappedIn;
+ jsonreport << "pages swapped out" << current._swappedOut;
+
+
+ for (uint32_t i=0; i<=current._cpuInfo.size(); ++i) {
+ const CpuInfo& post(i == 0 ? currTotal : current._cpuInfo[i-1]);
+ jsonreport << (i == 0 ? "cputotal" : "cpu" + std::to_string(post._cpuIndex))
+ << Object();
+ for (uint32_t j=0; j < proprityLevels; ++j) {
+ double total = post.getTotalUsage();
+ jsonreport << priorityText[j] << (total < 0.00001 ? 0 : total);
+ }
+ jsonreport << End();
+ }
+ jsonreport << End();
+}
+} /* namespace storage */
diff --git a/storage/src/vespa/storage/common/hostreporter/cpureporter.h b/storage/src/vespa/storage/common/hostreporter/cpureporter.h
new file mode 100644
index 00000000000..b5b1914328a
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/cpureporter.h
@@ -0,0 +1,19 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#ifndef STORAGE_SRC_CPP_STORAGE_COMMON_HOSTINFO_CPUREPORTER_H_
+#define STORAGE_SRC_CPP_STORAGE_COMMON_HOSTINFO_CPUREPORTER_H_
+
+#include "hostreporter.h"
+
+namespace storage {
+
+class CpuReporter: public HostReporter {
+public:
+ void report(vespalib::JsonStream& jsonreport) override;
+
+ CpuReporter() {}
+ ~CpuReporter() override {}
+};
+
+} /* namespace storage */
+
+#endif /* STORAGE_SRC_CPP_STORAGE_COMMON_HOSTINFO_CPUREPORTER_H_ */
diff --git a/storage/src/vespa/storage/common/hostreporter/diskreporter.cpp b/storage/src/vespa/storage/common/hostreporter/diskreporter.cpp
new file mode 100644
index 00000000000..23db7137033
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/diskreporter.cpp
@@ -0,0 +1,68 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+
+#include "diskreporter.h"
+#include "kernelmetrictool.h"
+
+#include <vespa/vespalib/text/stringtokenizer.h>
+#include <vespa/vespalib/stllike/string.h>
+#include <iostream>
+
+namespace storage {
+namespace {
+using Object = vespalib::JsonStream::Object;
+using End = vespalib::JsonStream::End;
+using kernelmetrictool::readFile;
+using kernelmetrictool::getLine;
+using kernelmetrictool::getToken;
+using kernelmetrictool::toLong;
+using kernelmetrictool::stripWhitespace;
+}
+
+DiskReporter::DiskReporter() {}
+
+DiskReporter::~DiskReporter() {}
+
+void DiskReporter::report(vespalib::JsonStream& jsonreport) {
+ vespalib::string content = readFile("/proc/diskstats");
+ vespalib::StringTokenizer st(vespalib::StringTokenizer(content.c_str(), "\n", ""));
+ jsonreport << "disk" << Object();
+ for (uint32_t i=2; i<st.size(); ++i) {
+ vespalib::string line(st[i]);
+ /*
+ * The /proc/diskstats file displays the I/O statistics
+ * of block devices.
+ * 0 - major number
+ * 1 - minor mumber
+ * 2 - device name
+ * 3 - reads completed successfully
+ * 4 - reads merged
+ * 5 - sectors read
+ * 6 - time spent reading (ms)
+ * 7 - writes completed
+ * 8 - writes merged
+ * 9 - sectors written
+ * 10 - time spent writing (ms)
+ * 11 - I/Os currently in progress
+ * 12 - time spent doing I/Os (ms)
+ * 13 - weighted time spent doing I/Os (ms)
+ */
+ vespalib::string name = getToken(2, line);
+ if (name.substr(0, 3) == "ram" || name.substr(0, 3) == "dm-"
+ || name.substr(0, 4) == "loop") {
+ continue;
+ }
+ jsonreport << name << Object();
+ jsonreport << "reads merged" << toLong(getToken(4, line));
+ jsonreport << "writes merged" << toLong(getToken(8, line));
+ jsonreport << "reads" << toLong(getToken(3, line));
+ jsonreport << "writes" << toLong(getToken(7, line));
+ jsonreport << "in progress" << toLong(getToken(11, line));
+ jsonreport << "sectors read" << toLong(getToken(5, line));
+ jsonreport << "sectores written" << toLong(getToken(9, line));
+ jsonreport << "time spent" << toLong(getToken(12, line));
+ jsonreport << End();
+ }
+ jsonreport << End();
+}
+} /* namespace storage */
diff --git a/storage/src/vespa/storage/common/hostreporter/diskreporter.h b/storage/src/vespa/storage/common/hostreporter/diskreporter.h
new file mode 100644
index 00000000000..c6840f7b4d9
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/diskreporter.h
@@ -0,0 +1,19 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#ifndef STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_DISKREPORTER_H_
+#define STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_DISKREPORTER_H_
+
+#include "hostreporter.h"
+
+namespace storage {
+
+class DiskReporter: public HostReporter {
+public:
+ DiskReporter();
+ ~DiskReporter() override;
+
+ void report(vespalib::JsonStream& jsonreport) override;
+};
+
+} /* namespace storage */
+
+#endif /* STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_DISKREPORTER_H_ */
diff --git a/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp b/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp
new file mode 100644
index 00000000000..f34b459fc51
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/hostinfo.cpp
@@ -0,0 +1,29 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+
+#include "hostinfo.h"
+#include "hostreporter.h"
+
+namespace storage {
+
+HostInfo::HostInfo() {
+ registerReporter(&cpuReporter);
+ registerReporter(&diskReporter);
+ registerReporter(&memReporter);
+ registerReporter(&networkReporter);
+ registerReporter(&versionReporter);
+}
+
+HostInfo::~HostInfo() {
+}
+
+void HostInfo::printReport(vespalib::JsonStream& report) {
+ for (HostReporter* reporter : customReporters) {
+ reporter->report(report);
+ }
+}
+
+void HostInfo::registerReporter(HostReporter *reporter) {
+ customReporters.push_back(reporter);
+}
+} /* namespace storage */
diff --git a/storage/src/vespa/storage/common/hostreporter/hostinfo.h b/storage/src/vespa/storage/common/hostreporter/hostinfo.h
new file mode 100644
index 00000000000..340ffdc73c7
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/hostinfo.h
@@ -0,0 +1,40 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#ifndef STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_HOSTINFO_H_
+#define STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_HOSTINFO_H_
+
+#include <vespa/vespalib/util/linkedptr.h>
+#include <vespa/vespalib/util/jsonstream.h>
+
+#include "cpureporter.h"
+#include "diskreporter.h"
+#include "memreporter.h"
+#include "networkreporter.h"
+#include "versionreporter.h"
+
+namespace storage {
+
+/**
+ * Reports status about this host. It has a set of default reporters and additional
+ * reporters might be added.
+ */
+class HostInfo {
+public:
+ HostInfo();
+ ~HostInfo();
+ void printReport(vespalib::JsonStream& report);
+
+ // Does not take ownership.
+ void registerReporter(HostReporter* reporter);
+
+private:
+ std::vector<HostReporter*> customReporters;
+ CpuReporter cpuReporter;
+ DiskReporter diskReporter;
+ MemReporter memReporter;
+ NetworkReporter networkReporter;
+ VersionReporter versionReporter;
+};
+
+} /* namespace storage */
+
+#endif /* STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_HOSTINFO_H_ */
diff --git a/storage/src/vespa/storage/common/hostreporter/hostreporter.h b/storage/src/vespa/storage/common/hostreporter/hostreporter.h
new file mode 100644
index 00000000000..ec9c9d24508
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/hostreporter.h
@@ -0,0 +1,17 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/vespalib/util/jsonstream.h>
+
+namespace storage {
+/**
+ * Interface for reporters. Each implementation should add a json entry, e.g. for
+ * cpu it should be named "cpu".
+ */
+class HostReporter {
+public:
+ virtual void report(vespalib::JsonStream& jsonreport) = 0;
+ virtual ~HostReporter() {}
+};
+}
+
diff --git a/storage/src/vespa/storage/common/hostreporter/kernelmetrictool.cpp b/storage/src/vespa/storage/common/hostreporter/kernelmetrictool.cpp
new file mode 100644
index 00000000000..8bf345de9a8
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/kernelmetrictool.cpp
@@ -0,0 +1,75 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+
+#include "kernelmetrictool.h"
+#include <vespa/vespalib/io/fileutil.h>
+#include <vespa/vespalib/text/stringtokenizer.h>
+#include <vespa/vespalib/stllike/asciistream.h>
+#include <cctype>
+
+namespace storage {
+namespace kernelmetrictool {
+
+
+vespalib::string readFile(const char* fileName) {
+ return vespalib::asciistream::createFromDevice(fileName).str();
+}
+
+vespalib::string stripWhitespace(const vespalib::string& s) {
+ vespalib::string::size_type start(0);
+ vespalib::string::size_type stop(s.size() - 1);
+ while (true) {
+ if (start == s.size()) return vespalib::string("");
+ if (!std::isspace(s[start])) break;
+ ++start;
+ }
+ while (true) {
+ if (!std::isspace(s[stop])) break;
+ --stop;
+ }
+ return s.substr(start, stop - start + 1);
+}
+
+vespalib::string getLine(const vespalib::stringref& key,
+ const vespalib::stringref& content)
+{
+ vespalib::string::size_type start(0);
+ vespalib::string::size_type stop(content.find('\n'));
+ while (true) {
+ bool last = (stop == vespalib::string::npos);
+ vespalib::stringref line(content.substr(start, stop - start));
+ for (uint32_t i=0, n=line.size(); i<n; ++i) {
+ if (std::isspace(line[i])) {
+ vespalib::stringref s(line.substr(0, i));
+ if (s == key) return line;
+ }
+ }
+ if (last) break;
+ start = stop + 1;
+ stop = content.find('\n', start);
+ }
+ return "";
+}
+
+vespalib::string getToken(uint32_t index, const vespalib::string& line) {
+ vespalib::StringTokenizer st(line, " \t\n", "");
+ st.removeEmptyTokens();
+ return (index >= st.size() ? "" : st[index]);
+}
+
+uint32_t getTokenCount(const vespalib::string& line) {
+ vespalib::StringTokenizer st(line, " \t\n", "");
+ st.removeEmptyTokens();
+ return st.size();
+}
+
+uint64_t toLong(const vespalib::stringref& s, int base) {
+ char* endptr;
+ uint64_t result(strtoull(s.c_str(), &endptr, base));
+ if ((s.c_str() + s.size()) != endptr) {
+ throw vespalib::IllegalArgumentException("Parsing '" + s + "' as a long.");
+ }
+ return result;
+}
+}
+} /* namespace storage */
diff --git a/storage/src/vespa/storage/common/hostreporter/kernelmetrictool.h b/storage/src/vespa/storage/common/hostreporter/kernelmetrictool.h
new file mode 100644
index 00000000000..4c44f73f07b
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/kernelmetrictool.h
@@ -0,0 +1,30 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/*
+ * This file contains various tools for use by reporters when fetching os information.
+ */
+
+#ifndef STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_KERNELMETRICTOOL_H_
+#define STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_KERNELMETRICTOOL_H_
+
+#include <vespa/vespalib/stllike/string.h>
+
+namespace storage {
+namespace kernelmetrictool {
+
+vespalib::string readFile(const char* fileName);
+
+vespalib::string stripWhitespace(const vespalib::string& s);
+
+vespalib::string getLine(const vespalib::stringref& key,
+ const vespalib::stringref& content);
+
+vespalib::string getToken(uint32_t index, const vespalib::string& line);
+
+uint32_t getTokenCount(const vespalib::string& line);
+
+uint64_t toLong(const vespalib::stringref& s, int base = 0) ;
+
+} /* namespace kernelmetrictool */
+} /* namespace storage */
+
+#endif /* STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_KERNELMETRICTOOL_H_ */
diff --git a/storage/src/vespa/storage/common/hostreporter/memreporter.cpp b/storage/src/vespa/storage/common/hostreporter/memreporter.cpp
new file mode 100644
index 00000000000..862e1c6a8ed
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/memreporter.cpp
@@ -0,0 +1,72 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+
+#include "memreporter.h"
+#include "kernelmetrictool.h"
+
+namespace storage {
+namespace {
+using Object = vespalib::JsonStream::Object;
+using End = vespalib::JsonStream::End;
+using kernelmetrictool::readFile;
+using kernelmetrictool::getLine;
+using kernelmetrictool::getToken;
+using kernelmetrictool::toLong;}
+
+MemReporter::MemReporter() {}
+MemReporter::~MemReporter() {}
+
+void MemReporter::report(vespalib::JsonStream& jsonreport) {
+ /* Parse /proc/meminfo. Expected format
+ * MemTotal: 36969940 kB
+ * MemFree: 13856316 kB
+ * Buffers: 612476 kB
+ * Cached: 18603000 kB
+ * SwapCached: 71064 kB
+ * Active: 13504144 kB
+ * Inactive: 7781768 kB
+ * HighTotal: 0 kB
+ * HighFree: 0 kB
+ * LowTotal: 36969940 kB
+ * LowFree: 13856316 kB
+ * SwapTotal: 33554424 kB
+ * SwapFree: 33465824 kB
+ * Dirty: 1416 kB
+ * Writeback: 0 kB
+ * Mapped: 1225592 kB
+ * Slab: 1669252 kB
+ * CommitLimit: 52039392 kB
+ * Committed_AS: 2337076 kB
+ * PageTables: 12992 kB
+ * VmallocTotal: 536870908 kB
+ * VmallocUsed: 377960 kB
+ * VmallocChunk: 536492708 kB
+ */
+ vespalib::string content = readFile("/proc/meminfo");
+ // Usable RAM - Physical memory minus reserved bits and kernel code
+ uint64_t memTotal = toLong(getToken(1, getLine("MemTotal:", content))) * 1024;
+ // LowFree + HighFree
+ uint64_t memFree = toLong(getToken(1, getLine("MemFree:", content))) * 1024;
+ // Disk data cached in memory
+ uint64_t cached = toLong(getToken(1, getLine("Cached:", content))) * 1024;
+ // Memory used recently.
+ uint64_t active = toLong(getToken(1, getLine("Active:", content))) * 1024;
+ uint64_t inActive = toLong(getToken(1, getLine("Inactive:", content))) * 1024;
+ uint64_t swapTotal = toLong(getToken(1, getLine("SwapTotal:", content))) * 1024;
+ uint64_t swapFree = toLong(getToken(1, getLine("SwapFree:", content))) * 1024;
+ // Bytes that may need to be written to disk soon. Swap or disk.
+ uint64_t dirty = toLong(getToken(1, getLine("Dirty:", content))) * 1024;
+
+ jsonreport << "memory" << Object()
+ << "total memory" << memTotal
+ << "free memory" << memFree
+ << "disk cache" << cached
+ << "active memory" << active
+ << "inactive memory" << inActive
+ << "swap total" << swapTotal
+ << "swap free" << swapFree
+ << "dirty" << dirty
+ << End();
+}
+
+} /* namespace storage */
diff --git a/storage/src/vespa/storage/common/hostreporter/memreporter.h b/storage/src/vespa/storage/common/hostreporter/memreporter.h
new file mode 100644
index 00000000000..983f36be907
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/memreporter.h
@@ -0,0 +1,19 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#ifndef STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_MEMREPORTER_H_
+#define STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_MEMREPORTER_H_
+
+#include "hostreporter.h"
+
+namespace storage {
+
+class MemReporter: public HostReporter {
+public:
+ MemReporter();
+ ~MemReporter() override;
+
+ void report(vespalib::JsonStream& jsonreport) override;
+};
+
+} /* namespace storage */
+
+#endif /* STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_MEMREPORTER_H_ */
diff --git a/storage/src/vespa/storage/common/hostreporter/networkreporter.cpp b/storage/src/vespa/storage/common/hostreporter/networkreporter.cpp
new file mode 100644
index 00000000000..04a10699d0f
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/networkreporter.cpp
@@ -0,0 +1,48 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include "networkreporter.h"
+#include "kernelmetrictool.h"
+#include <vespa/vespalib/text/stringtokenizer.h>
+
+namespace storage {
+namespace {
+using Object = vespalib::JsonStream::Object;
+using End = vespalib::JsonStream::End;
+using kernelmetrictool::readFile;
+using kernelmetrictool::getLine;
+using kernelmetrictool::getToken;
+using kernelmetrictool::toLong;
+using kernelmetrictool::stripWhitespace;
+}
+
+void NetworkReporter::report(vespalib::JsonStream& jsonreport) {
+
+ vespalib::string content = readFile("/proc/net/dev");
+ vespalib::StringTokenizer st(content.c_str(), "\n", "");
+
+ jsonreport << "network" << Object();
+
+ for (uint32_t i=2; i<st.size(); ++i) {
+ vespalib::string line = st[i];
+ vespalib::string::size_type pos = line.find(':');
+ if (pos == vespalib::string::npos) {
+ continue;
+ }
+ jsonreport << stripWhitespace(line.substr(0, pos)) << Object();
+ vespalib::string data(line.substr(pos+1));
+ jsonreport << "input" << Object();
+ jsonreport << "bytes" << toLong(getToken(0, data));
+ jsonreport << "packets" << toLong(getToken(1, data));
+ jsonreport << "errors" << toLong(getToken(2, data));
+ jsonreport << "drops" << toLong(getToken(3, data));
+ jsonreport << End() << "output" << Object();
+ jsonreport << "bytes" << toLong(getToken(8, data));
+ jsonreport << "packets" << toLong(getToken(9, data));
+ jsonreport << "errors" << toLong(getToken(10, data));
+ jsonreport << "drops" << toLong(getToken(11, data));
+ jsonreport << End();
+ jsonreport << End();
+ }
+ jsonreport << End();
+}
+} /* namespace storage */
diff --git a/storage/src/vespa/storage/common/hostreporter/networkreporter.h b/storage/src/vespa/storage/common/hostreporter/networkreporter.h
new file mode 100644
index 00000000000..8a3b855c933
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/networkreporter.h
@@ -0,0 +1,19 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#ifndef STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_NETWORKREPORTER_H_
+#define STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_NETWORKREPORTER_H_
+
+#include "hostreporter.h"
+
+namespace storage {
+
+class NetworkReporter: public HostReporter {
+public:
+ NetworkReporter() {};
+ ~NetworkReporter() override {};
+
+ void report(vespalib::JsonStream& jsonreport) override;
+};
+
+} /* namespace storage */
+
+#endif /* STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_NETWORKREPORTER_H_ */
diff --git a/storage/src/vespa/storage/common/hostreporter/versionreporter.cpp b/storage/src/vespa/storage/common/hostreporter/versionreporter.cpp
new file mode 100644
index 00000000000..110df2c2b12
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/versionreporter.cpp
@@ -0,0 +1,19 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include "versionreporter.h"
+#include <vespa/storage/common/vtag.h>
+#include <vespa/vespalib/component/version.h>
+
+namespace storage {
+
+namespace {
+using Object = vespalib::JsonStream::Object;
+using End = vespalib::JsonStream::End;
+}
+void VersionReporter::report(vespalib::JsonStream& jsonreport) {
+ jsonreport << "vtag" << Object()
+ << "version" << Vtag::currentVersion.toString()
+ << End();
+}
+
+} /* namespace storage */
diff --git a/storage/src/vespa/storage/common/hostreporter/versionreporter.h b/storage/src/vespa/storage/common/hostreporter/versionreporter.h
new file mode 100644
index 00000000000..e80a56f0a47
--- /dev/null
+++ b/storage/src/vespa/storage/common/hostreporter/versionreporter.h
@@ -0,0 +1,20 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#ifndef STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_VERSIONREPORTER_H_
+#define STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_VERSIONREPORTER_H_
+
+#include "hostreporter.h"
+
+namespace storage {
+
+// Reports Vtag.
+class VersionReporter: public HostReporter {
+public:
+ VersionReporter() {}
+ ~VersionReporter() override {}
+
+ void report(vespalib::JsonStream& jsonreport) override;
+};
+
+} /* namespace storage */
+
+#endif /* STORAGE_SRC_CPP_STORAGE_COMMON_HOSTREPORTER_VERSIONREPORTER_H_ */
diff --git a/storage/src/vespa/storage/common/messagebucketid.cpp b/storage/src/vespa/storage/common/messagebucketid.cpp
new file mode 100644
index 00000000000..b3ccc1a1e0b
--- /dev/null
+++ b/storage/src/vespa/storage/common/messagebucketid.cpp
@@ -0,0 +1,100 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/common/messagebucketid.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/removelocation.h>
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/storageapi/message/stat.h>
+#include <vespa/storageapi/message/batch.h>
+#include <vespa/storage/common/statusmessages.h>
+#include <vespa/storage/common/bucketmessages.h>
+
+namespace storage {
+
+document::BucketId
+getStorageMessageBucketId(const api::StorageMessage& msg)
+{
+ switch (msg.getType().getId()) {
+ case api::MessageType::GET_ID:
+ return static_cast<const api::GetCommand&>(msg).getBucketId();
+ case api::MessageType::PUT_ID:
+ return static_cast<const api::PutCommand&>(msg).getBucketId();
+ case api::MessageType::UPDATE_ID:
+ return static_cast<const api::UpdateCommand&>(msg).getBucketId();
+ case api::MessageType::REMOVE_ID:
+ return static_cast<const api::RemoveCommand&>(msg).getBucketId();
+ case api::MessageType::REVERT_ID:
+ return static_cast<const api::RevertCommand&>(msg).getBucketId();
+ case api::MessageType::STATBUCKET_ID:
+ return static_cast<const api::StatBucketCommand&>(msg).getBucketId();
+ case api::MessageType::MULTIOPERATION_ID:
+ return static_cast<const api::MultiOperationCommand&>(msg)
+ .getBucketId();
+ case api::MessageType::BATCHPUTREMOVE_ID:
+ return static_cast<const api::BatchPutRemoveCommand&>(msg)
+ .getBucketId();
+ case api::MessageType::REMOVELOCATION_ID:
+ return static_cast<const api::RemoveLocationCommand&>(msg)
+ .getBucketId();
+ case api::MessageType::CREATEBUCKET_ID:
+ return static_cast<const api::CreateBucketCommand&>(msg).getBucketId();
+ case api::MessageType::DELETEBUCKET_ID:
+ return static_cast<const api::DeleteBucketCommand&>(msg).getBucketId();
+ case api::MessageType::MERGEBUCKET_ID:
+ return static_cast<const api::MergeBucketCommand&>(msg).getBucketId();
+ case api::MessageType::GETBUCKETDIFF_ID:
+ return static_cast<const api::GetBucketDiffCommand&>(msg).getBucketId();
+ case api::MessageType::GETBUCKETDIFF_REPLY_ID:
+ return static_cast<const api::GetBucketDiffReply&>(msg).getBucketId();
+ case api::MessageType::APPLYBUCKETDIFF_ID:
+ return static_cast<const api::ApplyBucketDiffCommand&>(msg)
+ .getBucketId();
+ case api::MessageType::APPLYBUCKETDIFF_REPLY_ID:
+ return static_cast<const api::ApplyBucketDiffReply&>(msg).getBucketId();
+
+ case api::MessageType::JOINBUCKETS_ID:
+ return static_cast<const api::JoinBucketsCommand&>(msg).getBucketId();
+ case api::MessageType::SPLITBUCKET_ID:
+ return static_cast<const api::SplitBucketCommand&>(msg).getBucketId();
+ case api::MessageType::SETBUCKETSTATE_ID:
+ return static_cast<const api::SetBucketStateCommand&>(msg).getBucketId();
+
+ case api::MessageType::INTERNAL_ID:
+ switch(static_cast<const api::InternalCommand&>(msg).getType()) {
+ case RequestStatusPage::ID:
+ return document::BucketId();
+ case GetIterCommand::ID:
+ return static_cast<const GetIterCommand&>(msg).getBucketId();
+ case CreateIteratorCommand::ID:
+ return static_cast<const CreateIteratorCommand&>(msg)
+ .getBucketId();
+ case ReadBucketList::ID:
+ return document::BucketId();
+ case ReadBucketInfo::ID:
+ return static_cast<const ReadBucketInfo&>(msg).getBucketId();
+ case RepairBucketCommand::ID:
+ return static_cast<const RepairBucketCommand&>(msg).getBucketId();
+ case BucketDiskMoveCommand::ID:
+ return static_cast<const BucketDiskMoveCommand&>(msg).getBucketId();
+ case InternalBucketJoinCommand::ID:
+ return static_cast<const InternalBucketJoinCommand&>(msg)
+ .getBucketId();
+ case RecheckBucketInfoCommand::ID:
+ return static_cast<const RecheckBucketInfoCommand&>(msg).getBucketId();
+ default:
+ break;
+ }
+ default:
+ break;
+ }
+ throw vespalib::IllegalArgumentException(
+ "Message of type " + msg.toString() + " was not expected. Don't "
+ "know how to calculate bucket this message operates on.",
+ VESPA_STRLOC);
+}
+
+}
diff --git a/storage/src/vespa/storage/common/messagebucketid.h b/storage/src/vespa/storage/common/messagebucketid.h
new file mode 100644
index 00000000000..e4b4343e213
--- /dev/null
+++ b/storage/src/vespa/storage/common/messagebucketid.h
@@ -0,0 +1,22 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/document/bucket/bucketid.h>
+
+namespace storage {
+
+namespace api {
+class StorageMessage;
+}
+
+/**
+ * @return msg's relevant bucket id. May be an internal message.
+ * @throws vespalib::IllegalArgumentException if msg does not
+ * have a bucket id.
+ */
+document::BucketId getStorageMessageBucketId(
+ const api::StorageMessage& msg);
+
+}
+
+
diff --git a/storage/src/vespa/storage/common/messagesender.cpp b/storage/src/vespa/storage/common/messagesender.cpp
new file mode 100644
index 00000000000..77fa024ab13
--- /dev/null
+++ b/storage/src/vespa/storage/common/messagesender.cpp
@@ -0,0 +1,20 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/common/messagesender.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <vespa/storageapi/messageapi/storagereply.h>
+#include <vespa/storageapi/messageapi/storagecommand.h>
+
+namespace storage {
+
+void
+MessageSender::send(const std::shared_ptr<api::StorageMessage>& msg)
+{
+ if (msg->getType().isReply()) {
+ sendReply(std::static_pointer_cast<api::StorageReply>(msg));
+ } else {
+ sendCommand(std::static_pointer_cast<api::StorageCommand>(msg));
+ }
+}
+
+}
diff --git a/storage/src/vespa/storage/common/messagesender.h b/storage/src/vespa/storage/common/messagesender.h
new file mode 100644
index 00000000000..025695dff29
--- /dev/null
+++ b/storage/src/vespa/storage/common/messagesender.h
@@ -0,0 +1,44 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::MessageSender
+ * @ingroup common
+ *
+ * @brief Interface to implement for classes which send messages on for others.
+ *
+ * Used for instance by filestor manager. Filestor threads needs to send
+ * messages through the filemanager. The filestor manager thus implements this
+ * interface and gives to the filestor thread.
+ *
+ * @author H�kon Humberset
+ * @date 2006-03-22
+ * @version $Id$
+ */
+
+#pragma once
+
+
+namespace storage {
+namespace api {
+ class StorageCommand;
+ class StorageReply;
+ class StorageMessage;
+}
+
+struct MessageSender {
+ virtual ~MessageSender() {}
+
+ virtual void sendCommand(const std::shared_ptr<api::StorageCommand>&) = 0;
+ virtual void sendReply(const std::shared_ptr<api::StorageReply>&) = 0;
+
+ void send(const std::shared_ptr<api::StorageMessage>&);
+};
+
+struct ChainedMessageSender {
+ virtual ~ChainedMessageSender() {}
+ virtual void sendUp(const std::shared_ptr<api::StorageMessage>&) = 0;
+ virtual void sendDown(const std::shared_ptr<api::StorageMessage>&) = 0;
+};
+
+} // storage
+
+
diff --git a/storage/src/vespa/storage/common/nodestateupdater.h b/storage/src/vespa/storage/common/nodestateupdater.h
new file mode 100644
index 00000000000..31c127d6664
--- /dev/null
+++ b/storage/src/vespa/storage/common/nodestateupdater.h
@@ -0,0 +1,70 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::NodeStateUpdater
+ * @ingroup common
+ *
+ * @brief Interface to implement for handler of state updates
+ *
+ * This component is responsible for keeping the node and system state, and
+ * make it available to all components that want to access it. For thread
+ * safety it returns shared pointers to states, such that state objects
+ * retrieved are still valid after changes.
+ *
+ * If you're using the state so much that copying the shared pointer is too
+ * much, you can instead add yourself as a state listener, and keep your own
+ * copy of the state.
+ *
+ * When you set a new reported state, pending get node state requests will be
+ * answered, so do all your updates in one call.
+ *
+ * This interface exist so the storage server interface is not implementation
+ * dependent, and such that the state updater can be easily faked in tests.
+ *
+ */
+#pragma once
+
+#include <string>
+#include <vespa/vdslib/state/nodestate.h>
+#include <vespa/vdslib/state/clusterstate.h>
+
+namespace storage {
+
+struct StateListener {
+ virtual ~StateListener() {}
+ virtual void handleNewState() = 0;
+};
+
+struct NodeStateUpdater {
+ typedef std::unique_ptr<NodeStateUpdater> UP;
+
+ virtual ~NodeStateUpdater() {}
+
+ virtual lib::NodeState::CSP getReportedNodeState() const = 0;
+ virtual lib::NodeState::CSP getCurrentNodeState() const = 0;
+ virtual lib::ClusterState::CSP getSystemState() const = 0;
+
+ virtual void addStateListener(StateListener&) = 0;
+ virtual void removeStateListener(StateListener&) = 0;
+
+ /**
+ * Multiple components typically request state, changes something and sets
+ * it back. To prevent race conditions here, they should grab this lock
+ * before altering the state.
+ */
+ struct Lock {
+ typedef std::shared_ptr<Lock> SP;
+ virtual ~Lock() {}
+ };
+ virtual Lock::SP grabStateChangeLock() = 0;
+
+ /**
+ * Sets the node state. Remember that other components might be setting
+ * parts of the node state you don't care about. Thus, when you alter the
+ * nodestate, first retrieve it and only change the parts you want to.
+ */
+ virtual void setReportedNodeState(const lib::NodeState& state) = 0;
+};
+
+} // storage
+
+
diff --git a/storage/src/vespa/storage/common/servicelayercomponent.cpp b/storage/src/vespa/storage/common/servicelayercomponent.cpp
new file mode 100644
index 00000000000..56977c029e8
--- /dev/null
+++ b/storage/src/vespa/storage/common/servicelayercomponent.cpp
@@ -0,0 +1,26 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "servicelayercomponent.h"
+
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/vdslib/distribution/distribution.h>
+
+namespace storage {
+
+uint16_t
+ServiceLayerComponent::getIdealPartition(const document::BucketId& bucket) const
+{
+ return getDistribution()->getIdealDisk(
+ *getStateUpdater().getReportedNodeState(), getIndex(), bucket,
+ lib::Distribution::IDEAL_DISK_EVEN_IF_DOWN);
+}
+
+uint16_t
+ServiceLayerComponent::getPreferredAvailablePartition(
+ const document::BucketId& bucket) const
+{
+ return getDistribution()->getPreferredAvailableDisk(
+ *getStateUpdater().getReportedNodeState(), getIndex(), bucket);
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/common/servicelayercomponent.h b/storage/src/vespa/storage/common/servicelayercomponent.h
new file mode 100644
index 00000000000..8feccdbf954
--- /dev/null
+++ b/storage/src/vespa/storage/common/servicelayercomponent.h
@@ -0,0 +1,93 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::ServiceLayerComponent
+ * \ingroup common
+ *
+ * \brief Component class including some service layer specific information.
+ */
+
+/**
+ * \class storage::ServiceLayerComponentRegister
+ * \ingroup common
+ *
+ * \brief Specialization of ComponentRegister handling service layer components.
+ */
+
+/**
+ * \class storage::ServiceLayerManagedComponent
+ * \ingroup common
+ *
+ * \brief Specialization of StorageManagedComponent.
+ *
+ * A service layer component register will use this interface in order to set
+ * the service layer functionality parts.
+ */
+
+#pragma once
+
+#include <vespa/storage/common/storagecomponent.h>
+
+namespace storage {
+
+class MinimumUsedBitsTracker;
+class StorBucketDatabase;
+
+struct ServiceLayerManagedComponent
+{
+ virtual ~ServiceLayerManagedComponent() {}
+
+ virtual void setDiskCount(uint16_t count) = 0;
+ virtual void setBucketDatabase(StorBucketDatabase&) = 0;
+ virtual void setMinUsedBitsTracker(MinimumUsedBitsTracker&) = 0;
+};
+
+struct ServiceLayerComponentRegister : public virtual StorageComponentRegister
+{
+ virtual void registerServiceLayerComponent(
+ ServiceLayerManagedComponent&) = 0;
+};
+
+class ServiceLayerComponent : public StorageComponent,
+ private ServiceLayerManagedComponent
+{
+ uint16_t _diskCount;
+ StorBucketDatabase* _bucketDatabase;
+ MinimumUsedBitsTracker* _minUsedBitsTracker;
+
+ // ServiceLayerManagedComponent implementation
+ virtual void setDiskCount(uint16_t count) { _diskCount = count; }
+ virtual void setBucketDatabase(StorBucketDatabase& db)
+ { _bucketDatabase = &db; }
+ virtual void setMinUsedBitsTracker(MinimumUsedBitsTracker& tracker) {
+ _minUsedBitsTracker = &tracker;
+ }
+public:
+ typedef std::unique_ptr<ServiceLayerComponent> UP;
+
+ ServiceLayerComponent(ServiceLayerComponentRegister& compReg,
+ vespalib::stringref name)
+ : StorageComponent(compReg, name),
+ _diskCount(0),
+ _bucketDatabase(0),
+ _minUsedBitsTracker(0)
+ {
+ compReg.registerServiceLayerComponent(*this);
+ }
+
+ uint16_t getDiskCount() const { return _diskCount; }
+ StorBucketDatabase& getBucketDatabase() const
+ { assert(_bucketDatabase != 0); return *_bucketDatabase; }
+ MinimumUsedBitsTracker& getMinUsedBitsTracker() {
+ assert(_minUsedBitsTracker != 0);
+ return *_minUsedBitsTracker;
+ }
+ const MinimumUsedBitsTracker& getMinUsedBitsTracker() const {
+ assert(_minUsedBitsTracker != 0);
+ return *_minUsedBitsTracker;
+ }
+ uint16_t getIdealPartition(const document::BucketId&) const;
+ uint16_t getPreferredAvailablePartition(const document::BucketId&) const;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/common/statusmessages.h b/storage/src/vespa/storage/common/statusmessages.h
new file mode 100644
index 00000000000..2c138349892
--- /dev/null
+++ b/storage/src/vespa/storage/common/statusmessages.h
@@ -0,0 +1,99 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * Internal command used by visitor and filestor framework to gather partial
+ * status from message processing threads.
+ */
+
+#pragma once
+
+#include <vespa/storageapi/message/internal.h>
+#include <vespa/storageframework/storageframework.h>
+
+namespace storage {
+
+/**
+ * @class RequestStatusPage
+ * @ingroup visiting
+ *
+ * @brief Used to retrieve status page from threads.
+ */
+class RequestStatusPage : public api::InternalCommand {
+ framework::HttpUrlPath _path;
+ std::string _sortToken; // Used if sending multiple messages, to set order
+ // in which results should be sorted on status page.
+ // (Used by filestor threads)
+public:
+ static const uint32_t ID = 2100;
+
+ RequestStatusPage(const framework::HttpUrlPath& path)
+ : api::InternalCommand(ID),
+ _path(path),
+ _sortToken() {}
+
+ const std::string& getSortToken() const { return _sortToken; }
+ void setSortToken(const std::string& token) { _sortToken = token; }
+
+ std::unique_ptr<api::StorageReply> makeReply();
+
+ const framework::HttpUrlPath& getPath() const { return _path; }
+
+ virtual void print(std::ostream& out, bool verbose, const std::string& indent) const
+ {
+ out << "RequestStatusPage()";
+
+ if (verbose) {
+ out << " : ";
+ InternalCommand::print(out, true, indent);
+ }
+ }
+};
+
+/**
+ * @class RequestStatusPageReply
+ * @ingroup visiting
+ */
+class RequestStatusPageReply : public api::InternalReply {
+ std::string _status;
+ std::string _sortToken;
+public:
+ static const uint32_t ID = 2101;
+
+ RequestStatusPageReply(const RequestStatusPage& cmd,
+ const std::string& status)
+ : api::InternalReply(ID, cmd),
+ _status(status),
+ _sortToken(cmd.getSortToken())
+ {
+ }
+
+ const std::string& getStatus() const { return _status; }
+ const std::string& getSortToken() const { return _sortToken; }
+
+ virtual void print(std::ostream& out, bool verbose, const std::string& indent) const
+ {
+ out << "RequestStatusPageReply()";
+
+ if (verbose) {
+ out << " : ";
+ InternalReply::print(out, true, indent);
+ }
+ }
+};
+
+inline std::unique_ptr<api::StorageReply>
+RequestStatusPage::makeReply()
+{
+ return std::unique_ptr<api::StorageReply>(
+ new RequestStatusPageReply(*this, ""));
+}
+
+struct StatusReqSorter {
+ bool operator()(const std::shared_ptr<RequestStatusPageReply>& a,
+ const std::shared_ptr<RequestStatusPageReply>& b)
+ {
+ return (a->getSortToken() < b->getSortToken());
+ }
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/common/statusmetricconsumer.cpp b/storage/src/vespa/storage/common/statusmetricconsumer.cpp
new file mode 100644
index 00000000000..4f94fbb0be8
--- /dev/null
+++ b/storage/src/vespa/storage/common/statusmetricconsumer.cpp
@@ -0,0 +1,1081 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/common/statusmetricconsumer.h>
+
+#include <boost/assign.hpp>
+#include <boost/lexical_cast.hpp>
+#include <vespa/log/log.h>
+#include <vespa/metrics/printutils.h>
+#include <vespa/metrics/jsonwriter.h>
+#include <vespa/metrics/textwriter.h>
+#include <vespa/metrics/xmlwriter.h>
+#include <vespa/storage/storageserver/storagemetricsset.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <vespa/vespalib/text/stringtokenizer.h>
+ // For setlocale to print . separated numbers
+#include <locale.h>
+#include <stdio.h>
+
+LOG_SETUP(".status.metricreporter");
+
+namespace storage {
+
+StatusMetricConsumer::StatusMetricConsumer(
+ StorageComponentRegister& compReg, metrics::MetricManager& manager,
+ const std::string& name)
+ : framework::StatusReporter("metrics", "Performance metrics"),
+ _manager(manager),
+ _component(compReg, "statusmetricsconsumer"),
+ _name(name),
+ _startTime(_component.getClock().getTimeInSeconds()),
+ _processedTime(0),
+ _metricMemoryToken()
+{
+ const framework::MemoryAllocationType& allocType(
+ _component.getMemoryManager().registerAllocationType(
+ framework::MemoryAllocationType(
+ "METRICS", framework::MemoryAllocationType::FORCE_ALLOCATE)
+ ));
+ _metricMemoryToken = _component.getMemoryManager().allocate(
+ allocType, 0, 0, api::StorageMessage::HIGH);
+ assert(_metricMemoryToken.get() != 0);
+ LOG(debug, "Started metrics consumer");
+ setlocale(LC_NUMERIC, "");
+ _component.registerMetricUpdateHook(*this, framework::SecondTime(3600));
+ _component.registerStatusPage(*this);
+}
+
+StatusMetricConsumer::~StatusMetricConsumer()
+{
+}
+
+void
+StatusMetricConsumer::updateMetrics(const MetricLockGuard & guard)
+{
+ metrics::MemoryConsumption::UP mc(_manager.getMemoryConsumption(guard));
+ uint32_t usage = mc->getTotalMemoryUsage();
+ _metricMemoryToken->resize(usage, usage);
+}
+
+vespalib::string
+StatusMetricConsumer::getReportContentType(
+ const framework::HttpUrlPath& path) const
+{
+ if (!path.hasAttribute("format")) {
+ return "text/html";
+ }
+
+ if (path.getAttribute("format") == "xml") {
+ return "application/xml";
+ }
+
+ if (path.getAttribute("format") == "text") {
+ return "text/plain";
+ }
+
+ if (path.getAttribute("format") == "json") {
+ return "application/json";
+ }
+
+ return "text/html";
+}
+
+namespace {
+ void addSnapshotAsTableRow(const metrics::MetricSnapshot& snapshot,
+ const metrics::MetricSnapshotSet* building,
+ std::ostream& out,
+ int32_t interval = -3)
+ {
+ if (interval == -3) interval = snapshot.getPeriod();
+ std::string name(snapshot.getName());
+ if (interval == -1) {
+ name = "Clone of total metrics with active metrics added";
+ }
+ std::vector<char> buffer(40);
+ //FastOS_Time::GMT_timestr(&buffer[0], data.first)
+ out << " <tr>\n"
+ << " <td>" << name << "</td>\n";
+ //if (snapshot.getToTime() != 0 || interval < 0 || building != 0)
+ for (uint32_t format=0; format<4; ++format) {
+ // 0 XML - 1 HTML - 2 text, 3 JSON
+ out << " <td>";
+ bool linked = false;
+ for (uint32_t tmp=0; tmp<2; ++tmp) { // 0 last - 1 temp
+ if (tmp == 1) {
+ if (building == 0 || !building->hasTemporarySnapshot()
+ || building->getBuilderCount() == 0)
+ {
+ continue;
+ }
+ } else {
+ if ((snapshot.getToTime() == 0 && interval >= 0)
+ || snapshot.getToTime() == snapshot.getFromTime())
+ {
+ continue;
+ }
+ }
+ if (tmp == 1) out << "&nbsp;&nbsp;";
+
+ const char* formatStr = "xml";
+ const char* consumer = "status";
+ switch (format) {
+ case 0:
+ formatStr = "xml";
+ break;
+ case 1:
+ formatStr = "html";
+ break;
+ case 2:
+ formatStr = "text";
+ break;
+ case 3:
+ formatStr = "json";
+ consumer = "yamas";
+ break;
+ }
+
+ linked = true;
+ out << "<a href=\""
+ << "?interval=" << interval
+ << "&format=" << formatStr
+ << "&consumer=" << consumer
+ << "&verbosity=0"
+ << "&pattern=.*"
+ << "&callsnapshothooks=0"
+ << "&tmpsnapshot=" << tmp
+ << "\">"
+ << (tmp == 0 ? (interval > 0 ? "Last complete"
+ : "Current")
+ : "Building")
+ << "</a>";
+ }
+ if (!linked) {
+ out << "None taken yet";
+ }
+ out << "</td>\n";
+ }
+ out << "</tr>\n";
+ }
+
+}
+
+bool
+StatusMetricConsumer::reportStatus(std::ostream& out,
+ const framework::HttpUrlPath& path) const
+{
+ // Update metrics unless 'dontcallupdatehooks' is 1. Update
+ // snapshot metrics too, if callsnapshothooks is set to 1.
+ if (path.get("dontcallupdatehooks", 0) == 0) {
+ bool updateSnapshotHooks = path.get("callsnapshothooks", 0) == 1;
+ LOG(debug, "Updating metrics ahead of status page view%s",
+ updateSnapshotHooks ? ", calling snapshot hooks too" : ".");
+ _manager.updateMetrics(updateSnapshotHooks);
+ } else {
+ LOG(debug, "Not calling update hooks as dontcallupdatehooks option "
+ "has been given");
+ }
+ framework::SecondTime currentTime(_component.getClock().getTimeInSeconds());
+ bool html = (!path.hasAttribute("format")
+ || path.getAttribute("format") == "html");
+ bool xml = (!html && path.getAttribute("format") == "xml");
+ bool json = (!html && path.getAttribute("format") == "json");
+
+ int verbosity(path.get("verbosity", 0));
+ // We have to copy unset values if using HTML as HTML version gathers
+ // metrics for calculations and thus needs unset values.
+ bool copyUnset = (html || verbosity >= 2);
+ bool temporarySnap = (path.get("tmpsnapshot", 0) == 1);
+
+ framework::PartlyHtmlStatusReporter htmlReporter(*this);
+ if (html) {
+ htmlReporter.reportHtmlHeader(out, path);
+ }
+
+ if (path.hasAttribute("task") && path.getAttribute("task") == "reset") {
+ {
+ vespalib::MonitorGuard sync(_waiter);
+ _manager.reset(currentTime.getTime());
+ }
+ if (html) {
+ out << "<p>Metrics reset at " << currentTime << ".</p>\n";
+ }
+ }
+
+ if (html) {
+ out << "<p><div align=\"right\"><a href=\"?task=reset\">"
+ << "Reset all metrics</a></div></p>\n"
+ << "<p>Metrics available at " << framework::SecondTime(currentTime)
+ << "</p>\n"
+ << "<table border=\"1\" cellspacing=\"3\">\n"
+ << " <tr><td>Display metrics from</td>\n"
+ << " <td>XML raw data</td>\n"
+ << " <td>HTML presentation</td>\n"
+ << " <td>Text output</td>\n"
+ << " <td>JSON output</td></tr>\n";
+
+ metrics::MetricLockGuard metricLock(_manager.getMetricLock());
+ std::vector<uint32_t> intervals(
+ _manager.getSnapshotPeriods(metricLock));
+ addSnapshotAsTableRow(
+ _manager.getActiveMetrics(metricLock), 0, out, -2);
+ for (uint32_t i=0; i<intervals.size(); ++i) {
+ addSnapshotAsTableRow(
+ _manager.getMetricSnapshot(metricLock, intervals[i]),
+ &_manager.getMetricSnapshotSet(metricLock, intervals[i]),
+ out);
+ }
+ addSnapshotAsTableRow(
+ _manager.getTotalMetricSnapshot(metricLock), 0, out);
+ addSnapshotAsTableRow(
+ _manager.getTotalMetricSnapshot(metricLock), 0, out, -1);
+ out << "</table>\n";
+
+ out << "<h3>Metrics explanation</h3>\n"
+ << "<p>\n"
+ << "The active metrics are currently being updated. The snapshots "
+ << "update at given time intervals, such that you can always view "
+ << "activity for a full time window. A total snapshot is also "
+ << "available. It is updated each time the shortest interval "
+ << "snapshot is updated. In addition it is possible to view a "
+ << "truly total metric set for all metrics since start, but this "
+ << "is a bit more expensive as we need to take a copy of the total "
+ << "snapshot to add the active metrics to it.\n"
+ << "</p><p>\n"
+ << "The XML view has a verbosity option that can be adjusted to "
+ << "control the amount of detail shown. With the default verbosity "
+ << "only the critical parts are shown. Currently, verbosity 1 adds "
+ << "descriptions. Verbosity 2 adds unused metrics and total values "
+ << "for value metrics. Verbosity 3 add tags.\n"
+ << "</p><p>\n"
+ << "The Text view uses a pattern that has a regular expression. "
+ << "Only metrics whose name path matches the regular expression "
+ << "will be shown.\n"
+ << "</p><p>\n"
+ << "Both XML, json and text view use consumer identifiers to detect "
+ << "what metrics to show. The default status consumer hides "
+ << "metrics that are part of sums and shows everything else. Use an "
+ << "empty consumer string to see all metrics. The 'log' consumer "
+ << "can be used to see metrics logged. JSON uses the yamas consumer as "
+ << "default.\n"
+ << "</p>\n";
+ }
+
+ if (path.hasAttribute("interval")) {
+ // Grab the snapshot we want to view more of
+ int32_t interval(
+ boost::lexical_cast<int32_t>(path.getAttribute("interval")));
+ metrics::MetricLockGuard metricLock(_manager.getMetricLock());
+ std::unique_ptr<metrics::MetricSnapshot> generated;
+ const metrics::MetricSnapshot* snapshot;
+ if (interval == -2) {
+ snapshot = &_manager.getActiveMetrics(metricLock);
+ _manager.getActiveMetrics(metricLock).setToTime(
+ currentTime.getTime());
+ } else if (interval == -1) {
+ // "Prime" the metric structure by first fetching the set of active
+ // metrics (complete with structure) and resetting these. This
+ // leaves us with an empty metrics set to which we can (in order)
+ // add the total and the active metrics. If this is not done, non-
+ // written metrics won't be included even if copyUnset is true.
+ generated.reset(new metrics::MetricSnapshot(
+ "Total metrics from start until current time", 0,
+ _manager.getActiveMetrics(metricLock).getMetrics(),
+ copyUnset));
+ generated->reset(0);
+ _manager.getTotalMetricSnapshot(metricLock).addToSnapshot(
+ *generated, currentTime.getTime());
+ _manager.getActiveMetrics(metricLock).addToSnapshot(
+ *generated, currentTime.getTime());
+ generated->setFromTime(
+ _manager.getTotalMetricSnapshot(metricLock).getFromTime());
+ snapshot = generated.get();
+ } else if (interval == 0) {
+ if (copyUnset) {
+ generated.reset(new metrics::MetricSnapshot(
+ _manager.getTotalMetricSnapshot(metricLock).getName(),
+ 0,
+ _manager.getActiveMetrics(metricLock).getMetrics(),
+ true));
+ generated->reset(0);
+ _manager.getTotalMetricSnapshot(metricLock).addToSnapshot(
+ *generated, currentTime.getTime());
+ snapshot = generated.get();
+ } else {
+ snapshot = &_manager.getTotalMetricSnapshot(metricLock);
+ }
+ } else {
+ if (copyUnset) {
+ generated.reset(new metrics::MetricSnapshot(
+ _manager.getMetricSnapshot(metricLock, interval)
+ .getName(), 0,
+ _manager.getActiveMetrics(metricLock).getMetrics(),
+ true));
+ generated->reset(0);
+ _manager.getMetricSnapshot(metricLock, interval, temporarySnap)
+ .addToSnapshot(*generated, currentTime.getTime());
+ snapshot = generated.get();
+ } else {
+ snapshot = &_manager.getMetricSnapshot(
+ metricLock, interval, temporarySnap);
+ }
+ }
+
+ std::string consumer = path.getAttribute("consumer", "");
+ if (html) {
+ printHtmlMetricsReport(out, *snapshot,
+ path.hasAttribute("includenotused"));
+ } else if (xml) {
+ out << "<?xml version=\"1.0\"?>\n";
+ vespalib::XmlOutputStream xos(out);
+ metrics::XmlWriter xmlWriter(xos, snapshot->getPeriod(), verbosity);
+ _manager.visit(metricLock, *snapshot, xmlWriter, consumer);
+ out << "\n";
+ } else if (json) {
+ vespalib::asciistream jsonStreamData;
+ vespalib::JsonStream stream(jsonStreamData, true);
+ stream << Object() << "metrics";
+ metrics::JsonWriter metricJsonWriter(stream);
+ _manager.visit(metricLock, *snapshot, metricJsonWriter, consumer);
+ stream << End();
+ stream.finalize();
+ out << jsonStreamData.str();
+ } else {
+ std::string pattern = path.getAttribute("pattern", ".*");
+ metrics::TextWriter textWriter(out, snapshot->getPeriod(),
+ pattern, verbosity > 0);
+ _manager.visit(metricLock, *snapshot, textWriter, consumer);
+ }
+ }
+ if (html) {
+ htmlReporter.reportHtmlFooter(out, path);
+ }
+ return true;
+}
+
+void
+StatusMetricConsumer::waitUntilTimeProcessed(framework::SecondTime t) const
+{
+ return; // Return straight away as thread is not running now.
+ // This is used in unit testing to wait for internal thread to have
+ // generated snapshots. Wait aggressively and signal other thread to
+ // make it do it quick (as it uses fake timer)
+ vespalib::MonitorGuard sync(_waiter);
+ while (_processedTime < t) {
+ sync.signal();
+ sync.wait(1);
+ }
+}
+
+void
+StatusMetricConsumer::writeXmlTags(std::ostream& out,
+ const vespalib::StringTokenizer& name,
+ std::vector<std::string>& xmlTags) const
+{
+ // Find how many common elements exist.
+ uint32_t equalUpToIndex = 0;
+ if (name.size() != 0) {
+ uint32_t n = std::min((size_t)name.size() - 1, xmlTags.size());
+ while (equalUpToIndex < n &&
+ name[equalUpToIndex] == xmlTags[equalUpToIndex])
+ {
+ ++equalUpToIndex;
+ }
+ }
+
+ // End old tags that aren't common.
+ for (uint32_t i=0, n=xmlTags.size() - equalUpToIndex; i<n; ++i) {
+ for (uint32_t j=0; j<xmlTags.size(); ++j) out << " ";
+ std::string xmlname(xmlTags.back());
+ std::replace(xmlname.begin(), xmlname.end(), ' ', '_');
+ out << "</" << xmlname << ">\n";
+ xmlTags.pop_back();
+ }
+
+ // Create new tags that aren't common.
+ for (uint32_t i=equalUpToIndex; i + 1 <name.size(); ++i) {
+ std::string xmlname(name[i]);
+ if (xmlname.size() > 0) {
+ for (uint32_t j=0; j<=xmlTags.size(); ++j) out << " ";
+ xmlTags.push_back(xmlname);
+ std::replace(xmlname.begin(), xmlname.end(), ' ', '_');
+ out << "<" << xmlname << ">\n";
+ }
+ }
+}
+
+namespace {
+ struct UnusedMetricPrinter : public metrics::MetricVisitor {
+ const std::map<metrics::Metric::String,
+ metrics::Metric::SP>& _usedMetrics;
+ std::ostream& _out;
+
+ UnusedMetricPrinter(const std::map<metrics::Metric::String,
+ metrics::Metric::SP>& used,
+ std::ostream& out)
+ : _usedMetrics(used), _out(out) {}
+
+ bool visitMetric(const metrics::Metric& metric, bool) {
+ std::map<metrics::Metric::String,
+ metrics::Metric::SP>::const_iterator it(
+ _usedMetrics.find(metric.getPath()));
+ if (it == _usedMetrics.end()) {
+ std::string result = metric.toString();
+ std::string::size_type pos1 = result.find(' ');
+ std::string::size_type pos2 = result.rfind('"');
+ _out << metric.getPath() << result.substr(pos1, pos2 + 1 - pos1)
+ << "\n";
+ }
+ return true;
+ }
+ };
+}
+
+void
+StatusMetricConsumer::printHtmlMetricsReport(
+ std::ostream& out, const metrics::MetricSnapshot& data,
+ bool includeNotUsed) const
+{
+ using namespace boost::assign;
+
+ /*
+ std::cerr << "All metrics available:\n";
+ for (MetricSnapshot::const_iterator it = data.begin();
+ it != data.end(); ++it)
+ {
+ std::cerr << " '" << it->first << "' => '";
+ it->second->printXml(std::cerr);
+ std::cerr << "'.\n";
+ }
+ */
+
+ std::map<String, Metric::SP> usedMetrics;
+
+ out << "<h2>Metrics report for the last "
+ << framework::SecondTime(data.getLength())
+ .toString(framework::DIFFERENCE)
+ << ", from " << framework::SecondTime(data.getFromTime()) << " to "
+ << framework::SecondTime(data.getFromTime() + data.getLength())
+ << "</h2>\n";
+
+ out << "<p>\n"
+ << "Note that min/max values are currently always from start of "
+ << "process, as current procedure for gathering data doesn't let us "
+ << "know min/max for windows.\n"
+ << "</p>\n";
+
+ // Storage node metrics
+ if (_component.getNodeType() == lib::NodeType::STORAGE) {
+ printStorageHtmlReport(out, usedMetrics, data);
+ printOperationHtmlReport(out, usedMetrics, data);
+ printMaintOpHtmlReport(out, usedMetrics, data);
+ out << "<br>\n"
+ << "<table cellpadding=\"0\" cellspacing=\"0\">\n"
+ << "<td valign=\"top\">\n";
+ out << "</td><td>&nbsp;&nbsp;</td>\n"
+ << "<td valign=\"top\">\n";
+ out << "</td></table><br>\n";
+ printMergeHtmlReport(out, usedMetrics, data);
+ printVisitHtmlReport(out, usedMetrics, data);
+ }
+
+ if (includeNotUsed) {
+ out << "<h2>Metrics not used by user friendly overview above</h2>\n";
+ out << "<pre>\n";
+ UnusedMetricPrinter metricPrinter(usedMetrics, out);
+ data.getMetrics().visit(metricPrinter);
+ out << "</pre>\n";
+ }
+}
+
+void
+StatusMetricConsumer::printStorageHtmlReport(
+ std::ostream& out,
+ std::map<metrics::Metric::String, metrics::Metric::SP>& usedMetrics,
+ const metrics::MetricSnapshot& snapshot) const
+{
+ using namespace boost::assign;
+ using namespace metrics::printutils;
+
+ MetricSource ms(snapshot, "vds.datastored", &usedMetrics);
+ HttpTable table("Disk storage utilization", "Disk");
+ table.colNames += "Stored data", "Document count";
+
+ std::vector<std::string> diskMetrics(ms.getPathsMatchingPrefix("disk_"));
+ std::map<uint32_t, std::string> indexMap;
+ for(std::vector<std::string>::const_iterator it = diskMetrics.begin();
+ it != diskMetrics.end(); ++it)
+ {
+ std::string::size_type pos = it->find('_');
+ indexMap[boost::lexical_cast<uint32_t>(it->substr(pos + 1))] = *it;
+ }
+
+ std::map<uint32_t, std::string>::const_iterator it = indexMap.begin();
+ for (uint32_t i=0; i<indexMap.size(); ++i) {
+ std::ostringstream ost;
+ ost << it->first;
+ table.rowNames += ost.str();
+ table[i][0] = getByteValueString(
+ getLongMetric(it->second + ".bytes.value", ms));
+ table[i][1] = getValueString(
+ getLongMetric(it->second + ".docs.value", ms));
+ ++it;
+ }
+ table.rowNames += "Total";
+ table[diskMetrics.size()][0] = getByteValueString(
+ getLongMetric("alldisks.bytes.value", ms));
+ table[diskMetrics.size()][1] = getValueString(
+ getLongMetric("alldisks.docs.value", ms));
+
+ table.print(out);
+}
+
+void
+StatusMetricConsumer::printOperationHtmlReport(
+ std::ostream& out,
+ std::map<metrics::Metric::String, metrics::Metric::SP>& usedMetrics,
+ const metrics::MetricSnapshot& snapshot) const
+{
+ using namespace boost::assign;
+ using namespace metrics::printutils;
+
+ LVW timePeriod(snapshot.getLength());
+
+ MetricSource ms(snapshot, "vds.filestor.alldisks.allthreads", &usedMetrics);
+ HttpTable table("External load", "Operation");
+ table.colNames += "Ops/s", "Ops", "Failed", "Not found",
+ "Min latency", "Avg latency", "Max latency";
+ LongValue failed;
+
+ table.rowNames += "Put";
+ table[0][0] = getValueString(
+ DVW(1.0) * getLongMetric("put.sum.count.count", ms) / timePeriod,
+ "%'3.2f");
+ table[0][1] = getValueString(getLongMetric("put.sum.count.count", ms));
+ LongValue failedPuts = getLongMetric("put.sum.count.count", ms)
+ - getLongMetric("put.sum.latency.count", ms);
+ table[0][2] = getValueString(failedPuts) + getValueString(
+ DVW(100) * failedPuts / getLongMetric("put.sum.count.count", ms),
+ " (%'3.2f %%)");
+ table[0][4] = getValueString(
+ getDoubleMetric("put.sum.latency.min", ms), "%'3.2f ms");
+ table[0][5] = getValueString(
+ getDoubleMetric("put.sum.latency.average", ms), "%'3.2f ms");
+ table[0][6] = getValueString(
+ getDoubleMetric("put.sum.latency.max", ms), "%'3.2f ms");
+
+ table.rowNames += "Update";
+ table[1][0] = getValueString(
+ DVW(1.0) * getLongMetric("update.sum.count.count", ms) / LVW(timePeriod),
+ "%'3.2f");
+ table[1][1] = getValueString(getLongMetric("update.sum.count.count", ms));
+ LongValue failedUpdates = getLongMetric("update.sum.count.count", ms)
+ - getLongMetric("update.sum.latency.count", ms);
+ table[1][2] = getValueString(failedUpdates) + getValueString(
+ DVW(100) * failedUpdates / getLongMetric("update.sum.count.count", ms),
+ " (%'3.2f %%)");
+ LongValue notFoundUpdates = getLongMetric("update.sum.not_found.count", ms);
+ table[1][3] = getValueString(notFoundUpdates) + getValueString(
+ DVW(100) * notFoundUpdates / getLongMetric("update.sum.count.count", ms),
+ " (%'3.2f %%)");
+ table[1][4] = getValueString(
+ getDoubleMetric("update.sum.latency.min", ms), "%'3.2f ms");
+ table[1][5] = getValueString(
+ getDoubleMetric("update.sum.latency.average", ms), "%'3.2f ms");
+ table[1][6] = getValueString(
+ getDoubleMetric("update.sum.latency.max", ms), "%'3.2f ms");
+
+ table.rowNames += "Remove";
+ table[2][0] = getValueString(
+ DVW(1.0) * getLongMetric("remove.sum.count.count", ms) / LVW(timePeriod),
+ "%'3.2f");
+ table[2][1] = getValueString(getLongMetric("remove.sum.count.count", ms));
+ LongValue failedRemoves = getLongMetric("remove.sum.count.count", ms)
+ - getLongMetric("remove.sum.latency.count", ms);
+ table[2][2] = getValueString(failedRemoves) + getValueString(
+ DVW(100) * failedRemoves / getLongMetric("remove.sum.count.count", ms),
+ " (%'3.2f %%)");
+ LongValue notFoundRemoves = getLongMetric("remove.sum.not_found.count", ms);
+ table[2][3] = getValueString(notFoundRemoves) + getValueString(
+ DVW(100) * notFoundRemoves / getLongMetric("remove.sum.count.count", ms),
+ " (%'3.2f %%)");
+ table[2][4] = getValueString(
+ getDoubleMetric("remove.sum.latency.min", ms), "%'3.2f ms");
+ table[2][5] = getValueString(
+ getDoubleMetric("remove.sum.latency.average", ms), "%'3.2f ms");
+ table[2][6] = getValueString(
+ getDoubleMetric("remove.sum.latency.max", ms), "%'3.2f ms");
+
+ table.rowNames += "Get";
+ table[3][0] = getValueString(
+ DVW(1.0) * getLongMetric("get.sum.count.count", ms) / LVW(timePeriod),
+ "%'3.2f");
+ table[3][1] = getValueString(getLongMetric("get.sum.count.count", ms));
+ LongValue failedGets = getLongMetric("get.sum.count.count", ms)
+ - getLongMetric("get.sum.latency.count", ms);
+ table[3][2] = getValueString(failedGets) + getValueString(
+ DVW(100) * failedGets / getLongMetric("get.sum.count.count", ms),
+ " (%'3.2f %%)");
+ LongValue notFoundGets = getLongMetric("get.sum.not_found.count", ms);
+ table[3][3] = getValueString(notFoundGets) + getValueString(
+ DVW(100) * notFoundGets / getLongMetric("get.sum.count.count", ms),
+ " (%'3.2f %%)");
+ table[3][4] = getValueString(
+ getDoubleMetric("get.sum.latency.min", ms), "%'3.2f ms");
+ table[3][5] = getValueString(
+ getDoubleMetric("get.sum.latency.average", ms), "%'3.2f ms");
+ table[3][6] = getValueString(
+ getDoubleMetric("get.sum.latency.max", ms), "%'3.2f ms");
+
+ table.rowNames += "Visit";
+ std::string visPrefix = "../../../visitor.allthreads.";
+ LongValue completedVis = getLongMetric(visPrefix + "completed.sum.count", ms);
+ failed = getLongMetric(visPrefix + "failed.sum.count", ms);
+ LongValue totalVis = failed + completedVis;
+ // Not adding aborted to this count for now
+ // + getLongMetric(visPrefix + "aborted.count", ms)
+ table[4][0] = getValueString(DVW(1.0) * completedVis / timePeriod, "%'3.2f");
+ table[4][1] = getValueString(completedVis);
+ table[4][2] = getValueString(failed)
+ + getValueString(DVW(100) * failed / totalVis, " (%'3.2f %%)");
+ table[4][4] = getValueString(
+ getDoubleMetric(visPrefix + "averagevisitorlifetime.sum.min", ms),
+ "%'3.2f ms");
+ table[4][5] = getValueString(
+ getDoubleMetric(visPrefix + "averagevisitorlifetime.sum.average", ms),
+ "%'3.2f ms");
+ table[4][6] = getValueString(
+ getDoubleMetric(visPrefix + "averagevisitorlifetime.sum.max", ms),
+ "%'3.2f ms");
+
+ /*
+ table.rowNames += "Stat";
+ table[3][0] = getValueString(
+ getQuotient(getLongMetric("stats.count", ms),
+ LVW(timePeriod)), "%'3.2f");
+ table[3][1] = getValueString(getLongMetric("stats.count", ms));
+ LongValue failedStats = getDiff(
+ getLongMetric("stats.count", ms),
+ getLongMetric("statlatencytotal.count", ms));
+ table[3][2] = getValueString(failedStats) + getValueString(
+ getProduct(LVW(100), getQuotient(failedStats,
+ getLongMetric("stats.count", ms))),
+ " (%'3.2f %%)");
+ LongValue notFoundStats = getLongMetric("statsnotfound.count", ms);
+ table[3][3] = getValueString(notFoundStats) + getValueString(
+ getProduct(LVW(100), getQuotient(notFoundStat,
+ getLongMetric("stats.count", ms))),
+ " (%'3.2f %%)");
+ table[3][4] = getValueString(
+ getDoubleMetric("statlatencytotal.average", ms), "%'3.2f ms");
+ */
+
+ table.rowNames += "Revert";
+ table[5][0] = getValueString(
+ DVW(1.0) * getLongMetric("revert.sum.count.count", ms) / LVW(timePeriod),
+ "%'3.2f");
+ table[5][1] = getValueString(getLongMetric("revert.sum.count.count", ms));
+ LongValue failedReverts = getLongMetric("revert.sum.count.count", ms)
+ - getLongMetric("revert.sum.latency.count", ms);
+ table[5][2] = getValueString(failedReverts) + getValueString(
+ DVW(100) * failedReverts / getLongMetric("revert.sum.count.count", ms),
+ " (%'3.2f %%)");
+ LongValue notFoundReverts = getLongMetric("revert.sum.not_found.count", ms);
+ table[5][3] = getValueString(notFoundReverts) + getValueString(
+ DVW(100) * notFoundReverts / getLongMetric("revert.sum.count.count", ms),
+ " (%'3.2f %%)");
+ table[5][4] = getValueString(
+ getDoubleMetric("revert.sum.latency.min", ms), "%'3.2f ms");
+ table[5][5] = getValueString(
+ getDoubleMetric("revert.sum.latency.average", ms), "%'3.2f ms");
+ table[5][6] = getValueString(
+ getDoubleMetric("revert.sum.latency.max", ms), "%'3.2f ms");
+
+ table.print(out);
+}
+
+void
+StatusMetricConsumer::printMaintOpHtmlReport(
+ std::ostream& out,
+ std::map<metrics::Metric::String, metrics::Metric::SP>& usedMetrics,
+ const metrics::MetricSnapshot& snapshot) const
+{
+ using namespace boost::assign;
+ using namespace metrics::printutils;
+
+ MetricSource ms(snapshot, "vds.filestor.alldisks.allthreads", &usedMetrics);
+ HttpTable table("Maintenance load", "Operation");
+ table.colNames += "Ops/s", "Ops", "Failed", "Not found",
+ "Min latency", "Avg latency", "Max latency", "Notes";
+ LongValue failed;
+
+ LVW timePeriod(snapshot.getLength());
+
+ table.rowNames += "Merge bucket";
+ table[0][0] = getValueString(
+ DVW(1.0) * getLongMetric("mergebuckets.count.count", ms)
+ / timePeriod, "%'3.2f");
+ table[0][1] = getValueString(getLongMetric("mergebuckets.count.count", ms));
+ failed = getLongMetric("mergebuckets.count.count", ms);
+ table[0][2] = getValueString(failed) + getValueString(
+ DVW(100) * failed / getLongMetric("mergebuckets.count.count", ms),
+ " (%'3.2f %%)");
+ table[0][4] = getValueString(
+ getDoubleMetric("mergelatencytotal.min", ms), "%'3.2f ms");
+ table[0][5] = getValueString(
+ getDoubleMetric("mergelatencytotal.average", ms), "%'3.2f ms");
+ table[0][6] = getValueString(
+ getDoubleMetric("mergelatencytotal.max", ms), "%'3.2f ms");
+
+ table.rowNames += "Split bucket";
+ table[1][0] = getValueString(
+ DVW(1.0) * getLongMetric("splitbuckets.count.count", ms)
+ / timePeriod, "%'3.2f");
+ table[1][1] = getValueString(getLongMetric("splitbuckets.count.count", ms));
+ failed = getLongMetric("splitbuckets.failed.count", ms);
+ table[1][2] = getValueString(failed) + getValueString(
+ DVW(100) * failed / getLongMetric("splitbuckets.count.count", ms),
+ " (%'3.2f %%)");
+ table[1][4] = getValueString(
+ getDoubleMetric("splitbuckets.latency.min", ms), "%'3.2f ms");
+ table[1][5] = getValueString(
+ getDoubleMetric("splitbuckets.latency.average", ms), "%'3.2f ms");
+ table[1][6] = getValueString(
+ getDoubleMetric("splitbuckets.latency.max", ms), "%'3.2f ms");
+
+ table.rowNames += "Delete bucket";
+ table[2][0] = getValueString(
+ DVW(1.0) * getLongMetric("deletebuckets.count.count", ms)
+ / timePeriod, "%'3.2f");
+ table[2][1] = getValueString(getLongMetric("deletebuckets.count.count", ms));
+ failed = getLongMetric("deletebuckets.failed.count", ms);
+ table[2][2] = getValueString(failed) + getValueString(
+ DVW(100) * failed / getLongMetric("deletebuckets.count.count", ms),
+ " (%'3.2f %%)");
+ table[2][4] = getValueString(
+ getDoubleMetric("deletebuckets.latency.min", ms), "%'3.2f ms");
+ table[2][5] = getValueString(
+ getDoubleMetric("deletebuckets.latency.average", ms), "%'3.2f ms");
+ table[2][6] = getValueString(
+ getDoubleMetric("deletebuckets.latency.max", ms), "%'3.2f ms");
+
+ table.rowNames += "Buckets verified";
+ table[3][0] = getValueString(
+ DVW(1.0) * getLongMetric("bucketverified.count.count", ms)
+ / timePeriod, "%'3.2f");
+ table[3][1] = getValueString(getLongMetric("bucketverified.count.count", ms));
+ failed = getLongMetric("bucketverified.failed.count", ms);
+ table[3][2] = getValueString(failed) + getValueString(
+ DVW(100) * failed / getLongMetric("bucketverified.count.count", ms),
+ " (%'3.2f %%)");
+ table[3][4] = getValueString(
+ getDoubleMetric("bucketverified.latency.min", ms), "%'3.2f ms");
+ table[3][5] = getValueString(
+ getDoubleMetric("bucketverified.latency.average", ms), "%'3.2f ms");
+ table[3][6] = getValueString(
+ getDoubleMetric("bucketverified.latency.max", ms), "%'3.2f ms");
+ table[3][7] = "Buckets repaired: "
+ + getValueString(getLongMetric("bucketfixed.count", ms))
+ + getValueString(DVW(100)
+ * getLongMetric("bucketfixed.count", ms)
+ / getLongMetric("bucketverified.count.count", ms),
+ " (%'3.2f %%)");
+
+ table.rowNames += "List buckets";
+ table[4][0] = getValueString(
+ DVW(1.0) * getLongMetric("readbucketlist.count.count", ms)
+ / timePeriod, "%'3.2f");
+ table[4][1] = getValueString(
+ getLongMetric("readbucketlist.count.count", ms));
+ failed = getLongMetric("readbucketlist.failed.count", ms);
+ table[4][2] = getValueString(failed) + getValueString(
+ DVW(100) * failed / getLongMetric("readbucketlist.count.count", ms),
+ " (%'3.2f %%)");
+ table[4][4] = getValueString(
+ getDoubleMetric("readbucketlist.latency.min", ms),
+ "%'3.2f ms");
+ table[4][5] = getValueString(
+ getDoubleMetric("readbucketlist.latency.average", ms),
+ "%'3.2f ms");
+ table[4][6] = getValueString(
+ getDoubleMetric("readbucketlist.latency.max", ms),
+ "%'3.2f ms");
+
+ table.rowNames += "Read bucket info";
+ table[5][0] = getValueString(
+ DVW(1.0) * getLongMetric("readbucketinfo.count.count", ms)
+ / timePeriod, "%'3.2f");
+ table[5][1] = getValueString(getLongMetric("readbucketinfo.count.count", ms));
+ failed = getLongMetric("readbucketinfo.failed.count", ms);
+ table[5][2] = getValueString(failed) + getValueString(
+ DVW(100) * failed / getLongMetric("readbucketinfo.count.count", ms),
+ " (%'3.2f %%)");
+ table[5][4] = getValueString(
+ getDoubleMetric("readbucketinfo.latency.min", ms),
+ "%'3.2f ms");
+ table[5][5] = getValueString(
+ getDoubleMetric("readbucketinfo.latency.average", ms),
+ "%'3.2f ms");
+ table[5][6] = getValueString(
+ getDoubleMetric("readbucketinfo.latency.max", ms),
+ "%'3.2f ms");
+
+ table.rowNames += "Bucket move";
+ table[6][0] = getValueString(
+ DVW(1.0) * getLongMetric("movedbuckets.count.count", ms)
+ / timePeriod, "%'3.2f");
+ table[6][1] = getValueString(getLongMetric("movedbuckets.count.count", ms));
+ failed = getLongMetric("movedbuckets.failed.count", ms);
+ table[6][2] = getValueString(failed) + getValueString(
+ DVW(100) * failed / getLongMetric("movedbuckets.count.count", ms),
+ " (%'3.2f %%)");
+ table[6][4] = getValueString(
+ getDoubleMetric("movedbuckets.latency.min", ms), "%'3.2f ms");
+ table[6][5] = getValueString(
+ getDoubleMetric("movedbuckets.latency.average", ms), "%'3.2f ms");
+ table[6][6] = getValueString(
+ getDoubleMetric("movedbuckets.latency.max", ms), "%'3.2f ms");
+
+ table.rowNames += "Internal join";
+ table[7][0] = getValueString(
+ DVW(1.0) * getLongMetric("internaljoin.count.count", ms)
+ / timePeriod, "%'3.2f");
+ table[7][1] = getValueString(getLongMetric("internaljoin.count.count", ms));
+ failed = getLongMetric("internaljoin.failed.count", ms);
+ table[7][2] = getValueString(failed) + getValueString(
+ DVW(100) * failed / getLongMetric("internaljoin.count.count", ms),
+ " (%'3.2f %%)");
+ table[7][4] = getValueString(
+ getDoubleMetric("internaljoin.latency.min", ms),
+ "%'3.2f ms");
+ table[7][5] = getValueString(
+ getDoubleMetric("internaljoin.latency.average", ms),
+ "%'3.2f ms");
+ table[7][6] = getValueString(
+ getDoubleMetric("internaljoin.latency.max", ms),
+ "%'3.2f ms");
+
+ table.print(out);
+
+}
+
+void
+StatusMetricConsumer::printMergeHtmlReport(
+ std::ostream& out,
+ std::map<metrics::Metric::String, metrics::Metric::SP>& usedMetrics,
+ const metrics::MetricSnapshot& snapshot) const
+{
+ using namespace boost::assign;
+ using namespace metrics::printutils;
+
+ LVW timePeriod(snapshot.getLength());
+
+ MetricSource ms(snapshot, "vds.filestor.alldisks.allthreads", &usedMetrics);
+ HttpTable table("Merge frequency and partial latencies", "Type");
+ table.colNames += "Ops/s", "Ops", "Failed",
+ "Min latency", "Avg latency", "Max latency";
+ LongValue failed;
+
+ table.rowNames += "Complete merge";
+ table[0][0] = getValueString(
+ DVW(1.0) * getLongMetric("mergebuckets.count.count", ms)
+ / timePeriod, "%'3.2f");
+ table[0][1] = getValueString(getLongMetric("mergebuckets.count.count", ms));
+ failed = getLongMetric("mergebuckets.count.count", ms)
+ - getLongMetric("mergelatencytotal.count", ms);
+ table[0][2] = getValueString(failed) + getValueString(
+ DVW(100.0) * failed / getLongMetric("mergebuckets.count.count", ms),
+ " (%'3.2f %%)");
+ table[0][3] = getValueString(
+ getDoubleMetric("mergelatencytotal.min", ms), "%'3.2f ms");
+ table[0][4] = getValueString(
+ getDoubleMetric("mergelatencytotal.average", ms), "%'3.2f ms");
+ table[0][5] = getValueString(
+ getDoubleMetric("mergelatencytotal.max", ms), "%'3.2f ms");
+
+ table.rowNames += "Metadata read";
+ LongValue metadataCount = getLongMetric("getbucketdiff.count.count", ms)
+ + getLongMetric("mergebuckets.count.count", ms);
+ table[1][0] = getValueString(DVW(1.0) * metadataCount
+ / timePeriod, "%'3.2f");
+ table[1][1] = getValueString(metadataCount);
+ failed = metadataCount
+ - getLongMetric("mergemetadatareadlatency.count", ms);
+ table[1][2] = getValueString(failed) + getValueString(
+ DVW(100.0) * failed / metadataCount, " (%'3.2f %%)");
+ table[1][3] = getValueString(
+ getDoubleMetric("mergemetadatareadlatency.min", ms),
+ "%'3.2f ms");
+ table[1][4] = getValueString(
+ getDoubleMetric("mergemetadatareadlatency.average", ms),
+ "%'3.2f ms");
+ table[1][5] = getValueString(
+ getDoubleMetric("mergemetadatareadlatency.max", ms),
+ "%'3.2f ms");
+
+ table.rowNames += "Successful data reads";
+ table[2][0] = getValueString(
+ DVW(1.0) * getLongMetric("mergedatareadlatency.count", ms)
+ / timePeriod, "%'3.2f");
+ table[2][1] = getValueString(
+ getLongMetric("mergedatareadlatency.count", ms));
+ table[2][3] = getValueString(
+ getDoubleMetric("mergedatareadlatency.min", ms),
+ "%'3.2f ms");
+ table[2][4] = getValueString(
+ getDoubleMetric("mergedatareadlatency.average", ms),
+ "%'3.2f ms");
+ table[2][5] = getValueString(
+ getDoubleMetric("mergedatareadlatency.max", ms),
+ "%'3.2f ms");
+
+ table.rowNames += "Successful data writes";
+ table[3][0] = getValueString(
+ DVW(1.0) * getLongMetric("mergedatawritelatency.count", ms)
+ / timePeriod, "%'3.2f");
+ table[3][1] = getValueString(
+ getLongMetric("mergedatawritelatency.count", ms));
+ table[3][3] = getValueString(
+ getDoubleMetric("mergedatawritelatency.min", ms),
+ "%'3.2f ms");
+ table[3][4] = getValueString(
+ getDoubleMetric("mergedatawritelatency.average", ms),
+ "%'3.2f ms");
+ table[3][5] = getValueString(
+ getDoubleMetric("mergedatawritelatency.max", ms),
+ "%'3.2f ms");
+
+ HttpTable table2("Other merge properties", "Property");
+ table2.colNames += "Per merge value", "Total value";
+
+ table2.rowNames += "Bytes merged";
+ table2[0][0] = getByteValueString(
+ DVW(1.0) * getLongMetric("bytesmerged.count", ms)
+ / getLongMetric("mergebuckets.count.count", ms));
+ table2[0][1] = getByteValueString(getLongMetric("bytesmerged.count", ms));
+
+ table2.rowNames += "Network efficiency";
+ table2[1][1] = getValueString(
+ DVW(100)
+ * getDoubleMetric("mergeavgdatareceivedneeded.average", ms),
+ "%'3.2f %%");
+
+ table2.rowNames += "Average merges pending";
+ table2[2][1] = getValueString(
+ getDoubleMetric("../pendingmerge.average", ms), "%'3.2f");
+
+ table2.rowNames += "Data transfers";
+ table2[3][0] = getValueString(
+ DVW(1.0) * getLongMetric("applybucketdiff.count.count", ms)
+ / getLongMetric("mergebuckets.count.count", ms), "%'3.2f");
+ table2[3][1] = getValueString(
+ getLongMetric("applybucketdiff.count.count", ms));
+
+ table2.rowNames += "Merge master percentage";
+ table2[4][1] = getValueString(
+ DVW(100) * getLongMetric("mergebuckets.count.count", ms)
+ / (getLongMetric("getbucketdiff.count.count", ms)
+ + getLongMetric("mergebuckets.count.count", ms)), "%'3.2f %%");
+
+ out << "<table cellpadding=\"0\" cellspacing=\"0\">\n"
+ << "<td valign=\"top\" width=\"60%\">\n";
+ table.print(out);
+ out << "<p><font size=\"-1\"><div align=\"left\">\n"
+ << "Complete merge lists amount of merges this node has been master of. Metadata read shows how many merges this node has taken part of in total. By looking at the data reads and writes one can see how many of these are done per merge if one compares it to the amount of metadata reads.\n"
+ << "</div></font></p>\n";
+ out << "</td><td width=\"1%\">&nbsp;&nbsp;</td>\n"
+ << "<td valign=\"top\" width=\"39%\">\n";
+ table2.print(out);
+ out << "<p><font size=\"-1\"><div align=\"left\">\n"
+ << "Bytes merged sets how many bytes have been merged into this node. The network efficiency states how big percent the byte merged was of the total data received during the merge. (A low efficiency indicates that data being merged between two nodes are routed through this node, which doesn't need it). Pending merges are the number of merges this node is master for that is currently being processed. Lastly, a percentage is shown, giving how many of total merges gone through this node the node has actually been master of.\n"
+ << "</div></font></p>\n";
+ out << "</td></table>\n";
+}
+
+void
+StatusMetricConsumer::printVisitHtmlReport(
+ std::ostream& out,
+ std::map<metrics::Metric::String, metrics::Metric::SP>& usedMetrics,
+ const metrics::MetricSnapshot& snapshot) const
+{
+ using namespace boost::assign;
+ using namespace metrics::printutils;
+
+ LVW timePeriod(snapshot.getLength());
+
+ MetricSource ms(snapshot, "vds.visitor.allthreads", &usedMetrics);
+ HttpTable table("Visiting", "Type");
+ table.colNames += "Value", "Percentage";
+ std::string persPrefix = "../../filestor.alldisks.allthreads.";
+ std::string diskPrefix = "../../filestor.alldisks.";
+ LongValue completedVisitors = getLongMetric("completed.sum.count", ms);
+ LongValue failedVisitors = getLongMetric("failed.sum.count", ms);
+ LongValue abortedVisitors = getLongMetric("aborted.sum.count", ms);
+ LongValue totalVisitors = completedVisitors + failedVisitors
+ + abortedVisitors;
+ DoubleValue bucketsPerVisitor(
+ DVW(1) * getLongMetric(persPrefix + "visit.sum.count.count", ms)
+ / completedVisitors);
+ LongValue totalTime = getLongMetric("averagevisitorlifetime.sum.average", ms);
+
+ table.rowNames += "Pending visitors";
+ table[0][0] = getValueString(
+ getLongMetric("created.sum.count", ms) - completedVisitors);
+
+ table.rowNames += "Completed visitors";
+ table[1][0] = getValueString(completedVisitors);
+ table[1][1] = getValueString(
+ DVW(100.0) * completedVisitors / totalVisitors, "%'3.2f %%");
+
+ table.rowNames += "Failed visitors";
+ table[2][0] = getValueString(failedVisitors);
+ table[2][1] = getValueString(
+ DVW(100.0) * failedVisitors / totalVisitors, "%'3.2f %%");
+
+ table.rowNames += "Aborted visitors";
+ table[3][0] = getValueString(abortedVisitors);
+ table[3][1] = getValueString(
+ DVW(100.0) * abortedVisitors / totalVisitors, "%'3.2f %%");
+
+ table.rowNames += "Buckets visited per visitor";
+ table[4][0] = getValueString(bucketsPerVisitor, "%'3.2f");
+
+ table.rowNames += "Average size of visitor manager queue (min/average/max)";
+ table[5][0] = getValueString(getLongMetric("queuesize.min", ms),
+ "%'llu")
+ + getValueString(getLongMetric("queuesize.average", ms),
+ " / %'llu")
+ + getValueString(getLongMetric("queuesize.max", ms),
+ " / %'llu");
+
+ table.rowNames += "Total latency of completed visitors (min/average/max)";
+ table[6][0] = getValueString(getLongMetric(
+ + "averagevisitorlifetime.sum.min", ms), "%'llu ms")
+ + getValueString(getLongMetric(
+ "averagevisitorlifetime.sum.average", ms), " / %'llu ms")
+ + getValueString(getLongMetric(
+ "averagevisitorlifetime.sum.max", ms), " / %'llu ms");
+ table[6][1] = getValueString(DVW(100), "%'3.2f %%");
+
+
+ table.rowNames += "Time spent in visitor manager queue (min/average/max)";
+ LongValue queueWait = getLongMetric("averagequeuewait.sum.average", ms);
+ table[7][0] = getValueString(getLongMetric("averagequeuewait.sum.min", ms))
+ + getValueString(queueWait, " ms / %'llu ms / ")
+ + getValueString(
+ getLongMetric("averagequeuewait.sum.max", ms), "%'llu ms");
+ table[7][1] = getValueString(DVW(100) * queueWait / totalTime, "%'3.2f %%");
+
+ table.rowNames += "Time spent processing in visitor thread";
+ LongValue cpuTime = getLongMetric("averageprocessingtime.sum.average", ms);
+ table[8][0] = getValueString(cpuTime, "%" PRId64 " ms");
+ table[8][1] = getValueString(DVW(100) * cpuTime / totalTime, "%'3.2f %%");
+
+ table.print(out);
+
+ out << "<p><font size=\"-1\"><div align=\"left\">\n"
+ << "Note that the buckets and blocks per visitor values assume that there are no aborted visitors. If there is a considerable amount of aborted visitors that has done persistence requests, these calculated numbers will be too large. Average time spent per visitor waiting in peristence layer, reading metadata or data, and the time used processing in visitor threads are also calculated values. If visiting consistently happens while cluster does not have average load, these values may also be a bit off.\n"
+ << "</div></font></p>\n";
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/common/statusmetricconsumer.h b/storage/src/vespa/storage/common/statusmetricconsumer.h
new file mode 100644
index 00000000000..fac9296032e
--- /dev/null
+++ b/storage/src/vespa/storage/common/statusmetricconsumer.h
@@ -0,0 +1,83 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+/**
+ * @class storage::StatusMetricConsumer
+ * @ingroup common
+ *
+ * @brief Writes metrics to status page.
+ */
+
+#pragma once
+
+#include <vespa/vespalib/util/document_runnable.h>
+#include <vespa/vespalib/util/sync.h>
+#include <map>
+#include <vespa/metrics/metrics.h>
+#include <vespa/storage/common/storagecomponent.h>
+#include <vespa/storageframework/storageframework.h>
+
+namespace vespalib {
+ class StringTokenizer;
+}
+
+namespace storage {
+
+class StatusMetricConsumer : public framework::StatusReporter,
+ private framework::MetricUpdateHook,
+ private vespalib::JsonStreamTypes
+{
+public:
+ StatusMetricConsumer(
+ StorageComponentRegister&,
+ metrics::MetricManager&,
+ const std::string& name = "status");
+ ~StatusMetricConsumer();
+
+ vespalib::string getReportContentType(
+ const framework::HttpUrlPath&) const;
+ bool reportStatus(std::ostream& out, const framework::HttpUrlPath&) const;
+
+ void waitUntilTimeProcessed(framework::SecondTime t) const;
+
+ void updateMetrics(const MetricLockGuard & guard) override;
+
+private:
+ typedef metrics::Metric Metric;
+ typedef metrics::Metric::String String;
+
+ metrics::MetricManager& _manager;
+ StorageComponent _component;
+ std::string _name;
+ vespalib::Monitor _waiter;
+ framework::SecondTime _startTime;
+ framework::SecondTime _processedTime;
+ framework::MemoryToken::UP _metricMemoryToken;
+
+ void writeXmlTags(std::ostream& out,
+ const vespalib::StringTokenizer& name,
+ std::vector<std::string>& xmlTags) const;
+
+ void printHtmlMetricsReport(std::ostream& out,
+ const metrics::MetricSnapshot& data,
+ bool includeNotUsed) const;
+
+ void printStorageHtmlReport(std::ostream& out,
+ std::map<String, Metric::SP>& usedMetrics,
+ const metrics::MetricSnapshot&) const;
+ void printOperationHtmlReport(std::ostream& out,
+ std::map<String, Metric::SP>& usedMetrics,
+ const metrics::MetricSnapshot&) const;
+ void printMaintOpHtmlReport(std::ostream& out,
+ std::map<String, Metric::SP>& usedMetrics,
+ const metrics::MetricSnapshot&) const;
+ void printMergeHtmlReport(std::ostream& out,
+ std::map<String, Metric::SP>& usedMetrics,
+ const metrics::MetricSnapshot& snapshot) const;
+ void printVisitHtmlReport(std::ostream& out,
+ std::map<String, Metric::SP>& usedMetrics,
+ const metrics::MetricSnapshot&) const;
+
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/common/storagecomponent.cpp b/storage/src/vespa/storage/common/storagecomponent.cpp
new file mode 100644
index 00000000000..69dc81d3bac
--- /dev/null
+++ b/storage/src/vespa/storage/common/storagecomponent.cpp
@@ -0,0 +1,136 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/common/storagecomponent.h>
+#include <vespa/storage/storageserver/prioritymapper.h>
+
+#include <vespa/vespalib/util/exceptions.h>
+
+#include <vespa/vdslib/distribution/distribution.h>
+
+namespace storage {
+
+// Defined in cpp file to allow unique pointers of unknown type in header.
+StorageComponent::~StorageComponent()
+{
+}
+
+void
+StorageComponent::setNodeInfo(vespalib::stringref clusterName,
+ const lib::NodeType& nodeType,
+ uint16_t index)
+{
+ // Assumed to not be set dynamically.
+ _clusterName = clusterName;
+ _nodeType = &nodeType;
+ _index = index;
+}
+
+void
+StorageComponent::setDocumentTypeRepo(DocumentTypeRepoSP repo)
+{
+ vespalib::LockGuard guard(_lock);
+ _docTypeRepo = repo;
+}
+
+void
+StorageComponent::setLoadTypes(LoadTypeSetSP loadTypes)
+{
+ vespalib::LockGuard guard(_lock);
+ _loadTypes = loadTypes;
+}
+
+
+void
+StorageComponent::setPriorityConfig(const PriorityConfig& c)
+{
+ // Priority mapper is already thread safe.
+ _priorityMapper->setConfig(c);
+}
+
+void
+StorageComponent::setBucketIdFactory(const document::BucketIdFactory& factory)
+{
+ // Assumed to not be set dynamically.
+ _bucketIdFactory = factory;
+}
+
+void
+StorageComponent::setDistribution(DistributionSP distribution)
+{
+ vespalib::LockGuard guard(_lock);
+ _distribution = distribution;
+}
+
+void
+StorageComponent::setNodeStateUpdater(NodeStateUpdater& updater)
+{
+ vespalib::LockGuard guard(_lock);
+ if (_nodeStateUpdater != 0) {
+ throw vespalib::IllegalStateException(
+ "Node state updater is already set", VESPA_STRLOC);
+ }
+ _nodeStateUpdater = &updater;
+}
+
+StorageComponent::StorageComponent(StorageComponentRegister& compReg,
+ vespalib::stringref name)
+ : Component(compReg, name),
+ _clusterName(),
+ _nodeType(0),
+ _index(0),
+ _priorityMapper(new PriorityMapper),
+ _nodeStateUpdater(0)
+{
+ compReg.registerStorageComponent(*this);
+}
+
+NodeStateUpdater&
+StorageComponent::getStateUpdater() const
+{
+ vespalib::LockGuard guard(_lock);
+ if (_nodeStateUpdater == 0) {
+ throw vespalib::IllegalStateException(
+ "Component need node state updater at this time, but it has "
+ "not been initialized.", VESPA_STRLOC);
+ }
+ return *_nodeStateUpdater;
+}
+
+vespalib::string
+StorageComponent::getIdentity() const
+{
+ vespalib::asciistream name;
+ name << "storage/cluster." << _clusterName << "/"
+ << _nodeType->serialize() << "/" << _index;
+ return name.str();
+}
+
+uint8_t
+StorageComponent::getPriority(const documentapi::LoadType& lt) const
+{
+ return _priorityMapper->getPriority(lt);
+}
+
+StorageComponent::DocumentTypeRepoSP
+StorageComponent::getTypeRepo() const
+{
+ vespalib::LockGuard guard(_lock);
+ return _docTypeRepo;
+}
+
+StorageComponent::LoadTypeSetSP
+StorageComponent::getLoadTypes() const
+{
+ vespalib::LockGuard guard(_lock);
+ return _loadTypes;
+}
+
+StorageComponent::DistributionSP
+StorageComponent::getDistribution() const
+{
+ vespalib::LockGuard guard(_lock);
+ return _distribution;
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/common/storagecomponent.h b/storage/src/vespa/storage/common/storagecomponent.h
new file mode 100644
index 00000000000..6b561619542
--- /dev/null
+++ b/storage/src/vespa/storage/common/storagecomponent.h
@@ -0,0 +1,125 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::StorageComponent
+ * \ingroup common
+ *
+ * \brief Component class including some storage specific information.
+ *
+ * The storage framework defines components with generic functionality.
+ * The storage component inherits from this and adds some storage specific
+ * components. Further, the distributor component and service layer component
+ * will inherit from this to also include distributor and service layer specific
+ * implementations.
+ */
+
+/**
+ * \class storage::StorageComponentRegister
+ * \ingroup common
+ *
+ * \brief Specialization of ComponentRegister handling storage components.
+ */
+
+/**
+ * \class storage::StorageManagedComponent
+ * \ingroup common
+ *
+ * \brief Specialization of ManagedComponent to set storage functionality.
+ *
+ * A storage component register will use this interface in order to set the
+ * storage functionality parts.
+ */
+
+#pragma once
+
+#include <vespa/document/bucket/bucketidfactory.h>
+#include <vespa/storageframework/generic/component/component.h>
+#include <vespa/storageframework/generic/component/componentregister.h>
+#include <vespa/vdslib/state/node.h>
+#include <vespa/vespalib/util/sync.h>
+
+namespace vespa { namespace config { namespace content { namespace core {
+namespace internal {
+ class InternalStorPrioritymappingType;
+} } } } }
+namespace document {
+ class DocumentTypeRepo;
+}
+namespace documentapi {
+ class LoadType;
+ class LoadTypeSet;
+}
+
+namespace storage {
+namespace lib {
+ class Distribution;
+}
+class NodeStateUpdater;
+class PriorityMapper;
+class StorageComponentRegister;
+
+class StorageComponent : public framework::Component {
+public:
+ typedef std::unique_ptr<StorageComponent> UP;
+ typedef vespa::config::content::core::internal::InternalStorPrioritymappingType PriorityConfig;
+ typedef std::shared_ptr<document::DocumentTypeRepo> DocumentTypeRepoSP;
+ typedef std::shared_ptr<documentapi::LoadTypeSet> LoadTypeSetSP;
+ typedef std::shared_ptr<lib::Distribution> DistributionSP;
+
+ /**
+ * Node type is supposed to be set immediately, and never be updated.
+ * Thus it does not need to be threadsafe. Should never be used before set.
+ */
+ void setNodeInfo(vespalib::stringref clusterName,
+ const lib::NodeType& nodeType,
+ uint16_t index);
+
+ /**
+ * Node state updater is supposed to be set immediately, and never be
+ * updated. Thus it does not need to be threadsafe. Should never be used
+ * before set.
+ */
+ void setNodeStateUpdater(NodeStateUpdater& updater);
+ void setDocumentTypeRepo(DocumentTypeRepoSP);
+ void setLoadTypes(LoadTypeSetSP);
+ void setPriorityConfig(const PriorityConfig&);
+ void setBucketIdFactory(const document::BucketIdFactory&);
+ void setDistribution(DistributionSP);
+
+ StorageComponent(StorageComponentRegister&, vespalib::stringref name);
+ virtual ~StorageComponent();
+
+ vespalib::string getClusterName() const { return _clusterName; }
+ const lib::NodeType& getNodeType() const { return *_nodeType; }
+ uint16_t getIndex() const { return _index; }
+ lib::Node getNode() const { return lib::Node(*_nodeType, _index); }
+
+ vespalib::string getIdentity() const;
+
+ DocumentTypeRepoSP getTypeRepo() const;
+ LoadTypeSetSP getLoadTypes() const;
+ const document::BucketIdFactory& getBucketIdFactory() const
+ { return _bucketIdFactory; }
+ uint8_t getPriority(const documentapi::LoadType&) const;
+ DistributionSP getDistribution() const;
+ NodeStateUpdater& getStateUpdater() const;
+
+private:
+ vespalib::string _clusterName;
+ const lib::NodeType* _nodeType;
+ uint16_t _index;
+ DocumentTypeRepoSP _docTypeRepo;
+ LoadTypeSetSP _loadTypes;
+ std::unique_ptr<PriorityMapper> _priorityMapper;
+ document::BucketIdFactory _bucketIdFactory;
+ DistributionSP _distribution;
+ NodeStateUpdater* _nodeStateUpdater;
+ vespalib::Lock _lock;
+};
+
+struct StorageComponentRegister : public virtual framework::ComponentRegister
+{
+ virtual void registerStorageComponent(StorageComponent&) = 0;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/common/storagelink.cpp b/storage/src/vespa/storage/common/storagelink.cpp
new file mode 100644
index 00000000000..3a48573c4ce
--- /dev/null
+++ b/storage/src/vespa/storage/common/storagelink.cpp
@@ -0,0 +1,306 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/common/storagelink.h>
+
+#include <vespa/log/log.h>
+#include <sstream>
+#include <vespa/storageapi/messageapi/storagecommand.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/vespalib/util/backtrace.h>
+
+LOG_SETUP(".application.link");
+
+using std::shared_ptr;
+using std::ostringstream;
+using namespace storage;
+using namespace storage::api;
+
+StorageLink::~StorageLink()
+{
+}
+
+void StorageLink::push_back(StorageLink::UP link)
+{
+ if (_state != CREATED) {
+ LOG(error, "Attempted to alter chain by adding link %s after link %s "
+ "while state is %s",
+ link->toString().c_str(),
+ toString().c_str(),
+ stateToString(_state));
+ assert(false);
+ }
+ assert(link.get());
+ if (isBottom()) {
+ link->_up = this;
+ _down = std::move(link);
+ } else {
+ _down->push_back(std::move(link));
+ }
+}
+
+void StorageLink::open()
+{
+ // First tag all states as opened, as components are allowed to send
+ // messages both ways in onOpen call, in case any component send message
+ // up, the link receiving them should have their state as opened.
+ StorageLink* link = this;
+ while (true) {
+ if (link->_state != CREATED) {
+ LOG(error, "During open(), link %s should be in CREATED state, "
+ "not in state %s.",
+ toString().c_str(),
+ stateToString(link->_state));
+ assert(false);
+ }
+ link->_state = OPENED;
+ if (link->_down.get() == 0) break;
+ link = link->_down.get();
+ }
+ // When give all links an onOpen call, bottoms up. Do it bottoms up, as
+ // links are more likely to send messages down in their onOpen() call
+ // than up. Thus, chances are best that the component is ready to
+ // receive messages sent during onOpen().
+ while (link != 0) {
+ link->onOpen();
+ link = link->_up;
+ }
+}
+
+void StorageLink::doneInit()
+{
+ StorageLink* link = this;
+ while (true) {
+ link->onDoneInit();
+ if (link->_down.get() == 0) break;
+ link = link->_down.get();
+ }
+}
+
+void StorageLink::close()
+{
+ _state = CLOSING;
+ onClose();
+ if (!isBottom()) {
+ _down->close();
+ }
+}
+
+void StorageLink::closeNextLink() {
+ _down.reset(0);
+}
+
+void StorageLink::flush()
+{
+ if (_state != CLOSING) {
+ LOG(error, "During flush(), link %s should be in CLOSING state, "
+ "not in state %s.",
+ toString().c_str(),
+ stateToString(_state));
+ assert(false);
+ }
+ // First flush down to get all requests out of the system.
+ _state = FLUSHINGDOWN;
+ LOG(debug, "Flushing link %s on the way down.", toString().c_str());
+ onFlush(true);
+ LOG(debug, "Flushed link %s on the way down.", toString().c_str());
+ if (!isBottom()) {
+ _down->flush();
+ // Then flush up to get replies out of the system
+ LOG(debug, "Flushing link %s on the way back up.", toString().c_str());
+ _state = FLUSHINGUP;
+ onFlush(false);
+ LOG(debug, "Flushed link %s on the way back up.", toString().c_str());
+ } else {
+ // Then flush up to get replies out of the system
+ LOG(debug, "Flushing link %s on the way back up.", toString().c_str());
+ _state = FLUSHINGUP;
+ onFlush(false);
+ LOG(debug, "Flushed link %s on the way back up.", toString().c_str());
+ }
+ _state = CLOSED;
+ LOG(debug, "Link %s is now closed and should do nothing more.",
+ toString().c_str());
+}
+
+void StorageLink::sendDown(const StorageMessage::SP& msg)
+{
+ // Verify acceptable state to send messages down
+ switch(_state) {
+ case OPENED:
+ case CLOSING:
+ case FLUSHINGDOWN:
+ break;
+ default:
+ LOG(error,
+ "Link %s trying to send %s down while in state %s",
+ toString().c_str(),
+ msg->toString().c_str(),
+ stateToString(_state));
+ assert(false);
+ }
+ assert(msg.get());
+ LOG(spam, "Storage Link %s to handle %s",
+ toString().c_str(), msg->toString().c_str());
+ if (isBottom()) {
+ LOG(spam, "Storage link %s at bottom of chain got message %s.",
+ toString().c_str(), msg->toString().c_str());
+ /*
+ if (isFlush(msg)) {
+ StorageCommand& cmd = static_cast<StorageCommand&>(*msg);
+ shared_ptr<StorageReply> reply(cmd.makeReply().release());
+
+ if (reply.get()) {
+ sendUp(reply);
+ }
+ } else {
+ */
+ ostringstream ost;
+ ost << "Unhandled message at bottom of chain "
+ << *msg << " (message type "
+ << msg->getType().getName()
+ << "). "
+ << vespalib::getStackTrace(0);
+ if (!msg->getType().isReply()) {
+ //if (!_closed) {
+ LOGBP(warning, ost.str().c_str());
+ //}
+ StorageCommand& cmd = static_cast<StorageCommand&>(*msg);
+ shared_ptr<StorageReply> reply(cmd.makeReply().release());
+
+ if (reply.get()) {
+ reply->setResult(ReturnCode(ReturnCode::NOT_IMPLEMENTED,
+ msg->getType().getName()));
+ sendUp(reply);
+ }
+ } else {
+ ost << " Return code: "
+ << static_cast<StorageReply&>(*msg).getResult();
+ //if (!_closed) {
+ LOGBP(warning, ost.str().c_str());
+ //}
+ }
+ //}
+ } else if (!_down->onDown(msg)) {
+ //LOG(spam, "Storage link %s forwarding message %s.",
+ // toString().c_str(), msg->toString().c_str());
+ _down->sendDown(msg);
+ } else {
+ LOG(spam, "Storage link %s handled message %s.",
+ _down->toString().c_str(), msg->toString().c_str());
+ }
+}
+
+void StorageLink::sendUp(const shared_ptr<StorageMessage> & msg)
+{
+ // Verify acceptable state to send messages up
+ switch(_state) {
+ case OPENED:
+ case CLOSING:
+ case FLUSHINGDOWN:
+ case FLUSHINGUP:
+ break;
+ default:
+ LOG(error,
+ "Link %s trying to send %s up while in state %s",
+ toString().c_str(),
+ msg->toString(true).c_str(),
+ stateToString(_state));
+ assert(false);
+ }
+ assert(msg.get());
+ if (isTop()) {
+ /*
+ if (isFlush(msg)) {
+ StorageCommand& cmd = static_cast<StorageCommand&>(*msg);
+ shared_ptr<StorageReply> reply(cmd.makeReply().release());
+
+ if (reply.get()) {
+ sendDown(reply);
+ }
+ } else {
+ */
+ ostringstream ost;
+ ost << "Unhandled message at top of chain " << *msg << ".";
+ ost << vespalib::getStackTrace(0);
+ if (!msg->getType().isReply()) {
+ //if (!_closed) {
+ LOGBP(warning, ost.str().c_str());
+ //}
+ StorageCommand& cmd = static_cast<StorageCommand&>(*msg);
+ shared_ptr<StorageReply> reply(cmd.makeReply().release());
+
+ if (reply.get()) {
+ reply->setResult(ReturnCode(ReturnCode::NOT_IMPLEMENTED,
+ msg->getType().getName()));
+ sendDown(reply);
+ }
+ } else {
+ ost << " Return code: "
+ << static_cast<StorageReply&>(*msg).getResult();
+ //if (!_closed) {
+ LOGBP(warning, ost.str().c_str());
+ //}
+ }
+ //}
+ } else if (!_up->onUp(msg)) {
+ _up->sendUp(msg);
+ }
+}
+
+void StorageLink::printChain(std::ostream& out, std::string indent) const {
+ out << indent << "StorageChain(" << size();
+ if (!isTop()) out << ", not top";
+ out << ")";
+ const StorageLink* lastlink = _up;
+ for (const StorageLink* link = this; link != 0; link = link->_down.get()) {
+ out << "\n";
+ link->print(out, false, indent + " ");
+ if (link->_up != lastlink) out << ", broken linkage";
+ lastlink = link;
+ }
+}
+
+bool StorageLink::onDown(const shared_ptr<StorageMessage> & msg)
+{
+ //LOG(spam, "Checking if storage link %s handles %s.",
+ // toString().c_str(), msg->toString().c_str());
+ bool result = msg->callHandler(*this, msg);
+ /*
+ if (result) {
+ LOG(spam, "Storage link %s handled message %s.",
+ toString().c_str(), msg->toString().c_str());
+ } else {
+ LOG(spam, "Storage link %s did not handle message %s.",
+ toString().c_str(), msg->toString().c_str());
+ }
+ */
+ return result;
+}
+
+bool StorageLink::onUp(const shared_ptr<StorageMessage> & msg)
+{
+ return msg->callHandler(*this, msg);
+}
+
+const char*
+StorageLink::stateToString(State state)
+{
+ switch (state) {
+ case CREATED:
+ return "CREATED";
+ case OPENED:
+ return "OPENED";
+ case CLOSING:
+ return "CLOSING";
+ case FLUSHINGDOWN:
+ return "FLUSHINGDOWN";
+ case FLUSHINGUP:
+ return "FLUSHINGUP";
+ case CLOSED:
+ return "CLOSED";
+ default:
+ assert(false);
+ return 0;
+ }
+}
diff --git a/storage/src/vespa/storage/common/storagelink.h b/storage/src/vespa/storage/common/storagelink.h
new file mode 100644
index 00000000000..a7b68c59ee5
--- /dev/null
+++ b/storage/src/vespa/storage/common/storagelink.h
@@ -0,0 +1,198 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::StorageLink
+ * @ingroup common
+ *
+ * @brief Base class for StorageServer modules.
+ *
+ * Base class for StorageServer modules. Each module receives commands from
+ * "upstream" and replies from "downstream". It can choose to intercept both
+ * these streams via the onDown and onUp methods. The base class methods
+ * calls the hooks from MessageHandler. The handlers should return true if the
+ * message has been handled and should not be sent to the next module.
+ *
+ * Replies to messages should not be dispatched from within onDown. Create a
+ * separate thread and dispatch messages from this (or use StorageChainQueued).
+ *
+ * @version $Id$
+ */
+
+#pragma once
+
+#include <boost/utility.hpp>
+#include <vespa/vespalib/util/printable.h>
+#include <vespa/fastos/fastos.h>
+#include <memory>
+#include <vespa/storageapi/messageapi/messagehandler.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <string>
+#include <vespa/storage/common/messagesender.h>
+#include <vespa/storage/common/storagecomponent.h>
+
+namespace storage {
+
+class FileStorManagerTest;
+
+class StorageLink : public document::Printable,
+ public ChainedMessageSender,
+ private boost::noncopyable,
+ protected api::MessageHandler
+{
+public:
+ typedef std::unique_ptr<StorageLink> UP;
+
+ enum State { CREATED, OPENED, CLOSING, FLUSHINGDOWN, FLUSHINGUP, CLOSED };
+
+private:
+ StorageLink(const StorageLink&);
+ StorageLink& operator=(const StorageLink&);
+
+ std::string _name;
+ StorageLink* _up;
+ std::unique_ptr<StorageLink> _down;
+ State _state;
+
+public:
+ StorageLink(const std::string& name)
+ : _name(name), _up(0), _down(), _state(CREATED) {}
+ virtual ~StorageLink();
+
+ const std::string& getName() const { return _name; }
+ bool isTop() const { return (_up == 0); }
+ bool isBottom() const { return (_down.get() == 0); }
+ unsigned int size() const { return (isBottom() ? 1 : _down->size() + 1); }
+
+ /** Adds the link to the end of the chain. */
+ void push_back(StorageLink::UP);
+
+ /** Get the current state of the storage link. */
+ State getState() const { return _state; }
+
+ /**
+ * Called by storage server after the storage chain have been created.
+ */
+ void open();
+
+ void doneInit();
+
+ /**
+ * Mark this link as closed. After close is called, the link should not
+ * accept requests from external sources. (Internal sources still ok)
+ */
+ void close();
+
+ /**
+ * Flush messages through this link. Allways called after close() and
+ * before deletion, to remove any queued up messages.
+ */
+ void flush();
+
+ /** Send message down the storage chain. */
+ virtual void sendDown(const api::StorageMessage::SP&);
+
+ /** Send message up the storage chain. */
+ virtual void sendUp(const api::StorageMessage::SP&);
+
+ void printChain(std::ostream&, std::string indent = "") const;
+
+ /** Used for debugging/testing. */
+ StorageLink* getNextLink() { return _down.get(); }
+ void addTestLinkOnTop(StorageLink* up) { _up = up; }
+
+ virtual void storageDistributionChanged() {}
+
+ /**
+ * Called for each command message. Default implementation calls hooks
+ * from MessageHandler. Either overload this or the MessageHandler
+ * hooks to implement the module. In most cases, if you return true,
+ * you should create and dispatch a reply message.
+ *
+ * This function should only be called by storage links sendDown, or
+ * from storage links implementing it to default to default behavior.
+ *
+ * @return True if message is handled, false if it should be passed
+ * to the next module.
+ */
+ virtual bool onDown(const api::StorageMessage::SP&);
+
+ /**
+ * Called for each reply message. Default implementation calls hooks
+ * from MessageHandler. Either overload this or the MessageHandler
+ * hooks to implement the module. If you intercept and return true for
+ * a reply, it should either be a reply to a command your module sent,
+ * or you should construct a new reply message and dispatch that.
+ *
+ * This function should only be called by storage links sendUp, or
+ * from storage links implementing it to default to default behavior.
+ *
+ * @return True if message is handled, false if it should be passed
+ * to the next module.
+ */
+ virtual bool onUp(const api::StorageMessage::SP&);
+
+ virtual void print(std::ostream& out, bool,
+ const std::string&) const {
+ out << getName();
+ }
+
+ static const char* stateToString(State state);
+
+protected:
+ /**
+ * To ensure that the storage chain is deleted bottom-up, each storage
+ * link must call closeNextLink first in it's destructor, such that all
+ * links below are deleted before it deletes itself.
+ *
+ * This function should only be called from storage link destructor.
+ */
+ void closeNextLink();
+
+private:
+ /**
+ * Called from open(), after all links in the chain have been set up and
+ * initialized. In onOpen() and after, links are allowed to send messages
+ * both up and down. (Though should likely only send down)
+ */
+ virtual void onOpen() {}
+
+ /**
+ * Called from doneInit(), after node is done initializing.
+ */
+ virtual void onDoneInit() {}
+
+ /**
+ * Called from close. Override if you need to react to close calls.
+ * After close, no new operations can be requested. RPC servers should no
+ * longer accept incoming messages, web server taking HTTP requests should
+ * be shut down or no longer accept requests, background task schedulers
+ * should no longer schedule tasks, etc.
+ */
+ virtual void onClose() {}
+
+ /**
+ * Called from flush. Override if your class contains anything flushable.
+ * Flush is called twice after onClose() (and never at any other time).
+ * First time it is called on the way down the storage chain. Second time
+ * it is called on the way up the storage chain. On the way down, link
+ * must flush all operations scheduled to be sent down. Since chain is
+ * closed while this is happening. No new requests should happen until
+ * flush is called upwards. At that time links must flush all messages going
+ * upwards the chain. After this has been done, no messages/operations
+ * should remain in the process.
+ */
+ virtual void onFlush(bool downwards) { (void) downwards; }
+
+ /**
+ * Some unit tests wants access to private functions. They can do this
+ * through the storage link test.
+ */
+ friend class StorageLinkTest;
+};
+
+inline std::ostream& operator<<(std::ostream& out, StorageLink& link) {
+ link.printChain(out);
+ return out;
+}
+
+}
+
diff --git a/storage/src/vespa/storage/common/storagelinkqueued.cpp b/storage/src/vespa/storage/common/storagelinkqueued.cpp
new file mode 100644
index 00000000000..49d1d8f9592
--- /dev/null
+++ b/storage/src/vespa/storage/common/storagelinkqueued.cpp
@@ -0,0 +1,64 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include "storagelinkqueued.h"
+#include <vespa/log/log.h>
+
+LOG_SETUP(".application.link.queued");
+
+namespace storage {
+
+StorageLinkQueued::~StorageLinkQueued()
+{
+ if (_closeState != 7) {
+ LOG(error, "Link %s has closing state %u at destruction. Has likely "
+ "implemented onFlush/onClose without calling storage link "
+ "queued's implementations. This is a bug which can cause "
+ "crashes on shutdown.",
+ getName().c_str(), _closeState);
+ }
+}
+
+void StorageLinkQueued::dispatchDown(
+ const std::shared_ptr<api::StorageMessage>& msg)
+{
+ // Verify acceptable state to send messages down
+ switch(getState()) {
+ case OPENED:
+ case CLOSING:
+ case FLUSHINGDOWN:
+ break;
+ default:
+ LOG(error, "Link %s trying to dispatch %s down while in state %u",
+ toString().c_str(), msg->toString().c_str(), getState());
+ assert(false);
+ }
+ _commandDispatcher.add(msg);
+}
+
+void StorageLinkQueued::dispatchUp(
+ const std::shared_ptr<api::StorageMessage>& msg)
+{
+ // Verify acceptable state to send messages up
+ switch(getState()) {
+ case OPENED:
+ case CLOSING:
+ case FLUSHINGDOWN:
+ case FLUSHINGUP:
+ break;
+ default:
+ LOG(error, "Link %s trying to dispatch %s up while in state %u",
+ toString().c_str(), msg->toString().c_str(), getState());
+ assert(false);
+ }
+ _replyDispatcher.add(msg);
+}
+
+void StorageLinkQueued::logError(const char* err) {
+ LOG(error, "%s", err);
+};
+
+void StorageLinkQueued::logDebug(const char* err) {
+ LOG(info, "%s", err);
+};
+
+} // storage
diff --git a/storage/src/vespa/storage/common/storagelinkqueued.h b/storage/src/vespa/storage/common/storagelinkqueued.h
new file mode 100644
index 00000000000..cec29d1b3b5
--- /dev/null
+++ b/storage/src/vespa/storage/common/storagelinkqueued.h
@@ -0,0 +1,247 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::StorageLinkQueued
+ * @ingroup common
+ *
+ * @brief Storage link with a message queue.
+ *
+ * Storage link implementing separate threads for dispatching messages.
+ * Using this class you can use dispatchReply instead of sendReply to have the
+ * replies sent through another thread.
+ *
+ * @version $Id$
+ */
+
+#pragma once
+
+#include <vespa/vespalib/util/document_runnable.h>
+#include <deque>
+#include <iostream>
+#include <memory>
+#include <list>
+#include <limits>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storage/common/storagelink.h>
+
+namespace storage {
+
+class StorageLinkQueued : public StorageLink {
+public:
+ StorageLinkQueued(const std::string& name, framework::ComponentRegister& cr)
+ : StorageLink(name),
+ _compReg(cr),
+ _replyDispatcher(*this),
+ _commandDispatcher(*this),
+ _closeState(0) {}
+ virtual ~StorageLinkQueued();
+
+ /**
+ * Add message to internal queue, to be dispatched downstream
+ * in separate thread.
+ */
+ void dispatchDown(const std::shared_ptr<api::StorageMessage>&);
+
+ /**
+ * Add message to internal queue, to be dispatched downstream
+ * in separate thread.
+ */
+ void dispatchUp(const std::shared_ptr<api::StorageMessage>&);
+
+ /** Remember to call this method if you override it. */
+ virtual void onClose() {
+ _commandDispatcher.flush();
+ _closeState |= 1;
+ }
+
+ /** Remember to call this method if you override it. */
+ virtual void onFlush(bool downwards) {
+ if (downwards) {
+ _commandDispatcher.flush();
+ _closeState |= 2;
+ } else {
+ _replyDispatcher.flush();
+ _closeState |= 4;
+ }
+ }
+
+ void logError(const char* error);
+ void logDebug(const char* error);
+
+ framework::ComponentRegister& getComponentRegister() { return _compReg; }
+
+private:
+ /** Common class to prevent need for duplicate code. */
+ template<typename Message>
+ class Dispatcher : public framework::Runnable
+ {
+ protected:
+ StorageLinkQueued& _parent;
+ unsigned int _maxQueueSize;
+ vespalib::Monitor _sync;
+ std::deque< std::shared_ptr<Message> > _messages;
+ bool _replyDispatcher;
+ framework::Component::UP _component;
+ framework::Thread::UP _thread;
+ void terminate() {
+ if (_thread.get()) {
+ _thread->interrupt();
+ {
+ vespalib::MonitorGuard sync(_sync);
+ sync.signal();
+ }
+ _thread->join();
+ _thread.reset(0);
+ }
+ }
+
+ public:
+ Dispatcher(StorageLinkQueued& parent, unsigned int maxQueueSize,
+ bool replyDispatcher)
+ : _parent(parent),
+ _maxQueueSize(maxQueueSize),
+ _sync(),
+ _messages(),
+ _replyDispatcher(replyDispatcher)
+ {
+ std::ostringstream name;
+ name << "Queued storage " << (_replyDispatcher ? "up" : "down")
+ << "link - " << _parent.getName();
+ _component.reset(new framework::Component(
+ parent.getComponentRegister(),
+ name.str()));
+ }
+
+ virtual ~Dispatcher() {
+ terminate();
+ }
+
+ void start();
+ void run(framework::ThreadHandle&);
+
+ void add(const std::shared_ptr<Message>&);
+ void flush();
+ // You can use the given functions if you need to keep the
+ // dispatcher thread locked while you process a message. Bucket
+ // manager does this during bucket dumps
+ vespalib::Monitor& getMonitor() { return _sync; }
+ void addWithoutLocking(const std::shared_ptr<Message>&);
+
+ virtual void send(const std::shared_ptr<Message> & ) = 0;
+ };
+
+ class ReplyDispatcher : public Dispatcher<api::StorageMessage>
+ {
+ public:
+ ReplyDispatcher(StorageLinkQueued& parent)
+ : Dispatcher<api::StorageMessage>(
+ parent, std::numeric_limits<unsigned int>::max(), true)
+ {
+ }
+ void send(const std::shared_ptr<api::StorageMessage> & reply)
+ { _parent.sendUp(reply); }
+ virtual ~ReplyDispatcher() { terminate(); }
+ };
+
+ class CommandDispatcher : public Dispatcher<api::StorageMessage>
+ {
+ public:
+ CommandDispatcher(StorageLinkQueued& parent)
+ : Dispatcher<api::StorageMessage>(
+ parent, std::numeric_limits<unsigned int>::max(), false)
+ {
+ }
+ virtual ~CommandDispatcher() { terminate(); }
+ void send(const std::shared_ptr<api::StorageMessage> & command)
+ { _parent.sendDown(command); }
+ };
+
+ framework::ComponentRegister& _compReg;
+ framework::Thread::UP _replyThread;
+ framework::Thread::UP _commandThread;
+ ReplyDispatcher _replyDispatcher;
+ CommandDispatcher _commandDispatcher;
+ uint16_t _closeState;
+
+protected:
+ ReplyDispatcher& getReplyDispatcher() { return _replyDispatcher; }
+
+};
+
+template<typename Message>
+void StorageLinkQueued::Dispatcher<Message>::start()
+{
+ assert(_thread.get() == 0);
+ framework::MilliSecTime maxProcessTime(5 * 1000);
+ framework::MilliSecTime waitTime(100);
+ _thread = _component->startThread(*this, maxProcessTime, waitTime);
+}
+
+template<typename Message>
+void StorageLinkQueued::Dispatcher<Message>::add(
+ const std::shared_ptr<Message>& m)
+{
+ vespalib::MonitorGuard sync(_sync);
+
+ if (_thread.get() == 0) start();
+ while ((_messages.size() > _maxQueueSize) && !_thread->interrupted()) {
+ sync.wait(100);
+ }
+ _messages.push_back(m);
+ sync.signal();
+}
+
+template<typename Message>
+void StorageLinkQueued::Dispatcher<Message>::addWithoutLocking(
+ const std::shared_ptr<Message>& m)
+{
+ if (_thread.get() == 0) start();
+ _messages.push_back(m);
+}
+
+template<typename Message>
+void StorageLinkQueued::Dispatcher<Message>::run(framework::ThreadHandle& h)
+{
+ while (!h.interrupted()) {
+ h.registerTick(framework::PROCESS_CYCLE);
+ std::shared_ptr<Message> message;
+ {
+ vespalib::MonitorGuard sync(_sync);
+ while (!h.interrupted() && _messages.empty()) {
+ sync.wait(100);
+ h.registerTick(framework::WAIT_CYCLE);
+ }
+ if (h.interrupted()) break;
+ message.swap(_messages.front());
+ }
+ try {
+ send(message);
+ } catch (std::exception& e) {
+ _parent.logError(vespalib::make_string(
+ "When running command %s, caught exception %s. "
+ "Discarding message",
+ message->toString().c_str(),
+ e.what()).c_str());
+ }
+
+ {
+ // Since flush() only waits for stack to be empty, we must
+ // pop stack AFTER send have been called.
+ vespalib::MonitorGuard sync(_sync);
+ _messages.pop_front();
+ sync.signal();
+ }
+ }
+ _parent.logDebug("Finished storage link queued thread");
+}
+
+template<typename Message>
+void StorageLinkQueued::Dispatcher<Message>::flush()
+{
+ vespalib::MonitorGuard sync(_sync);
+ while (!_messages.empty()) {
+ sync.wait(100);
+ }
+}
+
+}
+
diff --git a/storage/src/vespa/storage/common/vectorprinter.h b/storage/src/vespa/storage/common/vectorprinter.h
new file mode 100644
index 00000000000..46e71a14d09
--- /dev/null
+++ b/storage/src/vespa/storage/common/vectorprinter.h
@@ -0,0 +1,48 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vector>
+#include <iostream>
+
+namespace storage {
+
+template <typename T>
+class VectorPrinter
+{
+public:
+ VectorPrinter(const std::vector<T>& vec,
+ const char* separator)
+ : _vec(&vec),
+ _separator(separator)
+ {}
+
+ void print(std::ostream& os) const {
+ for (uint32_t i = 0; i < _vec->size(); ++i) {
+ if (i != 0) {
+ os << _separator;
+ }
+ os << (*_vec)[i];
+ }
+ }
+private:
+ const std::vector<T>* _vec;
+ const char* _separator;
+};
+
+template <typename T>
+inline std::ostream&
+operator<<(std::ostream& os, const VectorPrinter<T>& printer)
+{
+ printer.print(os);
+ return os;
+}
+
+template <typename T>
+inline VectorPrinter<T>
+commaSeparated(const std::vector<T>& vec)
+{
+ return VectorPrinter<T>(vec, ",");
+}
+
+}
+
diff --git a/storage/src/vespa/storage/common/visitorfactory.h b/storage/src/vespa/storage/common/visitorfactory.h
new file mode 100644
index 00000000000..ac03057d466
--- /dev/null
+++ b/storage/src/vespa/storage/common/visitorfactory.h
@@ -0,0 +1,40 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::VisitorFactory
+ *
+ * New visitor implementations must implement this interface and register it in
+ * the storage server, in order for the visitor threads to be able to create
+ * instances of the visitor.
+ */
+#pragma once
+
+#include <boost/shared_ptr.hpp>
+#include <vespa/vdslib/container/parameters.h>
+
+namespace storage {
+
+class Visitor;
+
+class VisitorEnvironment {
+public:
+ typedef std::unique_ptr<VisitorEnvironment> UP;
+ VisitorEnvironment() {}
+ virtual ~VisitorEnvironment() {}
+};
+
+class VisitorFactory {
+public:
+ typedef std::shared_ptr<VisitorFactory> SP;
+ typedef std::map<std::string, std::shared_ptr<VisitorFactory> > Map;
+
+ virtual ~VisitorFactory() {};
+
+ virtual VisitorEnvironment::UP makeVisitorEnvironment(StorageComponent&) = 0;
+
+ virtual storage::Visitor *makeVisitor(
+ StorageComponent&, VisitorEnvironment& env,
+ const vdslib::Parameters& params) = 0;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/common/vtag.cpp b/storage/src/vespa/storage/common/vtag.cpp
new file mode 100644
index 00000000000..cf352006904
--- /dev/null
+++ b/storage/src/vespa/storage/common/vtag.cpp
@@ -0,0 +1,81 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <string.h>
+#include <stdio.h>
+#include <vespa/storage/common/vtag.h>
+#include <vespa/vespalib/component/version.h>
+
+#ifndef V_TAG
+#define V_TAG "NOTAG"
+#define V_TAG_DATE "NOTAG"
+#define V_TAG_SYSTEM "NOTAG"
+#define V_TAG_SYSTEM_REV "NOTAG"
+#define V_TAG_BUILDER "NOTAG"
+#define V_TAG_VERSION "0"
+#define V_TAG_ARCH "NOTAG"
+#endif
+
+namespace storage {
+
+char VersionTag[] = V_TAG;
+char VersionTagDate[] = V_TAG_DATE;
+char VersionTagSystem[] = V_TAG_SYSTEM;
+char VersionTagSystemRev[] = V_TAG_SYSTEM_REV;
+char VersionTagBuilder[] = V_TAG_BUILDER;
+char VersionTagPkg[] = V_TAG_PKG;
+char VersionTagComponent[] = V_TAG_COMPONENT;
+char VersionTagArch[] = V_TAG_ARCH;
+
+vespalib::Version Vtag::currentVersion(VersionTagComponent);
+
+void
+Vtag::printVersionNice()
+{
+ char *s = VersionTag;
+ bool needdate = true;
+ if (strncmp(VersionTag, "V_", 2) == 0) {
+ s += 2;
+ do {
+ while (strchr("0123456789", *s) != NULL) {
+ printf("%c", *s++);
+ }
+ if (strncmp(s, "_RELEASE", 8) == 0) {
+ needdate = false;
+ break;
+ }
+ if (strncmp(s, "_RC", 3) == 0) {
+ char *e = strchr(s, '-');
+ if (e == NULL) {
+ printf("%s", s);
+ } else {
+ printf("%.*s", (int)(e-s), s);
+ }
+ needdate = false;
+ break;
+ }
+ if (*s == '_' && strchr("0123456789", *++s)) {
+ printf(".");
+ } else {
+ break;
+ }
+ } while (*s && *s != '-');
+ } else {
+ char *e = strchr(s, '-');
+ if (e == NULL) {
+ printf("%s", s);
+ } else {
+ printf("%.*s", (int)(e-s), s);
+ }
+ }
+ if (needdate) {
+ s = VersionTagDate;
+ char *e = strchr(s, '-');
+ if (e == NULL) {
+ printf("-%s", s);
+ } else {
+ printf("-%.*s", (int)(e-s), s);
+ }
+ }
+}
+
+} // namespace storage
diff --git a/storage/src/vespa/storage/common/vtag.h b/storage/src/vespa/storage/common/vtag.h
new file mode 100644
index 00000000000..26236f13a41
--- /dev/null
+++ b/storage/src/vespa/storage/common/vtag.h
@@ -0,0 +1,24 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+namespace vespalib {
+ class Version;
+}
+
+namespace storage {
+
+extern char VersionTag[];
+extern char VersionTagDate[];
+extern char VersionTagSystem[];
+extern char VersionTagSystemRev[];
+extern char VersionTagBuilder[];
+
+class Vtag {
+public:
+ static vespalib::Version currentVersion;
+ static void printVersionNice();
+};
+
+} // namespace messagebus
+
diff --git a/storage/src/vespa/storage/config/.gitignore b/storage/src/vespa/storage/config/.gitignore
new file mode 100644
index 00000000000..621aa4f624a
--- /dev/null
+++ b/storage/src/vespa/storage/config/.gitignore
@@ -0,0 +1,11 @@
+*.So
+*.lo
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
+config-stor-*.cpp
+config-stor-*.h
+/config-rpc-provider.cpp
+/config-rpc-provider.h
diff --git a/storage/src/vespa/storage/config/CMakeLists.txt b/storage/src/vespa/storage/config/CMakeLists.txt
new file mode 100644
index 00000000000..863b29e6fd6
--- /dev/null
+++ b/storage/src/vespa/storage/config/CMakeLists.txt
@@ -0,0 +1,29 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_storageconfig OBJECT
+ SOURCES
+ DEPENDS
+)
+vespa_generate_config(storage_storageconfig stor-communicationmanager.def)
+install(FILES stor-communicationmanager.def DESTINATION var/db/vespa/config_server/serverdb/classes)
+vespa_generate_config(storage_storageconfig stor-distributormanager.def)
+install(FILES stor-distributormanager.def DESTINATION var/db/vespa/config_server/serverdb/classes)
+vespa_generate_config(storage_storageconfig stor-server.def)
+install(FILES stor-server.def DESTINATION var/db/vespa/config_server/serverdb/classes)
+vespa_generate_config(storage_storageconfig stor-status.def)
+install(FILES stor-status.def DESTINATION var/db/vespa/config_server/serverdb/classes)
+vespa_generate_config(storage_storageconfig stor-messageforwarder.def)
+install(FILES stor-messageforwarder.def DESTINATION var/db/vespa/config_server/serverdb/classes)
+vespa_generate_config(storage_storageconfig stor-opslogger.def)
+install(FILES stor-opslogger.def DESTINATION var/db/vespa/config_server/serverdb/classes)
+vespa_generate_config(storage_storageconfig stor-visitordispatcher.def)
+install(FILES stor-visitordispatcher.def DESTINATION var/db/vespa/config_server/serverdb/classes)
+vespa_generate_config(storage_storageconfig stor-integritychecker.def)
+install(FILES stor-integritychecker.def DESTINATION var/db/vespa/config_server/serverdb/classes)
+vespa_generate_config(storage_storageconfig stor-bucketmover.def)
+install(FILES stor-bucketmover.def DESTINATION var/db/vespa/config_server/serverdb/classes)
+vespa_generate_config(storage_storageconfig stor-bouncer.def)
+install(FILES stor-bouncer.def DESTINATION var/db/vespa/config_server/serverdb/classes)
+vespa_generate_config(storage_storageconfig stor-prioritymapping.def)
+install(FILES stor-prioritymapping.def DESTINATION var/db/vespa/config_server/serverdb/classes)
+vespa_generate_config(storage_storageconfig rpc-provider.def)
+install(FILES rpc-provider.def DESTINATION var/db/vespa/config_server/serverdb/classes)
diff --git a/storage/src/vespa/storage/config/rpc-provider.def b/storage/src/vespa/storage/config/rpc-provider.def
new file mode 100644
index 00000000000..b1d712b83cc
--- /dev/null
+++ b/storage/src/vespa/storage/config/rpc-provider.def
@@ -0,0 +1,4 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+connectspec string default="tcp/localhost:17777" restart
diff --git a/storage/src/vespa/storage/config/stor-bouncer.def b/storage/src/vespa/storage/config/stor-bouncer.def
new file mode 100644
index 00000000000..1e61367c43c
--- /dev/null
+++ b/storage/src/vespa/storage/config/stor-bouncer.def
@@ -0,0 +1,30 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+## Whether or not the bouncer should stop external load from
+## entering node when the cluster state is down.
+stop_external_load_when_cluster_down bool default=true
+
+## Sets what node states the node will allow incoming commands
+## in.
+stop_all_load_when_nodestate_not_in string default="uri"
+
+## Sets whether to just use (self) reported node state or to use wanted state
+## if wanted state is worse than the current reported state.
+use_wanted_state_if_possible bool default=true
+
+## The maximum clock skew allowed in the system. Any messages received
+## that have a timestamp longer in the future than this will be failed.
+max_clock_skew_seconds int default=5
+
+## If this config value is != -1, the node will reject any external feed
+## operations with a priority lower than that specified here. Note that since
+## we map priorities in such a way that 0 is the _highest_ priority and 255 the
+## _lowest_ priority, for two operations A and B, if B has a lower priority
+## than A it will have a higher priority _integer_ value.
+##
+## Only mutating external feed operations will be blocked. Read-only operations
+## and internal operations are always let through.
+##
+## Default is -1 (i.e. rejection is disabled and load is allowed through)
+feed_rejection_priority_threshold int default=-1
diff --git a/storage/src/vespa/storage/config/stor-bucketmover.def b/storage/src/vespa/storage/config/stor-bucketmover.def
new file mode 100644
index 00000000000..d554ae68c57
--- /dev/null
+++ b/storage/src/vespa/storage/config/stor-bucketmover.def
@@ -0,0 +1,37 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+## Minimum time between bucket database iterations in the bucket mover. The
+## minumum time is used when disks starts to get pretty full and we have plenty
+## stuff we can move.
+## restart flag was added automatically and needs to be verified.
+minimum_recheck_interval_in_seconds int default=60 restart
+
+## Maximum time between bucket database iterations in the bucket mover. The
+## maximum time is used when disks have plenty free space, so moving data is
+## not critical.
+## restart flag was added automatically and needs to be verified.
+maximum_recheck_interval_in_seconds int default=3600 restart
+
+## Number of buckets to cache at a time when reading the bucket database
+## restart flag was added automatically and needs to be verified.
+bucket_iteration_chunk int default=1000 restart
+
+## Maximum fill rate above average fill rate for a target disk to be eligible
+## as a target for a bucket move operation.
+## restart flag was added automatically and needs to be verified.
+max_target_fill_rate_above_average double default=0.01 restart
+
+## Number of bucket mover runs to keep in history vector
+## restart flag was added automatically and needs to be verified.
+max_history_size int default=10 restart
+
+## Max concurrent pending bucket move operations scheduled in total.
+## restart flag was added automatically and needs to be verified.
+max_pending int default=5 restart
+
+## Operation delay. If set, the bucket mover will wait for this amount of
+## milliseconds between each operation. Useful in testing to make move run go
+## slow enough to view without that much data.
+## restart flag was added automatically and needs to be verified.
+operation_delay int default=0 restart
diff --git a/storage/src/vespa/storage/config/stor-communicationmanager.def b/storage/src/vespa/storage/config/stor-communicationmanager.def
new file mode 100644
index 00000000000..da64d5ad28e
--- /dev/null
+++ b/storage/src/vespa/storage/config/stor-communicationmanager.def
@@ -0,0 +1,19 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+mbusport int default=-1 restart
+
+rpcport int default=6000 restart
+
+# Limits for max pending concurrent number of operations towards a node before
+# MessageBus starts busy-bouncing messages. Distributor and content nodes are
+# treated separately. 0 means no limit.
+mbus_distributor_node_max_pending_count int default=5000
+mbus_content_node_max_pending_count int default=0
+
+# Limits for max total amount of memory (in bytes) used by operations towards
+# a node before MessageBus starts busy-bouncing messages. Distributor and
+# content nodes are treated separately. 0 means no limit.
+mbus_distributor_node_max_pending_size int default=0
+mbus_content_node_max_pending_size int default=0
+
diff --git a/storage/src/vespa/storage/config/stor-distributormanager.def b/storage/src/vespa/storage/config/stor-distributormanager.def
new file mode 100644
index 00000000000..44f93867335
--- /dev/null
+++ b/storage/src/vespa/storage/config/stor-distributormanager.def
@@ -0,0 +1,169 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+## Maximum number of ideal state operations scheduled by a distributor.
+maxpendingidealstateoperations int default=100
+
+## The total size of unique documents in a bucket before we split it due to
+## being too big. By default this is now 32 MB.
+splitsize int default=33544432
+
+## The maximum amount of entries in a file before we should attempt to split it.
+## A meta data entry in a slotfile currently uses 40 bytes. It is probably
+## good to have the split size, such that all meta data entries are normally
+## read when you do the initial read. With the default of 1024, meta data will
+## take up around 40 kB, and the default initial read is 64 kB, allowing the
+## file to grow a bit above max and still all be read in initial read.
+splitcount int default=1024
+
+## The maximum size of unique documents that allows the system to reduce
+## the number of split bits on the bucket, or join two buckets together.
+## The size must be lower than this number, and the count must be lower than
+## joincount.
+joinsize int default=16000000
+
+## The maximum number of unique documents that allows for joining (see
+## joinsize).
+joincount int default=512
+
+## Minimum level of splitting for buckets
+minsplitcount int default=16
+
+## If non-empty, continuously delete all the documents matching this selection.
+garbagecollection.selectiontoremove string default=""
+
+## The interval with which each bucket is purged using the selection above.
+## If 0, no garbage collection is done.
+garbagecollection.interval int default=0
+
+## If false, dont do splits inline with feeding.
+inlinebucketsplitting bool default=true
+
+## List of state checkers (ideal state generators) that should be ignored in the cluster.
+## One or more of the following (case insensitive):
+##
+## SynchronizeAndMove
+## DeleteExtraCopies
+## JoinBuckets
+## SplitBucket
+## SplitInconsistentBuckets
+## SetBucketState
+## GarbageCollection
+blockedstatecheckers[] string restart
+
+## Whether or not distributor should issue reverts when operations partially
+## fail.
+enable_revert bool default=true
+
+## Maximum nodes involved in a merge operation. Currently, this can not be more
+## than 16 nodes due to protocol limitations. However, decreasing the max may
+## be useful if 16 node merges ends up too expensive.
+maximum_nodes_per_merge int default=16
+
+## For internal in process debugging, it may be useful to not start the
+## distributor thread to be able to call tick() manually and run single threaded
+start_distributor_thread bool default=true restart
+
+## The number of ticks calls done before a wait is done. This can be
+## set higher than 1 for the distributor to improve speed of bucket iterations
+## while still keep CPU load low/moderate.
+ticks_before_wait int default=1
+
+## The sleep time between ticks if there are no more queed tasks.
+ticks_wait_time_ms int default=5
+
+## Max processing time used by deadlock detector.
+max_process_time_ms int default=5000
+
+## Allow overriding default priorities of certain maintenance operations.
+## This is an advanced feature, do not touch this unless you have a very good
+## reason to do so! Configuring these values wrongly may cause starvation of
+## important operations, leading to unpredictable behavior and/or data loss.
+##
+## Merge used to move data to ideal location
+priority_merge_move_to_ideal_node int default=165
+
+## Merge for copies that have gotten out of sync with each other
+priority_merge_out_of_sync_copies int default=120
+
+## Merge for restoring redundancy of copies
+priority_merge_too_few_copies int default=120
+
+## Copy activation when there are no other active copies (likely causing
+## lack of search coverage for that bucket)
+priority_activate_no_existing_active int default=100
+
+## Copy activation when there is already an active copy for the bucket.
+priority_activate_with_existing_active int default=100
+
+## Deletion of bucket copy. Cheap on VDS, not necessarily so on indexed search.
+priority_delete_bucket_copy int default=100
+
+## Joining caused by bucket siblings getting sufficiently small to fit into a
+## single bucket.
+priority_join_buckets int default=155
+
+## Splitting caused by system increasing its minimum distribution bit count.
+priority_split_distribution_bits int default=200
+
+## Splitting due to bucket exceeding max document count or byte size (see
+## splitcount and splitsize config values)
+priority_split_large_bucket int default=175
+
+## Splitting due to buckets being inconsistently split. Should be higher
+## priority than the vast majority of external load.
+priority_split_inconsistent_bucket int default=110
+
+## Background garbage collection. Should be lower priority than external load
+## and other ideal state operations (aside from perhaps minimum bit splitting).
+priority_garbage_collection int default=200
+
+## The distributor can send joins that "lift" a bucket without any siblings
+## higher up in the bucket tree hierarchy. The assumption is that if this
+## is done for all sibling-less buckets, they will all eventually reach a
+## level in the tree where they do in fact have a sibling and may (if their
+## sizes allow) be joined into a single bucket.
+enable_join_for_sibling_less_buckets bool default=false
+
+## There exists a distribution edge case where bucket siblings end up having
+## non-equal ideal locations. This will normally inhibit join operations, as
+## these are only allowed when all nodes have all source buckets involved in
+## the join operation. Setting this property to true means such buckets may
+## still be joined at the cost of transient inconsistencies for the buckets
+## being joined into.
+enable_inconsistent_join bool default=false
+
+## The distributor host info reporter may be disabled entirely, in which case
+## no per-node statistics for merges, latencies or bucket replication factors
+## will be reported back to the cluster controller. Disabling this may make
+## sense in large clusters that do not make use of these reports directly or
+## indirectly, as it causes potentially significant processing overhead on the
+## cluster controller.
+## This host reporter must never be disabled on a Hosted Vespa system, or
+## automatic upgrades will stall.
+enable_host_info_reporting bool default=true
+
+## For each available node, the distributor will report back to the cluster
+## controller a value which indicates the minimum replication factor for any
+## bucket contained on said node. This config exposes a way to alter how this
+## replication factor is computed.
+##
+## Valid enum values and their semantics:
+##
+## TRUSTED - only trusted replicas are counted.
+## ANY - any replica present is counted. This may return an overly optimistic
+## view of the system. E.g. if there are 3 replicas, 1 having 1000 docs
+## and 2 having 1 doc, all being out of sync, counting with ANY will still
+## treat this as a minimum replication factor of 3. Conversely, with
+## TRUSTED such a bucket would most likely have a factor of 0 (or 1 iff
+## the trusted status for the replica with 1000 docs is known).
+minimum_replica_counting_mode enum { TRUSTED, ANY } default=TRUSTED
+
+## Bucket activation only makes sense for indexed search clusters, but Proton
+## may also be run in store-only or streaming mode, in which case it does not
+## actually require any activations. If the model infers that Proton is running
+## in such a mode, activation will be explicitly disabled.
+##
+## Activation is always disabled entirely for clusters using VDS as their
+## engine, regardless of the value of this setting.
+disable_bucket_activation bool default=false
diff --git a/storage/src/vespa/storage/config/stor-integritychecker.def b/storage/src/vespa/storage/config/stor-integritychecker.def
new file mode 100644
index 00000000000..ae2dc82bf0b
--- /dev/null
+++ b/storage/src/vespa/storage/config/stor-integritychecker.def
@@ -0,0 +1,38 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+## Minutes after midnight when integrity checker is allowed to start running.
+## 0 means it will start/continue run at midnight.
+dailycyclestart int default=0
+
+## Minutes after midnight when integrity checker is not allowed to run anymore.
+## If this equals dailycyclestart it is allowed to run all day. dailycyclestop
+## is allowed to be less than dailycyclestart.
+dailycyclestop int default=0
+
+## Status of what is allowed done on what weekdays. Should be a string with
+## seven characters, where the first represent sunday, the seventh saturday.
+## The possible options are RrCc- which means:
+## R - If state becomes R, and current cycle does not verify file content,
+## abort current cycle, otherwise continue it. Start new cycle verifying
+## all content of all files.
+## r - Continue current cycle. Start new cycle using cheap partial file
+## verification.
+## c - Continue current cycle. Dont start a new cycle.
+weeklycycle string default="Rrrrrrr"
+
+## Max concurrent pending bucket verifications. For max speed, each disk thread
+## should have one to work with all the time. Default is 1, to ensure little
+## resources are consumed by this process by default. Once request priority
+## has been introduced, this default may become higher.
+maxpending int default=2
+
+## Minimum time since last cycle before starting a new one in minutes.
+## Defaults to 24 hours.
+mincycletime int default=1440
+
+## Minimum time in seconds between each request. To throttle the system even
+## slower if continuous one pending puts on more load on the system than you
+## want. Works with multiple pending messages, though it doesnt make much sense
+## unless maxpending equals 1.
+requestdelay int default=0
diff --git a/storage/src/vespa/storage/config/stor-messageforwarder.def b/storage/src/vespa/storage/config/stor-messageforwarder.def
new file mode 100644
index 00000000000..3896733f7bb
--- /dev/null
+++ b/storage/src/vespa/storage/config/stor-messageforwarder.def
@@ -0,0 +1,4 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+route string default="" restart
diff --git a/storage/src/vespa/storage/config/stor-opslogger.def b/storage/src/vespa/storage/config/stor-opslogger.def
new file mode 100644
index 00000000000..9913249c381
--- /dev/null
+++ b/storage/src/vespa/storage/config/stor-opslogger.def
@@ -0,0 +1,4 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+targetfile string default="" restart
diff --git a/storage/src/vespa/storage/config/stor-prioritymapping.def b/storage/src/vespa/storage/config/stor-prioritymapping.def
new file mode 100644
index 00000000000..1863cf15c35
--- /dev/null
+++ b/storage/src/vespa/storage/config/stor-prioritymapping.def
@@ -0,0 +1,20 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+highest int default=50
+very_high int default=60
+high_1 int default=70
+high_2 int default=80
+high_3 int default=90
+normal_1 int default=100
+normal_2 int default=110
+normal_3 int default=120
+normal_4 int default=130
+normal_5 int default=140
+normal_6 int default=150
+low_1 int default=160
+low_2 int default=170
+low_3 int default=180
+very_low int default=190
+lowest int default=200
+
diff --git a/storage/src/vespa/storage/config/stor-server.def b/storage/src/vespa/storage/config/stor-server.def
new file mode 100644
index 00000000000..343348cd378
--- /dev/null
+++ b/storage/src/vespa/storage/config/stor-server.def
@@ -0,0 +1,78 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+## Root directory for all files related to this storage node.
+## Will typically be "$VESPA_HOME/var/db/vespa/vds/<cluster>/<nodetype>/<index>
+root_folder string restart
+
+## VDS cluster
+cluster_name string default="storage" restart
+
+## The index of this node. Each node of the same type in the same cluster need
+## to have unique indexes. This should not be changed, as this is what we use
+## to identify the node, and to decide what data should be on it.
+node_index int default=0 restart
+
+## The maximum amount of memory to use in the storage node.
+## Currently default is 2 GB.
+## DEPRECATED!
+memorytouse long default=2147483647 restart
+
+## Set whether this is a distributor or a storage node. This will decide what
+## storage links are set up.
+is_distributor bool restart
+
+## Capacity of the node. How much data and load this node will get relative to
+## other nodes.
+node_capacity double default=1.0 restart
+
+## Capacity of the disks on this node. How much data and load will each disk
+## get relative to the other disks on this node.
+disk_capacity[] double restart
+
+## Reliability of this node. How much of the cluster redundancy factor can this
+## node make up for.
+node_reliability int default=1 restart
+
+## The upper bound of merges that any storage node can have active.
+## A merge operation will be chained through all nodes involved in the
+## merge, only actually starting the operation when every node has
+## allowed it to pass through.
+max_merges_per_node int default=16
+max_merge_queue_size int default=1024
+
+## Whether the deadlock detector should be enabled or not. If disabled, it will
+## still run, but it will never actually abort the process it is running in.
+enable_dead_lock_detector bool default=false restart
+
+## Whether to enable deadlock detector warnings in log or not. If enabled,
+## warnings will be written even if dead lock detecting is not enabled.
+enable_dead_lock_detector_warnings bool default=true restart
+
+## Each thread registers how often it will at minimum register ticks (given that
+## the system is not overloaded. If you are running Vespa on overloaded nodes,
+## you can use this slack timeout to add to the thread timeouts in order to
+## allow for more slack before dead lock detector kicks in. The value is in seconds.
+dead_lock_detector_timeout_slack double default=240 restart
+
+## If set to 0, storage will attempt to auto-detect the number of VDS mount
+## points to use. If set to a number, force this number. This number only makes
+## sense on a storage node of course
+disk_count int default=0 restart
+
+## Configure persistence provider. Temporary here to test.
+persistence_provider.type enum {STORAGE, DUMMY, RPC } default=STORAGE restart
+persistence_provider.rpc.connectspec string default="tcp/localhost:27777" restart
+
+## Whether or not to use the new metadata flow implementation. Default to not
+## as it is currently in development and not even functional
+switch_new_meta_data_flow bool default=false restart
+
+## When the content layer receives a set of changed buckets from the persistence
+## layer, it must recheck all of these. Each such recheck results in an
+## operation scheduled against the persistence queust and since the total
+## number of buckets to recheck may reach hundreds of thousands in a large
+## system, we send these in chunks to avoid saturating the queues with
+## operations.
+bucket_rechecking_chunk_size int default=100
+
diff --git a/storage/src/vespa/storage/config/stor-status.def b/storage/src/vespa/storage/config/stor-status.def
new file mode 100644
index 00000000000..b3ff1378d3c
--- /dev/null
+++ b/storage/src/vespa/storage/config/stor-status.def
@@ -0,0 +1,4 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+httpport int default=0 restart
diff --git a/storage/src/vespa/storage/config/stor-visitordispatcher.def b/storage/src/vespa/storage/config/stor-visitordispatcher.def
new file mode 100644
index 00000000000..fcbafdd2aee
--- /dev/null
+++ b/storage/src/vespa/storage/config/stor-visitordispatcher.def
@@ -0,0 +1,13 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+maxvisitorspernodeperclientvisitor int default=4
+minbucketspervisitor int default=5
+maxbucketspervisitor int default=100
+minpendingperstoragevisitor int default=2
+minsuperbucketsactive int default=50
+minsubbucketsactive int default=10000
+storagenetworklatency int default=2
+progresstimeout int default=180000
+highprioritylimit int default=600000
+veryhighprioritylimit int default=60000
diff --git a/storage/src/vespa/storage/distributor/.gitignore b/storage/src/vespa/storage/distributor/.gitignore
new file mode 100644
index 00000000000..77ca1a026a6
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/.gitignore
@@ -0,0 +1,4 @@
+*.So
+.depend
+Makefile
+dbclient
diff --git a/storage/src/vespa/storage/distributor/CMakeLists.txt b/storage/src/vespa/storage/distributor/CMakeLists.txt
new file mode 100644
index 00000000000..eb7850c3439
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/CMakeLists.txt
@@ -0,0 +1,35 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_distributor
+ SOURCES
+ distributor.cpp
+ operationowner.cpp
+ distributorcomponent.cpp
+ statechecker.cpp
+ persistencemessagetracker.cpp
+ messagetracker.cpp
+ externaloperationhandler.cpp
+ bucketlistmerger.cpp
+ idealstatemanager.cpp
+ statecheckers.cpp
+ bucketdbupdater.cpp
+ pendingmessagetracker.cpp
+ pendingclusterstate.cpp
+ nodeinfo.cpp
+ sentmessagemap.cpp
+ operationtargetresolverimpl.cpp
+ distributorconfiguration.cpp
+ throttlingoperationstarter.cpp
+ blockingoperationstarter.cpp
+ distributormessagesender.cpp
+ clusterinformation.cpp
+ activecopy.cpp
+ statusreporterdelegate.cpp
+ bucketgctimecalculator.cpp
+ distributor_host_info_reporter.cpp
+ latency_statistics_provider.cpp
+ $<TARGET_OBJECTS:storage_bucketdatabase>
+ INSTALL lib64
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/distributor/activecopy.cpp b/storage/src/vespa/storage/distributor/activecopy.cpp
new file mode 100644
index 00000000000..6c2fba75d2a
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/activecopy.cpp
@@ -0,0 +1,182 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/storage/distributor/activecopy.h>
+
+#include <vespa/storage/storageutil/utils.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <algorithm>
+
+namespace std {
+ template<typename T>
+ std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
+ out << "[";
+ for (uint32_t i=0; i<v.size(); ++i) {
+ out << "\n " << v[i];
+ }
+ if (!v.empty()) out << "\n";
+ return out << "]";
+ }
+}
+
+namespace storage {
+namespace distributor {
+
+namespace {
+ struct Entry {
+ uint16_t _nodeIndex;
+ uint16_t _ideal;
+ bool _ready;
+ bool _trusted;
+ bool _active;
+
+ Entry(uint16_t node, BucketDatabase::Entry& e,
+ const std::vector<uint16_t>& idealState)
+ : _nodeIndex(node),
+ _ideal(0xffff)
+ {
+ const BucketCopy* copy = e->getNode(node);
+ assert(copy != 0);
+ _ready = copy->ready();
+ _trusted = copy->trusted();
+ _active = copy->active();
+ for (uint32_t i=0; i<idealState.size(); ++i) {
+ if (idealState[i] == node) _ideal = i;
+ }
+ }
+
+ vespalib::string getReason() {
+ if (_ready && _trusted && _ideal < 0xffff) {
+ vespalib::asciistream ost;
+ ost << "copy is ready, trusted and ideal state priority "
+ << _ideal;
+ return ost.str();
+ } else if (_ready && _trusted) {
+ return "copy is ready and trusted";
+ } else if (_ready) {
+ return "copy is ready";
+ } else if (_trusted && _ideal < 0xffff) {
+ vespalib::asciistream ost;
+ ost << "copy is trusted and ideal state priority " << _ideal;
+ return ost.str();
+ } else if (_trusted) {
+ return "copy is trusted";
+ } else if (_ideal < 0xffff) {
+ vespalib::asciistream ost;
+ ost << "copy is ideal state priority " << _ideal;
+ return ost.str();
+ } else {
+ return "first available copy";
+ }
+ }
+
+ friend std::ostream& operator<<(std::ostream& out, const Entry& e) {
+ out << "Entry(Node " << e._nodeIndex;
+ if (e._ready) out << ", ready";
+ if (e._trusted) out << ", trusted";
+ if (e._ideal < 0xffff) out << ", ideal pri " << e._ideal;
+ out << ")";
+ return out;
+ }
+ };
+
+ struct ActiveStateOrder {
+ bool operator()(const Entry& e1, const Entry& e2) {
+ if (e1._ready != e2._ready) return e1._ready;
+ if (e1._trusted != e2._trusted) return e1._trusted;
+ if (e1._ideal != e2._ideal) return e1._ideal < e2._ideal;
+ if (e1._active != e2._active) return e1._active;
+ return e1._nodeIndex < e2._nodeIndex;
+ }
+ };
+
+ void buildValidNodeIndexList(BucketDatabase::Entry& e,
+ std::vector<uint16_t>& result)
+ {
+ for (uint32_t i=0, n=e->getNodeCount(); i < n; ++i) {
+ const BucketCopy& cp = e->getNodeRef(i);
+ if (!cp.valid()) continue;
+ result.push_back(cp.getNode());
+ }
+ }
+
+ void buildNodeList(BucketDatabase::Entry& e,
+ const std::vector<uint16_t>& nodeIndexes,
+ const std::vector<uint16_t>& idealState,
+ std::vector<Entry>& result)
+ {
+ for (uint32_t i=0; i<nodeIndexes.size(); ++i) {
+ result.push_back(Entry(nodeIndexes[i], e, idealState));
+ }
+ }
+}
+
+#undef DEBUG
+#if 0
+#define DEBUG(a) a
+#else
+#define DEBUG(a)
+#endif
+
+ActiveList
+ActiveCopy::calculate(const std::vector<uint16_t>& idealState,
+ const lib::Distribution& distribution,
+ BucketDatabase::Entry& e)
+{
+ DEBUG(std::cerr << "Ideal state is " << idealState << "\n");
+ std::vector<ActiveCopy> result;
+ std::vector<uint16_t> validNodesWithCopy;
+ buildValidNodeIndexList(e, validNodesWithCopy);
+ if (validNodesWithCopy.empty()) {
+ return result;
+ }
+ typedef std::vector<uint16_t> IndexList;
+ std::vector<IndexList> groups;
+ if (distribution.activePerGroup()) {
+ groups = distribution.splitNodesIntoLeafGroups(validNodesWithCopy);
+ } else {
+ groups.push_back(validNodesWithCopy);
+ }
+ for (uint32_t i=0; i<groups.size(); ++i) {
+ std::vector<Entry> entries;
+ buildNodeList(e, groups[i], idealState, entries);
+ DEBUG(std::cerr << "Finding active for group " << entries << "\n");
+ auto best = std::min_element(entries.begin(), entries.end(),
+ ActiveStateOrder());
+ DEBUG(std::cerr << "Best copy " << *best << "\n");
+ result.push_back(ActiveCopy(best->_nodeIndex,
+ best->getReason()));
+ }
+ return ActiveList(result);
+}
+
+void
+ActiveList::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ out << "[";
+ if (verbose) {
+ for (size_t i=0; i<_v.size(); ++i) {
+ out << "\n" << indent << " "
+ << _v[i].nodeIndex << " " << _v[i].reason;
+ }
+ if (!_v.empty()) out << "\n" << indent;
+ } else {
+ if (!_v.empty()) out << _v[0].nodeIndex;
+ for (size_t i=1; i<_v.size(); ++i) {
+ out << " " << _v[i].nodeIndex;
+ }
+ }
+ out << "]";
+}
+
+bool
+ActiveList::contains(uint16_t node) const
+{
+ for (uint32_t i=0; i<_v.size(); ++i) {
+ if (node == _v[i].nodeIndex) return true;
+ }
+ return false;
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/activecopy.h b/storage/src/vespa/storage/distributor/activecopy.h
new file mode 100644
index 00000000000..73e135af352
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/activecopy.h
@@ -0,0 +1,44 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+
+namespace storage {
+namespace lib {
+ class Distribution;
+}
+namespace distributor {
+
+class ActiveList;
+
+struct ActiveCopy {
+ uint16_t nodeIndex;
+ vespalib::string reason;
+
+ ActiveCopy() : nodeIndex(0xffff), reason(0) {}
+ ActiveCopy(uint16_t index, vespalib::stringref r)
+ : nodeIndex(index), reason(r) {}
+
+ static ActiveList calculate(
+ const std::vector<uint16_t>& idealState,
+ const lib::Distribution&, BucketDatabase::Entry&);
+};
+
+class ActiveList : public vespalib::Printable {
+ std::vector<ActiveCopy> _v;
+
+public:
+ ActiveList() {}
+ ActiveList(std::vector<ActiveCopy>& v) { _v.swap(v); }
+
+ ActiveCopy& operator[](size_t i) { return _v[i]; }
+ const ActiveCopy& operator[](size_t i) const { return _v[i]; }
+ bool contains(uint16_t) const;
+ bool empty() const { return _v.empty(); }
+ size_t size() const { return _v.size(); }
+ void print(std::ostream&, bool verbose, const std::string& indent) const;
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/blockingoperationstarter.cpp b/storage/src/vespa/storage/distributor/blockingoperationstarter.cpp
new file mode 100644
index 00000000000..3be6c73704c
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/blockingoperationstarter.cpp
@@ -0,0 +1,20 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/blockingoperationstarter.h>
+
+namespace storage {
+namespace distributor {
+
+bool
+BlockingOperationStarter::start(const std::shared_ptr<Operation>& operation,
+ Priority priority)
+{
+ if (operation->isBlocked(_messageTracker)) {
+ return true;
+ }
+ return _starterImpl.start(operation, priority);
+}
+
+}
+}
diff --git a/storage/src/vespa/storage/distributor/blockingoperationstarter.h b/storage/src/vespa/storage/distributor/blockingoperationstarter.h
new file mode 100644
index 00000000000..ffd94c7d743
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/blockingoperationstarter.h
@@ -0,0 +1,34 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/operationstarter.h>
+#include <vespa/storage/distributor/operations/operation.h>
+
+namespace storage {
+namespace distributor {
+
+class PendingMessageTracker;
+
+class BlockingOperationStarter : public OperationStarter
+{
+public:
+ BlockingOperationStarter(PendingMessageTracker& messageTracker,
+ OperationStarter& starterImpl)
+ : _messageTracker(messageTracker),
+ _starterImpl(starterImpl)
+ {}
+
+ virtual bool start(const std::shared_ptr<Operation>& operation,
+ Priority priority);
+
+private:
+ BlockingOperationStarter(const BlockingOperationStarter&);
+ BlockingOperationStarter& operator=(const BlockingOperationStarter&);
+
+ PendingMessageTracker& _messageTracker;
+ OperationStarter& _starterImpl;
+};
+
+}
+}
+
diff --git a/storage/src/vespa/storage/distributor/bucketdb/CMakeLists.txt b/storage/src/vespa/storage/distributor/bucketdb/CMakeLists.txt
new file mode 100644
index 00000000000..b9f5f926a65
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdb/CMakeLists.txt
@@ -0,0 +1,13 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_bucketdatabase OBJECT
+ SOURCES
+ bucketinfo.cpp
+ bucketcopy.cpp
+ bucketdatabase.cpp
+ judybucketdatabase.cpp
+ mapbucketdatabase.cpp
+ bucketdbmetricupdater.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/distributor/bucketdb/bucketcopy.cpp b/storage/src/vespa/storage/distributor/bucketdb/bucketcopy.cpp
new file mode 100644
index 00000000000..762fa290109
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdb/bucketcopy.cpp
@@ -0,0 +1,24 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/bucketdb/bucketcopy.h>
+
+namespace storage {
+
+namespace distributor {
+
+void
+BucketCopy::print(std::ostream& out, bool /*verbose*/, const std::string&) const
+{
+ out << "node("
+ << "idx=" << _node
+ << ",crc=" << std::hex << "0x" << getChecksum() << std::dec
+ << ",docs=" << getDocumentCount() << "/" << getMetaCount()
+ << ",bytes=" << getTotalDocumentSize() << "/" << getUsedFileSize()
+ << ",trusted=" << (trusted() ? "true" : "false")
+ << ",active=" << (active() ? "true" : "false")
+ << ")";
+}
+
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/bucketdb/bucketcopy.h b/storage/src/vespa/storage/distributor/bucketdb/bucketcopy.h
new file mode 100644
index 00000000000..5d7c2067fe1
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdb/bucketcopy.h
@@ -0,0 +1,113 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storageapi/buckets/bucketinfo.h>
+
+namespace storage {
+
+namespace distributor {
+
+class BucketCopy {
+private:
+ uint64_t _timestamp;
+ api::BucketInfo _info;
+ uint16_t _flags;
+ uint16_t _node;
+
+public:
+ static const int TRUSTED = 1;
+
+ BucketCopy()
+ : _timestamp(0), _flags(0), _node(0xffff) {}
+
+ BucketCopy(uint64_t timestamp,
+ uint16_t nodeIdx,
+ const api::BucketInfo& info)
+ : _timestamp(timestamp),
+ _info(info),
+ _flags(0),
+ _node(nodeIdx)
+ {
+ }
+
+ bool trusted() const { return _flags & TRUSTED; }
+
+ BucketCopy& setTrusted(bool val = true) {
+ if (!val) {
+ clearTrusted();
+ } else {
+ _flags |= TRUSTED;
+ }
+
+ return *this;
+ }
+
+ void clearTrusted() { _flags &= ~TRUSTED; }
+
+ bool valid() const { return getBucketInfo().valid(); }
+ bool empty() const { return getBucketInfo().empty(); }
+ bool wasRecentlyCreated() const {
+ return (getChecksum() == 1
+ && getDocumentCount() == 0
+ && getTotalDocumentSize() == 0);
+ }
+
+ static BucketCopy recentlyCreatedCopy(uint64_t timestamp, uint16_t nodeIdx)
+ {
+ return BucketCopy(timestamp, nodeIdx, api::BucketInfo(1, 0, 0, 0, 0));
+ }
+
+ uint16_t getNode() const { return _node; }
+ uint64_t getTimestamp() const { return _timestamp; }
+
+ uint32_t getChecksum() const { return _info.getChecksum(); }
+ uint32_t getDocumentCount() const { return _info.getDocumentCount(); }
+ uint32_t getTotalDocumentSize() const
+ { return _info.getTotalDocumentSize(); }
+ uint32_t getMetaCount() const { return _info.getMetaCount(); }
+ uint32_t getUsedFileSize() const { return _info.getUsedFileSize(); }
+ bool active() const { return _info.isActive(); }
+ bool ready() const { return _info.isReady(); }
+
+ const api::BucketInfo& getBucketInfo() const { return _info; }
+
+ void setBucketInfo(uint64_t timestamp, const api::BucketInfo& bInfo) {
+ _info = bInfo;
+ _timestamp = timestamp;
+ }
+
+ void setActive(bool setactive) {
+ _info.setActive(setactive);
+ }
+
+ bool consistentWith(const BucketCopy& other,
+ bool countInvalidAsConsistent = false) const
+ {
+ // If both are valid, check checksum and doc count.
+ if (valid() && other.valid()) {
+ return (getChecksum() == other.getChecksum()
+ && getDocumentCount() == other.getDocumentCount());
+ }
+
+ return countInvalidAsConsistent;
+ }
+
+ void print(std::ostream&, bool verbose, const std::string& indent) const;
+
+ std::string toString() const {
+ std::ostringstream ost;
+ print(ost, true, "");
+ return ost.str();
+ }
+
+ bool operator==(const BucketCopy& other) const {
+ return
+ getBucketInfo() == other.getBucketInfo() &&
+ _flags == other._flags;
+ }
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/bucketdb/bucketdatabase.cpp b/storage/src/vespa/storage/distributor/bucketdb/bucketdatabase.cpp
new file mode 100644
index 00000000000..902b6d88ab0
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdb/bucketdatabase.cpp
@@ -0,0 +1,56 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+
+namespace storage {
+namespace distributor {
+
+namespace {
+ struct GetNextEntryProcessor : public BucketDatabase::EntryProcessor {
+ BucketDatabase::Entry _entry;
+
+ bool process(const BucketDatabase::Entry& e) {
+ _entry = e;
+ return false;
+ }
+ };
+}
+
+BucketDatabase::Entry
+BucketDatabase::getNext(const document::BucketId& last) const
+{
+ return upperBound(last);
+}
+
+BucketDatabase::Entry
+BucketDatabase::createAppropriateBucket(
+ uint16_t minBits, const document::BucketId& bid)
+{
+ document::BucketId newBid(getAppropriateBucket(minBits, bid));
+
+ Entry e(newBid);
+ update(e);
+ return e;
+}
+
+std::ostream& operator<<(std::ostream& o, const BucketDatabase::Entry& e)
+{
+ if (!e.valid()) {
+ o << "NONEXISTING";
+ } else {
+ o << e.getBucketId() << " : " << e.getBucketInfo();
+ }
+ return o;
+}
+
+std::string
+BucketDatabase::Entry::toString() const
+{
+ std::ostringstream ost;
+ ost << *this;
+ return ost.str();
+}
+
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/bucketdb/bucketdatabase.h b/storage/src/vespa/storage/distributor/bucketdb/bucketdatabase.h
new file mode 100644
index 00000000000..a85b5aa5edf
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdb/bucketdatabase.h
@@ -0,0 +1,121 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * Interface for bucket database implementations in the distributor.
+ */
+#pragma once
+
+#include <vespa/vespalib/util/printable.h>
+#include <vespa/storage/distributor/bucketdb/bucketinfo.h>
+#include <vespa/document/bucket/bucketid.h>
+
+namespace storage {
+namespace distributor {
+
+class BucketDatabase : public vespalib::Printable
+{
+public:
+ class Entry {
+ document::BucketId _bucketId;
+ BucketInfo _info;
+
+ public:
+ Entry() : _bucketId(0) {} // Invalid entry
+ Entry(const document::BucketId& bId, const BucketInfo& bucketInfo)
+ : _bucketId(bId), _info(bucketInfo) {}
+ explicit Entry(const document::BucketId& bId) : _bucketId(bId) {}
+
+ bool operator==(const Entry& other) const {
+ return (_bucketId == other._bucketId && _info == other._info);
+ }
+ bool valid() const { return (_bucketId.getRawId() != 0); }
+ std::string toString() const;
+
+ const document::BucketId& getBucketId() const { return _bucketId; }
+ const BucketInfo& getBucketInfo() const { return _info; }
+ BucketInfo& getBucketInfo() { return _info; }
+ BucketInfo* operator->() { return &_info; }
+ const BucketInfo* operator->() const { return &_info; }
+
+ static Entry createInvalid() {
+ return Entry();
+ }
+ };
+
+ template<typename T> struct Processor {
+ virtual ~Processor() {}
+ /** Return false to stop iterating. */
+ virtual bool process(T& e) = 0;
+ };
+ typedef Processor<const Entry> EntryProcessor;
+ typedef Processor<Entry> MutableEntryProcessor;
+
+ virtual ~BucketDatabase() {}
+
+ virtual Entry get(const document::BucketId& bucket) const = 0;
+ virtual void remove(const document::BucketId& bucket) = 0;
+
+ /**
+ * Puts all entries that are can contain the given bucket id
+ * into the given entry vector, including itself if found.
+ */
+ virtual void getParents(const document::BucketId& childBucket,
+ std::vector<Entry>& entries) const = 0;
+
+ /**
+ * Puts the sum of entries from getParents() and getChildren() into
+ * the given vector.
+ */
+ virtual void getAll(const document::BucketId& bucket,
+ std::vector<Entry>& entries) const = 0;
+
+ /**
+ * Updates the entry for the given bucket. Adds the bucket to the bucket
+ * database if it wasn't found.
+ */
+ virtual void update(const Entry& newEntry) = 0;
+
+ virtual void forEach(
+ EntryProcessor&,
+ const document::BucketId& after = document::BucketId()) const = 0;
+ virtual void forEach(
+ MutableEntryProcessor&,
+ const document::BucketId& after = document::BucketId()) = 0;
+
+ /**
+ * Get the first bucket that does _not_ compare less than or equal to
+ * value in standard reverse bucket bit order (i.e. the next bucket in
+ * DB iteration order after value).
+ *
+ * If no such bucket exists, an invalid (empty) entry should be returned.
+ * If upperBound is used as part of database iteration, such a return value
+ * in effect signals that the end of the database has been reached.
+ */
+ virtual Entry upperBound(const document::BucketId& value) const = 0;
+
+ Entry getNext(const document::BucketId& last) const;
+
+ virtual uint64_t size() const = 0;
+ virtual void clear() = 0;
+
+ // FIXME: make const as soon as Judy distributor bucket database
+ // has been removed, as it has no such function and will always
+ // mutate its internal database!
+ virtual document::BucketId getAppropriateBucket(
+ uint16_t minBits,
+ const document::BucketId& bid) = 0;
+ /**
+ * Based on the minimum split bits and the existing buckets,
+ * creates the correct new bucket in the bucket database,
+ * and returns the resulting entry.
+ */
+ BucketDatabase::Entry createAppropriateBucket(
+ uint16_t minBits,
+ const document::BucketId& bid);
+
+ virtual uint32_t childCount(const document::BucketId&) const = 0;
+};
+
+std::ostream& operator<<(std::ostream& o, const BucketDatabase::Entry& e);
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/bucketdb/bucketdbmetricupdater.cpp b/storage/src/vespa/storage/distributor/bucketdb/bucketdbmetricupdater.cpp
new file mode 100644
index 00000000000..1b82cdfeaf3
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdb/bucketdbmetricupdater.cpp
@@ -0,0 +1,139 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/storage/distributor/bucketdb/bucketdbmetricupdater.h>
+
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storage/distributor/idealstatemetricsset.h>
+
+#include <algorithm>
+
+namespace storage {
+namespace distributor {
+
+BucketDBMetricUpdater::BucketDBMetricUpdater()
+ : _workingStats(),
+ _lastCompleteStats(),
+ _replicaCountingMode(ReplicaCountingMode::TRUSTED),
+ _hasCompleteStats(false)
+{
+}
+
+void
+BucketDBMetricUpdater::resetStats()
+{
+ _workingStats = Stats();
+}
+
+void
+BucketDBMetricUpdater::visit(const BucketDatabase::Entry& entry,
+ uint32_t redundancy)
+{
+ if (entry->getNodeCount() == 0) {
+ // We used to have an assert on >0 but that caused some crashes, see
+ // ticket 7275624. Why? Until that gets sorted out, we're disabling the
+ // assert and return, which should be fine since it was the old
+ // behavior.
+ return;
+ }
+
+ ++_workingStats._totalBuckets;
+
+ uint32_t docCount = 0;
+ uint32_t byteCount = 0;
+ uint32_t trustedCopies = 0;
+
+ for (uint32_t i = 0; i < entry->getNodeCount(); i++) {
+ if (entry->getNodeRef(i).trusted()) {
+ if (trustedCopies == 0) {
+ docCount = entry->getNodeRef(i).getDocumentCount();
+ byteCount = entry->getNodeRef(i).getTotalDocumentSize();
+ }
+
+ trustedCopies++;
+ }
+ }
+
+ // If there was no trusted, pick the largest one.
+ if (trustedCopies == 0) {
+ for (uint32_t i = 0; i < entry->getNodeCount(); i++) {
+ uint32_t curr = entry->getNodeRef(i).getDocumentCount();
+
+ if (curr > docCount) {
+ docCount = curr;
+ byteCount = entry->getNodeRef(i).getTotalDocumentSize();
+ }
+ }
+ }
+
+ _workingStats._docCount += docCount;
+ _workingStats._byteCount += byteCount;
+
+ if (trustedCopies < redundancy) {
+ ++_workingStats._tooFewCopies;
+ } else if (trustedCopies > redundancy) {
+ ++_workingStats._tooManyCopies;
+ }
+ if (trustedCopies == 0) {
+ ++_workingStats._noTrusted;
+ }
+ updateMinReplicationStats(entry, trustedCopies);
+}
+
+void
+BucketDBMetricUpdater::updateMinReplicationStats(
+ const BucketDatabase::Entry& entry,
+ uint32_t trustedCopies)
+{
+ auto& minBucketReplica = _workingStats._minBucketReplica;
+ for (uint32_t i = 0; i < entry->getNodeCount(); i++) {
+ const uint16_t node = entry->getNodeRef(i).getNode();
+ // Note: currently we assume there are only 2 counting modes.
+ // Either we only count the trusted replicas, or we count any and all
+ // available replicas without caring about whether or not they are in
+ // sync across each other.
+ // Regardless of counting mode we still have to take the minimum
+ // replica count across all buckets present on any given node.
+ const uint32_t countedReplicas(
+ (_replicaCountingMode == ReplicaCountingMode::TRUSTED)
+ ? trustedCopies : entry->getNodeCount());
+ auto it = minBucketReplica.find(node);
+ if (it == minBucketReplica.end()) {
+ minBucketReplica[node] = countedReplicas;
+ } else {
+ it->second = std::min(it->second, countedReplicas);
+ }
+ }
+}
+
+void
+BucketDBMetricUpdater::completeRound(bool resetWorkingStats)
+{
+ _lastCompleteStats = _workingStats;
+ _hasCompleteStats = true;
+ if (resetWorkingStats) {
+ resetStats();
+ }
+}
+
+void
+BucketDBMetricUpdater::Stats::propagateMetrics(
+ IdealStateMetricSet& idealStateMetrics,
+ DistributorMetricSet& distributorMetrics)
+{
+ distributorMetrics.docsStored.set(_docCount);
+ distributorMetrics.bytesStored.set(_byteCount);
+
+ idealStateMetrics.buckets_toofewcopies.set(_tooFewCopies);
+ idealStateMetrics.buckets_toomanycopies.set(_tooManyCopies);
+ idealStateMetrics.buckets_notrusted.set(_noTrusted);
+ idealStateMetrics.buckets.set(_totalBuckets);
+}
+
+void
+BucketDBMetricUpdater::reset()
+{
+ resetStats();
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/bucketdb/bucketdbmetricupdater.h b/storage/src/vespa/storage/distributor/bucketdb/bucketdbmetricupdater.h
new file mode 100644
index 00000000000..1fb7b5672f5
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdb/bucketdbmetricupdater.h
@@ -0,0 +1,104 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <vespa/storage/config/config-stor-distributormanager.h>
+
+#include <unordered_map>
+
+namespace storage {
+
+class DistributorMetricSet;
+
+namespace distributor {
+
+class IdealStateMetricSet;
+
+class BucketDBMetricUpdater {
+public:
+ /** Bucket statistics for a single database iteration */
+ struct Stats {
+ uint64_t _docCount {0};
+ uint64_t _byteCount {0};
+ uint64_t _tooFewCopies {0};
+ uint64_t _tooManyCopies {0};
+ uint64_t _noTrusted {0};
+ uint64_t _totalBuckets {0};
+
+ /**
+ * For each node N, look at all the buckets that have or should have a
+ * bucket copy on that node. For each of these buckets, there is a
+ * number of trusted copies. Take the bucket with the least number of
+ * trusted copies C. _minBucketReplica[N] equals this C.
+ *
+ * C can be used to determine the effect on replication if storage node
+ * N is taken out for maintenance.
+ *
+ * If we could rely 100% on our concept of "trusted copies", then a more
+ * accurate measure for any effect on replication would be to only look
+ * at the buckets for which node N has a trusted copy.
+ *
+ * Note: If no buckets have been found for a node, that node is not in
+ * this map.
+ */
+ std::unordered_map<uint16_t, uint32_t> _minBucketReplica;
+
+ /**
+ * Propagate state values to the appropriate metric values.
+ */
+ void propagateMetrics(IdealStateMetricSet&, DistributorMetricSet&);
+ };
+
+ using ReplicaCountingMode = vespa::config::content::core::StorDistributormanagerConfig::MinimumReplicaCountingMode;
+
+private:
+ Stats _workingStats;
+ Stats _lastCompleteStats;
+ ReplicaCountingMode _replicaCountingMode;
+ bool _hasCompleteStats;
+
+public:
+ BucketDBMetricUpdater();
+
+ void setMinimumReplicaCountingMode(ReplicaCountingMode mode) noexcept {
+ _replicaCountingMode = mode;
+ }
+ ReplicaCountingMode getMinimumReplicaCountingMode() const noexcept {
+ return _replicaCountingMode;
+ }
+
+ void visit(const BucketDatabase::Entry& e, uint32_t redundancy);
+ /**
+ * Reset all values in current working state to zero.
+ */
+ void reset();
+ /**
+ * Called after an entire DB iteration round has been completed. Updates
+ * last complete state with current working state.
+ *
+ * If reset==true, resets current working state to all zero. Using anything
+ * but true here is primarily for unit testing.
+ */
+ void completeRound(bool resetWorkingStats = true);
+
+ /**
+ * Returns true iff completeRound() has been called at least once.
+ */
+ bool hasCompletedRound() const {
+ return _hasCompleteStats;
+ }
+
+ Stats getLastCompleteStats() const {
+ return _lastCompleteStats;
+ }
+
+private:
+ void updateMinReplicationStats(const BucketDatabase::Entry& entry,
+ uint32_t trustedCopies);
+
+ void resetStats();
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/bucketdb/bucketinfo.cpp b/storage/src/vespa/storage/distributor/bucketdb/bucketinfo.cpp
new file mode 100644
index 00000000000..ccb3702bbbd
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdb/bucketinfo.cpp
@@ -0,0 +1,316 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/bucketdb/bucketinfo.h>
+#include <vespa/storage/storageutil/utils.h>
+
+namespace storage {
+
+namespace distributor {
+
+BucketInfo::BucketInfo()
+ : _lastGarbageCollection(0)
+{
+}
+
+bool
+BucketInfo::emptyAndConsistent() const {
+ for (uint32_t i = 0; i < _nodes.size(); i++) {
+ if (!_nodes[i].empty()) return false;
+ }
+ return consistentNodes();
+}
+
+bool
+BucketInfo::validAndConsistent() const {
+ for (uint32_t i = 0; i < _nodes.size(); i++) {
+ if (!_nodes[i].valid()) return false;
+ }
+ return consistentNodes();
+}
+
+bool
+BucketInfo::hasInvalidCopy() const
+{
+ for (uint32_t i = 0; i < _nodes.size(); i++) {
+ if (!_nodes[i].valid()) return true;
+ }
+ return false;
+}
+
+void
+BucketInfo::updateTrusted() {
+ if (validAndConsistent()) {
+ for (uint32_t i = 0; i < _nodes.size(); i++) {
+ _nodes[i].setTrusted();
+ }
+ }
+
+ int trusted = -1;
+ for (uint32_t i = 0; i < _nodes.size(); i++) {
+ if (_nodes[i].trusted()) {
+ trusted = i;
+ break;
+ }
+ }
+
+ if (trusted != -1) {
+ for (uint32_t i = 0; i < _nodes.size(); i++) {
+ if (_nodes[i].consistentWith(_nodes[trusted])) {
+ _nodes[i].setTrusted();
+ } else if (_nodes[i].trusted()) {
+ resetTrusted();
+ return;
+ }
+ }
+ }
+}
+
+void
+BucketInfo::resetTrusted() {
+ for (uint32_t i = 0; i < _nodes.size(); i++) {
+ _nodes[i].clearTrusted();
+ }
+ updateTrusted();
+}
+
+uint16_t
+BucketInfo::getTrustedCount() const {
+ uint32_t trustedCount = 0;
+ for (uint32_t i = 0; i < _nodes.size(); i++) {
+ if (_nodes[i].trusted()) {
+ trustedCount++;
+ }
+ }
+ return trustedCount;
+}
+
+bool
+BucketInfo::consistentNodes(bool countInvalidAsConsistent) const
+{
+ int compareIndex = 0;
+ for (uint32_t i=1; i<_nodes.size(); i++) {
+ if (!_nodes[i].consistentWith(_nodes[compareIndex],
+ countInvalidAsConsistent)) return false;
+ }
+ return true;
+}
+
+void
+BucketInfo::print(std::ostream& out, bool verbose, const std::string& indent) const
+{
+ if (_nodes.empty()) {
+ out << "no nodes";
+ }
+ for (uint32_t i=0; i<_nodes.size(); ++i) {
+ if (i != 0) out << ", ";
+ _nodes[i].print(out, verbose, indent);
+ }
+}
+
+namespace {
+
+struct Sorter {
+ const std::vector<uint16_t>& _order;
+
+ Sorter(const std::vector<uint16_t>& recommendedOrder) :
+ _order(recommendedOrder) {}
+
+ bool operator() (const BucketCopy& a, const BucketCopy& b) {
+ int order_a = -1;
+ for (uint32_t i = 0; i < _order.size(); i++) {
+ if (_order[i] == a.getNode()) {
+ order_a = i;
+ break;
+ }
+ }
+ int order_b = -1;
+ for (uint32_t i = 0; i < _order.size(); i++) {
+ if (_order[i] == b.getNode()) {
+ order_b = i;
+ break;
+ }
+ }
+
+ if (order_b == -1 && order_a == -1) {
+ return a.getNode() < b.getNode();
+ }
+ if (order_b == -1) {
+ return true;
+ }
+ if (order_a == -1) {
+ return false;
+ }
+
+ return order_a < order_b;
+ }
+};
+
+}
+
+void
+BucketInfo::updateNode(const BucketCopy& newCopy)
+{
+ BucketCopy* found = getNodeInternal(newCopy.getNode());
+
+ if (found) {
+ *found = newCopy;
+ updateTrusted();
+ }
+}
+
+void
+BucketInfo::addNodes(const std::vector<BucketCopy>& newCopies,
+ const std::vector<uint16_t>& recommendedOrder)
+{
+ for (uint32_t i = 0; i < newCopies.size(); ++i) {
+ BucketCopy* found = getNodeInternal(newCopies[i].getNode());
+
+ if (found) {
+ if (found->getTimestamp() < newCopies[i].getTimestamp()) {
+ found->setBucketInfo(newCopies[i].getTimestamp(),
+ newCopies[i].getBucketInfo());
+ }
+ } else {
+ _nodes.push_back(newCopies[i]);
+ }
+ }
+
+ std::sort(_nodes.begin(), _nodes.end(), Sorter(recommendedOrder));
+
+ updateTrusted();
+}
+
+void
+BucketInfo::addNode(const BucketCopy& newCopy,
+ const std::vector<uint16_t>& recommendedOrder)
+{
+ addNodes(toVector<BucketCopy>(newCopy),
+ recommendedOrder);
+}
+
+bool
+BucketInfo::removeNode(unsigned short node)
+{
+ for (std::vector<BucketCopy>::iterator iter = _nodes.begin();
+ iter != _nodes.end();
+ iter++) {
+ if (iter->getNode() == node) {
+ _nodes.erase(iter);
+ updateTrusted();
+ return true;
+ }
+ }
+ return false;
+}
+
+const BucketCopy*
+BucketInfo::getNode(uint16_t node) const
+{
+ for (std::vector<BucketCopy>::const_iterator iter = _nodes.begin();
+ iter != _nodes.end();
+ iter++) {
+ if (iter->getNode() == node) {
+ return &*iter;
+ }
+ }
+ return 0;
+}
+
+BucketCopy*
+BucketInfo::getNodeInternal(uint16_t node)
+{
+ for (std::vector<BucketCopy>::iterator iter = _nodes.begin();
+ iter != _nodes.end();
+ iter++) {
+ if (iter->getNode() == node) {
+ return &*iter;
+ }
+ }
+ return 0;
+}
+
+std::vector<uint16_t>
+BucketInfo::getNodes() const {
+ std::vector<uint16_t> result;
+
+ for (uint32_t i = 0; i < _nodes.size(); i++) {
+ result.push_back(_nodes[i].getNode());
+ }
+
+ return result;
+}
+
+uint32_t
+BucketInfo::getHighestDocumentCount() const
+{
+ uint32_t highest = 0;
+ for (uint32_t i = 0; i < _nodes.size(); ++i) {
+ highest = std::max(highest, _nodes[i].getDocumentCount());
+ }
+ return highest;
+}
+
+uint32_t
+BucketInfo::getHighestTotalDocumentSize() const
+{
+ uint32_t highest = 0;
+ for (uint32_t i = 0; i < _nodes.size(); ++i) {
+ highest = std::max(highest, _nodes[i].getTotalDocumentSize());
+ }
+ return highest;
+}
+
+uint32_t
+BucketInfo::getHighestMetaCount() const
+{
+ uint32_t highest = 0;
+ for (uint32_t i = 0; i < _nodes.size(); ++i) {
+ highest = std::max(highest, _nodes[i].getMetaCount());
+ }
+ return highest;
+}
+
+uint32_t
+BucketInfo::getHighestUsedFileSize() const
+{
+ uint32_t highest = 0;
+ for (uint32_t i = 0; i < _nodes.size(); ++i) {
+ highest = std::max(highest, _nodes[i].getUsedFileSize());
+ }
+ return highest;
+}
+
+bool
+BucketInfo::hasRecentlyCreatedEmptyCopy() const
+{
+ for (uint32_t i = 0; i < _nodes.size(); ++i) {
+ if (_nodes[i].wasRecentlyCreated()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool
+BucketInfo::operator==(const BucketInfo& other) const
+{
+ if (_nodes.size() != other._nodes.size()) {
+ return false;
+ }
+
+ for (uint32_t i = 0; i < _nodes.size(); ++i) {
+ if (_nodes[i].getNode() != other._nodes[i].getNode()) {
+ return false;
+ }
+
+ if (!(_nodes[i] == other._nodes[i])) {
+ return false;
+ }
+ }
+
+ return true;
+};
+
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/bucketdb/bucketinfo.h b/storage/src/vespa/storage/distributor/bucketdb/bucketinfo.h
new file mode 100644
index 00000000000..4e450994f8a
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdb/bucketinfo.h
@@ -0,0 +1,177 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/bucketdb/bucketcopy.h>
+
+namespace storage {
+
+namespace distributor {
+
+class BucketInfo
+{
+private:
+ uint32_t _lastGarbageCollection;
+ std::vector<BucketCopy> _nodes;
+
+public:
+ BucketInfo();
+
+ /**
+ * @return Returns the last time when this bucket was "garbage collected".
+ */
+ uint32_t getLastGarbageCollectionTime() const { return _lastGarbageCollection; }
+
+ /**
+ * Sets the last time the bucket was "garbage collected".
+ */
+ void setLastGarbageCollectionTime(uint32_t timestamp) {
+ _lastGarbageCollection = timestamp;
+ }
+
+ /**
+ Update trusted flags if bucket is now complete and consistent.
+ */
+ void updateTrusted();
+
+ /**
+ Removes any historical information on trustedness, and sets the bucket copies to
+ trusted if they are now complete and consistent.
+ */
+ void resetTrusted();
+
+ /** True if the bucket contains no documents and is consistent. */
+ bool emptyAndConsistent() const;
+
+ /**
+ Check that all copies have complete bucket information and are
+ consistent with eachother.
+ */
+ bool validAndConsistent() const;
+
+ /**
+ * True if the bucket contains at least one invalid copy
+ */
+ bool hasInvalidCopy() const;
+
+ /**
+ * Returns the number of trusted nodes this entry has.
+ */
+ uint16_t getTrustedCount() const;
+
+ bool hasTrusted() const {
+ return getTrustedCount() != 0;
+ }
+
+ /**
+ * Check that all of the nodes have the same checksums.
+ *
+ * @param countInCompleteAsInconsistent If false, nodes that are incomplete
+ * are always counted as consistent with complete nodes.
+ */
+ bool consistentNodes(bool countInvalidAsConsistent = false) const;
+
+ static bool mayContain(const BucketInfo&) { return true; }
+ void print(std::ostream&, bool verbose, const std::string& indent) const;
+
+ /**
+ Adds the given node.
+
+ @param recommendedOrder A recommended ordering of nodes.
+ All nodes in this list will be ordered first, in the order
+
+ listed. Any nodes not in this list will be order numerically afterward.
+ @param replace If replace is true, replaces old ones that may exist.
+ */
+ void addNodes(const std::vector<BucketCopy>& newCopies,
+ const std::vector<uint16_t>& recommendedOrder);
+
+ /**
+ Simplified API for the common case of inserting one node. See addNodes().
+ */
+ void addNode(const BucketCopy& newCopy,
+ const std::vector<uint16_t>& recommendedOrder);
+
+ /**
+ Updates bucket information for a node. Does nothing if the node
+ doesn't already exist.
+ */
+ void updateNode(const BucketCopy& newCopy);
+
+ /**
+ Returns true if the node existed and was removed.
+ */
+ bool removeNode(uint16_t node);
+
+ /**
+ * Returns the bucket copy struct for the given node, null if nonexisting
+ */
+ const BucketCopy* getNode(uint16_t node) const;
+
+ /**
+ * Returns the number of nodes this entry has.
+ */
+ uint32_t getNodeCount() const { return _nodes.size(); }
+
+ /**
+ * Returns a list of the nodes this entry has.
+ */
+ std::vector<uint16_t> getNodes() const;
+
+ /**
+ Returns a reference to the node with the given index in the node
+ array. This operation has undefined behaviour if the index given
+ is not within the node count.
+ */
+ const BucketCopy& getNodeRef(uint16_t idx) const {
+ return _nodes[idx];
+ }
+
+ void clearTrusted(uint16_t nodeIdx) {
+ getNodeInternal(nodeIdx)->clearTrusted();
+ }
+
+ /**
+ Clears all nodes from the bucket information.
+ */
+ void clear() { _nodes.clear(); }
+
+ std::string toString() const {
+ std::ostringstream ost;
+ print(ost, true, "");
+ return ost.str();
+ };
+
+ bool verifyLegal() const { return true; }
+
+ uint32_t getHighestDocumentCount() const;
+ uint32_t getHighestTotalDocumentSize() const;
+ uint32_t getHighestMetaCount() const;
+ uint32_t getHighestUsedFileSize() const;
+
+ bool hasRecentlyCreatedEmptyCopy() const;
+
+ bool operator==(const BucketInfo& other) const;
+
+private:
+ friend class DistrBucketDBTest;
+ friend class DistributorTestUtil;
+
+ /**
+ * Returns the bucket copy struct for the given node, null if nonexisting
+ */
+ BucketCopy* getNodeInternal(uint16_t node);
+
+ const BucketCopy& getNodeRefInternal(uint16_t idx) const {
+ return _nodes[idx];
+ }
+
+ void addNodeManual(const BucketCopy& newCopy) { _nodes.push_back(newCopy); }
+};
+
+inline std::ostream& operator<<(std::ostream& out, const BucketInfo& info)
+ { info.print(out, false, ""); return out; }
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/bucketdb/judybucketdatabase.cpp b/storage/src/vespa/storage/distributor/bucketdb/judybucketdatabase.cpp
new file mode 100644
index 00000000000..62dc47f0881
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdb/judybucketdatabase.cpp
@@ -0,0 +1,187 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/bucketdb/judybucketdatabase.h>
+
+namespace storage {
+
+using bucketdb::DistrBucketDatabase;
+
+namespace distributor {
+
+BucketDatabase::Entry
+JudyBucketDatabase::get(const document::BucketId& bucket) const
+{
+ DistrBucketDatabase::WrappedEntry wrp = _db.get(bucket, "", false);
+ if (!wrp.exist()) {
+ return BucketDatabase::Entry();
+ } else {
+ return BucketDatabase::Entry(bucket, *wrp);
+ }
+}
+
+void
+JudyBucketDatabase::remove(const document::BucketId& bucket)
+{
+ DistrBucketDatabase::WrappedEntry wrp = _db.get(bucket, "", false);
+ if (wrp.exist()) {
+ wrp.remove();
+ }
+}
+
+void
+JudyBucketDatabase::getParents(const document::BucketId& childBucket,
+ std::vector<Entry>& entries) const
+{
+ bucketdb::DistrBucketDatabase::EntryMap e = _db.getContained(childBucket, "");
+
+ for (bucketdb::DistrBucketDatabase::EntryMap::iterator iter = e.begin();
+ iter != e.end();
+ ++iter) {
+ entries.push_back(BucketDatabase::Entry(iter->first,
+ *iter->second));
+ }
+}
+
+void
+JudyBucketDatabase::getAll(const document::BucketId& bucket,
+ std::vector<Entry>& entries) const
+{
+ bucketdb::DistrBucketDatabase::EntryMap e = _db.getAll(bucket, "");
+
+ for (bucketdb::DistrBucketDatabase::EntryMap::iterator iter = e.begin();
+ iter != e.end();
+ ++iter) {
+ entries.push_back(BucketDatabase::Entry(iter->first,
+ *iter->second));
+ }
+}
+
+void
+JudyBucketDatabase::update(const Entry& newEntry)
+{
+ DistrBucketDatabase::WrappedEntry wrp = _db.get(newEntry.getBucketId(), "", true);
+ (*wrp) = newEntry.getBucketInfo();
+ wrp.write();
+}
+
+namespace {
+
+class Iterator {
+public:
+ document::BucketId lastBucketId;
+ BucketDatabase::Entry lastEntry;
+
+
+ Iterator(const document::BucketId& b)
+ : lastBucketId(b) {};
+
+ DistrBucketDatabase::Decision operator()(document::BucketId::Type key,
+ DistrBucketDatabase::Entry& info)
+ {
+ document::BucketId bucketId(document::BucketId::keyToBucketId(key));
+
+ if (lastBucketId == bucketId) {
+ return DistrBucketDatabase::CONTINUE;
+ }
+
+ lastEntry = BucketDatabase::Entry(bucketId, info);
+ return DistrBucketDatabase::ABORT;
+ }
+
+};
+
+}
+
+void
+JudyBucketDatabase::forEach(EntryProcessor& processor,
+ const document::BucketId& last) const
+{
+ document::BucketId curr = last;
+
+ JudyBucketDatabase& mutableSelf(const_cast<JudyBucketDatabase&>(*this));
+ Entry currEntry;
+ while ((currEntry = mutableSelf.getNextEntry(curr)).valid()) {
+
+ bool continueProcessing = processor.process(currEntry);
+ if (!continueProcessing) {
+ break;
+ }
+ curr = currEntry.getBucketId();
+ }
+}
+
+BucketDatabase::Entry
+JudyBucketDatabase::getNextEntry(const document::BucketId& curr)
+{
+ return upperBound(curr);
+}
+
+void
+JudyBucketDatabase::forEach(MutableEntryProcessor& processor,
+ const document::BucketId& last)
+{
+ document::BucketId curr = last;
+
+ Entry currEntry;
+ while ((currEntry = getNextEntry(curr)).valid()) {
+
+ Entry lastEntry = currEntry;
+ bool continueProcessing = processor.process(currEntry);
+ if (!(currEntry.getBucketInfo() == lastEntry.getBucketInfo())) {
+ update(currEntry);
+ }
+
+ if (!continueProcessing) {
+ break;
+ }
+ curr = currEntry.getBucketId();
+ }
+}
+
+uint64_t
+JudyBucketDatabase::size() const
+{
+ return _db.size();
+}
+
+void
+JudyBucketDatabase::clear()
+{
+ _db.clear();
+}
+
+// FIXME: mutates database! No read-only functionality for this in LocakableMap!
+document::BucketId
+JudyBucketDatabase::getAppropriateBucket(
+ uint16_t minBits,
+ const document::BucketId& bid)
+{
+ DistrBucketDatabase::WrappedEntry wrp =
+ _db.createAppropriateBucket(minBits, "", bid);
+ return wrp.getBucketId();
+}
+
+uint32_t
+JudyBucketDatabase::childCount(const document::BucketId&) const
+{
+ // Not implemented! Judy map for distributor is deprecated.
+ abort();
+}
+
+BucketDatabase::Entry
+JudyBucketDatabase::upperBound(const document::BucketId& value) const
+{
+ Iterator iter(value);
+ _db.all(iter, "", value.toKey());
+ return iter.lastEntry;
+}
+
+void
+JudyBucketDatabase::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) out; (void) verbose; (void) indent;
+}
+
+}
+}
diff --git a/storage/src/vespa/storage/distributor/bucketdb/judybucketdatabase.h b/storage/src/vespa/storage/distributor/bucketdb/judybucketdatabase.h
new file mode 100644
index 00000000000..13cddd72cdf
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdb/judybucketdatabase.h
@@ -0,0 +1,47 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+
+namespace storage {
+namespace distributor {
+
+class JudyBucketDatabase : public BucketDatabase
+{
+public:
+ virtual Entry get(const document::BucketId& bucket) const;
+ virtual void remove(const document::BucketId& bucket);
+ virtual void getParents(const document::BucketId& childBucket,
+ std::vector<Entry>& entries) const;
+ virtual void getAll(const document::BucketId& bucket,
+ std::vector<Entry>& entries) const;
+ virtual void update(const Entry& newEntry);
+ virtual void forEach(EntryProcessor&,
+ const document::BucketId& after) const;
+ virtual void forEach(MutableEntryProcessor&,
+ const document::BucketId& after);
+ uint64_t size() const;
+ void clear();
+
+ // FIXME: remove! mutates internal database!
+ document::BucketId getAppropriateBucket(
+ uint16_t minBits,
+ const document::BucketId& bid);
+
+ uint32_t childCount(const document::BucketId&) const override;
+
+ Entry upperBound(const document::BucketId& bucket) const override;
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+private:
+ mutable bucketdb::DistrBucketDatabase _db;
+
+ Entry getNextEntry(const document::BucketId& id);
+};
+
+}
+}
+
diff --git a/storage/src/vespa/storage/distributor/bucketdb/mapbucketdatabase.cpp b/storage/src/vespa/storage/distributor/bucketdb/mapbucketdatabase.cpp
new file mode 100644
index 00000000000..0abf24e5821
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdb/mapbucketdatabase.cpp
@@ -0,0 +1,515 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/bucketdb/mapbucketdatabase.h>
+#include <vespa/storage/common/bucketoperationlogger.h>
+
+namespace storage {
+
+namespace distributor {
+
+MapBucketDatabase::MapBucketDatabase()
+{
+ // Allocate the root element.
+ allocate();
+}
+
+MapBucketDatabase::E::~E()
+{
+}
+
+uint32_t
+MapBucketDatabase::allocate()
+{
+ if (!_free.empty()) {
+ uint32_t retVal = _free[_free.size() - 1];
+ _free.pop_back();
+ return retVal;
+ }
+
+ _db.push_back(E());
+ return _db.size() - 1;
+}
+
+uint32_t
+MapBucketDatabase::allocateValue(const document::BucketId& bid)
+{
+ if (!_freeValues.empty()) {
+ uint32_t retVal = _freeValues[_freeValues.size() - 1];
+ _freeValues.pop_back();
+ return retVal;
+ }
+
+ _values.push_back(BucketDatabase::Entry(bid));
+ return _values.size() - 1;
+}
+
+BucketDatabase::Entry*
+MapBucketDatabase::find(int index,
+ uint8_t bitCount,
+ const document::BucketId& bid,
+ bool create)
+{
+ if (index == -1) {
+ return NULL;
+ }
+
+ E& e = _db[index];
+ if (bitCount == bid.getUsedBits()) {
+ if (e.value == -1) {
+ if (create) {
+ e.value = allocateValue(bid);
+ } else {
+ return NULL;
+ }
+ }
+
+ return &_values[e.value];
+ }
+
+ // Must reference _db[index] rather than E, since the address of E may change
+ // in allocate().
+ if (bid.getBit(bitCount) == 0) {
+ if (e.e_0 == -1 && create) {
+ int val = allocate();
+ _db[index].e_0 = val;
+ }
+
+ return find(_db[index].e_0, bitCount + 1, bid, create);
+ } else {
+ if (e.e_1 == -1 && create) {
+ int val = allocate();
+ _db[index].e_1 = val;
+ }
+
+ return find(_db[index].e_1, bitCount + 1, bid, create);
+ }
+}
+
+BucketDatabase::Entry
+MapBucketDatabase::get(const document::BucketId& bucket) const
+{
+ MapBucketDatabase& mutableSelf(const_cast<MapBucketDatabase&>(*this));
+ Entry* found = mutableSelf.find(0, 0, bucket, false);
+ if (found) {
+ return *found;
+ } else {
+ return BucketDatabase::Entry();
+ }
+}
+
+bool
+MapBucketDatabase::remove(int index,
+ uint8_t bitCount,
+ const document::BucketId& bid)
+{
+ if (index == -1) {
+ return false;
+ }
+
+ E& e = _db[index];
+ if (bitCount == bid.getUsedBits()) {
+ if (e.value != -1) {
+ _freeValues.push_back(e.value);
+ e.value = -1;
+ }
+ }
+
+ if (bid.getBit(bitCount) == 0) {
+ if (remove(e.e_0, bitCount + 1, bid)) {
+ e.e_0 = -1;
+ }
+ } else {
+ if (remove(e.e_1, bitCount + 1, bid)) {
+ e.e_1 = -1;
+ }
+ }
+
+ if (e.empty() && index > 0) {
+ _free.push_back(index);
+ return true;
+ } else {
+ return false;
+ }
+}
+
+void
+MapBucketDatabase::remove(const document::BucketId& bucket)
+{
+ LOG_BUCKET_OPERATION_NO_LOCK(bucket, "REMOVING from bucket db!");
+ remove(0, 0, bucket);
+}
+
+void
+MapBucketDatabase::update(const Entry& newEntry)
+{
+ assert(newEntry.valid());
+ LOG_BUCKET_OPERATION_NO_LOCK(
+ newEntry.getBucketId(),
+ vespalib::make_vespa_string(
+ "bucketdb insert of %s", newEntry.toString().c_str()));
+
+ Entry* found = find(0, 0, newEntry.getBucketId(), true);
+ assert(found);
+ *found = newEntry;
+}
+
+void
+MapBucketDatabase::findParents(int index,
+ uint8_t bitCount,
+ const document::BucketId& bid,
+ std::vector<Entry>& entries) const
+{
+ if (index == -1) {
+ return;
+ }
+
+ const E& e = _db[index];
+ if (e.value != -1) {
+ entries.push_back(_values[e.value]);
+ }
+
+ if (bitCount >= bid.getUsedBits()) {
+ return;
+ }
+
+ if (bid.getBit(bitCount) == 0) {
+ findParents(e.e_0, bitCount + 1, bid, entries);
+ } else {
+ findParents(e.e_1, bitCount + 1, bid, entries);
+ }
+}
+
+
+void
+MapBucketDatabase::getParents(const document::BucketId& childBucket,
+ std::vector<Entry>& entries) const
+{
+ findParents(0, 0, childBucket, entries);
+}
+
+void
+MapBucketDatabase::findAll(int index,
+ uint8_t bitCount,
+ const document::BucketId& bid,
+ std::vector<Entry>& entries) const
+{
+ if (index == -1) {
+ return;
+ }
+
+ const E& e = _db[index];
+ if (e.value != -1) {
+ entries.push_back(_values[e.value]);
+ }
+
+ if (bitCount >= bid.getUsedBits()) {
+ findAll(e.e_0, bitCount + 1, bid, entries);
+ findAll(e.e_1, bitCount + 1, bid, entries);
+ } else {
+ if (bid.getBit(bitCount) == 0) {
+ findAll(e.e_0, bitCount + 1, bid, entries);
+ } else {
+ findAll(e.e_1, bitCount + 1, bid, entries);
+ }
+ }
+}
+
+void
+MapBucketDatabase::getAll(const document::BucketId& bucket,
+ std::vector<Entry>& entries) const
+{
+ findAll(0, 0, bucket, entries);
+}
+
+/**
+ * Any child bucket under a bucket held in an inner node will be ordered after
+ * (i.e. be greater than) the inner node bucket. This is because in bucket key
+ * order these have the same bit prefix but are guaranteed to have a suffix that
+ * make them greater. From our bucket ordering spec, a bucket with 5 bits of
+ * 00000 is greater than a bucket of 3 bits of 000 because the suffix logically
+ * takes into account the number of used bucket bits (meaning the actual
+ * values are more akin to 000000000:5 and 00000000:3). When traversing the bit
+ * tree, we mirror this behavior since all child nodes by definition have a
+ * higher used bit value from their depth in the tree.
+ */
+int
+MapBucketDatabase::findFirstInOrderNodeInclusive(int index) const
+{
+ if (index == -1) {
+ return -1;
+ }
+
+ int follow = index;
+ while (true) {
+ const E& e = _db[follow];
+ if (e.value != -1) {
+ return follow;
+ }
+ // In-order 0 bits sort before 1 bits so we follow the 0 branch if
+ // at all possible. It is illegal for a branch to exist without there
+ // existing a leaf somewhere underneath it, so we're destined to hit
+ // something if it exists.
+ follow = (e.e_0 != -1 ? e.e_0 : e.e_1);
+ if (follow == -1) {
+ return -1;
+ }
+ }
+}
+
+/**
+ * Follow the bit tree as far as we can based on upper bound `value`. To get a
+ * bucket with an ID greater than `value` we must try to follow the bit tree
+ * as far down as possible, taking the branches that correspond to our input
+ * value:
+ * 1) If input value has a 0 bit in the `depth` position but no such branch
+ * exists at the current node we look in its 1 branch (if one exists),
+ * returning the first in-order child.
+ * 2) If we've reached a node that equals the input value (current depth
+ * equals used bits), look for the first in-order child under the node
+ * in question.
+ * 3) Otherwise, keep recursing down the same bit prefix subtree.
+ */
+int
+MapBucketDatabase::upperBoundImpl(int index,
+ uint8_t depth,
+ const document::BucketId& value) const
+{
+ if (index == -1) {
+ return -1; // Branch with no children; bail out and up.
+ }
+
+ const E& e = _db[index];
+ if (depth < value.getUsedBits()) {
+ if (value.getBit(depth) == 0) {
+ int candidate = upperBoundImpl(e.e_0, depth + 1, value);
+ if (candidate != -1) {
+ return candidate;
+ }
+ // No choice but to try to follow 1-branch.
+ return findFirstInOrderNodeInclusive(e.e_1);
+ } else {
+ return upperBoundImpl(e.e_1, depth + 1, value);
+ }
+ } else {
+ // We've hit a node whose bucket ID corresponds exactly to that given
+ // in `value`. Find the first in-order child node, if one exists.
+ // Please see findFirstInOrderNodeInclusive() comments for an
+ // explanation of why this satisfies the upper bound ordering
+ // requirements.
+ // Due to Funky Business(tm) inside BucketId, asking for getBit beyond
+ // usedBits returns potentially undefined values, so we have to treat
+ // this case by itself.
+ int candidate = findFirstInOrderNodeInclusive(e.e_0);
+ if (candidate == -1) {
+ candidate = findFirstInOrderNodeInclusive(e.e_1);
+ }
+ return candidate;
+ }
+}
+
+BucketDatabase::Entry
+MapBucketDatabase::upperBound(const document::BucketId& value) const
+{
+ int index = upperBoundImpl(0, 0, value);
+ if (index != -1) {
+ assert(_db[index].value != -1);
+ return _values[_db[index].value];
+ }
+ return Entry::createInvalid();
+}
+
+template <typename EntryProcessorType>
+bool
+MapBucketDatabase::forEach(int index,
+ EntryProcessorType& processor,
+ uint8_t bitCount,
+ const document::BucketId& lowerBound,
+ bool& process)
+{
+ if (index == -1) {
+ return true;
+ }
+
+ E& e = _db[index];
+ if (e.value != -1 && process && !processor.process(_values[e.value])) {
+ return false;
+ }
+
+ // We have followed the bucket to where we want to start,
+ // start processing.
+ if (!process && bitCount >= lowerBound.getUsedBits()) {
+ process = true;
+ }
+
+ if (process || lowerBound.getBit(bitCount) == 0) {
+ if (!forEach(e.e_0, processor, bitCount + 1, lowerBound, process)) {
+ return false;
+ }
+ }
+
+ if (process || lowerBound.getBit(bitCount) != 0) {
+ if (!forEach(e.e_1, processor, bitCount + 1, lowerBound, process)) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void
+MapBucketDatabase::forEach(EntryProcessor& processor,
+ const document::BucketId& after) const
+{
+ bool process = false;
+ MapBucketDatabase& mutableSelf(const_cast<MapBucketDatabase&>(*this));
+ mutableSelf.forEach(0, processor, 0, after, process);
+}
+
+void
+MapBucketDatabase::forEach(MutableEntryProcessor& processor,
+ const document::BucketId& after)
+{
+ bool process = false;
+ forEach(0, processor, 0, after, process);
+}
+
+void
+MapBucketDatabase::clear()
+{
+ _db.clear();
+ _values.clear();
+ _free.clear();
+ _freeValues.clear();
+ allocate();
+}
+
+uint8_t
+MapBucketDatabase::getHighestSplitBit(int index,
+ uint8_t bitCount,
+ const document::BucketId& bid,
+ uint8_t minCount)
+{
+ if (index == -1) {
+ return minCount;
+ }
+
+ E& e = _db[index];
+ if (bitCount == bid.getUsedBits()) {
+ return minCount;
+ }
+
+ if (bid.getBit(bitCount) == 0) {
+ if (e.e_0 != -1) {
+ minCount = getHighestSplitBit(e.e_0,
+ bitCount + 1,
+ bid,
+ minCount);
+ }
+
+ if (e.e_1 != -1) {
+ return std::max((int)minCount, bitCount + 1);
+ }
+ } else {
+ if (e.e_1 != -1) {
+ minCount = getHighestSplitBit(e.e_1,
+ bitCount + 1,
+ bid,
+ minCount);
+ }
+
+ if (e.e_0 != -1) {
+ return std::max((int)minCount, bitCount + 1);
+ }
+ }
+
+ return minCount;
+
+}
+
+document::BucketId
+MapBucketDatabase::getAppropriateBucket(
+ uint16_t minBits,
+ const document::BucketId& bid)
+{
+ return document::BucketId(getHighestSplitBit(0, 0, bid, minBits),
+ bid.getRawId());
+}
+
+uint32_t
+MapBucketDatabase::childCountImpl(int index,
+ uint8_t bitCount,
+ const document::BucketId& b) const
+{
+ if (index == -1) {
+ // A non-existing node cannot have any subtrees (obviously).
+ return 0;
+ }
+ const E& e(_db[index]);
+ if (bitCount == b.getUsedBits()) {
+ // If a child has a valid index, it counts as a subtree.
+ return ((e.e_0 != -1) + (e.e_1 != -1));
+ }
+ if (b.getBit(bitCount) == 0) {
+ return childCountImpl(e.e_0, bitCount + 1, b);
+ } else {
+ return childCountImpl(e.e_1, bitCount + 1, b);
+ }
+}
+
+uint32_t
+MapBucketDatabase::childCount(const document::BucketId& b) const
+{
+ return childCountImpl(0, 0, b);
+}
+
+
+namespace {
+ struct Writer : public BucketDatabase::EntryProcessor {
+ std::ostream& _ost;
+ Writer(std::ostream& ost) : _ost(ost) {}
+ virtual bool process(const BucketDatabase::Entry& e) {
+ _ost << e.toString() << "\n";
+ return true;
+ }
+ };
+}
+
+void
+MapBucketDatabase::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) indent;
+ if (verbose) {
+ Writer writer(out);
+ forEach(writer);
+ /* Write out all the gory details to debug
+ out << "Entries {";
+ for (uint32_t i=0, n=_db.size(); i<n; ++i) {
+ out << "\n" << indent << " " << _db[i].e_0 << "," << _db[i].e_1
+ << "," << _db[i].value;
+ }
+ out << "\n" << indent << "}";
+ out << "Free {";
+ for (uint32_t i=0, n=_free.size(); i<n; ++i) {
+ out << "\n" << indent << " " << _free[i];
+ }
+ out << "\n" << indent << "}";
+ out << "Entries {";
+ for (uint32_t i=0, n=_values.size(); i<n; ++i) {
+ out << "\n" << indent << " " << _values[i];
+ }
+ out << "\n" << indent << "}";
+ out << "Free {";
+ for (uint32_t i=0, n=_freeValues.size(); i<n; ++i) {
+ out << "\n" << indent << " " << _freeValues[i];
+ }
+ out << "\n" << indent << "}";
+ */
+ } else {
+ out << "Size(" << size() << ") Nodes("
+ << (_db.size() - _free.size() - 1) << ")";
+ }
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/bucketdb/mapbucketdatabase.h b/storage/src/vespa/storage/distributor/bucketdb/mapbucketdatabase.h
new file mode 100644
index 00000000000..9574d4fe114
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdb/mapbucketdatabase.h
@@ -0,0 +1,111 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <map>
+
+namespace storage {
+
+namespace distributor {
+
+class MapBucketDatabase : public BucketDatabase
+{
+public:
+ MapBucketDatabase();
+
+ virtual Entry get(const document::BucketId& bucket) const;
+ virtual void remove(const document::BucketId& bucket);
+ virtual void getParents(const document::BucketId& childBucket,
+ std::vector<Entry>& entries) const;
+ virtual void getAll(const document::BucketId& bucket,
+ std::vector<Entry>& entries) const;
+ virtual void update(const Entry& newEntry);
+ virtual void forEach(
+ EntryProcessor&,
+ const document::BucketId& after = document::BucketId()) const;
+ virtual void forEach(
+ MutableEntryProcessor&,
+ const document::BucketId& after = document::BucketId());
+ uint64_t size() const { return _values.size() - _freeValues.size(); };
+ void clear();
+
+ uint32_t childCount(const document::BucketId&) const override;
+
+ Entry upperBound(const document::BucketId& value) const override;
+
+ document::BucketId getAppropriateBucket(
+ uint16_t minBits,
+ const document::BucketId& bid);
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+private:
+ struct E {
+ E() : value(-1), e_0(-1), e_1(-1) {};
+ ~E();
+
+ bool empty() {
+ return (value == -1 && e_0 == -1 && e_1 == -1);
+ };
+
+ int value;
+ int e_0;
+ int e_1;
+ };
+
+ BucketDatabase::Entry* find(int idx,
+ uint8_t bitCount,
+ const document::BucketId& bid,
+ bool create);
+
+ bool remove(int index,
+ uint8_t bitCount,
+ const document::BucketId& bId);
+
+ int findFirstInOrderNodeInclusive(int index) const;
+
+ int upperBoundImpl(int index,
+ uint8_t depth,
+ const document::BucketId& value) const;
+
+ template <typename EntryProcessorType>
+ bool forEach(int index,
+ EntryProcessorType& processor,
+ uint8_t bitCount,
+ const document::BucketId& lowerBound,
+ bool& process);
+
+ void findParents(int index,
+ uint8_t bitCount,
+ const document::BucketId& bid,
+ std::vector<Entry>& entries) const;
+
+ void findAll(int index,
+ uint8_t bitCount,
+ const document::BucketId& bid,
+ std::vector<Entry>& entries) const;
+
+ uint8_t getHighestSplitBit(int index,
+ uint8_t bitCount,
+ const document::BucketId& bid,
+ uint8_t minCount);
+
+ uint32_t childCountImpl(int index,
+ uint8_t bitCount,
+ const document::BucketId& b) const;
+
+ uint32_t allocate();
+ uint32_t allocateValue(const document::BucketId& bid);
+
+ std::vector<E> _db;
+ std::vector<uint32_t> _free;
+
+ std::vector<BucketDatabase::Entry> _values;
+ std::vector<uint32_t> _freeValues;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.cpp b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
new file mode 100644
index 00000000000..6ecb8dd9276
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdbupdater.cpp
@@ -0,0 +1,746 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/bucketdbupdater.h>
+#include <vespa/storageapi/messageapi/storagereply.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/removelocation.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <vespa/storage/common/bucketoperationlogger.h>
+#include <vespa/storage/storageutil/utils.h>
+#include <vespa/storage/distributor/distributor.h>
+#include <vespa/storage/distributor/simpleclusterinformation.h>
+
+LOG_SETUP(".distributor.bucketdb.updater");
+
+using storage::lib::Node;
+using storage::lib::NodeType;
+
+namespace storage {
+namespace distributor {
+
+BucketDBUpdater::BucketDBUpdater(Distributor& owner,
+ DistributorMessageSender& sender,
+ DistributorComponentRegister& compReg)
+ : framework::StatusReporter("bucketdb", "Bucket DB Updater"),
+ _distributorComponent(owner, compReg, "Bucket DB Updater"),
+ _sender(sender)
+{
+}
+
+BucketDBUpdater::~BucketDBUpdater()
+{
+}
+
+void
+BucketDBUpdater::flush()
+{
+ for (std::map<uint64_t, BucketRequest>::iterator
+ i(_sentMessages.begin()), end(_sentMessages.end());
+ i != end; ++i)
+ {
+ // Cannot sendDown MergeBucketReplies during flushing, since
+ // all lower links have been closed
+ if (i->second._mergeReplyGuard.get()) {
+ i->second._mergeReplyGuard->resetReply();
+ }
+ }
+ _sentMessages.clear();
+}
+
+void
+BucketDBUpdater::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ out << "BucketDBUpdater";
+}
+
+bool
+BucketDBUpdater::hasPendingClusterState() const
+{
+ return _pendingClusterState.get() != nullptr;
+}
+
+BucketOwnership
+BucketDBUpdater::checkOwnershipInPendingState(const document::BucketId& b) const
+{
+ if (hasPendingClusterState()) {
+ const lib::ClusterState& state(
+ _pendingClusterState->getNewClusterState());
+ const lib::Distribution& distribution(
+ _pendingClusterState->getDistribution());
+ if (!_distributorComponent.ownsBucketInState(distribution, state, b)) {
+ return BucketOwnership::createNotOwnedInState(state);
+ }
+ }
+ return BucketOwnership::createOwned();
+}
+
+void
+BucketDBUpdater::clearPending(uint16_t node)
+{
+ for (std::map<uint64_t, BucketRequest>::iterator iter(
+ _sentMessages.begin()); iter != _sentMessages.end();)
+ {
+ if (iter->second.targetNode == node) {
+ std::map<uint64_t, BucketRequest>::iterator del = iter;
+ iter++;
+ _sentMessages.erase(del);
+ } else {
+ iter++;
+ }
+ }
+}
+
+void
+BucketDBUpdater::sendRequestBucketInfo(
+ uint16_t node,
+ const document::BucketId& bucket,
+ const std::shared_ptr<MergeReplyGuard>& mergeReplyGuard)
+{
+ if (!_distributorComponent.storageNodeIsUp(node)) {
+ return;
+ }
+
+ std::vector<document::BucketId> buckets;
+ buckets.push_back(bucket);
+
+ std::shared_ptr<api::RequestBucketInfoCommand> msg(
+ new api::RequestBucketInfoCommand(buckets));
+
+ LOG(debug,
+ "Sending request bucket info command %lu for "
+ "bucket %s to node %u",
+ msg->getMsgId(),
+ bucket.toString().c_str(),
+ node);
+
+ msg->setPriority(50);
+ msg->setAddress(_distributorComponent.nodeAddress(node));
+
+ _sentMessages[msg->getMsgId()] =
+ BucketRequest(node, _distributorComponent.getUniqueTimestamp(),
+ bucket, mergeReplyGuard);
+ _sender.sendCommand(msg);
+}
+
+void
+BucketDBUpdater::recheckBucketInfo(uint32_t nodeIdx,
+ const document::BucketId& bid)
+{
+ sendRequestBucketInfo(nodeIdx, bid, std::shared_ptr<MergeReplyGuard>());
+}
+
+void
+BucketDBUpdater::removeSuperfluousBuckets(
+ const lib::Distribution& newDistribution,
+ const lib::ClusterState& newState)
+{
+ // Remove all buckets not belonging to this distributor, or
+ // being on storage nodes that are no longer up.
+ NodeRemover proc(
+ _distributorComponent.getClusterState(),
+ newState,
+ _distributorComponent.getBucketIdFactory(),
+ _distributorComponent.getIndex(),
+ newDistribution,
+ _distributorComponent.getDistributor().getStorageNodeUpStates());
+
+ _distributorComponent.getBucketDatabase().forEach(proc);
+
+ for (uint32_t i = 0; i < proc.getBucketsToRemove().size(); ++i) {
+ _distributorComponent.getBucketDatabase()
+ .remove(proc.getBucketsToRemove()[i]);
+ }
+}
+
+void
+BucketDBUpdater::storageDistributionChanged(
+ const lib::Distribution& distribution)
+{
+ removeSuperfluousBuckets(distribution,
+ _distributorComponent.getClusterState());
+
+ ClusterInformation::CSP clusterInfo(new SimpleClusterInformation(
+ _distributorComponent.getIndex(),
+ distribution,
+ _distributorComponent.getClusterState(),
+ _distributorComponent.getDistributor().getStorageNodeUpStates()));
+ _pendingClusterState = PendingClusterState::createForDistributionChange(
+ _distributorComponent.getClock(),
+ std::move(clusterInfo),
+ _sender,
+ _distributorComponent.getUniqueTimestamp());
+ _outdatedNodes = _pendingClusterState->getOutdatedNodeSet();
+}
+
+void
+BucketDBUpdater::replyToPreviousPendingClusterStateIfAny()
+{
+ if (_pendingClusterState.get() &&
+ _pendingClusterState->getCommand().get())
+ {
+ _distributorComponent.sendUp(
+ std::make_shared<api::SetSystemStateReply>(*_pendingClusterState->getCommand()));
+ }
+}
+
+bool
+BucketDBUpdater::onSetSystemState(
+ const std::shared_ptr<api::SetSystemStateCommand>& cmd)
+{
+ LOG(debug,
+ "Received new cluster state %s",
+ cmd->getSystemState().toString().c_str());
+
+ lib::ClusterState oldState = _distributorComponent.getClusterState();
+ const lib::ClusterState& state = cmd->getSystemState();
+
+ if (state == oldState) {
+ return false;
+ }
+
+ removeSuperfluousBuckets(
+ _distributorComponent.getDistribution(),
+ cmd->getSystemState());
+ replyToPreviousPendingClusterStateIfAny();
+
+ ClusterInformation::CSP clusterInfo(
+ new SimpleClusterInformation(
+ _distributorComponent.getIndex(),
+ _distributorComponent.getDistribution(),
+ _distributorComponent.getClusterState(),
+ _distributorComponent.getDistributor()
+ .getStorageNodeUpStates()));
+ _pendingClusterState = PendingClusterState::createForClusterStateChange(
+ _distributorComponent.getClock(),
+ std::move(clusterInfo),
+ _sender,
+ cmd,
+ _outdatedNodes,
+ _distributorComponent.getUniqueTimestamp());
+ _outdatedNodes = _pendingClusterState->getOutdatedNodeSet();
+
+ if (isPendingClusterStateCompleted()) {
+ processCompletedPendingClusterState();
+ }
+ return true;
+}
+
+BucketDBUpdater::MergeReplyGuard::~MergeReplyGuard()
+{
+ if (_reply.get()) {
+ _updater.getDistributorComponent().getDistributor()
+ .handleCompletedMerge(_reply);
+ }
+}
+
+bool
+BucketDBUpdater::onMergeBucketReply(
+ const std::shared_ptr<api::MergeBucketReply>& reply)
+{
+ std::shared_ptr<MergeReplyGuard> replyGuard(
+ new MergeReplyGuard(*this, reply));
+
+ // In case the merge was unsuccessful somehow, or some nodes weren't
+ // actually merged (source-only nodes?) we request the bucket info of the
+ // bucket again to make sure it's ok.
+ for (uint32_t i = 0; i < reply->getNodes().size(); i++) {
+ sendRequestBucketInfo(reply->getNodes()[i].index,
+ reply->getBucketId(),
+ replyGuard);
+ }
+
+ return true;
+}
+
+void
+BucketDBUpdater::enqueueRecheckUntilPendingStateEnabled(
+ uint16_t node,
+ const document::BucketId& bucket)
+{
+ LOG(spam,
+ "DB updater has a pending cluster state, enqueuing recheck "
+ "of bucket %s on node %u until state is done processing",
+ bucket.toString().c_str(),
+ node);
+ _enqueuedRechecks.insert(EnqueuedBucketRecheck(node, bucket));
+}
+
+void
+BucketDBUpdater::sendAllQueuedBucketRechecks()
+{
+ LOG(spam,
+ "Sending %zu queued bucket rechecks previously received "
+ "via NotifyBucketChange commands",
+ _enqueuedRechecks.size());
+
+ typedef std::set<EnqueuedBucketRecheck>::const_iterator const_iterator;
+ for (const_iterator it(_enqueuedRechecks.begin()),
+ e(_enqueuedRechecks.end()); it != e; ++it)
+ {
+ sendRequestBucketInfo(it->node,
+ it->bucket,
+ std::shared_ptr<MergeReplyGuard>());
+ }
+ _enqueuedRechecks.clear();
+}
+
+bool
+BucketDBUpdater::onNotifyBucketChange(
+ const std::shared_ptr<api::NotifyBucketChangeCommand>& cmd)
+{
+ // Immediately schedule reply to ensure it is sent.
+ _sender.sendReply(std::shared_ptr<api::StorageReply>(
+ new api::NotifyBucketChangeReply(*cmd)));
+
+ if (!cmd->getBucketInfo().valid()) {
+ LOG(error,
+ "Received invalid bucket info for bucket %s from notify bucket "
+ "change! Not updating bucket.",
+ cmd->getBucketId().toString().c_str());
+ return true;
+ }
+ LOG(debug,
+ "Received notify bucket change from node %u for bucket %s with %s.",
+ cmd->getSourceIndex(),
+ cmd->getBucketId().toString().c_str(),
+ cmd->getBucketInfo().toString().c_str());
+
+ if (hasPendingClusterState()) {
+ enqueueRecheckUntilPendingStateEnabled(cmd->getSourceIndex(),
+ cmd->getBucketId());
+ } else {
+ sendRequestBucketInfo(cmd->getSourceIndex(),
+ cmd->getBucketId(),
+ std::shared_ptr<MergeReplyGuard>());
+ }
+
+ return true;
+}
+
+bool sort_pred(const BucketListMerger::BucketEntry& left,
+ const BucketListMerger::BucketEntry& right)
+{
+ return left.first < right.first;
+}
+
+bool
+BucketDBUpdater::onRequestBucketInfoReply(
+ const std::shared_ptr<api::RequestBucketInfoReply> & repl)
+{
+ if (pendingClusterStateAccepted(repl)) {
+ return true;
+ }
+ return processSingleBucketInfoReply(repl);
+}
+
+bool
+BucketDBUpdater::pendingClusterStateAccepted(
+ const std::shared_ptr<api::RequestBucketInfoReply> & repl)
+{
+ if (_pendingClusterState.get()
+ && _pendingClusterState->onRequestBucketInfoReply(repl))
+ {
+ if (isPendingClusterStateCompleted()) {
+ processCompletedPendingClusterState();
+ }
+ return true;
+ }
+ LOG(spam,
+ "Reply %s was not accepted by pending cluster state",
+ repl->toString().c_str());
+ return false;
+}
+
+void
+BucketDBUpdater::handleSingleBucketInfoFailure(
+ const std::shared_ptr<api::RequestBucketInfoReply>& repl,
+ const BucketRequest& req)
+{
+ LOG(debug, "Request bucket info failed towards node %d: error was %s",
+ req.targetNode, repl->getResult().toString().c_str());
+
+ if (req.bucket != document::BucketId(0)) {
+ framework::MilliSecTime sendTime(_distributorComponent.getClock());
+ sendTime += framework::MilliSecTime(100);
+ _delayedRequests.push_back(std::make_pair(sendTime, req));
+ }
+}
+
+void
+BucketDBUpdater::resendDelayedMessages()
+{
+ if (_pendingClusterState.get()) {
+ _pendingClusterState->resendDelayedMessages();
+ }
+ if (_delayedRequests.empty()) return; // Don't fetch time if not needed
+ framework::MilliSecTime currentTime(_distributorComponent.getClock());
+ while (!_delayedRequests.empty()
+ && currentTime >= _delayedRequests.front().first)
+ {
+ BucketRequest& req(_delayedRequests.front().second);
+ sendRequestBucketInfo(req.targetNode,
+ req.bucket,
+ std::shared_ptr<MergeReplyGuard>());
+ _delayedRequests.pop_front();
+ }
+}
+
+void
+BucketDBUpdater::convertBucketInfoToBucketList(
+ const std::shared_ptr<api::RequestBucketInfoReply>& repl,
+ uint16_t targetNode,
+ BucketListMerger::BucketList& newList)
+{
+ for (uint32_t i = 0; i < repl->getBucketInfo().size(); i++) {
+ LOG(debug,
+ "Received bucket information from node %u for bucket %s: %s",
+ targetNode,
+ repl->getBucketInfo()[i]._bucketId.toString().c_str(),
+ repl->getBucketInfo()[i]._info.toString().c_str());
+
+ newList.push_back(BucketListMerger::BucketEntry(
+ repl->getBucketInfo()[i]._bucketId,
+ repl->getBucketInfo()[i]._info));
+ }
+}
+
+void
+BucketDBUpdater::mergeBucketInfoWithDatabase(
+ const std::shared_ptr<api::RequestBucketInfoReply>& repl,
+ const BucketRequest& req)
+{
+ BucketListMerger::BucketList existing;
+ BucketListMerger::BucketList newList;
+
+ findRelatedBucketsInDatabase(req.targetNode, req.bucket, existing);
+ convertBucketInfoToBucketList(repl, req.targetNode, newList);
+
+ std::sort(existing.begin(), existing.end(), sort_pred);
+ std::sort(newList.begin(), newList.end(), sort_pred);
+
+ BucketListMerger merger(newList, existing, req.timestamp);
+ updateDatabase(req.targetNode, merger);
+}
+
+bool
+BucketDBUpdater::processSingleBucketInfoReply(
+ const std::shared_ptr<api::RequestBucketInfoReply> & repl)
+{
+ std::map<uint64_t, BucketRequest>::iterator iter =
+ _sentMessages.find(repl->getMsgId());
+
+ // Has probably been deleted for some reason earlier.
+ if (iter == _sentMessages.end()) {
+ return true;
+ }
+
+ BucketRequest req = iter->second;
+ _sentMessages.erase(iter);
+
+ if (!_distributorComponent.storageNodeIsUp(req.targetNode)) {
+ // Ignore replies from nodes that are down.
+ return true;
+ }
+ if (repl->getResult().getResult() != api::ReturnCode::OK) {
+ handleSingleBucketInfoFailure(repl, req);
+ return true;
+ }
+ mergeBucketInfoWithDatabase(repl, req);
+ return true;
+}
+
+void
+BucketDBUpdater::addBucketInfoForNode(
+ const BucketDatabase::Entry& e,
+ uint16_t node,
+ BucketListMerger::BucketList& existing) const
+{
+ const BucketCopy* copy(e->getNode(node));
+ if (copy) {
+ existing.push_back(BucketListMerger::BucketEntry(
+ e.getBucketId(), copy->getBucketInfo()));
+ }
+}
+
+void
+BucketDBUpdater::findRelatedBucketsInDatabase(
+ uint16_t node,
+ const document::BucketId& bucketId,
+ BucketListMerger::BucketList& existing)
+{
+ std::vector<BucketDatabase::Entry> entries;
+ _distributorComponent.getBucketDatabase().getAll(bucketId, entries);
+
+ for (uint32_t j = 0; j < entries.size(); ++j) {
+ addBucketInfoForNode(entries[j], node, existing);
+ }
+}
+
+void
+BucketDBUpdater::updateDatabase(uint16_t node, BucketListMerger& merger)
+{
+ for (uint32_t i = 0; i < merger.getRemovedEntries().size(); i++) {
+ _distributorComponent.removeNodeFromDB(merger.getRemovedEntries()[i], node);
+ }
+
+ for (uint32_t i = 0; i < merger.getAddedEntries().size(); i++) {
+ const BucketListMerger::BucketEntry& entry(
+ merger.getAddedEntries()[i]);
+
+ _distributorComponent.updateBucketDatabase(
+ entry.first,
+ BucketCopy(merger.getTimestamp(), node, entry.second),
+ DatabaseUpdate::CREATE_IF_NONEXISTING);
+ }
+}
+
+bool
+BucketDBUpdater::isPendingClusterStateCompleted() const
+{
+ return _pendingClusterState.get() && _pendingClusterState->done();
+}
+
+void
+BucketDBUpdater::processCompletedPendingClusterState()
+{
+ _pendingClusterState->mergeInto(_distributorComponent.getBucketDatabase());
+
+ if (_pendingClusterState->getCommand().get()) {
+ enableCurrentClusterStateInDistributor();
+ _distributorComponent.getDistributor().getMessageSender().sendDown(
+ _pendingClusterState->getCommand());
+ addCurrentStateToClusterStateHistory();
+ } else {
+ _distributorComponent.getDistributor().notifyDistributionChangeEnabled();
+ }
+
+ _pendingClusterState.reset();
+ _outdatedNodes.clear();
+ sendAllQueuedBucketRechecks();
+}
+
+void
+BucketDBUpdater::enableCurrentClusterStateInDistributor()
+{
+ const lib::ClusterState& state(
+ _pendingClusterState->getCommand()->getSystemState());
+
+ LOG(debug,
+ "BucketDBUpdater finished processing state %s",
+ state.toString().c_str());
+
+ _distributorComponent.getDistributor().enableClusterState(state);
+}
+
+void
+BucketDBUpdater::addCurrentStateToClusterStateHistory()
+{
+ _history.push_back(_pendingClusterState->getSummary());
+
+ if (_history.size() > 50) {
+ _history.pop_front();
+ }
+}
+
+vespalib::string
+BucketDBUpdater::getReportContentType(const framework::HttpUrlPath&) const
+{
+ return "text/xml";
+}
+
+bool
+BucketDBUpdater::reportStatus(std::ostream& out,
+ const framework::HttpUrlPath& path) const
+{
+ using namespace vespalib::xml;
+ XmlOutputStream xos(out);
+ // FIXME(vekterli): have to do this manually since we cannot inherit
+ // directly from XmlStatusReporter due to data races when BucketDBUpdater
+ // gets status requests directly.
+ xos << XmlTag("status")
+ << XmlAttribute("id", "bucketdb")
+ << XmlAttribute("name", "Bucket Database Updater");
+ reportXmlStatus(xos, path);
+ xos << XmlEndTag();
+ return true;
+}
+
+vespalib::string
+BucketDBUpdater::reportXmlStatus(vespalib::xml::XmlOutputStream& xos,
+ const framework::HttpUrlPath&) const
+{
+ using namespace vespalib::xml;
+ xos << XmlTag("bucketdb")
+ << XmlTag("systemstate_active")
+ << XmlContent(_distributorComponent.getClusterState().toString())
+ << XmlEndTag();
+ if (_pendingClusterState.get() != 0) {
+ xos << *_pendingClusterState;
+ }
+ xos << XmlTag("systemstate_history");
+ typedef std::list<PendingClusterState::Summary>::const_reverse_iterator HistoryIter;
+ for (HistoryIter i(_history.rbegin()), e(_history.rend()); i != e; ++i) {
+ xos << XmlTag("change")
+ << XmlAttribute("from", i->_prevClusterState)
+ << XmlAttribute("to", i->_newClusterState)
+ << XmlAttribute("processingtime", i->_processingTime)
+ << XmlEndTag();
+ }
+ xos << XmlEndTag()
+ << XmlTag("single_bucket_requests");
+ for (std::map<uint64_t, BucketRequest>::const_iterator iter
+ = _sentMessages.begin(); iter != _sentMessages.end(); iter++)
+ {
+ xos << XmlTag("storagenode")
+ << XmlAttribute("index", iter->second.targetNode);
+ if (iter->second.bucket.getRawId() == 0) {
+ xos << XmlAttribute("bucket", "all");
+ } else {
+ xos << XmlAttribute("bucket", iter->second.bucket.getId(),
+ XmlAttribute::HEX);
+ }
+ xos << XmlAttribute("sendtimestamp", iter->second.timestamp)
+ << XmlEndTag();
+ }
+ xos << XmlEndTag() << XmlEndTag();
+ return "";
+}
+
+bool
+BucketDBUpdater::BucketListGenerator::process(BucketDatabase::Entry& e)
+{
+ document::BucketId bucketId(e.getBucketId());
+
+ const BucketCopy* copy(e->getNode(_node));
+ if (copy) {
+ _entries.push_back(
+ BucketListMerger::BucketEntry(
+ bucketId,
+ copy->getBucketInfo()));
+ }
+ return true;
+}
+
+void
+BucketDBUpdater::NodeRemover::logRemove(const document::BucketId& bucketId,
+ const char* msg) const
+{
+ LOG(spam, "Removing bucket %s: %s", bucketId.toString().c_str(), msg);
+ LOG_BUCKET_OPERATION_NO_LOCK(bucketId, msg);
+}
+
+bool
+BucketDBUpdater::NodeRemover::distributorOwnsBucket(
+ const document::BucketId& bucketId) const
+{
+ try {
+ uint16_t distributor(
+ _distribution.getIdealDistributorNode(_state, bucketId, "uim"));
+ if (distributor != _localIndex) {
+ logRemove(bucketId, "bucket now owned by another distributor");
+ return false;
+ }
+ return true;
+ } catch (lib::TooFewBucketBitsInUseException& exc) {
+ logRemove(bucketId, "using too few distribution bits now");
+ } catch (lib::NoDistributorsAvailableException& exc) {
+ logRemove(bucketId, "no distributors are available");
+ }
+ return false;
+}
+
+void
+BucketDBUpdater::NodeRemover::setCopiesInEntry(
+ BucketDatabase::Entry& e,
+ const std::vector<BucketCopy>& copies) const
+{
+ e->clear();
+
+ std::vector<uint16_t> order =
+ _distribution.getIdealStorageNodes(_state, e.getBucketId(), _upStates);
+
+ e->addNodes(copies, order);
+
+ LOG(debug, "Changed %s", e->toString().c_str());
+ LOG_BUCKET_OPERATION_NO_LOCK(
+ e.getBucketId(),
+ vespalib::make_vespa_string("updated bucketdb entry to %s",
+ e->toString().c_str()));
+}
+
+void
+BucketDBUpdater::NodeRemover::removeEmptyBucket(const document::BucketId& bucketId)
+{
+ _removedBuckets.push_back(bucketId);
+
+ LOG(debug,
+ "After system state change %s, bucket %s now has no copies.",
+ _oldState.getTextualDifference(_state).c_str(),
+ bucketId.toString().c_str());
+ LOG_BUCKET_OPERATION_NO_LOCK(bucketId, "bucket now has no copies");
+}
+
+bool
+BucketDBUpdater::NodeRemover::process(BucketDatabase::Entry& e)
+{
+ const document::BucketId& bucketId(e.getBucketId());
+
+ LOG(spam, "Check for remove: bucket %s", e.toString().c_str());
+ if (e->getNodeCount() == 0) {
+ removeEmptyBucket(e.getBucketId());
+ return true;
+ }
+ if (!distributorOwnsBucket(bucketId)) {
+ _removedBuckets.push_back(bucketId);
+ return true;
+ }
+
+ std::vector<BucketCopy> remainingCopies;
+ for (uint16_t i = 0; i < e->getNodeCount(); i++) {
+ Node n(NodeType::STORAGE, e->getNodeRef(i).getNode());
+
+ if (_state.getNodeState(n).getState().oneOf(_upStates)) {
+ remainingCopies.push_back(e->getNodeRef(i));
+ }
+ }
+
+ if (remainingCopies.size() == e->getNodeCount()) {
+ return true;
+ }
+
+ if (remainingCopies.empty()) {
+ removeEmptyBucket(bucketId);
+ } else {
+ setCopiesInEntry(e, remainingCopies);
+ }
+
+ return true;
+}
+
+BucketDBUpdater::NodeRemover::~NodeRemover()
+{
+ if (_removedBuckets.size() > 0) {
+ std::ostringstream ost;
+ ost << "After system state change "
+ << _oldState.getTextualDifference(_state) << ", we removed "
+ << "buckets. Data is unavailable until node comes back up. "
+ << _removedBuckets.size() << " buckets removed:";
+ for (uint32_t i=0; i < 10 && i < _removedBuckets.size(); ++i) {
+ ost << " " << _removedBuckets[i];
+ }
+ if (_removedBuckets.size() >= 10) {
+ ost << " ...";
+ }
+ LOGBM(info, ost.str().c_str());
+ }
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/bucketdbupdater.h b/storage/src/vespa/storage/distributor/bucketdbupdater.h
new file mode 100644
index 00000000000..01fae03f44d
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketdbupdater.h
@@ -0,0 +1,267 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/fastos/fastos.h>
+#include <set>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/storageapi/messageapi/returncode.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storage/distributor/bucketlistmerger.h>
+#include <vespa/storage/distributor/messageguard.h>
+#include <vespa/storage/distributor/distributorcomponent.h>
+#include <vespa/storage/distributor/distributormessagesender.h>
+#include <vespa/storage/distributor/pendingclusterstate.h>
+#include <vespa/storageframework/generic/memory/memorymanagerinterface.h>
+#include <vespa/storageapi/messageapi/messagehandler.h>
+
+namespace storage
+{
+
+namespace distributor
+{
+
+class Distributor;
+
+class BucketDBUpdater : public framework::StatusReporter,
+ public api::MessageHandler
+{
+public:
+ BucketDBUpdater(Distributor& owner,
+ DistributorMessageSender& sender,
+ DistributorComponentRegister& compReg);
+ ~BucketDBUpdater();
+
+ void flush();
+
+ BucketOwnership checkOwnershipInPendingState(const document::BucketId&) const;
+
+ void recheckBucketInfo(uint32_t nodeIdx, const document::BucketId& bid);
+
+ bool onSetSystemState(const std::shared_ptr<api::SetSystemStateCommand>& cmd);
+
+ bool onRequestBucketInfoReply(
+ const std::shared_ptr<api::RequestBucketInfoReply> & repl);
+
+ bool onMergeBucketReply(const std::shared_ptr<api::MergeBucketReply>& reply);
+
+ bool onNotifyBucketChange(const std::shared_ptr<api::NotifyBucketChangeCommand>&);
+
+ void resendDelayedMessages();
+
+ void storageDistributionChanged(const lib::Distribution&);
+
+ vespalib::string reportXmlStatus(vespalib::xml::XmlOutputStream&,
+ const framework::HttpUrlPath&) const;
+
+ vespalib::string getReportContentType(
+ const framework::HttpUrlPath&) const;
+ bool reportStatus(std::ostream&, const framework::HttpUrlPath&) const;
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ DistributorComponent& getDistributorComponent() { return _distributorComponent; }
+
+private:
+ DistributorComponent _distributorComponent;
+ class MergeReplyGuard {
+ public:
+ MergeReplyGuard(BucketDBUpdater& updater,
+ const std::shared_ptr<api::MergeBucketReply>& reply)
+ : _updater(updater), _reply(reply) {}
+
+ ~MergeReplyGuard();
+
+ // Used when we're flushing and simply want to drop the reply rather
+ // than send it down
+ void resetReply() { _reply.reset(); }
+ private:
+ BucketDBUpdater& _updater;
+ std::shared_ptr<api::MergeBucketReply> _reply;
+ };
+
+ struct BucketRequest {
+ BucketRequest()
+ : targetNode(0), bucket(0), timestamp(0) {};
+
+ BucketRequest(uint16_t t,
+ uint64_t currentTime,
+ const document::BucketId& b,
+ const std::shared_ptr<MergeReplyGuard>& guard)
+ : targetNode(t),
+ bucket(b),
+ timestamp(currentTime),
+ _mergeReplyGuard(guard) {};
+
+ uint16_t targetNode;
+ document::BucketId bucket;
+ uint64_t timestamp;
+
+ std::shared_ptr<MergeReplyGuard> _mergeReplyGuard;
+ };
+
+ struct EnqueuedBucketRecheck {
+ uint16_t node;
+ document::BucketId bucket;
+
+ EnqueuedBucketRecheck() : node(0), bucket() {}
+
+ EnqueuedBucketRecheck(uint16_t _node,
+ const document::BucketId& _bucket)
+ : node(_node),
+ bucket(_bucket)
+ {}
+
+ bool operator<(const EnqueuedBucketRecheck& o) const {
+ if (node != o.node) {
+ return node < o.node;
+ }
+ return bucket < o.bucket;
+ }
+ bool operator==(const EnqueuedBucketRecheck& o) const {
+ return node == o.node && bucket == o.bucket;
+ }
+ };
+
+ bool hasPendingClusterState() const;
+
+ void clearPending(uint16_t node);
+
+ bool pendingClusterStateAccepted(
+ const std::shared_ptr<api::RequestBucketInfoReply>& repl);
+ bool bucketOwnedAccordingToPendingState(
+ const document::BucketId& bucketId) const;
+ bool processSingleBucketInfoReply(
+ const std::shared_ptr<api::RequestBucketInfoReply>& repl);
+ void handleSingleBucketInfoFailure(
+ const std::shared_ptr<api::RequestBucketInfoReply>& repl,
+ const BucketRequest& req);
+ bool isPendingClusterStateCompleted() const;
+ void processCompletedPendingClusterState();
+ void mergeBucketInfoWithDatabase(
+ const std::shared_ptr<api::RequestBucketInfoReply>& repl,
+ const BucketRequest& req);
+ void convertBucketInfoToBucketList(
+ const std::shared_ptr<api::RequestBucketInfoReply>& repl,
+ uint16_t targetNode,
+ BucketListMerger::BucketList& newList);
+ void sendRequestBucketInfo(
+ uint16_t node,
+ const document::BucketId& bucket,
+ const std::shared_ptr<MergeReplyGuard>& mergeReply);
+ void addBucketInfoForNode(
+ const BucketDatabase::Entry& e,
+ uint16_t node,
+ BucketListMerger::BucketList& existing) const;
+ /**
+ * Adds all buckets contained in the bucket database
+ * that are either contained
+ * in bucketId, or that bucketId is contained in, that have copies
+ * on the given node.
+ */
+ void findRelatedBucketsInDatabase(
+ uint16_t node,
+ const document::BucketId& bucketId,
+ BucketListMerger::BucketList& existing);
+
+ /**
+ Updates the bucket database from the information generated by the given
+ bucket list merger.
+ */
+ void updateDatabase(uint16_t node, BucketListMerger& merger);
+
+ void updateState(const lib::ClusterState& oldState,
+ const lib::ClusterState& newState);
+
+ void removeSuperfluousBuckets(const lib::Distribution& newDistribution,
+ const lib::ClusterState& newState);
+
+ void replyToPreviousPendingClusterStateIfAny();
+
+ void enableCurrentClusterStateInDistributor();
+ void addCurrentStateToClusterStateHistory();
+ void enqueueRecheckUntilPendingStateEnabled(uint16_t node,
+ const document::BucketId&);
+ void sendAllQueuedBucketRechecks();
+
+ friend class BucketDBUpdater_Test;
+ friend class MergeOperation_Test;
+
+ class BucketListGenerator
+ {
+ public:
+ BucketListGenerator(uint16_t node,
+ BucketListMerger::BucketList& entries)
+ : _node(node), _entries(entries) {};
+
+ bool process(BucketDatabase::Entry&);
+
+ private:
+ uint16_t _node;
+ BucketListMerger::BucketList& _entries;
+ };
+
+ /**
+ Removes all copies of buckets that are on nodes that are down.
+ */
+ class NodeRemover : public BucketDatabase::MutableEntryProcessor
+ {
+ public:
+ NodeRemover(const lib::ClusterState& oldState,
+ const lib::ClusterState& s,
+ const document::BucketIdFactory& factory,
+ uint16_t localIndex,
+ const lib::Distribution& distribution,
+ const char* upStates)
+ : _oldState(oldState),
+ _state(s),
+ _factory(factory),
+ _localIndex(localIndex),
+ _distribution(distribution),
+ _upStates(upStates) {}
+
+ ~NodeRemover();
+
+ virtual bool process(BucketDatabase::Entry& e);
+
+ void logRemove(const document::BucketId& bucketId,
+ const char* msg) const;
+
+ bool distributorOwnsBucket(const document::BucketId&) const;
+
+ const std::vector<document::BucketId>& getBucketsToRemove() const {
+ return _removedBuckets;
+ }
+ private:
+ void setCopiesInEntry(BucketDatabase::Entry& e,
+ const std::vector<BucketCopy>& copies) const;
+ void removeEmptyBucket(const document::BucketId& bucketId);
+
+ const lib::ClusterState _oldState;
+ const lib::ClusterState _state;
+ std::vector<document::BucketId> _removedBuckets;
+
+ const document::BucketIdFactory& _factory;
+ uint16_t _localIndex;
+ const lib::Distribution& _distribution;
+ const char* _upStates;
+ };
+
+ std::deque<std::pair<framework::MilliSecTime,
+ BucketRequest> > _delayedRequests;
+ std::map<uint64_t, BucketRequest> _sentMessages;
+ std::unique_ptr<PendingClusterState> _pendingClusterState;
+ std::list<PendingClusterState::Summary> _history;
+ DistributorMessageSender& _sender;
+ std::set<EnqueuedBucketRecheck> _enqueuedRechecks;
+ std::unordered_set<uint16_t> _outdatedNodes;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/bucketgctimecalculator.cpp b/storage/src/vespa/storage/distributor/bucketgctimecalculator.cpp
new file mode 100644
index 00000000000..85627ddf38f
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketgctimecalculator.cpp
@@ -0,0 +1,33 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/bucketgctimecalculator.h>
+
+namespace storage {
+namespace distributor {
+
+bool
+BucketGcTimeCalculator::shouldGc(const document::BucketId& b,
+ std::chrono::seconds currentTime,
+ std::chrono::seconds lastRunAt) const
+{
+ if (_checkInterval.count() == 0) {
+ return false;
+ }
+ std::chrono::seconds gcPoint(_hasher.hash(b) % _checkInterval.count());
+ std::chrono::seconds currentPeriodStart(currentTime
+ - (currentTime % _checkInterval));
+ std::chrono::seconds newestValid(currentPeriodStart + gcPoint);
+
+ // Should GC have been started in current period?
+ if (currentTime >= newestValid && lastRunAt < newestValid) {
+ return true;
+ }
+ // Not in current; did it miss the previous period?
+ return lastRunAt < (newestValid - _checkInterval);
+}
+
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/bucketgctimecalculator.h b/storage/src/vespa/storage/distributor/bucketgctimecalculator.h
new file mode 100644
index 00000000000..d01c0c9083f
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketgctimecalculator.h
@@ -0,0 +1,56 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <chrono>
+#include <vespa/document/bucket/bucketid.h>
+
+namespace storage {
+namespace distributor {
+
+/**
+ * Semantics are basically as follows:
+ * We divide the timeline into periods based on the configured check
+ * interval, with each bucket having a start point in this period
+ * based on its hash. If the current time is at least that of the start
+ * point and the bucket has not been checked after this point, it is
+ * scheduled for GC. Otherwise, the bucket is checked iff there
+ * has been at least one missed start point in a previous period.
+ *
+ * If the check period is zero, this is considered to mean GC is disabled.
+ */
+
+class BucketGcTimeCalculator
+{
+public:
+ class BucketIdHasher {
+ virtual size_t doHash(const document::BucketId&) const = 0;
+ public:
+ virtual ~BucketIdHasher() {}
+ size_t hash(const document::BucketId& b) const { return doHash(b); }
+ };
+
+ class BucketIdIdentityHasher : public BucketIdHasher {
+ size_t doHash(const document::BucketId& b) const override {
+ return b.getId();
+ }
+ };
+
+ BucketGcTimeCalculator(const BucketIdHasher& hasher,
+ std::chrono::seconds checkInterval)
+ : _hasher(hasher),
+ _checkInterval(checkInterval)
+ {
+ }
+
+ bool shouldGc(const document::BucketId&,
+ std::chrono::seconds currentTime,
+ std::chrono::seconds lastRunAt) const;
+
+private:
+ const BucketIdHasher& _hasher;
+ std::chrono::seconds _checkInterval;
+};
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/bucketlistmerger.cpp b/storage/src/vespa/storage/distributor/bucketlistmerger.cpp
new file mode 100644
index 00000000000..894a8717c8a
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketlistmerger.cpp
@@ -0,0 +1,35 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/bucketlistmerger.h>
+
+using namespace storage::distributor;
+
+BucketListMerger::BucketListMerger(const BucketList& newList,
+ const BucketList& oldList,
+ uint64_t timestamp)
+ : _timestamp(timestamp)
+{
+ uint32_t i = 0;
+ uint32_t j = 0;
+
+ while (i < newList.size() || j < oldList.size()) {
+ if (i >= newList.size()) {
+ _removedEntries.push_back(oldList[j].first);
+ j++;
+ } else if (j >= oldList.size()) {
+ _addedEntries.push_back(newList[i]);
+ i++;
+ } else if (newList[i].first.getId() > oldList[j].first.getId()) {
+ _removedEntries.push_back(oldList[j].first);
+ j++;
+ } else if (newList[i].first.getId() < oldList[j].first.getId()) {
+ _addedEntries.push_back(newList[i]);
+ i++;
+ } else {
+ if (!(newList[i].second == oldList[j].second)) {
+ _addedEntries.push_back(newList[i]);
+ }
+ i++; j++;
+ }
+ }
+}
diff --git a/storage/src/vespa/storage/distributor/bucketlistmerger.h b/storage/src/vespa/storage/distributor/bucketlistmerger.h
new file mode 100644
index 00000000000..f7737ad0000
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketlistmerger.h
@@ -0,0 +1,44 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/storageapi/buckets/bucketinfo.h>
+
+namespace storage {
+
+namespace distributor {
+
+/**
+ Merges two sorted lists of buckets.
+
+ Creates two lists:
+ - One list containing buckets missing from the old list, or that are in both and have different checksums (to get updated bucket information)
+ - One list containing buckets missing from the new list (to be deleted).
+*/
+class BucketListMerger
+{
+public:
+ typedef std::pair<document::BucketId, api::BucketInfo> BucketEntry;
+ typedef std::vector<BucketEntry> BucketList;
+
+ BucketListMerger(const BucketList& newList, const BucketList& oldList,
+ uint64_t timestamp);
+
+ const std::vector<BucketEntry>& getAddedEntries()
+ { return _addedEntries; }
+
+ const std::vector<document::BucketId>& getRemovedEntries()
+ { return _removedEntries; }
+
+ uint64_t getTimestamp() const { return _timestamp; }
+
+private:
+ std::vector<BucketEntry> _addedEntries;
+ std::vector<document::BucketId> _removedEntries;
+ uint64_t _timestamp;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/bucketownership.h b/storage/src/vespa/storage/distributor/bucketownership.h
new file mode 100644
index 00000000000..c3125c262ed
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/bucketownership.h
@@ -0,0 +1,49 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/vdslib/state/clusterstate.h>
+
+namespace storage {
+namespace distributor {
+
+class BucketOwnership
+{
+ const lib::ClusterState* _checkedState;
+ bool _owned;
+
+ BucketOwnership(const lib::ClusterState& checkedState)
+ : _checkedState(&checkedState),
+ _owned(false)
+ {
+ }
+
+ BucketOwnership() : _checkedState(nullptr), _owned(true) {}
+
+public:
+ bool isOwned() const { return _owned; }
+ /**
+ * Cluster state in which the ownership check failed. Lifetime of returned
+ * reference depends on when the active or pending cluster state of the
+ * distributor may be altered, so it should be used immediately and not
+ * stored away. Since the distributor is single threaded, immediate use
+ * should be safe.
+ *
+ * Precondition: isOwned() == false
+ */
+ const lib::ClusterState& getNonOwnedState() {
+ assert(!isOwned());
+ return *_checkedState;
+ }
+
+ static BucketOwnership createOwned() {
+ return BucketOwnership();
+ }
+
+ static BucketOwnership createNotOwnedInState(const lib::ClusterState& s) {
+ return BucketOwnership(s);
+ }
+};
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/clusterinformation.cpp b/storage/src/vespa/storage/distributor/clusterinformation.cpp
new file mode 100644
index 00000000000..b6d0b19cb2b
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/clusterinformation.cpp
@@ -0,0 +1,56 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/clusterinformation.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/vdslib/state/clusterstate.h>
+
+namespace storage {
+namespace distributor {
+
+bool
+ClusterInformation::ownsBucket(const document::BucketId& bucketId) const
+{
+ try {
+ uint16_t distributor(getDistribution().getIdealDistributorNode(
+ getClusterState(), bucketId));
+
+ return (getDistributorIndex() == distributor);
+ } catch (lib::TooFewBucketBitsInUseException& e) {
+ return false;
+ } catch (lib::NoDistributorsAvailableException& e) {
+ return false;
+ }
+}
+
+bool
+ClusterInformation::nodeInSameGroupAsSelf(uint16_t otherNode) const
+{
+ return (getDistribution().getNodeGraph().getGroupForNode(otherNode)
+ == getDistribution().getNodeGraph().getGroupForNode(getDistributorIndex()));
+}
+
+vespalib::string
+ClusterInformation::getDistributionHash() const
+{
+ return getDistribution().getNodeGraph().getDistributionConfigHash();
+}
+
+std::vector<uint16_t>
+ClusterInformation::getIdealStorageNodesForState(
+ const lib::ClusterState& clusterState,
+ const document::BucketId& bucketId) const
+{
+ return getDistribution().getIdealStorageNodes(
+ clusterState,
+ bucketId,
+ getStorageUpStates());
+}
+
+uint16_t
+ClusterInformation::getStorageNodeCount() const
+{
+ return getClusterState().getNodeCount(lib::NodeType::STORAGE);
+}
+
+}
+}
diff --git a/storage/src/vespa/storage/distributor/clusterinformation.h b/storage/src/vespa/storage/distributor/clusterinformation.h
new file mode 100644
index 00000000000..4f88f98df54
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/clusterinformation.h
@@ -0,0 +1,50 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <stdint.h>
+#include <vector>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/vespalib/stllike/string.h>
+
+namespace storage {
+
+namespace lib {
+
+class Distribution;
+class ClusterState;
+
+}
+
+namespace distributor {
+
+class ClusterInformation
+{
+public:
+ typedef std::shared_ptr<const ClusterInformation> CSP;
+
+ virtual ~ClusterInformation() {}
+
+ virtual uint16_t getDistributorIndex() const = 0;
+
+ virtual const lib::Distribution& getDistribution() const = 0;
+
+ virtual const lib::ClusterState& getClusterState() const = 0;
+
+ virtual const char* getStorageUpStates() const = 0;
+
+ bool ownsBucket(const document::BucketId& bucketId) const;
+
+ bool nodeInSameGroupAsSelf(uint16_t otherNode) const;
+
+ vespalib::string getDistributionHash() const;
+
+ std::vector<uint16_t> getIdealStorageNodesForState(
+ const lib::ClusterState& clusterState,
+ const document::BucketId& bucketId) const;
+
+ uint16_t getStorageNodeCount() const;
+};
+
+}
+}
+
diff --git a/storage/src/vespa/storage/distributor/common/.gitignore b/storage/src/vespa/storage/distributor/common/.gitignore
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/common/.gitignore
diff --git a/storage/src/vespa/storage/distributor/delegatedstatusrequest.h b/storage/src/vespa/storage/distributor/delegatedstatusrequest.h
new file mode 100644
index 00000000000..ca0fb239f76
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/delegatedstatusrequest.h
@@ -0,0 +1,29 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storageframework/storageframework.h>
+
+namespace storage {
+namespace distributor {
+
+struct DelegatedStatusRequest
+{
+ const framework::StatusReporter& reporter;
+ const framework::HttpUrlPath& path;
+ std::ostream& outputStream;
+
+ DelegatedStatusRequest(const framework::StatusReporter& _reporter,
+ const framework::HttpUrlPath& _path,
+ std::ostream& _outputStream)
+ : reporter(_reporter),
+ path(_path),
+ outputStream(_outputStream)
+ {}
+
+private:
+ DelegatedStatusRequest(const DelegatedStatusRequest&);
+ DelegatedStatusRequest& operator=(const DelegatedStatusRequest&);
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/distributor.cpp b/storage/src/vespa/storage/distributor/distributor.cpp
new file mode 100644
index 00000000000..2c806901de5
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/distributor.cpp
@@ -0,0 +1,759 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/distributor/distributor.h>
+#include <vespa/storage/distributor/bucketdb/mapbucketdatabase.h>
+#include <vespa/storage/distributor/maintenance/simplemaintenancescanner.h>
+#include <vespa/storage/distributor/maintenance/simplebucketprioritydatabase.h>
+#include <vespa/storage/distributor/blockingoperationstarter.h>
+#include <vespa/storage/distributor/throttlingoperationstarter.h>
+#include <vespa/storage/distributor/idealstatemetricsset.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/common/hostreporter/hostinfo.h>
+
+LOG_SETUP(".distributor-main");
+
+namespace storage {
+namespace distributor {
+
+class Distributor::Status {
+ const DelegatedStatusRequest& _request;
+ vespalib::Monitor _monitor;
+ bool _done;
+
+public:
+ Status(const DelegatedStatusRequest& request)
+ : _request(request),
+ _monitor(),
+ _done(false)
+ {}
+
+ std::ostream& getStream() {
+ return _request.outputStream;
+ }
+ const framework::HttpUrlPath& getPath() const {
+ return _request.path;
+ }
+ const framework::StatusReporter& getReporter() const {
+ return _request.reporter;
+ }
+
+ void notifyCompleted() {
+ vespalib::MonitorGuard guard(_monitor);
+ _done = true;
+ guard.broadcast();
+ }
+ void waitForCompletion() {
+ vespalib::MonitorGuard guard(_monitor);
+ while (!_done) {
+ guard.wait();
+ }
+ }
+};
+
+Distributor::Distributor(DistributorComponentRegister& compReg,
+ framework::TickingThreadPool& threadPool,
+ DoneInitializeHandler& doneInitHandler,
+ bool manageActiveBucketCopies,
+ HostInfo& hostInfoReporterRegistrar,
+ ChainedMessageSender* messageSender)
+ : StorageLink("distributor"),
+ DistributorInterface(),
+ framework::StatusReporter("distributor", "Distributor"),
+ _compReg(compReg),
+ _component(compReg, "distributor"),
+ _metrics(new DistributorMetricSet(
+ _component.getLoadTypes()->getMetricLoadTypes())),
+ _operationOwner(*this, _component.getClock()),
+ _maintenanceOperationOwner(*this, _component.getClock()),
+ _pendingMessageTracker(compReg),
+ _bucketDBUpdater(*this, *this, compReg),
+ _distributorStatusDelegate(compReg, *this, *this),
+ _bucketDBStatusDelegate(compReg, *this, _bucketDBUpdater),
+ _idealStateManager(*this, compReg,
+ manageActiveBucketCopies),
+ _externalOperationHandler(*this, _idealStateManager, compReg),
+ _threadPool(threadPool),
+ _initializingIsUp(true),
+ _doneInitializeHandler(doneInitHandler),
+ _doneInitializing(false),
+ _messageSender(messageSender),
+ _bucketPriorityDb(new SimpleBucketPriorityDatabase()),
+ _scanner(new SimpleMaintenanceScanner(*_bucketPriorityDb,
+ _idealStateManager,
+ getBucketDatabase())),
+ _throttlingStarter(new ThrottlingOperationStarter(
+ _maintenanceOperationOwner)),
+ _blockingStarter(new BlockingOperationStarter(_pendingMessageTracker,
+ *_throttlingStarter)),
+ _scheduler(new MaintenanceScheduler(_idealStateManager,
+ *_bucketPriorityDb,
+ *_blockingStarter)),
+ _schedulingMode(MaintenanceScheduler::NORMAL_SCHEDULING_MODE),
+ _recoveryTimeStarted(_component.getClock()),
+ _tickResult(framework::ThreadWaitInfo::NO_MORE_CRITICAL_WORK_KNOWN),
+ _clusterName(_component.getClusterName()),
+ _bucketIdHasher(new BucketGcTimeCalculator::BucketIdIdentityHasher()),
+ _metricUpdateHook(*this),
+ _metricLock(),
+ _maintenanceStats(),
+ _bucketDbStats(),
+ _hostInfoReporter(_pendingMessageTracker.getLatencyStatisticsProvider(),
+ *this)
+{
+ _component.registerMetric(*_metrics);
+ _component.registerMetricUpdateHook(_metricUpdateHook,
+ framework::SecondTime(0));
+ _distributorStatusDelegate.registerStatusPage();
+ _bucketDBStatusDelegate.registerStatusPage();
+ hostInfoReporterRegistrar.registerReporter(&_hostInfoReporter);
+};
+
+Distributor::~Distributor()
+{
+ // XXX: why is there no _component.unregisterMetricUpdateHook()?
+ closeNextLink();
+}
+
+int
+Distributor::getDistributorIndex() const
+{
+ return _component.getIndex();
+}
+
+const std::string&
+Distributor::getClusterName() const
+{
+ return _clusterName;
+}
+
+const PendingMessageTracker&
+Distributor::getPendingMessageTracker() const
+{
+ return _pendingMessageTracker;
+}
+
+BucketOwnership
+Distributor::checkOwnershipInPendingState(const document::BucketId& b) const
+{
+ return _bucketDBUpdater.checkOwnershipInPendingState(b);
+}
+
+void
+Distributor::sendCommand(const std::shared_ptr<api::StorageCommand>& cmd)
+{
+ if (cmd->getType() == api::MessageType::MERGEBUCKET) {
+ api::MergeBucketCommand& merge(
+ static_cast<api::MergeBucketCommand&>(*cmd));
+ _idealStateManager.getMetrics().nodesPerMerge.addValue(
+ merge.getNodes().size());
+ }
+ sendUp(cmd);
+}
+
+void
+Distributor::sendReply(const std::shared_ptr<api::StorageReply>& reply)
+{
+ sendUp(reply);
+}
+
+void
+Distributor::setNodeStateUp()
+{
+ NodeStateUpdater::Lock::SP lock(
+ _component.getStateUpdater().grabStateChangeLock());
+ lib::NodeState ns(
+ *_component.getStateUpdater().getReportedNodeState());
+ ns.setState(lib::State::UP);
+ _component.getStateUpdater().setReportedNodeState(ns);
+}
+
+void
+Distributor::onOpen()
+{
+ LOG(debug, "Distributor::onOpen invoked");
+ setNodeStateUp();
+ framework::MilliSecTime maxProcessingTime(60 * 1000);
+ framework::MilliSecTime waitTime(1000);
+ if (_component.getDistributorConfig().startDistributorThread) {
+ _threadPool.addThread(*this);
+ _threadPool.start(_component.getThreadPool());
+ } else {
+ LOG(warning, "Not starting distributor thread as it's configured to "
+ "run. Unless you are just running a test tool, this is a "
+ "fatal error.");
+ }
+}
+
+void
+Distributor::onClose()
+{
+ for (uint32_t i=0; i<_messageQueue.size(); ++i) {
+ std::shared_ptr<api::StorageMessage> msg = _messageQueue[i];
+ if (!msg->getType().isReply()) {
+ api::StorageReply::UP reply(
+ std::dynamic_pointer_cast<api::StorageCommand>(msg)
+ ->makeReply());
+ reply->setResult(api::ReturnCode(api::ReturnCode::ABORTED,
+ "Distributor is shutting down"));
+ sendUp(std::shared_ptr<api::StorageMessage>(reply.release()));
+ }
+ }
+ _messageQueue.clear();
+
+ LOG(debug, "Distributor::onFlush invoked");
+ _bucketDBUpdater.flush();
+ _operationOwner.onClose();
+ _maintenanceOperationOwner.onClose();
+}
+
+void
+Distributor::sendUp(const std::shared_ptr<api::StorageMessage>& msg)
+{
+ _pendingMessageTracker.insert(msg);
+ if (_messageSender != 0) {
+ _messageSender->sendUp(msg);
+ } else {
+ StorageLink::sendUp(msg);
+ }
+}
+
+void
+Distributor::sendDown(const std::shared_ptr<api::StorageMessage>& msg)
+{
+ if (_messageSender != 0) {
+ _messageSender->sendDown(msg);
+ } else {
+ StorageLink::sendDown(msg);
+ }
+}
+
+bool
+Distributor::onDown(const std::shared_ptr<api::StorageMessage>& msg)
+{
+ framework::TickingLockGuard guard(_threadPool.freezeCriticalTicks());
+ MBUS_TRACE(msg->getTrace(), 9,
+ "Distributor: Added to message queue. Thread state: "
+ + _threadPool.getStatus());
+ _messageQueue.push_back(msg);
+ guard.broadcast();
+ return true;
+}
+
+void
+Distributor::handleCompletedMerge(
+ const std::shared_ptr<api::MergeBucketReply>& reply)
+{
+ _maintenanceOperationOwner.handleReply(reply);
+}
+
+bool
+Distributor::isMaintenanceReply(const api::StorageReply& reply) const
+{
+ switch (reply.getType().getId()) {
+ case api::MessageType::CREATEBUCKET_REPLY_ID:
+ case api::MessageType::MERGEBUCKET_REPLY_ID:
+ case api::MessageType::DELETEBUCKET_REPLY_ID:
+ case api::MessageType::REQUESTBUCKETINFO_REPLY_ID:
+ case api::MessageType::SPLITBUCKET_REPLY_ID:
+ case api::MessageType::JOINBUCKETS_REPLY_ID:
+ case api::MessageType::SETBUCKETSTATE_REPLY_ID:
+ case api::MessageType::REMOVELOCATION_REPLY_ID:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool
+Distributor::handleReply(const std::shared_ptr<api::StorageReply>& reply)
+{
+ document::BucketId bid = _pendingMessageTracker.reply(*reply);
+
+ if (reply->getResult().getResult() == api::ReturnCode::BUCKET_NOT_FOUND &&
+ bid != document::BucketId(0) &&
+ reply->getAddress())
+ {
+ recheckBucketInfo(reply->getAddress()->getIndex(), bid);
+ }
+
+ if (reply->callHandler(_bucketDBUpdater, reply)) {
+ return true;
+ }
+
+ if (_operationOwner.handleReply(reply)) {
+ return true;
+ }
+
+ if (_maintenanceOperationOwner.handleReply(reply)) {
+ _scanner->prioritizeBucket(bid);
+ return true;
+ }
+
+ // If it's a maintenance operation reply, it's most likely a reply to an
+ // operation whose state was flushed from the distributor when its node
+ // went down in the cluster state. Just swallow the reply to avoid getting
+ // warnings about unhandled messages at the bottom of the link chain.
+ return isMaintenanceReply(*reply);
+}
+
+bool
+Distributor::generateOperation(
+ const std::shared_ptr<api::StorageMessage>& msg,
+ Operation::SP& operation)
+{
+ return _externalOperationHandler.handleMessage(msg, operation);
+}
+
+bool
+Distributor::handleMessage(const std::shared_ptr<api::StorageMessage>& msg)
+{
+ if (msg->getType().isReply()) {
+ std::shared_ptr<api::StorageReply> reply =
+ std::dynamic_pointer_cast<api::StorageReply>(msg);
+
+ if (handleReply(reply)) {
+ return true;
+ }
+ }
+
+ if (msg->callHandler(_bucketDBUpdater, msg)) {
+ return true;
+ }
+
+ Operation::SP operation;
+ if (generateOperation(msg, operation)) {
+ if (operation.get()) {
+ _operationOwner.start(operation, msg->getPriority());
+ }
+ return true;
+ }
+
+ return false;
+}
+
+void
+Distributor::enableClusterState(const lib::ClusterState& state)
+{
+ lib::ClusterState oldState = _clusterState;
+ _clusterState = state;
+
+ lib::Node myNode(lib::NodeType::DISTRIBUTOR, _component.getIndex());
+
+ if (!_doneInitializing &&
+ getClusterState().getNodeState(myNode).getState() == lib::State::UP)
+ {
+ scanAllBuckets();
+ _doneInitializing = true;
+ _doneInitializeHandler.notifyDoneInitializing();
+ } else {
+ enterRecoveryMode();
+ }
+
+ // Clear all active messages on nodes that are down.
+ for (uint16_t i = 0; i < state.getNodeCount(lib::NodeType::STORAGE); ++i) {
+ if (!state.getNodeState(lib::Node(lib::NodeType::STORAGE, i)).getState()
+ .oneOf(getStorageNodeUpStates()))
+ {
+ std::vector<uint64_t> msgIds(
+ _pendingMessageTracker.clearMessagesForNode(i));
+
+ LOG(debug,
+ "Node %d is down, clearing %d pending maintenance operations",
+ (int)i,
+ (int)msgIds.size());
+
+ for (uint32_t j = 0; j < msgIds.size(); ++j) {
+ _maintenanceOperationOwner.erase(msgIds[j]);
+ }
+ }
+ }
+}
+
+void
+Distributor::notifyDistributionChangeEnabled()
+{
+ LOG(debug, "Pending cluster state for distribution change has been enabled");
+ // Trigger a re-scan of bucket database, just like we do when a new cluster
+ // state has been enabled.
+ enterRecoveryMode();
+}
+
+void
+Distributor::enterRecoveryMode()
+{
+ LOG(debug, "Entering recovery mode");
+ _schedulingMode = MaintenanceScheduler::RECOVERY_SCHEDULING_MODE;
+ _scanner->reset();
+ _bucketDBMetricUpdater.reset();
+ _recoveryTimeStarted = framework::MilliSecTimer(_component.getClock());
+}
+
+void
+Distributor::leaveRecoveryMode()
+{
+ if (isInRecoveryMode()) {
+ LOG(debug, "Leaving recovery mode");
+ _metrics->recoveryModeTime.addValue(_recoveryTimeStarted);
+ }
+ _schedulingMode = MaintenanceScheduler::NORMAL_SCHEDULING_MODE;
+}
+
+void
+Distributor::storageDistributionChanged()
+{
+ if (!_distribution.get()
+ || *_component.getDistribution() != *_distribution)
+ {
+ LOG(debug,
+ "Distribution changed to %s, must refetch bucket information",
+ _component.getDistribution()->toString().c_str());
+
+ _nextDistribution = _component.getDistribution();
+ } else {
+ LOG(debug,
+ "Got distribution change, but the distribution %s was the same as "
+ "before: %s",
+ _component.getDistribution()->toString().c_str(),
+ _distribution->toString().c_str());
+ }
+}
+
+void
+Distributor::recheckBucketInfo(uint16_t nodeIdx, const document::BucketId& bid) {
+ _bucketDBUpdater.recheckBucketInfo(nodeIdx, bid);
+}
+
+namespace {
+
+class MaintenanceChecker : public PendingMessageTracker::Checker
+{
+public:
+ bool found;
+
+ MaintenanceChecker() : found(false) {};
+
+ bool check(uint32_t msgType, uint16_t node, uint8_t pri) {
+ (void) node;
+ (void) pri;
+ for (uint32_t i = 0;
+ IdealStateOperation::MAINTENANCE_MESSAGE_TYPES[i] != 0;
+ ++i)
+ {
+ if (msgType == IdealStateOperation::MAINTENANCE_MESSAGE_TYPES[i]) {
+ found = true;
+ return false;
+ }
+ }
+ return true;
+ }
+};
+
+class SplitChecker : public PendingMessageTracker::Checker
+{
+public:
+ bool found;
+ uint8_t maxPri;
+
+ SplitChecker(uint8_t maxP) : found(false), maxPri(maxP) {};
+
+ bool check(uint32_t msgType, uint16_t node, uint8_t pri) {
+ (void) node;
+ (void) pri;
+ if (msgType == api::MessageType::SPLITBUCKET_ID && pri <= maxPri) {
+ found = true;
+ return false;
+ }
+
+ return true;
+ }
+};
+
+}
+
+void
+Distributor::checkBucketForSplit(const BucketDatabase::Entry& e,
+ uint8_t priority)
+{
+ if (!getConfig().doInlineSplit()) {
+ return;
+ }
+
+ // Verify that there are no existing pending splits at the
+ // appropriate priority.
+ SplitChecker checker(priority);
+ for (uint32_t i = 0; i < e->getNodeCount(); ++i) {
+ _pendingMessageTracker.checkPendingMessages(e->getNodeRef(i).getNode(),
+ e.getBucketId(),
+ checker);
+ if (checker.found) {
+ return;
+ }
+ }
+
+ Operation::SP operation =
+ _idealStateManager.generateInterceptingSplit(e, priority);
+
+ if (operation.get()) {
+ _maintenanceOperationOwner.start(operation, priority);
+ }
+}
+
+const lib::Distribution&
+Distributor::getDistribution() const
+{
+ if (!_distribution.get()) {
+ _distribution = _component.getDistribution();
+ }
+
+ return *_distribution;
+}
+
+void
+Distributor::enableNextDistribution()
+{
+ if (_nextDistribution.get()) {
+ _distribution = _nextDistribution;
+ _nextDistribution = std::shared_ptr<lib::Distribution>();
+ _bucketDBUpdater.storageDistributionChanged(getDistribution());
+ }
+}
+
+void
+Distributor::signalWorkWasDone()
+{
+ _tickResult = framework::ThreadWaitInfo::MORE_WORK_ENQUEUED;
+}
+
+bool
+Distributor::workWasDone()
+{
+ return !_tickResult.waitWanted();
+}
+
+void
+Distributor::startExternalOperations()
+{
+ for (uint32_t i=0; i<_fetchedMessages.size(); ++i) {
+ MBUS_TRACE(_fetchedMessages[i]->getTrace(), 9,
+ "Distributor: Grabbed from queue to be processed.");
+ if (!handleMessage(_fetchedMessages[i])) {
+ MBUS_TRACE(_fetchedMessages[i]->getTrace(), 9,
+ "Distributor: Not handling it. Sending further down.");
+ sendDown(_fetchedMessages[i]);
+ }
+ }
+ if (!_fetchedMessages.empty()) {
+ signalWorkWasDone();
+ }
+ _fetchedMessages.clear();
+}
+
+std::unordered_map<uint16_t, uint32_t>
+Distributor::getMinReplica() const
+{
+ vespalib::LockGuard guard(_metricLock);
+ return _bucketDbStats._minBucketReplica;
+}
+
+void
+Distributor::propagateInternalScanMetricsToExternal()
+{
+ vespalib::LockGuard guard(_metricLock);
+
+ // All shared values are written when _metricLock is held, so no races.
+ if (_bucketDBMetricUpdater.hasCompletedRound()) {
+ _bucketDbStats.propagateMetrics(_idealStateManager.getMetrics(),
+ getMetrics());
+ _idealStateManager.getMetrics().setPendingOperations(
+ _maintenanceStats.global.pending);
+ }
+}
+
+void
+Distributor::updateInternalMetricsForCompletedScan()
+{
+ vespalib::LockGuard guard(_metricLock);
+
+ _bucketDBMetricUpdater.completeRound();
+ _bucketDbStats = _bucketDBMetricUpdater.getLastCompleteStats();
+ _maintenanceStats = _scanner->getPendingMaintenanceStats();
+
+}
+
+void
+Distributor::scanAllBuckets()
+{
+ enterRecoveryMode();
+ while (!scanNextBucket().isDone()) {}
+}
+
+MaintenanceScanner::ScanResult
+Distributor::scanNextBucket()
+{
+ MaintenanceScanner::ScanResult scanResult(_scanner->scanNext());
+ if (scanResult.isDone()) {
+ leaveRecoveryMode();
+ updateInternalMetricsForCompletedScan();
+ _scanner->reset();
+ } else {
+ _bucketDBMetricUpdater.visit(
+ scanResult.getEntry(),
+ _component.getDistribution()->getRedundancy());
+ }
+ return scanResult;
+}
+
+void
+Distributor::startNextMaintenanceOperation()
+{
+ _throttlingStarter->setMaxPendingRange(getConfig().getMinPendingMaintenanceOps(),
+ getConfig().getMaxPendingMaintenanceOps());
+ _scheduler->tick(_schedulingMode);
+}
+
+framework::ThreadWaitInfo
+Distributor::doCriticalTick(framework::ThreadIndex)
+{
+ _tickResult = framework::ThreadWaitInfo::NO_MORE_CRITICAL_WORK_KNOWN;
+ enableNextDistribution();
+ enableNextConfig();
+ fetchStatusRequests();
+ fetchExternalMessages();
+ return _tickResult;
+}
+
+framework::ThreadWaitInfo
+Distributor::doNonCriticalTick(framework::ThreadIndex)
+{
+ _tickResult = framework::ThreadWaitInfo::NO_MORE_CRITICAL_WORK_KNOWN;
+ handleStatusRequests();
+ startExternalOperations();
+ if (!initializing()) {
+ scanNextBucket();
+ startNextMaintenanceOperation();
+ if (isInRecoveryMode()) {
+ signalWorkWasDone();
+ }
+ }
+ _bucketDBUpdater.resendDelayedMessages();
+ return _tickResult;
+}
+
+void
+Distributor::enableNextConfig()
+{
+ _hostInfoReporter.enableReporting(
+ getConfig().getEnableHostInfoReporting());
+ _bucketDBMetricUpdater.setMinimumReplicaCountingMode(
+ getConfig().getMinimumReplicaCountingMode());
+}
+
+void
+Distributor::fetchStatusRequests()
+{
+ if (_fetchedStatusRequests.empty()) {
+ _fetchedStatusRequests.swap(_statusToDo);
+ }
+}
+
+void
+Distributor::fetchExternalMessages()
+{
+ assert(_fetchedMessages.empty());
+ _fetchedMessages.swap(_messageQueue);
+}
+
+void
+Distributor::handleStatusRequests()
+{
+ uint32_t sz = _fetchedStatusRequests.size();
+ for (uint32_t i = 0; i < sz; ++i) {
+ Status& s(*_fetchedStatusRequests[i]);
+ s.getReporter().reportStatus(s.getStream(), s.getPath());
+ s.notifyCompleted();
+ }
+ _fetchedStatusRequests.clear();
+ if (sz > 0) {
+ signalWorkWasDone();
+ }
+}
+
+vespalib::string
+Distributor::getReportContentType(const framework::HttpUrlPath& path) const
+{
+ if (path.hasAttribute("page")) {
+ if (path.getAttribute("page") == "buckets") {
+ return "text/html";
+ } else {
+ return "application/xml";
+ }
+ } else {
+ return "text/html";
+ }
+}
+
+std::string
+Distributor::getActiveIdealStateOperations() const
+{
+ return _maintenanceOperationOwner.toString();
+}
+
+std::string
+Distributor::getActiveOperations() const
+{
+ return _operationOwner.toString();
+}
+
+bool
+Distributor::reportStatus(std::ostream& out,
+ const framework::HttpUrlPath& path) const
+{
+ if (!path.hasAttribute("page") || path.getAttribute("page") == "buckets") {
+ framework::PartlyHtmlStatusReporter htmlReporter(*this);
+ htmlReporter.reportHtmlHeader(out, path);
+ if (!path.hasAttribute("page")) {
+ out << "<a href=\"?page=pending\">Count of pending messages to "
+ << "storage nodes</a><br><a href=\"?page=maintenance&show=50\">"
+ << "List maintenance queue (adjust show parameter to see more "
+ << "operations, -1 for all)</a><br>\n<a href=\"?page=buckets\">"
+ << "List all buckets, highlight non-ideal state</a><br>\n";
+ } else {
+ const_cast<IdealStateManager&>(_idealStateManager)
+ .getBucketStatus(out);
+ }
+ htmlReporter.reportHtmlFooter(out, path);
+ } else {
+ framework::PartlyXmlStatusReporter xmlReporter(*this, out, path);
+ using namespace vespalib::xml;
+ std::string page(path.getAttribute("page"));
+
+ if (page == "pending") {
+ xmlReporter << XmlTag("pending")
+ << XmlAttribute("externalload", _operationOwner.size())
+ << XmlAttribute("maintenance",
+ _maintenanceOperationOwner.size())
+ << XmlEndTag();
+ } else if (page == "maintenance") {
+ // Need new page
+ }
+ }
+
+ return true;
+}
+
+bool
+Distributor::handleStatusRequest(const DelegatedStatusRequest& request) const
+{
+ auto wrappedRequest = std::make_shared<Status>(request);
+ {
+ framework::TickingLockGuard guard(_threadPool.freezeCriticalTicks());
+ _statusToDo.push_back(wrappedRequest);
+ guard.broadcast();
+ }
+ wrappedRequest->waitForCompletion();
+ return true;
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/distributor.h b/storage/src/vespa/storage/distributor/distributor.h
new file mode 100644
index 00000000000..06f9dc4995e
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/distributor.h
@@ -0,0 +1,304 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/config/config.h>
+#include <vespa/storage/common/distributorcomponent.h>
+#include <vespa/storage/common/doneinitializehandler.h>
+#include <vespa/storage/common/messagesender.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/storage/distributor/bucketdbupdater.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+#include <vespa/storage/distributor/externaloperationhandler.h>
+#include <vespa/storage/distributor/bucketdb/bucketdbmetricupdater.h>
+#include <vespa/storage/distributor/bucketdbupdater.h>
+#include <vespa/storage/distributor/maintenancebucket.h>
+#include <vespa/storage/distributor/min_replica_provider.h>
+#include <vespa/storage/distributor/distributorinterface.h>
+#include <vespa/storage/distributor/maintenance/maintenancescheduler.h>
+#include <vespa/storage/distributor/statusreporterdelegate.h>
+#include <vespa/storage/distributor/distributor_host_info_reporter.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storageframework/generic/thread/tickingthread.h>
+#include <vespa/vespalib/util/sync.h>
+
+#include <unordered_map>
+
+namespace storage {
+
+class DoneInitializeHandler;
+class HostInfo;
+
+namespace distributor {
+
+class SimpleMaintenanceScanner;
+class BlockingOperationStarter;
+class ThrottlingOperationStarter;
+class BucketPriorityDatabase;
+
+class Distributor : public StorageLink,
+ public DistributorInterface,
+ public StatusDelegator,
+ public framework::StatusReporter,
+ public framework::TickingThread,
+ public MinReplicaProvider
+{
+public:
+ Distributor(DistributorComponentRegister&,
+ framework::TickingThreadPool&,
+ DoneInitializeHandler&,
+ bool manageActiveBucketCopies,
+ HostInfo& hostInfoReporterRegistrar,
+ ChainedMessageSender* = nullptr);
+
+ ~Distributor();
+
+ void onOpen();
+
+ void onClose();
+
+ bool onDown(const std::shared_ptr<api::StorageMessage>&);
+
+ void sendUp(const std::shared_ptr<api::StorageMessage>&);
+
+ void sendDown(const std::shared_ptr<api::StorageMessage>&);
+
+ virtual ChainedMessageSender& getMessageSender() {
+ return (_messageSender == 0 ? *this : *_messageSender);
+ }
+
+ DistributorMetricSet& getMetrics() { return *_metrics; }
+
+ PendingMessageTracker& getPendingMessageTracker() {
+ return _pendingMessageTracker;
+ }
+
+ BucketOwnership checkOwnershipInPendingState(const document::BucketId&) const override;
+
+ /**
+ * Enables a new cluster state. Called after the bucket db updater has
+ * retrieved all bucket info related to the change.
+ */
+ void enableClusterState(const lib::ClusterState& clusterState);
+
+ /**
+ * Invoked when a pending cluster state for a distribution (config)
+ * change has been enabled. An invocation of storageDistributionChanged
+ * will eventually cause this method to be called, assuming the pending
+ * cluster state completed successfully.
+ */
+ void notifyDistributionChangeEnabled();
+
+ void storageDistributionChanged();
+
+ void recheckBucketInfo(uint16_t nodeIdx, const document::BucketId& bid);
+
+ bool handleReply(const std::shared_ptr<api::StorageReply>& reply);
+
+ // StatusReporter implementation
+ vespalib::string getReportContentType(
+ const framework::HttpUrlPath&) const;
+ bool reportStatus(std::ostream&, const framework::HttpUrlPath&) const;
+
+ bool handleStatusRequest(const DelegatedStatusRequest& request) const;
+
+ uint32_t pendingMaintenanceCount() const;
+
+ std::string getActiveIdealStateOperations() const;
+ std::string getActiveOperations() const;
+
+ virtual framework::ThreadWaitInfo doCriticalTick(framework::ThreadIndex);
+ virtual framework::ThreadWaitInfo doNonCriticalTick(framework::ThreadIndex);
+
+ /**
+ * Checks whether a bucket needs to be split, and sends a split
+ * if so.
+ */
+ void checkBucketForSplit(const BucketDatabase::Entry& e,
+ uint8_t priority);
+
+ const lib::Distribution& getDistribution() const;
+
+ const lib::ClusterState& getClusterState() const {
+ return _clusterState;
+ }
+
+ /**
+ * @return Returns the states in which the distributors consider
+ * storage nodes to be up.
+ */
+ const char* getStorageNodeUpStates() const
+ { return _initializingIsUp ? "uri" : "ur"; }
+
+ /**
+ * Called by bucket db updater after a merge has finished, and all the
+ * request bucket info operations have been performed as well. Passes the
+ * merge back to the operation that created it.
+ */
+ void handleCompletedMerge(const std::shared_ptr<api::MergeBucketReply>& reply);
+
+
+ bool initializing() const {
+ return !_doneInitializing;
+ }
+
+ BucketDatabase& getBucketDatabase() {
+ return _component.getBucketDatabase();
+ }
+ const BucketDatabase& getBucketDatabase() const {
+ return const_cast<Distributor&>(*this).getBucketDatabase();
+ }
+
+ const DistributorConfiguration& getConfig() const {
+ return _component.getTotalDistributorConfig();
+ }
+
+ bool isInRecoveryMode() const {
+ return _schedulingMode == MaintenanceScheduler::RECOVERY_SCHEDULING_MODE;
+ }
+
+ int getDistributorIndex() const;
+
+ const std::string& getClusterName() const;
+
+ const PendingMessageTracker& getPendingMessageTracker() const;
+
+ virtual void sendCommand(const std::shared_ptr<api::StorageCommand>&);
+ virtual void sendReply(const std::shared_ptr<api::StorageReply>&);
+
+ const BucketGcTimeCalculator::BucketIdHasher&
+ getBucketIdHasher() const override {
+ return *_bucketIdHasher;
+ }
+
+private:
+ friend class Distributor_Test;
+ friend class BucketDBUpdaterTest;
+ friend class DistributorTestUtil;
+ friend class ExternalOperationHandler_Test;
+ friend class Operation_Test;
+ friend class MetricUpdateHook;
+
+ class MetricUpdateHook : public framework::MetricUpdateHook
+ {
+ public:
+ MetricUpdateHook(Distributor& self)
+ : _self(self)
+ {
+ }
+
+ void updateMetrics(const MetricLockGuard &) override {
+ _self.propagateInternalScanMetricsToExternal();
+ }
+
+ private:
+ Distributor& _self;
+ };
+
+ void setNodeStateUp();
+
+ bool handleMessage(const std::shared_ptr<api::StorageMessage>& msg);
+ bool isMaintenanceReply(const api::StorageReply& reply) const;
+
+ void handleStatusRequests();
+ void startExternalOperations();
+
+ /**
+ * Return a copy of the latest min replica data, see MinReplicaProvider.
+ */
+ std::unordered_map<uint16_t, uint32_t> getMinReplica() const override;
+
+ /**
+ * Atomically publish internal metrics to external ideal state metrics.
+ * Takes metric lock.
+ */
+ void propagateInternalScanMetricsToExternal();
+ /**
+ * Atomically updates internal metrics (not externally visible metrics;
+ * these are not changed until a snapshot triggers
+ * propagateIdealStateMetrics()).
+ *
+ * Takes metric lock.
+ */
+ void updateInternalMetricsForCompletedScan();
+ void scanAllBuckets();
+ MaintenanceScanner::ScanResult scanNextBucket();
+ void enableNextConfig();
+ void fetchStatusRequests();
+ void fetchExternalMessages();
+ void startNextMaintenanceOperation();
+ void signalWorkWasDone();
+ bool workWasDone();
+
+ void enterRecoveryMode();
+ void leaveRecoveryMode();
+
+ // Tries to generate an operation from the given message. Returns true
+ // if we either returned an operation, or the message was otherwise handled
+ // (for instance, wrong distribution).
+ bool generateOperation(const std::shared_ptr<api::StorageMessage>& msg,
+ Operation::SP& operation);
+
+ void enableNextDistribution();
+
+ lib::ClusterState _clusterState;
+
+ DistributorComponentRegister& _compReg;
+ storage::DistributorComponent _component;
+ std::shared_ptr<DistributorMetricSet> _metrics;
+
+ OperationOwner _operationOwner;
+ OperationOwner _maintenanceOperationOwner;
+
+ PendingMessageTracker _pendingMessageTracker;
+ BucketDBUpdater _bucketDBUpdater;
+ StatusReporterDelegate _distributorStatusDelegate;
+ StatusReporterDelegate _bucketDBStatusDelegate;
+ IdealStateManager _idealStateManager;
+ ExternalOperationHandler _externalOperationHandler;
+
+ mutable std::shared_ptr<lib::Distribution> _distribution;
+ std::shared_ptr<lib::Distribution> _nextDistribution;
+
+ typedef std::vector<std::shared_ptr<api::StorageMessage> > MessageQueue;
+ MessageQueue _messageQueue;
+ MessageQueue _fetchedMessages;
+ framework::TickingThreadPool& _threadPool;
+ vespalib::Monitor _statusMonitor;
+
+ class Status;
+ mutable std::vector<std::shared_ptr<Status>> _statusToDo;
+ mutable std::vector<std::shared_ptr<Status>> _fetchedStatusRequests;
+
+ bool _initializingIsUp;
+
+ DoneInitializeHandler& _doneInitializeHandler;
+ bool _doneInitializing;
+
+ ChainedMessageSender* _messageSender;
+
+ std::unique_ptr<BucketPriorityDatabase> _bucketPriorityDb;
+ std::unique_ptr<SimpleMaintenanceScanner> _scanner;
+ std::unique_ptr<ThrottlingOperationStarter> _throttlingStarter;
+ std::unique_ptr<BlockingOperationStarter> _blockingStarter;
+ std::unique_ptr<MaintenanceScheduler> _scheduler;
+ MaintenanceScheduler::SchedulingMode _schedulingMode;
+ framework::MilliSecTimer _recoveryTimeStarted;
+ framework::ThreadWaitInfo _tickResult;
+ const std::string _clusterName;
+ BucketDBMetricUpdater _bucketDBMetricUpdater;
+ std::unique_ptr<BucketGcTimeCalculator::BucketIdHasher> _bucketIdHasher;
+ MetricUpdateHook _metricUpdateHook;
+ vespalib::Lock _metricLock;
+ /**
+ * Maintenance stats for last completed database scan iteration.
+ * Access must be protected by _metricLock as it is read by metric
+ * manager thread but written by distributor thread.
+ */
+ SimpleMaintenanceScanner::PendingMaintenanceStats _maintenanceStats;
+ BucketDBMetricUpdater::Stats _bucketDbStats;
+ DistributorHostInfoReporter _hostInfoReporter;
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/distributor_host_info_reporter.cpp b/storage/src/vespa/storage/distributor/distributor_host_info_reporter.cpp
new file mode 100644
index 00000000000..0c739c65425
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/distributor_host_info_reporter.cpp
@@ -0,0 +1,103 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/distributor_host_info_reporter.h>
+#include <vespa/storage/distributor/min_replica_provider.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+
+#include <set>
+
+using std::set;
+using std::unordered_map;
+
+namespace storage {
+namespace distributor {
+
+using Object = vespalib::JsonStream::Object;
+using Array = vespalib::JsonStream::Array;
+using End = vespalib::JsonStream::End;
+
+DistributorHostInfoReporter::DistributorHostInfoReporter(
+ LatencyStatisticsProvider& latencyProvider,
+ MinReplicaProvider& minReplicaProvider)
+ : _latencyProvider(latencyProvider),
+ _minReplicaProvider(minReplicaProvider),
+ _enabled(true)
+{
+}
+
+namespace {
+
+void
+writeOperationStats(vespalib::JsonStream& stream,
+ const OperationStats& stats)
+{
+ stream << "put" << Object()
+ << "latency-ms-sum" << stats.totalLatency.count()
+ << "count" << stats.numRequests
+ << End();
+}
+
+void
+outputStorageNodes(vespalib::JsonStream& output,
+ const unordered_map<uint16_t, NodeStats>& nodeStats,
+ const unordered_map<uint16_t, uint32_t>& minReplica)
+{
+ set<uint16_t> nodes;
+ for (auto& element : nodeStats) {
+ nodes.insert(element.first);
+ }
+ for (auto& element : minReplica) {
+ nodes.insert(element.first);
+ }
+
+ for (uint16_t node : nodes) {
+ output << Object();
+ {
+ output << "node-index" << node;
+
+ auto nodeStatsIt = nodeStats.find(node);
+ if (nodeStatsIt != nodeStats.end()) {
+ output << "ops-latency" << Object();
+ {
+ writeOperationStats(output, nodeStatsIt->second.puts);
+ }
+ output << End();
+ }
+
+ auto minReplicaIt = minReplica.find(node);
+ if (minReplicaIt != minReplica.end()) {
+ output << "min-current-replication-factor"
+ << minReplicaIt->second;
+ }
+ }
+ output << End();
+ }
+}
+
+} // anonymous namespace
+
+void
+DistributorHostInfoReporter::report(vespalib::JsonStream& output)
+{
+ if (!isReportingEnabled()) {
+ return;
+ }
+
+ NodeStatsSnapshot nodeStats = _latencyProvider.getLatencyStatistics();
+ std::unordered_map<uint16_t, uint32_t> minReplica =
+ _minReplicaProvider.getMinReplica();
+
+ output << "distributor" << Object();
+ {
+ output << "storage-nodes" << Array();
+
+ outputStorageNodes(output, nodeStats.nodeToStats, minReplica);
+
+ output << End();
+ }
+ output << End();
+}
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/distributor_host_info_reporter.h b/storage/src/vespa/storage/distributor/distributor_host_info_reporter.h
new file mode 100644
index 00000000000..219ec35da1d
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/distributor_host_info_reporter.h
@@ -0,0 +1,51 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/common/hostreporter/hostreporter.h>
+#include <atomic>
+
+namespace storage {
+namespace distributor {
+
+class LatencyStatisticsProvider;
+class MinReplicaProvider;
+struct OperationStats;
+
+class DistributorHostInfoReporter : public HostReporter
+{
+public:
+ DistributorHostInfoReporter(LatencyStatisticsProvider& latencyProvider,
+ MinReplicaProvider& minReplicaProvider);
+
+ DistributorHostInfoReporter(const DistributorHostInfoReporter&) = delete;
+ DistributorHostInfoReporter& operator=(
+ const DistributorHostInfoReporter&) = delete;
+
+ void report(vespalib::JsonStream& output) override;
+
+ /**
+ * Set wether per-node latency, replication factors, merge stats etc are
+ * to be included in the generated JSON report.
+ *
+ * Thread safe.
+ */
+ void enableReporting(bool enabled) noexcept {
+ _enabled.store(enabled, std::memory_order_relaxed);
+ }
+
+ /**
+ * Thread safe.
+ */
+ bool isReportingEnabled() const noexcept {
+ return _enabled.load(std::memory_order_relaxed);
+ }
+
+private:
+ LatencyStatisticsProvider& _latencyProvider;
+ MinReplicaProvider& _minReplicaProvider;
+ std::atomic<bool> _enabled;
+};
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/distributorcomponent.cpp b/storage/src/vespa/storage/distributor/distributorcomponent.cpp
new file mode 100644
index 00000000000..a8e5a380b8e
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/distributorcomponent.cpp
@@ -0,0 +1,356 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/distributorcomponent.h>
+#include <vespa/log/log.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/storageapi/messageapi/storagereply.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/common/bucketoperationlogger.h>
+
+LOG_SETUP(".distributorstoragelink");
+
+namespace storage {
+
+namespace distributor {
+
+DistributorComponent::DistributorComponent(
+ DistributorInterface& distributor,
+ DistributorComponentRegister& compReg,
+ const std::string& name)
+ : storage::DistributorComponent(compReg, name),
+ _distributor(distributor)
+{
+}
+
+void
+DistributorComponent::sendDown(const api::StorageMessage::SP& msg)
+{
+ _distributor.getMessageSender().sendDown(msg);
+}
+
+void
+DistributorComponent::sendUp(const api::StorageMessage::SP& msg)
+{
+ _distributor.getMessageSender().sendUp(msg);
+}
+
+const lib::ClusterState&
+DistributorComponent::getClusterState() const
+{
+ return _distributor.getClusterState();
+};
+
+std::vector<uint16_t>
+DistributorComponent::getIdealNodes(const document::BucketId& bid) const
+{
+ return getDistribution().getIdealStorageNodes(
+ getClusterState(),
+ bid,
+ _distributor.getStorageNodeUpStates());
+}
+
+BucketOwnership
+DistributorComponent::checkOwnershipInPendingAndGivenState(
+ const lib::Distribution& distribution,
+ const lib::ClusterState& clusterState,
+ const document::BucketId& bucket) const
+{
+ try {
+ BucketOwnership pendingRes(
+ _distributor.checkOwnershipInPendingState(bucket));
+ if (!pendingRes.isOwned()) {
+ return pendingRes;
+ }
+ uint16_t distributor = distribution.getIdealDistributorNode(
+ clusterState, bucket);
+
+ if (getIndex() == distributor) {
+ return BucketOwnership::createOwned();
+ } else {
+ return BucketOwnership::createNotOwnedInState(clusterState);
+ }
+ } catch (lib::TooFewBucketBitsInUseException& e) {
+ return BucketOwnership::createNotOwnedInState(clusterState);
+ } catch (lib::NoDistributorsAvailableException& e) {
+ return BucketOwnership::createNotOwnedInState(clusterState);
+ }
+}
+
+BucketOwnership
+DistributorComponent::checkOwnershipInPendingAndCurrentState(
+ const document::BucketId& bucket) const
+{
+ return checkOwnershipInPendingAndGivenState(
+ getDistribution(), getClusterState(), bucket);
+}
+
+bool
+DistributorComponent::ownsBucketInState(
+ const lib::Distribution& distribution,
+ const lib::ClusterState& clusterState,
+ const document::BucketId& bucket) const
+{
+ LOG(spam, "checking bucket %s in state %s with distr %s",
+ bucket.toString().c_str(), clusterState.toString().c_str(),
+ distribution.getNodeGraph().getDistributionConfigHash().c_str());
+ try {
+ uint16_t distributor = distribution.getIdealDistributorNode(
+ clusterState, bucket);
+
+ return (getIndex() == distributor);
+ } catch (lib::TooFewBucketBitsInUseException& e) {
+ return false;
+ } catch (lib::NoDistributorsAvailableException& e) {
+ return false;
+ }
+}
+
+bool
+DistributorComponent::ownsBucketInState(
+ const lib::ClusterState& clusterState,
+ const document::BucketId& bucket) const
+{
+ return ownsBucketInState(getDistribution(), clusterState, bucket);
+}
+
+bool
+DistributorComponent::ownsBucketInCurrentState(
+ const document::BucketId& bucket) const
+{
+ return ownsBucketInState(getDistribution(), getClusterState(), bucket);
+}
+
+api::StorageMessageAddress
+DistributorComponent::nodeAddress(uint16_t nodeIndex) const
+{
+ return api::StorageMessageAddress(
+ getClusterName(),
+ lib::NodeType::STORAGE,
+ nodeIndex);
+}
+
+uint16_t
+DistributorComponent::getRedundancy() const {
+ return getDistribution().getRedundancy();
+}
+
+bool
+DistributorComponent::checkDistribution(
+ api::StorageCommand &cmd,
+ const document::BucketId& bid)
+{
+ BucketOwnership bo(checkOwnershipInPendingAndCurrentState(bid));
+ if (!bo.isOwned()) {
+ std::string systemStateStr = bo.getNonOwnedState().toString();
+ LOG(debug,
+ "Got message with wrong distribution, "
+ "bucketid %s sending back state '%s'",
+ bid.toString().c_str(),
+ systemStateStr.c_str());
+
+ api::StorageReply::UP reply(cmd.makeReply());
+ api::ReturnCode ret(
+ api::ReturnCode::WRONG_DISTRIBUTION,
+ systemStateStr);
+ reply->setResult(ret);
+ sendUp(std::shared_ptr<api::StorageMessage>(reply.release()));
+ return false;
+ }
+ return true;
+}
+
+void
+DistributorComponent::removeNodesFromDB(const document::BucketId& bucketId,
+ const std::vector<uint16_t>& nodes)
+{
+ BucketDatabase::Entry dbentry = getBucketDatabase().get(bucketId);
+
+ if (dbentry.valid()) {
+ for (uint32_t i = 0; i < nodes.size(); ++i) {
+ if (dbentry->removeNode(nodes[i])) {
+ LOG(debug,
+ "Removed node %d from bucket %s. %u copies remaining",
+ nodes[i],
+ bucketId.toString().c_str(),
+ dbentry->getNodeCount());
+ }
+ }
+
+ if (dbentry->getNodeCount() != 0) {
+ getBucketDatabase().update(dbentry);
+ } else {
+ LOG(debug,
+ "After update, bucket %s now has no copies. "
+ "Removing from database.",
+ bucketId.toString().c_str());
+
+ getBucketDatabase().remove(bucketId);
+ }
+ }
+}
+
+std::vector<uint16_t>
+DistributorComponent::enumerateDownNodes(
+ const lib::ClusterState& s,
+ const document::BucketId& bucket,
+ const std::vector<BucketCopy>& candidates) const
+{
+ std::vector<uint16_t> downNodes;
+ for (uint32_t i = 0; i < candidates.size(); ++i) {
+ const BucketCopy& copy(candidates[i]);
+ const lib::NodeState& ns(
+ s.getNodeState(lib::Node(lib::NodeType::STORAGE,
+ copy.getNode())));
+ if (ns.getState() == lib::State::DOWN) {
+ LOG(debug,
+ "Trying to add a bucket copy to %s whose node is marked as "
+ "down in the cluster state: %s. Ignoring it since no zombies "
+ "are allowed!",
+ bucket.toString().c_str(),
+ copy.toString().c_str());
+ downNodes.push_back(copy.getNode());
+ }
+ }
+ return downNodes;
+}
+
+void
+DistributorComponent::updateBucketDatabase(
+ const document::BucketId& bucketId,
+ const std::vector<BucketCopy>& changedNodes,
+ uint32_t updateFlags)
+{
+ assert(!(bucketId == document::BucketId()));
+ BucketDatabase::Entry dbentry = getBucketDatabase().get(bucketId);
+
+ BucketOwnership ownership(checkOwnershipInPendingAndCurrentState(bucketId));
+ if (!ownership.isOwned()) {
+ LOG(debug,
+ "Trying to add %s to database that we do not own according to "
+ "cluster state '%s' - ignoring!",
+ bucketId.toString().c_str(),
+ ownership.getNonOwnedState().toString().c_str());
+ LOG_BUCKET_OPERATION_NO_LOCK(bucketId, "Ignoring database insert since "
+ "we do not own the bucket");
+ return;
+ }
+
+ if (!dbentry.valid()) {
+ if (updateFlags & DatabaseUpdate::CREATE_IF_NONEXISTING) {
+ dbentry = BucketDatabase::Entry(bucketId, BucketInfo());
+ } else {
+ return;
+ }
+ }
+
+ // 0 implies bucket was just added. Since we don't know if any other
+ // distributor has run GC on it, we just have to assume this and set the
+ // timestamp to the current time to avoid duplicate work.
+ if (dbentry->getLastGarbageCollectionTime() == 0) {
+ dbentry->setLastGarbageCollectionTime(
+ getClock().getTimeInSeconds().getTime());
+ }
+
+ // Ensure that we're not trying to bring any zombie copies into the
+ // bucket database (i.e. copies on nodes that are actually down).
+ std::vector<uint16_t> downNodes(
+ enumerateDownNodes(getClusterState(), bucketId, changedNodes));
+ // Optimize for common case where we don't have to create a new
+ // bucket copy vector
+ if (downNodes.empty()) {
+ dbentry->addNodes(changedNodes, getIdealNodes(bucketId));
+ } else {
+ std::vector<BucketCopy> upNodes;
+ for (uint32_t i = 0; i < changedNodes.size(); ++i) {
+ const BucketCopy& copy(changedNodes[i]);
+ if (std::find(downNodes.begin(), downNodes.end(),
+ copy.getNode())
+ == downNodes.end())
+ {
+ upNodes.push_back(copy);
+ }
+ }
+ dbentry->addNodes(upNodes, getIdealNodes(bucketId));
+ }
+ if (updateFlags & DatabaseUpdate::RESET_TRUSTED) {
+ dbentry->resetTrusted();
+ }
+ getBucketDatabase().update(dbentry);
+}
+
+void
+DistributorComponent::recheckBucketInfo(uint16_t nodeIdx,
+ const document::BucketId& bid)
+{
+ _distributor.recheckBucketInfo(nodeIdx, bid);
+}
+
+document::BucketId
+DistributorComponent::getBucketId(const document::DocumentId& docId) const
+{
+ document::BucketId id(getBucketIdFactory().getBucketId(docId));
+
+ id.setUsedBits(_distributor.getConfig().getMinimalBucketSplit());
+ return id.stripUnused();
+}
+
+bool
+DistributorComponent::storageNodeIsUp(uint32_t nodeIndex) const
+{
+ const lib::NodeState& ns = getClusterState().getNodeState(
+ lib::Node(lib::NodeType::STORAGE, nodeIndex));
+
+ return ns.getState().oneOf(_distributor.getStorageNodeUpStates());
+}
+
+document::BucketId
+DistributorComponent::getSibling(const document::BucketId& bid) const {
+ document::BucketId zeroBucket;
+ document::BucketId oneBucket;
+
+ if (bid.getUsedBits() == 1) {
+ zeroBucket = document::BucketId(1, 0);
+ oneBucket = document::BucketId(1, 1);
+ } else {
+ document::BucketId joinedBucket = document::BucketId(
+ bid.getUsedBits() - 1,
+ bid.getId());
+
+ zeroBucket = document::BucketId(
+ bid.getUsedBits(),
+ joinedBucket.getId());
+
+ uint64_t hiBit = 1;
+ hiBit <<= (bid.getUsedBits() - 1);
+ oneBucket = document::BucketId(
+ bid.getUsedBits(),
+ joinedBucket.getId() | hiBit);
+ }
+
+ return (zeroBucket == bid) ? oneBucket : zeroBucket;
+};
+
+BucketDatabase::Entry
+DistributorComponent::createAppropriateBucket(const document::BucketId& bid)
+{
+ return getBucketDatabase().createAppropriateBucket(
+ _distributor.getConfig().getMinimalBucketSplit(),
+ bid);
+}
+
+document::BucketId
+DistributorComponent::getAppropriateBucket(const document::BucketId& bid)
+{
+ return getBucketDatabase().getAppropriateBucket(
+ _distributor.getConfig().getMinimalBucketSplit(),
+ bid);
+}
+
+bool
+DistributorComponent::initializing() const {
+ return _distributor.initializing();
+}
+
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/distributorcomponent.h b/storage/src/vespa/storage/distributor/distributorcomponent.h
new file mode 100644
index 00000000000..fbf773e7fff
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/distributorcomponent.h
@@ -0,0 +1,206 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/distributorinterface.h>
+#include <vespa/storage/distributor/operationowner.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/storageapi/messageapi/storagecommand.h>
+#include <vespa/storageapi/buckets/bucketinfo.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/common/distributorcomponent.h>
+#include <vespa/storage/distributor/statechecker.h>
+#include <vespa/storage/storageutil/utils.h>
+#include <vespa/storageframework/storageframework.h>
+
+namespace storage {
+
+namespace distributor {
+
+struct DatabaseUpdate {
+ enum UpdateFlags {
+ CREATE_IF_NONEXISTING = 1,
+ RESET_TRUSTED = 2
+ };
+};
+
+/**
+ * Takes care of subscribing to document manager config and
+ * making those values available to other subcomponents.
+ */
+class DistributorComponent : public storage::DistributorComponent
+{
+public:
+ DistributorComponent(DistributorInterface& distributor,
+ DistributorComponentRegister& compReg,
+ const std::string& name);
+
+ virtual ~DistributorComponent() {}
+
+ /**
+ * Returns the ownership status of a bucket as decided with the given
+ * distribution and cluster state -and- that of the pending cluster
+ * state and distribution (if any pending exists).
+ */
+ BucketOwnership checkOwnershipInPendingAndGivenState(
+ const lib::Distribution& distribution,
+ const lib::ClusterState& clusterState,
+ const document::BucketId& bucket) const;
+
+ BucketOwnership checkOwnershipInPendingAndCurrentState(
+ const document::BucketId& bucket) const;
+
+ bool ownsBucketInState(const lib::Distribution& distribution,
+ const lib::ClusterState& clusterState,
+ const document::BucketId& bucket) const;
+
+ /**
+ * Returns true if this distributor owns the given bucket in the
+ * given cluster and current distribution config.
+ */
+ bool ownsBucketInState(const lib::ClusterState& clusterState,
+ const document::BucketId& bucket) const;
+
+ /**
+ * Returns true if this distributor owns the given bucket with the current
+ * cluster state and distribution config.
+ */
+ bool ownsBucketInCurrentState(const document::BucketId&) const;
+
+ /**
+ * Returns a reference to the current system state. Valid until the next
+ * time the distributor main thread processes its message queue.
+ */
+ const lib::ClusterState& getClusterState() const;
+
+ /**
+ * Returns the ideal nodes for the given bucket.
+ */
+ std::vector<uint16_t> getIdealNodes(const document::BucketId& bucketId) const;
+
+ /**
+ * Returns the slobrok address of the given storage node.
+ */
+ api::StorageMessageAddress nodeAddress(uint16_t nodeIndex) const;
+
+ /**
+ * Returns true if the given storage node is in an "up state".
+ */
+ bool storageNodeIsUp(uint32_t nodeIndex) const;
+
+ /**
+ * Returns the current desired redundancy level.
+ */
+ uint16_t getRedundancy() const;
+
+ /**
+ * Verifies that the given command has been received at the
+ * correct distributor based on the current system state.
+ */
+ bool checkDistribution(
+ api::StorageCommand& cmd,
+ const document::BucketId& bid);
+
+ /**
+ * Removes the given bucket copies from the bucket database.
+ * If the resulting bucket is empty afterwards, removes the entire
+ * bucket entry from the bucket database.
+ */
+ void removeNodesFromDB(const document::BucketId& id,
+ const std::vector<uint16_t>& nodes);
+
+ /**
+ * Removes a copy from the given bucket from the bucket database.
+ * If the resulting bucket is empty afterwards, removes the entire
+ * bucket entry from the bucket database.
+ */
+ void removeNodeFromDB(const document::BucketId& id, uint16_t node) {
+ removeNodesFromDB(id, toVector<uint16_t>(node));
+ }
+
+ /**
+ * Adds the given copies to the bucket database.
+ */
+ void updateBucketDatabase(
+ const document::BucketId& bid,
+ const std::vector<BucketCopy>& changedNodes,
+ uint32_t updateFlags = 0);
+
+ /**
+ * Simple API for the common case of modifying a single node.
+ */
+ void updateBucketDatabase(
+ const document::BucketId& bid,
+ const BucketCopy& changedNode,
+ uint32_t updateFlags = 0)
+ {
+ updateBucketDatabase(bid,
+ toVector<BucketCopy>(changedNode),
+ updateFlags);
+ }
+
+ /**
+ * Fetch bucket info about the given bucket from the given node.
+ * Used when we get BUCKET_NOT_FOUND.
+ */
+ void recheckBucketInfo(uint16_t nodeIdx,
+ const document::BucketId& id);
+
+ /**
+ * Returns the bucket id corresponding to the given document id.
+ */
+ document::BucketId getBucketId(const document::DocumentId& docId) const;
+
+ void sendDown(const api::StorageMessage::SP&);
+ void sendUp(const api::StorageMessage::SP&);
+
+ DistributorInterface& getDistributor() { return _distributor; }
+
+ const DistributorInterface& getDistributor() const {
+ return _distributor;
+ }
+
+ virtual BucketDatabase& getBucketDatabase()
+ { return _distributor.getBucketDatabase(); }
+
+ virtual const BucketDatabase& getBucketDatabase() const
+ { return _distributor.getBucketDatabase(); }
+
+ const lib::Distribution& getDistribution() const
+ { return _distributor.getDistribution(); };
+
+ /**
+ * Finds a bucket that has the same direct parent as the given bucket
+ * (i.e. split one bit less), but different bit in the most used bit.
+ */
+ document::BucketId getSibling(const document::BucketId& bid) const;
+
+ /**
+ * Gets a bucket that is split correctly according to other buckets that
+ * are in the bucket database. For instance, if you have a sibling bucket of
+ * the bucket, a similarly split bucket should be created.
+ */
+ document::BucketId getAppropriateBucket(const document::BucketId& bid);
+
+ BucketDatabase::Entry createAppropriateBucket(const document::BucketId& bid);
+
+ /**
+ * Returns true if the node is currently initializing.
+ */
+ bool initializing() const;
+
+private:
+ std::vector<uint16_t> enumerateDownNodes(
+ const lib::ClusterState& s,
+ const document::BucketId& bucket,
+ const std::vector<BucketCopy>& candidates) const;
+ DistributorInterface& _distributor;
+
+protected:
+
+ vespalib::Lock _sync;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/distributorconfiguration.cpp b/storage/src/vespa/storage/distributor/distributorconfiguration.cpp
new file mode 100644
index 00000000000..0ac1851fdc4
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/distributorconfiguration.cpp
@@ -0,0 +1,176 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/storage/distributor/distributorconfiguration.h>
+#include <vespa/log/log.h>
+#include <vespa/document/select/parser.h>
+#include <vespa/document/select/node.h>
+#include <vespa/document/select/traversingvisitor.h>
+#include <vespa/vespalib/util/exceptions.h>
+
+LOG_SETUP(".distributorconfiguration");
+
+namespace storage {
+
+namespace distributor {
+
+DistributorConfiguration::DistributorConfiguration(StorageComponent& component)
+ : _component(component),
+ _byteCountSplitLimit(0xffffffff),
+ _docCountSplitLimit(0xffffffff),
+ _byteCountJoinLimit(0),
+ _docCountJoinLimit(0),
+ _minimalBucketSplit(16),
+ _maxIdealStateOperations(100),
+ _idealStateChunkSize(1000),
+ _maxNodesPerMerge(16),
+ _lastGarbageCollectionChange(0),
+ _garbageCollectionInterval(0),
+ _minPendingMaintenanceOps(100),
+ _maxPendingMaintenanceOps(1000),
+ _maxVisitorsPerNodePerClientVisitor(4),
+ _minBucketsPerVisitor(5),
+ _minTimeLeftToResend(20),
+ _doInlineSplit(true),
+ _enableJoinForSiblingLessBuckets(false),
+ _enableInconsistentJoin(false),
+ _enableHostInfoReporting(true),
+ _disableBucketActivation(false),
+ _minimumReplicaCountingMode(ReplicaCountingMode::TRUSTED)
+{
+
+}
+
+namespace {
+
+class TimeVisitor : public document::select::TraversingVisitor {
+public:
+ bool hasCurrentTime;
+
+ TimeVisitor() : hasCurrentTime(false) {}
+
+ void visitCurrentTimeValueNode(const document::select::CurrentTimeValueNode&) {
+ hasCurrentTime = true;
+ }
+};
+
+}
+
+bool
+DistributorConfiguration::containsTimeStatement(const std::string& documentSelection) const
+{
+ TimeVisitor visitor;
+ try {
+ document::select::Parser parser(*_component.getTypeRepo(),
+ _component.getBucketIdFactory());
+
+ std::unique_ptr<document::select::Node> node = parser.parse(documentSelection);
+ node->visit(visitor);
+ } catch (std::exception& e) {
+ LOG(error,
+ "Caught exception during config-time processing of GC "
+ "selection '%s', terminating process to force full "
+ "reconfiguration: %s",
+ documentSelection.c_str(),
+ e.what());
+ std::terminate();
+ }
+ return visitor.hasCurrentTime;
+}
+
+void
+DistributorConfiguration::configureMaintenancePriorities(
+ const vespa::config::content::core::StorDistributormanagerConfig& cfg)
+{
+ MaintenancePriorities& mp(_maintenancePriorities);
+ mp.mergeMoveToIdealNode = cfg.priorityMergeMoveToIdealNode;
+ mp.mergeOutOfSyncCopies = cfg.priorityMergeOutOfSyncCopies;
+ mp.mergeTooFewCopies = cfg.priorityMergeTooFewCopies;
+ mp.activateNoExistingActive = cfg.priorityActivateNoExistingActive;
+ mp.activateWithExistingActive = cfg.priorityActivateWithExistingActive;
+ mp.deleteBucketCopy = cfg.priorityDeleteBucketCopy;
+ mp.joinBuckets = cfg.priorityJoinBuckets;
+ mp.splitDistributionBits = cfg.prioritySplitDistributionBits;
+ mp.splitLargeBucket = cfg.prioritySplitLargeBucket;
+ mp.splitInconsistentBucket = cfg.prioritySplitInconsistentBucket;
+ mp.garbageCollection = cfg.priorityGarbageCollection;
+}
+
+void
+DistributorConfiguration::configure(const vespa::config::content::core::StorDistributormanagerConfig& config)
+{
+ if ((config.splitsize != 0 && config.joinsize > config.splitsize)
+ || (config.splitcount != 0 && config.joincount > config.splitcount))
+ {
+ std::ostringstream ost;
+ ost << "Split limits must be higher than join limits (both count and "
+ << "size). Values gotten are size(join(" << config.joinsize
+ << ")/split(" << config.splitsize << ")) count(join("
+ << config.joincount << ")/split(" << config.splitcount << "))";
+ throw vespalib::IllegalArgumentException(ost.str(), VESPA_STRLOC);
+ }
+
+ _maxIdealStateOperations = config.maxpendingidealstateoperations;
+ _byteCountSplitLimit = config.splitsize;
+ _docCountSplitLimit = config.splitcount;
+ _byteCountJoinLimit = config.joinsize;
+ _docCountJoinLimit = config.joincount;
+ _minimalBucketSplit = config.minsplitcount;
+ _maxNodesPerMerge = config.maximumNodesPerMerge;
+
+ _garbageCollectionInterval = config.garbagecollection.interval;
+
+ if (containsTimeStatement(config.garbagecollection.selectiontoremove)) {
+ // Always changes.
+ _lastGarbageCollectionChange = 1;
+ } else if (_garbageCollectionSelection != config.garbagecollection.selectiontoremove) {
+ _lastGarbageCollectionChange = time(NULL);
+ }
+
+ _garbageCollectionSelection = config.garbagecollection.selectiontoremove;
+
+ // Don't garbage collect with empty selection.
+ if (_garbageCollectionSelection.empty()) {
+ _garbageCollectionInterval = 0;
+ }
+
+ _blockedStateCheckers.clear();
+ for (uint32_t i = 0; i < config.blockedstatecheckers.size(); ++i) {
+ _blockedStateCheckers.insert(config.blockedstatecheckers[i]);
+ }
+
+ _doInlineSplit = config.inlinebucketsplitting;
+ _enableJoinForSiblingLessBuckets = config.enableJoinForSiblingLessBuckets;
+ _enableInconsistentJoin = config.enableInconsistentJoin;
+
+ _enableHostInfoReporting = config.enableHostInfoReporting;
+ _disableBucketActivation = config.disableBucketActivation;
+
+ _minimumReplicaCountingMode = config.minimumReplicaCountingMode;
+
+ configureMaintenancePriorities(config);
+
+ LOG(debug,
+ "Distributor now using new configuration parameters. Split limits: %d docs/%d bytes. "
+ "Join limits: %d docs/%d bytes. Minimal bucket split %d. "
+ "Documents to garbage collect: %s (check every %d seconds). "
+ "Maximum pending ideal state operations: %d",
+ (int)_docCountSplitLimit,
+ (int)_byteCountSplitLimit,
+ (int)_docCountJoinLimit,
+ (int)_byteCountJoinLimit,
+ (int)_minimalBucketSplit,
+ _garbageCollectionSelection.c_str(),
+ (int)_garbageCollectionInterval,
+ (int)_maxIdealStateOperations);
+}
+
+void
+DistributorConfiguration::configure(const vespa::config::content::core::StorVisitordispatcherConfig& config)
+{
+ _minTimeLeftToResend = config.storagenetworklatency;
+ _minBucketsPerVisitor = config.minbucketspervisitor;
+ _maxVisitorsPerNodePerClientVisitor = config.maxvisitorspernodeperclientvisitor;
+}
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/distributorconfiguration.h b/storage/src/vespa/storage/distributor/distributorconfiguration.h
new file mode 100644
index 00000000000..efc57fe534b
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/distributorconfiguration.h
@@ -0,0 +1,276 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/config/config-stor-distributormanager.h>
+#include <vespa/storage/config/config-stor-visitordispatcher.h>
+#include <vespa/vespalib/stllike/hash_set.h>
+#include <vespa/storage/common/storagecomponent.h>
+
+namespace storage {
+namespace distributor {
+
+
+class DistributorConfiguration {
+public:
+ DistributorConfiguration(StorageComponent& component);
+
+ struct MaintenancePriorities
+ {
+ // Defaults for these are chosen as those used as the current (non-
+ // configurable) values at the time of implementation.
+ uint8_t mergeMoveToIdealNode {120};
+ uint8_t mergeOutOfSyncCopies {120};
+ uint8_t mergeTooFewCopies {120};
+ uint8_t activateNoExistingActive {100};
+ uint8_t activateWithExistingActive {100};
+ uint8_t deleteBucketCopy {100};
+ uint8_t joinBuckets {155};
+ uint8_t splitDistributionBits {200};
+ uint8_t splitLargeBucket {175};
+ uint8_t splitInconsistentBucket {110};
+ uint8_t garbageCollection {200};
+ };
+
+ using DistrConfig = vespa::config::content::core::StorDistributormanagerConfig;
+
+ void configure(const DistrConfig& config);
+
+ void configure(const vespa::config::content::core::StorVisitordispatcherConfig& config);
+
+ void setIdealStateChunkSize(uint32_t chunkSize) {
+ _idealStateChunkSize = chunkSize;
+ }
+
+ uint32_t getIdealStateChunkSize() {
+ return _idealStateChunkSize;
+ }
+
+ uint32_t lastGarbageCollectionChangeTime() const {
+ return _lastGarbageCollectionChange;
+ }
+
+ const std::string& getGarbageCollectionSelection() const {
+ return _garbageCollectionSelection;
+ }
+
+ uint32_t getGarbageCollectionInterval() const {
+ return _garbageCollectionInterval;
+ }
+
+ void setGarbageCollection(const std::string& selection, uint32_t interval) {
+ _garbageCollectionSelection = selection;
+ _garbageCollectionInterval = interval;
+ }
+
+ void setLastGarbageCollectionChangeTime(uint32_t lastChangeTime) {
+ _lastGarbageCollectionChange = lastChangeTime;
+ }
+
+ bool stateCheckerIsActive(const vespalib::stringref & stateCheckerName) const {
+ return _blockedStateCheckers.find(stateCheckerName) == _blockedStateCheckers.end();
+ }
+
+ void disableStateChecker(const vespalib::stringref & stateCheckerName) {
+ _blockedStateCheckers.insert(stateCheckerName);
+ }
+
+ void setDoInlineSplit(bool value) {
+ _doInlineSplit = value;
+ }
+
+ bool doInlineSplit() const {
+ return _doInlineSplit;
+ }
+
+ /**
+ Sets the number of documents needed for a bucket to be split.
+
+ @param count The minimum number of documents a bucket needs to have to be split.
+ */
+ void setSplitCount(uint32_t count) { _docCountSplitLimit = count; }
+
+ /**
+ Sets the number of bytes needed for a bucket to be split.
+
+ @param sz The minimum size (in bytes) a bucket needs to have in order to be split.
+ */
+ void setSplitSize(uint32_t sz) { _byteCountSplitLimit = sz; }
+
+ /**
+ Sets the maximum number of documents two buckets can have in order to be joined. The sum
+ of the documents in the two buckets need to be below this limit for join to occur.
+
+ @param count The maximum number of documents two buckets need to have in order to be joined.
+ */
+ void setJoinCount(uint32_t count) { _docCountJoinLimit = count; }
+
+ /**
+ Sets the maximum number of stored bytes two buckets can have in order to be joined. The sum
+ of the sizes of the two buckets need to be below this limit for join to occur.
+
+ @param count The maximum size the two buckets need to have in order to be joined.
+ */
+ void setJoinSize(uint32_t sz) { _byteCountJoinLimit = sz; }
+
+ /**
+ Sets the minimal bucket split level we want buckets to have. Buckets that have fewer used bits
+ than this are automatically split.
+
+ @param splitBits The minimal bucket split level.
+ */
+ void setMinimalBucketSplit(int splitBits) { _minimalBucketSplit = splitBits; };
+
+ /**
+ Sets the maximum number of ideal state operations a distributor should
+ schedule to each storage node.
+
+ @param numOps The number of operations to schedule.
+ */
+ void setMaxIdealStateOperations(uint32_t numOps) {
+ _maxIdealStateOperations = numOps;
+ };
+
+ uint32_t getMaxIdealStateOperations() {
+ return _maxIdealStateOperations;
+ }
+
+ void setMaintenancePriorities(const MaintenancePriorities& mp) {
+ _maintenancePriorities = mp;
+ }
+
+ const MaintenancePriorities& getMaintenancePriorities() const {
+ return _maintenancePriorities;
+ }
+
+ /**
+ @see setSplitCount
+ */
+ uint32_t getSplitCount() const { return _docCountSplitLimit; }
+
+ /**
+ @see setSplitSize
+ */
+ uint32_t getSplitSize() const { return _byteCountSplitLimit; }
+
+ /**
+ @see setJoinCount
+ */
+ uint32_t getJoinCount() const { return _docCountJoinLimit; }
+
+ /**
+ @see setJoinSize
+ */
+ uint32_t getJoinSize() const { return _byteCountJoinLimit; }
+
+ /**
+ @see setMinimalBucketSplit
+ */
+ uint32_t getMinimalBucketSplit() const { return _minimalBucketSplit; };
+
+ uint32_t getMinPendingMaintenanceOps() const {
+ return _minPendingMaintenanceOps;
+ }
+ void setMinPendingMaintenanceOps(uint32_t minPendingMaintenanceOps) {
+ _minPendingMaintenanceOps = minPendingMaintenanceOps;
+ }
+ uint32_t getMaxPendingMaintenanceOps() const {
+ return _maxPendingMaintenanceOps;
+ }
+ void setMaxPendingMaintenanceOps(uint32_t maxPendingMaintenanceOps) {
+ _maxPendingMaintenanceOps = maxPendingMaintenanceOps;
+ }
+
+ uint32_t getMaxVisitorsPerNodePerClientVisitor() const {
+ return _maxVisitorsPerNodePerClientVisitor;
+ }
+ uint32_t getMinBucketsPerVisitor() const {
+ return _minBucketsPerVisitor;
+ }
+ int64_t getMinTimeLeftToResend() const {
+ return _minTimeLeftToResend;
+ }
+
+ void setMaxVisitorsPerNodePerClientVisitor(uint32_t n) {
+ _maxVisitorsPerNodePerClientVisitor = n;
+ }
+ void setMinBucketsPerVisitor(uint32_t n) {
+ _minBucketsPerVisitor = n;
+ }
+ void setMinTimeLeftToResend(int64_t minTime) {
+ _minTimeLeftToResend = minTime;
+ }
+ uint32_t getMaxNodesPerMerge() const {
+ return _maxNodesPerMerge;
+ }
+ bool getEnableJoinForSiblingLessBuckets() const {
+ return _enableJoinForSiblingLessBuckets;
+ }
+ bool getEnableInconsistentJoin() const noexcept {
+ return _enableInconsistentJoin;
+ }
+
+ bool getEnableHostInfoReporting() const noexcept {
+ return _enableHostInfoReporting;
+ }
+
+ using ReplicaCountingMode = DistrConfig::MinimumReplicaCountingMode;
+ void setMinimumReplicaCountingMode(ReplicaCountingMode mode) noexcept {
+ _minimumReplicaCountingMode = mode;
+ }
+ ReplicaCountingMode getMinimumReplicaCountingMode() const noexcept {
+ return _minimumReplicaCountingMode;
+ }
+ bool isBucketActivationDisabled() const noexcept {
+ return _disableBucketActivation;
+ }
+
+private:
+ DistributorConfiguration(const DistributorConfiguration& other);
+ DistributorConfiguration& operator=(const DistributorConfiguration& other);
+
+ StorageComponent& _component;
+
+ uint32_t _byteCountSplitLimit;
+ uint32_t _docCountSplitLimit;
+ uint32_t _byteCountJoinLimit;
+ uint32_t _docCountJoinLimit;
+ uint32_t _minimalBucketSplit;
+ uint32_t _maxIdealStateOperations;
+ uint32_t _idealStateChunkSize;
+ uint32_t _maxNodesPerMerge;
+
+ std::string _garbageCollectionSelection;
+
+ uint32_t _lastGarbageCollectionChange;
+ uint32_t _garbageCollectionInterval;
+
+ uint32_t _minPendingMaintenanceOps;
+ uint32_t _maxPendingMaintenanceOps;
+
+ vespalib::hash_set<vespalib::string> _blockedStateCheckers;
+
+ uint32_t _maxVisitorsPerNodePerClientVisitor;
+ uint32_t _minBucketsPerVisitor;
+ int64_t _minTimeLeftToResend;
+
+ MaintenancePriorities _maintenancePriorities;
+
+ bool _doInlineSplit;
+ bool _enableJoinForSiblingLessBuckets;
+ bool _enableInconsistentJoin;
+ bool _enableHostInfoReporting;
+ bool _disableBucketActivation;
+
+ DistrConfig::MinimumReplicaCountingMode _minimumReplicaCountingMode;
+
+ friend class Distributor_Test;
+
+ bool containsTimeStatement(const std::string& documentSelection) const;
+ void configureMaintenancePriorities(
+ const vespa::config::content::core::StorDistributormanagerConfig&);
+};
+
+}
+}
+
+
diff --git a/storage/src/vespa/storage/distributor/distributorinterface.h b/storage/src/vespa/storage/distributor/distributorinterface.h
new file mode 100644
index 00000000000..5562670c35e
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/distributorinterface.h
@@ -0,0 +1,79 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/common/distributorcomponent.h>
+#include <vespa/storage/common/messagesender.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storage/distributor/maintenancebucket.h>
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <vespa/storage/distributor/bucketgctimecalculator.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storage/distributor/distributorconfiguration.h>
+#include <vespa/storage/distributor/distributormessagesender.h>
+#include <vespa/storage/distributor/bucketownership.h>
+
+namespace storage {
+
+namespace distributor {
+
+class DistributorInterface : public DistributorMessageSender
+{
+public:
+ virtual PendingMessageTracker& getPendingMessageTracker() = 0;
+ virtual const lib::Distribution& getDistribution() const = 0;
+
+ virtual DistributorMetricSet& getMetrics() = 0;
+
+ virtual void enableClusterState(const lib::ClusterState& state) = 0;
+
+ virtual BucketOwnership checkOwnershipInPendingState(const document::BucketId&) const = 0;
+
+ virtual void notifyDistributionChangeEnabled() = 0;
+
+ /**
+ * Requests that we send a requestBucketInfo for the given bucket to the given
+ * node. Should be called whenever we receive a BUCKET_NOT_FOUND result.
+ */
+ virtual void recheckBucketInfo(uint16_t nodeIdx, const document::BucketId& bid) = 0;
+
+ virtual bool handleReply(const std::shared_ptr<api::StorageReply>& reply) = 0;
+
+ /**
+ * Checks whether a bucket needs to be split, and sends a split
+ * if so.
+ *
+ * @param e The bucket to check.
+ * @param pri The priority the split should be sent at.
+ */
+ virtual void checkBucketForSplit(const BucketDatabase::Entry& e, uint8_t pri) = 0;
+
+ /**
+ * @return Returns the current cluster state.
+ */
+ virtual const lib::ClusterState& getClusterState() const = 0;
+
+ /**
+ * Returns true if the node is currently initializing.
+ */
+ virtual bool initializing() const = 0;
+
+ virtual void handleCompletedMerge(const std::shared_ptr<api::MergeBucketReply>&) = 0;
+
+ virtual BucketDatabase& getBucketDatabase() = 0;
+
+ virtual const BucketDatabase& getBucketDatabase() const = 0;
+
+ virtual const char* getStorageNodeUpStates() const = 0;
+
+ virtual const DistributorConfiguration& getConfig() const = 0;
+
+ virtual ChainedMessageSender& getMessageSender() = 0;
+
+ virtual const BucketGcTimeCalculator::BucketIdHasher& getBucketIdHasher() const = 0;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/distributormessagesender.cpp b/storage/src/vespa/storage/distributor/distributormessagesender.cpp
new file mode 100644
index 00000000000..d7b970baadd
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/distributormessagesender.cpp
@@ -0,0 +1,33 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/distributormessagesender.h>
+#include <vespa/storageapi/messageapi/storagecommand.h>
+
+namespace storage {
+
+namespace distributor {
+
+uint64_t
+DistributorMessageSender::sendToNode(
+ const lib::NodeType& nodeType,
+ uint16_t node,
+ const std::shared_ptr<api::StorageCommand> & cmd,
+ bool useDocumentAPI)
+{
+ cmd->setSourceIndex(getDistributorIndex());
+ cmd->setAddress(api::StorageMessageAddress(
+ getClusterName(),
+ nodeType,
+ node,
+ (useDocumentAPI
+ ? api::StorageMessageAddress::DOCUMENT
+ : api::StorageMessageAddress::STORAGE)));
+ uint64_t msgId = cmd->getMsgId();
+ sendCommand(cmd);
+ return msgId;
+}
+
+
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/distributormessagesender.h b/storage/src/vespa/storage/distributor/distributormessagesender.h
new file mode 100644
index 00000000000..5e916f9b650
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/distributormessagesender.h
@@ -0,0 +1,37 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <stdint.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/storage/common/messagesender.h>
+
+namespace storage {
+
+namespace distributor {
+
+class PendingMessageTracker;
+
+class DistributorMessageSender : public MessageSender {
+public:
+ /**
+ Sends the storage command to the given node,
+ returns message id.
+ */
+ virtual uint64_t sendToNode(const lib::NodeType& nodeType,
+ uint16_t node,
+ const std::shared_ptr<api::StorageCommand>& cmd,
+ bool useDocumentAPI = false);
+
+ virtual int getDistributorIndex() const = 0;
+
+ virtual const std::string& getClusterName() const = 0;
+
+ virtual const PendingMessageTracker& getPendingMessageTracker() const = 0;
+};
+
+} // distributor
+
+} // storage
+
+
+
diff --git a/storage/src/vespa/storage/distributor/distributormetricsset.h b/storage/src/vespa/storage/distributor/distributormetricsset.h
new file mode 100644
index 00000000000..aa4cec9e67d
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/distributormetricsset.h
@@ -0,0 +1,145 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/metrics/metrics.h>
+#include <vespa/documentapi/loadtypes/loadtypeset.h>
+
+namespace storage {
+
+class PersistenceFailuresMetricSet : public metrics::MetricSet
+{
+public:
+ PersistenceFailuresMetricSet(metrics::MetricSet* owner)
+ : metrics::MetricSet("failures", "", "Detailed failure statistics", owner),
+ sum("total", "logdefault yamasdefault", "Sum of all failures", this),
+ notready("notready", "", "The number of operations discarded because distributor was not ready", this),
+ notconnected("notconnected", "", "The number of operations discarded because there were no available storage nodes to send to", this),
+ wrongdistributor("wrongdistributor", "", "The number of operations discarded because they were sent to the wrong distributor", this),
+ storagefailure("storagefailure", "", "The number of operations that failed in storage", this),
+ timeout("timeout", "", "The number of operations that failed because the operation timed out towards storage", this),
+ busy("busy", "", "The number of messages from storage that failed because the storage node was busy", this),
+ notfound("notfound", "", "The number of operations that failed because the document did not exist", this)
+ {
+ sum.addMetricToSum(notready);
+ sum.addMetricToSum(notconnected);
+ sum.addMetricToSum(wrongdistributor);
+ sum.addMetricToSum(storagefailure);
+ sum.addMetricToSum(timeout);
+ sum.addMetricToSum(busy);
+ sum.addMetricToSum(notfound);
+ }
+
+ metrics::SumMetric<metrics::LongCountMetric> sum;
+ metrics::LongCountMetric notready;
+ metrics::LongCountMetric notconnected;
+ metrics::LongCountMetric wrongdistributor;
+ metrics::LongCountMetric storagefailure;
+ metrics::LongCountMetric timeout;
+ metrics::LongCountMetric busy;
+ metrics::LongCountMetric notfound;
+
+ virtual Metric* clone(std::vector<Metric::LP>& ownerList,
+ CopyType copyType,
+ metrics::MetricSet* owner,
+ bool includeUnused) const
+ {
+ if (copyType == INACTIVE) {
+ return MetricSet::clone(ownerList, INACTIVE, owner, includeUnused);
+ }
+ return (PersistenceFailuresMetricSet*)
+ (new PersistenceFailuresMetricSet(owner))->assignValues(*this);
+ }
+ PersistenceFailuresMetricSet* operator&() { return this; }
+};
+
+class PersistenceOperationMetricSet : public metrics::MetricSet
+{
+public:
+ metrics::DoubleAverageMetric latency;
+ metrics::LongCountMetric ok;
+ PersistenceFailuresMetricSet failures;
+
+ PersistenceOperationMetricSet(const std::string& name,
+ metrics::MetricSet* owner = 0)
+ : metrics::MetricSet(name,
+ "",
+ vespalib::make_string("Statistics for the %s command",
+ name.c_str()),
+ owner,
+ "operationtype"),
+ latency("latency",
+ "yamasdefault",
+ vespalib::make_string("The average latency of %s operations",
+ name.c_str()),
+ this),
+ ok("ok",
+ "logdefault yamasdefault",
+ vespalib::make_string("The number of successful %s operations performed",
+ name.c_str()),
+ this),
+ failures(this)
+ {
+ }
+
+ virtual Metric* clone(std::vector<Metric::LP>& ownerList,
+ CopyType copyType,
+ metrics::MetricSet* owner,
+ bool includeUnused) const
+ {
+ if (copyType == INACTIVE) {
+ return MetricSet::clone(ownerList, INACTIVE, owner, includeUnused);
+ }
+ return (PersistenceOperationMetricSet*)
+ (new PersistenceOperationMetricSet(getName(), owner))
+ ->assignValues(*this);
+ }
+ PersistenceOperationMetricSet* operator&() { return this; }
+
+
+};
+
+class DistributorMetricSet : public metrics::MetricSet
+{
+public:
+ metrics::LoadMetric<PersistenceOperationMetricSet> puts;
+ metrics::LoadMetric<PersistenceOperationMetricSet> updates;
+ metrics::LoadMetric<PersistenceOperationMetricSet> update_puts;
+ metrics::LoadMetric<PersistenceOperationMetricSet> update_gets;
+ metrics::LoadMetric<PersistenceOperationMetricSet> removes;
+ metrics::LoadMetric<PersistenceOperationMetricSet> removelocations;
+ metrics::LoadMetric<PersistenceOperationMetricSet> gets;
+ metrics::LoadMetric<PersistenceOperationMetricSet> stats;
+ metrics::LoadMetric<PersistenceOperationMetricSet> multioperations;
+ metrics::DoubleAverageMetric recoveryModeTime;
+ metrics::LongValueMetric docsStored;
+ metrics::LongValueMetric bytesStored;
+
+ DistributorMetricSet(const metrics::LoadTypeSet& lt)
+ : metrics::MetricSet("distributor", "distributor", ""),
+ puts(lt, *&PersistenceOperationMetricSet("puts"), this),
+ updates(lt, *&PersistenceOperationMetricSet("updates"), this),
+ update_puts(lt, *&PersistenceOperationMetricSet("update_puts"), this),
+ update_gets(lt, *&PersistenceOperationMetricSet("update_gets"), this),
+ removes(lt, *&PersistenceOperationMetricSet("removes"), this),
+ removelocations(lt, *&PersistenceOperationMetricSet("removelocations"), this),
+ gets(lt, *&PersistenceOperationMetricSet("gets"), this),
+ stats(lt, *&PersistenceOperationMetricSet("stats"), this),
+ multioperations(lt, *&PersistenceOperationMetricSet("multioperations"), this),
+ recoveryModeTime("recoverymodeschedulingtime", "",
+ "Time spent scheduling operations in recovery mode "
+ "after receiving new cluster state", this),
+ docsStored("docsstored", "logdefault yamasdefault",
+ "Number of documents stored in all buckets controlled by "
+ "this distributor", this),
+ bytesStored("bytesstored", "logdefault yamasdefault",
+ "Number of bytes stored in all buckets controlled by "
+ "this distributor", this)
+ {
+ docsStored.logOnlyIfSet();
+ bytesStored.logOnlyIfSet();
+ }
+
+};
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/externaloperationhandler.cpp b/storage/src/vespa/storage/distributor/externaloperationhandler.cpp
new file mode 100644
index 00000000000..e3ea4ae5779
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/externaloperationhandler.cpp
@@ -0,0 +1,227 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/externaloperationhandler.h>
+
+#include <vespa/document/bucket/bucketidfactory.h>
+#include <vespa/document/base/documentid.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/config/config-stor-server.h>
+#include <vespa/storage/storageutil/log.h>
+#include <vespa/storage/distributor/operations/external/putoperation.h>
+#include <vespa/storage/distributor/operations/external/twophaseupdateoperation.h>
+#include <vespa/storage/distributor/operations/external/updateoperation.h>
+#include <vespa/storage/distributor/operations/external/removeoperation.h>
+#include <vespa/storage/distributor/operations/external/getoperation.h>
+#include <vespa/storage/distributor/operations/external/multioperationoperation.h>
+#include <vespa/storage/distributor/operations/external/statbucketoperation.h>
+#include <vespa/storage/distributor/operations/external/statbucketlistoperation.h>
+#include <vespa/storage/distributor/operations/external/removelocationoperation.h>
+#include <vespa/storage/distributor/operations/external/visitoroperation.h>
+#include <vespa/document/util/stringutil.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <vespa/storageapi/message/removelocation.h>
+#include <vespa/storageapi/message/batch.h>
+#include <vespa/storage/storageserver/storagemetricsset.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/stat.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/storage/distributor/distributor.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+
+LOG_SETUP(".distributor.manager");
+
+namespace storage {
+namespace distributor {
+
+ExternalOperationHandler::ExternalOperationHandler(
+ Distributor& owner,
+ const MaintenanceOperationGenerator& gen,
+ DistributorComponentRegister& compReg)
+ : DistributorComponent(owner, compReg, "Distributor manager"),
+ _visitorMetrics(getLoadTypes()->getMetricLoadTypes(),
+ *&VisitorMetricSet(NULL)),
+ _operationGenerator(gen)
+{
+}
+
+ExternalOperationHandler::~ExternalOperationHandler()
+{
+}
+
+bool
+ExternalOperationHandler::handleMessage(
+ const std::shared_ptr<api::StorageMessage>& msg,
+ Operation::SP& op)
+{
+ _op = Operation::SP();
+ bool retVal = msg->callHandler(*this, msg);
+ op = _op;
+ return retVal;
+}
+
+IMPL_MSG_COMMAND_H(ExternalOperationHandler, Put)
+{
+ if (!checkDistribution(*cmd, getBucketId(cmd->getDocumentId()))) {
+ LOG(debug,
+ "Distributor manager received put for %s, bucket %s with wrong "
+ "distribution",
+ cmd->getDocumentId().toString().c_str(),
+ getBucketId(cmd->getDocumentId()).toString().c_str());
+
+ getMetrics().puts[cmd->getLoadType()].failures.wrongdistributor++;
+ return true;
+ }
+
+ if (cmd->getTimestamp() == 0) {
+ cmd->setTimestamp(getUniqueTimestamp());
+ }
+
+ _op = Operation::SP(new PutOperation(*this,
+ cmd,
+ getMetrics().puts[cmd->getLoadType()]));
+
+ return true;
+}
+
+
+IMPL_MSG_COMMAND_H(ExternalOperationHandler, Update)
+{
+ if (!checkDistribution(*cmd, getBucketId(cmd->getDocumentId()))) {
+ LOG(debug, "Distributor manager received update for %s, bucket %s with wrong distribution", cmd->getDocumentId().toString().c_str(), getBucketId(cmd->getDocumentId()).toString().c_str());
+
+ getMetrics().updates[cmd->getLoadType()].failures.wrongdistributor++;
+ return true;
+ }
+
+ if (cmd->getTimestamp() == 0) {
+ cmd->setTimestamp(getUniqueTimestamp());
+ }
+ _op = Operation::SP(new TwoPhaseUpdateOperation(*this, cmd, getMetrics()));
+ return true;
+}
+
+
+IMPL_MSG_COMMAND_H(ExternalOperationHandler, Remove)
+{
+ if (!checkDistribution(*cmd, getBucketId(cmd->getDocumentId()))) {
+ LOG(debug,
+ "Distributor manager received remove for %s, bucket %s with "
+ "wrong distribution",
+ cmd->getDocumentId().toString().c_str(),
+ getBucketId(cmd->getDocumentId()).toString().c_str());
+
+ getMetrics().removes[cmd->getLoadType()].failures.wrongdistributor++;
+ return true;
+ }
+
+ if (cmd->getTimestamp() == 0) {
+ cmd->setTimestamp(getUniqueTimestamp());
+ }
+ _op = Operation::SP(new RemoveOperation(
+ *this,
+ cmd,
+ getMetrics().removes[cmd->getLoadType()]));
+ return true;
+}
+
+IMPL_MSG_COMMAND_H(ExternalOperationHandler, RemoveLocation)
+{
+ document::BucketId bid;
+ RemoveLocationOperation::getBucketId(*this, *cmd, bid);
+
+ if (!checkDistribution(*cmd, bid)) {
+ LOG(debug,
+ "Distributor manager received %s with wrong distribution",
+ cmd->toString().c_str());
+
+ getMetrics().removelocations[cmd->getLoadType()].
+ failures.wrongdistributor++;
+ return true;
+ }
+
+ _op = Operation::SP(new RemoveLocationOperation(
+ *this,
+ cmd,
+ getMetrics().removelocations[cmd->getLoadType()]));
+ return true;
+}
+
+IMPL_MSG_COMMAND_H(ExternalOperationHandler, Get)
+{
+ if (!checkDistribution(*cmd, getBucketId(cmd->getDocumentId()))) {
+ LOG(debug,
+ "Distributor manager received get for %s, "
+ "bucket %s with wrong distribution",
+ cmd->getDocumentId().toString().c_str(),
+ getBucketId(cmd->getDocumentId()).toString().c_str());
+
+ getMetrics().gets[cmd->getLoadType()].failures.wrongdistributor++;
+ return true;
+ }
+
+ _op = Operation::SP(new GetOperation(
+ *this,
+ cmd,
+ getMetrics().gets[cmd->getLoadType()]));
+ return true;
+}
+
+IMPL_MSG_COMMAND_H(ExternalOperationHandler, MultiOperation)
+{
+ if (!checkDistribution(*cmd, cmd->getBucketId())) {
+ LOG(debug,
+ "Distributor manager received multi-operation message, "
+ "bucket %s with wrong distribution",
+ cmd->getBucketId().toString().c_str());
+ return true;
+ }
+
+ _op = Operation::SP(new MultiOperationOperation(
+ *this,
+ cmd,
+ getMetrics().multioperations[cmd->getLoadType()]));
+ return true;
+}
+
+IMPL_MSG_COMMAND_H(ExternalOperationHandler, StatBucket)
+{
+ if (!checkDistribution(*cmd, cmd->getBucketId())) {
+ return true;
+ }
+
+ _op = Operation::SP(new StatBucketOperation(*this, cmd));
+ return true;
+}
+
+IMPL_MSG_COMMAND_H(ExternalOperationHandler, GetBucketList)
+{
+ if (!checkDistribution(*cmd, cmd->getBucketId())) {
+ return true;
+ }
+ _op = Operation::SP(new StatBucketListOperation(
+ getBucketDatabase(), _operationGenerator, getIndex(), cmd));
+ return true;
+}
+
+IMPL_MSG_COMMAND_H(ExternalOperationHandler, CreateVisitor)
+{
+ const DistributorConfiguration& config(getDistributor().getConfig());
+ VisitorOperation::Config visitorConfig(
+ framework::MilliSecTime(config.getMinTimeLeftToResend()),
+ config.getMinBucketsPerVisitor(),
+ config.getMaxVisitorsPerNodePerClientVisitor());
+ _op = Operation::SP(new VisitorOperation(
+ *this,
+ cmd,
+ visitorConfig,
+ &_visitorMetrics[cmd->getLoadType()]));
+ return true;
+}
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/externaloperationhandler.h b/storage/src/vespa/storage/distributor/externaloperationhandler.h
new file mode 100644
index 00000000000..f8c88c31604
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/externaloperationhandler.h
@@ -0,0 +1,56 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/document/bucket/bucketidfactory.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/distributorcomponent.h>
+#include <vespa/storage/distributor/visitormetricsset.h>
+#include <vespa/storageapi/messageapi/messagehandler.h>
+#include <vespa/storageframework/storageframework.h>
+
+namespace storage {
+
+class DistributorMetricSet;
+
+namespace distributor {
+
+class Distributor;
+class MaintenanceOperationGenerator;
+
+class ExternalOperationHandler : public DistributorComponent,
+ public api::MessageHandler
+{
+public:
+ DEF_MSG_COMMAND_H(Get);
+ DEF_MSG_COMMAND_H(Put);
+ DEF_MSG_COMMAND_H(Update);
+ DEF_MSG_COMMAND_H(Remove);
+ DEF_MSG_COMMAND_H(RemoveLocation);
+ DEF_MSG_COMMAND_H(MultiOperation);
+ DEF_MSG_COMMAND_H(StatBucket);
+ DEF_MSG_COMMAND_H(CreateVisitor);
+ DEF_MSG_COMMAND_H(GetBucketList);
+
+ ExternalOperationHandler(Distributor& owner,
+ const MaintenanceOperationGenerator&,
+ DistributorComponentRegister& compReg);
+
+ ~ExternalOperationHandler();
+
+ bool handleMessage(const std::shared_ptr<api::StorageMessage>& msg,
+ Operation::SP& operation);
+
+private:
+ metrics::LoadMetric<VisitorMetricSet> _visitorMetrics;
+ const MaintenanceOperationGenerator& _operationGenerator;
+ Operation::SP _op;
+
+ DistributorMetricSet& getMetrics() { return getDistributor().getMetrics(); }
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/idealstatemanager.cpp b/storage/src/vespa/storage/distributor/idealstatemanager.cpp
new file mode 100644
index 00000000000..0b8c7abac23
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/idealstatemanager.cpp
@@ -0,0 +1,277 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/storage/distributor/operations/idealstate/idealstateoperation.h>
+#include <vespa/log/log.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/vespalib/stllike/asciistream.h>
+#include <vespa/storage/storageserver/storagemetricsset.h>
+#include <vespa/storageapi/message/internal.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storage/distributor/statecheckers.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storageapi/message/stat.h>
+#include <vespa/storage/distributor/distributor.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storage/distributor/idealstatemetricsset.h>
+
+LOG_SETUP(".distributor.operation.queue");
+
+using storage::lib::Node;
+using storage::lib::NodeType;
+
+namespace storage {
+namespace distributor {
+
+IdealStateManager::IdealStateManager(
+ Distributor& owner,
+ DistributorComponentRegister& compReg,
+ bool manageActiveBucketCopies)
+ : HtmlStatusReporter("idealstateman", "Ideal state manager"),
+ _metrics(new IdealStateMetricSet),
+ _distributorComponent(owner, compReg, "Ideal state manager")
+{
+ _distributorComponent.registerStatusPage(*this);
+ _distributorComponent.registerMetric(*_metrics);
+
+ if (manageActiveBucketCopies) {
+ LOG(debug, "Adding BucketStateStateChecker to state checkers");
+ _stateCheckers.push_back(
+ StateChecker::SP(new BucketStateStateChecker()));
+ }
+
+ _splitBucketStateChecker = new SplitBucketStateChecker();
+ _stateCheckers.push_back(StateChecker::SP(_splitBucketStateChecker));
+ _stateCheckers.push_back(StateChecker::SP(new SplitInconsistentStateChecker()));
+ _stateCheckers.push_back(StateChecker::SP(new SynchronizeAndMoveStateChecker()));
+ _stateCheckers.push_back(StateChecker::SP(new JoinBucketsStateChecker()));
+ _stateCheckers.push_back(StateChecker::SP(new DeleteExtraCopiesStateChecker()));
+ _stateCheckers.push_back(StateChecker::SP(new GarbageCollectionStateChecker()));
+}
+
+IdealStateManager::~IdealStateManager()
+{
+}
+
+void
+IdealStateManager::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ out << "IdealStateManager";
+}
+
+bool
+IdealStateManager::iAmUp() const
+{
+ Node node(NodeType::DISTRIBUTOR, _distributorComponent.getIndex());
+ const lib::State &nodeState = _distributorComponent.getClusterState()
+ .getNodeState(node).getState();
+ const lib::State &clusterState = _distributorComponent.getClusterState().getClusterState();
+
+ return (nodeState == lib::State::UP && clusterState == lib::State::UP);
+}
+
+void
+IdealStateManager::fillParentAndChildBuckets(StateChecker::Context& c) const
+{
+ _distributorComponent.getBucketDatabase().getAll(c.bucketId, c.entries);
+ if (c.entries.empty()) {
+ LOG(spam,
+ "Did not find bucket %s in bucket database",
+ c.bucketId.toString().c_str());
+ }
+}
+void
+IdealStateManager::fillSiblingBucket(StateChecker::Context& c) const
+{
+ c.siblingEntry = _distributorComponent.getBucketDatabase().get(c.siblingBucket);
+}
+
+BucketDatabase::Entry*
+IdealStateManager::getEntryForPrimaryBucket(StateChecker::Context& c) const
+{
+ for (uint32_t j = 0; j < c.entries.size(); ++j) {
+ BucketDatabase::Entry& e = c.entries[j];
+ if (e.getBucketId() == c.bucketId) {
+ return &e;
+ }
+ }
+ return 0;
+}
+
+namespace {
+
+/*
+ * Since state checkers are in prioritized order, don't allow
+ * overwriting if already explicitly set.
+ */
+bool
+canOverwriteResult(const StateChecker::Result& existing,
+ const StateChecker::Result& candidate)
+{
+ return (!existing.getPriority().requiresMaintenance()
+ && candidate.getPriority().requiresMaintenance());
+}
+
+}
+
+StateChecker::Result
+IdealStateManager::runStateCheckers(StateChecker::Context& c) const
+{
+ auto highestPri = StateChecker::Result::noMaintenanceNeeded();
+ // We go through _all_ active state checkers so that statistics can be
+ // collected across all checkers, not just the ones that are highest pri.
+ for (uint32_t i = 0; i < _stateCheckers.size(); i++) {
+ if (!_distributorComponent.getDistributor().getConfig().stateCheckerIsActive(
+ _stateCheckers[i]->getName()))
+ {
+ LOG(spam, "Skipping state checker %s",
+ _stateCheckers[i]->getName());
+ continue;
+ }
+
+ auto result = _stateCheckers[i]->check(c);
+ if (canOverwriteResult(highestPri, result)) {
+ highestPri = std::move(result);
+ }
+ }
+ return highestPri;
+}
+
+StateChecker::Result
+IdealStateManager::generateHighestPriority(
+ const document::BucketId& bid,
+ NodeMaintenanceStatsTracker& statsTracker) const
+{
+ StateChecker::Context c(_distributorComponent, statsTracker, bid);
+ fillParentAndChildBuckets(c);
+ fillSiblingBucket(c);
+
+ BucketDatabase::Entry* e(getEntryForPrimaryBucket(c));
+ if (!e) {
+ return StateChecker::Result::noMaintenanceNeeded();
+ }
+ LOG(spam, "Checking bucket %s", e->toString().c_str());
+
+ c.entry = *e;
+ return runStateCheckers(c);
+}
+
+MaintenancePriorityAndType
+IdealStateManager::prioritize(
+ const document::BucketId& bucketId,
+ NodeMaintenanceStatsTracker& statsTracker) const
+{
+ StateChecker::Result generated(
+ generateHighestPriority(bucketId, statsTracker));
+ MaintenancePriority priority(generated.getPriority());
+ MaintenanceOperation::Type type(priority.requiresMaintenance()
+ ? generated.getType()
+ : MaintenanceOperation::OPERATION_COUNT);
+ return MaintenancePriorityAndType(priority, type);
+}
+
+IdealStateOperation::SP
+IdealStateManager::generateInterceptingSplit(const BucketDatabase::Entry& e,
+ api::StorageMessage::Priority pri)
+{
+ NodeMaintenanceStatsTracker statsTracker;
+ StateChecker::Context c(_distributorComponent, statsTracker, e.getBucketId());
+ if (e.valid()) {
+ c.entry = e;
+
+ IdealStateOperation::UP operation(
+ _splitBucketStateChecker->check(c).createOperation());
+ if (operation.get()) {
+ operation->setPriority(pri);
+ operation->setIdealStateManager(this);
+ }
+
+ return IdealStateOperation::SP(operation.release());
+ }
+
+ return IdealStateOperation::SP();
+}
+
+MaintenanceOperation::SP
+IdealStateManager::generate(const document::BucketId& bucketId) const
+{
+ NodeMaintenanceStatsTracker statsTracker;
+ IdealStateOperation::SP op(
+ generateHighestPriority(bucketId, statsTracker).createOperation());
+ if (op.get()) {
+ op->setIdealStateManager(
+ const_cast<IdealStateManager*>(this));
+ }
+ return op;
+}
+
+std::vector<MaintenanceOperation::SP>
+IdealStateManager::generateAll(const document::BucketId& bucketId,
+ NodeMaintenanceStatsTracker& statsTracker) const
+{
+ StateChecker::Context c(_distributorComponent, statsTracker, bucketId);
+ fillParentAndChildBuckets(c);
+ fillSiblingBucket(c);
+ BucketDatabase::Entry* e(getEntryForPrimaryBucket(c));
+ std::vector<MaintenanceOperation::SP> operations;
+ if (e) {
+ c.entry = *e;
+ } else {
+ return operations;
+ }
+
+ for (uint32_t i = 0; i < _stateCheckers.size(); i++) {
+ IdealStateOperation::UP op(
+ _stateCheckers[i]->check(c).createOperation());
+ if (op.get()) {
+ operations.push_back(IdealStateOperation::SP(op.release()));
+ }
+ }
+ return operations;
+}
+
+void
+IdealStateManager::getBucketStatus(
+ const BucketDatabase::Entry& entry,
+ NodeMaintenanceStatsTracker& statsTracker,
+ std::ostream& out) const
+{
+ LOG(debug, "Dumping bucket database valid at cluster state version %u",
+ _distributorComponent.getDistributor().getClusterState().getVersion());
+
+ std::vector<MaintenanceOperation::SP> operations(
+ generateAll(entry.getBucketId(), statsTracker));
+ if (operations.empty()) {
+ out << entry.getBucketId() << " : ";
+ } else {
+ out << "<b>" << entry.getBucketId() << ":</b> <i> : ";
+ }
+ for (uint32_t i = 0; i < operations.size(); ++i) {
+ const MaintenanceOperation& op(*operations[i]);
+ if (i > 0) {
+ out << ", ";
+ }
+ out << op.getName() << ": " << op.getDetailedReason();
+ }
+ if (!operations.empty()) {
+ out << "</i> ";
+ }
+ out << "[" << entry->toString() << "]<br>\n";
+}
+
+void
+IdealStateManager::getBucketStatus(std::ostream& out) const
+{
+ StatusBucketVisitor proc(*this, out);
+ _distributorComponent.getBucketDatabase().forEach(proc);
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/idealstatemanager.h b/storage/src/vespa/storage/distributor/idealstatemanager.h
new file mode 100644
index 00000000000..7dba4102466
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/idealstatemanager.h
@@ -0,0 +1,147 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <deque>
+#include <map>
+#include <set>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/distributorcomponent.h>
+#include <vespa/storage/distributor/statechecker.h>
+#include <vespa/storage/distributor/maintenance/maintenanceprioritygenerator.h>
+#include <vespa/storage/distributor/maintenance/maintenanceoperationgenerator.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vector>
+
+namespace storage {
+namespace distributor {
+
+class IdealStateMetricSet;
+class IdealStateOperation;
+class Distributor;
+class SplitBucketStateChecker;
+
+/**
+ @class IdealStateManager
+
+ This storage link is responsible for generating maintenance operations to
+ be performed on the storage nodes.
+
+ To generate operation objects, we have a set of StateCheckers. A
+ StateChecker takes a bucket and configuration information, and checks for a
+ certain property on the bucket. If that property is not according to the
+ configuration, it makes an Operation to correct the problem. The
+ StateCheckers are run in sequence for each bucket, and only one StateChecker
+ may generate Operations. Once one does so, the rest of the state checkers
+ aren't run.
+*/
+class IdealStateManager : public framework::HtmlStatusReporter,
+ public MaintenancePriorityGenerator,
+ public MaintenanceOperationGenerator
+{
+public:
+
+ IdealStateManager(Distributor& owner,
+ DistributorComponentRegister& compReg,
+ bool manageActiveBucketCopies);
+
+ ~IdealStateManager();
+
+ void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ // MaintenancePriorityGenerator interface
+ MaintenancePriorityAndType prioritize(
+ const document::BucketId& bucketId,
+ NodeMaintenanceStatsTracker& statsTracker) const override;
+
+ // MaintenanceOperationGenerator
+ MaintenanceOperation::SP generate(
+ const document::BucketId& bucketId) const override;
+
+ // MaintenanceOperationGenerator
+ std::vector<MaintenanceOperation::SP> generateAll(
+ const document::BucketId& bucketId,
+ NodeMaintenanceStatsTracker& statsTracker) const override;
+
+ /**
+ * If the given bucket is too large, generate a split operation for it,
+ * with higher priority than the given one.
+ */
+ IdealStateOperation::SP generateInterceptingSplit(
+ const BucketDatabase::Entry& e,
+ api::StorageMessage::Priority pri);
+
+ IdealStateMetricSet& getMetrics() { return *_metrics; }
+
+ void getBucketStatus(std::ostream& out) const;
+
+ // HtmlStatusReporter
+ void reportHtmlStatus(
+ std::ostream& out, const framework::HttpUrlPath&) const override {
+ getBucketStatus(out);
+ }
+
+ DistributorComponent& getDistributorComponent() {
+ return _distributorComponent; }
+ StorageComponent::LoadTypeSetSP getLoadTypes() {
+ return _distributorComponent.getLoadTypes(); }
+
+private:
+ void fillParentAndChildBuckets(StateChecker::Context& c) const;
+ void fillSiblingBucket(StateChecker::Context& c) const;
+ StateChecker::Result generateHighestPriority(
+ const document::BucketId& bucketId,
+ NodeMaintenanceStatsTracker& statsTracker) const;
+ StateChecker::Result runStateCheckers(StateChecker::Context& c) const;
+
+ BucketDatabase::Entry* getEntryForPrimaryBucket(StateChecker::Context& c) const;
+
+ friend class Operation_TestCase;
+ friend class RemoveBucketOperation_Test;
+ friend class MergeOperation_Test;
+ friend class CreateBucketOperation_Test;
+ friend class SplitOperation_Test;
+ friend class JoinOperation_Test;
+
+ std::shared_ptr<IdealStateMetricSet> _metrics;
+ document::BucketId _lastPrioritizedBucket;
+
+ // Prioritized of state checkers that generate operations
+ // for idealstatemanager.
+ std::vector<StateChecker::SP> _stateCheckers;
+ SplitBucketStateChecker* _splitBucketStateChecker;
+
+ DistributorComponent _distributorComponent;
+
+ std::vector<IdealStateOperation::SP> generateOperationsForBucket(
+ StateChecker::Context& c) const;
+
+ bool iAmUp() const;
+
+ class StatusBucketVisitor : public BucketDatabase::EntryProcessor {
+ // Stats tracker to use for all generateAll() calls to avoid having
+ // to create a new hash map for each single bucket processed.
+ NodeMaintenanceStatsTracker _statsTracker;
+ const IdealStateManager& _ism;
+ std::ostream& _out;
+ public:
+ StatusBucketVisitor(const IdealStateManager& ism, std::ostream& out)
+ : _ism(ism), _out(out) {}
+
+ bool process(const BucketDatabase::Entry& e) {
+ _ism.getBucketStatus(e, _statsTracker, _out);
+ return true;
+ }
+ };
+ friend class StatusBucketVisitor;
+
+ void getBucketStatus(const BucketDatabase::Entry& entry,
+ NodeMaintenanceStatsTracker& statsTracker,
+ std::ostream& out) const;
+
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/idealstatemetricsset.h b/storage/src/vespa/storage/distributor/idealstatemetricsset.h
new file mode 100644
index 00000000000..85d19e18d3e
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/idealstatemetricsset.h
@@ -0,0 +1,118 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/metrics/metrics.h>
+#include <vespa/storage/distributor/operations/idealstate/idealstateoperation.h>
+
+namespace storage {
+
+namespace distributor {
+
+class OperationMetricSet : public metrics::MetricSet
+{
+public:
+ metrics::LongValueMetric pending;
+ metrics::LongCountMetric ok;
+ metrics::LongCountMetric failed;
+
+ OperationMetricSet(const std::string& name, const std::string& tags, const std::string& description, metrics::MetricSet* owner)
+ : metrics::MetricSet(name, tags, description, owner),
+ pending("pending",
+ "logdefault yamasdefault",
+ "The number of operations pending", this),
+ ok("done_ok",
+ "logdefault yamasdefault",
+ "The number of operations successfully performed", this),
+ failed("done_failed",
+ "logdefault yamasdefault",
+ "The number of operations that failed", this)
+ {
+ }
+
+};
+
+class IdealStateMetricSet : public metrics::MetricSet
+{
+public:
+ std::vector<std::shared_ptr<OperationMetricSet> > operations;
+ metrics::LongValueMetric idealstate_diff;
+ metrics::LongValueMetric buckets_toofewcopies;
+ metrics::LongValueMetric buckets_toomanycopies;
+ metrics::LongValueMetric buckets;
+ metrics::LongValueMetric buckets_notrusted;
+ metrics::LongValueMetric buckets_rechecking;
+ metrics::LongAverageMetric startOperationsLatency;
+ metrics::DoubleAverageMetric nodesPerMerge;
+
+ void createOperationMetrics() {
+ typedef IdealStateOperation ISO;
+ operations.resize(ISO::OPERATION_COUNT);
+ operations[ISO::DELETE_BUCKET] = std::shared_ptr<OperationMetricSet>(
+ new OperationMetricSet("delete_bucket", "logdefault yamasdefault",
+ "Operations to delete excess buckets on storage nodes", this));
+ operations[ISO::MERGE_BUCKET] = std::shared_ptr<OperationMetricSet>(
+ new OperationMetricSet("merge_bucket", "logdefault yamasdefault",
+ "Operations to merge buckets that are out of sync", this));
+ operations[ISO::SPLIT_BUCKET] = std::shared_ptr<OperationMetricSet>(
+ new OperationMetricSet("split_bucket", "logdefault yamasdefault",
+ "Operations to split buckets that are larger than the configured size", this));
+ operations[ISO::JOIN_BUCKET] = std::shared_ptr<OperationMetricSet>(
+ new OperationMetricSet("join_bucket", "logdefault yamasdefault",
+ "Operations to join buckets that in sum are smaller than the configured size", this));
+ operations[ISO::SET_BUCKET_STATE] = std::shared_ptr<OperationMetricSet>(
+ new OperationMetricSet("set_bucket_state",
+ "logdefault yamasdefault",
+ "Operations to set active/ready state for bucket copies", this));
+ operations[ISO::GARBAGE_COLLECTION] = std::shared_ptr<OperationMetricSet>(
+ new OperationMetricSet("garbage_collection",
+ "logdefault yamasdefault",
+ "Operations to garbage collect data from buckets", this));
+ }
+
+ IdealStateMetricSet()
+ : metrics::MetricSet("idealstate", "idealstate",
+ "Statistics for ideal state generation"),
+ idealstate_diff("idealstate_diff", "logdefault yamasdefault",
+ "A number representing the current difference from the ideal "
+ "state. This is a number that decreases steadily as the system "
+ "is getting closer to the ideal state", this),
+ buckets_toofewcopies("buckets_toofewcopies", "logdefault yamasdefault",
+ "The number of buckets the distributor controls that have less "
+ "than the desired redundancy", this),
+ buckets_toomanycopies("buckets_toomanycopies", "logdefault yamasdefault",
+ "The number of buckets the distributor controls that have more "
+ "than the desired redundancy", this),
+ buckets("buckets", "logdefault yamasdefault",
+ "The number of buckets the distributor controls", this),
+ buckets_notrusted("buckets_notrusted", "logdefault yamasdefault",
+ "The number of buckets that have no trusted copies.", this),
+ buckets_rechecking("buckets_rechecking", "logdefault yamasdefault",
+ "The number of buckets that we are rechecking for "
+ "ideal state operations", this),
+ startOperationsLatency("start_operations_latency", "",
+ "Time used in startOperations()", this),
+ nodesPerMerge("nodes_per_merge", "",
+ "The number of nodes involved in a single merge operation.",
+ this)
+ {
+ createOperationMetrics();
+ }
+
+ void setPendingOperations(const std::vector<uint64_t>& newMetrics) {
+ for (uint32_t i = 0; i < IdealStateOperation::OPERATION_COUNT; i++) {
+ operations[i]->pending.set(newMetrics[i]);
+ }
+
+ idealstate_diff.set(
+ operations[IdealStateOperation::DELETE_BUCKET]->pending.getLast() +
+ operations[IdealStateOperation::MERGE_BUCKET]->pending.getLast() * 10 +
+ operations[IdealStateOperation::SPLIT_BUCKET]->pending.getLast() * 4 +
+ operations[IdealStateOperation::JOIN_BUCKET]->pending.getLast() * 2 +
+ operations[IdealStateOperation::SET_BUCKET_STATE]->pending.getLast());
+ }
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/latency_statistics_provider.cpp b/storage/src/vespa/storage/distributor/latency_statistics_provider.cpp
new file mode 100644
index 00000000000..ef0807da030
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/latency_statistics_provider.cpp
@@ -0,0 +1,29 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/latency_statistics_provider.h>
+
+namespace storage {
+namespace distributor {
+
+std::ostream&
+operator<<(std::ostream& os, const OperationStats& op)
+{
+ os << "OperationStats("
+ << "totalLatency=" << op.totalLatency.count()
+ << "ms, numRequests=" << op.numRequests
+ << ')';
+ return os;
+}
+
+std::ostream&
+operator<<(std::ostream& os, const NodeStats& stats)
+{
+ os << "NodeStats("
+ << "puts=" << stats.puts
+ << ')';
+ return os;
+}
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/latency_statistics_provider.h b/storage/src/vespa/storage/distributor/latency_statistics_provider.h
new file mode 100644
index 00000000000..32f217e5ce3
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/latency_statistics_provider.h
@@ -0,0 +1,58 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <chrono>
+#include <unordered_map>
+#include <iostream>
+#include <stdint.h>
+
+namespace storage {
+namespace distributor {
+
+struct OperationStats {
+ std::chrono::milliseconds totalLatency;
+ uint64_t numRequests;
+
+ OperationStats()
+ : totalLatency(0), numRequests(0)
+ {
+ }
+};
+
+struct NodeStats {
+ OperationStats puts;
+};
+
+std::ostream&
+operator<<(std::ostream&, const OperationStats&);
+
+std::ostream&
+operator<<(std::ostream&, const NodeStats&);
+
+struct NodeStatsSnapshot
+{
+ std::unordered_map<uint16_t, NodeStats> nodeToStats;
+};
+
+class LatencyStatisticsProvider
+{
+public:
+ virtual ~LatencyStatisticsProvider() {}
+
+ /**
+ * Get a snapshot representation of the latency statistics towards a set of
+ * nodes at the point of the call.
+ *
+ * Can be called at any time after registration from another thread context
+ * and the call must thus be thread safe and data race free.
+ */
+ NodeStatsSnapshot getLatencyStatistics() const {
+ return doGetLatencyStatistics();
+ }
+
+private:
+ virtual NodeStatsSnapshot doGetLatencyStatistics() const = 0;
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/maintenance/CMakeLists.txt b/storage/src/vespa/storage/distributor/maintenance/CMakeLists.txt
new file mode 100644
index 00000000000..6dc7e4ca5fd
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/CMakeLists.txt
@@ -0,0 +1,12 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_distributormaintenance OBJECT
+ SOURCES
+ simplebucketprioritydatabase.cpp
+ simplemaintenancescanner.cpp
+ prioritizedbucket.cpp
+ maintenancescheduler.cpp
+ node_maintenance_stats_tracker.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/distributor/maintenance/bucketprioritydatabase.h b/storage/src/vespa/storage/distributor/maintenance/bucketprioritydatabase.h
new file mode 100644
index 00000000000..3a4a278543f
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/bucketprioritydatabase.h
@@ -0,0 +1,74 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <vespa/storage/distributor/maintenance/prioritizedbucket.h>
+#include <boost/iterator/iterator_facade.hpp>
+#include <vespa/vespalib/util/linkedptr.h>
+
+namespace storage {
+namespace distributor {
+
+class BucketPriorityDatabase
+{
+protected:
+ class ConstIteratorImpl
+ {
+ public:
+ virtual ~ConstIteratorImpl() { }
+ virtual void increment() = 0;
+
+ virtual bool equal(const ConstIteratorImpl& other) const = 0;
+
+ virtual PrioritizedBucket dereference() const = 0;
+ };
+
+ typedef vespalib::LinkedPtr<ConstIteratorImpl> ConstIteratorImplPtr;
+public:
+ class ConstIterator
+ : public boost::iterator_facade<
+ ConstIterator,
+ PrioritizedBucket const,
+ boost::forward_traversal_tag,
+ PrioritizedBucket
+ >
+ {
+ ConstIteratorImplPtr _impl;
+ public:
+ ConstIterator(const ConstIteratorImplPtr& impl)
+ : _impl(impl)
+ {}
+
+ virtual ~ConstIterator() {}
+ private:
+ friend class boost::iterator_core_access;
+
+ void increment() {
+ _impl->increment();
+ }
+
+ bool equal(const ConstIterator& other) const {
+ return _impl->equal(*other._impl);
+ }
+
+ PrioritizedBucket dereference() const {
+ return _impl->dereference();
+ }
+ };
+
+ typedef ConstIterator const_iterator;
+
+ virtual ~BucketPriorityDatabase() { }
+
+ virtual const_iterator begin() const = 0;
+
+ virtual const_iterator end() const = 0;
+
+ virtual void setPriority(const PrioritizedBucket&) = 0;
+};
+
+}
+}
+
+
+
diff --git a/storage/src/vespa/storage/distributor/maintenance/maintenanceoperation.h b/storage/src/vespa/storage/distributor/maintenance/maintenanceoperation.h
new file mode 100644
index 00000000000..a196915798e
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/maintenanceoperation.h
@@ -0,0 +1,28 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/operations/operation.h>
+
+namespace storage {
+namespace distributor {
+
+class MaintenanceOperation : public Operation
+{
+public:
+ typedef enum {
+ DELETE_BUCKET,
+ MERGE_BUCKET,
+ SPLIT_BUCKET,
+ JOIN_BUCKET,
+ SET_BUCKET_STATE,
+ GARBAGE_COLLECTION,
+ OPERATION_COUNT
+ } Type;
+
+ typedef std::shared_ptr<MaintenanceOperation> SP;
+
+ virtual const std::string& getDetailedReason() const = 0;
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/maintenance/maintenanceoperationgenerator.h b/storage/src/vespa/storage/distributor/maintenance/maintenanceoperationgenerator.h
new file mode 100644
index 00000000000..07897268995
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/maintenanceoperationgenerator.h
@@ -0,0 +1,46 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vector>
+#include <vespa/storage/distributor/maintenance/maintenanceoperation.h>
+#include <vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h>
+
+namespace storage {
+namespace distributor {
+
+class MaintenanceOperationGenerator
+{
+public:
+ virtual ~MaintenanceOperationGenerator() {}
+
+ /**
+ * Generate and return the highest prioritized maintenance operation for
+ * the given bucket. If the bucket does not need maintenance, a nullptr
+ * shared_ptr is returned.
+ */
+ virtual MaintenanceOperation::SP generate(
+ const document::BucketId&) const = 0;
+
+ /**
+ * Generate all possible maintenance operations for the given bucket and
+ * return these, ordered by priority in decreasing order. If the bucket
+ * does not need maintenance, the returned vector will be empty.
+ */
+ virtual std::vector<MaintenanceOperation::SP> generateAll(
+ const document::BucketId&,
+ NodeMaintenanceStatsTracker&) const = 0;
+
+ /**
+ * Convenience wrapper around generateAll() for when there's no need for
+ * an explicit stats tracker
+ */
+ std::vector<MaintenanceOperation::SP> generateAll(
+ const document::BucketId& bucketId) const
+ {
+ NodeMaintenanceStatsTracker dummyTracker;
+ return generateAll(bucketId, dummyTracker);
+ }
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/maintenance/maintenancepriority.h b/storage/src/vespa/storage/distributor/maintenance/maintenancepriority.h
new file mode 100644
index 00000000000..c5e5ee93afd
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/maintenancepriority.h
@@ -0,0 +1,59 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+namespace storage {
+namespace distributor {
+
+class MaintenancePriority
+{
+public:
+ enum Priority {
+ NO_MAINTENANCE_NEEDED,
+ VERY_LOW,
+ LOW,
+ MEDIUM,
+ HIGH,
+ VERY_HIGH,
+ PRIORITY_LIMIT
+ };
+
+ static const std::string toString(Priority pri) {
+ switch (pri) {
+ case NO_MAINTENANCE_NEEDED: return "NO_MAINTENANCE_NEEDED";
+ case VERY_LOW: return "VERY_LOW";
+ case LOW: return "LOW";
+ case MEDIUM: return "MEDIUM";
+ case HIGH: return "HIGH";
+ case VERY_HIGH: return "VERY_HIGH";
+ default: return "INVALID";
+ }
+ }
+
+ MaintenancePriority()
+ : _priority(NO_MAINTENANCE_NEEDED)
+ {}
+
+ explicit MaintenancePriority(Priority priority)
+ : _priority(priority)
+ {}
+
+ Priority getPriority() const {
+ return _priority;
+ }
+
+ bool requiresMaintenance() const {
+ return _priority != NO_MAINTENANCE_NEEDED;
+ }
+
+ static MaintenancePriority noMaintenanceNeeded() {
+ return MaintenancePriority(NO_MAINTENANCE_NEEDED);
+ }
+
+private:
+ Priority _priority;
+};
+
+}
+}
+
+
diff --git a/storage/src/vespa/storage/distributor/maintenance/maintenancepriorityandtype.h b/storage/src/vespa/storage/distributor/maintenance/maintenancepriorityandtype.h
new file mode 100644
index 00000000000..c5a46464623
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/maintenancepriorityandtype.h
@@ -0,0 +1,36 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/maintenance/maintenanceoperation.h>
+#include <vespa/storage/distributor/maintenance/maintenancepriority.h>
+
+namespace storage {
+namespace distributor {
+
+class MaintenancePriorityAndType
+{
+ MaintenancePriority _priority;
+ MaintenanceOperation::Type _type;
+public:
+ MaintenancePriorityAndType(MaintenancePriority pri,
+ MaintenanceOperation::Type type)
+ : _priority(pri),
+ _type(type)
+ {}
+
+ const MaintenancePriority& getPriority() const {
+ return _priority;
+ }
+
+ MaintenanceOperation::Type getType() const {
+ return _type;
+ }
+
+ bool requiresMaintenance() const {
+ return (_priority.getPriority()
+ != MaintenancePriority::NO_MAINTENANCE_NEEDED);
+ }
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/maintenance/maintenanceprioritygenerator.h b/storage/src/vespa/storage/distributor/maintenance/maintenanceprioritygenerator.h
new file mode 100644
index 00000000000..cf98ecab0ad
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/maintenanceprioritygenerator.h
@@ -0,0 +1,24 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vector>
+#include <vespa/storage/distributor/maintenance/prioritizedbucket.h>
+#include <vespa/storage/distributor/maintenance/maintenancepriorityandtype.h>
+#include <vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h>
+
+namespace storage {
+namespace distributor {
+
+class MaintenancePriorityGenerator
+{
+public:
+ virtual ~MaintenancePriorityGenerator() {}
+
+ virtual MaintenancePriorityAndType prioritize(
+ const document::BucketId&,
+ NodeMaintenanceStatsTracker&) const = 0;
+};
+
+}
+}
+
diff --git a/storage/src/vespa/storage/distributor/maintenance/maintenancescanner.h b/storage/src/vespa/storage/distributor/maintenance/maintenancescanner.h
new file mode 100644
index 00000000000..9dce66c1f32
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/maintenancescanner.h
@@ -0,0 +1,39 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+
+namespace storage {
+namespace distributor {
+
+class MaintenanceScanner
+{
+public:
+ virtual ~MaintenanceScanner() {}
+
+ class ScanResult {
+ bool _done;
+ BucketDatabase::Entry _entry;
+
+ public:
+ bool isDone() const { return _done; }
+ const BucketDatabase::Entry& getEntry() const { return _entry; }
+
+ static ScanResult createDone() { return ScanResult(true); }
+ static ScanResult createNotDone(BucketDatabase::Entry entry) {
+ return ScanResult(entry);
+ }
+
+ private:
+ ScanResult(bool done) : _done(done), _entry() {}
+ ScanResult(const BucketDatabase::Entry& e) : _done(false), _entry(e) {}
+ };
+
+ virtual ScanResult scanNext() = 0;
+
+ virtual void reset() = 0;
+};
+
+}
+}
+
diff --git a/storage/src/vespa/storage/distributor/maintenance/maintenancescheduler.cpp b/storage/src/vespa/storage/distributor/maintenance/maintenancescheduler.cpp
new file mode 100644
index 00000000000..5dd9567044d
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/maintenancescheduler.cpp
@@ -0,0 +1,105 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/maintenance/maintenancescheduler.h>
+#include <vespa/storage/distributor/maintenance/maintenanceoperationgenerator.h>
+#include <vespa/storage/distributor/operationstarter.h>
+#include <vespa/storage/distributor/operations/idealstate/idealstateoperation.h>
+
+namespace storage {
+namespace distributor {
+
+MaintenanceScheduler::MaintenanceScheduler(
+ MaintenanceOperationGenerator& operationGenerator,
+ BucketPriorityDatabase& priorityDb,
+ OperationStarter& operationStarter)
+ : _operationGenerator(operationGenerator),
+ _priorityDb(priorityDb),
+ _operationStarter(operationStarter)
+{
+}
+
+PrioritizedBucket
+MaintenanceScheduler::getMostImportantBucket()
+{
+ BucketPriorityDatabase::const_iterator mostImportant(_priorityDb.begin());
+ if (mostImportant == _priorityDb.end()) {
+ return PrioritizedBucket::INVALID;
+ }
+ return *mostImportant;
+}
+
+MaintenanceScheduler::WaitTimeMs
+MaintenanceScheduler::tick(SchedulingMode currentMode)
+{
+ PrioritizedBucket mostImportant(getMostImportantBucket());
+
+ if (!possibleToSchedule(mostImportant, currentMode)) {
+ return WaitTimeMs(1);
+ }
+ if (!startOperation(mostImportant)) {
+ return WaitTimeMs(1);
+ }
+ clearPriority(mostImportant);
+ return WaitTimeMs(0);
+}
+
+bool
+MaintenanceScheduler::possibleToSchedule(const PrioritizedBucket& bucket,
+ SchedulingMode currentMode) const
+{
+ if (currentMode == RECOVERY_SCHEDULING_MODE) {
+ return (bucket.valid()
+ && possibleToScheduleInEmergency(bucket));
+ } else {
+ return bucket.valid();
+ }
+}
+
+bool
+MaintenanceScheduler::possibleToScheduleInEmergency(
+ const PrioritizedBucket& bucket) const
+{
+ return bucket.moreImportantThan(MaintenancePriority::HIGH);
+}
+
+void
+MaintenanceScheduler::clearPriority(const PrioritizedBucket& bucket)
+{
+ _priorityDb.setPriority(PrioritizedBucket(bucket.getBucketId(),
+ MaintenancePriority::NO_MAINTENANCE_NEEDED));
+}
+
+OperationStarter::Priority
+MaintenanceScheduler::convertToOperationPriority(MaintenancePriority::Priority priority) const
+{
+ switch (priority) {
+ case MaintenancePriority::VERY_LOW:
+ return OperationStarter::Priority(200);
+ case MaintenancePriority::LOW:
+ return OperationStarter::Priority(150);
+ case MaintenancePriority::MEDIUM:
+ return OperationStarter::Priority(100);
+ case MaintenancePriority::HIGH:
+ return OperationStarter::Priority(50);
+ case MaintenancePriority::VERY_HIGH:
+ return OperationStarter::Priority(0);
+ default:
+ assert(false);
+ abort();
+ }
+}
+
+bool
+MaintenanceScheduler::startOperation(const PrioritizedBucket& bucket)
+{
+ Operation::SP operation(_operationGenerator.generate(bucket.getBucketId()));
+ if (!operation) {
+ return true;
+ }
+ OperationStarter::Priority operationPriority(
+ convertToOperationPriority(bucket.getPriority()));
+ return _operationStarter.start(operation, operationPriority);
+}
+
+}
+}
diff --git a/storage/src/vespa/storage/distributor/maintenance/maintenancescheduler.h b/storage/src/vespa/storage/distributor/maintenance/maintenancescheduler.h
new file mode 100644
index 00000000000..5aa3773e1c8
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/maintenancescheduler.h
@@ -0,0 +1,50 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/maintenance/prioritizedbucket.h>
+#include <vespa/storage/distributor/maintenance/simplemaintenancescanner.h>
+#include <vespa/storage/distributor/operationstarter.h>
+
+namespace storage {
+namespace distributor {
+
+class MaintenanceOperationGenerator;
+class BucketPriorityDatabase;
+
+class MaintenanceScheduler
+{
+public:
+ enum SchedulingMode {
+ RECOVERY_SCHEDULING_MODE,
+ NORMAL_SCHEDULING_MODE
+ };
+
+ typedef int WaitTimeMs;
+
+ MaintenanceScheduler(MaintenanceOperationGenerator& operationGenerator,
+ BucketPriorityDatabase& priorityDb,
+ OperationStarter& operationStarter);
+
+ WaitTimeMs tick(SchedulingMode currentMode);
+
+private:
+ MaintenanceScheduler(const MaintenanceScheduler&);
+ MaintenanceScheduler& operator=(const MaintenanceScheduler&);
+
+ //void scanNextBucket();
+ PrioritizedBucket getMostImportantBucket();
+ bool possibleToSchedule(const PrioritizedBucket& bucket, SchedulingMode currentMode) const;
+ bool possibleToScheduleInEmergency(const PrioritizedBucket& bucket) const;
+ void clearPriority(const PrioritizedBucket& bucket);
+ bool startOperation(const PrioritizedBucket& bucket);
+ OperationStarter::Priority convertToOperationPriority(
+ MaintenancePriority::Priority priority) const;
+
+ MaintenanceOperationGenerator& _operationGenerator;
+ BucketPriorityDatabase& _priorityDb;
+ OperationStarter& _operationStarter;
+};
+
+}
+}
+
diff --git a/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp
new file mode 100644
index 00000000000..5cef9767714
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.cpp
@@ -0,0 +1,25 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+
+#include <vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h>
+
+namespace storage {
+namespace distributor {
+
+const NodeMaintenanceStats NodeMaintenanceStatsTracker::_emptyStats;
+
+std::ostream&
+operator<<(std::ostream& os, const NodeMaintenanceStats& stats)
+{
+ os << "NodeStats("
+ << "movingOut=" << stats.movingOut
+ << ",syncing=" << stats.syncing
+ << ",copyingIn=" << stats.copyingIn
+ << ",copyingOut=" << stats.copyingOut
+ << ")";
+ return os;
+}
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h
new file mode 100644
index 00000000000..5e1177a5ca6
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h
@@ -0,0 +1,61 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <unordered_map>
+#include <iostream>
+#include <stdint.h>
+
+namespace storage {
+namespace distributor {
+
+struct NodeMaintenanceStats
+{
+ uint64_t movingOut {0};
+ uint64_t syncing {0};
+ uint64_t copyingIn {0};
+ uint64_t copyingOut {0};
+
+ bool operator==(const NodeMaintenanceStats& other) const noexcept {
+ return (movingOut == other.movingOut
+ && syncing == other.syncing
+ && copyingIn == other.copyingIn
+ && copyingOut == other.copyingOut);
+ }
+};
+
+std::ostream& operator<<(std::ostream&, const NodeMaintenanceStats&);
+
+class NodeMaintenanceStatsTracker
+{
+ std::unordered_map<uint16_t, NodeMaintenanceStats> _stats;
+ static const NodeMaintenanceStats _emptyStats;
+public:
+ void incMovingOut(uint16_t node) {
+ ++_stats[node].movingOut;
+ }
+
+ void incSyncing(uint16_t node) {
+ ++_stats[node].syncing;
+ }
+
+ void incCopyingIn(uint16_t node) {
+ ++_stats[node].copyingIn;
+ }
+
+ void incCopyingOut(uint16_t node) {
+ ++_stats[node].copyingOut;
+ }
+
+ /**
+ * Returned statistics for a given node index, or all zero statistics
+ * if none have been recorded yet
+ */
+ const NodeMaintenanceStats& forNode(uint16_t node) const {
+ auto iter = _stats.find(node);
+ return (iter != _stats.end() ? iter->second : _emptyStats);
+ }
+};
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/maintenance/prioritizedbucket.cpp b/storage/src/vespa/storage/distributor/maintenance/prioritizedbucket.cpp
new file mode 100644
index 00000000000..9f6c14a171a
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/prioritizedbucket.cpp
@@ -0,0 +1,19 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <iostream>
+#include <vespa/storage/distributor/maintenance/prioritizedbucket.h>
+
+namespace storage {
+namespace distributor {
+
+const PrioritizedBucket PrioritizedBucket::INVALID = PrioritizedBucket();
+
+std::ostream&
+operator<<(std::ostream& os, const PrioritizedBucket& bucket)
+{
+ os << bucket.toString();
+ return os;
+}
+
+}
+}
diff --git a/storage/src/vespa/storage/distributor/maintenance/prioritizedbucket.h b/storage/src/vespa/storage/distributor/maintenance/prioritizedbucket.h
new file mode 100644
index 00000000000..d6f7df04824
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/prioritizedbucket.h
@@ -0,0 +1,76 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <iosfwd>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/storage/distributor/maintenance/maintenancepriority.h>
+
+namespace storage {
+
+namespace distributor {
+
+class PrioritizedBucket {
+public:
+ typedef MaintenancePriority::Priority Priority;
+
+ static const PrioritizedBucket INVALID;
+
+ PrioritizedBucket()
+ : _bucketId(),
+ _priority(MaintenancePriority::NO_MAINTENANCE_NEEDED)
+ {}
+
+ PrioritizedBucket(const document::BucketId& bid,
+ Priority pri)
+ : _bucketId(bid),
+ _priority(pri)
+ {
+ }
+
+ const document::BucketId& getBucketId() const {
+ return _bucketId;
+ }
+
+ Priority getPriority() const {
+ return _priority;
+ }
+
+ bool valid() const {
+ return _bucketId.getRawId() != 0;
+ }
+
+ std::string toString() const {
+ return vespalib::make_string("PrioritizedBucket(%s, pri %s)",
+ _bucketId.toString().c_str(),
+ MaintenancePriority::toString(_priority).c_str());
+ }
+
+ bool operator==(const PrioritizedBucket& other) const {
+ return _bucketId == other._bucketId && _priority == other._priority;
+ }
+
+ bool requiresMaintenance() const {
+ return _priority != MaintenancePriority::NO_MAINTENANCE_NEEDED;
+ }
+
+ bool moreImportantThan(const PrioritizedBucket& other) const {
+ return _priority > other._priority;
+ }
+
+ bool moreImportantThan(Priority otherPri) const {
+ return _priority > otherPri;
+ }
+
+private:
+ document::BucketId _bucketId;
+ Priority _priority;
+};
+
+std::ostream&
+operator<<(std::ostream& os, const PrioritizedBucket& bucket);
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/maintenance/simplebucketprioritydatabase.cpp b/storage/src/vespa/storage/distributor/maintenance/simplebucketprioritydatabase.cpp
new file mode 100644
index 00000000000..93369faa32e
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/simplebucketprioritydatabase.cpp
@@ -0,0 +1,143 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/maintenance/simplebucketprioritydatabase.h>
+#include <iostream>
+#include <sstream>
+
+namespace storage {
+namespace distributor {
+
+SimpleBucketPriorityDatabase::~SimpleBucketPriorityDatabase()
+{
+}
+
+void
+SimpleBucketPriorityDatabase::clearAllEntriesForBucket(
+ const document::BucketId& bucketId)
+{
+ for (PriorityMap::iterator priIter(_prioritizedBuckets.begin()),
+ priEnd(_prioritizedBuckets.end());
+ priIter != priEnd;
+ ++priIter)
+ {
+ priIter->second.erase(bucketId);
+ }
+}
+
+void
+SimpleBucketPriorityDatabase::setPriority(const PrioritizedBucket& bucket)
+{
+ clearAllEntriesForBucket(bucket.getBucketId());
+ if (bucket.requiresMaintenance()) {
+ _prioritizedBuckets[bucket.getPriority()].insert(bucket.getBucketId());
+ }
+}
+
+void
+SimpleBucketPriorityDatabase::SimpleConstIteratorImpl::initializeBucketIterToFirstAvailableEntry()
+{
+ _bucketIter = _priorityIter->second.begin();
+ if (currentPriorityAtEnd()) {
+ increment();
+ }
+}
+
+bool
+SimpleBucketPriorityDatabase::SimpleConstIteratorImpl::atEnd() const
+{
+ return _priorityIter == _priorityEnd;
+}
+
+void
+SimpleBucketPriorityDatabase::SimpleConstIteratorImpl::stepWithinCurrentPriority()
+{
+ ++_bucketIter;
+}
+
+bool
+SimpleBucketPriorityDatabase::SimpleConstIteratorImpl::currentPriorityAtEnd() const
+{
+ return _bucketIter == _priorityIter->second.end();
+}
+
+void
+SimpleBucketPriorityDatabase::SimpleConstIteratorImpl::stepToNextPriority()
+{
+ ++_priorityIter;
+ if (atEnd()) {
+ return;
+ }
+ _bucketIter = _priorityIter->second.begin();
+}
+
+void
+SimpleBucketPriorityDatabase::SimpleConstIteratorImpl::step()
+{
+ if (currentPriorityAtEnd()) {
+ stepToNextPriority();
+ } else {
+ stepWithinCurrentPriority();
+ }
+}
+
+void
+SimpleBucketPriorityDatabase::SimpleConstIteratorImpl::increment()
+{
+ while (!atEnd()) {
+ step();
+ if (!currentPriorityAtEnd()) {
+ break;
+ }
+ }
+}
+
+bool
+SimpleBucketPriorityDatabase::SimpleConstIteratorImpl::equal(const ConstIteratorImpl& otherBase) const
+{
+ const SimpleConstIteratorImpl& other(
+ static_cast<const SimpleConstIteratorImpl&>(otherBase));
+ if (_priorityIter != other._priorityIter) {
+ return false;
+ }
+ if (atEnd()) {
+ return true;
+ }
+ return _bucketIter == other._bucketIter;
+}
+
+PrioritizedBucket
+SimpleBucketPriorityDatabase::SimpleConstIteratorImpl::dereference() const
+{
+ return PrioritizedBucket(*_bucketIter, _priorityIter->first);
+}
+
+SimpleBucketPriorityDatabase::const_iterator
+SimpleBucketPriorityDatabase::begin() const
+{
+ return const_iterator(ConstIteratorImplPtr(new SimpleConstIteratorImpl(
+ _prioritizedBuckets.rbegin(),
+ _prioritizedBuckets.rend())));
+}
+
+SimpleBucketPriorityDatabase::const_iterator
+SimpleBucketPriorityDatabase::end() const
+{
+ return const_iterator(ConstIteratorImplPtr(new SimpleConstIteratorImpl(
+ _prioritizedBuckets.rend(),
+ _prioritizedBuckets.rend())));
+}
+
+std::string
+SimpleBucketPriorityDatabase::toString() const
+{
+ std::ostringstream ss;
+ const_iterator i(begin());
+ const_iterator e(end());
+ for (; i != e; ++i) {
+ ss << *i << '\n';
+ }
+ return ss.str();
+}
+
+}
+}
diff --git a/storage/src/vespa/storage/distributor/maintenance/simplebucketprioritydatabase.h b/storage/src/vespa/storage/distributor/maintenance/simplebucketprioritydatabase.h
new file mode 100644
index 00000000000..f3b64a7ce41
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/simplebucketprioritydatabase.h
@@ -0,0 +1,71 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <set>
+#include <map>
+#include <vespa/storage/distributor/maintenance/bucketprioritydatabase.h>
+
+namespace storage {
+namespace distributor {
+
+class SimpleBucketPriorityDatabase : public BucketPriorityDatabase
+{
+public:
+ virtual ~SimpleBucketPriorityDatabase();
+ typedef PrioritizedBucket::Priority Priority;
+
+ virtual void setPriority(const PrioritizedBucket&);
+
+ virtual const_iterator begin() const;
+
+ virtual const_iterator end() const;
+
+ std::string toString() const;
+
+private:
+ typedef std::set<document::BucketId> BucketSet;
+ typedef std::map<Priority, BucketSet> PriorityMap;
+
+ class SimpleConstIteratorImpl : public ConstIteratorImpl
+ {
+ PriorityMap::const_reverse_iterator _priorityIter;
+ PriorityMap::const_reverse_iterator _priorityEnd;
+ BucketSet::const_iterator _bucketIter;
+ public:
+ SimpleConstIteratorImpl(PriorityMap::const_reverse_iterator first,
+ PriorityMap::const_reverse_iterator end)
+ : _priorityIter(first),
+ _priorityEnd(end),
+ _bucketIter()
+ {
+ if (!atEnd()) {
+ initializeBucketIterToFirstAvailableEntry();
+ }
+ }
+ private:
+ SimpleConstIteratorImpl(const SimpleConstIteratorImpl&);
+ SimpleConstIteratorImpl& operator=(const SimpleConstIteratorImpl&);
+
+ void initializeBucketIterToFirstAvailableEntry();
+
+ bool atEnd() const;
+ void stepWithinCurrentPriority();
+ bool currentPriorityAtEnd() const;
+ void stepToNextPriority();
+ void step();
+
+ virtual void increment();
+
+ virtual bool equal(const ConstIteratorImpl& other) const;
+
+ virtual PrioritizedBucket dereference() const;
+ };
+
+ void clearAllEntriesForBucket(const document::BucketId& bucketId);
+
+ PriorityMap _prioritizedBuckets;
+};
+
+}
+}
+
diff --git a/storage/src/vespa/storage/distributor/maintenance/simplemaintenancescanner.cpp b/storage/src/vespa/storage/distributor/maintenance/simplemaintenancescanner.cpp
new file mode 100644
index 00000000000..de510755e97
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/simplemaintenancescanner.cpp
@@ -0,0 +1,56 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <sstream>
+#include <vespa/storage/distributor/maintenance/simplemaintenancescanner.h>
+
+namespace storage {
+namespace distributor {
+
+MaintenanceScanner::ScanResult
+SimpleMaintenanceScanner::scanNext()
+{
+ BucketDatabase::Entry entry(_bucketDb.getNext(_bucketCursor));
+ if (!entry.valid()) {
+ return ScanResult::createDone();
+ }
+ prioritizeBucket(entry.getBucketId());
+ _bucketCursor = entry.getBucketId();
+ return ScanResult::createNotDone(entry);
+}
+
+void
+SimpleMaintenanceScanner::reset()
+{
+ _bucketCursor = document::BucketId();
+ _pendingMaintenance = PendingMaintenanceStats();
+}
+
+void
+SimpleMaintenanceScanner::prioritizeBucket(const document::BucketId& id)
+{
+ MaintenancePriorityAndType pri(
+ _priorityGenerator.prioritize(
+ id, _pendingMaintenance.perNodeStats));
+ if (pri.requiresMaintenance()) {
+ _bucketPriorityDb.setPriority(PrioritizedBucket(id, pri.getPriority().getPriority()));
+ assert(pri.getType() != MaintenanceOperation::OPERATION_COUNT);
+ ++_pendingMaintenance.global.pending[pri.getType()];
+ }
+}
+
+std::ostream&
+operator<<(std::ostream& os,
+ const SimpleMaintenanceScanner::GlobalMaintenanceStats& stats)
+{
+ using MO = MaintenanceOperation;
+ os << "delete bucket: " << stats.pending[MO::DELETE_BUCKET]
+ << ", merge bucket: " << stats.pending[MO::MERGE_BUCKET]
+ << ", split bucket: " << stats.pending[MO::SPLIT_BUCKET]
+ << ", join bucket: " << stats.pending[MO::JOIN_BUCKET]
+ << ", set bucket state: " << stats.pending[MO::SET_BUCKET_STATE]
+ << ", garbage collection: " << stats.pending[MO::GARBAGE_COLLECTION];
+ return os;
+}
+
+}
+}
diff --git a/storage/src/vespa/storage/distributor/maintenance/simplemaintenancescanner.h b/storage/src/vespa/storage/distributor/maintenance/simplemaintenancescanner.h
new file mode 100644
index 00000000000..eaec4d5186b
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenance/simplemaintenancescanner.h
@@ -0,0 +1,67 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <string>
+#include <vector>
+#include <vespa/storage/distributor/maintenance/maintenancescanner.h>
+#include <vespa/storage/distributor/maintenance/bucketprioritydatabase.h>
+#include <vespa/storage/distributor/maintenance/maintenanceprioritygenerator.h>
+#include <vespa/storage/distributor/maintenance/node_maintenance_stats_tracker.h>
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+
+namespace storage {
+namespace distributor {
+
+class SimpleMaintenanceScanner : public MaintenanceScanner
+{
+public:
+ struct GlobalMaintenanceStats {
+ std::vector<uint64_t> pending;
+
+ GlobalMaintenanceStats()
+ : pending(MaintenanceOperation::OPERATION_COUNT)
+ {
+ }
+ };
+ struct PendingMaintenanceStats {
+ GlobalMaintenanceStats global;
+ NodeMaintenanceStatsTracker perNodeStats;
+ };
+private:
+ BucketPriorityDatabase& _bucketPriorityDb;
+ const MaintenancePriorityGenerator& _priorityGenerator;
+ const BucketDatabase& _bucketDb;
+ document::BucketId _bucketCursor;
+ PendingMaintenanceStats _pendingMaintenance;
+public:
+ SimpleMaintenanceScanner(BucketPriorityDatabase& bucketPriorityDb,
+ const MaintenancePriorityGenerator& priorityGenerator,
+ const BucketDatabase& bucketDb)
+ : _bucketPriorityDb(bucketPriorityDb),
+ _priorityGenerator(priorityGenerator),
+ _bucketDb(bucketDb),
+ _bucketCursor()
+ {}
+
+ ScanResult scanNext();
+
+ void reset();
+
+ // TODO: move out into own interface!
+ void prioritizeBucket(const document::BucketId& id);
+
+ const PendingMaintenanceStats& getPendingMaintenanceStats() const {
+ return _pendingMaintenance;
+ }
+private:
+ SimpleMaintenanceScanner(const SimpleMaintenanceScanner&);
+ SimpleMaintenanceScanner& operator=(const SimpleMaintenanceScanner&);
+};
+
+std::ostream&
+operator<<(std::ostream&,
+ const SimpleMaintenanceScanner::GlobalMaintenanceStats&);
+
+}
+}
+
diff --git a/storage/src/vespa/storage/distributor/maintenancebucket.h b/storage/src/vespa/storage/distributor/maintenancebucket.h
new file mode 100644
index 00000000000..d05ceb4b3f7
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/maintenancebucket.h
@@ -0,0 +1,59 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/storage/distributor/maintenance/maintenancepriority.h>
+
+namespace storage {
+
+namespace distributor {
+
+/**
+ * Simple container to communicate a bucket that needs to be
+ * checked for maintenanceoperations.
+ */
+class MaintenanceBucket {
+public:
+ typedef MaintenancePriority::Priority Priority;
+
+ MaintenanceBucket()
+ : node(0),
+ pri(MaintenancePriority::NO_MAINTENANCE_NEEDED)
+ {}
+
+ MaintenanceBucket(const document::BucketId& bid_,
+ uint16_t node_,
+ Priority pri_)
+ : bid(bid_),
+ node(node_),
+ pri(pri_)
+ {
+
+ }
+
+ // The bucket to be checked.
+ document::BucketId bid;
+
+ // The primary node of the bucket.
+ uint16_t node;
+
+ // The priority to check the bucket.
+ Priority pri;
+
+ bool requiresMaintenance() const {
+ return pri != MaintenancePriority::NO_MAINTENANCE_NEEDED;
+ }
+
+ std::string toString() const {
+ return vespalib::make_string("MaintenanceBucket(%s: Node %d, Pri %s)",
+ bid.toString().c_str(),
+ (int)node,
+ MaintenancePriority::toString(pri).c_str());
+ }
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/messageguard.h b/storage/src/vespa/storage/distributor/messageguard.h
new file mode 100644
index 00000000000..8d549965136
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/messageguard.h
@@ -0,0 +1,46 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/common/messagesender.h>
+#include <vespa/storage/distributor/pendingclusterstate.h>
+
+namespace storage {
+
+class MessageGuard {
+ std::vector<std::shared_ptr<api::StorageMessage> > messagesUp;
+ std::vector<std::shared_ptr<api::StorageMessage> > messagesDown;
+
+ vespalib::LockGuard _lock;
+ ChainedMessageSender& _messageSender;
+
+public:
+ MessageGuard(const vespalib::Lock &lock,
+ ChainedMessageSender& messageSender)
+ : _lock(lock),
+ _messageSender(messageSender) {}
+
+ void send(const std::shared_ptr<api::StorageMessage>& message) {
+ sendUp(message);
+ }
+
+ void sendUp(const std::shared_ptr<api::StorageMessage>& message) {
+ messagesUp.push_back(message);
+ }
+
+ void sendDown(const std::shared_ptr<api::StorageMessage>& message) {
+ messagesDown.push_back(message);
+ }
+
+ ~MessageGuard() {
+ _lock.unlock();
+ for (uint32_t i = 0; i < messagesUp.size(); i++) {
+ _messageSender.sendUp(messagesUp[i]);
+ }
+ for (uint32_t i = 0; i < messagesDown.size(); i++) {
+ _messageSender.sendDown(messagesDown[i]);
+ }
+ }
+};
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/messagetracker.cpp b/storage/src/vespa/storage/distributor/messagetracker.cpp
new file mode 100644
index 00000000000..15b428f3393
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/messagetracker.cpp
@@ -0,0 +1,53 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/messagetracker.h>
+#include <vespa/log/log.h>
+#include <vespa/vdslib/state/nodetype.h>
+
+LOG_SETUP(".messagetracker");
+
+namespace storage {
+
+namespace distributor {
+
+MessageTracker::MessageTracker(const std::string& clusterName)
+ : _clusterName(clusterName)
+{}
+
+void
+MessageTracker::flushQueue(MessageSender& sender)
+{
+ for (uint32_t i = 0; i < _commandQueue.size(); i++) {
+ _commandQueue[i]._msg->setAddress(
+ api::StorageMessageAddress(_clusterName, lib::NodeType::STORAGE, _commandQueue[i]._target));
+ _sentMessages[_commandQueue[i]._msg->getMsgId()] = _commandQueue[i]._target;
+ sender.sendCommand(_commandQueue[i]._msg);
+ }
+
+ _commandQueue.clear();
+}
+
+uint16_t
+MessageTracker::handleReply(api::BucketReply& reply)
+{
+ std::map<uint64_t, uint16_t>::iterator found = _sentMessages.find(reply.getMsgId());
+ if (found == _sentMessages.end()) {
+ LOG(warning, "Received reply %" PRIu64 " for callback which we have no recollection of", reply.getMsgId());
+ return (uint16_t)-1;
+ } else {
+ uint16_t node = found->second;
+ _sentMessages.erase(found);
+ return node;
+ }
+}
+
+bool
+MessageTracker::finished()
+{
+ return _sentMessages.empty();
+}
+
+
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/messagetracker.h b/storage/src/vespa/storage/distributor/messagetracker.h
new file mode 100644
index 00000000000..eb41a7ccd27
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/messagetracker.h
@@ -0,0 +1,54 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storageapi/messageapi/bucketcommand.h>
+#include <vespa/storageapi/messageapi/bucketreply.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/common/messagesender.h>
+
+namespace storage {
+
+namespace distributor {
+
+class MessageTracker {
+public:
+ class ToSend {
+ public:
+ ToSend(const std::shared_ptr<api::BucketCommand>& msg, uint16_t target) :
+ _msg(msg), _target(target) {};
+
+ std::shared_ptr<api::BucketCommand> _msg;
+ uint16_t _target;
+ };
+
+ MessageTracker(const std::string& clusterName);
+
+ void queueCommand(std::shared_ptr<api::BucketCommand> msg, uint16_t target) {
+ _commandQueue.push_back(ToSend(msg, target));
+ }
+
+ void flushQueue(MessageSender& sender);
+
+ /**
+ If the reply is for a message that is being tracked here, returns the node the message was sent to. If not, returns (uint16_t)-1
+ */
+ uint16_t handleReply(api::BucketReply& reply);
+
+ /**
+ Returns true if all messages sent have been received.
+ */
+ bool finished();
+
+protected:
+ std::vector<ToSend> _commandQueue;
+
+ // Keeps track of which node a message was sent to.
+ std::map<uint64_t, uint16_t> _sentMessages;
+ std::string _clusterName;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/min_replica_provider.h b/storage/src/vespa/storage/distributor/min_replica_provider.h
new file mode 100644
index 00000000000..64adea622ed
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/min_replica_provider.h
@@ -0,0 +1,26 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <iostream>
+#include <stdint.h>
+#include <unordered_map>
+
+namespace storage {
+namespace distributor {
+
+class MinReplicaProvider
+{
+public:
+ virtual ~MinReplicaProvider() {}
+
+ /**
+ * Get a snapshot of the minimum bucket replica for each of the nodes.
+ *
+ * Can be called at any time after registration from another thread context
+ * and the call must thus be thread safe and data race free.
+ */
+ virtual std::unordered_map<uint16_t, uint32_t> getMinReplica() const = 0;
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/nodeinfo.cpp b/storage/src/vespa/storage/distributor/nodeinfo.cpp
new file mode 100644
index 00000000000..52c16dbb08d
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/nodeinfo.cpp
@@ -0,0 +1,85 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/nodeinfo.h>
+
+namespace storage {
+
+namespace distributor {
+
+NodeInfo::NodeInfo(const framework::Clock& clock)
+ : _clock(clock) {}
+
+uint32_t
+NodeInfo::getPendingCount(uint16_t idx) const
+{
+ return getNode(idx)._pending;
+}
+
+bool
+NodeInfo::isBusy(uint16_t idx) const
+{
+ const SingleNodeInfo& info = getNode(idx);
+ if (info._busyTime.isSet()) {
+ if (_clock.getTimeInSeconds() > info._busyTime) {
+ info._busyTime = framework::SecondTime(0);
+ } else {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void
+NodeInfo::setBusy(uint16_t idx)
+{
+ getNode(idx)._busyTime = _clock.getTimeInSeconds()
+ + framework::SecondTime(60);
+}
+
+void
+NodeInfo::incPending(uint16_t idx)
+{
+ getNode(idx)._pending++;
+}
+
+void
+NodeInfo::decPending(uint16_t idx)
+{
+ SingleNodeInfo& info = getNode(idx);
+
+ if (info._pending > 0) {
+ getNode(idx)._pending--;
+ }
+}
+
+void
+NodeInfo::clearPending(uint16_t idx)
+{
+ SingleNodeInfo& info = getNode(idx);
+ info._pending = 0;
+}
+
+NodeInfo::SingleNodeInfo&
+NodeInfo::getNode(uint16_t idx)
+{
+ while ((int)_nodes.size() < idx + 1) {
+ _nodes.push_back(SingleNodeInfo());
+ }
+
+ return _nodes[idx];
+}
+
+const NodeInfo::SingleNodeInfo&
+NodeInfo::getNode(uint16_t idx) const
+{
+ while ((int)_nodes.size() < idx + 1) {
+ _nodes.push_back(SingleNodeInfo());
+ }
+
+ return _nodes[idx];
+}
+
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/nodeinfo.h b/storage/src/vespa/storage/distributor/nodeinfo.h
new file mode 100644
index 00000000000..59943ed494a
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/nodeinfo.h
@@ -0,0 +1,50 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::distributor::NodeInfo
+ * \ingroup distributor
+ *
+ * \brief Keeps track of node state for all storage nodes.
+ */
+#pragma once
+
+#include <vespa/storageframework/storageframework.h>
+#include <vector>
+
+namespace storage {
+namespace distributor {
+
+class NodeInfo {
+public:
+ NodeInfo(const framework::Clock& clock);
+
+ uint32_t getPendingCount(uint16_t idx) const;
+
+ bool isBusy(uint16_t idx) const;
+
+ void setBusy(uint16_t idx);
+
+ void incPending(uint16_t idx);
+
+ void decPending(uint16_t idx);
+
+ void clearPending(uint16_t idx);
+
+private:
+ struct SingleNodeInfo {
+ SingleNodeInfo()
+ : _pending(0), _busyTime(0) {};
+
+ uint32_t _pending;
+ mutable framework::SecondTime _busyTime;
+ };
+
+ mutable std::vector<SingleNodeInfo> _nodes;
+ const framework::Clock& _clock;
+
+ const SingleNodeInfo& getNode(uint16_t idx) const;
+ SingleNodeInfo& getNode(uint16_t idx);
+};
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/operationowner.cpp b/storage/src/vespa/storage/distributor/operationowner.cpp
new file mode 100644
index 00000000000..98b1944f9e5
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operationowner.cpp
@@ -0,0 +1,88 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/operationowner.h>
+#include <vespa/storage/distributor/operations/operation.h>
+#include <vespa/log/log.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <vespa/storageapi/messageapi/storagecommand.h>
+#include <vespa/storageapi/messageapi/storagereply.h>
+
+LOG_SETUP(".operationowner");
+
+namespace storage {
+
+namespace distributor {
+
+OperationOwner::~OperationOwner()
+{
+}
+
+void
+OperationOwner::Sender::sendCommand(const std::shared_ptr<api::StorageCommand> & msg)
+{
+ _owner.getSentMessageMap().insert(msg->getMsgId(), _cb);
+ _sender.sendCommand(msg);
+}
+
+void
+OperationOwner::Sender::sendReply(const std::shared_ptr<api::StorageReply> & msg)
+{
+ _sender.sendReply(msg);
+};
+
+bool
+OperationOwner::handleReply(const std::shared_ptr<api::StorageReply>& reply)
+{
+ std::shared_ptr<Operation> cb = _sentMessageMap.pop(reply->getMsgId());
+
+ if (cb.get() != 0) {
+ Sender sender(*this, _sender, cb);
+ cb->receive(sender, reply);
+ return true;
+ }
+
+ return false;
+}
+
+bool
+OperationOwner::start(const std::shared_ptr<Operation>& operation,
+ Priority priority)
+{
+ (void) priority;
+ LOG(spam, "Starting operation %s", operation->toString().c_str());
+ Sender sender(*this, _sender, operation);
+ operation->start(sender, _clock.getTimeInMillis());
+ return true;
+}
+
+std::string
+OperationOwner::toString() const
+{
+ return _sentMessageMap.toString();
+}
+
+void
+OperationOwner::onClose()
+{
+ while (true) {
+ std::shared_ptr<Operation> cb = _sentMessageMap.pop();
+
+ if (cb.get()) {
+ Sender sender(*this, _sender, std::shared_ptr<Operation>());
+ cb->onClose(sender);
+ } else {
+ break;
+ }
+ }
+}
+
+void
+OperationOwner::erase(api::StorageMessage::Id msgId)
+{
+ _sentMessageMap.pop(msgId);
+}
+
+
+} // distributor
+
+} // storage
diff --git a/storage/src/vespa/storage/distributor/operationowner.h b/storage/src/vespa/storage/distributor/operationowner.h
new file mode 100644
index 00000000000..e72d6738513
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operationowner.h
@@ -0,0 +1,110 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/sentmessagemap.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storage/distributor/distributormessagesender.h>
+#include <vespa/storage/distributor/operationstarter.h>
+
+namespace storage {
+
+namespace distributor {
+
+class Operation;
+
+/**
+ Storage link that keeps track of running operations.
+ */
+class OperationOwner : public OperationStarter {
+public:
+
+ class Sender : public DistributorMessageSender {
+ public:
+ Sender(OperationOwner& owner,
+ DistributorMessageSender& sender,
+ const std::shared_ptr<Operation>& cb)
+ : _owner(owner),
+ _sender(sender),
+ _cb(cb)
+ {}
+
+ /**
+ Sends a message.
+ */
+ void sendCommand(const std::shared_ptr<api::StorageCommand> &);
+
+ /**
+ Send a reply.
+ */
+ void sendReply(const std::shared_ptr<api::StorageReply> & msg);
+
+ OperationOwner& getOwner() {
+ return _owner;
+ }
+
+ virtual int getDistributorIndex() const {
+ return _sender.getDistributorIndex();
+ }
+
+ virtual const std::string& getClusterName() const {
+ return _sender.getClusterName();
+ }
+
+ virtual const PendingMessageTracker& getPendingMessageTracker() const {
+ return _sender.getPendingMessageTracker();
+ }
+
+ private:
+ OperationOwner& _owner;
+ DistributorMessageSender& _sender;
+ std::shared_ptr<Operation> _cb;
+ };
+
+ OperationOwner(DistributorMessageSender& sender,
+ const framework::Clock& clock)
+ : _sender(sender),
+ _clock(clock) {
+ }
+ ~OperationOwner();
+
+ /**
+ Handles replies from storage, mapping from a message id to an operation.
+
+ If the operation was found, returns it in result.first. If the operation was
+ done after the reply was processed (no more pending commands), returns true
+
+ */
+ bool handleReply(const std::shared_ptr<api::StorageReply>& reply);
+
+ SentMessageMap& getSentMessageMap() {
+ return _sentMessageMap;
+ };
+
+ virtual bool start(const std::shared_ptr<Operation>& operation,
+ Priority priority);
+
+ /**
+ If the given message exists, create a reply and pass it to the
+ appropriate callback.
+ */
+ void erase(api::StorageMessage::Id msgId);
+
+ void onClose();
+
+ uint32_t size() const {
+ return _sentMessageMap.size();
+ }
+
+ std::string toString() const;
+
+private:
+ SentMessageMap _sentMessageMap;
+ DistributorMessageSender& _sender;
+ const framework::Clock& _clock;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/operations/CMakeLists.txt b/storage/src/vespa/storage/distributor/operations/CMakeLists.txt
new file mode 100644
index 00000000000..04f371c97a1
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_distributoroperation OBJECT
+ SOURCES
+ operation.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/distributor/operations/external/CMakeLists.txt b/storage/src/vespa/storage/distributor/operations/external/CMakeLists.txt
new file mode 100644
index 00000000000..963eebc66c9
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_distributoroperationexternal OBJECT
+ SOURCES
+ putoperation.cpp
+ removeoperation.cpp
+ updateoperation.cpp
+ twophaseupdateoperation.cpp
+ statbucketoperation.cpp
+ removelocationoperation.cpp
+ multioperationoperation.cpp
+ getoperation.cpp
+ multioperationoperation.cpp
+ visitoroperation.cpp
+ statbucketlistoperation.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/distributor/operations/external/getoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/getoperation.cpp
new file mode 100644
index 00000000000..883fd3809d6
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/getoperation.cpp
@@ -0,0 +1,289 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/operations/external/getoperation.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/distributorcomponent.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/vdslib/state/nodestate.h>
+
+LOG_SETUP(".distributor.callback.doc.get");
+
+namespace storage {
+namespace distributor {
+
+GetOperation::GroupId::GroupId(const document::BucketId& id,
+ uint32_t checksum,
+ int node)
+ : _id(id),
+ _checksum(checksum),
+ _node(node)
+{
+}
+
+bool
+GetOperation::GroupId::operator<(const GroupId& other) const
+{
+ if (_id.getRawId() != other._id.getRawId()) {
+ return (_id.getRawId() < other._id.getRawId());
+ }
+ if (_checksum != other._checksum) {
+ return (_checksum < other._checksum);
+ }
+ if (_node != other._node) {
+ return (_node < other._node);
+ }
+ return false;
+}
+
+bool
+GetOperation::GroupId::operator==(const GroupId& other) const
+{
+ return (_id == other._id
+ && _checksum == other._checksum
+ && _node == other._node);
+}
+
+GetOperation::GetOperation(DistributorComponent& manager,
+ const std::shared_ptr<api::GetCommand> & msg,
+ PersistenceOperationMetricSet& metric)
+ : Operation(),
+ _manager(manager),
+ _msg(msg),
+ _returnCode(api::ReturnCode::OK),
+ _doc((document::Document*)NULL),
+ _lastModified(0),
+ _metric(metric)
+{
+ assignTargetNodeGroups();
+}
+
+void
+GetOperation::onClose(DistributorMessageSender& sender)
+{
+ _returnCode = api::ReturnCode(api::ReturnCode::ABORTED,
+ "Process is shutting down");
+ sendReply(sender);
+}
+
+bool
+GetOperation::copyIsOnLocalNode(const BucketCopy& copy) const
+{
+ return (copy.getNode() == _manager.getIndex());
+}
+
+int
+GetOperation::findBestUnsentTarget(const GroupVector& candidates) const
+{
+ int best = -1;
+ for (uint32_t i = 0; i < candidates.size(); ++i) {
+ if (candidates[i].sent) {
+ continue;
+ }
+ if (copyIsOnLocalNode(candidates[i].copy)) {
+ return i; // Can't get better match than this.
+ }
+ if (best == -1) {
+ best = i;
+ }
+ }
+ return best;
+}
+
+bool
+GetOperation::sendForChecksum(DistributorMessageSender& sender,
+ const document::BucketId& id,
+ GroupVector& res)
+{
+ const int best = findBestUnsentTarget(res);
+
+ if (best != -1) {
+ std::shared_ptr<api::GetCommand> command(
+ std::make_shared<api::GetCommand>(
+ id,
+ _msg->getDocumentId(),
+ _msg->getFieldSet(),
+ _msg->getBeforeTimestamp()));
+ copyMessageSettings(*_msg, *command);
+
+ LOG(spam,
+ "Sending %s to node %d",
+ command->toString(true).c_str(),
+ res[best].copy.getNode());
+
+ res[best].sent = sender.sendToNode(lib::NodeType::STORAGE,
+ res[best].copy.getNode(),
+ command);
+ return true;
+ }
+
+ return false;
+}
+
+void
+GetOperation::onStart(DistributorMessageSender& sender)
+{
+ // Send one request for each unique group (BucketId/checksum)
+ bool sent = false;
+ for (std::map<GroupId, GroupVector>::iterator iter = _responses.begin();
+ iter != _responses.end(); ++iter)
+ {
+ sent |= sendForChecksum(sender, iter->first.getBucketId(), iter->second);
+ }
+
+ // If nothing was sent (no useful copies), just return NOT_FOUND
+ if (!sent) {
+ LOG(debug, "No useful bucket copies for get on document %s. Returning without document", _msg->getDocumentId().toString().c_str());
+ sendReply(sender);
+ }
+};
+
+void
+GetOperation::onReceive(DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply>& msg)
+{
+ api::GetReply* getreply = dynamic_cast<api::GetReply*>(msg.get());
+ assert(getreply != nullptr);
+
+ LOG(debug, "Received %s", msg->toString(true).c_str());
+
+ _msg->getTrace().getRoot().addChild(getreply->getTrace().getRoot());
+ bool allDone = true;
+ for (std::map<GroupId, GroupVector>::iterator iter = _responses.begin();
+ iter != _responses.end(); ++iter)
+ {
+ for (uint32_t i = 0; i < iter->second.size(); i++) {
+ if (iter->second[i].sent == getreply->getMsgId()) {
+ LOG(debug, "Get on %s returned %s",
+ _msg->getDocumentId().toString().c_str(),
+ getreply->getResult().toString().c_str());
+
+ iter->second[i].received = true;
+ iter->second[i].returnCode = getreply->getResult();
+
+ if (getreply->getResult().success()) {
+ if (getreply->getLastModifiedTimestamp() > _lastModified) {
+ _returnCode = getreply->getResult();
+ _lastModified = getreply->getLastModifiedTimestamp();
+ _doc = getreply->getDocument();
+ }
+ } else {
+ if (_lastModified == 0) {
+ _returnCode = getreply->getResult();
+ }
+
+ // Try to send to another node in this checksum group.
+ bool sent = sendForChecksum(sender,
+ iter->first.getBucketId(),
+ iter->second);
+ if (sent) {
+ allDone = false;
+ }
+ }
+ }
+
+ if (iter->second[i].sent && !iter->second[i].received) {
+ LOG(spam, "Have not received all replies yet, setting allDone = false");
+ allDone = false;
+ }
+ }
+ }
+
+ if (allDone) {
+ LOG(debug, "Get on %s done, returning reply %s",
+ _msg->getDocumentId().toString().c_str(),
+ _returnCode.toString().c_str());
+ sendReply(sender);
+ }
+}
+
+void
+GetOperation::sendReply(DistributorMessageSender& sender)
+{
+ if (_msg.get()) {
+ std::shared_ptr<api::GetReply> repl(
+ std::make_shared<api::GetReply>(*_msg, _doc, _lastModified));
+ repl->setResult(_returnCode);
+
+ if (_returnCode.success()) {
+ ++_metric.ok;
+ } else if (_returnCode.getResult() == api::ReturnCode::TIMEOUT) {
+ ++_metric.failures.timeout;
+ } else if (_returnCode.isBusy()) {
+ ++_metric.failures.busy;
+ } else if (_returnCode.isNodeDownOrNetwork()) {
+ ++_metric.failures.notconnected;
+ } else {
+ ++_metric.failures.storagefailure;
+ }
+
+ if (!_doc.get()) {
+ ++_metric.failures.notfound;
+ }
+
+ framework::MilliSecTime currTime(_manager.getClock().getTimeInMillis());
+ _metric.latency.addValue((currTime - _startTime).getTime());
+
+ sender.sendReply(repl);
+ _msg.reset();
+ }
+
+}
+
+void
+GetOperation::assignTargetNodeGroups()
+{
+ document::BucketIdFactory bucketIdFactory;
+ document::BucketId bid = bucketIdFactory.getBucketId(_msg->getDocumentId());
+
+ std::vector<BucketDatabase::Entry> entries;
+ _manager.getBucketDatabase().getParents(bid, entries);
+
+ for (uint32_t j = 0; j < entries.size(); ++j) {
+ const BucketDatabase::Entry& e = entries[j];
+
+ LOG(spam, "Entry for %s: %s", e.getBucketId().toString().c_str(),
+ e->toString().c_str());
+
+ bool haveTrusted = false;
+ for (uint32_t i = 0; i < e->getNodeCount(); i++) {
+ const BucketCopy& c = e->getNodeRef(i);
+
+ if (!c.trusted()) {
+ continue;
+ }
+
+ _responses[GroupId(e.getBucketId(), c.getChecksum(), -1)].push_back(c);
+ haveTrusted = true;
+ break;
+ }
+
+ if (haveTrusted) {
+ continue;
+ }
+
+ for (uint32_t i = 0; i < e->getNodeCount(); i++) {
+ const BucketCopy& copy = e->getNodeRef(i);
+
+ if (!copy.valid()) {
+ _responses[GroupId(e.getBucketId(), copy.getChecksum(), copy.getNode())].
+ push_back(copy);
+ } else if (!copy.empty()) {
+ _responses[GroupId(e.getBucketId(), copy.getChecksum(), -1)].
+ push_back(copy);
+ }
+ }
+ }
+}
+
+bool
+GetOperation::hasConsistentCopies() const
+{
+ return _responses.size() == 1;
+}
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/operations/external/getoperation.h b/storage/src/vespa/storage/distributor/operations/external/getoperation.h
new file mode 100644
index 00000000000..6af02abe144
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/getoperation.h
@@ -0,0 +1,111 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storageapi/defs.h>
+#include <vespa/storage/distributor/operations/operation.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+
+namespace document {
+class Document;
+}
+
+namespace storage {
+
+namespace api {
+class GetCommand;
+}
+
+class PersistenceOperationMetricSet;
+
+namespace distributor {
+
+class DistributorComponent;
+
+class GetOperation : public Operation
+{
+public:
+ GetOperation(DistributorComponent& manager,
+ const std::shared_ptr<api::GetCommand> & msg,
+ PersistenceOperationMetricSet& metric);
+
+ void onClose(DistributorMessageSender& sender);
+
+ void onStart(DistributorMessageSender& sender);
+
+ void onReceive(DistributorMessageSender& sender, const std::shared_ptr<api::StorageReply> & msg);
+
+ const char* getName() const { return "get"; }
+
+ std::string getStatus() const { return ""; }
+
+ bool hasConsistentCopies() const;
+
+private:
+ class GroupId {
+ public:
+ // Node should be set only if bucket is incomplete
+ GroupId(const document::BucketId& id, uint32_t checksum, int node);
+
+ bool operator<(const GroupId& other) const;
+
+ bool operator==(const GroupId& other) const;
+
+ const document::BucketId& getBucketId() const { return _id; }
+
+ int getNode() const { return _node; }
+
+ private:
+ document::BucketId _id;
+ uint32_t _checksum;
+ int _node;
+ };
+
+ class BucketChecksumGroup {
+ public:
+ BucketChecksumGroup(const BucketCopy& c) :
+ copy(c),
+ sent(0), received(false), returnCode(api::ReturnCode::OK) {};
+
+ BucketCopy copy;
+ api::StorageMessage::Id sent;
+ bool received;
+ api::ReturnCode returnCode;
+ };
+
+ typedef std::vector<BucketChecksumGroup> GroupVector;
+
+ // Organize the different copies by bucket/checksum pairs. We should
+ // try to request GETs from each bucket and each different checksum
+ // within that bucket.
+ std::map<GroupId, GroupVector> _responses;
+
+ DistributorComponent& _manager;
+
+ std::shared_ptr<api::GetCommand> _msg;
+
+ api::ReturnCode _returnCode;
+ std::shared_ptr<document::Document> _doc;
+
+ api::Timestamp _lastModified;
+
+ PersistenceOperationMetricSet& _metric;
+
+ void sendReply(DistributorMessageSender& sender);
+ bool sendForChecksum(DistributorMessageSender& sender, const document::BucketId& id, GroupVector& res);
+
+ void assignTargetNodeGroups();
+ bool copyIsOnLocalNode(const BucketCopy&) const;
+ /**
+ * Returns the vector index of the target to send to, or -1 if none
+ * could be found (i.e. all targets have already been sent to).
+ */
+ int findBestUnsentTarget(const GroupVector& candidates) const;
+};
+
+}
+
+
+}
+
+
diff --git a/storage/src/vespa/storage/distributor/operations/external/multioperationoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/multioperationoperation.cpp
new file mode 100644
index 00000000000..ab4bdfbd3dd
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/multioperationoperation.cpp
@@ -0,0 +1,246 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storage/distributor/operations/external/multioperationoperation.h>
+#include <vespa/storage/distributor/operations/external/putoperation.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/vdslib/container/writabledocumentlist.h>
+
+LOG_SETUP(".distributor.callback.doc.multioperation");
+
+
+using namespace storage::distributor;
+using namespace storage;
+
+MultiOperationOperation::MultiOperationOperation(
+ DistributorComponent& manager,
+ const std::shared_ptr<api::MultiOperationCommand> & msg,
+ PersistenceOperationMetricSet& metric)
+ : Operation(),
+ _reply(new api::MultiOperationReply(*msg)),
+ _trackerInstance(metric, _reply, manager),
+ _tracker(_trackerInstance),
+ _msg(msg),
+ _manager(manager),
+ _minUseBits(manager.getDistributor().getConfig().getMinimalBucketSplit())
+{
+};
+
+bool
+MultiOperationOperation::sendToBucket(
+ BucketDatabase::Entry& e,
+ std::shared_ptr<api::MultiOperationCommand> moCommand)
+{
+ std::vector<uint16_t> targetNodes;
+ std::vector<MessageTracker::ToSend> createBucketBatch;
+
+ if (PutOperation::checkCreateBucket(_manager.getDistribution(),
+ _manager.getClusterState(),
+ e,
+ targetNodes,
+ createBucketBatch,
+ *moCommand))
+ {
+ _manager.getBucketDatabase().update(e);
+ }
+
+ if (createBucketBatch.size()) {
+ _tracker.queueMessageBatch(createBucketBatch);
+ }
+
+ std::vector<MessageTracker::ToSend> messages;
+
+ for (uint32_t i = 0; i < targetNodes.size(); i++) {
+ std::shared_ptr<api::MultiOperationCommand> snd(
+ new api::MultiOperationCommand(*moCommand));
+ copyMessageSettings(*moCommand, *snd);
+ messages.push_back(MessageTracker::ToSend(snd, targetNodes[i]));
+ }
+
+ _tracker.queueMessageBatch(messages);
+
+ return true;
+}
+
+typedef std::vector<vdslib::DocumentList::Entry> EntryVector;
+
+uint32_t
+MultiOperationOperation::getMinimumUsedBits(const vdslib::DocumentList& opList) const
+{
+ uint32_t splitBit = 58;
+ uint64_t splitMask = 0;
+ document::BucketId refBucket;
+
+ for (uint32_t i=0; i< splitBit; ++i) {
+ splitMask = (splitMask << 1) | 1;
+ }
+
+ //iterate through operations to find which bucketId they belong to
+ for (vdslib::DocumentList::const_iterator operationIt = opList.begin();
+ operationIt != opList.end();
+ operationIt++)
+ {
+ document::DocumentId docId = operationIt->getDocumentId();
+ document::BucketId bucketId(
+ _manager.getBucketIdFactory().getBucketId(docId));
+
+ if (refBucket.getRawId() == 0) {
+ refBucket = bucketId;
+ } else {
+ while ((bucketId.getRawId() & splitMask) != (refBucket.getRawId() & splitMask)) {
+ --splitBit;
+ splitMask = splitMask >> 1;
+ }
+ }
+ }
+
+ return splitBit;
+}
+
+namespace {
+
+struct BucketOperationList {
+ BucketDatabase::Entry entry;
+ EntryVector operations;
+};
+
+}
+
+void
+MultiOperationOperation::onStart(DistributorMessageSender& sender)
+{
+ lib::ClusterState systemState = _manager.getClusterState();
+
+ // Don't do anything if all nodes are down.
+ bool up = false;
+ for (uint16_t i = 0; i < systemState.getNodeCount(lib::NodeType::STORAGE); i++) {
+ if (_manager.storageNodeIsUp(i)) {
+ up = true;
+ break;
+ }
+ }
+
+ if (!up) {
+ _tracker.fail(sender, api::ReturnCode(api::ReturnCode::NOT_CONNECTED, "Can't perform operations: No storage nodes available"));
+ return;
+ }
+
+ const vdslib::DocumentList& opList= _msg->getOperations();
+ LOG(debug, "Received MultiOperation message with %d operations", opList.size());
+ std::map<document::BucketId, BucketOperationList> bucketMap;
+
+ if ((_manager.getDistributor().getConfig().getSplitCount() != 0 && opList.size() > _manager.getDistributor().getConfig().getSplitCount() / 3) ||
+ (_manager.getDistributor().getConfig().getSplitSize() != 0 && opList.getBufferSize() > _manager.getDistributor().getConfig().getSplitSize() / 3)) {
+ _minUseBits = getMinimumUsedBits(opList);
+ }
+
+ //iterate through operations to find which bucketId they belong to
+ for (vdslib::DocumentList::const_iterator operationIt = opList.begin();
+ operationIt != opList.end();
+ operationIt++)
+ {
+ if (operationIt->valid()) {
+ document::DocumentId docId = operationIt->getDocumentId();
+ document::BucketId bucketId(
+ _manager.getBucketIdFactory().getBucketId(docId));
+
+ LOG(debug, "Operation with documentid %s mapped to bucketid %s", docId.toString().c_str(), bucketId.toString().c_str());
+
+ // OK, we have a bucket ID, must now know which buckets this belongs
+ // to
+ std::vector<BucketDatabase::Entry> entries;
+ _manager.getBucketDatabase().getParents(bucketId, entries);
+
+ if (entries.empty()) {
+ entries.push_back(_manager.createAppropriateBucket(bucketId));
+ }
+
+ for (uint32_t i = 0; i < entries.size(); ++i) {
+ bucketMap[entries[i].getBucketId()].entry = entries[i];
+ bucketMap[entries[i].getBucketId()].operations.push_back(*operationIt);
+
+ LOG(debug, "Operation with flags %d must go to bucket %s",
+ operationIt->getFlags(), entries[i].toString().c_str());
+ }
+ }
+ }
+
+ LOG(debug,
+ "MultiOperation has operations for %lu bucketIds",
+ (unsigned long)bucketMap.size());
+
+ uint64_t highestTimestamp = 0;
+
+ //iterate through the map of <bucket, vector<Entry>>
+ for (std::map<document::BucketId, BucketOperationList>::iterator bucketIt =
+ bucketMap.begin();
+ bucketIt != bucketMap.end();
+ bucketIt++)
+ {
+ LOG(debug, "Iterating through bucketMap, bucket %s", bucketIt->first.toString().c_str());
+ //get the size of the buffer large enough to hold the entries that
+ //must go to this bucketId
+ uint32_t blockSize = 4; //4 bytes initially for length
+
+ EntryVector& v = bucketIt->second.operations;
+ for (EntryVector::iterator entryIt = v.begin();
+ entryIt != v.end();
+ entryIt++) {
+ blockSize += entryIt->getSerializedSize();
+ }
+ assert(blockSize > 4);
+
+ //now create a MultiOperationCommand with the new DocumentList
+ std::shared_ptr<api::MultiOperationCommand>
+ command(new api::MultiOperationCommand(
+ _manager.getTypeRepo(),
+ bucketIt->first, blockSize));
+ copyMessageSettings(*_msg, *command);
+
+ LOG(debug, "Block size %d", blockSize);
+ vdslib::WritableDocumentList& block = command->getOperations();
+
+ //iterate through the entries, and add them to the new DocumentList
+ for (EntryVector::iterator entryIt = v.begin(); entryIt != v.end(); entryIt++)
+ {
+ uint64_t ts;
+ if(!_msg->keepTimeStamps()){
+ ts = _manager.getUniqueTimestamp();
+ }
+ else{
+ ts = entryIt->getTimestamp();
+ }
+
+ if (ts > highestTimestamp) {
+ highestTimestamp = ts;
+ }
+ block.addEntry(*entryIt, ts);
+
+ LOG(debug, "Entry size is %d", block.size());
+ }
+
+ sendToBucket(bucketIt->second.entry, command);
+ }
+
+ _tracker.flushQueue(sender);
+
+ _msg = std::shared_ptr<api::MultiOperationCommand>();
+ _reply->setHighestModificationTimestamp(highestTimestamp);
+};
+
+void
+MultiOperationOperation::onReceive(DistributorMessageSender& sender, const std::shared_ptr<api::StorageReply> & msg)
+{
+ _tracker.receiveReply(sender, static_cast<api::BucketInfoReply&>(*msg));
+}
+
+void
+MultiOperationOperation::onClose(DistributorMessageSender& sender)
+{
+ _tracker.fail(sender, api::ReturnCode(api::ReturnCode::ABORTED, "Process is shutting down"));
+}
diff --git a/storage/src/vespa/storage/distributor/operations/external/multioperationoperation.h b/storage/src/vespa/storage/distributor/operations/external/multioperationoperation.h
new file mode 100644
index 00000000000..95f31537dfb
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/multioperationoperation.h
@@ -0,0 +1,61 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/persistencemessagetracker.h>
+#include <vespa/storageapi/messageapi/returncode.h>
+#include <vespa/vdslib/container/writabledocumentlist.h>
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+
+namespace document {
+class Document;
+}
+
+namespace storage {
+
+namespace api {
+class CreateBucketReply;
+class MultiOperationCommand;
+}
+
+namespace distributor {
+
+class MultiOperationOperation : public Operation
+{
+public:
+ MultiOperationOperation(DistributorComponent& manager,
+ const std::shared_ptr<api::MultiOperationCommand> & msg,
+ PersistenceOperationMetricSet& metric);
+
+ void onStart(DistributorMessageSender& sender);
+
+ const char* getName() const { return "multioperation"; };
+
+ std::string getStatus() const { return ""; };
+
+ void onReceive(DistributorMessageSender& sender, const std::shared_ptr<api::StorageReply> &);
+
+ void onClose(DistributorMessageSender& sender);
+private:
+ std::shared_ptr<api::MultiOperationReply> _reply;
+
+ PersistenceMessageTrackerImpl _trackerInstance;
+ PersistenceMessageTracker& _tracker;
+
+ std::shared_ptr<api::MultiOperationCommand> _msg;
+
+ DistributorComponent& _manager;
+
+ uint32_t _minUseBits;
+
+ uint32_t getMinimumUsedBits(const vdslib::DocumentList& opList) const;
+
+ bool sendToBucket(BucketDatabase::Entry& e,
+ std::shared_ptr<api::MultiOperationCommand> moCommand);
+};
+
+}
+
+
+}
+
+
diff --git a/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp
new file mode 100644
index 00000000000..51198043775
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/putoperation.cpp
@@ -0,0 +1,375 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/operations/external/putoperation.h>
+
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/activecopy.h>
+#include <vespa/storage/distributor/distributorcomponent.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storage/distributor/operationtargetresolverimpl.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/vdslib/distribution/idealnodecalculatorimpl.h>
+
+LOG_SETUP(".distributor.callback.doc.put");
+
+
+using namespace storage::distributor;
+using namespace storage;
+
+PutOperation::PutOperation(DistributorComponent& manager,
+ const std::shared_ptr<api::PutCommand> & msg,
+ PersistenceOperationMetricSet& metric)
+ : Operation(),
+ _trackerInstance(metric,
+ std::shared_ptr<api::BucketInfoReply>(new api::PutReply(*msg)),
+ manager,
+ msg->getTimestamp()),
+ _tracker(_trackerInstance),
+ _msg(msg),
+ _manager(manager)
+{
+};
+
+namespace {
+
+bool hasNode(const std::vector<uint16_t>& vec, uint16_t value) {
+ for (uint32_t i = 0; i < vec.size(); i++) {
+ if (vec[i] == value) {
+ return true;
+ }
+ }
+
+ return false;
+};
+
+}
+
+void
+PutOperation::getTargetNodes(const std::vector<uint16_t>& idealNodes,
+ std::vector<uint16_t>& targetNodes,
+ std::vector<uint16_t>& createNodes,
+ const BucketInfo& bucketInfo,
+ uint32_t redundancy)
+{
+ // First insert all nodes that are trusted or already in the ideal state.
+ for (uint32_t i = 0; i < bucketInfo.getNodeCount(); i++) {
+ if (bucketInfo.getNodeRef(i).trusted() || hasNode(idealNodes,bucketInfo.getNodeRef(i).getNode())) {
+ LOG(spam, "Adding target node %u with %s since it's trusted or in ideal state",
+ i, bucketInfo.getNodeRef(i).toString().c_str());
+ targetNodes.push_back(bucketInfo.getNodeRef(i).getNode());
+ }
+ }
+
+ // Then insert all nodes that already exist if we need them.
+ for (uint32_t i = 0; targetNodes.size() < redundancy && i < bucketInfo.getNodeCount(); i++) {
+ if (!hasNode(targetNodes, bucketInfo.getNodeRef(i).getNode())) {
+ LOG(spam, "Adding target node %u with %s since it already exists",
+ i, bucketInfo.getNodeRef(i).toString().c_str());
+ targetNodes.push_back(bucketInfo.getNodeRef(i).getNode());
+ }
+ }
+
+ // Then add stuff from ideal state.
+ for (uint32_t i = 0; targetNodes.size() < redundancy && i < idealNodes.size(); i++) {
+ if (!hasNode(targetNodes, idealNodes[i])) {
+ targetNodes.push_back(idealNodes[i]);
+ LOG(spam, "Adding target+create node %u it's in ideal state",
+ idealNodes[i]);
+ createNodes.push_back(idealNodes[i]);
+ }
+ }
+
+ std::sort(targetNodes.begin(), targetNodes.end());
+ std::sort(createNodes.begin(), createNodes.end());
+}
+
+// FIXME: deprecated! remove as soon as multoperationoperation is merely
+// a haunting memory of the past since it's only used by that component!
+bool
+PutOperation::checkCreateBucket(const lib::Distribution& dist,
+ const lib::ClusterState& state,
+ BucketDatabase::Entry& entry,
+ std::vector<uint16_t>& targetNodes,
+ std::vector<MessageTracker::ToSend>& messagesToSend,
+ const api::StorageCommand& originalCommand)
+{
+ BucketInfo& info = entry.getBucketInfo();
+
+ std::vector<uint16_t> createNodes;
+ std::vector<uint16_t> idealNodes(
+ dist.getIdealStorageNodes(state, entry.getBucketId(), "ui"));
+
+ getTargetNodes(idealNodes,
+ targetNodes,
+ createNodes,
+ info,
+ dist.getRedundancy());
+
+ ActiveList active(ActiveCopy::calculate(idealNodes, dist, entry));
+ LOG(debug, "Active copies for bucket %s: %s",
+ entry.getBucketId().toString().c_str(), active.toString().c_str());
+ // Send create buckets for all nodes in ideal state where we don't
+ // currently have copies.
+ for (uint32_t i = 0; i < createNodes.size(); i++) {
+ std::shared_ptr<api::CreateBucketCommand> cbc(
+ new api::CreateBucketCommand(entry.getBucketId()));
+ if (active.contains(createNodes[i])) {
+ BucketCopy copy(*entry->getNode(createNodes[i]));
+ copy.setActive(true);
+ entry->updateNode(copy);
+ cbc->setActive(true);
+ }
+ LOG(debug, "Creating bucket on node %u: %s",
+ createNodes[i], cbc->toString().c_str());
+
+ copyMessageSettings(originalCommand, *cbc);
+ messagesToSend.push_back(MessageTracker::ToSend(cbc, createNodes[i]));
+ }
+
+ // All nodes that we are not feeding to now will no longer be trusted.
+ // TODO: Refactor?
+ bool mustWrite = false;
+ for (uint32_t i = 0; i < info.getNodeCount(); i++) {
+ bool found = false;
+ for (uint32_t j = 0; j < targetNodes.size(); j++) {
+ if (info.getNodeRef(i).getNode() == targetNodes[j]) {
+ LOG(spam,
+ "Found matching target node %u in %s",
+ targetNodes[i],
+ info.getNodeRef(i).toString().c_str());
+ found = true;
+ break;
+ }
+ }
+
+ if (!found && info.getNodeRef(i).trusted()) {
+ LOG(spam,
+ "Setting mustWrite=true since %s is trusted",
+ info.getNodeRef(i).toString().c_str());
+
+ info.clearTrusted(info.getNodeRef(i).getNode());
+ mustWrite = true;
+ }
+ }
+
+ return mustWrite;
+}
+
+void
+PutOperation::insertDatabaseEntryAndScheduleCreateBucket(
+ const OperationTargetList& copies,
+ bool setOneActive,
+ const api::StorageCommand& originalCommand,
+ std::vector<MessageTracker::ToSend>& messagesToSend)
+{
+ document::BucketId lastBucket;
+ bool multipleBuckets = false;
+ for (uint32_t i=0, n=copies.size(); i<n; ++i) {
+ if (!copies[i].isNewCopy()) continue;
+ if (lastBucket.getRawId() != 0 && copies[i].getBucketId() != lastBucket)
+ {
+ multipleBuckets = true;
+ }
+ lastBucket = copies[i].getBucketId();
+ // Fake that we have a non-empty bucket so it isn't deleted.
+ // Copy is inserted with timestamp 0 such that any actual bucket info
+ // subsequently arriving from the storage node will always overwrite it.
+ BucketCopy copy(BucketCopy::recentlyCreatedCopy(
+ 0, copies[i].getNode().getIndex()));
+ _manager.updateBucketDatabase(lastBucket, copy,
+ DatabaseUpdate::CREATE_IF_NONEXISTING);
+ }
+ ActiveList active;
+ if (setOneActive) {
+ assert(!multipleBuckets);
+ (void) multipleBuckets;
+ BucketDatabase::Entry entry(
+ _manager.getBucketDatabase().get(lastBucket));
+ std::vector<uint16_t> idealState(
+ _manager.getDistribution().getIdealStorageNodes(
+ _manager.getClusterState(), lastBucket, "ui"));
+ active = ActiveCopy::calculate(idealState, _manager.getDistribution(),
+ entry);
+ LOG(debug, "Active copies for bucket %s: %s",
+ entry.getBucketId().toString().c_str(), active.toString().c_str());
+ for (uint32_t i=0; i<active.size(); ++i) {
+ BucketCopy copy(*entry->getNode(active[i].nodeIndex));
+ copy.setActive(true);
+ entry->updateNode(copy);
+ }
+ _manager.getBucketDatabase().update(entry);
+ }
+ for (uint32_t i=0, n=copies.size(); i<n; ++i) {
+ if (!copies[i].isNewCopy()) continue;
+ std::shared_ptr<api::CreateBucketCommand> cbc(
+ new api::CreateBucketCommand(copies[i].getBucketId()));
+ if (setOneActive && active.contains(copies[i].getNode().getIndex())) {
+ cbc->setActive(true);
+ }
+ LOG(debug, "Creating bucket on node %u: %s",
+ copies[i].getNode().getIndex(), cbc->toString().c_str());
+
+ copyMessageSettings(originalCommand, *cbc);
+ messagesToSend.push_back(MessageTracker::ToSend(
+ cbc, copies[i].getNode().getIndex()));
+ }
+}
+
+void
+PutOperation::sendPutToBucketOnNode(
+ const document::BucketId& bucketId,
+ const uint16_t node,
+ std::vector<PersistenceMessageTracker::ToSend>& putBatch)
+{
+ std::shared_ptr<api::PutCommand> command(
+ new api::PutCommand(
+ bucketId,
+ _msg->getDocument(),
+ _msg->getTimestamp()));
+ LOG(debug,
+ "Sending %s to node %u",
+ command->toString().c_str(),
+ node);
+
+ copyMessageSettings(*_msg, *command);
+ command->setUpdateTimestamp(_msg->getUpdateTimestamp());
+ command->setCondition(_msg->getCondition());
+ putBatch.push_back(MessageTracker::ToSend(command, node));
+
+}
+
+void
+PutOperation::onStart(DistributorMessageSender& sender)
+{
+ document::BucketIdFactory bucketIdFactory;
+ document::BucketId bid = bucketIdFactory.getBucketId(_msg->getDocumentId());
+
+ LOG(debug,
+ "Received PUT %s for bucket %s",
+ _msg->getDocumentId().toString().c_str(),
+ bid.toString().c_str());
+
+ lib::ClusterState systemState = _manager.getClusterState();
+
+ // Don't do anything if all nodes are down.
+ bool up = false;
+ for (uint16_t i = 0; i < systemState.getNodeCount(lib::NodeType::STORAGE); i++) {
+ if (systemState.getNodeState(lib::Node(lib::NodeType::STORAGE, i))
+ .getState().oneOf(_manager.getDistributor().getStorageNodeUpStates()))
+ {
+ up = true;
+ }
+ }
+
+ if (up) {
+ std::vector<document::BucketId> bucketsToCheckForSplit;
+
+ lib::IdealNodeCalculatorImpl idealNodeCalculator;
+ idealNodeCalculator.setDistribution(_manager.getDistribution());
+ idealNodeCalculator.setClusterState(_manager.getClusterState());
+ OperationTargetResolverImpl targetResolver(
+ _manager.getBucketDatabase(),
+ idealNodeCalculator,
+ _manager.getDistributor().getConfig().getMinimalBucketSplit(),
+ _manager.getDistribution().getRedundancy());
+ OperationTargetList targets(targetResolver.getTargets(
+ OperationTargetResolver::PUT, bid));
+
+ for (size_t i = 0; i < targets.size(); ++i) {
+ if (_manager.getDistributor().getPendingMessageTracker().
+ hasPendingMessage(targets[i].getNode().getIndex(),
+ targets[i].getBucketId(),
+ api::MessageType::DELETEBUCKET_ID))
+ {
+ _tracker.fail(sender, api::ReturnCode(api::ReturnCode::BUCKET_DELETED,
+ "Bucket was being deleted while we got a PUT, failing "
+ "operation to be safe"));
+ return;
+ }
+ }
+
+ // Mark any entries we're not feeding to as not trusted.
+ std::vector<BucketDatabase::Entry> entries;
+ _manager.getBucketDatabase().getParents(bid, entries);
+
+ std::vector<PersistenceMessageTracker::ToSend> createBucketBatch;
+ if (targets.hasAnyNewCopies()) {
+ insertDatabaseEntryAndScheduleCreateBucket(
+ targets,
+ shouldImplicitlyActivateReplica(targets),
+ *_msg,
+ createBucketBatch);
+ }
+
+ if (!createBucketBatch.empty()) {
+ _tracker.queueMessageBatch(createBucketBatch);
+ }
+
+ std::vector<PersistenceMessageTracker::ToSend> putBatch;
+
+ // Now send PUTs
+ for (uint32_t i = 0; i < targets.size(); i++) {
+ const OperationTarget& target(targets[i]);
+ sendPutToBucketOnNode(target.getBucketId(), target.getNode().getIndex(),
+ putBatch);
+ }
+
+ if (putBatch.size()) {
+ _tracker.queueMessageBatch(putBatch);
+ } else {
+ const char* error = "Can't store document: No storage nodes available";
+ LOG(debug, "%s", error);
+ _tracker.fail(sender,
+ api::ReturnCode(api::ReturnCode::NOT_CONNECTED, error));
+ return;
+ }
+
+ // Check whether buckets are large enough to be split.
+ // TODO(vekterli): only check entries for sendToExisting?
+ for (uint32_t i = 0; i < entries.size(); ++i) {
+ _manager.getDistributor().checkBucketForSplit(
+ entries[i],
+ _msg->getPriority());
+ }
+
+ _tracker.flushQueue(sender);
+ } else {
+ const char* error = "Can't store document: No storage nodes available";
+ LOG(debug, "%s", error);
+ _tracker.fail(sender, api::ReturnCode(api::ReturnCode::NOT_CONNECTED, error));
+ }
+
+ _msg = std::shared_ptr<api::PutCommand>();
+}
+
+bool
+PutOperation::shouldImplicitlyActivateReplica(
+ const OperationTargetList& targets) const
+{
+ const auto& config(_manager.getDistributor().getConfig());
+ if (config.isBucketActivationDisabled()) {
+ return false;
+ }
+ return !targets.hasAnyExistingCopies();
+}
+
+void
+PutOperation::onReceive(DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply> & msg)
+{
+ LOG(debug, "Received %s", msg->toString(true).c_str());
+ _tracker.receiveReply(sender, static_cast<api::BucketInfoReply&>(*msg));
+}
+
+void
+PutOperation::onClose(DistributorMessageSender& sender)
+{
+ const char* error = "Process is shutting down";
+ LOG(debug, "%s", error);
+ _tracker.fail(sender, api::ReturnCode(api::ReturnCode::ABORTED, error));
+}
+
diff --git a/storage/src/vespa/storage/distributor/operations/external/putoperation.h b/storage/src/vespa/storage/distributor/operations/external/putoperation.h
new file mode 100644
index 00000000000..677b2f85d4e
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/putoperation.h
@@ -0,0 +1,83 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/storage/distributor/operations/operation.h>
+#include <vespa/storageapi/messageapi/returncode.h>
+#include <vespa/storage/distributor/persistencemessagetracker.h>
+#include <vespa/storage/distributor/operationtargetresolver.h>
+
+namespace document {
+ class Document;
+}
+namespace storage {
+namespace lib {
+ class Distribution;
+}
+namespace api {
+ class CreateBucketReply;
+ class PutCommand;
+}
+namespace distributor {
+
+class PutOperation : public Operation
+{
+public:
+ PutOperation(DistributorComponent& manager,
+ const std::shared_ptr<api::PutCommand> & msg,
+ PersistenceOperationMetricSet& metric);
+
+ void onStart(DistributorMessageSender& sender);
+
+ const char* getName() const { return "put"; };
+
+ std::string getStatus() const { return ""; };
+
+ void onReceive(DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply> &);
+
+ void onClose(DistributorMessageSender& sender);
+
+ /**
+ * Gets the ideal state of the given bucket, and adds all nodes from the
+ * ideal state to targetNodes. Also schedules create bucket messages for
+ * all buckets currently not in the nodes list, and sets nodes in the node
+ * list not in the ideal state to untrusted.
+ */
+ static bool checkCreateBucket(const lib::Distribution& distribution,
+ const lib::ClusterState& state,
+ BucketDatabase::Entry& e,
+ std::vector<uint16_t>& targetNodes,
+ std::vector<MessageTracker::ToSend>& messagesToSend,
+ const api::StorageCommand& originalCommand);
+
+ static void getTargetNodes(const std::vector<uint16_t>& idealNodes,
+ std::vector<uint16_t>& targetNodes,
+ std::vector<uint16_t>& createNodes,
+ const BucketInfo& bucketInfo,
+ uint32_t redundancy);
+private:
+ PersistenceMessageTrackerImpl _trackerInstance;
+ PersistenceMessageTracker& _tracker;
+
+ void insertDatabaseEntryAndScheduleCreateBucket(
+ const OperationTargetList& copies,
+ bool setOneActive,
+ const api::StorageCommand& originalCommand,
+ std::vector<MessageTracker::ToSend>& messagesToSend);
+
+ void sendPutToBucketOnNode(
+ const document::BucketId& bucketId,
+ const uint16_t node,
+ std::vector<PersistenceMessageTracker::ToSend>& putBatch);
+
+ bool shouldImplicitlyActivateReplica(
+ const OperationTargetList& targets) const;
+
+ std::shared_ptr<api::PutCommand> _msg;
+
+ DistributorComponent& _manager;
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/operations/external/removelocationoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/removelocationoperation.cpp
new file mode 100644
index 00000000000..5adaa2b8cc1
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/removelocationoperation.cpp
@@ -0,0 +1,118 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/document/bucket/bucketselector.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/document/repo/documenttyperepo.h>
+#include <vespa/document/select/parser.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storage/distributor/operations/external/removelocationoperation.h>
+#include <vespa/storageapi/message/removelocation.h>
+
+LOG_SETUP(".distributor.callback.doc.removelocation");
+
+
+using namespace storage::distributor;
+using namespace storage;
+
+RemoveLocationOperation::RemoveLocationOperation(
+ DistributorComponent& manager,
+ const std::shared_ptr<api::RemoveLocationCommand> & msg,
+ PersistenceOperationMetricSet& metric)
+ : Operation(),
+ _trackerInstance(metric,
+ std::shared_ptr<api::BucketInfoReply>(new api::RemoveLocationReply(*msg)),
+ manager,
+ 0),
+ _tracker(_trackerInstance),
+ _msg(msg),
+ _manager(manager)
+{
+}
+
+int
+RemoveLocationOperation::getBucketId(
+ DistributorComponent& manager,
+ const api::RemoveLocationCommand& cmd, document::BucketId& bid)
+{
+ document::DocumentTypeRepo::SP repo =
+ manager.getTypeRepo();
+ document::select::Parser parser(
+ *repo, manager.getBucketIdFactory());
+
+ document::BucketSelector bucketSel(manager.getBucketIdFactory());
+ std::unique_ptr<document::BucketSelector::BucketVector> exprResult
+ = bucketSel.select(*parser.parse(cmd.getDocumentSelection()));
+
+ if (!exprResult.get()) {
+ return 0;
+ } else if (exprResult->size() != 1) {
+ return exprResult->size();
+ } else {
+ bid = (*exprResult)[0];
+ return 1;
+ }
+}
+
+void
+RemoveLocationOperation::onStart(DistributorMessageSender& sender)
+{
+ document::BucketId bid;
+ int count = getBucketId(_manager, *_msg, bid);
+
+ if (count != 1) {
+ _tracker.fail(sender,
+ api::ReturnCode(api::ReturnCode::ILLEGAL_PARAMETERS,
+ "Document selection could not be mapped to a single location"));
+ }
+
+ std::vector<BucketDatabase::Entry> entries;
+ _manager.getBucketDatabase().getAll(bid, entries);
+
+ bool sent = false;
+ for (uint32_t j = 0; j < entries.size(); ++j) {
+ const BucketDatabase::Entry& e = entries[j];
+
+ std::vector<uint16_t> nodes = e->getNodes();
+
+ for (uint32_t i = 0; i < nodes.size(); i++) {
+ std::shared_ptr<api::RemoveLocationCommand> command(
+ new api::RemoveLocationCommand(
+ _msg->getDocumentSelection(),
+ e.getBucketId()));
+
+ copyMessageSettings(*_msg, *command);
+ _tracker.queueCommand(command, nodes[i]);
+ sent = true;
+ }
+ }
+
+ if (!sent) {
+ LOG(debug,
+ "Remove location %s failed since no available nodes found. "
+ "System state is %s",
+ _msg->toString().c_str(),
+ _manager.getClusterState().toString().c_str());
+
+ _tracker.fail(sender, api::ReturnCode(api::ReturnCode::OK));
+ } else {
+ _tracker.flushQueue(sender);
+ }
+};
+
+
+void
+RemoveLocationOperation::onReceive(
+ DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply> & msg)
+{
+ _tracker.receiveReply(sender, static_cast<api::BucketInfoReply&>(*msg));
+}
+
+void
+RemoveLocationOperation::onClose(DistributorMessageSender& sender)
+{
+ _tracker.fail(sender, api::ReturnCode(api::ReturnCode::ABORTED,
+ "Process is shutting down"));
+}
diff --git a/storage/src/vespa/storage/distributor/operations/external/removelocationoperation.h b/storage/src/vespa/storage/distributor/operations/external/removelocationoperation.h
new file mode 100644
index 00000000000..9c0c8e21e91
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/removelocationoperation.h
@@ -0,0 +1,48 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/operations/operation.h>
+#include <vespa/storage/distributor/persistencemessagetracker.h>
+
+namespace storage {
+
+namespace api {
+class RemoveLocationCommand;
+}
+
+namespace distributor {
+
+class RemoveLocationOperation : public Operation
+{
+public:
+ RemoveLocationOperation(DistributorComponent& manager,
+ const std::shared_ptr<api::RemoveLocationCommand> & msg,
+ PersistenceOperationMetricSet& metric);
+
+ void onStart(DistributorMessageSender& sender);
+
+ static int getBucketId(DistributorComponent& manager,
+ const api::RemoveLocationCommand& cmd,
+ document::BucketId& id);
+
+ const char* getName() const { return "removelocation"; };
+
+ std::string getStatus() const { return ""; };
+
+ void onReceive(DistributorMessageSender& sender, const std::shared_ptr<api::StorageReply> &);
+
+ void onClose(DistributorMessageSender& sender);
+
+private:
+ PersistenceMessageTrackerImpl _trackerInstance;
+ PersistenceMessageTracker& _tracker;
+
+ std::shared_ptr<api::RemoveLocationCommand> _msg;
+
+ DistributorComponent& _manager;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/operations/external/removeoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/removeoperation.cpp
new file mode 100644
index 00000000000..b6d575071a2
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/removeoperation.cpp
@@ -0,0 +1,103 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/operations/external/removeoperation.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+
+LOG_SETUP(".distributor.operation.external.remove");
+
+
+using namespace storage::distributor;
+using namespace storage;
+
+RemoveOperation::RemoveOperation(DistributorComponent& manager,
+ const std::shared_ptr<api::RemoveCommand> & msg,
+ PersistenceOperationMetricSet& metric)
+ : Operation(),
+ _trackerInstance(metric,
+ std::shared_ptr<api::BucketInfoReply>(new api::RemoveReply(*msg)),
+ manager, msg->getTimestamp()),
+ _tracker(_trackerInstance),
+ _msg(msg),
+ _manager(manager)
+{
+}
+
+void
+RemoveOperation::onStart(DistributorMessageSender& sender)
+{
+ LOG(spam,
+ "Started remove on document %s",
+ _msg->getDocumentId().toString().c_str());
+
+ document::BucketId bucketId(
+ _manager.getBucketIdFactory().getBucketId(
+ _msg->getDocumentId()));
+
+ std::vector<BucketDatabase::Entry> entries;
+ _manager.getBucketDatabase().getParents(bucketId, entries);
+
+ bool sent = false;
+
+ for (uint32_t j = 0; j < entries.size(); j++) {
+ const BucketDatabase::Entry& e = entries[j];
+ std::vector<MessageTracker::ToSend> messages;
+
+ for (uint32_t i = 0; i < e->getNodeCount(); i++) {
+ std::shared_ptr<api::RemoveCommand> command(new api::RemoveCommand(
+ e.getBucketId(),
+ _msg->getDocumentId(),
+ _msg->getTimestamp()));
+
+ copyMessageSettings(*_msg, *command);
+ command->getTrace().setLevel(_msg->getTrace().getLevel());
+ command->setCondition(_msg->getCondition());
+
+ messages.push_back(
+ MessageTracker::ToSend(command, e->getNodeRef(i).getNode()));
+ sent = true;
+ }
+
+ _tracker.queueMessageBatch(messages);
+ }
+
+ if (!sent) {
+ LOG(debug,
+ "Remove document %s failed since no available nodes found. "
+ "System state is %s",
+ _msg->getDocumentId().toString().c_str(),
+ _manager.getClusterState().toString().c_str());
+
+ _tracker.fail(sender, api::ReturnCode(api::ReturnCode::OK));
+ } else {
+ _tracker.flushQueue(sender);
+ }
+};
+
+
+void
+RemoveOperation::onReceive(DistributorMessageSender& sender, const std::shared_ptr<api::StorageReply> & msg)
+{
+ api::RemoveReply& reply(static_cast<api::RemoveReply&>(*msg));
+
+ if (_tracker.getReply().get()) {
+ api::RemoveReply& replyToSend =
+ static_cast<api::RemoveReply&>(*_tracker.getReply());
+
+
+ if (reply.getOldTimestamp() > replyToSend.getOldTimestamp()) {
+ replyToSend.setOldTimestamp(reply.getOldTimestamp());
+ }
+ }
+
+ _tracker.receiveReply(sender, reply);
+}
+
+void
+RemoveOperation::onClose(DistributorMessageSender& sender)
+{
+ _tracker.fail(sender, api::ReturnCode(api::ReturnCode::ABORTED, "Process is shutting down"));
+}
diff --git a/storage/src/vespa/storage/distributor/operations/external/removeoperation.h b/storage/src/vespa/storage/distributor/operations/external/removeoperation.h
new file mode 100644
index 00000000000..b85170a0920
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/removeoperation.h
@@ -0,0 +1,44 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/operations/operation.h>
+#include <vespa/storage/distributor/persistencemessagetracker.h>
+
+namespace storage {
+
+namespace api {
+class RemoveCommand;
+}
+
+namespace distributor {
+
+class RemoveOperation : public Operation
+{
+public:
+ RemoveOperation(DistributorComponent& manager,
+ const std::shared_ptr<api::RemoveCommand> & msg,
+ PersistenceOperationMetricSet& metric);
+
+ void onStart(DistributorMessageSender& sender);
+
+ const char* getName() const { return "remove"; };
+
+ std::string getStatus() const { return ""; };
+
+ void onReceive(DistributorMessageSender& sender, const std::shared_ptr<api::StorageReply> &);
+
+ void onClose(DistributorMessageSender& sender);
+
+private:
+ PersistenceMessageTrackerImpl _trackerInstance;
+ PersistenceMessageTracker& _tracker;
+
+ std::shared_ptr<api::RemoveCommand> _msg;
+
+ DistributorComponent& _manager;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/operations/external/statbucketlistoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/statbucketlistoperation.cpp
new file mode 100644
index 00000000000..c9f1d157557
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/statbucketlistoperation.cpp
@@ -0,0 +1,65 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <iostream>
+#include <vespa/storageapi/message/stat.h>
+#include <vespa/storage/distributor/operations/external/statbucketlistoperation.h>
+#include <vespa/storage/distributor/maintenance/maintenanceoperationgenerator.h>
+
+namespace storage {
+namespace distributor {
+
+StatBucketListOperation::StatBucketListOperation(
+ const BucketDatabase& bucketDb,
+ const MaintenanceOperationGenerator& generator,
+ uint16_t distributorIndex,
+ const std::shared_ptr<api::GetBucketListCommand>& cmd)
+ : _bucketDb(bucketDb),
+ _generator(generator),
+ _distributorIndex(distributorIndex),
+ _command(cmd)
+{
+}
+
+void
+StatBucketListOperation::getBucketStatus(const BucketDatabase::Entry& entry,
+ std::ostream& ost) const
+{
+ std::vector<MaintenanceOperation::SP> operations(
+ _generator.generateAll(entry.getBucketId()));
+
+ for (uint32_t i = 0; i < operations.size(); ++i) {
+ const MaintenanceOperation& op(*operations[i]);
+ if (i > 0) {
+ ost << ", ";
+ }
+ ost << op.getName() << ": " << op.getDetailedReason();
+ }
+ if (!operations.empty()) {
+ ost << ' ';
+ }
+ ost << "[" << entry->toString() << "]";
+}
+
+void
+StatBucketListOperation::onStart(DistributorMessageSender& sender)
+{
+ api::GetBucketListReply::SP reply(new api::GetBucketListReply(*_command));
+
+ std::vector<BucketDatabase::Entry> entries;
+ _bucketDb.getAll(_command->getBucketId(), entries);
+
+ for (uint32_t i = 0; i < entries.size(); i++) {
+ std::ostringstream ost;
+ ost << "[distributor:" << _distributorIndex << "] ";
+
+ getBucketStatus(entries[i], ost);
+
+ reply->getBuckets().push_back(api::GetBucketListReply::BucketInfo(
+ entries[i].getBucketId(),
+ ost.str()));
+ }
+ sender.sendReply(reply);
+}
+
+}
+}
diff --git a/storage/src/vespa/storage/distributor/operations/external/statbucketlistoperation.h b/storage/src/vespa/storage/distributor/operations/external/statbucketlistoperation.h
new file mode 100644
index 00000000000..a1b4eb110b7
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/statbucketlistoperation.h
@@ -0,0 +1,53 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/operations/operation.h>
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <vespa/vespalib/util/sync.h>
+
+namespace storage {
+
+namespace api {
+class GetBucketListCommand;
+}
+
+namespace distributor {
+
+class MaintenanceOperationGenerator;
+
+class StatBucketListOperation : public Operation
+{
+public:
+ StatBucketListOperation(
+ const BucketDatabase& bucketDb,
+ const MaintenanceOperationGenerator& generator,
+ uint16_t distributorIndex,
+ const std::shared_ptr<api::GetBucketListCommand>& cmd);
+ virtual ~StatBucketListOperation() {}
+
+ virtual const char* getName() const { return "statBucketList"; }
+ virtual std::string getStatus() const { return ""; }
+
+ virtual void onStart(DistributorMessageSender& sender);
+ virtual void onReceive(DistributorMessageSender&,
+ const std::shared_ptr<api::StorageReply>&)
+ {
+ // Never called.
+ assert(false);
+ }
+ void onClose(DistributorMessageSender&) {
+ }
+
+private:
+ void getBucketStatus(const BucketDatabase::Entry& entry,
+ std::ostream& os) const;
+
+ const BucketDatabase& _bucketDb;
+ const MaintenanceOperationGenerator& _generator;
+ uint16_t _distributorIndex;
+ std::shared_ptr<api::GetBucketListCommand> _command;
+};
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/operations/external/statbucketoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/statbucketoperation.cpp
new file mode 100644
index 00000000000..b3a7ec4805e
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/statbucketoperation.cpp
@@ -0,0 +1,107 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/operations/external/statbucketoperation.h>
+#include <vespa/log/log.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/distributorcomponent.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storageapi/message/stat.h>
+
+LOG_SETUP(".distributor.callback.statbucket");
+
+namespace storage {
+namespace distributor {
+
+StatBucketOperation::StatBucketOperation(
+ DistributorComponent& manager,
+ const std::shared_ptr<api::StatBucketCommand> & cmd)
+ : Operation(),
+ _manager(manager),
+ _command(cmd)
+{
+}
+
+void
+StatBucketOperation::onClose(DistributorMessageSender& sender)
+{
+ api::StatBucketReply* rep = (api::StatBucketReply*)_command->makeReply().release();
+ rep->setResult(api::ReturnCode(api::ReturnCode::ABORTED, "Process is shutting down"));
+ sender.sendReply(std::shared_ptr<api::StatBucketReply>(rep));
+}
+
+void
+StatBucketOperation::onStart(DistributorMessageSender& sender)
+{
+ std::vector<uint16_t> nodes;
+
+ BucketDatabase::Entry entry(
+ _manager.getBucketDatabase().get(_command->getBucketId()));
+
+ if (entry.valid()) {
+ nodes = entry->getNodes();
+ }
+
+ // If no entries exist, give empty reply
+ if (nodes.size() == 0) {
+ api::StatBucketReply::SP reply(new api::StatBucketReply(*_command, "Bucket was not stored on any nodes."));
+ reply->setResult(api::ReturnCode(api::ReturnCode::OK));
+ sender.sendReply(reply);
+ } else {
+ std::vector<std::shared_ptr<api::StorageCommand> > messages;
+ for (uint32_t i = 0; i < nodes.size(); i++) {
+ std::shared_ptr<api::StatBucketCommand> cmd(
+ new api::StatBucketCommand(
+ _command->getBucketId(),
+ _command->getDocumentSelection()));
+
+ messages.push_back(cmd);
+ _sent[cmd->getMsgId()] = nodes[i];
+ }
+
+ for (uint32_t i = 0; i < nodes.size(); i++) {
+ sender.sendToNode(
+ lib::NodeType::STORAGE,
+ nodes[i],
+ messages[i],
+ true);
+ }
+ }
+};
+
+void
+StatBucketOperation::onReceive(DistributorMessageSender& sender, const std::shared_ptr<api::StorageReply> & msg)
+{
+ assert(msg->getType() == api::MessageType::STATBUCKET_REPLY);
+ api::StatBucketReply& myreply(dynamic_cast<api::StatBucketReply&>(*msg));
+
+ std::map<uint64_t, uint16_t>::iterator found = _sent.find(msg->getMsgId());
+
+ if (found != _sent.end()) {
+ std::ostringstream ost;
+ if (myreply.getResult().getResult() == api::ReturnCode::OK) {
+ ost << "\tBucket information from node " << found->second << ":\n" << myreply.getResults() << "\n\n";
+ } else {
+ ost << "\tBucket information retrieval failed on node " << found->second << ": " << myreply.getResult() << "\n\n";
+ }
+ _results[found->second] = ost.str();
+
+ _sent.erase(found);
+ }
+
+ if (_sent.empty()) {
+ std::ostringstream ost;
+ for (std::map<uint16_t, std::string>::iterator iter = _results.begin();
+ iter != _results.end();
+ iter++) {
+ ost << iter->second;
+ }
+
+ api::StatBucketReply::SP reply(new api::StatBucketReply(*_command, ost.str()));
+ sender.sendReply(reply);
+ }
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/operations/external/statbucketoperation.h b/storage/src/vespa/storage/distributor/operations/external/statbucketoperation.h
new file mode 100644
index 00000000000..23cb629f89f
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/statbucketoperation.h
@@ -0,0 +1,49 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class StatCallback
+ * @ingroup distributor
+ *
+ * @brief Callback class handling StatBucket messages.
+ */
+#pragma once
+
+#include <vespa/storage/distributor/operations/operation.h>
+#include <vespa/vespalib/util/sync.h>
+
+namespace storage {
+
+namespace api {
+class StatBucketCommand;
+}
+
+namespace distributor {
+
+class DistributorComponent;
+
+class StatBucketOperation : public Operation
+{
+public:
+ StatBucketOperation(DistributorComponent& manager,
+ const std::shared_ptr<api::StatBucketCommand> & cmd);
+ virtual ~StatBucketOperation() {};
+
+ virtual const char* getName() const { return "statBucket"; }
+ virtual std::string getStatus() const { return ""; }
+
+ void onClose(DistributorMessageSender& sender);
+
+ virtual void onStart(DistributorMessageSender& sender);
+ virtual void onReceive(DistributorMessageSender& sender, const std::shared_ptr<api::StorageReply> & msg);
+
+private:
+ DistributorComponent& _manager;
+
+ std::shared_ptr<api::StatBucketCommand> _command;
+
+ std::map<uint64_t, uint16_t> _sent;
+ std::map<uint16_t, std::string> _results;
+};
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp
new file mode 100644
index 00000000000..72a6dede161
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.cpp
@@ -0,0 +1,556 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/operations/external/twophaseupdateoperation.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storage/distributor/operations/external/getoperation.h>
+#include <vespa/storage/distributor/operations/external/putoperation.h>
+#include <vespa/storage/distributor/operations/external/updateoperation.h>
+#include <vespa/storageapi/message/batch.h>
+
+LOG_SETUP(".distributor.callback.twophaseupdate");
+
+using namespace std::literals::string_literals;
+
+namespace storage {
+namespace distributor {
+
+TwoPhaseUpdateOperation::TwoPhaseUpdateOperation(
+ DistributorComponent& manager,
+ const std::shared_ptr<api::UpdateCommand>& msg,
+ DistributorMetricSet& metrics)
+ : Operation(),
+ _updateMetric(metrics.updates[msg->getLoadType()]),
+ _putMetric(metrics.update_puts[msg->getLoadType()]),
+ _getMetric(metrics.update_gets[msg->getLoadType()]),
+ _updateCmd(msg),
+ _updateReply(),
+ _manager(manager),
+ _sendState(SendState::NONE_SENT),
+ _mode(Mode::FAST_PATH),
+ _replySent(false)
+{
+ document::BucketIdFactory idFactory;
+ _updateDocBucketId = idFactory.getBucketId(_updateCmd->getDocumentId());
+}
+
+namespace {
+
+struct IntermediateMessageSender : DistributorMessageSender {
+ SentMessageMap& msgMap;
+ std::shared_ptr<Operation> callback;
+ DistributorMessageSender& forward;
+ std::shared_ptr<api::StorageReply> _reply;
+
+ IntermediateMessageSender(
+ SentMessageMap& mm,
+ const std::shared_ptr<Operation>& cb,
+ DistributorMessageSender & fwd)
+ : msgMap(mm), callback(cb), forward(fwd)
+ {
+ }
+
+ virtual void sendCommand(const std::shared_ptr<api::StorageCommand>& cmd) {
+ msgMap.insert(cmd->getMsgId(), callback);
+ forward.sendCommand(cmd);
+ };
+
+ virtual void sendReply(const std::shared_ptr<api::StorageReply>& reply) {
+ _reply = reply;
+ }
+
+ virtual int getDistributorIndex() const {
+ return forward.getDistributorIndex();
+ }
+
+ virtual const std::string& getClusterName() const {
+ return forward.getClusterName();
+ }
+
+ virtual const PendingMessageTracker& getPendingMessageTracker() const {
+ return forward.getPendingMessageTracker();
+ }
+};
+
+}
+
+const char*
+TwoPhaseUpdateOperation::stateToString(SendState state)
+{
+ switch (state) {
+ case SendState::NONE_SENT: return "NONE_SENT";
+ case SendState::UPDATES_SENT: return "UPDATES_SENT";
+ case SendState::GETS_SENT: return "GETS_SENT";
+ case SendState::PUTS_SENT: return "PUTS_SENT";
+ default:
+ assert(!"Unknown state");
+ return "";
+ }
+}
+
+void
+TwoPhaseUpdateOperation::transitionTo(SendState newState)
+{
+ assert(newState != SendState::NONE_SENT);
+ LOG(spam, "Transitioning operation %p state %s -> %s",
+ this, stateToString(_sendState), stateToString(newState));
+ _sendState = newState;
+}
+
+void
+TwoPhaseUpdateOperation::ensureUpdateReplyCreated()
+{
+ if (!_updateReply.get()) {
+ _updateReply = _updateCmd->makeReply();
+ }
+}
+
+void
+TwoPhaseUpdateOperation::sendReply(
+ DistributorMessageSender& sender,
+ std::shared_ptr<api::StorageReply>& reply)
+{
+ assert(!_replySent);
+ if (!_trace.isEmpty()) {
+ reply->getTrace().getRoot().addChild(_trace);
+ }
+ sender.sendReply(reply);
+ _replySent = true;
+}
+
+void
+TwoPhaseUpdateOperation::sendReplyWithResult(
+ DistributorMessageSender& sender,
+ const api::ReturnCode& result)
+{
+ ensureUpdateReplyCreated();
+ _updateReply->setResult(result);
+ sendReply(sender, _updateReply);
+}
+
+bool
+TwoPhaseUpdateOperation::isFastPathPossible() const
+{
+ // Fast path iff bucket exists AND is consistent (split and copies).
+ std::vector<BucketDatabase::Entry> entries;
+ _manager.getBucketDatabase().getParents(_updateDocBucketId, entries);
+
+ if (entries.size() != 1) {
+ return false;
+ }
+ return entries[0]->validAndConsistent();
+}
+
+void
+TwoPhaseUpdateOperation::startFastPathUpdate(DistributorMessageSender& sender)
+{
+ _mode = Mode::FAST_PATH;
+ std::shared_ptr<UpdateOperation> updateOperation(
+ new UpdateOperation(_manager, _updateCmd, _updateMetric));
+
+ IntermediateMessageSender intermediate(
+ _sentMessageMap, updateOperation, sender);
+ updateOperation->start(intermediate,
+ _manager.getClock().getTimeInMillis());
+ transitionTo(SendState::UPDATES_SENT);
+
+ if (intermediate._reply.get()) {
+ sendReply(sender, intermediate._reply);
+ }
+}
+
+void
+TwoPhaseUpdateOperation::startSafePathUpdate(DistributorMessageSender& sender)
+{
+ LOG(debug, "Update(%s) safe path: sending Get commands",
+ _updateCmd->getDocumentId().toString().c_str());
+
+ _mode = Mode::SLOW_PATH;
+ std::shared_ptr<api::GetCommand> get(
+ std::make_shared<api::GetCommand>(
+ document::BucketId(0),
+ _updateCmd->getDocumentId(),
+ "[all]"));
+ copyMessageSettings(*_updateCmd, *get);
+ std::shared_ptr<GetOperation> getOperation(
+ std::make_shared<GetOperation>(_manager, get, _getMetric));
+
+ IntermediateMessageSender intermediate(
+ _sentMessageMap, getOperation, sender);
+ getOperation->start(intermediate,
+ _manager.getClock().getTimeInMillis());
+ transitionTo(SendState::GETS_SENT);
+
+ if (intermediate._reply.get()) {
+ assert(intermediate._reply->getType() == api::MessageType::GET_REPLY);
+ handleSafePathReceivedGet(
+ sender, static_cast<api::GetReply&>(*intermediate._reply));
+ }
+}
+
+void
+TwoPhaseUpdateOperation::onStart(DistributorMessageSender& sender) {
+ if (isFastPathPossible()) {
+ startFastPathUpdate(sender);
+ } else {
+ startSafePathUpdate(sender);
+ }
+}
+
+/**
+ * Verify that we still own this bucket. We don't want to put this check
+ * in the regular PutOperation class since the common case is that such
+ * operations are executed after the distributor has synchronously verified
+ * the ownership in the current state already. It's only during two phase
+ * updates that the ownership may change between the initial check and
+ * actually executing a Put for the bucket.
+ */
+bool
+TwoPhaseUpdateOperation::lostBucketOwnershipBetweenPhases() const
+{
+ BucketOwnership bo(_manager.checkOwnershipInPendingAndCurrentState(
+ _updateDocBucketId));
+ return !bo.isOwned();
+}
+
+void
+TwoPhaseUpdateOperation::sendLostOwnershipTransientErrorReply(
+ DistributorMessageSender& sender)
+{
+ sendReplyWithResult(
+ sender,
+ api::ReturnCode(api::ReturnCode::BUCKET_NOT_FOUND,
+ "Distributor lost ownership of bucket between "
+ "executing the read and write phases of a two-"
+ "phase update operation"));
+}
+
+void
+TwoPhaseUpdateOperation::schedulePutsWithUpdatedDocument(
+ std::shared_ptr<document::Document> doc,
+ api::Timestamp putTimestamp,
+ DistributorMessageSender& sender)
+{
+ if (lostBucketOwnershipBetweenPhases()) {
+ sendLostOwnershipTransientErrorReply(sender);
+ return;
+ }
+ std::shared_ptr<api::PutCommand> put(
+ new api::PutCommand(document::BucketId(0), doc, putTimestamp));
+ copyMessageSettings(*_updateCmd, *put);
+ std::shared_ptr<PutOperation> putOperation(
+ new PutOperation(_manager, put, _putMetric));
+
+ IntermediateMessageSender intermediate(
+ _sentMessageMap, putOperation, sender);
+ putOperation->start(intermediate,
+ _manager.getClock().getTimeInMillis());
+ transitionTo(SendState::PUTS_SENT);
+
+ LOG(debug, "Update(%s): sending Put commands with doc %s",
+ _updateCmd->getDocumentId().toString().c_str(),
+ doc->toString(true).c_str());
+
+ if (intermediate._reply.get()) {
+ sendReplyWithResult(sender, intermediate._reply->getResult());
+ }
+}
+
+void
+TwoPhaseUpdateOperation::onReceive(
+ DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply>& msg)
+{
+ if (_mode == Mode::FAST_PATH) {
+ handleFastPathReceive(sender, msg);
+ } else {
+ handleSafePathReceive(sender, msg);
+ }
+}
+
+void
+TwoPhaseUpdateOperation::handleFastPathReceive(
+ DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply>& msg)
+{
+ if (msg->getType() == api::MessageType::GET_REPLY) {
+ assert(_sendState == SendState::GETS_SENT);
+ api::GetReply& getReply = static_cast<api::GetReply&> (*msg);
+ addTraceFromReply(getReply);
+
+ LOG(debug, "Update(%s) Get reply had result: %s",
+ _updateCmd->getDocumentId().toString().c_str(),
+ getReply.getResult().toString().c_str());
+
+ if (!getReply.getResult().success()) {
+ sendReplyWithResult(sender, getReply.getResult());
+ return;
+ }
+
+ if (!getReply.getDocument().get()) {
+ // Weird, document is no longer there ... Just fail.
+ sendReplyWithResult(sender, api::ReturnCode(
+ api::ReturnCode::INTERNAL_FAILURE, ""));
+ return;
+ }
+ schedulePutsWithUpdatedDocument(getReply.getDocument(),
+ _manager.getUniqueTimestamp(),
+ sender);
+ return;
+ }
+
+ std::shared_ptr<Operation> callback = _sentMessageMap.pop(msg->getMsgId());
+ assert(callback.get());
+ IntermediateMessageSender intermediate(_sentMessageMap, callback, sender);
+ callback->receive(intermediate, msg);
+
+ if (msg->getType() == api::MessageType::UPDATE_REPLY) {
+ if (intermediate._reply.get()) {
+ assert(_sendState == SendState::UPDATES_SENT);
+ addTraceFromReply(*intermediate._reply);
+ UpdateOperation& cb = static_cast<UpdateOperation&> (*callback);
+
+ std::pair<document::BucketId, uint16_t> bestNode =
+ cb.getNewestTimestampLocation();
+
+ if (!intermediate._reply->getResult().success() ||
+ bestNode.first == document::BucketId(0)) {
+ // Failed or was consistent
+ sendReply(sender, intermediate._reply);
+ } else {
+ LOG(debug, "Update(%s) fast path: was inconsistent!",
+ _updateCmd->getDocumentId().toString().c_str());
+
+ _updateReply = intermediate._reply;
+ std::shared_ptr<api::GetCommand> cmd(
+ new api::GetCommand(bestNode.first,
+ _updateCmd->getDocumentId(),
+ "[all]"));
+ copyMessageSettings(*_updateCmd, *cmd);
+
+ sender.sendToNode(
+ lib::NodeType::STORAGE,
+ bestNode.second,
+ cmd);
+ transitionTo(SendState::GETS_SENT);
+ }
+ }
+ } else {
+ if (intermediate._reply.get()) {
+ // PUTs are done.
+ addTraceFromReply(*intermediate._reply);
+ sendReplyWithResult(sender, intermediate._reply->getResult());
+ }
+ }
+}
+
+void
+TwoPhaseUpdateOperation::handleSafePathReceive(
+ DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply>& msg)
+{
+ std::shared_ptr<Operation> callback = _sentMessageMap.pop(msg->getMsgId());
+ assert(callback.get());
+
+ IntermediateMessageSender intermediate(_sentMessageMap, callback, sender);
+ callback->receive(intermediate, msg);
+
+ if (!intermediate._reply.get()) {
+ return; // Not enough replies received yet or we're draining callbacks.
+ }
+ addTraceFromReply(*intermediate._reply);
+ if (_sendState == SendState::GETS_SENT) {
+ assert(intermediate._reply->getType() == api::MessageType::GET_REPLY);
+ handleSafePathReceivedGet(
+ sender, static_cast<api::GetReply&>(*intermediate._reply));
+ } else if (_sendState == SendState::PUTS_SENT) {
+ assert(intermediate._reply->getType() == api::MessageType::PUT_REPLY);
+ handleSafePathReceivedPut(
+ sender, static_cast<api::PutReply&>(*intermediate._reply));
+ } else {
+ assert(!"Unknown state");
+ }
+}
+
+void
+TwoPhaseUpdateOperation::handleSafePathReceivedGet(
+ DistributorMessageSender& sender,
+ api::GetReply& reply)
+{
+ LOG(debug, "Update(%s): got Get reply with code %s",
+ _updateCmd->getDocumentId().toString().c_str(),
+ reply.getResult().toString().c_str());
+
+ if (!reply.getResult().success()) {
+ sendReplyWithResult(sender, reply.getResult());
+ return;
+ }
+ document::Document::SP docToUpdate;
+ api::Timestamp putTimestamp = _manager.getUniqueTimestamp();
+
+ if (reply.getDocument().get()) {
+ api::Timestamp receivedTimestamp = reply.getLastModifiedTimestamp();
+ if (!satisfiesUpdateTimestampConstraint(receivedTimestamp)) {
+ sendReplyWithResult(sender, api::ReturnCode(
+ api::ReturnCode::OK,
+ "No document with requested timestamp found"));
+ return;
+ }
+ if (!processAndMatchTasCondition(sender, *reply.getDocument())) {
+ return; // Reply already generated at this point.
+ }
+ docToUpdate = reply.getDocument();
+ setUpdatedForTimestamp(receivedTimestamp);
+ } else if (hasTasCondition()) {
+ replyWithTasFailure(sender, "Document did not exist");
+ return;
+ } else if (shouldCreateIfNonExistent()) {
+ LOG(debug,
+ "No existing documents found for %s, creating blank "
+ "document to update",
+ _updateCmd->getUpdate()->getId().toString().c_str());
+ docToUpdate = createBlankDocument();
+ setUpdatedForTimestamp(putTimestamp);
+ } else {
+ sendReplyWithResult(sender, reply.getResult());
+ return;
+ }
+ try {
+ applyUpdateToDocument(*docToUpdate);
+ schedulePutsWithUpdatedDocument(docToUpdate, putTimestamp, sender);
+ } catch (vespalib::Exception& e) {
+ sendReplyWithResult(sender, api::ReturnCode(
+ api::ReturnCode::INTERNAL_FAILURE, e.getMessage()));
+ }
+}
+
+bool
+TwoPhaseUpdateOperation::processAndMatchTasCondition(
+ DistributorMessageSender& sender,
+ const document::Document& candidateDoc)
+{
+ if (!hasTasCondition()) {
+ return true; // No condition; nothing to do here.
+ }
+
+ document::select::Parser parser(*_manager.getTypeRepo(),
+ _manager.getBucketIdFactory());
+ std::unique_ptr<document::select::Node> selection;
+ try {
+ selection = parser.parse(_updateCmd->getCondition().getSelection());
+ } catch (const document::select::ParsingFailedException & e) {
+ sendReplyWithResult(sender, api::ReturnCode(
+ api::ReturnCode::ILLEGAL_PARAMETERS,
+ "Failed to parse test and set condition: "s + e.getMessage()));
+ return false;
+ }
+
+ if (selection->contains(candidateDoc) != document::select::Result::True) {
+ replyWithTasFailure(sender, "Condition did not match document");
+ return false;
+ }
+ return true;
+}
+
+bool
+TwoPhaseUpdateOperation::hasTasCondition() const noexcept
+{
+ return _updateCmd->getCondition().isPresent();
+}
+
+void
+TwoPhaseUpdateOperation::replyWithTasFailure(
+ DistributorMessageSender& sender,
+ vespalib::stringref message)
+{
+ sendReplyWithResult(sender, api::ReturnCode(
+ api::ReturnCode::TEST_AND_SET_CONDITION_FAILED, message));
+}
+
+void
+TwoPhaseUpdateOperation::setUpdatedForTimestamp(api::Timestamp ts)
+{
+ ensureUpdateReplyCreated();
+ static_cast<api::UpdateReply&>(*_updateReply).setOldTimestamp(ts);
+}
+
+std::shared_ptr<document::Document>
+TwoPhaseUpdateOperation::createBlankDocument() const
+{
+ const document::DocumentUpdate& up(*_updateCmd->getUpdate());
+ return std::make_shared<document::Document>(up.getType(), up.getId());
+}
+
+void
+TwoPhaseUpdateOperation::handleSafePathReceivedPut(
+ DistributorMessageSender& sender,
+ const api::PutReply& reply)
+{
+ sendReplyWithResult(sender, reply.getResult());
+}
+
+void
+TwoPhaseUpdateOperation::applyUpdateToDocument(document::Document& doc) const
+{
+ _updateCmd->getUpdate()->applyTo(doc);
+}
+
+bool
+TwoPhaseUpdateOperation::shouldCreateIfNonExistent() const
+{
+ return _updateCmd->getUpdate()->getCreateIfNonExistent();
+}
+
+bool
+TwoPhaseUpdateOperation::satisfiesUpdateTimestampConstraint(
+ api::Timestamp ts) const
+{
+ return (_updateCmd->getOldTimestamp() == 0
+ || _updateCmd->getOldTimestamp() == ts);
+}
+
+void
+TwoPhaseUpdateOperation::addTraceFromReply(const api::StorageReply& reply)
+{
+ _trace.addChild(reply.getTrace().getRoot());
+}
+
+void
+TwoPhaseUpdateOperation::onClose(DistributorMessageSender& sender) {
+ while (true) {
+ std::shared_ptr<Operation> cb = _sentMessageMap.pop();
+
+ if (cb.get()) {
+ IntermediateMessageSender intermediate(
+ _sentMessageMap,
+ std::shared_ptr<Operation > (),
+ sender);
+ cb->onClose(intermediate);
+ // We will _only_ forward UpdateReply instances up, since those
+ // are created by UpdateOperation and are bound to the original
+ // UpdateCommand. Any other intermediate replies will be replies
+ // to synthetic commands created for gets/puts and should never be
+ // propagated to the outside world.
+ auto candidateReply = std::move(intermediate._reply);
+ if (candidateReply
+ && candidateReply->getType() == api::MessageType::UPDATE_REPLY)
+ {
+ assert(_mode == Mode::FAST_PATH);
+ sendReply(sender, candidateReply); // Sets _replySent
+ }
+ } else {
+ break;
+ }
+ }
+
+ if (!_replySent) {
+ sendReplyWithResult(sender, api::ReturnCode(api::ReturnCode::ABORTED));
+ }
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.h b/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.h
new file mode 100644
index 00000000000..13d501592d8
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/twophaseupdateoperation.h
@@ -0,0 +1,136 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <set>
+#include <vespa/storageapi/messageapi/returncode.h>
+#include <vespa/storage/distributor/persistencemessagetracker.h>
+#include <vespa/document/update/documentupdate.h>
+
+namespace document {
+class Document;
+}
+
+namespace storage {
+
+namespace api {
+class UpdateCommand;
+class BatchDocumentUpdateCommand;
+class CreateBucketReply;
+}
+
+namespace distributor {
+
+/*
+ * General functional outline:
+ *
+ * if bucket is consistent and all copies are in sync
+ * send updates directly to nodes
+ * else
+ * start safe (slow) path
+ *
+ * Slow path:
+ *
+ * send Get for document to update to inconsistent copies
+ * if get reply has document
+ * apply updates and send new put
+ * else if create-if-non-existing set on update
+ * create new blank document
+ * apply updates and send new put
+ * else
+ * reply with not found
+ *
+ * Note that the above case also implicitly handles the case in which a
+ * bucket does not exist.
+*/
+
+
+class TwoPhaseUpdateOperation : public Operation
+{
+public:
+ TwoPhaseUpdateOperation(DistributorComponent& manager,
+ const std::shared_ptr<api::UpdateCommand> & msg,
+ DistributorMetricSet& metrics);
+
+ void onStart(DistributorMessageSender& sender);
+
+ const char* getName() const { return "twophaseupdate"; }
+
+ std::string getStatus() const { return ""; }
+
+ void onReceive(DistributorMessageSender&,
+ const std::shared_ptr<api::StorageReply>&);
+
+ void onClose(DistributorMessageSender& sender);
+
+ bool canSendHeaderOnly() const;
+
+private:
+ enum class SendState {
+ NONE_SENT,
+ UPDATES_SENT,
+ GETS_SENT,
+ PUTS_SENT,
+ };
+
+ enum class Mode {
+ FAST_PATH,
+ SLOW_PATH
+ };
+
+ void transitionTo(SendState newState);
+ const char* stateToString(SendState);
+
+ void sendReply(DistributorMessageSender&,
+ std::shared_ptr<api::StorageReply>&);
+ void sendReplyWithResult(DistributorMessageSender&, const api::ReturnCode&);
+ void ensureUpdateReplyCreated();
+
+ bool isFastPathPossible() const;
+ void startFastPathUpdate(DistributorMessageSender&);
+ void startSafePathUpdate(DistributorMessageSender&);
+ bool lostBucketOwnershipBetweenPhases() const;
+ void sendLostOwnershipTransientErrorReply(DistributorMessageSender&);
+ void schedulePutsWithUpdatedDocument(
+ std::shared_ptr<document::Document>,
+ api::Timestamp,
+ DistributorMessageSender&);
+ void applyUpdateToDocument(document::Document&) const;
+ std::shared_ptr<document::Document> createBlankDocument() const;
+ void setUpdatedForTimestamp(api::Timestamp);
+ void handleFastPathReceive(DistributorMessageSender&,
+ const std::shared_ptr<api::StorageReply>&);
+ void handleSafePathReceive(DistributorMessageSender&,
+ const std::shared_ptr<api::StorageReply>&);
+ void handleSafePathReceivedGet(DistributorMessageSender&,
+ api::GetReply&);
+ void handleSafePathReceivedPut(DistributorMessageSender&,
+ const api::PutReply&);
+ bool shouldCreateIfNonExistent() const;
+ bool processAndMatchTasCondition(
+ DistributorMessageSender& sender,
+ const document::Document& candidateDoc);
+ bool satisfiesUpdateTimestampConstraint(api::Timestamp) const;
+ void addTraceFromReply(const api::StorageReply& reply);
+ bool hasTasCondition() const noexcept;
+ void replyWithTasFailure(DistributorMessageSender& sender,
+ vespalib::stringref message);
+
+ PersistenceOperationMetricSet& _updateMetric;
+ PersistenceOperationMetricSet& _putMetric;
+ PersistenceOperationMetricSet& _getMetric;
+ std::shared_ptr<api::UpdateCommand> _updateCmd;
+ std::shared_ptr<api::StorageReply> _updateReply;
+ DistributorComponent& _manager;
+ SentMessageMap _sentMessageMap;
+ SendState _sendState;
+ Mode _mode;
+ mbus::TraceNode _trace;
+ document::BucketId _updateDocBucketId;
+ bool _replySent;
+};
+
+}
+
+}
+
+
diff --git a/storage/src/vespa/storage/distributor/operations/external/updateoperation.cpp b/storage/src/vespa/storage/distributor/operations/external/updateoperation.cpp
new file mode 100644
index 00000000000..57a5c968d54
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/updateoperation.cpp
@@ -0,0 +1,170 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/operations/external/updateoperation.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/distributormetricsset.h>
+
+LOG_SETUP(".distributor.callback.doc.update");
+
+
+using namespace storage::distributor;
+using namespace storage;
+
+UpdateOperation::UpdateOperation(DistributorComponent& manager,
+ const std::shared_ptr<api::UpdateCommand> & msg,
+ PersistenceOperationMetricSet& metric)
+ : Operation(),
+ _trackerInstance(metric,
+ std::shared_ptr<api::BucketInfoReply>(new api::UpdateReply(*msg)),
+ manager,
+ msg->getTimestamp()),
+ _tracker(_trackerInstance),
+ _msg(msg),
+ _manager(manager)
+{
+}
+
+bool
+UpdateOperation::anyStorageNodesAvailable() const
+{
+ const auto& clusterState(_manager.getClusterState());
+ const auto storageNodeCount(
+ clusterState.getNodeCount(lib::NodeType::STORAGE));
+
+ for (uint16_t i = 0; i < storageNodeCount; ++i) {
+ const auto& ns(clusterState.getNodeState(
+ lib::Node(lib::NodeType::STORAGE, i)));
+ if (ns.getState() == lib::State::UP
+ || ns.getState() == lib::State::RETIRED)
+ {
+ return true;
+ }
+ }
+ return false;
+}
+
+void
+UpdateOperation::onStart(DistributorMessageSender& sender)
+{
+ LOG(debug, "Received UPDATE %s for bucket %" PRIx64,
+ _msg->getDocumentId().toString().c_str(),
+ _manager.getBucketIdFactory().getBucketId(
+ _msg->getDocumentId()).getRawId());
+
+ // Don't do anything if all nodes are down.
+ if (!anyStorageNodesAvailable()) {
+ _tracker.fail(sender,
+ api::ReturnCode(api::ReturnCode::NOT_CONNECTED,
+ "Can't store document: No storage nodes "
+ "available"));
+ return;
+ }
+
+ document::BucketId bucketId(
+ _manager.getBucketIdFactory().getBucketId(
+ _msg->getDocumentId()));
+
+ std::vector<BucketDatabase::Entry> entries;
+ _manager.getBucketDatabase().getParents(bucketId, entries);
+
+ if (entries.empty()) {
+ _tracker.fail(sender,
+ api::ReturnCode(api::ReturnCode::OK,
+ "No buckets found for given document update"));
+ return;
+ }
+
+ // FIXME(vekterli): this loop will happily update all replicas in the
+ // bucket sub-tree, but there is nothing here at all which will fail the
+ // update if we cannot satisfy a desired replication level (not even for
+ // n-of-m operations).
+ for (uint32_t j = 0; j < entries.size(); ++j) {
+ LOG(debug, "Found bucket %s", entries[j].toString().c_str());
+
+ const std::vector<uint16_t>& nodes = entries[j]->getNodes();
+
+ std::vector<MessageTracker::ToSend> messages;
+
+ for (uint32_t i = 0; i < nodes.size(); i++) {
+ std::shared_ptr<api::UpdateCommand> command(
+ new api::UpdateCommand(entries[j].getBucketId(),
+ _msg->getUpdate(),
+ _msg->getTimestamp()));
+ copyMessageSettings(*_msg, *command);
+ command->setOldTimestamp(_msg->getOldTimestamp());
+ command->setCondition(_msg->getCondition());
+ messages.push_back(MessageTracker::ToSend(command, nodes[i]));
+ }
+
+ _tracker.queueMessageBatch(messages);
+ }
+
+ _tracker.flushQueue(sender);
+ _msg = std::shared_ptr<api::UpdateCommand>();
+};
+
+void
+UpdateOperation::onReceive(DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply> & msg)
+{
+ api::UpdateReply& reply =
+ static_cast<api::UpdateReply&>(*msg);
+
+ if (msg->getType() == api::MessageType::UPDATE_REPLY) {
+ uint16_t node = _tracker.handleReply(reply);
+
+ if (node != (uint16_t)-1) {
+ if (reply.getResult().getResult() == api::ReturnCode::OK) {
+ _results.push_back(OldTimestamp(
+ reply.getBucketId(),
+ reply.getOldTimestamp(),
+ node));
+ }
+
+ if (_tracker.getReply().get()) {
+ api::UpdateReply& replyToSend =
+ static_cast<api::UpdateReply&>(*_tracker.getReply());
+
+ uint64_t oldTs = 0;
+ uint64_t goodNode = 0;
+
+ // Find the highest old timestamp.
+ for (uint32_t i = 0; i < _results.size(); i++) {
+ if (_results[i].oldTs > oldTs) {
+ oldTs = _results[i].oldTs;
+ goodNode = i;
+ }
+ }
+
+ replyToSend.setOldTimestamp(oldTs);
+
+ for (uint32_t i = 0; i < _results.size(); i++) {
+ if (_results[i].oldTs < oldTs) {
+ replyToSend.setNodeWithNewestTimestamp(
+ _results[goodNode].nodeId);
+ _newestTimestampLocation.first =
+ _results[goodNode].bucketId;
+ _newestTimestampLocation.second =
+ _results[goodNode].nodeId;
+ break;
+ }
+ }
+ }
+
+ _tracker.updateFromReply(sender, reply, node);
+ }
+ } else {
+ _tracker.receiveReply(sender, static_cast<api::BucketInfoReply&>(*msg));
+ }
+}
+
+
+void
+UpdateOperation::onClose(DistributorMessageSender& sender)
+{
+ _tracker.fail(sender, api::ReturnCode(api::ReturnCode::ABORTED, "Process is shutting down"));
+}
diff --git a/storage/src/vespa/storage/distributor/operations/external/updateoperation.h b/storage/src/vespa/storage/distributor/operations/external/updateoperation.h
new file mode 100644
index 00000000000..b990e953623
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/updateoperation.h
@@ -0,0 +1,68 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storageapi/messageapi/returncode.h>
+#include <vespa/storage/distributor/persistencemessagetracker.h>
+
+namespace document {
+class Document;
+}
+
+namespace storage {
+
+namespace api {
+class UpdateCommand;
+class CreateBucketReply;
+}
+
+namespace distributor {
+
+class UpdateOperation : public Operation
+{
+public:
+ UpdateOperation(DistributorComponent& manager,
+ const std::shared_ptr<api::UpdateCommand> & msg,
+ PersistenceOperationMetricSet& metric);
+
+ void onStart(DistributorMessageSender& sender);
+
+ const char* getName() const { return "update"; };
+
+ std::string getStatus() const { return ""; };
+
+ void onReceive(DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply> & msg);
+
+ void onClose(DistributorMessageSender& sender);
+
+ std::pair<document::BucketId, uint16_t> getNewestTimestampLocation() const {
+ return _newestTimestampLocation;
+ }
+
+private:
+ PersistenceMessageTrackerImpl _trackerInstance;
+ PersistenceMessageTracker& _tracker;
+ std::shared_ptr<api::UpdateCommand> _msg;
+
+ DistributorComponent& _manager;
+ std::pair<document::BucketId, uint16_t> _newestTimestampLocation;
+
+ bool anyStorageNodesAvailable() const;
+
+ class OldTimestamp {
+ public:
+ OldTimestamp(document::BucketId b, uint64_t o, uint16_t node) :
+ bucketId(b), oldTs(o), nodeId(node) {}
+
+ document::BucketId bucketId;
+ uint64_t oldTs;
+ uint16_t nodeId;
+ };
+
+ std::vector<OldTimestamp> _results;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/operations/external/visitoroperation.cpp b/storage/src/vespa/storage/distributor/operations/external/visitoroperation.cpp
new file mode 100644
index 00000000000..1f5d012e8d0
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/visitoroperation.cpp
@@ -0,0 +1,1008 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/storageserver/storagemetricsset.h>
+#include <vespa/storage/distributor/operations/external/visitoroperation.h>
+#include <vespa/document/base/exceptions.h>
+#include <vespa/document/select/orderingselector.h>
+#include <vespa/document/select/parser.h>
+#include <iomanip>
+#include <sstream>
+#include <math.h>
+#include <vespa/storage/distributor/distributor.h>
+#include <vespa/storage/distributor/bucketownership.h>
+#include <vespa/storage/distributor/operations/external/visitororder.h>
+
+namespace storage {
+
+namespace distributor {
+
+LOG_SETUP(".visitoroperation");
+
+void
+VisitorOperation::BucketInfo::print(vespalib::asciistream & out) const
+{
+ out << "BucketInfo("
+ << "done=" << done << ", "
+ << "activeNode=" << activeNode << ", "
+ << "failedCount=" << failedCount << ", "
+ << "triedNodes=";
+ for (uint32_t i = 0; i < triedNodes.size(); i++) {
+ out << triedNodes[i];
+ if (i != triedNodes.size()-1) {
+ out << " ";
+ }
+ }
+ out << ")";
+}
+
+vespalib::string
+VisitorOperation::BucketInfo::toString() const
+{
+ vespalib::asciistream ost;
+ print(ost);
+ return ost.str();
+}
+
+VisitorOperation::VisitorOperation(
+ DistributorComponent& owner,
+ const api::CreateVisitorCommand::SP& m,
+ const Config& config,
+ VisitorMetricSet* metric)
+ : Operation(),
+ _owner(owner),
+ _msg(m),
+ _sentReply(false),
+ _config(config),
+ _metrics(metric),
+ _trace(TRACE_SOFT_MEMORY_LIMIT)
+{
+ const std::vector<document::BucketId>& buckets = m->getBuckets();
+
+ if (buckets.size() > 0) {
+ _superBucket = SuperBucketInfo(buckets[0]);
+ }
+
+ if (buckets.size() > 1) {
+ _lastBucket = buckets[1];
+ }
+
+ _fromTime = m->getFromTime();
+ _toTime = m->getToTime();
+ if (_toTime == 0) {
+ _toTime = owner.getUniqueTimestamp();
+ }
+
+ _startVisitorTime = owner.getClock().getTimeInMillis();
+}
+
+VisitorOperation::~VisitorOperation()
+{
+}
+
+document::BucketId
+VisitorOperation::getLastBucketVisited()
+{
+ document::BucketId newLastBucket = _lastBucket;
+ bool foundNotDone = false;
+ bool foundDone = false;
+
+ LOG(spam, "getLastBucketVisited(): Sub bucket count: %zu",
+ _superBucket.subBucketsVisitOrder.size());
+ for (uint32_t i=0; i<_superBucket.subBucketsVisitOrder.size(); i++) {
+ auto found = _superBucket.subBuckets.find(_superBucket.subBucketsVisitOrder[i]);
+ assert(found != _superBucket.subBuckets.end());
+ LOG(spam, "%s => %s",
+ found->first.toString().c_str(),
+ found->second.toString().c_str());
+
+ if (found->second.done) {
+ foundDone = true;
+ } else if (!allowInconsistencies()) {
+ // Don't allow a non-complete bucket to be treated as successfully
+ // visited unless we're doing an inconsistent visit.
+ foundNotDone = true;
+ }
+ if (!foundNotDone) {
+ newLastBucket = found->first;
+ }
+ }
+
+ if (_superBucket.subBucketsCompletelyExpanded) {
+ LOG(spam, "Sub buckets were completely expanded");
+ if (_superBucket.subBucketsVisitOrder.empty()
+ || (foundDone && !foundNotDone))
+ {
+ newLastBucket = document::BucketId(INT_MAX);
+ }
+ }
+
+ LOG(spam, "Returning last bucket: %s", newLastBucket.toString().c_str());
+ return newLastBucket;
+}
+
+uint64_t
+VisitorOperation::timeLeft() const noexcept
+{
+ framework::MilliSecTime now = _owner.getClock().getTimeInMillis();
+ framework::MilliSecTime timeSpent = now - _startVisitorTime;
+
+ LOG(spam,
+ "Checking if visitor has timed out: now=%zu, start=%zu, "
+ "diff=%zu, timeout=%u",
+ now.getTime(),
+ _startVisitorTime.getTime(),
+ timeSpent.getTime(),
+ _msg->getTimeout());
+
+ if (timeSpent.getTime() >= _msg->getTimeout()) {
+ return 0;
+ } else {
+ return _msg->getTimeout() - timeSpent.getTime();
+ }
+}
+
+void
+VisitorOperation::markCompleted(const document::BucketId& bid,
+ const api::ReturnCode& code)
+{
+ VisitBucketMap::iterator found = _superBucket.subBuckets.find(bid);
+ assert(found != _superBucket.subBuckets.end());
+
+ BucketInfo& info = found->second;
+ assert(info.activeNode != -1);
+ info.activeNode = -1;
+ if (code.success()) {
+ info.done = true;
+ }
+}
+
+void
+VisitorOperation::markOperationAsFailedDueToNodeError(
+ const api::ReturnCode& result,
+ uint16_t fromFailingNodeIndex)
+{
+ _storageError = api::ReturnCode(
+ result.getResult(),
+ vespalib::make_string("[from content node %u] %s",
+ fromFailingNodeIndex,
+ result.getMessage().c_str()));
+}
+
+void
+VisitorOperation::onReceive(
+ DistributorMessageSender& sender,
+ const api::StorageReply::SP& r)
+{
+ api::CreateVisitorReply& reply = static_cast<api::CreateVisitorReply&>(*r);
+
+ _trace.add(reply.getTrace().getRoot());
+
+ SentMessagesMap::iterator iter = _sentMessages.find(reply.getMsgId());
+ assert(iter != _sentMessages.end());
+
+ api::CreateVisitorCommand& storageVisitor = *iter->second;
+
+ const uint16_t contentNodeIndex = storageVisitor.getAddress()->getIndex();
+ _activeNodes[contentNodeIndex]--;
+
+ api::ReturnCode result = reply.getResult();
+ if (result.success()) {
+ _visitorStatistics = _visitorStatistics + reply.getVisitorStatistics();
+ LOG(spam, "Client stats %s for visitor %s. New stats is %s",
+ reply.getVisitorStatistics().toString().c_str(),
+ _msg->getInstanceId().c_str(),
+ _visitorStatistics.toString().c_str());
+ } else if (result.isCriticalForVisitorDispatcher()) {
+ // If an error code is critical, we don't bother to do a "worst-of"
+ // comparison with the existing code since it's assumed either one is
+ // sufficiently bad to tell the client about it.
+ markOperationAsFailedDueToNodeError(result, contentNodeIndex);
+ }
+ // else: will lose code for non-critical events, degenerates to "not found".
+
+ for (uint32_t i = 0; i < storageVisitor.getBuckets().size(); i++) {
+ const document::BucketId& bid(storageVisitor.getBuckets()[i]);
+ markCompleted(bid, result);
+ }
+
+ _sentMessages.erase(iter);
+ startNewVisitors(sender);
+}
+
+namespace {
+
+class VisitorVerificationException
+{
+public:
+ VisitorVerificationException(api::ReturnCode::Result result,
+ vespalib::stringref message)
+ : _code(result, message)
+ {}
+
+ const api::ReturnCode& getReturnCode() const {
+ return _code;
+ }
+
+private:
+ api::ReturnCode _code;
+};
+
+}
+
+void
+VisitorOperation::verifyDistributorsAreAvailable()
+{
+ const lib::ClusterState& clusterState = _owner.getClusterState();
+ if (clusterState.getNodeCount(lib::NodeType::DISTRIBUTOR) == 0) {
+ vespalib::string err(vespalib::make_string(
+ "No distributors available when processing visitor '%s'",
+ _msg->getInstanceId().c_str()));
+ LOG(debug, "%s", err.c_str());
+ throw VisitorVerificationException(api::ReturnCode::NOT_READY, err);
+ }
+}
+
+void
+VisitorOperation::verifyVisitorDistributionBitCount(
+ const document::BucketId& bid)
+{
+ const lib::ClusterState& clusterState = _owner.getClusterState();
+ if (_msg->getDocumentSelection().length() == 0
+ && bid.getUsedBits() != clusterState.getDistributionBitCount())
+ {
+ LOG(debug,
+ "Got message with wrong distribution bits (%d != %d), bucketid %s, "
+ "sending back system state '%s'",
+ bid.getUsedBits(),
+ clusterState.getDistributionBitCount(),
+ bid.toString().c_str(),
+ clusterState.toString().c_str());
+ throw VisitorVerificationException(
+ api::ReturnCode::WRONG_DISTRIBUTION,
+ clusterState.toString());
+ }
+}
+
+void
+VisitorOperation::verifyDistributorIsNotDown(const lib::ClusterState& state)
+{
+ const lib::NodeState& ownState(
+ state.getNodeState(
+ lib::Node(lib::NodeType::DISTRIBUTOR, _owner.getIndex())));
+ if (!ownState.getState().oneOf("ui")) {
+ throw VisitorVerificationException(
+ api::ReturnCode::ABORTED, "Distributor is shutting down");
+ }
+}
+
+void
+VisitorOperation::verifyDistributorOwnsBucket(const document::BucketId& bid)
+{
+ BucketOwnership bo(_owner.checkOwnershipInPendingAndCurrentState(bid));
+ if (!bo.isOwned()) {
+ verifyDistributorIsNotDown(bo.getNonOwnedState());
+ std::string systemStateStr = bo.getNonOwnedState().toString();
+ LOG(debug,
+ "Bucket %s is not owned by distributor %d, "
+ "sending back system state '%s'",
+ bid.toString().c_str(),
+ _owner.getIndex(),
+ bo.getNonOwnedState().toString().c_str());
+ throw VisitorVerificationException(
+ api::ReturnCode::WRONG_DISTRIBUTION,
+ bo.getNonOwnedState().toString());
+ }
+}
+
+void
+VisitorOperation::verifyOperationContainsBuckets()
+{
+ size_t bucketCount = _msg->getBuckets().size();
+ if (bucketCount == 0) {
+ vespalib::string errorMsg = vespalib::make_string(
+ "No buckets in CreateVisitorCommand for visitor '%s'",
+ _msg->getInstanceId().c_str());
+ throw VisitorVerificationException(api::ReturnCode::ILLEGAL_PARAMETERS, errorMsg);
+ }
+}
+
+void
+VisitorOperation::verifyOperationHasSuperbucketAndProgress()
+{
+ size_t bucketCount = _msg->getBuckets().size();
+ if (bucketCount != 2) {
+ vespalib::string errorMsg = vespalib::make_string(
+ "CreateVisitorCommand does not contain 2 buckets for visitor '%s'",
+ _msg->getInstanceId().c_str());
+ throw VisitorVerificationException(api::ReturnCode::ILLEGAL_PARAMETERS, errorMsg);
+ }
+}
+
+void
+VisitorOperation::verifyOperationSentToCorrectDistributor()
+{
+ verifyDistributorsAreAvailable();
+ verifyVisitorDistributionBitCount(_superBucket.bid);
+ verifyDistributorOwnsBucket(_superBucket.bid);
+}
+
+bool
+VisitorOperation::verifyCreateVisitorCommand(DistributorMessageSender& sender)
+{
+ try {
+ verifyOperationContainsBuckets();
+ verifyOperationHasSuperbucketAndProgress();
+ verifyOperationSentToCorrectDistributor();
+ return true;
+ } catch (const VisitorVerificationException& e) {
+ LOG(debug,
+ "Visitor verification failed; replying with %s",
+ e.getReturnCode().toString().c_str());
+ sendReply(e.getReturnCode(), sender);
+ return false;
+ }
+}
+
+namespace {
+
+bool
+isSplitPastOrderBits(const document::BucketId& bucket,
+ const document::OrderingSpecification& ordering) {
+ int32_t bitsUsed = bucket.getUsedBits();
+ int32_t orderBitCount = ordering.getWidthBits() -
+ ordering.getDivisionBits();
+ return (bitsUsed > 32 + orderBitCount);
+}
+
+bool
+isInconsistentlySplit(const document::BucketId& ain,
+ const document::BucketId& bin) {
+ int minUsed = std::min(ain.getUsedBits(), bin.getUsedBits());
+
+ document::BucketId a = document::BucketId(minUsed,
+ ain.getRawId()).stripUnused();
+ document::BucketId b = document::BucketId(minUsed,
+ bin.getRawId()).stripUnused();
+
+ return (a == b);
+}
+
+bool
+isInconsistentlySplit(const document::BucketId& bucket,
+ const std::vector<document::BucketId>& buckets)
+{
+ if (buckets.size()) {
+ for (uint32_t i=0; i<buckets.size(); i++) {
+ if (isInconsistentlySplit(bucket, buckets[i])) {
+ return true;
+ }
+ }
+ }
+ return false;
+}
+
+} // End anonymous namespace
+
+bool
+VisitorOperation::isSpecialBucketForOrderDoc(const document::BucketId& bucketId) const
+{
+ if (isSplitPastOrderBits(bucketId, *_ordering)) {
+ LOG(spam, "Split past orderbits: Found in db: %s", bucketId.toString().c_str());
+ } else if (isInconsistentlySplit(bucketId, _superBucket.subBucketsVisitOrder)) {
+ LOG(spam, "Inconsistent: Found in db: %s", bucketId.toString().c_str());
+ } else {
+ return false;
+ }
+ return true;
+}
+
+std::vector<document::BucketId>::const_iterator
+VisitorOperation::addSpecialBucketsForOrderDoc(
+ std::vector<document::BucketId>::const_iterator iter,
+ std::vector<document::BucketId>::const_iterator end)
+{
+ if (_ordering->getWidthBits() == 0) {
+ return iter;
+ }
+ for (; iter != end; ++iter) {
+ if (isSpecialBucketForOrderDoc(*iter)) {
+ _superBucket.subBucketsVisitOrder.push_back(*iter);
+ _superBucket.subBuckets[*iter] = BucketInfo();
+ } else {
+ break;
+ }
+ }
+ return iter;
+}
+
+bool
+VisitorOperation::pickBucketsToVisit(const std::vector<BucketDatabase::Entry>& buckets)
+{
+ uint32_t maxBuckets = _msg->getMaxBucketsPerVisitor();
+
+ std::vector<document::BucketId> bucketVisitOrder;
+
+ for (uint32_t i = 0; i < buckets.size(); ++i) {
+ bucketVisitOrder.push_back(buckets[i].getBucketId());
+ }
+
+ VisitorOrder bucketLessThan(*_ordering);
+ std::sort(bucketVisitOrder.begin(), bucketVisitOrder.end(), bucketLessThan);
+
+ std::vector<document::BucketId>::const_iterator iter(bucketVisitOrder.begin());
+ std::vector<document::BucketId>::const_iterator end(bucketVisitOrder.end());
+ for (; iter != end; ++iter) {
+ if (bucketLessThan(*iter, _lastBucket) ||
+ *iter == _lastBucket)
+ {
+ LOG(spam,
+ "Skipping bucket %s because it is lower than or equal to progress bucket %s",
+ iter->toString().c_str(),
+ _lastBucket.toString().c_str());
+ continue;
+ }
+ LOG(spam, "Iterating: Found in db: %s", iter->toString().c_str());
+ _superBucket.subBucketsVisitOrder.push_back(*iter);
+ _superBucket.subBuckets[*iter] = BucketInfo();
+ if (_superBucket.subBuckets.size() >= maxBuckets) {
+ ++iter;
+ break;
+ }
+ }
+
+ iter = addSpecialBucketsForOrderDoc(iter, end);
+
+ bool doneExpand(iter == bucketVisitOrder.end());
+ return doneExpand;
+}
+
+bool
+VisitorOperation::expandBucketAll()
+{
+ std::vector<BucketDatabase::Entry> entries;
+ _owner.getBucketDatabase().getAll(_superBucket.bid, entries);
+ return pickBucketsToVisit(entries);
+}
+
+bool
+VisitorOperation::expandBucketContaining()
+{
+ std::vector<BucketDatabase::Entry> entries;
+ _owner.getBucketDatabase().getParents(_superBucket.bid, entries);
+ return pickBucketsToVisit(entries);
+}
+
+namespace {
+
+struct NextEntryFinder : public BucketDatabase::EntryProcessor {
+ bool _first;
+ document::BucketId _last;
+ std::unique_ptr<document::BucketId> _next;
+
+ NextEntryFinder(const document::BucketId& id)
+ : _first(true), _last(id), _next() {}
+
+ bool process(const BucketDatabase::Entry& e) {
+ document::BucketId bucket(e.getBucketId());
+
+ if (_first && bucket == _last) {
+ _first = false;
+ return true;
+ } else {
+ _next.reset(new document::BucketId(bucket));
+ return false;
+ }
+ }
+};
+
+
+std::unique_ptr<document::BucketId>
+getBucketIdAndLast(
+ BucketDatabase& database,
+ const document::BucketId& super,
+ const document::BucketId& last)
+{
+ if (!super.contains(last)) {
+ NextEntryFinder proc(super);
+ database.forEach(proc, super);
+ return std::move(proc._next);
+ } else {
+ NextEntryFinder proc(last);
+ database.forEach(proc, last);
+ return std::move(proc._next);
+ }
+}
+
+}
+
+bool
+VisitorOperation::expandBucketContained()
+{
+ uint32_t maxBuckets = _msg->getMaxBucketsPerVisitor();
+
+ std::unique_ptr<document::BucketId> bid = getBucketIdAndLast(
+ _owner.getBucketDatabase(),
+ _superBucket.bid,
+ _lastBucket);
+
+ while (bid.get() && _superBucket.subBuckets.size() < maxBuckets) {
+ if (!_superBucket.bid.contains(*bid)) {
+ LOG(spam,
+ "Iterating: Found bucket %s is not contained in bucket %s",
+ bid->toString().c_str(),
+ _superBucket.bid.toString().c_str());
+ break;
+ }
+
+ LOG(spam, "Iterating: Found in db: %s", bid->toString().c_str());
+ _superBucket.subBucketsVisitOrder.push_back(*bid);
+ _superBucket.subBuckets[*bid] = BucketInfo();
+
+ bid = getBucketIdAndLast(_owner.getBucketDatabase(),
+ _superBucket.bid,
+ *bid);
+ }
+
+ bool doneExpand = (!bid.get() || !_superBucket.bid.contains(*bid));
+ return doneExpand;
+}
+
+void
+VisitorOperation::expandBucket()
+{
+ bool doneExpandBuckets = false;
+ if (_ordering->getWidthBits() > 0) { // Orderdoc
+ doneExpandBuckets = expandBucketAll();
+ } else {
+ bool doneExpandContainingBuckets = true;
+ if (!_superBucket.bid.contains(_lastBucket)) {
+ LOG(spam, "Bucket %s does not contain progress bucket %s",
+ _superBucket.bid.toString().c_str(),
+ _lastBucket.toString().c_str());
+ doneExpandContainingBuckets = expandBucketContaining();
+ } else {
+ LOG(spam, "Bucket %s contains progress bucket %s",
+ _superBucket.bid.toString().c_str(),
+ _lastBucket.toString().c_str());
+ }
+
+ if (doneExpandContainingBuckets) {
+ LOG(spam, "Done expanding containing buckets");
+ doneExpandBuckets = expandBucketContained();
+ }
+ }
+
+ if (doneExpandBuckets) {
+ _superBucket.subBucketsCompletelyExpanded = true;
+ LOG(spam,
+ "Sub buckets completely expanded for super bucket %s",
+ _superBucket.bid.toString().c_str());
+ } else {
+ LOG(spam,
+ "Sub buckets NOT completely expanded for super bucket %s",
+ _superBucket.bid.toString().c_str());
+ }
+}
+
+namespace {
+
+bool
+alreadyTried(const std::vector<uint16_t>& triedNodes,
+ uint16_t node)
+{
+ for (uint32_t j = 0; j < triedNodes.size(); j++) {
+ if (triedNodes[j] == node) {
+ return true;
+ }
+ }
+ return false;
+}
+
+int
+findNodeWithMostDocuments(const std::vector<BucketCopy>& potentialNodes)
+{
+ int indexWithMostDocs = -1;
+ for (uint32_t i = 0; i < potentialNodes.size(); i++) {
+ if (indexWithMostDocs == -1 ||
+ potentialNodes[i].getDocumentCount() >
+ potentialNodes[indexWithMostDocs].getDocumentCount())
+ {
+ indexWithMostDocs = i;
+ }
+ }
+ return potentialNodes[indexWithMostDocs].getNode();
+}
+
+}
+
+int
+VisitorOperation::pickTargetNode(
+ const BucketDatabase::Entry& entry,
+ const std::vector<uint16_t>& triedNodes)
+{
+ std::vector<BucketCopy> potentialNodes;
+
+ // Figure out if there are any trusted nodes. If there are,
+ // only those should be considered for visiting.
+ bool foundTrusted = entry->hasTrusted();
+ for (uint32_t i = 0; i < entry->getNodeCount(); i++) {
+ const BucketCopy& copy(entry->getNodeRef(i));
+ if (foundTrusted && !copy.trusted()) {
+ continue;
+ }
+ if (!alreadyTried(triedNodes, copy.getNode())) {
+ potentialNodes.push_back(copy);
+ }
+ }
+
+ if (potentialNodes.empty()) {
+ return -1;
+ }
+
+ if (!entry->validAndConsistent()) {
+ return findNodeWithMostDocuments(potentialNodes);
+ }
+
+ assert(!potentialNodes.empty());
+ return potentialNodes.front().getNode();
+}
+
+bool
+VisitorOperation::documentSelectionMayHaveOrdering() const
+{
+ // FIXME: this is hairy and depends on opportunistic ordering
+ // parsing working fine even when no ordering is present.
+ return strcasestr(_msg->getDocumentSelection().c_str(), "order") != NULL;
+}
+
+void
+VisitorOperation::attemptToParseOrderingSelector()
+{
+ std::unique_ptr<document::select::Node> docSelection;
+ document::DocumentTypeRepo::SP repo(_owner.getTypeRepo());
+ document::select::Parser parser(
+ *repo, _owner.getBucketIdFactory());
+ docSelection = parser.parse(_msg->getDocumentSelection());
+
+ document::OrderingSelector selector;
+ _ordering = selector.select(*docSelection, _msg->getVisitorOrdering());
+}
+
+bool
+VisitorOperation::parseDocumentSelection(DistributorMessageSender& sender)
+{
+ try{
+ if (documentSelectionMayHaveOrdering()) {
+ attemptToParseOrderingSelector();
+ }
+
+ if (!_ordering.get()) {
+ _ordering.reset(new document::OrderingSpecification());
+ }
+ } catch (document::DocumentTypeNotFoundException& e) {
+ std::ostringstream ost;
+ ost << "Failed to parse document select string '"
+ << _msg->getDocumentSelection() << "': " << e.getMessage();
+ LOG(warning, "CreateVisitor(%s): %s",
+ _msg->getInstanceId().c_str(), ost.str().c_str());
+
+ sendReply(api::ReturnCode(api::ReturnCode::ILLEGAL_PARAMETERS, ost.str()), sender);
+ return false;
+ } catch (document::select::ParsingFailedException& e) {
+ std::ostringstream ost;
+ ost << "Failed to parse document select string '"
+ << _msg->getDocumentSelection() << "': " << e.getMessage();
+ LOG(warning, "CreateVisitor(%s): %s",
+ _msg->getInstanceId().c_str(), ost.str().c_str());
+
+ sendReply(api::ReturnCode(api::ReturnCode::ILLEGAL_PARAMETERS, ost.str()), sender);
+ return false;
+ }
+
+ return true;
+}
+
+void
+VisitorOperation::onStart(DistributorMessageSender& sender)
+{
+ if (!verifyCreateVisitorCommand(sender)) {
+ return;
+ }
+
+ if (!parseDocumentSelection(sender)) {
+ return;
+ }
+
+ expandBucket();
+
+ startNewVisitors(sender);
+}
+
+bool
+VisitorOperation::shouldAbortDueToTimeout() const noexcept
+{
+ return timeLeft() == 0;
+}
+
+void
+VisitorOperation::markOperationAsFailed(const api::ReturnCode& result)
+{
+ // Error codes are ordered so that increasing numbers approximate
+ // increasing severity. In particular, transient errors < fatal errors.
+ // In case of same error code, don't overwrite initial error.
+ if (_storageError.getResult() < result.getResult()) {
+ _storageError = result;
+ }
+}
+
+bool
+VisitorOperation::maySendNewStorageVisitors() const noexcept
+{
+ // If we've already failed, don't bother sending any more visitors.
+ // We rather want to get all currently pending visitors done so
+ // we can send a timely reply back to the visiting client.
+ return _storageError.success();
+}
+
+void
+VisitorOperation::startNewVisitors(DistributorMessageSender& sender)
+{
+ LOG(spam,
+ "Starting new visitors: Superbucket: %s, last subbucket: %s",
+ _superBucket.bid.toString().c_str(),
+ _lastBucket.toString().c_str());
+
+ initializeActiveNodes();
+
+ NodeToBucketsMap nodeToBucketsMap;
+ if (!assignBucketsToNodes(nodeToBucketsMap)
+ && !allowInconsistencies()
+ && _storageError.success())
+ {
+ // We do not allow "not found" to override any other errors.
+ // Furthermore, we do not fail with not found if we're visiting with
+ // inconsistencies allowed.
+ markOperationAsFailed(
+ api::ReturnCode(api::ReturnCode::BUCKET_NOT_FOUND));
+ }
+ if (shouldAbortDueToTimeout()) {
+ markOperationAsFailed(
+ api::ReturnCode(api::ReturnCode::ABORTED,
+ vespalib::make_string(
+ "Timeout of %u ms is running out",
+ _msg->getTimeout())));
+ }
+
+ if (maySendNewStorageVisitors()) {
+ sendStorageVisitors(nodeToBucketsMap, sender);
+ }
+
+ if (_sentMessages.empty()) {
+ sendReply(_storageError, sender);
+ }
+}
+
+void
+VisitorOperation::initializeActiveNodes()
+{
+ const lib::ClusterState& clusterState(_owner.getClusterState());
+
+ uint32_t storageNodeCount = clusterState.getNodeCount(lib::NodeType::STORAGE);
+ if (storageNodeCount > _activeNodes.size()) {
+ _activeNodes.resize(storageNodeCount);
+ }
+}
+
+bool
+VisitorOperation::shouldSkipBucket(const BucketInfo& bucketInfo) const
+{
+ return (bucketInfo.done ||
+ bucketInfo.activeNode != -1 ||
+ bucketInfo.failedCount > 0);
+}
+
+bool
+VisitorOperation::bucketIsValidAndConsistent(const BucketDatabase::Entry& entry) const
+{
+ if (!entry.valid()) {
+ LOG(debug,
+ "Bucket %s does not exist anymore",
+ entry.toString().c_str());
+ return false;
+ }
+ assert(entry->getNodeCount() != 0);
+
+ if (!allowInconsistencies() && !entry->hasTrusted()) {
+ LOG(spam,
+ "Failing visitor because %s is currently inconsistent. "
+ "Bucket contents: %s",
+ entry.getBucketId().toString().c_str(),
+ entry->toString().c_str());
+ return false;
+ }
+
+ return true;
+}
+
+bool
+VisitorOperation::allowInconsistencies() const noexcept
+{
+ return _msg->visitInconsistentBuckets();
+}
+
+bool
+VisitorOperation::assignBucketsToNodes(NodeToBucketsMap& nodeToBucketsMap)
+{
+ for (const auto& subBucket : _superBucket.subBucketsVisitOrder) {
+ auto subIter(_superBucket.subBuckets.find(subBucket));
+ assert(subIter != _superBucket.subBuckets.end());
+
+ BucketInfo& bucketInfo(subIter->second);
+ if (shouldSkipBucket(bucketInfo)) {
+ LOG(spam,
+ "Skipping subbucket %s because it is done/active/failed: %s",
+ subBucket.toString().c_str(),
+ bucketInfo.toString().c_str());
+ continue;
+ }
+
+ BucketDatabase::Entry entry(_owner.getBucketDatabase().get(subBucket));
+ if (!bucketIsValidAndConsistent(entry)) {
+ return false;
+ }
+
+ int node = pickTargetNode(entry, bucketInfo.triedNodes);
+ if (node == -1) {
+ return false;
+ }
+ LOG(spam, "Visiting %s on node %d", subBucket.toString().c_str(), node);
+ bucketInfo.activeNode = node;
+ bucketInfo.triedNodes.push_back(node);
+ nodeToBucketsMap[node].push_back(subBucket);
+ }
+ return true;
+}
+
+int
+VisitorOperation::getNumVisitorsToSendForNode(uint16_t node,
+ uint32_t totalBucketsOnNode) const
+{
+ int visitorCountAvailable(
+ std::max(1, static_cast<int>(_config.maxVisitorsPerNodePerVisitor -
+ _activeNodes[node])));
+
+ int visitorCountMinBucketsPerVisitor(
+ std::max(1, static_cast<int>(totalBucketsOnNode / _config.minBucketsPerVisitor)));
+
+ int visitorCount(
+ std::min(visitorCountAvailable, visitorCountMinBucketsPerVisitor));
+ LOG(spam,
+ "Will send %d visitors to node %d (available=%d, "
+ "buckets restricted=%d)",
+ visitorCount,
+ node,
+ visitorCountAvailable,
+ visitorCountMinBucketsPerVisitor);
+
+ return visitorCount;
+}
+
+bool
+VisitorOperation::sendStorageVisitors(const NodeToBucketsMap& nodeToBucketsMap,
+ DistributorMessageSender& sender)
+{
+ bool visitorsSent = false;
+ for (NodeToBucketsMap::const_iterator iter = nodeToBucketsMap.begin();
+ iter != nodeToBucketsMap.end();
+ ++iter) {
+ if (iter->second.size() > 0) {
+ int visitorCount(getNumVisitorsToSendForNode(iter->first, iter->second.size()));
+
+ std::vector<std::vector<document::BucketId> > bucketsVector(visitorCount);
+ for (unsigned int i = 0; i < iter->second.size(); i++) {
+ bucketsVector[i % visitorCount].push_back(iter->second[i]);
+ }
+ for (int i = 0; i < visitorCount; i++) {
+ LOG(spam,
+ "Send visitor to node %d with %u buckets",
+ iter->first,
+ (unsigned int)bucketsVector[i].size());
+
+ sendStorageVisitor(iter->first,
+ bucketsVector[i],
+ _msg->getMaximumPendingReplyCount(),
+ sender);
+
+ visitorsSent = true;
+ }
+ } else {
+ LOG(spam, "Do not send visitor to node %d, no buckets", iter->first);
+ }
+ }
+ return visitorsSent;
+}
+
+uint32_t
+VisitorOperation::computeVisitorQueueTimeoutMs() const noexcept
+{
+ return timeLeft() / 2;
+}
+
+void
+VisitorOperation::sendStorageVisitor(uint16_t node,
+ const std::vector<document::BucketId>& buckets,
+ uint32_t pending,
+ DistributorMessageSender& sender)
+{
+ api::CreateVisitorCommand::SP cmd(new api::CreateVisitorCommand(*_msg));
+ cmd->getBuckets() = buckets;
+
+ // TODO: Send this through distributor - do after moving visitor stuff from docapi to storageprotocol
+ cmd->setControlDestination(_msg->getControlDestination());
+ cmd->setToTime(_toTime);
+
+ vespalib::asciistream os;
+ os << _msg->getInstanceId() << '-'
+ << _owner.getIndex() << '-' << cmd->getMsgId();
+
+ vespalib::string storageInstanceId(os.str());
+ cmd->setInstanceId(storageInstanceId);
+ cmd->setAddress(api::StorageMessageAddress(_owner.getClusterName(),
+ lib::NodeType::STORAGE, node));
+ cmd->setMaximumPendingReplyCount(pending);
+ cmd->setQueueTimeout(computeVisitorQueueTimeoutMs());
+
+ _sentMessages[cmd->getMsgId()] = cmd;
+
+ cmd->setTimeout(timeLeft());
+
+ LOG(spam, "Priority is %d", cmd->getPriority());
+ LOG(debug, "Sending CreateVisitor command %zu for storage visitor '%s' to %s",
+ cmd->getMsgId(),
+ storageInstanceId.c_str(),
+ cmd->getAddress()->toString().c_str());
+
+ _activeNodes[node]++;
+ sender.sendCommand(cmd);
+}
+
+void
+VisitorOperation::sendReply(const api::ReturnCode& code, DistributorMessageSender& sender)
+{
+ if (!_sentReply) {
+ // Send create visitor reply
+ api::CreateVisitorReply::SP reply(new api::CreateVisitorReply(*_msg));
+ _trace.moveTraceTo(reply->getTrace().getRoot());
+ reply->setLastBucket(getLastBucketVisited());
+ reply->setResult(code);
+
+ reply->setVisitorStatistics(_visitorStatistics);
+ LOG(debug,
+ "Sending CreateVisitor reply %zu with return code '%s' for visitor "
+ "'%s', msg id '%zu' back to client",
+ reply->getMsgId(),
+ code.toString().c_str(),
+ _msg->getInstanceId().c_str(), _msg->getMsgId());
+
+ sender.sendReply(reply);
+
+ if (_metrics) {
+ framework::MilliSecTime timeNow(_owner.getClock().getTimeInMillis());
+ _metrics->latency.addValue((timeNow - _startVisitorTime).getTime());
+ }
+ _sentReply = true;
+ }
+}
+
+void
+VisitorOperation::onClose(DistributorMessageSender& sender)
+{
+ sendReply(api::ReturnCode(api::ReturnCode::ABORTED, "Process is shutting down"),
+ sender);
+}
+
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/operations/external/visitoroperation.h b/storage/src/vespa/storage/distributor/operations/external/visitoroperation.h
new file mode 100644
index 00000000000..6de20b81f25
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/visitoroperation.h
@@ -0,0 +1,193 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storageapi/defs.h>
+#include <vespa/storage/distributor/operations/operation.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <vespa/storageapi/message/visitor.h>
+#include <vespa/storage/distributor/visitormetricsset.h>
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <vespa/storage/visiting/memory_bounded_trace.h>
+
+namespace document {
+class Document;
+}
+
+namespace storage {
+
+namespace distributor {
+
+class DistributorComponent;
+
+class VisitorOperation : public Operation
+{
+public:
+ struct Config {
+ Config(const framework::MilliSecTime& storageNetworkLatency_,
+ uint32_t minBucketsPerVisitor_,
+ uint32_t maxVisitorsPerNodePerVisitor_)
+ : storageNetworkLatency(storageNetworkLatency_),
+ minBucketsPerVisitor(minBucketsPerVisitor_),
+ maxVisitorsPerNodePerVisitor(maxVisitorsPerNodePerVisitor_) {}
+
+ framework::MilliSecTime storageNetworkLatency;
+ uint32_t minBucketsPerVisitor;
+ uint32_t maxVisitorsPerNodePerVisitor;
+ };
+
+ VisitorOperation(DistributorComponent& manager,
+ const std::shared_ptr<api::CreateVisitorCommand> & msg,
+ const Config& config,
+ VisitorMetricSet* metrics = NULL);
+
+ ~VisitorOperation();
+
+ void onClose(DistributorMessageSender& sender);
+
+ void onStart(DistributorMessageSender& sender);
+
+ void onReceive(DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply> & msg);
+
+ const char* getName() const { return "visit"; }
+
+ std::string getStatus() const { return ""; }
+
+private:
+ struct BucketInfo {
+ bool done;
+ int activeNode;
+ uint16_t failedCount;
+ std::vector<uint16_t> triedNodes;
+
+ BucketInfo()
+ : done(false), activeNode(-1), failedCount(0), triedNodes()
+ {
+ }
+
+ void print(vespalib::asciistream & out) const;
+ vespalib::string toString() const;
+ };
+
+ typedef std::map<document::BucketId, BucketInfo> VisitBucketMap;
+
+ struct SuperBucketInfo {
+ document::BucketId bid;
+ bool subBucketsCompletelyExpanded;
+ VisitBucketMap subBuckets;
+ std::vector<document::BucketId> subBucketsVisitOrder;
+
+ SuperBucketInfo(const document::BucketId& b = document::BucketId(0))
+ : bid(b),
+ subBucketsCompletelyExpanded(false)
+ {
+ }
+
+ };
+
+ typedef std::map<uint16_t, std::vector<document::BucketId> > NodeToBucketsMap;
+ typedef std::map<uint64_t, api::CreateVisitorCommand::SP> SentMessagesMap;
+
+ void sendReply(const api::ReturnCode& code, DistributorMessageSender& sender);
+ void verifyDistributorsAreAvailable();
+ void verifyVisitorDistributionBitCount(const document::BucketId&);
+ void verifyDistributorIsNotDown(const lib::ClusterState&);
+ void verifyDistributorOwnsBucket(const document::BucketId&);
+ void verifyOperationContainsBuckets();
+ void verifyOperationHasSuperbucketAndProgress();
+ void verifyOperationSentToCorrectDistributor();
+ bool verifyCreateVisitorCommand(DistributorMessageSender& sender);
+ bool pickBucketsToVisit(const std::vector<BucketDatabase::Entry>& buckets);
+ bool expandBucketAll();
+ bool expandBucketContaining();
+ bool expandBucketContained();
+ void expandBucket();
+ int pickTargetNode(
+ const BucketDatabase::Entry& entry,
+ const std::vector<uint16_t>& triedNodes);
+ void attemptToParseOrderingSelector();
+ bool documentSelectionMayHaveOrdering() const;
+ bool parseDocumentSelection(DistributorMessageSender& sender);
+ bool maySendNewStorageVisitors() const noexcept;
+ void startNewVisitors(DistributorMessageSender& sender);
+ void initializeActiveNodes();
+ bool shouldSkipBucket(const BucketInfo& bucketInfo) const;
+ bool bucketIsValidAndConsistent(const BucketDatabase::Entry& entry) const;
+ bool allowInconsistencies() const noexcept;
+ bool shouldAbortDueToTimeout() const noexcept;
+ bool assignBucketsToNodes(NodeToBucketsMap& nodeToBucketsMap);
+ int getNumVisitorsToSendForNode(uint16_t node,
+ uint32_t totalBucketsOnNode) const;
+ uint32_t computeVisitorQueueTimeoutMs() const noexcept;
+ bool sendStorageVisitors(const NodeToBucketsMap& nodeToBucketsMap,
+ DistributorMessageSender& sender);
+ void sendStorageVisitor(uint16_t node,
+ const std::vector<document::BucketId>& buckets,
+ uint32_t pending,
+ DistributorMessageSender& sender);
+ void markCompleted(const document::BucketId& bid,
+ const api::ReturnCode& code);
+ /**
+ * Operation failed and we can pin the blame on a specific node. Updates
+ * internal error code and augments error message with the index of the
+ * failing node.
+ */
+ void markOperationAsFailedDueToNodeError(
+ const api::ReturnCode& result,
+ uint16_t fromFailingNodeIndex);
+ /**
+ * Operation failed but cannot blame a specific node in the failing context.
+ * Only overwrites current error code if `result` has a higher numeric
+ * code value, which avoids overwriting more critical errors.
+ */
+ void markOperationAsFailed(const api::ReturnCode& result);
+ bool isSpecialBucketForOrderDoc(const document::BucketId& bucketId) const;
+ std::vector<document::BucketId>::const_iterator addSpecialBucketsForOrderDoc(
+ std::vector<document::BucketId>::const_iterator iter,
+ std::vector<document::BucketId>::const_iterator end);
+ /**
+ * Compute time remaining of visitor in milliseconds, relative to timeout
+ * time point. In case of the current time having passed the timeout
+ * point, function returns 0.
+ */
+ uint64_t timeLeft() const noexcept;
+
+ std::unique_ptr<document::OrderingSpecification> _ordering;
+
+ DistributorComponent& _owner;
+ SentMessagesMap _sentMessages;
+
+ api::CreateVisitorCommand::SP _msg;
+ api::ReturnCode _storageError;
+ bool _sentReply;
+
+ SuperBucketInfo _superBucket;
+ document::BucketId _lastBucket;
+
+ api::Timestamp _fromTime;
+ api::Timestamp _toTime;
+
+ std::vector<uint32_t> _activeNodes;
+ uint32_t _bucketCount;
+
+ framework::MilliSecTime _startVisitorTime;
+ vdslib::VisitorStatistics _visitorStatistics;
+
+ Config _config;
+ VisitorMetricSet* _metrics;
+ MemoryBoundedTrace _trace;
+
+ static constexpr size_t TRACE_SOFT_MEMORY_LIMIT = 65536;
+
+ bool done();
+ bool hasNoPendingMessages();
+ document::BucketId getLastBucketVisited();
+ mbus::TraceNode trace;
+};
+
+}
+
+}
+
+
diff --git a/storage/src/vespa/storage/distributor/operations/external/visitororder.h b/storage/src/vespa/storage/distributor/operations/external/visitororder.h
new file mode 100644
index 00000000000..02001afcdb1
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/external/visitororder.h
@@ -0,0 +1,83 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+namespace storage {
+
+namespace distributor {
+
+struct VisitorOrder {
+ const document::OrderingSpecification& _ordering;
+
+ VisitorOrder(const document::OrderingSpecification& ordering)
+ : _ordering(ordering) {}
+
+ document::BucketId::Type getOrder(const document::BucketId& bid) {
+ int32_t orderBitCount = _ordering.getWidthBits() -
+ _ordering.getDivisionBits();
+ document::BucketId::Type order = bid.withoutCountBits();
+ order >>= 32;
+ order <<= 64 - orderBitCount;
+ order = document::BucketId::reverse(order);
+ return order;
+ }
+
+ document::BucketId::Type padOnesRight(const document::BucketId::Type& id,
+ int32_t count) {
+ document::BucketId::Type res = id;
+ document::BucketId::Type one = 1;
+ for (int32_t i=0; i<count; i++) {
+ res |= (one << i);
+ }
+ return res;
+ }
+
+ bool operator()(const document::BucketId& a, const document::BucketId& b) {
+ if (a == document::BucketId(INT_MAX) ||
+ b == document::BucketId(0, 0)) {
+ return false; // All before max, non before null
+ }
+ if (a == document::BucketId(0, 0) ||
+ b == document::BucketId(INT_MAX)) {
+ return true; // All after null, non after max
+ }
+ int32_t orderBitCount = _ordering.getWidthBits() -
+ _ordering.getDivisionBits();
+ int32_t aOrderBitsUsed = std::max((int32_t)a.getUsedBits() - 32, 0);
+ int32_t bOrderBitsUsed = std::max((int32_t)b.getUsedBits() - 32, 0);
+ if (orderBitCount <= 0 ||
+ aOrderBitsUsed == 0 ||
+ bOrderBitsUsed == 0) {
+ return (a.toKey() < b.toKey()); // Reversed bucket id order
+ }
+
+ document::BucketId::Type aOrder = getOrder(a);
+ document::BucketId::Type bOrder = getOrder(b);
+
+ document::BucketId::Type sOrder = _ordering.getOrderingStart();
+ sOrder <<= 64 - _ordering.getWidthBits();
+ sOrder >>= 64 - orderBitCount;
+
+ if (_ordering.getOrder() == document::OrderingSpecification::ASCENDING) {
+ aOrder = padOnesRight(aOrder, orderBitCount - aOrderBitsUsed);
+ bOrder = padOnesRight(bOrder, orderBitCount - bOrderBitsUsed);
+ }
+
+ aOrder -= sOrder;
+ bOrder -= sOrder;
+
+ if (_ordering.getOrder() == document::OrderingSpecification::DESCENDING) {
+ aOrder = -aOrder;
+ bOrder = -bOrder;
+ }
+
+ if (aOrder == bOrder) {
+ return (a.toKey() < b.toKey()); // Reversed bucket id order
+ }
+ return (aOrder < bOrder);
+ }
+};
+
+}
+
+}
+
+
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/CMakeLists.txt b/storage/src/vespa/storage/distributor/operations/idealstate/CMakeLists.txt
new file mode 100644
index 00000000000..5073d222f94
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/CMakeLists.txt
@@ -0,0 +1,15 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_distributoroperationidealstate OBJECT
+ SOURCES
+ idealstateoperation.cpp
+ joinoperation.cpp
+ mergeoperation.cpp
+ splitoperation.cpp
+ removebucketoperation.cpp
+ setbucketstateoperation.cpp
+ garbagecollectionoperation.cpp
+ mergelimiter.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/garbagecollectionoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/garbagecollectionoperation.cpp
new file mode 100644
index 00000000000..2deab309ea4
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/garbagecollectionoperation.cpp
@@ -0,0 +1,80 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/operations/idealstate/garbagecollectionoperation.h>
+#include <vespa/storageapi/messageapi/storagereply.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/storage/distributor/distributor.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/storageapi/message/removelocation.h>
+
+#include <vespa/log/log.h>
+
+LOG_SETUP(".distributor.operation.idealstate.remove");
+
+using namespace storage::distributor;
+
+void
+GarbageCollectionOperation::onStart(DistributorMessageSender& sender)
+{
+ BucketDatabase::Entry entry = _manager->getDistributorComponent().getBucketDatabase().get(getBucketId());
+ std::vector<uint16_t> nodes = entry->getNodes();
+
+ for (uint32_t i = 0; i < nodes.size(); i++) {
+ std::shared_ptr<api::RemoveLocationCommand> command(
+ new api::RemoveLocationCommand(
+ _manager->getDistributorComponent().getDistributor().getConfig().getGarbageCollectionSelection(),
+ getBucketId()));
+
+ command->setPriority(_priority);
+ _tracker.queueCommand(command, nodes[i]);
+ }
+
+ _tracker.flushQueue(sender);
+
+ if (_tracker.finished()) {
+ done();
+ }
+}
+
+void
+GarbageCollectionOperation::onReceive(DistributorMessageSender&,
+ const std::shared_ptr<api::StorageReply>& reply)
+{
+ api::RemoveLocationReply* rep =
+ dynamic_cast<api::RemoveLocationReply*>(reply.get());
+
+ uint16_t node = _tracker.handleReply(*rep);
+
+ if (!rep->getResult().failed()) {
+ _manager->getDistributorComponent().updateBucketDatabase(
+ getBucketId(),
+ BucketCopy(_manager->getDistributorComponent().getUniqueTimestamp(),
+ node,
+ rep->getBucketInfo()));
+ } else {
+ _ok = false;
+ }
+
+ if (_tracker.finished()) {
+ if (_ok) {
+ BucketDatabase::Entry dbentry = _manager->getDistributorComponent().getBucketDatabase().get(getBucketId());
+ if (dbentry.valid()) {
+ dbentry->setLastGarbageCollectionTime(
+ _manager->getDistributorComponent().getClock().getTimeInSeconds().getTime());
+ _manager->getDistributorComponent().getBucketDatabase().update(dbentry);
+ }
+ }
+
+ done();
+ }
+}
+
+bool
+GarbageCollectionOperation::shouldBlockThisOperation(uint32_t, uint8_t) const
+{
+ return true;
+}
+
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/garbagecollectionoperation.h b/storage/src/vespa/storage/distributor/operations/idealstate/garbagecollectionoperation.h
new file mode 100644
index 00000000000..33ccf3c2abe
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/garbagecollectionoperation.h
@@ -0,0 +1,43 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/operations/idealstate/idealstateoperation.h>
+#include <vespa/storage/distributor/messagetracker.h>
+
+namespace storage
+{
+
+namespace distributor
+{
+
+class PendingMessageTracker;
+
+class GarbageCollectionOperation : public IdealStateOperation
+{
+public:
+ GarbageCollectionOperation(
+ const std::string& clusterName,
+ const BucketAndNodes& nodes)
+ : IdealStateOperation(nodes), _tracker(clusterName) {};
+
+ /**
+ Sends messages, calls done() if we are done (sent nothing).
+ */
+ void onStart(DistributorMessageSender& sender);
+
+ void onReceive(DistributorMessageSender& sender, const std::shared_ptr<api::StorageReply> &);
+
+ const char* getName() const { return "garbagecollection"; };
+
+ Type getType() const { return GARBAGE_COLLECTION; }
+
+ bool shouldBlockThisOperation(uint32_t, uint8_t) const;
+
+protected:
+ MessageTracker _tracker;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/idealstateoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/idealstateoperation.cpp
new file mode 100644
index 00000000000..257e9e53d4f
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/idealstateoperation.cpp
@@ -0,0 +1,252 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/operations/idealstate/idealstateoperation.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/log/log.h>
+#include <vespa/storageapi/messageapi/maintenancecommand.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+#include <vespa/storage/distributor/idealstatemetricsset.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+
+LOG_SETUP(".distributor.operation");
+
+using namespace storage;
+using namespace storage::distributor;
+
+const uint32_t IdealStateOperation::MAINTENANCE_MESSAGE_TYPES[] =
+{
+ api::MessageType::CREATEBUCKET_ID,
+ api::MessageType::MERGEBUCKET_ID,
+ api::MessageType::DELETEBUCKET_ID,
+ api::MessageType::SPLITBUCKET_ID,
+ api::MessageType::JOINBUCKETS_ID,
+ api::MessageType::SETBUCKETSTATE_ID,
+ 0
+};
+
+IdealStateOperation::IdealStateOperation(const BucketAndNodes& bucketAndNodes)
+ : _manager(NULL),
+ _bucketAndNodes(bucketAndNodes),
+ _ok(true),
+ _priority(255)
+{
+}
+
+IdealStateOperation::~IdealStateOperation()
+{
+}
+
+BucketAndNodes::BucketAndNodes(const document::BucketId& id, uint16_t node)
+ : _id(id)
+{
+ _nodes.push_back(node);
+}
+
+BucketAndNodes::BucketAndNodes(const document::BucketId& id,
+ const std::vector<uint16_t>& nodes)
+ : _id(id),
+ _nodes(nodes)
+{
+ assert(!nodes.empty());
+ std::sort(_nodes.begin(), _nodes.end());
+}
+
+std::string
+BucketAndNodes::toString() const
+{
+ std::ostringstream ost;
+
+ ost << "[";
+
+ for (uint32_t i = 0; i < _nodes.size(); i++) {
+ if (i != 0) {
+ ost << ",";
+ }
+ ost << _nodes[i];
+ }
+
+ ost << "] ";
+ ost << _id;
+ return ost.str();
+}
+
+void
+IdealStateOperation::done()
+{
+ if (_manager != NULL) {
+ if (ok()) {
+ _manager->getMetrics().operations[getType()]->ok.inc(1);
+ } else {
+ _manager->getMetrics().operations[getType()]->failed.inc(1);
+ }
+ }
+}
+
+uint32_t
+IdealStateOperation::memorySize() const
+{
+ return sizeof(*this) + _detailedReason.size();
+}
+
+void
+IdealStateOperation::setCommandMeta(api::MaintenanceCommand& cmd) const
+{
+ cmd.setPriority(_priority);
+ cmd.setReason(_detailedReason);
+ cmd.setLoadType(
+ (*_manager->getLoadTypes())["maintenance"]);
+}
+
+std::string
+IdealStateOperation::toXML(framework::Clock& clock) const
+{
+ std::ostringstream ost;
+
+ ost << "<operation bucketid=\"" << getBucketId()
+ << "\" reason=\"" << _detailedReason << "\" operations=\"";
+
+ ost << getName() << "[";
+ for (uint32_t j = 0; j < getNodes().size(); j++) {
+ if (j != 0) {
+ ost << ",";
+ }
+ ost << getNodes()[j];
+ }
+ ost << "]";
+
+ if (getStartTime().isSet()) {
+ uint64_t timeSpent(
+ (clock.getTimeInMillis() - getStartTime()).getTime());
+ ost << "\" runtime_secs=\"" << timeSpent << "\"";
+ } else {
+ ost << "\"";
+ }
+
+ ost << "/>";
+ return ost.str();
+}
+
+namespace {
+
+class IdealStateOpChecker : public PendingMessageTracker::Checker
+{
+public:
+ bool blocked;
+ const IdealStateOperation& op;
+
+ IdealStateOpChecker(const IdealStateOperation& o)
+ : blocked(false), op(o)
+ {
+ }
+
+ bool check(uint32_t messageType, uint16_t node, uint8_t priority) override
+ {
+ (void) node;
+ if (op.shouldBlockThisOperation(messageType, priority)) {
+ blocked = true;
+ return false;
+ }
+
+ return true;
+ }
+};
+
+class RequestBucketInfoChecker : public PendingMessageTracker::Checker
+{
+public:
+ bool blocked;
+
+ RequestBucketInfoChecker()
+ : blocked(false)
+ {
+ }
+
+ bool check(uint32_t messageType, uint16_t node, uint8_t priority) override
+ {
+ (void) node;
+ (void) priority;
+ // Always block for RequestBucketInfo pending to a node involved
+ // in the ideal state operation.
+ if (messageType == api::MessageType::REQUESTBUCKETINFO_ID) {
+ blocked = true;
+ return false;
+ }
+ return true;
+ }
+};
+
+}
+
+bool
+IdealStateOperation::checkBlock(const document::BucketId& bId,
+ const PendingMessageTracker& tracker) const
+{
+ IdealStateOpChecker ichk(*this);
+ RequestBucketInfoChecker rchk;
+ const std::vector<uint16_t>& nodes(getNodes());
+ for (size_t i = 0; i < nodes.size(); ++i) {
+ tracker.checkPendingMessages(nodes[i], bId, ichk);
+ if (ichk.blocked) {
+ return true;
+ }
+ // Check messages sent to null-bucket (i.e. any bucket) for the node.
+ tracker.checkPendingMessages(nodes[i], document::BucketId(), rchk);
+ if (rchk.blocked) {
+ return true;
+ }
+ }
+ return false;
+}
+
+bool
+IdealStateOperation::checkBlockForAllNodes(
+ const document::BucketId& bid,
+ const PendingMessageTracker& tracker) const
+{
+ IdealStateOpChecker ichk(*this);
+ // Check messages sent to _any node_ for _this_ particular bucket.
+ tracker.checkPendingMessages(bid, ichk);
+ if (ichk.blocked) {
+ return true;
+ }
+ RequestBucketInfoChecker rchk;
+ // Check messages sent to null-bucket (i.e. _any bucket_) for the node.
+ const std::vector<uint16_t>& nodes(getNodes());
+ for (size_t i = 0; i < nodes.size(); ++i) {
+ tracker.checkPendingMessages(nodes[i], document::BucketId(), rchk);
+ if (rchk.blocked) {
+ return true;
+ }
+ }
+ return false;
+}
+
+
+bool
+IdealStateOperation::isBlocked(const PendingMessageTracker& tracker) const
+{
+ return checkBlock(getBucketId(), tracker);
+}
+
+std::string
+IdealStateOperation::toString() const
+{
+ std::ostringstream ost;
+ ost << getName() << " to " << _bucketAndNodes.toString()
+ << " (pri " << (int)_priority << ")";
+
+ return ost.str();
+}
+
+bool
+IdealStateOperation::shouldBlockThisOperation(uint32_t messageType,
+ uint8_t) const
+{
+ for (uint32_t i = 0; MAINTENANCE_MESSAGE_TYPES[i] != 0; ++i) {
+ if (messageType == MAINTENANCE_MESSAGE_TYPES[i]) {
+ return true;
+ }
+ }
+
+ return false;
+}
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/idealstateoperation.h b/storage/src/vespa/storage/distributor/operations/idealstate/idealstateoperation.h
new file mode 100644
index 00000000000..29e9aa7422c
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/idealstateoperation.h
@@ -0,0 +1,255 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/maintenance/maintenanceoperation.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <vespa/storageapi/messageapi/storagereply.h>
+#include <vespa/storageapi/messageapi/maintenancecommand.h>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storageframework/generic/memory/memorymanagerinterface.h>
+
+namespace storage
+{
+
+namespace distributor
+{
+
+class PendingMessageTracker;
+class IdealStateManager;
+
+/**
+ @class BucketAndNodes
+
+ Represents a target for an ideal state operation, consisting of a set of storage nodes
+ and a bucket id.
+
+ BucketAndNodes has a default sort order of nodes first (having already sorted the nodes
+ in numerical order), then BucketId, so that it can be used for scheduling by the
+ @link StateChecker class.
+*/
+class BucketAndNodes
+{
+public:
+ /**
+ Constructor for operations having only one node.
+
+ @param id Target bucket
+ @param node Target node
+ */
+ BucketAndNodes(const document::BucketId& id, uint16_t node);
+
+ /**
+ Constructor for operations with multiple target nodes.
+
+ @param id Target bucket
+ @param nodes Target nodes
+ */
+ BucketAndNodes(const document::BucketId& id,
+ const std::vector<uint16_t>& nodes);
+
+ /**
+ Changes the target bucket.
+
+ @param id The new target bucket
+ */
+ void setBucketId(const document::BucketId& id) { _id = id; }
+
+ /**
+ Returns the target bucket.
+
+ @return Returns the target bucket.
+ */
+ const document::BucketId& getBucketId() const { return _id; }
+
+ /**
+ Returns the target nodes
+
+ @return the target nodes
+ */
+ std::vector<uint16_t>& getNodes() { return _nodes; }
+
+ /**
+ Returns the target nodes
+
+ @return the target nodes
+ */
+ const std::vector<uint16_t>& getNodes() const { return _nodes; }
+
+ /**
+ Returns a string representation of this object.
+
+ @return String representation
+ */
+ std::string toString() const;
+
+private:
+ document::BucketId _id;
+ std::vector<uint16_t> _nodes;
+};
+
+/**
+ @class Operation
+
+ Superclass for ideal state operations started by the IdealStateManager.
+ Each operation has a target (BucketAndNodes), and a pointer back to the
+ IdealStateManager.
+
+ An operation is started by the start() method (from @link Callback), and
+ may send messages there. Once replies are received, the receive() method
+ (also from @link Callback) is called. When the operation is done, it should
+ call done(), where this class will call back to the IdealStateManager
+ with operationFinished(), so that the IdealStateManager can update its
+ active state, possibly reschedule other operations in the same OperationList
+ as this one, or recheck the bucket.
+*/
+class IdealStateOperation : public MaintenanceOperation
+{
+public:
+ static const uint32_t MAINTENANCE_MESSAGE_TYPES[];
+
+ typedef std::shared_ptr<IdealStateOperation> SP;
+ typedef vespalib::LinkedPtr<IdealStateOperation> LP;
+ typedef std::unique_ptr<IdealStateOperation> UP;
+ typedef std::vector<SP> Vector;
+ typedef std::map<document::BucketId, SP> Map;
+
+ IdealStateOperation(const BucketAndNodes& bucketAndNodes);
+
+ virtual ~IdealStateOperation();
+
+ void onClose(DistributorMessageSender&) {}
+
+ /**
+ Returns true if the operation was performed successfully.
+
+ @return Returns the status of the operation.
+ */
+ virtual bool ok() { return _ok; }
+
+ /**
+ Returns the target nodes of the operation.
+
+ @return The target nodes
+ */
+ std::vector<uint16_t>& getNodes() { return _bucketAndNodes.getNodes(); }
+
+ /**
+ Returns the target nodes of the operation.
+
+ @return The target nodes
+ */
+ const std::vector<uint16_t>& getNodes() const { return _bucketAndNodes.getNodes(); }
+
+ /**
+ Returns the target bucket of the operation.
+
+ @return The target bucket.
+ */
+ const document::BucketId& getBucketId() const { return _bucketAndNodes.getBucketId(); }
+
+ /**
+ Returns the target of the operation.
+
+ @return The target bucket and nodes
+ */
+ const BucketAndNodes& getBucketAndNodes() const { return _bucketAndNodes; }
+
+ /**
+ Called by the operation when it is finished. Must be called, otherwise the active
+ state won't be updated correctly.
+ */
+ virtual void done();
+
+ /**
+ Called by IdealStateManager to allow the operation to call back its
+ OperationFinished() method when done.
+
+ @param manager The ideal state manager.
+ */
+ void setIdealStateManager(IdealStateManager* manager) {
+ _manager = manager;
+ };
+
+ /**
+ Returns the type of operation this is.
+ */
+ virtual Type getType() const = 0;
+
+ /**
+ Set the priority we should send messages from this operation with.
+ */
+ void setPriority(api::StorageMessage::Priority priority)
+ { _priority = priority; }
+
+ /**
+ * Returns true if we are blocked to start this operation given
+ * the pending messages.
+ */
+ virtual bool isBlocked(const PendingMessageTracker& pendingMessages) const;
+
+ /**
+ Returns the priority we should send messages with.
+ */
+ api::StorageMessage::Priority getPriority() { return _priority; }
+
+ void setDetailedReason(const std::string& detailedReason) {
+ _detailedReason = detailedReason;
+ }
+ void setDetailedReason(std::string&& detailedReason) {
+ _detailedReason = std::move(detailedReason);
+ }
+
+ const std::string& getDetailedReason() const {
+ return _detailedReason;
+ }
+
+ uint32_t memorySize() const;
+
+ /**
+ * Sets the various metadata for the given command that
+ * is common for all ideal state operations.
+ */
+ void setCommandMeta(api::MaintenanceCommand& cmd) const;
+
+ std::string toXML(framework::Clock& clock) const;
+
+ std::string toString() const;
+
+ /**
+ * Should return true if the given message type should block this operation.
+ */
+ virtual bool shouldBlockThisOperation(uint32_t messageType, uint8_t priority) const;
+
+protected:
+ friend class IdealStateManagerTest;
+ friend class IdealStateManager;
+
+ IdealStateManager* _manager;
+ BucketAndNodes _bucketAndNodes;
+ std::string _detailedReason;
+
+ bool _ok;
+ api::StorageMessage::Priority _priority;
+ framework::MemoryToken::UP _memoryToken;
+
+ /**
+ * Checks if the given bucket is blocked by any pending messages to any
+ * node _explicitly part of this ideal state operation_. If there are
+ * operations to other nodes for this bucket, these will not be part of
+ * the set of messages checked.
+ */
+ bool checkBlock(const document::BucketId& bId,
+ const PendingMessageTracker& tracker) const;
+
+ bool checkBlockForAllNodes(const document::BucketId& bId,
+ const PendingMessageTracker& tracker) const;
+
+};
+
+}
+
+
+}
+
+
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/joinoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/joinoperation.cpp
new file mode 100644
index 00000000000..bc7322b579a
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/joinoperation.cpp
@@ -0,0 +1,158 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/operations/idealstate/joinoperation.h>
+#include <vespa/storageapi/messageapi/storagereply.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+#include <vespa/log/log.h>
+
+LOG_SETUP(".distributor.operation.idealstate.join");
+
+using namespace storage::distributor;
+
+void
+JoinOperation::onStart(DistributorMessageSender& sender)
+{
+ _ok = false;
+
+ if (_bucketsToJoin.size() == 1) {
+ LOG(debug, "Starting join operation for %s -> %s",
+ _bucketsToJoin[0].toString().c_str(), getBucketId().toString().c_str());
+ } else {
+ LOG(debug, "Starting join operation for (%s,%s) -> %s",
+ _bucketsToJoin[0].toString().c_str(),
+ _bucketsToJoin[1].toString().c_str(),
+ getBucketId().toString().c_str());
+ }
+
+ std::sort(_bucketsToJoin.begin(), _bucketsToJoin.end());
+
+ auto nodeToBuckets = resolveSourceBucketsPerTargetNode();
+ fillMissingSourceBucketsForInconsistentJoins(nodeToBuckets);
+
+ _ok = enqueueJoinMessagePerTargetNode(nodeToBuckets);
+
+ if (!_ok) {
+ LOGBP(debug, "Unable to join bucket %s, since no copies are available (some in maintenance?)", getBucketId().toString().c_str());
+ done();
+ } else {
+ _tracker.flushQueue(sender);
+ }
+}
+
+JoinOperation::NodeToBuckets
+JoinOperation::resolveSourceBucketsPerTargetNode() const
+{
+ NodeToBuckets nodeToBuckets;
+ const auto& db(_manager->getDistributorComponent().getBucketDatabase());
+ for (const auto& bucket : _bucketsToJoin) {
+ BucketDatabase::Entry entry(db.get(bucket));
+
+ for (uint32_t j = 0; j < entry->getNodeCount(); j++) {
+ nodeToBuckets[entry->getNodeRef(j).getNode()].push_back(bucket);
+ }
+ }
+ return nodeToBuckets;
+}
+
+void
+JoinOperation::fillMissingSourceBucketsForInconsistentJoins(
+ NodeToBuckets& nodeToBuckets) const
+{
+ for (auto& node : nodeToBuckets) {
+ if (node.second.size() == 1) {
+ document::BucketId source = node.second.front();
+ node.second.push_back(source);
+ }
+ }
+}
+
+bool
+JoinOperation::enqueueJoinMessagePerTargetNode(
+ const NodeToBuckets& nodeToBuckets)
+{
+ if (nodeToBuckets.empty()) {
+ return false;
+ }
+ for (const auto& node : nodeToBuckets) {
+ std::shared_ptr<api::JoinBucketsCommand> msg(
+ new api::JoinBucketsCommand(getBucketId()));
+ msg->getSourceBuckets() = node.second;
+ msg->setTimeout(INT_MAX);
+ setCommandMeta(*msg);
+ _tracker.queueCommand(msg, node.first);
+ }
+ return true;
+}
+
+void
+JoinOperation::onReceive(DistributorMessageSender&, const api::StorageReply::SP& msg)
+{
+ api::JoinBucketsReply& rep = static_cast<api::JoinBucketsReply&>(*msg);
+ uint16_t node = _tracker.handleReply(rep);
+ if (node == 0xffff) {
+ LOG(debug, "Ignored reply since node was max uint16_t for unknown "
+ "reasons");
+ return;
+ }
+
+ if (rep.getResult().success()) {
+ const std::vector<document::BucketId>& sourceBuckets(
+ rep.getSourceBuckets());
+ for (uint32_t i = 0; i < sourceBuckets.size(); i++) {
+ _manager->getDistributorComponent().removeNodeFromDB(sourceBuckets[i], node);
+ }
+
+ // Add new buckets.
+ if (!rep.getBucketInfo().valid()) {
+ LOG(debug, "Invalid bucketinfo for bucket %s returned in join",
+ getBucketId().toString().c_str());
+ } else {
+ _manager->getDistributorComponent().updateBucketDatabase(
+ getBucketId(),
+ BucketCopy(_manager->getDistributorComponent().getUniqueTimestamp(),
+ node,
+ rep.getBucketInfo()),
+ DatabaseUpdate::CREATE_IF_NONEXISTING);
+
+ LOG(spam, "Adding joined bucket %s", getBucketId().toString().c_str());
+ }
+ } else if (rep.getResult().getResult() == api::ReturnCode::BUCKET_NOT_FOUND
+ && _manager->getDistributorComponent().getBucketDatabase().get(getBucketId())->getNode(node) != 0)
+ {
+ _manager->getDistributorComponent().recheckBucketInfo(node, getBucketId());
+ LOGBP(warning, "Join failed to find %s: %s",
+ getBucketId().toString().c_str(),
+ rep.getResult().toString().c_str());
+ } else if (rep.getResult().isBusy()) {
+ LOG(debug, "Join failed for %s, node was busy. Will retry later",
+ getBucketId().toString().c_str());
+ } else if (rep.getResult().isCriticalForMaintenance()) {
+ LOGBP(warning, "Join failed for %s: %s with error '%s'",
+ getBucketId().toString().c_str(), msg->toString().c_str(),
+ msg->getResult().toString().c_str());
+ } else {
+ LOG(debug, "Join failed for %s with non-critical failure: %s",
+ getBucketId().toString().c_str(),
+ rep.getResult().toString().c_str());
+ }
+ _ok = rep.getResult().success();
+
+ LOG(debug, "Bucket %s join finished", getBucketId().toString().c_str());
+ if (_tracker.finished()) {
+ done();
+ }
+}
+
+bool
+JoinOperation::isBlocked(const PendingMessageTracker& tracker) const
+{
+ return (checkBlock(getBucketId(), tracker) ||
+ checkBlock(_bucketsToJoin[0], tracker) ||
+ (_bucketsToJoin.size() > 1 && checkBlock(_bucketsToJoin[1], tracker)));
+}
+
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/joinoperation.h b/storage/src/vespa/storage/distributor/operations/idealstate/joinoperation.h
new file mode 100644
index 00000000000..36b274629d5
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/joinoperation.h
@@ -0,0 +1,64 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/storage/distributor/operations/idealstate/idealstateoperation.h>
+#include <vespa/storage/distributor/messagetracker.h>
+
+namespace storage {
+namespace distributor {
+
+class JoinOperation : public IdealStateOperation
+{
+public:
+ /**
+ * Creates a new join operation.
+ *
+ * @param clusterName The name of this storage cluster.
+ * @param bucketAndNodes The bucket to join into, along with the nodes this operation uses.
+ * @param bucketsToJoin The buckets to join together. The size of this array should always be either one or two.
+ */
+ JoinOperation(const std::string& clusterName,
+ const BucketAndNodes& nodes,
+ const std::vector<document::BucketId>& bucketsToJoin)
+ : IdealStateOperation(nodes),
+ _tracker(clusterName),
+ _bucketsToJoin(bucketsToJoin)
+ {};
+
+ ~JoinOperation() {}
+
+ void onStart(DistributorMessageSender& sender) override;
+
+ void onReceive(DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply>&) override;
+
+ const char* getName() const override {
+ return "join";
+ }
+
+ Type getType() const override {
+ return JOIN_BUCKET;
+ }
+
+ bool isBlocked(const PendingMessageTracker& pendingMessages) const override;
+
+protected:
+ using NodeToBuckets = std::map<uint16_t, std::vector<document::BucketId>>;
+ NodeToBuckets resolveSourceBucketsPerTargetNode() const;
+
+ void fillMissingSourceBucketsForInconsistentJoins(
+ NodeToBuckets& nodeToBuckets) const;
+
+ /**
+ * Returns true if any messages were enqueued, false otherwise.
+ */
+ bool enqueueJoinMessagePerTargetNode(const NodeToBuckets& nodeToBuckets);
+
+ MessageTracker _tracker;
+ std::vector<document::BucketId> _bucketsToJoin;
+};
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.cpp
new file mode 100644
index 00000000000..18a2f8c8118
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.cpp
@@ -0,0 +1,178 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/storage/distributor/operations/idealstate/mergelimiter.h>
+#include <vespa/vdslib/container/smallvector.h>
+
+#include <vespa/log/log.h>
+
+LOG_SETUP(".distributor.operations.merge.limiter");
+
+namespace storage {
+namespace distributor {
+
+MergeLimiter::MergeLimiter(uint16_t maxNodes)
+ : _maxNodes(maxNodes)
+{
+ LOG(spam, "Limiter initialized with %u nodes.", uint32_t(maxNodes));
+}
+
+namespace {
+ class EqualCopies {
+ uint32_t _checksum;
+ lib::SmallVector<MergeMetaData> _copies;
+ uint32_t _trustedCopies;
+
+ public:
+ EqualCopies()
+ : _checksum(0),
+ _trustedCopies(0)
+ {
+ }
+
+ bool hasTrusted() const { return (_trustedCopies > 0); }
+ uint32_t trustedCount() const { return _trustedCopies; }
+ uint32_t size() const { return _copies.size(); }
+ bool operator==(const MergeMetaData& mmd) const {
+ return (_checksum == mmd.checksum());
+ }
+ void add(const MergeMetaData& mmd) {
+ if (_copies.empty()) _checksum = mmd.checksum();
+ if (mmd.trusted()) ++_trustedCopies;
+ _copies.push_back(mmd);
+ }
+ MergeMetaData extractNext() {
+ MergeMetaData data = _copies.back();
+ _copies.pop_back();
+ return data;
+ }
+ };
+
+ vespalib::asciistream& operator<<(vespalib::asciistream& out,
+ const EqualCopies& e)
+ {
+ return out << "EqualCopies(" << e.size() << ")";
+ }
+
+ class Statistics {
+ lib::SmallVector<EqualCopies> _groups;
+ uint32_t _trustedCopies;
+
+ public:
+ Statistics() : _trustedCopies(0) {}
+ Statistics(const MergeLimiter::NodeArray& a)
+ : _trustedCopies(0)
+ {
+ for (uint32_t i=0, n=a.size(); i<n; ++i) {
+ add(a[i]);
+ if (a[i].trusted()) {
+ ++_trustedCopies;
+ }
+ }
+ }
+
+ EqualCopies& getMajority() {
+ EqualCopies* candidate = 0;
+ uint32_t size = 0;
+ for (uint32_t i=0, n=_groups.size(); i<n; ++i) {
+ if (_groups[i].size() > size) {
+ candidate = &_groups[i];
+ size = candidate->size();
+ }
+ }
+ assert(candidate != 0);
+ return *candidate;
+ }
+
+ bool hasTrusted() const { return (_trustedCopies > 0); }
+ uint32_t trustedCount() const { return _trustedCopies; }
+
+ Statistics extractGroupsWithTrustedCopies() {
+ lib::SmallVector<EqualCopies> _remaining;
+ Statistics trusted;
+ for (uint32_t i=0, n=_groups.size(); i<n; ++i) {
+ if (_groups[i].hasTrusted()) {
+ trusted._groups.push_back(_groups[i]);
+ trusted._trustedCopies += _groups[i].trustedCount();
+ } else {
+ _remaining.push_back(_groups[i]);
+ _trustedCopies -= _groups[i].trustedCount();
+ }
+ }
+ swap(_remaining, _groups);
+ return trusted;
+ }
+ bool extractNext(MergeMetaData& data, uint32_t& last) {
+ if (_groups.empty()) return false;
+ if (++last >= _groups.size()) { last = 0; }
+ data = _groups[last].extractNext();
+ if (_groups[last].size() == 0) {
+ removeGroup(last);
+ --last;
+ }
+ return true;
+ }
+ void removeGroup(uint32_t groupIndex) {
+ lib::SmallVector<EqualCopies> remaining;
+ for (uint32_t i=0, n=_groups.size(); i<n; ++i) {
+ if (i != groupIndex) remaining.push_back(_groups[i]);
+ }
+ remaining.swap(_groups);
+ }
+
+ private:
+ void add(const MergeMetaData& mmd) {
+ for (uint32_t i=0; i<_groups.size(); ++i) {
+ if (_groups[i] == mmd) {
+ _groups[i].add(mmd);
+ return;
+ }
+ }
+ _groups.push_back(EqualCopies());
+ _groups.back().add(mmd);
+ }
+ };
+
+ // Add up to max nodes, where different variants exist, prefer having
+ // some of each.
+ void addNodes(uint32_t max, Statistics& stats,
+ MergeLimiter::NodeArray& result)
+ {
+ uint32_t last = -1;
+ for (uint32_t i=0; i<max; ++i) {
+ MergeMetaData data;
+ if (!stats.extractNext(data, last)) return;
+ result.push_back(data);
+ }
+ }
+
+ struct SourceOnlyOrder {
+ bool operator()(const MergeMetaData& m1, const MergeMetaData& m2) {
+ if (m1._sourceOnly == m2._sourceOnly) return false;
+ return m2._sourceOnly;
+ }
+ };
+}
+
+void
+MergeLimiter::limitMergeToMaxNodes(NodeArray& nodes)
+{
+ // If not above max anyhow, we need not do anything
+ if (nodes.size() <= _maxNodes) return;
+ // Gather some statistics to base decision on what we are going to do on
+ Statistics stats(nodes);
+ NodeArray result;
+ // If we have trusted copies, these should be complete. Pick one of them
+ // and merge with as many untrusted copies as possible
+ if (stats.hasTrusted()) {
+ Statistics trusted(stats.extractGroupsWithTrustedCopies());
+ addNodes(_maxNodes - 1, stats, result);
+ addNodes(_maxNodes - result.size(), trusted, result);
+ } else {
+ addNodes(_maxNodes, stats, result);
+ }
+ std::stable_sort(result.begin(), result.end(), SourceOnlyOrder());
+ result.swap(nodes);
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.h b/storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.h
new file mode 100644
index 00000000000..acde58e6061
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergelimiter.h
@@ -0,0 +1,23 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/storage/distributor/operations/idealstate/mergemetadata.h>
+#include <vector>
+
+namespace storage {
+namespace distributor {
+
+class MergeLimiter {
+ uint16_t _maxNodes;
+
+public:
+ typedef std::vector<MergeMetaData> NodeArray;
+
+ MergeLimiter(uint16_t maxNodes);
+
+ void limitMergeToMaxNodes(NodeArray&);
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergemetadata.h b/storage/src/vespa/storage/distributor/operations/idealstate/mergemetadata.h
new file mode 100644
index 00000000000..3edc9a3f268
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergemetadata.h
@@ -0,0 +1,38 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/storage/distributor/bucketdb/bucketcopy.h>
+#include <vespa/vespalib/stllike/asciistream.h>
+
+namespace storage {
+namespace distributor {
+
+struct MergeMetaData {
+ uint16_t _nodeIndex;
+ bool _sourceOnly;
+ const BucketCopy* _copy;
+
+ MergeMetaData() : _nodeIndex(0), _sourceOnly(false), _copy(0) {}
+ MergeMetaData(uint16_t nodeIndex, const BucketCopy& copy)
+ : _nodeIndex(nodeIndex), _sourceOnly(false), _copy(&copy) {}
+
+ bool trusted() const {
+ assert(_copy != 0);
+ return _copy->trusted();
+ }
+ uint32_t checksum() const {
+ assert(_copy != 0);
+ return _copy->getChecksum();
+ }
+};
+
+inline vespalib::asciistream& operator<<(vespalib::asciistream& out,
+ const MergeMetaData& e)
+{
+ return out << "MergeMetaData(" << e._nodeIndex << ")";
+}
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
new file mode 100644
index 00000000000..e448192abfc
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.cpp
@@ -0,0 +1,393 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/operations/idealstate/mergeoperation.h>
+#include <vespa/storageapi/messageapi/storagereply.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/storage/distributor/operations/idealstate/removebucketoperation.h>
+
+#include <vespa/log/log.h>
+
+LOG_SETUP(".distributor.operation.idealstate.merge");
+
+namespace storage {
+namespace distributor {
+
+MergeOperation::~MergeOperation()
+{
+}
+
+std::string
+MergeOperation::getStatus() const
+{
+ return
+ Operation::getStatus() +
+ vespalib::make_string(" . Sent MergeBucketCommand at %s",
+ _sentMessageTime.toString().c_str());
+}
+
+void
+MergeOperation::addIdealNodes(
+ const lib::Distribution& distribution,
+ const lib::ClusterState& state,
+ const document::BucketId& bucketId,
+ const std::vector<MergeMetaData>& nodes,
+ std::vector<MergeMetaData>& result)
+{
+ std::vector<uint16_t> idealNodes(
+ distribution.getIdealStorageNodes(state, bucketId, "ui"));
+
+ // Add all ideal nodes first
+ for (uint32_t i = 0; i < idealNodes.size(); i++) {
+ const MergeMetaData* entry = 0;
+ for (uint32_t j = 0; j < nodes.size(); j++) {
+ if (idealNodes[i] == nodes[j]._nodeIndex) {
+ entry = &nodes[j];
+ break;
+ }
+ }
+
+ if (entry != 0) {
+ result.push_back(*entry);
+ result.back()._sourceOnly = false;
+ }
+ }
+}
+
+uint16_t
+MergeOperation::countTrusted(const std::vector<MergeMetaData>& nodes)
+{
+ uint16_t trusted = 0;
+ for (const auto& n : nodes) {
+ if (n.trusted()) {
+ ++trusted;
+ }
+ }
+ return trusted;
+}
+
+void
+MergeOperation::addTrustedNodesNotAlreadyAdded(
+ uint16_t redundancy,
+ const std::vector<MergeMetaData>& nodes,
+ std::vector<MergeMetaData>& result)
+{
+ uint16_t alreadyTrusted = countTrusted(result);
+ for (uint32_t i = 0; i < nodes.size(); i++) {
+ if (!nodes[i].trusted()) {
+ continue;
+ }
+
+ bool found = false;
+ for (uint32_t j = 0; j < result.size(); j++) {
+ if (result[j]._nodeIndex == nodes[i]._nodeIndex) {
+ found = true;
+ }
+ }
+
+ if (!found) {
+ result.push_back(nodes[i]);
+ result.back()._sourceOnly = (alreadyTrusted >= redundancy);
+ ++alreadyTrusted;
+ }
+ }
+}
+
+void
+MergeOperation::addCopiesNotAlreadyAdded(
+ uint16_t redundancy,
+ const std::vector<MergeMetaData>& nodes,
+ std::vector<MergeMetaData>& result)
+{
+ for (uint32_t i = 0; i < nodes.size(); i++) {
+ bool found = false;
+ for (uint32_t j = 0; j < result.size(); j++) {
+ if (result[j]._nodeIndex == nodes[i]._nodeIndex) {
+ found = true;
+ }
+ }
+
+ if (!found) {
+ result.push_back(nodes[i]);
+ result.back()._sourceOnly = (result.size() > redundancy);
+ }
+ }
+}
+
+void
+MergeOperation::generateSortedNodeList(
+ const lib::Distribution& distribution,
+ const lib::ClusterState& state,
+ const document::BucketId& bucketId,
+ MergeLimiter& limiter,
+ std::vector<MergeMetaData>& nodes)
+{
+ std::vector<MergeMetaData> result;
+ const uint16_t redundancy = distribution.getRedundancy();
+ addIdealNodes(distribution, state, bucketId, nodes, result);
+ addTrustedNodesNotAlreadyAdded(redundancy, nodes, result);
+ addCopiesNotAlreadyAdded(redundancy, nodes, result);
+ limiter.limitMergeToMaxNodes(result);
+ result.swap(nodes);
+}
+
+namespace {
+
+struct NodeIndexComparator
+{
+ bool operator()(const storage::api::MergeBucketCommand::Node& a,
+ const storage::api::MergeBucketCommand::Node& b) const
+ {
+ return a.index < b.index;
+ }
+};
+
+}
+
+void
+MergeOperation::onStart(DistributorMessageSender& sender)
+{
+ BucketDatabase::Entry entry = _manager->getDistributorComponent().getBucketDatabase().get(getBucketId());
+ if (!entry.valid()) {
+ LOGBP(debug,
+ "Unable to merge nonexisting bucket %s",
+ getBucketId().toString().c_str());
+ _ok = false;
+ done();
+ return;
+ }
+
+ const lib::ClusterState& clusterState(_manager->getDistributorComponent().getClusterState());
+ std::vector<vespalib::LinkedPtr<BucketCopy> > newCopies;
+ std::vector<MergeMetaData> nodes;
+
+ for (uint32_t i = 0; i < getNodes().size(); ++i) {
+ const BucketCopy* copy = entry->getNode(getNodes()[i]);
+ if (copy == 0) { // New copies?
+ newCopies.push_back(vespalib::LinkedPtr<BucketCopy>(
+ new BucketCopy(0, getNodes()[i], api::BucketInfo())));
+ copy = newCopies.back().get();
+ }
+ nodes.push_back(MergeMetaData(getNodes()[i], *copy));
+ }
+ _infoBefore = entry.getBucketInfo();
+
+ generateSortedNodeList(_manager->getDistributorComponent().getDistribution(),
+ clusterState,
+ getBucketId(),
+ _limiter,
+ nodes);
+ for (uint32_t i=0; i<nodes.size(); ++i) {
+ _mnodes.push_back(api::MergeBucketCommand::Node(
+ nodes[i]._nodeIndex, nodes[i]._sourceOnly));
+ }
+
+ if (_mnodes.size() > 1) {
+ std::shared_ptr<api::MergeBucketCommand> msg(
+ new api::MergeBucketCommand(
+ getBucketId(),
+ _mnodes,
+ _manager->getDistributorComponent().getUniqueTimestamp(),
+ clusterState.getVersion()));
+
+ // Due to merge forwarding/chaining semantics, we must always send
+ // the merge command to the lowest indexed storage node involved in
+ // the merge in order to avoid deadlocks.
+ std::sort(_mnodes.begin(), _mnodes.end(), NodeIndexComparator());
+
+ LOG(debug, "Sending %s to storage node %u", msg->toString().c_str(),
+ _mnodes[0].index);
+
+ // Set timeout to one hour to prevent hung nodes that manage to keep
+ // connections open from stalling merges in the cluster indefinitely.
+ msg->setTimeout(60 * 60 * 1000);
+ setCommandMeta(*msg);
+
+ sender.sendToNode(
+ lib::NodeType::STORAGE,
+ _mnodes[0].index,
+ msg);
+
+ _sentMessageTime = _manager->getDistributorComponent().getClock().getTimeInSeconds();
+ } else {
+ LOGBP(debug,
+ "Unable to merge bucket %s, since only one copy is available. "
+ "System state %s",
+ getBucketId().toString().c_str(),
+ clusterState.toString().c_str());
+ _ok = false;
+ done();
+ }
+}
+
+bool
+MergeOperation::sourceOnlyCopyChangedDuringMerge(
+ const BucketDatabase::Entry& currentState) const
+{
+ assert(currentState.valid());
+ for (size_t i = 0; i < _mnodes.size(); ++i) {
+ const BucketCopy* copyBefore(
+ _infoBefore.getNode(_mnodes[i].index));
+ if (!copyBefore) {
+ continue;
+ }
+ const BucketCopy* copyAfter(
+ currentState->getNode(_mnodes[i].index));
+ if (!copyAfter) {
+ LOG(debug, "Copy of %s on node %u removed during merge. Was %s",
+ getBucketId().toString().c_str(),
+ _mnodes[i].index,
+ copyBefore->toString().c_str());
+ continue;
+ }
+ if (_mnodes[i].sourceOnly
+ && !copyBefore->consistentWith(*copyAfter))
+ {
+ LOG(debug, "Source-only copy of %s on node %u changed from "
+ "%s to %s during the course of the merge. Failing it.",
+ getBucketId().toString().c_str(),
+ _mnodes[i].index,
+ copyBefore->toString().c_str(),
+ copyAfter->toString().c_str());
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void
+MergeOperation::deleteSourceOnlyNodes(
+ const BucketDatabase::Entry& currentState,
+ DistributorMessageSender& sender)
+{
+ assert(currentState.valid());
+ std::vector<uint16_t> sourceOnlyNodes;
+ for (uint32_t i = 0; i < _mnodes.size(); ++i) {
+ const uint16_t nodeIndex = _mnodes[i].index;
+ const BucketCopy* copy = currentState->getNode(nodeIndex);
+ if (!copy) {
+ continue; // No point in deleting what's not even there now.
+ }
+ if (copy->active()) {
+ LOG(spam,
+ "Not deleting copy on node %u for %s as it is marked active",
+ nodeIndex,
+ getBucketId().toString().c_str());
+ continue;
+ }
+ if (_mnodes[i].sourceOnly) {
+ sourceOnlyNodes.push_back(nodeIndex);
+ }
+ }
+
+ LOG(debug, "Attempting to delete %zu source only copies for %s",
+ sourceOnlyNodes.size(),
+ getBucketId().toString().c_str());
+
+ if (!sourceOnlyNodes.empty()) {
+ _removeOperation.reset(
+ new RemoveBucketOperation(
+ _manager->getDistributorComponent().getClusterName(),
+ BucketAndNodes(getBucketId(), sourceOnlyNodes)));
+ // Must not send removes to source only copies if something has caused
+ // pending load to the copy after the merge was sent!
+ if (_removeOperation->isBlocked(sender.getPendingMessageTracker())) {
+ LOG(debug,
+ "Source only removal for %s was blocked by a pending "
+ "operation",
+ getBucketId().toString().c_str());
+ _ok = false;
+ done();
+ return;
+ }
+ _removeOperation->setIdealStateManager(_manager);
+
+ if (_removeOperation->onStartInternal(sender)) {
+ _ok = _removeOperation->ok();
+ done();
+ }
+ }
+}
+
+void
+MergeOperation::onReceive(DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply> & msg)
+{
+ if (_removeOperation.get()) {
+ if (_removeOperation->onReceiveInternal(msg)) {
+ _ok = _removeOperation->ok();
+ done();
+ }
+
+ return;
+ }
+
+ api::MergeBucketReply& reply(dynamic_cast<api::MergeBucketReply&>(*msg));
+ LOG(debug,
+ "Merge operation for bucket %s finished",
+ getBucketId().toString().c_str());
+
+ api::ReturnCode result = reply.getResult();
+ _ok = result.success();
+ if (_ok) {
+ BucketDatabase::Entry entry(
+ _manager->getDistributorComponent().getBucketDatabase().get(getBucketId()));
+ if (!entry.valid()) {
+ LOG(debug, "Bucket %s no longer exists after merge",
+ getBucketId().toString().c_str());
+ done(); // Nothing more we can do.
+ return;
+ }
+ if (sourceOnlyCopyChangedDuringMerge(entry)) {
+ _ok = false;
+ done();
+ return;
+ }
+ deleteSourceOnlyNodes(entry, sender);
+ return;
+ } else if (result.isBusy()) {
+ } else if (result.isCriticalForMaintenance()) {
+ LOGBP(warning,
+ "Merging failed for %s: %s with error '%s'",
+ getBucketId().toString().c_str(),
+ msg->toString().c_str(),
+ msg->getResult().toString().c_str());
+ } else {
+ LOG(debug, "Merge failed for %s with non-critical failure: %s",
+ getBucketId().toString().c_str(), result.toString().c_str());
+ }
+ done();
+}
+
+namespace {
+
+static const uint32_t WRITE_FEED_MESSAGE_TYPES[] =
+{
+ api::MessageType::PUT_ID,
+ api::MessageType::REMOVE_ID,
+ api::MessageType::UPDATE_ID,
+ api::MessageType::REMOVELOCATION_ID,
+ api::MessageType::MULTIOPERATION_ID,
+ api::MessageType::BATCHPUTREMOVE_ID,
+ api::MessageType::BATCHDOCUMENTUPDATE_ID,
+ 0
+};
+
+}
+
+bool
+MergeOperation::shouldBlockThisOperation(uint32_t messageType, uint8_t pri) const
+{
+ for (uint32_t i = 0; WRITE_FEED_MESSAGE_TYPES[i] != 0; ++i) {
+ if (messageType == WRITE_FEED_MESSAGE_TYPES[i]) {
+ return true;
+ }
+ }
+
+ return IdealStateOperation::shouldBlockThisOperation(messageType, pri);
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h
new file mode 100644
index 00000000000..0dfe7756b79
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/mergeoperation.h
@@ -0,0 +1,84 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <vespa/storage/distributor/operations/idealstate/idealstateoperation.h>
+#include <vespa/storage/distributor/operations/idealstate/mergelimiter.h>
+#include <vespa/storage/distributor/operations/idealstate/mergemetadata.h>
+#include <vespa/storage/distributor/operations/idealstate/removebucketoperation.h>
+#include <vespa/storageapi/message/bucket.h>
+
+namespace storage {
+namespace lib {
+ class Distribution;
+}
+namespace distributor {
+
+class MergeOperation : public IdealStateOperation
+{
+protected:
+ bool sourceOnlyCopyChangedDuringMerge(const BucketDatabase::Entry&) const;
+
+ framework::SecondTime _sentMessageTime;
+ std::vector<api::MergeBucketCommand::Node> _mnodes;
+ std::unique_ptr<RemoveBucketOperation> _removeOperation;
+ BucketInfo _infoBefore;
+ MergeLimiter _limiter;
+
+public:
+ static const int LOAD = 10;
+
+ MergeOperation(const BucketAndNodes& nodes,
+ uint16_t maxNodes = 16)
+ : IdealStateOperation(nodes),
+ _sentMessageTime(0),
+ _limiter(maxNodes)
+ {}
+
+ ~MergeOperation();
+
+ void onStart(DistributorMessageSender& sender);
+
+ void onReceive(DistributorMessageSender& sender,
+ const api::StorageReply::SP&);
+
+ const char* getName() const { return "merge"; };
+
+ std::string getStatus() const;
+
+ Type getType() const { return MERGE_BUCKET; }
+
+ /** Generates ordered list of nodes that should be included in the merge */
+ static void generateSortedNodeList(
+ const lib::Distribution&, const lib::ClusterState&,
+ const document::BucketId&, MergeLimiter&,
+ std::vector<MergeMetaData>&);
+
+ bool shouldBlockThisOperation(uint32_t messageType, uint8_t pri) const;
+private:
+ static void addIdealNodes(
+ const lib::Distribution&,
+ const lib::ClusterState&,
+ const document::BucketId&,
+ const std::vector<MergeMetaData>& nodes,
+ std::vector<MergeMetaData>& result);
+
+ static void addTrustedNodesNotAlreadyAdded(
+ uint16_t redundancy,
+ const std::vector<MergeMetaData>& nodes,
+ std::vector<MergeMetaData>& result);
+
+ static void addCopiesNotAlreadyAdded(
+ uint16_t redundancy,
+ const std::vector<MergeMetaData>& nodes,
+ std::vector<MergeMetaData>& result);
+
+ static uint16_t countTrusted(const std::vector<MergeMetaData>& nodes);
+
+ void deleteSourceOnlyNodes(const BucketDatabase::Entry& currentState,
+ DistributorMessageSender& sender);
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/removebucketoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/removebucketoperation.cpp
new file mode 100644
index 00000000000..0db8cda367a
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/removebucketoperation.cpp
@@ -0,0 +1,124 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/operations/idealstate/removebucketoperation.h>
+#include <vespa/storageapi/messageapi/storagereply.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/storage/distributor/distributor.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+#include <vespa/vdslib/state/clusterstate.h>
+
+#include <vespa/log/log.h>
+
+LOG_SETUP(".distributor.operation.idealstate.remove");
+
+using namespace storage::distributor;
+
+bool
+RemoveBucketOperation::onStartInternal(DistributorMessageSender& sender)
+{
+ std::vector<std::pair<uint16_t, std::shared_ptr<api::DeleteBucketCommand> > > msgs;
+
+ BucketDatabase::Entry entry = _manager->getDistributorComponent().getBucketDatabase().get(getBucketId());
+
+ for (uint32_t i = 0; i < getNodes().size(); ++i) {
+ uint16_t node = getNodes()[i];
+ const BucketCopy* copy(entry->getNode(node));
+ if (!copy) {
+ LOG(debug, "Node %u was removed between scheduling remove "
+ "operation and starting it; not sending DeleteBucket to it",
+ node);
+ continue;
+ }
+ LOG(debug, "Sending DeleteBucket for %s to node %u",
+ getBucketId().toString().c_str(),
+ node);
+ std::shared_ptr<api::DeleteBucketCommand> msg(
+ new api::DeleteBucketCommand(getBucketId()));
+ setCommandMeta(*msg);
+ msg->setBucketInfo(copy->getBucketInfo());
+ msgs.push_back(std::make_pair(node, msg));
+ }
+
+ _ok = true;
+ if (!getNodes().empty()) {
+ _manager->getDistributorComponent().removeNodesFromDB(getBucketId(), getNodes());
+ for (uint32_t i = 0; i < msgs.size(); ++i) {
+ _tracker.queueCommand(msgs[i].second, msgs[i].first);
+ }
+ _tracker.flushQueue(sender);
+ }
+
+ return _tracker.finished();
+}
+
+
+void
+RemoveBucketOperation::onStart(DistributorMessageSender& sender)
+{
+ if (onStartInternal(sender)) {
+ done();
+ }
+}
+
+bool
+RemoveBucketOperation::onReceiveInternal(const std::shared_ptr<api::StorageReply> &msg)
+{
+ api::DeleteBucketReply* rep =
+ dynamic_cast<api::DeleteBucketReply*>(msg.get());
+
+ uint16_t node = _tracker.handleReply(*rep);
+
+ LOG(debug, "Got DeleteBucket reply for %s from node %u",
+ getBucketId().toString().c_str(),
+ node);
+
+ if (rep->getResult().failed()) {
+ if (rep->getResult().getResult() == api::ReturnCode::REJECTED
+ && rep->getBucketInfo().valid())
+ {
+ LOG(debug, "Got DeleteBucket rejection reply from storage for "
+ "%s on node %u: %s. Reinserting node into bucket db with %s",
+ getBucketId().toString().c_str(),
+ node,
+ rep->getResult().getMessage().c_str(),
+ rep->getBucketInfo().toString().c_str());
+
+ _manager->getDistributorComponent().updateBucketDatabase(
+ getBucketId(),
+ BucketCopy(_manager->getDistributorComponent().getUniqueTimestamp(),
+ node,
+ rep->getBucketInfo()),
+ DatabaseUpdate::CREATE_IF_NONEXISTING);
+ } else {
+ LOG(info,
+ "Remove operation on bucket %s failed. This distributor "
+ "has already removed the bucket from the bucket database, "
+ "so it is not possible to retry this operation. Failure code: %s",
+ getBucketId().toString().c_str(),
+ rep->getResult().toString().c_str());
+ }
+
+ _ok = false;
+ }
+
+ return _tracker.finished();
+}
+
+
+void
+RemoveBucketOperation::onReceive(DistributorMessageSender&,
+ const std::shared_ptr<api::StorageReply> &msg)
+{
+ if (onReceiveInternal(msg)) {
+ done();
+ }
+}
+
+bool
+RemoveBucketOperation::shouldBlockThisOperation(uint32_t, uint8_t) const
+{
+ return true;
+}
+
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/removebucketoperation.h b/storage/src/vespa/storage/distributor/operations/idealstate/removebucketoperation.h
new file mode 100644
index 00000000000..25eea35267b
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/removebucketoperation.h
@@ -0,0 +1,50 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/operations/idealstate/idealstateoperation.h>
+#include <vespa/storage/distributor/messagetracker.h>
+
+namespace storage
+{
+
+namespace distributor
+{
+
+class PendingMessageTracker;
+
+class RemoveBucketOperation : public IdealStateOperation
+{
+public:
+ RemoveBucketOperation(
+ const std::string& clusterName,
+ const BucketAndNodes& nodes)
+ : IdealStateOperation(nodes), _tracker(clusterName) {};
+
+ /**
+ Sends messages, returns true if we are done (sent nothing).
+ */
+ bool onStartInternal(DistributorMessageSender& sender);
+
+ /**
+ Sends messages, calls done() if we are done (sent nothing).
+ */
+ void onStart(DistributorMessageSender& sender);
+
+ bool onReceiveInternal(const std::shared_ptr<api::StorageReply> &);
+
+ void onReceive(DistributorMessageSender& sender, const std::shared_ptr<api::StorageReply> &);
+
+ const char* getName() const { return "remove"; };
+
+ Type getType() const { return DELETE_BUCKET; }
+
+ bool shouldBlockThisOperation(uint32_t, uint8_t) const;
+
+protected:
+ MessageTracker _tracker;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/setbucketstateoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/setbucketstateoperation.cpp
new file mode 100644
index 00000000000..9dc0947b219
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/setbucketstateoperation.cpp
@@ -0,0 +1,125 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/storage/distributor/operations/idealstate/setbucketstateoperation.h>
+#include <vespa/storageapi/message/bucket.h>
+
+#include <vespa/log/log.h>
+
+LOG_SETUP(".distributor.operation.idealstate.setactive");
+
+namespace storage {
+
+namespace distributor {
+
+void
+SetBucketStateOperation::enqueueSetBucketStateCommand(uint16_t node, bool active) {
+ std::shared_ptr<api::SetBucketStateCommand> msg(
+ new api::SetBucketStateCommand(
+ getBucketId(),
+ active
+ ? api::SetBucketStateCommand::ACTIVE
+ : api::SetBucketStateCommand::INACTIVE));
+ LOG(debug, "Enqueuing %s for %s to node %u",
+ active ? "Activate" : "Deactivate",
+ getBucketId().toString().c_str(),
+ node);
+ setCommandMeta(*msg);
+ _tracker.queueCommand(msg, node);
+}
+
+bool
+SetBucketStateOperation::shouldBeActive(uint16_t node) const
+{
+ for (uint32_t i=0, n=_wantedActiveNodes.size(); i<n; ++i) {
+ if (_wantedActiveNodes[i] == node) return true;
+ }
+ return false;
+}
+
+void
+SetBucketStateOperation::activateNode(DistributorMessageSender& sender) {
+ for (uint32_t i=0; i<_wantedActiveNodes.size(); ++i) {
+ enqueueSetBucketStateCommand(_wantedActiveNodes[i], true);
+ }
+ _tracker.flushQueue(sender);
+ _ok = true;
+}
+
+
+void
+SetBucketStateOperation::deactivateNodes(DistributorMessageSender& sender) {
+ const std::vector<uint16_t>& nodes(getNodes());
+ for (size_t i = 0; i < nodes.size(); ++i) {
+ if (!shouldBeActive(nodes[i])) {
+ enqueueSetBucketStateCommand(nodes[i], false);
+ }
+ }
+ _tracker.flushQueue(sender);
+}
+
+void
+SetBucketStateOperation::onStart(DistributorMessageSender& sender)
+{
+ activateNode(sender);
+}
+
+void
+SetBucketStateOperation::onReceive(DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply>& reply)
+{
+ api::SetBucketStateReply& rep(
+ static_cast<api::SetBucketStateReply&>(*reply));
+
+ const uint16_t node = _tracker.handleReply(rep);
+ LOG(debug, "Got %s from node %u", reply->toString(true).c_str(), node);
+
+ bool deactivate = false;
+ if (reply->getResult().success()) {
+ BucketDatabase::Entry entry =
+ _manager->getDistributorComponent().getBucketDatabase().get(rep.getBucketId());
+
+ if (entry.valid()) {
+ const BucketCopy* copy = entry->getNode(node);
+
+ if (copy) {
+ api::BucketInfo bInfo = copy->getBucketInfo();
+
+ if (shouldBeActive(node)) {
+ bInfo.setActive(true);
+ deactivate = true;
+ } else {
+ bInfo.setActive(false);
+ }
+
+ entry->updateNode(
+ BucketCopy(_manager->getDistributorComponent().getUniqueTimestamp(),
+ node,
+ bInfo).setTrusted(copy->trusted()));
+
+ _manager->getDistributorComponent().getBucketDatabase().update(entry);
+ }
+ } else {
+ LOG(debug, "%s did not exist when receiving %s",
+ rep.getBucketId().toString().c_str(),
+ rep.toString(true).c_str());
+ }
+ } else {
+ LOG(debug, "Failed setting state for %s on node %u: %s",
+ rep.getBucketId().toString().c_str(),
+ node,
+ reply->getResult().toString().c_str());
+ _ok = false;
+ }
+ if (deactivate) {
+ deactivateNodes(sender);
+ }
+
+ if (_tracker.finished()) {
+ done();
+ }
+}
+
+} // namespace distributor
+
+} // namespace storage
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/setbucketstateoperation.h b/storage/src/vespa/storage/distributor/operations/idealstate/setbucketstateoperation.h
new file mode 100644
index 00000000000..32a5fcaf381
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/setbucketstateoperation.h
@@ -0,0 +1,47 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/operations/idealstate/idealstateoperation.h>
+#include <vespa/storage/distributor/messagetracker.h>
+
+namespace storage
+{
+
+namespace distributor
+{
+
+class SetBucketStateOperation : public IdealStateOperation
+{
+public:
+ SetBucketStateOperation(const std::string& clusterName,
+ const BucketAndNodes& nodes,
+ const std::vector<uint16_t>& wantedActiveNodes)
+ : IdealStateOperation(nodes),
+ _tracker(clusterName),
+ _wantedActiveNodes(wantedActiveNodes)
+ {
+ }
+
+ void onStart(DistributorMessageSender&);
+
+ void onReceive(DistributorMessageSender&, const std::shared_ptr<api::StorageReply>&);
+
+ const char* getName() const { return "setbucketstate"; }
+
+ virtual Type getType() const { return SET_BUCKET_STATE; }
+
+protected:
+ MessageTracker _tracker;
+ std::vector<uint16_t> _wantedActiveNodes;
+
+private:
+ void enqueueSetBucketStateCommand(uint16_t node, bool active);
+ void activateNode(DistributorMessageSender& sender);
+ void deactivateNodes(DistributorMessageSender& sender);
+ bool shouldBeActive(uint16_t node) const;
+};
+
+} // namespace distributor
+
+} // namespace storage
+
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/splitoperation.cpp b/storage/src/vespa/storage/distributor/operations/idealstate/splitoperation.cpp
new file mode 100644
index 00000000000..423039f2b0c
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/splitoperation.cpp
@@ -0,0 +1,179 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/operations/idealstate/splitoperation.h>
+#include <vespa/storageapi/messageapi/storagereply.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/storage/distributor/idealstatemanager.h>
+#include <vespa/storage/common/bucketoperationlogger.h>
+#include <algorithm>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+
+#include <vespa/log/log.h>
+
+LOG_SETUP(".distributor.operation.idealstate.split");
+
+using namespace storage::distributor;
+
+void
+SplitOperation::onStart(DistributorMessageSender& sender)
+{
+ _ok = false;
+
+ BucketDatabase::Entry entry = _manager->getDistributorComponent()
+ .getBucketDatabase().get(getBucketId());
+
+ for (uint32_t i = 0; i < entry->getNodeCount(); i++) {
+ std::shared_ptr<api::SplitBucketCommand> msg(
+ new api::SplitBucketCommand(getBucketId()));
+ msg->setMaxSplitBits(_maxBits);
+ msg->setMinDocCount(_splitCount);
+ msg->setMinByteSize(_splitSize);
+ msg->setTimeout(INT_MAX);
+ setCommandMeta(*msg);
+ _tracker.queueCommand(msg, entry->getNodeRef(i).getNode());
+ _ok = true;
+ }
+
+ if (!_ok) {
+ LOGBP(debug, "Unable to split bucket %s, since no copies are available (some in maintenance?)", getBucketId().toString().c_str());
+ done();
+ } else {
+ _tracker.flushQueue(sender);
+ }
+}
+
+void
+SplitOperation::onReceive(DistributorMessageSender&, const api::StorageReply::SP& msg)
+{
+ api::SplitBucketReply& rep = static_cast<api::SplitBucketReply&>(*msg);
+
+ uint16_t node = _tracker.handleReply(rep);
+
+ if (node == 0xffff) {
+ LOG(debug, "Ignored reply since node was max uint16_t for unknown "
+ "reasons");
+ return;
+ }
+
+ std::ostringstream ost;
+
+ if (rep.getResult().success()) {
+ BucketDatabase::Entry entry =
+ _manager->getDistributorComponent().getBucketDatabase().get(rep.getBucketId());
+
+ if (entry.valid()) {
+ entry->removeNode(node);
+
+ if (entry->getNodeCount() == 0) {
+ LOG(spam, "Removing split bucket %s",
+ getBucketId().toString().c_str());
+ _manager->getDistributorComponent().getBucketDatabase().remove(rep.getBucketId());
+ } else {
+ _manager->getDistributorComponent().getBucketDatabase().update(entry);
+ }
+
+ ost << getBucketId() << " => ";
+ }
+
+ // Add new buckets.
+ for (uint32_t i = 0; i < rep.getSplitInfo().size(); i++) {
+ const api::SplitBucketReply::Entry& sinfo = rep.getSplitInfo()[i];
+
+ if (!sinfo.second.valid()) {
+ LOG(error, "Received invalid bucket %s from node %d as reply "
+ "to split bucket",
+ sinfo.first.toString().c_str(), node);
+ }
+
+ ost << sinfo.first << ",";
+
+ BucketCopy copy(
+ BucketCopy(_manager->getDistributorComponent().getUniqueTimestamp(),
+ node,
+ sinfo.second));
+
+ // Must reset trusted since otherwise trustedness of inconsistent
+ // copies would be arbitrarily determined by which copy managed
+ // to finish its split first.
+ _manager->getDistributorComponent().updateBucketDatabase(
+ sinfo.first, copy,
+ (DatabaseUpdate::CREATE_IF_NONEXISTING
+ | DatabaseUpdate::RESET_TRUSTED));
+
+ LOG_BUCKET_OPERATION_NO_LOCK(
+ sinfo.first, vespalib::make_vespa_string(
+ "Split from bucket %s: %s",
+ getBucketId().toString().c_str(),
+ copy.toString().c_str()));
+ }
+ } else if (
+ rep.getResult().getResult() == api::ReturnCode::BUCKET_NOT_FOUND
+ && _manager->getDistributorComponent().getBucketDatabase().get(rep.getBucketId())->getNode(node) != 0)
+ {
+ _manager->getDistributorComponent().recheckBucketInfo(node, getBucketId());
+ LOGBP(debug, "Split failed for %s: bucket not found. Storage and "
+ "distributor bucket databases might be out of sync: %s",
+ getBucketId().toString().c_str(),
+ rep.getResult().getMessage().c_str());
+ _ok = false;
+ } else if (rep.getResult().isBusy()) {
+ LOG(debug, "Split failed for %s, node was busy. Will retry later",
+ getBucketId().toString().c_str());
+ _ok = false;
+ } else if (rep.getResult().isCriticalForMaintenance()) {
+ LOGBP(warning, "Split failed for %s: %s with error '%s'",
+ getBucketId().toString().c_str(), msg->toString().c_str(),
+ msg->getResult().toString().c_str());
+ _ok = false;
+ } else {
+ LOG(debug, "Split failed for %s with non-critical failure: %s",
+ getBucketId().toString().c_str(),
+ rep.getResult().toString().c_str());
+ }
+#ifdef ENABLE_BUCKET_OPERATION_LOGGING
+ if (_ok) {
+ LOG_BUCKET_OPERATION_NO_LOCK(
+ getBucketId(), vespalib::make_vespa_string(
+ "Split OK on node %d: %s. Finished: %s",
+ node, ost.str().c_str(),
+ _tracker.finished() ? "yes" : "no"));
+ } else {
+ LOG_BUCKET_OPERATION_NO_LOCK(
+ getBucketId(), vespalib::make_vespa_string(
+ "Split FAILED on node %d: %s. Finished: %s",
+ node, rep.getResult().toString().c_str(),
+ _tracker.finished() ? "yes" : "no"));
+ }
+#endif
+
+ if (_tracker.finished()) {
+ LOG(debug, "Split done on node %d: %s completed operation",
+ node, ost.str().c_str());
+ done();
+ } else {
+ LOG(debug, "Split done on node %d: %s still pending on other nodes",
+ node, ost.str().c_str());
+ }
+}
+
+bool
+SplitOperation::isBlocked(const PendingMessageTracker& tracker) const
+{
+ return checkBlockForAllNodes(getBucketId(), tracker);
+}
+
+bool
+SplitOperation::shouldBlockThisOperation(uint32_t msgType,
+ uint8_t pri) const
+{
+ if (msgType == api::MessageType::SPLITBUCKET_ID && _priority >= pri) {
+ return true;
+ }
+ if (msgType == api::MessageType::JOINBUCKETS_ID) {
+ return true;
+ }
+
+ return false;
+}
diff --git a/storage/src/vespa/storage/distributor/operations/idealstate/splitoperation.h b/storage/src/vespa/storage/distributor/operations/idealstate/splitoperation.h
new file mode 100644
index 00000000000..e9e44f81930
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/idealstate/splitoperation.h
@@ -0,0 +1,55 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/operations/idealstate/idealstateoperation.h>
+#include <vespa/storage/distributor/messagetracker.h>
+
+namespace storage
+{
+
+namespace distributor
+{
+
+class SplitOperation : public IdealStateOperation
+{
+public:
+ SplitOperation(const std::string& clusterName,
+ const BucketAndNodes& nodes,
+ uint32_t maxBits,
+ uint32_t splitCount,
+ uint32_t splitSize)
+ : IdealStateOperation(nodes),
+ _tracker(clusterName),
+ _maxBits(maxBits),
+ _splitCount(splitCount),
+ _splitSize(splitSize)
+ {}
+
+ void onStart(DistributorMessageSender& sender) override;
+
+ void onReceive(DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply> &) override;
+
+ const char* getName() const override { return "split"; };
+
+ Type getType() const override { return SPLIT_BUCKET; }
+
+ uint32_t getMaxSplitBits() const { return _maxBits; }
+
+ bool isBlocked(const PendingMessageTracker&) const override;
+
+ bool shouldBlockThisOperation(uint32_t, uint8_t) const override;
+
+protected:
+ MessageTracker _tracker;
+
+ uint32_t _maxBits;
+ uint32_t _splitCount;
+ uint32_t _splitSize;
+ std::vector<document::BucketId> _inconsistentBuckets;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/operations/operation.cpp b/storage/src/vespa/storage/distributor/operations/operation.cpp
new file mode 100644
index 00000000000..7a1929115d3
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/operation.cpp
@@ -0,0 +1,55 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+/* $Id$ */
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/common/distributorcomponent.h>
+#include <vespa/storage/distributor/operations/operation.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <vespa/storageapi/messageapi/storagecommand.h>
+#include <vespa/storageapi/messageapi/storagereply.h>
+
+#include <vespa/log/log.h>
+
+LOG_SETUP(".distributor.callback");
+
+namespace storage {
+
+namespace distributor {
+
+Operation::Operation()
+ : _startTime(0)
+{
+}
+
+Operation::~Operation()
+{
+}
+
+std::string
+Operation::getStatus() const
+{
+ return vespalib::make_string("%s (started %s)",
+ getName(), _startTime.toString().c_str());
+}
+
+void
+Operation::start(DistributorMessageSender& sender,
+ framework::MilliSecTime startTime)
+{
+ _startTime = startTime;
+ onStart(sender);
+}
+
+void
+Operation::copyMessageSettings(const api::StorageCommand& source, api::StorageCommand& target)
+{
+ target.getTrace().setLevel(source.getTrace().getLevel());
+ target.setTimeout(source.getTimeout());
+ target.setPriority(source.getPriority());
+ target.setLoadType(source.getLoadType());
+}
+
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/operations/operation.h b/storage/src/vespa/storage/distributor/operations/operation.h
new file mode 100644
index 00000000000..50917334cfc
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operations/operation.h
@@ -0,0 +1,98 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storageapi/messageapi/returncode.h>
+#include <vespa/vdslib/state/nodetype.h>
+#include <vespa/storage/distributor/distributormessagesender.h>
+#include <vespa/storageframework/storageframework.h>
+
+namespace storage
+{
+
+namespace api {
+class StorageMessage;
+class StorageReply;
+}
+
+class StorageComponent;
+
+namespace distributor {
+
+class PendingMessageTracker;
+
+class Operation
+{
+public:
+ typedef std::shared_ptr<Operation> SP;
+
+ Operation();
+
+ virtual ~Operation();
+
+ /**
+ Tell the callback that storage is shutting down. Reply to any pending
+ stuff.
+ */
+ virtual void onClose(DistributorMessageSender&) = 0;
+
+ /**
+ When a reply has been received, the storagelink will call receive()
+ on the owner of the message that was replied to.
+ */
+ virtual void receive(DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply> & msg)
+ {
+ onReceive(sender, msg);
+ }
+
+ virtual const char* getName() const = 0;
+
+ virtual std::string getStatus() const;
+
+ virtual std::string toString() const {
+ return std::string(getName());
+ }
+
+ /**
+ Starts the callback, sending any messages etc. Sets _startTime to current time
+ */
+ virtual void start(DistributorMessageSender& sender, framework::MilliSecTime startTime);
+
+ /**
+ * Returns true if we are blocked to start this operation given
+ * the pending messages.
+ */
+ virtual bool isBlocked(const PendingMessageTracker&) const {
+ return false;
+ }
+
+ /**
+ Returns the timestamp on which the first message was sent from this callback.
+ */
+ framework::MilliSecTime getStartTime() const { return _startTime; }
+
+ /**
+ Transfers message settings such as priority, timeout, etc. from one message to another.
+ */
+ static void copyMessageSettings(const api::StorageCommand& source,
+ api::StorageCommand& target);
+
+private:
+ /**
+ Implementation of start for the callback
+ */
+ virtual void onStart(DistributorMessageSender& sender) = 0;
+
+ virtual void onReceive(DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply> & msg) = 0;
+
+protected:
+ framework::MilliSecTime _startTime;
+};
+
+}
+
+}
+
+
+
diff --git a/storage/src/vespa/storage/distributor/operationstarter.h b/storage/src/vespa/storage/distributor/operationstarter.h
new file mode 100644
index 00000000000..bc556128247
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operationstarter.h
@@ -0,0 +1,23 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+
+namespace storage {
+namespace distributor {
+
+class Operation;
+
+class OperationStarter
+{
+public:
+ typedef uint8_t Priority;
+
+ virtual ~OperationStarter() {}
+
+ virtual bool start(const std::shared_ptr<Operation>& operation,
+ Priority priority) = 0;
+};
+
+}
+}
+
diff --git a/storage/src/vespa/storage/distributor/operationtargetresolver.cpp b/storage/src/vespa/storage/distributor/operationtargetresolver.cpp
new file mode 100644
index 00000000000..05255eefcb6
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operationtargetresolver.cpp
@@ -0,0 +1,237 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <map>
+#include <queue>
+#include <vespa/storage/distributor/operationtargetresolver.h>
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/vespalib/stllike/hash_set.h>
+
+namespace storage {
+namespace distributor {
+
+using document::BucketId;
+
+namespace {
+
+struct BucketState
+{
+ BucketId bid;
+ BucketCopy copy;
+
+ BucketState(const BucketId& id, const BucketCopy& cpy)
+ : bid(id), copy(cpy)
+ {
+ }
+};
+
+enum Priority
+{
+ NONE_FOUND,
+ NOT_TRUSTED_OR_IDEAL,
+ IDEAL_STATE_NOT_TRUSTED,
+ TRUSTED,
+};
+
+class NodePriority
+{
+public:
+ NodePriority(const BucketState* state,
+ Priority priority,
+ uint16_t node)
+ : _state(state),
+ _priority(priority),
+ _node(node)
+ {
+ }
+
+ bool operator<(const NodePriority& other) const {
+ return _priority < other._priority;
+ }
+
+ const BucketState* getState() const { return _state; }
+ Priority getPriority() const { return _priority; }
+ uint16_t getNode() const { return _node; }
+
+ bool isValid() const { return _state != 0; }
+
+ void reset(const BucketState* state, Priority priority) {
+ _state = state;
+ _priority = priority;
+ }
+
+ bool worseThan(Priority otherPri) const {
+ return _priority < otherPri;
+ }
+ bool equalTo(Priority otherPri) const {
+ return _priority == otherPri;
+ }
+ /**
+ * Returns true iff the current best bucket has fewer used bits
+ * than the parameter bucket. Requires current best bucket to be
+ * set already.
+ */
+ bool lessSplitThan(const BucketId& bid) const {
+ return _state->bid.getUsedBits() < bid.getUsedBits();
+ }
+private:
+ const BucketState* _state;
+ Priority _priority;
+ uint16_t _node;
+};
+
+}
+
+document::BucketId
+OperationTargetResolver::bestBucketToCreate(
+ const document::BucketId& target) const
+{
+ return _manager.getAppropriateBucket(target);
+}
+
+document::BucketId
+OperationTargetResolver::getHighestSplitBucketAcrossNodes(
+ const document::BucketId& target,
+ const std::vector<BucketDatabase::Entry>& entries) const
+{
+ document::BucketId highest;
+ if (entries.empty()) {
+ highest = bestBucketToCreate(target);
+ }
+ for (uint32_t i = 0; i < entries.size(); ++i) {
+ const BucketDatabase::Entry& entry(entries[i]);
+ if (entry.getBucketId().getUsedBits() > highest.getUsedBits()) {
+ highest = entry.getBucketId();
+ }
+ }
+ return highest;
+}
+
+namespace {
+
+NodePriority
+findBestExistingCopyOnNode(uint16_t node,
+ const std::vector<BucketState>& states,
+ const std::vector<uint16_t>& idealNodes)
+{
+ NodePriority best(NULL, NONE_FOUND, node);
+
+ for (size_t j = 0; j < states.size(); ++j) {
+ if (states[j].copy.trusted()) {
+ // We always prefer to send to trusted copies.
+ if (best.worseThan(TRUSTED)
+ || best.lessSplitThan(states[j].bid))
+ {
+ best.reset(&states[j], TRUSTED);
+ }
+ } else if (std::find(idealNodes.begin(), idealNodes.end(), node)
+ != idealNodes.end())
+ {
+ // Node is in ideal state for the highest split bucket.
+ if (best.worseThan(IDEAL_STATE_NOT_TRUSTED)
+ || (best.equalTo(IDEAL_STATE_NOT_TRUSTED)
+ && best.lessSplitThan(states[j].bid)))
+ {
+ best.reset(&states[j], IDEAL_STATE_NOT_TRUSTED);
+ }
+ } else {
+ // Not trusted or in ideal state for highest split; just add so
+ // we have a "best effort" bucket on this node at all.
+ if (best.worseThan(NOT_TRUSTED_OR_IDEAL)
+ || (best.equalTo(NOT_TRUSTED_OR_IDEAL)
+ && best.lessSplitThan(states[j].bid)))
+ {
+ best.reset(&states[j], NOT_TRUSTED_OR_IDEAL);
+ }
+ }
+ }
+ return best;
+}
+
+}
+
+void
+OperationTargetResolver::getTargets(const BucketId& bid,
+ std::vector<BucketDatabase::Entry>& entries,
+ std::vector<OperationTarget>& sendToExisting,
+ std::vector<OperationTarget>& createNew)
+{
+ entries.clear();
+ _manager.getBucketDatabase().getParents(bid, entries);
+
+ /*
+ * 1. Find all buckets on all nodes and the highest split bucket.
+ * 2. Add buckets on nodes where the bucket is either trusted (may or may
+ * not be the highest split on the node) OR if the node is in the
+ * ideal state for the highest split bucket.
+ * 3. If redundancy is still not satisfied, create new buckets according to
+ * ideal state.
+ */
+
+ typedef std::map<uint16_t, std::vector<BucketState> > NodeBucketMap;
+ NodeBucketMap bucketsPerNode;
+ for (size_t i = 0; i < entries.size(); ++i) {
+ const BucketDatabase::Entry& entry(entries[i]);
+ const BucketInfo& info(entry.getBucketInfo());
+
+ for (uint32_t j = 0; j < info.getNodeCount(); ++j) {
+ const BucketCopy& copy(info.getNodeRef(j));
+ bucketsPerNode[copy.getNode()].push_back(
+ BucketState(entry.getBucketId(), copy));
+ }
+ }
+
+ document::BucketId highestSplitIdAcrossNodes(
+ getHighestSplitBucketAcrossNodes(bid, entries));
+
+ std::vector<uint16_t> idealNodes(
+ _manager.getDistribution().getIdealStorageNodes(
+ _manager.getClusterState(), highestSplitIdAcrossNodes, "ui"));
+
+ std::priority_queue<NodePriority> candidates;
+
+ // Create prioritized list of node+bucket pairs.
+ for (NodeBucketMap::iterator it(bucketsPerNode.begin()),
+ e(bucketsPerNode.end()); it != e; ++it)
+ {
+ const uint16_t node(it->first);
+ const std::vector<BucketState>& states(it->second);
+
+ NodePriority best(findBestExistingCopyOnNode(node, states, idealNodes));
+ if (best.isValid()) {
+ candidates.push(best);
+ }
+ }
+ vespalib::hash_set<uint16_t> existingNodes(candidates.size() * 2);
+ while (!candidates.empty()
+ && sendToExisting.size() < idealNodes.size())
+ {
+ const NodePriority& np(candidates.top());
+ sendToExisting.push_back(OperationTarget(np.getState()->bid, np.getNode()));
+ existingNodes.insert(np.getNode());
+ candidates.pop();
+ }
+
+ // If the wanted redundancy has not been satisfied by the existing copies,
+ // we have to create additional ones. For this, we create the highest split
+ // bucket on any ideal nodes that don't already have a copy.
+ for (size_t i = 0; i < idealNodes.size(); ++i) {
+ if (sendToExisting.size() + createNew.size() >= idealNodes.size()) {
+ break;
+ }
+ const uint16_t ideal = idealNodes[i];
+ if (existingNodes.find(ideal) != existingNodes.end()) {
+ continue;
+ }
+ createNew.push_back(OperationTarget(highestSplitIdAcrossNodes, ideal));
+ }
+ assert(sendToExisting.size() + createNew.size() == idealNodes.size());
+ // Sort based on bucket and nodes to make operation ordering consistent
+ std::sort(sendToExisting.begin(), sendToExisting.end());
+ std::sort(createNew.begin(), createNew.end());
+}
+
+}
+}
+
diff --git a/storage/src/vespa/storage/distributor/operationtargetresolver.h b/storage/src/vespa/storage/distributor/operationtargetresolver.h
new file mode 100644
index 00000000000..87757ec3227
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operationtargetresolver.h
@@ -0,0 +1,75 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \brief Interface to deduct what bucket copies to send load to.
+ *
+ * - Must handle inconsistent split buckets.
+ */
+#pragma once
+
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/vdslib/container/smallvector.h>
+#include <vespa/vdslib/state/node.h>
+#include <vespa/vespalib/util/printable.h>
+
+namespace storage {
+namespace distributor {
+
+class OperationTarget : public vespalib::AsciiPrintable
+{
+ document::BucketId _bucket;
+ lib::Node _node;
+ bool _newCopy;
+
+public:
+ OperationTarget() : _newCopy(true) {}
+ OperationTarget(const document::BucketId& id, const lib::Node& node, bool newCopy)
+ : _bucket(id), _node(node), _newCopy(newCopy) {}
+
+ const document::BucketId& getBucketId() const { return _bucket; }
+ const lib::Node& getNode() const { return _node; }
+ bool isNewCopy() const { return _newCopy; }
+
+ bool operator==(const OperationTarget& o) const {
+ return (_bucket == o._bucket && _node == o._node && _newCopy == o._newCopy);
+ }
+ bool operator!=(const OperationTarget& o) const {
+ return !(operator==(o));
+ }
+
+ void print(vespalib::asciistream& out, const PrintProperties&) const {
+ out << "OperationTarget(" << _bucket << ", " << _node
+ << (_newCopy ? ", new copy" : ", existing copy") << ")";
+ }
+};
+
+class OperationTargetList : public lib::SmallVector<OperationTarget> {
+public:
+ bool hasAnyNewCopies() const {
+ for (size_t i=0; i<size(); ++i) {
+ if (operator[](i).isNewCopy()) return true;
+ }
+ return false;
+ }
+ bool hasAnyExistingCopies() const {
+ for (size_t i=0; i<size(); ++i) {
+ if (!operator[](i).isNewCopy()) return true;
+ }
+ return false;
+ }
+};
+
+class OperationTargetResolver {
+public:
+ virtual ~OperationTargetResolver() {}
+
+ // Sadly all operations but put currently implement this by themselves.
+ enum OperationType {
+ PUT
+ };
+
+ virtual OperationTargetList getTargets(OperationType type,
+ const document::BucketId& id) = 0;
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/operationtargetresolverimpl.cpp b/storage/src/vespa/storage/distributor/operationtargetresolverimpl.cpp
new file mode 100644
index 00000000000..0ea85c47f75
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operationtargetresolverimpl.cpp
@@ -0,0 +1,193 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/storage/distributor/operationtargetresolverimpl.h>
+
+namespace storage {
+namespace distributor {
+
+BucketInstance::BucketInstance(
+ const document::BucketId& id, const api::BucketInfo& info,
+ lib::Node node, uint16_t idealLocationPriority, bool trusted, bool exist)
+ : _bucket(id), _info(info), _node(node),
+ _idealLocationPriority(idealLocationPriority), _trusted(trusted), _exist(exist)
+{
+}
+
+void
+BucketInstance::print(vespalib::asciistream& out, const PrintProperties&) const
+{
+ std::string infoString(_info.toString());
+ infoString = infoString.substr(10, infoString.size() - 10);
+
+ std::ostringstream ost;
+ ost << std::hex << _bucket.getId();
+ out << "(" << ost.str() << ", "
+ << infoString << ", node " << _node.getIndex()
+ << ", ideal " << _idealLocationPriority
+ << (_trusted ? ", trusted" : "")
+ << (_exist ? "" : ", new copy")
+ << ")";
+}
+
+bool
+BucketInstanceList::contains(lib::Node node) const {
+ for (uint32_t i=0; i<_instances.size(); ++i) {
+ if (_instances[i]._node == node) return true;
+ }
+ return false;
+}
+
+void
+BucketInstanceList::add(BucketDatabase::Entry& e,
+ const lib::IdealNodeList& idealState)
+{
+ for (uint32_t i = 0; i < e.getBucketInfo().getNodeCount(); ++i) {
+ const BucketCopy& copy(e.getBucketInfo().getNodeRef(i));
+ lib::Node node(lib::NodeType::STORAGE, copy.getNode());
+ _instances.push_back(BucketInstance(
+ e.getBucketId(), copy.getBucketInfo(), node,
+ idealState.indexOf(node), copy.trusted()));
+ }
+}
+
+void
+BucketInstanceList::populate(const document::BucketId& specificId, BucketDatabase& db,
+ const lib::IdealNodeCalculator& idealNodeCalc)
+{
+ std::vector<BucketDatabase::Entry> entries;
+ db.getParents(specificId, entries);
+ for (uint32_t i=0; i<entries.size(); ++i) {
+ lib::IdealNodeList idealNodes(idealNodeCalc.getIdealStorageNodes(
+ entries[i].getBucketId(),
+ lib::IdealNodeCalculator::UpInitMaintenance));
+ add(entries[i], idealNodes);
+ }
+}
+
+void
+BucketInstanceList::removeNodeDuplicates()
+{
+ // Normally few entries in list. Probably heaper to just go through entries
+ // to detect whether it pre exist rather than creating a set or similar.
+ BucketInstanceList other;
+ for (uint32_t i=0; i<_instances.size(); ++i) {
+ BucketInstance& instance(_instances[i]);
+ if (!other.contains(instance._node)) {
+ other.add(instance);
+ }
+ }
+ _instances.swap(other._instances);
+}
+
+void
+BucketInstanceList::limitToRedundancyCopies(uint16_t redundancy)
+{
+ while (_instances.size() > redundancy) _instances.pop_back();
+}
+
+document::BucketId
+BucketInstanceList::leastSpecificLeafBucketInSubtree(
+ const document::BucketId& candidateId,
+ const document::BucketId& mostSpecificId,
+ const BucketDatabase& db) const
+{
+ assert(candidateId.contains(mostSpecificId));
+ document::BucketId treeNode = candidateId;
+ // treeNode may reach at most 58 bits since buckets at 58 bits by definition
+ // cannot have any children.
+ while (db.childCount(treeNode) != 0) {
+ treeNode = document::BucketId(treeNode.getUsedBits() + 1,
+ mostSpecificId.getRawId());
+ }
+ assert(treeNode.contains(mostSpecificId));
+ return treeNode;
+}
+
+void
+BucketInstanceList::extendToEnoughCopies(
+ const BucketDatabase& db,
+ const document::BucketId& targetIfNonPreExisting,
+ const document::BucketId& mostSpecificId,
+ const lib::IdealNodeCalculator& idealNodeCalc)
+{
+ document::BucketId newTarget(_instances.empty() ? targetIfNonPreExisting
+ : _instances[0]._bucket);
+ newTarget = leastSpecificLeafBucketInSubtree(newTarget, mostSpecificId, db);
+
+ lib::IdealNodeList idealNodes(idealNodeCalc.getIdealStorageNodes(
+ newTarget, lib::IdealNodeCalculator::UpInit));
+ for (uint32_t i=0; i<idealNodes.size(); ++i) {
+ if (!contains(idealNodes[i])) {
+ _instances.push_back(BucketInstance(
+ newTarget, api::BucketInfo(), idealNodes[i],
+ i, false, false));
+ }
+ }
+}
+
+OperationTargetList
+BucketInstanceList::createTargets()
+{
+ OperationTargetList result;
+ for (uint32_t i=0; i<_instances.size(); ++i) {
+ BucketInstance& bi(_instances[i]);
+ result.push_back(OperationTarget(bi._bucket, bi._node, !bi._exist));
+ }
+ return result;
+}
+
+namespace {
+
+/**
+ * - Trusted copies should be preferred over non-trusted copies for the same bucket.
+ * - Buckets in ideal locations should be preferred over non-ideal locations for the
+ * same bucket across several nodes.
+ * - Buckets with data should be preferred over buckets without data.
+ *
+ * - Right after split/join, bucket is often not in ideal location, but should be
+ * preferred instead of source anyhow.
+ */
+struct InstanceOrder {
+ bool operator()(const BucketInstance& a, const BucketInstance& b) {
+ if (a._bucket == b._bucket) {
+ // Trusted only makes sense within same bucket
+ // Prefer trusted buckets over non-trusted ones.
+ if (a._trusted != b._trusted) return a._trusted;
+ if (a._idealLocationPriority != b._idealLocationPriority) {
+ return a._idealLocationPriority < b._idealLocationPriority;
+ }
+ } else {
+ if ((a._info.getMetaCount() == 0) ^ (b._info.getMetaCount() == 0)) {
+ return (a._info.getMetaCount() == 0);
+ }
+ return (a._bucket.getUsedBits() > b._bucket.getUsedBits());
+ }
+ return false;
+ }
+};
+
+} // anonymous
+
+BucketInstanceList
+OperationTargetResolverImpl::getAllInstances(OperationType type,
+ const document::BucketId& id)
+{
+ BucketInstanceList instances;
+ if (type == PUT) {
+ instances.populate(id, _bucketDatabase, _idealNodeCalculator);
+ instances.sort(InstanceOrder());
+ instances.removeNodeDuplicates();
+ instances.extendToEnoughCopies(
+ _bucketDatabase,
+ _bucketDatabase.getAppropriateBucket(_minUsedBucketBits, id),
+ id,
+ _idealNodeCalculator);
+ } else {
+ throw vespalib::IllegalArgumentException(
+ "Unsupported operation type given", VESPA_STRLOC);
+ }
+ return instances;
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/operationtargetresolverimpl.h b/storage/src/vespa/storage/distributor/operationtargetresolverimpl.h
new file mode 100644
index 00000000000..08d79c501e4
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/operationtargetresolverimpl.h
@@ -0,0 +1,115 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <vespa/storage/distributor/operationtargetresolver.h>
+#include <vespa/vdslib/distribution/idealnodecalculator.h>
+
+namespace storage {
+namespace distributor {
+
+struct BucketInstance : public vespalib::AsciiPrintable {
+ document::BucketId _bucket;
+ api::BucketInfo _info;
+ lib::Node _node;
+ uint16_t _idealLocationPriority;
+ bool _trusted;
+ bool _exist;
+
+ BucketInstance() : _idealLocationPriority(0xffff),
+ _trusted(false), _exist(false) {}
+ BucketInstance(const document::BucketId& id, const api::BucketInfo& info,
+ lib::Node node, uint16_t idealLocationPriority, bool trusted,
+ bool exist = true);
+
+ void print(vespalib::asciistream& out, const PrintProperties&) const;
+};
+
+class BucketInstanceList : public vespalib::AsciiPrintable {
+ lib::SmallVector<BucketInstance> _instances;
+
+ /**
+ * Resolve and return the least specific bucket in the subtree of (and
+ * including) candidateId that is a leaf node in the tree. I.e. a bucket
+ * whose insertion will not cause an inconsistency with other leaf buckets
+ * in the tree at the minimum possible depth at or below candidateId.
+ *
+ * Preconditions:
+ * candidateId.contains(mostSpecificId)
+ * Postconditions:
+ * <return value>.contains(mostSpecificId)
+ */
+ document::BucketId leastSpecificLeafBucketInSubtree(
+ const document::BucketId& candidateId,
+ const document::BucketId& mostSpecificId,
+ const BucketDatabase& db) const;
+
+public:
+ void add(const BucketInstance& instance) { _instances.push_back(instance); }
+ bool contains(lib::Node node) const;
+ void removeNodeDuplicates();
+ void limitToRedundancyCopies(uint16_t redundancy);
+ /**
+ * Preconditions:
+ * targetIfNonPreExisting.contains(mostSpecificId)
+ * Postconditions:
+ * _instances.size() >= configured redundancy level, unless insufficient
+ * number of nodes are available
+ */
+ void extendToEnoughCopies(const BucketDatabase& db,
+ const document::BucketId& targetIfNonPreExisting,
+ const document::BucketId& mostSpecificId,
+ const lib::IdealNodeCalculator& idealNodeCalc);
+
+ void populate(const document::BucketId&, BucketDatabase&,
+ const lib::IdealNodeCalculator&);
+ void add(BucketDatabase::Entry& e, const lib::IdealNodeList& idealState);
+
+ template <typename Order>
+ void sort(const Order& order) {
+ std::sort(_instances.begin(), _instances.end(), order);
+ }
+
+ OperationTargetList createTargets();
+
+ void print(vespalib::asciistream& out, const PrintProperties& p) const {
+ _instances.print(out, p);
+ }
+};
+
+class OperationTargetResolverImpl : public OperationTargetResolver {
+ BucketDatabase& _bucketDatabase;
+ const lib::IdealNodeCalculator& _idealNodeCalculator;
+ uint32_t _minUsedBucketBits;
+ uint16_t _redundancy;
+
+public:
+ OperationTargetResolverImpl(BucketDatabase& bucketDatabase,
+ const lib::IdealNodeCalculator& idealNodeCalc,
+ uint32_t minUsedBucketBits,
+ uint16_t redundancy)
+ : _bucketDatabase(bucketDatabase),
+ _idealNodeCalculator(idealNodeCalc),
+ _minUsedBucketBits(minUsedBucketBits),
+ _redundancy(redundancy)
+ {
+ }
+
+ BucketInstanceList getAllInstances(OperationType type,
+ const document::BucketId& id);
+ BucketInstanceList getInstances(OperationType type, const document::BucketId& id)
+ {
+ BucketInstanceList result(getAllInstances(type, id));
+ result.limitToRedundancyCopies(_redundancy);
+ return result;
+ }
+
+ virtual OperationTargetList getTargets(OperationType type,
+ const document::BucketId& id)
+ { return getInstances(type, id).createTargets(); }
+};
+
+} // distributor
+} // storage
+
diff --git a/storage/src/vespa/storage/distributor/pendingclusterstate.cpp b/storage/src/vespa/storage/distributor/pendingclusterstate.cpp
new file mode 100644
index 00000000000..e064295e540
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/pendingclusterstate.cpp
@@ -0,0 +1,600 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/pendingclusterstate.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/distributor/bucketdbupdater.h>
+#include <vespa/storage/distributor/distributormessagesender.h>
+#include <vespa/storage/storageutil/utils.h>
+#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
+#include <vespa/storage/common/bucketoperationlogger.h>
+
+LOG_SETUP(".pendingclusterstate");
+
+namespace storage {
+
+namespace distributor {
+
+using lib::Node;
+using lib::NodeType;
+using lib::NodeState;
+
+PendingClusterState::PendingClusterState(
+ const framework::Clock& clock,
+ const ClusterInformation::CSP& clusterInfo,
+ DistributorMessageSender& sender,
+ const std::shared_ptr<api::SetSystemStateCommand>& newStateCmd,
+ const std::unordered_set<uint16_t>& outdatedNodes,
+ api::Timestamp creationTimestamp)
+ : _cmd(newStateCmd),
+ _requestedNodes(
+ newStateCmd->getSystemState().getNodeCount(lib::NodeType::STORAGE)),
+ _outdatedNodes(
+ newStateCmd->getSystemState().getNodeCount(lib::NodeType::STORAGE)),
+ _iter(0),
+ _prevClusterState(clusterInfo->getClusterState()),
+ _newClusterState(newStateCmd->getSystemState()),
+ _clock(clock),
+ _clusterInfo(clusterInfo),
+ _creationTimestamp(creationTimestamp),
+ _sender(sender)
+{
+ logConstructionInformation();
+ if (distributorChanged(_prevClusterState, _newClusterState)) {
+ markAllAvailableNodesAsRequiringRequest();
+ } else {
+ updateSetOfNodesThatAreOutdated();
+ addAdditionalNodesToOutdatedSet(outdatedNodes);
+ }
+ if (shouldRequestBucketInfo()) {
+ requestNodes();
+ }
+}
+
+PendingClusterState::PendingClusterState(
+ const framework::Clock& clock,
+ const ClusterInformation::CSP& clusterInfo,
+ DistributorMessageSender& sender,
+ api::Timestamp creationTimestamp)
+ : _requestedNodes(clusterInfo->getStorageNodeCount()),
+ _outdatedNodes(clusterInfo->getStorageNodeCount()),
+ _iter(0),
+ _prevClusterState(clusterInfo->getClusterState()),
+ _newClusterState(clusterInfo->getClusterState()),
+ _clock(clock),
+ _clusterInfo(clusterInfo),
+ _creationTimestamp(creationTimestamp),
+ _sender(sender)
+{
+ logConstructionInformation();
+ markAllAvailableNodesAsRequiringRequest();
+ if (shouldRequestBucketInfo()) {
+ requestNodes();
+ }
+}
+
+void
+PendingClusterState::logConstructionInformation() const
+{
+ LOG(debug,
+ "New PendingClusterState constructed with previous cluster "
+ "state '%s', new cluster state '%s', distribution config "
+ "hash: '%s'",
+ _prevClusterState.toString().c_str(),
+ _newClusterState.toString().c_str(),
+ _clusterInfo->getDistribution().getNodeGraph().getDistributionConfigHash().c_str());
+}
+
+bool
+PendingClusterState::storageNodeUpInNewState(uint16_t node) const
+{
+ return _newClusterState.getNodeState(Node(NodeType::STORAGE, node))
+ .getState().oneOf(_clusterInfo->getStorageUpStates());
+}
+
+void
+PendingClusterState::markAllAvailableNodesAsRequiringRequest()
+{
+ const uint16_t nodeCount(newStateStorageNodeCount());
+ for (uint16_t i = 0; i < nodeCount; ++i) {
+ if (storageNodeUpInNewState(i)) {
+ _outdatedNodes.insert(i);
+ }
+ }
+}
+
+void
+PendingClusterState::addAdditionalNodesToOutdatedSet(
+ const std::unordered_set<uint16_t>& nodes)
+{
+ const uint16_t nodeCount(newStateStorageNodeCount());
+ for (uint16_t node : nodes) {
+ if (node < nodeCount) {
+ _outdatedNodes.insert(node);
+ }
+ }
+}
+
+std::unordered_set<uint16_t>
+PendingClusterState::getOutdatedNodeSet() const
+{
+ return _outdatedNodes;
+}
+
+uint16_t
+PendingClusterState::newStateStorageNodeCount() const
+{
+ return _newClusterState.getNodeCount(lib::NodeType::STORAGE);
+}
+
+bool
+PendingClusterState::shouldRequestBucketInfo() const
+{
+ if (clusterIsDown()) {
+ LOG(debug, "Received system state where the cluster is down");
+ return false;
+ }
+ if (iAmDown()) {
+ LOG(debug, "Received system state where our node is down");
+ return false;
+ }
+ return true;
+}
+
+bool
+PendingClusterState::clusterIsDown() const
+{
+ return _newClusterState.getClusterState() == lib::State::DOWN;
+}
+
+bool
+PendingClusterState::iAmDown() const
+{
+ const lib::NodeState& myState(
+ _newClusterState.getNodeState(Node(NodeType::DISTRIBUTOR,
+ _sender.getDistributorIndex())));
+ return myState.getState() == lib::State::DOWN;
+}
+
+bool
+PendingClusterState::storageNodeMayHaveLostData(uint16_t index)
+{
+ Node node(NodeType::STORAGE, index);
+ NodeState newState = _newClusterState.getNodeState(node);
+ NodeState oldState = _prevClusterState.getNodeState(node);
+
+ return (newState.getStartTimestamp() > oldState.getStartTimestamp());
+}
+
+void
+PendingClusterState::updateSetOfNodesThatAreOutdated()
+{
+ const uint16_t nodeCount(newStateStorageNodeCount());
+ for (uint16_t index = 0; index < nodeCount; ++index) {
+ if (storageNodeMayHaveLostData(index) || storageNodeChanged(index)) {
+ _outdatedNodes.insert(index);
+ }
+ }
+}
+
+bool
+PendingClusterState::storageNodeChanged(uint16_t index) {
+ Node node(NodeType::STORAGE, index);
+ NodeState newState = _newClusterState.getNodeState(node);
+ NodeState oldNodeState = _prevClusterState.getNodeState(node);
+
+ // similarTo() also covers disk states.
+ if (!(oldNodeState.similarTo(newState))) {
+ LOG(debug,
+ "State for storage node %d has changed from '%s' to '%s', "
+ "updating bucket information",
+ index,
+ oldNodeState.toString().c_str(),
+ newState.toString().c_str());
+ return true;
+ }
+
+ return false;
+}
+
+void
+PendingClusterState::requestNodes()
+{
+ LOG(debug,
+ "New system state: Old state was %s, new state is %s",
+ _prevClusterState.toString().c_str(),
+ _newClusterState.toString().c_str());
+
+ requestBucketInfoFromStorageNodesWithChangedState();
+}
+
+void
+PendingClusterState::requestBucketInfoFromStorageNodesWithChangedState()
+{
+ for (uint16_t idx : _outdatedNodes) {
+ if (storageNodeUpInNewState(idx)) {
+ requestNode(idx);
+ }
+ }
+}
+
+bool
+PendingClusterState::distributorChanged(
+ const lib::ClusterState& oldState,
+ const lib::ClusterState& newState)
+{
+ if (newState.getDistributionBitCount() !=
+ oldState.getDistributionBitCount())
+ {
+ return true;
+ }
+
+ Node myNode(NodeType::DISTRIBUTOR, _sender.getDistributorIndex());
+ if (oldState.getNodeState(myNode).getState() ==
+ lib::State::DOWN)
+ {
+ return true;
+ }
+
+ uint16_t oldCount = oldState.getNodeCount(NodeType::DISTRIBUTOR);
+ uint16_t newCount = newState.getNodeCount(NodeType::DISTRIBUTOR);
+
+ uint16_t maxCount = std::max(oldCount, newCount);
+
+ for (uint16_t i = 0; i < maxCount; ++i) {
+ Node node(NodeType::DISTRIBUTOR, i);
+
+ const lib::State& old(oldState.getNodeState(node).getState());
+ const lib::State& nw(newState.getNodeState(node).getState());
+
+ if (nodeWasUpButNowIsDown(old, nw)) {
+ return (nodeInSameGroupAsSelf(i)
+ || nodeNeedsOwnershipTransferFromGroupDown(i, newState));
+ }
+ }
+
+ return false;
+}
+
+bool
+PendingClusterState::nodeWasUpButNowIsDown(const lib::State& old,
+ const lib::State& nw) const
+{
+ return (old.oneOf("uimr") && !nw.oneOf("uimr"));
+}
+
+bool
+PendingClusterState::nodeInSameGroupAsSelf(uint16_t index) const
+{
+ if (_clusterInfo->nodeInSameGroupAsSelf(index)) {
+ LOG(debug,
+ "Distributor %d state changed, need to request data from all "
+ "storage nodes",
+ index);
+ return true;
+ } else {
+ LOG(debug,
+ "Distributor %d state changed but unrelated to my group.",
+ index);
+ return false;
+ }
+}
+
+bool
+PendingClusterState::nodeNeedsOwnershipTransferFromGroupDown(
+ uint16_t nodeIndex,
+ const lib::ClusterState& state) const
+{
+ const lib::Distribution& dist(_clusterInfo->getDistribution());
+ if (!dist.distributorAutoOwnershipTransferOnWholeGroupDown()) {
+ return false; // Not doing anything for downed groups.
+ }
+ const lib::Group* group(dist.getNodeGraph().getGroupForNode(nodeIndex));
+ // If there is no group information associated with the node (because the
+ // group has changed or the node has been removed from config), we must
+ // also invoke ownership transfer of buckets.
+ if (group == nullptr
+ || lib::Distribution::allDistributorsDown(*group, state))
+ {
+ LOG(debug,
+ "Distributor %u state changed and is in a "
+ "group that now has no distributors remaining",
+ nodeIndex);
+ return true;
+ }
+ return false;
+}
+
+void
+PendingClusterState::requestNode(uint16_t node)
+{
+ vespalib::string distributionHash(_clusterInfo->getDistributionHash());
+ LOG(debug,
+ "Requesting bucket info for node %d with cluster state '%s' "
+ "and distribution hash '%s'",
+ node,
+ _newClusterState.toString().c_str(),
+ distributionHash.c_str());
+
+ std::shared_ptr<api::RequestBucketInfoCommand> cmd(
+ new api::RequestBucketInfoCommand(
+ _sender.getDistributorIndex(),
+ _newClusterState,
+ distributionHash));
+
+ cmd->setPriority(api::StorageMessage::HIGH);
+ cmd->setTimeout(INT_MAX);
+
+ _sentMessages[cmd->getMsgId()] = node;
+
+ _sender.sendToNode(NodeType::STORAGE, node, cmd);
+}
+
+bool
+PendingClusterState::onRequestBucketInfoReply(
+ const std::shared_ptr<api::RequestBucketInfoReply>& reply)
+{
+ auto iter = _sentMessages.find(reply->getMsgId());
+
+ if (iter == _sentMessages.end()) {
+ return false;
+ }
+ const uint16_t node = iter->second;
+
+ if (!reply->getResult().success()) {
+ framework::MilliSecTime resendTime(_clock);
+ resendTime += framework::MilliSecTime(100);
+ _delayedRequests.push_back(std::make_pair(resendTime, node));
+ _sentMessages.erase(iter);
+ return true;
+ }
+
+ setNodeReplied(node);
+
+ for (uint32_t i = 0; i < reply->getBucketInfo().size(); ++i) {
+ addNodeInfo(reply->getBucketInfo()[i]._bucketId,
+ BucketCopy(_creationTimestamp,
+ node,
+ reply->getBucketInfo()[i]._info));
+ }
+
+ _sentMessages.erase(iter);
+
+ return true;
+}
+
+void
+PendingClusterState::resendDelayedMessages() {
+ if (_delayedRequests.empty()) return; // Don't fetch time if not needed
+ framework::MilliSecTime currentTime(_clock);
+ while (!_delayedRequests.empty()
+ && currentTime >= _delayedRequests.front().first)
+ {
+ requestNode(_delayedRequests.front().second);
+ _delayedRequests.pop_front();
+ }
+}
+
+void
+PendingClusterState::addNodeInfo(
+ const document::BucketId& id,
+ const BucketCopy& copy)
+{
+ _entries.push_back(Entry(id, copy));
+}
+
+PendingClusterState::Range
+PendingClusterState::skipAllForSameBucket()
+{
+ Range r(_iter, _iter);
+
+ for (document::BucketId& bid = _entries[_iter].bucketId;
+ _iter < _entries.size() && _entries[_iter].bucketId == bid;
+ ++_iter)
+ {
+ }
+
+ r.second = _iter;
+ return r;
+}
+
+void
+PendingClusterState::insertInfo(
+ BucketDatabase::Entry& info,
+ const Range& range)
+{
+ std::vector<BucketCopy> copiesToAddOrUpdate(
+ getCopiesThatAreNewOrAltered(info, range));
+
+ std::vector<uint16_t> order(
+ _clusterInfo->getIdealStorageNodesForState(
+ _newClusterState,
+ _entries[range.first].bucketId));
+ info->addNodes(copiesToAddOrUpdate, order);
+
+ LOG_BUCKET_OPERATION_NO_LOCK(
+ _entries[range.first].bucketId,
+ vespalib::make_vespa_string("insertInfo: %s",
+ info.toString().c_str()));
+}
+
+std::vector<BucketCopy>
+PendingClusterState::getCopiesThatAreNewOrAltered(
+ BucketDatabase::Entry& info,
+ const Range& range)
+{
+ std::vector<BucketCopy> copiesToAdd;
+ for (uint32_t i = range.first; i < range.second; ++i) {
+ const BucketCopy& candidate(_entries[i].copy);
+ const BucketCopy* cp = info->getNode(candidate.getNode());
+
+ if (!cp || !(cp->getBucketInfo() == candidate.getBucketInfo())) {
+ copiesToAdd.push_back(candidate);
+ }
+ }
+ return copiesToAdd;
+}
+
+std::string
+PendingClusterState::requestNodesToString()
+{
+ std::ostringstream ost;
+ for (uint32_t i = 0; i < _requestedNodes.size(); ++i) {
+ if (_requestedNodes[i]) {
+ if (ost.str().length() > 0) {
+ ost << ",";
+ }
+ ost << i;
+ }
+ }
+ return ost.str();
+}
+
+bool
+PendingClusterState::removeCopiesFromNodesThatWereRequested(
+ BucketDatabase::Entry& e,
+ const document::BucketId& bucketId)
+{
+ bool updated = false;
+ for (uint32_t i = 0; i < e->getNodeCount(); ++i) {
+ auto& info(e->getNodeRef(i));
+ const uint16_t entryNode(info.getNode());
+ // Don't remove an entry if it's been updated in the time after the
+ // bucket info requests were sent, as this would erase newer state.
+ if (nodeIsOutdated(entryNode)
+ && (info.getTimestamp() < _creationTimestamp)
+ && e->removeNode(entryNode))
+ {
+ LOG(spam,
+ "Removed bucket %s from node %d",
+ bucketId.toString().c_str(),
+ entryNode);
+ updated = true;
+ }
+ }
+ return updated;
+}
+
+bool
+PendingClusterState::databaseIteratorHasPassedBucketInfoIterator(
+ const document::BucketId& bucketId) const
+{
+ return (_iter < _entries.size()
+ && _entries[_iter].bucketId.toKey() < bucketId.toKey());
+}
+
+bool
+PendingClusterState::bucketInfoIteratorPointsToBucket(
+ const document::BucketId& bucketId) const
+{
+ return _iter < _entries.size() && _entries[_iter].bucketId == bucketId;
+}
+
+bool
+PendingClusterState::process(BucketDatabase::Entry& e)
+{
+ document::BucketId bucketId(e.getBucketId());
+
+ LOG(spam,
+ "Before merging info from nodes [%s], bucket %s had info %s",
+ requestNodesToString().c_str(),
+ bucketId.toString().c_str(),
+ e.getBucketInfo().toString().c_str());
+
+ while (databaseIteratorHasPassedBucketInfoIterator(bucketId)) {
+ LOG(spam, "Found new bucket %s, adding",
+ _entries[_iter].bucketId.toString().c_str());
+
+ _missingEntries.push_back(skipAllForSameBucket());
+ }
+
+ bool updated(removeCopiesFromNodesThatWereRequested(e, bucketId));
+
+ if (bucketInfoIteratorPointsToBucket(bucketId)) {
+ LOG(spam, "Updating bucket %s",
+ _entries[_iter].bucketId.toString().c_str());
+
+ insertInfo(e, skipAllForSameBucket());
+ updated = true;
+ }
+
+ // Remove bucket if we've previously removed all nodes from it
+ if (e->getNodeCount() == 0 && updated) {
+ _removedBuckets.push_back(bucketId);
+ }
+
+ LOG(spam,
+ "After merging info from nodes [%s], bucket %s had info %s",
+ requestNodesToString().c_str(),
+ bucketId.toString().c_str(),
+ e.getBucketInfo().toString().c_str());
+
+ return true;
+}
+
+void
+PendingClusterState::addToBucketDB(BucketDatabase& db,
+ const Range& range)
+{
+ LOG(spam, "Adding new bucket %s with %d copies",
+ _entries[range.first].bucketId.toString().c_str(),
+ range.second - range.first);
+
+ BucketDatabase::Entry e(_entries[range.first].bucketId, BucketInfo());
+ insertInfo(e, range);
+ if (e->getLastGarbageCollectionTime() == 0) {
+ e->setLastGarbageCollectionTime(
+ framework::MicroSecTime(_creationTimestamp)
+ .getSeconds().getTime());
+ }
+ db.update(e);
+}
+
+void
+PendingClusterState::mergeInto(BucketDatabase& db)
+{
+ std::sort(_entries.begin(), _entries.end());
+
+ db.forEach(*this);
+
+ for (uint32_t i = 0; i < _removedBuckets.size(); ++i) {
+ db.remove(_removedBuckets[i]);
+ }
+ _removedBuckets.clear();
+
+ // All of the remaining were not already in the bucket database.
+ while (_iter < _entries.size()) {
+ _missingEntries.push_back(skipAllForSameBucket());
+ }
+
+ for (uint32_t i = 0; i < _missingEntries.size(); ++i) {
+ addToBucketDB(db, _missingEntries[i]);
+ }
+}
+
+void
+PendingClusterState::printXml(vespalib::XmlOutputStream& xos) const
+{
+ using namespace vespalib::xml;
+ xos << XmlTag("systemstate_pending")
+ << XmlAttribute("state", _newClusterState);
+ for (std::map<uint64_t, uint16_t>::const_iterator iter
+ = _sentMessages.begin(); iter != _sentMessages.end(); ++iter)
+ {
+ xos << XmlTag("pending")
+ << XmlAttribute("node", iter->second)
+ << XmlEndTag();
+ }
+ xos << XmlEndTag();
+}
+
+PendingClusterState::Summary
+PendingClusterState::getSummary() const
+{
+ return Summary(_prevClusterState.toString(),
+ _newClusterState.toString(),
+ (_clock.getTimeInMicros().getTime() - _creationTimestamp));
+}
+
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/pendingclusterstate.h b/storage/src/vespa/storage/distributor/pendingclusterstate.h
new file mode 100644
index 00000000000..c31747cf600
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/pendingclusterstate.h
@@ -0,0 +1,287 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/vespalib/util/xmlserializable.h>
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <vespa/storage/distributor/clusterinformation.h>
+#include <unordered_set>
+
+namespace storage {
+
+namespace distributor {
+
+class DistributorMessageSender;
+
+/**
+ * Class used by BucketDBUpdater to track request bucket info
+ * messages sent to the storage nodes.
+ */
+class PendingClusterState : public vespalib::XmlSerializable,
+ public BucketDatabase::MutableEntryProcessor {
+public:
+ struct Entry {
+ Entry(const document::BucketId& bid,
+ const BucketCopy& copy_)
+ : bucketId(bid),
+ copy(copy_) {};
+
+ document::BucketId bucketId;
+ BucketCopy copy;
+
+ bool operator<(const Entry& other) const {
+ return bucketId.toKey() < other.bucketId.toKey();
+ }
+ };
+
+ struct Summary {
+ Summary(const std::string& prevClusterState,
+ const std::string& newClusterState,
+ uint32_t processingTime)
+ : _prevClusterState(prevClusterState),
+ _newClusterState(newClusterState),
+ _processingTime(processingTime) {};
+
+ std::string _prevClusterState;
+ std::string _newClusterState;
+ uint32_t _processingTime;
+ };
+
+ typedef std::vector<Entry> EntryList;
+
+ static std::unique_ptr<PendingClusterState> createForClusterStateChange(
+ const framework::Clock& clock,
+ const ClusterInformation::CSP& clusterInfo,
+ DistributorMessageSender& sender,
+ const std::shared_ptr<api::SetSystemStateCommand>& newStateCmd,
+ const std::unordered_set<uint16_t>& outdatedNodes,
+ api::Timestamp creationTimestamp)
+ {
+ return std::unique_ptr<PendingClusterState>(
+ new PendingClusterState(clock, clusterInfo, sender, newStateCmd,
+ outdatedNodes,
+ creationTimestamp));
+ }
+
+ /**
+ * Distribution changes always need to ask all storage nodes, so no
+ * need to do an union of existing outdated nodes; implicit complete set.
+ */
+ static std::unique_ptr<PendingClusterState> createForDistributionChange(
+ const framework::Clock& clock,
+ const ClusterInformation::CSP& clusterInfo,
+ DistributorMessageSender& sender,
+ api::Timestamp creationTimestamp)
+ {
+ return std::unique_ptr<PendingClusterState>(
+ new PendingClusterState(clock, clusterInfo, sender,
+ creationTimestamp));
+ }
+
+ /**
+ * Adds the info from the reply to our list of information.
+ * Returns true if the reply was accepted by this object, false if not.
+ */
+ bool onRequestBucketInfoReply(
+ const std::shared_ptr<api::RequestBucketInfoReply>& reply);
+
+ /**
+ * Tags the given node as having replied to the
+ * request bucket info command.
+ */
+ void setNodeReplied(uint16_t nodeIdx) {
+ _requestedNodes[nodeIdx] = true;
+ }
+
+ /**
+ * Adds info from a node to our list of information.
+ */
+ void addNodeInfo(const document::BucketId& id,
+ const BucketCopy& copy);
+
+ /** Called to resend delayed resends due to failures. */
+ void resendDelayedMessages();
+
+ /**
+ * Returns true if all the nodes we requested have replied to
+ * the request bucket info commands.
+ */
+ bool done() {
+ return _sentMessages.empty() && _delayedRequests.empty();
+ }
+
+ std::shared_ptr<api::SetSystemStateCommand> getCommand() {
+ return _cmd;
+ }
+
+ const lib::ClusterState& getNewClusterState() const {
+ return _newClusterState;
+ }
+ const lib::ClusterState& getPrevClusterState() const {
+ return _prevClusterState;
+ }
+ const lib::Distribution& getDistribution() const {
+ return _clusterInfo->getDistribution();
+ }
+
+ /**
+ * Returns the union set of the outdated node set provided at construction
+ * time and the set of nodes that the pending cluster state figured out
+ * were outdated based on the cluster state diff. If the pending cluster
+ * state was constructed for a distribution config change, this set will
+ * be equal to the set of all available storage nodes.
+ */
+ std::unordered_set<uint16_t> getOutdatedNodeSet() const;
+
+ /**
+ * Merges all the results with the given bucket database.
+ */
+ void mergeInto(BucketDatabase& db);
+
+ bool process(BucketDatabase::Entry& e);
+
+ const EntryList& results() const { return _entries; }
+
+ /**
+ * Returns true if this pending state was due to a distribution bit
+ * change rather than an actual state change.
+ */
+ bool distributionChange() const { return _distributionChange; }
+
+ virtual void printXml(vespalib::XmlOutputStream&) const;
+
+ Summary getSummary() const;
+
+private:
+ /**
+ * Creates a pending cluster state that represents
+ * a set system state command from the fleet controller.
+ */
+ PendingClusterState(
+ const framework::Clock&,
+ const ClusterInformation::CSP& clusterInfo,
+ DistributorMessageSender& sender,
+ const std::shared_ptr<api::SetSystemStateCommand>& newStateCmd,
+ const std::unordered_set<uint16_t>& outdatedNodes,
+ api::Timestamp creationTimestamp);
+
+ /**
+ * Creates a pending cluster state that represents a distribution
+ * change.
+ */
+ PendingClusterState(
+ const framework::Clock&,
+ const ClusterInformation::CSP& clusterInfo,
+ DistributorMessageSender& sender,
+ api::Timestamp creationTimestamp);
+
+ void logConstructionInformation() const;
+
+ void requestNode(uint16_t node);
+
+ bool distributorChanged(const lib::ClusterState& oldState,
+ const lib::ClusterState& newState);
+
+ bool storageNodeMayHaveLostData(uint16_t index);
+ bool storageNodeChanged(uint16_t index);
+
+ void markAllAvailableNodesAsRequiringRequest();
+
+ void addAdditionalNodesToOutdatedSet(
+ const std::unordered_set<uint16_t>& nodes);
+
+ void updateSetOfNodesThatAreOutdated();
+
+ void requestNodes();
+
+ void requestBucketInfoFromStorageNodesWithChangedState();
+
+ /**
+ * Number of nodes with node type 'storage' in _newClusterState.
+ */
+ uint16_t newStateStorageNodeCount() const;
+
+ bool shouldRequestBucketInfo() const;
+ bool clusterIsDown() const;
+ bool iAmDown() const;
+
+ bool nodeInSameGroupAsSelf(uint16_t index) const;
+ bool nodeNeedsOwnershipTransferFromGroupDown(
+ uint16_t nodeIndex,
+ const lib::ClusterState& state) const;
+ bool nodeWasUpButNowIsDown(const lib::State& old,
+ const lib::State& nw) const;
+
+ typedef std::pair<uint32_t, uint32_t> Range;
+
+ /**
+ * Skips through all entries for the same bucket and returns
+ * the range in the entry list for which they were found.
+ * The range is [from, to>
+ */
+ Range skipAllForSameBucket();
+
+ void insertInfo(BucketDatabase::Entry& info, const Range& range);
+ void addToBucketDB(BucketDatabase& db, const Range& range);
+
+ std::vector<BucketCopy> getCopiesThatAreNewOrAltered(
+ BucketDatabase::Entry& info,
+ const Range& range);
+
+ std::string requestNodesToString();
+
+ bool removeCopiesFromNodesThatWereRequested(
+ BucketDatabase::Entry& e,
+ const document::BucketId& bucketId);
+
+ bool databaseIteratorHasPassedBucketInfoIterator(
+ const document::BucketId& bucketId) const;
+ bool bucketInfoIteratorPointsToBucket(
+ const document::BucketId& bucketId) const;
+
+ bool nodeIsOutdated(uint16_t node) const {
+ return (_outdatedNodes.find(node) != _outdatedNodes.end());
+ }
+
+ bool storageNodeUpInNewState(uint16_t node) const;
+
+ std::shared_ptr<api::SetSystemStateCommand> _cmd;
+
+ std::map<uint64_t, uint16_t> _sentMessages;
+ std::vector<bool> _requestedNodes;
+ std::vector<document::BucketId> _removedBuckets;
+ std::deque<std::pair<framework::MilliSecTime, uint16_t> > _delayedRequests;
+
+ // Set for all nodes that may have changed state since that previous
+ // active cluster state, or that were marked as outdated when the pending
+ // cluster state was constructed.
+ // May be a superset of _requestedNodes, as some nodes that are outdated
+ // may be down and thus cannot get a request.
+ std::unordered_set<uint16_t> _outdatedNodes;
+
+ EntryList _entries;
+ uint32_t _iter;
+
+ std::vector<Range> _missingEntries;
+
+ bool _distributionChange;
+
+ lib::ClusterState _prevClusterState;
+ lib::ClusterState _newClusterState;
+
+ const framework::Clock& _clock;
+ ClusterInformation::CSP _clusterInfo;
+ api::Timestamp _creationTimestamp;
+
+ DistributorMessageSender& _sender;
+};
+
+}
+
+}
+
+
diff --git a/storage/src/vespa/storage/distributor/pendingmessagetracker.cpp b/storage/src/vespa/storage/distributor/pendingmessagetracker.cpp
new file mode 100644
index 00000000000..a34adb2336c
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/pendingmessagetracker.cpp
@@ -0,0 +1,349 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+#include <vespa/vespalib/stllike/asciistream.h>
+#include <map>
+#include <vespa/log/log.h>
+
+#include <algorithm>
+
+LOG_SETUP(".pendingmessages");
+
+namespace storage {
+
+namespace distributor {
+
+PendingMessageTracker::PendingMessageTracker(framework::ComponentRegister& cr)
+ : framework::HtmlStatusReporter("pendingmessages",
+ "Pending messages to storage nodes"),
+ _component(cr, "pendingmessagetracker"),
+ _nodeIndexToStats(),
+ _nodeInfo(_component.getClock()),
+ _statisticsForwarder(*this),
+ _lock()
+{
+ _component.registerStatusPage(*this);
+}
+
+PendingMessageTracker::~PendingMessageTracker()
+{
+}
+
+PendingMessageTracker::MessageEntry::MessageEntry(
+ TimePoint timeStamp_,
+ uint32_t msgType_,
+ uint32_t priority_,
+ uint64_t msgId_,
+ document::BucketId bucketId_,
+ uint16_t nodeIdx_,
+ const vespalib::string & msgText_)
+ : timeStamp(timeStamp_),
+ msgType(msgType_),
+ priority(priority_),
+ msgId(msgId_),
+ bucketId(bucketId_),
+ nodeIdx(nodeIdx_),
+ msgText(msgText_)
+{
+}
+
+PendingMessageTracker::TimePoint
+PendingMessageTracker::currentTime() const
+{
+ return TimePoint(_component.getClock().getTimeInMillis().getTime());
+}
+
+namespace {
+
+template <typename Pair>
+struct PairAsRange
+{
+ Pair _pair;
+ explicit PairAsRange(Pair pair) : _pair(std::move(pair)) {}
+
+ auto begin() -> decltype(_pair.first) { return _pair.first; }
+ auto end() -> decltype(_pair.second) { return _pair.second; }
+ auto begin() const -> decltype(_pair.first) { return _pair.first; }
+ auto end() const -> decltype(_pair.second) { return _pair.second; }
+};
+
+template <typename Pair>
+PairAsRange<Pair>
+pairAsRange(Pair pair)
+{
+ return PairAsRange<Pair>(std::move(pair));
+}
+
+}
+
+std::vector<uint64_t>
+PendingMessageTracker::clearMessagesForNode(uint16_t node)
+{
+ vespalib::LockGuard guard(_lock);
+ MessagesByNodeAndBucket& idx(boost::multi_index::get<1>(_messages));
+ auto range = pairAsRange(idx.equal_range(boost::make_tuple(node)));
+
+ std::vector<uint64_t> erasedIds;
+ for (auto& entry : range) {
+ erasedIds.push_back(entry.msgId);
+ }
+ idx.erase(std::begin(range), std::end(range));
+
+ _nodeInfo.clearPending(node);
+ return erasedIds;
+}
+
+void
+PendingMessageTracker::insert(
+ const std::shared_ptr<api::StorageMessage>& msg)
+{
+ vespalib::LockGuard guard(_lock);
+ if (msg->getAddress()) {
+ _messages.insert(
+ MessageEntry(currentTime(),
+ msg->getType().getId(),
+ msg->getPriority(),
+ msg->getMsgId(),
+ msg->getBucketId(),
+ msg->getAddress()->getIndex(),
+ msg->getSummary()));
+
+ _nodeInfo.incPending(msg->getAddress()->getIndex());
+
+ LOG(debug, "Sending message %s with id %zu to %s",
+ msg->toString().c_str(),
+ msg->getMsgId(),
+ msg->getAddress()->toString().c_str());
+ }
+}
+
+document::BucketId
+PendingMessageTracker::reply(const api::StorageReply& r)
+{
+ vespalib::LockGuard guard(_lock);
+ document::BucketId bucketId;
+
+ LOG(debug, "Got reply: %s", r.toString().c_str());
+ uint64_t msgId = r.getMsgId();
+
+ MessagesByMsgId& msgs = boost::multi_index::get<0>(_messages);
+ MessagesByMsgId::iterator iter = msgs.find(msgId);
+
+ if (iter != msgs.end()) {
+ bucketId = iter->bucketId;
+ _nodeInfo.decPending(r.getAddress()->getIndex());
+ updateNodeStatsOnReply(*iter);
+ api::ReturnCode::Result code = r.getResult().getResult();
+ if (code == api::ReturnCode::BUSY || code == api::ReturnCode::TIMEOUT) {
+ _nodeInfo.setBusy(r.getAddress()->getIndex());
+ }
+ LOG(debug, "Erased message with id %zu", msgId);
+ msgs.erase(msgId);
+ }
+
+ return bucketId;
+}
+
+void
+PendingMessageTracker::updateNodeStatsOnReply(const MessageEntry& entry)
+{
+ NodeStats& stats(_nodeIndexToStats[entry.nodeIdx]);
+ switch (entry.msgType) {
+ case api::MessageType::PUT_ID:
+ updateOperationStats(stats.puts, entry);
+ break;
+ default:
+ return; // Message was for type not tracked by stats.
+ }
+}
+
+void
+PendingMessageTracker::updateOperationStats(OperationStats& opStats,
+ const MessageEntry& entry) const
+{
+ // Time might go backwards due to clock adjustments (here assuming clock
+ // implementation in storage framework is non-monotonic), so avoid negative
+ // latencies by clamping to delta of 0.
+ auto now = std::max(currentTime(), entry.timeStamp);
+ opStats.totalLatency += (now - entry.timeStamp);
+ ++opStats.numRequests;
+}
+
+
+NodeStatsSnapshot
+PendingMessageTracker::getLatencyStatistics() const
+{
+ vespalib::LockGuard guard(_lock);
+ NodeStatsSnapshot snapshot;
+ // Conveniently, snapshot data structure is exactly the same as our own.
+ snapshot.nodeToStats = _nodeIndexToStats;
+ return snapshot;
+}
+
+NodeStatsSnapshot
+PendingMessageTracker::ForwardingLatencyStatisticsProvider
+::doGetLatencyStatistics() const
+{
+ return _messageTracker.getLatencyStatistics();
+}
+
+namespace {
+
+template <typename Range>
+void
+runCheckerOnRange(PendingMessageTracker::Checker& checker, const Range& range)
+{
+ for (auto& e : range) {
+ if (!checker.check(e.msgType, e.nodeIdx, e.priority)) {
+ break;
+ }
+ }
+}
+
+}
+
+void
+PendingMessageTracker::checkPendingMessages(uint16_t node,
+ const document::BucketId& bid,
+ Checker& checker) const
+{
+ vespalib::LockGuard guard(_lock);
+ const MessagesByNodeAndBucket& msgs(boost::multi_index::get<1>(_messages));
+
+ auto range = pairAsRange(msgs.equal_range(boost::make_tuple(node, bid)));
+ runCheckerOnRange(checker, range);
+}
+
+void
+PendingMessageTracker::checkPendingMessages(const document::BucketId& bid,
+ Checker& checker) const
+{
+ vespalib::LockGuard guard(_lock);
+ const MessagesByBucketAndType& msgs(boost::multi_index::get<2>(_messages));
+
+ auto range = pairAsRange(msgs.equal_range(boost::make_tuple(bid)));
+ runCheckerOnRange(checker, range);
+}
+
+bool
+PendingMessageTracker::hasPendingMessage(uint16_t node,
+ const document::BucketId& bid,
+ uint32_t messageType) const
+{
+ vespalib::LockGuard guard(_lock);
+ const MessagesByNodeAndBucket& msgs(boost::multi_index::get<1>(_messages));
+
+ auto range = msgs.equal_range(boost::make_tuple(node, bid, messageType));
+ return (range.first != range.second);
+}
+
+void
+PendingMessageTracker::getStatusStartPage(std::ostream& out) const
+{
+ out << "View:\n"
+ "<ul>\n"
+ "<li><a href=\"?order=bucket\">Group by bucket</a></li>"
+ "<li><a href=\"?order=node\">Group by node</a></li>\n";
+}
+
+void
+PendingMessageTracker::getStatusPerBucket(std::ostream& out) const
+{
+ vespalib::LockGuard guard(_lock);
+ const MessagesByNodeAndBucket& msgs = boost::multi_index::get<1>(_messages);
+ using BucketMap = std::map<document::BucketId,
+ std::vector<vespalib::string>>;
+ BucketMap perBucketMsgs;
+ for (auto& msg : msgs) {
+ vespalib::asciistream ss;
+ ss << "<li><i>Node "
+ << msg.nodeIdx << "</i>: "
+ << "<b>"
+ << framework::MilliSecTime(msg.timeStamp.count()).toString()
+ << "</b> "
+ << msg.msgText << "</li>\n";
+
+ perBucketMsgs[msg.bucketId].emplace_back(ss.str());
+ }
+
+ document::BucketId lastBucketId;
+ for (auto& bucket : perBucketMsgs) {
+ if (lastBucketId.getRawId() != 0) {
+ out << "</ul>\n";
+ }
+ out << "<b>" << bucket.first << "</b>\n";
+ out << "<ul>\n";
+ lastBucketId = bucket.first;
+ for (auto& msgDesc : bucket.second) {
+ out << msgDesc;
+ }
+ }
+
+ if (lastBucketId.getRawId() != 0) {
+ out << "</ul>\n";
+ }
+}
+
+void
+PendingMessageTracker::getStatusPerNode(std::ostream& out) const
+{
+ vespalib::LockGuard guard(_lock);
+ const MessagesByNodeAndBucket& msgs = boost::multi_index::get<1>(_messages);
+ int lastNode = -1;
+ for (MessagesByNodeAndBucket::const_iterator iter =
+ msgs.begin(); iter != msgs.end(); iter++) {
+ if (iter->nodeIdx != lastNode) {
+ if (lastNode != -1) {
+ out << "</ul>\n";
+ }
+
+ out << "<b>Node " << iter->nodeIdx
+ << " (pending count: "
+ << _nodeInfo.getPendingCount(iter->nodeIdx)
+ << ")</b>\n<ul>\n";
+ lastNode = iter->nodeIdx;
+ }
+
+ out << "<li><b>"
+ << framework::MilliSecTime(iter->timeStamp.count()).toString()
+ << "</b> "
+ << iter->msgText << "</li>\n";
+ }
+
+ if (lastNode != -1) {
+ out << "</ul>\n";
+ }
+}
+
+void
+PendingMessageTracker::reportHtmlStatus(
+ std::ostream& out, const framework::HttpUrlPath& path) const
+{
+ if (!path.hasAttribute("order")) {
+ getStatusStartPage(out);
+ } else if (path.getAttribute("order") == "bucket") {
+ getStatusPerBucket(out);
+ } else if (path.getAttribute("order") == "node") {
+ getStatusPerNode(out);
+ }
+}
+
+void
+PendingMessageTracker::print(std::ostream& /*out*/,
+ bool /*verbose*/,
+ const std::string& /*indent*/) const
+{
+
+}
+
+NodeStats
+PendingMessageTracker::getNodeStats(uint16_t node) const
+{
+ vespalib::LockGuard guard(_lock);
+ auto nodeIter = _nodeIndexToStats.find(node);
+ return (nodeIter != _nodeIndexToStats.end() ? nodeIter->second
+ : NodeStats());
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/pendingmessagetracker.h b/storage/src/vespa/storage/distributor/pendingmessagetracker.h
new file mode 100644
index 00000000000..bcae5cc9790
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/pendingmessagetracker.h
@@ -0,0 +1,255 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/storageapi/messageapi/returncode.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storageframework/storageframework.h>
+#include <boost/multi_index_container.hpp>
+#include <boost/multi_index/identity.hpp>
+#include <boost/multi_index/member.hpp>
+#include <boost/multi_index/mem_fun.hpp>
+#include <boost/multi_index/ordered_index.hpp>
+#include <boost/multi_index/sequenced_index.hpp>
+#include <boost/multi_index/composite_key.hpp>
+#include <vespa/storage/distributor/nodeinfo.h>
+#include <vespa/storage/distributor/latency_statistics_provider.h>
+#include <vespa/vespalib/stllike/hash_set.h>
+#include <vespa/vespalib/util/sync.h>
+
+#include <set>
+#include <unordered_map>
+
+namespace storage {
+namespace distributor {
+
+class PendingMessageTracker : public framework::HtmlStatusReporter
+{
+ class ForwardingLatencyStatisticsProvider
+ : public LatencyStatisticsProvider
+ {
+ PendingMessageTracker& _messageTracker;
+ public:
+ ForwardingLatencyStatisticsProvider(
+ PendingMessageTracker& messageTracker)
+ : _messageTracker(messageTracker)
+ {
+ }
+
+ NodeStatsSnapshot doGetLatencyStatistics() const override;
+ };
+
+public:
+ class Checker {
+ public:
+ virtual ~Checker() {}
+
+ virtual bool check(uint32_t messageType,
+ uint16_t node,
+ uint8_t priority) = 0;
+ };
+
+ /**
+ * Time point represented as the millisecond interval from the framework
+ * clock's epoch to a given point in time. Note that it'd be more
+ * semantically correct to use std::chrono::time_point, but it is bound
+ * to specific chrono clock types, their epochs and duration resolution.
+ */
+ using TimePoint = std::chrono::milliseconds;
+
+ PendingMessageTracker(framework::ComponentRegister&);
+ ~PendingMessageTracker();
+
+ void insert(const std::shared_ptr<api::StorageMessage>&);
+
+ document::BucketId reply(const api::StorageReply& reply);
+
+ void reportHtmlStatus(std::ostream&, const framework::HttpUrlPath&) const;
+
+ void print(std::ostream& out,
+ bool verbose,
+ const std::string& indent) const;
+
+ /**
+ * Goes through each pending message for the given node+bucket pair,
+ * passing it to the given type checker.
+ * Breaks when the checker returns false.
+ */
+ void checkPendingMessages(uint16_t node,
+ const document::BucketId& bid,
+ Checker& checker) const;
+
+ /**
+ * Goes through each pending message (across all nodes) for the given bucket
+ * and invokes the given checker with the node, message type and priority.
+ * Breaks when the checker returns false.
+ */
+ void checkPendingMessages(const document::BucketId& bid,
+ Checker& checker) const;
+
+ /**
+ * Utility function for checking if there's a message of type
+ * messageType pending to bucket bid on the given node.
+ */
+ bool hasPendingMessage(uint16_t node,
+ const document::BucketId& bid,
+ uint32_t messageType) const;
+
+ /**
+ * Returns a vector containing the number of pending messages to each storage node.
+ * The vector might be smaller than a given node index. In that case, that storage
+ * node has never had any pending messages.
+ */
+ const NodeInfo& getNodeInfo() const { return _nodeInfo; }
+ NodeInfo& getNodeInfo() { return _nodeInfo; }
+
+ /**
+ * Get the statistics for all completed operations towards a specific
+ * storage node. "Completed" here means both successful and failed
+ * operations. Statistics are monotonically increasing within the scope of
+ * the process' lifetime and are never reset. This models how the Linux
+ * kernel reports its internal stats and means the caller must maintan
+ * value snapshots to extract meaningful time series information.
+ *
+ * If stats are requested for a node that has not had any operations
+ * complete towards it, the returned stats will be all zero.
+ *
+ * Method is thread safe and data race free.
+ *
+ * It is assumed that NodeStats is sufficiently small that returning it
+ * by value does not incur a measurable performance impact. This also
+ * prevents any data race issues in case returned stats are e.g.
+ * concurrently read by another thread such the metric snapshotting thread.
+ */
+ NodeStats getNodeStats(uint16_t node) const;
+
+ /**
+ * Clears all pending messages for the given node, and returns
+ * the messages erased.
+ */
+ std::vector<uint64_t> clearMessagesForNode(uint16_t node);
+
+ /**
+ * Must not be called when _lock is already held or there will be a
+ * deadlock.
+ */
+ NodeStatsSnapshot getLatencyStatistics() const;
+
+ LatencyStatisticsProvider& getLatencyStatisticsProvider() {
+ return _statisticsForwarder;
+ }
+
+private:
+ struct MessageEntry {
+ TimePoint timeStamp;
+ uint32_t msgType;
+ uint32_t priority;
+ uint64_t msgId;
+ document::BucketId bucketId;
+ uint16_t nodeIdx;
+ vespalib::string msgText;
+
+ MessageEntry(TimePoint timeStamp,
+ uint32_t msgType,
+ uint32_t priority,
+ uint64_t msgId,
+ document::BucketId bucketId,
+ uint16_t nodeIdx,
+ const vespalib::string & msgText);
+ };
+
+ struct MessageIdKey
+ : boost::multi_index::member<MessageEntry, uint64_t, &MessageEntry::msgId>
+ {
+ };
+
+ /**
+ * Each entry has a separate composite keyed index on node+bucket id+type.
+ * This makes it efficient to find all messages for a node, for a bucket
+ * on that node and specific message types to an exact bucket on the node.
+ */
+ struct CompositeNodeBucketKey
+ : boost::multi_index::composite_key<
+ MessageEntry,
+ boost::multi_index::member<MessageEntry, uint16_t,
+ &MessageEntry::nodeIdx>,
+ boost::multi_index::member<MessageEntry, document::BucketId,
+ &MessageEntry::bucketId>,
+ boost::multi_index::member<MessageEntry, uint32_t,
+ &MessageEntry::msgType>
+ >
+ {
+ };
+
+ struct CompositeBucketMsgNodeKey
+ : boost::multi_index::composite_key<
+ MessageEntry,
+ boost::multi_index::member<MessageEntry, document::BucketId,
+ &MessageEntry::bucketId>,
+ boost::multi_index::member<MessageEntry, uint32_t,
+ &MessageEntry::msgType>,
+ boost::multi_index::member<MessageEntry, uint16_t,
+ &MessageEntry::nodeIdx>
+ >
+ {
+ };
+
+ typedef boost::multi_index::multi_index_container <
+ MessageEntry,
+ boost::multi_index::indexed_by<
+ boost::multi_index::ordered_unique<MessageIdKey>,
+ boost::multi_index::ordered_non_unique<CompositeNodeBucketKey>,
+ boost::multi_index::ordered_non_unique<CompositeBucketMsgNodeKey>
+ >
+ > Messages;
+
+ typedef Messages::nth_index<0>::type MessagesByMsgId;
+ typedef Messages::nth_index<1>::type MessagesByNodeAndBucket;
+ typedef Messages::nth_index<2>::type MessagesByBucketAndType;
+
+ Messages _messages;
+ framework::Component _component;
+ std::unordered_map<uint16_t, NodeStats> _nodeIndexToStats;
+ NodeInfo _nodeInfo;
+ ForwardingLatencyStatisticsProvider _statisticsForwarder;
+
+ // Since distributor is currently single-threaded, this will only
+ // contend when status page is being accessed. It is, however, required
+ // to be present for that exact purpose.
+ vespalib::Lock _lock;
+
+ /**
+ * Increment latency and operation count stats for the node the message
+ * was sent towards based on the registered send time and the current time.
+ *
+ * In the event that system time has moved backwards across sending a
+ * command and reciving its reply, the latency will not be recorded but
+ * the total number of messages will increase.
+ *
+ * _lock MUST be held upon invocation.
+ */
+ void updateNodeStatsOnReply(const MessageEntry& entry);
+
+
+ /**
+ * Modifies opStats in-place with added latency based on delta from send
+ * time to current time and incremented operation count.
+ */
+ void updateOperationStats(OperationStats& opStats,
+ const MessageEntry& entry) const;
+
+
+ void getStatusStartPage(std::ostream& out) const;
+
+ void getStatusPerNode(std::ostream& out) const;
+
+ void getStatusPerBucket(std::ostream& out) const;
+
+ TimePoint currentTime() const;
+};
+
+}
+}
+
diff --git a/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp b/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp
new file mode 100644
index 00000000000..34193bbb3c1
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/persistencemessagetracker.cpp
@@ -0,0 +1,367 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/persistencemessagetracker.h>
+#include <vespa/log/log.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storage/common/vectorprinter.h>
+#include <vespa/storage/common/bucketoperationlogger.h>
+
+LOG_SETUP(".persistencemessagetracker");
+
+namespace storage {
+
+namespace distributor {
+
+PersistenceMessageTrackerImpl::PersistenceMessageTrackerImpl(
+ PersistenceOperationMetricSet& metric,
+ std::shared_ptr<api::BucketInfoReply> reply,
+ DistributorComponent& link,
+ api::Timestamp revertTimestamp)
+ : MessageTracker(link.getClusterName()),
+ _metric(metric),
+ _reply(reply),
+ _manager(link),
+ _success(true),
+ _revertTimestamp(revertTimestamp),
+ _priority(reply->getPriority())
+{
+ _creationTime.SetNow();
+}
+
+void
+PersistenceMessageTrackerImpl::updateDB()
+{
+ for (BucketInfoMap::iterator iter = _bucketInfo.begin();
+ iter != _bucketInfo.end();
+ iter++)
+ {
+ _manager.updateBucketDatabase(iter->first, iter->second);
+ }
+
+ for (BucketInfoMap::iterator iter = _remapBucketInfo.begin();
+ iter != _remapBucketInfo.end();
+ iter++)
+ {
+ _manager.updateBucketDatabase(iter->first, iter->second,
+ DatabaseUpdate::CREATE_IF_NONEXISTING);
+ }
+}
+
+void
+PersistenceMessageTrackerImpl::updateMetrics()
+{
+ const api::ReturnCode& result(_reply->getResult());
+ if (result.success()) {
+ ++_metric.ok;
+ } else if (result.getResult() == api::ReturnCode::TIMEOUT) {
+ ++_metric.failures.timeout;
+ } else if (result.isBusy()) {
+ ++_metric.failures.busy;
+ } else if (result.isNodeDownOrNetwork()) {
+ ++_metric.failures.notconnected;
+ } else {
+ ++_metric.failures.storagefailure;
+ }
+ _metric.latency.addValue(_creationTime.MilliSecsToNow());
+}
+
+void
+PersistenceMessageTrackerImpl::fail(MessageSender& sender, const api::ReturnCode& result) {
+ if (_reply.get()) {
+ _reply->setResult(result);
+ updateMetrics();
+ sender.sendReply(_reply);
+ _reply.reset();
+ }
+}
+
+uint16_t
+PersistenceMessageTrackerImpl::receiveReply(
+ MessageSender& sender,
+ api::BucketInfoReply& reply)
+{
+ uint16_t node = handleReply(reply);
+
+ if (node != (uint16_t)-1) {
+ updateFromReply(sender, reply, node);
+ }
+
+ return node;
+}
+
+void
+PersistenceMessageTrackerImpl::revert(
+ MessageSender& sender,
+ const std::vector<std::pair<document::BucketId, uint16_t> > revertNodes)
+{
+ if (_revertTimestamp != 0) {
+ // Since we're reverting, all received bucket info is voided.
+ _bucketInfo.clear();
+
+ std::vector<api::Timestamp> reverts;
+ reverts.push_back(_revertTimestamp);
+
+ for (uint32_t i = 0; i < revertNodes.size(); i++) {
+ std::shared_ptr<api::RevertCommand> toRevert(
+ new api::RevertCommand(revertNodes[i].first, reverts));
+ toRevert->setPriority(_priority);
+ queueCommand(toRevert, revertNodes[i].second);
+ }
+
+ flushQueue(sender);
+ }
+}
+
+void
+PersistenceMessageTrackerImpl::queueMessageBatch(const std::vector<MessageTracker::ToSend>& messages) {
+ _messageBatches.push_back(MessageBatch());
+ for (uint32_t i = 0; i < messages.size(); i++) {
+ if (_reply.get()) {
+ messages[i]._msg->getTrace().setLevel(_reply->getTrace().getLevel());
+ }
+
+ _messageBatches.back().push_back(messages[i]._msg->getMsgId());
+ queueCommand(messages[i]._msg, messages[i]._target);
+ }
+}
+
+bool
+PersistenceMessageTrackerImpl::canSendReplyEarly() const
+{
+ if (!_reply.get() || !_reply->getResult().success()) {
+ LOG(spam, "Can't return early because we have already replied or failed");
+ return false;
+ }
+
+ const lib::Distribution& distribution = _manager.getDistribution();
+
+ if (distribution.getInitialRedundancy() == 0) {
+ LOG(spam, "Not returning early because initial redundancy wasn't set");
+ return false;
+ }
+
+ for (uint32_t i = 0; i < _messageBatches.size(); i++) {
+ uint32_t messagesDone = 0;
+
+ for (uint32_t j = 0; j < _messageBatches[i].size(); j++) {
+ if (_sentMessages.find(_messageBatches[i][j]) == _sentMessages.end()) {
+ messagesDone++;
+ } else if (distribution.ensurePrimaryPersisted() && j == 0) {
+ // Primary must always be written.
+ LOG(debug, "Not returning early because primary node wasn't done");
+ return false;
+ }
+ }
+
+ if (messagesDone < distribution.getInitialRedundancy()) {
+ LOG(spam, "Not returning early because only %d messages out of %d are done",
+ messagesDone, distribution.getInitialRedundancy());
+ return false;
+ }
+ }
+
+ return true;
+}
+
+void
+PersistenceMessageTrackerImpl::checkCopiesDeleted()
+{
+ if (!_reply.get()) {
+ return;
+ }
+
+ // Don't check the buckets that have been remapped here, as we will
+ // create them.
+ for (BucketInfoMap::const_iterator iter = _bucketInfo.begin();
+ iter != _bucketInfo.end();
+ iter++)
+ {
+ BucketDatabase::Entry dbentry =
+ _manager.getBucketDatabase().get(iter->first);
+
+ if (!dbentry.valid()) {
+ continue;
+ }
+
+ std::vector<uint16_t> missing;
+ std::vector<uint16_t> total;
+
+ for (uint32_t i = 0; i < iter->second.size(); ++i) {
+ if (dbentry->getNode(iter->second[i].getNode()) == NULL) {
+ missing.push_back(iter->second[i].getNode());
+ }
+
+ total.push_back(iter->second[i].getNode());
+ }
+
+ if (!missing.empty()) {
+ std::ostringstream msg;
+ msg << iter->first << " was deleted from nodes ["
+ << commaSeparated(missing)
+ << "] after message was sent but before it was done. Sent to ["
+ << commaSeparated(total)
+ << "]";
+
+ LOG(debug, "%s", msg.str().c_str());
+ _reply->setResult(api::ReturnCode(api::ReturnCode::BUCKET_DELETED,
+ msg.str()));
+ break;
+ }
+ }
+}
+
+void
+PersistenceMessageTrackerImpl::addBucketInfoFromReply(
+ uint16_t node,
+ const api::BucketInfoReply& reply)
+{
+ const document::BucketId& bucket(reply.getBucketId());
+ const api::BucketInfo& bucketInfo(reply.getBucketInfo());
+
+ if (reply.hasBeenRemapped()) {
+ LOG(debug, "Bucket %s: Received remapped bucket info %s from node %d",
+ bucket.toString().c_str(),
+ bucketInfo.toString().c_str(),
+ node);
+ _remapBucketInfo[bucket].push_back(
+ BucketCopy(_manager.getUniqueTimestamp(),
+ node,
+ bucketInfo));
+ } else {
+ LOG(debug, "Bucket %s: Received bucket info %s from node %d",
+ bucket.toString().c_str(),
+ bucketInfo.toString().c_str(),
+ node);
+ _bucketInfo[bucket].push_back(
+ BucketCopy(_manager.getUniqueTimestamp(),
+ node,
+ bucketInfo));
+ }
+}
+
+void
+PersistenceMessageTrackerImpl::logSuccessfulReply(uint16_t node,
+ const api::BucketInfoReply& reply) const
+{
+ LOG(spam, "Bucket %s: Received successful reply %s",
+ reply.getBucketId().toString().c_str(),
+ reply.toString().c_str());
+
+ if (!reply.getBucketInfo().valid()) {
+ LOG(error,
+ "Reply %s from node %d contained invalid bucket "
+ "information %s. This is a bug! Please report "
+ "this to the Vespa team",
+ reply.toString().c_str(),
+ node,
+ reply.getBucketInfo().toString().c_str());
+ }
+}
+
+bool
+PersistenceMessageTrackerImpl::shouldRevert() const
+{
+ return _manager.getDistributorConfig().enableRevert
+ && _revertNodes.size() && !_success && _reply.get();
+}
+
+void
+PersistenceMessageTrackerImpl::sendReply(MessageSender& sender)
+{
+ updateMetrics();
+ _trace.setStrict(false);
+ _reply->getTrace().getRoot().addChild(_trace);
+
+ sender.sendReply(_reply);
+ _reply = std::shared_ptr<api::BucketInfoReply>();
+}
+
+void
+PersistenceMessageTrackerImpl::updateFailureResult(const api::BucketInfoReply& reply)
+{
+ LOG(debug, "Bucket %s: Received failed reply %s with result %s",
+ reply.getBucketId().toString().c_str(),
+ reply.toString().c_str(),
+ reply.getResult().toString().c_str());
+ if (reply.getResult().getResult() >
+ _reply->getResult().getResult())
+ {
+ _reply->setResult(reply.getResult());
+ }
+
+ _success = false;
+}
+
+void
+PersistenceMessageTrackerImpl::handleCreateBucketReply(
+ api::BucketInfoReply& reply,
+ uint16_t node)
+{
+ LOG(spam, "Received CreateBucket reply for %s from node %u",
+ reply.getBucketId().toString().c_str(), node);
+ if (!reply.getResult().success()
+ && reply.getResult().getResult() != api::ReturnCode::EXISTS)
+ {
+ LOG(spam, "Create bucket reply failed, so deleting it from bucket db");
+ _manager.removeNodeFromDB(reply.getBucketId(), node);
+ LOG_BUCKET_OPERATION_NO_LOCK(
+ reply.getBucketId(),
+ vespalib::make_vespa_string(
+ "Deleted bucket on node %u due to failing create bucket %s",
+ node, reply.getResult().toString().c_str()));
+ }
+}
+
+void
+PersistenceMessageTrackerImpl::handlePersistenceReply(
+ api::BucketInfoReply& reply,
+ uint16_t node)
+{
+ if (reply.getBucketInfo().valid()) {
+ addBucketInfoFromReply(node, reply);
+ }
+ if (reply.getResult().success()) {
+ logSuccessfulReply(node, reply);
+ _revertNodes.push_back(std::pair<document::BucketId, uint16_t>(
+ reply.getBucketId(), node));
+ } else if (!hasSentReply()) {
+ updateFailureResult(reply);
+ }
+}
+
+void
+PersistenceMessageTrackerImpl::updateFromReply(
+ MessageSender& sender,
+ api::BucketInfoReply& reply,
+ uint16_t node)
+{
+ _trace.addChild(reply.getTrace().getRoot());
+
+ if (reply.getType() == api::MessageType::CREATEBUCKET_REPLY) {
+ handleCreateBucketReply(reply, node);
+ } else {
+ handlePersistenceReply(reply, node);
+ }
+
+ if (finished()) {
+ bool doRevert(shouldRevert());
+
+ checkCopiesDeleted();
+ updateDB();
+
+ if (!hasSentReply()) {
+ sendReply(sender);
+ }
+ if (doRevert) {
+ revert(sender, _revertNodes);
+ }
+ } else if (canSendReplyEarly()) {
+ LOG(debug, "Sending reply early because initial redundancy has been reached");
+ checkCopiesDeleted();
+ sendReply(sender);
+ }
+}
+
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/persistencemessagetracker.h b/storage/src/vespa/storage/distributor/persistencemessagetracker.h
new file mode 100644
index 00000000000..3f667c146eb
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/persistencemessagetracker.h
@@ -0,0 +1,119 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/distributormetricsset.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/distributor/distributorcomponent.h>
+#include <vespa/storage/distributor/messagetracker.h>
+#include <vespa/storageapi/messageapi/bucketinfocommand.h>
+#include <vespa/storageapi/messageapi/bucketinforeply.h>
+
+
+namespace storage {
+
+namespace distributor {
+
+struct PersistenceMessageTracker {
+ virtual ~PersistenceMessageTracker() { }
+ typedef MessageTracker::ToSend ToSend;
+
+ virtual void fail(MessageSender&, const api::ReturnCode&) = 0;
+ virtual void queueMessageBatch(const std::vector<ToSend>&) = 0;
+ virtual uint16_t receiveReply(MessageSender&, api::BucketInfoReply&) = 0;
+ virtual std::shared_ptr<api::BucketInfoReply>& getReply() = 0;
+ virtual void updateFromReply(MessageSender&, api::BucketInfoReply&,
+ uint16_t node) = 0;
+
+ virtual void queueCommand(api::BucketCommand::SP, uint16_t target) = 0;
+ virtual void flushQueue(MessageSender&) = 0;
+ virtual uint16_t handleReply(api::BucketReply& reply) = 0;
+};
+
+class PersistenceMessageTrackerImpl : public PersistenceMessageTracker,
+ public MessageTracker
+{
+private:
+ typedef std::map<document::BucketId, std::vector<BucketCopy> > BucketInfoMap;
+ BucketInfoMap _remapBucketInfo;
+ BucketInfoMap _bucketInfo;
+
+public:
+ PersistenceMessageTrackerImpl(PersistenceOperationMetricSet& metric,
+ std::shared_ptr<api::BucketInfoReply> reply,
+ DistributorComponent&,
+ api::Timestamp revertTimestamp = 0);
+
+ void updateDB();
+
+ void updateMetrics();
+
+ bool success() const { return _success; }
+
+ void fail(MessageSender& sender, const api::ReturnCode& result);
+
+ /**
+ Returns the node the reply was from.
+ */
+ uint16_t receiveReply(MessageSender& sender, api::BucketInfoReply& reply);
+
+ void updateFromReply(MessageSender& sender, api::BucketInfoReply& reply, uint16_t node);
+
+ std::shared_ptr<api::BucketInfoReply>& getReply() { return _reply; }
+
+ typedef std::pair<document::BucketId, uint16_t> BucketNodePair;
+
+ void revert(MessageSender& sender, const std::vector<BucketNodePair> revertNodes);
+
+ /**
+ Sends a set of messages that are permissible for early return.
+ If early return is enabled, each message batch must be "finished", that is,
+ have at most (messages.size() - initial redundancy) messages left in the
+ queue and have it's first message be done.
+ */
+ void queueMessageBatch(const std::vector<MessageTracker::ToSend>& messages);
+
+private:
+ typedef std::vector<uint64_t> MessageBatch;
+ std::vector<MessageBatch> _messageBatches;
+
+ PersistenceOperationMetricSet& _metric;
+ std::shared_ptr<api::BucketInfoReply> _reply;
+ DistributorComponent& _manager;
+ FastOS_Time _creationTime;
+ bool _success;
+ api::Timestamp _revertTimestamp;
+ std::vector<std::pair<document::BucketId, uint16_t> > _revertNodes;
+ mbus::TraceNode _trace;
+ uint8_t _priority;
+
+ bool canSendReplyEarly() const;
+ void addBucketInfoFromReply(uint16_t node,
+ const api::BucketInfoReply& reply);
+ void logSuccessfulReply(uint16_t node,
+ const api::BucketInfoReply& reply) const;
+ bool hasSentReply() const {
+ return _reply.get() == 0;
+ }
+ bool shouldRevert() const;
+ void sendReply(MessageSender& sender);
+ void checkCopiesDeleted();
+ void updateFailureResult(const api::BucketInfoReply& reply);
+ void handleCreateBucketReply(
+ api::BucketInfoReply& reply,
+ uint16_t node);
+ void handlePersistenceReply(
+ api::BucketInfoReply& reply,
+ uint16_t node);
+
+ virtual void queueCommand(std::shared_ptr<api::BucketCommand> msg,
+ uint16_t target)
+ { MessageTracker::queueCommand(msg, target); }
+ virtual void flushQueue(MessageSender& s) { MessageTracker::flushQueue(s); }
+ virtual uint16_t handleReply(api::BucketReply& r)
+ { return MessageTracker::handleReply(r); }
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/sentmessagemap.cpp b/storage/src/vespa/storage/distributor/sentmessagemap.cpp
new file mode 100644
index 00000000000..0848de69981
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/sentmessagemap.cpp
@@ -0,0 +1,96 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/sentmessagemap.h>
+
+#include <vespa/log/log.h>
+#include <vespa/storage/distributor/operations/operation.h>
+
+LOG_SETUP(".distributor.callback.map");
+
+namespace storage {
+
+namespace distributor {
+
+SentMessageMap::SentMessageMap()
+ : _map()
+{
+}
+
+SentMessageMap::~SentMessageMap()
+{
+}
+
+
+std::shared_ptr<Operation>
+SentMessageMap::pop()
+{
+ std::map<api::StorageMessage::Id, std::shared_ptr<Operation> >::iterator found = _map.begin();
+
+ if (found != _map.end()) {
+ std::shared_ptr<Operation> retVal = found->second;
+ _map.erase(found);
+ return retVal;
+ } else {
+ return std::shared_ptr<Operation>();
+ }
+}
+
+std::shared_ptr<Operation>
+SentMessageMap::pop(api::StorageMessage::Id id)
+{
+ std::map<api::StorageMessage::Id, std::shared_ptr<Operation> >::iterator found = _map.find(id);
+
+ if (found != _map.end()) {
+ LOG(spam, "Found Id %" PRIu64 " in callback map: %p", id,
+ found->second.get());
+
+ std::shared_ptr<Operation> retVal = found->second;
+ _map.erase(found);
+ return retVal;
+ } else {
+ LOG(spam, "Did not find Id %" PRIu64 " in callback map", id);
+
+ return std::shared_ptr<Operation>();
+ }
+}
+
+void
+SentMessageMap::insert(api::StorageMessage::Id id, const std::shared_ptr<Operation> & callback)
+{
+ LOG(spam, "Inserting callback %p for message %" PRIu64 "",
+ callback.get(), id);
+
+ _map[id] = callback;
+}
+
+std::string
+SentMessageMap::toString() const
+{
+ std::ostringstream ost;
+ std::set<std::string> messages;
+
+ for (Map::const_iterator iter = _map.begin();
+ iter != _map.end();
+ ++iter)
+ {
+ messages.insert(iter->second.get()->toString());
+ }
+ for (std::set<std::string>::const_iterator
+ it(messages.begin()), e(messages.end());
+ it != e; ++it)
+ {
+ ost << *it << "\n";
+ }
+
+ return ost.str();
+}
+
+void
+SentMessageMap::clear()
+{
+ _map.clear();
+}
+
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/sentmessagemap.h b/storage/src/vespa/storage/distributor/sentmessagemap.h
new file mode 100644
index 00000000000..02765a0ad43
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/sentmessagemap.h
@@ -0,0 +1,45 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/vespalib/util/sync.h>
+#include <map>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+
+namespace storage
+{
+
+namespace distributor {
+
+class Operation;
+
+class SentMessageMap
+{
+public:
+ SentMessageMap();
+
+ ~SentMessageMap();
+
+ std::shared_ptr<Operation> pop(api::StorageMessage::Id id);
+
+ std::shared_ptr<Operation> pop();
+
+ void insert(api::StorageMessage::Id id, const std::shared_ptr<Operation> & msg);
+
+ void clear();
+
+ uint32_t size() const { return _map.size(); }
+
+ uint32_t empty() const { return _map.empty(); }
+
+ std::string toString() const;
+
+private:
+ typedef std::map<api::StorageMessage::Id, std::shared_ptr<Operation> > Map;
+
+ Map _map;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/simpleclusterinformation.h b/storage/src/vespa/storage/distributor/simpleclusterinformation.h
new file mode 100644
index 00000000000..8d68417ea2d
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/simpleclusterinformation.h
@@ -0,0 +1,49 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/clusterinformation.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/vdslib/state/clusterstate.h>
+
+namespace storage {
+namespace distributor {
+
+class SimpleClusterInformation : public ClusterInformation
+{
+public:
+ SimpleClusterInformation(uint16_t myIndex,
+ const lib::Distribution& distribution,
+ const lib::ClusterState& clusterState,
+ const char* storageUpStates)
+ : _myIndex(myIndex),
+ _distribution(distribution.serialize()),
+ _clusterState(clusterState),
+ _storageUpStates(storageUpStates)
+ {}
+
+ uint16_t getDistributorIndex() const {
+ return _myIndex;
+ }
+
+ const lib::Distribution& getDistribution() const {
+ return _distribution;
+ }
+
+ const lib::ClusterState& getClusterState() const {
+ return _clusterState;
+ }
+
+ const char* getStorageUpStates() const {
+ return _storageUpStates;
+ }
+
+private:
+ uint16_t _myIndex;
+ lib::Distribution _distribution;
+ lib::ClusterState _clusterState;
+ const char* _storageUpStates;
+};
+
+}
+}
+
diff --git a/storage/src/vespa/storage/distributor/statechecker.cpp b/storage/src/vespa/storage/distributor/statechecker.cpp
new file mode 100644
index 00000000000..b9f8058cf13
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/statechecker.cpp
@@ -0,0 +1,112 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/statechecker.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/distributor/distributorcomponent.h>
+
+#include <algorithm>
+
+LOG_SETUP(".distributor.statechecker");
+
+namespace storage {
+
+namespace distributor {
+
+namespace {
+
+class StoredResultImpl
+ : public StateChecker::ResultImpl,
+ boost::noncopyable
+{
+ mutable IdealStateOperation::UP _operation;
+ MaintenancePriority _priority;
+public:
+ StoredResultImpl()
+ : _operation(),
+ _priority(MaintenancePriority::NO_MAINTENANCE_NEEDED)
+ {}
+
+ StoredResultImpl(IdealStateOperation::UP operation,
+ MaintenancePriority priority)
+ : _operation(std::move(operation)),
+ _priority(priority)
+ {}
+
+ IdealStateOperation::UP createOperation() {
+ return std::move(_operation);
+ }
+
+ MaintenancePriority getPriority() const {
+ return _priority;
+ }
+
+ MaintenanceOperation::Type getType() const {
+ assert(_operation.get());
+ return _operation->getType();
+ }
+};
+
+}
+
+StateChecker::Result
+StateChecker::Result::noMaintenanceNeeded()
+{
+ return Result(vespalib::LinkedPtr<ResultImpl>());
+}
+
+StateChecker::Result
+StateChecker::Result::createStoredResult(
+ IdealStateOperation::UP operation,
+ MaintenancePriority::Priority priority)
+{
+ return Result(vespalib::LinkedPtr<ResultImpl>(new StoredResultImpl(std::move(operation), MaintenancePriority(priority))));
+}
+
+StateChecker::Context::Context(const DistributorComponent& c,
+ NodeMaintenanceStatsTracker& statsTracker,
+ const document::BucketId& bid)
+ : bucketId(bid),
+ siblingBucket(c.getSibling(bid)),
+ systemState(c.getClusterState()),
+ distributorConfig(c.getDistributor().getConfig()),
+ distribution(c.getDistribution()),
+ gcTimeCalculator(c.getDistributor().getBucketIdHasher(),
+ std::chrono::seconds(distributorConfig
+ .getGarbageCollectionInterval())),
+ component(c),
+ db(c.getBucketDatabase()),
+ stats(statsTracker)
+{
+ idealState =
+ distribution.getIdealStorageNodes(systemState, bucketId);
+ unorderedIdealState.insert(idealState.begin(), idealState.end());
+}
+
+std::string
+StateChecker::Context::toString() const
+{
+ std::ostringstream ss;
+ ss << "entries: {";
+
+ for (uint32_t i = 0; i < entries.size(); ++i) {
+ if (i != 0) {
+ ss << ", ";
+ }
+ ss << entries[i].getBucketId() << ": [" << entries[i]->toString() << "]";
+ }
+
+ ss << "}, state: " << systemState;
+ return ss.str();
+}
+
+StateChecker::StateChecker()
+{
+}
+
+StateChecker::~StateChecker()
+{
+}
+
+}
+
+}
diff --git a/storage/src/vespa/storage/distributor/statechecker.h b/storage/src/vespa/storage/distributor/statechecker.h
new file mode 100644
index 00000000000..2ae43a4338c
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/statechecker.h
@@ -0,0 +1,165 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <boost/noncopyable.hpp>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/common/storagecomponent.h>
+#include <vespa/storage/distributor/operations/idealstate/idealstateoperation.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/storage/distributor/bucketgctimecalculator.h>
+#include <vespa/storage/distributor/maintenancebucket.h>
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <vespa/vespalib/util/linkedptr.h>
+
+#include <unordered_set>
+#include <map>
+#include <set>
+
+namespace storage {
+
+namespace distributor {
+
+class DistributorComponent;
+class DistributorConfiguration;
+class NodeMaintenanceStatsTracker;
+
+/**
+ * This class is used by IdealStateManager to generate ideal state operations.
+ * Every time IdealStateManager wants to verify that a bucket is in its ideal
+ * state, it calls a list of StateCheckers' generateOperations() methods.
+ * This generates a list of operations to run.
+ *
+ * Each statechecker also keeps a queue of operations that have been previously
+ * generated. IdealStateManager adds to this queue, and also calls
+ * startOperations() to fetch operations to perform.
+ *
+ * The statechecker can also be used to generate metrics on what needs to be
+ * done to reach the ideal state - using the generateMetrics() method.
+ */
+class StateChecker {
+public:
+ typedef std::shared_ptr<StateChecker> SP;
+
+ /**
+ * Context object used when generating operations and metrics for a
+ * bucket.
+ */
+ struct Context
+ {
+ Context(const DistributorComponent&,
+ NodeMaintenanceStatsTracker&,
+ const document::BucketId& bid);
+
+ // Per bucket
+ document::BucketId bucketId;
+ document::BucketId siblingBucket;
+
+ BucketDatabase::Entry entry;
+ BucketDatabase::Entry siblingEntry;
+ std::vector<BucketDatabase::Entry> entries;
+
+ // Common
+ const lib::ClusterState& systemState;
+ const DistributorConfiguration& distributorConfig;
+ const lib::Distribution& distribution;
+
+ BucketGcTimeCalculator gcTimeCalculator;
+
+ // Separate ideal state into ordered sequence and unordered set, as we
+ // need to both know the actual order (activation prioritization etc) as
+ // well as have the ability to quickly check if a node is in an ideal
+ // location.
+ std::vector<uint16_t> idealState;
+ std::unordered_set<uint16_t> unorderedIdealState;
+
+ const DistributorComponent& component;
+ const BucketDatabase& db;
+ NodeMaintenanceStatsTracker& stats;
+
+ const BucketDatabase::Entry& getSiblingEntry() const {
+ return siblingEntry;
+ }
+
+ std::string toString() const;
+ };
+
+ class ResultImpl
+ {
+ public:
+ virtual ~ResultImpl() {}
+
+ virtual IdealStateOperation::UP createOperation() = 0;
+
+ virtual MaintenancePriority getPriority() const = 0;
+
+ virtual MaintenanceOperation::Type getType() const = 0;
+ };
+
+ class Result
+ {
+ vespalib::LinkedPtr<ResultImpl> _impl;
+ public:
+ IdealStateOperation::UP createOperation() {
+ return (_impl.get()
+ ? _impl->createOperation()
+ : IdealStateOperation::UP());
+ }
+
+ MaintenancePriority getPriority() const {
+ return (_impl.get()
+ ? _impl->getPriority()
+ : MaintenancePriority());
+ }
+
+ MaintenanceOperation::Type getType() const {
+ return (_impl.get()
+ ? _impl->getType()
+ : MaintenanceOperation::OPERATION_COUNT);
+
+ }
+
+ static Result noMaintenanceNeeded();
+ static Result createStoredResult(
+ IdealStateOperation::UP operation,
+ MaintenancePriority::Priority priority);
+ private:
+ Result(const vespalib::LinkedPtr<ResultImpl> impl)
+ : _impl(impl)
+ {}
+ };
+
+ /**
+ * Constructor.
+ */
+ StateChecker();
+
+ virtual ~StateChecker();
+
+ /**
+ * Calculates if operations need to be scheduled to rectify any issues
+ * this state checker is checking for.
+ *
+ * @return Returns an operation to perform for the given bucket.
+ */
+ virtual Result check(Context& c) = 0;
+
+ /**
+ * Used by status pages to generate human-readable information
+ * about the ideal state.
+
+ * @return Returns a string containing information about the
+ * problems this state checker is intended to solve.
+ */
+ virtual std::string getStatusText() const = 0;
+
+ /**
+ * Returns the name of this state checker.
+ */
+ virtual const char* getName() const = 0;
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/statecheckers.cpp b/storage/src/vespa/storage/distributor/statecheckers.cpp
new file mode 100644
index 00000000000..e93806e2181
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/statecheckers.cpp
@@ -0,0 +1,1147 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/statecheckers.h>
+
+#include <vespa/log/log.h>
+#include <vespa/storage/distributor/activecopy.h>
+#include <vespa/storage/distributor/bucketdb/bucketdatabase.h>
+#include <vespa/storage/distributor/operations/idealstate/splitoperation.h>
+#include <vespa/storage/distributor/operations/idealstate/joinoperation.h>
+#include <vespa/storage/distributor/operations/idealstate/removebucketoperation.h>
+#include <vespa/storage/distributor/operations/idealstate/setbucketstateoperation.h>
+#include <vespa/storage/distributor/operations/idealstate/mergeoperation.h>
+#include <vespa/storage/distributor/operations/idealstate/garbagecollectionoperation.h>
+#include <vespa/storage/common/bucketoperationlogger.h>
+#include <vespa/vespalib/stllike/asciistream.h>
+
+LOG_SETUP(".distributor.operation.checkers");
+
+namespace storage {
+namespace distributor {
+
+bool
+SplitBucketStateChecker::validForSplit(StateChecker::Context& c)
+{
+ // Can't split if we have no nodes.
+ if (c.entry->getNodeCount() == 0) {
+ LOG(spam,
+ "Can't split bucket %s, since it has no copies",
+ c.bucketId.toString().c_str());
+ return false;
+ }
+
+ // Can't split anymore if we already used 58 bits.
+ if (c.bucketId.getUsedBits() >= 58) {
+ return false;
+ }
+
+ return true;
+}
+
+double
+SplitBucketStateChecker::getBucketSizeRelativeToMax(StateChecker::Context& c)
+{
+ const BucketInfo& info(c.entry.getBucketInfo());
+ const uint32_t highestDocumentCount(info.getHighestDocumentCount());
+ const uint32_t highestTotalDocumentSize(info.getHighestTotalDocumentSize());
+ const uint32_t highestMetaCount(info.getHighestMetaCount());
+ const uint32_t highestUsedFileSize(info.getHighestUsedFileSize());
+
+ if (highestDocumentCount < 2) {
+ return 0;
+ }
+
+ double byteSplitRatio = 0;
+ if (c.distributorConfig.getSplitSize() > 0) {
+ byteSplitRatio = static_cast<double>(highestTotalDocumentSize)
+ / c.distributorConfig.getSplitSize();
+ }
+
+ double docSplitRatio = 0;
+ if (c.distributorConfig.getSplitCount() > 0) {
+ docSplitRatio = static_cast<double>(highestDocumentCount)
+ / c.distributorConfig.getSplitCount();
+ }
+
+ double fileSizeRatio = 0;
+ if (c.distributorConfig.getSplitSize() > 0) {
+ fileSizeRatio = static_cast<double>(highestUsedFileSize)
+ / (2 * c.distributorConfig.getSplitSize());
+ }
+
+ double metaSplitRatio = 0;
+ if (c.distributorConfig.getSplitCount() > 0) {
+ metaSplitRatio = static_cast<double>(highestMetaCount)
+ / (2 * c.distributorConfig.getSplitCount());
+ }
+
+ return std::max(std::max(byteSplitRatio, docSplitRatio),
+ std::max(fileSizeRatio, metaSplitRatio));
+}
+
+StateChecker::Result
+SplitBucketStateChecker::generateMinimumBucketSplitOperation(
+ StateChecker::Context& c)
+{
+ IdealStateOperation::UP so(new SplitOperation(
+ c.component.getClusterName(),
+ BucketAndNodes(c.bucketId, c.entry->getNodes()),
+ c.distributorConfig.getMinimalBucketSplit(),
+ 0,
+ 0));
+
+ so->setPriority(c.distributorConfig.getMaintenancePriorities()
+ .splitDistributionBits);
+ so->setDetailedReason(
+ "[Splitting bucket because the current system size requires "
+ "a higher minimum split bit]");
+ return Result::createStoredResult(std::move(so), MaintenancePriority::MEDIUM);
+}
+
+StateChecker::Result
+SplitBucketStateChecker::generateMaxSizeExceededSplitOperation(
+ StateChecker::Context& c)
+{
+ IdealStateOperation::UP so(new SplitOperation(
+ c.component.getClusterName(),
+ BucketAndNodes(c.bucketId, c.entry->getNodes()),
+ 58,
+ c.distributorConfig.getSplitCount(),
+ c.distributorConfig.getSplitSize()));
+
+ so->setPriority(c.distributorConfig.getMaintenancePriorities()
+ .splitLargeBucket);
+
+ const BucketInfo& info(c.entry.getBucketInfo());
+ vespalib::asciistream ost;
+ ost << "[Splitting bucket because its maximum size ("
+ << info.getHighestTotalDocumentSize()
+ << " b, "
+ << info.getHighestDocumentCount()
+ << " docs, "
+ << info.getHighestMetaCount()
+ << " meta, "
+ << info.getHighestUsedFileSize()
+ << " b total"
+ << ") is higher than the configured limit of ("
+ << c.distributorConfig.getSplitSize()
+ << ", " << c.distributorConfig.getSplitCount() << ")]";
+
+ so->setDetailedReason(ost.str());
+ return Result::createStoredResult(std::move(so), MaintenancePriority::HIGH);
+
+}
+
+StateChecker::Result
+SplitBucketStateChecker::check(StateChecker::Context& c) {
+ if (!validForSplit(c)) {
+ return StateChecker::Result::noMaintenanceNeeded();
+ }
+
+ double splitRatio(getBucketSizeRelativeToMax(c));
+ if (splitRatio > 1.0) {
+ return generateMaxSizeExceededSplitOperation(c);
+ }
+
+ // Always split it if it has less used bits than the minimum.
+ if (c.bucketId.getUsedBits() < c.distributorConfig.getMinimalBucketSplit()) {
+ return generateMinimumBucketSplitOperation(c);
+ }
+ return Result::noMaintenanceNeeded();
+}
+
+bool
+JoinBucketsStateChecker::isFirstSibling(const document::BucketId& bucketId) const
+{
+ return (bucketId.getId() & (1ULL << (bucketId.getUsedBits() - 1))) == 0;
+}
+
+namespace {
+
+bool
+equalNodeSet(const std::vector<uint16_t>& idealState,
+ const BucketDatabase::Entry& dbEntry)
+{
+ if (idealState.size() != dbEntry->getNodeCount()) {
+ return false;
+ }
+ // Note: no assumptions are made on the ordering of the elements in
+ // either vector.
+ for (uint16_t node : idealState) {
+ if (!dbEntry->getNode(node)) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool
+bucketAndSiblingReplicaLocationsEqualIdealState(
+ const StateChecker::Context& context)
+{
+ if (!equalNodeSet(context.idealState, context.entry)) {
+ return false;
+ }
+ std::vector<uint16_t> siblingIdealState(
+ context.distribution.getIdealStorageNodes(
+ context.systemState, context.siblingBucket));
+ if (!equalNodeSet(siblingIdealState, context.siblingEntry)) {
+ return false;
+ }
+ return true;
+}
+
+bool
+inconsistentJoinIsEnabled(const StateChecker::Context& context)
+{
+ return context.distributorConfig.getEnableInconsistentJoin();
+}
+
+bool
+inconsistentJoinIsAllowed(const StateChecker::Context& context)
+{
+ return (inconsistentJoinIsEnabled(context)
+ && bucketAndSiblingReplicaLocationsEqualIdealState(context));
+}
+
+} // anon ns
+
+bool
+JoinBucketsStateChecker::siblingsAreInSync(const Context& context) const
+{
+ const auto& entry(context.entry);
+ const auto& siblingEntry(context.siblingEntry);
+
+ if (entry->getNodeCount() != siblingEntry->getNodeCount()) {
+ LOG(spam,
+ "Not joining bucket %s because sibling bucket %s had different "
+ "node count",
+ context.bucketId.toString().c_str(),
+ context.siblingBucket.toString().c_str());
+ return false;
+ }
+
+ bool siblingsCoLocated = true;
+ for (uint32_t i = 0; i < entry->getNodeCount(); ++i) {
+ if (entry->getNodeRef(i).getNode()
+ != siblingEntry->getNodeRef(i).getNode())
+ {
+ siblingsCoLocated = false;
+ break;
+ }
+ }
+
+ if (!siblingsCoLocated && !inconsistentJoinIsAllowed(context)) {
+ LOG(spam,
+ "Not joining bucket %s because sibling bucket %s "
+ "does not have the same node set, or inconsistent joins cannot be "
+ "performed either due to config or because replicas were not in "
+ "their ideal location",
+ context.bucketId.toString().c_str(),
+ context.siblingBucket.toString().c_str());
+ return false;
+ }
+
+ if (!entry->validAndConsistent() || !siblingEntry->validAndConsistent()) {
+ LOG(spam,
+ "Not joining bucket %s because it or %s is out of sync "
+ "and syncing it may cause it to become too large",
+ context.bucketId.toString().c_str(),
+ context.siblingBucket.toString().c_str());
+ return false;
+ }
+
+ return true;
+}
+
+bool
+JoinBucketsStateChecker::singleBucketJoinIsConsistent(const Context& c) const
+{
+ document::BucketId joinTarget(c.bucketId.getUsedBits() - 1,
+ c.bucketId.getRawId());
+ // If there are 2 children under the potential join target bucket, joining
+ // would cause the bucket tree to become inconsistent. The reason for this
+ // being that "moving" a bucket one bit up in the tree (and into
+ // joinedBucket) would create a new parent bucket for the bucket(s)
+ // already present in the other child tree, thus causing it to become
+ // inconsistent. After all, we desire a bucket tree with only leaves
+ // being actually present.
+ return (c.db.childCount(joinTarget) == 1);
+}
+
+bool
+JoinBucketsStateChecker::singleBucketJoinIsEnabled(const Context& c) const
+{
+ return c.distributorConfig.getEnableJoinForSiblingLessBuckets();
+}
+
+namespace {
+
+// We don't want to invoke joins on buckets that have more replicas than
+// required. This is in particular because joins cause ideal states to change
+// for the target buckets and trigger merges. Since the removal of the non-
+// ideal replicas is done by the DeleteBuckets state-checker, it will become
+// preempted by potential follow-up joins unless we explicitly avoid these.
+bool
+contextBucketHasTooManyReplicas(const StateChecker::Context& c)
+{
+ return (c.entry->getNodeCount() > c.distribution.getRedundancy());
+}
+
+bool
+bucketAtDistributionBitLimit(const document::BucketId& bucket,
+ const StateChecker::Context& c)
+{
+ return (bucket.getUsedBits() <= std::max(
+ uint32_t(c.systemState.getDistributionBitCount()),
+ c.distributorConfig.getMinimalBucketSplit()));
+}
+
+}
+
+bool
+JoinBucketsStateChecker::shouldJoin(const Context& c) const
+{
+ if (c.entry->getNodeCount() == 0) {
+ LOG(spam, "Not joining bucket %s because it has no nodes",
+ c.bucketId.toString().c_str());
+ return false;
+ }
+
+ if (contextBucketHasTooManyReplicas(c)) {
+ LOG(spam, "Not joining %s because it has too high replication level",
+ c.bucketId.toString().c_str());
+ return false;
+ }
+
+ if (c.distributorConfig.getJoinSize() == 0 && c.distributorConfig.getJoinCount() == 0) {
+ LOG(spam, "Not joining bucket %s because join is disabled",
+ c.bucketId.toString().c_str());
+ return false;
+ }
+
+ if (bucketAtDistributionBitLimit(c.bucketId, c)) {
+ LOG(spam,
+ "Not joining bucket %s because it is below the min split "
+ "count (config: %u, cluster state: %u, bucket has: %u)",
+ c.bucketId.toString().c_str(),
+ c.distributorConfig.getMinimalBucketSplit(),
+ c.systemState.getDistributionBitCount(),
+ c.bucketId.getUsedBits());
+ return false;
+ }
+
+ if (c.entry->hasRecentlyCreatedEmptyCopy()) {
+ return false;
+ }
+
+ if (c.getSiblingEntry().valid()) {
+ if (!isFirstSibling(c.bucketId)) {
+ LOG(spam,
+ "Not joining bucket %s because it is the second sibling of "
+ "%s and not the first",
+ c.bucketId.toString().c_str(),
+ c.siblingBucket.toString().c_str());
+ return false;
+ }
+ if (!siblingsAreInSync(c)) {
+ return false;
+ }
+ return smallEnoughToJoin(c);
+ }
+
+ if (!singleBucketJoinIsEnabled(c)) {
+ return false;
+ }
+
+ if (!smallEnoughToJoin(c)) {
+ return false;
+ }
+
+ // No sibling and bucket has more bits than the minimum number of split
+ // bits. If joining the bucket with itself into a bucket with 1 less
+ // bit does _not_ introduce any inconsistencies in the bucket tree, do
+ // so in order to gradually compact away sparse buckets.
+ return singleBucketJoinIsConsistent(c);
+}
+
+/**
+ * Compute sum(for each sibling(max(for each replica(used file size)))).
+ * If sibling does not exist, treats its highest used file size as 0.
+ */
+uint64_t
+JoinBucketsStateChecker::getTotalUsedFileSize(const Context& c) const
+{
+ return (c.entry.getBucketInfo().getHighestUsedFileSize()
+ + c.getSiblingEntry().getBucketInfo().getHighestUsedFileSize());
+}
+
+/**
+ * Compute sum(for each sibling(max(for each replica(meta count)))).
+ * If sibling does not exist, treats its highest meta count as 0.
+ */
+uint64_t
+JoinBucketsStateChecker::getTotalMetaCount(const Context& c) const
+{
+ return (c.entry.getBucketInfo().getHighestMetaCount()
+ + c.getSiblingEntry().getBucketInfo().getHighestMetaCount());
+}
+
+bool
+JoinBucketsStateChecker::smallEnoughToJoin(const Context& c) const
+{
+ if (c.distributorConfig.getJoinSize() != 0) {
+ if (getTotalUsedFileSize(c) >= c.distributorConfig.getJoinSize()) {
+ return false;
+ }
+ }
+ if (c.distributorConfig.getJoinCount() != 0) {
+ if (getTotalMetaCount(c) >= c.distributorConfig.getJoinCount()) {
+ return false;
+ }
+ }
+ return true;
+}
+
+namespace {
+
+bool
+legalBucketSplitLevel(const document::BucketId& bucket,
+ const StateChecker::Context& c)
+{
+ return bucket.getUsedBits() >= c.distributorConfig.getMinimalBucketSplit();
+}
+
+bool
+bucketHasMultipleChildren(const document::BucketId& bucket,
+ const StateChecker::Context& c)
+{
+ return c.db.childCount(bucket) > 1;
+}
+
+}
+
+document::BucketId
+JoinBucketsStateChecker::computeJoinBucket(const Context& c) const
+{
+ // Always decrease by at least 1 bit, as we could not get here unless this
+ // were a valid outcome.
+ unsigned int level = c.bucketId.getUsedBits() - 1;
+ document::BucketId target(level, c.bucketId.getRawId());
+
+ // Push bucket up the tree as long as it gets no siblings. This means
+ // joins involving 2 source buckets will currently only be decreased by 1
+ // bit (mirroring the legacy behavior), but sparse (single) buckets may
+ // be decreased by multiple bits. We may want to optimize joins for cases
+ // with 2 source buckets in the future.
+ while (true) {
+ document::BucketId candidate(level, c.bucketId.getRawId());
+ if (bucketHasMultipleChildren(candidate, c)
+ || !legalBucketSplitLevel(candidate, c))
+ {
+ break;
+ }
+ --level;
+ target = candidate;
+ }
+ return target;
+}
+
+StateChecker::Result
+JoinBucketsStateChecker::check(StateChecker::Context& c)
+{
+ // At this point in time, bucket is consistently split as the state checker
+ // would otherwise be pre-empted by the inconsistent state checker.
+ if (!shouldJoin(c)) {
+ return Result::noMaintenanceNeeded();
+ }
+
+ document::BucketId joinedBucket(computeJoinBucket(c));
+ assert(joinedBucket.getUsedBits() < c.bucketId.getUsedBits());
+
+ std::vector<document::BucketId> sourceBuckets;
+ if (c.getSiblingEntry().valid()) {
+ sourceBuckets.push_back(c.siblingBucket);
+ } else {
+ sourceBuckets.push_back(c.bucketId);
+ }
+ sourceBuckets.push_back(c.bucketId);
+ IdealStateOperation::UP op(new JoinOperation(
+ c.component.getClusterName(),
+ BucketAndNodes(joinedBucket, c.entry->getNodes()),
+ sourceBuckets));
+ op->setPriority(c.distributorConfig.getMaintenancePriorities()
+ .joinBuckets);
+ vespalib::asciistream ost;
+ ost << "[Joining buckets "
+ << sourceBuckets[1].toString()
+ << " and " << sourceBuckets[0].toString()
+ << " because their size ("
+ << getTotalUsedFileSize(c)
+ << " bytes, "
+ << getTotalMetaCount(c)
+ << " docs) is less than the configured limit of ("
+ << c.distributorConfig.getJoinSize()
+ << ", "
+ << c.distributorConfig.getJoinCount()
+ << ")";
+
+ op->setDetailedReason(ost.str());
+
+ return Result::createStoredResult(std::move(op), MaintenancePriority::VERY_LOW);
+}
+
+bool
+SplitInconsistentStateChecker::isLeastSplitBucket(
+ const document::BucketId& bucket,
+ const std::vector<BucketDatabase::Entry>& entries) const
+{
+ // Figure out if any other buckets are less split than the current one.
+ for (uint32_t i = 0; i < entries.size(); ++i) {
+ const BucketDatabase::Entry& e = entries[i];
+
+ assert(e.valid());
+
+ if (e.getBucketId().getUsedBits() < bucket.getUsedBits()) {
+ return false;
+ }
+ }
+
+ return true;
+}
+
+uint32_t
+SplitInconsistentStateChecker::getHighestUsedBits(
+ const std::vector<BucketDatabase::Entry>& entries) const
+{
+ uint32_t highestUsedBits = 0;
+ for (uint32_t i = 0; i < entries.size(); ++i) {
+ highestUsedBits = std::max(entries[i].getBucketId().getUsedBits(),
+ highestUsedBits);
+ }
+ return highestUsedBits;
+}
+
+vespalib::string
+SplitInconsistentStateChecker::getReason(
+ const document::BucketId& bucketId,
+ const std::vector<BucketDatabase::Entry>& entries) const
+{
+ vespalib::asciistream reason;
+ reason << "[Bucket is inconsistently split (list includes "
+ << vespalib::hex << "0x" << bucketId.getId();
+
+ for (uint32_t i = 0, found = 0; i < entries.size() && found < 3; i++) {
+ if (!(entries[i].getBucketId() == bucketId)) {
+ reason << ", 0x" << vespalib::hex << entries[i].getBucketId().getId();
+ ++found;
+ }
+ }
+
+ if (entries.size() > 4) {
+ reason << " and " << vespalib::dec << entries.size() - 4 << " others";
+ }
+
+ reason << ") Splitting it to improve the problem (max used bits "
+ << vespalib::dec
+ << getHighestUsedBits(entries)
+ << ")]";
+
+ return reason.str();
+}
+
+namespace {
+
+bool
+isInconsistentlySplit(const StateChecker::Context& c)
+{
+ return (c.entries.size() > 1);
+}
+
+}
+
+StateChecker::Result
+SplitInconsistentStateChecker::check(StateChecker::Context& c)
+{
+ if (!isInconsistentlySplit(c)) {
+ return Result::noMaintenanceNeeded();
+ }
+
+ if (!isLeastSplitBucket(c.bucketId, c.entries)) {
+ return Result::noMaintenanceNeeded();
+ }
+
+ IdealStateOperation::UP op(new SplitOperation(
+ c.component.getClusterName(),
+ BucketAndNodes(c.bucketId, c.entry->getNodes()),
+ getHighestUsedBits(c.entries),
+ 0,
+ 0));
+
+ op->setPriority(c.distributorConfig.getMaintenancePriorities()
+ .splitInconsistentBucket);
+ op->setDetailedReason(getReason(c.bucketId, c.entries));
+ return Result::createStoredResult(std::move(op), MaintenancePriority::HIGH);
+}
+
+namespace {
+bool containsMaintenanceNode(const std::vector<uint16_t>& ideal,
+ const StateChecker::Context& c)
+{
+ for (uint32_t i = 0; i < ideal.size(); i++) {
+ if (c.systemState.getNodeState(lib::Node(lib::NodeType::STORAGE,
+ ideal[i])).getState()
+ == lib::State::MAINTENANCE)
+ {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+bool
+consistentApartFromEmptyBucketsInNonIdealLocationAndInvalidEntries(
+ const std::vector<uint16_t>& idealNodes,
+ const BucketInfo& entry)
+{
+ api::BucketInfo info;
+ for (uint32_t i=0, n=entry.getNodeCount(); i<n; ++i) {
+ const BucketCopy& copy(entry.getNodeRef(i));
+ bool onIdealNode = false;
+ for (uint32_t j = 0; j < idealNodes.size(); ++j) {
+ if (copy.getNode() == idealNodes[j]) {
+ onIdealNode = true;
+ break;
+ }
+ }
+ // Ignore empty buckets on non-ideal nodes
+ if (!onIdealNode && copy.empty()) {
+ continue;
+ }
+ // Ignore invalid entries.
+ if (!copy.valid()) {
+ continue;
+ }
+ if (info.valid()) {
+ if (info.getChecksum() != copy.getChecksum()) {
+ return false;
+ }
+ } else {
+ info = copy.getBucketInfo();
+ }
+ }
+ return true;
+}
+
+class MergeNodes
+{
+public:
+ MergeNodes()
+ : _reason(), _nodes(), _problemFlags(0), _priority(255)
+ {}
+
+ MergeNodes(const BucketDatabase::Entry& entry)
+ : _reason(), _nodes(), _problemFlags(0), _priority(255)
+ {
+ for (uint16_t i = 0; i < entry->getNodeCount(); i++) {
+ addNode(entry->getNodeRef(i).getNode());
+ }
+ }
+
+ void operator+=(const MergeNodes& other) {
+ _reason << other._reason.str();
+ _problemFlags |= other._problemFlags;
+ _nodes.insert(_nodes.end(), other._nodes.begin(), other._nodes.end());
+ updatePriority(other._priority);
+ }
+
+ bool shouldMerge() const {
+ return _problemFlags != 0;
+ }
+
+ void markMoveToIdealLocation(uint16_t node, uint8_t msgPriority) {
+ _reason << "[Moving bucket to ideal node " << node << "]";
+ addProblem(NON_IDEAL_LOCATION);
+ addNode(node);
+ updatePriority(msgPriority);
+ }
+
+ void markOutOfSync(const StateChecker::Context& c, uint8_t msgPriority) {
+ _reason << "[Synchronizing buckets with different checksums "
+ << c.entry->toString()
+ << "]";
+ addProblem(OUT_OF_SYNC);
+ updatePriority(msgPriority);
+ }
+
+ void markMissingReplica(uint16_t node, uint8_t msgPriority) {
+ _reason << "[Adding missing node " << node << "]";
+ addProblem(MISSING_REPLICA);
+ addNode(node);
+ updatePriority(msgPriority);
+ }
+
+ bool needsMoveOnly() const {
+ return _problemFlags == NON_IDEAL_LOCATION;
+ }
+
+ void addNode(uint16_t node) {
+ _nodes.push_back(node);
+ }
+
+ const std::vector<uint16_t>& nodes() const noexcept { return _nodes; }
+ uint8_t priority() const noexcept { return _priority; }
+ std::string reason() const { return _reason.str(); }
+
+private:
+ void updatePriority(uint8_t pri) {
+ _priority = std::min(pri, _priority);
+ }
+
+ void addProblem(uint8_t newProblem) {
+ _problemFlags |= newProblem;
+ }
+
+ enum Problem {
+ OUT_OF_SYNC = 1,
+ MISSING_REPLICA = 2,
+ NON_IDEAL_LOCATION = 4
+ };
+ vespalib::asciistream _reason;
+ std::vector<uint16_t> _nodes;
+ uint8_t _problemFlags;
+ uint8_t _priority;
+};
+
+bool
+presentInIdealState(const StateChecker::Context& c, uint16_t node)
+{
+ return c.unorderedIdealState.find(node) != c.unorderedIdealState.end();
+}
+
+void
+addStatisticsForNonIdealNodes(const StateChecker::Context& c,
+ bool missingReplica)
+{
+ // Common case is that ideal state == actual state with no missing replicas.
+ // If so, do nothing.
+ if (!missingReplica && (c.idealState.size() == c.entry->getNodeCount())) {
+ return;
+ }
+ for (uint32_t j = 0; j < c.entry->getNodeCount(); ++j) {
+ const uint16_t node(c.entry->getNodeRef(j).getNode());
+ if (!presentInIdealState(c, node)) {
+ c.stats.incMovingOut(node);
+ } else if (missingReplica) {
+ // Copy is in ideal location and we're missing a replica. Thus
+ // we treat all ideal copies as sources to copy from.
+ c.stats.incCopyingOut(node);
+ }
+ }
+}
+
+MergeNodes
+checkForNodesMissingFromIdealState(StateChecker::Context& c)
+{
+ MergeNodes ret;
+
+ // Check if we need to add copies to get to ideal state.
+ if (!c.entry->emptyAndConsistent()) {
+ bool hasMissingReplica = false;
+ for (uint32_t i = 0; i < c.idealState.size(); i++) {
+ bool found = false;
+ for (uint32_t j = 0; j < c.entry->getNodeCount(); j++) {
+ if (c.entry->getNodeRef(j).getNode() == c.idealState[i]) {
+ found = true;
+ break;
+ }
+ }
+
+ if (!found) {
+ const DistributorConfiguration::MaintenancePriorities& mp(
+ c.distributorConfig.getMaintenancePriorities());
+ if (c.idealState.size() > c.entry->getNodeCount()) {
+ ret.markMissingReplica(c.idealState[i],
+ mp.mergeTooFewCopies);
+ } else {
+ ret.markMoveToIdealLocation(c.idealState[i],
+ mp.mergeMoveToIdealNode);
+ }
+ c.stats.incCopyingIn(c.idealState[i]);
+ hasMissingReplica = true;
+ }
+ }
+ addStatisticsForNonIdealNodes(c, hasMissingReplica);
+ }
+ return ret;
+}
+
+void
+addStatisticsForOutOfSyncCopies(StateChecker::Context& c)
+{
+ const uint32_t n = c.entry->getNodeCount();
+ for (uint32_t i = 0; i < n; ++i) {
+ const BucketCopy& cp(c.entry->getNodeRef(i));
+ c.stats.incSyncing(cp.getNode());
+ }
+}
+
+MergeNodes
+checkIfBucketsAreOutOfSyncAndNeedMerging(StateChecker::Context& c)
+{
+ MergeNodes ret;
+ if (!consistentApartFromEmptyBucketsInNonIdealLocationAndInvalidEntries(
+ c.idealState,
+ c.entry.getBucketInfo()))
+ {
+ auto pri(c.distributorConfig.getMaintenancePriorities()
+ .mergeOutOfSyncCopies);
+ ret.markOutOfSync(c, pri);
+ addStatisticsForOutOfSyncCopies(c);
+ }
+ return ret;
+}
+
+bool
+allCopiesAreInvalid(const StateChecker::Context& c)
+{
+ const uint32_t n = c.entry->getNodeCount();
+ for (uint32_t i = 0; i < n; ++i) {
+ const BucketCopy& cp(c.entry->getNodeRef(i));
+ if (cp.valid()) {
+ return false;
+ }
+ }
+ return true;
+}
+
+}
+
+StateChecker::Result
+SynchronizeAndMoveStateChecker::check(StateChecker::Context& c)
+{
+ if (isInconsistentlySplit(c)) {
+ return Result::noMaintenanceNeeded();
+ }
+ if (containsMaintenanceNode(c.idealState, c)) {
+ return Result::noMaintenanceNeeded();
+ }
+ if (allCopiesAreInvalid(c)) {
+ return Result::noMaintenanceNeeded();
+ }
+
+ assert(c.entry->getNodeCount() > 0);
+
+ MergeNodes result(c.entry);
+ result += checkForNodesMissingFromIdealState(c);
+ result += checkIfBucketsAreOutOfSyncAndNeedMerging(c);
+
+ if (result.shouldMerge()) {
+ IdealStateOperation::UP op(
+ new MergeOperation(BucketAndNodes(c.bucketId, result.nodes()),
+ c.distributorConfig.getMaxNodesPerMerge()));
+ op->setPriority(result.priority());
+ op->setDetailedReason(result.reason());
+ MaintenancePriority::Priority schedPri(
+ result.needsMoveOnly() ? MaintenancePriority::VERY_LOW
+ : MaintenancePriority::MEDIUM);
+
+ return Result::createStoredResult(std::move(op), schedPri);
+ } else {
+ LOG(spam, "Bucket %s: No need for merge, as bucket is in consistent state "
+ "(or inconsistent buckets are empty) %s",
+ c.bucketId.toString().c_str(),
+ c.entry->toString().c_str());
+ return Result::noMaintenanceNeeded();
+ }
+}
+
+bool
+DeleteExtraCopiesStateChecker::bucketHasNoData(const StateChecker::Context& c)
+{
+ return (c.entry->getHighestMetaCount() == 0
+ && !c.entry->hasRecentlyCreatedEmptyCopy());
+}
+
+bool
+DeleteExtraCopiesStateChecker::copyIsInIdealState(const BucketCopy& cp,
+ const StateChecker::Context& c) const
+{
+ return hasItem(c.idealState, cp.getNode());
+}
+
+bool
+DeleteExtraCopiesStateChecker::enoughCopiesKept(uint32_t keptIdealCopies,
+ uint32_t keptNonIdealCopies,
+ const StateChecker::Context& c) const
+{
+ return ((keptIdealCopies + keptNonIdealCopies) >= c.distribution.getRedundancy());
+}
+
+void
+DeleteExtraCopiesStateChecker::addToRemoveSet(
+ const BucketCopy& copyToRemove,
+ const char* reasonForRemoval,
+ std::vector<uint16_t>& removedCopies,
+ vespalib::asciistream& reasons)
+{
+ reasons << "[Removing " << reasonForRemoval
+ << " from node " << copyToRemove.getNode()
+ << ']';
+ removedCopies.push_back(copyToRemove.getNode());
+}
+
+uint32_t
+DeleteExtraCopiesStateChecker::numberOfIdealCopiesPresent(
+ const StateChecker::Context& c) const
+{
+ const uint32_t cnt = c.entry->getNodeCount();
+ uint32_t idealCopies = 0;
+ for (uint32_t i = 0; i < cnt; ++i) {
+ const BucketCopy& cp(c.entry->getNodeRef(i));
+ if (copyIsInIdealState(cp, c)) {
+ ++idealCopies;
+ }
+ }
+ return idealCopies;
+}
+
+/**
+ * Delete copies that are not in ideal state and either:
+ * - in sync with all other copies AND redundant, or
+ * - empty
+ *
+ * Assumes that no other method has removed copies before this.
+ */
+void
+DeleteExtraCopiesStateChecker::removeRedundantEmptyOrConsistentCopies(
+ StateChecker::Context& c,
+ std::vector<uint16_t>& removedCopies,
+ vespalib::asciistream& reasons)
+{
+ assert(removedCopies.empty());
+ const bool copiesAreConsistent = c.entry->validAndConsistent();
+ const uint32_t cnt = c.entry->getNodeCount();
+ // Always keep all ideal copies
+ uint32_t keptIdealCopies = numberOfIdealCopiesPresent(c);
+ uint32_t keptNonIdealCopies = 0;
+
+ for (uint32_t i = 0; i < cnt; ++i) {
+ const BucketCopy& cp(c.entry->getNodeRef(i));
+ if (copyIsInIdealState(cp, c)) {
+ continue;
+ }
+ // Caller already checked for recently created/invalid copies, so
+ // any empty copies not in ideal state are pending for a bending,
+ // no matter if bucket is consistent or not.
+ if (cp.empty()) {
+ addToRemoveSet(cp, "empty copy", removedCopies, reasons);
+ } else if (copiesAreConsistent
+ && enoughCopiesKept(keptIdealCopies, keptNonIdealCopies, c)
+ && !cp.active())
+ {
+ addToRemoveSet(cp, "redundant in-sync copy",
+ removedCopies, reasons);
+ } else {
+ ++keptNonIdealCopies;
+ }
+ }
+}
+
+StateChecker::Result
+DeleteExtraCopiesStateChecker::check(StateChecker::Context& c)
+{
+ if (c.entry->hasInvalidCopy()) {
+ // Don't delete anything here.
+ return Result::noMaintenanceNeeded();
+ }
+ // Maintain symmetry with merge; don't try to mess with nodes that have an
+ // ideal copy on a node set in maintenance mode.
+ if (containsMaintenanceNode(c.idealState, c)) {
+ return Result::noMaintenanceNeeded();
+ }
+
+ vespalib::asciistream reasons;
+ std::vector<uint16_t> removedCopies;
+
+ if (bucketHasNoData(c)) {
+ reasons << "[Removing all copies since bucket is empty:"
+ << c.entry->toString() << "]";
+
+ for (uint32_t j = 0, cnt = c.entry->getNodeCount(); j < cnt; ++j) {
+ removedCopies.push_back(c.entry->getNodeRef(j).getNode());
+ }
+ } else if (c.entry->getNodeCount() <= c.distribution.getRedundancy()) {
+ return Result::noMaintenanceNeeded();
+ } else if (c.entry->hasRecentlyCreatedEmptyCopy()) {
+ return Result::noMaintenanceNeeded();
+ } else {
+ removeRedundantEmptyOrConsistentCopies(c, removedCopies, reasons);
+ }
+
+ if (!removedCopies.empty()) {
+ IdealStateOperation::UP ro(new RemoveBucketOperation(
+ c.component.getClusterName(),
+ BucketAndNodes(c.bucketId, removedCopies)));
+
+ ro->setPriority(c.distributorConfig.getMaintenancePriorities()
+ .deleteBucketCopy);
+ ro->setDetailedReason(reasons.str());
+ return Result::createStoredResult(std::move(ro), MaintenancePriority::HIGH);
+ }
+
+ return Result::noMaintenanceNeeded();
+}
+
+bool
+BucketStateStateChecker::shouldSkipActivationDueToMaintenance(
+ const ActiveList& activeNodes,
+ const StateChecker::Context& c) const
+{
+ for (uint32_t i = 0; i < activeNodes.size(); ++i) {
+ const BucketCopy* cp(c.entry->getNode(activeNodes[i].nodeIndex));
+ if (!cp || cp->active()) {
+ continue;
+ }
+ if (!cp->ready()) {
+ // If copy is not ready, we don't want to activate it if a node
+ // is set in maintenance. Doing so would imply that we want proton
+ // to start background indexing.
+ return containsMaintenanceNode(c.idealState, c);
+ }
+ }
+ return false;
+}
+
+/**
+ * The copy we want to set active is, in prioritized order:
+ * 1. The first ideal state copy that is trusted and ready
+ * 2. The first non-ideal state copy that is ready
+ * 3. The first ideal state copy that is trusted
+ * 4. The first available copy that is trusted
+ * 5. The first ideal state copy
+ * 6. Any existing active copy (i.e. do not alter active state)
+ * 7. Any valid copy if no copies are active
+ */
+StateChecker::Result
+BucketStateStateChecker::check(StateChecker::Context& c)
+{
+ if (c.distributorConfig.isBucketActivationDisabled()) {
+ return Result::noMaintenanceNeeded();
+ }
+
+ if (isInconsistentlySplit(c)) {
+ return Result::noMaintenanceNeeded();
+ }
+
+ ActiveList activeNodes(
+ ActiveCopy::calculate(c.idealState, c.distribution, c.entry));
+ if (activeNodes.empty()) {
+ return Result::noMaintenanceNeeded();
+ }
+ if (shouldSkipActivationDueToMaintenance(activeNodes, c)) {
+ return Result::noMaintenanceNeeded();
+ }
+
+ vespalib::asciistream reason;
+ std::vector<uint16_t> operationNodes;
+ for (uint32_t i=0; i<activeNodes.size(); ++i) {
+ const BucketCopy* cp = c.entry->getNode(activeNodes[i].nodeIndex);
+ if (cp == 0 || cp->active()) {
+ continue;
+ }
+ operationNodes.push_back(activeNodes[i].nodeIndex);
+ reason << "[Setting node " << activeNodes[i].nodeIndex << " as active: "
+ << activeNodes[i].reason << "]";
+ }
+
+ // Deactivate all copies that are currently marked as active.
+ for (uint32_t i = 0; i < c.entry->getNodeCount(); ++i) {
+ const BucketCopy& cp = c.entry->getNodeRef(i);
+ if (!cp.active()) {
+ continue;
+ }
+ bool shouldBeActive = false;
+ for (uint32_t j=0; j<activeNodes.size(); ++j) {
+ if (activeNodes[j].nodeIndex == cp.getNode()) {
+ shouldBeActive = true;
+ }
+ }
+ if (!shouldBeActive) {
+ reason << "[Setting node " << cp.getNode() << " as inactive]";
+ operationNodes.push_back(cp.getNode());
+ }
+ }
+
+ if (operationNodes.size() == 0) {
+ return Result::noMaintenanceNeeded();
+ }
+
+ std::vector<uint16_t> activeNodeIndexes;
+ for (uint32_t i=0; i<activeNodes.size(); ++i) {
+ activeNodeIndexes.push_back(activeNodes[i].nodeIndex);
+ }
+ auto op = std::make_unique<SetBucketStateOperation>(
+ c.component.getClusterName(),
+ BucketAndNodes(c.bucketId, operationNodes),
+ activeNodeIndexes);
+
+ // If activeNodes > 1, we're dealing with a active-per-leaf group case and
+ // we currently always send high pri activations.
+ // Otherwise, only > 1 operationNodes if we have copies to deactivate.
+ if (activeNodes.size() > 1 || operationNodes.size() == 1) {
+ op->setPriority(c.distributorConfig.getMaintenancePriorities()
+ .activateNoExistingActive);
+ } else {
+ op->setPriority(c.distributorConfig.getMaintenancePriorities()
+ .activateWithExistingActive);
+ }
+ op->setDetailedReason(reason.str());
+ return Result::createStoredResult(std::move(op), MaintenancePriority::VERY_HIGH);
+}
+
+bool
+GarbageCollectionStateChecker::needsGarbageCollection(const Context& c) const
+{
+ if (c.entry->getNodeCount() == 0 || c.distributorConfig.getGarbageCollectionInterval() == 0) {
+ return false;
+ }
+ if (containsMaintenanceNode(c.idealState, c)) {
+ return false;
+ }
+ std::chrono::seconds lastRunAt(c.entry->getLastGarbageCollectionTime());
+ std::chrono::seconds currentTime(
+ c.component.getClock().getTimeInSeconds().getTime());
+
+ return c.gcTimeCalculator.shouldGc(c.bucketId, currentTime, lastRunAt);
+}
+
+StateChecker::Result
+GarbageCollectionStateChecker::check(Context& c)
+{
+ if (needsGarbageCollection(c)) {
+ IdealStateOperation::UP op(
+ new GarbageCollectionOperation(
+ c.component.getClusterName(),
+ BucketAndNodes(c.bucketId, c.entry->getNodes())));
+
+ vespalib::asciistream reason;
+ reason << "[Needs garbage collection: Last check at "
+ << c.entry->getLastGarbageCollectionTime()
+ << ", current time "
+ << c.component.getClock().getTimeInSeconds().getTime()
+ << ", configured interval "
+ << c.distributorConfig.getGarbageCollectionInterval() << "]";
+
+ op->setPriority(c.distributorConfig.getMaintenancePriorities()
+ .garbageCollection);
+ op->setDetailedReason(reason.c_str());
+ return Result::createStoredResult(std::move(op), MaintenancePriority::MEDIUM);
+ } else {
+ return Result::noMaintenanceNeeded();
+ }
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/statecheckers.h b/storage/src/vespa/storage/distributor/statecheckers.h
new file mode 100644
index 00000000000..8edf04f3bf7
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/statecheckers.h
@@ -0,0 +1,139 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/idealstatemanager.h>
+
+namespace storage {
+
+namespace distributor {
+
+class SynchronizeAndMoveStateChecker : public StateChecker
+{
+public:
+ std::string getStatusText() const { return "Synchronization and moving"; }
+
+ Result check(Context& c);
+
+ const char* getName() const { return "SynchronizeAndMove"; }
+};
+
+class DeleteExtraCopiesStateChecker : public StateChecker
+{
+public:
+ std::string getStatusText() const { return "Delete extra copies"; }
+
+ Result check(Context& c);
+
+ const char* getName() const { return "DeleteExtraCopies"; }
+
+private:
+ bool bucketHasNoData(const StateChecker::Context& c);
+ void removeRedundantEmptyOrConsistentCopies(
+ StateChecker::Context& c,
+ std::vector<uint16_t>& removedCopies,
+ vespalib::asciistream& reasons);
+ bool copyIsInIdealState(const BucketCopy& cp,
+ const StateChecker::Context& c) const;
+ bool enoughCopiesKept(uint32_t keptIdealCopies,
+ uint32_t keptNonIdealCopies,
+ const StateChecker::Context& c) const;
+ uint32_t numberOfIdealCopiesPresent(const StateChecker::Context& c) const;
+ void addToRemoveSet(const BucketCopy& copyToRemove,
+ const char* reasonForRemoval,
+ std::vector<uint16_t>& removedCopies,
+ vespalib::asciistream& reasons);
+
+};
+
+class JoinBucketsStateChecker : public StateChecker
+{
+public:
+ std::string getStatusText() const { return "Join buckets"; }
+
+ Result check(Context& c);
+
+ const char* getName() const { return "JoinBuckets"; }
+private:
+ uint64_t getTotalUsedFileSize(const Context& c) const;
+ uint64_t getTotalMetaCount(const Context& c) const;
+ bool isFirstSibling(const document::BucketId& bucketId) const;
+ bool siblingsAreInSync(const Context& c) const;
+ bool shouldJoin(const Context& c) const;
+ bool smallEnoughToJoin(const Context& c) const;
+ bool singleBucketJoinIsEnabled(const Context&) const;
+ bool singleBucketJoinIsConsistent(const Context& c) const;
+ document::BucketId computeJoinBucket(const Context& c) const;
+};
+
+class SplitBucketStateChecker : public StateChecker
+{
+public:
+ std::string getStatusText() const { return "Split buckets"; }
+
+ Result check(Context& c);
+
+ const char* getName() const { return "SplitBucket"; }
+
+private:
+
+ Result generateMinimumBucketSplitOperation(Context& c);
+ Result generateMaxSizeExceededSplitOperation(Context& c);
+
+ bool validForSplit(StateChecker::Context& c);
+ double getBucketSizeRelativeToMax(Context& c);
+};
+
+class SplitInconsistentStateChecker : public StateChecker
+{
+public:
+ std::string getStatusText() const
+ { return "Fix inconsistently split buckets"; }
+
+ Result check(Context& c);
+
+ const char* getName() const { return "SplitInconsistentBuckets"; }
+
+private:
+ typedef std::pair<document::BucketId, uint16_t> BucketAndNode;
+ bool isLeastSplitBucket(
+ const document::BucketId& bucket,
+ const std::vector<BucketDatabase::Entry>& entries) const;
+ uint32_t getHighestUsedBits(
+ const std::vector<BucketDatabase::Entry>& entries) const;
+ vespalib::string getReason(
+ const document::BucketId& bucketId,
+ const std::vector<BucketDatabase::Entry>& entries) const;
+ bool isLeastSplit(Context& c, std::vector<BucketAndNode>& others);
+};
+
+class ActiveList;
+
+class BucketStateStateChecker : public StateChecker
+{
+ bool shouldSkipActivationDueToMaintenance(
+ const ActiveList& activeList,
+ const StateChecker::Context& c) const;
+public:
+ std::string getStatusText() const { return "Set bucket copy state"; }
+
+ Result check(Context& c);
+
+ const char* getName() const { return "SetBucketState"; }
+};
+
+class GarbageCollectionStateChecker : public StateChecker
+{
+public:
+ std::string getStatusText() const { return "Garbage collection"; }
+
+ bool needsGarbageCollection(const Context& c) const;
+
+ Result check(Context& c);
+
+ const char* getName() const { return "GarbageCollection"; }
+};
+
+}
+
+}
+
diff --git a/storage/src/vespa/storage/distributor/statusdelegator.h b/storage/src/vespa/storage/distributor/statusdelegator.h
new file mode 100644
index 00000000000..bafb8ce5dff
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/statusdelegator.h
@@ -0,0 +1,18 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+namespace storage {
+namespace distributor {
+
+class DelegatedStatusRequest;
+
+class StatusDelegator
+{
+public:
+ virtual ~StatusDelegator() {}
+
+ virtual bool handleStatusRequest(const DelegatedStatusRequest& request) const = 0;
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/statusreporterdelegate.cpp b/storage/src/vespa/storage/distributor/statusreporterdelegate.cpp
new file mode 100644
index 00000000000..ecd880f2876
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/statusreporterdelegate.cpp
@@ -0,0 +1,42 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/statusreporterdelegate.h>
+
+namespace storage {
+namespace distributor {
+
+StatusReporterDelegate::StatusReporterDelegate(
+ framework::ComponentRegister& compReg,
+ const StatusDelegator& delegator,
+ const framework::StatusReporter& target)
+ : framework::StatusReporter(target.getId(), target.getName()),
+ _delegator(delegator),
+ _target(target),
+ _component(compReg, std::string(target.getId()) + "_status")
+{
+}
+
+vespalib::string
+StatusReporterDelegate::getReportContentType(const framework::HttpUrlPath& path) const
+{
+ // Implementation must be data race free.
+ return _target.getReportContentType(path);
+}
+
+bool
+StatusReporterDelegate::reportStatus(std::ostream& out,
+ const framework::HttpUrlPath& path) const
+{
+ return _delegator.handleStatusRequest(
+ DelegatedStatusRequest(_target, path, out));
+}
+
+void
+StatusReporterDelegate::registerStatusPage()
+{
+ _component.registerStatusPage(*this);
+}
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/statusreporterdelegate.h b/storage/src/vespa/storage/distributor/statusreporterdelegate.h
new file mode 100644
index 00000000000..7f66554b091
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/statusreporterdelegate.h
@@ -0,0 +1,30 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storage/distributor/delegatedstatusrequest.h>
+#include <vespa/storage/distributor/statusdelegator.h>
+
+namespace storage {
+namespace distributor {
+
+class StatusReporterDelegate
+ : public framework::StatusReporter
+{
+ const StatusDelegator& _delegator;
+ const framework::StatusReporter& _target;
+ framework::Component _component;
+public:
+ StatusReporterDelegate(framework::ComponentRegister& compReg,
+ const StatusDelegator& delegator,
+ const framework::StatusReporter& target);
+
+ void registerStatusPage();
+
+ vespalib::string getReportContentType(
+ const framework::HttpUrlPath&) const;
+ bool reportStatus(std::ostream&, const framework::HttpUrlPath&) const;
+};
+
+} // distributor
+} // storage
diff --git a/storage/src/vespa/storage/distributor/throttlingoperationstarter.cpp b/storage/src/vespa/storage/distributor/throttlingoperationstarter.cpp
new file mode 100644
index 00000000000..b7fcd6ffc9d
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/throttlingoperationstarter.cpp
@@ -0,0 +1,45 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/distributor/throttlingoperationstarter.h>
+
+namespace storage {
+namespace distributor {
+
+ThrottlingOperationStarter::ThrottlingOperation::~ThrottlingOperation()
+{
+ _operationStarter.signalOperationFinished(*this);
+}
+
+bool
+ThrottlingOperationStarter::canStart(uint32_t currentOperationCount,
+ Priority priority) const
+{
+ uint32_t variablePending(_maxPending - _minPending);
+ uint32_t maxPendingForPri(_minPending + variablePending*((255.0 - priority) / 255.0));
+
+ return currentOperationCount < maxPendingForPri;
+}
+
+bool
+ThrottlingOperationStarter::start(const std::shared_ptr<Operation>& operation,
+ Priority priority)
+{
+ if (!canStart(_pendingCount, priority)) {
+ return false;
+ }
+ Operation::SP wrappedOp(new ThrottlingOperation(operation, *this));
+ ++_pendingCount;
+ return _starterImpl.start(wrappedOp, priority);
+}
+
+void
+ThrottlingOperationStarter::signalOperationFinished(const Operation& op)
+{
+ (void) op;
+ assert(_pendingCount > 0);
+ --_pendingCount;
+}
+
+}
+}
diff --git a/storage/src/vespa/storage/distributor/throttlingoperationstarter.h b/storage/src/vespa/storage/distributor/throttlingoperationstarter.h
new file mode 100644
index 00000000000..711534d64dd
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/throttlingoperationstarter.h
@@ -0,0 +1,100 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/distributor/operationstarter.h>
+#include <vespa/storage/distributor/operations/operation.h>
+
+namespace storage {
+namespace distributor {
+
+class ThrottlingOperationStarter : public OperationStarter
+{
+ class ThrottlingOperation : public Operation
+ {
+ public:
+ ThrottlingOperation(const Operation::SP& operation,
+ ThrottlingOperationStarter& operationStarter)
+ : _operation(operation),
+ _operationStarter(operationStarter)
+ {}
+
+ ~ThrottlingOperation();
+ private:
+ Operation::SP _operation;
+ ThrottlingOperationStarter& _operationStarter;
+
+ ThrottlingOperation(const ThrottlingOperation&);
+ ThrottlingOperation& operator=(const ThrottlingOperation&);
+
+ virtual void onClose(DistributorMessageSender& sender) {
+ _operation->onClose(sender);
+ }
+ virtual const char* getName() const {
+ return _operation->getName();
+ }
+ virtual std::string getStatus() const {
+ return _operation->getStatus();
+ }
+ virtual std::string toString() const {
+ return _operation->toString();
+ }
+ virtual void start(DistributorMessageSender& sender,
+ framework::MilliSecTime startTime)
+ {
+ _operation->start(sender, startTime);
+ }
+ virtual void receive(DistributorMessageSender& sender,
+ const std::shared_ptr<api::StorageReply> & msg)
+ {
+ _operation->receive(sender, msg);
+ }
+ framework::MilliSecTime getStartTime() const {
+ return _operation->getStartTime();
+ }
+ virtual void onStart(DistributorMessageSender&) {
+ // Should never be called directly on the throttled operation
+ // instance, but rather on its wrapped implementation.
+ assert(false);
+ }
+ virtual void onReceive(DistributorMessageSender&,
+ const std::shared_ptr<api::StorageReply>&)
+ {
+ assert(false);
+ }
+ };
+
+ OperationStarter& _starterImpl;
+public:
+ ThrottlingOperationStarter(OperationStarter& starterImpl)
+ : _starterImpl(starterImpl),
+ _minPending(0),
+ _maxPending(UINT32_MAX),
+ _pendingCount(0)
+ {}
+
+ virtual bool start(const std::shared_ptr<Operation>& operation,
+ Priority priority);
+
+ bool canStart(uint32_t currentOperationCount,
+ Priority priority) const;
+
+ void setMaxPendingRange(uint32_t minPending, uint32_t maxPending) {
+ _minPending = minPending;
+ _maxPending = maxPending;
+ }
+
+private:
+ ThrottlingOperationStarter(const ThrottlingOperationStarter&);
+ ThrottlingOperationStarter& operator=(const ThrottlingOperationStarter&);
+
+ friend class ThrottlingOperation;
+ void signalOperationFinished(const Operation& op);
+
+ uint32_t _minPending;
+ uint32_t _maxPending;
+ uint32_t _pendingCount;
+};
+
+}
+}
+
diff --git a/storage/src/vespa/storage/distributor/visitormetricsset.h b/storage/src/vespa/storage/distributor/visitormetricsset.h
new file mode 100644
index 00000000000..122682b9e04
--- /dev/null
+++ b/storage/src/vespa/storage/distributor/visitormetricsset.h
@@ -0,0 +1,36 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/metrics/metrics.h>
+#include <vespa/documentapi/loadtypes/loadtypeset.h>
+
+namespace storage {
+
+struct VisitorMetricSet : public metrics::MetricSet {
+ metrics::DoubleAverageMetric latency;
+ metrics::LongCountMetric failed;
+
+ VisitorMetricSet(metrics::MetricSet* owner = 0)
+ : metrics::MetricSet("visitor", "visitor", "", owner),
+ latency("latency", "", "Latency of visitor (in ms)", this),
+ failed("failed", "", "Number of visitors that failed or were aborted by the user", this)
+ {
+ }
+
+ virtual Metric* clone(std::vector<Metric::LP>& ownerList,
+ CopyType copyType,
+ metrics::MetricSet* owner,
+ bool includeUnused) const
+ {
+ if (copyType == INACTIVE) {
+ return MetricSet::clone(ownerList, INACTIVE, owner, includeUnused);
+ }
+ return (VisitorMetricSet*)
+ (new VisitorMetricSet(owner))->assignValues(*this);
+ }
+
+ VisitorMetricSet* operator&() { return this; }
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/frameworkimpl/component/CMakeLists.txt b/storage/src/vespa/storage/frameworkimpl/component/CMakeLists.txt
new file mode 100644
index 00000000000..0490ddbc8d0
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/component/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_component OBJECT
+ SOURCES
+ storagecomponentregisterimpl.cpp
+ servicelayercomponentregisterimpl.cpp
+ distributorcomponentregisterimpl.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/frameworkimpl/component/distributorcomponentregisterimpl.cpp b/storage/src/vespa/storage/frameworkimpl/component/distributorcomponentregisterimpl.cpp
new file mode 100644
index 00000000000..d412a5f842a
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/component/distributorcomponentregisterimpl.cpp
@@ -0,0 +1,105 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/frameworkimpl/component/distributorcomponentregisterimpl.h>
+
+#include <vespa/log/log.h>
+#include <vespa/vdslib/distribution/idealnodecalculatorimpl.h>
+
+LOG_SETUP(".storage.component.register.distributor");
+
+namespace storage {
+
+DistributorComponentRegisterImpl::DistributorComponentRegisterImpl()
+ : _timeCalculator(0),
+ _bucketDatabase(),
+ _idealNodeCalculator(new lib::IdealNodeCalculatorImpl)
+{
+ _idealNodeCalculator->setClusterState(_clusterState);
+}
+
+void
+DistributorComponentRegisterImpl::handleNewState()
+{
+ _clusterState = *getNodeStateUpdater().getSystemState();
+}
+
+void
+DistributorComponentRegisterImpl::registerDistributorComponent(
+ DistributorManagedComponent& smc)
+{
+ vespalib::LockGuard lock(_componentLock);
+ _components.push_back(&smc);
+ if (_timeCalculator != 0) {
+ smc.setTimeCalculator(*_timeCalculator);
+ }
+ smc.setBucketDatabase(_bucketDatabase);
+ smc.setDistributorConfig(_distributorConfig);
+ smc.setVisitorConfig(_visitorConfig);
+ smc.setIdealNodeCalculator(*_idealNodeCalculator);
+}
+
+void
+DistributorComponentRegisterImpl::setIdealNodeCalculator(
+ std::unique_ptr<lib::IdealNodeCalculatorConfigurable> calc)
+{
+ _idealNodeCalculator = std::move(calc);
+ for (uint32_t i=0; i<_components.size(); ++i) {
+ _components[i]->setIdealNodeCalculator(*_idealNodeCalculator);
+ }
+}
+
+void
+DistributorComponentRegisterImpl::setTimeCalculator(UniqueTimeCalculator& utc)
+{
+ vespalib::LockGuard lock(_componentLock);
+ if (_timeCalculator != 0) {
+ throw vespalib::IllegalStateException(
+ "Time calculator already set. Cannot be updated live",
+ VESPA_STRLOC);
+ }
+ _timeCalculator = &utc;
+ for (uint32_t i=0; i<_components.size(); ++i) {
+ _components[i]->setTimeCalculator(*_timeCalculator);
+ }
+}
+
+void
+DistributorComponentRegisterImpl::setDistributorConfig(
+ const DistributorConfig& c)
+{
+ vespalib::LockGuard lock(_componentLock);
+ _distributorConfig = c;
+ for (uint32_t i=0; i<_components.size(); ++i) {
+ _components[i]->setDistributorConfig(c);
+ }
+}
+
+void
+DistributorComponentRegisterImpl::setVisitorConfig(const VisitorConfig& c)
+{
+ vespalib::LockGuard lock(_componentLock);
+ _visitorConfig = c;
+ for (uint32_t i=0; i<_components.size(); ++i) {
+ _components[i]->setVisitorConfig(c);
+ }
+}
+
+void
+DistributorComponentRegisterImpl::setDistribution(lib::Distribution::SP d)
+{
+ StorageComponentRegisterImpl::setDistribution(d);
+ _idealNodeCalculator->setDistribution(*d);
+}
+
+void
+DistributorComponentRegisterImpl::setNodeStateUpdater(NodeStateUpdater& updater)
+{
+ StorageComponentRegisterImpl::setNodeStateUpdater(updater);
+ if (updater.getSystemState().get() != 0) {
+ _clusterState = *updater.getSystemState();
+ }
+ updater.addStateListener(*this);
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/frameworkimpl/component/distributorcomponentregisterimpl.h b/storage/src/vespa/storage/frameworkimpl/component/distributorcomponentregisterimpl.h
new file mode 100644
index 00000000000..d1a37c5ba5e
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/component/distributorcomponentregisterimpl.h
@@ -0,0 +1,59 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::framework::DistributorComponentRegisterImpl
+ * \ingroup component
+ *
+ * \brief Subclass of component register impl that handles storage components.
+ */
+#pragma once
+
+#include <vespa/storage/distributor/bucketdb/mapbucketdatabase.h>
+#include <vespa/storage/common/distributorcomponent.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h>
+
+namespace storage {
+namespace lib {
+ class IdealNodeCalculatorConfigurable;
+}
+
+class DistributorComponentRegisterImpl
+ : public virtual DistributorComponentRegister,
+ public virtual StorageComponentRegisterImpl,
+ private StateListener
+{
+ vespalib::Lock _componentLock;
+ std::vector<DistributorManagedComponent*> _components;
+
+ UniqueTimeCalculator* _timeCalculator;
+ distributor::MapBucketDatabase _bucketDatabase;
+ DistributorConfig _distributorConfig;
+ VisitorConfig _visitorConfig;
+ lib::ClusterState _clusterState;
+ std::unique_ptr<lib::IdealNodeCalculatorConfigurable> _idealNodeCalculator;
+
+public:
+ typedef std::unique_ptr<DistributorComponentRegisterImpl> UP;
+
+ DistributorComponentRegisterImpl();
+
+ distributor::BucketDatabase& getBucketDatabase() { return _bucketDatabase; }
+
+ virtual void registerDistributorComponent(DistributorManagedComponent&);
+
+ void setTimeCalculator(UniqueTimeCalculator& calc);
+ void setDistributorConfig(const DistributorConfig&);
+ void setVisitorConfig(const VisitorConfig&);
+ void setDistribution(lib::Distribution::SP);
+ void setIdealNodeCalculator(
+ std::unique_ptr<lib::IdealNodeCalculatorConfigurable>);
+
+private:
+ virtual void handleNewState();
+
+ virtual void setNodeStateUpdater(NodeStateUpdater& updater);
+};
+
+} // storage
+
+
diff --git a/storage/src/vespa/storage/frameworkimpl/component/servicelayercomponentregisterimpl.cpp b/storage/src/vespa/storage/frameworkimpl/component/servicelayercomponentregisterimpl.cpp
new file mode 100644
index 00000000000..5d8cc5c6c27
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/component/servicelayercomponentregisterimpl.cpp
@@ -0,0 +1,43 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/frameworkimpl/component/servicelayercomponentregisterimpl.h>
+
+#include <vespa/log/log.h>
+
+LOG_SETUP(".storage.component.register.servicelayer");
+
+namespace storage {
+
+ServiceLayerComponentRegisterImpl::ServiceLayerComponentRegisterImpl()
+ : _diskCount(0),
+ _bucketDatabase()
+{
+}
+
+void
+ServiceLayerComponentRegisterImpl::registerServiceLayerComponent(
+ ServiceLayerManagedComponent& smc)
+{
+ vespalib::LockGuard lock(_componentLock);
+ _components.push_back(&smc);
+ smc.setDiskCount(_diskCount);
+ smc.setBucketDatabase(_bucketDatabase);
+ smc.setMinUsedBitsTracker(_minUsedBitsTracker);
+}
+
+void
+ServiceLayerComponentRegisterImpl::setDiskCount(uint16_t count)
+{
+ vespalib::LockGuard lock(_componentLock);
+ if (_diskCount != 0) {
+ throw vespalib::IllegalStateException(
+ "Disk count already set. Cannot be updated live", VESPA_STRLOC);
+ }
+ _diskCount = count;
+ for (uint32_t i=0; i<_components.size(); ++i) {
+ _components[i]->setDiskCount(count);
+ }
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/frameworkimpl/component/servicelayercomponentregisterimpl.h b/storage/src/vespa/storage/frameworkimpl/component/servicelayercomponentregisterimpl.h
new file mode 100644
index 00000000000..fe73e5a1563
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/component/servicelayercomponentregisterimpl.h
@@ -0,0 +1,45 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::framework::StorageComponentRegisterImpl
+ * \ingroup component
+ *
+ * \brief Subclass of component register impl that handles storage components.
+ */
+#pragma once
+
+#include <vespa/storage/bucketdb/minimumusedbitstracker.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storage/common/servicelayercomponent.h>
+#include <vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h>
+
+namespace storage {
+
+class ServiceLayerComponentRegisterImpl
+ : public virtual ServiceLayerComponentRegister,
+ public virtual StorageComponentRegisterImpl
+{
+ vespalib::Lock _componentLock;
+ std::vector<ServiceLayerManagedComponent*> _components;
+ uint16_t _diskCount;
+ StorBucketDatabase _bucketDatabase;
+ MinimumUsedBitsTracker _minUsedBitsTracker;
+
+public:
+ typedef std::unique_ptr<ServiceLayerComponentRegisterImpl> UP;
+
+ ServiceLayerComponentRegisterImpl();
+
+ uint16_t getDiskCount() const { return _diskCount; }
+ StorBucketDatabase& getBucketDatabase() { return _bucketDatabase; }
+ MinimumUsedBitsTracker& getMinUsedBitsTracker() {
+ return _minUsedBitsTracker;
+ }
+
+ virtual void registerServiceLayerComponent(ServiceLayerManagedComponent&);
+
+ void setDiskCount(uint16_t count);
+};
+
+} // storage
+
+
diff --git a/storage/src/vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.cpp b/storage/src/vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.cpp
new file mode 100644
index 00000000000..376fefa3c76
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.cpp
@@ -0,0 +1,123 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h>
+
+#include <vespa/log/log.h>
+
+LOG_SETUP(".storage.component.register");
+
+namespace storage {
+
+StorageComponentRegisterImpl::StorageComponentRegisterImpl()
+ : _nodeType(0),
+ _index(0xffff),
+ _loadTypes(new documentapi::LoadTypeSet),
+ _nodeStateUpdater(0)
+{
+
+}
+
+void
+StorageComponentRegisterImpl::registerStorageComponent(
+ StorageComponent& smc)
+{
+ vespalib::LockGuard lock(_componentLock);
+ _components.push_back(&smc);
+ assert(_nodeType != 0);
+ smc.setNodeInfo(_clusterName, *_nodeType, _index);
+ if (_nodeStateUpdater != 0) {
+ smc.setNodeStateUpdater(*_nodeStateUpdater);
+ }
+ smc.setDocumentTypeRepo(_docTypeRepo);
+ smc.setLoadTypes(_loadTypes);
+ smc.setPriorityConfig(_priorityConfig);
+ smc.setBucketIdFactory(_bucketIdFactory);
+ smc.setDistribution(_distribution);
+}
+
+void
+StorageComponentRegisterImpl::setNodeInfo(vespalib::stringref clusterName,
+ const lib::NodeType& nodeType,
+ uint16_t index)
+{
+ vespalib::LockGuard lock(_componentLock);
+ if (_nodeType != 0) {
+ LOG(warning, "Node info already set. May be valid in tests, but is a "
+ "bug in production. Node info should not be updated live");
+ }
+ _clusterName = clusterName;
+ _nodeType = &nodeType;
+ _index = index;
+}
+
+void
+StorageComponentRegisterImpl::setNodeStateUpdater(NodeStateUpdater& updater)
+{
+ vespalib::LockGuard lock(_componentLock);
+ if (_nodeStateUpdater != 0) {
+ throw vespalib::IllegalStateException(
+ "Node state updater already set. Should never be altered live.",
+ VESPA_STRLOC);
+ }
+ _nodeStateUpdater = &updater;
+ for (uint32_t i=0; i<_components.size(); ++i) {
+ _components[i]->setNodeStateUpdater(updater);
+ }
+}
+
+void
+StorageComponentRegisterImpl::setDocumentTypeRepo(
+ document::DocumentTypeRepo::SP repo)
+{
+ vespalib::LockGuard lock(_componentLock);
+ _docTypeRepo = repo;
+ for (uint32_t i=0; i<_components.size(); ++i) {
+ _components[i]->setDocumentTypeRepo(repo);
+ }
+}
+
+void
+StorageComponentRegisterImpl::setLoadTypes(
+ documentapi::LoadTypeSet::SP loadTypes)
+{
+ vespalib::LockGuard lock(_componentLock);
+ _loadTypes = loadTypes;
+ for (uint32_t i=0; i<_components.size(); ++i) {
+ _components[i]->setLoadTypes(loadTypes);
+ }
+}
+
+void
+StorageComponentRegisterImpl::setPriorityConfig(const PriorityConfig& config)
+{
+ vespalib::LockGuard lock(_componentLock);
+ _priorityConfig = config;
+ for (uint32_t i=0; i<_components.size(); ++i) {
+ _components[i]->setPriorityConfig(config);
+ }
+}
+
+void
+StorageComponentRegisterImpl::setBucketIdFactory(
+ const document::BucketIdFactory& factory)
+{
+ vespalib::LockGuard lock(_componentLock);
+ _bucketIdFactory = factory;
+ for (uint32_t i=0; i<_components.size(); ++i) {
+ _components[i]->setBucketIdFactory(factory);
+ }
+}
+
+void
+StorageComponentRegisterImpl::setDistribution(
+ lib::Distribution::SP distribution)
+{
+ vespalib::LockGuard lock(_componentLock);
+ _distribution = distribution;
+ for (uint32_t i=0; i<_components.size(); ++i) {
+ _components[i]->setDistribution(distribution);
+ }
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h b/storage/src/vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h
new file mode 100644
index 00000000000..416391fbd53
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h
@@ -0,0 +1,72 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::framework::StorageComponentRegisterImpl
+ * \ingroup component
+ *
+ * \brief Subclass of component register impl that handles storage components.
+ */
+#pragma once
+
+#include <vespa/document/bucket/bucketidfactory.h>
+#include <vespa/document/repo/documenttyperepo.h>
+#include <vespa/documentapi/loadtypes/loadtypeset.h>
+#include <vespa/storage/common/storagecomponent.h>
+#include <vespa/storage/config/config-stor-prioritymapping.h>
+#include <vespa/storageframework/defaultimplementation/component/componentregisterimpl.h>
+#include <vespa/vdslib/distribution/distribution.h>
+
+namespace storage {
+
+class StorageComponentRegisterImpl
+ : public virtual StorageComponentRegister,
+ public virtual framework::defaultimplementation::ComponentRegisterImpl
+{
+ typedef framework::defaultimplementation::ComponentRegisterImpl CompRegImpl;
+ typedef StorageComponent::PriorityConfig PriorityConfig;
+ //CompRegImpl _compReg;
+ vespalib::Lock _componentLock;
+ std::vector<StorageComponent*> _components;
+ vespalib::string _clusterName;
+ const lib::NodeType* _nodeType;
+ uint16_t _index;
+ document::DocumentTypeRepo::SP _docTypeRepo;
+ documentapi::LoadTypeSet::SP _loadTypes;
+ PriorityConfig _priorityConfig;
+ document::BucketIdFactory _bucketIdFactory;
+ lib::Distribution::SP _distribution;
+ NodeStateUpdater* _nodeStateUpdater;
+
+public:
+ typedef std::unique_ptr<StorageComponentRegisterImpl> UP;
+
+ StorageComponentRegisterImpl();
+
+ const vespalib::string& getClusterName() const { return _clusterName; }
+ const lib::NodeType& getNodeType() const
+ { assert(_nodeType != 0); return *_nodeType; }
+ uint16_t getIndex() const { return _index; }
+ document::DocumentTypeRepo::SP getTypeRepo() { return _docTypeRepo; }
+ documentapi::LoadTypeSet::SP getLoadTypes() { return _loadTypes; }
+ const document::BucketIdFactory& getBucketIdFactory()
+ { return _bucketIdFactory; }
+ lib::Distribution::SP getDistribution() { return _distribution; }
+ NodeStateUpdater& getNodeStateUpdater()
+ { assert(_nodeStateUpdater != 0); return *_nodeStateUpdater; }
+
+ virtual void registerStorageComponent(StorageComponent&);
+
+ void setNodeInfo(vespalib::stringref clusterName,
+ const lib::NodeType& nodeType,
+ uint16_t index);
+ virtual void setNodeStateUpdater(NodeStateUpdater& updater);
+ virtual void setDocumentTypeRepo(document::DocumentTypeRepo::SP);
+ virtual void setLoadTypes(documentapi::LoadTypeSet::SP);
+ virtual void setPriorityConfig(const PriorityConfig&);
+ virtual void setBucketIdFactory(const document::BucketIdFactory&);
+ virtual void setDistribution(lib::Distribution::SP);
+
+};
+
+} // storage
+
+
diff --git a/storage/src/vespa/storage/frameworkimpl/memory/CMakeLists.txt b/storage/src/vespa/storage/frameworkimpl/memory/CMakeLists.txt
new file mode 100644
index 00000000000..5c700cf243e
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/memory/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_memory OBJECT
+ SOURCES
+ memorystatusviewer.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/frameworkimpl/memory/memorysnapshotlist.h b/storage/src/vespa/storage/frameworkimpl/memory/memorysnapshotlist.h
new file mode 100644
index 00000000000..2b5db2ac5a3
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/memory/memorysnapshotlist.h
@@ -0,0 +1,20 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::MemorySnapshotList
+ *
+ * \brief Holds a historic list of MemoryStates.
+ *
+ */
+
+#pragma once
+
+#include <storage/frameworkimpl/memory/memorystate.h>
+
+namespace storage {
+
+class MemorySnapshotList : public vespalib::Printable {
+ std::map<uint64_t time, MemoryState> _snapshots;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/frameworkimpl/memory/memorystatusviewer.cpp b/storage/src/vespa/storage/frameworkimpl/memory/memorystatusviewer.cpp
new file mode 100644
index 00000000000..16977f4fec5
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/memory/memorystatusviewer.cpp
@@ -0,0 +1,661 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/frameworkimpl/memory/memorystatusviewer.h>
+#include <vespa/storage/storageutil/graph.h>
+#include <vespa/storage/storageutil/palette.h>
+#include <vespa/storage/storageutil/piechart.h>
+#include <vespa/metrics/metricmanager.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+
+LOG_SETUP(".memory.status.viewer");
+
+using storage::framework::defaultimplementation::MemoryState;
+
+namespace storage {
+
+MemoryStatusViewer::Entry::Entry(
+ const std::string& name, framework::Clock& clock,
+ framework::SecondTime maxAge)
+ : _name(name),
+ _maxAge(maxAge),
+ _timeTaken(clock.getTimeInSeconds()),
+ _data(),
+ _maxMemory(0)
+{
+}
+
+MemoryStatusViewer::MemoryStatusViewer(
+ framework::defaultimplementation::MemoryManager& mm,
+ const metrics::MetricManager& metricMan,
+ StorageComponentRegister& compReg)
+ : framework::HtmlStatusReporter("memorymanager", "Memory Manager"),
+ _component(compReg, "memorystatusviewer"),
+ _manager(mm),
+ _metricManager(metricMan),
+ _workerMonitor(),
+ _states(),
+ _memoryHistory(),
+ _memoryHistorySize(24 * 31),
+ _memoryHistoryPeriod(60),
+ _allowedSlackPeriod(6),
+ _lastHistoryUpdate(_component.getClock().getTimeInSeconds()),
+ _processedTime(0)
+{
+ addEntry("Current", 0);
+ addEntry("Last hour", 60 * 60);
+ addEntry("Last day", 24 * 60 * 60);
+ addEntry("Last month", 4 * 7 * 24 * 60 * 60);
+ addEntry("Last ever", std::numeric_limits<uint32_t>::max());
+
+ framework::MilliSecTime maxProcessingTime(60 * 1000);
+ _thread = _component.startThread(*this, maxProcessingTime,
+ framework::MilliSecTime(1000));
+ _component.registerStatusPage(*this);
+}
+
+MemoryStatusViewer::~MemoryStatusViewer()
+{
+ if (_thread.get() != 0) {
+ _thread->interrupt();
+ {
+ vespalib::MonitorGuard monitor(_workerMonitor);
+ monitor.signal();
+ }
+ _thread->join();
+ }
+}
+
+namespace {
+ struct Group {
+ std::set<const framework::MemoryAllocationType*> types;
+ api::StorageMessage::Priority minPri;
+ api::StorageMessage::Priority maxPri;
+ MemoryState::Entry entry;
+
+ Group(const framework::MemoryAllocationType& type,
+ api::StorageMessage::Priority pri,
+ const MemoryState::Entry& e)
+ : types(),
+ minPri(pri),
+ maxPri(pri),
+ entry(e)
+ {
+ types.insert(&type);
+ }
+ };
+
+ struct GroupSizeOrder {
+ bool operator()(const Group& g1, const Group& g2) {
+ return (g1.entry._currentUsedSize > g2.entry._currentUsedSize);
+ }
+ };
+ struct GroupAllocsOrder {
+ bool operator()(const Group& g1, const Group& g2) {
+ return (g1.entry._totalUserCount > g2.entry._totalUserCount);
+ }
+ };
+ struct GroupMinAllocsOrder {
+ bool operator()(const Group& g1, const Group& g2) {
+ return (g1.entry._minimumCount > g2.entry._minimumCount);
+ }
+ };
+ struct GroupDeniedAllocsOrder {
+ bool operator()(const Group& g1, const Group& g2) {
+ return (g1.entry._deniedCount > g2.entry._deniedCount);
+ }
+ };
+
+ std::vector<Group> collapsePriorities(MemoryStatusViewer::Entry& entry) {
+ std::vector<Group> groups;
+ const MemoryState::SnapShot& ss(entry._data);
+ for (MemoryState::AllocationMap::const_iterator it
+ = ss.getAllocations().begin();
+ it != ss.getAllocations().end(); ++it)
+ {
+ std::unique_ptr<Group> group;
+ for (MemoryState::PriorityMap::const_iterator it2
+ = it->second.begin(); it2 != it->second.end(); ++it2)
+ {
+ if (group.get() == 0) {
+ group.reset(new Group(
+ *it->first, it2->first, it2->second));
+ } else {
+ group->entry += it2->second;
+ group->minPri = std::min(group->minPri, it2->first);
+ group->maxPri = std::max(group->maxPri, it2->first);
+ }
+ }
+ if (group.get() != 0) {
+ groups.push_back(*group);
+ }
+ }
+ return groups;
+ }
+
+ std::vector<Group> groupLoad(uint32_t groupCount, uint64_t minSize,
+ uint32_t minEntries,
+ MemoryStatusViewer::Entry& entry)
+ {
+ assert(groupCount > 1);
+ std::vector<Group> groups(collapsePriorities(entry));
+ if (groups.size() == 0) return groups;
+ std::sort(groups.begin(), groups.end(), GroupSizeOrder());
+ assert(groups.front().entry._currentUsedSize
+ >= groups.back().entry._currentUsedSize);
+ while (groups.size() > minEntries
+ && (groups.size() > groupCount
+ || groups[groups.size() - 2].entry._currentUsedSize
+ < minSize))
+ {
+ Group& nextButLast(groups[groups.size() - 2]);
+ Group& last(groups.back());
+ if (last.entry._currentUsedSize > 0) {
+ nextButLast.entry += last.entry;
+ nextButLast.minPri = std::min(nextButLast.minPri, last.minPri);
+ nextButLast.maxPri = std::max(nextButLast.maxPri, last.maxPri);
+ nextButLast.types.insert(*last.types.begin());
+ }
+ groups.pop_back();
+ }
+ return groups;
+ }
+
+ std::vector<Group> groupAllocs(uint32_t groupCount, uint64_t minSize,
+ uint32_t minEntries,
+ MemoryStatusViewer::Entry& entry)
+ {
+ assert(groupCount > 1);
+ std::vector<Group> groups(collapsePriorities(entry));
+ if (groups.size() == 0) return groups;
+ std::sort(groups.begin(), groups.end(), GroupAllocsOrder());
+ assert(groups.front().entry._totalUserCount
+ >= groups.back().entry._totalUserCount);
+ while (groups.size() > minEntries
+ && (groups.size() > groupCount
+ || groups[groups.size() - 2].entry._totalUserCount
+ < minSize))
+ {
+ Group& nextButLast(groups[groups.size() - 2]);
+ Group& last(groups.back());
+ nextButLast.entry += last.entry;
+ nextButLast.minPri = std::min(nextButLast.minPri, last.minPri);
+ nextButLast.maxPri = std::max(nextButLast.maxPri, last.maxPri);
+ nextButLast.types.insert(*last.types.begin());
+ groups.pop_back();
+ }
+ return groups;
+ }
+
+ std::vector<Group> groupMinAllocs(uint32_t groupCount, uint64_t minSize,
+ uint32_t minEntries,
+ MemoryStatusViewer::Entry& entry)
+ {
+ assert(groupCount > 1);
+ std::vector<Group> groups(collapsePriorities(entry));
+ if (groups.size() == 0) return groups;
+ std::sort(groups.begin(), groups.end(), GroupMinAllocsOrder());
+ assert(groups.front().entry._minimumCount
+ >= groups.back().entry._minimumCount);
+ while (groups.size() > minEntries
+ && (groups.size() > groupCount
+ || groups[groups.size() - 2].entry._minimumCount
+ < minSize))
+ {
+ Group& nextButLast(groups[groups.size() - 2]);
+ Group& last(groups.back());
+ nextButLast.entry += last.entry;
+ nextButLast.minPri = std::min(nextButLast.minPri, last.minPri);
+ nextButLast.maxPri = std::max(nextButLast.maxPri, last.maxPri);
+ nextButLast.types.insert(*last.types.begin());
+ groups.pop_back();
+ }
+ return groups;
+ }
+
+ std::vector<Group> groupDeniedAllocs(uint32_t groupCount, uint64_t minSize,
+ uint32_t minEntries,
+ MemoryStatusViewer::Entry& entry)
+ {
+ assert(groupCount > 1);
+ std::vector<Group> groups(collapsePriorities(entry));
+ if (groups.size() == 0) return groups;
+ std::sort(groups.begin(), groups.end(), GroupDeniedAllocsOrder());
+ assert(groups.front().entry._deniedCount
+ >= groups.back().entry._deniedCount);
+ while (groups.size() > minEntries
+ && (groups.size() > groupCount
+ || groups[groups.size() - 2].entry._deniedCount
+ < minSize))
+ {
+ Group& nextButLast(groups[groups.size() - 2]);
+ Group& last(groups.back());
+ nextButLast.entry += last.entry;
+ nextButLast.minPri = std::min(nextButLast.minPri, last.minPri);
+ nextButLast.maxPri = std::max(nextButLast.maxPri, last.maxPri);
+ nextButLast.types.insert(*last.types.begin());
+ groups.pop_back();
+ }
+ return groups;
+ }
+}
+
+void
+MemoryStatusViewer::printSnapshot(
+ std::ostream& out, Entry& entry,
+ std::map<const framework::MemoryAllocationType*,
+ uint32_t>& colors) const
+{
+ out << "<h4>" << entry._name << " - Taken at "
+ << entry._timeTaken.toString() << "</h4>\n"
+ << "<table><tr><td>\n"
+ << "<b>Memory usage";
+ if (entry._name != "Current") {
+ out << ", maxed at " << framework::SecondTime(entry._timeTaken);
+ }
+ out << " with "
+ << (entry._data.getUsedSizeIgnoringCache() / (1024 * 1024))
+ << " MB.</b><br>\n";
+ std::string piename = entry._name;
+ std::replace(piename.begin(), piename.end(), ' ', '_');
+ uint64_t freeSize = entry._maxMemory - entry._data.getUsedSize();
+ // Memory usage pie
+ uint64_t minSize = freeSize / 20;
+ std::vector<Group> groups(groupLoad(20, minSize, 5, entry));
+ PieChart chart(piename, PieChart::SCHEME_CUSTOM);
+ chart.printLabels(false);
+ for (uint32_t i=0; i<groups.size(); ++i) {
+ std::string name = "Other";
+ if (groups[i].types.size() == 1) {
+ name = (*groups[i].types.begin())->getName();
+ }
+ uint32_t mbytes = groups[i].entry._currentUsedSize / (1024 * 1024);
+ std::ostringstream ost;
+ ost << name << ", pri " << static_cast<uint16_t>(groups[i].minPri);
+ if (groups[i].minPri != groups[i].maxPri) {
+ ost << " - " << static_cast<uint16_t>(groups[i].maxPri);
+ }
+ ost << " (" << mbytes << " MB)";
+ name = ost.str();
+ if (groups[i].entry._currentUsedSize > 0) {
+ chart.add(groups[i].entry._currentUsedSize, name,
+ colors[*groups[i].types.begin()]);
+ }
+ }
+ {
+ std::ostringstream ost;
+ ost << "Free (" << (freeSize / (1024 * 1024)) << " MB)";
+ chart.add(freeSize, ost.str(), colors[0]);
+ }
+ chart.printCanvas(out, 750, 300);
+ out << "\n\n";
+ chart.printScript(out, "");
+ out << "\n\n";
+ // Total allocations pie
+ out << "</td><td>\n";
+ PieChart allocChart(piename + "Alloc", PieChart::SCHEME_CUSTOM);
+ allocChart.printLabels(false);
+ groups = groupAllocs(20, 100, 5, entry);
+ uint64_t totalAllocs = 0;
+ for (uint32_t i=0; i<groups.size(); ++i) {
+ std::string name = "Other";
+ if (groups[i].types.size() == 1) {
+ name = (*groups[i].types.begin())->getName();
+ }
+ uint32_t allocs = groups[i].entry._totalUserCount;
+ totalAllocs += allocs;
+ std::ostringstream ost;
+ ost << name << ", pri " << static_cast<uint16_t>(groups[i].minPri);
+ if (groups[i].minPri != groups[i].maxPri) {
+ ost << " - " << static_cast<uint16_t>(groups[i].maxPri);
+ }
+ ost << " (" << allocs << " allocations)";
+ name = ost.str();
+ if (groups[i].entry._totalUserCount > 0) {
+ allocChart.add(groups[i].entry._totalUserCount, name,
+ colors[*groups[i].types.begin()]);
+ }
+ }
+ out << "<b>Allocations, totalling " << totalAllocs << "</b><br>\n";
+ allocChart.printCanvas(out, 750, 300);
+ out << "\n\n";
+ allocChart.printScript(out, "");
+ out << "\n\n";
+ out << "</td></tr><tr><td>\n";
+ PieChart minChart(piename + "Min", PieChart::SCHEME_CUSTOM);
+ minChart.printLabels(false);
+ groups = groupMinAllocs(20, 100, 5, entry);
+ uint64_t totalMinAllocs = 0;
+ for (uint32_t i=0; i<groups.size(); ++i) {
+ std::string name = "Other";
+ if (groups[i].types.size() == 1) {
+ name = (*groups[i].types.begin())->getName();
+ }
+ uint32_t allocs = groups[i].entry._minimumCount;
+ totalMinAllocs += allocs;
+ std::ostringstream ost;
+ ost << name << ", pri " << static_cast<uint16_t>(groups[i].minPri);
+ if (groups[i].minPri != groups[i].maxPri) {
+ ost << " - " << static_cast<uint16_t>(groups[i].maxPri);
+ }
+ ost << " (" << allocs << " min allocations)";
+ name = ost.str();
+ if (groups[i].entry._minimumCount > 0) {
+ minChart.add(groups[i].entry._minimumCount, name,
+ colors[*groups[i].types.begin()]);
+ }
+ }
+ out << "<b>Minimum allocations, totalling " << totalMinAllocs
+ << "</b><br>\n";
+ if (totalMinAllocs > 0) {
+ minChart.printCanvas(out, 750, 300);
+ out << "\n\n";
+ minChart.printScript(out, "");
+ out << "\n\n";
+ }
+ out << "</td><td>\n";
+ PieChart deniedChart(piename + "Denied", PieChart::SCHEME_CUSTOM);
+ deniedChart.printLabels(false);
+ groups = groupDeniedAllocs(20, 100, 5, entry);
+ uint64_t totalDeniedAllocs = 0;
+ for (uint32_t i=0; i<groups.size(); ++i) {
+ std::string name = "Other";
+ if (groups[i].types.size() == 1) {
+ name = (*groups[i].types.begin())->getName();
+ }
+ uint32_t allocs = groups[i].entry._deniedCount;
+ totalDeniedAllocs += allocs;
+ std::ostringstream ost;
+ ost << name << ", pri " << static_cast<uint16_t>(groups[i].minPri);
+ if (groups[i].minPri != groups[i].maxPri) {
+ ost << " - " << static_cast<uint16_t>(groups[i].maxPri);
+ }
+ ost << " (" << allocs << " denied allocations)";
+ name = ost.str();
+ if (groups[i].entry._deniedCount > 0) {
+ deniedChart.add(groups[i].entry._deniedCount, name,
+ colors[*groups[i].types.begin()]);
+ }
+ }
+ out << "<b>Denied allocations, totalling " << totalDeniedAllocs
+ << "</b><br>\n";
+ if (totalDeniedAllocs > 0) {
+ deniedChart.printCanvas(out, 750, 300);
+ out << "\n\n";
+ deniedChart.printScript(out, "");
+ out << "\n\n";
+ }
+ out << "</td></tr></table>\n";
+}
+
+void
+MemoryStatusViewer::reportHtmlHeaderAdditions(
+ std::ostream& out, const framework::HttpUrlPath&) const
+{
+ (void) out;
+ // FIXME this function used to emit Yahoo-internal links to graph plotting
+ // JS files. Obviously, this won't work for external users. Either way, the
+ // memory manager/status reporter is deprecated.
+}
+
+namespace {
+ std::map<const framework::MemoryAllocationType*, uint32_t> assignColors(
+ const std::vector<const framework::MemoryAllocationType*>& types)
+ {
+ Palette palette(types.size() + 1);
+ std::map<const framework::MemoryAllocationType*, uint32_t> colors;
+ uint32_t nextCol = 0;
+ colors[0] = palette[nextCol++];
+ for (std::vector<const framework::MemoryAllocationType*>
+ ::const_iterator it = types.begin(); it != types.end(); ++it)
+ {
+ colors[*it] = palette[nextCol++];
+ }
+ return colors;
+ }
+}
+
+void
+MemoryStatusViewer::reportHtmlStatus(std::ostream& out,
+ const framework::HttpUrlPath& path) const
+{
+ vespalib::MonitorGuard monitor(_workerMonitor);
+
+ if (path.getAttribute("page") == "reset") {
+ }
+ if (path.getAttribute("interval") == "current") {
+ Entry& e(*_states[0]);
+ out << "<pre>" << e._name << ": ";
+ if (e.containsData()) {
+ e._data.print(out, true, " ");
+ } else {
+ out << "na";
+ }
+ out << "\n</pre>";
+ return;
+ }
+ const_cast<MemoryStatusViewer*>(this)->grabMemoryUsage();
+ framework::SecondTime currentTime(_component.getClock().getTimeInSeconds());
+ std::vector<const framework::MemoryAllocationType*> allocTypes(
+ _manager.getAllocationTypes());
+ std::map<const framework::MemoryAllocationType*, uint32_t> colors(
+ assignColors(allocTypes));
+ // Print memory usage graph
+ {
+ uint32_t mb = 1024 * 1024;
+ Graph memoryHistory("memhistory", Graph::SCHEME_CUSTOM);
+ std::vector<Graph::Point> total;
+ std::vector<Graph::Point> used;
+ std::vector<Graph::Point> usedWoCache;
+ uint32_t xval = 0;
+ for (std::deque<MemoryTimeEntry>::const_iterator it
+ = _memoryHistory.begin(); it != _memoryHistory.end();
+ ++it, ++xval)
+ {
+ used.push_back(Graph::Point(xval, it->used));
+ usedWoCache.push_back(Graph::Point(xval, it->usedWithoutCache));
+ }
+ used.push_back(Graph::Point(
+ xval, _states[0]->_data.getUsedSize() / mb));
+ usedWoCache.push_back(Graph::Point(
+ xval, _states[0]->_data.getUsedSizeIgnoringCache() / mb));
+ uint32_t totalSize = _states[0]->_maxMemory / mb;
+ total.push_back(Graph::Point(0, totalSize));
+ total.push_back(Graph::Point(xval, totalSize));
+ memoryHistory.add(total, "Total memory", Graph::GREEN);
+ memoryHistory.add(used, "Used memory", Graph::YELLOW);
+ memoryHistory.add(usedWoCache, "Used memory excluding freeable cache",
+ Graph::RED);
+ out << "<p>Memory available for lowest priority (255): "
+ << _manager.getMemorySizeFreeForPriority(255) << " byte(s).</p>\n";
+ out << "<h3>Historic memory usage</h3>\n";
+ uint32_t yAxisUnit = ((totalSize / 4) / 256) * 256;
+ if (yAxisUnit == 0) yAxisUnit = (totalSize / 4);
+ if (yAxisUnit == 0) yAxisUnit = 1;
+ uint32_t size = yAxisUnit;
+ memoryHistory.addYAxisLabel(0, "0 B");
+ while (size <= totalSize) {
+ std::ostringstream label;
+ if (size % 1024 == 0) {
+ label << (size / 1024) << " GB";
+ } else {
+ label << size << " MB";
+ }
+ memoryHistory.addYAxisLabel(size, label.str());
+ size += yAxisUnit;
+ }
+ uint32_t xAxisUnit = ((_memoryHistory.size() / 4) / 24) * 24;
+ if (xAxisUnit == 0) xAxisUnit = _memoryHistoryPeriod.getTime();
+ uint32_t startTime = ((currentTime.getTime()
+ / _memoryHistoryPeriod.getTime())
+ / 24) * 24;
+ uint32_t stopTime = (currentTime.getTime()
+ / _memoryHistoryPeriod.getTime())
+ - _memoryHistory.size() + 1;
+ memoryHistory.addXAxisLabel(xval, currentTime.toString());
+ bool addedMiddlePoints = false;
+ while (startTime >= stopTime) {
+ if (currentTime.getTime() / _memoryHistoryPeriod.getTime()
+ - startTime > 48)
+ {
+ memoryHistory.addXAxisLabel(
+ (startTime - stopTime),
+ framework::SecondTime(
+ startTime * _memoryHistoryPeriod.getTime())
+ .toString());
+ addedMiddlePoints = true;
+ }
+ startTime -= xAxisUnit;
+ }
+ if (!addedMiddlePoints && _memoryHistory.size() > 2) {
+ memoryHistory.addXAxisLabel(
+ 1,
+ framework::SecondTime(
+ stopTime * _memoryHistoryPeriod.getTime())
+ .toString());
+ }
+ memoryHistory.setBorders(50, 0, 0, 30);
+ memoryHistory.setLegendPos(80, 20);
+ memoryHistory.printCanvas(out, 1000, 250);
+ memoryHistory.printScript(out, "");
+ }
+ uint32_t maxUsedWithoutCache = 0;
+ for (uint32_t i=0; i<_states.size(); ++i) {
+ Entry& e(*_states[i]);
+ if (!e.containsData()
+ || e._data.getUsedSizeIgnoringCache() == maxUsedWithoutCache)
+ {
+ continue;
+ }
+ printSnapshot(out, e, colors);
+ maxUsedWithoutCache = e._data.getUsedSizeIgnoringCache();
+ }
+ out << "<h3>Raw output of stored data</h3>\n"
+ << "<pre>\n";
+ monitor.unlock();
+ printDebugOutput(out);
+ out << "</pre>\n";
+ out << "<h2>Memory used for metrics. (Not tracked in memory manager)</h2>\n"
+ << "<pre>\n"
+ << _metricManager.getMemoryConsumption(_metricManager.getMetricLock())->toString()
+ << "\n</pre>\n";
+}
+
+void
+MemoryStatusViewer::run(framework::ThreadHandle& thread)
+{
+ while (!thread.interrupted()) {
+ vespalib::MonitorGuard monitor(_workerMonitor);
+ framework::SecondTime currentTime(
+ _component.getClock().getTimeInSeconds());
+ if (_lastHistoryUpdate + _memoryHistoryPeriod <= currentTime
+ || _states[0]->_timeTaken + _memoryHistoryPeriod <= currentTime)
+ {
+ grabMemoryUsage();
+ _processedTime = currentTime;
+ LOG(spam, "Done processing time %" PRIu64, currentTime.getTime());
+ thread.registerTick(framework::PROCESS_CYCLE);
+ } else {
+ monitor.wait(thread.getWaitTime());
+ thread.registerTick(framework::WAIT_CYCLE);
+ }
+ }
+}
+
+// You should have worker monitor when calling this function
+void
+MemoryStatusViewer::grabMemoryUsage()
+{
+ framework::SecondTime currentTime(_component.getClock().getTimeInSeconds());
+ MemoryState state(_component.getClock(), 0);
+ _manager.getState(state, true);
+
+ if (_lastHistoryUpdate + _memoryHistoryPeriod <= currentTime) {
+ LOG(spam, "Adding another %" PRIu64 " sec entry to memory history.",
+ _memoryHistoryPeriod.getTime());
+ // Add history once an hour
+ uint32_t mb = 1024 * 1024;
+ _memoryHistory.push_back(MemoryTimeEntry(
+ state.getMaxSnapshot().getUsedSize() / mb,
+ state.getMaxSnapshot().getUsedSizeIgnoringCache() / mb));
+ if (_memoryHistory.size() > _memoryHistorySize) {
+ if (_memoryHistoryPeriod != framework::SecondTime(60 * 60)) {
+ uint32_t periodDiff = 60 * 60 / _memoryHistoryPeriod.getTime();
+ std::deque<MemoryTimeEntry> newHistory;
+ uint32_t count = 0;
+ MemoryTimeEntry entry(0, 0);
+ for (std::deque<MemoryTimeEntry>::const_iterator it
+ = _memoryHistory.begin();
+ it != _memoryHistory.end(); ++it)
+ {
+ entry.keepMax(*it);
+ if (++count == periodDiff) {
+ newHistory.push_back(entry);
+ entry = MemoryTimeEntry(0, 0);
+ count = 0;
+ }
+ }
+ if (entry.used != 0) {
+ newHistory.push_back(entry);
+ }
+ _memoryHistory.swap(newHistory);
+ _memoryHistoryPeriod = framework::SecondTime(60 * 60);
+ }
+ }
+ _lastHistoryUpdate += _memoryHistoryPeriod;
+ if (_lastHistoryUpdate + _allowedSlackPeriod < currentTime) {
+ LOGBP(warning, "Memory history is supposed to be tracked every %"
+ PRIu64 " seconds, but %" PRIu64" seconds have passed "
+ "since last update. Memory history graph will be "
+ "incorrect.",
+ _memoryHistoryPeriod.getTime(),
+ (currentTime - _lastHistoryUpdate + _memoryHistoryPeriod)
+ .getTime());
+ _lastHistoryUpdate = currentTime;
+ }
+ }
+ LOG(spam, "Overwriting current with snapshot using %" PRIu64 " bytes.",
+ state.getCurrentSnapshot().getUsedSize());
+ _states[0]->assign(state.getCurrentSnapshot(),
+ state.getTotalSize(), currentTime);
+ for (uint32_t i=1, n=_states.size(); i<n; ++i) {
+ if (currentTime - _states[i]->_timeTaken >= _states[i]->_maxAge
+ || state.getMaxSnapshot().getUsedSize()
+ > _states[i]->_data.getUsedSize())
+ {
+ LOG(spam, "Updating period %s usage. Old usage was %" PRIu64 ". "
+ "Last set at %" PRIu64,
+ _states[i]->_name.c_str(), _states[i]->_data.getUsedSize(),
+ _states[i]->_timeTaken.getTime());
+ _states[i]->assign(state.getMaxSnapshot(),
+ state.getTotalSize(), currentTime);
+ }
+ }
+}
+
+void
+MemoryStatusViewer::notifyThread() const
+{
+ vespalib::MonitorGuard monitor(_workerMonitor);
+ monitor.broadcast();
+}
+
+void
+MemoryStatusViewer::printDebugOutput(std::ostream& out) const
+{
+ vespalib::MonitorGuard monitor(_workerMonitor);
+ for (uint32_t i=0; i<_states.size(); ++i) {
+ Entry& e(*_states[i]);
+ out << e._name << ": ";
+ if (e.containsData()) {
+ out << e._timeTaken.toString() << " Max memory " << e._maxMemory << " ";
+ e._data.print(out, true, " ");
+ } else {
+ out << "na";
+ }
+ out << "\n\n";
+ }
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/frameworkimpl/memory/memorystatusviewer.h b/storage/src/vespa/storage/frameworkimpl/memory/memorystatusviewer.h
new file mode 100644
index 00000000000..120eea1a457
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/memory/memorystatusviewer.h
@@ -0,0 +1,139 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::MemoryStatusViewer
+ *
+ * \brief Generates status to access through status pages.
+ *
+ * Keeps a history of the largest memory inprints seen historically. This is
+ * done be defining periods, where a period is always a multiplum of the length
+ * of the period shorter than it. The last entry will store the biggest memory
+ * imprint ever seen, and the earlier entries will show biggest for their time
+ * period.
+ *
+ * To avoid having all periods cleared once the biggest period resets, the
+ * periods keep data for each of the periods one size below it. Thus, a year
+ * keeps data for 12 months, a month for 30 days, and so on.
+ *
+ * The memory state objects are divided in 3 parts. Current memory data, max
+ * memory data since since reset and counts for how often various events have
+ * happened.
+ *
+ * The counts will have their total count values stored in the current entry.
+ * When the next period is updated getting a copy of these counts, we can see
+ * how many counts have happened recently, by taking the current entry and
+ * subtract those accounted for earlier.
+ *
+ * The current memory data will not be interesting for anything than to show the
+ * actual now values in the current entry.
+ *
+ * The max since reset values will be the values used for the various periods.
+ * When a period is updated with new data for a subpart of their period, the
+ * max seen data is reset in the period in front, such that a lower maximum
+ * can be found.
+ */
+
+#pragma once
+
+#include <deque>
+#include <vespa/vespalib/util/document_runnable.h>
+#include <vespa/storage/common/storagecomponent.h>
+#include <vespa/storageframework/defaultimplementation/memory/memorystate.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vector>
+#include <vespa/vespalib/util/sync.h>
+
+namespace metrics {
+ class MetricManager;
+}
+
+namespace storage {
+
+class StorageServerInterface;
+
+class MemoryStatusViewer : public framework::HtmlStatusReporter,
+ private framework::Runnable
+{
+public:
+ typedef framework::defaultimplementation::MemoryState::SnapShot SnapShot;
+ struct Entry {
+ typedef std::shared_ptr<Entry> SP;
+
+ std::string _name;
+ framework::SecondTime _maxAge;
+ framework::SecondTime _timeTaken;
+ SnapShot _data;
+ uint64_t _maxMemory;
+
+ Entry(const std::string& name, framework::Clock&,
+ framework::SecondTime maxAge);
+ bool containsData() const { return (_maxMemory != 0); }
+
+ void assign(const SnapShot& snapshot, uint64_t maxMemory,
+ framework::SecondTime time)
+ {
+ _data = snapshot;
+ _maxMemory = maxMemory;
+ _timeTaken = time;
+ }
+ };
+
+ struct MemoryTimeEntry {
+ uint64_t used;
+ uint64_t usedWithoutCache;
+
+ MemoryTimeEntry(uint64_t u, uint64_t wo)
+ : used(u), usedWithoutCache(wo) {}
+
+ void keepMax(const MemoryTimeEntry& e) {
+ used = (used > e.used ? used : e.used);
+ usedWithoutCache = (usedWithoutCache > e.usedWithoutCache
+ ? usedWithoutCache : e.usedWithoutCache);
+ }
+ };
+
+private:
+ framework::Component _component;
+ framework::defaultimplementation::MemoryManager& _manager;
+ const metrics::MetricManager& _metricManager;
+ vespalib::Monitor _workerMonitor;
+
+ std::vector<Entry::SP> _states;
+ std::deque<MemoryTimeEntry> _memoryHistory;
+ uint32_t _memoryHistorySize;
+ framework::SecondTime _memoryHistoryPeriod;
+ framework::SecondTime _allowedSlackPeriod;
+ framework::SecondTime _lastHistoryUpdate;
+ framework::Thread::UP _thread;
+ framework::SecondTime _processedTime;
+
+ void addEntry(const std::string& name, uint32_t maxAge) {
+ _states.push_back(Entry::SP(new Entry(name, _component.getClock(),
+ framework::SecondTime(maxAge))));
+ }
+ void run(framework::ThreadHandle&);
+ void grabMemoryUsage();
+ void printSnapshot(std::ostream& out, Entry& entry,
+ std::map<const framework::MemoryAllocationType*,
+ uint32_t>& colors) const;
+
+public:
+ MemoryStatusViewer(
+ framework::defaultimplementation::MemoryManager&,
+ const metrics::MetricManager&,
+ StorageComponentRegister&);
+ ~MemoryStatusViewer();
+
+ virtual void reportHtmlHeaderAdditions(std::ostream&,
+ const framework::HttpUrlPath&) const;
+ virtual void reportHtmlStatus(std::ostream&,
+ const framework::HttpUrlPath&) const;
+
+ /** Useful for testing. */
+ framework::SecondTime getProcessedTime() const { return _processedTime; }
+ void notifyThread() const;
+ void printDebugOutput(std::ostream&) const;
+
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/frameworkimpl/status/CMakeLists.txt b/storage/src/vespa/storage/frameworkimpl/status/CMakeLists.txt
new file mode 100644
index 00000000000..17a89a343b8
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/status/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_status OBJECT
+ SOURCES
+ statuswebserver.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/frameworkimpl/status/statuswebserver.cpp b/storage/src/vespa/storage/frameworkimpl/status/statuswebserver.cpp
new file mode 100644
index 00000000000..ddfb0bd8a95
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/status/statuswebserver.cpp
@@ -0,0 +1,357 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/frameworkimpl/status/statuswebserver.h>
+
+#include <vespa/document/util/stringutil.h>
+#include <vespa/log/log.h>
+#include <map>
+#include <sstream>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storage/common/vtag.h>
+#include <vespa/fastlib/net/url.h>
+#include <vespa/vespalib/util/host_name.h>
+#include <vespa/vespalib/util/stringfmt.h>
+
+LOG_SETUP(".status");
+
+namespace storage {
+
+StatusWebServer::StatusWebServer(
+ framework::ComponentRegister& componentRegister,
+ framework::StatusReporterMap& reporterMap,
+ const config::ConfigUri & configUri)
+ : _reporterMap(reporterMap),
+ _workerMonitor(),
+ _port(0),
+ _httpServer(),
+ _configFetcher(configUri.getContext()),
+ _queuedRequests(),
+ _component(componentRegister, "Status"),
+ _thread()
+{
+ _configFetcher.subscribe<vespa::config::content::core::StorStatusConfig>(configUri.getConfigId(), this);
+ _configFetcher.start();
+ framework::MilliSecTime maxProcessingTime(60 * 60 * 1000);
+ framework::MilliSecTime maxWaitTime(10 * 1000);
+ _thread = _component.startThread(*this, maxProcessingTime, maxWaitTime);
+
+}
+
+StatusWebServer::~StatusWebServer()
+{
+ // Avoid getting config during shutdown
+ _configFetcher.close();
+
+ if (_httpServer.get() != 0) {
+ LOG(debug, "Shutting down status web server on port %u",
+ _httpServer->getListenPort());
+ }
+ // Delete http server to ensure that no more incoming requests reach us.
+ _httpServer.reset(0);
+
+ // Stop internal thread such that we don't process anymore web requests
+ if (_thread.get() != 0) {
+ _thread->interruptAndJoin(&_workerMonitor);
+ }
+}
+
+void StatusWebServer::configure(std::unique_ptr<vespa::config::content::core::StorStatusConfig> config)
+{
+ int newPort = config->httpport;
+ // If server is already running, ignore config updates that doesn't
+ // alter port, or suggests random port.
+ if (_httpServer.get() != 0) {
+ if (newPort == 0 || newPort == _port) return;
+ }
+ // Try to create new server before destroying old.
+ LOG(info, "Starting status web server on port %u.", newPort);
+ std::unique_ptr<WebServer> server;
+ // Negative port number means don't run the web server
+ if (newPort >= 0) {
+ server.reset(new WebServer(*this, newPort));
+ server->SetKeepAlive(false);
+
+ bool started = false;
+ switch (server->Start()) {
+ case FASTLIB_SUCCESS:
+ started = true;
+ break;
+ case FASTLIB_HTTPSERVER_BADLISTEN:
+ LOG(warning, "Listen failed on port %u", newPort);
+ break;
+ case FASTLIB_HTTPSERVER_NEWTHREADFAILED:
+ LOG(warning, "Failed starting thread for status server on "
+ "port %u", newPort);
+ break;
+ case FASTLIB_HTTPSERVER_ALREADYSTARTED:
+ LOG(warning, "Failed starting status server on port %u "
+ "(already started?)", newPort);
+ break;
+ default:
+ LOG(warning, "Failed starting status server on port %u "
+ "(unknown reason)", newPort);
+ break;
+ }
+ if (!started) {
+ std::ostringstream ost;
+ ost << "Failed to start status HTTP server using port " << newPort
+ << ".";
+ if (_httpServer.get() != 0) {
+ ost << " Status server still running on port " << _port
+ << " instead of suggested port " << newPort;
+ }
+ throw vespalib::IllegalStateException(ost.str(), VESPA_STRLOC);
+ }
+ // Now that we know config update went well, update internal state
+ _port = server->getListenPort();
+ LOG(config, "Status pages now available on port %u", _port);
+ if (_httpServer.get() != 0) {
+ LOG(debug, "Shutting down old status server.");
+ _httpServer.reset(0);
+ LOG(debug, "Done shutting down old status server.");
+ }
+ } else if (_httpServer.get() != 0) {
+ LOG(info, "No longer running status server as negative port was given "
+ "in config, indicating not to run a server.");
+ }
+ _httpServer = std::move(server);
+}
+
+StatusWebServer::WebServer::WebServer(StatusWebServer& status, uint16_t port)
+ : Fast_HTTPServer(port, NULL, 100, false, 128*1024, 10),
+ _status(status),
+ _serverSpec(vespalib::make_string("%s:%d", vespalib::HostName::get().c_str(), port))
+{
+}
+
+
+namespace {
+ /** Utility class for printing HTTP errors. */
+ struct HttpErrorWriter {
+ std::ostream& _out;
+
+ HttpErrorWriter(std::ostream& out, vespalib::stringref error)
+ : _out(out)
+ {
+ _out << "HTTP/1.1 " << error << "\r\n"
+ "Connection: Close\r\n"
+ "Content-type: text/html\r\n\r\n"
+ "<html><head><title>" << error << "</title></head>\r\n"
+ "<body><h1>" << error << "</h1>\r\n"
+ "<p>";
+ }
+
+ template<typename T>
+ HttpErrorWriter& operator<<(const T& t) {
+ _out << t;
+ return *this;
+ }
+
+ ~HttpErrorWriter() {
+ _out << "</p></body>\r\n"
+ "</html>\r\n";
+ }
+ };
+}
+
+void
+StatusWebServer::WebServer::onGetRequest(const string & tmpurl, const string &serverSpec,
+ Fast_HTTPConnection& conn)
+{
+ Fast_URL urlCodec;
+ int bufLength = tmpurl.length() * 2 + 10;
+ char * encodedUrl = new char[bufLength];
+ strcpy(encodedUrl, tmpurl.c_str());
+ char decodedUrl[bufLength];
+ urlCodec.DecodeQueryString(encodedUrl);
+ urlCodec.decode(encodedUrl, decodedUrl, bufLength);
+ delete [] encodedUrl;
+
+ string url = decodedUrl;
+
+ LOG(debug, "Status got get request '%s'", url.c_str());
+ framework::HttpUrlPath urlpath(url.c_str(),
+ StatusWebServer::getServerSpec(serverSpec, getServerSpec()));
+ std::string link(urlpath.getPath());
+ if (link.size() > 0 && link[0] == '/') link = link.substr(1);
+ // Only allow crucial components not locking to answer directly.
+ // (We want deadlockdetector status page to be available during a
+ // deadlock
+ if (link == "" || link == "deadlockdetector") {
+ std::ostringstream ost;
+ _status.handlePage(urlpath, ost);
+ conn.Output(ost.str().c_str());
+ } else {
+ // Route other status requests that can possibly deadlock to a
+ // worker thread.
+ vespalib::MonitorGuard monitor(_status._workerMonitor);
+ _status._queuedRequests.push_back(
+ HttpRequest::SP(new HttpRequest(url.c_str(), urlpath.getServerSpec())));
+ HttpRequest* req = _status._queuedRequests.back().get();
+ framework::SecondTime timeout(urlpath.get("timeout", 30u));
+ framework::SecondTime timeoutTime(
+ _status._component.getClock().getTimeInSeconds() + timeout);
+ monitor.signal();
+ while (true) {
+ monitor.wait(100);
+ bool done = false;
+ if (req->_result.get()) {
+ conn.Output(req->_result->c_str());
+ LOG(debug,
+ "Finished status request for '%s'",
+ req->_url.c_str());
+ done = true;
+ } else {
+ if (_status._component.getClock().getTimeInSeconds()
+ > timeoutTime)
+ {
+ std::ostringstream ost;
+ {
+ HttpErrorWriter writer(
+ ost, "500 Internal Server Error");
+ writer << "Request " << url.c_str() << " timed out "
+ << "after " << timeout << " seconds.";
+ }
+ LOG(debug,
+ "HTTP status request failed: %s. %zu requests queued",
+ ost.str().c_str(),
+ _status._queuedRequests.size() - 1);
+ conn.Output(ost.str().c_str());
+ done = true;
+ }
+ }
+ if (done) {
+ for (std::list<HttpRequest::SP>::iterator it
+ = _status._queuedRequests.begin();
+ it != _status._queuedRequests.end(); ++it)
+ {
+ if (it->get() == req) {
+ _status._queuedRequests.erase(it);
+ break;
+ }
+ }
+ break;
+ }
+ }
+ }
+}
+
+namespace {
+ class IndexPageReporter : public framework::HtmlStatusReporter {
+ std::ostringstream ost;
+ virtual void reportHtmlStatus(std::ostream& out,
+ const framework::HttpUrlPath&) const
+ {
+ out << ost.str();
+ }
+
+ public:
+ IndexPageReporter() : framework::HtmlStatusReporter("", "Index page") {}
+
+ template<typename T>
+ IndexPageReporter& operator<<(const T& t) { ost << t; return *this; }
+ };
+}
+
+void
+StatusWebServer::handlePage(const framework::HttpUrlPath& urlpath,
+ std::ostream& out)
+{
+ vespalib::string link(urlpath.getPath());
+ if (link.size() > 0 && link[0] == '/') link = link.substr(1);
+
+ size_t slashPos = link.find('/');
+ if (slashPos != std::string::npos) link = link.substr(0, slashPos);
+
+ const framework::StatusReporter* reporter = 0;
+ if (link.size() > 0) {
+ reporter = _reporterMap.getStatusReporter(link);
+ }
+ bool pageExisted = false;
+ if (reporter != 0) {
+ try{
+ pageExisted = reporter->reportHttpHeader(out, urlpath);
+ if (pageExisted) {
+ pageExisted = reporter->reportStatus(out, urlpath);
+ }
+ } catch (std::exception& e) {
+ HttpErrorWriter writer(out, "500 Internal Server Error");
+ writer << "<pre>" << e.what() << "</pre>";
+ pageExisted = true;
+ } catch (...) {
+ HttpErrorWriter writer(out, "500 Internal Server Error");
+ writer << "Unknown exception";
+ pageExisted = true;
+ }
+ if (pageExisted) {
+ LOG(spam, "Status finished request");
+ return;
+ }
+ }
+ if (!pageExisted && link.size() > 0) {
+ HttpErrorWriter writer(out, "404 Not found");
+ } else {
+ IndexPageReporter indexRep;
+ indexRep << "<p><b>Binary version of Vespa:</b> "
+ << Vtag::currentVersion.toString()
+ << "</p>\n";
+ {
+ std::vector<const framework::StatusReporter*> reporters(
+ _reporterMap.getStatusReporters());
+ for (uint32_t i=0; i<reporters.size(); ++i) {
+ indexRep << "<a href=\"" << reporters[i]->getId() << "\">"
+ << reporters[i]->getName() << "</a><br>\n";
+ }
+ }
+ indexRep.reportHttpHeader(out, urlpath);
+ indexRep.reportStatus(out, urlpath);
+ }
+ LOG(spam, "Status finished request");
+}
+
+vespalib::string
+StatusWebServer::getServerSpec(const vespalib::string &specFromRequest,
+ const vespalib::string &specFromServer)
+{
+ if (specFromRequest.empty()) {
+ // This is a fallback in case the request spec is not given (HTTP 1.0 header)
+ return specFromServer;
+ }
+ return specFromRequest;
+}
+
+void
+StatusWebServer::run(framework::ThreadHandle& thread)
+{
+ while (!thread.interrupted()) {
+ HttpRequest::SP request;
+ {
+ vespalib::MonitorGuard monitor(_workerMonitor);
+ for (std::list<HttpRequest::SP>::iterator it
+ = _queuedRequests.begin(); it != _queuedRequests.end();
+ ++it)
+ {
+ if ((*it)->_result.get() == 0) {
+ request = *it;
+ break;
+ }
+ }
+ if (!request.get()) {
+ monitor.wait(10 * 1000);
+ thread.registerTick(framework::WAIT_CYCLE);
+ continue;
+ }
+ }
+ framework::HttpUrlPath urlpath(request->_url, request->_serverSpec);
+ std::ostringstream ost;
+ handlePage(urlpath, ost);
+ // If the same request is still in front of the queue
+ // (it hasn't timed out), add the result to it.
+ vespalib::MonitorGuard monitor(_workerMonitor);
+ request->_result.reset(new vespalib::string(ost.str()));
+ monitor.signal();
+ thread.registerTick(framework::PROCESS_CYCLE);
+ }
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/frameworkimpl/status/statuswebserver.h b/storage/src/vespa/storage/frameworkimpl/status/statuswebserver.h
new file mode 100644
index 00000000000..009adb0c2d0
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/status/statuswebserver.h
@@ -0,0 +1,84 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::Status
+ * @ingroup storageserver
+ *
+ * @brief Storage link handling status.
+ *
+ * @version $Id: status.h 126730 2011-09-30 14:02:22Z humbe $
+ */
+
+#pragma once
+
+#include <boost/noncopyable.hpp>
+#include <vespa/vespalib/stllike/string.h>
+#include <vespa/fastlib/net/httpserver.h>
+#include <list>
+#include <vespa/storage/config/config-stor-status.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/config/config.h>
+#include <vespa/config/helper/configfetcher.h>
+
+namespace storage {
+
+class StatusWebServer : private boost::noncopyable,
+ private config::IFetcherCallback<vespa::config::content::core::StorStatusConfig>,
+ private framework::Runnable
+{
+ class WebServer : public Fast_HTTPServer {
+ StatusWebServer& _status;
+ vespalib::string _serverSpec;
+
+ public:
+ WebServer(StatusWebServer&, uint16_t port);
+
+ virtual void onGetRequest(const string & url,
+ const string & serverSpec,
+ Fast_HTTPConnection& conn);
+ const vespalib::string &getServerSpec() const {
+ return _serverSpec;
+ }
+ };
+ struct HttpRequest {
+ typedef std::shared_ptr<HttpRequest> SP;
+
+ vespalib::string _url;
+ vespalib::string _serverSpec;
+ std::unique_ptr<vespalib::string> _result;
+
+ HttpRequest(vespalib::stringref url, vespalib::stringref serverSpec)
+ : _url(url),
+ _serverSpec(serverSpec),
+ _result()
+ {}
+ };
+
+ framework::StatusReporterMap& _reporterMap;
+ vespalib::Monitor _workerMonitor;
+ uint16_t _port;
+ std::unique_ptr<WebServer> _httpServer;
+ config::ConfigFetcher _configFetcher;
+ std::list<HttpRequest::SP> _queuedRequests;
+ framework::Component _component;
+ framework::Thread::UP _thread;
+
+public:
+ StatusWebServer(framework::ComponentRegister&,
+ framework::StatusReporterMap&,
+ const config::ConfigUri & configUri);
+ virtual ~StatusWebServer();
+
+ void handlePage(const framework::HttpUrlPath&, std::ostream& out);
+
+ static vespalib::string getServerSpec(const vespalib::string &requestSpec,
+ const vespalib::string &serverSpec);
+
+private:
+ virtual void configure(std::unique_ptr<vespa::config::content::core::StorStatusConfig> config);
+ void getPage(const char* url, Fast_HTTPConnection& conn);
+ virtual void run(framework::ThreadHandle&);
+
+};
+
+}
+
diff --git a/storage/src/vespa/storage/frameworkimpl/thread/CMakeLists.txt b/storage/src/vespa/storage/frameworkimpl/thread/CMakeLists.txt
new file mode 100644
index 00000000000..08a6d9860e8
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/thread/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_thread OBJECT
+ SOURCES
+ deadlockdetector.cpp
+ appkiller.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/frameworkimpl/thread/appkiller.cpp b/storage/src/vespa/storage/frameworkimpl/thread/appkiller.cpp
new file mode 100644
index 00000000000..f124a2849c3
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/thread/appkiller.cpp
@@ -0,0 +1,18 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/storage/frameworkimpl/thread/appkiller.h>
+
+#include <vespa/log/log.h>
+
+LOG_SETUP(".deadlock.killer");
+
+namespace storage {
+
+void RealAppKiller::kill() {
+ LOG(info, "Aborting the server to dump core, as we're "
+ "most likely deadlocked and want a core file "
+ "to view the stack traces.");
+ abort();
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/frameworkimpl/thread/appkiller.h b/storage/src/vespa/storage/frameworkimpl/thread/appkiller.h
new file mode 100644
index 00000000000..b785c98e505
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/thread/appkiller.h
@@ -0,0 +1,29 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::AppKiller
+ * @ingroup thread
+ * @brief A class for killing a storage process
+ *
+ * The app killer is a utility used by the deadlock detector to kill the
+ * process. This is separated into this utility such that the deadlock
+ * detector itself can use a fake killer to test the functionality.
+ */
+
+#pragma once
+
+#include <memory>
+
+namespace storage {
+
+struct AppKiller {
+ typedef std::unique_ptr<AppKiller> UP;
+ virtual ~AppKiller() {}
+ virtual void kill() = 0;
+};
+
+struct RealAppKiller : public AppKiller {
+ virtual void kill();
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/frameworkimpl/thread/deadlockdetector.cpp b/storage/src/vespa/storage/frameworkimpl/thread/deadlockdetector.cpp
new file mode 100644
index 00000000000..ee10584ce69
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/thread/deadlockdetector.cpp
@@ -0,0 +1,341 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/frameworkimpl/thread/deadlockdetector.h>
+
+#include <vespa/log/log.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/vdslib/state/nodetype.h>
+//#include <vespalib/util/htmlutils.h>
+#include <vespa/storage/bucketmover/htmltable.h>
+
+LOG_SETUP(".deadlock.detector");
+
+namespace storage {
+
+DeadLockDetector::DeadLockDetector(StorageComponentRegister& compReg,
+ AppKiller::UP killer)
+ : framework::HtmlStatusReporter("deadlockdetector", "Dead lock detector"),
+ _killer(std::move(killer)),
+ _states(),
+ _waiter(),
+ _enableWarning(true),
+ _enableShutdown(false),
+ _processSlackMs(30 * 1000),
+ _waitSlackMs(5 * 1000),
+ _reportedBucketDBLocksAtState(OK)
+{
+ DistributorComponentRegister* dComp(
+ dynamic_cast<DistributorComponentRegister*>(&compReg));
+ if (dComp) {
+ _dComponent.reset(new DistributorComponent(*dComp, "deadlockdetector"));
+ _component = _dComponent.get();
+ } else {
+ ServiceLayerComponentRegister* slComp(
+ dynamic_cast<ServiceLayerComponentRegister*>(&compReg));
+ assert(slComp != 0);
+ _slComponent.reset(new ServiceLayerComponent(
+ *slComp, "deadlockdetector"));
+ _component = _slComponent.get();
+ }
+ _component->registerStatusPage(*this);
+ _thread = _component->startThread(*this);
+}
+
+DeadLockDetector::~DeadLockDetector()
+{
+ if (_thread.get() != 0) {
+ _thread->interruptAndJoin(&_waiter);
+ }
+}
+
+void
+DeadLockDetector::enableWarning(bool enable)
+{
+ if (enable == _enableWarning) return;
+ LOG(debug, "%s dead lock detection warnings",
+ enable ? "Enabling" : "Disabling");
+ _enableWarning = enable;
+}
+
+void
+DeadLockDetector::enableShutdown(bool enable)
+{
+ if (enable == _enableShutdown) return;
+ LOG(debug, "%s dead lock detection",
+ enable ? "Enabling" : "Disabling");
+ _enableShutdown = enable;
+}
+
+namespace {
+ struct VisitorWrapper : public framework::ThreadVisitor {
+ std::map<vespalib::string, DeadLockDetector::State>& _states;
+ DeadLockDetector::ThreadVisitor& _visitor;
+
+ VisitorWrapper(std::map<vespalib::string, DeadLockDetector::State>& s,
+ DeadLockDetector::ThreadVisitor& visitor)
+ : _states(s),
+ _visitor(visitor)
+ {
+ }
+
+ virtual void visitThread(const vespalib::string& id,
+ const framework::ThreadProperties& p,
+ const framework::ThreadTickData& td)
+ {
+ if (_states.find(id) == _states.end()) {
+ _states[id] = DeadLockDetector::OK;
+ }
+ _visitor.visitThread(id, p, td, _states[id]);
+ }
+ };
+}
+
+void
+DeadLockDetector::visitThreads(ThreadVisitor& visitor) const
+{
+ VisitorWrapper wrapper(_states, visitor);
+ _component->getThreadPool().visitThreads(wrapper);
+}
+
+bool
+DeadLockDetector::isAboveFailThreshold(
+ const framework::MilliSecTime& time,
+ const framework::ThreadProperties& tp,
+ const framework::ThreadTickData& tick) const
+{
+ if (tp.getMaxCycleTime() == 0) {
+ return false;
+ }
+ uint64_t slack(tick._lastTickType == framework::WAIT_CYCLE
+ ? getWaitSlack().getTime() : getProcessSlack().getTime());
+ return (tick._lastTickMs + tp.getMaxCycleTime() + slack < time.getTime());
+}
+
+bool
+DeadLockDetector::isAboveWarnThreshold(
+ const framework::MilliSecTime& time,
+ const framework::ThreadProperties& tp,
+ const framework::ThreadTickData& tick) const
+{
+ if (tp.getMaxCycleTime() == 0) return false;
+ uint64_t slack(tick._lastTickType == framework::WAIT_CYCLE
+ ? getWaitSlack().getTime() : getProcessSlack().getTime());
+ return (tick._lastTickMs + tp.getMaxCycleTime() + slack / 4 < time.getTime());
+}
+
+vespalib::string
+DeadLockDetector::getBucketLockInfo() const
+{
+ vespalib::asciistream ost;
+ if (_dComponent.get()) {
+ if (_dComponent->getBucketDatabase().size() > 0) {
+ //_dComponent->getBucketDatabase().showLockClients(ost);
+ ost << "No bucket lock information available for distributor\n";
+ }
+ } else {
+ if (_slComponent->getBucketDatabase().size() > 0) {
+ _slComponent->getBucketDatabase().showLockClients(ost);
+ }
+ }
+ return ost.str();
+}
+
+namespace {
+ struct ThreadChecker : public DeadLockDetector::ThreadVisitor
+ {
+ DeadLockDetector& _detector;
+ framework::MilliSecTime _currentTime;
+
+ ThreadChecker(DeadLockDetector& d, const framework::MilliSecTime& time)
+ : _detector(d), _currentTime(time) {}
+
+ virtual void visitThread(const vespalib::string& id,
+ const framework::ThreadProperties& tp,
+ const framework::ThreadTickData& tick,
+ DeadLockDetector::State& state)
+ {
+ // In case we just got a new tick, ignore the thread
+ if (tick._lastTickMs > _currentTime.getTime()) return;
+ // If thread is already in halted state, ignore it.
+ if (state == DeadLockDetector::HALTED) return;
+
+ if (_detector.isAboveFailThreshold(_currentTime, tp, tick)) {
+ state = DeadLockDetector::HALTED;
+ _detector.handleDeadlock(_currentTime, id, tp, tick, false);
+ } else if (_detector.isAboveWarnThreshold(_currentTime, tp, tick)) {
+ state = DeadLockDetector::WARNED;
+ _detector.handleDeadlock(_currentTime, id, tp, tick, true);
+ } else if (state != DeadLockDetector::OK) {
+ vespalib::asciistream ost;
+ ost << "Thread " << id << " has registered tick again.\n";
+ LOGBT(info, "%s", ost.str().c_str());
+ state = DeadLockDetector::OK;
+ }
+ }
+ };
+}
+
+void
+DeadLockDetector::handleDeadlock(const framework::MilliSecTime& currentTime,
+ const vespalib::string& id,
+ const framework::ThreadProperties&,
+ const framework::ThreadTickData& tick,
+ bool warnOnly)
+{
+ vespalib::asciistream error;
+ error << "Thread " << id << " has gone "
+ << (currentTime.getTime() - tick._lastTickMs)
+ << " milliseconds without registering a tick.";
+ if (!warnOnly) {
+ if (_enableShutdown && !warnOnly) {
+ error << " Restarting process due to deadlock.";
+ } else {
+ error << " Would have restarted process due to "
+ << "deadlock if shutdown had been enabled.";
+ }
+ } else {
+ error << " Global slack not expended yet. Warning for now.";
+ }
+ if (warnOnly) {
+ if (_enableWarning) {
+ LOGBT(warning, "deadlockw-" + id, "%s",
+ error.str().c_str());
+ if (_reportedBucketDBLocksAtState != WARNED) {
+ _reportedBucketDBLocksAtState = WARNED;
+ LOG(info, "Locks in bucket database at deadlock time:"
+ "\n%s",
+ getBucketLockInfo().c_str());
+ }
+ }
+ return;
+ } else {
+ if (_enableShutdown || _enableWarning) {
+ LOGBT(error, "deadlock-" + id, "%s",
+ error.str().c_str());
+ }
+ }
+ if (!_enableShutdown) return;
+ if (_reportedBucketDBLocksAtState != HALTED) {
+ _reportedBucketDBLocksAtState = HALTED;
+ LOG(info, "Locks in bucket database at deadlock time:"
+ "\n%s", getBucketLockInfo().c_str());
+ }
+ if (_enableShutdown) {
+ _killer->kill();
+ }
+}
+
+void
+DeadLockDetector::run(framework::ThreadHandle& thread)
+{
+ vespalib::MonitorGuard sync(_waiter);
+ while (!thread.interrupted()) {
+ framework::MilliSecTime time(_component->getClock().getTimeInMillis());
+ ThreadChecker checker(*this, time);
+ visitThreads(checker);
+ sync.wait(1000);
+ thread.registerTick(framework::WAIT_CYCLE);
+ }
+}
+
+namespace {
+ struct ThreadTable {
+ HtmlTable _table;
+ LongColumn _msSinceLastTick;
+ LongColumn _maxProcTickTime;
+ LongColumn _maxWaitTickTime;
+ LongColumn _maxProcTickTimeSeen;
+ LongColumn _maxWaitTickTimeSeen;
+
+ ThreadTable()
+ : _table("Thread name"),
+ _msSinceLastTick("Milliseconds since last tick", " ms", &_table),
+ _maxProcTickTime("Max milliseconds before wait tick", " ms", &_table),
+ _maxWaitTickTime("Max milliseconds before wait tick", " ms", &_table),
+ _maxProcTickTimeSeen("Max processing tick time observed", " ms", &_table),
+ _maxWaitTickTimeSeen("Max wait tick time observed", " ms", &_table)
+ {
+ _maxProcTickTime._alignment = Column::LEFT;
+ _maxProcTickTimeSeen._alignment = Column::LEFT;
+ _maxWaitTickTimeSeen._alignment = Column::LEFT;
+ }
+ };
+ struct ThreadStatusWriter : public DeadLockDetector::ThreadVisitor {
+ ThreadTable& _table;
+ framework::MilliSecTime _time;
+ framework::MilliSecTime _processSlack;
+ framework::MilliSecTime _waitSlack;
+
+ ThreadStatusWriter(ThreadTable& table,
+ const framework::MilliSecTime& time,
+ framework::MilliSecTime processSlack,
+ framework::MilliSecTime waitSlack)
+ : _table(table), _time(time),
+ _processSlack(processSlack), _waitSlack(waitSlack) {}
+
+ template<typename T>
+ vespalib::string toS(const T& val) {
+ vespalib::asciistream ost;
+ ost << val;
+ return ost.str();
+ }
+
+ void visitThread(const vespalib::string& id,
+ const framework::ThreadProperties& tp,
+ const framework::ThreadTickData& tick,
+ DeadLockDetector::State& /*state*/)
+ {
+ _table._table.addRow(id);
+ uint32_t i = _table._table.getRowCount() - 1;
+ _table._msSinceLastTick[i] = _time.getTime() - tick._lastTickMs;
+ _table._maxProcTickTime[i] = tp.getMaxProcessTime();
+ _table._maxWaitTickTime[i] = tp.getWaitTime();
+ _table._maxProcTickTimeSeen[i] = tick._maxProcessingTimeSeenMs;
+ _table._maxWaitTickTimeSeen[i] = tick._maxWaitTimeSeenMs;
+ }
+ };
+}
+
+void
+DeadLockDetector::reportHtmlStatus(std::ostream& os,
+ const framework::HttpUrlPath&) const
+{
+ vespalib::asciistream out;
+ out << "<h2>Overview of latest thread ticks</h2>\n";
+ ThreadTable threads;
+ vespalib::MonitorGuard monitor(_waiter);
+ framework::MilliSecTime time(_component->getClock().getTimeInMillis());
+ ThreadStatusWriter writer(threads, time, getProcessSlack(), getWaitSlack());
+ visitThreads(writer);
+ std::ostringstream ost;
+ threads._table.print(ost);
+ out << ost.str();
+ out << "<p>\n"
+ << "Note that there is a global slack period of " << getProcessSlack()
+ << " ms for processing ticks and " << getWaitSlack()
+ << " ms for wait ticks. Actual shutdown or warning logs will not"
+ << " appear before this slack time is expendede on top of the per"
+ << " thread value.\n"
+ << "</p>\n";
+ if (_enableShutdown) {
+ out << "<p>The deadlock detector is enabled and will kill the process "
+ << "if a deadlock is detected</p>\n";
+ } else {
+ out << "<p>The deadlock detector is disabled and will only monitor "
+ << "tick times.</p>\n";
+ }
+ out << "<h2>Current locks in the bucket database</h2>\n"
+ << "<p>In case of a software bug causing a deadlock in the code, bucket"
+ << " database locks are a likely reason. Thus, we list current locks "
+ << "here in hopes that it will simplify debugging.</p>\n"
+ << "<p>Bucket database</p>\n"
+ << "<pre>\n"
+ << getBucketLockInfo()
+ << "</pre>\n";
+ os << out.str();
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/frameworkimpl/thread/deadlockdetector.h b/storage/src/vespa/storage/frameworkimpl/thread/deadlockdetector.h
new file mode 100644
index 00000000000..61ce0b26757
--- /dev/null
+++ b/storage/src/vespa/storage/frameworkimpl/thread/deadlockdetector.h
@@ -0,0 +1,100 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::DeadLockDetector
+ * @ingroup common
+ *
+ * Threads register in the deadlock detector and calls registerTick
+ * periodically. If they do not tick often enough, the deadlock detector
+ * will shut down the node.
+ *
+ * @brief A class for detecting whether storage has entered a deadlock.
+ */
+
+#pragma once
+
+#include <vespa/storage/common/distributorcomponent.h>
+#include <vespa/storage/common/servicelayercomponent.h>
+#include <vespa/storage/frameworkimpl/thread/appkiller.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/vespalib/util/sync.h>
+#include <map>
+#include <atomic>
+
+
+namespace storage {
+
+struct DeadLockDetector : private framework::Runnable,
+ private framework::HtmlStatusReporter
+{
+ enum State { OK, WARNED, HALTED };
+
+ DeadLockDetector(StorageComponentRegister&,
+ AppKiller::UP killer = AppKiller::UP(new RealAppKiller));
+ ~DeadLockDetector();
+
+ void enableWarning(bool enable);
+ void enableShutdown(bool enable);
+ // There are no data read/write dependencies on neither _processSlackMs
+ // nor _waitSlackMs so relaxed ops suffice.
+ void setProcessSlack(framework::MilliSecTime slack) {
+ _processSlackMs.store(slack.getTime(), std::memory_order_relaxed);
+ }
+ framework::MilliSecTime getProcessSlack() const {
+ return framework::MilliSecTime(
+ _processSlackMs.load(std::memory_order_relaxed));
+ }
+ void setWaitSlack(framework::MilliSecTime slack) {
+ _waitSlackMs.store(slack.getTime(), std::memory_order_relaxed);
+ }
+ framework::MilliSecTime getWaitSlack() const {
+ return framework::MilliSecTime(
+ _waitSlackMs.load(std::memory_order_relaxed));
+ }
+
+ // These utility functions are public as internal anonymous classes are
+ // using them. Can also be useful for whitebox testing.
+ struct ThreadVisitor {
+ virtual ~ThreadVisitor() {}
+ virtual void visitThread(const vespalib::string& id,
+ const framework::ThreadProperties&,
+ const framework::ThreadTickData&,
+ State& state) = 0;
+ };
+ void visitThreads(ThreadVisitor&) const;
+
+ bool isAboveFailThreshold(const framework::MilliSecTime& time,
+ const framework::ThreadProperties& tp,
+ const framework::ThreadTickData& tick) const;
+ bool isAboveWarnThreshold(const framework::MilliSecTime& time,
+ const framework::ThreadProperties& tp,
+ const framework::ThreadTickData& tick) const;
+ void handleDeadlock(const framework::MilliSecTime& currentTime,
+ const vespalib::string& id,
+ const framework::ThreadProperties& tp,
+ const framework::ThreadTickData& tick,
+ bool warnOnly);
+
+private:
+ AppKiller::UP _killer;
+ mutable std::map<vespalib::string, State> _states;
+ vespalib::Monitor _waiter;
+ bool _enableWarning;
+ bool _enableShutdown;
+ std::atomic<uint64_t> _processSlackMs;
+ std::atomic<uint64_t> _waitSlackMs;
+ State _reportedBucketDBLocksAtState;
+ DistributorComponent::UP _dComponent;
+ ServiceLayerComponent::UP _slComponent;
+ StorageComponent* _component;
+ framework::Thread::UP _thread;
+
+ virtual void run(framework::ThreadHandle&);
+
+ // Status implementation
+ virtual void reportHtmlStatus(std::ostream& out,
+ const framework::HttpUrlPath&) const;
+ vespalib::string getBucketLockInfo() const;
+};
+
+}
+
diff --git a/storage/src/vespa/storage/persistence/.gitignore b/storage/src/vespa/storage/persistence/.gitignore
new file mode 100644
index 00000000000..333f254ba10
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/.gitignore
@@ -0,0 +1,8 @@
+*.So
+*.lo
+.*.swp
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
diff --git a/storage/src/vespa/storage/persistence/CMakeLists.txt b/storage/src/vespa/storage/persistence/CMakeLists.txt
new file mode 100644
index 00000000000..4037bdce550
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/CMakeLists.txt
@@ -0,0 +1,19 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_spersistence OBJECT
+ SOURCES
+ persistenceutil.cpp
+ splitbitdetector.cpp
+ persistencethread.cpp
+ processallhandler.cpp
+ diskmoveoperationhandler.cpp
+ types.cpp
+ mergehandler.cpp
+ bucketprocessor.cpp
+ providershutdownwrapper.cpp
+ bucketownershipnotifier.cpp
+ fieldvisitor.cpp
+ testandsethelper.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/persistence/bucketownershipnotifier.cpp b/storage/src/vespa/storage/persistence/bucketownershipnotifier.cpp
new file mode 100644
index 00000000000..002f0be2d3d
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/bucketownershipnotifier.cpp
@@ -0,0 +1,165 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/common/bucketoperationlogger.h>
+#include <vespa/storage/persistence/bucketownershipnotifier.h>
+#include <vespa/vespalib/util/backtrace.h>
+
+LOG_SETUP(".persistence.bucketownershipnotifier");
+
+namespace storage {
+
+uint16_t
+BucketOwnershipNotifier::getOwnerDistributorForBucket(
+ const document::BucketId& bucket) const
+{
+ try {
+ return (_component.getDistribution()->getIdealDistributorNode(
+ *_component.getStateUpdater().getSystemState(), bucket));
+ // If we get exceptions there aren't any distributors, so they'll have
+ // to explicitly fetch all bucket info eventually anyway.
+ } catch (lib::TooFewBucketBitsInUseException& e) {
+ LOGBP(debug, "Too few bucket bits used for %s to be assigned "
+ "to a distributor. Not notifying any distributor of "
+ "bucket change.",
+ bucket.toString().c_str());
+ } catch (lib::NoDistributorsAvailableException& e) {
+ LOGBP(debug, "No distributors available. Not notifying any "
+ "distributor of bucket change.");
+ } catch (const std::exception& e) {
+ LOG(error,
+ "Got unknown exception while resolving distributor: %s",
+ e.what());
+ }
+ return FAILED_TO_RESOLVE;
+}
+
+bool
+BucketOwnershipNotifier::distributorOwns(uint16_t distributor,
+ const document::BucketId& bucket) const
+{
+ return (distributor == getOwnerDistributorForBucket(bucket));
+}
+
+void
+BucketOwnershipNotifier::sendNotifyBucketToDistributor(
+ uint16_t distributorIndex,
+ const document::BucketId& bucket,
+ const api::BucketInfo& infoToSend)
+{
+ if (!infoToSend.valid()) {
+ LOG(error,
+ "Trying to send invalid bucket info to distributor %u: %s. %s",
+ distributorIndex,
+ infoToSend.toString().c_str(),
+ vespalib::getStackTrace(0).c_str());
+ return;
+ }
+ api::NotifyBucketChangeCommand::SP notifyCmd(
+ new api::NotifyBucketChangeCommand(bucket, infoToSend));
+
+ notifyCmd->setAddress(api::StorageMessageAddress(
+ _component.getClusterName(),
+ lib::NodeType::DISTRIBUTOR,
+ distributorIndex));
+ notifyCmd->setSourceIndex(_component.getIndex());
+ LOG(debug,
+ "Sending notify to distributor %u: %s",
+ distributorIndex,
+ notifyCmd->toString().c_str());
+ _sender.sendCommand(notifyCmd);
+}
+
+void
+BucketOwnershipNotifier::logNotification(const document::BucketId& bucket,
+ uint16_t sourceIndex,
+ uint16_t currentOwnerIndex,
+ const api::BucketInfo& newInfo)
+{
+ LOG(debug,
+ "%s now owned by distributor %u, but reply for operation is scheduled "
+ "to go to distributor %u. Sending NotifyBucketChange with %s to ensure "
+ "new owner knows bucket exists",
+ bucket.toString().c_str(),
+ currentOwnerIndex,
+ sourceIndex,
+ newInfo.toString().c_str());
+ LOG_BUCKET_OPERATION_NO_LOCK(
+ bucket,
+ vespalib::make_vespa_string(
+ "Sending notify to distributor %u "
+ "(ownership changed away from %u)",
+ currentOwnerIndex, sourceIndex));
+}
+
+void
+BucketOwnershipNotifier::notifyIfOwnershipChanged(
+ const document::BucketId& bucket,
+ uint16_t sourceIndex,
+ const api::BucketInfo& infoToSend)
+{
+ uint16_t distributor(getOwnerDistributorForBucket(bucket));
+
+ if (distributor == sourceIndex || distributor == FAILED_TO_RESOLVE) {
+ return;
+ }
+ if (sourceIndex == FAILED_TO_RESOLVE) {
+ LOG(debug,
+ "Got an invalid source index of %u; impossible to know if "
+ "bucket ownership has changed. %s",
+ sourceIndex,
+ vespalib::getStackTrace(0).c_str());
+ return;
+ }
+ logNotification(bucket, sourceIndex, distributor, infoToSend);
+ sendNotifyBucketToDistributor(distributor, bucket, infoToSend);
+}
+
+void
+BucketOwnershipNotifier::sendNotifyBucketToCurrentOwner(
+ const document::BucketId& bucket,
+ const api::BucketInfo& infoToSend)
+{
+ uint16_t distributor(getOwnerDistributorForBucket(bucket));
+ if (distributor == FAILED_TO_RESOLVE) {
+ return;
+ }
+ sendNotifyBucketToDistributor(distributor, bucket, infoToSend);
+}
+
+NotificationGuard::~NotificationGuard()
+{
+ for (uint32_t i = 0; i < _bucketsToCheck.size(); ++i) {
+ const BucketToCheck& b(_bucketsToCheck[i]);
+ if (b.alwaysSend) {
+ _notifier.sendNotifyBucketToCurrentOwner(b.bucket, b.info);
+ } else {
+ _notifier.notifyIfOwnershipChanged(b.bucket, b.sourceIndex, b.info);
+ }
+ }
+}
+
+void
+NotificationGuard::notifyIfOwnershipChanged(const document::BucketId& bucket,
+ uint16_t sourceIndex,
+ const api::BucketInfo& infoToSend)
+{
+ _bucketsToCheck.push_back(BucketToCheck(bucket, sourceIndex, infoToSend));
+}
+
+void
+NotificationGuard::notifyAlways(const document::BucketId& bucket,
+ const api::BucketInfo& infoToSend)
+{
+ BucketToCheck bc(bucket, 0xffff, infoToSend);
+ bc.alwaysSend = true;
+ _bucketsToCheck.push_back(bc);
+}
+
+} // storage
+
diff --git a/storage/src/vespa/storage/persistence/bucketownershipnotifier.h b/storage/src/vespa/storage/persistence/bucketownershipnotifier.h
new file mode 100644
index 00000000000..3bf3aab6c37
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/bucketownershipnotifier.h
@@ -0,0 +1,94 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vector>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/storageapi/buckets/bucketinfo.h>
+#include <vespa/storage/common/messagesender.h>
+#include <vespa/storage/common/servicelayercomponent.h>
+
+namespace storage {
+
+class BucketOwnershipNotifier
+{
+ ServiceLayerComponent& _component;
+ MessageSender& _sender;
+public:
+ BucketOwnershipNotifier(ServiceLayerComponent& component,
+ MessageSender& sender)
+ : _component(component),
+ _sender(sender)
+ {}
+
+ bool distributorOwns(uint16_t distributor,
+ const document::BucketId& bucket) const;
+
+ void notifyIfOwnershipChanged(const document::BucketId& bucket,
+ uint16_t sourceIndex,
+ const api::BucketInfo& infoToSend);
+
+ void sendNotifyBucketToCurrentOwner(const document::BucketId& bucket,
+ const api::BucketInfo& infoToSend);
+private:
+ enum IndexMeta {
+ FAILED_TO_RESOLVE = 0xffff
+ };
+
+ void sendNotifyBucketToDistributor(uint16_t distributorIndex,
+ const document::BucketId& bucket,
+ const api::BucketInfo& infoToSend);
+
+ // Returns either index or FAILED_TO_RESOLVE
+ uint16_t getOwnerDistributorForBucket(const document::BucketId& bucket) const;
+
+ void logNotification(const document::BucketId& bucket,
+ uint16_t sourceIndex,
+ uint16_t currentOwnerIndex,
+ const api::BucketInfo& newInfo);
+};
+
+/**
+ * Convenience class for sending notifications at the end of a scope, primarily
+ * to avoid issues with sending while holding a bucket lock.
+ */
+class NotificationGuard
+{
+ struct BucketToCheck
+ {
+ BucketToCheck(const document::BucketId& _bucket,
+ uint16_t _sourceIndex,
+ const api::BucketInfo& _info)
+ : bucket(_bucket),
+ info(_info),
+ sourceIndex(_sourceIndex),
+ alwaysSend(false)
+ {}
+
+ document::BucketId bucket;
+ api::BucketInfo info;
+ uint16_t sourceIndex;
+ bool alwaysSend;
+ };
+ BucketOwnershipNotifier& _notifier;
+ std::vector<BucketToCheck> _bucketsToCheck;
+
+ NotificationGuard(const NotificationGuard&);
+ NotificationGuard& operator=(const NotificationGuard&);
+public:
+ NotificationGuard(BucketOwnershipNotifier& notifier)
+ : _notifier(notifier),
+ _bucketsToCheck()
+ {}
+
+ ~NotificationGuard();
+
+ void notifyIfOwnershipChanged(const document::BucketId& bucket,
+ uint16_t sourceIndex,
+ const api::BucketInfo& infoToSend);
+
+ void notifyAlways(const document::BucketId& bucket,
+ const api::BucketInfo& infoToSend);
+};
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/bucketprocessor.cpp b/storage/src/vespa/storage/persistence/bucketprocessor.cpp
new file mode 100644
index 00000000000..972071f7b5a
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/bucketprocessor.cpp
@@ -0,0 +1,83 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/persistence/bucketprocessor.h>
+#include <vespa/document/fieldset/fieldsets.h>
+#include <stdexcept>
+
+namespace storage {
+
+namespace {
+
+class IteratorGuard
+{
+private:
+ spi::PersistenceProvider& _spi;
+ spi::IteratorId _iteratorId;
+ spi::Context& _context;
+
+public:
+ IteratorGuard(spi::PersistenceProvider& spi,
+ spi::IteratorId iteratorId,
+ spi::Context& context)
+ : _spi(spi),
+ _iteratorId(iteratorId),
+ _context(context)
+ {}
+ ~IteratorGuard()
+ {
+ assert(_iteratorId != 0);
+ _spi.destroyIterator(_iteratorId, _context);
+ }
+ spi::IteratorId getIteratorId() const { return _iteratorId; }
+ spi::PersistenceProvider& getPersistenceProvider() const { return _spi; }
+};
+
+}
+
+void
+BucketProcessor::iterateAll(spi::PersistenceProvider& provider,
+ const spi::Bucket& bucket,
+ const std::string& documentSelection,
+ EntryProcessor& processor,
+ spi::IncludedVersions versions,
+ spi::Context& context)
+{
+ spi::Selection sel
+ = spi::Selection(spi::DocumentSelection(documentSelection));
+ spi::CreateIteratorResult createIterResult(provider.createIterator(
+ bucket,
+ document::HeaderFields(),
+ sel,
+ versions,
+ context));
+
+ if (createIterResult.getErrorCode() != spi::Result::NONE) {
+ std::ostringstream ss;
+ ss << "Failed to create iterator: "
+ << createIterResult.getErrorMessage();
+ throw std::runtime_error(ss.str());
+ }
+
+ spi::IteratorId iteratorId(createIterResult.getIteratorId());
+ IteratorGuard iteratorGuard(provider, iteratorId, context);
+
+ while (true) {
+ spi::IterateResult result(
+ provider.iterate(iteratorId, UINT64_MAX, context));
+ if (result.getErrorCode() != spi::Result::NONE) {
+ std::ostringstream ss;
+ ss << "Failed: " << result.getErrorMessage();
+ throw std::runtime_error(ss.str());
+ }
+
+ for (size_t i = 0; i < result.getEntries().size(); ++i) {
+ processor.process(*result.getEntries()[i]);
+ }
+
+ if (result.isCompleted()) {
+ break;
+ }
+ }
+}
+
+}
diff --git a/storage/src/vespa/storage/persistence/bucketprocessor.h b/storage/src/vespa/storage/persistence/bucketprocessor.h
new file mode 100644
index 00000000000..bfcb115aaa3
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/bucketprocessor.h
@@ -0,0 +1,31 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * Class that simplifies operations where we want to iterate through all
+ * the documents in a bucket (possibly with a document selection) and do
+ * something with each entry.
+ */
+#pragma once
+
+#include <vespa/persistence/spi/persistenceprovider.h>
+
+namespace storage {
+
+class BucketProcessor
+{
+public:
+ class EntryProcessor {
+ public:
+ virtual ~EntryProcessor() {};
+ virtual void process(spi::DocEntry&) = 0;
+ };
+
+ static void iterateAll(spi::PersistenceProvider&,
+ const spi::Bucket&,
+ const std::string& documentSelection,
+ EntryProcessor&,
+ spi::IncludedVersions,
+ spi::Context&);
+};
+
+}
+
diff --git a/storage/src/vespa/storage/persistence/diskmoveoperationhandler.cpp b/storage/src/vespa/storage/persistence/diskmoveoperationhandler.cpp
new file mode 100644
index 00000000000..aaadb8a7f7d
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/diskmoveoperationhandler.cpp
@@ -0,0 +1,95 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/persistence/diskmoveoperationhandler.h>
+#include <vespa/log/log.h>
+
+LOG_SETUP(".persistence.diskmoveoperationhandler");
+
+namespace storage {
+
+DiskMoveOperationHandler::DiskMoveOperationHandler(PersistenceUtil& env,
+ spi::PersistenceProvider& provider)
+ : _env(env),
+ _provider(provider)
+{
+}
+
+MessageTracker::UP
+DiskMoveOperationHandler::handleBucketDiskMove(BucketDiskMoveCommand& cmd,
+ spi::Context& context)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.movedBuckets,
+ _env._component.getClock()));
+
+ document::BucketId bucket(cmd.getBucketId());
+ uint32_t targetDisk(cmd.getDstDisk());
+ uint32_t deviceIndex(_env._partition);
+
+ if (cmd.getSrcDisk() != deviceIndex) {
+ tracker->fail(api::ReturnCode::INTERNAL_FAILURE,
+ "Tried to move bucket from source disk where it was not located");
+ return tracker;
+ }
+ if (targetDisk == deviceIndex) {
+ tracker->fail(api::ReturnCode::INTERNAL_FAILURE,
+ "Tried to move bucket from and to the same disk");
+ return tracker;
+ }
+ if (!_env._fileStorHandler.enabled(targetDisk)) {
+ tracker->fail(api::ReturnCode::ABORTED, "Target disk is not available");
+ return tracker;
+ }
+
+ LOG(debug, "Moving bucket %s from disk %u to disk %u.",
+ bucket.toString().c_str(),
+ deviceIndex, targetDisk);
+
+ spi::Bucket from(bucket, spi::PartitionId(deviceIndex));
+ spi::Bucket to(bucket, spi::PartitionId(targetDisk));
+
+ spi::Result result(
+ _provider.move(from, spi::PartitionId(targetDisk), context));
+ if (result.hasError()) {
+ tracker->fail(api::ReturnCode::INTERNAL_FAILURE,
+ result.getErrorMessage());
+ return tracker;
+ }
+
+ api::BucketInfo bInfo = _env.getBucketInfo(to, targetDisk);
+ uint32_t sourceFileSize = bInfo.getUsedFileSize();
+
+ {
+ // Grab bucket lock in bucket database, and update it
+ // If entry doesn't exist, that means it has just been deleted by
+ // delete bucket command. If so, it'll be deleted when delete bucket
+ // is executed. moving queue should move delete command to correct disk
+ StorBucketDatabase::WrappedEntry entry(
+ _env.getBucketDatabase().get(
+ bucket, "FileStorThread::onBucketDiskMove",
+ StorBucketDatabase::LOCK_IF_NONEXISTING_AND_NOT_CREATING));
+
+ // Move queued operations in bucket to new thread. Hold bucket lock
+ // while doing it, so filestor manager can't put in other operations
+ // first, such that operations change order.
+ _env._fileStorHandler.remapQueueAfterDiskMove(bucket, deviceIndex, targetDisk);
+
+ if (entry.exist()) {
+ entry->setBucketInfo(bInfo);
+ entry->disk = targetDisk;
+ entry.write();
+ }
+ }
+
+ // Answer message, setting extra info such as filesize
+ tracker->setReply(std::shared_ptr<BucketDiskMoveReply>(
+ new BucketDiskMoveReply(
+ cmd,
+ bInfo,
+ sourceFileSize,
+ sourceFileSize)));
+
+ return tracker;
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/diskmoveoperationhandler.h b/storage/src/vespa/storage/persistence/diskmoveoperationhandler.h
new file mode 100644
index 00000000000..d2564a6002b
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/diskmoveoperationhandler.h
@@ -0,0 +1,24 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storage/persistence/persistenceutil.h>
+
+namespace storage {
+
+class DiskMoveOperationHandler : public Types {
+
+public:
+ DiskMoveOperationHandler(PersistenceUtil&,
+ spi::PersistenceProvider& provider);
+
+ MessageTracker::UP handleBucketDiskMove(BucketDiskMoveCommand&,
+ spi::Context&);
+
+private:
+ PersistenceUtil& _env;
+ spi::PersistenceProvider& _provider;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/persistence/diskthread.h b/storage/src/vespa/storage/persistence/diskthread.h
new file mode 100644
index 00000000000..5f0f367f010
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/diskthread.h
@@ -0,0 +1,77 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class DiskThread
+ * @ingroup persistence
+ *
+ * @brief Implements the public API of the disk threads.
+ *
+ * The disk threads have a tiny interface as they pull messages of the disk
+ * queue themselves. Thus it is easy to provide multiple implementations of it.
+ * The diskthread implements the common functionality needed above, currently
+ * for the filestor manager.
+ */
+#pragma once
+
+#include <boost/utility.hpp>
+#include <vespa/vespalib/util/printable.h>
+#include <vespa/vespalib/util/document_runnable.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/config-stor-filestor.h>
+
+namespace storage {
+namespace framework {
+ class Thread;
+}
+
+class Directory;
+class SlotFileOptions;
+struct FileStorThreadMetrics;
+
+class DiskThread : public framework::Runnable,
+ private boost::noncopyable
+{
+public:
+ typedef std::shared_ptr<DiskThread> SP;
+
+ virtual ~DiskThread() {}
+
+ /**
+ * Query filestorthread for its operation count.
+ *
+ * Count is increased for each operation (and rolls around). If you query
+ * filestorthread, and count has not changed over a period of time that is
+ * longer than a single operation should use, and shorter than
+ * filestorthread can manage 2^32 operations, you can detect if the thread
+ * is stuck.
+ *
+ * (No locking is used for this. We assume instance don't manage to get
+ * partially updated to look exactly like the last retrieved entry)
+ */
+ struct OperationCount : public vespalib::Printable {
+ uint32_t count;
+ bool pending;
+
+ OperationCount() : count(0), pending(false) {}
+
+ void inc() { ++count; pending = true; }
+ void done() { pending = false; }
+
+ bool operator==(const OperationCount& c) const
+ { return (count == c.count && pending == c.pending); }
+
+ void print(std::ostream& out, bool, const std::string&) const
+ {
+ out << "OperationCount(" << count << (pending ? ", pending" : "")
+ << ")";
+ }
+ };
+
+ /** Waits for current operation to be finished. */
+ virtual void flush() = 0;
+
+ virtual framework::Thread& getThread() = 0;
+
+};
+
+}
+
diff --git a/storage/src/vespa/storage/persistence/fieldvisitor.cpp b/storage/src/vespa/storage/persistence/fieldvisitor.cpp
new file mode 100644
index 00000000000..dc8b55f41e2
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/fieldvisitor.cpp
@@ -0,0 +1,27 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// @author Vegard Sjonfjell
+#include <vespa/storage/persistence/fieldvisitor.h>
+
+namespace storage {
+
+void FieldVisitor::visitFieldValueNode(const document::select::FieldValueNode & node) {
+ _fields.insert(_docType.getField(node.getFieldName()));
+}
+
+void FieldVisitor::visitComparison(const document::select::Compare & node) {
+ visitBothBranches(node);
+}
+
+void FieldVisitor::visitAndBranch(const document::select::And & node) {
+ visitBothBranches(node);
+}
+
+void FieldVisitor::visitOrBranch(const document::select::Or & node) {
+ visitBothBranches(node);
+}
+
+void FieldVisitor::visitNotBranch(const document::select::Not & node) {
+ node.getChild().visit(*this);
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/fieldvisitor.h b/storage/src/vespa/storage/persistence/fieldvisitor.h
new file mode 100644
index 00000000000..3455b1bb032
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/fieldvisitor.h
@@ -0,0 +1,60 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// @author Vegard Sjonfjell
+
+#pragma once
+
+#include <vespa/document/select/node.h>
+#include <vespa/document/select/valuenode.h>
+#include <vespa/document/select/visitor.h>
+#include <vespa/document/select/branch.h>
+#include <vespa/document/select/compare.h>
+#include <vespa/document/fieldset/fieldsets.h>
+#include <vespa/document/datatype/documenttype.h>
+
+namespace storage {
+
+class FieldVisitor : public document::select::Visitor {
+private:
+ document::DocumentType _docType;
+ document::FieldCollection _fields;
+
+public:
+ FieldVisitor(const document::DocumentType & docType)
+ : _docType(docType),
+ _fields(_docType)
+ {}
+
+ const document::FieldSet & getFieldSet() {
+ return _fields;
+ }
+
+ void visitFieldValueNode(const document::select::FieldValueNode &) override;
+ void visitComparison(const document::select::Compare &) override;
+ void visitAndBranch(const document::select::And &) override;
+ void visitOrBranch(const document::select::Or &) override;
+ void visitNotBranch(const document::select::Not &) override;
+
+ // Ignored node types
+ void visitConstant(const document::select::Constant &) override {}
+ void visitInvalidConstant(const document::select::InvalidConstant &) override {}
+ void visitDocumentType(const document::select::DocType &) override {}
+ void visitArithmeticValueNode(const document::select::ArithmeticValueNode &) override {}
+ void visitFunctionValueNode(const document::select::FunctionValueNode &) override {}
+ void visitIdValueNode(const document::select::IdValueNode &) override {}
+ void visitSearchColumnValueNode(const document::select::SearchColumnValueNode &) override {}
+ void visitFloatValueNode(const document::select::FloatValueNode &) override {}
+ void visitVariableValueNode(const document::select::VariableValueNode &) override {}
+ void visitIntegerValueNode(const document::select::IntegerValueNode &) override {}
+ void visitCurrentTimeValueNode(const document::select::CurrentTimeValueNode &) override {}
+ void visitStringValueNode(const document::select::StringValueNode &) override {}
+ void visitNullValueNode(const document::select::NullValueNode &) override {}
+ void visitInvalidValueNode(const document::select::InvalidValueNode &) override {}
+
+ template <typename BinaryNode>
+ void visitBothBranches(const BinaryNode & node) {
+ node.getLeft().visit(*this);
+ node.getRight().visit(*this);
+ }
+};
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/filestorage/.gitignore b/storage/src/vespa/storage/persistence/filestorage/.gitignore
new file mode 100644
index 00000000000..95c0e70c2c6
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/.gitignore
@@ -0,0 +1,11 @@
+*.So
+*.lo
+*.o
+.*.swp
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
+config-stor-filestor.cpp
+config-stor-filestor.h
diff --git a/storage/src/vespa/storage/persistence/filestorage/CMakeLists.txt b/storage/src/vespa/storage/persistence/filestorage/CMakeLists.txt
new file mode 100644
index 00000000000..13d3afd9df0
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/CMakeLists.txt
@@ -0,0 +1,12 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_filestorpersistence OBJECT
+ SOURCES
+ filestormanager.cpp
+ filestorhandler.cpp
+ filestorhandlerimpl.cpp
+ mergestatus.cpp
+ modifiedbucketchecker.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/persistence/filestorage/debugverifications.h b/storage/src/vespa/storage/persistence/filestorage/debugverifications.h
new file mode 100644
index 00000000000..3973e5de0bf
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/debugverifications.h
@@ -0,0 +1,36 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::DebugVerifications
+ * @ingroup filestorage
+ *
+ * @brief Class containing some parameters for debug verifications.
+ *
+ * The persistence layer has config for what to verify as a bitmask.
+ * This class is a simple helper class defining an enum such that we
+ * can refer to the various parts with names instead of numbers.
+ */
+
+#pragma once
+
+namespace storage {
+
+struct DebugVerifications
+{
+ enum Types {
+ SLOTFILE_INTEGRITY_AFTER_PUT = 0x001,
+ SLOTFILE_INTEGRITY_AFTER_UPDATE = 0x002,
+ SLOTFILE_INTEGRITY_AFTER_COMPACT = 0x004,
+ SLOTFILE_INTEGRITY_AFTER_MERGE = 0x008,
+ SLOTFILE_INTEGRITY_AFTER_REMOVE = 0x010,
+ SLOTFILE_INTEGRITY_AFTER_REVERT = 0x020,
+ SLOTFILE_INTEGRITY_AFTER_MULTIOP = 0x040,
+ SLOTFILE_INTEGRITY_AFTER_REMOVEALL = 0x080,
+ SLOTFILE_INTEGRITY_AFTER_JOIN = 0x100,
+ SLOTFILE_INTEGRITY_AFTER_SPLIT = 0x200,
+ SLOTFILE_INTEGRITY_AFTER_REMOVELOCATION = 0x400,
+ FILESTORTHREAD_DISK_MATCHES_BUCKETDB = 0x800
+ };
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/persistence/filestorage/filestorhandler.cpp b/storage/src/vespa/storage/persistence/filestorage/filestorhandler.cpp
new file mode 100644
index 00000000000..486051b2e95
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/filestorhandler.cpp
@@ -0,0 +1,208 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/persistence/filestorage/filestorhandler.h>
+#include <vespa/storage/persistence/filestorage/filestorhandlerimpl.h>
+
+namespace storage {
+
+FileStorHandler::FileStorHandler(MessageSender& sender,
+ FileStorMetrics& metrics,
+ const spi::PartitionStateList& partitions,
+ ServiceLayerComponentRegister& compReg,
+ uint8_t maxPriorityToBlock,
+ uint8_t minPriorityToBeBlocking)
+ : _impl(new FileStorHandlerImpl(
+ sender, metrics, partitions, compReg,
+ maxPriorityToBlock, minPriorityToBeBlocking))
+{
+}
+
+FileStorHandler::~FileStorHandler()
+{
+ delete _impl;
+}
+
+void
+FileStorHandler::flush(bool flushMerges)
+{
+ _impl->flush(flushMerges);
+}
+
+void
+FileStorHandler::setDiskState(uint16_t disk, DiskState state)
+{
+ _impl->setDiskState(disk, state);
+}
+
+FileStorHandler::DiskState
+FileStorHandler::getDiskState(uint16_t disk)
+{
+ return _impl->getDiskState(disk);
+}
+
+void
+FileStorHandler::close()
+{
+ _impl->close();
+}
+
+ResumeGuard
+FileStorHandler::pause()
+{
+ return _impl->pause();
+}
+
+bool
+FileStorHandler::schedule(const api::StorageMessage::SP& msg, uint16_t thread)
+{
+ return _impl->schedule(msg, thread);
+}
+
+void
+FileStorHandler::pause(uint16_t disk, uint8_t priority) const {
+ return _impl->pause(disk, priority);
+}
+
+FileStorHandler::LockedMessage
+FileStorHandler::getNextMessage(uint16_t thread, uint8_t lowestPriority)
+{
+ return _impl->getNextMessage(thread, lowestPriority);
+}
+
+FileStorHandler::LockedMessage &
+FileStorHandler::getNextMessage(uint16_t thread,
+ LockedMessage& lck,
+ uint8_t lowestPriority)
+{
+ return _impl->getNextMessage(thread, lck, lowestPriority);
+}
+
+FileStorHandler::BucketLockInterface::SP
+FileStorHandler::lock(const document::BucketId& bucket, uint16_t disk)
+{
+ return _impl->lock(bucket, disk);
+}
+
+void
+FileStorHandler::remapQueueAfterDiskMove(
+ const document::BucketId& bucket,
+ uint16_t sourceDisk, uint16_t targetDisk)
+{
+ RemapInfo target(bucket, targetDisk);
+
+ _impl->remapQueue(RemapInfo(bucket, sourceDisk), target,
+ FileStorHandlerImpl::MOVE);
+}
+
+void
+FileStorHandler::remapQueueAfterJoin(
+ const RemapInfo& source,
+ RemapInfo& target)
+{
+ _impl->remapQueue(source, target, FileStorHandlerImpl::JOIN);
+}
+
+void
+FileStorHandler::remapQueueAfterSplit(
+ const RemapInfo& source,
+ RemapInfo& target1,
+ RemapInfo& target2)
+{
+ _impl->remapQueue(source, target1, target2, FileStorHandlerImpl::SPLIT);
+}
+
+void
+FileStorHandler::failOperations(const document::BucketId& bid,
+ uint16_t fromDisk, const api::ReturnCode& err)
+{
+ _impl->failOperations(bid, fromDisk, err);
+}
+
+void
+FileStorHandler::sendCommand(const api::StorageCommand::SP& msg)
+{
+ _impl->sendCommand(msg);
+}
+
+void
+FileStorHandler::sendReply(const api::StorageReply::SP& msg)
+{
+ _impl->sendReply(msg);
+}
+
+void
+FileStorHandler::getStatus(std::ostream& out,
+ const framework::HttpUrlPath& path) const
+{
+ _impl->getStatus(out, path);
+}
+
+uint32_t
+FileStorHandler::getQueueSize() const
+{
+ return _impl->getQueueSize();
+}
+
+uint32_t
+FileStorHandler::getQueueSize(uint16_t disk) const
+{
+ return _impl->getQueueSize(disk);
+}
+
+void
+FileStorHandler::addMergeStatus(const document::BucketId& bucket,
+ MergeStatus::SP ms)
+{
+ return _impl->addMergeStatus(bucket, ms);
+}
+
+MergeStatus&
+FileStorHandler::editMergeStatus(const document::BucketId& bucket)
+{
+ return _impl->editMergeStatus(bucket);
+}
+
+bool
+FileStorHandler::isMerging(const document::BucketId& bucket) const
+{
+ return _impl->isMerging(bucket);
+}
+
+uint32_t
+FileStorHandler::getNumActiveMerges() const
+{
+ return _impl->getNumActiveMerges();
+}
+
+void
+FileStorHandler::clearMergeStatus(const document::BucketId& bucket,
+ const api::ReturnCode& code)
+{
+ return _impl->clearMergeStatus(bucket, &code);
+}
+
+void
+FileStorHandler::clearMergeStatus(const document::BucketId& bucket)
+{
+ return _impl->clearMergeStatus(bucket, 0);
+}
+
+void
+FileStorHandler::abortQueuedOperations(const AbortBucketOperationsCommand& cmd)
+{
+ _impl->abortQueuedOperations(cmd);
+}
+
+void
+FileStorHandler::setGetNextMessageTimeout(uint32_t timeout)
+{
+ _impl->setGetNextMessageTimeout(timeout);
+}
+
+std::string
+FileStorHandler::dumpQueue(uint16_t disk) const
+{
+ return _impl->dumpQueue(disk);
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/filestorage/filestorhandler.h b/storage/src/vespa/storage/persistence/filestorage/filestorhandler.h
new file mode 100644
index 00000000000..378103def1c
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/filestorhandler.h
@@ -0,0 +1,277 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+/**
+ * \class storage::FileStorHandler
+ * \ingroup storage
+ *
+ * \brief Common resource for filestor threads
+ *
+ * Takes care of the interface between file stor threads and the file stor
+ * manager to avoid circular dependencies, and confine the implementation that
+ * needs to worry about locking between these components.
+ */
+
+#pragma once
+
+#include <vespa/document/bucket/bucketid.h>
+#include <ostream>
+#include <vespa/storage/persistence/filestorage/mergestatus.h>
+#include <vespa/storage/storageutil/resumeguard.h>
+#include <vespa/storage/common/messagesender.h>
+
+namespace storage {
+namespace api {
+ class ReturnCode;
+ class StorageMessage;
+ class StorageCommand;
+ class StorageReply;
+}
+namespace spi {
+ class PartitionStateList;
+}
+
+class FileStorHandlerImpl;
+class FileStorMetrics;
+class MessageSender;
+class MountPointList;
+class ServiceLayerComponentRegister;
+class AbortBucketOperationsCommand;
+
+class FileStorHandler : public MessageSender {
+public:
+ struct RemapInfo {
+ document::BucketId bid;
+ uint16_t diskIndex;
+ bool foundInQueue;
+
+ RemapInfo(const document::BucketId& bucketId, uint16_t diskIdx)
+ : bid(bucketId),
+ diskIndex(diskIdx),
+ foundInQueue(false)
+ {}
+ };
+
+ class BucketLockInterface {
+ public:
+ typedef std::shared_ptr<BucketLockInterface> SP;
+
+ virtual const document::BucketId& getBucketId() const = 0;
+
+ virtual ~BucketLockInterface() {};
+ };
+
+ typedef std::pair<BucketLockInterface::SP,
+ api::StorageMessage::SP> LockedMessage;
+
+ enum DiskState {
+ AVAILABLE,
+ DISABLED,
+ CLOSED
+ };
+
+ FileStorHandler(MessageSender&,
+ FileStorMetrics&,
+ const spi::PartitionStateList&,
+ ServiceLayerComponentRegister&,
+ uint8_t maxPriorityToBlock,
+ uint8_t minPriorityToBeBlocking);
+ ~FileStorHandler();
+
+ // Commands used by file stor manager
+
+ /**
+ * Waits for the filestor queues to be empty. Providing no new load is
+ * added while flushing, queues should be empty upon return.
+ *
+ * @param killPendingMerges If true, clear out all pending merges and reply
+ * to them with failure.
+ */
+ void flush(bool killPendingMerges);
+
+ void setDiskState(uint16_t disk, DiskState state);
+ DiskState getDiskState(uint16_t disk);
+
+ /** Check whether a given disk is enabled or not. */
+ bool enabled(uint16_t disk) { return (getDiskState(disk) == AVAILABLE); }
+ bool closed(uint16_t disk) { return (getDiskState(disk) == CLOSED); }
+ /**
+ * Disable the given disk. Operations towards threads using this disk will
+ * start to fail. Typically called when disk errors are detected.
+ */
+ void disable(uint16_t disk) { setDiskState(disk, DISABLED); }
+ /** Closes all disk threads. */
+ void close();
+
+ /**
+ * Makes sure no operations are active, then stops any new operations
+ * from being performed, until the ResumeGuard is destroyed.
+ */
+ ResumeGuard pause();
+
+ /**
+ * Schedule a storage message to be processed by the given disk
+ * @return True if we maanged to schedule operation. False if not
+ */
+ bool schedule(const std::shared_ptr<api::StorageMessage>&,
+ uint16_t disk);
+
+ // Commands used by file stor threads
+
+ /**
+ * When called, checks if any running operations have "preempting"
+ * priority. If so, and the given priority is less than that, this call
+ * will hang until the other operation is done.
+ */
+ void pause(uint16_t disk, uint8_t priority) const;
+
+ /**
+ * Used by file stor threads to get their next message to process.
+ *
+ * @param disk The disk to get messages for
+ * @param lowestPriority The lowest priority of operation we should return
+ */
+ LockedMessage getNextMessage(uint16_t disk, uint8_t lowestPriority);
+
+ /**
+ * Returns the next message for the same bucket.
+ */
+ LockedMessage & getNextMessage(uint16_t disk,
+ LockedMessage& lock,
+ uint8_t lowestPriority);
+
+ /**
+ * Lock a bucket. By default, each file stor thread has the locks of all
+ * buckets in their area of responsibility. If they need to access buckets
+ * outside of their area, they can call this to make sure the thread
+ * responsible for it doesn't interfere during the operation.
+ * This function will block until bucket is locked, and an operation on it
+ * is not pending. (Handler tracks current operation by remembering bucket
+ * of last message taken for each thread)
+ * NB: As current operation can be a split or join operation, make sure that
+ * you always wait for current to finish, if is a super or sub bucket of
+ * the bucket we're locking.
+ *
+ *
+ */
+ BucketLockInterface::SP lock(const document::BucketId&, uint16_t disk);
+
+ /**
+ * Called by FileStorThread::onBucketDiskMove() after moving file, in case
+ * we need to move operations from one disk queue to another.
+ *
+ * get/put/remove/update/revert/stat/multiop - Move to correct queue
+ * merge messages - Move to correct queue. Move any filestor thread state.
+ * join/split/getiter/repair/deletebucket - Move to correct queue
+ * requeststatus - Ignore
+ * readbucketinfo/bucketdiskmove/internalbucketjoin - Fail and log errors
+ */
+ void remapQueueAfterDiskMove(const document::BucketId& bucket,
+ uint16_t sourceDisk, uint16_t targetDisk);
+
+ /**
+ * Called by FileStorThread::onJoin() after joining a bucket into another,
+ * in case we need to move operations from one disk queue to another, and
+ * to remap operations to contain correct bucket target.
+ * Merge operations towards removed bucket probably needs to be aborted,
+ * so we remove any merge state stored in the filestor thread.
+ *
+ * get/put/remove/update/revert/multiop - Move to correct queue
+ * stat - Fail with bucket not found
+ * merge messages - Fail with bucket not found. Erase merge state in thread.
+ * join - Ignore
+ * split/getiter/repair/bucketdiskmove - Fail with bucket not found
+ * requeststatus/deletebucket - Ignore
+ * readbucketinfo/internalbucketjoin - Fail and log errors
+ */
+ void remapQueueAfterJoin(const RemapInfo& source, RemapInfo& target);
+
+ /**
+ * Called by FileStorThread::onSplit() after splitting a bucket,
+ * in case we need to move operations from one disk queue to another, and
+ * to remap operations to contain correct bucket target.
+ * Merge operations towards removed bucket probably needs to be aborted,
+ * so we remove any merge state stored in the filestor thread.
+ * Split targets that wasn't created sets bucket raw id to 0 to indicate
+ * that they were not added.
+ *
+ * get/put/remove/update/revert - Move to correct queue
+ * revert - In some way revert on both or correct copy
+ * multiop/stat - Fail with bucket not found
+ * merge messages - Fail with bucket not found. Erase merge state in thread.
+ * join - Ignore
+ * split/getiter/repair/bucketdiskmove - Fail with bucket not found
+ * requeststatus/deletebucket - Ignore
+ * readbucketinfo/internalbucketjoin - Fail and log errors
+ */
+ void remapQueueAfterSplit(const RemapInfo& source,
+ RemapInfo& target1,
+ RemapInfo& target2);
+
+ struct DeactivateCallback {
+ virtual ~DeactivateCallback() {}
+ virtual void handleDeactivate() = 0;
+ };
+
+ /**
+ * Fail all operations towards a single bucket currently queued to the
+ * given thread with the given error code.
+ */
+ void failOperations(const document::BucketId&, uint16_t fromDisk,
+ const api::ReturnCode&);
+
+ /**
+ * Add a new merge state to the registry.
+ */
+ void addMergeStatus(const document::BucketId&, MergeStatus::SP);
+
+ /**
+ * Returns the reference to the current merge status for the given bucket.
+ * This allows unlocked access to an internal variable, so users should
+ * first check that noone else is using it by calling isMerging() first.
+ *
+ * @param bucket The bucket to start merging.
+ */
+ MergeStatus& editMergeStatus(const document::BucketId& bucket);
+
+ /**
+ * Returns true if the bucket is currently being merged on this node.
+ *
+ * @param bucket The bucket to check merge status for
+ * @return Returns true if the bucket is being merged.
+ */
+ bool isMerging(const document::BucketId& bucket) const;
+
+ /**
+ * @return Returns the number of active merges on the node.
+ */
+ uint32_t getNumActiveMerges() const;
+
+ /** Removes the merge status for the given bucket. */
+ void clearMergeStatus(const document::BucketId&);
+ void clearMergeStatus(const document::BucketId&, const api::ReturnCode&);
+
+ void abortQueuedOperations(const AbortBucketOperationsCommand& cmd);
+
+ /** Send the given command back out of the persistence layer. */
+ void sendCommand(const api::StorageCommand::SP&);
+ /** Send the given reply back out of the persistence layer. */
+ void sendReply(const api::StorageReply::SP&);
+
+ /** Writes status page. */
+ void getStatus(std::ostream& out, const framework::HttpUrlPath& path) const;
+
+ /** Utility function to fetch total size of queue. */
+ uint32_t getQueueSize() const;
+ uint32_t getQueueSize(uint16_t disk) const;
+
+ // Commands used by testing
+ void setGetNextMessageTimeout(uint32_t timeout);
+
+ std::string dumpQueue(uint16_t disk) const;
+
+private:
+ FileStorHandlerImpl* _impl;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/persistence/filestorage/filestorhandlerimpl.cpp b/storage/src/vespa/storage/persistence/filestorage/filestorhandlerimpl.cpp
new file mode 100644
index 00000000000..36d0c477b39
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/filestorhandlerimpl.cpp
@@ -0,0 +1,1388 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/persistence/filestorage/filestorhandlerimpl.h>
+
+#include <vespa/log/log.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/removelocation.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storage/common/messagesender.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/common/statusmessages.h>
+#include <vespa/storage/common/bucketoperationlogger.h>
+#include <vespa/storage/common/messagebucketid.h>
+#include <vespa/storage/persistence/filestorage/filestormetrics.h>
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/vespalib/util/random.h>
+#include <vespa/storageapi/message/stat.h>
+#include <vespa/storageapi/message/batch.h>
+
+LOG_SETUP(".persistence.filestor.handler.impl");
+
+namespace storage {
+
+FileStorHandlerImpl::FileStorHandlerImpl(
+ MessageSender& sender,
+ FileStorMetrics& metrics,
+ const spi::PartitionStateList& partitions,
+ ServiceLayerComponentRegister& compReg,
+ uint8_t maxPriorityToBlock,
+ uint8_t minPriorityToBeBlocking)
+ : _partitions(partitions),
+ _component(compReg, "filestorhandlerimpl"),
+ _diskInfo(_component.getDiskCount()),
+ _messageSender(sender),
+ _bucketIdFactory(_component.getBucketIdFactory()),
+ _maxPriorityToBlock(maxPriorityToBlock),
+ _minPriorityToBeBlocking(minPriorityToBeBlocking),
+ _getNextMessageTimeout(100),
+ _paused(false)
+{
+ for (uint32_t i=0; i<_diskInfo.size(); ++i) {
+ _diskInfo[i].metrics = metrics.disks[i].get();
+ assert(_diskInfo[i].metrics != 0);
+ }
+
+ if (_diskInfo.size() == 0) {
+ throw vespalib::IllegalArgumentException(
+ "No disks configured", VESPA_STRLOC);
+ }
+ // Add update hook, so we will get callbacks each 5 seconds to update
+ // metrics.
+ _component.registerMetricUpdateHook(*this, framework::SecondTime(5));
+}
+
+FileStorHandlerImpl::~FileStorHandlerImpl()
+{
+}
+
+void
+FileStorHandlerImpl::addMergeStatus(const document::BucketId& bucket,
+ MergeStatus::SP status)
+{
+ vespalib::LockGuard mlock(_mergeStatesLock);
+ if (_mergeStates.find(bucket) != _mergeStates.end()) {;
+ LOG(warning, "A merge status already existed for %s. Overwriting it.",
+ bucket.toString().c_str());
+ }
+ _mergeStates[bucket] = status;
+}
+
+MergeStatus&
+FileStorHandlerImpl::editMergeStatus(const document::BucketId& bucket)
+{
+ vespalib::LockGuard mlock(_mergeStatesLock);
+ MergeStatus::SP status = _mergeStates[bucket];
+ if (status.get() == 0) {
+ throw vespalib::IllegalStateException(
+ "No merge state exist for " + bucket.toString(), VESPA_STRLOC);
+ }
+ return *status;
+}
+
+bool
+FileStorHandlerImpl::isMerging(const document::BucketId& bucket) const
+{
+ vespalib::LockGuard mlock(_mergeStatesLock);
+ return (_mergeStates.find(bucket) != _mergeStates.end());
+}
+
+uint32_t
+FileStorHandlerImpl::getNumActiveMerges() const
+{
+ vespalib::LockGuard mlock(_mergeStatesLock);
+ return _mergeStates.size();
+}
+
+void
+FileStorHandlerImpl::clearMergeStatus(const document::BucketId& bucket,
+ const api::ReturnCode* code)
+{
+ vespalib::LockGuard mlock(_mergeStatesLock);
+ auto it = _mergeStates.find(bucket);
+ if (it == _mergeStates.end()) {
+ if (code != 0) {
+ LOG(debug, "Merge state not present at the time of clear. "
+ "Could not fail merge of bucket %s with code %s.",
+ bucket.toString().c_str(), code->toString().c_str());
+ } else {
+ LOG(debug, "No merge state to clear for bucket %s.",
+ bucket.toString().c_str());
+ }
+ return;
+ }
+ if (code != 0) {
+ MergeStatus::SP statusPtr(it->second);
+ assert(statusPtr.get());
+ MergeStatus& status(*statusPtr);
+ if (status.reply.get()) {
+ status.reply->setResult(*code);
+ LOG(debug, "Aborting merge. Replying merge of %s with code %s.",
+ bucket.toString().c_str(), code->toString().c_str());
+ _messageSender.sendReply(status.reply);
+ }
+ if (status.pendingGetDiff.get()) {
+ status.pendingGetDiff->setResult(*code);
+ LOG(debug, "Aborting merge. Replying getdiff of %s with code %s.",
+ bucket.toString().c_str(), code->toString().c_str());
+ _messageSender.sendReply(status.pendingGetDiff);
+ }
+ if (status.pendingApplyDiff.get()) {
+ status.pendingApplyDiff->setResult(*code);
+ LOG(debug, "Aborting merge. Replying applydiff of %s with code %s.",
+ bucket.toString().c_str(), code->toString().c_str());
+ _messageSender.sendReply(status.pendingApplyDiff);
+ }
+ }
+ _mergeStates.erase(bucket);
+}
+
+void
+FileStorHandlerImpl::flush(bool killPendingMerges)
+{
+ for (uint32_t i=0; i<_diskInfo.size(); ++i) {
+ LOG(debug, "Wait until queues and bucket locks released for disk '%d'", i);
+ Disk& t(_diskInfo[i]);
+ vespalib::MonitorGuard lockGuard(t.lock);
+ while (t.getQueueSize() != 0 || !t.lockedBuckets.empty()) {
+ LOG(debug, "Still %d in queue and %ld locked buckets for disk '%d'", t.getQueueSize(), t.lockedBuckets.size(), i);
+ lockGuard.wait(100);
+ }
+ LOG(debug, "All queues and bucket locks released for disk '%d'", i);
+ }
+
+ if (killPendingMerges) {
+ api::ReturnCode code(api::ReturnCode::ABORTED,
+ "Storage node is shutting down");
+ for (std::map<document::BucketId, MergeStatus::SP>::iterator it
+ = _mergeStates.begin(); it != _mergeStates.end(); ++it)
+ {
+ MergeStatus& s(*it->second);
+ if (s.pendingGetDiff.get() != 0) {
+ s.pendingGetDiff->setResult(code);
+ _messageSender.sendReply(s.pendingGetDiff);
+ }
+ if (s.pendingApplyDiff.get() != 0) {
+ s.pendingApplyDiff->setResult(code);
+ _messageSender.sendReply(s.pendingApplyDiff);
+ }
+ if (s.reply.get() != 0) {
+ s.reply->setResult(code);
+ _messageSender.sendReply(s.reply);
+ }
+ }
+ _mergeStates.clear();
+ }
+}
+
+void
+FileStorHandlerImpl::reply(api::StorageMessage& msg,
+ DiskState state) const
+{
+ if (!msg.getType().isReply()) {
+ std::shared_ptr<api::StorageReply> rep(
+ static_cast<api::StorageCommand&>(msg).makeReply().release());
+ if (state == FileStorHandler::DISABLED) {
+ rep->setResult(api::ReturnCode(
+ api::ReturnCode::DISK_FAILURE, "Disk disabled"));
+ } else {
+ rep->setResult(api::ReturnCode(
+ api::ReturnCode::ABORTED, "Shutting down storage node."));
+ }
+ _messageSender.sendReply(rep);
+ }
+}
+
+void
+FileStorHandlerImpl::setDiskState(uint16_t disk, DiskState state)
+{
+ Disk& t(_diskInfo[disk]);
+ vespalib::MonitorGuard lockGuard(t.lock);
+
+ // Mark disk closed
+ t.setState(state);
+ if (state != FileStorHandler::AVAILABLE) {
+ while (t.queue.begin() != t.queue.end()) {
+ reply(*t.queue.begin()->_command, state);
+ t.queue.erase(t.queue.begin());
+ }
+ }
+ lockGuard.broadcast();
+}
+
+FileStorHandler::DiskState
+FileStorHandlerImpl::getDiskState(uint16_t disk) const
+{
+ return _diskInfo[disk].getState();
+}
+
+void
+FileStorHandlerImpl::close()
+{
+ for (uint32_t i=0; i<_diskInfo.size(); ++i) {
+ if (getDiskState(i) == FileStorHandler::AVAILABLE) {
+ LOG(debug, "AVAILABLE -> CLOSED disk[%d]", i);
+ setDiskState(i, FileStorHandler::CLOSED);
+ }
+ LOG(debug, "Closing disk[%d]", i);
+ Disk& t(_diskInfo[i]);
+ vespalib::MonitorGuard lockGuard(t.lock);
+ lockGuard.broadcast();
+ LOG(debug, "Closed disk[%d]", i);
+ }
+}
+
+uint32_t
+FileStorHandlerImpl::getQueueSize() const
+{
+ uint32_t count = 0;
+ for (uint32_t i=0; i<_diskInfo.size(); ++i) {
+ const Disk& t(_diskInfo[i]);
+ vespalib::MonitorGuard lockGuard(t.lock);
+ count += t.getQueueSize();
+ }
+ return count;
+}
+
+bool
+FileStorHandlerImpl::schedule(const std::shared_ptr<api::StorageMessage>& msg,
+ uint16_t disk)
+{
+ assert(disk < _diskInfo.size());
+ Disk& t(_diskInfo[disk]);
+ vespalib::MonitorGuard lockGuard(t.lock);
+
+ if (t.getState() == FileStorHandler::AVAILABLE) {
+ MBUS_TRACE(msg->getTrace(), 5, vespalib::make_string(
+ "FileStorHandler: Operation added to disk %d's queue with "
+ "priority %u", disk, msg->getPriority()));
+
+ t.queue.push_back(MessageEntry(msg,
+ getStorageMessageBucketId(*msg)));
+
+ LOG(spam, "Queued operation %s with priority %u.",
+ msg->getType().toString().c_str(),
+ msg->getPriority());
+
+ lockGuard.broadcast();
+ } else {
+ return false;
+ }
+ return true;
+}
+
+void
+FileStorHandlerImpl::pause(uint16_t disk, uint8_t priority) const {
+ if (priority < _maxPriorityToBlock) {
+ return;
+ }
+
+ assert(disk < _diskInfo.size());
+ const Disk& t(_diskInfo[disk]);
+ vespalib::MonitorGuard lockGuard(t.lock);
+
+ bool paused = true;
+ while (paused) {
+ paused = false;
+ for (auto& lockedBucket : t.lockedBuckets) {
+ if (lockedBucket.second.priority <= _minPriorityToBeBlocking) {
+ paused = true;
+ lockGuard.wait();
+ break;
+ }
+ }
+ }
+}
+
+bool
+FileStorHandlerImpl::messageMayBeAborted(const api::StorageMessage& msg) const
+{
+ if (msg.getType().isReply()) {
+ return false;
+ }
+ // Create/DeleteBucket have already updated the bucket database before
+ // being scheduled and must be allowed through to avoid getting out of
+ // sync between the service layer and the provider.
+ switch (msg.getType().getId()) {
+ case api::MessageType::PUT_ID:
+ case api::MessageType::REMOVE_ID:
+ case api::MessageType::REVERT_ID:
+ case api::MessageType::MERGEBUCKET_ID:
+ case api::MessageType::GETBUCKETDIFF_ID:
+ case api::MessageType::APPLYBUCKETDIFF_ID:
+ case api::MessageType::SPLITBUCKET_ID:
+ case api::MessageType::JOINBUCKETS_ID:
+ case api::MessageType::MULTIOPERATION_ID:
+ case api::MessageType::UPDATE_ID:
+ case api::MessageType::REMOVELOCATION_ID:
+ case api::MessageType::BATCHPUTREMOVE_ID:
+ case api::MessageType::BATCHDOCUMENTUPDATE_ID:
+ case api::MessageType::SETBUCKETSTATE_ID:
+ return true;
+ default:
+ return false;
+ }
+}
+
+void
+FileStorHandlerImpl::abortQueuedCommandsForBuckets(
+ Disk& disk,
+ const AbortBucketOperationsCommand& cmd)
+{
+ Disk& t(disk);
+ vespalib::MonitorGuard diskLock(t.lock);
+ typedef PriorityQueue::iterator iter_t;
+ api::ReturnCode abortedCode(api::ReturnCode::ABORTED,
+ "Sending distributor no longer owns "
+ "bucket operation was bound to");
+ for (iter_t it(t.queue.begin()), e(t.queue.end()); it != e;) {
+ api::StorageMessage& msg(*it->_command);
+ if (messageMayBeAborted(msg) && cmd.shouldAbort(it->_bucketId)) {
+ LOG(debug,
+ "Aborting operation %s as it is bound for bucket %s",
+ msg.toString().c_str(),
+ it->_bucketId.toString().c_str());
+ std::shared_ptr<api::StorageReply> msgReply(
+ static_cast<api::StorageCommand&>(msg).makeReply().release());
+ msgReply->setResult(abortedCode);
+ _messageSender.sendReply(msgReply);
+
+ it = t.queue.erase(it);
+ } else {
+ ++it;
+ }
+ }
+}
+
+bool
+FileStorHandlerImpl::diskHasActiveOperationForAbortedBucket(
+ const Disk& disk,
+ const AbortBucketOperationsCommand& cmd) const
+{
+ for (auto& lockedBucket : disk.lockedBuckets) {
+ if (cmd.shouldAbort(lockedBucket.first)) {
+ LOG(spam,
+ "Disk had active operation for aborted bucket %s, "
+ "waiting for it to complete...",
+ lockedBucket.first.toString().c_str());
+ return true;
+ }
+ }
+ return false;
+}
+
+void
+FileStorHandlerImpl::waitUntilNoActiveOperationsForAbortedBuckets(
+ Disk& disk,
+ const AbortBucketOperationsCommand& cmd)
+{
+ vespalib::MonitorGuard guard(disk.lock);
+ while (diskHasActiveOperationForAbortedBucket(disk, cmd)) {
+ guard.wait();
+ }
+ guard.broadcast();
+}
+
+void
+FileStorHandlerImpl::abortQueuedOperations(
+ const AbortBucketOperationsCommand& cmd)
+{
+ // Do queue clearing and active operation waiting in two passes
+ // to allow disk threads to drain running operations in parallel.
+ for (uint32_t i = 0; i < _diskInfo.size(); ++i) {
+ abortQueuedCommandsForBuckets(_diskInfo[i], cmd);
+ }
+ for (uint32_t i = 0; i < _diskInfo.size(); ++i) {
+ waitUntilNoActiveOperationsForAbortedBuckets(_diskInfo[i], cmd);
+ }
+}
+
+bool
+FileStorHandlerImpl::hasBlockingOperations(const Disk& t) const
+{
+ for (auto& lockedBucket : t.lockedBuckets) {
+ if (lockedBucket.second.priority <= _minPriorityToBeBlocking) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+void
+FileStorHandlerImpl::updateMetrics(const MetricLockGuard &)
+{
+ for (uint32_t i=0; i<_diskInfo.size(); ++i) {
+ const Disk& t(_diskInfo[i]);
+ vespalib::MonitorGuard lockGuard(t.lock);
+ t.metrics->pendingMerges.addValue(_mergeStates.size());
+ t.metrics->queueSize.addValue(t.getQueueSize());
+ }
+}
+
+FileStorHandler::LockedMessage &
+FileStorHandlerImpl::getNextMessage(uint16_t disk,
+ FileStorHandler::LockedMessage& lck,
+ uint8_t maxPriority)
+{
+ document::BucketId id(lck.first->getBucketId());
+
+ LOG(spam,
+ "Disk %d retrieving message for buffered bucket %s",
+ disk,
+ id.toString().c_str());
+
+ assert(disk < _diskInfo.size());
+ Disk& t(_diskInfo[disk]);
+
+ if (getDiskState(disk) == FileStorHandler::CLOSED) {
+ lck.second.reset();
+ return lck;
+ }
+
+ vespalib::MonitorGuard lockGuard(t.lock);
+ BucketIdx& idx = boost::multi_index::get<2>(t.queue);
+ std::pair<BucketIdx::iterator, BucketIdx::iterator> range = idx.equal_range(id);
+
+ // No more for this bucket.
+ if (range.first == range.second) {
+ lck.second.reset();
+ return lck;
+ }
+
+ std::shared_ptr<api::StorageMessage> msg(range.first->_command);
+ mbus::Trace& trace = msg->getTrace();
+
+ // Priority is too low, not buffering any more.
+ if (msg->getPriority() > maxPriority || msg->getPriority() >= _maxPriorityToBlock) {
+ lck.second.reset();
+ return lck;
+ }
+
+ MBUS_TRACE(trace, 9,
+ "FileStorHandler: Message identified by disk thread looking for "
+ "more requests to active bucket.");
+
+ uint64_t waitTime(
+ const_cast<metrics::MetricTimer&>(range.first->_timer).stop(
+ t.metrics->averageQueueWaitingTime[msg->getLoadType()]));
+
+ LOG(debug, "Message %s waited %" PRIu64 " ms in storage queue (bucket %s), "
+ "timeout %d",
+ msg->toString().c_str(), waitTime, id.toString().c_str(),
+ static_cast<api::StorageCommand&>(*msg).getTimeout());
+
+ if (msg->getType().isReply() ||
+ waitTime < static_cast<api::StorageCommand&>(*msg).getTimeout())
+ {
+ idx.erase(range.first);
+ lck.second.swap(msg);
+ lockGuard.broadcast();
+ lockGuard.unlock();
+ return lck;
+ } else {
+ std::shared_ptr<api::StorageReply> msgReply(
+ static_cast<api::StorageCommand&>(*msg)
+ .makeReply().release());
+ msgReply->setResult(api::ReturnCode(
+ api::ReturnCode::TIMEOUT,
+ "Message waited too long in storage queue"));
+
+ idx.erase(range.first);
+ lockGuard.broadcast();
+ lockGuard.unlock();
+ _messageSender.sendReply(msgReply);
+
+ lck.second.reset();
+ return lck;
+ }
+}
+
+bool
+FileStorHandlerImpl::tryHandlePause(uint16_t disk) const
+{
+ if (isPaused()) {
+ // Wait a single time to see if filestor gets unpaused.
+ if (getDiskState(disk) != FileStorHandler::CLOSED) {
+ vespalib::MonitorGuard g(_pauseMonitor);
+ g.wait(100);
+ }
+ return !isPaused();
+ }
+ return true;
+}
+
+bool
+FileStorHandlerImpl::diskIsClosed(uint16_t disk) const
+{
+ return (getDiskState(disk) == FileStorHandler::CLOSED);
+}
+
+bool
+FileStorHandlerImpl::operationHasHighEnoughPriorityToBeRun(
+ const api::StorageMessage& msg,
+ uint8_t maxPriority) const
+{
+ // NOTE: priority integral value 0 is considered highest pri.
+ return (msg.getPriority() <= maxPriority);
+}
+
+bool
+FileStorHandlerImpl::operationBlockedByHigherPriorityThread(
+ const api::StorageMessage& msg,
+ const Disk& disk) const
+{
+ return ((msg.getPriority() >= _maxPriorityToBlock)
+ && hasBlockingOperations(disk));
+}
+
+bool
+FileStorHandlerImpl::messageTimedOutInQueue(const api::StorageMessage& msg,
+ uint64_t waitTime) const
+{
+ if (msg.getType().isReply()) {
+ return false; // Replies must always be processed and cannot time out.
+ }
+ return (waitTime >= static_cast<const api::StorageCommand&>(
+ msg).getTimeout());
+}
+
+std::unique_ptr<FileStorHandler::BucketLockInterface>
+FileStorHandlerImpl::takeDiskBucketLockOwnership(
+ Disk& disk,
+ const document::BucketId& id,
+ const api::StorageMessage& msg)
+{
+ return std::unique_ptr<FileStorHandler::BucketLockInterface>(
+ new BucketLock(disk, id, msg.getPriority(), msg.getSummary()));
+}
+
+std::unique_ptr<api::StorageReply>
+FileStorHandlerImpl::makeQueueTimeoutReply(api::StorageMessage& msg) const
+{
+ assert(!msg.getType().isReply());
+ std::unique_ptr<api::StorageReply> msgReply(
+ static_cast<api::StorageCommand&>(msg).makeReply().release());
+ msgReply->setResult(api::ReturnCode(
+ api::ReturnCode::TIMEOUT,
+ "Message waited too long in storage queue"));
+ return msgReply;
+}
+
+bool
+FileStorHandlerImpl::bucketIsLockedOnDisk(const document::BucketId& id,
+ const Disk& t) const
+{
+ return (id.getRawId() != 0 && t.isLocked(id));
+}
+
+FileStorHandler::LockedMessage
+FileStorHandlerImpl::getNextMessage(uint16_t disk, uint8_t maxPriority)
+{
+ if (!tryHandlePause(disk)) {
+ return {}; // Still paused, return to allow tick.
+ }
+
+ assert(disk < _diskInfo.size());
+ Disk& t(_diskInfo[disk]);
+
+ vespalib::MonitorGuard lockGuard(t.lock);
+ // Try to grab a message+lock, immediately retrying once after a wait
+ // if none can be found and then exiting if the same is the case on the
+ // second attempt. This is key to allowing the run loop to register
+ // ticks at regular intervals while not busy-waiting.
+ for (int attempt = 0; attempt < 2; ++attempt) {
+ PriorityIdx& idx(boost::multi_index::get<1>(t.queue));
+ PriorityIdx::iterator iter(idx.begin()), end(idx.end());
+
+ if (diskIsClosed(disk)) {
+ return {};
+ }
+ while (iter != end) {
+ document::BucketId id(iter->_bucketId);
+ if (bucketIsLockedOnDisk(id, t)) {
+ ++iter; // Try next in queue, if any.
+ continue;
+ }
+
+ std::shared_ptr<api::StorageMessage> msg(iter->_command);
+ mbus::Trace& trace(msg->getTrace());
+
+ if (!operationHasHighEnoughPriorityToBeRun(*msg, maxPriority)
+ || operationBlockedByHigherPriorityThread(*msg, t)
+ || isPaused())
+ {
+ break;
+ }
+
+ const uint64_t waitTime(
+ const_cast<metrics::MetricTimer&>(iter->_timer).stop(
+ t.metrics->averageQueueWaitingTime[
+ msg->getLoadType()]));
+
+ MBUS_TRACE(trace, 9, "FileStorHandler: Message identified by "
+ "disk thread.");
+ LOG(debug,
+ "Message %s waited %" PRIu64 " ms in storage queue, timeout %d",
+ msg->toString().c_str(), waitTime,
+ static_cast<api::StorageCommand&>(*msg).getTimeout());
+
+ idx.erase(iter); // iter not used after this point.
+
+ if (!messageTimedOutInQueue(*msg, waitTime)) {
+ std::unique_ptr<FileStorHandler::BucketLockInterface> locker(
+ takeDiskBucketLockOwnership(t, id, *msg));
+ MBUS_TRACE(trace, 9, "FileStorHandler: Got lock on bucket");
+ lockGuard.broadcast(); // XXX: needed here?
+ return {std::move(locker), msg};
+ } else {
+ std::shared_ptr<api::StorageReply> msgReply(
+ makeQueueTimeoutReply(*msg));
+ lockGuard.broadcast(); // XXX: needed here?
+ lockGuard.unlock();
+ _messageSender.sendReply(msgReply);
+ return {};
+ }
+ }
+ if (attempt == 0) {
+ lockGuard.wait(_getNextMessageTimeout);
+ }
+ }
+ return {}; // No message fetched.
+}
+
+std::shared_ptr<FileStorHandler::BucketLockInterface>
+FileStorHandlerImpl::lock(const document::BucketId& bucket, uint16_t disk)
+{
+ assert(disk < _diskInfo.size());
+
+ Disk& t(_diskInfo[disk]);
+ LOG(spam,
+ "Acquiring filestor lock for %s on disk %d",
+ bucket.toString().c_str(),
+ disk);
+
+ vespalib::MonitorGuard lockGuard(t.lock);
+
+ while (bucket.getRawId() != 0 && t.isLocked(bucket)) {
+ LOG(spam,
+ "Contending for filestor lock for %s",
+ bucket.toString().c_str());
+ lockGuard.wait(100);
+ }
+
+ std::shared_ptr<FileStorHandler::BucketLockInterface> locker(
+ new BucketLock(t, bucket, 255, "External lock"));
+
+ lockGuard.broadcast();
+ return locker;
+}
+
+namespace {
+ struct MultiLockGuard {
+ std::map<uint16_t, vespalib::Monitor*> monitors;
+ std::vector<std::shared_ptr<vespalib::MonitorGuard> > guards;
+
+ MultiLockGuard() {}
+
+ void addLock(vespalib::Monitor& monitor, uint16_t index) {
+ monitors[index] = &monitor;
+ }
+ void lock() {
+ for (std::map<uint16_t, vespalib::Monitor*>::iterator it
+ = monitors.begin(); it != monitors.end(); ++it)
+ {
+ guards.push_back(std::shared_ptr<vespalib::MonitorGuard>(
+ new vespalib::MonitorGuard(*it->second)));
+ }
+ }
+ };
+}
+
+namespace {
+ document::DocumentId getDocId(const api::StorageMessage& msg) {
+ switch (msg.getType().getId()) {
+ case api::MessageType::GET_ID:
+ return static_cast<const api::GetCommand&>(msg).getDocumentId();
+ break;
+ case api::MessageType::PUT_ID:
+ return static_cast<const api::PutCommand&>(msg).getDocumentId();
+ break;
+ case api::MessageType::UPDATE_ID:
+ return static_cast<const api::UpdateCommand&>(msg)
+ .getDocumentId();
+ break;
+ case api::MessageType::REMOVE_ID:
+ return static_cast<const api::RemoveCommand&>(msg)
+ .getDocumentId();
+ break;
+ default:
+ assert(false);
+ abort();
+ }
+ }
+ uint32_t findCommonBits(document::BucketId a, document::BucketId b) {
+ if (a.getUsedBits() > b.getUsedBits()) {
+ a.setUsedBits(b.getUsedBits());
+ } else {
+ b.setUsedBits(a.getUsedBits());
+ }
+ for (uint32_t i=a.getUsedBits() - 1; i>0; --i) {
+ if (a == b) return i + 1;
+ a.setUsedBits(i);
+ b.setUsedBits(i);
+ }
+ return (a == b ? 1 : 0);
+ }
+}
+
+int
+FileStorHandlerImpl::calculateTargetBasedOnDocId(
+ const api::StorageMessage& msg,
+ std::vector<RemapInfo*>& targets)
+{
+ document::DocumentId id(getDocId(msg));
+ document::BucketId bucket(_bucketIdFactory.getBucketId(id));
+
+ for (uint32_t i = 0; i < targets.size(); i++) {
+ if (targets[i]->bid.getRawId() != 0 && targets[i]->bid.contains(bucket)) {
+ return i;
+ }
+ }
+
+ return -1;
+}
+
+document::BucketId
+FileStorHandlerImpl::remapMessage(
+ api::StorageMessage& msg,
+ const document::BucketId& source,
+ Operation op,
+ std::vector<RemapInfo*>& targets,
+ uint16_t& targetDisk, api::ReturnCode& returnCode)
+{
+ document::BucketId newBucketId = source;
+
+ switch (msg.getType().getId()) {
+ case api::MessageType::GET_ID:
+ case api::MessageType::PUT_ID:
+ case api::MessageType::UPDATE_ID:
+ case api::MessageType::REMOVE_ID:
+ // Move to correct queue
+ {
+ api::BucketCommand& cmd(
+ static_cast<api::BucketCommand&>(msg));
+
+ if (cmd.getBucketId() == source) {
+ if (op == SPLIT) {
+ int idx = calculateTargetBasedOnDocId(msg, targets);
+
+ if (idx > -1) {
+ cmd.remapBucketId(targets[idx]->bid);
+ targets[idx]->foundInQueue = true;
+ targetDisk = targets[idx]->diskIndex;
+#if defined(ENABLE_BUCKET_OPERATION_LOGGING)
+ {
+ vespalib::string desc = vespalib::make_vespa_string(
+ "Remapping %s from %s to %s, targetDisk = %u",
+ cmd.toString().c_str(), source.toString().c_str(),
+ targets[idx]->bid.toString().c_str(), targetDisk);
+ LOG_BUCKET_OPERATION_NO_LOCK(source, desc);
+ LOG_BUCKET_OPERATION_NO_LOCK(targets[idx]->bid, desc);
+ }
+#endif
+ newBucketId = targets[idx]->bid;
+ } else {
+ document::DocumentId did(getDocId(msg));
+ document::BucketId bucket = _bucketIdFactory.getBucketId(did);
+ uint32_t commonBits(
+ findCommonBits(targets[0]->bid, bucket));
+ if (commonBits < source.getUsedBits()) {
+ std::ostringstream ost;
+ ost << bucket << " belongs in neither "
+ << targets[0]->bid << " nor " << targets[1]->bid
+ << ". Cannot remap it after split. It "
+ << "did not belong in the original "
+ << "bucket " << source;
+ LOG(error, "Error remapping %s after split %s",
+ cmd.getType().toString().c_str(),
+ ost.str().c_str());
+ returnCode = api::ReturnCode(
+ api::ReturnCode::REJECTED, ost.str());
+ } else {
+ std::ostringstream ost;
+ assert(targets.size() == 2);
+ ost << "Bucket " << source << " was split and "
+ << "neither bucket " << targets[0]->bid << " nor "
+ << targets[1]->bid << " fit for this operation. "
+ << "Failing operation so distributor can create "
+ << "bucket on correct node.";
+ LOG(debug, "%s", ost.str().c_str());
+ returnCode = api::ReturnCode(
+ api::ReturnCode::BUCKET_DELETED,
+ ost.str());
+ }
+ }
+ } else {
+ LOG(debug, "Remapping %s operation to bucket %s",
+ cmd.toString().c_str(), targets[0]->bid.toString().c_str());
+ cmd.remapBucketId(targets[0]->bid);
+ newBucketId = targets[0]->bid;
+ targetDisk = targets[0]->diskIndex;
+#ifdef ENABLE_BUCKET_OPERATION_LOGGING
+ {
+ vespalib::string desc = vespalib::make_vespa_string(
+ "Remapping %s from %s to %s, targetDisk = %u",
+ cmd.toString().c_str(), source.toString().c_str(),
+ targets[0]->bid.toString().c_str(), targetDisk);
+ LOG_BUCKET_OPERATION_NO_LOCK(source, desc);
+ LOG_BUCKET_OPERATION_NO_LOCK(targets[0]->bid, desc);
+ }
+#endif
+ }
+ } else {
+ LOG(debug, "Did not remap %s with bucket %s from bucket %s",
+ cmd.toString().c_str(), cmd.getBucketId().toString().c_str(),
+ source.toString().c_str());
+ assert(false);
+ }
+ break;
+ }
+ case api::MessageType::MERGEBUCKET_ID:
+ case api::MessageType::GETBUCKETDIFF_ID:
+ case api::MessageType::GETBUCKETDIFF_REPLY_ID:
+ case api::MessageType::APPLYBUCKETDIFF_ID:
+ case api::MessageType::APPLYBUCKETDIFF_REPLY_ID:
+ // Move to correct queue including filestor thread state
+ // if op == MOVE. If op != MOVE, fail with bucket not found
+ // and clear filestor thread state
+ {
+ api::BucketCommand& cmd(
+ static_cast<api::BucketCommand&>(msg));
+ if (cmd.getBucketId() == source) {
+ if (op != MOVE) {
+ std::ostringstream ost;
+ ost << "Bucket " << (op == SPLIT ? "split" : "joined")
+ << ". Cannot remap merge, so aborting it";
+ api::ReturnCode code(api::ReturnCode::BUCKET_DELETED,
+ ost.str());
+ clearMergeStatus(cmd.getBucketId(), &code);
+ }
+ }
+ // Follow onto next to move queue or fail
+ }
+ case api::MessageType::SPLITBUCKET_ID:
+ // Move to correct queue if op == MOVE
+ // Fail with bucket not found if op is JOIN
+ // Ok if op is SPLIT, as we have already done as requested.
+ {
+ api::BucketCommand& cmd(
+ static_cast<api::BucketCommand&>(msg));
+ if (cmd.getBucketId() == source) {
+ if (op == MOVE) {
+ targetDisk = targets[0]->diskIndex;
+ } else if (op == SPLIT) {
+ returnCode = api::ReturnCode(
+ api::ReturnCode::BUCKET_DELETED,
+ "Bucket split while operation enqueued");
+ } else {
+ returnCode = api::ReturnCode(
+ api::ReturnCode::BUCKET_DELETED,
+ "Bucket was just joined");
+ }
+ }
+ break;
+ }
+ case api::MessageType::STAT_ID:
+ case api::MessageType::MULTIOPERATION_ID:
+ case api::MessageType::BATCHPUTREMOVE_ID:
+ case api::MessageType::REVERT_ID:
+ case api::MessageType::REMOVELOCATION_ID:
+ case api::MessageType::SETBUCKETSTATE_ID:
+ {
+ // Move to correct queue if op == MOVE
+ // Fail with bucket not found if op != MOVE
+ api::BucketCommand& cmd(
+ static_cast<api::BucketCommand&>(msg));
+ if (cmd.getBucketId() == source) {
+ if (op == MOVE) {
+ targetDisk = targets[0]->diskIndex;
+ } else {
+ returnCode = api::ReturnCode(
+ api::ReturnCode::BUCKET_DELETED,
+ op == SPLIT ? "Bucket was just split"
+ : "Bucket was just joined");
+ }
+ }
+ break;
+ }
+ case api::MessageType::CREATEBUCKET_ID:
+ case api::MessageType::DELETEBUCKET_ID:
+ case api::MessageType::JOINBUCKETS_ID:
+ // Move to correct queue if op == MOVE. Otherwise ignore.
+ {
+ api::BucketCommand& cmd(
+ static_cast<api::BucketCommand&>(msg));
+ if (cmd.getBucketId() == source) {
+ if (op == MOVE) {
+ targetDisk = targets[0]->diskIndex;
+ }
+ }
+ break;
+ }
+ case api::MessageType::INTERNAL_ID:
+ {
+ const api::InternalCommand& icmd(
+ static_cast<const api::InternalCommand&>(msg));
+ document::BucketId bucket;
+ switch(icmd.getType()) {
+ case RequestStatusPage::ID:
+ // Ignore
+ break;
+ case CreateIteratorCommand::ID:
+ bucket = static_cast<CreateIteratorCommand&>(msg).getBucketId();
+ // Move to correct queue if op == MOVE
+ // Fail with bucket not found if op != MOVE
+ if (bucket == source) {
+ if (op == MOVE) {
+ targetDisk = targets[0]->diskIndex;
+ } else {
+ returnCode = api::ReturnCode(
+ api::ReturnCode::BUCKET_DELETED,
+ op == SPLIT
+ ? "Bucket was just split"
+ : "Bucket was just joined");
+ }
+ }
+ break;
+ case GetIterCommand::ID:
+ bucket = static_cast<GetIterCommand&>(msg).getBucketId();
+ case RepairBucketCommand::ID:
+ if (bucket.getRawId() == 0) {
+ bucket = static_cast<RepairBucketCommand&>(msg)
+ .getBucketId();
+ }
+ // Move to correct queue if op == MOVE
+ // Fail with bucket not found if op != MOVE
+ if (bucket == source) {
+ if (op == MOVE) {
+ targetDisk = targets[0]->diskIndex;
+ } else {
+ returnCode = api::ReturnCode(
+ api::ReturnCode::BUCKET_DELETED,
+ op == SPLIT
+ ? "Bucket was just split"
+ : "Bucket was just joined");
+ }
+ }
+ break;
+ case BucketDiskMoveCommand::ID:
+ // Fail bucket not found if op != MOVE
+ // Fail and log error if op == MOVE
+ {
+ api::BucketCommand& cmd(
+ static_cast<api::BucketCommand&>(msg));
+ if (cmd.getBucketId() == source) {
+ if (op == MOVE) {
+ returnCode = api::ReturnCode(
+ api::ReturnCode::INTERNAL_FAILURE,
+ "Multiple bucket disk move "
+ "commands pending at the same time "
+ "towards bucket "
+ + source.toString());
+ } else {
+ returnCode = api::ReturnCode(
+ api::ReturnCode::BUCKET_DELETED,
+ op == SPLIT
+ ? "Bucket was just split"
+ : "Bucket was just joined");
+ }
+ }
+ break;
+ }
+ case ReadBucketInfo::ID:
+ case RecheckBucketInfoCommand::ID:
+ {
+ LOG(debug, "While remapping load for bucket %s for reason %u, "
+ "we abort read bucket info request for this bucket.",
+ source.toString().c_str(), op);
+ break;
+ }
+ case InternalBucketJoinCommand::ID:
+ default:
+ // Fail and log error
+ {
+ LOG(error, "Attempted (and failed) to remap %s which should "
+ "not be processed at this time",
+ msg.toString(true).c_str());
+ returnCode = api::ReturnCode(
+ api::ReturnCode::INTERNAL_FAILURE,
+ "No such message should be processed at "
+ "this time.");
+ break;
+ }
+ }
+ break;
+ }
+ default:
+ {
+ returnCode = api::ReturnCode(
+ api::ReturnCode::INTERNAL_FAILURE,
+ "Unknown message type in persistence layer");
+ LOG(error,
+ "Unknown message type in persistence layer: %s",
+ msg.toString().c_str());
+ }
+ } // End of switch
+
+ return newBucketId;
+}
+
+void
+FileStorHandlerImpl::remapQueueNoLock(
+ Disk& from,
+ const RemapInfo& source,
+ std::vector<RemapInfo*>& targets,
+ Operation op)
+{
+ BucketIdx& idx(boost::multi_index::get<2>(from.queue));
+ std::pair<BucketIdx::iterator, BucketIdx::iterator> range(
+ idx.equal_range(source.bid));
+
+ std::vector<MessageEntry> entriesFound;
+
+ // Find all the messages for the given bucket.
+ for (BucketIdx::iterator i = range.first; i != range.second; ++i) {
+ assert(i->_bucketId == source.bid);
+
+ entriesFound.push_back(*i);
+ }
+
+ // Remove them
+ idx.erase(range.first, range.second);
+
+ // Reinsert all that can be remapped.
+ for (uint32_t i = 0; i < entriesFound.size(); ++i) {
+ // If set to something other than source.diskIndex, move this message
+ // to that queue.
+ MessageEntry& entry = entriesFound[i];
+ uint16_t targetDisk = source.diskIndex;
+
+ // If not OK, reply to this message with the following message
+ api::ReturnCode returnCode(api::ReturnCode::OK);
+ api::StorageMessage& msg(*entry._command);
+ assert(entry._bucketId == source.bid);
+
+ document::BucketId bid = remapMessage(msg,
+ source.bid,
+ op,
+ targets,
+ targetDisk,
+ returnCode);
+
+ if (returnCode.getResult() != api::ReturnCode::OK) {
+ // Fail message if errorcode set
+ if (!msg.getType().isReply()) {
+ std::shared_ptr<api::StorageReply> rep(
+ static_cast<api::StorageCommand&>(msg)
+ .makeReply().release());
+ LOG(spam, "Sending reply %s because remapping failed: %s",
+ msg.toString().c_str(),
+ returnCode.toString().c_str());
+
+ rep->setResult(returnCode);
+ _messageSender.sendReply(rep);
+ }
+ } else {
+ entry._bucketId = bid;
+ // Move to correct disk queue if needed
+ _diskInfo[targetDisk].queue.push_back(entry);
+ }
+ }
+
+}
+
+void
+FileStorHandlerImpl::remapQueue(
+ const RemapInfo& source,
+ RemapInfo& target,
+ Operation op) {
+ // Use a helper class to lock to solve issue that some buckets might be
+ // the same bucket. Will fix order if we accept wrong order later.
+ MultiLockGuard guard;
+
+ Disk& from(_diskInfo[source.diskIndex]);
+ guard.addLock(from.lock, source.diskIndex);
+
+ Disk& to1(_diskInfo[target.diskIndex]);
+ if (target.bid.getRawId() != 0) {
+ guard.addLock(to1.lock, target.diskIndex);
+ }
+
+ std::vector<RemapInfo*> targets;
+ targets.push_back(&target);
+
+ guard.lock();
+
+ remapQueueNoLock(from, source, targets, op);
+}
+
+void
+FileStorHandlerImpl::remapQueue(
+ const RemapInfo& source,
+ RemapInfo& target1,
+ RemapInfo& target2,
+ Operation op)
+{
+ // Use a helper class to lock to solve issue that some buckets might be
+ // the same bucket. Will fix order if we accept wrong order later.
+ MultiLockGuard guard;
+
+ Disk& from(_diskInfo[source.diskIndex]);
+ guard.addLock(from.lock, source.diskIndex);
+
+ Disk& to1(_diskInfo[target1.diskIndex]);
+ if (target1.bid.getRawId() != 0) {
+ guard.addLock(to1.lock, target1.diskIndex);
+ }
+
+ Disk& to2(_diskInfo[target2.diskIndex]);
+ if (target2.bid.getRawId() != 0) {
+ guard.addLock(to2.lock, target2.diskIndex);
+ }
+
+ guard.lock();
+
+ std::vector<RemapInfo*> targets;
+ targets.push_back(&target1);
+ targets.push_back(&target2);
+
+ remapQueueNoLock(from, source, targets, op);
+}
+
+void
+FileStorHandlerImpl::failOperations(
+ const document::BucketId& bucket, uint16_t fromDisk,
+ const api::ReturnCode& err)
+{
+ Disk& from(_diskInfo[fromDisk]);
+ vespalib::MonitorGuard lockGuard(from.lock);
+
+ BucketIdx& idx(boost::multi_index::get<2>(from.queue));
+ std::pair<BucketIdx::iterator, BucketIdx::iterator> range(
+ idx.equal_range(bucket));
+
+ for (auto iter = range.first; iter != range.second;) {
+ // We want to post delete bucket to list before calling this
+ // function in order to release bucket database lock. Thus we
+ // cannot delete the delete bucket operation itself
+ if (iter->_command->getType() != api::MessageType::DELETEBUCKET) {
+ if (!iter->_command->getType().isReply()) {
+ std::shared_ptr<api::StorageReply> msgReply(
+ static_cast<api::StorageCommand&>(*iter->_command)
+ .makeReply().release());
+ msgReply->setResult(err);
+ _messageSender.sendReply(msgReply);
+ }
+ iter = idx.erase(iter);
+ } else {
+ ++iter;
+ }
+ }
+}
+
+void
+FileStorHandlerImpl::sendCommand(
+ const std::shared_ptr<api::StorageCommand>& msg)
+{
+ _messageSender.sendCommand(msg);
+}
+
+void
+FileStorHandlerImpl::sendReply(const std::shared_ptr<api::StorageReply>& msg)
+{
+ _messageSender.sendReply(msg);
+}
+
+FileStorHandlerImpl::Disk::Disk()
+ : lock(),
+ queue(),
+ lockedBuckets(100),
+ metrics(0),
+ state(FileStorHandler::AVAILABLE)
+{
+}
+
+bool
+FileStorHandlerImpl::Disk::isLocked(
+ const document::BucketId& bucket) const noexcept
+{
+ return (lockedBuckets.find(bucket) != lockedBuckets.end());
+}
+
+uint32_t
+FileStorHandlerImpl::Disk::getQueueSize() const noexcept
+{
+ return queue.size();
+}
+
+uint32_t
+FileStorHandlerImpl::getQueueSize(uint16_t disk) const
+{
+ const Disk& t(_diskInfo[disk]);
+ vespalib::MonitorGuard lockGuard(t.lock);
+ return t.getQueueSize();
+}
+
+FileStorHandlerImpl::BucketLock::BucketLock(
+ Disk& disk,
+ const document::BucketId& id,
+ uint8_t priority,
+ const vespalib::stringref & statusString)
+ : _disk(disk),
+ _id(id)
+{
+ if (_id.getRawId() != 0) {
+ // Lock the bucket and wait until it is not the current operation for
+ // the disk itself.
+ _disk.lockedBuckets.insert(
+ std::make_pair(_id, Disk::LockEntry(priority, statusString)));
+ LOG(debug,
+ "Locked bucket %s with priority %u",
+ id.toString().c_str(),
+ priority);
+
+ LOG_BUCKET_OPERATION_SET_LOCK_STATE(
+ _id, "acquired filestor lock", false,
+ debug::BucketOperationLogger::State::BUCKET_LOCKED);
+ }
+}
+
+
+FileStorHandlerImpl::BucketLock::~BucketLock()
+{
+ if (_id.getRawId() != 0) {
+ vespalib::MonitorGuard lockGuard(_disk.lock);
+ _disk.lockedBuckets.erase(_id);
+ LOG(debug, "Unlocked bucket %s", _id.toString().c_str());
+ LOG_BUCKET_OPERATION_SET_LOCK_STATE(
+ _id, "released filestor lock", true,
+ debug::BucketOperationLogger::State::BUCKET_UNLOCKED);
+ lockGuard.broadcast();
+ }
+}
+
+std::string
+FileStorHandlerImpl::dumpQueue(uint16_t disk) const
+{
+ std::ostringstream ost;
+
+ const Disk& t(_diskInfo[disk]);
+ vespalib::MonitorGuard lockGuard(t.lock);
+
+ const PriorityIdx& idx = boost::multi_index::get<1>(t.queue);
+ for (PriorityIdx::const_iterator it = idx.begin();
+ it != idx.end();
+ it++)
+ {
+ ost << it->_bucketId << ": " << it->_command->toString() << " (priority: "
+ << (int)it->_command->getPriority() << ")\n";
+ }
+
+ return ost.str();
+}
+
+void
+FileStorHandlerImpl::getStatus(std::ostream& out,
+ const framework::HttpUrlPath& path) const
+{
+ bool verbose = path.hasAttribute("verbose");
+ out << "<h1>Filestor handler</h1>\n";
+ for (uint32_t i=0; i<_diskInfo.size(); ++i) {
+ out << "<h2>Disk " << i << "</h2>\n";
+ const Disk& t(_diskInfo[i]);
+ vespalib::MonitorGuard lockGuard(t.lock);
+ out << "Queue size: " << t.getQueueSize() << "<br>\n";
+ out << "Disk state: ";
+ switch (t.getState()) {
+ case FileStorHandler::AVAILABLE: out << "AVAILABLE"; break;
+ case FileStorHandler::DISABLED: out << "DISABLED"; break;
+ case FileStorHandler::CLOSED: out << "CLOSED"; break;
+ }
+ out << "<h4>Active operations</h4>\n";
+ for (const auto& lockedBucket : t.lockedBuckets) {
+ out << lockedBucket.second.statusString
+ << " (" << lockedBucket.first
+ << ") Running for "
+ << (_component.getClock().getTimeInSeconds().getTime()
+ - lockedBucket.second.timestamp)
+ << " secs<br/>\n";
+ }
+ if (!verbose) continue;
+ out << "<h4>Input queue</h4>\n";
+
+ out << "<ul>\n";
+ const PriorityIdx& idx = boost::multi_index::get<1>(t.queue);
+ for (PriorityIdx::const_iterator it = idx.begin();
+ it != idx.end();
+ it++)
+ {
+ out << "<li>" << it->_command->toString() << " (priority: "
+ << (int)it->_command->getPriority() << ")</li>\n";
+ }
+ out << "</ul>\n";
+ }
+
+ out << "<tr><td>Active merge operations</td><td>" << _mergeStates.size()
+ << "</td></tr>\n";
+
+ // Print merge states
+ if (verbose) {
+ out << "<h4>Active merges</h4>\n";
+ if (_mergeStates.size() == 0) {
+ out << "None\n";
+ }
+ for (std::map<document::BucketId, MergeStatus::SP>::const_iterator it
+ = _mergeStates.begin(); it != _mergeStates.end(); ++it)
+ {
+ out << "<b>" << it->first.toString() << "</b><br>\n";
+ // << "<p>" << it->second << "</p>\n"; // Gets very spammy with
+ // the complete state here..
+ }
+ }
+}
+
+void
+FileStorHandlerImpl::waitUntilNoLocks()
+{
+ for (uint32_t i=0; i<_diskInfo.size(); ++i) {
+ const Disk& t(_diskInfo[i]);
+ vespalib::MonitorGuard lockGuard(t.lock);
+ while (!t.lockedBuckets.empty()) {
+ lockGuard.wait();
+ }
+ }
+}
+
+ResumeGuard
+FileStorHandlerImpl::pause()
+{
+ _paused.store(true, std::memory_order_relaxed);
+ waitUntilNoLocks();
+ return ResumeGuard(*this);
+}
+
+void
+FileStorHandlerImpl::resume()
+{
+ vespalib::MonitorGuard g(_pauseMonitor);
+ _paused.store(false, std::memory_order_relaxed);
+ g.broadcast();
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/filestorage/filestorhandlerimpl.h b/storage/src/vespa/storage/persistence/filestorage/filestorhandlerimpl.h
new file mode 100644
index 00000000000..5abf033be2b
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/filestorhandlerimpl.h
@@ -0,0 +1,362 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::FileStorHandlerImpl
+ * \ingroup storage
+ *
+ * \brief Common resource for filestor threads.
+ *
+ * This class implements all locking related stuff between filestor threads.
+ * It keeps the various filestor thread queues, and implement thread safe
+ * functions for inserting, removing and moving stuff in the queues. In addition
+ * it makes it possible to lock buckets, by keeping track of current operation
+ * for various threads, and not allowing them to get another operation of a
+ * locked bucket until unlocked.
+ */
+
+#pragma once
+
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/metrics/metrics.h>
+#include <vespa/storage/common/servicelayercomponent.h>
+#include <vespa/storage/persistence/filestorage/filestorhandler.h>
+#include <vespa/storage/persistence/filestorage/mergestatus.h>
+#include <vespa/storageframework/storageframework.h>
+#include <boost/multi_index_container.hpp>
+#include <boost/multi_index/identity.hpp>
+#include <boost/multi_index/member.hpp>
+#include <boost/multi_index/mem_fun.hpp>
+#include <boost/multi_index/ordered_index.hpp>
+#include <boost/multi_index/sequenced_index.hpp>
+#include <vespa/storage/common/messagesender.h>
+#include <vespa/vespalib/stllike/hash_map.h>
+#include <atomic>
+
+namespace storage {
+
+class FileStorDiskMetrics;
+class StorBucketDatabase;
+class AbortBucketOperationsCommand;
+
+class FileStorHandlerImpl : private framework::MetricUpdateHook,
+ private ResumeGuard::Callback,
+ public MessageSender {
+public:
+ typedef FileStorHandler::DiskState DiskState;
+ typedef FileStorHandler::RemapInfo RemapInfo;
+
+ struct MessageEntry {
+ std::shared_ptr<api::StorageMessage> _command;
+ metrics::MetricTimer _timer;
+ document::BucketId _bucketId;
+ uint8_t _priority;
+
+ MessageEntry(const std::shared_ptr<api::StorageMessage>& cmd,
+ const document::BucketId& bId)
+ : _command(cmd),
+ _bucketId(bId),
+ _priority(cmd->getPriority())
+ {}
+
+ MessageEntry(const MessageEntry& entry)
+ : _command(entry._command),
+ _timer(entry._timer),
+ _bucketId(entry._bucketId),
+ _priority(entry._priority)
+ {
+ }
+
+ bool operator<(const MessageEntry& entry) const {
+ return (_priority < entry._priority);
+ }
+ };
+
+ typedef boost::multi_index::ordered_non_unique<
+ boost::multi_index::identity<MessageEntry> > PriorityOrder;
+
+ typedef boost::multi_index::ordered_non_unique<
+ boost::multi_index::member<MessageEntry,
+ document::BucketId,
+ &MessageEntry::_bucketId> > BucketOrder;
+
+ typedef boost::multi_index::multi_index_container<
+ MessageEntry,
+ boost::multi_index::indexed_by<
+ boost::multi_index::sequenced<>,
+ PriorityOrder,
+ BucketOrder
+ >
+ > PriorityQueue;
+
+ typedef boost::multi_index::nth_index<PriorityQueue, 1>::type PriorityIdx;
+ typedef boost::multi_index::nth_index<PriorityQueue, 2>::type BucketIdx;
+
+ struct Disk {
+ vespalib::Monitor lock;
+ PriorityQueue queue;
+
+ struct LockEntry {
+ uint32_t timestamp;
+ uint8_t priority;
+ vespalib::string statusString;
+
+ LockEntry()
+ : timestamp(0), priority(0), statusString()
+ {
+ }
+
+ LockEntry(uint8_t priority_,
+ vespalib::stringref status)
+ : timestamp(time(NULL)),
+ priority(priority_),
+ statusString(status)
+ {
+ }
+ };
+
+ typedef vespalib::hash_map<
+ document::BucketId,
+ LockEntry,
+ document::BucketId::hash
+ > LockedBuckets;
+ LockedBuckets lockedBuckets;
+ FileStorDiskMetrics* metrics;
+
+ /**
+ * No assumption on memory ordering around disk state reads should
+ * be made by callers.
+ */
+ DiskState getState() const noexcept {
+ return state.load(std::memory_order_relaxed);
+ }
+ /**
+ * No assumption on memory ordering around disk state writes should
+ * be made by callers.
+ */
+ void setState(DiskState s) noexcept {
+ state.store(s, std::memory_order_relaxed);
+ }
+
+ Disk();
+
+ bool isLocked(const document::BucketId&) const noexcept;
+ uint32_t getQueueSize() const noexcept;
+ private:
+ std::atomic<DiskState> state;
+ };
+
+ class BucketLock : public FileStorHandler::BucketLockInterface {
+ public:
+ BucketLock(Disk& disk,
+ const document::BucketId& id,
+ uint8_t priority,
+ const vespalib::stringref & statusString);
+ ~BucketLock();
+
+ const document::BucketId& getBucketId() const {
+ return _id;
+ }
+
+ private:
+ Disk& _disk;
+ document::BucketId _id;
+ };
+
+ FileStorHandlerImpl(MessageSender&,
+ FileStorMetrics&,
+ const spi::PartitionStateList&,
+ ServiceLayerComponentRegister&,
+ uint8_t maxPriorityToBlock,
+ uint8_t minPriorityToBeBlocking);
+
+ ~FileStorHandlerImpl();
+ void setGetNextMessageTimeout(uint32_t timeout)
+ { _getNextMessageTimeout = timeout; }
+
+ void flush(bool killPendingMerges);
+ void setDiskState(uint16_t disk, DiskState state);
+ DiskState getDiskState(uint16_t disk) const;
+ void close();
+ bool schedule(const std::shared_ptr<api::StorageMessage>&,
+ uint16_t disk);
+
+ void pause(uint16_t disk, uint8_t priority) const;
+
+ FileStorHandler::LockedMessage getNextMessage(uint16_t disk,
+ uint8_t lowestPriority);
+
+ FileStorHandler::LockedMessage & getNextMessage(uint16_t disk,
+ FileStorHandler::LockedMessage& lock,
+ uint8_t lowestPriority);
+
+ enum Operation { MOVE, SPLIT, JOIN };
+ void remapQueue(
+ const RemapInfo& source,
+ RemapInfo& target,
+ Operation op);
+
+ void remapQueue(const RemapInfo& source,
+ RemapInfo& target1,
+ RemapInfo& target2,
+ Operation op);
+
+ void failOperations(const document::BucketId&, uint16_t fromDisk,
+ const api::ReturnCode&);
+ void sendCommand(const std::shared_ptr<api::StorageCommand>&);
+ void sendReply(const std::shared_ptr<api::StorageReply>&);
+
+ void getStatus(std::ostream& out, const framework::HttpUrlPath& path) const;
+
+ uint32_t getQueueSize() const;
+ uint32_t getQueueSize(uint16_t disk) const;
+
+ std::shared_ptr<FileStorHandler::BucketLockInterface>
+ lock(const document::BucketId&, uint16_t disk);
+
+ void addMergeStatus(const document::BucketId&, MergeStatus::SP);
+ MergeStatus& editMergeStatus(const document::BucketId&);
+
+ bool isMerging(const document::BucketId&) const;
+
+ uint32_t getNumActiveMerges() const;
+
+ void clearMergeStatus(const document::BucketId&, const api::ReturnCode*);
+
+ std::string dumpQueue(uint16_t disk) const;
+
+ ResumeGuard pause();
+
+ void resume();
+
+ void abortQueuedOperations(const AbortBucketOperationsCommand& cmd);
+
+private:
+ const spi::PartitionStateList& _partitions;
+ ServiceLayerComponent _component;
+ std::vector<Disk> _diskInfo;
+ MessageSender& _messageSender;
+ const document::BucketIdFactory& _bucketIdFactory;
+
+ vespalib::Lock _mergeStatesLock;
+
+ std::map<document::BucketId, MergeStatus::SP> _mergeStates;
+
+ uint8_t _maxPriorityToBlock;
+ uint8_t _minPriorityToBeBlocking;
+ uint32_t _getNextMessageTimeout;
+
+ vespalib::Monitor _pauseMonitor;
+ std::atomic<bool> _paused;
+
+ void reply(api::StorageMessage&, DiskState state) const;
+ static document::BucketId getBucketId(const api::StorageMessage&);
+
+ // Returns the index in the targets array we are sending to, or -1 if none of them match.
+ int calculateTargetBasedOnDocId(
+ const api::StorageMessage& msg,
+ std::vector<RemapInfo*>& targets);
+
+ /**
+ * If FileStor layer is explicitly paused, try to wait a single time, then
+ * recheck pause status. Returns true if filestor isn't paused at the time
+ * of the first check or after the wait, false if it's still paused.
+ */
+ bool tryHandlePause(uint16_t disk) const;
+
+ /**
+ * Checks whether the entire filestor layer is paused.
+ * Since there should be no data or synchronization dependencies on
+ * _paused, use relaxed atomics.
+ */
+ bool isPaused() const { return _paused.load(std::memory_order_relaxed); }
+
+ /**
+ * Return whether a disk has been shut down by the system (IO failure is
+ * the most likely candidate here) and should not serve any more requests.
+ */
+ bool diskIsClosed(uint16_t disk) const;
+
+ /**
+ * Return whether msg has sufficiently high priority that a thread with
+ * a configured priority threshold of maxPriority can even run in.
+ * Often, operations such as streaming searches will have dedicated threads
+ * that refuse lower priority operations such as Puts etc.
+ */
+ bool operationHasHighEnoughPriorityToBeRun(
+ const api::StorageMessage& msg,
+ uint8_t maxPriority) const;
+
+ /**
+ * Return whether an already running high priority operation pre-empts
+ * (blocks) the operation in msg from even starting in the current thread.
+ */
+ bool operationBlockedByHigherPriorityThread(
+ const api::StorageMessage& msg,
+ const Disk& disk) const;
+
+ /**
+ * Return whether msg has timed out based on waitTime and the message's
+ * specified timeout.
+ */
+ bool messageTimedOutInQueue(const api::StorageMessage& msg,
+ uint64_t waitTime) const;
+
+ /**
+ * Assume ownership of lock for a given bucket on a given disk.
+ * Disk lock MUST have been taken prior to calling this function.
+ */
+ std::unique_ptr<FileStorHandler::BucketLockInterface>
+ takeDiskBucketLockOwnership(Disk& disk,
+ const document::BucketId& id,
+ const api::StorageMessage& msg);
+
+ /**
+ * Creates and returns a reply with api::TIMEOUT return code for msg.
+ * Swaps (invalidates) context from msg into reply.
+ */
+ std::unique_ptr<api::StorageReply>
+ makeQueueTimeoutReply(api::StorageMessage& msg) const;
+
+ bool bucketIsLockedOnDisk(const document::BucketId&,
+ const Disk&) const;
+
+ bool messageMayBeAborted(const api::StorageMessage& msg) const;
+
+ bool hasBlockingOperations(const Disk& t) const;
+
+ void abortQueuedCommandsForBuckets(
+ Disk& disk,
+ const AbortBucketOperationsCommand& cmd);
+
+ bool diskHasActiveOperationForAbortedBucket(
+ const Disk& disk,
+ const AbortBucketOperationsCommand& cmd) const;
+
+ void waitUntilNoActiveOperationsForAbortedBuckets(
+ Disk& disk,
+ const AbortBucketOperationsCommand& cmd);
+
+ // Update hook
+ void updateMetrics(const MetricLockGuard &) override;
+
+ document::BucketId remapMessage(api::StorageMessage& msg,
+ const document::BucketId& source,
+ Operation op,
+ std::vector<RemapInfo*>& targets,
+ uint16_t& targetDisk,
+ api::ReturnCode& returnCode);
+
+ void remapQueueNoLock(
+ Disk& from,
+ const RemapInfo& source,
+ std::vector<RemapInfo*>& targets,
+ Operation op);
+
+ /**
+ * Waits until the queue has no pending operations (i.e. no locks are
+ * being held.
+ */
+ void waitUntilNoLocks();
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/persistence/filestorage/filestormanager.cpp b/storage/src/vespa/storage/persistence/filestorage/filestormanager.cpp
new file mode 100644
index 00000000000..17024c6d1c0
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/filestormanager.cpp
@@ -0,0 +1,1081 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/persistence/filestorage/filestormanager.h>
+
+#include <set>
+#include <sys/types.h>
+#include <signal.h>
+#include <unistd.h>
+#include <vespa/document/bucket/bucketidfactory.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <vespa/storageapi/message/internal.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/removelocation.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storage/config/config-stor-server.h>
+#include <vespa/storage/persistence/persistencethread.h>
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/storage/storageserver/statemanager.h>
+#include <vespa/storage/storageserver/storagemetricsset.h>
+#include <vespa/storage/storageutil/log.h>
+#include <vespa/storage/common/messagebucketid.h>
+#include <vespa/storage/persistence/bucketownershipnotifier.h>
+#include <vespa/vdslib/state/random.h>
+#include <vespa/vdslib/state/nodestate.h>
+#include <vespa/storageapi/message/stat.h>
+#include <vespa/storageapi/message/batch.h>
+#include <vespa/vespalib/io/fileutil.h>
+
+LOG_SETUP(".persistence.filestor.manager");
+
+using std::shared_ptr;
+
+namespace storage {
+
+FileStorManager::
+FileStorManager(const config::ConfigUri & configUri,
+ const spi::PartitionStateList& partitions,
+ spi::PersistenceProvider& provider,
+ ServiceLayerComponentRegister& compReg)
+ : StorageLinkQueued("File store manager", compReg),
+ framework::HtmlStatusReporter("filestorman", "File store manager"),
+ _compReg(compReg),
+ _component(compReg, "filestormanager"),
+ _partitions(partitions),
+ _providerCore(provider),
+ _providerShutdown(_providerCore, _component),
+ _nodeUpInLastNodeStateSeenByProvider(false),
+ _providerMetric(new spi::MetricPersistenceProvider(_providerShutdown)),
+ _provider(_providerMetric.get()),
+ _bucketIdFactory(_component.getBucketIdFactory()),
+ _configUri(configUri),
+ _disks(),
+ _bucketOwnershipNotifier(new BucketOwnershipNotifier(_component, *this)),
+ _configFetcher(_configUri.getContext()),
+ _threadLockCheckInterval(60),
+ _failDiskOnError(false),
+ _metrics(new FileStorMetrics(_component.getLoadTypes()->getMetricLoadTypes())),
+ _threadMonitor(),
+ _closed(false)
+{
+ _metrics->registerMetric(*_providerMetric),
+ _configFetcher.subscribe(_configUri.getConfigId(), this);
+ _configFetcher.start();
+ _component.registerMetric(*_metrics);
+ _component.registerStatusPage(*this);
+ _component.getStateUpdater().addStateListener(*this);
+}
+
+FileStorManager::~FileStorManager()
+{
+ closeNextLink();
+ LOG(debug, "Deleting link %s. Giving filestor threads stop signal.",
+ toString().c_str());
+
+ for (uint32_t i = 0; i < _disks.size(); ++i) {
+ for (uint32_t j = 0; j < _disks[i].size(); ++j) {
+ if (_disks[i][j].get() != 0) {
+ _disks[i][j]->getThread().interrupt();
+ }
+ }
+ }
+ for (uint32_t i = 0; i < _disks.size(); ++i) {
+ for (uint32_t j = 0; j < _disks[i].size(); ++j) {
+ if (_disks[i][j].get() != 0) {
+ _disks[i][j]->getThread().join();
+ }
+ }
+ }
+ LOG(debug, "Closing all filestor queues, answering queued messages. "
+ "New messages will be refused.");
+ _filestorHandler->close();
+ LOG(debug, "Deleting filestor threads. Waiting for their current operation "
+ "to finish. Stop their threads and delete objects.");
+ _disks.clear();
+}
+
+void
+FileStorManager::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ out << "FileStorManager";
+}
+
+/**
+ * If live configuration, assuming storageserver makes sure no messages are
+ * incoming during reconfiguration
+ */
+void
+FileStorManager::configure(std::unique_ptr<vespa::config::content::StorFilestorConfig> config)
+{
+ // If true, this is not the first configure.
+ bool liveUpdate = (_disks.size() != 0);
+
+ _threadLockCheckInterval = config->diskOperationTimeout;
+ _failDiskOnError = (config->failDiskAfterErrorCount > 0);
+
+ if (!liveUpdate) {
+ _config = std::move(config);
+
+ _disks.resize(_component.getDiskCount());
+
+ _metrics->initDiskMetrics(
+ _disks.size(),
+ _component.getLoadTypes()->getMetricLoadTypes(),
+ (_config->threads.size() > 0) ? (_config->threads.size()) : 6);
+
+ _filestorHandler.reset(new FileStorHandler(
+ *this, *_metrics, _partitions, _compReg,
+ _config->maxPriorityToBlock, _config->minPriorityToBeBlocking));
+ for (uint32_t i=0; i<_component.getDiskCount(); ++i) {
+ if (_partitions[i].isUp()) {
+ if (_config->threads.size() == 0) {
+ LOG(spam, "Setting up disk %u", i);
+ for (uint32_t j = 0; j < 4; j++) {
+ _disks[i].push_back(DiskThread::SP(
+ new PersistenceThread(
+ _compReg, _configUri, *_provider,
+ *_filestorHandler,
+ *_metrics->disks[i]->threads[j],
+ i, 255, false)));
+
+ }
+ for (uint32_t j = 4; j < 6; j++) {
+ _disks[i].push_back(DiskThread::SP(
+ new PersistenceThread(
+ _compReg, _configUri, *_provider,
+ *_filestorHandler,
+ *_metrics->disks[i]->threads[j],
+ i, 100)));
+ }
+ }
+
+ for (uint16_t j = 0; j < _config->threads.size(); j++) {
+ LOG(spam, "Setting up disk %u, thread %u with priority %d",
+ i, j, _config->threads[j].lowestpri);
+ _disks[i].push_back(DiskThread::SP(
+ new PersistenceThread(
+ _compReg, _configUri, *_provider,
+ *_filestorHandler,
+ *_metrics->disks[i]->threads[j],
+ i, _config->threads[j].lowestpri,
+ false)));
+
+ }
+ } else {
+ _filestorHandler->disable(i);
+ }
+ }
+ }
+}
+
+void
+FileStorManager::replyDroppedOperation(api::StorageMessage& msg,
+ const document::BucketId& bucket,
+ api::ReturnCode::Result returnCode,
+ vespalib::stringref reason)
+{
+ std::ostringstream error;
+ error << "Dropping " << msg.getType() << " to bucket "
+ << bucket.toString() << ". Reason: " << reason;
+ LOGBT(debug, bucket.toString(), "%s", error.str().c_str());
+ if (!msg.getType().isReply()) {
+ std::shared_ptr<api::StorageReply> reply(
+ static_cast<api::StorageCommand&>(msg).makeReply().release());
+ reply->setResult(api::ReturnCode(returnCode, error.str()));
+ sendUp(reply);
+ }
+}
+
+void
+FileStorManager::replyWithBucketNotFound(api::StorageMessage& msg,
+ const document::BucketId& bucket)
+{
+ replyDroppedOperation(msg,
+ bucket,
+ api::ReturnCode::BUCKET_NOT_FOUND,
+ "bucket does not exist");
+}
+
+StorBucketDatabase::WrappedEntry
+FileStorManager::mapOperationToDisk(api::StorageMessage& msg,
+ const document::BucketId& bucket)
+{
+ StorBucketDatabase::WrappedEntry entry(_component.getBucketDatabase().get(
+ bucket, "FileStorManager::mapOperationToDisk"));
+ if (!entry.exist()) {
+ replyWithBucketNotFound(msg, bucket);
+ }
+ return entry;
+}
+
+StorBucketDatabase::WrappedEntry
+FileStorManager::mapOperationToBucketAndDisk(api::BucketCommand& cmd,
+ const document::DocumentId* docId)
+{
+ StorBucketDatabase::WrappedEntry entry(_component.getBucketDatabase().get(
+ cmd.getBucketId(), "FileStorManager::mapOperationToBucketAndDisk"));
+ if (!entry.exist()) {
+ document::BucketId specific(cmd.getBucketId());
+ if (docId) {
+ specific = _bucketIdFactory.getBucketId(*docId);
+ }
+ typedef std::map<document::BucketId,
+ StorBucketDatabase::WrappedEntry> BucketMap;
+ std::shared_ptr<api::StorageReply> reply;
+ {
+ BucketMap results(
+ _component.getBucketDatabase().getContained(
+ specific, "FileStorManager::mapOperationToBucketAndDisk-2"));
+ if (results.size() == 1) {
+ LOG(debug,
+ "Remapping %s operation to specific %s versus "
+ "non-existing %s to %s.",
+ cmd.toString().c_str(), specific.toString().c_str(),
+ cmd.getBucketId().toString().c_str(),
+ results.begin()->first.toString().c_str());
+ cmd.remapBucketId(results.begin()->first);
+ return results.begin()->second;
+ }
+ std::ostringstream error;
+ error << "Dropping " << cmd.getType() << " to bucket "
+ << cmd.getBucketId().toString() << " since bucket doesnt exist. ";
+ if (results.size() > 1) {
+ error << "Bucket was inconsistent with " << results.size()
+ << " entries so no automatic remapping done:";
+ BucketMap::const_iterator it = results.begin();
+ for (uint32_t i=0; i <= 4 && it != results.end(); ++it, ++i) {
+ error << " " << it->first;
+ }
+ if (it != results.end()) {
+ error << " ...";
+ }
+ } else {
+ error << "No other bucket exists that can contain this data either.";
+ }
+ LOGBT(debug, cmd.getBucketId().toString(), "%s", error.str().c_str());
+
+ reply.reset(static_cast<api::StorageCommand&>(cmd).makeReply().release());
+ reply->setResult(
+ api::ReturnCode(
+ api::ReturnCode::BUCKET_NOT_FOUND, error.str()));
+ }
+ sendUp(reply);
+ }
+ return entry;
+}
+
+bool
+FileStorManager::handlePersistenceMessage(
+ const shared_ptr<api::StorageMessage>& msg, uint16_t disk)
+{
+ api::ReturnCode errorCode(api::ReturnCode::OK);
+ do {
+ LOG(spam, "Received %s. Attempting to queue it to disk %u.",
+ msg->getType().getName().c_str(), disk);
+
+ LOG_BUCKET_OPERATION_NO_LOCK(
+ getStorageMessageBucketId(*msg),
+ vespalib::make_vespa_string("Attempting to queue %s to disk %u",
+ msg->toString().c_str(), disk));
+
+
+ if (_filestorHandler->schedule(msg, disk)) {
+ LOG(spam, "Received persistence message %s. Queued it to disk %u",
+ msg->getType().getName().c_str(), disk);
+ return true;
+ }
+ switch (_filestorHandler->getDiskState(disk)) {
+ case FileStorHandler::DISABLED:
+ errorCode = api::ReturnCode(api::ReturnCode::DISK_FAILURE,
+ "Disk disabled");
+ break;
+ case FileStorHandler::CLOSED:
+ errorCode = api::ReturnCode(api::ReturnCode::ABORTED,
+ "Shutting down storage node.");
+ break;
+ case FileStorHandler::AVAILABLE:
+ assert(false);
+ }
+ } while(0);
+ // If we get here, we failed to schedule message. errorCode says why
+ // We need to reply to message (while not having bucket lock)
+ if (!msg->getType().isReply()) {
+ std::shared_ptr<api::StorageReply> reply(
+ static_cast<api::StorageCommand&>(*msg).makeReply().release());
+ reply->setResult(errorCode);
+ LOG(spam, "Received persistence message %s. Returning reply: %s",
+ msg->getType().getName().c_str(), errorCode.toString().c_str());
+ dispatchUp(reply);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onPut(const shared_ptr<api::PutCommand>& cmd)
+{
+ if (cmd->getTimestamp() == 0) {
+ shared_ptr<api::StorageReply> reply(cmd->makeReply().release());
+ std::string msg("Put command received without timestamp set. "
+ "Distributor need to set timestamp to ensure equal "
+ "timestamps between storage nodes. Rejecting.");
+ reply->setResult(api::ReturnCode(api::ReturnCode::REJECTED, msg));
+ sendUp(reply);
+ return true;
+ }
+ StorBucketDatabase::WrappedEntry entry(mapOperationToBucketAndDisk(
+ *cmd, &cmd->getDocumentId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onUpdate(const shared_ptr<api::UpdateCommand>& cmd)
+{
+ if (cmd->getTimestamp() == 0) {
+ shared_ptr<api::StorageReply> reply(cmd->makeReply().release());
+ std::string msg("Update command received without timestamp set. "
+ "Distributor need to set timestamp to ensure equal "
+ "timestamps between storage nodes. Rejecting.");
+ reply->setResult(api::ReturnCode(api::ReturnCode::REJECTED, msg));
+ sendUp(reply);
+ return true;
+ }
+ StorBucketDatabase::WrappedEntry entry(mapOperationToBucketAndDisk(
+ *cmd, &cmd->getDocumentId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onGet(const shared_ptr<api::GetCommand>& cmd)
+{
+ StorBucketDatabase::WrappedEntry entry(mapOperationToBucketAndDisk(
+ *cmd, &cmd->getDocumentId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onRemove(const shared_ptr<api::RemoveCommand>& cmd)
+{
+ if (cmd->getTimestamp() == 0) {
+ shared_ptr<api::StorageReply> reply(cmd->makeReply().release());
+ std::string msg("Remove command received without timestamp set. "
+ "Distributor need to set timestamp to ensure equal "
+ "timestamps between storage nodes. Rejecting.");
+ reply->setResult(api::ReturnCode(api::ReturnCode::REJECTED, msg));
+ sendUp(reply);
+ return true;
+ }
+ StorBucketDatabase::WrappedEntry entry(mapOperationToBucketAndDisk(
+ *cmd, &cmd->getDocumentId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onRevert(const shared_ptr<api::RevertCommand>& cmd)
+{
+ StorBucketDatabase::WrappedEntry entry(mapOperationToBucketAndDisk(
+ *cmd, 0));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onMultiOperation(
+ const std::shared_ptr<api::MultiOperationCommand>& cmd)
+{
+ StorBucketDatabase::WrappedEntry entry(mapOperationToBucketAndDisk(
+ *cmd, 0));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onBatchPutRemove(
+ const std::shared_ptr<api::BatchPutRemoveCommand>& cmd)
+{
+ StorBucketDatabase::WrappedEntry entry(mapOperationToBucketAndDisk(
+ *cmd, 0));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onRemoveLocation(
+ const std::shared_ptr<api::RemoveLocationCommand>& cmd)
+{
+ StorBucketDatabase::WrappedEntry entry(mapOperationToDisk(
+ *cmd, cmd->getBucketId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onStatBucket(
+ const std::shared_ptr<api::StatBucketCommand>& cmd)
+{
+ StorBucketDatabase::WrappedEntry entry(mapOperationToDisk(
+ *cmd, cmd->getBucketId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onCreateBucket(
+ const std::shared_ptr<api::CreateBucketCommand>& cmd)
+{
+ api::ReturnCode code(api::ReturnCode::OK);
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _component.getBucketDatabase().get(
+ cmd->getBucketId(), "FileStorManager::onCreateBucket",
+ StorBucketDatabase::CREATE_IF_NONEXISTING));
+ if (entry.preExisted()) {
+ LOG(debug,
+ "Got create bucket request for %s which already exists: %s",
+ cmd->getBucketId().toString().c_str(),
+ entry->getBucketInfo().toString().c_str());
+ code = api::ReturnCode(api::ReturnCode::EXISTS,
+ "Bucket already exist");
+ } else {
+ entry->disk = _component.getIdealPartition(cmd->getBucketId());
+ if (_partitions[entry->disk].isUp()) {
+ // Newly created buckets are ready but not active, unless
+ // explicitly marked as such by the distributor.
+ entry->setBucketInfo(api::BucketInfo(
+ 0, 0, 0, 0, 0, true, cmd->getActive()));
+ cmd->setPriority(0);
+ handlePersistenceMessage(cmd, entry->disk);
+ entry.write();
+ LOG(debug, "Created bucket %s on disk %d (node index is %d)",
+ cmd->getBucketId().toString().c_str(),
+ entry->disk, _component.getIndex());
+ return true;
+ } else {
+ entry.remove();
+ code = api::ReturnCode(
+ api::ReturnCode::IO_FAILURE,
+ vespalib::make_string(
+ "Trying to create bucket %s on disabled disk %d",
+ cmd->getBucketId().toString().c_str(),
+ entry->disk));
+ }
+ }
+ }
+ std::shared_ptr<api::CreateBucketReply> reply(
+ (api::CreateBucketReply*)cmd->makeReply().release());
+ reply->setBucketInfo(api::BucketInfo(0, 0, 0, 0, 0, true, cmd->getActive()));
+ reply->setResult(code);
+ sendUp(reply);
+ return true;
+}
+
+bool
+FileStorManager::onDeleteBucket(const shared_ptr<api::DeleteBucketCommand>& cmd)
+{
+ uint16_t disk;
+ {
+ StorBucketDatabase::WrappedEntry entry(
+ _component.getBucketDatabase().get(
+ cmd->getBucketId(), "FileStorManager::onDeleteBucket"));
+ if (!entry.exist()) {
+ LOG(debug, "%s was already deleted",
+ cmd->getBucketId().toString().c_str());
+ std::shared_ptr<api::StorageReply> reply(cmd->makeReply().release());
+ sendUp(reply);
+ return true;
+ }
+
+ // If bucket info in command is invalid, it means it was sent by a
+ // distributor with an older protocol implementation of
+ // DeleteBucketCommand, so we should always allow it to go through
+ if (cmd->getBucketInfo().valid()
+ && (cmd->getBucketInfo().getChecksum()
+ != entry->getBucketInfo().getChecksum()))
+ {
+ vespalib::asciistream ost;
+ ost << "DeleteBucketCommand("
+ << cmd->getBucketId().toString()
+ << ") did not have up to date bucketinfo. "
+ << "Distributor thought we had "
+ << cmd->getBucketInfo().toString()
+ << ", but storage bucket database contains "
+ << entry->getBucketInfo().toString();
+
+ LOG(debug, "Rejecting bucket delete: %s", ost.str().c_str());
+ std::shared_ptr<api::StorageReply> reply(cmd->makeReply().release());
+ static_cast<api::DeleteBucketReply&>(*reply).setBucketInfo(
+ entry->getBucketInfo());
+ reply->setResult(api::ReturnCode(api::ReturnCode::REJECTED,
+ ost.str()));
+ entry.unlock();
+ sendUp(reply);
+ return true;
+ }
+
+ // Forcing max pri on delete bucket for now, so we can't get into
+ // a race condition with a create bucket/put coming in after with
+ // higher priority.
+ cmd->setPriority(0);
+ LOG(debug, "Deleting %s", cmd->getBucketId().toString().c_str());
+ handlePersistenceMessage(cmd, entry->disk);
+ disk = entry->disk;
+ entry.remove();
+ }
+ _filestorHandler->failOperations(
+ cmd->getBucketId(),
+ disk,
+ api::ReturnCode(api::ReturnCode::BUCKET_DELETED,
+ vespalib::make_string(
+ "Bucket %s about to be deleted anyway",
+ cmd->getBucketId().toString().c_str())));
+ return true;
+}
+
+
+
+StorBucketDatabase::WrappedEntry
+FileStorManager::ensureConsistentBucket(
+ const document::BucketId& bucket,
+ api::StorageMessage& msg,
+ const char* callerId)
+{
+ StorBucketDatabase::WrappedEntry entry(_component.getBucketDatabase().get(
+ bucket, callerId, StorBucketDatabase::CREATE_IF_NONEXISTING));
+ assert(entry.exist());
+ if (!_component.getBucketDatabase().isConsistent(entry)) {
+ if (!entry.preExisted()) {
+ // Don't create empty bucket if merge isn't allowed to continue.
+ entry.remove();
+ }
+ replyDroppedOperation(msg,
+ bucket,
+ api::ReturnCode::ABORTED,
+ "bucket is inconsistently split");
+ return StorBucketDatabase::WrappedEntry();
+ }
+
+ return entry;
+}
+
+bool
+FileStorManager::onMergeBucket(const shared_ptr<api::MergeBucketCommand>& cmd)
+{
+ StorBucketDatabase::WrappedEntry entry(
+ ensureConsistentBucket(cmd->getBucketId(),
+ *cmd,
+ "FileStorManager::onMergeBucket"));
+ if (!entry.exist()) {
+ return true;
+ }
+
+ if (!entry.preExisted()) {
+ entry->disk = _component.getIdealPartition(cmd->getBucketId());
+ if (_partitions[entry->disk].isUp()) {
+ entry->info = api::BucketInfo(0, 0, 0, 0, 0, true, false);
+ LOG(debug, "Created bucket %s on disk %d (node index is %d) due "
+ "to merge being received.",
+ cmd->getBucketId().toString().c_str(),
+ entry->disk, _component.getIndex());
+ // Call before writing bucket entry as we need to have bucket
+ // lock while calling
+ handlePersistenceMessage(cmd, entry->disk);
+ entry.write();
+ } else {
+ entry.remove();
+ api::ReturnCode code(
+ api::ReturnCode::IO_FAILURE,
+ vespalib::make_string(
+ "Trying to perform merge %s whose bucket belongs on target disk %d, which is down. Cluster state version of command is %d, our system state version is %d",
+ cmd->toString().c_str(),
+ entry->disk,
+ cmd->getClusterStateVersion(),
+ _component.getStateUpdater().getSystemState()->getVersion()));
+ LOGBT(debug, cmd->getBucketId().toString(),
+ "%s", code.getMessage().c_str());
+ api::MergeBucketReply::SP reply(new api::MergeBucketReply(*cmd));
+ reply->setResult(code);
+ sendUp(reply);
+ return true;
+ }
+ } else {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onGetBucketDiff(
+ const shared_ptr<api::GetBucketDiffCommand>& cmd)
+{
+ StorBucketDatabase::WrappedEntry entry(
+ ensureConsistentBucket(cmd->getBucketId(),
+ *cmd,
+ "FileStorManager::onGetBucketDiff"));
+ if (!entry.exist()) {
+ return true;
+ }
+ if (!entry.preExisted()) {
+ entry->disk = _component.getIdealPartition(cmd->getBucketId());
+ if (_partitions[entry->disk].isUp()) {
+ LOG(debug, "Created bucket %s on disk %d (node index is %d) due "
+ "to get bucket diff being received.",
+ cmd->getBucketId().toString().c_str(),
+ entry->disk, _component.getIndex());
+ entry->info.setTotalDocumentSize(0);
+ entry->info.setUsedFileSize(0);
+ entry->info.setReady(true);
+ // Call before writing bucket entry as we need to have bucket
+ // lock while calling
+ handlePersistenceMessage(cmd, entry->disk);
+ entry.write();
+ } else {
+ entry.remove();
+ api::ReturnCode code(api::ReturnCode::IO_FAILURE,
+ vespalib::make_string(
+ "Trying to merge non-existing bucket %s, which "
+ "can't be created because target disk %d is down",
+ cmd->getBucketId().toString().c_str(),
+ entry->disk));
+ LOGBT(warning, cmd->getBucketId().toString(),
+ "%s", code.getMessage().c_str());
+ api::GetBucketDiffReply::SP reply(
+ new api::GetBucketDiffReply(*cmd));
+ reply->setResult(code);
+ sendUp(reply);
+ return true;
+ }
+ } else {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::validateApplyDiffCommandBucket(
+ api::StorageMessage& msg,
+ const StorBucketDatabase::WrappedEntry& entry)
+{
+ if (!entry.exist()) {
+ return false;
+ }
+ if (!_component.getBucketDatabase().isConsistent(entry)) {
+ replyDroppedOperation(msg,
+ entry.getBucketId(),
+ api::ReturnCode::ABORTED,
+ "bucket became inconsistent during merging");
+ return false;
+ }
+ return true;
+}
+
+bool
+FileStorManager::validateDiffReplyBucket(
+ const StorBucketDatabase::WrappedEntry& entry,
+ const document::BucketId& bucket)
+{
+ if (!entry.exist()) {
+ _filestorHandler->clearMergeStatus(bucket,
+ api::ReturnCode(api::ReturnCode::BUCKET_NOT_FOUND,
+ "Bucket removed during merge"));
+ return false;
+ }
+ if (!_component.getBucketDatabase().isConsistent(entry)) {
+ _filestorHandler->clearMergeStatus(bucket,
+ api::ReturnCode(api::ReturnCode::ABORTED,
+ "Bucket became inconsistent during merging"));
+ return false;
+ }
+ return true;
+}
+
+bool
+FileStorManager::onGetBucketDiffReply(
+ const shared_ptr<api::GetBucketDiffReply>& reply)
+{
+ StorBucketDatabase::WrappedEntry entry(mapOperationToDisk(
+ *reply, reply->getBucketId()));
+ if (validateDiffReplyBucket(entry, reply->getBucketId())) {
+ handlePersistenceMessage(reply, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onApplyBucketDiff(
+ const shared_ptr<api::ApplyBucketDiffCommand>& cmd)
+{
+ StorBucketDatabase::WrappedEntry entry(mapOperationToDisk(
+ *cmd, cmd->getBucketId()));
+ if (validateApplyDiffCommandBucket(*cmd, entry)) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onApplyBucketDiffReply(
+ const shared_ptr<api::ApplyBucketDiffReply>& reply)
+{
+ StorBucketDatabase::WrappedEntry entry(mapOperationToDisk(
+ *reply, reply->getBucketId()));
+ if (validateDiffReplyBucket(entry, reply->getBucketId())) {
+ handlePersistenceMessage(reply, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onJoinBuckets(
+ const std::shared_ptr<api::JoinBucketsCommand>& cmd)
+{
+ StorBucketDatabase::WrappedEntry entry(_component.getBucketDatabase().get(
+ cmd->getBucketId(), "FileStorManager::onJoinBuckets"));
+ uint16_t disk;
+ if (entry.exist()) {
+ disk = entry->disk;
+ } else {
+ disk = _component.getPreferredAvailablePartition(cmd->getBucketId());
+ }
+ return handlePersistenceMessage(cmd, disk);
+}
+
+bool
+FileStorManager::onSplitBucket(
+ const std::shared_ptr<api::SplitBucketCommand>& cmd)
+{
+ StorBucketDatabase::WrappedEntry entry(mapOperationToDisk(
+ *cmd, cmd->getBucketId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onSetBucketState(
+ const std::shared_ptr<api::SetBucketStateCommand>& cmd)
+{
+ StorBucketDatabase::WrappedEntry entry(mapOperationToDisk(
+ *cmd, cmd->getBucketId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+}
+
+bool
+FileStorManager::onInternal(const shared_ptr<api::InternalCommand>& msg)
+{
+ switch (msg->getType()) {
+ case GetIterCommand::ID:
+ {
+ shared_ptr<GetIterCommand> cmd(
+ std::static_pointer_cast<GetIterCommand>(msg));
+ StorBucketDatabase::WrappedEntry entry(mapOperationToDisk(
+ *cmd, cmd->getBucketId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+ }
+ case CreateIteratorCommand::ID:
+ {
+ shared_ptr<CreateIteratorCommand> cmd(
+ std::static_pointer_cast<CreateIteratorCommand>(msg));
+ StorBucketDatabase::WrappedEntry entry(mapOperationToDisk(
+ *cmd, cmd->getBucketId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+ }
+ case DestroyIteratorCommand::ID:
+ {
+ spi::Context context(msg->getLoadType(), msg->getPriority(),
+ msg->getTrace().getLevel());
+ shared_ptr<DestroyIteratorCommand> cmd(
+ std::static_pointer_cast<DestroyIteratorCommand>(msg));
+ _provider->destroyIterator(cmd->getIteratorId(), context);
+ msg->getTrace().getRoot().addChild(context.getTrace().getRoot());
+ return true;
+ }
+ case ReadBucketList::ID:
+ {
+ shared_ptr<ReadBucketList> cmd(
+ std::static_pointer_cast<ReadBucketList>(msg));
+
+ handlePersistenceMessage(cmd, cmd->getPartition());
+ return true;
+ }
+ case ReadBucketInfo::ID:
+ {
+ shared_ptr<ReadBucketInfo> cmd(
+ std::static_pointer_cast<ReadBucketInfo>(msg));
+ StorBucketDatabase::WrappedEntry entry(mapOperationToDisk(
+ *cmd, cmd->getBucketId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+ }
+ case InternalBucketJoinCommand::ID:
+ {
+ shared_ptr<InternalBucketJoinCommand> cmd(
+ std::static_pointer_cast<InternalBucketJoinCommand>(msg));
+ StorBucketDatabase::WrappedEntry entry(mapOperationToDisk(
+ *cmd, cmd->getBucketId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+ }
+ case RepairBucketCommand::ID:
+ {
+ shared_ptr<RepairBucketCommand> cmd(
+ std::static_pointer_cast<RepairBucketCommand>(msg));
+ StorBucketDatabase::WrappedEntry entry(mapOperationToDisk(
+ *cmd, cmd->getBucketId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+ }
+ case BucketDiskMoveCommand::ID:
+ {
+ shared_ptr<BucketDiskMoveCommand> cmd(
+ std::static_pointer_cast<BucketDiskMoveCommand>(msg));
+ StorBucketDatabase::WrappedEntry entry(mapOperationToDisk(
+ *cmd, cmd->getBucketId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+ }
+ case RecheckBucketInfoCommand::ID:
+ {
+ shared_ptr<RecheckBucketInfoCommand> cmd(
+ std::static_pointer_cast<RecheckBucketInfoCommand>(msg));
+ StorBucketDatabase::WrappedEntry entry(mapOperationToDisk(
+ *cmd, cmd->getBucketId()));
+ if (entry.exist()) {
+ handlePersistenceMessage(cmd, entry->disk);
+ }
+ return true;
+ }
+ case AbortBucketOperationsCommand::ID:
+ {
+ shared_ptr<AbortBucketOperationsCommand> cmd(
+ std::static_pointer_cast<AbortBucketOperationsCommand>(msg));
+ handleAbortBucketOperations(cmd);
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+void
+FileStorManager::handleAbortBucketOperations(
+ const shared_ptr<AbortBucketOperationsCommand>& cmd)
+{
+ _filestorHandler->abortQueuedOperations(*cmd);
+ sendReply(api::StorageReply::SP(cmd->makeReply().release()));
+}
+
+bool
+FileStorManager::onInternalReply(const shared_ptr<api::InternalReply>& r)
+{
+ switch(r->getType()) {
+ case GetIterReply::ID:
+ {
+ sendUp(r);
+ return true;
+ }
+ default:
+ return false;
+ }
+}
+
+void
+FileStorManager::sendCommand(const std::shared_ptr<api::StorageCommand>& cmd)
+{
+ sendUp(cmd);
+}
+
+void
+FileStorManager::sendReply(const std::shared_ptr<api::StorageReply>& reply)
+{
+ LOG(spam, "Sending reply %s", reply->toString().c_str());
+
+ if (reply->getType() == api::MessageType::INTERNAL_REPLY) {
+ std::shared_ptr<api::InternalReply> rep(
+ std::dynamic_pointer_cast<api::InternalReply>(reply));
+ assert(rep.get());
+ if (onInternalReply(rep)) return;
+ }
+
+ // Currently we need to dispatch due to replies sent by remapQueue
+ // function in handlerimpl, as filestorthread keeps bucket db lock
+ // while running this function
+ dispatchUp(reply);
+}
+
+void
+FileStorManager::sendUp(const std::shared_ptr<api::StorageMessage>& msg)
+{
+ StorageLinkQueued::sendUp(msg);
+}
+
+void FileStorManager::onClose()
+{
+ LOG(debug, "Start closing");
+ // Avoid getting config during shutdown
+ _configFetcher.close();
+ LOG(debug, "Closed _configFetcher.");
+ _filestorHandler->close();
+ LOG(debug, "Closed _filestorHandler.");
+ _closed = true;
+ StorageLinkQueued::onClose();
+ LOG(debug, "Done closing");
+}
+
+void FileStorManager::onFlush(bool downwards)
+{
+ // Don't delete merges first time around, since threads might be
+ // processing them
+ LOG(debug, "Start Flushing");
+ _filestorHandler->flush(!downwards);
+ LOG(debug, "Flushed _filestorHandler->flush(!downwards);");
+ for (uint32_t i = 0; i < _disks.size(); ++i) {
+ for (uint32_t j = 0; j < _disks[i].size(); ++j) {
+ if (_disks[i][j].get() != NULL) {
+ _disks[i][j]->flush();
+ LOG(debug, "flushed disk[%d][%d]", i, j);
+ }
+ }
+ }
+ uint32_t queueSize = _filestorHandler->getQueueSize();
+ std::ostringstream ost;
+ if (queueSize > 0) {
+ ost << "Queue size " << queueSize;
+ }
+ std::string result = ost.str();
+ if (result.size() > 0) {
+ LOG(error, "Operations in persistence layer after flush. This is ok "
+ "during load, but should not happen when flush is called "
+ "during shutdown as load then is supposed to have been "
+ "stopped: %s",
+ result.c_str());
+ }
+ StorageLinkQueued::onFlush(downwards);
+ LOG(debug, "Done Flushing");
+}
+
+void
+FileStorManager::reportHtmlStatus(std::ostream& out,
+ const framework::HttpUrlPath& path) const
+{
+ bool showStatus = !path.hasAttribute("thread");
+ bool verbose = path.hasAttribute("verbose");
+
+ // Print menu
+ out << "<font size=\"-1\">[ <a href=\"/\">Back to top</a>"
+ << " | <a href=\"?" << (verbose ? "verbose" : "")
+ << "\">Main filestor manager status page</a>"
+ << " | <a href=\"?" << (verbose ? "notverbose" : "verbose");
+ if (!showStatus) {
+ out << "&thread=" << path.get("thread", std::string(""));
+ }
+ out << "\">" << (verbose ? "Less verbose" : "More verbose") << "</a>\n"
+ << " ]</font><br><br>\n";
+
+ if (_disks.size()) {
+ out << "<p>Using " << _disks[0].size() << " threads per disk</p>\n";
+ }
+
+ _filestorHandler->getStatus(out, path);
+}
+
+bool
+FileStorManager::isMerging(const document::BucketId& bucket) const
+{
+ return _filestorHandler->isMerging(bucket);
+}
+
+namespace {
+ struct Deactivator {
+ StorBucketDatabase::Decision operator()(
+ document::BucketId::Type, StorBucketDatabase::Entry& data)
+ {
+ data.info.setActive(false);
+ return StorBucketDatabase::UPDATE;
+ }
+ };
+}
+
+void
+FileStorManager::updateState()
+{
+ lib::ClusterState::CSP state(_component.getStateUpdater().getSystemState());
+ spi::ClusterState spiState(
+ *state, _component.getIndex(), *_component.getDistribution());
+ lib::Node node(_component.getNodeType(), _component.getIndex());
+ bool nodeUp = state->getNodeState(node).getState().oneOf("uir");
+
+ LOG(debug, "FileStorManager received cluster state '%s'",
+ state->toString().c_str());
+ // If edge where we go down
+ if (_nodeUpInLastNodeStateSeenByProvider && !nodeUp) {
+ LOG(debug,
+ "Received cluster state where this node is down; "
+ "de-activating all buckets in database");
+ Deactivator deactivator;
+ _component.getBucketDatabase().all(
+ deactivator, "FileStorManager::updateState");
+ }
+ _provider->setClusterState(spiState);
+ _nodeUpInLastNodeStateSeenByProvider = nodeUp;
+}
+
+void
+FileStorManager::storageDistributionChanged()
+{
+ updateState();
+}
+
+void
+FileStorManager::handleNewState()
+{
+ //TODO: Don't update if it isn't necessary (distributor-only change)
+ updateState();
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/filestorage/filestormanager.h b/storage/src/vespa/storage/persistence/filestorage/filestormanager.h
new file mode 100644
index 00000000000..c14a36ec428
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/filestormanager.h
@@ -0,0 +1,202 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::FileStorManager
+ * @ingroup filestorage
+ *
+ * @version $Id$
+ */
+
+#pragma once
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/vespalib/util/document_runnable.h>
+#include <vespa/vespalib/util/sync.h>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/persistence/spi/persistenceprovider.h>
+#include <vespa/persistence/spi/metricpersistenceprovider.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storage/common/messagesender.h>
+#include <vespa/storage/common/servicelayercomponent.h>
+#include <vespa/storage/common/statusmessages.h>
+#include <vespa/storage/common/storagelinkqueued.h>
+#include <vespa/config-stor-filestor.h>
+#include <vespa/storage/persistence/diskthread.h>
+#include <vespa/storage/persistence/filestorage/filestorhandler.h>
+#include <vespa/storage/persistence/filestorage/filestormetrics.h>
+#include <vespa/storage/persistence/providershutdownwrapper.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storage/common/nodestateupdater.h>
+
+namespace storage {
+namespace api {
+ class ReturnCode;
+ class StorageReply;
+}
+
+class BucketMergeTest;
+class DiskInfo;
+class FileStorManagerTest;
+class ReadBucketList;
+class ModifiedBucketCheckerThread;
+class BucketOwnershipNotifier;
+class AbortBucketOperationsCommand;
+
+class FileStorManager : public StorageLinkQueued,
+ public framework::HtmlStatusReporter,
+ public StateListener,
+ private config::IFetcherCallback<vespa::config::content::StorFilestorConfig>,
+ private MessageSender
+{
+ ServiceLayerComponentRegister& _compReg;
+ ServiceLayerComponent _component;
+ const spi::PartitionStateList& _partitions;
+ spi::PersistenceProvider& _providerCore;
+ ProviderShutdownWrapper _providerShutdown;
+ bool _nodeUpInLastNodeStateSeenByProvider;
+ spi::MetricPersistenceProvider::UP _providerMetric;
+ spi::PersistenceProvider* _provider;
+
+ const document::BucketIdFactory& _bucketIdFactory;
+ config::ConfigUri _configUri;
+
+ typedef std::vector<DiskThread::SP> DiskThreads;
+ std::vector<DiskThreads> _disks;
+ std::unique_ptr<BucketOwnershipNotifier> _bucketOwnershipNotifier;
+
+ std::unique_ptr<vespa::config::content::StorFilestorConfig> _config;
+ config::ConfigFetcher _configFetcher;
+ uint32_t _threadLockCheckInterval; // In seconds
+ bool _failDiskOnError;
+ int _killSignal;
+ std::shared_ptr<FileStorMetrics> _metrics;
+ std::unique_ptr<FileStorHandler> _filestorHandler;
+ lib::ClusterState _lastState;
+
+ struct ReplyHolder {
+ int refCount;
+ std::unique_ptr<api::StorageReply> reply;
+
+ ReplyHolder(int rc, std::unique_ptr<api::StorageReply> r)
+ : refCount(rc), reply(std::move(r)) {};
+ };
+
+ std::map<api::StorageMessage::Id,
+ std::shared_ptr<ReplyHolder> > _splitMessages;
+ vespalib::Lock _splitLock;
+ mutable vespalib::Monitor _threadMonitor; // Notify to stop sleeping
+ bool _closed;
+
+ FileStorManager(const FileStorManager &);
+ FileStorManager& operator=(const FileStorManager &);
+
+ std::vector<DiskThreads> getThreads() { return _disks; }
+
+ friend class BucketMergeTest;
+ friend class FileStorManagerTest;
+ friend class MessageTest;
+
+public:
+ explicit FileStorManager(const config::ConfigUri &,
+ const spi::PartitionStateList&,
+ spi::PersistenceProvider&,
+ ServiceLayerComponentRegister&);
+ ~FileStorManager();
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ // Return true if we are currently merging the given bucket.
+ bool isMerging(const document::BucketId& bucket) const;
+
+ FileStorHandler& getFileStorHandler() {
+ return *_filestorHandler;
+ };
+
+ spi::PersistenceProvider& getPersistenceProvider() {
+ return *_provider;
+ }
+
+ void handleNewState();
+
+private:
+ void configure(std::unique_ptr<vespa::config::content::StorFilestorConfig> config);
+
+ void replyWithBucketNotFound(api::StorageMessage&,
+ const document::BucketId&);
+
+ void replyDroppedOperation(api::StorageMessage& msg,
+ const document::BucketId& bucket,
+ api::ReturnCode::Result returnCode,
+ vespalib::stringref reason);
+
+ StorBucketDatabase::WrappedEntry ensureConsistentBucket(
+ const document::BucketId& bucket,
+ api::StorageMessage& msg,
+ const char* callerId);
+
+ bool validateApplyDiffCommandBucket(api::StorageMessage& msg,
+ const StorBucketDatabase::WrappedEntry&);
+ bool validateDiffReplyBucket(const StorBucketDatabase::WrappedEntry&,
+ const document::BucketId&);
+
+ StorBucketDatabase::WrappedEntry mapOperationToDisk(
+ api::StorageMessage&, const document::BucketId&);
+ StorBucketDatabase::WrappedEntry mapOperationToBucketAndDisk(
+ api::BucketCommand&, const document::DocumentId*);
+ bool handlePersistenceMessage(const std::shared_ptr<api::StorageMessage>&,
+ uint16_t disk);
+
+ // Document operations
+ bool onPut(const std::shared_ptr<api::PutCommand>&);
+ bool onUpdate(const std::shared_ptr<api::UpdateCommand>&);
+ bool onGet(const std::shared_ptr<api::GetCommand>&);
+ bool onRemove(const std::shared_ptr<api::RemoveCommand>&);
+ bool onRevert(const std::shared_ptr<api::RevertCommand>&);
+ bool onMultiOperation(const std::shared_ptr<api::MultiOperationCommand>&);
+ bool onBatchPutRemove(const std::shared_ptr<api::BatchPutRemoveCommand>&);
+ bool onStatBucket(const std::shared_ptr<api::StatBucketCommand>&);
+
+ // Bucket operations
+ bool onRemoveLocation(const std::shared_ptr<api::RemoveLocationCommand>&);
+ bool onCreateBucket(const std::shared_ptr<api::CreateBucketCommand>&);
+ bool onDeleteBucket(const std::shared_ptr<api::DeleteBucketCommand>&);
+ bool onMergeBucket(const std::shared_ptr<api::MergeBucketCommand>&);
+ bool onGetBucketDiff(const std::shared_ptr<api::GetBucketDiffCommand>&);
+ bool onGetBucketDiffReply(
+ const std::shared_ptr<api::GetBucketDiffReply>&);
+ bool onApplyBucketDiff(
+ const std::shared_ptr<api::ApplyBucketDiffCommand>&);
+ bool onApplyBucketDiffReply(
+ const std::shared_ptr<api::ApplyBucketDiffReply>&);
+ bool onJoinBuckets(const std::shared_ptr<api::JoinBucketsCommand>&);
+ bool onSplitBucket(const std::shared_ptr<api::SplitBucketCommand>&);
+ bool onSetBucketState(const std::shared_ptr<api::SetBucketStateCommand>&);
+ bool onNotifyBucketChangeReply(
+ const std::shared_ptr<api::NotifyBucketChangeReply>&)
+ { return true; }
+
+ // Other
+ bool onInternal(const std::shared_ptr<api::InternalCommand>&);
+ bool onInternalReply(const std::shared_ptr<api::InternalReply>&);
+
+ void handleAbortBucketOperations(
+ const std::shared_ptr<AbortBucketOperationsCommand>&);
+
+ void sendCommand(const std::shared_ptr<api::StorageCommand>&);
+ void sendReply(const std::shared_ptr<api::StorageReply>&);
+
+ void sendUp(const std::shared_ptr<api::StorageMessage>&);
+
+ void onClose();
+ void onFlush(bool downwards);
+
+ virtual void reportHtmlStatus(std::ostream&,
+ const framework::HttpUrlPath&) const;
+
+ virtual void storageDistributionChanged();
+
+ void updateState();
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/persistence/filestorage/filestormetrics.h b/storage/src/vespa/storage/persistence/filestorage/filestormetrics.h
new file mode 100644
index 00000000000..f9c8f25a5dc
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/filestormetrics.h
@@ -0,0 +1,363 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::FileStorMetrics
+ * @ingroup filestorage
+ *
+ * @brief Metrics for the file store threads.
+ *
+ * @version $Id$
+ */
+
+#pragma once
+
+#include <vespa/metrics/metrics.h>
+#include <vespa/documentapi/loadtypes/loadtypeset.h>
+
+namespace storage {
+
+struct FileStorThreadMetrics : public metrics::MetricSet
+{
+ typedef std::shared_ptr<FileStorThreadMetrics> SP;
+
+ struct Op : public metrics::MetricSet {
+ std::string _name;
+ metrics::LongCountMetric count;
+ metrics::LongAverageMetric latency;
+ metrics::LongCountMetric failed;
+
+ Op(const std::string& id, const std::string name,
+ metrics::MetricSet* owner = 0)
+ : MetricSet(id,
+ id,
+ name + " load in filestor thread",
+ owner,
+ "operationtype"),
+ _name(name),
+ count("count",
+ "yamasdefault",
+ "Number of requests processed.",
+ this),
+ latency("latency",
+ "yamasdefault",
+ "Latency of successful requests.",
+ this),
+ failed("failed",
+ "yamasdefault",
+ "Number of failed requests.",
+ this)
+ {
+ }
+
+ virtual Metric* clone(std::vector<Metric::LP>& ownerList,
+ CopyType copyType,
+ metrics::MetricSet* owner,
+ bool includeUnused) const
+ {
+ if (copyType == INACTIVE) {
+ return MetricSet::clone(
+ ownerList, INACTIVE, owner, includeUnused);
+ }
+ return (Op*) (new Op(getName(), _name, owner))->assignValues(*this);
+ }
+ Op* operator&() { return this; }
+ };
+ struct OpWithNotFound : public Op {
+ metrics::LongCountMetric notFound;
+
+ OpWithNotFound(const std::string& id, const std::string name,
+ metrics::MetricSet* owner = 0)
+ : Op(id, name, owner),
+ notFound("not_found", "", "Number of requests that could not be "
+ "completed due to source document not found.", this)
+ {
+ }
+
+ virtual Metric* clone(std::vector<Metric::LP>& ownerList,
+ CopyType copyType,
+ metrics::MetricSet* owner,
+ bool includeUnused) const
+ {
+ if (copyType == INACTIVE) {
+ return MetricSet::clone(
+ ownerList, INACTIVE, owner, includeUnused);
+ }
+ return (OpWithNotFound*)
+ (new OpWithNotFound(getName(), _name, owner))
+ ->assignValues(*this);
+ }
+ OpWithNotFound* operator&() { return this; }
+ };
+
+ struct Update : public OpWithNotFound {
+ metrics::LongAverageMetric latencyRead;
+
+ Update(metrics::MetricSet* owner = 0)
+ : OpWithNotFound("update", "Update", owner),
+ latencyRead("latency_read", "", "Latency of the source read in "
+ "the request.", this)
+ {
+ }
+
+ virtual Metric* clone(std::vector<Metric::LP>& ownerList,
+ CopyType copyType,
+ metrics::MetricSet* owner,
+ bool includeUnused) const
+ {
+ if (copyType == INACTIVE) {
+ return MetricSet::clone(
+ ownerList, INACTIVE, owner, includeUnused);
+ }
+ return (Update*) (new Update(owner))->assignValues(*this);
+ }
+ Update* operator&() { return this; }
+ };
+
+ struct Visitor : public Op {
+ metrics::LongAverageMetric documentsPerIterate;
+
+ Visitor(metrics::MetricSet* owner = 0)
+ : Op("visit", "Visit", owner),
+ documentsPerIterate("docs", "", "Number of entries read per iterate call",
+ this)
+ {
+ }
+
+ virtual Metric* clone(std::vector<Metric::LP>& ownerList,
+ CopyType copyType,
+ metrics::MetricSet* owner,
+ bool includeUnused) const
+ {
+ if (copyType == INACTIVE) {
+ return MetricSet::clone(
+ ownerList, INACTIVE, owner, includeUnused);
+ }
+ return (Visitor*) (new Visitor(owner))->assignValues(*this);
+ }
+ Visitor* operator&() { return this; }
+ };
+
+ metrics::LongCountMetric operations;
+ metrics::LongCountMetric failedOperations;
+ metrics::LoadMetric<Op> put;
+ metrics::LoadMetric<OpWithNotFound> get;
+ metrics::LoadMetric<OpWithNotFound> remove;
+ metrics::LoadMetric<Op> removeLocation;
+ metrics::LoadMetric<Op> statBucket;
+ metrics::LoadMetric<Update> update;
+ metrics::LoadMetric<OpWithNotFound> revert;
+ Op createIterator;
+ metrics::LoadMetric<Visitor> visit;
+ metrics::LoadMetric<Op> multiOp;
+ Op createBuckets;
+ Op deleteBuckets;
+ Op repairs;
+ metrics::LongCountMetric repairFixed;
+ Op recheckBucketInfo;
+ Op splitBuckets;
+ Op joinBuckets;
+ Op setBucketStates;
+ Op movedBuckets;
+ Op readBucketList;
+ Op readBucketInfo;
+ Op internalJoin;
+ Op mergeBuckets;
+ Op getBucketDiff;
+ Op applyBucketDiff;
+
+ metrics::LongCountMetric bytesMerged;
+ metrics::LongCountMetric getBucketDiffReply;
+ metrics::LongCountMetric applyBucketDiffReply;
+ metrics::LongAverageMetric mergeLatencyTotal;
+ metrics::LongAverageMetric mergeMetadataReadLatency;
+ metrics::LongAverageMetric mergeDataReadLatency;
+ metrics::LongAverageMetric mergeDataWriteLatency;
+ metrics::DoubleAverageMetric mergeAverageDataReceivedNeeded;
+ metrics::LongAverageMetric batchingSize;
+
+ FileStorThreadMetrics(const std::string& name, const std::string& desc,
+ const metrics::LoadTypeSet& lt)
+ : metrics::MetricSet(name, "filestor partofsum thread", desc, NULL, "thread"),
+ operations("operations", "",
+ "Number of operations processed.", this),
+ failedOperations("failedoperations", "",
+ "Number of operations throwing exceptions.", this),
+
+ put(lt, *&Op("put", "Put"), this),
+ get(lt, *&OpWithNotFound("get", "Get"), this),
+ remove(lt, *&OpWithNotFound("remove", "Remove"), this),
+ removeLocation(lt, *&Op("remove_location", "Remove location"), this),
+ statBucket(lt, *&Op("stat_bucket", "Stat bucket"), this),
+ update(lt, *&Update(), this),
+ revert(lt, *&OpWithNotFound("revert", "Revert"), this),
+ createIterator("createiterator", "", this),
+ visit(lt, *&Visitor(), this),
+ multiOp(lt, *&Op("multioperations",
+ "The number of multioperations that have been created"), this),
+ createBuckets("createbuckets",
+ "Number of buckets that has been created.", this),
+ deleteBuckets("deletebuckets",
+ "Number of buckets that has been deleted.", this),
+ repairs("bucketverified", "Number of times buckets have been checked.", this),
+ repairFixed("bucketfixed", "",
+ "Number of times bucket has been fixed because of "
+ "corruption", this),
+ recheckBucketInfo("recheckbucketinfo",
+ "Number of times bucket info has been explicitly "
+ "rechecked due to buckets being marked modified by "
+ "the persistence provider",
+ this),
+ splitBuckets("splitbuckets",
+ "Number of times buckets have been split.", this),
+ joinBuckets("joinbuckets",
+ "Number of times buckets have been joined.", this),
+ setBucketStates("setbucketstates",
+ "Number of times buckets have been activated or deactivated.", this),
+ movedBuckets("movedbuckets",
+ "Number of buckets moved between disks", this),
+ readBucketList("readbucketlist",
+ "Number of read bucket list requests", this),
+ readBucketInfo("readbucketinfo",
+ "Number of read bucket info requests", this),
+ internalJoin("internaljoin",
+ "Number of joins to join buckets on multiple disks during "
+ "storage initialization.", this),
+ mergeBuckets("mergebuckets",
+ "Number of times buckets have been merged.", this),
+ getBucketDiff("getbucketdiff",
+ "Number of getbucketdiff commands that have been processed.", this),
+ applyBucketDiff("applybucketdiff",
+ "Number of applybucketdiff commands that have been processed.", this),
+ bytesMerged("bytesmerged", "",
+ "Total number of bytes merged into this node.", this),
+ getBucketDiffReply("getbucketdiffreply", "",
+ "Number of getbucketdiff replies that have been processed.", this),
+ applyBucketDiffReply("applybucketdiffreply", "",
+ "Number of applybucketdiff replies that have been processed.", this),
+ mergeLatencyTotal("mergelatencytotal", "",
+ "Latency of total merge operation, from master node receives "
+ "it, until merge is complete and master node replies.", this),
+ mergeMetadataReadLatency("mergemetadatareadlatency", "",
+ "Latency of time used in a merge step to check metadata of "
+ "current node to see what data it has.", this),
+ mergeDataReadLatency("mergedatareadlatency", "",
+ "Latency of time used in a merge step to read data other "
+ "nodes need.", this),
+ mergeDataWriteLatency("mergedatawritelatency", "",
+ "Latency of time used in a merge step to write data needed to "
+ "current node.", this),
+ mergeAverageDataReceivedNeeded("mergeavgdatareceivedneeded",
+ "",
+ "Amount of data transferred from previous node "
+ "in chain that "
+ "we needed to apply locally.", this),
+ batchingSize("batchingsize",
+ "",
+ "Number of operations batched per bucket (only counts "
+ "batches of size > 1)", this)
+ {
+ }
+
+};
+
+class FileStorDiskMetrics : public metrics::MetricSet
+{
+public:
+ typedef std::shared_ptr<FileStorDiskMetrics> SP;
+
+ std::vector<FileStorThreadMetrics::SP> threads;
+ metrics::SumMetric<MetricSet> sum;
+ metrics::LongAverageMetric queueSize;
+ metrics::LoadMetric<metrics::LongAverageMetric> averageQueueWaitingTime;
+ metrics::LongAverageMetric pendingMerges;
+ metrics::DoubleAverageMetric waitingForLockHitRate;
+ metrics::LongAverageMetric lockWaitTime;
+
+ FileStorDiskMetrics(const std::string& name,
+ const std::string& description,
+ const metrics::LoadTypeSet& loadTypes,
+ metrics::MetricSet* owner)
+ : MetricSet(name, "partofsum disk", description, owner, "disk"),
+ sum("allthreads", "sum", "", this),
+ queueSize("queuesize", "", "Size of input message queue.", this),
+ averageQueueWaitingTime(loadTypes, metrics::LongAverageMetric(
+ "averagequeuewait", "",
+ "Average time an operation spends in input queue."), this),
+ pendingMerges("pendingmerge", "",
+ "Number of buckets currently being merged.", this),
+ waitingForLockHitRate("waitingforlockrate", "",
+ "Amount of times a filestor thread has needed to wait for "
+ "lock to take next message in queue.", this),
+ lockWaitTime("lockwaittime", "",
+ "Amount of time waiting used waiting for lock.", this)
+ {
+ pendingMerges.unsetOnZeroValue();
+ waitingForLockHitRate.unsetOnZeroValue();
+ }
+
+ void initDiskMetrics(const metrics::LoadTypeSet& loadTypes,
+ uint32_t threadsPerDisk)
+ {
+ threads.clear();
+ threads.resize(threadsPerDisk);
+ for (uint32_t i=0; i<threadsPerDisk; ++i) {
+ std::ostringstream desc;
+ std::ostringstream name;
+ name << "thread" << i;
+ desc << "Thread " << i << '/' << threadsPerDisk;
+ threads[i]
+ = std::shared_ptr<FileStorThreadMetrics>(
+ new FileStorThreadMetrics(name.str(), desc.str(),
+ loadTypes));
+ registerMetric(*threads[i]);
+ sum.addMetricToSum(*threads[i]);
+ }
+ }
+};
+
+struct FileStorMetrics : public metrics::MetricSet
+{
+ std::vector<FileStorDiskMetrics::SP> disks;
+ metrics::SumMetric<MetricSet> sum;
+ metrics::LongCountMetric directoryEvents;
+ metrics::LongCountMetric partitionEvents;
+ metrics::LongCountMetric diskEvents;
+
+ FileStorMetrics(const metrics::LoadTypeSet&)
+ : metrics::MetricSet("filestor", "filestor", ""),
+ sum("alldisks", "sum", "", this),
+ directoryEvents("directoryevents", "",
+ "Number of directory events received.", this),
+ partitionEvents("partitionevents", "",
+ "Number of partition events received.", this),
+ diskEvents("diskevents", "",
+ "Number of disk events received.", this)
+ {
+ }
+
+ void initDiskMetrics(uint16_t numDisks,
+ const metrics::LoadTypeSet& loadTypes,
+ uint32_t threadsPerDisk)
+ {
+ if (!disks.empty()) {
+ throw vespalib::IllegalStateException(
+ "Can't initialize disks twice", VESPA_STRLOC);
+ }
+ disks.clear();
+ disks.resize(numDisks);
+ for (uint32_t i=0; i<numDisks; ++i) {
+ // Currently FileStorHandlerImpl expects metrics to exist for
+ // disks that are not in use too.
+ std::ostringstream desc;
+ std::ostringstream name;
+ name << "disk_" << i;
+ desc << "Disk " << i;
+ disks[i] = FileStorDiskMetrics::SP(new FileStorDiskMetrics(
+ name.str(), desc.str(), loadTypes, this));
+ sum.addMetricToSum(*disks[i]);
+ disks[i]->initDiskMetrics(loadTypes, threadsPerDisk);
+ }
+ }
+};
+
+}
+
diff --git a/storage/src/vespa/storage/persistence/filestorage/mergestatus.cpp b/storage/src/vespa/storage/persistence/filestorage/mergestatus.cpp
new file mode 100644
index 00000000000..f0465620253
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/mergestatus.cpp
@@ -0,0 +1,109 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/persistence/filestorage/mergestatus.h>
+#include <vespa/log/log.h>
+
+LOG_SETUP(".mergestatus");
+
+namespace storage {
+
+MergeStatus::MergeStatus(framework::Clock& clock, const metrics::LoadType& lt,
+ api::StorageMessage::Priority priority,
+ uint32_t traceLevel)
+ : reply(), nodeList(), maxTimestamp(0), diff(), pendingId(0),
+ pendingGetDiff(), pendingApplyDiff(), timeout(0), startTime(clock),
+ context(lt, priority, traceLevel)
+{
+}
+
+bool
+MergeStatus::removeFromDiff(
+ const std::vector<api::ApplyBucketDiffCommand::Entry>& part,
+ uint16_t hasMask)
+{
+ std::deque<api::GetBucketDiffCommand::Entry>::iterator it(diff.begin());
+ std::vector<api::ApplyBucketDiffCommand::Entry>::const_iterator it2(
+ part.begin());
+ bool altered = false;
+ // We expect part array to be sorted in the same order as in the diff,
+ // and that all entries in the part should exist in the source list.
+ while (it != diff.end() && it2 != part.end()) {
+ if (it->_timestamp != it2->_entry._timestamp) {
+ ++it;
+ } else {
+ break;
+ }
+ }
+
+ // Iterate and match entries in diff.
+ while (it != diff.end() && it2 != part.end()) {
+ if (it->_timestamp != it2->_entry._timestamp) {
+ ++it;
+ } else {
+ // It is legal for an apply bucket diff to not fill all entries, so
+ // only remove it if it was actually transferred to all copies this
+ // time around, or if no copies have that doc anymore. (Can happen
+ // due to reverting or corruption)
+ if (it2->_entry._hasMask == hasMask
+ || it2->_entry._hasMask == 0)
+ {
+ if (it2->_entry._hasMask == 0) {
+ LOG(debug, "Merge entry %s no longer exists on any nodes",
+ it2->toString().c_str());
+ }
+ // Timestamp equal. Should really be the same entry. If not
+ // though, there is nothing we can do but accept it.
+ if (!(*it == it2->_entry)) {
+ LOG(warning, "Merge retrieved entry %s for entry %s but "
+ "these do not match.",
+ it2->toString().c_str(), it->toString().c_str());
+ }
+ it = diff.erase(it);
+ altered = true;
+ } else if (it2->_entry._hasMask != it->_hasMask) {
+ // Hasmasks have changed, meaning bucket contents changed on
+ // one or more of the nodes during merging.
+ altered = true;
+ it->_hasMask = it2->_entry._hasMask;
+ }
+ ++it2;
+ }
+ }
+ if (it2 != part.end()) {
+ uint32_t counter = 0;
+ while (it2 != part.end()) {
+ ++it2;
+ ++counter;
+ }
+ LOG(warning, "Apply bucket diff contained %u entries not existing in "
+ "the request.", counter);
+ }
+
+ return altered;
+}
+
+void
+MergeStatus::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ if (reply.get()) {
+ (void) verbose;
+ out << "MergeStatus(" << "nodes";
+ for (uint32_t i=0; i<nodeList.size(); ++i) {
+ out << " " << nodeList[i];
+ }
+ out << ", maxtime " << maxTimestamp << ":";
+ for (std::deque<api::GetBucketDiffCommand::Entry>::const_iterator it
+ = diff.begin(); it != diff.end(); ++it)
+ {
+ out << "\n" << indent << it->toString(true);
+ }
+ out << ")";
+ } else if (pendingGetDiff.get() != 0) {
+ out << "MergeStatus(Middle node awaiting GetBucketDiffReply)\n";
+ } else if (pendingApplyDiff.get() != 0) {
+ out << "MergeStatus(Middle node awaiting ApplyBucketDiffReply)\n";
+ }
+}
+
+};
diff --git a/storage/src/vespa/storage/persistence/filestorage/mergestatus.h b/storage/src/vespa/storage/persistence/filestorage/mergestatus.h
new file mode 100644
index 00000000000..9967fc24c7d
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/mergestatus.h
@@ -0,0 +1,50 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/persistence/spi/context.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <vespa/storageapi/messageapi/storagereply.h>
+#include <vespa/storageapi/message/bucket.h>
+
+#include <vector>
+#include <deque>
+#include <memory>
+
+namespace storage {
+
+class MergeStatus : public document::Printable {
+public:
+ using SP = std::shared_ptr<MergeStatus>;
+
+ std::shared_ptr<api::StorageReply> reply;
+ std::vector<api::MergeBucketCommand::Node> nodeList;
+ framework::MicroSecTime maxTimestamp;
+ std::deque<api::GetBucketDiffCommand::Entry> diff;
+ api::StorageMessage::Id pendingId;
+ std::shared_ptr<api::GetBucketDiffReply> pendingGetDiff;
+ std::shared_ptr<api::ApplyBucketDiffReply> pendingApplyDiff;
+ uint32_t timeout;
+ framework::MilliSecTimer startTime;
+ spi::Context context;
+
+ MergeStatus(framework::Clock&, const metrics::LoadType&,
+ api::StorageMessage::Priority, uint32_t traceLevel);
+
+ /**
+ * @return true if any entries were removed from the internal diff
+ * or the two diffs had entries with mismatching hasmasks, which
+ * indicates that bucket contents have changed during the merge.
+ */
+ bool removeFromDiff(
+ const std::vector<api::ApplyBucketDiffCommand::Entry>& part,
+ uint16_t hasMask);
+ void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ bool isFirstNode() const { return (reply.get() != 0); }
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/persistence/filestorage/modifiedbucketchecker.cpp b/storage/src/vespa/storage/persistence/filestorage/modifiedbucketchecker.cpp
new file mode 100644
index 00000000000..76744dc2a5b
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/modifiedbucketchecker.cpp
@@ -0,0 +1,206 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/persistence/filestorage/modifiedbucketchecker.h>
+
+#include <vespa/log/log.h>
+#include <vespa/storage/persistence/filestorage/filestormanager.h>
+
+LOG_SETUP(".persistence.filestor.modifiedbucketchecker");
+
+namespace storage {
+
+ModifiedBucketChecker::ModifiedBucketChecker(
+ ServiceLayerComponentRegister& compReg,
+ spi::PersistenceProvider& provider,
+ const config::ConfigUri& configUri)
+ : StorageLink("Modified bucket checker"),
+ _provider(provider),
+ _component(),
+ _thread(),
+ _configFetcher(configUri.getContext()),
+ _monitor(),
+ _stateLock(),
+ _pendingRequests(0),
+ _maxPendingChunkSize(100),
+ _singleThreadMode(false)
+{
+ _configFetcher.subscribe<vespa::config::content::core::StorServerConfig>(configUri.getConfigId(), this);
+ _configFetcher.start();
+
+ std::ostringstream threadName;
+ threadName << "Modified bucket checker " << static_cast<void*>(this);
+ _component.reset(new ServiceLayerComponent(compReg, threadName.str()));
+}
+
+ModifiedBucketChecker::~ModifiedBucketChecker()
+{
+ assert(!_thread.get());
+}
+
+void
+ModifiedBucketChecker::configure(
+ std::unique_ptr<vespa::config::content::core::StorServerConfig> newConfig)
+{
+ vespalib::LockGuard lock(_stateLock);
+ if (newConfig->bucketRecheckingChunkSize < 1) {
+ throw config::InvalidConfigException(
+ "Cannot have bucket rechecking chunk size of less than 1");
+ }
+ _maxPendingChunkSize = newConfig->bucketRecheckingChunkSize;
+}
+
+
+void
+ModifiedBucketChecker::onOpen()
+{
+ framework::MilliSecTime maxProcessingTime(60 * 1000);
+ framework::MilliSecTime waitTime(1000);
+ if (!_singleThreadMode) {
+ _thread = _component->startThread(*this, maxProcessingTime, waitTime);
+ }
+}
+
+void
+ModifiedBucketChecker::onClose()
+{
+ if (_singleThreadMode) {
+ return;
+ }
+ assert(_thread.get() != 0);
+ LOG(debug, "Interrupting modified bucket checker thread");
+ _thread->interrupt();
+ {
+ vespalib::MonitorGuard guard(_monitor);
+ guard.signal();
+ }
+ LOG(debug, "Joining modified bucket checker thread");
+ _thread->join();
+ LOG(debug, "Modified bucket checker thread joined");
+ _thread.reset(0);
+}
+
+void
+ModifiedBucketChecker::run(framework::ThreadHandle& thread)
+{
+ LOG(debug,
+ "Started modified bucket checker thread with pid %d",
+ getpid());
+
+ while (!thread.interrupted()) {
+ thread.registerTick();
+
+ bool ok = tick();
+
+ vespalib::MonitorGuard guard(_monitor);
+ if (ok) {
+ guard.wait(50);
+ } else {
+ guard.wait(100);
+ }
+ }
+}
+
+bool
+ModifiedBucketChecker::onInternalReply(
+ const std::shared_ptr<api::InternalReply>& r)
+{
+ if (r->getType() == RecheckBucketInfoReply::ID) {
+ vespalib::LockGuard guard(_stateLock);
+ assert(_pendingRequests > 0);
+ --_pendingRequests;
+ if (_pendingRequests == 0 && moreChunksRemaining()) {
+ vespalib::MonitorGuard mg(_monitor);
+ // Safe: monitor never taken alongside lock anywhere else.
+ mg.signal(); // Immediately signal start of new chunk
+ }
+ return true;
+ }
+ return false;
+}
+
+bool
+ModifiedBucketChecker::requestModifiedBucketsFromProvider()
+{
+ spi::BucketIdListResult result(_provider.getModifiedBuckets());
+ if (result.hasError()) {
+ LOG(debug, "getModifiedBuckets() failed: %s",
+ result.toString().c_str());
+ return false;
+ }
+ {
+ vespalib::LockGuard guard(_stateLock);
+ assert(_rechecksNotStarted.empty());
+ _rechecksNotStarted.swap(result.getList());
+ // We pick chunks from the end of the list, so reverse it to get
+ // the same send order as order received.
+ std::reverse(_rechecksNotStarted.begin(), _rechecksNotStarted.end());
+ }
+ return true;
+}
+
+void
+ModifiedBucketChecker::nextRecheckChunk(
+ std::vector<RecheckBucketInfoCommand::SP>& commandsToSend)
+{
+ assert(_pendingRequests == 0);
+ assert(commandsToSend.empty());
+ size_t n = std::min(_maxPendingChunkSize, _rechecksNotStarted.size());
+
+ for (size_t i = 0; i < n; ++i) {
+ document::BucketId bid(_rechecksNotStarted.back());
+ commandsToSend.emplace_back(new RecheckBucketInfoCommand(bid));
+ _rechecksNotStarted.pop_back();
+ }
+ _pendingRequests = n;
+ LOG(spam, "Prepared new recheck chunk with %zu commands", n);
+}
+
+void
+ModifiedBucketChecker::dispatchAllToPersistenceQueues(
+ const std::vector<RecheckBucketInfoCommand::SP>& commandsToSend)
+{
+ for (auto& cmd : commandsToSend) {
+ // We assume sendDown doesn't throw, but that it may send a reply
+ // up synchronously, so we cannot hold lock around it. We also make
+ // the assumption that recheck commands are only discared if their
+ // bucket no longer exists, so it's safe to not retry them.
+ sendDown(cmd);
+ }
+}
+
+bool
+ModifiedBucketChecker::tick()
+{
+ // Do two phases of locking, as we want tick() to both fetch modified
+ // buckets and send the first chunk for these in a single call. However,
+ // we want getModifiedBuckets() to called outside the lock.
+ bool shouldRequestFromProvider = false;
+ {
+ vespalib::LockGuard guard(_stateLock);
+ if (!currentChunkFinished()) {
+ return true;
+ }
+ shouldRequestFromProvider = !moreChunksRemaining();
+ }
+ if (shouldRequestFromProvider) {
+ if (!requestModifiedBucketsFromProvider()) {
+ return false;
+ }
+ }
+
+ std::vector<RecheckBucketInfoCommand::SP> commandsToSend;
+ {
+ vespalib::LockGuard guard(_stateLock);
+ if (moreChunksRemaining()) {
+ nextRecheckChunk(commandsToSend);
+ }
+ }
+ // Sending must be done outside the lock.
+ if (!commandsToSend.empty()) {
+ dispatchAllToPersistenceQueues(commandsToSend);
+ }
+ return true;
+}
+
+} // ns storage
+
diff --git a/storage/src/vespa/storage/persistence/filestorage/modifiedbucketchecker.h b/storage/src/vespa/storage/persistence/filestorage/modifiedbucketchecker.h
new file mode 100644
index 00000000000..f7448fc25b8
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/modifiedbucketchecker.h
@@ -0,0 +1,71 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <memory>
+#include <vespa/storage/common/storagecomponent.h>
+#include <vespa/storage/common/servicelayercomponent.h>
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storage/config/config-stor-server.h>
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/storage/persistence/types.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/vespalib/util/sync.h>
+#include <boost/noncopyable.hpp>
+
+namespace storage {
+
+namespace spi { class PersistenceProvider; }
+
+class ModifiedBucketChecker
+ : public StorageLink,
+ public framework::Runnable,
+ public Types,
+ private config::IFetcherCallback<
+ vespa::config::content::core::StorServerConfig>
+{
+public:
+ typedef std::unique_ptr<ModifiedBucketChecker> UP;
+
+ ModifiedBucketChecker(ServiceLayerComponentRegister& compReg,
+ spi::PersistenceProvider& provide,
+ const config::ConfigUri& configUri);
+ ~ModifiedBucketChecker();
+
+ void configure(std::unique_ptr<vespa::config::content::core::StorServerConfig>);
+
+ void run(framework::ThreadHandle& thread);
+ bool tick();
+ void onOpen();
+ void onClose();
+
+ void setUnitTestingSingleThreadedMode() {
+ _singleThreadMode = true;
+ }
+
+private:
+ bool onInternalReply(const std::shared_ptr<api::InternalReply>&);
+ bool currentChunkFinished() const {
+ return _pendingRequests == 0;
+ }
+ bool moreChunksRemaining() const {
+ return !_rechecksNotStarted.empty();
+ }
+ bool requestModifiedBucketsFromProvider();
+ void nextRecheckChunk(std::vector<RecheckBucketInfoCommand::SP>&);
+ void dispatchAllToPersistenceQueues(
+ const std::vector<RecheckBucketInfoCommand::SP>&);
+
+ spi::PersistenceProvider& _provider;
+ ServiceLayerComponent::UP _component;
+ framework::Thread::UP _thread;
+ config::ConfigFetcher _configFetcher;
+ vespalib::Monitor _monitor;
+ vespalib::Lock _stateLock;
+ document::BucketId::List _rechecksNotStarted;
+ size_t _pendingRequests;
+ size_t _maxPendingChunkSize;
+ bool _singleThreadMode; // For unit testing only
+};
+
+} // ns storage
+
diff --git a/storage/src/vespa/storage/persistence/filestorage/pausehandler.h b/storage/src/vespa/storage/persistence/filestorage/pausehandler.h
new file mode 100644
index 00000000000..a149de76a16
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/filestorage/pausehandler.h
@@ -0,0 +1,34 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class PauseHandler
+ * @ingroup persistence
+ *
+ * @brief Object that can be used to possibly pause running operation
+ */
+#pragma once
+
+#include <vespa/storage/persistence/filestorage/filestorhandler.h>
+
+namespace storage {
+
+class PauseHandler {
+ FileStorHandler* _handler;
+ uint16_t _disk;
+ uint8_t _priority;
+
+public:
+ PauseHandler() : _handler(0), _disk(0), _priority(0) {}
+ PauseHandler(FileStorHandler& handler, uint16_t disk)
+ : _handler(&handler),
+ _disk(disk),
+ _priority(0)
+ {
+ }
+
+ void setPriority(uint8_t priority) { _priority = priority; }
+
+ void pause() const { if (_handler != 0) _handler->pause(_disk, _priority); }
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/persistence/mergehandler.cpp b/storage/src/vespa/storage/persistence/mergehandler.cpp
new file mode 100644
index 00000000000..bd5b4febf18
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/mergehandler.cpp
@@ -0,0 +1,1598 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/persistence/mergehandler.h>
+#include <vespa/vespalib/stllike/asciistream.h>
+
+#include <vespa/log/log.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/document/fieldset/fieldsets.h>
+
+LOG_SETUP(".persistence.mergehandler");
+
+namespace storage {
+
+MergeHandler::MergeHandler(spi::PersistenceProvider& spi,
+ PersistenceUtil& env)
+ : _spi(spi),
+ _env(env),
+ _maxChunkSize(env._config.bucketMergeChunkSize)
+{
+}
+
+MergeHandler::MergeHandler(spi::PersistenceProvider& spi,
+ PersistenceUtil& env,
+ uint32_t maxChunkSize)
+ : _spi(spi),
+ _env(env),
+ _maxChunkSize(maxChunkSize)
+{
+}
+
+namespace {
+
+int getDeleteFlag() {
+ // Referred into old slotfile code before. Where should this number come from?
+ return 2;
+}
+
+/**
+ * Throws std::runtime_error if result has an error.
+ */
+void
+checkResult(const spi::Result& result,
+ const spi::Bucket& bucket,
+ const document::DocumentId& docId,
+ const char* op)
+{
+ if (result.hasError()) {
+ std::ostringstream ss;
+ ss << "Failed " << op
+ << " for " << docId
+ << " in " << bucket
+ << ": " << result;
+ throw std::runtime_error(ss.str());
+ }
+}
+
+void
+checkResult(const spi::Result& result,
+ const spi::Bucket& bucket,
+ const char* op)
+{
+ if (result.hasError()) {
+ std::ostringstream ss;
+ ss << "Failed " << op << " in " << bucket << ": " << result;
+ throw std::runtime_error(ss.str());
+ }
+}
+
+
+class IteratorGuard
+{
+ spi::PersistenceProvider& _spi;
+ spi::IteratorId _iteratorId;
+ spi::Context& _context;
+public:
+ IteratorGuard(spi::PersistenceProvider& spi,
+ spi::IteratorId iteratorId,
+ spi::Context& context)
+ : _spi(spi),
+ _iteratorId(iteratorId),
+ _context(context)
+ {}
+ ~IteratorGuard()
+ {
+ assert(_iteratorId != 0);
+ _spi.destroyIterator(_iteratorId, _context);
+ }
+};
+
+class FlushGuard
+{
+ spi::PersistenceProvider& _spi;
+ spi::Bucket _bucket;
+ spi::Context& _context;
+ bool _hasFlushed;
+public:
+ FlushGuard(spi::PersistenceProvider& spi,
+ const spi::Bucket& bucket,
+ spi::Context& context)
+ : _spi(spi),
+ _bucket(bucket),
+ _context(context),
+ _hasFlushed(false)
+ {}
+ ~FlushGuard()
+ {
+ if (!_hasFlushed) {
+ LOG(debug, "Auto-flushing %s", _bucket.toString().c_str());
+ spi::Result result =_spi.flush(_bucket, _context);
+ if (result.hasError()) {
+ LOG(debug, "Flush %s failed: %s",
+ _bucket.toString().c_str(),
+ result.toString().c_str());
+ }
+ }
+ }
+ void flush() {
+ LOG(debug, "Flushing %s", _bucket.toString().c_str());
+ _hasFlushed = true;
+ checkResult(_spi.flush(_bucket, _context), _bucket, "flush");
+ }
+};
+
+struct IndirectDocEntryTimestampPredicate
+{
+ bool operator()(const spi::DocEntry::LP& e1,
+ const spi::DocEntry::LP& e2) const
+ {
+ return e1->getTimestamp() < e2->getTimestamp();
+ }
+
+ bool operator()(const spi::DocEntry::LP& e,
+ const spi::Timestamp timestamp) const
+ {
+ return e->getTimestamp() < timestamp;
+ }
+};
+
+struct DiffEntryTimestampPredicate
+{
+ bool operator()(const api::ApplyBucketDiffCommand::Entry& e,
+ const api::Timestamp timestamp) const
+ {
+ return e._entry._timestamp < timestamp;
+ }
+};
+
+} // anonymous namespace
+
+void
+MergeHandler::populateMetaData(
+ const spi::Bucket& bucket,
+ Timestamp maxTimestamp,
+ std::vector<spi::DocEntry::LP>& entries,
+ spi::Context& context)
+{
+ spi::DocumentSelection docSel("");
+
+ spi::Selection sel(docSel);
+ sel.setToTimestamp(spi::Timestamp(maxTimestamp.getTime()));
+ spi::CreateIteratorResult createIterResult(_spi.createIterator(
+ bucket,
+ document::NoFields(),
+ sel,
+ spi::ALL_VERSIONS,
+ context));
+
+ if (createIterResult.getErrorCode() != spi::Result::NONE) {
+ std::ostringstream ss;
+ ss << "Failed to create iterator for "
+ << bucket
+ << ": "
+ << createIterResult.getErrorMessage();
+ throw std::runtime_error(ss.str());
+ }
+ spi::IteratorId iteratorId(createIterResult.getIteratorId());
+ IteratorGuard iteratorGuard(_spi, iteratorId, context);
+
+ while (true) {
+ spi::IterateResult result(
+ _spi.iterate(iteratorId, UINT64_MAX, context));
+ if (result.getErrorCode() != spi::Result::NONE) {
+ std::ostringstream ss;
+ ss << "Failed to iterate for "
+ << bucket
+ << ": "
+ << result.getErrorMessage();
+ throw std::runtime_error(ss.str());
+ }
+ for (size_t i = 0; i < result.getEntries().size(); ++i) {
+ entries.push_back(result.getEntries()[i]);
+ }
+ if (result.isCompleted()) {
+ break;
+ }
+ }
+ std::sort(entries.begin(), entries.end(),
+ IndirectDocEntryTimestampPredicate());
+}
+
+bool
+MergeHandler::buildBucketInfoList(
+ const spi::Bucket& bucket,
+ const documentapi::LoadType& /*loadType*/,
+ Timestamp maxTimestamp,
+ uint8_t myNodeIndex,
+ std::vector<api::GetBucketDiffCommand::Entry>& output,
+ spi::Context& context)
+{
+ assert(output.size() == 0);
+ assert(myNodeIndex < 16);
+ uint32_t oldSize = output.size();
+ typedef api::BucketInfo DbBucketInfo;
+
+ // Always verify that bucket database is correct in merge, such that
+ // any out of sync data get fixed. Such errors must of course also be
+ // fixed, but by making merge fix it, distributors will stop and spin
+ // on merge, never getting their problems fixed.
+ {
+ StorBucketDatabase& db(_env.getBucketDatabase());
+ StorBucketDatabase::WrappedEntry entry(
+ db.get(bucket.getBucketId(), "MergeHandler::buildBucketInfoList"));
+ if (entry.exist()) {
+ spi::BucketInfoResult infoResult(_spi.getBucketInfo(bucket));
+
+ if (infoResult.getErrorCode() != spi::Result::NONE) {
+ std::ostringstream ss;
+ ss << "Failed to get bucket info for "
+ << bucket << ": "
+ << infoResult.getErrorMessage();
+ LOG(warning, "%s", ss.str().c_str());
+ throw std::runtime_error(ss.str());
+ }
+ DbBucketInfo dbInfo(entry->getBucketInfo());
+ const spi::BucketInfo& tmpInfo(infoResult.getBucketInfo());
+ DbBucketInfo providerInfo(tmpInfo.getChecksum(),
+ tmpInfo.getDocumentCount(),
+ tmpInfo.getDocumentSize(),
+ tmpInfo.getEntryCount(),
+ tmpInfo.getUsedSize(),
+ tmpInfo.isReady(),
+ tmpInfo.isActive(),
+ dbInfo.getLastModified());
+
+ if (!dbInfo.equalDocumentInfo(providerInfo)) {
+ if (dbInfo.valid()) {
+ LOG(warning, "Prior to merging %s we found that storage "
+ "bucket database was out of sync with content "
+ "of file. Actual file content is %s while "
+ "bucket database content was %s. Updating"
+ " bucket database to get in sync.",
+ bucket.toString().c_str(),
+ providerInfo.toString().c_str(),
+ dbInfo.toString().c_str());
+ DUMP_LOGGED_BUCKET_OPERATIONS(bucket.getBucketId());
+ }
+
+ entry->setBucketInfo(providerInfo);
+ entry.write();
+ }
+ } else {
+ return false;
+ }
+ }
+
+ std::vector<spi::DocEntry::LP> entries;
+ populateMetaData(bucket, maxTimestamp, entries, context);
+
+ for (size_t i = 0; i < entries.size(); ++i) {
+ api::GetBucketDiffCommand::Entry diff;
+ const spi::DocEntry& entry(*entries[i]);
+ diff._gid = GlobalId();
+ // We do not know doc sizes at this point, so just set to 0
+ diff._headerSize = 0;
+ diff._bodySize = 0;
+ diff._timestamp = entry.getTimestamp();
+ diff._flags = IN_USE
+ | (entry.isRemove() ? DELETED : 0);
+ diff._hasMask = 1 << myNodeIndex;
+ output.push_back(diff);
+
+ LOG(spam, "bucket info list of %s: Adding entry %s to diff",
+ bucket.toString().c_str(), diff.toString(true).c_str());
+ }
+ LOG(spam, "Built bucket info list of %s. Got %u entries.",
+ bucket.toString().c_str(), (uint32_t) (output.size() - oldSize));
+ return true;
+}
+
+namespace {
+
+ /**
+ * Find out whether we need to read data locally yet.
+ */
+ bool applyDiffNeedLocalData(
+ const std::vector<api::ApplyBucketDiffCommand::Entry>& diff,
+ uint8_t nodeIndex,
+ bool forwards)
+ {
+ if (!forwards && nodeIndex == 0) return false;
+ uint32_t result = 1 << nodeIndex;
+ uint32_t mask = 3 << (forwards ? nodeIndex : nodeIndex-1);
+ for (std::vector<api::ApplyBucketDiffCommand::Entry>::const_iterator it
+ = diff.begin(); it != diff.end(); ++it)
+ {
+ if (it->filled()) continue;
+ if ((it->_entry._hasMask & mask) == result) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Returns true if a diff from an ApplyBucketDiff message has all data
+ * needed by this local node.
+ */
+ bool applyDiffHasLocallyNeededData(
+ const std::vector<api::ApplyBucketDiffCommand::Entry>& diff,
+ uint8_t nodeIndex)
+ {
+ uint32_t nodeMask = 1 << nodeIndex;
+ bool foundEntries = false;
+ for (std::vector<api::ApplyBucketDiffCommand::Entry>::const_iterator it
+ = diff.begin(); it != diff.end(); ++it)
+ {
+ // Ignore entries we don't need locally
+ if ((it->_entry._hasMask & nodeMask) != 0) continue;
+ foundEntries = true;
+ if (it->filled()) return true;
+ }
+ if (foundEntries) {
+ LOG(spam, "Merge(): Found entries needed, but they don't contain "
+ "data");
+ }
+ return false;
+ }
+
+ int
+ countUnfilledEntries(
+ const std::vector<api::ApplyBucketDiffCommand::Entry>& diff)
+ {
+ int count = 0;
+
+ for (uint32_t i=0, n=diff.size(); i<n; ++i) {
+ if (!diff[i].filled()) count++;
+ }
+
+ return count;
+ };
+
+ /**
+ * Get the smallest value that is dividable by blocksize, but is not
+ * smaller than value.
+ */
+ template<typename T>
+ T align(T value, uint32_t blocksize) {
+ value += blocksize - 1;
+ value -= value % blocksize;
+ return value;
+ }
+
+ api::StorageMessageAddress createAddress(const std::string& clusterName,
+ uint16_t node)
+ {
+ return api::StorageMessageAddress(
+ clusterName, lib::NodeType::STORAGE, node);
+ }
+
+ void assertContainedInBucket(const document::DocumentId& docId,
+ const document::BucketId& bucket,
+ const document::BucketIdFactory& idFactory)
+ {
+ document::BucketId docBucket(idFactory.getBucketId(docId));
+ if (!bucket.contains(docBucket)) {
+ LOG(error,
+ "Broken bucket invariant discovered while fetching data from "
+ "local persistence layer during merging; document %s does not "
+ "belong in %s. Aborting to prevent broken document data from "
+ "spreading to other nodes in the cluster.",
+ docId.toString().c_str(),
+ bucket.toString().c_str());
+ assert(!"Document not contained in bucket");
+ }
+ }
+
+} // End of anonymous namespace
+
+void
+MergeHandler::fetchLocalData(
+ const spi::Bucket& bucket,
+ const documentapi::LoadType& /*loadType*/,
+ std::vector<api::ApplyBucketDiffCommand::Entry>& diff,
+ uint8_t nodeIndex,
+ spi::Context& context)
+{
+ uint32_t nodeMask = 1 << nodeIndex;
+ // Preload documents in memory
+ std::vector<spi::Timestamp> slots;
+ uint32_t alreadyFilled = 0;
+ for (uint32_t i=0, n=diff.size(); i<n; ++i) {
+ api::ApplyBucketDiffCommand::Entry& e(diff[i]);
+ if ((e._entry._hasMask & nodeMask) != 0 && !e.filled()) {
+ slots.push_back(spi::Timestamp(e._entry._timestamp));
+ }
+ if (e.filled()) {
+ alreadyFilled += e._headerBlob.size() + e._bodyBlob.size();
+ }
+ }
+ uint32_t remainingSize = _maxChunkSize - std::min(_maxChunkSize,
+ alreadyFilled);
+ LOG(debug, "Diff of %s has already filled %u of max %u bytes, "
+ "remaining size to fill is %u",
+ bucket.toString().c_str(), alreadyFilled, _maxChunkSize, remainingSize);
+ if (remainingSize == 0) {
+ LOG(debug,
+ "Diff already at max chunk size, not fetching any local data");
+ return;
+ }
+
+ spi::DocumentSelection docSel("");
+
+ spi::Selection sel(docSel);
+ sel.setTimestampSubset(slots);
+ spi::CreateIteratorResult createIterResult(
+ _spi.createIterator(bucket,
+ document::AllFields(),
+ sel,
+ spi::NEWEST_DOCUMENT_OR_REMOVE,
+ context));
+
+ if (createIterResult.getErrorCode() != spi::Result::NONE) {
+ std::ostringstream ss;
+ ss << "Failed to create iterator for "
+ << bucket.toString()
+ << ": "
+ << createIterResult.getErrorMessage();
+ throw std::runtime_error(ss.str());
+ }
+ spi::IteratorId iteratorId(createIterResult.getIteratorId());
+ IteratorGuard iteratorGuard(_spi, iteratorId, context);
+
+ // Fetch all entries
+ std::vector<spi::DocEntry::LP> entries;
+ entries.reserve(slots.size());
+ bool fetchedAllLocalData = false;
+ bool chunkLimitReached = false;
+ while (true) {
+ spi::IterateResult result(
+ _spi.iterate(iteratorId, remainingSize, context));
+ if (result.getErrorCode() != spi::Result::NONE) {
+ std::ostringstream ss;
+ ss << "Failed to iterate for "
+ << bucket.toString()
+ << ": "
+ << result.getErrorMessage();
+ throw std::runtime_error(ss.str());
+ }
+ for (size_t i = 0; i < result.getEntries().size(); ++i) {
+ if (result.getEntries()[i]->getSize() <= remainingSize
+ || (entries.empty() && alreadyFilled == 0))
+ {
+ entries.push_back(result.getEntries()[i]);
+ remainingSize -= result.getEntries()[i]->getSize();
+ LOG(spam, "Added %s, remainingSize is %u",
+ entries.back()->toString().c_str(),
+ remainingSize);
+ } else {
+ LOG(spam, "Adding %s would exceed chunk size limit of %u; "
+ "not filling up any more diffs for current round",
+ result.getEntries()[i]->toString().c_str(), _maxChunkSize);
+ chunkLimitReached = true;
+ break;
+ }
+ }
+ if (result.isCompleted() && !chunkLimitReached) {
+ fetchedAllLocalData = true;
+ break;
+ } else if (chunkLimitReached) {
+ break;
+ }
+ }
+
+ document::BucketIdFactory idFactory;
+
+ for (size_t i=0; i<entries.size(); ++i) {
+ const spi::DocEntry& docEntry(*entries[i]);
+ LOG(spam, "fetchLocalData: processing %s",
+ docEntry.toString().c_str());
+
+ std::vector<api::ApplyBucketDiffCommand::Entry>::iterator iter(
+ std::lower_bound(diff.begin(),
+ diff.end(),
+ api::Timestamp(docEntry.getTimestamp()),
+ DiffEntryTimestampPredicate()));
+ assert(iter != diff.end());
+ assert(iter->_entry._timestamp == docEntry.getTimestamp());
+ api::ApplyBucketDiffCommand::Entry& e(*iter);
+
+ if (!docEntry.isRemove()) {
+ const Document* doc = docEntry.getDocument();
+ assert(doc != 0);
+ assertContainedInBucket(doc->getId(), bucket, idFactory);
+ e._docName = doc->getId().toString();
+ {
+ vespalib::nbostream stream;
+ doc->serializeHeader(stream);
+ e._headerBlob.resize(stream.size());
+ memcpy(&e._headerBlob[0], stream.peek(), stream.size());
+ }
+ {
+ vespalib::nbostream stream;
+ doc->serializeBody(stream);
+ e._bodyBlob.resize(stream.size());
+ memcpy(&e._bodyBlob[0], stream.peek(), stream.size());
+ }
+ } else {
+ const DocumentId* docId = docEntry.getDocumentId();
+ assert(docId != 0);
+ assertContainedInBucket(*docId, bucket, idFactory);
+ if (e._entry._flags & DELETED) {
+ e._docName = docId->toString();
+ } else {
+ LOG(debug, "Diff contains non-remove entry %s, but local entry "
+ "was remove entry %s. Node will be removed from hasmask",
+ e.toString().c_str(), docEntry.toString().c_str());
+ }
+ }
+ e._repo = _env._repo.get();
+ }
+
+ for (size_t i=0; i<diff.size(); ++i) {
+ api::ApplyBucketDiffCommand::Entry& e(diff[i]);
+ if ((e._entry._hasMask & nodeMask) == 0 || e.filled()) {
+ continue;
+ }
+ if (fetchedAllLocalData) {
+ e._entry._hasMask &= ~nodeMask;
+ LOG(debug, "During merge, slot %" PRIu64 " no longer exists. "
+ "Removing it from hasmask of current node.",
+ e._entry._timestamp);
+ }
+ }
+
+ LOG(spam, "Fetched %" PRIu64 " entries locally to fill out diff for %s. "
+ "Still %d unfilled entries",
+ entries.size(), bucket.toString().c_str(), countUnfilledEntries(diff));
+}
+
+document::Document::UP
+MergeHandler::deserializeDiffDocument(
+ const api::ApplyBucketDiffCommand::Entry& e,
+ const document::DocumentTypeRepo& repo) const
+{
+ Document::UP doc(new Document);
+ using document::ByteBuffer;
+ ByteBuffer hbuf(&e._headerBlob[0], e._headerBlob.size());
+ if (e._bodyBlob.size() > 0) {
+ ByteBuffer bbuf(&e._bodyBlob[0], e._bodyBlob.size());
+ doc->deserialize(repo, hbuf, bbuf);
+ } else {
+ doc->deserialize(repo, hbuf);
+ }
+ return doc;
+}
+
+void
+MergeHandler::applyDiffEntry(const spi::Bucket& bucket,
+ const api::ApplyBucketDiffCommand::Entry& e,
+ spi::Context& context,
+ const document::DocumentTypeRepo& repo)
+{
+ spi::Timestamp timestamp(e._entry._timestamp);
+ if (!(e._entry._flags & (DELETED | DELETED_IN_PLACE))) {
+ // Regular put entry
+ Document::SP doc(deserializeDiffDocument(e, repo));
+ checkResult(_spi.put(bucket, timestamp, doc, context),
+ bucket,
+ doc->getId(),
+ "put");
+ } else {
+ DocumentId docId(e._docName);
+ checkResult(_spi.remove(bucket, timestamp, docId, context),
+ bucket,
+ docId,
+ "remove");
+ }
+}
+
+/**
+ * Apply the diffs needed locally.
+ */
+api::BucketInfo
+MergeHandler::applyDiffLocally(
+ const spi::Bucket& bucket,
+ const documentapi::LoadType& /*loadType*/,
+ std::vector<api::ApplyBucketDiffCommand::Entry>& diff,
+ uint8_t nodeIndex,
+ spi::Context& context)
+{
+ // Sort the data to apply by which file they should be added to
+ LOG(spam, "Merge(%s): Applying data locally. Diff has %zu entries",
+ bucket.toString().c_str(),
+ diff.size());
+ uint32_t nodeMask = 1 << nodeIndex;
+ uint32_t byteCount = 0;
+ uint32_t addedCount = 0;
+ uint32_t notNeededByteCount = 0;
+
+ std::vector<spi::DocEntry::LP> entries;
+ populateMetaData(bucket, MAX_TIMESTAMP, entries, context);
+
+ FlushGuard flushGuard(_spi, bucket, context);
+
+ document::DocumentTypeRepo::SP repo(_env._component.getTypeRepo());
+ assert(repo.get() != nullptr);
+
+ uint32_t existingCount = entries.size();
+ uint32_t i = 0, j = 0;
+ while (i < diff.size() && j < existingCount) {
+ api::ApplyBucketDiffCommand::Entry& e(diff[i]);
+ const spi::DocEntry& existing(*entries[j]);
+
+ if (spi::Timestamp(e._entry._timestamp) > existing.getTimestamp()) {
+ ++j;
+ LOG(spam, "ApplyBucketDiff(%s): slot %s not in diff and "
+ "already present in persistence", bucket.toString().c_str(),
+ existing.toString().c_str());
+ continue;
+ }
+ if ((e._entry._hasMask & nodeMask) != 0) {
+ ++i;
+ if (!e.filled()) continue;
+ notNeededByteCount += e._headerBlob.size() + e._bodyBlob.size();
+ continue;
+ }
+ if (!e.filled()) {
+ ++i;
+ LOG(debug, "Failed to apply unretrieved entry %s to diff "
+ "locally on %s. Entry was probably compacted away.",
+ e.toString().c_str(), bucket.toString().c_str());
+ continue;
+ }
+
+ e._entry._hasMask |= nodeMask;
+ if (spi::Timestamp(e._entry._timestamp) < existing.getTimestamp()) {
+ ++i;
+ LOG(spam, "ApplyBucketDiff(%s): Adding slot %s",
+ bucket.toString().c_str(), e.toString().c_str());
+ applyDiffEntry(bucket, e, context, *repo);
+ } else {
+ assert(spi::Timestamp(e._entry._timestamp)
+ == existing.getTimestamp());
+ // Diffing for existing timestamp; should either both be put
+ // dupes (which is a common case) or the new entry should be an
+ // unrevertable remove.
+ ++i;
+ ++j;
+ if ((e._entry._flags & DELETED) && !existing.isRemove()) {
+ LOG(debug, "Slot in diff is remove for existing "
+ "timestamp in %s. Diff slot: %s. Existing slot: %s",
+ bucket.toString().c_str(), e.toString().c_str(),
+ existing.toString().c_str());
+ applyDiffEntry(bucket, e, context, *repo);
+ } else {
+ // Duplicate put, just ignore it.
+ LOG(debug, "During diff apply, attempting to add slot "
+ "whose timestamp already exists in %s, but assuming "
+ "these are for the same entry--ignoring it. "
+ "Diff slot: %s. Existing slot: %s",
+ bucket.toString().c_str(), e.toString().c_str(),
+ existing.toString().c_str());
+ }
+ continue;
+ }
+ byteCount += e._headerBlob.size() + e._bodyBlob.size();
+ }
+ // Handle remaining entries in diff
+ for (; i < diff.size(); ++i) {
+ api::ApplyBucketDiffCommand::Entry& e(diff[i]);
+ if ((e._entry._hasMask & nodeMask) != 0) {
+ if (!e.filled()) continue;
+ notNeededByteCount += e._headerBlob.size() + e._bodyBlob.size();
+ continue;
+ }
+ if (!e.filled()) {
+ LOG(debug, "Failed to apply unretrieved entry %s to diff "
+ "locally on %s. Entry was probably compacted away.",
+ e.toString().c_str(), bucket.toString().c_str());
+ continue;
+ }
+ e._entry._hasMask |= nodeMask;
+ LOG(spam, "ApplyBucketDiff(%s): Adding slot %s",
+ bucket.toString().c_str(), e.toString().c_str());
+
+ applyDiffEntry(bucket, e, context, *repo);
+ byteCount += e._headerBlob.size() + e._bodyBlob.size();
+ }
+
+ if (byteCount + notNeededByteCount != 0) {
+ _env._metrics.mergeAverageDataReceivedNeeded.addValue(
+ static_cast<double>(byteCount) / (byteCount + notNeededByteCount));
+ }
+ _env._metrics.bytesMerged.inc(byteCount);
+ LOG(debug, "Merge(%s): Applied %u entries locally from ApplyBucketDiff.",
+ bucket.toString().c_str(), addedCount);
+
+ flushGuard.flush();
+
+ spi::BucketInfoResult infoResult(_spi.getBucketInfo(bucket));
+ if (infoResult.getErrorCode() != spi::Result::NONE) {
+ LOG(warning, "Failed to get bucket info for %s: %s",
+ bucket.toString().c_str(),
+ infoResult.getErrorMessage().c_str());
+ throw std::runtime_error("Failed to invoke getBucketInfo on "
+ "persistence provider");
+ }
+ const spi::BucketInfo& tmpInfo(infoResult.getBucketInfo());
+ api::BucketInfo providerInfo(tmpInfo.getChecksum(),
+ tmpInfo.getDocumentCount(),
+ tmpInfo.getDocumentSize(),
+ tmpInfo.getEntryCount(),
+ tmpInfo.getUsedSize(),
+ tmpInfo.isReady(),
+ tmpInfo.isActive());
+
+ _env.updateBucketDatabase(bucket.getBucketId(), providerInfo);
+ return providerInfo;
+}
+
+namespace {
+ void findCandidates(const document::BucketId& id, MergeStatus& status,
+ bool constrictHasMask, uint16_t hasMask,
+ uint16_t newHasMask,
+ uint32_t maxSize, api::ApplyBucketDiffCommand& cmd)
+ {
+ uint32_t chunkSize = 0;
+ for (std::deque<api::GetBucketDiffCommand::Entry>::const_iterator it
+ = status.diff.begin(); it != status.diff.end(); ++it)
+ {
+ if (constrictHasMask && it->_hasMask != hasMask) {
+ continue;
+ }
+ if (chunkSize != 0 &&
+ chunkSize + it->_bodySize + it->_headerSize > maxSize)
+ {
+ LOG(spam, "Merge of %s used %d bytes, max is %d. Will "
+ "fetch in next merge round.",
+ id.toString().c_str(),
+ chunkSize + it->_bodySize + it->_headerSize,
+ maxSize);
+ break;
+ }
+ chunkSize += it->_bodySize + it->_headerSize;
+ cmd.getDiff().push_back(api::ApplyBucketDiffCommand::Entry(*it));
+ if (constrictHasMask) {
+ cmd.getDiff().back()._entry._hasMask = newHasMask;
+ }
+ }
+ }
+}
+
+api::StorageReply::SP
+MergeHandler::processBucketMerge(const spi::Bucket& bucket, MergeStatus& status,
+ MessageSender& sender, spi::Context& context)
+{
+ // If last action failed, fail the whole merge
+ if (status.reply->getResult().failed()) {
+ LOG(warning, "Done with merge of %s (failed: %s) %s",
+ bucket.toString().c_str(),
+ status.reply->getResult().toString().c_str(),
+ status.toString().c_str());
+ return status.reply;
+ }
+
+ // If nothing to update, we're done.
+ if (status.diff.size() == 0) {
+ LOG(debug, "Done with merge of %s. No more entries in diff.",
+ bucket.toString().c_str());
+ return status.reply;
+ }
+
+ LOG(spam, "Processing merge of %s. %u entries left to merge.",
+ bucket.toString().c_str(), (uint32_t) status.diff.size());
+ std::shared_ptr<api::ApplyBucketDiffCommand> cmd;
+
+ // If we still have a source only node, eliminate that one from the
+ // merge.
+ while (status.nodeList.back().sourceOnly) {
+ std::vector<api::MergeBucketCommand::Node> nodes;
+ for (uint16_t i=0; i<status.nodeList.size(); ++i) {
+ if (!status.nodeList[i].sourceOnly) {
+ nodes.push_back(status.nodeList[i]);
+ }
+ }
+ nodes.push_back(status.nodeList.back());
+ assert(nodes.size() > 1);
+
+ // Add all the metadata, and thus use big limit. Max
+ // data to fetch parameter will control amount added.
+ uint32_t maxSize =
+ (_env._config.enableMergeLocalNodeChooseDocsOptimalization
+ ? std::numeric_limits<uint32_t>().max()
+ : _maxChunkSize);
+
+ cmd.reset(new api::ApplyBucketDiffCommand(
+ bucket.getBucketId(), nodes, maxSize));
+ cmd->setAddress(createAddress(_env._component.getClusterName(),
+ nodes[1].index));
+ findCandidates(bucket.getBucketId(),
+ status,
+ true,
+ 1 << (status.nodeList.size() - 1),
+ 1 << (nodes.size() - 1),
+ maxSize,
+ *cmd);
+ if (cmd->getDiff().size() != 0) break;
+ cmd.reset();
+ // If we found no data to merge from the last source only node,
+ // remove it and retry. (Clear it out of the hasmask such that we
+ // can match hasmask with operator==)
+ status.nodeList.pop_back();
+ uint16_t mask = ~(1 << status.nodeList.size());
+ for (std::deque<api::GetBucketDiffCommand::Entry>::iterator it
+ = status.diff.begin(); it != status.diff.end(); ++it)
+ {
+ it->_hasMask &= mask;
+ }
+ // If only one node left in the merge, return ok.
+ if (status.nodeList.size() == 1) {
+ LOG(debug, "Done with merge of %s as there is only one node "
+ "that is not source only left in the merge.",
+ bucket.toString().c_str());
+ return status.reply;
+ }
+ }
+ // If we did not have a source only node, check if we have a path with
+ // many documents within it that we'll merge separately
+ if (cmd.get() == 0) {
+ std::map<uint16_t, uint32_t> counts;
+ for (std::deque<api::GetBucketDiffCommand::Entry>::const_iterator it
+ = status.diff.begin(); it != status.diff.end(); ++it)
+ {
+ ++counts[it->_hasMask];
+ }
+ for (std::map<uint16_t, uint32_t>::const_iterator it = counts.begin();
+ it != counts.end(); ++it)
+ {
+ if (it->second >= uint32_t(
+ _env._config.commonMergeChainOptimalizationMinimumSize)
+ || counts.size() == 1)
+ {
+ LOG(spam, "Sending separate apply bucket diff for path %x "
+ "with size %u",
+ it->first, it->second);
+ std::vector<api::MergeBucketCommand::Node> nodes;
+ // This node always has to be first in chain.
+ nodes.push_back(status.nodeList[0]);
+ // Add all the nodes that lack the docs in question
+ for (uint16_t i=1; i<status.nodeList.size(); ++i) {
+ if ((it->first & (1 << i)) == 0) {
+ nodes.push_back(status.nodeList[i]);
+ }
+ }
+ uint16_t newMask = 1;
+ // If this node doesn't have the docs, add a node that has
+ // them to the end of the chain, so the data is applied
+ // going back.
+ if ((it->first & 1) == 0) {
+ for (uint16_t i=1; i<status.nodeList.size(); ++i) {
+ if ((it->first & (1 << i)) != 0) {
+ nodes.push_back(status.nodeList[i]);
+ break;
+ }
+ }
+ newMask = 1 << (nodes.size() - 1);
+ }
+ assert(nodes.size() > 1);
+ uint32_t maxSize =
+ (_env._config.enableMergeLocalNodeChooseDocsOptimalization
+ ? std::numeric_limits<uint32_t>().max()
+ : _maxChunkSize);
+ cmd.reset(new api::ApplyBucketDiffCommand(
+ bucket.getBucketId(), nodes, maxSize));
+ cmd->setAddress(
+ createAddress(_env._component.getClusterName(),
+ nodes[1].index));
+ // Add all the metadata, and thus use big limit. Max
+ // data to fetch parameter will control amount added.
+ findCandidates(bucket.getBucketId(), status, true,
+ it->first, newMask, maxSize, *cmd);
+ break;
+ }
+ }
+ }
+
+ // If we found no group big enough to handle on its own, do a common
+ // merge to merge the remaining data.
+ if (cmd.get() == 0) {
+ cmd.reset(new api::ApplyBucketDiffCommand(bucket.getBucketId(),
+ status.nodeList,
+ _maxChunkSize));
+ cmd->setAddress(createAddress(_env._component.getClusterName(),
+ status.nodeList[1].index));
+ findCandidates(bucket.getBucketId(), status, false, 0, 0,
+ _maxChunkSize, *cmd);
+ }
+ cmd->setPriority(status.context.getPriority());
+ cmd->setTimeout(status.timeout);
+ if (applyDiffNeedLocalData(cmd->getDiff(), 0, true)) {
+ framework::MilliSecTimer startTime(_env._component.getClock());
+ fetchLocalData(bucket, cmd->getLoadType(), cmd->getDiff(), 0, context);
+ _env._metrics.mergeDataReadLatency.addValue(startTime);
+ }
+ status.pendingId = cmd->getMsgId();
+ LOG(debug, "Sending %s", cmd->toString().c_str());
+ sender.sendCommand(cmd);
+ return api::StorageReply::SP();
+}
+
+/** Ensures merge states are deleted if we fail operation */
+class MergeStateDeleter {
+public:
+ FileStorHandler& _handler;
+ document::BucketId _bucket;
+ bool _active;
+
+ MergeStateDeleter(FileStorHandler& handler,
+ const document::BucketId& bucket)
+ : _handler(handler),
+ _bucket(bucket),
+ _active(true)
+ {
+ }
+
+ ~MergeStateDeleter() {
+ if (_active) {
+ _handler.clearMergeStatus(_bucket);
+ }
+ }
+
+ void deactivate() { _active = false; }
+};
+
+MessageTracker::UP
+MergeHandler::handleMergeBucket(api::MergeBucketCommand& cmd,
+ spi::Context& context)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.mergeBuckets,
+ _env._component.getClock()));
+
+ const document::BucketId& id(cmd.getBucketId());
+ spi::Bucket bucket(id, spi::PartitionId(_env._partition));
+ LOG(debug, "MergeBucket(%s) with max timestamp %" PRIu64 ".",
+ bucket.toString().c_str(), cmd.getMaxTimestamp());
+
+ if (cmd.getNodes().size() < 2) {
+ LOG(debug, "Attempt to merge a single instance of a bucket");
+ tracker->fail(ReturnCode::ILLEGAL_PARAMETERS,
+ "Cannot merge a single copy");
+ return tracker;
+ }
+
+ // Verify that first node is not source only, and that all source only
+ // nodes are at end of chain
+ for (uint16_t i=0; i<cmd.getNodes().size(); ++i) {
+ if (i == 0) {
+ if (cmd.getNodes()[i].sourceOnly) {
+ tracker->fail(ReturnCode::ILLEGAL_PARAMETERS,
+ "Attempted to merge a chain where the first node "
+ "in the chain is source only.");
+ return tracker;
+ }
+ } else {
+ if (!cmd.getNodes()[i].sourceOnly
+ && cmd.getNodes()[i-1].sourceOnly)
+ {
+ tracker->fail(ReturnCode::ILLEGAL_PARAMETERS,
+ "Attempted to merge a chain where the source only "
+ "copies are not in end of chain.");
+ return tracker;
+ }
+ }
+ }
+
+ if (_env._fileStorHandler.isMerging(id)) {
+ const char* err = "A merge is already running on this bucket.";
+ LOG(debug, err);
+ tracker->fail(ReturnCode::BUSY, err);
+ return tracker;
+ }
+ checkResult(_spi.createBucket(bucket, context), bucket, "create bucket");
+
+ MergeStateDeleter stateGuard(_env._fileStorHandler, id);
+ MergeStatus::SP s = MergeStatus::SP(new MergeStatus(
+ _env._component.getClock(), cmd.getLoadType(),
+ cmd.getPriority(), cmd.getTrace().getLevel()));
+ _env._fileStorHandler.addMergeStatus(id, s);
+ s->nodeList = cmd.getNodes();
+ s->maxTimestamp = Timestamp(cmd.getMaxTimestamp());
+ s->timeout = cmd.getTimeout();
+ s->startTime = framework::MilliSecTimer(_env._component.getClock());
+
+ std::shared_ptr<api::GetBucketDiffCommand> cmd2(
+ new api::GetBucketDiffCommand(id,
+ s->nodeList,
+ s->maxTimestamp.getTime()));
+ if (!buildBucketInfoList(bucket,
+ cmd.getLoadType(),
+ s->maxTimestamp,
+ 0,
+ cmd2->getDiff(),
+ context))
+ {
+ LOG(debug, "Bucket non-existing in db. Failing merge.");
+ tracker->fail(ReturnCode::BUCKET_DELETED,
+ "Bucket not found in buildBucketInfo step");
+ return tracker;
+ }
+ _env._metrics.mergeMetadataReadLatency.addValue(s->startTime);
+ LOG(spam, "Sending GetBucketDiff %" PRIu64 " for %s to next node %u "
+ "with diff of %u entries.",
+ cmd2->getMsgId(),
+ bucket.toString().c_str(),
+ s->nodeList[1].index,
+ uint32_t(cmd2->getDiff().size()));
+ cmd2->setAddress(createAddress(_env._component.getClusterName(),
+ s->nodeList[1].index));
+ cmd2->setPriority(s->context.getPriority());
+ cmd2->setTimeout(s->timeout);
+ cmd2->setSourceIndex(cmd.getSourceIndex());
+
+ s->pendingId = cmd2->getMsgId();
+ _env._fileStorHandler.sendCommand(cmd2);
+ // All went well. Dont delete state or send reply.
+ stateGuard.deactivate();
+ s->reply = api::StorageReply::SP(cmd.makeReply().release());
+ tracker->dontReply();
+ return tracker;
+}
+
+namespace {
+
+ uint8_t findOwnIndex(
+ const std::vector<api::MergeBucketCommand::Node>& nodeList,
+ uint16_t us)
+ {
+ for (uint32_t i=0, n=nodeList.size(); i<n; ++i) {
+ if (nodeList[i].index == us) return i;
+ }
+ throw vespalib::IllegalStateException(
+ "Got GetBucketDiff cmd on node not in nodelist in command",
+ VESPA_STRLOC);
+ }
+
+ struct DiffEntryTimestampOrder
+ : public std::binary_function<api::GetBucketDiffCommand::Entry,
+ api::GetBucketDiffCommand::Entry, bool>
+ {
+ bool operator()(const api::GetBucketDiffCommand::Entry& x,
+ const api::GetBucketDiffCommand::Entry& y) const
+ { return (x._timestamp < y._timestamp); }
+ };
+
+ /**
+ * Merges list A and list B together and puts the result in result.
+ * Result is swapped in as last step to keep function exception safe. Thus
+ * result can be listA or listB if wanted.
+ *
+ * listA and listB are assumed to be in the order found in the slotfile, or
+ * in the order given by a previous call to this function. (In both cases
+ * this will be sorted by timestamp)
+ *
+ * @return false if any suspect entries was found.
+ */
+ bool mergeLists(
+ const std::vector<api::GetBucketDiffCommand::Entry>& listA,
+ const std::vector<api::GetBucketDiffCommand::Entry>& listB,
+ std::vector<api::GetBucketDiffCommand::Entry>& finalResult)
+ {
+ bool suspect = false;
+ std::vector<api::GetBucketDiffCommand::Entry> result;
+ uint32_t i = 0, j = 0;
+ while (i < listA.size() && j < listB.size()) {
+ const api::GetBucketDiffCommand::Entry& a(listA[i]);
+ const api::GetBucketDiffCommand::Entry& b(listB[j]);
+ if (a._timestamp < b._timestamp) {
+ result.push_back(a);
+ ++i;
+ } else if (a._timestamp > b._timestamp) {
+ result.push_back(b);
+ ++j;
+ } else {
+ // If we find equal timestamped entries that are not the
+ // same.. Flag an error. But there is nothing we can do
+ // about it. Note it as if it is the same entry so we
+ // dont try to merge them.
+ if (!(a == b)) {
+ if (a._gid == b._gid && a._flags == b._flags) {
+ if ((a._flags & getDeleteFlag()) != 0 &&
+ (b._flags & getDeleteFlag()) != 0)
+ {
+ // Unfortunately this can happen, for instance
+ // if a remove comes to a bucket out of sync
+ // and reuses different headers in the two
+ // versions.
+ LOG(debug, "Found entries with equal timestamps of "
+ "the same gid who both are remove "
+ "entries: %s <-> %s.",
+ a.toString(true).c_str(),
+ b.toString(true).c_str());
+ } else {
+ LOG(error, "Found entries with equal timestamps of "
+ "the same gid. This is likely same "
+ "document where size of document varies:"
+ " %s <-> %s.",
+ a.toString(true).c_str(),
+ b.toString(true).c_str());
+ }
+ result.push_back(a);
+ result.back()._hasMask |= b._hasMask;
+ suspect = true;
+ } else if ((a._flags & getDeleteFlag())
+ != (b._flags & getDeleteFlag()))
+ {
+ // If we find one remove and one put entry on the
+ // same timestamp we are going to keep the remove
+ // entry to make the copies consistent.
+ const api::GetBucketDiffCommand::Entry& deletedEntry(
+ (a._flags & getDeleteFlag()) != 0 ? a : b);
+ result.push_back(deletedEntry);
+ LOG(debug,
+ "Found put and remove on same timestamp. Keeping"
+ "remove as it is likely caused by remove with "
+ "copies unavailable at the time: %s, %s.",
+ a.toString().c_str(), b.toString().c_str());
+ } else {
+ LOG(error, "Found entries with equal timestamps that "
+ "weren't the same entry: %s, %s.",
+ a.toString().c_str(), b.toString().c_str());
+ result.push_back(a);
+ result.back()._hasMask |= b._hasMask;
+ suspect = true;
+ }
+ } else {
+ result.push_back(a);
+ result.back()._hasMask |= b._hasMask;
+ }
+ ++i;
+ ++j;
+ }
+ }
+ if (i < listA.size()) {
+ assert(j >= listB.size());
+ for (uint32_t n = listA.size(); i<n; ++i) {
+ result.push_back(listA[i]);
+ }
+ } else if (j < listB.size()) {
+ assert(i >= listA.size());
+ for (uint32_t n = listB.size(); j<n; ++j) {
+ result.push_back(listB[j]);
+ }
+ }
+ result.swap(finalResult);
+ return !suspect;
+ }
+
+}
+
+MessageTracker::UP
+MergeHandler::handleGetBucketDiff(api::GetBucketDiffCommand& cmd,
+ spi::Context& context)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.getBucketDiff,
+ _env._component.getClock()));
+ const document::BucketId& id(cmd.getBucketId());
+ spi::Bucket bucket(id, spi::PartitionId(_env._partition));
+ LOG(debug, "GetBucketDiff(%s)", bucket.toString().c_str());
+ checkResult(_spi.createBucket(bucket, context), bucket, "create bucket");
+
+ if (_env._fileStorHandler.isMerging(id)) {
+ tracker->fail(ReturnCode::BUSY,
+ "A merge is already running on this bucket.");
+ return tracker;
+ }
+ uint8_t index = findOwnIndex(cmd.getNodes(), _env._nodeIndex);
+ // Merge info for retrieved and local info.
+ std::vector<api::GetBucketDiffCommand::Entry>& remote(cmd.getDiff());
+ std::vector<api::GetBucketDiffCommand::Entry> local;
+ framework::MilliSecTimer startTime(_env._component.getClock());
+ if (!buildBucketInfoList(bucket, cmd.getLoadType(),
+ Timestamp(cmd.getMaxTimestamp()),
+ index, local, context))
+ {
+ LOG(debug, "Bucket non-existing in db. Failing merge.");
+ tracker->fail(ReturnCode::BUCKET_DELETED,
+ "Bucket not found in buildBucketInfo step");
+ return tracker;
+ }
+ if (!mergeLists(remote, local, local)) {
+ LOG(error, "Diffing %s found suspect entries.",
+ bucket.toString().c_str());
+ }
+ _env._metrics.mergeMetadataReadLatency.addValue(startTime);
+
+ // If last node in merge chain, we can send reply straight away
+ if (index + 1u >= cmd.getNodes().size()) {
+ // Remove entries everyone has from list first.
+ uint16_t completeMask = 0;
+ for (uint32_t i=0; i<cmd.getNodes().size(); ++i) {
+ if (!cmd.getNodes()[i].sourceOnly) {
+ completeMask |= (1 << i);
+ }
+ }
+ std::vector<api::GetBucketDiffCommand::Entry> final;
+ for (uint32_t i=0, n=local.size(); i<n; ++i) {
+ if ((local[i]._hasMask & completeMask) != completeMask) {
+ final.push_back(local[i]);
+ }
+ }
+ // Send reply
+ LOG(spam, "Replying to GetBucketDiff %" PRIu64 " for %s to node %d"
+ ". Diff has %" PRIu64 " entries. (%" PRIu64 " before compaction)",
+ cmd.getMsgId(), bucket.toString().c_str(),
+ cmd.getNodes()[index - 1].index, final.size(), local.size());
+
+ api::GetBucketDiffReply* reply = new api::GetBucketDiffReply(cmd);
+ tracker->setReply(api::StorageReply::SP(reply));
+ reply->getDiff().swap(final);
+ } else {
+ // When not the last node in merge chain, we must save reply, and
+ // send command on.
+ MergeStateDeleter stateGuard(_env._fileStorHandler, id);
+ MergeStatus::SP s(new MergeStatus(_env._component.getClock(),
+ cmd.getLoadType(), cmd.getPriority(),
+ cmd.getTrace().getLevel()));
+ _env._fileStorHandler.addMergeStatus(id, s);
+
+ s->pendingGetDiff =
+ api::GetBucketDiffReply::SP(new api::GetBucketDiffReply(cmd));
+ s->pendingGetDiff->setPriority(cmd.getPriority());
+
+ LOG(spam, "Sending GetBucketDiff for %s on to node %d, "
+ "added %" PRIu64 " new entries to diff.",
+ bucket.toString().c_str(), cmd.getNodes()[index + 1].index,
+ local.size() - remote.size());
+ std::shared_ptr<api::GetBucketDiffCommand> cmd2(
+ new api::GetBucketDiffCommand(
+ id, cmd.getNodes(), cmd.getMaxTimestamp()));
+ cmd2->setAddress(createAddress(_env._component.getClusterName(),
+ cmd.getNodes()[index + 1].index));
+ cmd2->getDiff().swap(local);
+ cmd2->setPriority(cmd.getPriority());
+ cmd2->setTimeout(cmd.getTimeout());
+ s->pendingId = cmd2->getMsgId();
+ _env._fileStorHandler.sendCommand(cmd2);
+
+ // Everything went fine. Don't delete state but wait for reply
+ stateGuard.deactivate();
+ tracker->dontReply();
+ }
+
+ return tracker;
+}
+
+namespace {
+
+ struct DiffInfoTimestampOrder
+ : public std::binary_function<api::GetBucketDiffCommand::Entry,
+ api::GetBucketDiffCommand::Entry, bool>
+ {
+ bool operator()(const api::GetBucketDiffCommand::Entry& x,
+ const api::GetBucketDiffCommand::Entry& y)
+ {
+ return (x._timestamp < y._timestamp);
+ }
+ };
+
+ struct ApplyDiffInfoTimestampOrder
+ : public std::binary_function<api::ApplyBucketDiffCommand::Entry,
+ api::ApplyBucketDiffCommand::Entry, bool>
+ {
+ bool operator()(const api::ApplyBucketDiffCommand::Entry& x,
+ const api::ApplyBucketDiffCommand::Entry& y)
+ {
+ return (x._entry._timestamp
+ < y._entry._timestamp);
+ }
+ };
+
+} // End of anonymous namespace
+
+void
+MergeHandler::handleGetBucketDiffReply(api::GetBucketDiffReply& reply,
+ MessageSender& sender)
+{
+ ++_env._metrics.getBucketDiffReply;
+ document::BucketId id(reply.getBucketId());
+ spi::Bucket bucket(id, spi::PartitionId(_env._partition));
+ LOG(debug, "GetBucketDiffReply(%s)", bucket.toString().c_str());
+
+ if (!_env._fileStorHandler.isMerging(id)) {
+ LOG(warning, "Got GetBucketDiffReply for %s which we have no "
+ "merge state for.",
+ bucket.toString().c_str());
+ DUMP_LOGGED_BUCKET_OPERATIONS(id);
+ return;
+ }
+
+ MergeStatus& s = _env._fileStorHandler.editMergeStatus(id);
+ if (s.pendingId != reply.getMsgId()) {
+ LOG(warning, "Got GetBucketDiffReply for %s which had message "
+ "id %" PRIu64 " when we expected %" PRIu64 ". Ignoring reply.",
+ bucket.toString().c_str(), reply.getMsgId(), s.pendingId);
+ DUMP_LOGGED_BUCKET_OPERATIONS(id);
+ return;
+ }
+ api::StorageReply::SP replyToSend;
+ bool clearState = true;
+
+ try {
+ if (s.isFirstNode()) {
+ if (reply.getResult().failed()) {
+ // We failed, so we should reply to the pending message.
+ replyToSend = s.reply;
+ } else {
+ // If we didn't fail, reply should have good content
+ // Sanity check for nodes
+ assert(reply.getNodes().size() >= 2);
+
+ // Get bucket diff should retrieve all info at once
+ assert(s.diff.size() == 0);
+ s.diff.insert(s.diff.end(),
+ reply.getDiff().begin(),
+ reply.getDiff().end());
+
+ replyToSend = processBucketMerge(bucket, s, sender, s.context);
+
+ if (!replyToSend.get()) {
+ // We have sent something on, and shouldn't reply now.
+ clearState = false;
+ } else {
+ _env._metrics.mergeLatencyTotal.addValue(s.startTime);
+ }
+ }
+ } else {
+ // Exists in send on list, send on!
+ replyToSend = s.pendingGetDiff;
+ LOG(spam, "Received GetBucketDiffReply for %s with diff of "
+ "size %" PRIu64 ". Sending it on.",
+ bucket.toString().c_str(), reply.getDiff().size());
+ s.pendingGetDiff->getDiff().swap(reply.getDiff());
+ }
+ } catch (std::exception& e) {
+ _env._fileStorHandler.clearMergeStatus(
+ id,
+ api::ReturnCode(api::ReturnCode::INTERNAL_FAILURE,
+ e.what()));
+ throw;
+ } catch (...) {
+ assert(false);
+ }
+
+ if (clearState) {
+ _env._fileStorHandler.clearMergeStatus(id);
+ }
+ if (replyToSend.get()) {
+ replyToSend->setResult(reply.getResult());
+ sender.sendReply(replyToSend);
+ }
+}
+
+MessageTracker::UP
+MergeHandler::handleApplyBucketDiff(api::ApplyBucketDiffCommand& cmd,
+ spi::Context& context)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.applyBucketDiff,
+ _env._component.getClock()));
+
+ const document::BucketId& id(cmd.getBucketId());
+ spi::Bucket bucket(id, spi::PartitionId(_env._partition));
+ LOG(debug, "%s", cmd.toString().c_str());
+
+ if (_env._fileStorHandler.isMerging(id)) {
+ tracker->fail(ReturnCode::BUSY,
+ "A merge is already running on this bucket.");
+ return tracker;
+ }
+
+ uint8_t index = findOwnIndex(cmd.getNodes(), _env._nodeIndex);
+ bool lastInChain = index + 1u >= cmd.getNodes().size();
+ if (applyDiffNeedLocalData(cmd.getDiff(), index, !lastInChain)) {
+ framework::MilliSecTimer startTime(_env._component.getClock());
+ fetchLocalData(bucket, cmd.getLoadType(), cmd.getDiff(), index,
+ context);
+ _env._metrics.mergeDataReadLatency.addValue(startTime);
+ } else {
+ LOG(spam, "Merge(%s): Moving %" PRIu64 " entries, didn't need "
+ "local data on node %u (%u).",
+ bucket.toString().c_str(),
+ cmd.getDiff().size(),
+ _env._nodeIndex,
+ index);
+ }
+ if (applyDiffHasLocallyNeededData(cmd.getDiff(), index)) {
+ framework::MilliSecTimer startTime(_env._component.getClock());
+ api::BucketInfo info(applyDiffLocally(bucket, cmd.getLoadType(),
+ cmd.getDiff(), index, context));
+ _env._metrics.mergeDataWriteLatency.addValue(startTime);
+ } else {
+ LOG(spam, "Merge(%s): Didn't need fetched data on node %u (%u).",
+ bucket.toString().c_str(), _env._nodeIndex, index);
+ }
+
+ // If last node in merge chain, we can send reply straight away
+ if (lastInChain) {
+ // Unfill entries everyone has filled in before returning.
+ uint16_t completeMask = 0;
+ for (uint32_t i=0; i<cmd.getNodes().size(); ++i) {
+ if (!cmd.getNodes()[i].sourceOnly) {
+ completeMask |= (1 << i);
+ }
+ }
+ std::vector<api::ApplyBucketDiffCommand::Entry>& local(cmd.getDiff());
+ for (uint32_t i=0, n=local.size(); i<n; ++i) {
+ if ((local[i]._entry._hasMask & completeMask) == completeMask) {
+ local[i]._headerBlob.clear();
+ local[i]._bodyBlob.clear();
+ local[i]._docName.clear();
+ }
+ }
+
+ tracker->setReply(api::StorageReply::SP(new api::ApplyBucketDiffReply(cmd)));
+ static_cast<api::ApplyBucketDiffReply&>(*tracker->getReply()).getDiff().swap(
+ cmd.getDiff());
+ LOG(spam, "Replying to ApplyBucketDiff for %s to node %d.",
+ bucket.toString().c_str(), cmd.getNodes()[index - 1].index);
+ } else {
+ // When not the last node in merge chain, we must save reply, and
+ // send command on.
+ MergeStateDeleter stateGuard(_env._fileStorHandler, id);
+ MergeStatus::SP s(new MergeStatus(_env._component.getClock(),
+ cmd.getLoadType(), cmd.getPriority(),
+ cmd.getTrace().getLevel()));
+ _env._fileStorHandler.addMergeStatus(id, s);
+ s->pendingApplyDiff =
+ api::ApplyBucketDiffReply::SP(new api::ApplyBucketDiffReply(cmd));
+
+ LOG(spam, "Sending ApplyBucketDiff for %s on to node %d",
+ bucket.toString().c_str(), cmd.getNodes()[index + 1].index);
+ std::shared_ptr<api::ApplyBucketDiffCommand> cmd2(
+ new api::ApplyBucketDiffCommand(
+ id, cmd.getNodes(), cmd.getMaxBufferSize()));
+ cmd2->setAddress(createAddress(_env._component.getClusterName(),
+ cmd.getNodes()[index + 1].index));
+ cmd2->getDiff().swap(cmd.getDiff());
+ cmd2->setPriority(cmd.getPriority());
+ cmd2->setTimeout(cmd.getTimeout());
+ s->pendingId = cmd2->getMsgId();
+ _env._fileStorHandler.sendCommand(cmd2);
+ // Everything went fine. Don't delete state but wait for reply
+ stateGuard.deactivate();
+ tracker->dontReply();
+ }
+
+ return tracker;
+}
+
+void
+MergeHandler::handleApplyBucketDiffReply(api::ApplyBucketDiffReply& reply,
+ MessageSender& sender)
+{
+ ++_env._metrics.applyBucketDiffReply;
+ document::BucketId id(reply.getBucketId());
+ spi::Bucket bucket(id, spi::PartitionId(_env._partition));
+ std::vector<api::ApplyBucketDiffCommand::Entry>& diff(reply.getDiff());
+ LOG(debug, "%s", reply.toString().c_str());
+
+ if (!_env._fileStorHandler.isMerging(id)) {
+ LOG(warning, "Got ApplyBucketDiffReply for %s which we have no "
+ "merge state for.",
+ bucket.toString().c_str());
+ DUMP_LOGGED_BUCKET_OPERATIONS(id);
+ return;
+ }
+
+ MergeStatus& s = _env._fileStorHandler.editMergeStatus(id);
+ if (s.pendingId != reply.getMsgId()) {
+ LOG(warning, "Got ApplyBucketDiffReply for %s which had message "
+ "id %" PRIu64 " when we expected %" PRIu64 ". Ignoring reply.",
+ bucket.toString().c_str(), reply.getMsgId(), s.pendingId);
+ DUMP_LOGGED_BUCKET_OPERATIONS(id);
+ return;
+ }
+ bool clearState = true;
+ api::StorageReply::SP replyToSend;
+ // Process apply bucket diff locally
+ api::ReturnCode returnCode = reply.getResult();
+ try {
+ if (reply.getResult().failed()) {
+ LOG(debug, "Got failed apply bucket diff reply %s",
+ reply.toString().c_str());
+ } else {
+ assert(reply.getNodes().size() >= 2);
+ uint8_t index = findOwnIndex(reply.getNodes(), _env._nodeIndex);
+ if (applyDiffNeedLocalData(diff, index, false)) {
+ framework::MilliSecTimer startTime(_env._component.getClock());
+ fetchLocalData(bucket, reply.getLoadType(), diff, index,
+ s.context);
+ _env._metrics.mergeDataReadLatency.addValue(
+ startTime);
+ }
+ if (applyDiffHasLocallyNeededData(diff, index)) {
+ framework::MilliSecTimer startTime(_env._component.getClock());
+ api::BucketInfo info(
+ applyDiffLocally(bucket, reply.getLoadType(), diff,
+ index, s.context));
+ _env._metrics.mergeDataWriteLatency.addValue(
+ startTime);
+ } else {
+ LOG(spam, "Merge(%s): Didn't need fetched data on node %u (%u)",
+ bucket.toString().c_str(),
+ _env._nodeIndex,
+ static_cast<unsigned int>(index));
+ }
+ }
+
+ if (s.isFirstNode()) {
+ uint16_t hasMask = 0;
+ for (uint16_t i=0; i<reply.getNodes().size(); ++i) {
+ hasMask |= (1 << i);
+ }
+
+ const size_t diffSizeBefore = s.diff.size();
+ const bool altered = s.removeFromDiff(diff, hasMask);
+ if (reply.getResult().success()
+ && s.diff.size() == diffSizeBefore
+ && !altered)
+ {
+ std::string msg(
+ vespalib::make_string(
+ "Completed merge cycle without fixing "
+ "any entries (merge state diff at %zu entries)",
+ s.diff.size()));
+ returnCode = api::ReturnCode(api::ReturnCode::INTERNAL_FAILURE, msg);
+ LOG(warning,
+ "Got reply indicating merge cycle did not fix any entries: %s",
+ reply.toString(true).c_str());
+ LOG(warning,
+ "Merge state for which there was no progress across a "
+ "full merge cycle: %s",
+ s.toString().c_str());
+ }
+
+ if (returnCode.failed()) {
+ // Should reply now, since we failed.
+ replyToSend = s.reply;
+ } else {
+ replyToSend = processBucketMerge(bucket, s, sender, s.context);
+
+ if (!replyToSend.get()) {
+ // We have sent something on and shouldn't reply now.
+ clearState = false;
+ } else {
+ _env._metrics.mergeLatencyTotal.addValue(s.startTime);
+ }
+ }
+ } else {
+ replyToSend = s.pendingApplyDiff;
+ LOG(debug, "ApplyBucketDiff(%s) finished. Sending reply.",
+ bucket.toString().c_str());
+ s.pendingApplyDiff->getDiff().swap(reply.getDiff());
+ }
+ } catch (std::exception& e) {
+ _env._fileStorHandler.clearMergeStatus(
+ id,
+ api::ReturnCode(api::ReturnCode::INTERNAL_FAILURE,
+ e.what()));
+ throw;
+ } catch (...) {
+ assert(false);
+ }
+
+ if (clearState) {
+ _env._fileStorHandler.clearMergeStatus(id);
+ }
+ if (replyToSend.get()) {
+ // Send on
+ replyToSend->setResult(returnCode);
+ sender.sendReply(replyToSend);
+ }
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/mergehandler.h b/storage/src/vespa/storage/persistence/mergehandler.h
new file mode 100644
index 00000000000..f23dbe0c9b7
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/mergehandler.h
@@ -0,0 +1,103 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::MergeHandler
+ *
+ * @brief Handles a merge of a single bucket.
+ *
+ * A merge is a complex operation in many stages covering multiple nodes. It
+ * needs to track some state of ongoing merges, and it also needs quite a bit
+ * of logic.
+ *
+ * This class implements tracks the state and implements the logic, such that
+ * the rest of the provider layer does not need to concern itself with merges.
+ */
+#pragma once
+
+#include <vespa/persistence/spi/persistenceprovider.h>
+#include <vespa/storage/persistence/filestorage/mergestatus.h>
+#include <vespa/storage/persistence/persistenceutil.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storage/common/messagesender.h>
+
+namespace storage {
+
+class MergeHandler : public Types {
+
+public:
+ enum StateFlag {
+ IN_USE = 0x01,
+ DELETED = 0x02,
+ DELETED_IN_PLACE = 0x04
+ };
+
+ MergeHandler(spi::PersistenceProvider& spi, PersistenceUtil&);
+ /** Used for unit testing */
+ MergeHandler(spi::PersistenceProvider& spi,
+ PersistenceUtil& env,
+ uint32_t maxChunkSize);
+
+ bool buildBucketInfoList(
+ const spi::Bucket& bucket,
+ const documentapi::LoadType&,
+ Timestamp maxTimestamp,
+ uint8_t myNodeIndex,
+ std::vector<api::GetBucketDiffCommand::Entry>& output,
+ spi::Context& context);
+ void fetchLocalData(const spi::Bucket& bucket,
+ const documentapi::LoadType&,
+ std::vector<api::ApplyBucketDiffCommand::Entry>& diff,
+ uint8_t nodeIndex,
+ spi::Context& context);
+ api::BucketInfo applyDiffLocally(
+ const spi::Bucket& bucket,
+ const documentapi::LoadType&,
+ std::vector<api::ApplyBucketDiffCommand::Entry>& diff,
+ uint8_t nodeIndex,
+ spi::Context& context);
+
+ MessageTracker::UP handleMergeBucket(api::MergeBucketCommand&,
+ spi::Context&);
+ MessageTracker::UP handleGetBucketDiff(api::GetBucketDiffCommand&,
+ spi::Context&);
+ void handleGetBucketDiffReply(api::GetBucketDiffReply&, MessageSender&);
+ MessageTracker::UP handleApplyBucketDiff(api::ApplyBucketDiffCommand&,
+ spi::Context&);
+ void handleApplyBucketDiffReply(api::ApplyBucketDiffReply&, MessageSender&);
+
+private:
+ spi::PersistenceProvider& _spi;
+ PersistenceUtil& _env;
+ uint32_t _maxChunkSize;
+
+ /** Returns a reply if merge is complete */
+ api::StorageReply::SP processBucketMerge(const spi::Bucket& bucket,
+ MergeStatus& status,
+ MessageSender& sender,
+ spi::Context& context);
+
+ /**
+ * Invoke either put, remove or unrevertable remove on the SPI
+ * depending on the flags in the diff entry.
+ */
+ void applyDiffEntry(const spi::Bucket&,
+ const api::ApplyBucketDiffCommand::Entry&,
+ spi::Context& context,
+ const document::DocumentTypeRepo& repo);
+
+ /**
+ * Fill entries-vector with metadata for bucket up to maxTimestamp,
+ * sorted ascendingly on entry timestamp.
+ * Throws std::runtime_error upon iteration failure.
+ */
+ void populateMetaData(const spi::Bucket&,
+ Timestamp maxTimestamp,
+ std::vector<spi::DocEntry::LP>& entries,
+ spi::Context& context);
+
+ Document::UP deserializeDiffDocument(
+ const api::ApplyBucketDiffCommand::Entry& e,
+ const document::DocumentTypeRepo& repo) const;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/persistence/messages.h b/storage/src/vespa/storage/persistence/messages.h
new file mode 100644
index 00000000000..42fe174b14a
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/messages.h
@@ -0,0 +1,424 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <memory>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/vespalib/stllike/hash_set.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storageapi/message/internal.h>
+#include <vespa/persistence/spi/docentry.h>
+#include <vespa/persistence/spi/bucket.h>
+#include <vespa/persistence/spi/selection.h>
+#include <vespa/persistence/spi/read_consistency.h>
+
+namespace storage {
+
+class GetIterCommand : public api::InternalCommand {
+private:
+ mutable framework::MemoryToken::UP _token;
+ document::BucketId _bucketId;
+ spi::IteratorId _iteratorId;
+ uint32_t _maxByteSize;
+
+public:
+ static const uint32_t ID = 1001;
+ typedef std::unique_ptr<GetIterCommand> UP;
+ typedef std::shared_ptr<GetIterCommand> SP;
+
+ GetIterCommand(framework::MemoryToken::UP token,
+ const document::BucketId& bucketId,
+ const spi::IteratorId iteratorId,
+ uint32_t maxByteSize)
+ : api::InternalCommand(ID),
+ _token(std::move(token)),
+ _bucketId(bucketId),
+ _iteratorId(iteratorId),
+ _maxByteSize(maxByteSize)
+ {
+ assert(_token.get());
+ }
+
+ std::unique_ptr<api::StorageReply> makeReply();
+
+ document::BucketId getBucketId() const { return _bucketId; }
+ virtual bool hasSingleBucketId() const { return true; }
+
+ spi::IteratorId getIteratorId() const { return _iteratorId; }
+ void setIteratorId(spi::IteratorId iteratorId) { _iteratorId = iteratorId; }
+
+ void setMaxByteSize(uint32_t maxByteSize) { _maxByteSize = maxByteSize; }
+ uint32_t getMaxByteSize() const { return _maxByteSize; }
+
+
+ virtual void print(std::ostream& out, bool verbose, const std::string& indent) const
+ {
+ out << "GetIterCommand()";
+
+ if (verbose) {
+ out << " : ";
+ InternalCommand::print(out, true, indent);
+ }
+ }
+private:
+ framework::MemoryToken::UP releaseMemoryToken() { return std::move(_token); }
+ friend class GetIterReply;
+};
+
+class GetIterReply : public api::InternalReply {
+private:
+ framework::MemoryToken::UP _token;
+ document::BucketId _bucketId;
+ std::vector<spi::DocEntry::LP> _entries;
+ bool _completed;
+
+public:
+ typedef std::unique_ptr<GetIterReply> UP;
+ typedef std::shared_ptr<GetIterReply> SP;
+ static const uint32_t ID = 1002;
+
+ GetIterReply(GetIterCommand& cmd)
+ : api::InternalReply(ID, cmd),
+ _token(cmd.releaseMemoryToken()),
+ _bucketId(cmd.getBucketId()),
+ _completed(false)
+ {
+ }
+
+ virtual bool hasSingleBucketId() const { return true; }
+ document::BucketId getBucketId() const {
+ return _bucketId;
+ }
+
+ const std::vector<spi::DocEntry::LP>& getEntries() const {
+ return _entries;
+ }
+
+ std::vector<spi::DocEntry::LP>& getEntries() {
+ return _entries;
+ }
+
+ void setCompleted(bool completed = true) { _completed = completed; }
+ bool isCompleted() const { return _completed; }
+
+ virtual void print(std::ostream& out, bool verbose, const std::string& indent) const
+ {
+ out << "GetIterReply()";
+
+ if (verbose) {
+ out << " : ";
+ InternalReply::print(out, true, indent);
+ }
+ }
+};
+
+inline std::unique_ptr<api::StorageReply> GetIterCommand::makeReply() {
+ return std::unique_ptr<api::StorageReply>(new GetIterReply(*this));
+}
+
+class CreateIteratorCommand : public api::InternalCommand
+{
+ document::BucketId _bucketId;
+ spi::Selection _selection;
+ std::string _fieldSet;
+ spi::IncludedVersions _includedVersions;
+ spi::ReadConsistency _readConsistency;
+
+public:
+ static const uint32_t ID = 1003;
+ typedef std::unique_ptr<CreateIteratorCommand> UP;
+ typedef std::shared_ptr<CreateIteratorCommand> SP;
+
+ CreateIteratorCommand(const document::BucketId& bucketId,
+ const spi::Selection& selection,
+ const std::string& fields,
+ spi::IncludedVersions includedVersions)
+ : api::InternalCommand(ID),
+ _bucketId(bucketId),
+ _selection(selection),
+ _fieldSet(fields),
+ _includedVersions(includedVersions),
+ _readConsistency(spi::ReadConsistency::STRONG)
+ {
+ }
+
+ virtual bool hasSingleBucketId() const { return true; }
+ document::BucketId getBucketId() const { return _bucketId; }
+ const spi::Selection& getSelection() const { return _selection; }
+ spi::IncludedVersions getIncludedVersions() const { return _includedVersions; }
+ const std::string& getFields() const { return _fieldSet; }
+
+ void setReadConsistency(spi::ReadConsistency consistency) noexcept {
+ _readConsistency = consistency;
+ }
+ spi::ReadConsistency getReadConsistency() const noexcept {
+ return _readConsistency;
+ }
+
+ std::unique_ptr<api::StorageReply> makeReply();
+
+ void print(std::ostream& out,
+ bool /*verbose*/,
+ const std::string& /*indent*/) const
+ {
+ out << "CreateIteratorCommand(" << _bucketId << ")";
+ }
+};
+
+class CreateIteratorReply : public api::InternalReply
+{
+ document::BucketId _bucketId;
+ spi::IteratorId _iteratorId;
+public:
+ static const uint32_t ID = 1004;
+ typedef std::unique_ptr<CreateIteratorReply> UP;
+ typedef std::shared_ptr<CreateIteratorReply> SP;
+
+ CreateIteratorReply(const CreateIteratorCommand& cmd,
+ spi::IteratorId iteratorId)
+ : api::InternalReply(ID, cmd),
+ _bucketId(cmd.getBucketId()),
+ _iteratorId(iteratorId)
+ {
+ }
+
+ virtual bool hasSingleBucketId() const { return true; }
+ document::BucketId getBucketId() const { return _bucketId; }
+
+ spi::IteratorId getIteratorId() const { return _iteratorId; }
+
+ void print(std::ostream& out,
+ bool /*verbose*/,
+ const std::string& /*indent*/) const
+ {
+ out << "CreateIteratorReply(" << _bucketId << ")";
+ }
+};
+
+inline std::unique_ptr<api::StorageReply>
+CreateIteratorCommand::makeReply()
+{
+ spi::IteratorId id(0);
+ return std::unique_ptr<api::StorageReply>(
+ new CreateIteratorReply(*this, id));
+}
+
+class DestroyIteratorCommand : public api::InternalCommand
+{
+ spi::IteratorId _iteratorId;
+public:
+ static const uint32_t ID = 1005;
+ typedef std::unique_ptr<DestroyIteratorCommand> UP;
+ typedef std::shared_ptr<DestroyIteratorCommand> SP;
+
+ DestroyIteratorCommand(spi::IteratorId iteratorId)
+ : api::InternalCommand(ID),
+ _iteratorId(iteratorId)
+ {
+ }
+
+ spi::IteratorId getIteratorId() const { return _iteratorId; }
+
+ std::unique_ptr<api::StorageReply> makeReply();
+
+ void print(std::ostream& out,
+ bool /*verbose*/,
+ const std::string& /*indent*/) const
+ {
+ out << "DestroyIteratorCommand(id="
+ << _iteratorId
+ << ")";
+ }
+};
+
+class DestroyIteratorReply : public api::InternalReply
+{
+ spi::IteratorId _iteratorId;
+public:
+ static const uint32_t ID = 1006;
+ typedef std::unique_ptr<DestroyIteratorReply> UP;
+ typedef std::shared_ptr<DestroyIteratorReply> SP;
+
+ DestroyIteratorReply(const DestroyIteratorCommand& cmd)
+ : api::InternalReply(ID, cmd),
+ _iteratorId(cmd.getIteratorId())
+ {
+ }
+
+ void print(std::ostream& out,
+ bool /*verbose*/,
+ const std::string& /*indent*/) const
+ {
+ out << "DestroyIteratorReply(id="
+ << _iteratorId
+ << ")";
+ }
+};
+
+inline std::unique_ptr<api::StorageReply>
+DestroyIteratorCommand::makeReply() {
+ return std::unique_ptr<api::StorageReply>(new DestroyIteratorReply(*this));
+}
+
+class RecheckBucketInfoCommand : public api::InternalCommand
+{
+ document::BucketId _bucketId;
+public:
+ static const uint32_t ID = 1007;
+ typedef std::shared_ptr<RecheckBucketInfoCommand> SP;
+ typedef std::unique_ptr<RecheckBucketInfoCommand> UP;
+
+ RecheckBucketInfoCommand(const document::BucketId& bucketId)
+ : api::InternalCommand(ID),
+ _bucketId(bucketId)
+ {}
+
+ document::BucketId getBucketId() const {
+ return _bucketId;
+ }
+
+ std::unique_ptr<api::StorageReply> makeReply();
+
+ void print(std::ostream& out,
+ bool verbose,
+ const std::string& indent) const
+ {
+ (void) verbose;
+ (void) indent;
+ out << "RecheckBucketInfoCommand("
+ << _bucketId
+ << ")";
+ }
+};
+
+class RecheckBucketInfoReply : public api::InternalReply
+{
+ document::BucketId _bucketId;
+public:
+ static const uint32_t ID = 1008;
+ typedef std::shared_ptr<RecheckBucketInfoReply> SP;
+ typedef std::unique_ptr<RecheckBucketInfoReply> UP;
+
+ RecheckBucketInfoReply(const RecheckBucketInfoCommand& cmd)
+ : api::InternalReply(ID, cmd),
+ _bucketId(cmd.getBucketId())
+ {}
+
+ document::BucketId getBucketId() const {
+ return _bucketId;
+ }
+
+ void print(std::ostream& out,
+ bool verbose,
+ const std::string& indent) const
+ {
+ (void) verbose;
+ (void) indent;
+ out << "RecheckBucketInfoReply("
+ << _bucketId
+ << ")";
+ }
+};
+
+inline std::unique_ptr<api::StorageReply>
+RecheckBucketInfoCommand::makeReply() {
+ return std::unique_ptr<api::StorageReply>(new RecheckBucketInfoReply(*this));
+}
+
+class AbortBucketOperationsCommand : public api::InternalCommand
+{
+public:
+ class AbortPredicate {
+ virtual bool doShouldAbort(const document::BucketId&) const = 0;
+ public:
+ virtual ~AbortPredicate() {}
+ bool shouldAbort(const document::BucketId& bid) const {
+ return doShouldAbort(bid);
+ }
+ };
+
+ typedef vespalib::hash_set<
+ document::BucketId,
+ document::BucketId::hash
+ > BucketSet;
+
+ // Primarily for unit test mocking; actual predicate impl should do lazy
+ // evaluations based on previous and current cluster states.
+ class ExplicitBucketSetPredicate : public AbortPredicate {
+ BucketSet _bucketsToAbort;
+
+ bool doShouldAbort(const document::BucketId& bid) const override {
+ return _bucketsToAbort.find(bid) != _bucketsToAbort.end();
+ }
+ public:
+ explicit ExplicitBucketSetPredicate(const BucketSet& bucketsToAbort)
+ : _bucketsToAbort(bucketsToAbort)
+ {
+ }
+
+ template <typename Iterator>
+ ExplicitBucketSetPredicate(Iterator first, Iterator last)
+ : _bucketsToAbort(first, last)
+ {
+ }
+
+ const BucketSet& getBucketsToAbort() const {
+ return _bucketsToAbort;
+ }
+ };
+
+ static const uint32_t ID = 1009;
+ typedef std::shared_ptr<AbortBucketOperationsCommand> SP;
+ typedef std::shared_ptr<const AbortBucketOperationsCommand> CSP;
+private:
+ std::unique_ptr<AbortPredicate> _predicate;
+public:
+ AbortBucketOperationsCommand(std::unique_ptr<AbortPredicate> predicate)
+ : api::InternalCommand(ID),
+ _predicate(std::move(predicate))
+ {}
+
+
+ bool shouldAbort(const document::BucketId& bid) const {
+ return _predicate->shouldAbort(bid);
+ }
+
+ std::unique_ptr<api::StorageReply> makeReply();
+
+ void print(std::ostream& out,
+ bool verbose,
+ const std::string& indent) const
+ {
+ (void) verbose;
+ (void) indent;
+ out << "AbortBucketOperationsCommand()";
+ }
+};
+
+class AbortBucketOperationsReply : public api::InternalReply
+{
+public:
+ static const uint32_t ID = 1010;
+ typedef std::shared_ptr<AbortBucketOperationsReply> SP;
+ typedef std::shared_ptr<const AbortBucketOperationsReply> CSP;
+
+ AbortBucketOperationsReply(const AbortBucketOperationsCommand& cmd)
+ : api::InternalReply(ID, cmd)
+ {}
+
+ void print(std::ostream& out,
+ bool verbose,
+ const std::string& indent) const
+ {
+ (void) verbose;
+ (void) indent;
+ out << "AbortBucketOperationsReply()";
+ }
+};
+
+inline std::unique_ptr<api::StorageReply>
+AbortBucketOperationsCommand::makeReply() {
+ return std::unique_ptr<api::StorageReply>(new AbortBucketOperationsReply(*this));
+}
+
+} // ns storage
+
diff --git a/storage/src/vespa/storage/persistence/persistencethread.cpp b/storage/src/vespa/storage/persistence/persistencethread.cpp
new file mode 100644
index 00000000000..8a6f320ac3e
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/persistencethread.cpp
@@ -0,0 +1,1265 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/persistence/persistencethread.h>
+
+#include <vespa/log/log.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storageapi/message/bucketsplitting.h>
+#include <vespa/storage/persistence/splitbitdetector.h>
+#include <vespa/storage/persistence/bucketownershipnotifier.h>
+#include <vespa/storage/persistence/testandsethelper.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/common/bucketoperationlogger.h>
+#include <vespa/document/fieldset/fieldsetrepo.h>
+
+LOG_SETUP(".persistence.thread");
+
+namespace storage {
+
+PersistenceThread::PersistenceThread(ServiceLayerComponentRegister& compReg,
+ const config::ConfigUri & configUri,
+ spi::PersistenceProvider& provider,
+ FileStorHandler& filestorHandler,
+ FileStorThreadMetrics& metrics,
+ uint16_t deviceIndex,
+ uint8_t lowestPriority,
+ bool startThread)
+ : _env(configUri,
+ compReg,
+ filestorHandler,
+ metrics,
+ deviceIndex,
+ lowestPriority,
+ provider),
+ _warnOnSlowOperations(5000),
+ _spi(provider),
+ _processAllHandler(_env, provider),
+ _mergeHandler(_spi, _env),
+ _diskMoveHandler(_env, _spi),
+ _context(documentapi::LoadType::DEFAULT, 0, 0),
+ _bucketOwnershipNotifier(),
+ _flushMonitor(),
+ _closed(false)
+{
+ (void) startThread;
+ std::ostringstream threadName;
+ threadName << "Disk " << _env._partition << " thread "
+ << (void*) this;
+ _component.reset(new ServiceLayerComponent(compReg, threadName.str()));
+ _bucketOwnershipNotifier.reset(
+ new BucketOwnershipNotifier(*_component, filestorHandler));
+ framework::MilliSecTime maxProcessingTime(60 * 1000);
+ framework::MilliSecTime waitTime(1000);
+ _thread = _component->startThread(*this, maxProcessingTime, waitTime);
+}
+
+PersistenceThread::~PersistenceThread()
+{
+ LOG(debug, "Shutting down persistence thread. Waiting for current "
+ "operation to finish.");
+ _thread->interrupt();
+ LOG(debug, "Waiting for thread to terminate.");
+ _thread->join();
+ LOG(debug, "Persistence thread done with destruction");
+}
+
+spi::Bucket
+PersistenceThread::getBucket(const DocumentId& id,
+ const BucketId& bucket) const
+{
+ BucketId docBucket(_env._bucketFactory.getBucketId(id));
+ docBucket.setUsedBits(bucket.getUsedBits());
+ if (bucket != docBucket) {
+ docBucket = _env._bucketFactory.getBucketId(id);
+ throw vespalib::IllegalStateException("Document " + id.toString()
+ + " (bucket " + docBucket.toString() + ") does not belong in "
+ + "bucket " + bucket.toString() + ".", VESPA_STRLOC);
+ }
+
+ return spi::Bucket(bucket, spi::PartitionId(_env._partition));
+}
+
+bool
+PersistenceThread::checkForError(const spi::Result& response,
+ MessageTracker& tracker)
+{
+ uint32_t code = _env.convertErrorCode(response);
+
+ if (code != 0) {
+ tracker.fail(code, response.getErrorMessage());
+ return false;
+ }
+
+ return true;
+}
+
+
+bool PersistenceThread::tasConditionExists(const api::TestAndSetCommand & cmd) {
+ return cmd.getCondition().isPresent();
+}
+
+bool PersistenceThread::tasConditionMatches(const api::TestAndSetCommand & cmd, MessageTracker & tracker) {
+ try {
+ TestAndSetHelper helper(*this, cmd);
+
+ auto code = helper.retrieveAndMatch();
+ if (code.failed()) {
+ tracker.fail(code.getResult(), code.getMessage());
+ return false;
+ }
+ } catch (const TestAndSetException & e) {
+ auto code = e.getCode();
+ tracker.fail(code.getResult(), code.getMessage());
+ return false;
+ }
+
+ return true;
+}
+
+MessageTracker::UP
+PersistenceThread::handlePut(api::PutCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.put[cmd.getLoadType()],
+ _env._component.getClock()));
+
+ if (tasConditionExists(cmd) && !tasConditionMatches(cmd, *tracker)) {
+ return tracker;
+ }
+
+ spi::Result response =
+ _spi.put(getBucket(cmd.getDocumentId(), cmd.getBucketId()),
+ spi::Timestamp(cmd.getTimestamp()),
+ cmd.getDocument(),
+ _context);
+ checkForError(response, *tracker);
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleRemove(api::RemoveCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.remove[cmd.getLoadType()],
+ _env._component.getClock()));
+
+ if (tasConditionExists(cmd) && !tasConditionMatches(cmd, *tracker)) {
+ return tracker;
+ }
+
+ spi::RemoveResult response =
+ _spi.removeIfFound(getBucket(cmd.getDocumentId(), cmd.getBucketId()),
+ spi::Timestamp(cmd.getTimestamp()),
+ cmd.getDocumentId(), _context);
+ if (checkForError(response, *tracker)) {
+ api::RemoveReply* reply(new api::RemoveReply(
+ cmd, response.wasFound() ? cmd.getTimestamp() : 0));
+ tracker->setReply(api::StorageReply::SP(reply));
+ }
+ if (!response.wasFound()) {
+ ++_env._metrics.remove[cmd.getLoadType()].notFound;
+ }
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleUpdate(api::UpdateCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.update[cmd.getLoadType()],
+ _env._component.getClock()));
+
+ if (tasConditionExists(cmd) && !tasConditionMatches(cmd, *tracker)) {
+ return tracker;
+ }
+
+ spi::UpdateResult response =
+ _spi.update(getBucket(cmd.getUpdate()->getId(), cmd.getBucketId()),
+ spi::Timestamp(cmd.getTimestamp()),
+ cmd.getUpdate(), _context);
+ if (checkForError(response, *tracker)) {
+ api::UpdateReply* reply = new api::UpdateReply(cmd);
+ reply->setOldTimestamp(response.getExistingTimestamp());
+ tracker->setReply(api::StorageReply::SP(reply));
+ }
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleGet(api::GetCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.get[cmd.getLoadType()],
+ _env._component.getClock()));
+
+ document::FieldSetRepo repo;
+ document::FieldSet::UP fieldSet = repo.parse(*_env._component.getTypeRepo(),
+ cmd.getFieldSet());
+ spi::GetResult result =
+ _spi.get(getBucket(cmd.getDocumentId(), cmd.getBucketId()),
+ *fieldSet,
+ cmd.getDocumentId(),
+ _context);
+
+ if (checkForError(result, *tracker)) {
+ if (!result.hasDocument()) {
+ ++_env._metrics.get[cmd.getLoadType()].notFound;
+ }
+
+ api::GetReply::UP reply(
+ new api::GetReply(cmd,
+ Document::SP(result.getDocumentPtr()),
+ result.getTimestamp()));
+
+ tracker->setReply(api::StorageReply::SP(reply.release()));
+ }
+
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleRepairBucket(RepairBucketCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.repairs,
+ _env._component.getClock()));
+ NotificationGuard notifyGuard(*_bucketOwnershipNotifier);
+ LOG(debug, "Repair(%s): %s",
+ cmd.getBucketId().toString().c_str(),
+ (cmd.verifyBody() ? "Verifying body" : "Not verifying body"));
+ api::BucketInfo before = _env.getBucketInfo(cmd.getBucketId());
+ spi::Result result =
+ _spi.maintain(spi::Bucket(cmd.getBucketId(),
+ spi::PartitionId(_env._partition)),
+ cmd.verifyBody() ?
+ spi::HIGH : spi::LOW);
+ if (checkForError(result, *tracker)) {
+ api::BucketInfo after = _env.getBucketInfo(cmd.getBucketId());
+
+ RepairBucketReply::UP reply(new RepairBucketReply(cmd, after));
+ reply->setAltered(!(after == before));
+ if (reply->bucketAltered()) {
+ notifyGuard.notifyAlways(cmd.getBucketId(), after);
+ ++_env._metrics.repairFixed;
+ }
+
+ _env.updateBucketDatabase(cmd.getBucketId(), after);
+ tracker->setReply(api::StorageReply::SP(reply.release()));
+ }
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleMultiOperation(api::MultiOperationCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.multiOp[cmd.getLoadType()],
+ _env._component.getClock()));
+ spi::Bucket b = spi::Bucket(cmd.getBucketId(),
+ spi::PartitionId(_env._partition));
+ long puts = 0;
+ long removes = 0;
+ long updates = 0;
+ long updatesNotFound = 0;
+ long removesNotFound = 0;
+ for (vdslib::DocumentList::const_iterator it =
+ cmd.getOperations().begin();
+ it != cmd.getOperations().end(); ++it)
+ {
+ document::DocumentId docId = it->getDocumentId();
+ if (it->isRemoveEntry()) {
+ ++removes;
+ spi::RemoveResult result = _spi.removeIfFound(
+ b,
+ spi::Timestamp(it->getTimestamp()),
+ docId, _context);
+ if (!checkForError(result, *tracker)) {
+ return tracker;
+ }
+ if (!result.wasFound()) {
+ LOG(debug, "Cannot remove %s; document not found",
+ docId.toString().c_str());
+ ++removesNotFound;
+ }
+ } else if (it->isUpdateEntry()) {
+ ++updates;
+ document::DocumentUpdate::SP docUpdate = it->getUpdate();
+ spi::UpdateResult result =
+ _spi.update(b, spi::Timestamp(it->getTimestamp()), docUpdate,
+ _context);
+ if (!checkForError(result, *tracker)) {
+ return tracker;
+ }
+ if (result.getExistingTimestamp() == 0) {
+ ++updatesNotFound;
+ }
+ } else {
+ ++puts;
+ document::Document::SP doc = it->getDocument();
+ spi::Result result = _spi.put(b, spi::Timestamp(it->getTimestamp()),
+ doc, _context);
+ if (!checkForError(result, *tracker)) {
+ return tracker;
+ }
+ }
+ }
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleRevert(api::RevertCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.revert[cmd.getLoadType()],
+ _env._component.getClock()));
+ spi::Bucket b = spi::Bucket(cmd.getBucketId(),
+ spi::PartitionId(_env._partition));
+ const std::vector<api::Timestamp> tokens = cmd.getRevertTokens();
+ for (uint32_t i = 0; i < tokens.size(); ++i) {
+ spi::Result result = _spi.removeEntry(b,
+ spi::Timestamp(tokens[i]),
+ _context);
+ }
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleCreateBucket(api::CreateBucketCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.createBuckets,
+ _env._component.getClock()));
+ LOG(debug, "CreateBucket(%s)", cmd.getBucketId().toString().c_str());
+ if (_env._fileStorHandler.isMerging(cmd.getBucketId())) {
+ LOG(warning, "Bucket %s was merging at create time. Unexpected.",
+ cmd.getBucketId().toString().c_str());
+ DUMP_LOGGED_BUCKET_OPERATIONS(cmd.getBucketId());
+ }
+ spi::Bucket spiBucket(cmd.getBucketId(), spi::PartitionId(_env._partition));
+ _spi.createBucket(spiBucket, _context);
+ if (cmd.getActive()) {
+ _spi.setActiveState(spiBucket, spi::BucketInfo::ACTIVE);
+ }
+ return tracker;
+}
+
+bool
+PersistenceThread::checkProviderBucketInfoMatches(const spi::Bucket& bucket,
+ const api::BucketInfo& info) const
+{
+ spi::BucketInfoResult result(_spi.getBucketInfo(bucket));
+ if (result.hasError()) {
+ LOG(error,
+ "getBucketInfo(%s) failed before deleting bucket; got error '%s'",
+ bucket.toString().c_str(),
+ result.getErrorMessage().c_str());
+ return false;
+ }
+ api::BucketInfo providerInfo(
+ _env.convertBucketInfo(result.getBucketInfo()));
+ // Don't check meta fields or active/ready fields since these are not
+ // that important and ready may change under the hood in a race with
+ // getModifiedBuckets(). If bucket is empty it means it has already
+ // been deleted by a racing split/join.
+ if (!info.equalDocumentInfo(providerInfo) && !providerInfo.empty()) {
+ LOG(error,
+ "Service layer bucket database and provider out of sync before "
+ "deleting bucket %s! Service layer db had %s while provider says "
+ "bucket has %s. Deletion has been rejected to ensure data is not "
+ "lost, but bucket may remain out of sync until service has been "
+ "restarted.",
+ bucket.toString().c_str(),
+ info.toString().c_str(),
+ providerInfo.toString().c_str());
+ return false;
+ }
+ return true;
+}
+
+MessageTracker::UP
+PersistenceThread::handleDeleteBucket(api::DeleteBucketCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.deleteBuckets,
+ _env._component.getClock()));
+ LOG(debug, "DeletingBucket(%s)", cmd.getBucketId().toString().c_str());
+ LOG_BUCKET_OPERATION(cmd.getBucketId(), "deleteBucket()");
+ if (_env._fileStorHandler.isMerging(cmd.getBucketId())) {
+ _env._fileStorHandler.clearMergeStatus(cmd.getBucketId(),
+ api::ReturnCode(api::ReturnCode::ABORTED,
+ "Bucket was deleted during the merge"));
+ }
+ spi::Bucket bucket(cmd.getBucketId(), spi::PartitionId(_env._partition));
+ if (!checkProviderBucketInfoMatches(bucket, cmd.getBucketInfo())) {
+ return tracker;
+ }
+ _spi.deleteBucket(bucket, _context);
+ StorBucketDatabase& db(_env.getBucketDatabase());
+ {
+ StorBucketDatabase::WrappedEntry entry(db.get(
+ cmd.getBucketId(), "FileStorThread::onDeleteBucket"));
+ if (entry.exist() && entry->getMetaCount() > 0) {
+ LOG(debug, "onDeleteBucket(%s): Bucket DB entry existed. Likely "
+ "active operation when delete bucket was queued. "
+ "Updating bucket database to keep it in sync with file. "
+ "Cannot delete bucket from bucket database at this "
+ "point, as it can have been intentionally recreated "
+ "after delete bucket had been sent",
+ cmd.getBucketId().toString().c_str());
+ api::BucketInfo info(0, 0, 0);
+ // Only set document counts/size; retain ready/active state.
+ info.setReady(entry->getBucketInfo().isReady());
+ info.setActive(entry->getBucketInfo().isActive());
+
+ entry->setBucketInfo(info);
+ entry.write();
+ }
+ }
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleGetIter(GetIterCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.visit[cmd.getLoadType()],
+ _env._component.getClock()));
+ spi::IterateResult result(_spi.iterate(cmd.getIteratorId(),
+ cmd.getMaxByteSize(), _context));
+ if (checkForError(result, *tracker)) {
+ GetIterReply::SP reply(new GetIterReply(cmd));
+ reply->getEntries() = result.getEntries();
+ _env._metrics.visit[cmd.getLoadType()].
+ documentsPerIterate.addValue(reply->getEntries().size());
+ if (result.isCompleted()) {
+ reply->setCompleted();
+ }
+ tracker->setReply(reply);
+ }
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleReadBucketList(ReadBucketList& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.readBucketList,
+ _env._component.getClock()));
+
+ spi::BucketIdListResult result(_spi.listBuckets(cmd.getPartition()));
+ if (checkForError(result, *tracker)) {
+ ReadBucketListReply::SP reply(new ReadBucketListReply(cmd));
+ result.getList().swap(reply->getBuckets());
+ tracker->setReply(reply);
+ }
+
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleReadBucketInfo(ReadBucketInfo& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.readBucketInfo,
+ _env._component.getClock()));
+
+ _env.updateBucketDatabase(cmd.getBucketId(),
+ _env.getBucketInfo(cmd.getBucketId()));
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleCreateIterator(CreateIteratorCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.createIterator,
+ _env._component.getClock()));
+ document::FieldSetRepo repo;
+ document::FieldSet::UP fieldSet = repo.parse(*_env._component.getTypeRepo(),
+ cmd.getFields());
+ // _context is reset per command, so it's safe to modify it like this.
+ _context.setReadConsistency(cmd.getReadConsistency());
+ spi::CreateIteratorResult result(_spi.createIterator(
+ spi::Bucket(cmd.getBucketId(), spi::PartitionId(_env._partition)),
+ *fieldSet,
+ cmd.getSelection(),
+ cmd.getIncludedVersions(),
+ _context));
+ if (checkForError(result, *tracker)) {
+ tracker->setReply(CreateIteratorReply::SP(
+ new CreateIteratorReply(
+ cmd, spi::IteratorId(result.getIteratorId()))));
+ }
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleSplitBucket(api::SplitBucketCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.splitBuckets,
+ _env._component.getClock()));
+ NotificationGuard notifyGuard(*_bucketOwnershipNotifier);
+
+ // Calculate the various bucket ids involved.
+ if (cmd.getBucketId().getUsedBits() >= 58) {
+ tracker->fail(
+ api::ReturnCode::ILLEGAL_PARAMETERS,
+ "Can't split anymore since maximum split bits "
+ "is already reached");
+ return tracker;
+ }
+ if (cmd.getMaxSplitBits() <= cmd.getBucketId().getUsedBits()) {
+ tracker->fail(api::ReturnCode::ILLEGAL_PARAMETERS,
+ "Max lit bits must be set higher "
+ "than the number of bits used in the bucket to split");
+ return tracker;
+ }
+
+ spi::Bucket spiBucket(cmd.getBucketId(), spi::PartitionId(_env._partition));
+ SplitBitDetector::Result targetInfo;
+ if (_env._config.enableMultibitSplitOptimalization) {
+ targetInfo = SplitBitDetector::detectSplit(
+ _spi, spiBucket, cmd.getMaxSplitBits(),
+ _context, cmd.getMinDocCount(), cmd.getMinByteSize());
+ }
+ if (targetInfo.empty() || !_env._config.enableMultibitSplitOptimalization) {
+ document::BucketId src(cmd.getBucketId());
+ document::BucketId target1(src.getUsedBits() + 1, src.getId());
+ document::BucketId target2(src.getUsedBits() + 1, src.getId()
+ | (uint64_t(1) << src.getUsedBits()));
+ targetInfo = SplitBitDetector::Result(target1, target2, false);
+ }
+ if (targetInfo.failed()) {
+ tracker->fail(api::ReturnCode::INTERNAL_FAILURE,
+ targetInfo.getReason());
+ return tracker;
+ }
+ // If we get here, we're splitting data in two.
+ // (Possibly in special case where a target will be unused)
+ assert(targetInfo.success());
+ document::BucketId target1(targetInfo.getTarget1());
+ document::BucketId target2(targetInfo.getTarget2());
+
+ LOG(debug, "split(%s -> %s, %s)", cmd.getBucketId().toString().c_str(),
+ target1.toString().c_str(), target2.toString().c_str());
+
+ PersistenceUtil::LockResult lock1(_env.lockAndGetDisk(target1));
+ PersistenceUtil::LockResult lock2(_env.lockAndGetDisk(target2));
+
+#ifdef ENABLE_BUCKET_OPERATION_LOGGING
+ {
+ vespalib::string desc(
+ vespalib::make_vespa_string(
+ "split(%s -> %s, %s)",
+ cmd.getBucketId().toString().c_str(),
+ target1.toString().c_str(),
+ target2.toString().c_str()));
+ LOG_BUCKET_OPERATION(cmd.getBucketId(), desc);
+ LOG_BUCKET_OPERATION(target1, desc);
+ if (target2.getRawId() != 0) {
+ LOG_BUCKET_OPERATION(target2, desc);
+ }
+ }
+#endif
+ spi::Result result = _spi.split(
+ spiBucket,
+ spi::Bucket(target1, spi::PartitionId(lock1.disk)),
+ spi::Bucket(target2, spi::PartitionId(lock2.disk)), _context);
+ if (result.hasError()) {
+ tracker->fail(_env.convertErrorCode(result),
+ result.getErrorMessage());
+ return tracker;
+ }
+ // After split we need to take all bucket db locks to update them.
+ // Ensure to take them in rising order.
+ StorBucketDatabase::WrappedEntry sourceEntry(_env.getBucketDatabase().get(
+ cmd.getBucketId(), "PersistenceThread::handleSplitBucket-source"));
+ api::SplitBucketReply* splitReply(new api::SplitBucketReply(cmd));
+ tracker->setReply(api::StorageReply::SP(splitReply));
+
+ typedef std::pair<StorBucketDatabase::WrappedEntry,
+ FileStorHandler::RemapInfo> TargetInfo;
+ std::vector<TargetInfo> targets;
+ for (uint32_t i = 0; i < 2; i++) {
+ const document::BucketId& target(i == 0 ? target1 : target2);
+ uint16_t disk(i == 0 ? lock1.disk : lock2.disk);
+ assert(target.getRawId() != 0);
+ targets.push_back(TargetInfo(
+ _env.getBucketDatabase().get(
+ target, "PersistenceThread::handleSplitBucket - Target",
+ StorBucketDatabase::CREATE_IF_NONEXISTING),
+ FileStorHandler::RemapInfo(target, disk)));
+ targets.back().first->setBucketInfo(
+ _env.getBucketInfo(target, disk));
+ targets.back().first->disk = disk;
+ }
+ if (LOG_WOULD_LOG(spam)) {
+ api::BucketInfo targ1(targets[0].first->getBucketInfo());
+ api::BucketInfo targ2(targets[1].first->getBucketInfo());
+ LOG(spam, "split(%s - %u -> %s - %u, %s - %u)",
+ cmd.getBucketId().toString().c_str(),
+ targ1.getMetaCount() + targ2.getMetaCount(),
+ target1.toString().c_str(),
+ targ1.getMetaCount(),
+ target2.toString().c_str(),
+ targ2.getMetaCount());
+ }
+ FileStorHandler::RemapInfo source(cmd.getBucketId(), _env._partition);
+ _env._fileStorHandler.remapQueueAfterSplit(
+ source, targets[0].second, targets[1].second);
+ bool ownershipChanged(
+ !_bucketOwnershipNotifier->distributorOwns(
+ cmd.getSourceIndex(), cmd.getBucketId()));
+ // Now release all the bucketdb locks.
+ for (uint32_t i = 0; i < targets.size(); i++) {
+ if (ownershipChanged) {
+ notifyGuard.notifyAlways(targets[i].second.bid,
+ targets[i].first->getBucketInfo());
+ }
+ // The entries vector has the source bucket in element zero, so indexing
+ // that with i+1
+ if (targets[i].second.foundInQueue
+ || targets[i].first->getMetaCount() > 0)
+ {
+ if (targets[i].first->getMetaCount() == 0) {
+ // Fake that the bucket has content so it is not deleted.
+ targets[i].first->info.setMetaCount(1);
+ // Must make sure target bucket exists when we have pending ops
+ // to an empty target bucket, since the provider will have
+ // implicitly erased it by this point.
+ spi::Bucket createTarget(
+ spi::Bucket(targets[i].second.bid,
+ spi::PartitionId(targets[i].second.diskIndex)));
+ LOG(debug,
+ "Split target %s was empty, but re-creating it since "
+ "there are remapped operations queued to it",
+ createTarget.toString().c_str());
+ _spi.createBucket(createTarget, _context);
+ }
+ splitReply->getSplitInfo().push_back(
+ api::SplitBucketReply::Entry(
+ targets[i].second.bid,
+ targets[i].first->getBucketInfo()));
+ targets[i].first.write();
+ } else {
+ targets[i].first.remove();
+ }
+ }
+ if (sourceEntry.exist()) {
+ if (ownershipChanged) {
+ notifyGuard.notifyAlways(cmd.getBucketId(),
+ sourceEntry->getBucketInfo());
+ }
+ // Delete the old entry.
+ sourceEntry.remove();
+ }
+ return tracker;
+}
+
+bool
+PersistenceThread::validateJoinCommand(
+ const api::JoinBucketsCommand& cmd,
+ MessageTracker& tracker) const
+{
+ if (cmd.getSourceBuckets().size() != 2) {
+ tracker.fail(ReturnCode::ILLEGAL_PARAMETERS,
+ "Join needs exactly two buckets to be joined together"
+ + cmd.getBucketId().toString());
+ return false;
+ }
+ // Verify that source and target buckets look sane.
+ for (uint32_t i = 0; i < cmd.getSourceBuckets().size(); i++) {
+ if (cmd.getSourceBuckets()[i] == cmd.getBucketId()) {
+ tracker.fail(ReturnCode::ILLEGAL_PARAMETERS,
+ "Join had both source and target bucket "
+ + cmd.getBucketId().toString());
+ return false;
+ }
+ if (!cmd.getBucketId().contains(cmd.getSourceBuckets()[i])) {
+ tracker.fail(ReturnCode::ILLEGAL_PARAMETERS,
+ "Source bucket " +
+ cmd.getSourceBuckets()[i].toString()
+ + " is not contained in target "
+ + cmd.getBucketId().toString());
+ return false;
+ }
+ }
+ return true;
+}
+
+MessageTracker::UP
+PersistenceThread::handleJoinBuckets(api::JoinBucketsCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.joinBuckets,
+ _env._component.getClock()));
+ if (!validateJoinCommand(cmd, *tracker)) {
+ return tracker;
+ }
+ // To avoid a potential deadlock all operations locking multiple
+ // buckets must lock their buckets in the same order (sort order of
+ // bucket id, lowest countbits, lowest location first).
+ // Sort buckets to join in order to ensure we lock in correct order
+ std::sort(cmd.getSourceBuckets().begin(), cmd.getSourceBuckets().end());
+ {
+ // Create empty bucket for target.
+ StorBucketDatabase::WrappedEntry entry =
+ _env.getBucketDatabase().get(
+ cmd.getBucketId(),
+ "join",
+ StorBucketDatabase::CREATE_IF_NONEXISTING);
+
+ entry->disk = _env._partition;
+ entry.write();
+ }
+
+ document::BucketId firstBucket(cmd.getSourceBuckets()[0]);
+ document::BucketId secondBucket(cmd.getSourceBuckets()[1]);
+
+ PersistenceUtil::LockResult lock1(_env.lockAndGetDisk(firstBucket));
+ PersistenceUtil::LockResult lock2;
+ if (firstBucket != secondBucket) {
+ lock2 = _env.lockAndGetDisk(secondBucket);
+ }
+
+#ifdef ENABLE_BUCKET_OPERATION_LOGGING
+ {
+ vespalib::string desc(
+ vespalib::make_vespa_string(
+ "join(%s, %s -> %s)",
+ firstBucket.toString().c_str(),
+ secondBucket.toString().c_str(),
+ cmd.getBucketId().toString().c_str()));
+ LOG_BUCKET_OPERATION(cmd.getBucketId(), desc);
+ LOG_BUCKET_OPERATION(firstBucket, desc);
+ if (firstBucket != secondBucket) {
+ LOG_BUCKET_OPERATION(secondBucket, desc);
+ }
+ }
+#endif
+ spi::Result result =
+ _spi.join(spi::Bucket(firstBucket, spi::PartitionId(lock1.disk)),
+ spi::Bucket(secondBucket, spi::PartitionId(lock2.disk)),
+ spi::Bucket(cmd.getBucketId(),
+ spi::PartitionId(_env._partition)),
+ _context);
+ if (!checkForError(result, *tracker)) {
+ return tracker;
+ }
+ result = _spi.flush(spi::Bucket(cmd.getBucketId(),
+ spi::PartitionId(_env._partition)),
+ _context);
+ if (!checkForError(result, *tracker)) {
+ return tracker;
+ }
+ uint64_t lastModified = 0;
+ for (uint32_t i = 0; i < cmd.getSourceBuckets().size(); i++) {
+ document::BucketId bId = cmd.getSourceBuckets()[i];
+ uint16_t disk = (i == 0) ? lock1.disk : lock2.disk;
+ FileStorHandler::RemapInfo target(cmd.getBucketId(),
+ _env._partition);
+ _env._fileStorHandler.remapQueueAfterJoin(
+ FileStorHandler::RemapInfo(bId, disk),
+ target);
+ // Remove source from bucket db.
+ StorBucketDatabase::WrappedEntry entry(
+ _env.getBucketDatabase().get(
+ bId, "join-remove-source"));
+ if (entry.exist()) {
+ lastModified = std::max(lastModified,
+ entry->info.getLastModified());
+ entry.remove();
+ }
+ }
+ {
+ StorBucketDatabase::WrappedEntry entry =
+ _env.getBucketDatabase().get(
+ cmd.getBucketId(),
+ "join",
+ StorBucketDatabase::CREATE_IF_NONEXISTING);
+ if (entry->info.getLastModified() == 0) {
+ entry->info.setLastModified(
+ std::max(lastModified, entry->info.getLastModified()));
+ }
+ entry.write();
+ }
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleSetBucketState(api::SetBucketStateCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.setBucketStates,
+ _env._component.getClock()));
+ NotificationGuard notifyGuard(*_bucketOwnershipNotifier);
+
+ LOG(debug, "handleSetBucketState(): %s", cmd.toString().c_str());
+ spi::Bucket bucket(cmd.getBucketId(), spi::PartitionId(_env._partition));
+ bool shouldBeActive(cmd.getState() == api::SetBucketStateCommand::ACTIVE);
+ spi::BucketInfo::ActiveState newState(
+ shouldBeActive
+ ? spi::BucketInfo::ACTIVE
+ : spi::BucketInfo::NOT_ACTIVE);
+
+ spi::Result result(_spi.setActiveState(bucket, newState));
+ if (checkForError(result, *tracker)) {
+ StorBucketDatabase::WrappedEntry entry(_env.getBucketDatabase().get(
+ cmd.getBucketId(), "handleSetBucketState"));
+ if (entry.exist()) {
+ entry->info.setActive(newState == spi::BucketInfo::ACTIVE);
+ notifyGuard.notifyIfOwnershipChanged(cmd.getBucketId(),
+ cmd.getSourceIndex(),
+ entry->info);
+ entry.write();
+ } else {
+ LOG(warning, "Got OK setCurrentState result from provider for %s, "
+ "but bucket has disappeared from service layer database",
+ cmd.getBucketId().toString().c_str());
+ }
+
+ tracker->setReply(api::StorageReply::SP(
+ new api::SetBucketStateReply(cmd)));
+ }
+
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleInternalBucketJoin(InternalBucketJoinCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.internalJoin,
+ _env._component.getClock()));
+ {
+ // Create empty bucket for target.
+ StorBucketDatabase::WrappedEntry entry =
+ _env.getBucketDatabase().get(
+ cmd.getBucketId(),
+ "join",
+ StorBucketDatabase::CREATE_IF_NONEXISTING);
+
+ entry->disk = _env._partition;
+ entry.write();
+ }
+ spi::Result result =
+ _spi.join(spi::Bucket(cmd.getBucketId(),
+ spi::PartitionId(cmd.getDiskOfInstanceToJoin())),
+ spi::Bucket(cmd.getBucketId(),
+ spi::PartitionId(cmd.getDiskOfInstanceToJoin())),
+ spi::Bucket(cmd.getBucketId(),
+ spi::PartitionId(cmd.getDiskOfInstanceToKeep())),
+ _context);
+ if (checkForError(result, *tracker)) {
+ tracker->setReply(
+ api::StorageReply::SP(
+ new InternalBucketJoinReply(cmd,
+ _env.getBucketInfo(cmd.getBucketId()))));
+ }
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleRecheckBucketInfo(RecheckBucketInfoCommand& cmd)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.recheckBucketInfo, _env._component.getClock()));
+ document::BucketId bid(cmd.getBucketId());
+ api::BucketInfo info(_env.getBucketInfo(bid));
+ NotificationGuard notifyGuard(*_bucketOwnershipNotifier);
+ {
+ // Update bucket database
+ StorBucketDatabase::WrappedEntry entry(
+ _component->getBucketDatabase().get(
+ bid,
+ "handleRecheckBucketInfo"));
+
+ if (entry.exist()) {
+ api::BucketInfo prevInfo(entry->getBucketInfo());
+
+ if (!(prevInfo == info)) {
+ notifyGuard.notifyAlways(bid, info);
+ entry->info = info;
+ entry.write();
+ }
+ }
+ // else: there is a race condition where concurrent execution of
+ // DeleteBucket in the FileStorManager and this function can cause it
+ // to look like the provider has a bucket we do not know about, simply
+ // because this function was executed before the actual
+ // DeleteBucketCommand in the persistence thread (see ticket 6143025).
+ }
+ return tracker;
+}
+
+MessageTracker::UP
+PersistenceThread::handleCommandSplitByType(api::StorageCommand& msg)
+{
+ switch (msg.getType().getId()) {
+ case api::MessageType::GET_ID:
+ return handleGet(static_cast<api::GetCommand&>(msg));
+ case api::MessageType::PUT_ID:
+ return handlePut(static_cast<api::PutCommand&>(msg));
+ case api::MessageType::REMOVE_ID:
+ return handleRemove(static_cast<api::RemoveCommand&>(msg));
+ case api::MessageType::UPDATE_ID:
+ return handleUpdate(static_cast<api::UpdateCommand&>(msg));
+ case api::MessageType::MULTIOPERATION_ID:
+ return handleMultiOperation(
+ static_cast<api::MultiOperationCommand&>(msg));
+ case api::MessageType::REVERT_ID:
+ return handleRevert(static_cast<api::RevertCommand&>(msg));
+ case api::MessageType::CREATEBUCKET_ID:
+ return handleCreateBucket(static_cast<api::CreateBucketCommand&>(msg));
+ case api::MessageType::DELETEBUCKET_ID:
+ return handleDeleteBucket(static_cast<api::DeleteBucketCommand&>(msg));
+ case api::MessageType::JOINBUCKETS_ID:
+ return handleJoinBuckets(static_cast<api::JoinBucketsCommand&>(msg));
+ case api::MessageType::SPLITBUCKET_ID:
+ return handleSplitBucket(static_cast<api::SplitBucketCommand&>(msg));
+ // Depends on iterators
+ case api::MessageType::STATBUCKET_ID:
+ return _processAllHandler.handleStatBucket(
+ static_cast<api::StatBucketCommand&>(msg), _context);
+ case api::MessageType::REMOVELOCATION_ID:
+ return _processAllHandler.handleRemoveLocation(
+ static_cast<api::RemoveLocationCommand&>(msg), _context);
+ case api::MessageType::MERGEBUCKET_ID:
+ return _mergeHandler.handleMergeBucket(
+ static_cast<api::MergeBucketCommand&>(msg), _context);
+ case api::MessageType::GETBUCKETDIFF_ID:
+ return _mergeHandler.handleGetBucketDiff(
+ static_cast<api::GetBucketDiffCommand&>(msg), _context);
+ case api::MessageType::APPLYBUCKETDIFF_ID:
+ return _mergeHandler.handleApplyBucketDiff(
+ static_cast<api::ApplyBucketDiffCommand&>(msg), _context);
+ case api::MessageType::SETBUCKETSTATE_ID:
+ return handleSetBucketState(
+ static_cast<api::SetBucketStateCommand&>(msg));
+ case api::MessageType::INTERNAL_ID:
+ switch(static_cast<api::InternalCommand&>(msg).getType()) {
+ case GetIterCommand::ID:
+ return handleGetIter(static_cast<GetIterCommand&>(msg));
+ case CreateIteratorCommand::ID:
+ return handleCreateIterator(
+ static_cast<CreateIteratorCommand&>(msg));
+ case ReadBucketList::ID:
+ return handleReadBucketList(static_cast<ReadBucketList&>(msg));
+ case ReadBucketInfo::ID:
+ return handleReadBucketInfo(static_cast<ReadBucketInfo&>(msg));
+ case RepairBucketCommand::ID:
+ return handleRepairBucket(static_cast<RepairBucketCommand&>(msg));
+ case BucketDiskMoveCommand::ID:
+ return _diskMoveHandler.handleBucketDiskMove(
+ static_cast<BucketDiskMoveCommand&>(msg), _context);
+ case InternalBucketJoinCommand::ID:
+ return handleInternalBucketJoin(
+ static_cast<InternalBucketJoinCommand&>(msg));
+ case RecheckBucketInfoCommand::ID:
+ return handleRecheckBucketInfo(
+ static_cast<RecheckBucketInfoCommand&>(msg));
+ default:
+ LOG(warning,
+ "Persistence thread received unhandled internal command %s",
+ msg.toString().c_str());
+ break;
+ }
+ default:
+ break;
+ }
+ return MessageTracker::UP();
+}
+
+MessageTracker::UP
+PersistenceThread::handleCommand(api::StorageCommand& msg)
+{
+ _context = spi::Context(msg.getLoadType(), msg.getPriority(),
+ msg.getTrace().getLevel());
+ MessageTracker::UP mtracker(handleCommandSplitByType(msg));
+ if (mtracker.get() != 0) {
+ if (mtracker->getReply().get() != 0) {
+ mtracker->getReply()->getTrace().getRoot().addChild(
+ _context.getTrace().getRoot());
+ } else {
+ msg.getTrace().getRoot().addChild(_context.getTrace().getRoot());
+ }
+ }
+ return mtracker;
+}
+
+void
+PersistenceThread::handleReply(api::StorageReply& reply)
+{
+ switch (reply.getType().getId()) {
+ case api::MessageType::GETBUCKETDIFF_REPLY_ID:
+ _mergeHandler.handleGetBucketDiffReply(
+ static_cast<api::GetBucketDiffReply&>(reply),
+ _env._fileStorHandler);
+ break;
+ case api::MessageType::APPLYBUCKETDIFF_REPLY_ID:
+ _mergeHandler.handleApplyBucketDiffReply(
+ static_cast<api::ApplyBucketDiffReply&>(reply),
+ _env._fileStorHandler);
+ break;
+ default:
+ break;
+ }
+}
+
+MessageTracker::UP
+PersistenceThread::processMessage(api::StorageMessage& msg)
+{
+ MBUS_TRACE(msg.getTrace(), 5,
+ "PersistenceThread: Processing message in persistence layer");
+
+ ++_env._metrics.operations;
+ if (msg.getType().isReply()) {
+ try{
+ _env._pauseHandler.setPriority(msg.getPriority());
+ LOG(debug, "Handling reply: %s", msg.toString().c_str());
+ LOG(spam, "Message content: %s", msg.toString(true).c_str());
+ handleReply(static_cast<api::StorageReply&>(msg));
+ } catch (std::exception& e) {
+ // It's a reply, so nothing we can do.
+ LOG(debug, "Caught exception for %s: %s",
+ msg.toString().c_str(),
+ e.what());
+ }
+ } else {
+ api::StorageCommand& initiatingCommand =
+ static_cast<api::StorageCommand&>(msg);
+
+ try {
+ int64_t startTime(
+ _component->getClock().getTimeInMillis().getTime());
+
+ LOG(debug, "Handling command: %s", msg.toString().c_str());
+ LOG(spam, "Message content: %s", msg.toString(true).c_str());
+ std::unique_ptr<MessageTracker> tracker(
+ handleCommand(initiatingCommand));
+ if (!tracker.get()) {
+ LOG(debug, "Received unsupported command %s",
+ msg.getType().getName().c_str());
+ } else {
+ tracker->generateReply(initiatingCommand);
+ if ((tracker->getReply().get()
+ && tracker->getReply()->getResult().failed())
+ || tracker->getResult().failed())
+ {
+ ++_env._metrics.failedOperations;
+ }
+ }
+
+ int64_t stopTime(
+ _component->getClock().getTimeInMillis().getTime());
+ if (stopTime - startTime >= _warnOnSlowOperations) {
+ LOGBT(warning, msg.getType().toString(),
+ "Slow processing of message %s on disk %u. "
+ "Processing time: %" PRId64 " ms (>=%d ms)",
+ msg.toString().c_str(), _env._partition,
+ stopTime - startTime, _warnOnSlowOperations);
+ } else {
+ LOGBT(spam, msg.getType().toString(),
+ "Processing time of message %s on disk %u: %" PRId64 " ms",
+ msg.toString(true).c_str(), _env._partition,
+ stopTime - startTime);
+ }
+
+ return tracker;
+ } catch (std::exception& e) {
+ LOG(debug, "Caught exception for %s: %s",
+ msg.toString().c_str(),
+ e.what());
+ api::StorageReply::SP reply(initiatingCommand.makeReply().release());
+ reply->setResult(api::ReturnCode(
+ api::ReturnCode::INTERNAL_FAILURE, e.what()));
+ _env._fileStorHandler.sendReply(reply);
+ }
+ }
+
+ return MessageTracker::UP();
+}
+
+namespace {
+
+
+bool isBatchable(const api::StorageMessage& msg)
+{
+ return (msg.getType().getId() == api::MessageType::PUT_ID ||
+ msg.getType().getId() == api::MessageType::REMOVE_ID ||
+ msg.getType().getId() == api::MessageType::UPDATE_ID ||
+ msg.getType().getId() == api::MessageType::MULTIOPERATION_ID ||
+ msg.getType().getId() == api::MessageType::REVERT_ID);
+}
+
+bool hasBucketInfo(const api::StorageMessage& msg)
+{
+ return (isBatchable(msg) ||
+ (msg.getType().getId() == api::MessageType::REMOVELOCATION_ID ||
+ msg.getType().getId() == api::MessageType::JOINBUCKETS_ID));
+}
+
+}
+
+void
+PersistenceThread::flushAllReplies(
+ const document::BucketId& bucketId,
+ std::vector<vespalib::LinkedPtr<MessageTracker> >& replies)
+{
+ if (replies.empty()) {
+ return;
+ }
+
+ try {
+ if (replies.size() > 1) {
+ _env._metrics.batchingSize.addValue(replies.size());
+ }
+#ifdef ENABLE_BUCKET_OPERATION_LOGGING
+ {
+ size_t nputs = 0, nremoves = 0, nother = 0;
+ for (size_t i = 0; i < replies.size(); ++i) {
+ if (dynamic_cast<api::PutReply*>(replies[i]->getReply().get()))
+ {
+ ++nputs;
+ } else if (dynamic_cast<api::RemoveReply*>(
+ replies[i]->getReply().get()))
+ {
+ ++nremoves;
+ } else {
+ ++nother;
+ }
+ }
+ LOG_BUCKET_OPERATION(
+ bucketId,
+ vespalib::make_vespa_string(
+ "flushing %zu operations (%zu puts, %zu removes, "
+ "%zu other)",
+ replies.size(), nputs, nremoves, nother));
+ }
+#endif
+ spi::Bucket b(bucketId, spi::PartitionId(_env._partition));
+ spi::Result result = _spi.flush(b, _context);
+ uint32_t errorCode = _env.convertErrorCode(result);
+ if (errorCode != 0) {
+ for (uint32_t i = 0; i < replies.size(); ++i) {
+ replies[i]->getReply()->setResult(
+ api::ReturnCode(
+ (api::ReturnCode::Result)errorCode,
+ result.getErrorMessage()));
+ }
+ }
+ } catch (std::exception& e) {
+ for (uint32_t i = 0; i < replies.size(); ++i) {
+ replies[i]->getReply()->setResult(api::ReturnCode(
+ api::ReturnCode::INTERNAL_FAILURE, e.what()));
+ }
+ }
+
+ for (uint32_t i = 0; i < replies.size(); ++i) {
+ LOG(spam,
+ "Sending reply up (batched): %s %zu",
+ replies[i]->getReply()->toString().c_str(),
+ replies[i]->getReply()->getMsgId());
+ _env._fileStorHandler.sendReply(replies[i]->getReply());
+ }
+
+ replies.clear();
+}
+
+void PersistenceThread::processMessages(FileStorHandler::LockedMessage & lock)
+{
+ std::vector<MessageTracker::LP> trackers;
+ document::BucketId bucketId = lock.first->getBucketId();
+
+ while (lock.second.get() != 0) {
+ LOG(debug, "Inside while loop %d, nodeIndex %d, ptr=%p",
+ _env._partition, _env._nodeIndex, lock.second.get());
+ std::shared_ptr<api::StorageMessage> msg(lock.second);
+ bool batchable = isBatchable(*msg);
+
+ // If the next operation wasn't batchable, we should flush
+ // everything that came before.
+ if (!batchable) {
+ flushAllReplies(bucketId, trackers);
+ }
+
+ std::unique_ptr<MessageTracker> tracker = processMessage(*msg);
+ if (!tracker.get() || !tracker->getReply().get()) {
+ // Was a reply
+ break;
+ }
+
+ if (hasBucketInfo(*msg)) {
+ if (tracker->getReply()->getResult().success()) {
+ _env.setBucketInfo(*tracker, bucketId);
+ }
+ }
+ if (batchable) {
+ LOG(spam, "Adding reply %s to batch for bucket %s",
+ tracker->getReply()->toString().c_str(),
+ bucketId.toString().c_str());
+
+ trackers.push_back(MessageTracker::LP(tracker.release()));
+
+ if (trackers.back()->getReply()->getResult().success()) {
+ _env._fileStorHandler.getNextMessage(
+ _env._partition,
+ lock,
+ _env._lowestPriority);
+ } else {
+ break;
+ }
+ } else {
+ LOG(spam,
+ "Sending reply up: %s %zu",
+ tracker->getReply()->toString().c_str(),
+ tracker->getReply()->getMsgId());
+
+ _env._fileStorHandler.sendReply(tracker->getReply());
+ break;
+ }
+ }
+
+ flushAllReplies(bucketId, trackers);
+}
+
+void
+PersistenceThread::run(framework::ThreadHandle& thread)
+{
+ LOG(debug, "Started persistence thread with pid %d", getpid());
+
+ while (!thread.interrupted()
+ && !_env._fileStorHandler.closed(_env._partition))
+ {
+ thread.registerTick();
+
+ FileStorHandler::LockedMessage lock(
+ _env._fileStorHandler.getNextMessage(
+ _env._partition, _env._lowestPriority));
+
+ if (lock.first.get()) {
+ processMessages(lock);
+ }
+
+ vespalib::MonitorGuard flushMonitorGuard(_flushMonitor);
+ flushMonitorGuard.broadcast();
+ }
+ LOG(debug, "Closing down persistence thread %d", getpid());
+ vespalib::MonitorGuard flushMonitorGuard(_flushMonitor);
+ _closed = true;
+ flushMonitorGuard.broadcast();
+}
+
+void
+PersistenceThread::flush()
+{
+ vespalib::MonitorGuard flushMonitorGuard(_flushMonitor);
+ if (!_closed) {
+ flushMonitorGuard.wait();
+ }
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/persistencethread.h b/storage/src/vespa/storage/persistence/persistencethread.h
new file mode 100644
index 00000000000..032bc586342
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/persistencethread.h
@@ -0,0 +1,117 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/storage/common/statusmessages.h>
+#include <vespa/storage/persistence/diskthread.h>
+#include <vespa/storage/persistence/processallhandler.h>
+#include <vespa/storage/persistence/mergehandler.h>
+#include <vespa/storage/persistence/diskmoveoperationhandler.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storage/common/storagecomponent.h>
+#include <vespa/storage/persistence/persistenceutil.h>
+#include <vespa/storage/persistence/providershutdownwrapper.h>
+
+namespace storage {
+
+class BucketOwnershipNotifier;
+class TestAndSetHelper;
+
+class PersistenceThread : public DiskThread, public Types
+{
+public:
+ PersistenceThread(ServiceLayerComponentRegister&,
+ const config::ConfigUri & configUri,
+ spi::PersistenceProvider& provider,
+ FileStorHandler& filestorHandler,
+ FileStorThreadMetrics& metrics,
+ uint16_t deviceIndex,
+ uint8_t lowestPriority,
+ bool startThread = false);
+ ~PersistenceThread();
+
+ /** Waits for current operation to be finished. */
+ void flush();
+
+ bool isMerging(const BucketId& bucket) const;
+
+ virtual framework::Thread& getThread() { return *_thread; }
+
+ MessageTracker::UP handlePut(api::PutCommand& cmd);
+ MessageTracker::UP handleRemove(api::RemoveCommand& cmd);
+ MessageTracker::UP handleUpdate(api::UpdateCommand& cmd);
+ MessageTracker::UP handleGet(api::GetCommand& cmd);
+
+ MessageTracker::UP handleMultiOperation(api::MultiOperationCommand& cmd);
+ MessageTracker::UP handleRevert(api::RevertCommand& cmd);
+ MessageTracker::UP handleCreateBucket(api::CreateBucketCommand& cmd);
+ MessageTracker::UP handleDeleteBucket(api::DeleteBucketCommand& cmd);
+ MessageTracker::UP handleCreateIterator(CreateIteratorCommand& cmd);
+ MessageTracker::UP handleGetIter(GetIterCommand& cmd);
+ MessageTracker::UP handleReadBucketList(ReadBucketList& cmd);
+ MessageTracker::UP handleReadBucketInfo(ReadBucketInfo& cmd);
+ MessageTracker::UP handleJoinBuckets(api::JoinBucketsCommand& cmd);
+ MessageTracker::UP handleSetBucketState(api::SetBucketStateCommand& cmd);
+ MessageTracker::UP handleInternalBucketJoin(InternalBucketJoinCommand& cmd);
+ MessageTracker::UP handleSplitBucket(api::SplitBucketCommand& cmd);
+ MessageTracker::UP handleRepairBucket(RepairBucketCommand& cmd);
+ MessageTracker::UP handleRecheckBucketInfo(RecheckBucketInfoCommand& cmd);
+
+private:
+ PersistenceUtil _env;
+ uint32_t _warnOnSlowOperations;
+
+ spi::PersistenceProvider& _spi;
+ ProcessAllHandler _processAllHandler;
+ MergeHandler _mergeHandler;
+ DiskMoveOperationHandler _diskMoveHandler;
+ ServiceLayerComponent::UP _component;
+ framework::Thread::UP _thread;
+ spi::Context _context;
+ std::unique_ptr<BucketOwnershipNotifier> _bucketOwnershipNotifier;
+
+ vespalib::Monitor _flushMonitor;
+ bool _closed;
+
+ void setBucketInfo(MessageTracker& tracker,
+ const document::BucketId& bucketId);
+
+ bool checkProviderBucketInfoMatches(const spi::Bucket&,
+ const api::BucketInfo&) const;
+
+ void updateBucketDatabase(const document::BucketId& id,
+ const api::BucketInfo& info);
+
+ /**
+ * Sanity-checking of join command parameters. Invokes tracker.fail() with
+ * an appropriate error and returns false iff the command does not validate
+ * OK. Returns true and does not touch the tracker otherwise.
+ */
+ bool validateJoinCommand(const api::JoinBucketsCommand& cmd,
+ MessageTracker& tracker) const;
+
+ // Message handling functions
+ MessageTracker::UP handleCommand(api::StorageCommand&);
+ MessageTracker::UP handleCommandSplitByType(api::StorageCommand&);
+ void handleReply(api::StorageReply&);
+
+ MessageTracker::UP processMessage(api::StorageMessage& msg);
+ void processMessages(FileStorHandler::LockedMessage & lock);
+
+ // Thread main loop
+ virtual void run(framework::ThreadHandle&);
+
+ bool checkForError(const spi::Result& response, MessageTracker& tracker);
+
+ spi::Bucket getBucket(const DocumentId& id, const BucketId& bucket) const;
+
+ void flushAllReplies(const document::BucketId& bucketId,
+ std::vector<MessageTracker::LP>& trackers);
+
+ friend class TestAndSetHelper;
+ bool tasConditionExists(const api::TestAndSetCommand & cmd);
+ bool tasConditionMatches(const api::TestAndSetCommand & cmd, MessageTracker & tracker);
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/persistence/persistenceutil.cpp b/storage/src/vespa/storage/persistence/persistenceutil.cpp
new file mode 100644
index 00000000000..5c023707937
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/persistenceutil.cpp
@@ -0,0 +1,216 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/persistence/persistenceutil.h>
+#include <vespa/storage/persistence/filestorage/filestorhandler.h>
+#include <vespa/storage/storageutil/utils.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+
+namespace storage {
+
+LOG_SETUP(".persistence.util");
+
+namespace {
+ std::string generateName(void* p) {
+ std::ostringstream ost;
+ ost << "PersistenceUtil(" << p << ")";
+ return ost.str();
+ }
+}
+
+MessageTracker::MessageTracker(FileStorThreadMetrics::Op& metric,
+ framework::Clock& clock)
+ : _sendReply(true),
+ _metric(metric),
+ _result(api::ReturnCode::OK),
+ _timer(clock)
+{
+ ++_metric.count;
+}
+
+MessageTracker::~MessageTracker()
+{
+ if (_reply.get() && _reply->getResult().success()) {
+ _metric.latency.addValue(_timer);
+ }
+}
+
+void
+MessageTracker::fail(const ReturnCode& result)
+{
+ _result = result;
+ LOG(debug, "Failing operation with error: %s", _result.toString().c_str());
+}
+
+void
+MessageTracker::generateReply(api::StorageCommand& cmd)
+{
+ if (!_sendReply) {
+ return;
+ }
+
+ if (!_reply.get()) {
+ _reply.reset(cmd.makeReply().release());
+ _reply->setResult(_result);
+ }
+
+ if (!_reply->getResult().success()) {
+ ++_metric.failed;
+ LOGBP(debug, "Failed to handle command %s: %s",
+ cmd.toString().c_str(),
+ _result.toString().c_str());
+ }
+}
+
+PersistenceUtil::PersistenceUtil(
+ const config::ConfigUri & configUri,
+ ServiceLayerComponentRegister& compReg,
+ FileStorHandler& fileStorHandler,
+ FileStorThreadMetrics& metrics,
+ uint16_t partition,
+ uint8_t lowestPriority,
+ spi::PersistenceProvider& provider)
+ : _config(*config::ConfigGetter<vespa::config::content::StorFilestorConfig>::getConfig(configUri.getConfigId(), configUri.getContext())),
+ _compReg(compReg),
+ _component(compReg, generateName(this)),
+ _fileStorHandler(fileStorHandler),
+ _partition(partition),
+ _nodeIndex(_component.getIndex()),
+ _metrics(metrics),
+ _bucketFactory(_component.getBucketIdFactory()),
+ _repo(_component.getTypeRepo()),
+ _lowestPriority(lowestPriority),
+ _pauseHandler(),
+ _spi(provider)
+{
+}
+
+PersistenceUtil::~PersistenceUtil()
+{
+}
+
+void
+PersistenceUtil::updateBucketDatabase(const document::BucketId& id,
+ const api::BucketInfo& i)
+{
+ // Update bucket database
+ StorBucketDatabase::WrappedEntry entry(getBucketDatabase().get(
+ id,
+ "env::updatebucketdb"));
+ if (entry.exist()) {
+ api::BucketInfo info = i;
+
+ // Don't override last modified unless this is the first bucket
+ // info reading.
+ if (entry->info.getLastModified() != 0) {
+ info.setLastModified(entry->info.getLastModified());
+ }
+ entry->setBucketInfo(info);
+ entry.write();
+ } else {
+ LOG(debug,
+ "Bucket(%s).getBucketInfo: Bucket does not exist.",
+ id.toString().c_str());
+ }
+}
+
+uint16_t
+PersistenceUtil::getPreferredAvailableDisk(const document::BucketId& id) const
+{
+ return _component.getPreferredAvailablePartition(id);
+}
+
+PersistenceUtil::LockResult
+PersistenceUtil::lockAndGetDisk(const document::BucketId& bucket,
+ StorBucketDatabase::Flag flags)
+{
+ // To lock the bucket, we need to ensure that we don't conflict with
+ // bucket disk move command. First we fetch current disk index from
+ // bucket DB. When we attempt to lock that lock. And lastly we check
+ // the bucket DB again to verify that the bucket is still on that
+ // disk after locking it, or we will have to retry on new disk.
+ LockResult result;
+ result.disk = getPreferredAvailableDisk(bucket);
+
+ while (true) {
+ std::shared_ptr<FileStorHandler::BucketLockInterface> lock(
+ _fileStorHandler.lock(bucket, result.disk));
+
+ StorBucketDatabase::WrappedEntry entry(getBucketDatabase().get(
+ bucket, "join-lockAndGetDisk-1", flags));
+ if (entry.exist() && entry->disk != result.disk) {
+ result.disk = entry->disk;
+ continue;
+ }
+
+ result.lock = lock;
+ return result;
+ }
+}
+
+void
+PersistenceUtil::setBucketInfo(MessageTracker& tracker,
+ const document::BucketId& bucketId)
+{
+ api::BucketInfo info = getBucketInfo(bucketId, _partition);
+
+ static_cast<api::BucketInfoReply&>(*tracker.getReply()).
+ setBucketInfo(info);
+
+ updateBucketDatabase(bucketId, info);
+}
+
+api::BucketInfo
+PersistenceUtil::getBucketInfo(const document::BucketId& bId, int disk) const
+{
+ if (disk == -1) {
+ disk = _partition;
+ }
+
+ spi::BucketInfoResult response =
+ _spi.getBucketInfo(spi::Bucket(bId, spi::PartitionId(disk)));
+
+ return convertBucketInfo(response.getBucketInfo());
+}
+
+api::BucketInfo
+PersistenceUtil::convertBucketInfo(const spi::BucketInfo& info) const
+{
+ return api::BucketInfo(info.getChecksum(),
+ info.getDocumentCount(),
+ info.getDocumentSize(),
+ info.getEntryCount(),
+ info.getUsedSize(),
+ info.isReady(),
+ info.isActive(), 0);
+}
+
+uint32_t
+PersistenceUtil::convertErrorCode(const spi::Result& response)
+{
+ switch (response.getErrorCode()) {
+ case spi::Result::NONE:
+ return 0;
+ case spi::Result::TIMESTAMP_EXISTS:
+ return api::ReturnCode::TIMESTAMP_EXIST;
+ case spi::Result::TRANSIENT_ERROR:
+ case spi::Result::FATAL_ERROR:
+ return mbus::ErrorCode::APP_TRANSIENT_ERROR;
+ case spi::Result::RESOURCE_EXHAUSTED:
+ return api::ReturnCode::NO_SPACE;
+ case spi::Result::PERMANENT_ERROR:
+ default:
+ return mbus::ErrorCode::APP_FATAL_ERROR;
+ }
+
+ return 0;
+}
+
+void
+PersistenceUtil::shutdown(const std::string& reason)
+{
+ _component.requestShutdown(reason);
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/persistenceutil.h b/storage/src/vespa/storage/persistence/persistenceutil.h
new file mode 100644
index 00000000000..0007ebd9666
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/persistenceutil.h
@@ -0,0 +1,125 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/persistence/spi/persistenceprovider.h>
+#include <vespa/storage/common/servicelayercomponent.h>
+#include <vespa/storage/persistence/filestorage/filestorhandler.h>
+#include <vespa/storage/persistence/filestorage/filestormetrics.h>
+#include <vespa/storage/persistence/filestorage/pausehandler.h>
+#include <vespa/storage/persistence/types.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/vespalib/io/fileutil.h>
+#include <vespa/storage/storageutil/utils.h>
+#include <vespa/config-stor-filestor.h>
+#include <vespa/persistence/spi/persistenceprovider.h>
+
+namespace storage {
+
+class MessageTracker : protected Types {
+public:
+ typedef vespalib::LinkedPtr<MessageTracker> LP;
+ typedef std::unique_ptr<MessageTracker> UP;
+
+ MessageTracker(FileStorThreadMetrics::Op& metric,
+ framework::Clock& clock);
+
+ ~MessageTracker();
+
+ /**
+ * Called by operation handlers to set reply if they need to send a
+ * non-default reply. They should call this function as soon as they create
+ * a reply, to ensure it is stored in case of failure after reply creation.
+ */
+ void setReply(api::StorageReply::SP reply) {
+ assert(_reply.get() == 0);
+ _reply = reply;
+ }
+
+ /** Utility function to be able to write a bit less in client. */
+ void fail(uint32_t result, const String& message = "") {
+ fail(ReturnCode((api::ReturnCode::Result)result, message));
+ }
+ /** Set the request to fail with the given failure. */
+ void fail(const ReturnCode&);
+
+ /** Don't send reply for the command being processed. Used by multi chain
+ * commands like merge. */
+ void dontReply() { _sendReply = false; }
+
+ api::StorageReply::SP getReply() {
+ return _reply;
+ }
+
+ void generateReply(api::StorageCommand& cmd);
+
+ api::ReturnCode getResult() const { return _result; }
+
+private:
+ bool _sendReply;
+ FileStorThreadMetrics::Op& _metric;
+ api::StorageReply::SP _reply;
+ api::ReturnCode _result;
+ framework::MilliSecTimer _timer;
+};
+
+struct PersistenceUtil {
+ vespa::config::content::StorFilestorConfig _config;
+ ServiceLayerComponentRegister& _compReg;
+ ServiceLayerComponent _component;
+ FileStorHandler& _fileStorHandler;
+ uint16_t _partition;
+ uint16_t _nodeIndex;
+ FileStorThreadMetrics& _metrics;
+ const document::BucketIdFactory& _bucketFactory;
+ const document::DocumentTypeRepo::SP _repo;
+ uint8_t _lowestPriority;
+ PauseHandler _pauseHandler;
+ spi::PersistenceProvider& _spi;
+
+ PersistenceUtil(
+ const config::ConfigUri&,
+ ServiceLayerComponentRegister&,
+ FileStorHandler& fileStorHandler,
+ FileStorThreadMetrics& metrics,
+ uint16_t partition,
+ uint8_t lowestPriority,
+ spi::PersistenceProvider& provider);
+
+ ~PersistenceUtil();
+
+ StorBucketDatabase& getBucketDatabase()
+ { return _component.getBucketDatabase(); }
+
+ void updateBucketDatabase(const document::BucketId& id,
+ const api::BucketInfo& info);
+
+ uint16_t getPreferredAvailableDisk(const document::BucketId& id) const;
+
+ /** Lock the given bucket in the file stor handler. */
+ struct LockResult {
+ std::shared_ptr<FileStorHandler::BucketLockInterface> lock;
+ uint16_t disk;
+
+ LockResult() : lock(), disk(0) {}
+
+ bool bucketExisted() const { return (lock.get() != 0); }
+ };
+
+ LockResult lockAndGetDisk(
+ const document::BucketId& bucket,
+ StorBucketDatabase::Flag flags = StorBucketDatabase::NONE);
+
+ api::BucketInfo getBucketInfo(const document::BucketId& bId, int disk = -1) const;
+
+ api::BucketInfo convertBucketInfo(const spi::BucketInfo&) const;
+
+ void setBucketInfo(MessageTracker& tracker,
+ const document::BucketId& bucketId);
+
+ static uint32_t convertErrorCode(const spi::Result& response);
+
+ void shutdown(const std::string& reason);
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/persistence/processallhandler.cpp b/storage/src/vespa/storage/persistence/processallhandler.cpp
new file mode 100644
index 00000000000..5af2cef7b6e
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/processallhandler.cpp
@@ -0,0 +1,136 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storage/persistence/processallhandler.h>
+#include <vespa/storage/persistence/bucketprocessor.h>
+
+LOG_SETUP(".persistence.processall");
+
+namespace storage {
+
+ProcessAllHandler::ProcessAllHandler(PersistenceUtil& env,
+ spi::PersistenceProvider& spi)
+ : _env(env),
+ _spi(spi)
+{
+}
+
+namespace {
+
+class UnrevertableRemoveEntryProcessor : public BucketProcessor::EntryProcessor {
+public:
+ spi::PersistenceProvider& _provider;
+ const spi::Bucket& _bucket;
+ spi::Context& _context;
+
+ UnrevertableRemoveEntryProcessor(
+ spi::PersistenceProvider& provider,
+ const spi::Bucket& bucket,
+ spi::Context& context)
+ : _provider(provider),
+ _bucket(bucket),
+ _context(context) {}
+
+ void process(spi::DocEntry& entry) {
+ spi::RemoveResult removeResult = _provider.remove(
+ _bucket,
+ entry.getTimestamp(),
+ *entry.getDocumentId(),
+ _context);
+
+ if (removeResult.getErrorCode() != spi::Result::NONE) {
+ std::ostringstream ss;
+ ss << "Failed to do remove for removelocation: "
+ << removeResult.getErrorMessage();
+ throw std::runtime_error(ss.str());
+ }
+ }
+};
+
+class StatEntryProcessor : public BucketProcessor::EntryProcessor {
+public:
+ std::ostream& ost;
+ StatEntryProcessor(std::ostream& o)
+ : ost(o) {};
+
+ void process(spi::DocEntry& e) {
+ ost << " Timestamp: " << e.getTimestamp() << ", ";
+ if (e.getDocument() != 0) {
+ ost << "Doc(" << e.getDocument()->getId() << ")"
+ << ", " << e.getDocument()->getId().getGlobalId()
+ << ", size: " << e.getPersistedDocumentSize();
+ } else if (e.getDocumentId() != 0) {
+ ost << *e.getDocumentId()
+ << ", " << e.getDocumentId()->getGlobalId();
+ } else {
+ ost << "metadata only";
+ }
+ if (e.isRemove()) {
+ ost << " (remove)";
+ }
+ ost << "\n";
+ }
+};
+
+}
+
+MessageTracker::UP
+ProcessAllHandler::handleRemoveLocation(api::RemoveLocationCommand& cmd,
+ spi::Context& context)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.removeLocation[cmd.getLoadType()],
+ _env._component.getClock()));
+
+ LOG(debug, "RemoveLocation(%s): using selection '%s'",
+ cmd.getBucketId().toString().c_str(),
+ cmd.getDocumentSelection().c_str());
+
+ spi::Bucket bucket(cmd.getBucketId(),
+ spi::PartitionId(_env._partition));
+ UnrevertableRemoveEntryProcessor processor(_spi, bucket, context);
+ BucketProcessor::iterateAll(_spi,
+ bucket,
+ cmd.getDocumentSelection(),
+ processor,
+ spi::NEWEST_DOCUMENT_ONLY,
+ context);
+ spi::Result result = _spi.flush(bucket, context);
+ uint32_t code = _env.convertErrorCode(result);
+ if (code != 0) {
+ tracker->fail(code, result.getErrorMessage());
+ }
+
+ return tracker;
+}
+
+MessageTracker::UP
+ProcessAllHandler::handleStatBucket(api::StatBucketCommand& cmd,
+ spi::Context& context)
+{
+ MessageTracker::UP tracker(new MessageTracker(
+ _env._metrics.statBucket[cmd.getLoadType()],
+ _env._component.getClock()));
+ std::ostringstream ost;
+
+ ost << "Persistence bucket " << cmd.getBucketId()
+ << ", partition " << _env._partition << "\n";
+
+ spi::Bucket bucket(cmd.getBucketId(),
+ spi::PartitionId(_env._partition));
+ StatEntryProcessor processor(ost);
+ BucketProcessor::iterateAll(_spi,
+ bucket,
+ cmd.getDocumentSelection(),
+ processor,
+ spi::ALL_VERSIONS,
+ context);
+
+ api::StatBucketReply::UP reply(new api::StatBucketReply(cmd, ost.str()));
+ tracker->setReply(api::StorageReply::SP(reply.release()));
+ return tracker;
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/processallhandler.h b/storage/src/vespa/storage/persistence/processallhandler.h
new file mode 100644
index 00000000000..73c0c90a6a6
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/processallhandler.h
@@ -0,0 +1,33 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/storage/persistence/persistenceutil.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/removelocation.h>
+#include <vespa/storageapi/message/stat.h>
+#include <vespa/persistence/spi/persistenceprovider.h>
+
+namespace document {
+namespace select {
+class Node;
+}
+}
+
+namespace storage {
+
+class ProcessAllHandler : public Types {
+
+public:
+ ProcessAllHandler(PersistenceUtil&, spi::PersistenceProvider&);
+ MessageTracker::UP handleRemoveLocation(api::RemoveLocationCommand&,
+ spi::Context&);
+ MessageTracker::UP handleStatBucket(api::StatBucketCommand&, spi::Context&);
+
+protected:
+ PersistenceUtil& _env;
+ spi::PersistenceProvider& _spi;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/persistence/providershutdownwrapper.cpp b/storage/src/vespa/storage/persistence/providershutdownwrapper.cpp
new file mode 100644
index 00000000000..b3a44a2b41b
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/providershutdownwrapper.cpp
@@ -0,0 +1,207 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/persistence/providershutdownwrapper.h>
+#include <vespa/storage/persistence/persistenceutil.h>
+#include <vespa/log/log.h>
+
+LOG_SETUP(".persistence.shutdownwrapper");
+
+namespace storage {
+
+template <typename ResultType>
+ResultType
+ProviderShutdownWrapper::checkResult(ResultType&& result) const
+{
+ if (result.getErrorCode() == spi::Result::FATAL_ERROR) {
+ vespalib::LockGuard guard(_shutdownLock);
+ if (_shutdownTriggered) {
+ LOG(debug,
+ "Received FATAL_ERROR from persistence provider: %s. "
+ "Node has already been instructed to shut down so "
+ "not doing anything now.",
+ result.getErrorMessage().c_str());
+ } else {
+ LOG(info,
+ "Received FATAL_ERROR from persistence provider, "
+ "shutting down node: %s",
+ result.getErrorMessage().c_str());
+ const_cast<ProviderShutdownWrapper*>(this)->
+ _component.requestShutdown(result.getErrorMessage());
+ _shutdownTriggered = true;
+ }
+ }
+ return std::move(result);
+}
+
+spi::Result
+ProviderShutdownWrapper::initialize()
+{
+ return checkResult(_impl.initialize());
+}
+
+spi::PartitionStateListResult
+ProviderShutdownWrapper::getPartitionStates() const
+{
+ return checkResult(_impl.getPartitionStates());
+}
+
+spi::BucketIdListResult
+ProviderShutdownWrapper::listBuckets(spi::PartitionId partitionId) const
+{
+ return checkResult(_impl.listBuckets(partitionId));
+}
+
+spi::Result
+ProviderShutdownWrapper::setClusterState(const spi::ClusterState& state)
+{
+ return checkResult(_impl.setClusterState(state));
+}
+
+spi::Result
+ProviderShutdownWrapper::setActiveState(const spi::Bucket& bucket,
+ spi::BucketInfo::ActiveState newState)
+{
+ return checkResult(_impl.setActiveState(bucket, newState));
+}
+
+spi::BucketInfoResult
+ProviderShutdownWrapper::getBucketInfo(const spi::Bucket& bucket) const
+{
+ return checkResult(_impl.getBucketInfo(bucket));
+}
+
+spi::Result
+ProviderShutdownWrapper::put(const spi::Bucket& bucket,
+ spi::Timestamp ts,
+ const document::Document::SP& doc,
+ spi::Context& context)
+{
+ return checkResult(_impl.put(bucket, ts, doc, context));
+}
+
+spi::RemoveResult
+ProviderShutdownWrapper::remove(const spi::Bucket& bucket,
+ spi::Timestamp ts,
+ const document::DocumentId& docId,
+ spi::Context& context)
+{
+ return checkResult(_impl.remove(bucket, ts, docId, context));
+}
+
+spi::RemoveResult
+ProviderShutdownWrapper::removeIfFound(const spi::Bucket& bucket,
+ spi::Timestamp ts,
+ const document::DocumentId& docId,
+ spi::Context& context)
+{
+ return checkResult(_impl.removeIfFound(bucket, ts, docId, context));
+}
+
+spi::UpdateResult
+ProviderShutdownWrapper::update(const spi::Bucket& bucket,
+ spi::Timestamp ts,
+ const document::DocumentUpdate::SP& docUpdate,
+ spi::Context& context)
+{
+ return checkResult(_impl.update(bucket, ts, docUpdate, context));
+}
+
+spi::GetResult
+ProviderShutdownWrapper::get(const spi::Bucket& bucket,
+ const document::FieldSet& fieldSet,
+ const document::DocumentId& docId,
+ spi::Context& context) const
+{
+ return checkResult(_impl.get(bucket, fieldSet, docId, context));
+}
+
+spi::Result
+ProviderShutdownWrapper::flush(const spi::Bucket& bucket, spi::Context& context)
+{
+ return checkResult(_impl.flush(bucket, context));
+}
+
+spi::CreateIteratorResult
+ProviderShutdownWrapper::createIterator(const spi::Bucket& bucket,
+ const document::FieldSet& fieldSet,
+ const spi::Selection& selection,
+ spi::IncludedVersions versions,
+ spi::Context& context)
+{
+ return checkResult(_impl.createIterator(bucket, fieldSet, selection, versions, context));
+}
+
+spi::IterateResult
+ProviderShutdownWrapper::iterate(spi::IteratorId iteratorId,
+ uint64_t maxByteSize,
+ spi::Context& context) const
+{
+ return checkResult(_impl.iterate(iteratorId, maxByteSize, context));
+}
+
+spi::Result
+ProviderShutdownWrapper::destroyIterator(spi::IteratorId iteratorId,
+ spi::Context& context)
+{
+ return checkResult(_impl.destroyIterator(iteratorId, context));
+}
+
+spi::Result
+ProviderShutdownWrapper::createBucket(const spi::Bucket& bucket,
+ spi::Context& context)
+{
+ return checkResult(_impl.createBucket(bucket, context));
+}
+
+spi::Result
+ProviderShutdownWrapper::deleteBucket(const spi::Bucket& bucket,
+ spi::Context& context)
+{
+ return checkResult(_impl.deleteBucket(bucket, context));
+}
+
+spi::BucketIdListResult
+ProviderShutdownWrapper::getModifiedBuckets() const
+{
+ return checkResult(_impl.getModifiedBuckets());
+}
+
+spi::Result
+ProviderShutdownWrapper::maintain(const spi::Bucket& bucket,
+ spi::MaintenanceLevel level)
+{
+ return checkResult(_impl.maintain(bucket, level));
+}
+
+spi::Result
+ProviderShutdownWrapper::split(const spi::Bucket& source,
+ const spi::Bucket& target1,
+ const spi::Bucket& target2,
+ spi::Context& context)
+{
+ return checkResult(_impl.split(source, target1, target2, context));
+}
+
+spi::Result
+ProviderShutdownWrapper::join(const spi::Bucket& source1,
+ const spi::Bucket& source2,
+ const spi::Bucket& target, spi::Context& context)
+{
+ return checkResult(_impl.join(source1, source2, target, context));
+}
+
+spi::Result
+ProviderShutdownWrapper::move(const spi::Bucket& source,
+ spi::PartitionId target, spi::Context& context)
+{
+ return checkResult(_impl.move(source, target, context));
+}
+
+spi::Result
+ProviderShutdownWrapper::removeEntry(const spi::Bucket& bucket,
+ spi::Timestamp ts, spi::Context& context)
+{
+ return checkResult(_impl.removeEntry(bucket, ts, context));
+}
+
+} // ns storage
diff --git a/storage/src/vespa/storage/persistence/providershutdownwrapper.h b/storage/src/vespa/storage/persistence/providershutdownwrapper.h
new file mode 100644
index 00000000000..5c827130c2b
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/providershutdownwrapper.h
@@ -0,0 +1,124 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::ProviderShutdownWrapper
+ *
+ * \brief Utility class which forwards all calls to the real persistence
+ * provider implementation, transparently checking the result of each
+ * operation to see if the result is FATAL_ERROR. If so, it initiates a
+ * shutdown of the process (but still returns the response up to the caller
+ * as if it were just a non-wrapped call).
+ *
+ */
+#pragma once
+
+#include <vector>
+#include <string>
+#include <vespa/persistence/spi/persistenceprovider.h>
+#include <vespa/vespalib/util/sync.h>
+
+namespace storage {
+
+class ServiceLayerComponent;
+
+class ProviderShutdownWrapper : public spi::PersistenceProvider
+{
+public:
+ ProviderShutdownWrapper(spi::PersistenceProvider& impl,
+ ServiceLayerComponent& component)
+ : _impl(impl),
+ _component(component),
+ _shutdownLock(),
+ _shutdownTriggered(false)
+ {
+ }
+
+ spi::Result initialize();
+
+ spi::PartitionStateListResult getPartitionStates() const;
+
+ spi::BucketIdListResult listBuckets(spi::PartitionId) const;
+
+ spi::Result setClusterState(const spi::ClusterState&) ;
+
+ spi::Result setActiveState(const spi::Bucket& bucket,
+ spi::BucketInfo::ActiveState newState);
+
+ spi::BucketInfoResult getBucketInfo(const spi::Bucket&) const;
+
+ spi::Result put(const spi::Bucket&, spi::Timestamp,
+ const document::Document::SP&, spi::Context&);
+
+ spi::RemoveResult remove(const spi::Bucket&, spi::Timestamp,
+ const document::DocumentId&, spi::Context&);
+
+ spi::RemoveResult removeIfFound(const spi::Bucket&,
+ spi::Timestamp,
+ const document::DocumentId&, spi::Context&);
+
+ spi::UpdateResult update(const spi::Bucket&,
+ spi::Timestamp,
+ const document::DocumentUpdate::SP&, spi::Context&);
+
+ spi::GetResult get(const spi::Bucket&,
+ const document::FieldSet&,
+ const document::DocumentId&, spi::Context&) const;
+
+ spi::Result flush(const spi::Bucket&, spi::Context&);
+
+ spi::CreateIteratorResult createIterator(const spi::Bucket&,
+ const document::FieldSet&,
+ const spi::Selection&,
+ spi::IncludedVersions versions,
+ spi::Context&);
+
+ spi::IterateResult iterate(spi::IteratorId, uint64_t maxByteSize,
+ spi::Context&) const;
+
+ spi::Result destroyIterator(spi::IteratorId, spi::Context&);
+
+ spi::Result createBucket(const spi::Bucket&, spi::Context&);
+
+ spi::Result deleteBucket(const spi::Bucket&, spi::Context&);
+
+ spi::BucketIdListResult getModifiedBuckets() const;
+
+ spi::Result maintain(const spi::Bucket& bucket,
+ spi::MaintenanceLevel level);
+
+ spi::Result split(const spi::Bucket& source,
+ const spi::Bucket& target1,
+ const spi::Bucket& target2, spi::Context&);
+
+ spi::Result join(const spi::Bucket& source1,
+ const spi::Bucket& source2,
+ const spi::Bucket& target,
+ spi::Context&);
+
+ spi::Result move(const spi::Bucket& source, spi::PartitionId target,
+ spi::Context&);
+
+ spi::Result removeEntry(const spi::Bucket&, spi::Timestamp, spi::Context&);
+
+ spi::PersistenceProvider& getProviderImplementation() {
+ return _impl;
+ }
+ const spi::PersistenceProvider& getProviderImplementation() const {
+ return _impl;
+ }
+private:
+ /**
+ * Check whether result has a FATAL_ERROR return code and invoke
+ * requestShutdown with its error string if so. Will const_cast
+ * internally since it calls non-const on _component.
+ */
+ template <typename ResultType>
+ inline ResultType checkResult(ResultType&& result) const;
+
+ spi::PersistenceProvider& _impl;
+ ServiceLayerComponent& _component;
+ vespalib::Lock _shutdownLock;
+ mutable bool _shutdownTriggered;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/persistence/splitbitdetector.cpp b/storage/src/vespa/storage/persistence/splitbitdetector.cpp
new file mode 100644
index 00000000000..8fc49071cf6
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/splitbitdetector.cpp
@@ -0,0 +1,262 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+#include <vespa/storage/persistence/splitbitdetector.h>
+#include <vespa/storage/persistence/bucketprocessor.h>
+#include <vespa/vespalib/stllike/string.h>
+
+LOG_SETUP(".persistence.split.bitdetector");
+
+namespace storage {
+
+namespace {
+
+size_t keepFirstCount = 15;
+
+/**
+ * Detect what bit we need to use for splitting to split given bucket into
+ * two pieces.
+ *
+ * We do this by assuming it is bit 58. And then we go through all
+ * buckets, and reduce the bit number if both buckets arent equal below this
+ * bit. Thus we should end up pointing to the rightmost bit differing
+ * between the buckets, or 58 (which is an illegal number as only bits 0-57
+ * are possible, if it is impossible to split the bucket in two.
+ */
+struct BucketVisitor : public BucketProcessor::EntryProcessor {
+ const document::BucketIdFactory& _factory;
+ // Bit index of first bit that is not identical for all documents,
+ // starting at index 0.
+ mutable uint32_t _splitBit;
+ // Contain 1 for all bits lower than splitBit
+ mutable uint64_t _splitMask;
+ mutable document::DocumentId _refId;
+ mutable document::BucketId _refBucket;
+ mutable document::DocumentId _conflictId;
+ mutable document::BucketId _conflictBucket;
+ uint32_t _docCount;
+ uint64_t _docSize;
+ struct DocInfo {
+ uint64_t timestamp;
+ document::DocumentId docId;
+ document::BucketId bucketId;
+
+ DocInfo(uint64_t ts, const document::DocumentId& id,
+ document::BucketId& bid)
+ : timestamp(ts),
+ docId(id),
+ bucketId(bid)
+ {
+ }
+ };
+ std::vector<DocInfo> _firstDocs;
+
+ BucketVisitor(const document::BucketIdFactory& factory)
+ : _factory(factory), _splitBit(58),
+ _splitMask(0), _refId(), _refBucket(),
+ _conflictId(), _conflictBucket(),
+ _docCount(0), _docSize(0), _firstDocs()
+ {
+ _firstDocs.reserve(keepFirstCount);
+ // LOG(spam, "Checking out meta entries in bucket");
+ for (uint32_t i=0; i<_splitBit; ++i) {
+ _splitMask = (_splitMask << 1) | 1;
+ }
+ }
+
+ void process(spi::DocEntry& slot) {
+ assert(slot.getDocumentId());
+ ++_docCount;
+ _docSize += slot.getDocumentSize();
+
+ const document::DocumentId& id(*slot.getDocumentId());
+ document::BucketId bucket = _factory.getBucketId(id);
+ // LOG(spam, "Bucket %s", bucket.toString().c_str());
+ if (_firstDocs.size() < keepFirstCount) {
+ _firstDocs.push_back(DocInfo(slot.getTimestamp(), id, bucket));
+ }
+
+ if (_refBucket.getRawId() == 0) {
+ _refId = id;
+ _refBucket = bucket;
+ return;
+ }
+
+ while ((bucket.getRawId() & _splitMask) !=
+ (_refBucket.getRawId() & _splitMask))
+ {
+ --_splitBit;
+ _splitMask = _splitMask >> 1;
+ _conflictId = id;
+ _conflictBucket = bucket;
+ }
+
+ return;
+ }
+
+ void printEntrySummary(std::ostream& out) {
+ for (uint32_t i=0; i<_firstDocs.size(); ++i) {
+ out << "\n" << _firstDocs[i].timestamp << ' '
+ << _firstDocs[i].bucketId << ' '
+ << _firstDocs[i].docId;
+ }
+ }
+
+};
+
+bool
+smallerThanSizeLimit(uint32_t minCount,
+ uint32_t minSize,
+ const spi::Bucket& b,
+ spi::PersistenceProvider& provider)
+{
+ if (minCount == 0 && minSize == 0) return false;
+ spi::BucketInfo info = provider.getBucketInfo(b).getBucketInfo();
+ if ((minCount != 0 && info.getDocumentCount() < minCount)
+ && (minSize != 0 && (info.getDocumentCount() == 1
+ || info.getDocumentSize() < minSize)))
+ { // (A bucket with a single document is never too large size wise
+ return true;
+ }
+ return false;
+}
+
+bool
+deduceBucketIsInconsistentlySplit(uint32_t minCount)
+{
+ // If the bucket split command was sent with a minimum doc limit of 0,
+ // it was sent because the bucket is inconsistently split. Regular splits
+ // triggered by bucket size always contain values > 0 from the config.
+ return (minCount == 0);
+}
+
+} // anonymous
+
+SplitBitDetector::Result
+SplitBitDetector::detectSplit(spi::PersistenceProvider& provider,
+ const spi::Bucket& source,
+ uint32_t maxSplitBits,
+ spi::Context& context,
+ uint32_t minCount, uint32_t minSize)
+{
+ if (maxSplitBits <= source.getBucketId().getUsedBits()) {
+ std::ostringstream error;
+ error << "No use in trying to split " << source << " when max split "
+ << "bit is set to " << maxSplitBits << ".";
+ LOG(warning, "split(%s): %s",
+ source.getBucketId().toString().c_str(), error.str().c_str());
+ return Result(error.str());
+ }
+ document::BucketIdFactory factory;
+ BucketVisitor detector(factory);
+
+ BucketProcessor::iterateAll(
+ provider, source, "", detector, spi::ALL_VERSIONS, context);
+
+ uint16_t splitBit = detector._splitBit;
+
+ assert(splitBit <= 58);
+ // Handle empty source bucket case
+ if (detector._refBucket.getRawId() == 0) {
+ return Result();
+ }
+ // If max split bits is set and we are trying to split above that,
+ // correct
+ bool singleTarget = false;
+ if (maxSplitBits != 0 && maxSplitBits < splitBit) {
+ splitBit = maxSplitBits - 1;
+ singleTarget = true;
+ LOG(debug, "split(%s) - Found split bit %u but max is %u.",
+ source.toString().c_str(), splitBit, maxSplitBits);
+ }
+ // If size limits are set, but bucket is not too large, limit split to
+ // current + 1
+ if (smallerThanSizeLimit(minCount, minSize, source, provider)) {
+ if (LOG_WOULD_LOG(debug)) {
+ spi::BucketInfo info(
+ provider.getBucketInfo(source).getBucketInfo());
+ LOG(debug, "split(%s) - Bucket too small to trigger split. "
+ "%u docs, %u size. (Split size at %u/%u). "
+ "Only splitting to %u.",
+ source.toString().c_str(), info.getDocumentCount(),
+ info.getDocumentSize(), minCount, minSize,
+ source.getBucketId().getUsedBits());
+ }
+ splitBit = source.getBucketId().getUsedBits();
+ }
+ if (splitBit == 58) {
+ // We're in a situation where multiple unique documents map to the
+ // same 58-bit bucket ID and no differing bit may be found.
+ // If the split is sent in an inconsistent split context, we must
+ // always split the bucket or the bucket tree might forever remain
+ // inconsistent. If we're unable to deduce a split bit from the bucket
+ // contents (which is why we're in this branch) we have no other
+ // practical choice than to increase the split level by 1 bit to force
+ // the bucket further down into the tree.
+ // Otherwise, we can really do no better than either fail the operation
+ // or force the bucket to 58 bits. Failing the operation makes the
+ // distributor retry it ad inifinitum (forever smashing its head against
+ // the wall), so the latter is our chosen approach.
+ if (deduceBucketIsInconsistentlySplit(minCount)) {
+ splitBit = source.getBucketId().getUsedBits();
+ } else {
+ std::ostringstream error;
+ error << "Could not find differing bit to split bucket contents "
+ "around due to bucket ID collisions. Forcing resulting "
+ "bucket to be 58 bits. Bucket has "
+ << detector._docCount << " docs totalling "
+ << detector._docSize << " bytes. ";
+ detector.printEntrySummary(error);
+ LOGBT(warning,
+ source.getBucketId().toString(),
+ "split(%s): %s",
+ source.getBucketId().toString().c_str(),
+ error.str().c_str());
+ splitBit = 57; // + 1 below.
+ }
+ }
+
+ if (splitBit < source.getBucketId().getUsedBits()) {
+ LOG(error, "split(%s): Document(s) in wrong bucket, and thus "
+ "inaccessible! Split bit detector detected split bit %u but "
+ "the bucket is already split on %u bits. Conflicting "
+ "entries were document %s (%s) and document %s (%s).",
+ source.getBucketId().toString().c_str(),
+ splitBit,
+ source.getBucketId().getUsedBits(),
+ detector._refId.toString().c_str(),
+ detector._refBucket.toString().c_str(),
+ detector._conflictId.toString().c_str(),
+ detector._conflictBucket.toString().c_str());
+ assert(false);
+ }
+
+ document::BucketId base(splitBit,
+ detector._refBucket.getRawId());
+ document::BucketId target1(splitBit + 1, base.getId());
+ document::BucketId target2(splitBit + 1, base.getId()
+ | (uint64_t(1) << splitBit));
+ return Result(target1, target2, singleTarget);
+}
+
+void
+SplitBitDetector::Result::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ out << "SplitTargets(";
+ switch (_result) {
+ case OK:
+ out << _target1.getUsedBits() << ": " << _target1 << ", ";
+ if (_singleTarget) out << "[ ";
+ out << _target2;
+ if (_singleTarget) out << " ]";
+ break;
+ case EMPTY: out << "source empty"; break;
+ case ERROR: out << "error: " << _reason; break;
+ }
+ out << ")";
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/splitbitdetector.h b/storage/src/vespa/storage/persistence/splitbitdetector.h
new file mode 100644
index 00000000000..66096a70f68
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/splitbitdetector.h
@@ -0,0 +1,66 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * Iterates metadata in the bucket using the SPI, and analyze where we need to
+ * split in order to split the bucket in two pieces. Possible results include.
+ *
+ * - Normal case: A set of two buckets (with same used bits count),
+ * splitting source bucket in half.
+ * - Empty source bucket. No data to split.
+ * - Error: Impossible to split data in two. All data has common bucket bits.
+ * - Single target split: Asked to limit bits used to less than max, and using
+ * this amount of bits won't split data in two. Currently, we return this
+ * as success and create the paired bucket, such that SPI can handle single
+ * target split just as a regular split, only that no data will actually be
+ * split into the other target. (And that target thus must be deleted
+ * afterwards if empty.)
+ *
+ */
+#pragma once
+
+#include <vespa/persistence/spi/persistenceprovider.h>
+
+namespace storage {
+
+struct SplitBitDetector
+{
+ enum ResultType {
+ OK,
+ EMPTY,
+ ERROR
+ };
+
+ class Result : public document::Printable {
+ ResultType _result;
+ document::BucketId _target1;
+ document::BucketId _target2;
+ vespalib::string _reason;
+ bool _singleTarget;
+
+ public:
+ Result() : _result(EMPTY), _singleTarget(false) {}
+ Result(vespalib::stringref error)
+ : _result(ERROR), _reason(error), _singleTarget(false) {}
+ Result(const document::BucketId& t1, const document::BucketId& t2,
+ bool single)
+ : _result(OK), _target1(t1), _target2(t2), _singleTarget(single) {}
+
+ bool success() const { return (_result == OK); }
+ bool failed() const { return (_result == ERROR); }
+ bool empty() const { return (_result == EMPTY); }
+ const vespalib::string& getReason() const { return _reason; }
+ const document::BucketId& getTarget1() const { return _target1; }
+ const document::BucketId& getTarget2() const { return _target2; }
+
+ // Printable implementation
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+ };
+
+ static Result detectSplit(spi::PersistenceProvider&, const spi::Bucket&,
+ uint32_t maxSplitBits,
+ spi::Context&,
+ uint32_t minCount = 0, uint32_t minSize = 0);
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/persistence/testandsethelper.cpp b/storage/src/vespa/storage/persistence/testandsethelper.cpp
new file mode 100644
index 00000000000..9b3f0b9855b
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/testandsethelper.cpp
@@ -0,0 +1,72 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// @author Vegard Sjonfjell
+
+#include <vespa/storage/persistence/fieldvisitor.h>
+#include <vespa/storage/persistence/testandsethelper.h>
+
+using namespace std::string_literals;
+
+namespace storage {
+
+void TestAndSetHelper::getDocumentType() {
+ if (!_docId.hasDocType()) {
+ throw TestAndSetException(api::ReturnCode(api::ReturnCode::ILLEGAL_PARAMETERS, "Document id has no doctype"));
+ }
+
+ _docTypePtr = _component.getTypeRepo()->getDocumentType(_docId.getDocType());
+ if (_docTypePtr == nullptr) {
+ throw TestAndSetException(api::ReturnCode(api::ReturnCode::ILLEGAL_PARAMETERS, "Document type does not exist"));
+ }
+}
+
+void TestAndSetHelper::parseDocumentSelection() {
+ document::select::Parser parser(*_component.getTypeRepo(), _component.getBucketIdFactory());
+
+ try {
+ _docSelectionUp = parser.parse(_cmd.getCondition().getSelection());
+ } catch (const document::select::ParsingFailedException & e) {
+ throw TestAndSetException(api::ReturnCode(api::ReturnCode::ILLEGAL_PARAMETERS, "Failed to parse test and set condition: "s + e.getMessage()));
+ }
+}
+
+spi::GetResult TestAndSetHelper::retrieveDocument(const document::FieldSet & fieldSet) {
+ return _thread._spi.get(
+ _thread.getBucket(_docId, _cmd.getBucketId()),
+ fieldSet,
+ _cmd.getDocumentId(),
+ _thread._context);
+}
+
+TestAndSetHelper::TestAndSetHelper(PersistenceThread & thread, const api::TestAndSetCommand & cmd)
+ : _thread(thread),
+ _component(thread._env._component),
+ _cmd(cmd),
+ _docId(cmd.getDocumentId())
+{
+ getDocumentType();
+ parseDocumentSelection();
+}
+
+api::ReturnCode TestAndSetHelper::retrieveAndMatch() {
+ // Walk document selection tree to build a minimal field set
+ FieldVisitor fieldVisitor(*_docTypePtr);
+ _docSelectionUp->visit(fieldVisitor);
+
+ // Retrieve document
+ auto result = retrieveDocument(fieldVisitor.getFieldSet());
+
+ // If document exists, match it with selection
+ if (result.hasDocument()) {
+ auto docPtr = result.getDocumentPtr();
+ if (_docSelectionUp->contains(*docPtr) != document::select::Result::True) {
+ return api::ReturnCode(api::ReturnCode::TEST_AND_SET_CONDITION_FAILED, "Condition did not match document");
+ }
+
+ // Document matches
+ return api::ReturnCode();
+ }
+
+ return api::ReturnCode(api::ReturnCode::TEST_AND_SET_CONDITION_FAILED, "Document does not exist");
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/testandsethelper.h b/storage/src/vespa/storage/persistence/testandsethelper.h
new file mode 100644
index 00000000000..9c220547c7f
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/testandsethelper.h
@@ -0,0 +1,39 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// @author Vegard Sjonfjell
+
+#pragma once
+#include <vespa/storage/persistence/persistencethread.h>
+
+namespace storage {
+
+class TestAndSetException : public std::runtime_error {
+ api::ReturnCode _code;
+
+public:
+ TestAndSetException(api::ReturnCode code)
+ : std::runtime_error(code.getMessage()),
+ _code(std::move(code))
+ {}
+
+ const api::ReturnCode & getCode() const { return _code; }
+};
+
+class TestAndSetHelper {
+ PersistenceThread & _thread;
+ ServiceLayerComponent & _component;
+ const api::TestAndSetCommand & _cmd;
+
+ const document::DocumentId _docId;
+ const document::DocumentType * _docTypePtr;
+ std::unique_ptr<document::select::Node> _docSelectionUp;
+
+ void getDocumentType();
+ void parseDocumentSelection();
+ spi::GetResult retrieveDocument(const document::FieldSet & fieldSet);
+
+public:
+ TestAndSetHelper(PersistenceThread & thread, const api::TestAndSetCommand & cmd);
+ api::ReturnCode retrieveAndMatch();
+};
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/types.cpp b/storage/src/vespa/storage/persistence/types.cpp
new file mode 100644
index 00000000000..ca8141a1ac6
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/types.cpp
@@ -0,0 +1,12 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <iostream>
+#include <vespa/storage/persistence/types.h>
+
+namespace storage {
+
+const framework::MicroSecTime Types::MAX_TIMESTAMP(framework::MicroSecTime::max());
+const framework::MicroSecTime Types::UNSET_TIMESTAMP(0);
+
+} // storage
diff --git a/storage/src/vespa/storage/persistence/types.h b/storage/src/vespa/storage/persistence/types.h
new file mode 100644
index 00000000000..747b9fbd074
--- /dev/null
+++ b/storage/src/vespa/storage/persistence/types.h
@@ -0,0 +1,38 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <iosfwd>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/document/base/documentid.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storageapi/buckets/bucketinfo.h>
+#include <vespa/storageapi/messageapi/returncode.h>
+#include <vespa/storageapi/defs.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/vespalib/stllike/string.h>
+
+namespace storage {
+
+struct Types {
+ typedef document::BucketId BucketId;
+ typedef document::Document Document;
+ typedef vespalib::LinkedPtr<Document> DocLP;
+ typedef document::DocumentId DocumentId;
+ typedef document::GlobalId GlobalId;
+ typedef framework::MicroSecTime Timestamp;
+ typedef Timestamp RevertToken;
+ typedef vespalib::string String;
+ typedef api::BucketInfo BucketInfo;
+ typedef api::ReturnCode ReturnCode;
+ typedef StorBucketDatabase::WrappedEntry BucketDBEntry;
+
+ static const framework::MicroSecTime MAX_TIMESTAMP;
+ static const framework::MicroSecTime UNSET_TIMESTAMP;
+
+protected:
+ ~Types() {} // Noone should refer to objects as Types objects
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageserver/.gitignore b/storage/src/vespa/storage/storageserver/.gitignore
new file mode 100644
index 00000000000..333f254ba10
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/.gitignore
@@ -0,0 +1,8 @@
+*.So
+*.lo
+.*.swp
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
diff --git a/storage/src/vespa/storage/storageserver/CMakeLists.txt b/storage/src/vespa/storage/storageserver/CMakeLists.txt
new file mode 100644
index 00000000000..2253fb1dcee
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/CMakeLists.txt
@@ -0,0 +1,28 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_storageserver OBJECT
+ SOURCES
+ priorityconverter.cpp
+ bucketintegritychecker.cpp
+ bouncer.cpp
+ messagesink.cpp
+ fnetlistener.cpp
+ rpcrequestwrapper.cpp
+ communicationmanager.cpp
+ statemanager.cpp
+ documentapiconverter.cpp
+ opslogger.cpp
+ mergethrottler.cpp
+ messageallocationtypes.cpp
+ storagenodecontext.cpp
+ distributornodecontext.cpp
+ servicelayernodecontext.cpp
+ storagenode.cpp
+ distributornode.cpp
+ servicelayernode.cpp
+ statereporter.cpp
+ changedbucketownershiphandler.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+ storage_visitor
+)
diff --git a/storage/src/vespa/storage/storageserver/applicationgenerationfetcher.h b/storage/src/vespa/storage/storageserver/applicationgenerationfetcher.h
new file mode 100644
index 00000000000..0e33c8c86c6
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/applicationgenerationfetcher.h
@@ -0,0 +1,23 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::ApplicationGenerationFetcher
+ * \ingroup storageserver
+ *
+ * \brief Interface for fetching application generation number and
+ * component name.
+ */
+
+#pragma once
+
+namespace storage {
+
+class ApplicationGenerationFetcher {
+public:
+ virtual ~ApplicationGenerationFetcher() {};
+
+ virtual int64_t getGeneration() const = 0;
+ virtual std::string getComponentName() const = 0;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageserver/bouncer.cpp b/storage/src/vespa/storage/storageserver/bouncer.cpp
new file mode 100644
index 00000000000..598d84b934d
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/bouncer.cpp
@@ -0,0 +1,295 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/bouncer.h>
+
+#include <vespa/log/log.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storageapi/message/persistence.h>
+
+LOG_SETUP(".bouncer");
+
+namespace storage {
+
+Bouncer::Bouncer(StorageComponentRegister& compReg, const config::ConfigUri & configUri)
+ : StorageLink("Bouncer"),
+ _config(new vespa::config::content::core::StorBouncerConfig()),
+ _component(compReg, "bouncer"),
+ _lock(),
+ _nodeState("s:i"),
+ _clusterState(&lib::State::UP),
+ _configFetcher(configUri.getContext())
+{
+ _component.getStateUpdater().addStateListener(*this);
+ // Register for config. Normally not critical, so catching config
+ // exception allowing program to continue if missing/faulty config.
+ try{
+ if (!configUri.empty()) {
+ _configFetcher.subscribe<vespa::config::content::core::StorBouncerConfig>(configUri.getConfigId(), this);
+ _configFetcher.start();
+ } else {
+ LOG(info, "No config id specified. Using defaults rather than "
+ "config");
+ }
+ } catch (config::InvalidConfigException& e) {
+ LOG(info, "Bouncer failed to load config '%s'. This "
+ "is not critical since it has sensible defaults: %s",
+ configUri.getConfigId().c_str(), e.what());
+ }
+}
+
+Bouncer::~Bouncer()
+{
+ closeNextLink();
+ LOG(debug, "Deleting link %s.", toString().c_str());
+}
+
+void
+Bouncer::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ out << "Bouncer(" << _nodeState << ")";
+}
+
+void
+Bouncer::onClose()
+{
+ _configFetcher.close();
+ _component.getStateUpdater().removeStateListener(*this);
+}
+
+void
+Bouncer::configure(std::unique_ptr<vespa::config::content::core::StorBouncerConfig> config)
+{
+ validateConfig(*config);
+ vespalib::LockGuard lock(_lock);
+ _config = std::move(config);
+}
+
+void
+Bouncer::validateConfig(
+ const vespa::config::content::core::StorBouncerConfig& newConfig) const
+{
+ if (newConfig.feedRejectionPriorityThreshold != -1) {
+ if (newConfig.feedRejectionPriorityThreshold
+ > std::numeric_limits<api::StorageMessage::Priority>::max())
+ {
+ throw config::InvalidConfigException(
+ "feed_rejection_priority_threshold config value exceeds "
+ "maximum allowed value");
+ }
+ if (newConfig.feedRejectionPriorityThreshold
+ < std::numeric_limits<api::StorageMessage::Priority>::min())
+ {
+ throw config::InvalidConfigException(
+ "feed_rejection_priority_threshold config value lower than "
+ "minimum allowed value");
+ }
+ }
+}
+
+void
+Bouncer::abortCommandForUnavailableNode(api::StorageMessage& msg,
+ const lib::State& state)
+{
+ // If we're not up or retired, fail due to this nodes state.
+ std::shared_ptr<api::StorageReply> reply(
+ static_cast<api::StorageCommand&>(msg).makeReply().release());
+ std::ostringstream ost;
+ ost << "We don't allow command of type " << msg.getType()
+ << " when node is in state " << state.toString(true) << ".";
+ reply->setResult(api::ReturnCode(api::ReturnCode::ABORTED, ost.str()));
+ sendUp(reply);
+}
+
+void
+Bouncer::abortCommandWithTooHighClockSkew(api::StorageMessage& msg,
+ int maxClockSkewInSeconds)
+{
+ std::shared_ptr<api::StorageReply> reply(
+ static_cast<api::StorageCommand&>(msg).makeReply().release());
+ std::ostringstream ost;
+ ost << "Message " << msg.getType() << " is more than "
+ << maxClockSkewInSeconds << " seconds in the future.";
+ reply->setResult(api::ReturnCode(api::ReturnCode::ABORTED, ost.str()));
+ sendUp(reply);
+}
+
+void
+Bouncer::abortCommandDueToClusterDown(api::StorageMessage& msg)
+{
+ std::shared_ptr<api::StorageReply> reply(
+ static_cast<api::StorageCommand&>(msg).makeReply().release());
+ std::ostringstream ost;
+ ost << "We don't allow external load while cluster is in state "
+ << _clusterState->toString(true) << ".";
+ reply->setResult(api::ReturnCode(api::ReturnCode::ABORTED, ost.str()));
+ sendUp(reply);
+}
+
+bool
+Bouncer::clusterIsUp() const
+{
+ return (*_clusterState == lib::State::UP);
+}
+
+uint64_t
+Bouncer::extractMutationTimestampIfAny(const api::StorageMessage& msg)
+{
+ switch (msg.getType().getId()) {
+ case api::MessageType::PUT_ID:
+ return static_cast<const api::PutCommand&>(msg).getTimestamp();
+ case api::MessageType::REMOVE_ID:
+ return static_cast<const api::RemoveCommand&>(msg).getTimestamp();
+ case api::MessageType::UPDATE_ID:
+ return static_cast<const api::UpdateCommand&>(msg).getTimestamp();
+ default:
+ return 0;
+ }
+}
+
+bool
+Bouncer::isExternalLoad(const api::MessageType& type) const noexcept
+{
+ switch (type.getId()) {
+ case api::MessageType::PUT_ID:
+ case api::MessageType::REMOVE_ID:
+ case api::MessageType::UPDATE_ID:
+ case api::MessageType::GET_ID:
+ case api::MessageType::VISITOR_CREATE_ID:
+ case api::MessageType::MULTIOPERATION_ID:
+ case api::MessageType::STATBUCKET_ID:
+ return true;
+ default:
+ return false;
+ }
+}
+
+bool
+Bouncer::isExternalWriteOperation(const api::MessageType& type) const noexcept {
+ switch (type.getId()) {
+ case api::MessageType::PUT_ID:
+ case api::MessageType::REMOVE_ID:
+ case api::MessageType::UPDATE_ID:
+ case api::MessageType::MULTIOPERATION_ID:
+ return true;
+ default:
+ return false;
+ }
+}
+
+void
+Bouncer::rejectDueToInsufficientPriority(
+ api::StorageMessage& msg,
+ api::StorageMessage::Priority feedPriorityLowerBound)
+{
+ std::shared_ptr<api::StorageReply> reply(
+ static_cast<api::StorageCommand&>(msg).makeReply().release());
+ std::ostringstream ost;
+ ost << "Operation priority (" << int(msg.getPriority())
+ << ") is lower than currently configured threshold ("
+ << int(feedPriorityLowerBound) << ") -- note that lower numbers "
+ "mean a higher priority. This usually means your application "
+ "has been reconfigured to deal with a transient upgrade or "
+ "load event";
+ reply->setResult(api::ReturnCode(api::ReturnCode::REJECTED, ost.str()));
+ sendUp(reply);
+}
+
+bool
+Bouncer::onDown(const std::shared_ptr<api::StorageMessage>& msg)
+{
+ const api::MessageType& type(msg->getType());
+ // All replies can come in.
+ if (type.isReply()) return false;
+
+ switch (type.getId()) {
+ case api::MessageType::SETNODESTATE_ID:
+ case api::MessageType::GETNODESTATE_ID:
+ case api::MessageType::SETSYSTEMSTATE_ID:
+ case api::MessageType::NOTIFYBUCKETCHANGE_ID:
+ // state commands are always ok
+ return false;
+ default:
+ break;
+ }
+ const lib::State* state;
+ int maxClockSkewInSeconds;
+ bool isInAvailableState;
+ bool abortLoadWhenClusterDown;
+ int feedPriorityLowerBound;
+ {
+ vespalib::LockGuard lock(_lock);
+ state = &_nodeState.getState();
+ maxClockSkewInSeconds = _config->maxClockSkewSeconds;
+ abortLoadWhenClusterDown = _config->stopExternalLoadWhenClusterDown;
+ isInAvailableState = state->oneOf(
+ _config->stopAllLoadWhenNodestateNotIn.c_str());
+ feedPriorityLowerBound = _config->feedRejectionPriorityThreshold;
+ }
+ // Special case for messages storage nodes are expected to get during
+ // initializing. Request bucket info will be queued so storage can
+ // answer them at the moment they are done initializing
+ if (*state == lib::State::INITIALIZING &&
+ type.getId() == api::MessageType::REQUESTBUCKETINFO_ID)
+ {
+ return false;
+ }
+ if (!isInAvailableState) {
+ abortCommandForUnavailableNode(*msg, *state);
+ return true;
+ }
+
+ // Allow all internal load to go through at this point
+ if (!isExternalLoad(type)) {
+ return false;
+ }
+ if (priorityRejectionIsEnabled(feedPriorityLowerBound)
+ && isExternalWriteOperation(type)
+ && (msg->getPriority() > feedPriorityLowerBound))
+ {
+ rejectDueToInsufficientPriority(*msg, feedPriorityLowerBound);
+ return true;
+ }
+
+ uint64_t timestamp = extractMutationTimestampIfAny(*msg);
+ if (timestamp != 0) {
+ timestamp /= 1000000;
+ uint64_t currentTime = _component.getClock().getTimeInSeconds().getTime();
+ if (timestamp > currentTime + maxClockSkewInSeconds) {
+ abortCommandWithTooHighClockSkew(*msg, maxClockSkewInSeconds);
+ return true;
+ }
+ }
+
+ // If cluster state is not up, fail external load
+ if (abortLoadWhenClusterDown && !clusterIsUp()) {
+ abortCommandDueToClusterDown(*msg);
+ return true;
+ }
+ return false;
+}
+
+void
+Bouncer::handleNewState()
+{
+ vespalib::LockGuard lock(_lock);
+ _nodeState = *_component.getStateUpdater().getReportedNodeState();
+ _clusterState = &_component.getStateUpdater().getSystemState()
+ ->getClusterState();
+ if (_config->useWantedStateIfPossible) {
+ // If current node state is more strict than our own reported state,
+ // set node state to our current state
+ lib::NodeState currState = _component.getStateUpdater().getSystemState()
+ ->getNodeState(lib::Node(_component.getNodeType(),
+ _component.getIndex()));
+ if (_nodeState.getState().maySetWantedStateForThisNodeState(
+ currState.getState()))
+ {
+ _nodeState = currState;
+ }
+ }
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/bouncer.h b/storage/src/vespa/storage/storageserver/bouncer.h
new file mode 100644
index 00000000000..8f6706d2cf8
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/bouncer.h
@@ -0,0 +1,87 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::Bouncer
+ * @ingroup storageserver
+ *
+ * @brief Denies messages from entering if state is not good.
+ *
+ * If we are not in up state, but process is still running, only a few
+ * messages should be allowed through. This link stops all messages not allowed.
+ */
+
+#pragma once
+
+#include <vespa/config/helper/configfetcher.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/common/storagecomponent.h>
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storage/config/config-stor-bouncer.h>
+#include <vespa/vespalib/util/sync.h>
+
+namespace storage {
+
+class Bouncer : public StorageLink,
+ private StateListener,
+ private config::IFetcherCallback<vespa::config::content::core::StorBouncerConfig>
+{
+ std::unique_ptr<vespa::config::content::core::StorBouncerConfig> _config;
+ StorageComponent _component;
+ vespalib::Lock _lock;
+ lib::NodeState _nodeState;
+ const lib::State* _clusterState;
+ config::ConfigFetcher _configFetcher;
+
+public:
+ explicit Bouncer(StorageComponentRegister& compReg,
+ const config::ConfigUri & configUri);
+ ~Bouncer();
+
+ void print(std::ostream& out, bool verbose,
+ const std::string& indent) const override;
+
+ void configure(std::unique_ptr<vespa::config::content::core::StorBouncerConfig> config) override;
+
+private:
+ void validateConfig(
+ const vespa::config::content::core::StorBouncerConfig&) const;
+
+ void onClose() override;
+
+ void abortCommandForUnavailableNode(api::StorageMessage&,
+ const lib::State&);
+
+ void abortCommandWithTooHighClockSkew(api::StorageMessage& msg,
+ int maxClockSkewInSeconds);
+
+ void abortCommandDueToClusterDown(api::StorageMessage&);
+
+ void rejectDueToInsufficientPriority(api::StorageMessage&,
+ api::StorageMessage::Priority);
+
+ bool clusterIsUp() const;
+
+ bool isExternalLoad(const api::MessageType&) const noexcept;
+
+ bool isExternalWriteOperation(const api::MessageType&) const noexcept;
+
+ bool priorityRejectionIsEnabled(int configuredPriority) const noexcept {
+ return (configuredPriority != -1);
+ }
+
+ /**
+ * If msg is a command containing a mutating timestamp (put, remove or
+ * update commands), return that timestamp. Otherwise, return 0.
+ */
+ uint64_t extractMutationTimestampIfAny(const api::StorageMessage& msg);
+
+ bool onDown(const std::shared_ptr<api::StorageMessage>&) override;
+
+ void handleNewState() override;
+
+};
+
+} // storage
+
+
+
diff --git a/storage/src/vespa/storage/storageserver/bucketintegritychecker.cpp b/storage/src/vespa/storage/storageserver/bucketintegritychecker.cpp
new file mode 100644
index 00000000000..b39a44a857c
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/bucketintegritychecker.cpp
@@ -0,0 +1,671 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/bucketintegritychecker.h>
+
+#include <vespa/storage/common/bucketmessages.h>
+#include <vespa/storage/storageutil/log.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/config/config.h>
+
+LOG_SETUP(".bucketintegritychecker");
+
+using std::shared_ptr;
+
+namespace storage {
+
+namespace {
+ /*
+ std::string printDate(time_t time) {
+ char date[26];
+ struct tm datestruct;
+ struct tm* datestructptr = gmtime_r(&time, &datestruct);
+ assert(datestructptr);
+ char* result = asctime_r(&datestruct, date);
+ size_t size = strlen(result);
+ while (size > 0) {
+ bool stop = false;
+ switch (result[size - 1]) {
+ case '\n':
+ case '\r':
+ case '\f':
+ case '\t':
+ --size;
+ default:
+ stop = true;
+ break;
+ }
+ if (stop) break;
+ }
+ return std::string(result, size);
+ }
+ */
+
+ std::string printMinutesOfDay(uint32_t minutesOfDay) {
+ std::ostringstream ost;
+ uint32_t hours = minutesOfDay / 60;
+ uint32_t minutes = minutesOfDay % 60;
+ ost << (hours >= 10 ? hours / 10 : 0) << hours % 10 << ':'
+ << (minutes >= 10 ? minutes / 10 : 0) << minutes % 10;
+ return ost.str();
+ }
+
+ std::string printRunState(SchedulingOptions::RunState state) {
+ switch (state) {
+ case SchedulingOptions::DONT_RUN:
+ return "Not running";
+ case SchedulingOptions::RUN_FULL:
+ return "Running with full verification";
+ case SchedulingOptions::RUN_CHEAP:
+ return "Running with cheap verification";
+ case SchedulingOptions::CONTINUE:
+ return "Continuing any existing run";
+ default:
+ assert(false);
+ abort();
+ }
+ }
+}
+
+void
+SchedulingOptions::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose;
+ std::string ind = indent + " ";
+ out << "SchedulingOptions("
+ << "Daily cycle " << printMinutesOfDay(_dailyCycleStart)
+ << " - " << printMinutesOfDay(_dailyCycleStop)
+ << ",\n" << ind << "Weekly cycle"
+ << "\n" << ind << " Monday - " << printRunState(_dailyStates[1])
+ << "\n" << ind << " Tuesday - " << printRunState(_dailyStates[2])
+ << "\n" << ind << " Wednesday - " << printRunState(_dailyStates[3])
+ << "\n" << ind << " Thursday - " << printRunState(_dailyStates[4])
+ << "\n" << ind << " Friday - " << printRunState(_dailyStates[5])
+ << "\n" << ind << " Saturday - " << printRunState(_dailyStates[6])
+ << "\n" << ind << " Sunday - " << printRunState(_dailyStates[0])
+ << ",\n" << ind << "Max pending count " << _maxPendingCount
+ << ",\n" << ind << "Min cycle time "
+ << printMinutesOfDay(_minCycleTime.getTime() / 60)
+ << ",\n" << ind << "Request delay" << _requestDelay << " seconds."
+ << "\n" << indent << ")";
+}
+
+bool
+BucketIntegrityChecker::DiskData::done() const
+{
+ return (state == DONE && failedRepairs.empty() && pendingCount == 0);
+}
+
+bool
+BucketIntegrityChecker::DiskData::working() const
+{
+ return (state == IN_PROGRESS || !failedRepairs.empty()
+ || pendingCount != 0);
+}
+
+// Utilities to find next bucket in bucket list from a possibly non-existing one
+namespace {
+
+struct NextEntryFinder {
+ bool _first;
+ uint8_t _disk;
+ document::BucketId _last;
+ std::unique_ptr<document::BucketId> _next;
+
+ NextEntryFinder(const document::BucketId& id, uint8_t disk)
+ : _first(true), _disk(disk), _last(id), _next() {}
+
+ StorBucketDatabase::Decision operator()(document::BucketId::Type bucketId,
+ StorBucketDatabase::Entry& entry)
+ {
+ document::BucketId bucket(document::BucketId::keyToBucketId(bucketId));
+
+ if (entry.disk != _disk) {
+ return StorBucketDatabase::CONTINUE;
+ } else if (_first && bucket == _last) {
+ _first = false;
+ return StorBucketDatabase::CONTINUE;
+ } else {
+ _next.reset(new document::BucketId(bucket));
+ return StorBucketDatabase::ABORT;
+ }
+ }
+};
+
+std::unique_ptr<document::BucketId> getNextId(StorBucketDatabase& database,
+ const document::BucketId& last,
+ uint8_t disk)
+{
+ NextEntryFinder proc(last, disk);
+ database.each(proc, "BucketIntegrityChecker::getNextId", last.toKey());
+ return std::move(proc._next);
+}
+} // End of anonymous namespace
+
+document::BucketId
+BucketIntegrityChecker::DiskData::iterate(StorBucketDatabase& bucketDatabase)
+{
+ static uint32_t i=0;
+ // Resend failed buckets once in a while
+ if (failedRepairs.size() > 0 && ++i % 10 == 9)
+ {
+ document::BucketId bid(failedRepairs.front());
+ LOG(spam, "Scheduling next bucket %s from failed repairs list",
+ bid.toString().c_str());
+ failedRepairs.pop_front();
+ ++retriedBuckets;
+ return bid;
+ }
+ if (state == NOT_STARTED) {
+ // Guarantueed to be before all buckets.
+ currentBucket = document::BucketId(0, 0);
+ }
+ if (state != DONE) {
+ std::unique_ptr<document::BucketId> bid(
+ getNextId(bucketDatabase, currentBucket, disk));
+ if (bid.get()) {
+ state = IN_PROGRESS;
+ currentBucket = *bid;
+ return currentBucket;
+ } else {
+ state = DONE;
+ }
+ }
+ // If we didn't schedule repaired, but we ended up not having any other,
+ // take repaired once anyways
+ if (failedRepairs.size() > 0) {
+ document::BucketId bid(failedRepairs.front());
+ LOG(spam, "Done iterating, scheduling next bucket %s from failed "
+ "repairs list", bid.toString().c_str());
+ failedRepairs.pop_front();
+ ++retriedBuckets;
+ return bid;
+ }
+ return document::BucketId(0, 0);
+}
+
+BucketIntegrityChecker::BucketIntegrityChecker(
+ const config::ConfigUri & configUri,
+ ServiceLayerComponentRegister& compReg)
+ : StorageLinkQueued("Bucket integrity checker", compReg),
+ Runnable(),
+ framework::HtmlStatusReporter("bucketintegritychecker",
+ "Bucket integrity checker"),
+ _cycleCount(0),
+ _status(),
+ _lastCycleStart(0),
+ _cycleStartBucketCount(0),
+ _lastResponseTime(0),
+ _lastCycleCompleted(true),
+ _currentRunWithFullVerification(false),
+ _verifyAllRepairs(false),
+ _scheduleOptions(),
+ _systemState(),
+ _wait(),
+ _configFetcher(configUri.getContext()),
+ _maxThreadWaitTime(60 * 1000),
+ _component(compReg, "bucketintegritychecker")
+{
+ LOG(debug, "Configuring bucket integrity checker to work with %u disks.",
+ _component.getDiskCount());
+ _status.resize(_component.getDiskCount());
+ for (uint16_t i=0; i<_component.getDiskCount(); ++i) {
+ _status[i].disk = i;
+ }
+ if (_status.size() == 0) {
+ throw vespalib::IllegalStateException(
+ "Cannot have storage with no disks.", VESPA_STRLOC);
+ }
+ // Register for config. Normally not critical, so catching config
+ // exception allowing program to continue if missing/faulty config.
+ try{
+ if (!configUri.empty()) {
+ _configFetcher.subscribe<vespa::config::content::core::StorIntegritycheckerConfig>(configUri.getConfigId(), this);
+ _configFetcher.start();
+ } else {
+ LOG(info, "No config id specified. Using defaults rather than "
+ "config");
+ }
+ } catch (config::InvalidConfigException& e) {
+ LOG(info, "Bucket Integrity Checker failed to load config '%s'. This "
+ "is not critical since it has sensible defaults: %s",
+ configUri.getConfigId().c_str(), e.what());
+ }
+ _component.registerStatusPage(*this);
+}
+
+BucketIntegrityChecker::~BucketIntegrityChecker()
+{
+ // This can happen during unit testing
+ if (StorageLink::getState() == StorageLink::OPENED) {
+ LOG(error, "BucketIntegrityChecker deleted without calling close() "
+ "first");
+ close();
+ flush();
+ }
+ closeNextLink();
+}
+
+void
+BucketIntegrityChecker::onClose()
+{
+ // Avoid getting config during shutdown
+ _configFetcher.close();
+ // Close thread to ensure we don't send anything more down after
+ if (_thread.get() != 0) {
+ LOG(debug, "Waiting for bucket integrity worker thread to close.");
+ _thread->interruptAndJoin(&_wait);
+ LOG(debug, "Bucket integrity worker thread closed.");
+ }
+ StorageLinkQueued::onClose();
+}
+
+void BucketIntegrityChecker::bump() const {
+ vespalib::MonitorGuard monitor(_wait);
+ monitor.signal();
+}
+
+bool BucketIntegrityChecker::isWorkingOnCycle() const {
+ vespalib::MonitorGuard monitor(_wait);
+ for (uint32_t i=0; i<_status.size(); ++i) {
+ if (_status[i].working()) return true;
+ }
+ return (!_lastCycleCompleted);
+}
+
+uint32_t BucketIntegrityChecker::getCycleCount() const {
+ vespalib::MonitorGuard monitor(_wait);
+ return _cycleCount;
+}
+
+void
+BucketIntegrityChecker::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ out << "BucketIntegrityChecker";
+}
+
+void
+BucketIntegrityChecker::configure(
+ std::unique_ptr<vespa::config::content::core::StorIntegritycheckerConfig> config)
+{
+ SchedulingOptions options;
+ options._dailyCycleStart = config->dailycyclestart;
+ options._dailyCycleStop = config->dailycyclestop;
+ options._maxPendingCount = config->maxpending;
+ options._minCycleTime = framework::SecondTime(60 * config->mincycletime);
+ options._requestDelay = framework::SecondTime(config->requestdelay);
+ std::string states = config->weeklycycle;
+ if (states.size() != 7) {
+ LOG(warning, "Not using integritychecker config: weeklycycle must "
+ "contain 7 characters, one for each week. Retrieved value:"
+ " '%s'.", states.c_str());
+ return;
+ }
+ for (uint32_t i=0; i<7; ++i) {
+ switch (states[i]) {
+ case 'R':
+ options._dailyStates[i] = SchedulingOptions::RUN_FULL; break;
+ case 'r':
+ options._dailyStates[i] = SchedulingOptions::RUN_CHEAP; break;
+ case 'c':
+ options._dailyStates[i] = SchedulingOptions::CONTINUE; break;
+ case '-':
+ options._dailyStates[i] = SchedulingOptions::DONT_RUN; break; default:
+ LOG(warning, "Not using integritychecker config: weeklycycle "
+ "contained illegal character %c.", states[i]);
+ return;
+ }
+ }
+ if (options._dailyCycleStart >= 24*60) {
+ LOG(warning, "Not using integritychecker config: dailycyclestart "
+ "is minutes since midnight and must be less than %u. "
+ "%u is out of range.", 24*60, options._dailyCycleStart);
+ return;
+ }
+ if (options._dailyCycleStop >= 24*60) {
+ LOG(warning, "Not using integritychecker config: dailycyclestart "
+ "is minutes since midnight and must be less than %u. "
+ "%u is out of range.", 24*60, options._dailyCycleStart);
+ return;
+ }
+ if (options._maxPendingCount > 1024) {
+ LOG(warning, "integritychecker config: Values above 1024 not "
+ "accepted. Got %u.", options._maxPendingCount);
+ return;
+ }
+ if (options._requestDelay > framework::SecondTime(60*60)) {
+ LOG(warning, "With a %" PRIu64 " second delay between each bucket "
+ "verification actually finishing a cycle will take a very "
+ "long time.",
+ options._requestDelay.getTime());
+ }
+ vespalib::MonitorGuard monitor(_wait);
+ if (options._minCycleTime.getMillis() < _maxThreadWaitTime) {
+ _maxThreadWaitTime = framework::MilliSecTime(1000);
+ monitor.signal();
+ } else {
+ _maxThreadWaitTime = framework::MilliSecTime(60 * 1000);
+ }
+ _scheduleOptions = options;
+}
+
+void BucketIntegrityChecker::onDoneInit()
+{
+ framework::MilliSecTime maxProcessingTime(60 * 1000);
+ _thread = _component.startThread(
+ *this, maxProcessingTime, _maxThreadWaitTime);
+}
+
+bool
+BucketIntegrityChecker::onInternalReply(
+ const std::shared_ptr<api::InternalReply>& internalReply)
+{
+ // We only care about repair bucket replies
+ shared_ptr<RepairBucketReply> reply(
+ std::dynamic_pointer_cast<RepairBucketReply>(internalReply));
+ if (!reply.get()) return false;
+
+ vespalib::MonitorGuard monitor(_wait);
+ _lastResponseTime = _component.getClock().getTimeInSeconds();
+ uint8_t disk = reply->getDisk();
+ --_status[disk].pendingCount;
+ LOG(spam, "Got repair reply for bucket %s: %s. %u messages still pending "
+ "for disk %u. Bucket altered ? %s",
+ reply->getBucketId().toString().c_str(),
+ reply->getResult().toString().c_str(),
+ _status[disk].pendingCount, disk,
+ (reply->bucketAltered() ? "true" : "false"));
+ if (reply->getResult().success()) {
+ LOG(spam, "Repaired handled ok");
+ ++_status[disk].checkedBuckets;
+ if (_status[disk].done()) {
+ bool completed = true;
+ for (uint32_t i=0; i<_status.size(); ++i) {
+ if (!_status[i].done()) {
+ completed = false;
+ break;
+ }
+ }
+ _lastCycleCompleted = completed;
+ }
+ } else if (reply->getResult().isNonCriticalForIntegrityChecker()) {
+ ++_status[disk].checkedBuckets;
+ LOGBP(debug, "Failed to repair bucket %s due to aborting request. "
+ "Likely bucket split/join or storage shutting down: %s",
+ reply->getBucketId().toString().c_str(),
+ reply->getResult().toString().c_str());
+ } else {
+ _status[disk].failedRepairs.push_back(reply->getBucketId());
+ LOGBP(warning, "Failed to perform maintenance on bucket %s, "
+ "scheduled to be retried: %s",
+ reply->getBucketId().toString().c_str(),
+ reply->getResult().toString().c_str());
+ }
+ if (_lastCycleCompleted) {
+ LOG(info, "Completed bucket integrity check cycle");
+ }
+ monitor.signal();
+ return true;
+}
+
+bool
+BucketIntegrityChecker::onSetSystemState(
+ const std::shared_ptr<api::SetSystemStateCommand>& cmd)
+{
+ vespalib::MonitorGuard monitor(_wait);
+ _systemState = cmd->getSystemState();
+ return false;
+}
+
+
+SchedulingOptions::RunState
+BucketIntegrityChecker::getCurrentRunState(
+ framework::SecondTime currentTime) const
+{
+ time_t currTime = currentTime.getTime();
+ struct tm date;
+ struct tm* dateptr = ::gmtime_r(&currTime, &date);
+ assert(dateptr);
+ (void) dateptr;
+ // Get initial state based on weekday
+ SchedulingOptions::RunState state(
+ _scheduleOptions._dailyStates[date.tm_wday]);
+ uint32_t minutesOfDay = 60 * date.tm_hour + date.tm_min;
+ if ((
+ _scheduleOptions._dailyCycleStart < _scheduleOptions._dailyCycleStop
+ && _scheduleOptions._dailyCycleStart <= minutesOfDay
+ && _scheduleOptions._dailyCycleStop > minutesOfDay
+ ) || (
+ _scheduleOptions._dailyCycleStart >= _scheduleOptions._dailyCycleStop
+ && (_scheduleOptions._dailyCycleStart <= minutesOfDay
+ || _scheduleOptions._dailyCycleStop > minutesOfDay)
+ )
+ )
+ { // If we're within region in day that we can run.
+//std::cerr << "We're inside time boundary. Current time: " << minutesOfDay << " (" << printMinutesOfDay(minutesOfDay) << "). Running between " << _scheduleOptions._dailyCycleStart << " (" << printMinutesOfDay(_scheduleOptions._dailyCycleStart) << ") - " << _scheduleOptions._dailyCycleStop << " (" << printMinutesOfDay(_scheduleOptions._dailyCycleStop) << ")\n";
+ if (state == SchedulingOptions::CONTINUE) {
+//std::cerr << "We're in continue state.\n";
+ // If we're in a continue state, set runstate if there's a current
+ // run active that isn't completed yet, don't run otherwise.
+ state = (_lastCycleCompleted
+ ? SchedulingOptions::DONT_RUN
+ : (_currentRunWithFullVerification
+ ? SchedulingOptions::RUN_FULL
+ : SchedulingOptions::RUN_CHEAP));
+ } else if (state == SchedulingOptions::RUN_FULL ||
+ state == SchedulingOptions::RUN_CHEAP) {
+ // If we're not currently in a run, and it's less than min cycle
+ // time since last run started, we might not want to run yet.
+ if (_lastCycleCompleted &&
+ currentTime - _lastCycleStart < _scheduleOptions._minCycleTime)
+ {
+ // Unless we didn't do full verification last and want to
+ // do full verification now, delay run.
+ if (_currentRunWithFullVerification ||
+ state == SchedulingOptions::RUN_CHEAP)
+ {
+//std::cerr << "Tagging dont run since too little time passed since last run\n" << "current time: " << currentTime << ", last start " << _lastCycleStart << ", min cycle time " << _scheduleOptions._minCycleTime << "\n";
+ state = SchedulingOptions::DONT_RUN;
+ } else {
+//std::cerr << "We can start new run. Last cycle started at " << _lastCycleStart.toString() << " current time is " << currentTime.toString() << " and min cycle time is " << _scheduleOptions._minCycleTime << "\n";
+ }
+ } else {
+//std::cerr << "Enough time passed? " << currentTime.toString() << " - " << _lastCycleStart.toString() << " >= " << _scheduleOptions._minCycleTime << "\n";
+ }
+ }
+ } else {
+ // If we're outside of time of day boundaries, don't run
+//std::cerr << "We're outside time boundary. Current time: " << minutesOfDay << " (" << printMinutesOfDay(minutesOfDay) << "). Only running between " << _scheduleOptions._dailyCycleStart << " (" << printMinutesOfDay(_scheduleOptions._dailyCycleStart) << ") - " << _scheduleOptions._dailyCycleStop << " (" << printMinutesOfDay(_scheduleOptions._dailyCycleStop) << ")\n";
+ state = SchedulingOptions::DONT_RUN;
+ }
+ return state;
+}
+
+void
+BucketIntegrityChecker::run(framework::ThreadHandle& thread)
+{
+ while (!thread.interrupted()) {
+ thread.registerTick(framework::PROCESS_CYCLE);
+ // Get the state based on the current time.
+ framework::SecondTime currentTime(
+ _component.getClock().getTimeInSeconds());
+
+ vespalib::MonitorGuard monitor(_wait);
+ SchedulingOptions::RunState state = getCurrentRunState(currentTime);
+ if (state != SchedulingOptions::RUN_FULL &&
+ state != SchedulingOptions::RUN_CHEAP)
+ {
+ // If we dont want to run at this hour, wait.
+ LOG(spam, "Not in a run state. Waiting.");
+ monitor.wait(_maxThreadWaitTime.getTime());
+ thread.registerTick(framework::WAIT_CYCLE);
+ } else if (state == SchedulingOptions::RUN_FULL && !_lastCycleCompleted
+ && !_currentRunWithFullVerification)
+ {
+ if (getTotalPendingCount() > 0) {
+ LOG(spam, "Waiting for last run to get pending to 0, before "
+ "restarting run to get full verification.");
+ monitor.wait(_maxThreadWaitTime.getTime());
+ thread.registerTick(framework::WAIT_CYCLE);
+ } else {
+ LOG(info, "Aborting current verification/repair cycle and "
+ "starting new one as we at this time want full "
+ "verification.");
+ for (uint32_t i=0; i<_status.size(); ++i) {
+ _status[i].state = DiskData::DONE;
+ }
+ _lastCycleCompleted = true;
+ }
+ } else if (_scheduleOptions._requestDelay.isSet()
+ && getTotalPendingCount() > 0)
+ {
+ LOG(spam, "Request delay. Waiting for 0 pending before possibly "
+ "sending new.");
+ // If request delay is used, we don't send anything new before
+ // all requests have been received.
+ monitor.wait(_maxThreadWaitTime.getTime());
+ thread.registerTick(framework::WAIT_CYCLE);
+ } else if (_scheduleOptions._requestDelay.isSet() &&
+ currentTime - _lastResponseTime
+ < _scheduleOptions._requestDelay)
+ {
+ LOG(spam, "Request delay. Waiting given seconds before sending "
+ "next.");
+ // If request delay is used and we haven't waited enough, wait more
+ framework::MilliSecTime delay(
+ (_scheduleOptions._requestDelay
+ - (currentTime - _lastResponseTime)).getMillis());
+ if (delay > _maxThreadWaitTime) delay = _maxThreadWaitTime;
+ monitor.wait(std::min(_maxThreadWaitTime.getTime(),
+ delay.getTime()));
+ thread.registerTick(framework::WAIT_CYCLE);
+ } else if (_lastCycleCompleted && getTotalPendingCount() > 0) {
+ LOG(spam, "Completed last cycle. Waiting until we have 0 pending "
+ "before possibly starting new cycle");
+ monitor.wait(_maxThreadWaitTime.getTime());
+ thread.registerTick(framework::WAIT_CYCLE);
+ } else {
+ LOG(spam, "Sending messages if we have less than max pending. "
+ "(Currently %u pending total, max is %u per disk)",
+ getTotalPendingCount(),
+ _scheduleOptions._maxPendingCount);
+ // Else we send up to max pending and wait for responses.
+ if (_lastCycleCompleted) {
+ for (uint32_t i=0; i<_status.size(); ++i) {
+ _status[i].state = DiskData::NOT_STARTED;
+ _status[i].failedRepairs.clear();
+ _status[i].checkedBuckets = 0;
+ _status[i].retriedBuckets = 0;
+ }
+ LOG(info, "Starting new verification/repair cycle at time %s.",
+ currentTime.toString().c_str());
+ _lastCycleStart = currentTime;
+ _cycleStartBucketCount = _component.getBucketDatabase().size();
+ _lastCycleCompleted = false;
+ _currentRunWithFullVerification
+ = (state == SchedulingOptions::RUN_FULL);
+ ++_cycleCount;
+ }
+ for (uint32_t i=0; i<_status.size(); ++i) {
+ while (_status[i].pendingCount
+ < _scheduleOptions._maxPendingCount)
+ {
+ document::BucketId bid(_status[i].iterate(
+ _component.getBucketDatabase()));
+ if (bid == document::BucketId(0, 0)) {
+ LOG(debug, "Completed repair cycle for disk %u.", i);
+ // If there is no next bucket, we might have completed
+ // run
+ bool completed = true;
+ for (uint32_t j=0; j<_status.size(); ++j) {
+ if (!_status[j].done()) {
+ completed = false;
+ break;
+ }
+ }
+ _lastCycleCompleted = completed;
+ if (_lastCycleCompleted) {
+ LOG(debug, "Repair cycle completed for all disks.");
+ }
+ break;
+ }
+
+ std::shared_ptr<RepairBucketCommand> cmd(
+ new RepairBucketCommand(bid, _status[i].disk));
+ cmd->verifyBody(_currentRunWithFullVerification);
+ cmd->moveToIdealDisk(true);
+ cmd->setPriority(230);
+ LOG(spam, "Sending new repair command for bucket %s. "
+ "After this, there will be %u pending on disk %u",
+ bid.toString().c_str(),
+ _status[i].pendingCount + 1, _status[i].disk);
+ ++_status[i].pendingCount;
+ dispatchDown(cmd);
+ }
+ }
+ monitor.wait(_maxThreadWaitTime.getTime());
+ thread.registerTick(framework::WAIT_CYCLE);
+ }
+ }
+}
+
+uint32_t
+BucketIntegrityChecker::getTotalPendingCount() const
+{
+ uint32_t total = 0;
+ for (uint32_t i=0; i<_status.size(); ++i) {
+ total += _status[i].pendingCount;
+ }
+ return total;
+}
+
+namespace {
+ template<typename T>
+ void printRow(std::ostream& out, const std::string& key, const T& val) {
+ out << "<tr><td>" << key << "</td><td><pre>" << val
+ << "</pre></td></tr>\n";
+ }
+}
+
+void
+BucketIntegrityChecker::reportHtmlStatus(std::ostream& out,
+ const framework::HttpUrlPath&) const
+{
+ vespalib::MonitorGuard monitor(_wait);
+ uint32_t totalChecked = 0, totalRetried = 0;
+ for (uint32_t i=0; i<_status.size(); ++i) {
+ totalChecked += _status[i].checkedBuckets;
+ totalRetried += _status[i].retriedBuckets;
+ }
+ out << "<table>\n";
+ printRow(out, "current status", _lastCycleCompleted
+ ? "Not running a cycle" : "Running a cycle");
+ printRow(out, "pending count", getTotalPendingCount());
+ std::string name = (_lastCycleCompleted ? "last" : "current");
+ if (_lastCycleStart.isSet()) {
+ printRow(out, name + " cycle start", _lastCycleStart.toString());
+ printRow(out, "buckets checked in " + name + " cycle",
+ totalChecked);
+ printRow(out, "buckets retried check in " + name + " cycle",
+ totalRetried);
+ printRow(out, "total buckets in database at start of " + name
+ + " cycle", _cycleStartBucketCount);
+ if (!_lastCycleCompleted) {
+ std::ostringstream ost;
+ ost << (100.0 * totalChecked / _cycleStartBucketCount) << " %";
+ printRow(out, "progress", ost.str());
+ }
+ }
+ if (_lastResponseTime.isSet()) {
+ printRow(out, "Last response time", _lastResponseTime.toString());
+ }
+ printRow(out, "Schedule options", _scheduleOptions);
+ out << "</table>\n";
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/bucketintegritychecker.h b/storage/src/vespa/storage/storageserver/bucketintegritychecker.h
new file mode 100644
index 00000000000..bbea3fab2a8
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/bucketintegritychecker.h
@@ -0,0 +1,160 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::BucketIntegrityChecker
+ * @ingroup storageserver
+ *
+ * @brief This class schedules buckets for integrity checks.
+ *
+ * @version $Id$
+ */
+
+#pragma once
+
+#include <vespa/storage/common/servicelayercomponent.h>
+#include <vespa/storage/common/storagelinkqueued.h>
+#include <vespa/storage/config/config-stor-integritychecker.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storageframework/storageframework.h>
+
+namespace storage {
+
+class RepairBucketReply;
+
+/** Options describing when and how parallel we should run. */
+struct SchedulingOptions : public document::Printable {
+ /** Time of day to start/resume cycle. Minutes after 00:00. 0 - 24*60-1. */
+ uint32_t _dailyCycleStart;
+ /** Time of day to pause cycle if it's still going. Minutes after 00:00. */
+ uint32_t _dailyCycleStop;
+
+ enum RunState { DONT_RUN, RUN_FULL, RUN_CHEAP, CONTINUE };
+ /** Which days to run cycle. */
+ RunState _dailyStates[7];
+
+ /** Max pending requests at the same time. */
+ uint32_t _maxPendingCount;
+ /** Minimum time between each cycle. */
+ framework::SecondTime _minCycleTime;
+ /** Seconds delay between requests if max pending == 1. */
+ framework::SecondTime _requestDelay;
+
+ SchedulingOptions()
+ : _dailyCycleStart(0),
+ _dailyCycleStop(0),
+ _maxPendingCount(5),
+ _minCycleTime(24 * 60 * 60), // One day
+ _requestDelay(0)
+ {
+ for (uint32_t i=0; i<7; ++i) { _dailyStates[i] = RUN_FULL; }
+ }
+
+ void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+};
+
+
+class BucketIntegrityChecker : public StorageLinkQueued,
+ private framework::Runnable,
+ public framework::HtmlStatusReporter,
+ private config::IFetcherCallback<vespa::config::content::core::StorIntegritycheckerConfig> {
+public:
+ struct DiskData {
+ /**
+ * State of bucket database iterating. If not started, we should
+ * take first bucket in bucket db, if in progress, take next after
+ * currentBucket, and if done, don't do anymore.
+ */
+ enum State { NOT_STARTED, IN_PROGRESS, DONE };
+
+ document::BucketId currentBucket;
+ uint32_t pendingCount;
+ State state;
+ uint8_t disk;
+ std::list<document::BucketId> failedRepairs;
+ uint32_t checkedBuckets;
+ uint32_t retriedBuckets;
+
+ DiskData() : currentBucket(0), pendingCount(0),
+ state(NOT_STARTED), disk(255),
+ checkedBuckets(0), retriedBuckets(0) {}
+
+ bool done() const; // Whether we're still working on this disk
+ bool working() const; // Whether we've stated and not finished
+ /**
+ * Get the next bucket to repair. If no more to iterate, random bucket
+ * is returned. Check if done() afterwards.
+ */
+ document::BucketId iterate(StorBucketDatabase&);
+ };
+
+private:
+ uint32_t _cycleCount;
+ std::vector<DiskData> _status;
+ framework::SecondTime _lastCycleStart;
+ uint32_t _cycleStartBucketCount;
+ framework::SecondTime _lastResponseTime;
+ bool _lastCycleCompleted;
+ bool _currentRunWithFullVerification;
+ bool _verifyAllRepairs;
+ SchedulingOptions _scheduleOptions;
+ lib::ClusterState _systemState;
+ vespalib::Monitor _wait;
+ config::ConfigFetcher _configFetcher;
+ framework::MilliSecTime _maxThreadWaitTime;
+ ServiceLayerComponent _component;
+ framework::Thread::UP _thread;
+
+ BucketIntegrityChecker(const BucketIntegrityChecker &);
+ BucketIntegrityChecker& operator=(const BucketIntegrityChecker &);
+
+public:
+ BucketIntegrityChecker(const config::ConfigUri & configUri,
+ ServiceLayerComponentRegister&);
+ ~BucketIntegrityChecker();
+
+ virtual void onClose();
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ SchedulingOptions& getSchedulingOptions() { return _scheduleOptions; }
+
+ bool isWorkingOnCycle() const;
+
+ uint32_t getCycleCount() const;
+
+ /** Give thread a bump by signalling it. */
+ void bump() const;
+
+ void setMaxThreadWaitTime(framework::MilliSecTime milliSecs)
+ { _maxThreadWaitTime = milliSecs; }
+
+ framework::Clock& getClock() { return _component.getClock(); }
+
+private:
+ virtual void configure(std::unique_ptr<vespa::config::content::core::StorIntegritycheckerConfig>);
+
+ void onDoneInit();
+
+ bool onInternalReply(const std::shared_ptr<api::InternalReply>&);
+ bool onSetSystemState(const std::shared_ptr<api::SetSystemStateCommand>&);
+ bool onNotifyBucketChangeReply(
+ const std::shared_ptr<api::NotifyBucketChangeReply>&)
+ { return true; }
+
+ SchedulingOptions::RunState getCurrentRunState(
+ framework::SecondTime time) const;
+
+ virtual void run(framework::ThreadHandle&);
+
+ uint32_t getTotalPendingCount() const;
+
+ // Status::Reporter implementation
+ virtual void reportHtmlStatus(std::ostream&,
+ const framework::HttpUrlPath&) const;
+
+};
+
+}
+
diff --git a/storage/src/vespa/storage/storageserver/changedbucketownershiphandler.cpp b/storage/src/vespa/storage/storageserver/changedbucketownershiphandler.cpp
new file mode 100644
index 00000000000..5b742ad524d
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/changedbucketownershiphandler.cpp
@@ -0,0 +1,398 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storage/common/messagebucketid.h>
+#include <vespa/storage/common/servicelayercomponent.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/storage/storageserver/changedbucketownershiphandler.h>
+#include <vespa/log/log.h>
+
+LOG_SETUP(".bucketownershiphandler");
+
+namespace storage {
+
+ChangedBucketOwnershipHandler::ChangedBucketOwnershipHandler(
+ const config::ConfigUri& configUri,
+ ServiceLayerComponentRegister& compReg)
+ : StorageLink("Changed bucket ownership handler"),
+ _component(compReg, "changedbucketownershiphandler"),
+ _metrics(),
+ _configFetcher(configUri.getContext()),
+ _stateLock(),
+ _currentDistribution(_component.getDistribution()),
+ _currentState(), // Not set yet, so ownership will not be valid
+ _currentOwnership(std::make_shared<OwnershipState>(
+ _currentDistribution, _currentState)),
+ _abortQueuedAndPendingOnStateChange(false),
+ _abortMutatingIdealStateOps(false),
+ _abortMutatingExternalLoadOps(false)
+{
+ _configFetcher.subscribe<vespa::config::content::PersistenceConfig>(configUri.getConfigId(), this);
+ _configFetcher.start();
+ _component.registerMetric(_metrics);
+}
+
+ChangedBucketOwnershipHandler::~ChangedBucketOwnershipHandler()
+{
+}
+
+void
+ChangedBucketOwnershipHandler::configure(
+ std::unique_ptr<vespa::config::content::PersistenceConfig> config)
+{
+ _abortQueuedAndPendingOnStateChange.store(
+ config->abortOperationsWithChangedBucketOwnership,
+ std::memory_order_relaxed);
+ _abortMutatingIdealStateOps.store(
+ config->abortOutdatedMutatingIdealStateOps,
+ std::memory_order_relaxed);
+ _abortMutatingExternalLoadOps.store(
+ config->abortOutdatedMutatingExternalLoadOps,
+ std::memory_order_relaxed);
+}
+
+void
+ChangedBucketOwnershipHandler::reloadClusterState()
+{
+ vespalib::LockGuard guard(_stateLock);
+ lib::ClusterState::CSP newState(_component.getStateUpdater()
+ .getSystemState());
+ setCurrentOwnershipWithStateNoLock(*newState);
+}
+
+void
+ChangedBucketOwnershipHandler::setCurrentOwnershipWithStateNoLock(
+ const lib::ClusterState& newState)
+{
+ _currentState = std::make_shared<lib::ClusterState>(newState);
+ _currentOwnership = std::make_shared<OwnershipState>(
+ _currentDistribution, _currentState);
+}
+
+namespace {
+
+bool
+allDistributorsDownInState(const lib::ClusterState& state) {
+ using lib::NodeType;
+ using lib::Node;
+ uint16_t nodeCount(state.getNodeCount(NodeType::DISTRIBUTOR));
+ for (uint16_t i = 0; i < nodeCount; ++i) {
+ if (state.getNodeState(Node(NodeType::DISTRIBUTOR, i)).getState().oneOf("ui")) {
+ return false;
+ }
+ }
+ return true;
+}
+
+}
+
+uint16_t
+ChangedBucketOwnershipHandler::OwnershipState::ownerOf(
+ const document::BucketId& bucket) const
+{
+ try {
+ return _distribution->getIdealDistributorNode(*_state, bucket);
+ } catch (lib::TooFewBucketBitsInUseException& e) {
+ LOGBP(debug,
+ "Too few bucket bits used for %s to be assigned to "
+ "a distributor.",
+ bucket.toString().c_str());
+ } catch (lib::NoDistributorsAvailableException& e) {
+ LOGBP(warning,
+ "Got exception with no distributors available when checking "
+ "bucket owner; this should not happen as we explicitly check "
+ "for available distributors before reaching this code path! "
+ "Cluster state is '%s', distribution is '%s'",
+ _state->toString().c_str(),
+ _distribution->toString().c_str());
+ } catch (const std::exception& e) {
+ LOG(error,
+ "Got unknown exception while resolving distributor: %s",
+ e.what());
+ }
+ return FAILED_TO_RESOLVE;
+}
+
+void
+ChangedBucketOwnershipHandler::logTransition(
+ const lib::ClusterState& currentState,
+ const lib::ClusterState& newState) const
+{
+ LOG(debug,
+ "State transition '%s' -> '%s' changes distributor bucket ownership, "
+ "so must abort queued operations for the affected buckets.",
+ currentState.toString().c_str(),
+ newState.toString().c_str());
+}
+
+namespace {
+
+class StateDiffLazyAbortPredicate
+ : public AbortBucketOperationsCommand::AbortPredicate
+{
+ // Ownership states wrap a couple of shared_ptrs and are thus cheap to
+ // copy and store.
+ ChangedBucketOwnershipHandler::OwnershipState _oldState;
+ ChangedBucketOwnershipHandler::OwnershipState _newState;
+ // Fast path to avoid trying (and failing) to compute owner in a state
+ // where all distributors are down.
+ bool _allDistributorsHaveGoneDown;
+
+ bool doShouldAbort(const document::BucketId& b) const override {
+ if (_allDistributorsHaveGoneDown) {
+ return true;
+ }
+ uint16_t oldOwner(_oldState.ownerOf(b));
+ uint16_t newOwner(_newState.ownerOf(b));
+ if (oldOwner != newOwner) {
+ LOG(spam, "Owner of %s was %u, now %u. Operation should be aborted",
+ b.toString().c_str(), oldOwner, newOwner);
+ return true;
+ }
+ return false;
+ }
+public:
+ StateDiffLazyAbortPredicate(
+ const ChangedBucketOwnershipHandler::OwnershipState& oldState,
+ const ChangedBucketOwnershipHandler::OwnershipState& newState)
+ : _oldState(oldState),
+ _newState(newState),
+ _allDistributorsHaveGoneDown(
+ allDistributorsDownInState(newState.getState()))
+ {
+ }
+};
+
+}
+
+std::unique_ptr<AbortBucketOperationsCommand::AbortPredicate>
+ChangedBucketOwnershipHandler::makeLazyAbortPredicate(
+ const OwnershipState::CSP& oldOwnership,
+ const OwnershipState::CSP& newOwnership) const
+{
+ return std::unique_ptr<AbortBucketOperationsCommand::AbortPredicate>(
+ new StateDiffLazyAbortPredicate(*oldOwnership, *newOwnership));
+}
+
+/*
+ * If we go from:
+ * 1) Not all down -> all distributors down
+ * - abort ops for _all_ buckets
+ * 2) All distributors down -> not down
+ * - no-op, since down edge must have been handled first
+ * 3) All down -> all down
+ * - no-op
+ * 4) Some nodes down or up
+ * - abort ops for buckets that have changed ownership between
+ * current and new cluster state.
+ */
+bool
+ChangedBucketOwnershipHandler::onSetSystemState(
+ const std::shared_ptr<api::SetSystemStateCommand>& stateCmd)
+{
+ if (!enabledOperationAbortingOnStateChange()) {
+ LOG(debug, "Operation aborting is config-disabled");
+ return false; // Early out.
+ }
+ OwnershipState::CSP oldOwnership;
+ OwnershipState::CSP newOwnership;
+ // Get old state and update own current cluster state _before_ it is
+ // applied to the rest of the system. This helps ensure that no message
+ // can get through in the off-case that the lower level storage links
+ // don't apply the state immediately for some reason.
+ {
+ vespalib::LockGuard guard(_stateLock);
+ oldOwnership = _currentOwnership;
+ setCurrentOwnershipWithStateNoLock(stateCmd->getSystemState());
+ newOwnership = _currentOwnership;
+ }
+ assert(newOwnership->valid());
+ // If we're going from not having a state to having a state, we per
+ // definition cannot possibly have gotten any load that needs aborting,
+ // as no such load is allowed through this component when this is the
+ // case.
+ if (!oldOwnership->valid()) {
+ return false;
+ }
+
+ if (allDistributorsDownInState(oldOwnership->getState())) {
+ LOG(debug, "No need to send aborts on transition '%s' -> '%s'",
+ oldOwnership->getState().toString().c_str(),
+ newOwnership->getState().toString().c_str());
+ return false;
+ }
+ logTransition(oldOwnership->getState(), newOwnership->getState());
+
+ metrics::MetricTimer durationTimer;
+ auto predicate(makeLazyAbortPredicate(oldOwnership, newOwnership));
+ AbortBucketOperationsCommand::SP cmd(
+ new AbortBucketOperationsCommand(std::move(predicate)));
+
+ // Will not return until all operation aborts have been performed
+ // on the lower level links, at which point it is safe to send down
+ // the SetSystemStateCommand.
+ sendDown(cmd);
+
+ durationTimer.stop(_metrics.averageAbortProcessingTime);
+ return false;
+}
+
+/**
+ * Invoked whenever a distribution config change happens and is called in the
+ * context of the config updater thread (which is why we have to lock).
+ */
+void
+ChangedBucketOwnershipHandler::storageDistributionChanged()
+{
+ vespalib::LockGuard guard(_stateLock);
+ _currentDistribution = _component.getDistribution();
+ _currentOwnership = std::make_shared<OwnershipState>(
+ _currentDistribution, _currentState);
+}
+
+bool
+ChangedBucketOwnershipHandler::isMutatingIdealStateOperation(
+ const api::StorageMessage& msg) const
+{
+ switch (msg.getType().getId()) {
+ case api::MessageType::CREATEBUCKET_ID:
+ case api::MessageType::MERGEBUCKET_ID:
+ case api::MessageType::DELETEBUCKET_ID:
+ case api::MessageType::SPLITBUCKET_ID:
+ case api::MessageType::JOINBUCKETS_ID:
+ // Note: RemoveLocation is external load, but is used to implement GC and
+ // must thus be treated as an ideal state operation for that purpose.
+ case api::MessageType::REMOVELOCATION_ID:
+ case api::MessageType::SETBUCKETSTATE_ID:
+ return true;
+ default:
+ return false;
+ }
+}
+
+
+bool
+ChangedBucketOwnershipHandler::isMutatingExternalOperation(
+ const api::StorageMessage& msg) const
+{
+ switch (msg.getType().getId()) {
+ case api::MessageType::PUT_ID:
+ case api::MessageType::REMOVE_ID:
+ case api::MessageType::MULTIOPERATION_ID:
+ case api::MessageType::UPDATE_ID:
+ case api::MessageType::REVERT_ID:
+ return true;
+ default:
+ return false;
+ }
+}
+
+ChangedBucketOwnershipHandler::OwnershipState::CSP
+ChangedBucketOwnershipHandler::getCurrentOwnershipState() const
+{
+ vespalib::LockGuard guard(_stateLock);
+ return _currentOwnership;
+}
+
+bool
+ChangedBucketOwnershipHandler::sendingDistributorOwnsBucketInCurrentState(
+ const api::StorageCommand& cmd) const
+{
+ OwnershipState::CSP current(getCurrentOwnershipState());
+ if (!current->valid()) {
+ LOG(debug, "No cluster state received yet, must bounce message '%s'",
+ cmd.toString().c_str());
+ return false;
+ }
+
+ try {
+ document::BucketId opBucket(getStorageMessageBucketId(cmd));
+ return (current->ownerOf(opBucket) == cmd.getSourceIndex());
+ } catch (vespalib::IllegalArgumentException& e) {
+ LOG(error,
+ "Precondition violation: unable to get bucket from "
+ "message: %s",
+ e.toString().c_str());
+ assert(false);
+ }
+ return false; // Unreachable statement.
+}
+
+void
+ChangedBucketOwnershipHandler::abortOperation(api::StorageCommand& cmd)
+{
+ api::StorageReply::SP reply(cmd.makeReply());
+ reply->setResult(api::ReturnCode(
+ api::ReturnCode::ABORTED,
+ "Operation aborted to prevent inconsistencies caused by a "
+ "change in bucket ownership"));
+ sendUp(reply);
+ if (isMutatingIdealStateOperation(cmd)) {
+ _metrics.idealStateOpsAborted.inc();
+ } else {
+ _metrics.externalLoadOpsAborted.inc();
+ }
+}
+
+bool
+ChangedBucketOwnershipHandler::isMutatingCommandAndNeedsChecking(
+ const api::StorageMessage& msg) const
+{
+ if (enabledIdealStateAborting() && isMutatingIdealStateOperation(msg)) {
+ return true;
+ }
+ if (enabledExternalLoadAborting() && isMutatingExternalOperation(msg)) {
+ return true;
+ }
+ return false;
+}
+
+bool
+ChangedBucketOwnershipHandler::onDown(
+ const std::shared_ptr<api::StorageMessage>& msg)
+{
+ if (msg->getType() == api::MessageType::SETSYSTEMSTATE) {
+ return onSetSystemState(
+ std::static_pointer_cast<api::SetSystemStateCommand>(msg));
+ }
+ if (!isMutatingCommandAndNeedsChecking(*msg)) {
+ return false;
+ }
+ api::StorageCommand& cmd(static_cast<api::StorageCommand&>(*msg));
+ if (!sendingDistributorOwnsBucketInCurrentState(cmd)) {
+ abortOperation(cmd);
+ return true;
+ }
+ return false;
+}
+
+bool
+ChangedBucketOwnershipHandler::enabledOperationAbortingOnStateChange() const
+{
+ return _abortQueuedAndPendingOnStateChange.load(std::memory_order_relaxed);
+}
+
+bool
+ChangedBucketOwnershipHandler::enabledIdealStateAborting() const
+{
+ return _abortMutatingIdealStateOps.load(std::memory_order_relaxed);
+}
+
+bool
+ChangedBucketOwnershipHandler::enabledExternalLoadAborting() const
+{
+ return _abortMutatingExternalLoadOps.load(std::memory_order_relaxed);
+}
+
+bool
+ChangedBucketOwnershipHandler::onInternalReply(
+ const std::shared_ptr<api::InternalReply>& reply)
+{
+ // Just swallow reply, we don't do anything with it.
+ return (reply->getType() == AbortBucketOperationsReply::ID);
+}
+
+}
diff --git a/storage/src/vespa/storage/storageserver/changedbucketownershiphandler.h b/storage/src/vespa/storage/storageserver/changedbucketownershiphandler.h
new file mode 100644
index 00000000000..6027704be61
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/changedbucketownershiphandler.h
@@ -0,0 +1,218 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/vespalib/util/sync.h>
+#include <vespa/metrics/metrics.h>
+#include <vespa/config/config.h>
+#include <vespa/config-persistence.h>
+#include <atomic>
+#include <vector>
+
+namespace storage {
+
+class ServiceLayerComponent;
+
+/**
+ * The changed bucket ownership handler is a storage link that synchronously
+ * intercepts attempts to change the state on the node and ensure any
+ * operations to buckets whose ownership is changed are aborted.
+ *
+ * If default config is used, all mutating ideal state operations for buckets
+ * that--upon time of checking in this handler--belong to a different
+ * distributor than the one specified as the sender will be aborted.
+ *
+ * We consider the following operations as mutating ideal state ops:
+ * - SplitBucketCommand
+ * - JoinBucketsCommand
+ * - MergeBucketsCommand (already blocked by throttler, but let's not
+ * let that stop us)
+ * - RemoveLocationCommand (technically an external load op, but is used by
+ * the GC functionality and must therefore be included here)
+ * - SetBucketStateCommand
+ * - DeleteBucketCommand
+ * - CreateBucketCommand
+ *
+ * If default config is used, all mutating external operations with altered
+ * bucket owneship will also be aborted.
+ *
+ * We consider the following external operations as mutating:
+ * - PutCommand
+ * - MultiOperationCommand
+ * - UpdateCommand
+ * - RemoveCommand
+ * - RevertCommand
+ */
+class ChangedBucketOwnershipHandler
+ : public StorageLink,
+ private config::IFetcherCallback<vespa::config::content::PersistenceConfig>
+{
+public:
+ class Metrics : public metrics::MetricSet
+ {
+ public:
+ metrics::LongAverageMetric averageAbortProcessingTime;
+ metrics::LongCountMetric idealStateOpsAborted;
+ metrics::LongCountMetric externalLoadOpsAborted;
+
+ Metrics(metrics::MetricSet* owner = 0)
+ : metrics::MetricSet("changedbucketownershiphandler",
+ "", "", owner),
+ averageAbortProcessingTime(
+ "avg_abort_processing_time", "",
+ "Average time spent aborting operations for changed "
+ "buckets", this),
+ idealStateOpsAborted(
+ "ideal_state_ops_aborted", "",
+ "Number of outdated ideal state operations aborted",
+ this),
+ externalLoadOpsAborted(
+ "external_load_ops_aborted", "",
+ "Number of outdated external load operations aborted",
+ this)
+ {}
+ };
+
+ /**
+ * Wrapper around the distribution & state pair that decides how to
+ * compute the owner distributor for a bucket. It's possible to have
+ * an ownership state with a nullptr cluster state when the node
+ * initially starts up, which is why no owership state must be used unless
+ * invoking valid() on it returns true.
+ */
+ class OwnershipState
+ {
+ lib::Distribution::SP _distribution;
+ lib::ClusterState::CSP _state;
+ public:
+ using SP = std::shared_ptr<OwnershipState>;
+ using CSP = std::shared_ptr<const OwnershipState>;
+
+ OwnershipState(const lib::Distribution::SP& distribution,
+ const lib::ClusterState::CSP& state)
+ : _distribution(distribution),
+ _state(state)
+ {
+ }
+
+ static const uint16_t FAILED_TO_RESOLVE = 0xffff;
+
+ bool valid() const {
+ return ((_distribution.get() != nullptr)
+ && (_state.get() != nullptr));
+ }
+
+ /**
+ * Precondition: valid() == true.
+ */
+ const lib::ClusterState& getState() const {
+ assert(valid());
+ return *_state;
+ }
+
+ uint16_t ownerOf(const document::BucketId& bucket) const;
+ };
+
+ /**
+ * For unit testing only; trigger a reload of the cluster state from the
+ * component registry, since tests may want to set the cluster state
+ * explicitly without sending a message through the chain.
+ */
+ void reloadClusterState();
+
+private:
+ ServiceLayerComponent _component;
+ Metrics _metrics;
+ config::ConfigFetcher _configFetcher;
+ vespalib::Lock _stateLock;
+ lib::Distribution::SP _currentDistribution;
+ lib::ClusterState::CSP _currentState;
+ OwnershipState::CSP _currentOwnership;
+
+ std::atomic<bool> _abortQueuedAndPendingOnStateChange;
+ std::atomic<bool> _abortMutatingIdealStateOps;
+ std::atomic<bool> _abortMutatingExternalLoadOps;
+
+ std::unique_ptr<AbortBucketOperationsCommand::AbortPredicate>
+ makeLazyAbortPredicate(
+ const OwnershipState::CSP& oldOwnership,
+ const OwnershipState::CSP& newOwnership) const;
+
+ void logTransition(const lib::ClusterState& currentState,
+ const lib::ClusterState& newState) const;
+
+ /**
+ * Creates a new immutable OwnershipState based on the current distribution
+ * and the provided cluster state and assigns it to _currentOwnership.
+ */
+ void setCurrentOwnershipWithStateNoLock(const lib::ClusterState&);
+
+ /**
+ * Grabs _stateLock and returns a shared_ptr to the current ownership
+ * state, which may or may not be valid().
+ */
+ OwnershipState::CSP getCurrentOwnershipState() const;
+
+ bool isMutatingCommandAndNeedsChecking(const api::StorageMessage&) const;
+
+ bool isMutatingIdealStateOperation(const api::StorageMessage&) const;
+
+ bool isMutatingExternalOperation(const api::StorageMessage&) const;
+ /**
+ * Returns whether the operation in cmd has a bucket whose ownership in
+ * the current cluster state does not match the distributor marked as
+ * being the sender in the message itself.
+ *
+ * Precondition: cmd is an instance of a message type containing a bucket
+ * identifier.
+ */
+ bool sendingDistributorOwnsBucketInCurrentState(
+ const api::StorageCommand& cmd) const;
+ /**
+ * Creates a reply for cmd, assigns an ABORTED return code and sends the
+ * reply back up the storage chain.
+ */
+ void abortOperation(api::StorageCommand& cmd);
+
+ /**
+ * Returns whether aborting queued, changed ops and waiting for pending
+ * changed ops is enabled through config.
+ */
+ bool enabledOperationAbortingOnStateChange() const;
+
+ /**
+ * Returns whether aborting outdated ideal state operations has been enabled
+ * through config.
+ */
+ bool enabledIdealStateAborting() const;
+
+ bool enabledExternalLoadAborting() const;
+
+public:
+ ChangedBucketOwnershipHandler(const config::ConfigUri& configUri,
+ ServiceLayerComponentRegister& compReg);
+ ~ChangedBucketOwnershipHandler();
+
+ bool onSetSystemState(
+ const std::shared_ptr<api::SetSystemStateCommand>&) override;
+ bool onDown(const std::shared_ptr<api::StorageMessage>&) override;
+
+ bool onInternalReply(
+ const std::shared_ptr<api::InternalReply>& reply) override;
+
+ void configure(std::unique_ptr<vespa::config::content::PersistenceConfig>) override;
+
+ /**
+ * We want to ensure distribution config changes are thread safe wrt. our
+ * own state, so we make sure to get notified when these happen so we can
+ * do explicit locked updates.
+ */
+ void storageDistributionChanged() override;
+
+ const Metrics& getMetrics() const { return _metrics; }
+};
+
+}
diff --git a/storage/src/vespa/storage/storageserver/communicationmanager.cpp b/storage/src/vespa/storage/storageserver/communicationmanager.cpp
new file mode 100644
index 00000000000..21c1e68cbcc
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/communicationmanager.cpp
@@ -0,0 +1,850 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/communicationmanager.h>
+
+#include <vespa/log/log.h>
+#include <queue>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/messagebus/emptyreply.h>
+#include <vespa/messagebus/rpcmessagebus.h>
+#include <vespa/messagebus/sourcesessionparams.h>
+#include <vespa/storage/storageserver/documentapiconverter.h>
+#include <vespa/storage/config/config-stor-server.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/storageserver/storagemetricsset.h>
+#include <vespa/storageframework/storageframework.h>
+
+LOG_SETUP(".communication.manager");
+
+namespace storage {
+
+PriorityQueue::PriorityQueue() :
+ _queue(),
+ _queueMonitor(),
+ _msgCounter(0)
+{
+}
+
+PriorityQueue::~PriorityQueue()
+{
+}
+
+bool PriorityQueue::getNext(std::shared_ptr<api::StorageMessage>& msg,
+ int timeout)
+{
+ vespalib::MonitorGuard sync(_queueMonitor);
+ bool first = true;
+ while (true) { // Max twice
+ if (!_queue.empty()) {
+ LOG(spam, "Picking message from queue");
+ msg = _queue.top().second;
+ _queue.pop();
+ return true;
+ }
+ if (timeout == 0 || !first) {
+ return false;
+ }
+ sync.wait(timeout);
+ first = false;
+ }
+
+ return false;
+}
+
+void
+PriorityQueue::enqueue(const std::shared_ptr<api::StorageMessage>& msg)
+{
+ vespalib::MonitorGuard sync(_queueMonitor);
+ const uint8_t priority(msg->getType().isReply()
+ ? FIXED_REPLY_PRIORITY
+ : msg->getPriority());
+ Key key(priority, _msgCounter);
+ // We make a simplifying--though reasonable--assumption that we'll never
+ // process more than UINT64_MAX replies before process restart.
+ ++_msgCounter;
+ _queue.push(std::make_pair(key, msg));
+ sync.unsafeSignalUnlock();
+}
+
+void
+PriorityQueue::signal()
+{
+ vespalib::MonitorGuard sync(_queueMonitor);
+ sync.unsafeSignalUnlock();
+}
+
+int
+PriorityQueue::size()
+{
+ vespalib::MonitorGuard sync(_queueMonitor);
+ return _queue.size();
+}
+
+const framework::MemoryAllocationType&
+CommunicationManager::getAllocationType(api::StorageMessage& msg) const
+{
+ return _messageAllocTypes.getType(msg.getType().getId());
+}
+
+
+void
+CommunicationManager::receiveStorageReply(
+ const std::shared_ptr<api::StorageReply>& reply)
+{
+ assert(reply.get());
+ enqueue(reply);
+}
+
+namespace {
+ vespalib::string getNodeId(StorageComponent& sc) {
+ vespalib::asciistream ost;
+ ost << sc.getClusterName() << "/" << sc.getNodeType()
+ << "/" << sc.getIndex();
+ return ost.str();
+ }
+
+ framework::SecondTime TEN_MINUTES(600);
+
+}
+
+void
+CommunicationManager::handleMessage(std::unique_ptr<mbus::Message> msg)
+{
+ MBUS_TRACE(msg->getTrace(), 4, getNodeId(_component)
+ + " CommunicationManager: Received message from message bus");
+ // Relaxed load since we're not doing any dependent reads that aren't
+ // already covered by some other form of explicit synchronization.
+ if (_closed.load(std::memory_order_relaxed)) {
+ LOG(debug, "Not handling command of type %d as we have closed down",
+ msg->getType());
+ MBUS_TRACE(msg->getTrace(), 6,
+ "Communication manager: Failing message as we are closed");
+ std::unique_ptr<mbus::Reply> reply(new mbus::EmptyReply());
+ reply->addError(mbus::Error(
+ documentapi::DocumentProtocol::ERROR_ABORTED,
+ "Node shutting down"));
+ msg->swapState(*reply);
+ _messageBusSession->reply(std::move(reply));
+ return;
+ }
+ const vespalib::string & protocolName = msg->getProtocol();
+
+ if (protocolName == documentapi::DocumentProtocol::NAME) {
+ std::unique_ptr<documentapi::DocumentMessage> docMsgPtr(
+ static_cast<documentapi::DocumentMessage*>(msg.release()));
+
+ assert(docMsgPtr.get());
+
+ std::unique_ptr<api::StorageCommand> cmd(
+ _docApiConverter.toStorageAPI(
+ static_cast<documentapi::DocumentMessage&>(*docMsgPtr),
+ _component.getTypeRepo()));
+
+ if (!cmd.get()) {
+ LOGBM(warning, "Unsupported message: StorageApi could not convert "
+ "message of type %d to a storageapi message",
+ docMsgPtr->getType());
+ _metrics.convertToStorageAPIFailures.inc();
+ return;
+ }
+
+ cmd->setTrace(docMsgPtr->getTrace());
+ cmd->setTransportContext(std::unique_ptr<api::TransportContext>(
+ new StorageTransportContext(std::move(docMsgPtr))));
+
+ enqueue(std::shared_ptr<api::StorageCommand>(cmd.release()));
+ } else if (protocolName == mbusprot::StorageProtocol::NAME) {
+ std::unique_ptr<mbusprot::StorageCommand> storMsgPtr(
+ static_cast<mbusprot::StorageCommand*>(msg.release()));
+
+ assert(storMsgPtr.get());
+
+ const std::shared_ptr<api::StorageCommand> & cmd = storMsgPtr->getCommand();
+ cmd->setTimeout(storMsgPtr->getTimeRemaining());
+ cmd->setTrace(storMsgPtr->getTrace());
+ cmd->setTransportContext(std::unique_ptr<api::TransportContext>(
+ new StorageTransportContext(std::move(storMsgPtr))));
+
+ enqueue(cmd);
+ } else {
+ LOGBM(warning, "Received unsupported message type %d for protocol '%s'",
+ msg->getType(), msg->getProtocol().c_str());
+ }
+}
+
+void
+CommunicationManager::handleReply(std::unique_ptr<mbus::Reply> reply)
+{
+ MBUS_TRACE(reply->getTrace(), 4, getNodeId(_component)
+ + "Communication manager: Received reply from message bus");
+ // Relaxed load since we're not doing any dependent reads that aren't
+ // already covered by some other form of explicit synchronization.
+ if (_closed.load(std::memory_order_relaxed)) {
+ LOG(debug, "Not handling reply of type %d as we have closed down",
+ reply->getType());
+ return;
+ }
+ LOG(spam, "Got reply of type %d, trace is %s",
+ reply->getType(), reply->getTrace().toString().c_str());
+ // EmptyReply must be converted to real replies before processing.
+ if (reply->getType() == 0) {
+ std::unique_ptr<mbus::Message> message(reply->getMessage().release());
+
+ if (message.get()) {
+ std::unique_ptr<mbus::Reply> convertedReply;
+
+ const vespalib::string& protocolName = message->getProtocol();
+ if (protocolName == documentapi::DocumentProtocol::NAME) {
+ convertedReply.reset(static_cast<documentapi::DocumentMessage*>(
+ message.get())->createReply().release());
+ } else if (protocolName == mbusprot::StorageProtocol::NAME) {
+ std::shared_ptr<api::StorageReply> repl(
+ static_cast<mbusprot::StorageCommand*>(message.get())
+ ->getCommand()->makeReply().release());
+ mbusprot::StorageReply::UP sreply(
+ new mbusprot::StorageReply(repl));
+
+ if (reply->hasErrors()) {
+ // Convert only the first error since storageapi only
+ // supports one return code.
+ uint32_t mbuscode = reply->getError(0).getCode();
+ api::ReturnCode::Result code(
+ (api::ReturnCode::Result) mbuscode);
+ // Encode mbuscode into message not to lose it
+ sreply->getReply()->setResult(storage::api::ReturnCode(
+ code,
+ mbus::ErrorCode::getName(mbuscode)
+ + vespalib::string(": ")
+ + reply->getError(0).getMessage()
+ + vespalib::string(" (from ")
+ + reply->getError(0).getService()
+ + vespalib::string(")")));
+ }
+ convertedReply.reset(sreply.release());
+ } else {
+ LOG(warning, "Received reply of unhandled protocol '%s'",
+ protocolName.c_str());
+ return;
+ }
+
+ convertedReply->swapState(*reply);
+ convertedReply->setMessage(mbus::Message::UP(message.release()));
+ reply.reset(convertedReply.release());
+ }
+ if (reply->getType() == 0) {
+ LOG(warning, "Failed to convert empty reply by reflecting on "
+ "local message copy.");
+ return;
+ }
+ }
+
+ if (reply->getContext().value.UINT64 != FORWARDED_MESSAGE) {
+ const vespalib::string& protocolName = reply->getProtocol();
+
+ if (protocolName == documentapi::DocumentProtocol::NAME) {
+ std::shared_ptr<api::StorageCommand> originalCommand;
+
+ {
+ vespalib::LockGuard lock(_messageBusSentLock);
+ typedef std::map<api::StorageMessage::Id,
+ api::StorageCommand::SP> MessageMap;
+ MessageMap::iterator iter(
+ _messageBusSent.find(reply->getContext().value.UINT64));
+ if (iter != _messageBusSent.end()) {
+ originalCommand.swap(iter->second);
+ _messageBusSent.erase(iter);
+ } else {
+ LOG(warning, "Failed to convert reply - original sent "
+ "command doesn't exist");
+ return;
+ }
+ }
+
+ std::shared_ptr<api::StorageReply> sar(
+ _docApiConverter.toStorageAPI(
+ static_cast<documentapi::DocumentReply&>(*reply),
+ *originalCommand).release());
+
+ if (sar.get()) {
+ sar->setTrace(reply->getTrace());
+ receiveStorageReply(sar);
+ }
+ } else if (protocolName == mbusprot::StorageProtocol::NAME) {
+ mbusprot::StorageReply* sr(
+ static_cast<mbusprot::StorageReply*>(reply.get()));
+ sr->getReply()->setTrace(reply->getTrace());
+ receiveStorageReply(sr->getReply());
+ } else {
+ LOGBM(warning, "Received unsupported reply type %d for protocol "
+ "'%s'.",
+ reply->getType(), reply->getProtocol().c_str());
+ }
+ }
+}
+
+CommunicationManager::CommunicationManager(
+ StorageComponentRegister& compReg,
+ const config::ConfigUri & configUri)
+ : StorageLink("Communication manager"),
+ _component(compReg, "communicationmanager"),
+ _metrics(_component.getLoadTypes()->getMetricLoadTypes()),
+ _listener(),
+ _eventQueue(),
+ _mbus(),
+ _count(0),
+ _configUri(configUri),
+ _closed(false),
+ _docApiConverter(configUri),
+ _messageAllocTypes(_component.getMemoryManager())
+{
+ _component.registerMetricUpdateHook(*this, framework::SecondTime(5));
+ _component.registerMetric(_metrics);
+}
+
+void
+CommunicationManager::onOpen()
+{
+ _configFetcher.reset(new config::ConfigFetcher(_configUri.getContext()));
+ _configFetcher->subscribe<vespa::config::content::core::StorCommunicationmanagerConfig>(_configUri.getConfigId(), this);
+ _configFetcher->start();
+ framework::MilliSecTime maxProcessingTime(60 * 1000);
+ _thread = _component.startThread(*this, maxProcessingTime);
+
+ if (_listener.get()) {
+ _listener->registerHandle(_component.getIdentity());
+ }
+}
+
+CommunicationManager::~CommunicationManager()
+{
+ if (!_closed && StorageLink::getState() >= StorageLink::OPENED) {
+ // We can reach this state if onOpen fails due to network problems or
+ // other exceptions. The storage link will be in an opened state,
+ // but it cannot in general call onClose on a link that failed onOpen,
+ // as this would violate the assumption that close should always follow
+ // open. We can allow ourselves to explicitly close in the constructor
+ // because our onClose handles closing a partially initialized state.
+ onClose();
+ }
+
+ _sourceSession.reset(0);
+ _messageBusSession.reset(0);
+ _mbus.reset(0);
+
+ // Clear map of sent messages _before_ we delete any visitor threads to
+ // avoid any issues where unloading shared libraries causes messages
+ // created by dynamic visitors to point to unmapped memory
+ _messageBusSent.clear();
+
+ closeNextLink();
+ LOG(debug, "Deleting link %s.", toString().c_str());
+}
+
+void CommunicationManager::onClose()
+{
+ // Avoid getting config during shutdown
+ _configFetcher.reset(0);
+
+ _closed = true;
+
+ if (_mbus.get()) {
+ if (_messageBusSession.get()) {
+ _messageBusSession->close();
+ }
+ }
+
+ if (_listener.get()) {
+ _listener->close();
+ }
+
+ // Stopping pumper thread should stop all incoming messages from being
+ // processed.
+ if (_thread.get() != 0) {
+ _thread->interrupt();
+ _eventQueue.signal();
+ _thread->join();
+ _thread.reset(0);
+ }
+
+ // Emptying remaining queued messages
+ std::shared_ptr<api::StorageMessage> msg;
+ api::ReturnCode code(api::ReturnCode::ABORTED, "Node shutting down");
+ while (_eventQueue.size() > 0) {
+ assert(_eventQueue.getNext(msg, 0));
+ if (!msg->getType().isReply()) {
+ std::shared_ptr<api::StorageReply> reply(
+ static_cast<api::StorageCommand&>(*msg).makeReply().release());
+ reply->setResult(code);
+ sendReply(reply);
+ }
+ }
+}
+
+void
+CommunicationManager::configureMessageBusLimits(
+ const CommunicationManagerConfig& cfg)
+{
+ const bool isDist(_component.getNodeType() == lib::NodeType::DISTRIBUTOR);
+ auto& mbus(_mbus->getMessageBus());
+ mbus.setMaxPendingCount(isDist ? cfg.mbusDistributorNodeMaxPendingCount
+ : cfg.mbusContentNodeMaxPendingCount);
+ mbus.setMaxPendingSize(isDist ? cfg.mbusDistributorNodeMaxPendingSize
+ : cfg.mbusContentNodeMaxPendingSize);
+}
+
+void CommunicationManager::configure(
+ std::unique_ptr<CommunicationManagerConfig> config)
+{
+ // Only allow dynamic (live) reconfiguration of message bus limits.
+ if (_mbus.get()) {
+ configureMessageBusLimits(*config);
+ return;
+ };
+
+ if (!_configUri.empty()) {
+ mbus::RPCNetworkParams params;
+ LOG(debug, "setting up slobrok config from id: '%s", _configUri.getConfigId().c_str());
+
+ params.setSlobrokConfig(_configUri);
+
+ params.setIdentity(mbus::Identity(_component.getIdentity()));
+ if (config->mbusport != -1) {
+ params.setListenPort(config->mbusport);
+ }
+
+ // Configure messagebus here as we for legacy reasons have
+ // config here.
+ _mbus.reset(new mbus::RPCMessageBus(
+ mbus::ProtocolSet()
+ .add(mbus::IProtocol::SP(
+ new documentapi::DocumentProtocol(
+ *_component.getLoadTypes(),
+ _component.getTypeRepo())))
+ .add(mbus::IProtocol::SP(
+ new mbusprot::StorageProtocol(
+ _component.getTypeRepo(),
+ *_component.getLoadTypes()))),
+ params,
+ _configUri));
+
+ configureMessageBusLimits(*config);
+ }
+
+ _listener.reset(new FNetListener(*this, _configUri, config->rpcport));
+
+ if (_mbus.get()) {
+ mbus::DestinationSessionParams dstParams;
+ dstParams.setName("default");
+ dstParams.setBroadcastName(true);
+ dstParams.setMessageHandler(*this);
+ _messageBusSession = _mbus->getMessageBus().createDestinationSession(dstParams);
+
+ mbus::SourceSessionParams srcParams;
+ srcParams.setThrottlePolicy(mbus::IThrottlePolicy::SP());
+ srcParams.setReplyHandler(*this);
+ _sourceSession = _mbus->getMessageBus().createSourceSession(srcParams);
+ }
+}
+
+void
+CommunicationManager::process(const std::shared_ptr<api::StorageMessage>& msg)
+{
+ MBUS_TRACE(msg->getTrace(), 9,
+ "Communication manager: Sending message down chain.");
+ framework::MilliSecTimer startTime(_component.getClock());
+ try {
+ LOG(spam, "Process: %s", msg->toString().c_str());
+
+ if (!onDown(msg)) {
+ sendDown(msg);
+ }
+
+ LOG(spam, "Done processing: %s", msg->toString().c_str());
+ _metrics.messageProcessTime[msg->getLoadType()].addValue(startTime);
+ } catch (std::exception& e) {
+ LOGBP(error, "When running command %s, caught exception %s. "
+ "Discarding message",
+ msg->toString().c_str(), e.what());
+ _metrics.exceptionMessageProcessTime[msg->getLoadType()].addValue(
+ startTime);
+ } catch (...) {
+ LOG(fatal, "Caught fatal exception in communication manager");
+ throw;
+ }
+}
+
+void
+CommunicationManager::enqueue(const std::shared_ptr<api::StorageMessage> & msg)
+{
+ assert(msg.get());
+
+ const uint32_t memoryFootprint = msg->getMemoryFootprint();
+ framework::MemoryToken::UP token = _component.getMemoryManager().allocate(
+ getAllocationType(*msg),
+ memoryFootprint * 2, memoryFootprint * 2,
+ msg->getPriority());
+
+ if (token.get()) {
+ msg->setMemoryToken(std::unique_ptr<framework::MemoryToken>(token.release()));
+
+ LOG(spam, "Enq storage message %s, priority %d",
+ msg->toString().c_str(), msg->getPriority());
+ _eventQueue.enqueue(msg);
+ } else {
+ _metrics.failedDueToTooLittleMemory.inc();
+ std::ostringstream ost;
+ ost << "Failed to aquire " << (memoryFootprint * 2)
+ << " bytes of memory to handle command of type "
+ << msg->getType() << "\n";
+ LOG(spam, ost.str().c_str());
+ api::StorageCommand* cmd(dynamic_cast<api::StorageCommand*>(msg.get()));
+
+ if (cmd) {
+ std::shared_ptr<api::StorageReply> reply(
+ cmd->makeReply().release());
+ reply->setResult(api::ReturnCode(
+ api::ReturnCode::BUSY, ost.str()));
+ sendReply(reply);
+ }
+ }
+}
+
+bool
+CommunicationManager::onUp(const std::shared_ptr<api::StorageMessage> & msg)
+{
+ MBUS_TRACE(msg->getTrace(), 6,
+ "Communication manager: Sending " + msg->toString());
+ if (msg->getType().isReply()) {
+ if (static_cast<api::StorageReply&>(*msg).getResult().failed()) {
+ LOG(debug, "Request %s failed: %s",
+ msg->getType().toString().c_str(),
+ static_cast<api::StorageReply&>(*msg)
+ .getResult().toString().c_str());
+ }
+ return sendReply(
+ std::static_pointer_cast<api::StorageReply>(msg));
+ } else {
+ return sendCommand(
+ std::static_pointer_cast<api::StorageCommand>(msg));
+ }
+}
+
+void
+CommunicationManager::sendMessageBusMessage(
+ const std::shared_ptr<api::StorageCommand>& msg,
+ std::unique_ptr<mbus::Message> mbusMsg,
+ const mbus::Route& route)
+{
+ // Relaxed load since we're not doing any dependent reads that aren't
+ // already covered by some other form of explicit synchronization.
+ if (_closed.load(std::memory_order_relaxed)) {
+ return;
+ }
+
+ LOG(spam, "Sending message bus msg of type %d", mbusMsg->getType());
+
+ MBUS_TRACE(mbusMsg->getTrace(), 6,
+ "Communication manager: Passing message to source session");
+ mbus::Result result = _sourceSession->send(std::move(mbusMsg), route);
+
+ if (!result.isAccepted()) {
+ std::shared_ptr<api::StorageReply> reply(msg->makeReply().release());
+ if (reply.get()) {
+ if (result.getError().getCode() > mbus::ErrorCode::FATAL_ERROR) {
+ reply->setResult(api::ReturnCode(
+ api::ReturnCode::ABORTED,
+ result.getError().getMessage()));
+ } else {
+ reply->setResult(api::ReturnCode(
+ api::ReturnCode::BUSY, result.getError().getMessage()));
+ }
+ } else {
+ LOG(spam, "Failed to synthesize reply");
+ }
+
+ sendDown(reply);
+ }
+}
+
+bool
+CommunicationManager::sendCommand(
+ const std::shared_ptr<api::StorageCommand> & msg)
+{
+ if (!msg->getAddress()) {
+ LOGBP(warning, "Got command without address of type %s in "
+ "CommunicationManager::sendCommand",
+ msg->getType().getName().c_str());
+ return false;
+ }
+ if (!msg->sourceIndexSet()) {
+ msg->setSourceIndex(_component.getIndex());
+ }
+ // Components can not specify what storage node to send to
+ // without specifying protocol. This is a workaround, such that code
+ // doesn't have to care whether message is in documentapi or storage
+ // protocol.
+ api::StorageMessageAddress address(*msg->getAddress());
+ switch (msg->getType().getId()) {
+ case api::MessageType::STATBUCKET_ID: {
+ if (address.getProtocol() == api::StorageMessageAddress::STORAGE) {
+ address.setProtocol(api::StorageMessageAddress::DOCUMENT);
+ }
+ }
+ default:
+ break;
+ }
+
+ framework::MilliSecTimer startTime(_component.getClock());
+ switch (address.getProtocol()) {
+ case api::StorageMessageAddress::STORAGE:
+ {
+ LOG(spam, "Send to %s: %s",
+ address.toString().c_str(),
+ msg->toString().c_str());
+
+ std::unique_ptr<mbus::Message> cmd(new mbusprot::StorageCommand(msg));
+
+ cmd->setContext(mbus::Context(msg->getMsgId()));
+ cmd->setRetryEnabled(address.retryEnabled());
+ cmd->setTimeRemaining(msg->getTimeout());
+ cmd->setTrace(msg->getTrace());
+ sendMessageBusMessage(msg, std::move(cmd), address.getRoute());
+ break;
+ }
+ case api::StorageMessageAddress::DOCUMENT:
+ {
+ MBUS_TRACE(msg->getTrace(), 7,
+ "Communication manager: Converting storageapi message to "
+ "documentapi");
+
+ std::unique_ptr<mbus::Message> mbusMsg(
+ _docApiConverter.toDocumentAPI(*msg, _component.getTypeRepo()));
+
+ if (mbusMsg.get()) {
+ MBUS_TRACE(msg->getTrace(), 7,
+ "Communication manager: Converted OK");
+ mbusMsg->setTrace(msg->getTrace());
+ mbusMsg->setRetryEnabled(address.retryEnabled());
+
+ {
+ vespalib::LockGuard lock(_messageBusSentLock);
+ _messageBusSent[msg->getMsgId()] = msg;
+ }
+ sendMessageBusMessage(msg, std::move(mbusMsg), address.getRoute());
+ break;
+ } else {
+ LOGBM(warning, "This type of message can't be sent via messagebus");
+ return false;
+ }
+ }
+ default:
+ return false;
+ }
+ _metrics.sendCommandLatency.addValue(startTime);
+ return true;
+}
+
+void
+CommunicationManager::serializeNodeState(
+ const api::GetNodeStateReply& gns,
+ std::ostream& os,
+ bool includeDescription,
+ bool includeDiskDescription,
+ bool useOldFormat) const
+{
+ vespalib::asciistream tmp;
+ if (gns.hasNodeState()) {
+ gns.getNodeState().serialize(
+ tmp, "", includeDescription,
+ includeDiskDescription, useOldFormat);
+ } else {
+ _component.getStateUpdater().getReportedNodeState()->serialize(
+ tmp, "", includeDescription,
+ includeDiskDescription, useOldFormat);
+ }
+ os << tmp.str();
+}
+
+void
+CommunicationManager::sendDirectRPCReply(
+ RPCRequestWrapper& request,
+ const std::shared_ptr<api::StorageReply>& reply)
+{
+ std::string requestName(request.getMethodName());
+ if (requestName == "getnodestate3") {
+ api::GetNodeStateReply& gns(
+ static_cast<api::GetNodeStateReply&>(*reply));
+ std::ostringstream ns;
+ serializeNodeState(gns, ns, true, true, false);
+ request.addReturnString(ns.str().c_str());
+ request.addReturnString(gns.getNodeInfo().c_str());
+ LOGBP(debug, "Sending getnodestate3 reply with host info '%s'.",
+ gns.getNodeInfo().c_str());
+ } else if (requestName == "getnodestate2") {
+ api::GetNodeStateReply& gns(
+ static_cast<api::GetNodeStateReply&>(*reply));
+ std::ostringstream ns;
+ serializeNodeState(gns, ns, true, true, false);
+ request.addReturnString(ns.str().c_str());
+ LOGBP(debug, "Sending getnodestate2 reply with no host info.");
+ } else if (requestName == "setsystemstate2") {
+ // No data to return
+ } else {
+ request.addReturnInt(reply->getResult().getResult());
+ request.addReturnString(reply->getResult().getMessage().c_str());
+
+ if (reply->getType() == api::MessageType::GETNODESTATE_REPLY) {
+ api::GetNodeStateReply& gns(
+ static_cast<api::GetNodeStateReply&>(*reply));
+ std::ostringstream ns;
+ serializeNodeState(gns, ns, false, false, true);
+ request.addReturnString(ns.str().c_str());
+ request.addReturnInt(static_cast<int>(
+ gns.getNodeState().getInitProgress().getValue() * 100));
+ }
+ }
+
+ request.returnRequest();
+}
+
+void
+CommunicationManager::sendMessageBusReply(
+ StorageTransportContext& context,
+ const std::shared_ptr<api::StorageReply>& reply)
+{
+ // Using messagebus for communication.
+ mbus::Reply::UP replyUP;
+
+ LOG(spam, "Sending message bus reply %s", reply->toString().c_str());
+
+ // If this was originally documentapi, create a reply now and transfer the
+ // state.
+ if (context._docAPIMsg.get()) {
+ if (reply->getResult().getResult()
+ == api::ReturnCode::WRONG_DISTRIBUTION)
+ {
+ replyUP.reset(new documentapi::WrongDistributionReply(
+ reply->getResult().getMessage()));
+ replyUP->swapState(*context._docAPIMsg);
+ replyUP->setTrace(reply->getTrace());
+ replyUP->addError(mbus::Error(
+ documentapi::DocumentProtocol::ERROR_WRONG_DISTRIBUTION,
+ reply->getResult().getMessage()));
+ } else {
+ replyUP = context._docAPIMsg->createReply();
+ replyUP->swapState(*context._docAPIMsg);
+ replyUP->setTrace(reply->getTrace());
+ replyUP->setMessage(std::unique_ptr<mbus::Message>(
+ context._docAPIMsg.release()));
+ _docApiConverter.transferReplyState(*reply, *replyUP);
+ }
+ } else if (context._storageProtocolMsg.get()) {
+ replyUP.reset(new mbusprot::StorageReply(reply));
+ if (reply->getResult().getResult() != api::ReturnCode::OK) {
+ replyUP->addError(mbus::Error(reply->getResult().getResult(),
+ reply->getResult().getMessage()));
+ }
+
+ replyUP->swapState(*context._storageProtocolMsg);
+ replyUP->setTrace(reply->getTrace());
+ replyUP->setMessage(mbus::Message::UP(
+ context._storageProtocolMsg.release()));
+ }
+
+ if (replyUP.get() != NULL) {
+ // Forward message only if it was successfully stored in storage.
+ if (!replyUP->hasErrors()) {
+ mbus::Message::UP messageUP = replyUP->getMessage();
+
+ if (messageUP.get() && messageUP->getRoute().hasHops()) {
+ messageUP->setContext(mbus::Context(FORWARDED_MESSAGE));
+ _sourceSession->send(std::move(messageUP));
+ }
+ }
+
+ _messageBusSession->reply(std::move(replyUP));
+ }
+}
+
+bool
+CommunicationManager::sendReply(
+ const std::shared_ptr<api::StorageReply>& reply)
+{
+ // Relaxed load since we're not doing any dependent reads that aren't
+ // already covered by some other form of explicit synchronization.
+ if (_closed.load(std::memory_order_relaxed)) {
+ reply->setResult(api::ReturnCode(api::ReturnCode::ABORTED,
+ "Node is shutting down"));
+ }
+
+ std::unique_ptr<StorageTransportContext> context(
+ static_cast<StorageTransportContext*>(
+ reply->getTransportContext().release()));
+
+ if (!context.get()) {
+ LOG(spam,
+ "No transport context in reply %s",
+ reply->toString().c_str());
+
+ return false;
+ }
+
+ framework::MilliSecTimer startTime(_component.getClock());
+ if (context->_request.get()) {
+ sendDirectRPCReply(*(context->_request.get()), reply);
+ } else {
+ sendMessageBusReply(*context, reply);
+ }
+ _metrics.sendReplyLatency.addValue(startTime);
+ return true;
+}
+
+
+void
+CommunicationManager::run(framework::ThreadHandle& thread)
+{
+ while (!thread.interrupted()) {
+ thread.registerTick();
+ std::shared_ptr<api::StorageMessage> msg;
+ if (_eventQueue.getNext(msg, 100)) {
+ process(msg);
+ }
+ for (Protocols::iterator it(_earlierGenerations.begin());
+ !_earlierGenerations.empty() &&
+ ((it->first + TEN_MINUTES) < _component.getClock().getTimeInSeconds());
+ _earlierGenerations.begin())
+ {
+ _earlierGenerations.erase(it);
+ }
+ }
+}
+
+void
+CommunicationManager::updateMetrics(const MetricLockGuard &)
+{
+ _metrics.queueSize.addValue(_eventQueue.size());
+}
+
+void
+CommunicationManager::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ out << "CommunicationManager";
+}
+
+void CommunicationManager::updateMessagebusProtocol(
+ const document::DocumentTypeRepo::SP &repo) {
+ if (_mbus.get()) {
+ framework::SecondTime now(_component.getClock().getTimeInSeconds());
+ mbus::IProtocol::SP newDocumentProtocol(new documentapi::DocumentProtocol( *_component.getLoadTypes(), repo));
+ _earlierGenerations.push_back(std::make_pair(now, _mbus->getMessageBus().putProtocol(newDocumentProtocol)));
+
+ mbus::IProtocol::SP newStorageProtocol(new mbusprot::StorageProtocol(repo, *_component.getLoadTypes()));
+ _earlierGenerations.push_back(std::make_pair(now, _mbus->getMessageBus().putProtocol(newStorageProtocol)));
+ }
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/communicationmanager.h b/storage/src/vespa/storage/storageserver/communicationmanager.h
new file mode 100644
index 00000000000..458e1b209d8
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/communicationmanager.h
@@ -0,0 +1,236 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+
+
+* @class CommunicationManager
+ * @ingroup storageserver
+ *
+ * @brief Class used for sending messages over the network.
+ *
+ * @version $Id$
+ */
+
+#pragma once
+
+#include <vespa/vespalib/util/document_runnable.h>
+#include <map>
+#include <memory>
+#include <queue>
+#include <vector>
+#include <atomic>
+#include <vespa/metrics/metrics.h>
+#include <vespa/messagebus/rpcmessagebus.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storage/config/config-stor-communicationmanager.h>
+#include <vespa/storageapi/messageapi/storagecommand.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <vespa/storageapi/mbusprot/storagecommand.h>
+#include <vespa/storageapi/mbusprot/storagereply.h>
+#include <vespa/documentapi/documentapi.h>
+#include <vespa/storage/storageserver/communicationmanagermetrics.h>
+#include <vespa/storage/storageserver/fnetlistener.h>
+#include <vespa/storage/storageserver/messageallocationtypes.h>
+#include <vespa/storage/storageserver/rpcrequestwrapper.h>
+#include "documentapiconverter.h"
+#include <vespa/storageframework/storageframework.h>
+
+namespace storage {
+
+class VisitorMbusSession;
+class Visitor;
+class VisitorThread;
+
+class PriorityQueue {
+private:
+ struct Key {
+ uint8_t priority {255};
+ uint64_t seqNum {0};
+
+ Key(uint8_t pri, uint64_t seq)
+ : priority(pri), seqNum(seq)
+ {
+ }
+ };
+ using ValueType = std::pair<Key, api::StorageMessage::SP>;
+
+ struct PriorityThenFifoCmp {
+ bool operator()(const ValueType& lhs,
+ const ValueType& rhs) const noexcept
+ {
+ // priority_queue has largest element on top, so reverse order
+ // since our semantics have 0 as the highest priority.
+ if (lhs.first.priority != rhs.first.priority) {
+ return (lhs.first.priority > rhs.first.priority);
+ }
+ return (lhs.first.seqNum > rhs.first.seqNum);
+ }
+ };
+
+ using QueueType = std::priority_queue<
+ ValueType,
+ std::vector<ValueType>,
+ PriorityThenFifoCmp>;
+
+ // Sneakily chosen priority such that effectively only RPC commands are
+ // allowed in front of replies. Replies must have the same effective
+ // priority or they will get reordered and all hell breaks loose.
+ static constexpr uint8_t FIXED_REPLY_PRIORITY = 1;
+
+ QueueType _queue;
+ vespalib::Monitor _queueMonitor;
+ uint64_t _msgCounter;
+
+public:
+ PriorityQueue();
+ virtual ~PriorityQueue();
+
+ /**
+ * Returns the next event from the event queue
+ * @param msg The next event
+ * @param timeout Millisecs to wait if the queue is empty
+ * (0 = don't wait, -1 = forever)
+ * @return true or false if the queue was empty.
+ */
+ bool getNext(std::shared_ptr<api::StorageMessage>& msg, int timeout);
+
+ /**
+ * If `msg` is a StorageCommand, enqueues it using the priority stored in
+ * the command. If it's a reply, enqueues it using a fixed but very high
+ * priority that ensure replies are processed before commands but also
+ * ensures that replies are FIFO-ordered relative to each other.
+ */
+ void enqueue(const std::shared_ptr<api::StorageMessage>& msg);
+
+ /** Signal queue monitor. */
+ void signal();
+
+ int size();
+};
+
+class StorageTransportContext : public api::TransportContext {
+public:
+ StorageTransportContext(std::unique_ptr<documentapi::DocumentMessage> msg)
+ : _docAPIMsg(std::move(msg)) {};
+
+ StorageTransportContext(std::unique_ptr<mbusprot::StorageCommand> msg)
+ : _storageProtocolMsg(std::move(msg)) {};
+
+ StorageTransportContext(std::unique_ptr<RPCRequestWrapper> request)
+ : _request(std::move(request)) {};
+
+ std::unique_ptr<documentapi::DocumentMessage> _docAPIMsg;
+ std::unique_ptr<mbusprot::StorageCommand> _storageProtocolMsg;
+ std::unique_ptr<RPCRequestWrapper> _request;
+};
+
+class CommunicationManager : public StorageLink,
+ public framework::Runnable,
+ private config::IFetcherCallback<vespa::config::content::core::StorCommunicationmanagerConfig>,
+ public mbus::IMessageHandler,
+ public mbus::IReplyHandler,
+ private framework::MetricUpdateHook
+{
+private:
+ CommunicationManager(const CommunicationManager&);
+ CommunicationManager& operator=(const CommunicationManager&);
+
+ StorageComponent _component;
+ CommunicationManagerMetrics _metrics;
+
+ std::unique_ptr<FNetListener> _listener;
+ PriorityQueue _eventQueue;
+ // XXX: Should perhaps use a configsubscriber and poll from StorageComponent ?
+ std::unique_ptr<config::ConfigFetcher> _configFetcher;
+ typedef std::vector< std::pair<framework::SecondTime, mbus::IProtocol::SP> > Protocols;
+ Protocols _earlierGenerations;
+
+ void onOpen();
+ void onClose();
+
+ void process(const std::shared_ptr<api::StorageMessage>& msg);
+
+ using CommunicationManagerConfig
+ = vespa::config::content::core::StorCommunicationmanagerConfig;
+
+ void configureMessageBusLimits(const CommunicationManagerConfig& cfg);
+
+ void configure(std::unique_ptr<CommunicationManagerConfig> config);
+
+ void receiveStorageReply(const std::shared_ptr<api::StorageReply>&);
+
+ void serializeNodeState(
+ const api::GetNodeStateReply& gns,
+ std::ostream& os,
+ bool includeDescription,
+ bool includeDiskDescription,
+ bool useOldFormat) const;
+
+ static const uint64_t FORWARDED_MESSAGE = 0;
+
+ std::unique_ptr<mbus::RPCMessageBus> _mbus;
+ mbus::DestinationSession::UP _messageBusSession;
+ mbus::SourceSession::UP _sourceSession;
+ mbus::SourceSession::UP _visitorSourceSession;
+ uint32_t _count;
+
+ vespalib::Lock _messageBusSentLock;
+ std::map<api::StorageMessage::Id, std::shared_ptr<api::StorageCommand> > _messageBusSent;
+
+ config::ConfigUri _configUri;
+ std::atomic<bool> _closed;
+ DocumentApiConverter _docApiConverter;
+ framework::Thread::UP _thread;
+ MessageAllocationTypes _messageAllocTypes;
+
+ const framework::MemoryAllocationType&
+ getAllocationType(api::StorageMessage& msg) const;
+ void updateMetrics(const MetricLockGuard &) override;
+
+ // Test needs access to configure() for live reconfig testing.
+ friend class CommunicationManagerTest;
+
+public:
+ CommunicationManager(StorageComponentRegister& compReg,
+ const config::ConfigUri & configUri);
+ virtual ~CommunicationManager();
+
+ void enqueue(const std::shared_ptr<api::StorageMessage> & msg);
+
+ mbus::RPCMessageBus& getMessageBus() { assert(_mbus.get()); return *_mbus; }
+
+ const PriorityConverter& getPriorityConverter() const { return _docApiConverter.getPriorityConverter(); }
+
+ /**
+ * From StorageLink. Called when messages arrive from storage
+ * modules. Will convert and dispatch messages to MessageServer
+ */
+ virtual bool onUp(const std::shared_ptr<api::StorageMessage>&);
+
+ bool sendCommand(const std::shared_ptr<api::StorageCommand>& command);
+
+ bool sendReply(const std::shared_ptr<api::StorageReply>& reply);
+ void sendDirectRPCReply(RPCRequestWrapper& request, const std::shared_ptr<api::StorageReply>& reply);
+ void sendMessageBusReply(StorageTransportContext& context, const std::shared_ptr<api::StorageReply>& reply);
+
+ // Pump thread
+ void run(framework::ThreadHandle&);
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ /** Get messages from messagebus. */
+ void handleMessage(std::unique_ptr<mbus::Message> msg);
+
+ void sendMessageBusMessage(const std::shared_ptr<api::StorageCommand>& msg,
+ std::unique_ptr<mbus::Message> mbusMsg, const mbus::Route& route);
+
+ /** Get replies from messagebus. */
+ void handleReply(std::unique_ptr<mbus::Reply> msg);
+
+ void updateMessagebusProtocol(const document::DocumentTypeRepo::SP &repo);
+
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageserver/communicationmanagermetrics.h b/storage/src/vespa/storage/storageserver/communicationmanagermetrics.h
new file mode 100644
index 00000000000..03f934c606a
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/communicationmanagermetrics.h
@@ -0,0 +1,56 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class CommunicationManagerMetrics
+ * \ingroup storageserver
+ *
+ * \brief Metrics for the communication manager.
+ */
+
+#pragma once
+
+#include <vespa/metrics/metrics.h>
+#include <vespa/documentapi/loadtypes/loadtypeset.h>
+
+namespace storage {
+
+struct CommunicationManagerMetrics : public metrics::MetricSet {
+ metrics::LongAverageMetric queueSize;
+ metrics::LoadMetric<metrics::LongAverageMetric> messageProcessTime;
+ metrics::LoadMetric<metrics::LongAverageMetric> exceptionMessageProcessTime;
+ metrics::LongCountMetric failedDueToTooLittleMemory;
+ metrics::LongCountMetric convertToStorageAPIFailures;
+ metrics::LongAverageMetric sendCommandLatency;
+ metrics::LongAverageMetric sendReplyLatency;
+
+ CommunicationManagerMetrics(const metrics::LoadTypeSet& loadTypes,
+ metrics::MetricSet* owner = 0)
+ : metrics::MetricSet("communication", "",
+ "Metrics for the communication manager", owner),
+ queueSize("messagequeue", "", "Size of input message queue.", this),
+ messageProcessTime(loadTypes, metrics::LongAverageMetric(
+ "messageprocesstime", "",
+ "Time transport thread uses to process a single message"),
+ this),
+ exceptionMessageProcessTime(loadTypes, metrics::LongAverageMetric(
+ "exceptionmessageprocesstime", "",
+ "Time transport thread uses to process a single message "
+ "that fails with an exception thrown into communication "
+ "manager"),
+ this),
+ failedDueToTooLittleMemory("toolittlememory", "",
+ "Number of messages failed due to too little memory "
+ "available", this),
+ convertToStorageAPIFailures("convertfailures", "",
+ "Number of messages that failed to get converted to "
+ "storage API messages", this),
+ sendCommandLatency("sendcommandlatency", "",
+ "Average ms used to send commands to MBUS", this),
+ sendReplyLatency("sendreplylatency", "",
+ "Average ms used to send replies to MBUS", this)
+ {
+ }
+
+};
+
+}
+
diff --git a/storage/src/vespa/storage/storageserver/distributornode.cpp b/storage/src/vespa/storage/storageserver/distributornode.cpp
new file mode 100644
index 00000000000..308747b6b10
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/distributornode.cpp
@@ -0,0 +1,143 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/distributornode.h>
+
+#include <vespa/storage/distributor/distributor.h>
+#include <vespa/storage/storageserver/bouncer.h>
+#include <vespa/storage/storageserver/communicationmanager.h>
+#include <vespa/storage/storageserver/opslogger.h>
+#include <vespa/storage/storageserver/statemanager.h>
+#include <vespa/storage/common/hostreporter/hostinfo.h>
+
+#include <vespa/log/log.h>
+
+LOG_SETUP(".node.distributor");
+
+namespace storage {
+
+DistributorNode::DistributorNode(
+ const config::ConfigUri& configUri,
+ DistributorNodeContext& context,
+ ApplicationGenerationFetcher& generationFetcher,
+ NeedActiveState activeState,
+ StorageLink::UP communicationManager)
+ : StorageNode(configUri, context, generationFetcher,
+ std::unique_ptr<HostInfo>(new HostInfo()),
+ communicationManager.get() == 0 ? NORMAL
+ : SINGLE_THREADED_TEST_MODE),
+ _threadPool(framework::TickingThreadPool::createDefault("distributor")),
+ _context(context),
+ _lastUniqueTimestampRequested(0),
+ _uniqueTimestampCounter(0),
+ _manageActiveBucketCopies(activeState == NEED_ACTIVE_BUCKET_STATES_SET),
+ _retrievedCommunicationManager(std::move(communicationManager))
+{
+ try{
+ initialize();
+ } catch (const vespalib::NetworkSetupFailureException & e) {
+ LOG(warning, "Network failure: '%s'", e.what());
+ throw;
+ } catch (const vespalib::Exception & e) {
+ LOG(error, "Caught exception %s during startup. Calling destruct "
+ "functions in hopes of dying gracefully.",
+ e.getMessage().c_str());
+ requestShutdown("Failed to initialize: " + e.getMessage());
+ shutdownDistributor();
+ throw;
+ }
+}
+
+DistributorNode::~DistributorNode()
+{
+ shutdownDistributor();
+}
+
+void
+DistributorNode::shutdownDistributor()
+{
+ _threadPool->stop();
+ shutdown();
+}
+
+void
+DistributorNode::initializeNodeSpecific()
+{
+ _context.getComponentRegister().setTimeCalculator(*this);
+}
+
+void
+DistributorNode::handleConfigChange(vespa::config::content::core::StorDistributormanagerConfig& c)
+{
+ framework::TickingLockGuard guard(_threadPool->freezeAllTicks());
+ _context.getComponentRegister().setDistributorConfig(c);
+ framework::MilliSecTime ticksWaitTime(c.ticksWaitTimeMs);
+ framework::MilliSecTime maxProcessTime(c.maxProcessTimeMs);
+ _threadPool->updateParametersAllThreads(
+ ticksWaitTime,
+ maxProcessTime,
+ c.ticksBeforeWait);
+}
+
+void
+DistributorNode::handleConfigChange(vespa::config::content::core::StorVisitordispatcherConfig& c)
+{
+ framework::TickingLockGuard guard(_threadPool->freezeAllTicks());
+ _context.getComponentRegister().setVisitorConfig(c);
+}
+
+StorageLink::UP
+DistributorNode::createChain()
+{
+ DistributorComponentRegister& dcr(_context.getComponentRegister());
+ // TODO: All components in this chain should use a common thread instead of
+ // each having its own configfetcher.
+ StorageLink::UP chain;
+ if (_retrievedCommunicationManager.get()) {
+ chain = std::move(_retrievedCommunicationManager);
+ } else {
+ chain.reset(_communicationManager
+ = new CommunicationManager(dcr, _configUri));
+ }
+ std::unique_ptr<StateManager> stateManager(releaseStateManager());
+
+ chain->push_back(StorageLink::UP(new Bouncer(dcr, _configUri)));
+ chain->push_back(StorageLink::UP(new OpsLogger(dcr, _configUri)));
+ // Distributor instance registers a host info reporter with the state
+ // manager, which is safe since the lifetime of said state manager
+ // extends to the end of the process.
+ chain->push_back(StorageLink::UP(
+ new storage::distributor::Distributor(
+ dcr, *_threadPool, getDoneInitializeHandler(),
+ _manageActiveBucketCopies,
+ stateManager->getHostInfo())));
+
+ chain->push_back(StorageLink::UP(stateManager.release()));
+ return chain;
+}
+
+api::Timestamp
+DistributorNode::getUniqueTimestamp()
+{
+ uint64_t timeNow(_component->getClock().getTimeInSeconds().getTime());
+ if (timeNow == _lastUniqueTimestampRequested) {
+ ++_uniqueTimestampCounter;
+ } else {
+ if (timeNow < _lastUniqueTimestampRequested) {
+ LOG(error, "Time has moved backwards, from %" PRIu64 " to %" PRIu64 ".",
+ _lastUniqueTimestampRequested, timeNow);
+ }
+ _lastUniqueTimestampRequested = timeNow;
+ _uniqueTimestampCounter = 0;
+ }
+
+ return _lastUniqueTimestampRequested * 1000000ll + _uniqueTimestampCounter;
+}
+
+ResumeGuard
+DistributorNode::pause()
+{
+ return ResumeGuard();
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/distributornode.h b/storage/src/vespa/storage/storageserver/distributornode.h
new file mode 100644
index 00000000000..c0b6db67b95
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/distributornode.h
@@ -0,0 +1,66 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::DistributorNode
+ * \ingroup storageserver
+ *
+ * \brief Class for setting up a distributor node.
+ */
+
+#pragma once
+
+#include <vespa/storage/common/distributorcomponent.h>
+#include <vespa/storage/storageserver/distributornodecontext.h>
+#include <vespa/storage/storageserver/storagenode.h>
+#include <vespa/storageframework/generic/thread/tickingthread.h>
+
+namespace storage {
+
+class DistributorNode
+ : public StorageNode,
+ private UniqueTimeCalculator
+{
+ framework::TickingThreadPool::UP _threadPool;
+ DistributorNodeContext& _context;
+ uint64_t _lastUniqueTimestampRequested;
+ uint32_t _uniqueTimestampCounter;
+ bool _manageActiveBucketCopies;
+ StorageLink::UP _retrievedCommunicationManager;
+
+public:
+ typedef std::unique_ptr<DistributorNode> UP;
+ enum NeedActiveState
+ {
+ NEED_ACTIVE_BUCKET_STATES_SET,
+ NO_NEED_FOR_ACTIVE_STATES
+ };
+
+ DistributorNode(const config::ConfigUri & configUri,
+ DistributorNodeContext&,
+ ApplicationGenerationFetcher& generationFetcher,
+ NeedActiveState,
+ StorageLink::UP communicationManager = StorageLink::UP());
+ ~DistributorNode();
+
+ virtual const lib::NodeType& getNodeType() const
+ { return lib::NodeType::DISTRIBUTOR; }
+
+ virtual ResumeGuard pause();
+
+ void handleConfigChange(vespa::config::content::core::StorDistributormanagerConfig&);
+ void handleConfigChange(vespa::config::content::core::StorVisitordispatcherConfig&);
+
+private:
+ virtual void initializeNodeSpecific();
+ virtual StorageLink::UP createChain();
+
+ virtual api::Timestamp getUniqueTimestamp();
+
+ /**
+ * Shut down necessary distributor-specific components before shutting
+ * down general content node components.
+ */
+ void shutdownDistributor();
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageserver/distributornodecontext.cpp b/storage/src/vespa/storage/storageserver/distributornodecontext.cpp
new file mode 100644
index 00000000000..f44f7ecbbf2
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/distributornodecontext.cpp
@@ -0,0 +1,16 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/distributornodecontext.h>
+
+namespace storage {
+
+DistributorNodeContext::DistributorNodeContext(
+ framework::Clock::UP clock)
+ : StorageNodeContext(StorageComponentRegisterImpl::UP(new DistributorComponentRegisterImpl),
+ std::move(clock)),
+ _componentRegister(dynamic_cast<ComponentRegister&>(StorageNodeContext::getComponentRegister()))
+{
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/distributornodecontext.h b/storage/src/vespa/storage/storageserver/distributornodecontext.h
new file mode 100644
index 00000000000..609f965487d
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/distributornodecontext.h
@@ -0,0 +1,48 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::DistributorNodeContext
+ * @ingroup storageserver
+ *
+ * @brief Context needed by node, that can also be used by others
+ *
+ * This utility class sets up the default component register implementation.
+ * It also sets up the clock and the threadpool, such that the most basic
+ * features are available to the provider, before the service layer is set up.
+ *
+ * The service layer still provides the memory manager functionality though,
+ * so you cannot retrieve the memory manager before the service layer has
+ * started up. (Before getPartitionStates() have been called on provider)
+ */
+
+#pragma once
+
+#include <vespa/storage/frameworkimpl/component/distributorcomponentregisterimpl.h>
+#include <vespa/storage/storageserver/storagenodecontext.h>
+
+namespace storage {
+
+struct DistributorNodeContext : public StorageNodeContext {
+ // Typedefs to simplify the remainder of the interface
+ typedef DistributorComponentRegisterImpl ComponentRegister;
+
+ /**
+ * You can provide your own clock implementation. Useful in testing where
+ * you want to fake the clock.
+ */
+ DistributorNodeContext(
+ framework::Clock::UP clock = framework::Clock::UP(new RealClock));
+
+ /**
+ * Get the actual component register. Available as the actual type as the
+ * storage server need to set implementations, and the components need the
+ * actual component register interface.
+ */
+ ComponentRegister& getComponentRegister() { return _componentRegister; }
+
+private:
+ ComponentRegister& _componentRegister;
+};
+
+} // storage
+
+
diff --git a/storage/src/vespa/storage/storageserver/documentapiconverter.cpp b/storage/src/vespa/storage/storageserver/documentapiconverter.cpp
new file mode 100644
index 00000000000..c2965bcddd5
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/documentapiconverter.cpp
@@ -0,0 +1,516 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/documentapiconverter.h>
+#include <vespa/documentapi/documentapi.h>
+#include <vespa/storageapi/message/visitor.h>
+#include <vespa/storageapi/message/datagram.h>
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storageapi/message/searchresult.h>
+#include <vespa/storageapi/message/queryresult.h>
+#include <vespa/storageapi/message/documentsummary.h>
+#include <vespa/storageapi/message/multioperation.h>
+#include <vespa/storageapi/message/removelocation.h>
+#include <vespa/storageapi/message/stat.h>
+#include <vespa/storageapi/message/batch.h>
+#include <vespa/messagebus/errorcode.h>
+#include <vespa/storageapi/messageapi/returncode.h>
+#include <vespa/log/log.h>
+#include <vespa/vdslib/container/documentlist.h>
+#include <vespa/document/bucket/bucketidfactory.h>
+
+LOG_SETUP(".documentapiconverter");
+
+namespace storage {
+
+std::unique_ptr<api::StorageCommand>
+DocumentApiConverter::toStorageAPI(documentapi::DocumentMessage& fromMsg,
+ const document::DocumentTypeRepo::SP &repo)
+{
+ api::StorageCommand::UP toMsg;
+
+ using documentapi::DocumentProtocol;
+ switch (fromMsg.getType()) {
+ case DocumentProtocol::MESSAGE_PUTDOCUMENT:
+ {
+ documentapi::PutDocumentMessage& from(
+ static_cast<documentapi::PutDocumentMessage&>(fromMsg));
+ api::PutCommand::UP to(new api::PutCommand(
+ document::BucketId(0), from.getDocument(),
+ from.getTimestamp()));
+ to->setCondition(from.getCondition());
+ toMsg.reset(to.release());
+ break;
+ }
+ case DocumentProtocol::MESSAGE_UPDATEDOCUMENT:
+ {
+ documentapi::UpdateDocumentMessage& from(
+ static_cast<documentapi::UpdateDocumentMessage&>(fromMsg));
+ api::UpdateCommand::UP to(new api::UpdateCommand(
+ document::BucketId(0), from.getDocumentUpdate(),
+ from.getNewTimestamp()));
+ to->setOldTimestamp(from.getOldTimestamp());
+ to->setCondition(from.getCondition());
+ toMsg.reset(to.release());
+ break;
+ }
+ case DocumentProtocol::MESSAGE_REMOVEDOCUMENT:
+ {
+ documentapi::RemoveDocumentMessage& from(
+ static_cast<documentapi::RemoveDocumentMessage&>(fromMsg));
+ api::RemoveCommand::UP to(new api::RemoveCommand(
+ document::BucketId(0), from.getDocumentId(), 0));
+ to->setCondition(from.getCondition());
+ toMsg.reset(to.release());
+ break;
+ }
+ case DocumentProtocol::MESSAGE_GETDOCUMENT:
+ {
+ documentapi::GetDocumentMessage& from(
+ static_cast<documentapi::GetDocumentMessage&>(fromMsg));
+ api::GetCommand::UP to(new api::GetCommand(
+ document::BucketId(0), from.getDocumentId(),
+ from.getFieldSet()));
+ toMsg.reset(to.release());
+ break;
+ }
+ case DocumentProtocol::MESSAGE_CREATEVISITOR:
+ {
+ documentapi::CreateVisitorMessage& from(
+ static_cast<documentapi::CreateVisitorMessage&>(fromMsg));
+ api::CreateVisitorCommand::UP to(new api::CreateVisitorCommand(
+ from.getLibraryName(),
+ from.getInstanceId(),
+ from.getDocumentSelection()));
+
+ to->setControlDestination(from.getControlDestination());
+ to->setDataDestination(from.getDataDestination());
+ to->setMaximumPendingReplyCount(from.getMaximumPendingReplyCount());
+ to->setParameters(from.getParameters());
+ to->setFromTime(from.getFromTimestamp());
+ to->setToTime(from.getToTimestamp());
+ to->setVisitRemoves(from.visitRemoves());
+ to->setFieldSet(from.getFieldSet());
+ to->setVisitInconsistentBuckets(from.visitInconsistentBuckets());
+ to->getBuckets() = from.getBuckets();
+ to->setVisitorDispatcherVersion(from.getVisitorDispatcherVersion());
+ to->setVisitorOrdering(from.getVisitorOrdering());
+ to->setMaxBucketsPerVisitor(from.getMaxBucketsPerVisitor());
+ toMsg.reset(to.release());
+ break;
+ }
+ case DocumentProtocol::MESSAGE_DESTROYVISITOR:
+ {
+ documentapi::DestroyVisitorMessage& from(
+ static_cast<documentapi::DestroyVisitorMessage&>(fromMsg));
+ api::DestroyVisitorCommand::UP to(new api::DestroyVisitorCommand(
+ from.getInstanceId()));
+ toMsg.reset(to.release());
+ break;
+ }
+ case DocumentProtocol::MESSAGE_MULTIOPERATION:
+ {
+ documentapi::MultiOperationMessage& from(
+ static_cast<documentapi::MultiOperationMessage&>(fromMsg));
+ api::MultiOperationCommand::UP to(new api::MultiOperationCommand(repo,
+ from.getBucketId(), from.getBuffer(),
+ from.keepTimeStamps()));
+ toMsg.reset(to.release());
+ break;
+ }
+ case DocumentProtocol::MESSAGE_BATCHDOCUMENTUPDATE:
+ {
+ documentapi::BatchDocumentUpdateMessage& from(
+ static_cast<documentapi::BatchDocumentUpdateMessage&>(fromMsg));
+ api::BatchDocumentUpdateCommand::UP to(
+ new api::BatchDocumentUpdateCommand(from.getUpdates()));
+ toMsg.reset(to.release());
+ break;
+ }
+ case DocumentProtocol::MESSAGE_STATBUCKET:
+ {
+ documentapi::StatBucketMessage& from(
+ static_cast<documentapi::StatBucketMessage&>(fromMsg));
+ api::StatBucketCommand::UP to(new api::StatBucketCommand(
+ from.getBucketId(), from.getDocumentSelection()));
+ toMsg.reset(to.release());
+ break;
+ }
+ case DocumentProtocol::MESSAGE_GETBUCKETLIST:
+ {
+ documentapi::GetBucketListMessage& from(
+ static_cast<documentapi::GetBucketListMessage&>(fromMsg));
+ api::GetBucketListCommand::UP to(new api::GetBucketListCommand(
+ from.getBucketId()));
+ toMsg.reset(to.release());
+ break;
+ }
+ case DocumentProtocol::MESSAGE_VISITORINFO:
+ {
+ documentapi::VisitorInfoMessage& from(
+ static_cast<documentapi::VisitorInfoMessage&>(fromMsg));
+ api::VisitorInfoCommand::UP to(new api::VisitorInfoCommand);
+ for (uint32_t i = 0; i < from.getFinishedBuckets().size(); ++i) {
+ to->setBucketCompleted(from.getFinishedBuckets()[i], 0);
+ }
+ if (!from.getErrorMessage().empty()) {
+ to->setErrorCode(api::ReturnCode(
+ api::ReturnCode::INTERNAL_FAILURE, from.getErrorMessage()));
+ }
+ toMsg.reset(to.release());
+ break;
+ }
+ case DocumentProtocol::MESSAGE_REMOVELOCATION:
+ {
+ documentapi::RemoveLocationMessage& from(
+ static_cast<documentapi::RemoveLocationMessage&>(fromMsg));
+ api::RemoveLocationCommand::UP to(new api::RemoveLocationCommand(
+ from.getDocumentSelection(), document::BucketId(0)));
+ toMsg.reset(to.release());
+ break;
+ }
+ default:
+ break;
+ }
+
+ if (toMsg.get() != 0) {
+ int64_t timeout = fromMsg.getTimeRemaining();
+ if (timeout > INT_MAX) {
+ timeout = INT_MAX;
+ }
+ toMsg->setTimeout(timeout);
+ toMsg->setPriority(
+ _priConverter.toStoragePriority(fromMsg.getPriority()));
+ toMsg->setLoadType(fromMsg.getLoadType());
+
+ LOG(spam, "Converted command %s, loadtype %d, mapped priority %d to %d",
+ toMsg->toString().c_str(), toMsg->getLoadType().getId(),
+ fromMsg.getPriority(), toMsg->getPriority());
+ }
+ return std::move(toMsg);
+}
+
+std::unique_ptr<api::StorageReply>
+DocumentApiConverter::toStorageAPI(documentapi::DocumentReply& fromReply,
+ api::StorageCommand& fromCommand)
+{
+ if (LOG_WOULD_LOG(spam)) {
+ LOG(spam, "Trace for reply:\n%s",
+ fromReply.getTrace().toString().c_str());
+ }
+ std::unique_ptr<api::StorageReply> toMsg;
+
+ switch (fromReply.getType()) {
+ case documentapi::DocumentProtocol::REPLY_CREATEVISITOR:
+ {
+ documentapi::CreateVisitorReply& fromRep(
+ static_cast<documentapi::CreateVisitorReply&>(fromReply));
+ const api::CreateVisitorCommand& fromCmd(
+ static_cast<const api::CreateVisitorCommand&>(fromCommand));
+
+ api::CreateVisitorReply::UP to(new api::CreateVisitorReply(fromCmd));
+ to->setVisitorStatistics(fromRep.getVisitorStatistics());
+ toMsg.reset(to.release());
+ break;
+ }
+ case documentapi::DocumentProtocol::REPLY_STATBUCKET:
+ {
+ documentapi::StatBucketReply& fromRep(
+ static_cast<documentapi::StatBucketReply&>(fromReply));
+ const api::StatBucketCommand& fromCmd(
+ static_cast<const api::StatBucketCommand&>(fromCommand));
+
+ api::StatBucketReply::UP to(
+ new api::StatBucketReply(fromCmd, fromRep.getResults()));
+ toMsg.reset(to.release());
+ break;
+ }
+ default:
+ toMsg = fromCommand.makeReply();
+ break;
+ }
+
+ if (toMsg.get()) {
+ if (fromReply.hasErrors()) {
+ toMsg->setResult(api::ReturnCode(
+ (api::ReturnCode::Result) fromReply.getError(0).getCode(),
+ fromReply.getError(0).getMessage()));
+ toMsg->setPriority(
+ _priConverter.toStoragePriority(fromReply.getPriority()));
+ }
+ }
+ return std::move(toMsg);
+}
+
+std::unique_ptr<mbus::Message>
+DocumentApiConverter::toDocumentAPI(api::StorageCommand& fromMsg,
+ const document::DocumentTypeRepo::SP &repo)
+{
+ std::unique_ptr<mbus::Message> toMsg;
+ switch (fromMsg.getType().getId()) {
+ case api::MessageType::PUT_ID:
+ {
+ api::PutCommand& from(static_cast<api::PutCommand&>(fromMsg));
+ documentapi::PutDocumentMessage::UP to(
+ new documentapi::PutDocumentMessage(from.getDocument()));
+ to->setTimestamp(from.getTimestamp());
+ toMsg.reset(to.release());
+ break;
+ }
+ case api::MessageType::UPDATE_ID:
+ {
+ api::UpdateCommand& from(static_cast<api::UpdateCommand&>(fromMsg));
+ documentapi::UpdateDocumentMessage::UP to(
+ new documentapi::UpdateDocumentMessage(from.getUpdate()));
+ to->setOldTimestamp(from.getOldTimestamp());
+ to->setNewTimestamp(from.getTimestamp());
+ toMsg.reset(to.release());
+ break;
+ }
+ case api::MessageType::REMOVE_ID:
+ {
+ api::RemoveCommand& from(static_cast<api::RemoveCommand&>(fromMsg));
+ documentapi::RemoveDocumentMessage::UP to(
+ new documentapi::RemoveDocumentMessage(from.getDocumentId()));
+ toMsg.reset(to.release());
+ break;
+ }
+ case api::MessageType::VISITOR_INFO_ID:
+ {
+ api::VisitorInfoCommand& from(
+ static_cast<api::VisitorInfoCommand&>(fromMsg));
+ documentapi::VisitorInfoMessage::UP to(
+ new documentapi::VisitorInfoMessage);
+
+ for (uint32_t i = 0; i < from.getCompletedBucketsList().size(); ++i) {
+ to->getFinishedBuckets().push_back(
+ from.getCompletedBucketsList()[i].bucketId);
+ }
+ to->setErrorMessage(from.getErrorCode().getMessage());
+ toMsg.reset(to.release());
+ break;
+ }
+ case api::MessageType::DOCBLOCK_ID:
+ {
+ api::DocBlockCommand& from(static_cast<api::DocBlockCommand&>(fromMsg));
+ documentapi::MultiOperationMessage::UP to(
+ new documentapi::MultiOperationMessage(
+ from.getBucketId(),
+ from.getDocumentBlock(),
+ from.keepTimeStamps()));
+ toMsg.reset(to.release());
+ break;
+ }
+ case api::MessageType::SEARCHRESULT_ID:
+ {
+ api::SearchResultCommand& from(
+ static_cast<api::SearchResultCommand&>(fromMsg));
+ documentapi::SearchResultMessage::UP to(
+ new documentapi::SearchResultMessage(from));
+ toMsg.reset(to.release());
+ break;
+ }
+ case api::MessageType::QUERYRESULT_ID:
+ {
+ api::QueryResultCommand& from(
+ static_cast<api::QueryResultCommand&>(fromMsg));
+ documentapi::QueryResultMessage::UP to(
+ new documentapi::QueryResultMessage(
+ from.getSearchResult(), from.getDocumentSummary()));
+ toMsg.reset(to.release());
+ break;
+ }
+ case api::MessageType::DOCUMENTSUMMARY_ID:
+ {
+ api::DocumentSummaryCommand& from(
+ static_cast<api::DocumentSummaryCommand&>(fromMsg));
+ documentapi::DocumentSummaryMessage::UP to(
+ new documentapi::DocumentSummaryMessage(from));
+ toMsg.reset(to.release());
+ break;
+ }
+ case api::MessageType::MULTIOPERATION_ID:
+ {
+ api::MultiOperationCommand& from(
+ static_cast<api::MultiOperationCommand&>(fromMsg));
+ documentapi::MultiOperationMessage::UP to(
+ new documentapi::MultiOperationMessage(repo,
+ from.getBucketId(),
+ from.getBuffer(),
+ from.keepTimeStamps()));
+ toMsg.reset(to.release());
+ break;
+ }
+ case api::MessageType::MAPVISITOR_ID:
+ {
+ api::MapVisitorCommand& from(
+ static_cast<api::MapVisitorCommand&>(fromMsg));
+ documentapi::MapVisitorMessage::UP to(
+ new documentapi::MapVisitorMessage);
+ to->getData() = from.getData();
+ toMsg.reset(to.release());
+ break;
+ }
+ case api::MessageType::DOCUMENTLIST_ID:
+ {
+ api::DocumentListCommand& from(
+ static_cast<api::DocumentListCommand&>(fromMsg));
+ documentapi::DocumentListMessage::UP to(
+ new documentapi::DocumentListMessage(from.getBucketId()));
+
+ for (uint32_t i = 0; i < from.getDocuments().size(); i++) {
+ to->getDocuments().push_back(
+ documentapi::DocumentListMessage::Entry(
+ from.getDocuments()[i]._lastModified,
+ from.getDocuments()[i]._doc,
+ from.getDocuments()[i]._removeEntry));
+ }
+ toMsg.reset(to.release());
+ break;
+ }
+ case api::MessageType::EMPTYBUCKETS_ID:
+ {
+ api::EmptyBucketsCommand& from(
+ static_cast<api::EmptyBucketsCommand&>(fromMsg));
+ std::unique_ptr<documentapi::EmptyBucketsMessage> to(
+ new documentapi::EmptyBucketsMessage(from.getBuckets()));
+ toMsg.reset(to.release());
+ break;
+ }
+ case api::MessageType::VISITOR_CREATE_ID:
+ {
+ api::CreateVisitorCommand& from(
+ static_cast<api::CreateVisitorCommand&>(fromMsg));
+ documentapi::CreateVisitorMessage::UP to(
+ new documentapi::CreateVisitorMessage(
+ from.getLibraryName(),
+ from.getInstanceId(),
+ from.getControlDestination(),
+ from.getDataDestination()));
+ to->setDocumentSelection(from.getDocumentSelection());
+ to->setMaximumPendingReplyCount(from.getMaximumPendingReplyCount());
+ to->setParameters(from.getParameters());
+ to->setFromTimestamp(from.getFromTime());
+ to->setToTimestamp(from.getToTime());
+ to->setVisitRemoves(from.visitRemoves());
+ to->setFieldSet(from.getFieldSet());
+ to->setVisitInconsistentBuckets(from.visitInconsistentBuckets());
+ to->getBuckets() = from.getBuckets();
+ to->setVisitorOrdering(from.getVisitorOrdering());
+ to->setMaxBucketsPerVisitor(from.getMaxBucketsPerVisitor());
+ toMsg.reset(to.release());
+ break;
+ }
+ case api::MessageType::VISITOR_DESTROY_ID:
+ {
+ api::DestroyVisitorCommand& from(
+ static_cast<api::DestroyVisitorCommand&>(fromMsg));
+ documentapi::DestroyVisitorMessage::UP to(
+ new documentapi::DestroyVisitorMessage);
+ to->setInstanceId(from.getInstanceId());
+ toMsg.reset(to.release());
+ break;
+ }
+ case api::MessageType::STATBUCKET_ID:
+ {
+ api::StatBucketCommand& from(
+ static_cast<api::StatBucketCommand&>(fromMsg));
+ documentapi::StatBucketMessage::UP to(
+ new documentapi::StatBucketMessage(
+ from.getBucketId(), from.getDocumentSelection()));
+ toMsg.reset(to.release());
+ break;
+ }
+ default:
+ break;
+ }
+
+ if (toMsg.get()) {
+ toMsg->setTimeRemaining(fromMsg.getTimeout());
+ toMsg->setContext(mbus::Context(fromMsg.getMsgId()));
+ if (LOG_WOULD_LOG(spam)) {
+ toMsg->getTrace().setLevel(9);
+ }
+ }
+ return std::move(toMsg);
+}
+
+void
+DocumentApiConverter::transferReplyState(api::StorageReply& fromMsg,
+ mbus::Reply& toMsg)
+{
+ // First map error codes.
+ if (fromMsg.getResult().failed()) {
+ mbus::Error error(mbus::Error(fromMsg.getResult().getResult(),
+ fromMsg.getResult().toString()));
+ toMsg.addError(error);
+ LOG(debug, "Converted storageapi error code %d to %s",
+ fromMsg.getResult().getResult(), error.toString().c_str());
+ }
+ // Then map specifics for specific types of messages needing it
+ using documentapi::DocumentProtocol;
+ if (toMsg.getType() == DocumentProtocol::REPLY_GETDOCUMENT) {
+ api::GetReply& from(static_cast<api::GetReply&>(fromMsg));
+ documentapi::GetDocumentReply& to(
+ static_cast<documentapi::GetDocumentReply&>(toMsg));
+ if (from.getDocument().get() != 0) {
+ to.setDocument(from.getDocument());
+ to.setLastModified(from.getLastModifiedTimestamp());
+ }
+ } else if (toMsg.getType() == DocumentProtocol::REPLY_REMOVEDOCUMENT) {
+ api::RemoveReply& from(static_cast<api::RemoveReply&>(fromMsg));
+ documentapi::RemoveDocumentReply& to(
+ static_cast<documentapi::RemoveDocumentReply&>(toMsg));
+ to.setWasFound(from.wasFound());
+ to.setHighestModificationTimestamp(from.getTimestamp());
+ } else if (toMsg.getType() == DocumentProtocol::REPLY_PUTDOCUMENT) {
+ api::PutReply& from(static_cast<api::PutReply&>(fromMsg));
+ documentapi::WriteDocumentReply& to(
+ static_cast<documentapi::WriteDocumentReply&>(toMsg));
+ to.setHighestModificationTimestamp(from.getTimestamp());
+ } else if (toMsg.getType() == DocumentProtocol::REPLY_MULTIOPERATION) {
+ api::MultiOperationReply& from(
+ static_cast<api::MultiOperationReply&>(fromMsg));
+ documentapi::WriteDocumentReply& to(
+ static_cast<documentapi::WriteDocumentReply&>(toMsg));
+ to.setHighestModificationTimestamp(
+ from.getHighestModificationTimestamp());
+ } else if (toMsg.getType() == DocumentProtocol::REPLY_UPDATEDOCUMENT) {
+ api::UpdateReply& from(static_cast<api::UpdateReply&>(fromMsg));
+ documentapi::UpdateDocumentReply& to(
+ static_cast<documentapi::UpdateDocumentReply&>(toMsg));
+ to.setWasFound(from.wasFound());
+ to.setHighestModificationTimestamp(from.getTimestamp());
+ } else if (toMsg.getType() == DocumentProtocol::REPLY_STATBUCKET) {
+ api::StatBucketReply& from(static_cast<api::StatBucketReply&>(fromMsg));
+ documentapi::StatBucketReply& to(
+ static_cast<documentapi::StatBucketReply&>(toMsg));
+ to.setResults(from.getResults());
+ } else if (toMsg.getType() == DocumentProtocol::REPLY_GETBUCKETLIST) {
+ api::GetBucketListReply& from(
+ static_cast<api::GetBucketListReply&>(fromMsg));
+ documentapi::GetBucketListReply& to(
+ static_cast<documentapi::GetBucketListReply&>(toMsg));
+ const std::vector<api::GetBucketListReply::BucketInfo>& buckets(
+ from.getBuckets());
+ for (uint32_t i = 0; i < buckets.size(); i++) {
+ to.getBuckets().push_back(
+ documentapi::GetBucketListReply::BucketInfo(
+ buckets[i]._bucket, buckets[i]._bucketInformation));
+ }
+ } else if (toMsg.getType() == DocumentProtocol::REPLY_CREATEVISITOR) {
+ api::CreateVisitorReply& from(
+ static_cast<api::CreateVisitorReply&>(fromMsg));
+ documentapi::CreateVisitorReply& to(
+ static_cast<documentapi::CreateVisitorReply&>(toMsg));
+ to.setLastBucket(from.getLastBucket());
+ to.setVisitorStatistics(from.getVisitorStatistics());
+ } else if (toMsg.getType() == DocumentProtocol::REPLY_BATCHDOCUMENTUPDATE) {
+ api::BatchDocumentUpdateReply& from(
+ static_cast<api::BatchDocumentUpdateReply&>(fromMsg));
+ documentapi::BatchDocumentUpdateReply& to(
+ static_cast<documentapi::BatchDocumentUpdateReply&>(toMsg));
+ to.getDocumentsNotFound() = from.getDocumentsNotFound();
+ }
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/documentapiconverter.h b/storage/src/vespa/storage/storageserver/documentapiconverter.h
new file mode 100644
index 00000000000..96d10c8bbad
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/documentapiconverter.h
@@ -0,0 +1,41 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/document/repo/documenttyperepo.h>
+#include <vespa/documentapi/messagebus/messages/documentmessage.h>
+#include <vespa/documentapi/messagebus/messages/documentreply.h>
+#include <vespa/storage/storageserver/priorityconverter.h>
+#include <vespa/storageapi/messageapi/storagecommand.h>
+#include <vespa/storageapi/messageapi/storagereply.h>
+
+namespace storage {
+
+/**
+ Converts messages from storageapi to documentapi and
+ vice versa.
+*/
+class DocumentApiConverter
+{
+public:
+ DocumentApiConverter(const config::ConfigUri & configUri)
+ : _priConverter(configUri) {}
+
+ std::unique_ptr<storage::api::StorageCommand> toStorageAPI(
+ documentapi::DocumentMessage& msg,
+ const document::DocumentTypeRepo::SP &repo);
+
+ std::unique_ptr<storage::api::StorageReply> toStorageAPI(documentapi::DocumentReply& reply, api::StorageCommand& originalCommand);
+
+ void transferReplyState(storage::api::StorageReply& from, mbus::Reply& to);
+
+ std::unique_ptr<mbus::Message> toDocumentAPI(
+ storage::api::StorageCommand& cmd,
+ const document::DocumentTypeRepo::SP &repo);
+
+ const PriorityConverter& getPriorityConverter() const { return _priConverter; }
+private:
+ PriorityConverter _priConverter;
+};
+
+} // namespace storage
+
diff --git a/storage/src/vespa/storage/storageserver/fnetlistener.cpp b/storage/src/vespa/storage/storageserver/fnetlistener.cpp
new file mode 100644
index 00000000000..46558c6842a
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/fnetlistener.cpp
@@ -0,0 +1,178 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/fnetlistener.h>
+#include <vespa/storage/storageserver/communicationmanager.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/log/log.h>
+
+LOG_SETUP(".rpc.listener");
+
+namespace storage {
+
+
+FNetListener::FNetListener(
+ CommunicationManager& comManager, const config::ConfigUri & configUri, uint32_t port)
+ : _comManager(comManager),
+ _orb(),
+ _closed(false),
+ _slobrokRegister(_orb, configUri)
+{
+ initRPC();
+ if (!_orb.Listen(port)) {
+ std::ostringstream ost;
+ ost << "Failed to listen to RPC port " << port << ".";
+ throw vespalib::IllegalStateException(ost.str(), VESPA_STRLOC);
+ }
+ _orb.Start();
+}
+
+FNetListener::~FNetListener()
+{
+ if (!_closed) {
+ close();
+ }
+}
+
+void
+FNetListener::registerHandle(const vespalib::stringref & handle) {
+ _slobrokRegister.registerName(handle);
+ while (_slobrokRegister.busy()) {
+ LOG(debug, "Waiting to register in slobrok");
+ FastOS_Thread::Sleep(50);
+ }
+ _handle = handle;
+}
+
+void
+FNetListener::close()
+{
+ _closed = true;
+ _slobrokRegister.unregisterName(_handle);
+ _orb.ShutDown(true);
+}
+
+void
+FNetListener::initRPC()
+{
+ FRT_ReflectionBuilder rb(&_orb);
+
+ rb.DefineMethod(
+ "getnodestate3", "sii", "ss", true,
+ FRT_METHOD(FNetListener::RPC_getNodeState2),
+ this);
+ rb.MethodDesc("Get state of this node");
+ rb.ParamDesc("nodestate", "Expected state of given node. If correct, the "
+ "request will be queued on target until it changes. To not give "
+ "any state use the string 'unknown', enforcing a direct reply.");
+ rb.ParamDesc("timeout", "Timeout of message in milliseconds, set by the "
+ "state requester");
+ rb.ReturnDesc("nodestate", "State string for this node");
+ rb.ReturnDesc("hostinfo", "Information about host this node is running on");
+ //-------------------------------------------------------------------------
+ rb.DefineMethod(
+ "getnodestate2", "si", "s", true,
+ FRT_METHOD(FNetListener::RPC_getNodeState2),
+ this);
+ rb.MethodDesc("Get state of this node");
+ rb.ParamDesc("nodestate", "Expected state of given node. If correct, the "
+ "request will be queued on target until it changes. To not give "
+ "any state use the string 'unknown', enforcing a direct reply.");
+ rb.ParamDesc("timeout", "Timeout of message in milliseconds, set by the "
+ "state requester");
+ rb.ReturnDesc("nodestate", "State string for this node");
+ //-------------------------------------------------------------------------
+ rb.DefineMethod(
+ "setsystemstate2", "s", "", true,
+ FRT_METHOD(FNetListener::RPC_setSystemState2),
+ this);
+ rb.MethodDesc("Set systemstate on this node");
+ rb.ParamDesc("systemstate", "New systemstate to set");
+ //-------------------------------------------------------------------------
+ rb.DefineMethod(
+ "getcurrenttime", "", "lis", true,
+ FRT_METHOD(FNetListener::RPC_getCurrentTime),
+ this);
+ rb.MethodDesc("Get current time on this node");
+ rb.ReturnDesc("seconds", "Current time in seconds since epoch");
+ rb.ReturnDesc("nanoseconds", "additional nanoseconds since epoch");
+ rb.ReturnDesc("hostname", "Host name");
+ //-------------------------------------------------------------------------
+}
+
+
+void
+FNetListener::RPC_getCurrentTime(FRT_RPCRequest *req)
+{
+ if (_closed) {
+ LOG(debug, "Not handling RPC call getCurrentTime() as we have closed");
+ req->SetError(RPCRequestWrapper::ERR_NODE_SHUTTING_DOWN, "Node shutting down");
+ return;
+ }
+ struct timespec t;
+ clock_gettime(CLOCK_REALTIME, &t);
+ req->GetReturn()->AddInt64(t.tv_sec);
+ req->GetReturn()->AddInt32(t.tv_nsec);
+ vespalib::string hostname = FastOS_Socket::getHostName();
+ req->GetReturn()->AddString(hostname.c_str());
+ // all handled, will return immediately
+ return;
+}
+
+void
+FNetListener::RPC_getNodeState2(FRT_RPCRequest *req)
+{
+ if (_closed) {
+ LOG(debug, "Not handling RPC call getNodeState2() as we have closed");
+ req->SetError(RPCRequestWrapper::ERR_NODE_SHUTTING_DOWN, "Node shutting down");
+ return;
+ }
+
+ vespalib::string expected(req->GetParams()->GetValue(0)._string._str,
+ req->GetParams()->GetValue(0)._string._len);
+
+ std::shared_ptr<api::GetNodeStateCommand> cmd(
+ new api::GetNodeStateCommand(
+ expected != "unknown" ?
+ std::unique_ptr<lib::NodeState>(new lib::NodeState(expected)) :
+ std::unique_ptr<lib::NodeState>()));
+
+ cmd->setPriority(api::StorageMessage::VERYHIGH);
+ cmd->setTimeout(req->GetParams()->GetValue(1)._intval32);
+ if (req->GetParams()->GetNumValues() > 2) {
+ cmd->setSourceIndex(req->GetParams()->GetValue(2)._intval32);
+ }
+ // Create a request object to avoid needing a separate transport type
+ std::unique_ptr<RPCRequestWrapper> request(new RPCRequestWrapper(req));
+ cmd->setTransportContext(std::unique_ptr<api::TransportContext>(
+ new StorageTransportContext(std::move(request))));
+ req->Detach();
+ _comManager.enqueue(cmd);
+}
+
+void
+FNetListener::RPC_setSystemState2(FRT_RPCRequest *req)
+{
+ if (_closed) {
+ LOG(debug, "Not handling RPC call setSystemState2() as we have closed");
+ req->SetError(RPCRequestWrapper::ERR_NODE_SHUTTING_DOWN, "Node shutting down");
+ return;
+ }
+ vespalib::string systemStateStr(req->GetParams()->GetValue(0)._string._str,
+ req->GetParams()->GetValue(0)._string._len);
+ lib::ClusterState systemState(systemStateStr);
+
+ std::shared_ptr<api::SetSystemStateCommand> cmd(
+ new api::SetSystemStateCommand(systemState));
+ cmd->setPriority(api::StorageMessage::VERYHIGH);
+
+ // Create a request object to avoid needing a separate transport type
+ std::unique_ptr<RPCRequestWrapper> request(new RPCRequestWrapper(req));
+ cmd->setTransportContext(std::unique_ptr<api::TransportContext>(
+ new StorageTransportContext(std::move(request))));
+ req->Detach();
+ _comManager.enqueue(cmd);
+}
+
+
+
+}
diff --git a/storage/src/vespa/storage/storageserver/fnetlistener.h b/storage/src/vespa/storage/storageserver/fnetlistener.h
new file mode 100644
index 00000000000..40cd53eaa63
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/fnetlistener.h
@@ -0,0 +1,42 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/fnet/frt/frt.h>
+#include <vespa/slobrok/sbmirror.h>
+#include <vespa/slobrok/sbregister.h>
+
+
+namespace storage {
+
+class CommunicationManager;
+class StorageServerInterface;
+
+class FNetListener : public FRT_Invokable
+{
+public:
+ FNetListener(CommunicationManager& comManager,
+ const config::ConfigUri & configUri, uint32_t port);
+ ~FNetListener();
+
+ void initRPC();
+ void RPC_getNodeState2(FRT_RPCRequest *req);
+ void RPC_setSystemState2(FRT_RPCRequest *req);
+ void RPC_getCurrentTime(FRT_RPCRequest *req);
+
+ void registerHandle(const vespalib::stringref & handle);
+ void close();
+
+
+ // Used by unit tests.
+ bool serviceExists(const vespalib::stringref & connectionSpec);
+
+private:
+ CommunicationManager& _comManager;
+ FRT_Supervisor _orb;
+ bool _closed;
+ slobrok::api::RegisterAPI _slobrokRegister;
+ vespalib::string _handle;
+};
+
+}
+
diff --git a/storage/src/vespa/storage/storageserver/framework.cpp b/storage/src/vespa/storage/storageserver/framework.cpp
new file mode 100644
index 00000000000..1bdd3ad9e04
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/framework.cpp
@@ -0,0 +1,34 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/framework.h>
+
+#include <vespa/storageframework/defaultimplementation/memory/prioritymemorylogic.h>
+
+using storage::framework::defaultimplementation::AllocationLogic;
+
+namespace storage {
+
+Framework::Framework(framework::Clock::UP clock)
+ : _componentRegister(),
+ _clock(clock),
+ _threadPool(*_clock),
+ _memoryLogic(new framework::defaultimplementation::PriorityMemoryLogic(
+ *_clock, 1024 * 1024 * 1024)),
+ _memoryManager(AllocationLogic::UP(_memoryLogic))
+{
+ framework::defaultimplementation::ComponentRegisterImpl& cri(
+ _componentRegister.getComponentRegisterImpl());
+ cri.setClock(*_clock);
+ cri.setThreadPool(_threadPool);
+ cri.setMemoryManager(_memoryManager);
+}
+
+void
+Framework::setMaximumMemoryUsage(uint64_t max)
+{
+ using storage::framework::defaultimplementation::PriorityMemoryLogic;
+ static_cast<PriorityMemoryLogic*>(_memoryLogic)->setMaximumMemoryUsage(max);
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/framework.h b/storage/src/vespa/storage/storageserver/framework.h
new file mode 100644
index 00000000000..5355ce1a5e4
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/framework.h
@@ -0,0 +1,69 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::Framework
+ * @ingroup storageserver
+ *
+ * @brief Data available to both provider implementations and storage server
+ *
+ * This utility class sets up the default component register implementation.
+ * It also sets up the clock and the threadpool, such that the most basic
+ * features are available to the provider, before the service layer is set up.
+ *
+ * The service layer still provides the memory manager functionality though,
+ * so you cannot retrieve the memory manager before the service layer has
+ * started up. (Before getPartitionStates() have been called on provider)
+ */
+
+#pragma once
+
+#include <vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h>
+#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
+#include <vespa/storageframework/defaultimplementation/memory/memorymanager.h>
+#include <vespa/storageframework/defaultimplementation/thread/threadpoolimpl.h>
+
+namespace storage {
+
+struct Framework {
+ // Typedefs to simplify the remainder of the interface
+ typedef StorageComponentRegisterImpl CompReg;
+ typedef framework::defaultimplementation::RealClock RealClock;
+ typedef framework::defaultimplementation::MemoryManager MemoryManager;
+
+ /**
+ * You can provide your own clock implementation. Useful in testing where
+ * you want to fake the clock.
+ */
+ Framework(framework::Clock::UP clock = framework::Clock::UP(new RealClock));
+
+ /**
+ * Get the actual component register. Available as the actual type as the
+ * storage server need to set implementations, and the components need the
+ * actual component register interface.
+ */
+ CompReg& getComponentRegister() { return _componentRegister; }
+
+ /**
+ * There currently exist threads that doesn't use the component model.
+ * Let the backend threadpool be accessible for now.
+ */
+ FastOS_ThreadPool& getThreadPool() { return _threadPool.getThreadPool(); }
+
+ /**
+ * Get the memory manager. Components that wants to print status of memory
+ * manager need access to the actual implementation.
+ */
+ MemoryManager& getMemoryManager() { return _memoryManager; }
+
+ void setMaximumMemoryUsage(uint64_t max);
+
+private:
+ CompReg _componentRegister;
+ framework::Clock::UP _clock;
+ framework::defaultimplementation::ThreadPoolImpl _threadPool;
+ framework::defaultimplementation::AllocationLogic* _memoryLogic;
+ MemoryManager _memoryManager;
+
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageserver/mergethrottler.cpp b/storage/src/vespa/storage/storageserver/mergethrottler.cpp
new file mode 100644
index 00000000000..c8cfd7f3214
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/mergethrottler.cpp
@@ -0,0 +1,1225 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/mergethrottler.h>
+
+#include <iostream>
+#include <sstream>
+#include <iterator>
+#include <vespa/vespalib/stllike/asciistream.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/storage/storageserver/storagemetricsset.h>
+#include <vespa/log/log.h>
+
+LOG_SETUP(".mergethrottler");
+
+namespace storage {
+
+namespace {
+
+struct NodeComparator
+{
+ bool operator()(const api::MergeBucketCommand::Node& a,
+ const api::MergeBucketCommand::Node& b) const
+ {
+ return a.index < b.index;
+ }
+};
+
+// Class used to sneakily get around IThrottlePolicy only accepting
+// messagebus objects
+template <typename Base>
+class DummyMbusMessage : public Base
+{
+private:
+ static const mbus::string NAME;
+public:
+ const mbus::string& getProtocol() const { return NAME; }
+ uint32_t getType() const { return 0x1badb007; }
+
+ uint8_t priority() const { return 255; }
+};
+
+template <typename Base>
+const mbus::string DummyMbusMessage<Base>::NAME = "SkyNet";
+
+}
+
+MergeThrottler::MergeNodeSequence::MergeNodeSequence(
+ const api::MergeBucketCommand& cmd,
+ uint16_t thisIndex)
+ : _cmd(cmd),
+ _sortedNodes(cmd.getNodes()),
+ _sortedIndex(std::numeric_limits<std::size_t>::max()),
+ _thisIndex(thisIndex)
+{
+ // Sort the node vector so that we can find out if we're the
+ // last node in the chain or if we should forward the merge
+ std::sort(_sortedNodes.begin(), _sortedNodes.end(), NodeComparator());
+ assert(!_sortedNodes.empty());
+ for (std::size_t i = 0; i < _sortedNodes.size(); ++i) {
+ if (_sortedNodes[i].index == _thisIndex) {
+ _sortedIndex = i;
+ break;
+ }
+ }
+}
+
+uint16_t
+MergeThrottler::MergeNodeSequence::getNextNodeInChain() const
+{
+ assert(_cmd.getChain().size() < _sortedNodes.size());
+ // assert(_sortedNodes[_cmd.getChain().size()].index == _thisIndex);
+ if (_sortedNodes[_cmd.getChain().size()].index != _thisIndex) {
+ // Some added paranoia output
+ LOG(error, "For %s;_sortedNodes[%" PRIu64 "].index (%u) != %u",
+ _cmd.toString().c_str(), _cmd.getChain().size(),
+ _sortedNodes[_cmd.getChain().size()].index, _thisIndex);
+ assert(!"_sortedNodes[_cmd.getChain().size()].index != _thisIndex) failed");
+ }
+ return _sortedNodes[_cmd.getChain().size() + 1].index;
+}
+
+bool
+MergeThrottler::MergeNodeSequence::isChainCompleted() const
+{
+ if (_cmd.getChain().size() != _sortedNodes.size()) return false;
+
+ for (std::size_t i = 0; i < _cmd.getChain().size(); ++i) {
+ if (_cmd.getChain()[i] != _sortedNodes[i].index) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool
+MergeThrottler::MergeNodeSequence::chainContainsIndex(uint16_t idx) const
+{
+ for (std::size_t i = 0; i < _cmd.getChain().size(); ++i) {
+ if (_cmd.getChain()[i] == idx) {
+ return true;
+ }
+ }
+ return false;
+}
+
+std::string
+MergeThrottler::MergeNodeSequence::getSequenceString() const
+{
+ std::ostringstream oss;
+ oss << '[';
+ for (std::size_t i = 0; i < _cmd.getNodes().size(); ++i) {
+ if (i > 0) {
+ oss << ", ";
+ }
+ oss << _cmd.getNodes()[i].index;
+ }
+ oss << ']';
+ return oss.str();
+}
+
+MergeThrottler::MergeThrottler(
+ const config::ConfigUri & configUri,
+ StorageComponentRegister& compReg)
+ : StorageLink("Merge Throttler"),
+ framework::HtmlStatusReporter("merges", "Merge Throttler"),
+ _merges(),
+ _queue(),
+ _maxQueueSize(1024),
+ _throttlePolicy(new mbus::StaticThrottlePolicy()),
+ _queueSequence(0),
+ _messageLock(),
+ _stateLock(),
+ _configFetcher(configUri.getContext()),
+ _metrics(new Metrics),
+ _component(compReg, "mergethrottler"),
+ _thread(),
+ _rendezvous(RENDEZVOUS_NONE),
+ _closing(false)
+{
+ _throttlePolicy->setMaxPendingCount(20);
+ _configFetcher.subscribe<vespa::config::content::core::StorServerConfig>(configUri.getConfigId(), this);
+ _configFetcher.start();
+ _component.registerStatusPage(*this);
+ _component.registerMetric(*_metrics);
+}
+
+void
+MergeThrottler::configure(std::unique_ptr<vespa::config::content::core::StorServerConfig> newConfig)
+{
+ vespalib::LockGuard lock(_stateLock);
+
+ if (newConfig->maxMergesPerNode < 1) {
+ throw config::InvalidConfigException(
+ "Cannot have a max merge count of less than 1");
+ }
+ if (newConfig->maxMergeQueueSize < 0) {
+ throw config::InvalidConfigException(
+ "Max merge queue size cannot be less than 0");
+ }
+ if (static_cast<double>(newConfig->maxMergesPerNode)
+ != _throttlePolicy->getMaxPendingCount())
+ {
+ LOG(debug, "Setting new max pending count from max_merges_per_node: %d",
+ newConfig->maxMergesPerNode);
+ _throttlePolicy->setMaxPendingCount(newConfig->maxMergesPerNode);
+ }
+ LOG(debug, "Setting new max queue size to %d",
+ newConfig->maxMergeQueueSize);
+ _maxQueueSize = newConfig->maxMergeQueueSize;
+}
+
+MergeThrottler::~MergeThrottler()
+{
+ LOG(debug, "Deleting link %s", toString().c_str());
+ if (StorageLink::getState() == StorageLink::OPENED) {
+ LOG(error, "Deleted MergeThrottler before calling close()");
+ close();
+ flush();
+ }
+ closeNextLink();
+
+ // Sanity checking to find shutdown bug where not all messages have been flushed
+ assert(_merges.empty());
+ assert(_queue.empty());
+ assert(_messagesUp.empty());
+ assert(_messagesDown.empty());
+}
+
+void
+MergeThrottler::onOpen()
+{
+ framework::MilliSecTime maxProcessingTime(30 * 1000);
+ framework::MilliSecTime waitTime(1000);
+ _thread = _component.startThread(*this, maxProcessingTime, waitTime);
+}
+
+void
+MergeThrottler::onClose()
+{
+ // Avoid getting config on shutdown
+ _configFetcher.close();
+ {
+ vespalib::MonitorGuard guard(_messageLock);
+ // Note: used to prevent taking locks in different order if onFlush
+ // and abortOutdatedMerges are called concurrently, as these need to
+ // take both locks in differing orders.
+ _closing = true;
+ }
+ if (LOG_WOULD_LOG(debug)) {
+ vespalib::LockGuard lock(_stateLock);
+ LOG(debug, "onClose; active: %" PRIu64 ", queued: %" PRIu64,
+ _merges.size(), _queue.size());
+ }
+ if (_thread.get() != 0) {
+ _thread->interruptAndJoin(&_messageLock);
+ _thread.reset(0);
+ }
+}
+
+void
+MergeThrottler::onFlush(bool /*downwards*/)
+{
+ // Lock state before messages since the latter must be unlocked
+ // before the guard starts hauling messages up the chain.
+ MessageGuard msgGuard(_stateLock, *this);
+ vespalib::MonitorGuard lock(_messageLock);
+
+ // Abort active merges, queued and up/down pending
+ std::vector<api::StorageMessage::SP> flushable;
+
+ ActiveMergeMap::iterator mergeEnd = _merges.end();
+ for (ActiveMergeMap::iterator i = _merges.begin(); i != mergeEnd; ++i) {
+ // Only generate a reply if the throttler owns the command
+ if (i->second.getMergeCmd().get()) {
+ flushable.push_back(i->second.getMergeCmd());
+ } else {
+ LOG(debug, "Not generating flush-reply for %s since we don't "
+ "own the command", i->first.toString().c_str());
+ }
+
+ DummyMbusMessage<mbus::Reply> dummyReply;
+ _throttlePolicy->processReply(dummyReply);
+ }
+ MergePriorityQueue::iterator queueEnd = _queue.end();
+ for (MergePriorityQueue::iterator i = _queue.begin(); i != queueEnd; ++i) {
+ flushable.push_back(i->_msg);
+ }
+
+ // Just pass-through everything in the up-queue, since the messages
+ // are either replies or commands _we_ have sent and thus cannot
+ // send a meaningful reply for
+ for (std::size_t i = 0; i < _messagesUp.size(); ++i) {
+ msgGuard.sendUp(_messagesUp[i]);
+ }
+
+ std::back_insert_iterator<
+ std::vector<api::StorageMessage::SP>
+ > inserter(flushable);
+ std::copy(_messagesDown.begin(), _messagesDown.end(), inserter);
+
+ for (std::size_t i = 0; i < flushable.size(); ++i) {
+ // Down-bound merge may be a reply, in which case we ignore it
+ // since we can't actually do anything with it now
+ if (flushable[i]->getType() == api::MessageType::MERGEBUCKET) {
+ std::shared_ptr<api::MergeBucketReply> reply(
+ std::make_shared<api::MergeBucketReply>(
+ static_cast<const api::MergeBucketCommand&>(
+ *flushable[i])));
+ reply->setResult(api::ReturnCode(api::ReturnCode::ABORTED,
+ "Storage node is shutting down"));
+ LOG(debug, "Aborted merge since we're flushing: %s",
+ flushable[i]->toString().c_str());
+ msgGuard.sendUp(reply);
+ } else {
+ assert(flushable[i]->getType() == api::MessageType::MERGEBUCKET_REPLY);
+ LOG(debug, "Ignored merge reply since we're flushing: %s",
+ flushable[i]->toString().c_str());
+ }
+ }
+
+ LOG(debug, "Flushed %" PRIu64 " unfinished or pending merge operations",
+ flushable.size());
+
+ _merges.clear();
+ _queue.clear();
+ _messagesUp.clear();
+ _messagesDown.clear();
+}
+
+void
+MergeThrottler::forwardCommandToNode(
+ const api::MergeBucketCommand& mergeCmd,
+ uint16_t nodeIndex,
+ MessageGuard& msgGuard)
+{
+ // Push this node onto the chain trace
+ std::vector<uint16_t> newChain(mergeCmd.getChain());
+ newChain.push_back(_component.getIndex());
+
+ std::shared_ptr<api::MergeBucketCommand> fwdMerge(
+ std::make_shared<api::MergeBucketCommand>(
+ mergeCmd.getBucketId(),
+ mergeCmd.getNodes(),
+ mergeCmd.getMaxTimestamp(),
+ mergeCmd.getClusterStateVersion(),
+ newChain));
+ fwdMerge->setAddress(
+ api::StorageMessageAddress(
+ _component.getClusterName(),
+ lib::NodeType::STORAGE,
+ nodeIndex));
+ fwdMerge->setSourceIndex(mergeCmd.getSourceIndex());
+ fwdMerge->setPriority(mergeCmd.getPriority());
+ fwdMerge->setTimeout(mergeCmd.getTimeout());
+ msgGuard.sendUp(fwdMerge);
+}
+
+void
+MergeThrottler::removeActiveMerge(ActiveMergeMap::iterator mergeIter)
+{
+ LOG(debug, "Removed merge for %s from internal state",
+ mergeIter->first.toString().c_str());
+ _merges.erase(mergeIter);
+}
+
+api::StorageMessage::SP
+MergeThrottler::getNextQueuedMerge()
+{
+ if (_queue.empty()) {
+ return api::StorageMessage::SP();
+ }
+
+ MergePriorityQueue::iterator iter = _queue.begin();
+ MergePriorityQueue::value_type entry = *iter;
+ entry._startTimer.stop(_metrics->averageQueueWaitingTime);
+ _queue.erase(iter);
+ return entry._msg;
+}
+
+void
+MergeThrottler::enqueueMerge(
+ const api::StorageMessage::SP& msg,
+ MessageGuard& msgGuard)
+{
+ LOG(spam, "Enqueuing %s", msg->toString().c_str());
+ const api::MergeBucketCommand& mergeCmd
+ = static_cast<const api::MergeBucketCommand&>(*msg);
+ MergeNodeSequence nodeSeq(mergeCmd, _component.getIndex());
+ if (!validateNewMerge(mergeCmd, nodeSeq, msgGuard)) {
+ return;
+ }
+ _queue.insert(MergePriorityQueue::value_type(msg, _queueSequence++));
+}
+
+bool
+MergeThrottler::canProcessNewMerge() const
+{
+ DummyMbusMessage<mbus::Message> dummyMsg;
+ return _throttlePolicy->canSend(dummyMsg, _merges.size());
+}
+
+bool
+MergeThrottler::isMergeAlreadyKnown(const api::StorageMessage::SP& msg) const
+{
+ const api::MergeBucketCommand& mergeCmd
+ = static_cast<const api::MergeBucketCommand&>(*msg);
+ return _merges.find(mergeCmd.getBucketId()) != _merges.end();
+}
+
+bool
+MergeThrottler::rejectMergeIfOutdated(
+ const api::StorageMessage::SP& msg,
+ uint32_t rejectLessThanVersion,
+ MessageGuard& msgGuard) const
+{
+ // Only reject merge commands! never reject replies (for obvious reasons..)
+ assert(msg->getType() == api::MessageType::MERGEBUCKET);
+
+ const api::MergeBucketCommand& cmd(
+ static_cast<const api::MergeBucketCommand&>(*msg));
+
+ if (cmd.getClusterStateVersion() == 0
+ || cmd.getClusterStateVersion() >= rejectLessThanVersion)
+ {
+ return false;
+ }
+ std::ostringstream oss;
+ oss << "Rejected merge due to outdated cluster state; merge has "
+ << "version " << cmd.getClusterStateVersion()
+ << ", storage node has version "
+ << rejectLessThanVersion;
+ sendReply(cmd,
+ api::ReturnCode(
+ api::ReturnCode::WRONG_DISTRIBUTION,
+ oss.str()),
+ msgGuard, _metrics->chaining);
+ LOG(debug, "Immediately rejected %s, due to it having state version < %u",
+ cmd.toString().c_str(), rejectLessThanVersion);
+ return true;
+}
+
+void
+MergeThrottler::updateOperationMetrics(
+ const api::ReturnCode& result,
+ MergeOperationMetrics& metrics) const
+{
+ switch (result.getResult()) {
+ case api::ReturnCode::OK:
+ ++metrics.ok;
+ break;
+ case api::ReturnCode::NOT_READY:
+ ++metrics.failures.notready;
+ break;
+ case api::ReturnCode::TIMEOUT:
+ ++metrics.failures.timeout;
+ break;
+ case api::ReturnCode::ABORTED:
+ ++metrics.failures.aborted;
+ break;
+ case api::ReturnCode::WRONG_DISTRIBUTION:
+ ++metrics.failures.wrongdistribution;
+ break;
+ case api::ReturnCode::EXISTS:
+ ++metrics.failures.exists;
+ break;
+ case api::ReturnCode::REJECTED:
+ ++metrics.failures.rejected;
+ break;
+ default:
+ if (result.isBusy()) {
+ ++metrics.failures.busy;
+ } else if (result.isBucketDisappearance()) {
+ ++metrics.failures.bucketnotfound;
+ } else {
+ ++metrics.failures.other;
+ }
+ }
+}
+
+void
+MergeThrottler::sendReply(const api::MergeBucketCommand& cmd,
+ const api::ReturnCode& result,
+ MessageGuard& msgGuard,
+ MergeOperationMetrics& metrics) const
+{
+ updateOperationMetrics(result, metrics);
+ std::shared_ptr<api::MergeBucketReply> reply(
+ std::make_shared<api::MergeBucketReply>(cmd));
+ reply->setResult(result);
+ msgGuard.sendUp(reply);
+}
+
+void
+MergeThrottler::rejectOutdatedQueuedMerges(
+ MessageGuard& msgGuard,
+ uint32_t rejectLessThanVersion)
+{
+ // Flush all queued merges that have an outdated version
+ MergePriorityQueue::iterator queueEnd = _queue.end();
+ for (MergePriorityQueue::iterator i = _queue.begin(); i != queueEnd;) {
+ MergePriorityQueue::iterator erase_iter = i;
+ ++i;
+ if (rejectMergeIfOutdated(
+ erase_iter->_msg, rejectLessThanVersion, msgGuard))
+ {
+ _queue.erase(erase_iter);
+ }
+ }
+}
+
+// If there's a merge queued and the throttling policy allows for
+// the merge to be processed, do so.
+bool
+MergeThrottler::attemptProcessNextQueuedMerge(
+ MessageGuard& msgGuard)
+{
+ if (!canProcessNewMerge()) {
+ // Should never reach a non-sending state when there are
+ // no to-be-replied merges that can trigger a new processing
+ assert(!_merges.empty());
+ return false;
+ }
+
+ api::StorageMessage::SP msg = getNextQueuedMerge();
+ if (msg) {
+ // In case of resends and whatnot, it's possible for a merge
+ // command to be in the queue while another higher priority
+ // command for the same bucket sneaks in front of it and gets
+ // a slot. Send BUSY in this case to make the distributor retry
+ // later, at which point the existing merge has hopefully gone
+ // through and the new one will be effectively a no-op to perform
+ if (!isMergeAlreadyKnown(msg)) {
+ LOG(spam, "Processing queued merge %s", msg->toString().c_str());
+ processNewMergeCommand(msg, msgGuard);
+ } else {
+ std::stringstream oss;
+ oss << "Queued merge " << *msg << " is out of date; it has already "
+ "been started by someone else since it was queued";
+ LOG(debug, "%s", oss.str().c_str());
+ sendReply(dynamic_cast<const api::MergeBucketCommand&>(*msg),
+ api::ReturnCode(api::ReturnCode::BUSY, oss.str()),
+ msgGuard, _metrics->chaining);
+ }
+ return true;
+ } else {
+ if (_queue.empty()) {
+ LOG(spam, "Queue empty - no merges to process");
+ } else {
+ LOG(spam, "Merges queued, but throttle policy disallows further "
+ "merges at this time");
+ }
+ }
+ return false;
+}
+
+bool
+MergeThrottler::processQueuedMerges(MessageGuard& msgGuard)
+{
+ bool processed = attemptProcessNextQueuedMerge(msgGuard);
+ if (!processed) {
+ return false;
+ }
+
+ while (processed) {
+ processed = attemptProcessNextQueuedMerge(msgGuard);
+ }
+
+ return true;
+}
+
+void
+MergeThrottler::handleRendezvous(vespalib::MonitorGuard& guard)
+{
+ if (_rendezvous != RENDEZVOUS_NONE) {
+ LOG(spam, "rendezvous requested by external thread; establishing");
+ assert(_rendezvous == RENDEZVOUS_REQUESTED);
+ _rendezvous = RENDEZVOUS_ESTABLISHED;
+ guard.broadcast();
+ while (_rendezvous != RENDEZVOUS_RELEASED) {
+ guard.wait();
+ }
+ LOG(spam, "external thread rendezvous released");
+ _rendezvous = RENDEZVOUS_NONE;
+ guard.broadcast();
+ }
+}
+
+void
+MergeThrottler::run(framework::ThreadHandle& thread)
+{
+ while (!thread.interrupted()) {
+ thread.registerTick(framework::PROCESS_CYCLE);
+ std::vector<api::StorageMessage::SP> up;
+ std::vector<api::StorageMessage::SP> down;
+ {
+ vespalib::MonitorGuard msgLock(_messageLock);
+ // If a rendezvous is requested, we must do this here _before_ we
+ // swap the message queues. This is so the caller can remove aborted
+ // messages from the queues when it knows exactly where this thread
+ // is paused and that there cannot be any messages in flight from this
+ // runner thread causing race conditions.
+ while (_messagesDown.empty()
+ && _messagesUp.empty()
+ && !thread.interrupted()
+ && _rendezvous == RENDEZVOUS_NONE)
+ {
+ msgLock.wait(1000);
+ thread.registerTick(framework::WAIT_CYCLE);
+ }
+ handleRendezvous(msgLock);
+ down.swap(_messagesDown);
+ up.swap(_messagesUp);
+ }
+
+ LOG(spam, "messages up: %" PRIu64 ", down: %" PRIu64,
+ up.size(), down.size());
+
+ // Message lock has been relinquished. Now actually do something
+ // with the messages (which are now owned by this thread). All internal
+ // ops are protected by _stateLock.
+ MessageGuard msgGuard(_stateLock, *this);
+ for (std::size_t i = 0; i < down.size(); ++i) {
+ handleMessageDown(down[i], msgGuard);
+ }
+ for (std::size_t i = 0; i < up.size(); ++i) {
+ handleMessageUp(up[i], msgGuard);
+ }
+ }
+ LOG(debug, "Returning from MergeThrottler working thread");
+}
+
+// Must be run from worker thread
+void
+MergeThrottler::handleMessageDown(
+ const std::shared_ptr<api::StorageMessage>& msg,
+ MessageGuard& msgGuard)
+{
+ if (msg->getType() == api::MessageType::MERGEBUCKET) {
+ const api::MergeBucketCommand& mergeCmd
+ = static_cast<const api::MergeBucketCommand&>(*msg);
+
+ uint32_t ourVersion(
+ _component.getStateUpdater().getSystemState()->getVersion());
+
+ if (mergeCmd.getClusterStateVersion() > ourVersion) {
+ LOG(debug, "Merge %s with newer cluster state than us arrived",
+ mergeCmd.toString().c_str());
+ rejectOutdatedQueuedMerges(
+ msgGuard, mergeCmd.getClusterStateVersion());
+ } else if (rejectMergeIfOutdated(msg, ourVersion, msgGuard)) {
+ // Skip merge entirely
+ return;
+ }
+
+ if (isMergeAlreadyKnown(msg)) {
+ processCycledMergeCommand(msg, msgGuard);
+ } else if (canProcessNewMerge()) {
+ processNewMergeCommand(msg, msgGuard);
+ } else if (_queue.size() < _maxQueueSize) {
+ enqueueMerge(msg, msgGuard); // Queue for later processing
+ } else {
+ // No more room at the inn. Return BUSY so that the
+ // distributor will wait a bit before retrying
+ LOG(debug, "Queue is full; busy-returning %s",
+ mergeCmd.toString().c_str());
+ sendReply(mergeCmd,
+ api::ReturnCode(api::ReturnCode::BUSY,
+ "Merge queue is full"),
+ msgGuard,
+ _metrics->local);
+ }
+ } else {
+ assert(msg->getType() == api::MessageType::MERGEBUCKET_REPLY);
+ // Will create new unwind reply and send it back in the chain
+ processMergeReply(msg, false, msgGuard);
+ }
+}
+
+void
+MergeThrottler::handleMessageUp(
+ const std::shared_ptr<api::StorageMessage>& msg,
+ MessageGuard& msgGuard)
+{
+ assert(msg->getType() == api::MessageType::MERGEBUCKET_REPLY);
+ const api::MergeBucketReply& mergeReply
+ = static_cast<const api::MergeBucketReply&>(*msg);
+
+ LOG(debug, "Processing %s from persistence layer",
+ mergeReply.toString().c_str());
+
+ if (mergeReply.getResult().getResult() != api::ReturnCode::OK) {
+ LOG(debug, "Merging failed for %s (%s)",
+ mergeReply.toString().c_str(),
+ mergeReply.getResult().getMessage().c_str());
+ }
+
+ processMergeReply(msg, true, msgGuard);
+
+ // Always send up original reply
+ msgGuard.sendUp(msg);
+}
+
+bool
+MergeThrottler::validateNewMerge(
+ const api::MergeBucketCommand& mergeCmd,
+ const MergeNodeSequence& nodeSeq,
+ MessageGuard& msgGuard) const
+{
+ bool valid = false;
+ vespalib::asciistream oss;
+
+ if (nodeSeq.isIndexUnknown()) {
+ // Sanity check failure! Merge has been sent to a node
+ // not in the node set somehow. Whine to the sender.
+ oss << mergeCmd.toString() << " sent to node "
+ << _component.getIndex()
+ << ", which is not in its forwarding chain";
+ LOG(error, "%s", oss.str().c_str());
+ } else if (mergeCmd.getChain().size()
+ >= nodeSeq.getSortedNodes().size())
+ {
+ // Chain is full but we haven't seen the merge! This means
+ // the node has probably gone down with a merge it previously
+ // forwarded only now coming back to haunt it.
+ oss << mergeCmd.toString()
+ << " is not in node's internal state, but has a "
+ << "full chain, meaning it cannot be forwarded.";
+ LOG(debug, "%s", oss.str().c_str());
+ } else if (nodeSeq.chainContainsIndex(nodeSeq.getThisNodeIndex())) {
+ oss << mergeCmd.toString()
+ << " is not in node's internal state, but contains "
+ << "this node in its non-full chain. This should not happen!";
+ LOG(error, "%s", oss.str().c_str());
+ } else {
+ valid = true;
+ }
+
+ if (!valid) {
+ sendReply(mergeCmd,
+ api::ReturnCode(api::ReturnCode::REJECTED, oss.str()),
+ msgGuard,
+ _metrics->local);
+ }
+ return valid;
+}
+
+void
+MergeThrottler::processNewMergeCommand(
+ const api::StorageMessage::SP& msg,
+ MessageGuard& msgGuard)
+{
+ const api::MergeBucketCommand& mergeCmd
+ = static_cast<const api::MergeBucketCommand&>(*msg);
+
+ MergeNodeSequence nodeSeq(mergeCmd, _component.getIndex());
+
+ if (!validateNewMerge(mergeCmd, nodeSeq, msgGuard)) {
+ return;
+ }
+
+ // Caller guarantees that there is no merge registered for this bucket yet
+ // and that we can fit it into our window.
+ // Register the merge now so that it will contribute to filling up our
+ // merge throttling window.
+ assert(_merges.find(mergeCmd.getBucketId()) == _merges.end());
+ ActiveMergeMap::iterator state = _merges.insert(
+ std::make_pair(mergeCmd.getBucketId(),
+ ChainedMergeState(msg))).first;
+
+ LOG(debug, "Added merge %s to internal state",
+ mergeCmd.toString().c_str());
+
+ DummyMbusMessage<mbus::Message> dummyMsg;
+ _throttlePolicy->processMessage(dummyMsg);
+
+ bool execute = false;
+
+ // If chain is empty and this node is not the lowest
+ // index in the nodeset, immediately execute. Required for
+ // backwards compatibility with older distributor versions.
+ if (mergeCmd.getChain().empty()
+ && (nodeSeq.getSortedNodes()[0].index != _component.getIndex()))
+ {
+ LOG(debug, "%s has empty chain and was sent to node that "
+ "is not the lowest in its node set. Assuming 4.2 distributor "
+ "source and performing merge.",
+ mergeCmd.toString().c_str());
+ execute = true;
+ } else {
+ if (!nodeSeq.isLastNode()) {
+ // When we're not the last node and haven't seen the merge before,
+ // we cannot possible execute the merge yet. Forward to next.
+ uint16_t nextNodeInChain = nodeSeq.getNextNodeInChain();
+ LOG(debug, "Forwarding merge %s to storage node %u",
+ mergeCmd.toString().c_str(), nextNodeInChain);
+
+ forwardCommandToNode(mergeCmd, nextNodeInChain, msgGuard);
+ } else if (!nodeSeq.isMergeExecutor()) {
+ // Last node, but not the merge executor. Send a final forward
+ // to the designated executor node.
+ LOG(debug, "%s: node is last in chain, but not merge executor; doing final "
+ "forwarding to node %u", mergeCmd.toString().c_str(),
+ nodeSeq.getExecutorNodeIndex());
+
+ forwardCommandToNode(
+ mergeCmd, nodeSeq.getExecutorNodeIndex(), msgGuard);
+ } else {
+ // We are the last node and the designated executor. Make it so!
+ // Send down to persistence layer, which will trigger the actual
+ // merge operation itself. A MergeBucketReply will be sent up the
+ // link once it has been completed
+ LOG(debug, "%s: node is last in the chain and designated merge "
+ "executor; performing merge", mergeCmd.toString().c_str());
+ execute = true;
+ }
+ }
+
+ // If execute == true, message will be propagated down
+ if (execute) {
+ state->second.setExecutingLocally(true); // Set as currently executing
+ // Relinquish ownership of this message. Otherwise, it would
+ // be owned by both the throttler and the persistence layer
+ state->second.setMergeCmd(api::StorageCommand::SP());
+ msgGuard.sendDown(msg);
+ }
+}
+
+bool
+MergeThrottler::processCycledMergeCommand(
+ const api::StorageMessage::SP& msg,
+ MessageGuard& msgGuard)
+{
+ // Since we've already got state registered for this merge, the case
+ // here is pretty simple: either we're the executor and the chain
+ // is completed, in which case we execute the merge, OR we're not, in
+ // which case it means a resend took place. In the latter case, we
+ // really have no option but to reject the command.
+ // Additionally, there is the case where a merge has been explicitly
+ // aborted, in which case we have to immediately send an abortion reply
+ // so the cycle can be unwound.
+
+ const api::MergeBucketCommand& mergeCmd
+ = static_cast<const api::MergeBucketCommand&>(*msg);
+
+ MergeNodeSequence nodeSeq(mergeCmd, _component.getIndex());
+
+ ActiveMergeMap::iterator mergeIter(
+ _merges.find(mergeCmd.getBucketId()));
+ assert(mergeIter != _merges.end());
+
+ if (mergeIter->second.isAborted()) {
+ LOG(debug,
+ "%s: received cycled merge where state indicates merge "
+ "has been aborted",
+ mergeCmd.toString().c_str());
+ sendReply(mergeCmd,
+ api::ReturnCode(api::ReturnCode::ABORTED, "merge marked as "
+ "aborted due to bucket ownership change"),
+ msgGuard,
+ _metrics->chaining);
+ return true;
+ }
+
+ // Have to check if merge is already executing to remove chance
+ // of resend from previous chain link to mess up our internal state
+ if (nodeSeq.isChainCompleted()
+ && !mergeIter->second.isExecutingLocally())
+ {
+ assert(mergeIter->second.getMergeCmd().get() != msg.get());
+
+ mergeIter->second.setExecutingLocally(true);
+ // Have to signal that we're in a cycle in order to do unwinding
+ mergeIter->second.setInCycle(true);
+ LOG(debug, "%s: received cycled merge command and this "
+ "node is the designated executor. Performing merge.",
+ mergeCmd.toString().c_str());
+
+ // Message should be sent down
+ msgGuard.sendDown(msg);
+ return false;
+ } else {
+ LOG(debug, "%s failed: already active merge for this bucket",
+ mergeCmd.toString().c_str());
+ // Send BUSY, as this is what the persistence layer does for this case
+ sendReply(mergeCmd, api::ReturnCode(api::ReturnCode::BUSY,
+ "Already active merge for this bucket"),
+ msgGuard, _metrics->chaining);
+ }
+
+ return true;
+}
+
+void
+MergeThrottler::processMergeReply(
+ const std::shared_ptr<api::StorageMessage>& msg,
+ bool fromPersistenceLayer,
+ MessageGuard& msgGuard)
+{
+ const api::MergeBucketReply& mergeReply
+ = dynamic_cast<const api::MergeBucketReply&>(*msg);
+
+ ActiveMergeMap::iterator mergeIter(
+ _merges.find(mergeReply.getBucketId()));
+ if (mergeIter == _merges.end()) {
+ LOG(warning, "Received %s, which has no command mapped "
+ "for it. Cannot send chained reply!",
+ mergeReply.toString().c_str());
+ return;
+ }
+
+ ChainedMergeState& mergeState = mergeIter->second;
+
+ if (fromPersistenceLayer) {
+ assert(mergeState.isExecutingLocally());
+ mergeState.setExecutingLocally(false);
+ mergeState.setUnwinding(true);
+
+ // If we've cycled around, do NOT remove merge entry yet, as it
+ // will be removed during the proper chain unwinding
+ if (mergeState.isInCycle()) {
+ assert(mergeState.getMergeCmd().get());
+ LOG(debug, "Not removing %s yet, since we're in a chain cycle",
+ mergeReply.toString().c_str());
+ // Next time we encounter the merge, however, it should be removed
+ mergeState.setInCycle(false);
+ return;
+ }
+ } else {
+ if (mergeState.isExecutingLocally()) {
+ assert(mergeState.getMergeCmd().get());
+ // If we get a reply for a merge that is not from the persistence layer
+ // although it's still being processed there, it means the cycle has
+ // been broken, e.g by a node going down/being restarted/etc.
+ // Both unwind reply as well as reply to original will be sent
+ // when we finally get a reply from the persistence layer
+ mergeState.setInCycle(false);
+ mergeState.setCycleBroken(true);
+ LOG(debug, "Got non-persistence reply for a %s which is currently "
+ "executing on this node; marking merge cycle as broken and replying "
+ "to both unwind and chain source once we get a reply from persistence",
+ mergeReply.toString().c_str());
+ return;
+ }
+ }
+
+ LOG(debug, "Found merge entry for %s, proceeding to unwind chain.",
+ mergeReply.toString().c_str());
+ // Send reply to the command associated with the merge, if requested.
+ // If we have received the reply from the persistence layer, we should
+ // not create a new reply since the one we got will already suffice
+ // for sending back to the previous link in the chain, UNLESS the
+ // cycle has been broken (see above), in which case we MUST send a reply
+ // immediately, or there will be merges forever stuck on nodes earlier
+ // in the chain
+ if (!fromPersistenceLayer || mergeState.isCycleBroken()) {
+ assert(mergeState.getMergeCmd().get());
+ if (!mergeState.isCycleBroken()) {
+ LOG(spam, "Creating new unwind reply to send back for %s",
+ mergeState.getMergeCmd()->toString().c_str());
+ } else {
+ assert(fromPersistenceLayer);
+ LOG(debug, "Creating new (broken cycle) unwind reply to send back for %s",
+ mergeState.getMergeCmd()->toString().c_str());
+ }
+
+ sendReply(static_cast<const api::MergeBucketCommand&>(
+ *mergeState.getMergeCmd()),
+ mergeReply.getResult(), msgGuard, _metrics->chaining);
+ } else {
+ LOG(spam, "Not creating new unwind reply; using existing "
+ "reply from persistence layer");
+ updateOperationMetrics(mergeReply.getResult(), _metrics->local);
+ }
+
+ DummyMbusMessage<mbus::Reply> dummyReply;
+ if (mergeReply.getResult().failed()) {
+ // Must be sure to add an error if reply contained a failure, since
+ // DynamicThrottlePolicy penalizes on failed transmissions
+ dummyReply.addError(mbus::Error(mergeReply.getResult().getResult(),
+ mergeReply.getResult().getMessage()));
+ }
+ _throttlePolicy->processReply(dummyReply);
+
+ // Remove merge now that we've done our part to unwind the chain
+ removeActiveMerge(mergeIter);
+ processQueuedMerges(msgGuard);
+}
+
+bool
+MergeThrottler::onSetSystemState(
+ const std::shared_ptr<api::SetSystemStateCommand>& stateCmd)
+{
+
+ LOG(debug,
+ "New cluster state arrived with version %u, flushing "
+ "all outdated queued merges",
+ stateCmd->getSystemState().getVersion());
+ handleOutdatedMerges(*stateCmd);
+
+ return false;
+}
+
+bool
+MergeThrottler::onDown(const std::shared_ptr<api::StorageMessage>& msg)
+{
+ if (isMergeCommand(*msg) || isMergeReply(*msg)) {
+ vespalib::MonitorGuard lock(_messageLock);
+ _messagesDown.push_back(msg);
+ lock.broadcast();
+ return true;
+ } else if (isDiffCommand(*msg)) {
+ vespalib::LockGuard lock(_stateLock);
+ api::StorageCommand& cmd(static_cast<api::StorageCommand&>(*msg));
+ if (bucketIsUnknownOrAborted(cmd.getBucketId())) {
+ sendUp(makeAbortReply(cmd, "no state recorded for bucket in merge "
+ "throttler, source merge probably aborted earlier"));
+ return true;
+ }
+ }
+ return StorageLink::onDown(msg);
+}
+
+bool
+MergeThrottler::isDiffCommand(const api::StorageMessage& msg) const
+{
+ return (msg.getType() == api::MessageType::GETBUCKETDIFF
+ || msg.getType() == api::MessageType::APPLYBUCKETDIFF);
+}
+
+bool
+MergeThrottler::isMergeCommand(const api::StorageMessage& msg) const
+{
+ return (msg.getType() == api::MessageType::MERGEBUCKET);
+}
+
+bool
+MergeThrottler::isMergeReply(const api::StorageMessage& msg) const
+{
+ return (msg.getType() == api::MessageType::MERGEBUCKET_REPLY);
+}
+
+bool
+MergeThrottler::bucketIsUnknownOrAborted(const document::BucketId& bucket) const
+{
+ ActiveMergeMap::const_iterator it(_merges.find(bucket));
+ if (it == _merges.end()) {
+ return true;
+ }
+ return it->second.isAborted();
+}
+
+std::shared_ptr<api::StorageMessage>
+MergeThrottler::makeAbortReply(api::StorageCommand& cmd,
+ vespalib::stringref reason) const
+{
+ LOG(debug, "Aborting message %s with reason '%s'",
+ cmd.toString().c_str(), reason.c_str());
+ std::unique_ptr<api::StorageReply> reply(cmd.makeReply());
+ reply->setResult(api::ReturnCode(api::ReturnCode::ABORTED, reason));
+ return std::shared_ptr<api::StorageMessage>(reply.release());
+}
+
+bool
+MergeThrottler::onUp(const std::shared_ptr<api::StorageMessage>& msg)
+{
+ if (isMergeReply(*msg)) {
+ const api::MergeBucketReply& mergeReply
+ = dynamic_cast<const api::MergeBucketReply&>(*msg);
+
+ LOG(spam, "Received %s from persistence layer",
+ mergeReply.toString().c_str());
+
+ vespalib::MonitorGuard lock(_messageLock);
+ _messagesUp.push_back(msg);
+ lock.broadcast();
+ return true;
+ }
+ return false;
+}
+
+void
+MergeThrottler::rendezvousWithWorkerThread(vespalib::MonitorGuard& guard)
+{
+ LOG(spam, "establishing rendezvous with worker thread");
+ assert(_rendezvous == RENDEZVOUS_NONE);
+ _rendezvous = RENDEZVOUS_REQUESTED;
+ guard.broadcast();
+ while (_rendezvous != RENDEZVOUS_ESTABLISHED) {
+ guard.wait();
+ }
+ LOG(spam, "rendezvous established with worker thread");
+}
+
+void
+MergeThrottler::releaseWorkerThreadRendezvous(vespalib::MonitorGuard& guard)
+{
+ _rendezvous = RENDEZVOUS_RELEASED;
+ guard.broadcast();
+ while (_rendezvous != RENDEZVOUS_NONE) {
+ guard.wait();
+ }
+}
+
+class ThreadRendezvousGuard
+{
+ MergeThrottler& _throttler;
+ vespalib::MonitorGuard& _guard;
+public:
+ ThreadRendezvousGuard(MergeThrottler& throttler,
+ vespalib::MonitorGuard& guard)
+ : _throttler(throttler),
+ _guard(guard)
+ {
+ _throttler.rendezvousWithWorkerThread(_guard);
+ }
+
+ ~ThreadRendezvousGuard() {
+ _throttler.releaseWorkerThreadRendezvous(_guard);
+ }
+};
+
+void
+MergeThrottler::handleOutdatedMerges(const api::SetSystemStateCommand& cmd)
+{
+ // When aborting merges, we must--before allowing message to go
+ // through--ensure that there are no queued or active merges for
+ // any of the aborted buckets. We must also rendezvous with the
+ // worker thread to ensure it does not have any concurrent messages
+ // in flight that can slip by our radar.
+ // Ideally, we'd be able to just rely on the existing version check when
+ // receiving merges, but this uses the _server_ object's cluster state,
+ // which isn't set yet at the time we get the new state command, so
+ // there exists a time window where outdated merges can be accepted. Blarg!
+ vespalib::MonitorGuard guard(_messageLock);
+ ThreadRendezvousGuard rzGuard(*this, guard);
+
+ if (_closing) return; // Shutting down anyway.
+
+ // No other code than this function and onFlush() should ever take both the
+ // message monitor and state lock at the same time, and onFlush() should
+ // never be called unless _closing is true. So it's impossible for this to
+ // deadlock given these assumptions, despite using differing acquisition
+ // ordering.
+ try {
+ MessageGuard stateGuard(_stateLock, *this);
+
+ uint32_t minimumVersion = cmd.getSystemState().getVersion();
+ rejectOperationsInThreadQueue(stateGuard, minimumVersion);
+ rejectOutdatedQueuedMerges(stateGuard, minimumVersion);
+ markActiveMergesAsAborted(minimumVersion);
+ } catch (std::exception& e) {
+ LOG(error, "Received exception during merge aborting: %s", e.what());
+ abort();
+ }
+
+ // Rendezvous released on scope exit
+}
+
+void
+MergeThrottler::rejectOperationsInThreadQueue(
+ MessageGuard& guard,
+ uint32_t minimumStateVersion)
+{
+ std::vector<api::StorageMessage::SP> messagesToLetThrough;
+ for (uint32_t i = 0; i < _messagesDown.size(); ++i) {
+ api::StorageMessage::SP& msg(_messagesDown[i]);
+ if (isMergeCommand(*msg)
+ && rejectMergeIfOutdated(msg, minimumStateVersion, guard))
+ {
+ } else {
+ messagesToLetThrough.push_back(msg);
+ }
+ }
+ _messagesDown.swap(messagesToLetThrough);
+}
+
+void
+MergeThrottler::markActiveMergesAsAborted(uint32_t minimumStateVersion)
+{
+ // Since actually sending abort replies for the merges already chained
+ // would pretty seriously mess up the assumptions we've made in the
+ // rest of the code, merely mark the merges as aborted. This will ensure
+ // that no diff commands can get through for them and that cycled merges
+ // are cut short.
+ for (auto& activeMerge : _merges) {
+ if (activeMerge.second._clusterStateVersion < minimumStateVersion) {
+ LOG(spam,
+ "Marking merge state for bucket %s as aborted",
+ activeMerge.first.toString().c_str());
+ activeMerge.second.setAborted(true);
+ }
+ }
+}
+
+void
+MergeThrottler::print(std::ostream& out, bool /*verbose*/,
+ const std::string& /*indent*/) const
+{
+ out << "MergeThrottler";
+}
+
+void
+MergeThrottler::reportHtmlStatus(std::ostream& out,
+ const framework::HttpUrlPath&) const
+{
+ vespalib::LockGuard lock(_stateLock);
+ {
+ out << "<p>Max pending: "
+ << _throttlePolicy->getMaxPendingCount()
+ << "</p>\n";
+ out << "<p>Please see node metrics for performance numbers</p>\n";
+ out << "<h3>Active merges ("
+ << _merges.size()
+ << ")</h3>\n";
+ ActiveMergeMap::const_iterator end = _merges.end();
+ if (!_merges.empty()) {
+ out << "<ul>\n";
+ for (ActiveMergeMap::const_iterator i = _merges.begin(); i != end; ++i) {
+ out << "<li>" << i->second.getMergeCmdString();
+ if (i->second.isExecutingLocally()) {
+ out << " <strong>(";
+ if (i->second.isInCycle()) {
+ out << "cycled - ";
+ } else if (i->second.isCycleBroken()) {
+ out << "broken cycle (another node in the chain likely went down) - ";
+ }
+ out << "executing on this node)</strong>";
+ } else if (i->second.isUnwinding()) {
+ out << " <strong>(was executed here, now unwinding)</strong>";
+ }
+ if (i->second.isAborted()) {
+ out << " <strong>aborted</strong>";
+ }
+ out << "</li>\n";
+ }
+ out << "</ul>\n";
+ } else {
+ out << "<p>None</p>\n";
+ }
+ }
+
+ {
+ out << "<h3>Queued merges (in priority order) ("
+ << _queue.size()
+ << ")</h3>\n";
+ MergePriorityQueue::const_iterator end = _queue.end();
+ if (!_queue.empty()) {
+ out << "<ol>\n";
+ for (MergePriorityQueue::const_iterator i = _queue.begin(); i != end; ++i) {
+ // The queue always owns its messages, thus this is safe
+ out << "<li>Pri "
+ << static_cast<unsigned int>(i->_msg->getPriority())
+ << ": " << *i->_msg;
+ out << "</li>\n";
+ }
+ out << "</ol>\n";
+ } else {
+ out << "<p>None</p>\n";
+ }
+ }
+}
+
+} // namespace storage
diff --git a/storage/src/vespa/storage/storageserver/mergethrottler.h b/storage/src/vespa/storage/storageserver/mergethrottler.h
new file mode 100644
index 00000000000..c58c11d48e0
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/mergethrottler.h
@@ -0,0 +1,475 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::MergeThrottler
+ * @ingroup storageserver
+ *
+ * @brief Throttler and forwarder of merge commands
+ */
+#pragma once
+
+#include <map>
+#include <utility>
+#include <vector>
+#include <set>
+#include <memory>
+#include <vespa/vespalib/util/document_runnable.h>
+#include <vespa/storage/config/config-stor-server.h>
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storage/distributor/messageguard.h>
+#include <vespa/storageapi/message/bucket.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/messagebus/staticthrottlepolicy.h>
+#include <vespa/metrics/metrics.h>
+#include <vespa/config/config.h>
+
+namespace storage {
+
+class AbortBucketOperationsCommand;
+
+class MergeThrottler : public framework::Runnable,
+ public StorageLink,
+ public framework::HtmlStatusReporter,
+ private config::IFetcherCallback<vespa::config::content::core::StorServerConfig>
+{
+public:
+ class MergeFailureMetrics : public metrics::MetricSet
+ {
+ public:
+ metrics::SumMetric<metrics::LongCountMetric> sum;
+ metrics::LongCountMetric notready;
+ metrics::LongCountMetric timeout;
+ metrics::LongCountMetric aborted;
+ metrics::LongCountMetric wrongdistribution;
+ metrics::LongCountMetric bucketnotfound;
+ metrics::LongCountMetric busy;
+ metrics::LongCountMetric exists;
+ metrics::LongCountMetric rejected;
+ metrics::LongCountMetric other;
+
+ MergeFailureMetrics(metrics::MetricSet* owner)
+ : metrics::MetricSet("failures", "", "Detailed failure statistics", owner),
+ sum("total", "", "Sum of all failures", this),
+ notready("notready", "", "The number of merges discarded "
+ "because distributor was not ready", this),
+ timeout("timeout", "", "The number of merges that failed because "
+ "they timed out towards storage", this),
+ aborted("aborted", "", "The number of merges that failed "
+ "because the storage node was (most likely) shutting down", this),
+ wrongdistribution("wrongdistribution", "", "The number of merges that "
+ "were discarded (flushed) because they were initiated at an "
+ "older cluster state than the current", this),
+ bucketnotfound("bucketnotfound", "", "The number of operations that failed "
+ "because the bucket did not exist", this),
+ busy("busy", "", "The number of merges that failed because the "
+ "storage node was busy", this),
+ exists("exists", "", "The number of merges that were rejected due to a "
+ "merge operation for their bucket already being processed", this),
+ rejected("rejected", "", "The number of merges that were rejected", this),
+ other("other", "", "The number of other failures", this)
+ {
+ sum.addMetricToSum(notready);
+ sum.addMetricToSum(timeout);
+ sum.addMetricToSum(aborted);
+ sum.addMetricToSum(wrongdistribution);
+ sum.addMetricToSum(bucketnotfound);
+ sum.addMetricToSum(busy);
+ sum.addMetricToSum(exists);
+ sum.addMetricToSum(rejected);
+ sum.addMetricToSum(other);
+ }
+ };
+
+ class MergeOperationMetrics : public metrics::MetricSet
+ {
+ public:
+ metrics::LongCountMetric ok;
+ MergeFailureMetrics failures;
+
+ MergeOperationMetrics(const std::string& name, metrics::MetricSet* owner)
+ : metrics::MetricSet(name, "", vespalib::make_string("Statistics for %s", name.c_str()), owner),
+ ok("ok", "", vespalib::make_string("The number of successful merges for '%s'", name.c_str()), this),
+ failures(this)
+ {
+ }
+ };
+
+ class Metrics : public metrics::MetricSet
+ {
+ public:
+ metrics::DoubleAverageMetric averageQueueWaitingTime;
+ MergeOperationMetrics chaining;
+ MergeOperationMetrics local;
+
+ Metrics(metrics::MetricSet* owner = 0)
+ : metrics::MetricSet("mergethrottler", "", "", owner),
+ averageQueueWaitingTime(
+ "averagequeuewaitingtime", "", "Average time a merge spends in "
+ "the throttler queue", this),
+ chaining("mergechains", this),
+ local("locallyexecutedmerges", this)
+ {
+ }
+ };
+
+private:
+ // TODO: make PQ with stable ordering into own, generic class
+ template <class MessageType>
+ struct StablePriorityOrderingWrapper
+ {
+ MessageType _msg;
+ metrics::MetricTimer _startTimer;
+ uint64_t _sequence;
+
+ StablePriorityOrderingWrapper(const MessageType& msg, uint64_t sequence)
+ : _msg(msg), _startTimer(), _sequence(sequence)
+ {
+ }
+
+ bool operator==(const StablePriorityOrderingWrapper& other) const {
+ return (*_msg == *other._msg
+ && _sequence == other._sequence);
+ }
+
+ bool operator<(const StablePriorityOrderingWrapper& other) const {
+ if (_msg->getPriority() < other._msg->getPriority()) {
+ return true;
+ }
+ return (_sequence < other._sequence);
+ }
+ };
+
+ struct ChainedMergeState
+ {
+ api::StorageMessage::SP _cmd;
+ std::string _cmdString; // For being able to print message even when we don't own it
+ uint64_t _clusterStateVersion;
+ bool _inCycle;
+ bool _executingLocally;
+ bool _unwinding;
+ bool _cycleBroken;
+ bool _aborted;
+
+ ChainedMergeState()
+ : _cmd(),
+ _cmdString(),
+ _clusterStateVersion(0),
+ _inCycle(false),
+ _executingLocally(false),
+ _unwinding(false),
+ _cycleBroken(false),
+ _aborted(false)
+ {
+ }
+
+ ChainedMergeState(const api::StorageMessage::SP& cmd, bool executing = false)
+ : _cmd(cmd),
+ _cmdString(cmd->toString()),
+ _clusterStateVersion(static_cast<const api::MergeBucketCommand&>(
+ *cmd).getClusterStateVersion()),
+ _inCycle(false),
+ _executingLocally(executing),
+ _unwinding(false),
+ _cycleBroken(false),
+ _aborted(false)
+ {
+ }
+ // Use default copy-constructor/assignment operator
+
+ bool isExecutingLocally() const { return _executingLocally; }
+ void setExecutingLocally(bool execLocally) { _executingLocally = execLocally; }
+
+ const api::StorageMessage::SP& getMergeCmd() const { return _cmd; }
+ void setMergeCmd(const api::StorageMessage::SP& cmd) {
+ _cmd = cmd;
+ if (cmd.get()) {
+ _cmdString = cmd->toString();
+ }
+ }
+
+ bool isInCycle() const { return _inCycle; }
+ void setInCycle(bool inCycle) { _inCycle = inCycle; }
+
+ bool isUnwinding() const { return _unwinding; }
+ void setUnwinding(bool unwinding) { _unwinding = unwinding; }
+
+ bool isCycleBroken() const { return _cycleBroken; }
+ void setCycleBroken(bool cycleBroken) { _cycleBroken = cycleBroken; }
+
+ bool isAborted() const { return _aborted; }
+ void setAborted(bool aborted) { _aborted = aborted; }
+
+ const std::string& getMergeCmdString() const { return _cmdString; }
+ };
+
+ typedef std::map<document::BucketId, ChainedMergeState> ActiveMergeMap;
+
+ // Use a set rather than a priority_queue, since we want to be
+ // able to iterate over the collection during status rendering
+ typedef std::set<
+ StablePriorityOrderingWrapper<api::StorageMessage::SP>
+ > MergePriorityQueue;
+
+ enum RendezvousState {
+ RENDEZVOUS_NONE,
+ RENDEZVOUS_REQUESTED,
+ RENDEZVOUS_ESTABLISHED,
+ RENDEZVOUS_RELEASED
+ };
+
+ ActiveMergeMap _merges;
+ MergePriorityQueue _queue;
+ std::size_t _maxQueueSize;
+ mbus::StaticThrottlePolicy::UP _throttlePolicy;
+ uint64_t _queueSequence; // TODO: move into a stable priority queue class
+ vespalib::Monitor _messageLock;
+ vespalib::Lock _stateLock;
+ config::ConfigFetcher _configFetcher;
+ // Messages pending to be processed by the worker thread
+ std::vector<api::StorageMessage::SP> _messagesDown;
+ std::vector<api::StorageMessage::SP> _messagesUp;
+ std::unique_ptr<Metrics> _metrics;
+ StorageComponent _component;
+ framework::Thread::UP _thread;
+ RendezvousState _rendezvous;
+ bool _closing;
+public:
+ /**
+ * windowSizeIncrement used for allowing unit tests to start out with more
+ * than 1 as their window size.
+ */
+ MergeThrottler(const config::ConfigUri & configUri,
+ StorageComponentRegister&);
+
+ ~MergeThrottler();
+
+ /** Implements document::Runnable::run */
+ void run(framework::ThreadHandle&);
+
+ void onOpen();
+ void onClose();
+ void onFlush(bool downwards);
+ bool onUp(const std::shared_ptr<api::StorageMessage>& msg);
+ bool onDown(const std::shared_ptr<api::StorageMessage>& msg);
+
+ bool onSetSystemState(const std::shared_ptr<api::SetSystemStateCommand>& stateCmd);
+
+ // For unit testing only
+ const ActiveMergeMap& getActiveMerges() const { return _merges; }
+ // For unit testing only
+ const MergePriorityQueue& getMergeQueue() const { return _queue; }
+ // For unit testing only
+ const mbus::StaticThrottlePolicy& getThrottlePolicy() const { return *_throttlePolicy; }
+ mbus::StaticThrottlePolicy& getThrottlePolicy() { return *_throttlePolicy; }
+ // For unit testing only
+ vespalib::Monitor& getMonitor() { return _messageLock; }
+ vespalib::Lock& getStateLock() { return _stateLock; }
+
+ Metrics& getMetrics() { return *_metrics; }
+
+ std::size_t getMaxQueueSize() const { return _maxQueueSize; }
+
+ void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ // HtmlStatusReporter implementation
+ void reportHtmlStatus(std::ostream&, const framework::HttpUrlPath&) const;
+private:
+ friend class ThreadRendezvousGuard; // impl in .cpp file
+
+ // Simple helper class for centralizing chaining logic
+ struct MergeNodeSequence
+ {
+ const api::MergeBucketCommand& _cmd;
+ std::vector<api::MergeBucketCommand::Node> _sortedNodes;
+ std::size_t _sortedIndex; // Index of current storage node in the sorted node sequence
+ const uint16_t _thisIndex; // Index of the current storage node
+
+ MergeNodeSequence(
+ const api::MergeBucketCommand& cmd,
+ uint16_t thisIndex);
+
+ std::size_t getSortedIndex() const { return _sortedIndex; }
+ const std::vector<api::MergeBucketCommand::Node>& getSortedNodes() const {
+ return _sortedNodes;
+ }
+ bool isIndexUnknown() const {
+ return (_sortedIndex == std::numeric_limits<std::size_t>::max());
+ }
+ /**
+ * This node is the merge executor if it's the first element in the
+ * _unsorted_ node sequence.
+ */
+ bool isMergeExecutor() const {
+ return (_cmd.getNodes()[0].index == _thisIndex);
+ }
+ uint16_t getExecutorNodeIndex() const{
+ return _cmd.getNodes()[0].index;
+ }
+ bool isLastNode() const {
+ return (_sortedIndex == _sortedNodes.size() - 1);
+ }
+ bool chainContainsIndex(uint16_t idx) const;
+ uint16_t getThisNodeIndex() const { return _thisIndex; }
+ /**
+ * Gets node to forward to in strictly increasing order.
+ */
+ uint16_t getNextNodeInChain() const;
+
+ /**
+ * Returns true iff the chain vector (which is implicitly sorted)
+ * pairwise compares equally to the vector of sorted node indices
+ */
+ bool isChainCompleted() const;
+ std::string getSequenceString() const;
+ };
+
+ /**
+ * Callback method for config system (IFetcherCallback)
+ */
+ void configure(std::unique_ptr<vespa::config::content::core::StorServerConfig> newConfig);
+
+ // NOTE: unless explicitly specified, all the below functions require
+ // _sync lock to be held upon call (usually implicitly via MessageGuard)
+
+ void handleMessageDown(
+ const std::shared_ptr<api::StorageMessage>& msg,
+ MessageGuard& msgGuard);
+
+ void handleMessageUp(
+ const std::shared_ptr<api::StorageMessage>& msg,
+ MessageGuard& msgGuard);
+
+ /**
+ * Handle the receival of MergeBucketReply, be it from another node
+ * or from the persistence layer on the current node itself. In the
+ * case of the former, fromPersistenceLayer must be false, since we have
+ * to generate a new reply to pass back to the unwind chain. In
+ * case of the latter, fromPersistenceLayer must be true since the
+ * reply from the persistence layer will be automatically sent
+ * back in the chain.
+ */
+ void processMergeReply(
+ const std::shared_ptr<api::StorageMessage>& msg,
+ bool fromPersistenceLayer,
+ MessageGuard& msgGuard);
+
+ /**
+ * Validate that the merge command is consistent with our current
+ * state.
+ * @return true if message is valid and may be further processed.
+ * If false is returned, a rejection reply will have been sent up
+ * on the message guard.
+ */
+ bool validateNewMerge(
+ const api::MergeBucketCommand& mergeCmd,
+ const MergeNodeSequence& nodeSeq,
+ MessageGuard& msgGuard) const;
+ /**
+ * Register a new merge bucket command with the internal state and
+ * either forward or execute it, depending on where the current node
+ * is located in the merge chain.
+ *
+ * Precondition: no existing merge state exists for msg's bucketid.
+ */
+ void processNewMergeCommand(
+ const api::StorageMessage::SP& msg,
+ MessageGuard& msgGuard);
+
+ /**
+ * Precondition: an existing merge state exists for msg's bucketid.
+ * @return true if message was handled, false otherwise (see onUp/onDown).
+ */
+ bool processCycledMergeCommand(
+ const api::StorageMessage::SP& msg,
+ MessageGuard& msgGuard);
+
+ /**
+ * Forwards the given MergeBucketCommand to the storage node given
+ * by nodeIndex. New forwarded message will inherit mergeCmd's priority.
+ * The current node's index will be added to the end of the merge
+ * chain vector.
+ */
+ void forwardCommandToNode(
+ const api::MergeBucketCommand& mergeCmd,
+ uint16_t nodeIndex,
+ MessageGuard& msgGuard);
+
+ void removeActiveMerge(ActiveMergeMap::iterator);
+
+ /**
+ * Gets (and pops) the highest priority merge waiting in the queue,
+ * if one exists.
+ * @return Highest priority waiting merge or null SP if queue is empty
+ */
+ api::StorageMessage::SP getNextQueuedMerge();
+
+ void enqueueMerge(
+ const api::StorageMessage::SP& msg,
+ MessageGuard& msgGuard);
+
+ /**
+ * @return true if throttle policy says at least one additional
+ * merge can be processed.
+ */
+ bool canProcessNewMerge() const;
+
+ void sendReply(const api::MergeBucketCommand& cmd,
+ const api::ReturnCode& result,
+ MessageGuard& msgGuard,
+ MergeOperationMetrics& metrics) const;
+
+ /**
+ * @return true if a merge for msg's bucketid is already registered
+ * in the internal merge throttler state.
+ */
+ bool isMergeAlreadyKnown(const api::StorageMessage::SP& msg) const;
+
+ bool rejectMergeIfOutdated(
+ const api::StorageMessage::SP& msg,
+ uint32_t rejectLessThanVersion,
+ MessageGuard& msgGuard) const;
+
+ /**
+ * Immediately reject all queued merges whose cluster state version is
+ * less than that of rejectLessThanVersion
+ */
+ void rejectOutdatedQueuedMerges(MessageGuard& msgGuard,
+ uint32_t rejectLessThanVersion);
+
+ bool attemptProcessNextQueuedMerge(MessageGuard& msgGuard);
+
+ bool processQueuedMerges(MessageGuard& msgGuard);
+
+ void handleRendezvous(vespalib::MonitorGuard& guard);
+
+ void rendezvousWithWorkerThread(vespalib::MonitorGuard&);
+
+ void releaseWorkerThreadRendezvous(vespalib::MonitorGuard&);
+
+ bool isDiffCommand(const api::StorageMessage& msg) const;
+
+ bool isMergeCommand(const api::StorageMessage& msg) const;
+
+ bool isMergeReply(const api::StorageMessage& msg) const;
+
+ bool bucketIsUnknownOrAborted(const document::BucketId& bucket) const;
+
+ std::shared_ptr<api::StorageMessage> makeAbortReply(
+ api::StorageCommand& cmd,
+ vespalib::stringref reason) const;
+
+ void handleOutdatedMerges(const api::SetSystemStateCommand&);
+ void rejectOperationsInThreadQueue(MessageGuard&,
+ uint32_t minimumStateVersion);
+ void markActiveMergesAsAborted(uint32_t minimumStateVersion);
+
+ // const function, but metrics are mutable
+ void updateOperationMetrics(
+ const api::ReturnCode& result,
+ MergeOperationMetrics& metrics) const;
+};
+
+} // namespace storage
+
diff --git a/storage/src/vespa/storage/storageserver/messageallocationtypes.cpp b/storage/src/vespa/storage/storageserver/messageallocationtypes.cpp
new file mode 100644
index 00000000000..ee98740c133
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/messageallocationtypes.cpp
@@ -0,0 +1,90 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/messageallocationtypes.h>
+
+#include <vespa/storageapi/messageapi/storagemessage.h>
+
+namespace storage {
+
+MessageAllocationTypes::MessageAllocationTypes(framework::MemoryManagerInterface& manager)
+{
+ using api::MessageType;
+ using framework::MemoryAllocationType;
+
+ _types.resize(MessageType::MESSAGETYPE_MAX_ID);
+ _types[MessageType::DOCBLOCK_ID] = &manager.registerAllocationType(MemoryAllocationType("MESSAGE_DOCBLOCK"));
+ _types[MessageType::DOCBLOCK_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MESSAGE_DOCBLOCK_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::GET_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::GET", framework::MemoryAllocationType::EXTERNAL_LOAD));
+ _types[MessageType::GET_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::GET_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::INTERNAL_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::INTERNAL"));
+ _types[MessageType::INTERNAL_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::INTERNAL_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::PUT_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::PUT", framework::MemoryAllocationType::EXTERNAL_LOAD));
+ _types[MessageType::PUT_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::PUT_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::REMOVE_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::REMOVE", framework::MemoryAllocationType::EXTERNAL_LOAD));
+ _types[MessageType::REMOVE_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::REMOVE_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::REVERT_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::REVERT"));
+ _types[MessageType::REVERT_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::REVERT_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::VISITOR_CREATE_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::VISITOR_CREATE", framework::MemoryAllocationType::EXTERNAL_LOAD));
+ _types[MessageType::VISITOR_CREATE_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::VISITOR_CREATE_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::VISITOR_DESTROY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::VISITOR_DESTROY"));
+ _types[MessageType::VISITOR_DESTROY_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::VISITOR_DESTROY_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::REQUESTBUCKETINFO_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::REQUESTBUCKETINFO"));
+ _types[MessageType::REQUESTBUCKETINFO_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::REQUESTBUCKETINFO_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::NOTIFYBUCKETCHANGE_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::NOTIFYBUCKETCHANGE"));
+ _types[MessageType::NOTIFYBUCKETCHANGE_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::NOTIFYBUCKETCHANGE_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::CREATEBUCKET_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::CREATEBUCKET"));
+ _types[MessageType::CREATEBUCKET_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::CREATEBUCKET_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::MERGEBUCKET_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::MERGEBUCKET"));
+ _types[MessageType::MERGEBUCKET_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::MERGEBUCKET_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::DELETEBUCKET_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::DELETEBUCKET"));
+ _types[MessageType::DELETEBUCKET_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::DELETEBUCKET_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::SETNODESTATE_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::SETNODESTATE", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::SETNODESTATE_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::SETNODESTATE_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::GETNODESTATE_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::GETNODESTATE", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::GETNODESTATE_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::GETNODESTATE_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::SETSYSTEMSTATE_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::SETSYSTEMSTATE", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::SETSYSTEMSTATE_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::SETSYSTEMSTATE_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::GETSYSTEMSTATE_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::GETSYSTEMSTATE", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::GETSYSTEMSTATE_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::GETSYSTEMSTATE_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::GETBUCKETDIFF_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::GETBUCKETDIFF", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::GETBUCKETDIFF_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::GETBUCKETDIFF_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::APPLYBUCKETDIFF_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::APPLYBUCKETDIFF", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::APPLYBUCKETDIFF_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::APPLYBUCKETDIFF_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::VISITOR_INFO_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::VISITOR_INFO"));
+ _types[MessageType::VISITOR_INFO_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::VISITOR_INFO_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::SEARCHRESULT_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::SEARCHRESULT"));
+ _types[MessageType::SEARCHRESULT_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::SEARCHRESULT_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::SPLITBUCKET_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::SPLITBUCKET"));
+ _types[MessageType::SPLITBUCKET_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::SPLITBUCKET_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::JOINBUCKETS_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::JOINBUCKETS"));
+ _types[MessageType::JOINBUCKETS_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::JOINBUCKETS_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::SETBUCKETSTATE_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::SETBUCKETSTATE"));
+ _types[MessageType::SETBUCKETSTATE_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::SETBUCKETSTATE_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::MULTIOPERATION_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::MULTIOPERATION", framework::MemoryAllocationType::EXTERNAL_LOAD));
+ _types[MessageType::MULTIOPERATION_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::MULTIOPERATION_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::DOCUMENTSUMMARY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::DOCUMENTSUMMARY"));
+ _types[MessageType::DOCUMENTSUMMARY_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::DOCUMENTSUMMARY_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::MAPVISITOR_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::MAPVISITOR"));
+ _types[MessageType::MAPVISITOR_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::MAPVISITOR_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::STATBUCKET_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::STATBUCKET", framework::MemoryAllocationType::EXTERNAL_LOAD));
+ _types[MessageType::STATBUCKET_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::STATBUCKET_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::GETBUCKETLIST_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::GETBUCKETLIST", framework::MemoryAllocationType::EXTERNAL_LOAD));
+ _types[MessageType::GETBUCKETLIST_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::GETBUCKETLIST_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::DOCUMENTLIST_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::DOCUMENTLIST"));
+ _types[MessageType::DOCUMENTLIST_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::DOCUMENTLIST_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::UPDATE_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::UPDATE", framework::MemoryAllocationType::EXTERNAL_LOAD));
+ _types[MessageType::UPDATE_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::UPDATE_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::EMPTYBUCKETS_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::EMPTYBUCKETS"));
+ _types[MessageType::EMPTYBUCKETS_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::EMPTYBUCKETS_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::REMOVELOCATION_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::REMOVELOCATION", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::REMOVELOCATION_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::REMOVELOCATION_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::QUERYRESULT_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::QUERYRESULT"));
+ _types[MessageType::QUERYRESULT_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::QUERYRESULT_REPLY", framework::MemoryAllocationType::FORCE_ALLOCATE));
+ _types[MessageType::BATCHPUTREMOVE_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::BATCHPUTREMOVE", framework::MemoryAllocationType::EXTERNAL_LOAD));
+ _types[MessageType::BATCHPUTREMOVE_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::BATCHPUTREMOVE_REPLY", framework::MemoryAllocationType::EXTERNAL_LOAD));
+ _types[MessageType::BATCHDOCUMENTUPDATE_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::BATCHDOCUMENTUPDATE", framework::MemoryAllocationType::EXTERNAL_LOAD));
+ _types[MessageType::BATCHDOCUMENTUPDATE_REPLY_ID] = &manager.registerAllocationType(MemoryAllocationType("MessageType::BATCHDOCUMENTUPDATE_REPLY", framework::MemoryAllocationType::EXTERNAL_LOAD));
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/messageallocationtypes.h b/storage/src/vespa/storage/storageserver/messageallocationtypes.h
new file mode 100644
index 00000000000..3e9f15c2458
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/messageallocationtypes.h
@@ -0,0 +1,33 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::MessageAllocationTypes
+ *
+ * \brief Memory allocation types for messages in storage.
+ */
+#pragma once
+
+#include <sstream>
+#include <vespa/storageframework/generic/memory/memorymanagerinterface.h>
+#include <vector>
+#include <vespa/vespalib/util/exceptions.h>
+
+namespace storage {
+
+class MessageAllocationTypes {
+ std::vector<const framework::MemoryAllocationType*> _types;
+
+public:
+ MessageAllocationTypes(framework::MemoryManagerInterface& manager);
+
+ const framework::MemoryAllocationType& getType(uint32_t type) const {
+ if (_types.size() > size_t(type) && _types[type] != 0) {
+ return *_types[type];
+ }
+ std::ostringstream ost;
+ ost << "No type registered with value " << type << ".";
+ throw vespalib::IllegalArgumentException(ost.str(), VESPA_STRLOC);
+ }
+};
+
+}
+
diff --git a/storage/src/vespa/storage/storageserver/messagedispatcher.cpp b/storage/src/vespa/storage/storageserver/messagedispatcher.cpp
new file mode 100644
index 00000000000..a35b2c4157d
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/messagedispatcher.cpp
@@ -0,0 +1,234 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+/* $Id$ */
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/messagedispatcher.h>
+
+#include <vespa/log/log.h>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/storageapi/message/state.h>
+#include <storageapi/messageapi/chainedcommand.h>
+#include <storageapi/messageapi/chainedreply.h>
+
+LOG_SETUP(".message.dispatcher");
+
+using std::shared_ptr;
+
+namespace storage {
+
+MessageDispatcher::MessageDispatcher(StorageServerInterface& server)
+ : StorageLink(),
+ _access(),
+ _cache(),
+ _systemState(""),
+ _server(server)
+{
+}
+
+MessageDispatcher::~MessageDispatcher()
+{
+ closeNextLink();
+ LOG(debug, "Deleting link %s.", toString().c_str());
+}
+
+void
+MessageDispatcher::onClose()
+{
+ vespalib::LockGuard lock(_access);
+ for (std::map<api::StorageMessage::Id, std::shared_ptr<ReplyPair> >
+ ::iterator it = _cache.begin(); it != _cache.end(); ++it)
+ {
+ std::shared_ptr<api::ChainedReply> reply(it->second->first);
+ if (it->second->second != 0) {
+ reply->setResult(api::ReturnCode(api::ReturnCode::ABORTED,
+ "Storage node closing down. Aborting command."));
+ sendUp(reply);
+ it->second->second = 0;
+ }
+ }
+
+}
+
+void
+MessageDispatcher::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ out << "MessageDispatcher()";
+}
+
+bool MessageDispatcher::onDown(const shared_ptr<api::StorageMessage> & msg)
+{
+ if (msg->getType().isReply()) {
+ shared_ptr<api::ChainedReply> reply(
+ std::dynamic_pointer_cast<api::ChainedReply>(msg));
+ if (reply.get()) {
+ return handleReply(reply, false);
+ }
+ } else {
+ shared_ptr<api::ChainedCommand> cmd(
+ std::dynamic_pointer_cast<api::ChainedCommand>(msg));
+ if (cmd.get()) {
+ return handleCommand(cmd);
+ }
+ if (msg->getType() == api::MessageType::SETSYSTEMSTATE) {
+ shared_ptr<api::SetSystemStateCommand> stateCmd(
+ std::dynamic_pointer_cast<api::SetSystemStateCommand>(
+ msg));
+ assert(stateCmd.get());
+ _systemState = stateCmd->getSystemState();
+ LOG(debug, "Got new distributor state %s.",
+ _systemState.toString().c_str());
+ }
+ }
+ return false;
+}
+
+bool MessageDispatcher::onUp(const std::shared_ptr<api::StorageMessage> & msg)
+{
+ if (msg->getType().isReply()) {
+ shared_ptr<api::ChainedReply> reply(
+ std::dynamic_pointer_cast<api::ChainedReply>(msg));
+ if (reply.get()) {
+ return handleReply(reply, true);
+ }
+ }
+ return false;
+}
+
+bool MessageDispatcher::
+handleCommand(const std::shared_ptr<api::ChainedCommand> & cmd)
+{
+ // If we're the first node in the chain,
+ // the message has a bucket id related to it,
+ // and message came from wrong distributor, fail the message.
+ uint16_t expectedNode = 0xFFFF;
+ if (cmd->getSourceIndex() != 0xFFFF &&
+ cmd->hasBucketId() &&
+ !isCorrectDistributor(cmd->getBucketId(), cmd->getSourceIndex(),
+ expectedNode))
+ {
+ std::string msg;
+
+ if (expectedNode != 0xFFFF) {
+ msg = vespalib::make_string(
+ "Got chained command %s with bucket id %s from distributor "
+ "%d, which is wrong given our state. Correct should be %d. "
+ "Ignoring since we're primary node.",
+ cmd->getType().getName().c_str(),
+ cmd->getBucketId().toString().c_str(),
+ cmd->getSourceIndex(),
+ expectedNode);
+ } else {
+ msg = vespalib::make_string(
+ "Got chained command %s with bucket id %s, but no "
+ "distributors in system state. Haven't received system "
+ "state yet?",
+ cmd->getType().getName().c_str(),
+ cmd->getBucketId().toString().c_str());
+ }
+
+ LOG(debug, msg.c_str());
+ shared_ptr<api::StorageReply> reply(cmd->makeReply().release());
+ reply->setResult(api::ReturnCode(api::ReturnCode::ABORTED, msg));
+ sendUp(reply);
+ return true;
+
+ }
+ // If not used chained, just pass it through
+ if (!cmd->hasNodes()) {
+ LOG(spam, "Chained command contains no nodes, passing it through");
+ return false;
+ }
+ bool runLocally = cmd->getNodes().back()._run;
+ // If last node in chain, handle directly
+ if (cmd->getNodeCount() == 1) {
+ if (runLocally) {
+ LOG(spam, "Last node in chain, running it locally.");
+ return false;
+ } else {
+ LOG(spam, "Last node in chain, not running locally, so returning.");
+ shared_ptr<api::StorageReply> reply(cmd->makeReply().release());
+ sendUp(reply);
+ return true;
+ }
+ }
+ // Create commands first, as we need ids for cache.
+ shared_ptr<api::ChainedCommand> extCmd(cmd->clone());
+ shared_ptr<api::ChainedCommand> localCmd(runLocally ? cmd->clone() : 0);
+
+ // When stuff in cache, to be sure it's there when reply comes.
+ shared_ptr<api::ChainedReply> reply(dynamic_cast<api::ChainedReply*>(
+ cmd->makeReply().release()));
+ assert(reply.get());
+ {
+ vespalib::LockGuard lock(_access);
+ shared_ptr<ReplyPair> pair(new ReplyPair(reply, runLocally ? 2 : 1));
+ _cache[extCmd->getMsgId()] = pair;
+ if (localCmd.get()) {
+ _cache[localCmd->getMsgId()] = pair;
+ }
+ }
+ // Send external first since it will probably use the most time
+ extCmd->setSourceIndex(0xFFFF);
+ extCmd->getNodes().pop_back();
+ extCmd->setAddress(api::ServerAddress(_server.getClusterName(), "storage", extCmd->getNodes().back()._node));
+
+ LOG(spam, "Sending chained command on to node %d.",
+ extCmd->getNodes().back()._node);
+ sendUp(extCmd);
+ // Send internal copy if run locally flag is set
+ if (runLocally) {
+ LOG(spam, "Running chained command locally.");
+ localCmd->setSourceIndex(0xFFFF);
+ sendDown(localCmd);
+ }
+ return true;
+}
+
+bool
+MessageDispatcher::handleReply(
+ const std::shared_ptr<api::ChainedReply>& reply, bool localSource)
+{
+ // Ignore replies on their way up in the storage chain, with a
+ // destination object set. These are replies on commands not sent
+ // locally, thus not replies possibly for the message dispatcher.
+ if (localSource && !reply->isLocal()) return false;
+
+ vespalib::LockGuard lock(_access);
+ std::map<api::StorageMessage::Id, shared_ptr<ReplyPair> >::iterator it
+ = _cache.find(reply->getMsgId());
+ if (it == _cache.end()) {
+ return false; // Not for us
+ }
+ if (it->second.get() == 0) {
+ LOG(debug, "Reply already sent back (probably due to shutdown)");
+ return true; // Already sent
+ }
+ bool lastReply = (--it->second->second == 0);
+ if (!lastReply || localSource) {
+ it->second->first->appendState(*reply);
+ } else {
+ it->second->first->prependState(*reply);
+ }
+ if (lastReply) {
+ LOG(spam, "Last chained reply retrieved, sending original reply.");
+ sendUp(it->second->first);
+ } else {
+ LOG(spam, "Got chained reply, waiting for next");
+ }
+ _cache.erase(it);
+ return true;
+}
+
+bool
+MessageDispatcher::isCorrectDistributor(
+ const document::BucketId& id, uint16_t distributor, uint16_t& expected)
+{
+ std::vector<uint16_t> distributors;
+ (id).getIdealNodes(lib::NodeType::DISTRIBUTOR, _systemState, _server.getBucketIdFactory(), distributors);
+ return (distributors.size() > 0 && (expected = distributors[0]) == distributor);
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/messagedispatcher.h b/storage/src/vespa/storage/storageserver/messagedispatcher.h
new file mode 100644
index 00000000000..637356af3a4
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/messagedispatcher.h
@@ -0,0 +1,75 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::MessageDispatcher
+ * @ingroup storageserver
+ *
+ * @brief Sends messages through to multiple hosts.
+ *
+ * In VDS, some messages are sent to the first storage node, and the node itself
+ * should send the request on to another storage node and so on (put/remove).
+ * This link is responsible for receiving such messages, and send it through to
+ * next host, as well as through to the local host, wait for both responses and
+ * reply back. If one of the responses fails, it should issue a revert command.
+ *
+ * @author H�kon Humberset
+ * @date 2006-01-16
+ * @version $Id$
+ */
+
+#pragma once
+
+#include <vespa/vespalib/util/sync.h>
+#include <map>
+#include <vdslib/state/systemstate.h>
+#include <vespa/storage/common/storagelink.h>
+
+namespace storage {
+namespace api {
+ class BucketId;
+ class ChainedCommand;
+ class ChainedReply;
+}
+
+class MessageDispatcher : public StorageLink {
+ mutable vespalib::Lock _access;
+ typedef std::pair<std::shared_ptr<api::ChainedReply>, uint32_t> ReplyPair;
+ std::map<api::StorageMessage::Id, std::shared_ptr<ReplyPair> > _cache;
+ lib::ClusterState _systemState;
+ StorageServerInterface& _server;
+
+public:
+ explicit MessageDispatcher(StorageServerInterface& server);
+ ~MessageDispatcher();
+
+ virtual void onClose();
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ class Factory : public StorageLink::Factory {
+ public:
+ std::unique_ptr<StorageLink> create(const std::string& configId,
+ StorageServerInterface& server) const
+ {
+ (void) configId;
+ return std::unique_ptr<StorageLink>(new MessageDispatcher(server));
+ }
+ };
+
+private:
+
+ bool onDown(const std::shared_ptr<api::StorageMessage> & msg);
+ bool onUp(const std::shared_ptr<api::StorageMessage> & msg);
+
+ bool handleCommand(const std::shared_ptr<api::ChainedCommand>& cmd);
+ bool handleReply(const std::shared_ptr<api::ChainedReply>& reply,
+ bool localSource);
+
+ bool isCorrectDistributor(const document::BucketId& id, uint16_t distributor,
+ uint16_t& expected);
+
+};
+
+} // storage
+
+
diff --git a/storage/src/vespa/storage/storageserver/messagesink.cpp b/storage/src/vespa/storage/storageserver/messagesink.cpp
new file mode 100644
index 00000000000..617b81102c4
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/messagesink.cpp
@@ -0,0 +1,87 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/messagesink.h>
+
+#include <vespa/storageapi/message/persistence.h>
+#include <vespa/storage/storageutil/log.h>
+
+LOG_SETUP(".message.sink");
+
+using std::shared_ptr;
+
+namespace storage {
+
+MessageSink::MessageSink()
+ : StorageLink("Message Sink")
+{
+}
+
+MessageSink::~MessageSink()
+{
+ closeNextLink();
+}
+
+void
+MessageSink::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ out << "MessageSink";
+}
+
+namespace {
+#if 0
+ std::string getTimeString() {
+ char timeBuf[200];
+ time_t tm;
+ struct tm tms;
+ time(&tm);
+ gmtime_r(&tm, &tms);
+ strftime(timeBuf, sizeof(timeBuf), "%Y-%m-%d:%H:%M:%S %Z", &tms);
+ return std::string(timeBuf);
+ }
+#endif
+}
+
+IMPL_MSG_COMMAND_H(MessageSink, Get)
+{
+ //LOG(event, "[%s] Get %s", getTimeString().c_str(),
+ // cmd->getDocumentId()->toString());
+ shared_ptr<api::StorageReply> rmsg(new api::GetReply(*cmd));
+ rmsg->setResult(api::ReturnCode::NOT_IMPLEMENTED);
+ sendUp(rmsg);
+ return true;
+}
+
+IMPL_MSG_COMMAND_H(MessageSink, Put)
+{
+ //LOG(event, "[%s] Put %s", getTimeString().c_str(),
+ // cmd->getDocumentId()->toString());
+ shared_ptr<api::StorageReply> rmsg(new api::PutReply(*cmd));
+ rmsg->setResult(api::ReturnCode::OK);
+ sendUp(rmsg);
+ return true;
+}
+
+IMPL_MSG_COMMAND_H(MessageSink, Remove)
+{
+ //LOG(event, "[%s] Remove %s", getTimeString().c_str(),
+ // cmd->getDocumentId()->toString());
+ shared_ptr<api::StorageReply> rmsg(new api::RemoveReply(*cmd));
+ rmsg->setResult(api::ReturnCode::OK);
+ sendUp(rmsg);
+ return true;
+}
+
+IMPL_MSG_COMMAND_H(MessageSink, Revert)
+{
+ //LOG(event, "[%s] Revert %s", getTimeString().c_str(),
+ // cmd->getDocumentId()->toString());
+ shared_ptr<api::StorageReply> rmsg(new api::RevertReply(*cmd));
+ rmsg->setResult(api::ReturnCode::OK);
+ sendUp(rmsg);
+ return true;
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/messagesink.h b/storage/src/vespa/storage/storageserver/messagesink.h
new file mode 100644
index 00000000000..80a39e19538
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/messagesink.h
@@ -0,0 +1,38 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::MessageSink
+ * @ingroup storageserver
+ *
+ * @brief This class grabs persistence messages, and answers them without doing anything.
+ *
+ * @version $Id$
+ */
+
+#pragma once
+
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storage/common/storagelink.h>
+
+namespace storage {
+
+class MessageSink : public StorageLink {
+private:
+ MessageSink(const MessageSink &);
+ MessageSink& operator=(const MessageSink &);
+
+public:
+ explicit MessageSink();
+ ~MessageSink();
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+private:
+ DEF_MSG_COMMAND_H(Get);
+ DEF_MSG_COMMAND_H(Put);
+ DEF_MSG_COMMAND_H(Remove);
+ DEF_MSG_COMMAND_H(Revert);
+};
+
+}
+
diff --git a/storage/src/vespa/storage/storageserver/opslogger.cpp b/storage/src/vespa/storage/storageserver/opslogger.cpp
new file mode 100644
index 00000000000..b5d66c837c5
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/opslogger.cpp
@@ -0,0 +1,142 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/opslogger.h>
+
+#include <vespa/log/log.h>
+#include <vespa/storageapi/message/persistence.h>
+
+LOG_SETUP(".operationslogger");
+
+namespace storage {
+
+OpsLogger::OpsLogger(StorageComponentRegister& compReg,
+ const config::ConfigUri & configUri)
+ : StorageLink("Operations logger"),
+ _lock(),
+ _fileName(),
+ _targetFile(0),
+ _component(compReg, "opslogger"),
+ _configFetcher(configUri.getContext())
+{
+ _configFetcher.subscribe<vespa::config::content::core::StorOpsloggerConfig>(configUri.getConfigId(), this);
+ _configFetcher.start();
+}
+
+OpsLogger::~OpsLogger()
+{
+ closeNextLink();
+ LOG(debug, "Deleting link %s.", toString().c_str());
+
+ if (_targetFile) {
+ fclose(_targetFile);
+ }
+}
+
+void
+OpsLogger::onClose()
+{
+ // Avoid getting config during shutdown
+ _configFetcher.close();
+}
+
+void
+OpsLogger::configure(std::unique_ptr<vespa::config::content::core::StorOpsloggerConfig> config)
+{
+ vespalib::LockGuard lock(_lock);
+ // If no change in state, ignore
+ if (config->targetfile == _fileName) return;
+ // If a change we need to close old handle if open
+ if (_targetFile != 0) {
+ fclose(_targetFile);
+ _targetFile = 0;
+ }
+ // Set up the new operations log file
+ _fileName = config->targetfile;
+ if (_fileName.length() > 0) {
+ _targetFile = fopen(_fileName.c_str(), "a+b");
+
+ if (!_targetFile) {
+ LOG(warning, "Could not open file %s for operations logging",
+ _fileName.c_str());
+ }
+ }
+}
+
+void
+OpsLogger::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ out << "OpsLogger()";
+}
+
+bool
+OpsLogger::onPutReply(const std::shared_ptr<api::PutReply>& msg)
+{
+ if (_targetFile == 0) return false;
+ std::ostringstream ost;
+ ost << _component.getClock().getTimeInSeconds().getTime()
+ << "\tPUT\t" << msg->getDocumentId() << "\t"
+ << msg->getResult().toString() << "\n";
+ {
+ vespalib::LockGuard lock(_lock);
+ if (_targetFile == 0) return false;
+ fwrite(ost.str().c_str(), ost.str().length(), 1, _targetFile);
+ fflush(_targetFile);
+ }
+ return false;
+}
+
+bool
+OpsLogger::onUpdateReply(const std::shared_ptr<api::UpdateReply>& msg)
+{
+ if (_targetFile == 0) return false;
+ std::ostringstream ost;
+ ost << _component.getClock().getTimeInSeconds().getTime()
+ << "\tUPDATE\t" << msg->getDocumentId() << "\t"
+ << msg->getResult().toString() << "\n";
+ {
+ vespalib::LockGuard lock(_lock);
+ if (_targetFile == 0) return false;
+ fwrite(ost.str().c_str(), ost.str().length(), 1, _targetFile);
+ fflush(_targetFile);
+ }
+ return false;
+}
+
+bool
+OpsLogger::onRemoveReply(const std::shared_ptr<api::RemoveReply>& msg)
+{
+ if (_targetFile == 0) return false;
+ std::ostringstream ost;
+ ost << _component.getClock().getTimeInSeconds().getTime()
+ << "\tREMOVE\t" << msg->getDocumentId() << "\t"
+ << msg->getResult().toString() << "\n";
+ {
+ vespalib::LockGuard lock(_lock);
+ if (_targetFile == 0) return false;
+ fwrite(ost.str().c_str(), ost.str().length(), 1, _targetFile);
+ fflush(_targetFile);
+ }
+ return false;
+}
+
+bool
+OpsLogger::onGetReply(const std::shared_ptr<api::GetReply>& msg)
+{
+ if (_targetFile == 0) return false;
+ std::ostringstream ost;
+ ost << _component.getClock().getTimeInSeconds().getTime()
+ << "\tGET\t" << msg->getDocumentId() << "\t"
+ << msg->getResult().toString() << "\n";
+ {
+ vespalib::LockGuard lock(_lock);
+ if (_targetFile == 0) return false;
+ fwrite(ost.str().c_str(), ost.str().length(), 1, _targetFile);
+ fflush(_targetFile);
+ }
+ return false;
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/opslogger.h b/storage/src/vespa/storage/storageserver/opslogger.h
new file mode 100644
index 00000000000..6f6b52c4607
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/opslogger.h
@@ -0,0 +1,55 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+/**
+ * \class storage::OpsLogger
+ *
+ * \brief Storage link that can be configured to log all storage operations to
+ * a file.
+*/
+#pragma once
+
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storageframework/storageframework.h>
+
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storage/config/config-stor-opslogger.h>
+#include <vespa/vespalib/util/sync.h>
+#include <vespa/config/config.h>
+
+namespace storage {
+
+class OpsLogger : public StorageLink,
+ public config::IFetcherCallback<vespa::config::content::core::StorOpsloggerConfig> {
+public:
+ explicit OpsLogger(StorageComponentRegister&,
+ const config::ConfigUri & configUri);
+ ~OpsLogger();
+
+ void onClose();
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ bool onPutReply(const std::shared_ptr<api::PutReply>& msg);
+ bool onUpdateReply(const std::shared_ptr<api::UpdateReply>& msg);
+ bool onRemoveReply(const std::shared_ptr<api::RemoveReply>& msg);
+ bool onGetReply(const std::shared_ptr<api::GetReply>& msg);
+
+ /** Ignore all replies on the way down the storage chain. */
+ bool onDown(const std::shared_ptr<api::StorageMessage>&)
+ { return false; };
+
+ void configure(std::unique_ptr<vespa::config::content::core::StorOpsloggerConfig> config);
+
+private:
+ vespalib::Lock _lock;
+ std::string _fileName;
+ FILE* _targetFile;
+ framework::Component _component;
+
+ config::ConfigFetcher _configFetcher;
+};
+
+}
+
diff --git a/storage/src/vespa/storage/storageserver/priorityconverter.cpp b/storage/src/vespa/storage/storageserver/priorityconverter.cpp
new file mode 100644
index 00000000000..607dac95cb3
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/priorityconverter.cpp
@@ -0,0 +1,81 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/documentapi/messagebus/documentprotocol.h>
+#include "priorityconverter.h"
+
+namespace storage {
+
+PriorityConverter::PriorityConverter(const config::ConfigUri & configUri)
+ : _configFetcher(configUri.getContext())
+{
+ _configFetcher.subscribe<vespa::config::content::core::StorPrioritymappingConfig>(configUri.getConfigId(), this);
+ _configFetcher.start();
+}
+
+uint8_t
+PriorityConverter::toStoragePriority(documentapi::Priority::Value documentApiPriority) const
+{
+ const uint32_t index(static_cast<uint32_t>(documentApiPriority));
+ if (index >= PRI_ENUM_SIZE) {
+ return 255;
+ }
+
+ return _mapping[index];
+}
+
+documentapi::Priority::Value
+PriorityConverter::toDocumentPriority(uint8_t storagePriority) const
+{
+ vespalib::LockGuard guard(_mutex);
+ std::map<uint8_t, documentapi::Priority::Value>::const_iterator iter =
+ _reverseMapping.lower_bound(storagePriority);
+
+ if (iter != _reverseMapping.end()) {
+ return iter->second;
+ }
+
+ return documentapi::Priority::PRI_LOWEST;
+}
+
+void
+PriorityConverter::configure(std::unique_ptr<vespa::config::content::core::StorPrioritymappingConfig> config)
+{
+ // Data race free; _mapping is an array of std::atomic.
+ _mapping[documentapi::Priority::PRI_HIGHEST] = config->highest;
+ _mapping[documentapi::Priority::PRI_VERY_HIGH] = config->veryHigh;
+ _mapping[documentapi::Priority::PRI_HIGH_1] = config->high1;
+ _mapping[documentapi::Priority::PRI_HIGH_2] = config->high2;
+ _mapping[documentapi::Priority::PRI_HIGH_3] = config->high3;
+ _mapping[documentapi::Priority::PRI_NORMAL_1] = config->normal1;
+ _mapping[documentapi::Priority::PRI_NORMAL_2] = config->normal2;
+ _mapping[documentapi::Priority::PRI_NORMAL_3] = config->normal3;
+ _mapping[documentapi::Priority::PRI_NORMAL_4] = config->normal4;
+ _mapping[documentapi::Priority::PRI_NORMAL_5] = config->normal5;
+ _mapping[documentapi::Priority::PRI_NORMAL_6] = config->normal6;
+ _mapping[documentapi::Priority::PRI_LOW_1] = config->low1;
+ _mapping[documentapi::Priority::PRI_LOW_2] = config->low2;
+ _mapping[documentapi::Priority::PRI_LOW_3] = config->low3;
+ _mapping[documentapi::Priority::PRI_VERY_LOW] = config->veryLow;
+ _mapping[documentapi::Priority::PRI_LOWEST] = config->lowest;
+
+ vespalib::LockGuard guard(_mutex);
+ _reverseMapping.clear();
+ _reverseMapping[config->highest] = documentapi::Priority::PRI_HIGHEST;
+ _reverseMapping[config->veryHigh] = documentapi::Priority::PRI_VERY_HIGH;
+ _reverseMapping[config->high1] = documentapi::Priority::PRI_HIGH_1;
+ _reverseMapping[config->high2] = documentapi::Priority::PRI_HIGH_2;
+ _reverseMapping[config->high3] = documentapi::Priority::PRI_HIGH_3;
+ _reverseMapping[config->normal1] = documentapi::Priority::PRI_NORMAL_1;
+ _reverseMapping[config->normal2] = documentapi::Priority::PRI_NORMAL_2;
+ _reverseMapping[config->normal3] = documentapi::Priority::PRI_NORMAL_3;
+ _reverseMapping[config->normal4] = documentapi::Priority::PRI_NORMAL_4;
+ _reverseMapping[config->normal5] = documentapi::Priority::PRI_NORMAL_5;
+ _reverseMapping[config->normal6] = documentapi::Priority::PRI_NORMAL_6;
+ _reverseMapping[config->low1] = documentapi::Priority::PRI_LOW_1;
+ _reverseMapping[config->low2] = documentapi::Priority::PRI_LOW_2;
+ _reverseMapping[config->low3] = documentapi::Priority::PRI_LOW_3;
+ _reverseMapping[config->veryLow] = documentapi::Priority::PRI_VERY_LOW;
+ _reverseMapping[config->lowest] = documentapi::Priority::PRI_LOWEST;
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/priorityconverter.h b/storage/src/vespa/storage/storageserver/priorityconverter.h
new file mode 100644
index 00000000000..dbc51fc490a
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/priorityconverter.h
@@ -0,0 +1,43 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/storage/config/config-stor-prioritymapping.h>
+#include <vespa/config/config.h>
+#include <atomic>
+#include <array>
+
+namespace storage {
+
+class PriorityConverter
+ : public config::IFetcherCallback<
+ vespa::config::content::core::StorPrioritymappingConfig>
+{
+public:
+ typedef vespa::config::content::core::StorPrioritymappingConfig Config;
+
+ PriorityConverter(const config::ConfigUri& configUri);
+
+ /** Converts the given priority into a storage api priority number. */
+ uint8_t toStoragePriority(documentapi::Priority::Value) const;
+
+ /** Converts the given priority into a document api priority number. */
+ documentapi::Priority::Value toDocumentPriority(uint8_t) const;
+
+ void configure(std::unique_ptr<Config> config);
+
+private:
+ static_assert(documentapi::Priority::PRI_ENUM_SIZE == 16,
+ "Unexpected size of priority enumeration");
+ static_assert(documentapi::Priority::PRI_LOWEST == 15,
+ "Priority enum value out of bounds");
+ static constexpr size_t PRI_ENUM_SIZE = documentapi::Priority::PRI_ENUM_SIZE;
+
+ std::array<std::atomic<uint8_t>, PRI_ENUM_SIZE> _mapping;
+ std::map<uint8_t, documentapi::Priority::Value> _reverseMapping;
+ vespalib::Lock _mutex;
+
+ config::ConfigFetcher _configFetcher;
+};
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/prioritymapper.h b/storage/src/vespa/storage/storageserver/prioritymapper.h
new file mode 100644
index 00000000000..2a8b17d75af
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/prioritymapper.h
@@ -0,0 +1,43 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/documentapi/loadtypes/loadtype.h>
+#include <vespa/storage/config/config-stor-prioritymapping.h>
+
+namespace storage {
+
+class PriorityMapper
+{
+ std::vector<uint8_t> _priorities;
+
+public:
+ typedef vespa::config::content::core::internal::InternalStorPrioritymappingType Config;
+
+ PriorityMapper() : _priorities(16, 120) {}
+
+ void setConfig(const Config c) {
+ _priorities[documentapi::Priority::PRI_HIGHEST] = c.highest;
+ _priorities[documentapi::Priority::PRI_VERY_HIGH] = c.veryHigh;
+ _priorities[documentapi::Priority::PRI_HIGH_1] = c.high1;
+ _priorities[documentapi::Priority::PRI_HIGH_2] = c.high2;
+ _priorities[documentapi::Priority::PRI_HIGH_3] = c.high3;
+ _priorities[documentapi::Priority::PRI_NORMAL_1] = c.normal1;
+ _priorities[documentapi::Priority::PRI_NORMAL_2] = c.normal2;
+ _priorities[documentapi::Priority::PRI_NORMAL_3] = c.normal3;
+ _priorities[documentapi::Priority::PRI_NORMAL_4] = c.normal4;
+ _priorities[documentapi::Priority::PRI_NORMAL_5] = c.normal5;
+ _priorities[documentapi::Priority::PRI_NORMAL_6] = c.normal6;
+ _priorities[documentapi::Priority::PRI_LOW_1] = c.low1;
+ _priorities[documentapi::Priority::PRI_LOW_2] = c.low2;
+ _priorities[documentapi::Priority::PRI_LOW_3] = c.low3;
+ _priorities[documentapi::Priority::PRI_VERY_LOW] = c.veryLow;
+ _priorities[documentapi::Priority::PRI_LOWEST] = c.lowest;
+ }
+
+ uint8_t getPriority(const documentapi::LoadType& lt) const {
+ return _priorities[lt.getPriority()];
+ }
+};
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/rpcrequestwrapper.cpp b/storage/src/vespa/storage/storageserver/rpcrequestwrapper.cpp
new file mode 100644
index 00000000000..46d518b90c3
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/rpcrequestwrapper.cpp
@@ -0,0 +1,93 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+LOG_SETUP(".api.rpc.request");
+#include "rpcrequestwrapper.h"
+
+namespace storage {
+
+RPCRequestWrapper::RPCRequestWrapper(FRT_RPCRequest *req)
+ : _req(req)
+{
+}
+
+RPCRequestWrapper::~RPCRequestWrapper()
+{
+ if (_req != 0) {
+ _req->SetError(ERR_REQUEST_DELETED,
+ "Request deleted without having been replied to");
+ _req->Return();
+ }
+}
+
+const char *
+RPCRequestWrapper::getParam() const
+{
+ assert(_req != 0);
+ return _req->GetParams()->GetValue(0)._data._buf;
+}
+
+
+uint32_t
+RPCRequestWrapper::getParamLen() const
+{
+ assert(_req != 0);
+ return _req->GetParams()->GetValue(0)._data._len;
+}
+
+
+void
+RPCRequestWrapper::returnData(const char *pt, uint32_t len)
+{
+ assert(_req != 0);
+ _req->GetReturn()->AddData(pt, len);
+ _req->Return();
+ _req = 0;
+}
+
+
+void
+RPCRequestWrapper::returnError(uint32_t errorCode, const char *errorMessage)
+{
+ assert(_req != 0);
+ _req->SetError(errorCode, errorMessage);
+ _req->Return();
+ _req = 0;
+}
+
+void
+RPCRequestWrapper::addReturnString(const char *str, uint32_t len)
+{
+ assert(_req != 0);
+ if (len !=0) {
+ _req->GetReturn()->AddString(str, len);
+ } else {
+ _req->GetReturn()->AddString(str);
+ }
+}
+
+void
+RPCRequestWrapper::addReturnInt(uint32_t value)
+{
+ assert(_req != 0);
+ _req->GetReturn()->AddInt32(value);
+}
+
+void
+RPCRequestWrapper::returnRequest()
+{
+ assert(_req != 0);
+ _req->Return();
+ _req = 0;
+
+}
+
+void
+RPCRequestWrapper::discardBlobs()
+{
+ if (_req != 0) {
+ _req->DiscardBlobs();
+ }
+}
+
+} // namespace storage
diff --git a/storage/src/vespa/storage/storageserver/rpcrequestwrapper.h b/storage/src/vespa/storage/storageserver/rpcrequestwrapper.h
new file mode 100644
index 00000000000..a1e067ca974
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/rpcrequestwrapper.h
@@ -0,0 +1,70 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/fnet/frt/frt.h>
+
+namespace storage {
+
+/**
+ * This event wraps a request received from a remote rpc client.
+ **/
+class RPCRequestWrapper {
+public:
+ enum {
+ ERR_HANDLE_NOT_CONNECTED = 75000, // > 0xffff
+ ERR_HANDLE_GONE = 75001,
+ ERR_REQUEST_DELETED = 75002,
+ ERR_HANDLE_DISABLED = 75003,
+ ERR_NODE_SHUTTING_DOWN = 75004
+ };
+
+ RPCRequestWrapper(FRT_RPCRequest *req);
+ ~RPCRequestWrapper();
+
+ /**
+ * @return request parameter data
+ **/
+ const char *getParam() const;
+
+ /**
+ * @return request parameter length
+ **/
+ uint32_t getParamLen() const;
+
+ /**
+ * Return data for this request.
+ *
+ * @param pt return data
+ * @param len return data length
+ **/
+ void returnData(const char *pt, uint32_t len);
+
+ /**
+ * Return an error for this request
+ *
+ * @param errorCode numeric error code
+ * @param errorMessage human readable error message
+ **/
+ void returnError(uint32_t errorCode, const char *errorMessage);
+
+ const char *getMethodName() { return _req->GetMethodName(); }
+ void addReturnString(const char *str, uint32_t len=0);
+ void addReturnInt(uint32_t value);
+ void returnRequest();
+
+ /**
+ * Discard any large blobs from the underlying rpc request. This
+ * may be done after interpreting any parameters in order to save
+ * memory on the server.
+ **/
+ void discardBlobs();
+
+private:
+ RPCRequestWrapper(const RPCRequestWrapper &);
+ RPCRequestWrapper &operator=(const RPCRequestWrapper &);
+
+ FRT_RPCRequest *_req; // underlying RPC request
+};
+
+} // namespace storage
+
diff --git a/storage/src/vespa/storage/storageserver/servicelayernode.cpp b/storage/src/vespa/storage/storageserver/servicelayernode.cpp
new file mode 100644
index 00000000000..457001698ba
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/servicelayernode.cpp
@@ -0,0 +1,311 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/servicelayernode.h>
+
+#include <vespa/log/log.h>
+#include <vespa/persistence/spi/exceptions.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/bucketdb/bucketmanager.h>
+#include <vespa/storage/bucketdb/storagebucketdbinitializer.h>
+#include <vespa/storage/persistence/filestorage/filestormanager.h>
+#include <vespa/storage/persistence/filestorage/modifiedbucketchecker.h>
+#include <vespa/storage/storageserver/bouncer.h>
+#include <vespa/storage/storageserver/bucketintegritychecker.h>
+#include <vespa/storage/bucketmover/bucketmover.h>
+#include <vespa/storage/storageserver/communicationmanager.h>
+#include <vespa/storage/storageserver/changedbucketownershiphandler.h>
+#include <vespa/storage/storageserver/mergethrottler.h>
+#include <vespa/storage/storageserver/opslogger.h>
+#include <vespa/storage/storageserver/statemanager.h>
+#include <vespa/storage/visiting/messagebusvisitormessagesession.h>
+#include <vespa/storage/visiting/visitormanager.h>
+#include <vespa/storage/common/hostreporter/hostinfo.h>
+
+LOG_SETUP(".node.servicelayer");
+
+namespace storage {
+
+ServiceLayerNode::ServiceLayerNode(
+ const config::ConfigUri & configUri,
+ ServiceLayerNodeContext& context,
+ ApplicationGenerationFetcher& generationFetcher,
+ spi::PersistenceProvider& persistenceProvider,
+ const VisitorFactory::Map& externalVisitors)
+ : StorageNode(configUri, context, generationFetcher,
+ std::unique_ptr<HostInfo>(new HostInfo)),
+ _context(context),
+ _persistenceProvider(persistenceProvider),
+ _partitions(0),
+ _externalVisitors(externalVisitors),
+ _fileStorManager(0),
+ _init_has_been_called(false),
+ _noUsablePartitionMode(false)
+{
+}
+
+void ServiceLayerNode::init()
+{
+ assert( ! _init_has_been_called);
+ _init_has_been_called = true;
+ spi::Result initResult(_persistenceProvider.initialize());
+ if (initResult.hasError()) {
+ LOG(error, "Failed to initialize persistence provider: %s",
+ initResult.toString().c_str());
+ throw spi::HandledException(
+ "Failed provider init: " + initResult.toString(), VESPA_STRLOC);
+ }
+
+ spi::PartitionStateListResult result(
+ _persistenceProvider.getPartitionStates());
+ if (result.hasError()) {
+ LOG(error, "Failed to get partition list from persistence provider: %s",
+ result.toString().c_str());
+ throw spi::HandledException("Failed to get partition list: "
+ + result.toString(), VESPA_STRLOC);
+ }
+ _partitions = result.getList();
+ if (_partitions.size() == 0) {
+ LOG(error, "No partitions in persistence provider. See documentation "
+ "for your persistence provider as to how to set up "
+ "partitions in it.");
+ throw spi::HandledException("No partitions in provider", VESPA_STRLOC);
+ }
+ try{
+ initialize();
+ } catch (spi::HandledException& e) {
+ requestShutdown("Failed to initialize: " + e.getMessage());
+ throw;
+ } catch (const vespalib::NetworkSetupFailureException & e) {
+ LOG(warning, "Network failure: '%s'", e.what());
+ throw;
+ } catch (const vespalib::Exception & e) {
+ LOG(error, "Caught exception %s during startup. Calling destruct "
+ "functions in hopes of dying gracefully.",
+ e.getMessage().c_str());
+ requestShutdown("Failed to initialize: " + e.getMessage());
+ throw;
+ }
+}
+
+ServiceLayerNode::~ServiceLayerNode()
+{
+ assert(_init_has_been_called);
+ shutdown();
+}
+
+void
+ServiceLayerNode::subscribeToConfigs()
+{
+ StorageNode::subscribeToConfigs();
+ _configFetcher.reset(new config::ConfigFetcher(_configUri.getContext()));
+ _configFetcher->subscribe<vespa::config::storage::StorDevicesConfig>(_configUri.getConfigId(), this);
+
+ vespalib::LockGuard configLockGuard(_configLock);
+ _deviceConfig = std::move(_newDevicesConfig);
+ // Verify and set disk count
+ if (_serverConfig->diskCount != 0
+ && _serverConfig->diskCount != _partitions.size())
+ {
+ std::ostringstream ost;
+ ost << "Storage is configured to have " << _serverConfig->diskCount
+ << " disks but persistence provider states it has "
+ << _partitions.size() << " disks.";
+ throw vespalib::IllegalStateException(ost.str(), VESPA_STRLOC);
+ }
+ _context.getComponentRegister().setDiskCount(_partitions.size());
+}
+
+void
+ServiceLayerNode::removeConfigSubscriptions()
+{
+ StorageNode::removeConfigSubscriptions();
+ _configFetcher.reset(0);
+}
+
+void
+ServiceLayerNode::initializeNodeSpecific()
+{
+ // Give node state to mount point initialization, such that we can
+ // get disk count and state of unavailable disks set in reported
+ // node state.
+ NodeStateUpdater::Lock::SP lock(
+ _component->getStateUpdater().grabStateChangeLock());
+ lib::NodeState ns(*_component->getStateUpdater().getReportedNodeState());
+ ns.setDiskCount(_partitions.size());
+
+ uint32_t usablePartitions = 0;
+ for (uint32_t i = 0; i < _partitions.size(); ++i) {
+ if (_partitions[i].getState() == spi::PartitionState::UP) {
+ ++usablePartitions;
+ } else {
+ lib::DiskState diskState(lib::State::DOWN,
+ _partitions[i].getReason());
+ ns.setDiskState(i, diskState);
+ }
+ }
+
+ if (usablePartitions == 0) {
+ _noUsablePartitionMode = true;
+ ns.setState(lib::State::DOWN);
+ ns.setDescription("All partitions are down");
+ }
+ ns.setCapacity(_serverConfig->nodeCapacity);
+ ns.setReliability(_serverConfig->nodeReliability);
+ for (uint16_t i=0; i<_serverConfig->diskCapacity.size(); ++i) {
+ if (i >= ns.getDiskCount()) {
+ LOG(warning, "Capacity configured for partition %" PRIu64 " but only "
+ "%u partitions found.",
+ _serverConfig->diskCapacity.size(), ns.getDiskCount());
+ continue;
+ }
+ lib::DiskState ds(ns.getDiskState(i));
+ ds.setCapacity(_serverConfig->diskCapacity[i]);
+ ns.setDiskState(i, ds);
+ }
+ LOG(debug, "Adjusting reported node state to include partition count and "
+ "states, capacity and reliability: %s",
+ ns.toString().c_str());
+ _component->getStateUpdater().setReportedNodeState(ns);
+}
+
+#define DIFFER(a) (!(oldC.a == newC.a))
+#define ASSIGN(a) { oldC.a = newC.a; updated = true; }
+#define DIFFERWARN(a, b) \
+ if (DIFFER(a)) { LOG(warning, "Live config failure: %s.", b); }
+
+void
+ServiceLayerNode::handleLiveConfigUpdate()
+{
+ if (_newServerConfig.get() != 0) {
+ bool updated = false;
+ vespa::config::content::core::StorServerConfigBuilder oldC(*_serverConfig);
+ vespa::config::content::core::StorServerConfig& newC(*_newServerConfig);
+ DIFFERWARN(diskCount, "Cannot alter partition count of node live");
+ {
+ updated = false;
+ NodeStateUpdater::Lock::SP lock(
+ _component->getStateUpdater().grabStateChangeLock());
+ lib::NodeState ns(
+ *_component->getStateUpdater().getReportedNodeState());
+ if (DIFFER(nodeCapacity)) {
+ LOG(info, "Live config update: Updating node capacity "
+ "from %f to %f.",
+ oldC.nodeCapacity, newC.nodeCapacity);
+ ASSIGN(nodeCapacity);
+ ns.setCapacity(newC.nodeCapacity);
+ }
+ if (DIFFER(diskCapacity)) {
+ for (uint32_t i=0;
+ i<newC.diskCapacity.size() && i<ns.getDiskCount(); ++i)
+ {
+ if (newC.diskCapacity[i] != oldC.diskCapacity[i]) {
+ lib::DiskState ds(ns.getDiskState(i));
+ ds.setCapacity(newC.diskCapacity[i]);
+ ns.setDiskState(i, ds);
+ LOG(info, "Live config update: Disk capacity of "
+ "disk %u changed from %f to %f.",
+ i, oldC.diskCapacity[i], newC.diskCapacity[i]);
+ }
+ }
+ ASSIGN(diskCapacity);
+ }
+ if (DIFFER(nodeReliability)) {
+ LOG(info, "Live config update: Node reliability changed "
+ "from %u to %u.",
+ oldC.nodeReliability, newC.nodeReliability);
+ ASSIGN(nodeReliability);
+ ns.setReliability(newC.nodeReliability);
+ }
+ if (updated) {
+ _serverConfig.reset(new vespa::config::content::core::StorServerConfig(oldC));
+ _component->getStateUpdater().setReportedNodeState(ns);
+ }
+ }
+ }
+ StorageNode::handleLiveConfigUpdate();
+}
+
+void
+ServiceLayerNode::configure(
+ std::unique_ptr<vespa::config::storage::StorDevicesConfig> config)
+{
+ // When we get config, we try to grab the config lock to ensure noone
+ // else is doing configuration work, and then we write the new config
+ // to a variable where we can find it later when processing config
+ // updates
+ {
+ vespalib::LockGuard configLockGuard(_configLock);
+ _newDevicesConfig.reset(config.release());
+ }
+ if (_distributionConfig.get() != 0) handleLiveConfigUpdate();
+}
+
+VisitorMessageSession::UP
+ServiceLayerNode::createSession(Visitor& visitor, VisitorThread& thread)
+{
+ MessageBusVisitorMessageSession::UP mbusSession(
+ new MessageBusVisitorMessageSession(visitor, thread));
+ mbus::SourceSessionParams srcParams;
+ srcParams.setThrottlePolicy(mbus::IThrottlePolicy::SP());
+ srcParams.setReplyHandler(*mbusSession);
+ mbusSession->setSourceSession(
+ _communicationManager->getMessageBus().getMessageBus()
+ .createSourceSession(srcParams));
+ return VisitorMessageSession::UP(std::move(mbusSession));
+}
+
+documentapi::Priority::Value
+ServiceLayerNode::toDocumentPriority(uint8_t storagePriority) const
+{
+ return _communicationManager->getPriorityConverter().
+ toDocumentPriority(storagePriority);
+}
+
+StorageLink::UP
+ServiceLayerNode::createChain()
+{
+ ServiceLayerComponentRegister& compReg(_context.getComponentRegister());
+ StorageLink::UP chain;
+
+ chain.reset(_communicationManager = new CommunicationManager(
+ compReg, _configUri));
+ chain->push_back(StorageLink::UP(new Bouncer(compReg, _configUri)));
+ if (_noUsablePartitionMode) {
+ /*
+ * No usable partitions. Use minimal chain. Still needs to be
+ * able to report state back to cluster controller.
+ */
+ chain->push_back(StorageLink::UP(releaseStateManager().release()));
+ return chain;
+ }
+ chain->push_back(StorageLink::UP(new OpsLogger(compReg, _configUri)));
+ chain->push_back(StorageLink::UP(new MergeThrottler(_configUri, compReg)));
+ chain->push_back(StorageLink::UP(new ChangedBucketOwnershipHandler(_configUri, compReg)));
+ chain->push_back(StorageLink::UP(new BucketIntegrityChecker(_configUri, compReg)));
+ chain->push_back(StorageLink::UP(
+ new bucketmover::BucketMover(_configUri, compReg)));
+ chain->push_back(StorageLink::UP(new StorageBucketDBInitializer(
+ _configUri, _partitions, getDoneInitializeHandler(), compReg)));
+ chain->push_back(StorageLink::UP(new BucketManager(
+ _configUri, _context.getComponentRegister())));
+ chain->push_back(StorageLink::UP(new VisitorManager(
+ _configUri, _context.getComponentRegister(),
+ *this, _externalVisitors)));
+ chain->push_back(StorageLink::UP(new ModifiedBucketChecker(
+ _context.getComponentRegister(), _persistenceProvider,
+ _configUri)));
+ chain->push_back(StorageLink::UP(_fileStorManager = new FileStorManager(
+ _configUri, _partitions, _persistenceProvider,
+ _context.getComponentRegister())));
+ chain->push_back(StorageLink::UP(releaseStateManager().release()));
+ return chain;
+}
+
+ResumeGuard
+ServiceLayerNode::pause()
+{
+ return _fileStorManager->getFileStorHandler().pause();
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/servicelayernode.h b/storage/src/vespa/storage/storageserver/servicelayernode.h
new file mode 100644
index 00000000000..9c634e22c23
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/servicelayernode.h
@@ -0,0 +1,76 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::ServiceLayerNode
+ * \ingroup storageserver
+ *
+ * \brief Class for setting up a service layer node.
+ */
+
+#pragma once
+
+#include <vespa/persistence/spi/persistenceprovider.h>
+#include <vespa/storage/bucketdb/minimumusedbitstracker.h>
+#include <vespa/storage/storageserver/applicationgenerationfetcher.h>
+#include <vespa/storage/storageserver/servicelayernodecontext.h>
+#include <vespa/storage/storageserver/storagenode.h>
+#include <vespa/config-stor-devices.h>
+#include <vespa/config/config.h>
+
+namespace storage {
+
+class FileStorManager;
+
+class ServiceLayerNode
+ : public StorageNode,
+ private VisitorMessageSessionFactory,
+ private config::IFetcherCallback<vespa::config::storage::StorDevicesConfig>
+
+{
+ ServiceLayerNodeContext& _context;
+ spi::PersistenceProvider& _persistenceProvider;
+ spi::PartitionStateList _partitions;
+ VisitorFactory::Map _externalVisitors;
+ MinimumUsedBitsTracker _minUsedBitsTracker;
+
+ // FIXME: Should probably use the fetcher in StorageNode
+ std::unique_ptr<config::ConfigFetcher> _configFetcher;
+ std::unique_ptr<vespa::config::storage::StorDevicesConfig> _deviceConfig;
+ std::unique_ptr<vespa::config::storage::StorDevicesConfig> _newDevicesConfig;
+ FileStorManager* _fileStorManager;
+ bool _init_has_been_called;
+ bool _noUsablePartitionMode;
+
+public:
+ typedef std::unique_ptr<ServiceLayerNode> UP;
+
+ ServiceLayerNode(const config::ConfigUri & configUri,
+ ServiceLayerNodeContext& context,
+ ApplicationGenerationFetcher& generationFetcher,
+ spi::PersistenceProvider& persistenceProvider,
+ const VisitorFactory::Map& externalVisitors);
+ ~ServiceLayerNode();
+ /**
+ * Init must be called exactly once after construction and before destruction.
+ */
+ void init();
+
+ virtual const lib::NodeType& getNodeType() const
+ { return lib::NodeType::STORAGE; }
+
+ virtual ResumeGuard pause();
+
+private:
+ virtual void subscribeToConfigs();
+ virtual void initializeNodeSpecific();
+ virtual void handleLiveConfigUpdate();
+ virtual void configure(std::unique_ptr<vespa::config::storage::StorDevicesConfig> config);
+ virtual VisitorMessageSession::UP createSession(Visitor&, VisitorThread&);
+ virtual documentapi::Priority::Value toDocumentPriority(
+ uint8_t storagePriority) const;
+
+ virtual StorageLink::UP createChain();
+ virtual void removeConfigSubscriptions();
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageserver/servicelayernodecontext.cpp b/storage/src/vespa/storage/storageserver/servicelayernodecontext.cpp
new file mode 100644
index 00000000000..afdab9b6bce
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/servicelayernodecontext.cpp
@@ -0,0 +1,16 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/servicelayernodecontext.h>
+
+namespace storage {
+
+ServiceLayerNodeContext::ServiceLayerNodeContext(
+ framework::Clock::UP clock)
+ : StorageNodeContext(StorageComponentRegisterImpl::UP(new ServiceLayerComponentRegisterImpl),
+ std::move(clock)),
+ _componentRegister(dynamic_cast<ComponentRegister&>(StorageNodeContext::getComponentRegister()))
+{
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/servicelayernodecontext.h b/storage/src/vespa/storage/storageserver/servicelayernodecontext.h
new file mode 100644
index 00000000000..2890e76a4a3
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/servicelayernodecontext.h
@@ -0,0 +1,47 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::ServiceLayerNodeContext
+ * @ingroup storageserver
+ *
+ * @brief Context needed by node, that can also be used by others
+ *
+ * This utility class sets up the default component register implementation.
+ * It also sets up the clock and the threadpool, such that the most basic
+ * features are available to the provider, before the service layer is set up.
+ *
+ * The service layer still provides the memory manager functionality though,
+ * so you cannot retrieve the memory manager before the service layer has
+ * started up. (Before getPartitionStates() have been called on provider)
+ */
+
+#pragma once
+
+#include <vespa/storage/frameworkimpl/component/servicelayercomponentregisterimpl.h>
+#include <vespa/storage/storageserver/storagenodecontext.h>
+
+namespace storage {
+
+struct ServiceLayerNodeContext : public StorageNodeContext {
+ // Typedefs to simplify the remainder of the interface
+ typedef ServiceLayerComponentRegisterImpl ComponentRegister;
+
+ /**
+ * You can provide your own clock implementation. Useful in testing where
+ * you want to fake the clock.
+ */
+ ServiceLayerNodeContext(
+ framework::Clock::UP clock = framework::Clock::UP(new RealClock));
+
+ /**
+ * Get the actual component register. Available as the actual type as the
+ * storage server need to set implementations, and the components need the
+ * actual component register interface.
+ */
+ ComponentRegister& getComponentRegister() { return _componentRegister; }
+
+private:
+ ComponentRegister& _componentRegister;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageserver/statemanager.cpp b/storage/src/vespa/storage/storageserver/statemanager.cpp
new file mode 100644
index 00000000000..c55afb03788
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/statemanager.cpp
@@ -0,0 +1,573 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/defaults.h>
+#include <vespa/storage/storageserver/statemanager.h>
+
+#include <vespa/log/log.h>
+#include <fstream>
+#include <vespa/metrics/jsonwriter.h>
+#include <vespa/metrics/metricmanager.h>
+#include <vespa/storage/config/config-stor-server.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <vespa/storage/storageserver/storagemetricsset.h>
+#include <vespa/storage/common/bucketoperationlogger.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/vespalib/io/fileutil.h>
+
+LOG_SETUP(".state.manager");
+
+namespace storage {
+
+StateManager::StateManager(StorageComponentRegister& compReg,
+ metrics::MetricManager& metricManager,
+ std::unique_ptr<HostInfo> hostInfo,
+ bool testMode)
+ : StorageLink("State manager"),
+ framework::HtmlStatusReporter("systemstate", "Node and system state"),
+ _noThreadTestMode(testMode),
+ _component(compReg, "statemanager"),
+ _metricManager(metricManager),
+ _stateLock(),
+ _listenerLock(),
+ _grabbedExternalLock(false),
+ _notifyingListeners(false),
+ _nodeState(new lib::NodeState(
+ _component.getNodeType(), lib::State::INITIALIZING)),
+ _nextNodeState(),
+ _systemState(new lib::ClusterState),
+ _nextSystemState(),
+ _stateListeners(),
+ _queuedStateRequests(),
+ _threadMonitor(),
+ _lastProgressUpdateCausingSend(0),
+ _progressLastInitStateSend(-1),
+ _systemStateHistory(),
+ _systemStateHistorySize(50),
+ _hostInfo(std::move(hostInfo))
+{
+ _nodeState->setMinUsedBits(58);
+ _nodeState->setStartTimestamp(
+ _component.getClock().getTimeInSeconds().getTime());
+ _component.registerStatusPage(*this);
+}
+
+StateManager::~StateManager()
+{
+ closeNextLink();
+ LOG(debug, "Deleting link %s.", toString().c_str());
+ if (_thread.get() != 0) {
+ LOG(debug, "onClose() not called before destructor");
+ _thread->interruptAndJoin(&_threadMonitor);
+ }
+}
+
+void
+StateManager::onOpen()
+{
+ framework::MilliSecTime maxProcessingTime(30 * 1000);
+ if (!_noThreadTestMode) {
+ _thread = _component.startThread(*this, maxProcessingTime);
+ }
+}
+
+void
+StateManager::onClose()
+{
+ if (_thread.get() != 0) {
+ _thread->interruptAndJoin(&_threadMonitor);
+ _thread.reset(0);
+ }
+ sendGetNodeStateReplies();
+}
+
+void
+StateManager::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ out << "StateManager()";
+}
+
+#ifdef ENABLE_BUCKET_OPERATION_LOGGING
+namespace {
+
+vespalib::string
+escapeHtml(vespalib::stringref str)
+{
+ vespalib::asciistream ss;
+ for (size_t i = 0; i < str.size(); ++i) {
+ switch (str[i]) {
+ case '<':
+ ss << "&lt;";
+ break;
+ case '>':
+ ss << "&gt;";
+ break;
+ case '&':
+ ss << "&amp;";
+ break;
+ default:
+ ss << str[i];
+ }
+ }
+ return ss.str();
+}
+
+}
+#endif
+
+void
+StateManager::reportHtmlStatus(std::ostream& out,
+ const framework::HttpUrlPath& path) const
+{
+ (void) path;
+#ifdef ENABLE_BUCKET_OPERATION_LOGGING
+ if (path.hasAttribute("history")) {
+ std::istringstream iss(path.getAttribute("history"), std::istringstream::in);
+ uint64_t rawId;
+ iss >> std::hex >> rawId;
+ document::BucketId bid(rawId);
+ out << "<h3>History for " << bid << "</h3>\n";
+ vespalib::string history(
+ debug::BucketOperationLogger::getInstance().getHistory(bid));
+ out << "<pre>" << escapeHtml(history) << "</pre>\n";
+ return;
+ } else if (path.hasAttribute("search")) {
+ vespalib::string substr(path.getAttribute("search"));
+ out << debug::BucketOperationLogger::getInstance()
+ .searchBucketHistories(substr, "/systemstate?history=");
+ return;
+ }
+#endif
+
+ {
+ vespalib::LockGuard lock(_stateLock);
+ out << "<h1>Current system state</h1>\n"
+ << "<code>" << _systemState->toString(true) << "</code>\n"
+ << "<h1>Current node state</h1>\n"
+ << "<code>" << _systemState->getNodeState(lib::Node(
+ _component.getNodeType(), _component.getIndex())
+ ).toString(true)
+ << "</code>\n"
+ << "<h1>Reported node state</h1>\n"
+ << "<code>" << _nodeState->toString(true) << "</code>\n"
+ << "<h1>Pending state requests</h1>\n"
+ << _queuedStateRequests.size() << "\n"
+ << "<h1>System state history</h1>\n"
+ << "<table border=\"1\"><tr>"
+ << "<th>Received at time</th><th>State</th></tr>\n";
+ for (std::deque<TimeSysStatePair>::const_reverse_iterator it
+ = _systemStateHistory.rbegin();
+ it != _systemStateHistory.rend(); ++it)
+ {
+ out << "<tr><td>" << it->first << "</td><td>"
+ << *it->second << "</td></tr>\n";
+ }
+ out << "</table>\n";
+ }
+}
+
+lib::Node
+StateManager::thisNode() const
+{
+ return lib::Node(_component.getNodeType(), _component.getIndex());
+}
+
+lib::NodeState::CSP
+StateManager::getReportedNodeState() const
+{
+ vespalib::LockGuard lock(_stateLock);
+ return _nodeState;
+}
+
+lib::NodeState::CSP
+StateManager::getCurrentNodeState() const
+{
+ vespalib::LockGuard lock(_stateLock);
+ return lib::NodeState::SP(new lib::NodeState(
+ _systemState->getNodeState(thisNode())));
+}
+
+lib::ClusterState::CSP
+StateManager::getSystemState() const
+{
+ vespalib::LockGuard lock(_stateLock);
+ return _systemState;
+}
+
+void
+StateManager::addStateListener(StateListener& listener)
+{
+ vespalib::LockGuard lock(_listenerLock);
+ _stateListeners.push_back(&listener);
+}
+
+void
+StateManager::removeStateListener(StateListener& listener)
+{
+ vespalib::LockGuard lock(_listenerLock);
+ for (std::list<StateListener*>::iterator it = _stateListeners.begin();
+ it != _stateListeners.end();)
+ {
+ if (*it == &listener) {
+ it = _stateListeners.erase(it);
+ } else {
+ ++it;
+ }
+ }
+}
+
+struct StateManager::ExternalStateLock : public NodeStateUpdater::Lock {
+ StateManager& _manager;
+
+ ExternalStateLock(StateManager& manager) : _manager(manager) {}
+ ~ExternalStateLock() {
+ {
+ vespalib::MonitorGuard lock(_manager._stateLock);
+ _manager._grabbedExternalLock = false;
+ lock.broadcast();
+ }
+ _manager.notifyStateListeners();
+ }
+};
+
+NodeStateUpdater::Lock::SP
+StateManager::grabStateChangeLock()
+{
+ vespalib::MonitorGuard lock(_stateLock);
+ while (_grabbedExternalLock || _nextNodeState.get()) {
+ lock.wait();
+ }
+ _grabbedExternalLock = true;
+ return Lock::SP(new ExternalStateLock(*this));
+}
+
+void
+StateManager::setReportedNodeState(const lib::NodeState& state)
+{
+ vespalib::LockGuard lock(_stateLock);
+ if (!_grabbedExternalLock) {
+ LOG(error,
+ "Cannot set reported node state without first having "
+ "grabbed external lock");
+ assert(false);
+ }
+ if (_nodeState->getDiskCount() != 0 &&
+ state.getDiskCount() != _nodeState->getDiskCount())
+ {
+ std::ostringstream ost;
+ ost << "Illegal to alter disk count after initialization. Tried to "
+ << "alter disk count from " << _nodeState->getDiskCount()
+ << " to " << state.getDiskCount();
+ throw vespalib::IllegalArgumentException(ost.str(), VESPA_STRLOC);
+ }
+ LOG(debug, "Adjusting reported node state to %s -> %s",
+ _nodeState->toString().c_str(), state.toString().c_str());
+ _nextNodeState.reset(new lib::NodeState(state));
+}
+
+/**
+ * Allows reentrent calls, in case a listener calls setNodeState or similar.
+ * We solve this by detecting that we're already notifying listeners, and then
+ * doing it over and over again until noone alters the state in the callback.
+ */
+void
+StateManager::notifyStateListeners()
+{
+ using lib::State;
+ if (_notifyingListeners) return;
+ vespalib::LockGuard listenerLock(_listenerLock);
+ _notifyingListeners = true;
+ lib::NodeState::SP newState;
+ while (true) {
+ {
+ vespalib::MonitorGuard stateLock(_stateLock);
+ if (_nextNodeState.get() == 0 && _nextSystemState.get() == 0) {
+ _notifyingListeners = false;
+ stateLock.broadcast();
+ break; // No change
+ }
+ if (_nextNodeState.get() != 0) {
+ assert(!(_nodeState->getState() == State::UP
+ && _nextNodeState->getState() == State::INITIALIZING));
+
+ if (_nodeState->getState() == State::INITIALIZING
+ && _nextNodeState->getState() == State::INITIALIZING
+ && _component.getClock().getTimeInMillis()
+ - _lastProgressUpdateCausingSend
+ < framework::MilliSecTime(1000)
+ && _nextNodeState->getInitProgress() < 1
+ && _nextNodeState->getInitProgress()
+ - _progressLastInitStateSend < 0.01)
+ {
+ // For this special case, where we only have gotten a little
+ // initialization progress and we have reported recently,
+ // don't trigger sending get node state reply yet.
+ } else {
+ newState = _nextNodeState;
+ if (!_queuedStateRequests.empty()
+ && _nextNodeState->getState() == State::INITIALIZING)
+ {
+ _lastProgressUpdateCausingSend
+ = _component.getClock().getTimeInMillis();
+ _progressLastInitStateSend
+ = newState->getInitProgress();
+ } else {
+ _lastProgressUpdateCausingSend
+ = framework::MilliSecTime(0);
+ _progressLastInitStateSend = -1;
+ }
+ }
+ _nodeState = _nextNodeState;
+ _nextNodeState.reset();
+ }
+ if (_nextSystemState.get() != 0) {
+ enableNextClusterState();
+ }
+ stateLock.broadcast();
+ }
+ for (std::list<StateListener*>::iterator it = _stateListeners.begin();
+ it != _stateListeners.end(); ++it)
+ {
+ (**it).handleNewState();
+ // If one of them actually altered the state again, abort
+ // sending events, update states and send new one to all.
+ if (_nextNodeState.get() != 0 || _nextSystemState.get() != 0) break;
+ }
+ }
+ if (newState.get() != 0) sendGetNodeStateReplies();
+ _notifyingListeners = false;
+}
+
+void
+StateManager::enableNextClusterState()
+{
+ if (_systemStateHistory.size() >= _systemStateHistorySize) {
+ _systemStateHistory.pop_front();
+ }
+ // _systemState must be non-null due to being initially set to an empty,
+ // new cluster state upon construction and because it can only be
+ // overwritten by a non-null pending cluster state afterwards.
+ logNodeClusterStateTransition(*_systemState, *_nextSystemState);
+ _systemState = _nextSystemState;
+ _nextSystemState.reset();
+ _systemStateHistory.push_back(TimeSysStatePair(
+ _component.getClock().getTimeInMillis(), _systemState));
+}
+
+void
+StateManager::logNodeClusterStateTransition(
+ const lib::ClusterState& currentState,
+ const lib::ClusterState& newState) const
+{
+ lib::Node self(thisNode());
+ const lib::State& before(currentState.getNodeState(self).getState());
+ const lib::State& after(newState.getNodeState(self).getState());
+ if (before != after) {
+ LOG(info, "Transitioning from state '%s' to '%s' "
+ "(cluster state version %u)",
+ before.getName().c_str(),
+ after.getName().c_str(),
+ newState.getVersion());
+ }
+}
+
+bool
+StateManager::onGetNodeState(const api::GetNodeStateCommand::SP& cmd)
+{
+ bool sentReply = false;
+ if (cmd->getSourceIndex() != 0xffff) {
+ sentReply = sendGetNodeStateReplies(framework::MilliSecTime(0),
+ cmd->getSourceIndex());
+ }
+ std::shared_ptr<api::GetNodeStateReply> reply;
+ {
+ vespalib::LockGuard lock(_stateLock);
+ if (cmd->getExpectedState() != 0
+ && (*cmd->getExpectedState() == *_nodeState || sentReply))
+ {
+ LOG(debug, "Received get node state request with timeout of "
+ "%" PRIu32 " milliseconds. Scheduling to be answered in "
+ "%" PRIu32 " milliseconds unless a node state change "
+ "happens before that time.",
+ cmd->getTimeout(), cmd->getTimeout() * 800 / 1000);
+ TimeStatePair pair(
+ _component.getClock().getTimeInMillis()
+ + framework::MilliSecTime(cmd->getTimeout() * 800 / 1000),
+ cmd);
+ _queuedStateRequests.push_back(pair);
+ } else {
+ LOG(debug, "Answered get node state request right away since it "
+ "thought we were in nodestate %s, while our actual "
+ "node state is currently %s and we didn't just reply to "
+ "existing request.",
+ cmd->getExpectedState() == 0 ? "unknown"
+ : cmd->getExpectedState()->toString().c_str(),
+ _nodeState->toString().c_str());
+ reply.reset(new api::GetNodeStateReply(*cmd, *_nodeState));
+ lock.unlock();
+ std::string nodeInfo(getNodeInfo());
+ reply->setNodeInfo(nodeInfo);
+ }
+ }
+ if (reply.get()) {
+ sendUp(reply);
+ }
+ return true;
+}
+
+void
+StateManager::setClusterState(const lib::ClusterState& c)
+{
+ {
+ vespalib::LockGuard lock(_stateLock);
+ _nextSystemState.reset(new lib::ClusterState(c));
+ }
+ notifyStateListeners();
+}
+
+bool
+StateManager::onSetSystemState(
+ const std::shared_ptr<api::SetSystemStateCommand>& cmd)
+{
+ setClusterState(cmd->getSystemState());
+ std::shared_ptr<api::SetSystemStateReply> reply(
+ new api::SetSystemStateReply(*cmd));
+ sendUp(reply);
+ return true;
+}
+
+void
+StateManager::run(framework::ThreadHandle& thread)
+{
+ while (true) {
+ thread.registerTick();
+ vespalib::MonitorGuard guard(_threadMonitor);
+ // Take lock before doing stuff, to be sure we don't wait after
+ // destructor have grabbed lock to stop() us.
+ if (thread.interrupted()) break;
+ tick();
+ guard.wait(1000);
+ }
+
+}
+
+void
+StateManager::tick() {
+ framework::MilliSecTime time(_component.getClock().getTimeInMillis());
+ sendGetNodeStateReplies(time);
+}
+
+bool
+StateManager::sendGetNodeStateReplies(framework::MilliSecTime olderThanTime,
+ uint16_t node)
+{
+ std::list<std::shared_ptr<api::GetNodeStateReply> > replies;
+ {
+ vespalib::MonitorGuard guard(_stateLock);
+ for (std::list<TimeStatePair>::iterator it
+ = _queuedStateRequests.begin();
+ it != _queuedStateRequests.end();)
+ {
+ if (node != 0xffff && node != it->second->getSourceIndex()) {
+ ++it;
+ } else if (!olderThanTime.isSet() || it->first < olderThanTime) {
+ LOG(debug, "Sending reply to msg with id %" PRIu64,
+ it->second->getMsgId());
+
+ std::shared_ptr<api::GetNodeStateReply> reply(
+ new api::GetNodeStateReply(*it->second, *_nodeState));
+ replies.push_back(reply);
+ std::list<TimeStatePair>::iterator eraseIt = it++;
+ _queuedStateRequests.erase(eraseIt);
+ } else {
+ ++it;
+ }
+ }
+ if (replies.empty()) return false;
+ }
+ std::string nodeInfo(getNodeInfo());
+ for (std::list<std::shared_ptr<api::GetNodeStateReply> >::iterator it
+ = replies.begin(); it != replies.end(); ++it)
+ {
+ (**it).setNodeInfo(nodeInfo);
+ sendUp(*it);
+ }
+ return true;
+}
+
+namespace {
+ std::string getHostInfoFilename(bool advanceCount) {
+ static uint32_t fileCounter = 0;
+ static pid_t pid = getpid();
+ if (advanceCount) ++fileCounter;
+ uint32_t fileIndex = fileCounter % 8;
+ std::ostringstream fileName;
+ fileName << vespa::Defaults::vespaHome()
+ << "tmp/hostinfo." << pid << "." << fileIndex
+ << ".report";
+ return fileName.str();
+ }
+}
+
+std::string
+StateManager::getNodeInfo() const
+{
+ // Generate report from last to info
+ vespalib::asciistream json;
+ vespalib::JsonStream stream(json, true);
+ stream << Object();
+ { // Print metrics
+ stream << "metrics";
+ try {
+ metrics::MetricLockGuard lock(_metricManager.getMetricLock());
+ std::vector<uint32_t> periods(
+ _metricManager.getSnapshotPeriods(lock));
+ if (periods.size() > 0) {
+ uint32_t period = periods[0];
+ const metrics::MetricSnapshot& snapshot(
+ _metricManager.getMetricSnapshot(lock, period));
+ metrics::JsonWriter metricJsonWriter(stream);
+ _metricManager.visit(lock, snapshot, metricJsonWriter, "fleetcontroller");
+ } else {
+ stream << Object() << "error" << "no snapshot periods" << End();
+ }
+ } catch (vespalib::Exception& e) {
+ stream << Object() << "error" << e.getMessage() << End();
+ }
+ }
+
+ // Report cluster version. It would have been tricky to encapsulate this in
+ // a HostReporter, because:
+ // - That HostReporter would somehow need to get hold of the version
+ // from the cluster state from this StateManager.
+ // - the public getSystemState() need (and should) grab a lock on
+ // _systemLock.
+ // - getNodeInfo() (this function) always acquires the same lock.
+ vespalib::MonitorGuard guard(_stateLock);
+ stream << "cluster-state-version" << _systemState->getVersion();
+
+ _hostInfo->printReport(stream);
+ stream << End();
+ stream.finalize();
+ // Add deadlock detector data.
+ //ost << "Deadlock detector data from "
+ // << _component.getClock().getTimeInSeconds().toString() << "\n\n";
+ //framework::HttpUrlPath path("");
+ //_storageServer.getDeadLockDetector().getStatus(ost, path);
+ // Dump report to new report file.
+ std::string oldFile(getHostInfoFilename(false));
+ std::string newFile(getHostInfoFilename(true));
+ std::ofstream of(newFile.c_str());
+ of << json.str();
+ of.close();
+ // If dumping went ok, delete old report file
+ vespalib::unlink(oldFile);
+ // Return report
+ return json.str();
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/statemanager.h b/storage/src/vespa/storage/storageserver/statemanager.h
new file mode 100644
index 00000000000..d1838e851d1
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/statemanager.h
@@ -0,0 +1,146 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::StateManager
+ * @ingroup storageserver
+ *
+ * @brief Keeps and updates node and system states.
+ *
+ * This component implements the NodeStateUpdater interface to handle states
+ * for all components. See that interface for documentation.
+ *
+ * In addition, this manager is a storage link such that it can handle the
+ * various commands for setting and retrieving states.
+ */
+#pragma once
+
+#include <map>
+#include <atomic>
+#include <vespa/storage/common/hostreporter/hostinfo.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storageapi/message/state.h>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/vespalib/util/sync.h>
+#include <vespa/vespalib/objects/floatingpointtype.h>
+
+namespace metrics {
+ class MetricManager;
+}
+
+namespace storage {
+
+class StateManager : public NodeStateUpdater,
+ public StorageLink,
+ public framework::HtmlStatusReporter,
+ private framework::Runnable,
+ private vespalib::JsonStreamTypes
+{
+ bool _noThreadTestMode;
+ StorageComponent _component;
+ metrics::MetricManager& _metricManager;
+ vespalib::Monitor _stateLock;
+ vespalib::Lock _listenerLock;
+ bool _grabbedExternalLock;
+ std::atomic<bool> _notifyingListeners;
+ std::shared_ptr<lib::NodeState> _nodeState;
+ std::shared_ptr<lib::NodeState> _nextNodeState;
+ std::shared_ptr<lib::ClusterState> _systemState;
+ std::shared_ptr<lib::ClusterState> _nextSystemState;
+ std::list<StateListener*> _stateListeners;
+ typedef std::pair<framework::MilliSecTime,
+ api::GetNodeStateCommand::SP> TimeStatePair;
+ std::list<TimeStatePair> _queuedStateRequests;
+ mutable vespalib::Monitor _threadMonitor;
+ framework::MilliSecTime _lastProgressUpdateCausingSend;
+ vespalib::Double _progressLastInitStateSend;
+ typedef std::pair<framework::MilliSecTime,
+ lib::ClusterState::SP> TimeSysStatePair;
+ std::deque<TimeSysStatePair> _systemStateHistory;
+ uint32_t _systemStateHistorySize;
+ std::unique_ptr<HostInfo> _hostInfo;
+ framework::Thread::UP _thread;
+
+public:
+ explicit StateManager(StorageComponentRegister&, metrics::MetricManager&,
+ std::unique_ptr<HostInfo>, bool testMode = false);
+ ~StateManager();
+
+ void onOpen();
+ void onClose();
+
+ void tick();
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ /** Implementation of HtmlStatusReporter */
+ virtual void reportHtmlStatus(std::ostream&,
+ const framework::HttpUrlPath&) const;
+
+ virtual lib::NodeState::CSP getReportedNodeState() const;
+ virtual lib::NodeState::CSP getCurrentNodeState() const;
+ virtual lib::ClusterState::CSP getSystemState() const;
+
+ virtual void addStateListener(StateListener&);
+ virtual void removeStateListener(StateListener&);
+
+ virtual Lock::SP grabStateChangeLock();
+ virtual void setReportedNodeState(const lib::NodeState& state);
+
+ void setClusterState(const lib::ClusterState& c);
+
+ HostInfo& getHostInfo() { return *_hostInfo; }
+
+private:
+ class ExternalStateLock;
+ friend class ExternalStateLock;
+ friend class StateManagerTest;
+
+ void notifyStateListeners();
+ bool sendGetNodeStateReplies(
+ framework::MilliSecTime olderThanTime = framework::MilliSecTime(0),
+ uint16_t index = 0xffff);
+
+ lib::Node thisNode() const;
+
+ /**
+ * Overwrite the current cluster state with the one that is currently
+ * pending.
+ *
+ * Appends the pending cluster state to a circular buffer of historic
+ * states.
+ *
+ * Preconditions:
+ * - _stateLock is held
+ * - _systemState.get() != nullptr
+ * - _nextSystemState.get() != nullptr
+ * Postconditions:
+ * - _systemState = old(_nextSystemState)
+ * - _nextSystemState.get() == nullptr
+ */
+ void enableNextClusterState();
+
+ /**
+ * Log this node's state transition as given by the cluster state iff the
+ * state differs between currentState and newState.
+ */
+ void logNodeClusterStateTransition(
+ const lib::ClusterState& currentState,
+ const lib::ClusterState& newState) const;
+
+ bool onGetNodeState(const std::shared_ptr<api::GetNodeStateCommand>&);
+ bool onSetSystemState(const std::shared_ptr<api::SetSystemStateCommand>&);
+
+ /**
+ * _stateLock MUST NOT be held while calling.
+ */
+ std::string getNodeInfo() const;
+
+ virtual void run(framework::ThreadHandle&);
+
+};
+
+} // storage
+
+
diff --git a/storage/src/vespa/storage/storageserver/statereporter.cpp b/storage/src/vespa/storage/storageserver/statereporter.cpp
new file mode 100644
index 00000000000..a5ee956953f
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/statereporter.cpp
@@ -0,0 +1,119 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/statereporter.h>
+
+#include <vespa/log/log.h>
+#include <vespa/metrics/jsonwriter.h>
+#include <vespa/storage/common/nodestateupdater.h>
+#include <vespa/vdslib/state/nodestate.h>
+
+LOG_SETUP(".status.statereporter");
+
+namespace storage {
+
+StateReporter::StateReporter(
+ StorageComponentRegister& compReg,
+ metrics::MetricManager& manager,
+ ApplicationGenerationFetcher& generationFetcher,
+ const std::string& name)
+ : framework::StatusReporter("state", "State reporter"),
+ _manager(manager),
+ _metricsAdapter(manager),
+ _stateApi(*this, *this, *this),
+ _component(compReg, "statereporter"),
+ _generationFetcher(generationFetcher),
+ _name(name)
+{
+ LOG(debug, "Started state reporter");
+ _component.registerStatusPage(*this);
+}
+
+StateReporter::~StateReporter()
+{
+}
+
+vespalib::string
+StateReporter::getReportContentType(
+ const framework::HttpUrlPath& /*path*/) const
+{
+ return "application/json";
+}
+
+namespace {
+
+std::map<vespalib::string, vespalib::string>
+getParams(const framework::HttpUrlPath &path)
+{
+ std::map<vespalib::string, vespalib::string> params = path.getAttributes();
+ if (params.find("consumer") == params.end()) {
+ params.insert(std::make_pair("consumer", "statereporter"));
+ }
+ return params;
+}
+
+}
+
+bool
+StateReporter::reportStatus(std::ostream& out,
+ const framework::HttpUrlPath& path) const
+{
+ vespalib::string status = _stateApi.get(path.getServerSpec(), path.getPath(), getParams(path));
+ if (status.empty()) {
+ return false;
+ }
+ out << status;
+ return true;
+}
+
+vespalib::string
+StateReporter::getMetrics(const vespalib::string &consumer)
+{
+ metrics::MetricLockGuard guard(_manager.getMetricLock());
+ std::vector<uint32_t> periods = _manager.getSnapshotPeriods(guard);
+ if (periods.empty()) {
+ return ""; // no configuration yet
+ }
+ uint32_t interval = periods[0];
+
+ // To get unset metrics, we have to copy active metrics, clear them
+ // and then assign the snapshot
+ metrics::MetricSnapshot snapshot(
+ _manager.getMetricSnapshot(guard, interval).getName(), interval,
+ _manager.getActiveMetrics(guard).getMetrics(), true);
+
+ snapshot.reset(0);
+ _manager.getMetricSnapshot(guard, interval).addToSnapshot(
+ snapshot, _component.getClock().getTimeInSeconds().getTime());
+
+ vespalib::asciistream json;
+ vespalib::JsonStream stream(json);
+ metrics::JsonWriter metricJsonWriter(stream);
+ _manager.visit(guard, snapshot, metricJsonWriter, consumer);
+ stream.finalize();
+ return json.str();
+}
+
+vespalib::string
+StateReporter::getTotalMetrics(const vespalib::string &consumer)
+{
+ return _metricsAdapter.getTotalMetrics(consumer);
+}
+
+vespalib::HealthProducer::Health
+StateReporter::getHealth() const
+{
+ lib::NodeState cns(*_component.getStateUpdater().getCurrentNodeState());
+ bool up = cns.getState().oneOf("u");
+ std::string message = up ? "" : "Node state: " + cns.toString(true);
+ return vespalib::HealthProducer::Health(up, message);
+}
+
+void
+StateReporter::getComponentConfig(Consumer &consumer)
+{
+ consumer.add(ComponentConfigProducer::Config(_generationFetcher.getComponentName(),
+ _generationFetcher.getGeneration()));
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/statereporter.h b/storage/src/vespa/storage/storageserver/statereporter.h
new file mode 100644
index 00000000000..dfef97e85d0
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/statereporter.h
@@ -0,0 +1,65 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+/**
+ * @class storage::StateReporter
+ * @ingroup storageserver
+ *
+ * @brief Writes config generation or health status and metrics
+ * as json to status page.
+ */
+
+#pragma once
+
+#include <vespa/metrics/metrics.h>
+#include <vespa/metrics/state_api_adapter.h>
+#include <vespa/storage/common/storagecomponent.h>
+#include <vespa/storage/storageserver/applicationgenerationfetcher.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/vespalib/net/metrics_producer.h>
+#include <vespa/vespalib/net/state_api.h>
+
+namespace vespalib {
+ class StringTokenizer;
+}
+
+namespace storage {
+
+class StateReporter : public framework::StatusReporter,
+ public vespalib::MetricsProducer,
+ public vespalib::HealthProducer,
+ public vespalib::ComponentConfigProducer
+{
+public:
+ StateReporter(
+ StorageComponentRegister&,
+ metrics::MetricManager&,
+ ApplicationGenerationFetcher& generationFetcher,
+ const std::string& name = "status");
+ ~StateReporter();
+
+ vespalib::string getReportContentType(
+ const framework::HttpUrlPath&) const;
+ bool reportStatus(std::ostream& out,
+ const framework::HttpUrlPath& path) const;
+
+private:
+ metrics::MetricManager &_manager;
+ metrics::StateApiAdapter _metricsAdapter;
+ vespalib::StateApi _stateApi;
+ StorageComponent _component;
+ ApplicationGenerationFetcher& _generationFetcher;
+ std::string _name;
+
+ // Implements vespalib::MetricsProducer
+ virtual vespalib::string getMetrics(const vespalib::string &consumer) override;
+ virtual vespalib::string getTotalMetrics(const vespalib::string &consumer) override;
+
+ // Implements vespalib::HealthProducer
+ virtual Health getHealth() const override;
+
+ // Implements vespalib::ComponentConfigProducer
+ virtual void getComponentConfig(Consumer &consumer) override;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageserver/storagemetricsset.h b/storage/src/vespa/storage/storageserver/storagemetricsset.h
new file mode 100644
index 00000000000..18be3e21ada
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/storagemetricsset.h
@@ -0,0 +1,116 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/metrics/metrics.h>
+#include <vespa/document/fieldvalue/serializablearray.h>
+
+namespace storage {
+
+class MessageMemoryUseMetricSet : public metrics::MetricSet
+{
+public:
+ metrics::LongValueMetric total;
+ metrics::LongValueMetric lowpri;
+ metrics::LongValueMetric normalpri;
+ metrics::LongValueMetric highpri;
+ metrics::LongValueMetric veryhighpri;
+
+ MessageMemoryUseMetricSet(metrics::MetricSet* owner)
+ : metrics::MetricSet("message_memory_use", "memory",
+ "Message use from storage messages", owner),
+ total("total", "memory",
+ "Message use from storage messages", this),
+ lowpri("lowpri", "memory",
+ "Message use from low priority storage messages", this),
+ normalpri("normalpri", "memory",
+ "Message use from normal priority storage messages", this),
+ highpri("highpri", "memory",
+ "Message use from high priority storage messages", this),
+ veryhighpri("veryhighpri", "memory",
+ "Message use from very high priority storage messages", this)
+ {
+ }
+
+};
+
+struct DocumentSerializationMetricSet : public metrics::MetricSet
+{
+ metrics::LongCountMetric usedCachedSerializationCount;
+ metrics::LongCountMetric compressedDocumentCount;
+ metrics::LongCountMetric compressionDidntHelpCount;
+ metrics::LongCountMetric uncompressableCount;
+ metrics::LongCountMetric serializedUncompressed;
+ metrics::LongCountMetric inputWronglySerialized;
+
+ DocumentSerializationMetricSet(metrics::MetricSet* owner)
+ : metrics::MetricSet("document_serialization", "docserialization",
+ "Counts of document serialization of various types", owner),
+ usedCachedSerializationCount(
+ "cached_serialization_count", "docserialization",
+ "Number of times we didn't need to serialize the document as "
+ "we already had serialized version cached", this),
+ compressedDocumentCount(
+ "compressed_serialization_count", "docserialization",
+ "Number of times we compressed document when serializing",
+ this),
+ compressionDidntHelpCount(
+ "compressed_didnthelp_count", "docserialization",
+ "Number of times we compressed document when serializing, but "
+ "the compressed version was bigger, so it was dumped", this),
+ uncompressableCount(
+ "uncompressable_serialization_count", "docserialization",
+ "Number of times we didn't attempt compression as document "
+ "had already been tagged uncompressable", this),
+ serializedUncompressed(
+ "uncompressed_serialization_count", "docserialization",
+ "Number of times we serialized a document uncompressed", this),
+ inputWronglySerialized(
+ "input_wrongly_serialized_count", "docserialization",
+ "Number of times we reserialized a document because the "
+ "compression it had in cache did not match what was configured",
+ this)
+ {
+ }
+
+};
+
+struct StorageMetricSet : public metrics::MetricSet
+{
+ metrics::LongValueMetric memoryUse;
+ MessageMemoryUseMetricSet memoryUse_messages;
+ metrics::LongValueMetric memoryUse_visiting;
+ DocumentSerializationMetricSet documentSerialization;
+
+ StorageMetricSet()
+ : metrics::MetricSet("server", "memory",
+ "Metrics for VDS applications"),
+ memoryUse("memoryusage", "memory", "", this),
+ memoryUse_messages(this),
+ memoryUse_visiting("memoryusage_visiting", "memory",
+ "Message use from visiting", this),
+ documentSerialization(this)
+ {
+ }
+
+ void updateMetrics() {
+ document::SerializableArray::Statistics stats(
+ document::SerializableArray::getStatistics());
+
+ documentSerialization.usedCachedSerializationCount.set(
+ stats._usedCachedSerializationCount);
+ documentSerialization.compressedDocumentCount.set(
+ stats._compressedDocumentCount);
+ documentSerialization.compressionDidntHelpCount.set(
+ stats._compressionDidntHelpCount);
+ documentSerialization.uncompressableCount.set(
+ stats._uncompressableCount);
+ documentSerialization.serializedUncompressed.set(
+ stats._serializedUncompressed);
+ documentSerialization.inputWronglySerialized.set(
+ stats._inputWronglySerialized);
+ }
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageserver/storagenode.cpp b/storage/src/vespa/storage/storageserver/storagenode.cpp
new file mode 100644
index 00000000000..8690e95c647
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/storagenode.cpp
@@ -0,0 +1,626 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/storagenode.h>
+
+#include <vespa/storage/bucketdb/bucketmanager.h>
+#include <vespa/storage/bucketdb/storagebucketdbinitializer.h>
+#include <vespa/storage/bucketmover/bucketmover.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storageframework/defaultimplementation/memory/prioritymemorylogic.h>
+#include <vespa/storage/common/statusmetricconsumer.h>
+#include <vespa/storage/common/hostreporter/hostinfo.h>
+#include <vespa/storage/distributor/bucketdbupdater.h>
+#include <vespa/storage/distributor/distributor.h>
+#include <vespa/storage/distributor/pendingmessagetracker.h>
+#include <vespa/storage/persistence/filestorage/filestormanager.h>
+#include <vespa/storage/storageserver/bouncer.h>
+#include <vespa/storage/storageserver/bucketintegritychecker.h>
+#include <vespa/storage/storageserver/communicationmanager.h>
+#include <vespa/storage/storageserver/mergethrottler.h>
+#include <vespa/storage/storageserver/opslogger.h>
+#include <vespa/storage/storageserver/statemanager.h>
+#include <vespa/storage/storageserver/statereporter.h>
+#include <vespa/storage/storageserver/storagemetricsset.h>
+#include <vespa/storage/storageutil/functor.h>
+#include <vespa/storage/storageutil/log.h>
+#include <vespa/storage/visiting/visitormanager.h>
+#include <vespa/storage/visiting/messagebusvisitormessagesession.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/vespalib/io/fileutil.h>
+#include <fstream>
+#include <sstream>
+
+LOG_SETUP(".node.server");
+
+namespace storage {
+
+namespace {
+
+ using vespalib::getLastErrorString;
+
+ void writePidFile(const vespalib::string& pidfile)
+ {
+ int rv = -1;
+ vespalib::string mypid = vespalib::make_string("%d\n", getpid());
+ size_t lastSlash = pidfile.rfind('/');
+ if (lastSlash != vespalib::string::npos) {
+ vespalib::mkdir(pidfile.substr(0, lastSlash));
+ }
+ int fd = open(pidfile.c_str(), O_WRONLY | O_CREAT | O_TRUNC, 0644);
+ if (fd != -1) {
+ rv = write(fd, mypid.c_str(), mypid.size());
+ close(fd);
+ }
+ if (rv < 1) {
+ LOG(warning, "Failed to write pidfile '%s': %s",
+ pidfile.c_str(), getLastErrorString().c_str());
+ }
+ }
+
+ void removePidFile(const vespalib::string& pidfile)
+ {
+ if (unlink(pidfile.c_str()) != 0) {
+ LOG(warning, "Failed to delete pidfile '%s': %s",
+ pidfile.c_str(), getLastErrorString().c_str());
+ }
+ }
+
+
+bool
+allDisksDown(const lib::NodeState &nodeState)
+{
+ for (uint32_t i = 0; i < nodeState.getDiskCount(); ++i) {
+ if (nodeState.getDiskState(i).getState() != lib::State::DOWN)
+ return false;
+ }
+ return true;
+}
+
+
+} // End of anonymous namespace
+
+StorageNode::StorageNode(
+ const config::ConfigUri & configUri,
+ StorageNodeContext& context,
+ ApplicationGenerationFetcher& generationFetcher,
+ std::unique_ptr<HostInfo> hostInfo,
+ RunMode mode)
+ : _singleThreadedDebugMode(mode == SINGLE_THREADED_TEST_MODE),
+ _hostInfo(std::move(hostInfo)),
+ _context(context),
+ _generationFetcher(generationFetcher),
+ _attemptedStopped(false),
+ _configUri(configUri),
+ _communicationManager(0)
+{
+}
+
+void
+StorageNode::subscribeToConfigs()
+{
+ _configFetcher.reset(new config::ConfigFetcher(_configUri.getContext()));
+ _configFetcher->subscribe<vespa::config::content::StorDistributionConfig>(_configUri.getConfigId(), this);
+ _configFetcher->subscribe<vespa::config::content::UpgradingConfig>(_configUri.getConfigId(), this);
+ _configFetcher->subscribe<vespa::config::content::core::StorServerConfig>(_configUri.getConfigId(), this);
+ _configFetcher->subscribe<vespa::config::content::core::StorPrioritymappingConfig>(_configUri.getConfigId(), this);
+ _configFetcher->start();
+
+ vespalib::LockGuard configLockGuard(_configLock);
+ _serverConfig = std::move(_newServerConfig);
+ _clusterConfig = std::move(_newClusterConfig);
+ _distributionConfig = std::move(_newDistributionConfig);
+ _priorityConfig = std::move(_newPriorityConfig);
+}
+
+
+void
+StorageNode::initialize()
+{
+ // Fetch configs needed first. These functions will just grab the config
+ // and store them away, while having the config lock.
+ subscribeToConfigs();
+
+ _context.getMemoryManager().setMaximumMemoryUsage(
+ _serverConfig->memorytouse);
+ _context.getComponentRegister().registerShutdownListener(*this);
+ updateUpgradeFlag(*_clusterConfig);
+
+ // First update some basics that doesn't depend on anything else to be
+ // available
+ _rootFolder = _serverConfig->rootFolder;
+
+ _context.getComponentRegister().setNodeInfo(
+ _serverConfig->clusterName, getNodeType(),
+ _serverConfig->nodeIndex);
+ _context.getComponentRegister().setLoadTypes(
+ documentapi::LoadTypeSet::SP(
+ new documentapi::LoadTypeSet(_configUri)));
+ _context.getComponentRegister().setBucketIdFactory(
+ document::BucketIdFactory());
+ _context.getComponentRegister().setDistribution(
+ lib::Distribution::SP(new lib::Distribution(*_distributionConfig)));
+ _context.getComponentRegister().setPriorityConfig(*_priorityConfig);
+
+ _metrics.reset(new StorageMetricSet);
+ _component.reset(new StorageComponent(
+ _context.getComponentRegister(), "storagenode"));
+ _component->registerMetric(*_metrics);
+ if (!_context.getComponentRegister().hasMetricManager()) {
+ _metricManager.reset(new metrics::MetricManager);
+ _context.getComponentRegister().setMetricManager(*_metricManager);
+ }
+ _component->registerMetricUpdateHook(*this, framework::SecondTime(300));
+
+ // Initializing state manager early, as others use it init time to
+ // update node state according to disk count and min used bits etc.
+ // Needs node type to be set right away. Needs thread pool, index and
+ // dead lock detector too, but not before open()
+ _stateManager.reset(new StateManager(
+ _context.getComponentRegister(),
+ _context.getComponentRegister().getMetricManager(),
+ std::move(_hostInfo),
+ _singleThreadedDebugMode));
+ _context.getComponentRegister().setNodeStateUpdater(*_stateManager);
+
+ // Create VDS root folder, in case it doesn't already exist.
+ // Maybe better to rather fail if it doesn't exist, but tests
+ // might break if we do that. Might alter later.
+ vespalib::mkdir(_rootFolder);
+
+ initializeNodeSpecific();
+
+ _memoryStatusViewer.reset(new MemoryStatusViewer(
+ _context.getMemoryManager(),
+ _context.getComponentRegister().getMetricManager(),
+ _context.getComponentRegister()));
+
+ _statusMetrics.reset(new StatusMetricConsumer(
+ _context.getComponentRegister(), _context.getComponentRegister().getMetricManager()));
+ _stateReporter.reset(new StateReporter(
+ _context.getComponentRegister(), _context.getComponentRegister().getMetricManager(),
+ _generationFetcher));
+
+ // Start deadlock detector
+ _deadLockDetector.reset(new DeadLockDetector(
+ _context.getComponentRegister()));
+ _deadLockDetector->enableWarning(
+ _serverConfig->enableDeadLockDetectorWarnings);
+ _deadLockDetector->enableShutdown(_serverConfig->enableDeadLockDetector);
+ _deadLockDetector->setProcessSlack(framework::MilliSecTime(
+ static_cast<uint32_t>(
+ _serverConfig->deadLockDetectorTimeoutSlack * 1000)));
+ _deadLockDetector->setWaitSlack(framework::MilliSecTime(
+ static_cast<uint32_t>(
+ _serverConfig->deadLockDetectorTimeoutSlack * 1000)));
+
+ _chain.reset(createChain().release());
+
+ // Start the metric manager, such that it starts generating snapshots
+ // and the like. Note that at this time, all metrics should hopefully
+ // have been created, such that we don't need to pay the extra cost of
+ // reinitializing metric manager often.
+ _context.getComponentRegister().getMetricManager().init(_configUri, _context.getThreadPool());
+
+ if (_chain.get() != 0) {
+ LOG(debug, "Storage chain configured. Calling open()");
+ _chain->open();
+ }
+
+ initializeStatusWebServer();
+
+ // Write pid file as the last thing we do. If we fail initialization
+ // due to an exception we won't run shutdown. Thus we won't remove the
+ // pid file if something throws after writing it in initialization.
+ // Initialize _pidfile here, such that we can know that we didn't create
+ // it in shutdown code for shutdown during init.
+ _pidFile = _rootFolder + "/pidfile";
+ writePidFile(_pidFile);
+}
+
+void
+StorageNode::initializeStatusWebServer()
+{
+ if (_singleThreadedDebugMode) return;
+ _statusWebServer.reset(new StatusWebServer(
+ _context.getComponentRegister(), _context.getComponentRegister(),
+ _configUri));
+}
+
+#define DIFFER(a) (!(oldC.a == newC.a))
+#define ASSIGN(a) { oldC.a = newC.a; updated = true; }
+#define DIFFERWARN(a, b) \
+ if (DIFFER(a)) { LOG(warning, "Live config failure: %s.", b); }
+
+void
+StorageNode::setNewDocumentRepo(const document::DocumentTypeRepo::SP& repo)
+{
+ vespalib::LockGuard configLockGuard(_configLock);
+ _context.getComponentRegister().setDocumentTypeRepo(repo);
+ if (_communicationManager != 0) {
+ _communicationManager->updateMessagebusProtocol(repo);
+ }
+}
+
+void
+StorageNode::updateUpgradeFlag(const vespa::config::content::UpgradingConfig& config)
+{
+ framework::UpgradeFlags flag(framework::NO_UPGRADE_SPECIAL_HANDLING_ACTIVE);
+ if (config.upgradingMajorTo) {
+ flag = framework::UPGRADING_TO_MAJOR_VERSION;
+ } else if (config.upgradingMinorTo) {
+ flag = framework::UPGRADING_TO_MINOR_VERSION;
+ } else if (config.upgradingMajorFrom) {
+ flag = framework::UPGRADING_FROM_MAJOR_VERSION;
+ } else if (config.upgradingMinorFrom) {
+ flag = framework::UPGRADING_FROM_MINOR_VERSION;
+ }
+ _context.getComponentRegister().setUpgradeFlag(flag);
+}
+
+void
+StorageNode::handleLiveConfigUpdate()
+{
+ // Make sure we don't conflict with initialize or shutdown threads.
+ vespalib::LockGuard configLockGuard(_configLock);
+ // If storage haven't initialized, ignore. Initialize code will handle
+ // this config.
+ if (_chain.get() == 0) return;
+ // If we get here, initialize is done running. We have to handle changes
+ // we want to handle.
+
+ if (_newServerConfig.get() != 0) {
+ bool updated = false;
+ vespa::config::content::core::StorServerConfigBuilder oldC(*_serverConfig);
+ vespa::config::content::core::StorServerConfig& newC(*_newServerConfig);
+ DIFFERWARN(rootFolder, "Cannot alter root folder of node live");
+ DIFFERWARN(clusterName, "Cannot alter cluster name of node live");
+ DIFFERWARN(nodeIndex, "Cannot alter node index of node live");
+ DIFFERWARN(isDistributor, "Cannot alter role of node live");
+ {
+ if (DIFFER(memorytouse)) {
+ LOG(info, "Live config update: Memory to use changed "
+ "from %" PRId64 " to %" PRId64 ".",
+ oldC.memorytouse, newC.memorytouse);
+ ASSIGN(memorytouse);
+ _context.getMemoryManager().setMaximumMemoryUsage(
+ newC.memorytouse);
+ }
+ }
+ _serverConfig.reset(new vespa::config::content::core::StorServerConfig(oldC));
+ _newServerConfig.reset(0);
+ (void)updated;
+ }
+ if (_newDistributionConfig.get() != 0) {
+ vespa::config::content::StorDistributionConfigBuilder oldC(*_distributionConfig);
+ vespa::config::content::StorDistributionConfig& newC(*_newDistributionConfig);
+ bool updated = false;
+ if (DIFFER(redundancy)) {
+ LOG(info, "Live config update: Altering redundancy from %u to %u.",
+ oldC.redundancy, newC.redundancy);
+ ASSIGN(redundancy);
+ }
+ if (DIFFER(initialRedundancy)) {
+ LOG(info, "Live config update: Altering initial redundancy "
+ "from %u to %u.",
+ oldC.initialRedundancy, newC.initialRedundancy);
+ ASSIGN(initialRedundancy);
+ }
+ if (DIFFER(ensurePrimaryPersisted)) {
+ LOG(info, "Live config update: Now%s requiring primary copy to "
+ "succeed for n of m operation to succeed.",
+ newC.ensurePrimaryPersisted ? "" : " not");
+ ASSIGN(ensurePrimaryPersisted);
+ }
+ if (DIFFER(activePerLeafGroup)) {
+ LOG(info, "Live config update: Active per leaf group setting "
+ "altered from %s to %s",
+ oldC.activePerLeafGroup ? "true" : "false",
+ newC.activePerLeafGroup ? "true" : "false");
+ ASSIGN(activePerLeafGroup);
+ }
+ if (DIFFER(readyCopies)) {
+ LOG(info, "Live config update: Altering number of searchable "
+ "copies from %u to %u",
+ oldC.readyCopies, newC.readyCopies);
+ ASSIGN(readyCopies);
+ }
+ if (DIFFER(group)) {
+ LOG(info, "Live config update: Group structure altered.");
+ ASSIGN(group);
+ }
+ if (DIFFER(diskDistribution)) {
+ LOG(info, "Live config update: Disk distribution altered from "
+ "%s to %s.",
+ vespa::config::content::StorDistributionConfig::getDiskDistributionName(
+ oldC.diskDistribution).c_str(),
+ vespa::config::content::StorDistributionConfig::getDiskDistributionName(
+ newC.diskDistribution).c_str());
+ ASSIGN(diskDistribution);
+ }
+ _distributionConfig.reset(new vespa::config::content::StorDistributionConfig(oldC));
+ _newDistributionConfig.reset(0);
+ if (updated) {
+ _context.getComponentRegister().setDistribution(
+ lib::Distribution::SP(new lib::Distribution(oldC)));
+ for (StorageLink* link = _chain.get(); link != 0;
+ link = link->getNextLink())
+ {
+ link->storageDistributionChanged();
+ }
+ }
+ }
+ if (_newClusterConfig.get() != 0) {
+ updateUpgradeFlag(*_newClusterConfig);
+ if (*_clusterConfig != *_newClusterConfig) {
+ LOG(warning, "Live config failure: "
+ "Cannot alter cluster config of node live.");
+ }
+ _newClusterConfig.reset(0);
+ }
+ if (_newPriorityConfig.get() != 0) {
+ _priorityConfig = std::move(_newPriorityConfig);
+ _context.getComponentRegister().setPriorityConfig(*_priorityConfig);
+ }
+}
+
+void
+StorageNode::notifyDoneInitializing()
+{
+ bool isDistributor = (getNodeType() == lib::NodeType::DISTRIBUTOR);
+ LOG(info, "%s node ready. Done initializing. Giving out of sequence "
+ "metric event. Config id is %s",
+ isDistributor ? "Distributor" : "Storage", _configUri.getConfigId().c_str());
+ _context.getComponentRegister().getMetricManager().forceEventLogging();
+ if (!_singleThreadedDebugMode) {
+ EV_STARTED(isDistributor ? "distributor" : "storagenode");
+ }
+
+ NodeStateUpdater::Lock::SP lock(
+ _component->getStateUpdater().grabStateChangeLock());
+ lib::NodeState ns(*_component->getStateUpdater().getReportedNodeState());
+ ns.setState(lib::State::UP);
+ _component->getStateUpdater().setReportedNodeState(ns);
+ _chain->doneInit();
+}
+
+StorageNode::~StorageNode()
+{
+}
+
+void
+StorageNode::removeConfigSubscriptions()
+{
+ LOG(debug, "Removing config subscribers");
+ _configFetcher.reset(0);
+}
+
+void
+StorageNode::shutdown()
+{
+ // Try to shut down in opposite order of initialize. Bear in mind that
+ // we might be shutting down after init exception causing only parts
+ // of the server to have initialize
+ LOG(debug, "Shutting down storage node of type %s",
+ getNodeType().toString().c_str());
+ if (!_attemptedStopped) {
+ LOG(warning, "Storage killed before requestShutdown() was called. No "
+ "reason has been given for why we're stopping.");
+ }
+ // Remove the subscription to avoid more callbacks from config
+ removeConfigSubscriptions();
+
+ if (_chain.get()) {
+ LOG(debug, "Closing storage chain");
+ _chain->close();
+ LOG(debug, "Flushing storage chain");
+ _chain->flush();
+ }
+
+ if (_pidFile != "") {
+ LOG(debug, "Removing pid file");
+ removePidFile(_pidFile);
+ }
+
+ if (!_singleThreadedDebugMode) {
+ EV_STOPPING(getNodeType() == lib::NodeType::DISTRIBUTOR
+ ? "distributor" : "storagenode", "Stopped");
+ }
+
+ if (_context.getComponentRegister().hasMetricManager()) {
+ LOG(debug, "Stopping metric manager. "
+ "(Deleting chain may remove metrics)");
+ _context.getComponentRegister().getMetricManager().stop();
+ }
+
+ // Delete the status web server before the actual status providers, to
+ // ensure that web server does not query providers during shutdown
+ _statusWebServer.reset(0);
+
+ // For this to be safe, noone can touch the state updater after we start
+ // deleting the storage chain
+ LOG(debug, "Removing state updater pointer as we're about to delete it.");
+ if (_chain.get()) {
+ LOG(debug, "Deleting storage chain");
+ _chain.reset(0);
+ }
+ if (_statusMetrics.get()) {
+ LOG(debug, "Deleting status metrics consumer");
+ _statusMetrics.reset(0);
+ }
+ if (_stateReporter.get()) {
+ LOG(debug, "Deleting state reporter");
+ _stateReporter.reset(0);
+ }
+ if (_memoryStatusViewer.get()) {
+ LOG(debug, "Deleting memory status viewer");
+ _memoryStatusViewer.reset(0);
+ }
+ if (_stateManager.get()) {
+ LOG(debug, "Deleting state manager");
+ _stateManager.reset(0);
+ }
+ if (_deadLockDetector.get()) {
+ LOG(debug, "Deleting dead lock detector");
+ _deadLockDetector.reset(0);
+ }
+ if (_metricManager.get()) {
+ LOG(debug, "Deleting metric manager");
+ _metricManager.reset(0);
+ }
+ if (_metrics.get()) {
+ LOG(debug, "Deleting metric set");
+ _metrics.reset();
+ }
+ if (_component.get()) {
+ LOG(debug, "Deleting component");
+ _component.reset();
+ }
+
+ LOG(debug, "Done shutting down node");
+}
+
+void StorageNode::configure(std::unique_ptr<vespa::config::content::core::StorServerConfig> config)
+{
+ // When we get config, we try to grab the config lock to ensure noone
+ // else is doing configuration work, and then we write the new config
+ // to a variable where we can find it later when processing config
+ // updates
+ {
+ vespalib::LockGuard configLockGuard(_configLock);
+ _newServerConfig.reset(config.release());
+ }
+ if (_serverConfig.get() != 0) handleLiveConfigUpdate();
+}
+
+void
+StorageNode::configure(std::unique_ptr<vespa::config::content::UpgradingConfig> config)
+{
+ // When we get config, we try to grab the config lock to ensure noone
+ // else is doing configuration work, and then we write the new config
+ // to a variable where we can find it later when processing config
+ // updates
+ {
+ vespalib::LockGuard configLockGuard(_configLock);
+ _newClusterConfig.reset(config.release());
+ }
+ if (_clusterConfig.get() != 0) handleLiveConfigUpdate();
+}
+
+void
+StorageNode::configure(std::unique_ptr<vespa::config::content::StorDistributionConfig> config)
+{
+ // When we get config, we try to grab the config lock to ensure noone
+ // else is doing configuration work, and then we write the new config
+ // to a variable where we can find it later when processing config
+ // updates
+ {
+ vespalib::LockGuard configLockGuard(_configLock);
+ _newDistributionConfig.reset(config.release());
+ }
+ if (_distributionConfig.get() != 0) handleLiveConfigUpdate();
+}
+
+void
+StorageNode::configure(std::unique_ptr<vespa::config::content::core::StorPrioritymappingConfig> config)
+{
+ {
+ vespalib::LockGuard configLockGuard(_configLock);
+ _newPriorityConfig.reset(config.release());
+ }
+ if (_priorityConfig.get() != 0) handleLiveConfigUpdate();
+}
+
+void StorageNode::configure(std::unique_ptr<document::DocumenttypesConfig> config,
+ bool hasChanged, int64_t generation)
+{
+ (void) generation;
+ if (!hasChanged)
+ return;
+ {
+ vespalib::LockGuard configLockGuard(_configLock);
+ _newDoctypesConfig.reset(config.release());
+ }
+ if (_doctypesConfig.get() != 0) handleLiveConfigUpdate();
+}
+
+bool
+StorageNode::attemptedStopped() const
+{
+ return _attemptedStopped;
+}
+
+void
+StorageNode::updateMetrics(const MetricLockGuard &) {
+ _metrics->updateMetrics();
+}
+
+void
+StorageNode::waitUntilInitialized(uint32_t timeout) {
+ framework::defaultimplementation::RealClock clock;
+ framework::MilliSecTime endTime(
+ clock.getTimeInMillis() + framework::MilliSecTime(1000 * timeout));
+ while (true) {
+ {
+ NodeStateUpdater::Lock::SP lock(
+ _component->getStateUpdater().grabStateChangeLock());
+ lib::NodeState nodeState(
+ *_component->getStateUpdater().getReportedNodeState());
+ if (nodeState.getState() == lib::State::UP) break;
+ }
+ FastOS_Thread::Sleep(10);
+ if (clock.getTimeInMillis() >= endTime) {
+ std::ostringstream ost;
+ ost << "Storage server not initialized after waiting timeout of "
+ << timeout << " seconds.";
+ throw vespalib::IllegalStateException(ost.str(), VESPA_STRLOC);
+ }
+ }
+}
+
+void
+StorageNode::requestShutdown(vespalib::stringref reason)
+{
+ if (_attemptedStopped) return;
+ if (_component) {
+ NodeStateUpdater::Lock::SP lock(_component->getStateUpdater().grabStateChangeLock());
+ lib::NodeState nodeState(*_component->getStateUpdater().getReportedNodeState());
+ if (nodeState.getState() != lib::State::STOPPING) {
+ nodeState.setState(lib::State::STOPPING);
+ nodeState.setDescription(reason);
+ _component->getStateUpdater().setReportedNodeState(nodeState);
+ }
+ }
+ _attemptedStopped = true;
+}
+
+
+void
+StorageNode::notifyPartitionDown(int partId, vespalib::stringref reason)
+{
+ if (!_component)
+ return;
+ NodeStateUpdater::Lock::SP lock(_component->getStateUpdater().grabStateChangeLock());
+ lib::NodeState nodeState(*_component->getStateUpdater().getReportedNodeState());
+ if (partId >= nodeState.getDiskCount())
+ return;
+ lib::DiskState diskState(nodeState.getDiskState(partId));
+ if (diskState.getState() == lib::State::DOWN)
+ return;
+ diskState.setState(lib::State::DOWN);
+ diskState.setDescription(reason);
+ nodeState.setDiskState(partId, diskState);
+ if (allDisksDown(nodeState)) {
+ nodeState.setState(lib::State::DOWN);
+ nodeState.setDescription("All partitions are down");
+ }
+ _component->getStateUpdater().setReportedNodeState(nodeState);
+}
+
+
+std::unique_ptr<StateManager>
+StorageNode::releaseStateManager() {
+ return std::move(_stateManager);
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/storagenode.h b/storage/src/vespa/storage/storageserver/storagenode.h
new file mode 100644
index 00000000000..a63bd1ce4e9
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/storagenode.h
@@ -0,0 +1,195 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::StorageNode
+ * @ingroup storageserver
+ *
+ * @brief Main storage server class.
+ *
+ * This class sets up the entire storage server.
+ *
+ * @author H?kon Humberset
+ * @date 2005-05-13
+ * @version $Id: storageserver.h 131081 2011-12-16 18:44:06Z lulf $
+ */
+
+#pragma once
+
+#include <boost/utility.hpp>
+#include <vespa/document/bucket/bucketidfactory.h>
+#include <vespa/fastos/fastos.h>
+#include <memory>
+#include <string>
+#include <vespa/storage/config/config-stor-server.h>
+
+#include <vespa/config/helper/legacysubscriber.h>
+#include <vespa/document/bucket/bucketid.h>
+#include <vespa/document/config/config-documenttypes.h>
+#include <vespa/documentapi/loadtypes/loadtypeset.h>
+#include <vespa/metrics/metrics.h>
+#include <vespa/storage/bucketdb/distrbucketdb.h>
+#include <vespa/storage/bucketdb/storbucketdb.h>
+#include <vespa/storage/common/doneinitializehandler.h>
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storage/common/visitorfactory.h>
+#include <vespa/storage/config/config-stor-prioritymapping.h>
+#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
+#include <vespa/storageframework/defaultimplementation/component/componentregisterimpl.h>
+#include <vespa/storage/frameworkimpl/status/statuswebserver.h>
+#include <vespa/storage/frameworkimpl/thread/deadlockdetector.h>
+#include <vespa/storageframework/defaultimplementation/memory/memorymanager.h>
+#include <vespa/storageframework/defaultimplementation/thread/threadpoolimpl.h>
+#include <vespa/storage/frameworkimpl/memory/memorystatusviewer.h>
+#include <vespa/storage/storageserver/applicationgenerationfetcher.h>
+#include <vespa/storage/storageserver/storagenodecontext.h>
+#include <vespa/storage/storageserver/storagemetricsset.h>
+#include <vespa/storage/visiting/visitormessagesessionfactory.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storage/storageutil/resumeguard.h>
+#include <vespa/config-upgrading.h>
+#include <vespa/config-stor-distribution.h>
+
+namespace storage {
+
+class StatusMetricConsumer;
+class StateReporter;
+class CommunicationManager;
+class FileStorManager;
+class HostInfo;
+class StateManager;
+
+class StorageNode : private boost::noncopyable,
+ private config::IFetcherCallback<vespa::config::content::core::StorServerConfig>,
+ private config::IFetcherCallback<vespa::config::content::StorDistributionConfig>,
+ private config::IFetcherCallback<vespa::config::content::UpgradingConfig>,
+ private config::IFetcherCallback<vespa::config::content::core::StorPrioritymappingConfig>,
+ private framework::MetricUpdateHook,
+ private DoneInitializeHandler,
+ private framework::defaultimplementation::ShutdownListener
+{
+public:
+ enum RunMode { NORMAL, SINGLE_THREADED_TEST_MODE };
+
+ /**
+ * @param excludeStorageChain With this option set, no chain will be set
+ * up. This can be useful in unit testing if you need a storage server
+ * instance, but you want to have full control over the components yourself.
+ */
+ StorageNode(const config::ConfigUri & configUri,
+ StorageNodeContext& context,
+ ApplicationGenerationFetcher& generationFetcher,
+ std::unique_ptr<HostInfo> hostInfo,
+ RunMode = NORMAL);
+ virtual ~StorageNode();
+
+ virtual const lib::NodeType& getNodeType() const = 0;
+
+ bool attemptedStopped() const;
+
+ virtual void notifyDoneInitializing();
+ void waitUntilInitialized(uint32_t timeoutSeconds = 15);
+
+ void updateMetrics(const MetricLockGuard & guard);
+
+ /** Updates the document type repo. */
+ void setNewDocumentRepo(const document::DocumentTypeRepo::SP& repo);
+
+ /**
+ * Pauses the persistence processing. While the returned ResumeGuard
+ * is alive, no calls will be made towards the persistence provider.
+ */
+ virtual ResumeGuard pause() = 0;
+
+ void requestShutdown(vespalib::stringref reason);
+
+ void
+ notifyPartitionDown(int partId, vespalib::stringref reason);
+
+ DoneInitializeHandler& getDoneInitializeHandler() { return *this; }
+
+ // For testing
+ StorageLink* getChain() { return _chain.get(); }
+
+ virtual void initializeStatusWebServer();
+
+private:
+ bool _singleThreadedDebugMode;
+ // Subscriptions to config
+ std::unique_ptr<config::ConfigFetcher> _configFetcher;
+
+ std::unique_ptr<HostInfo> _hostInfo;
+
+ StorageNodeContext& _context;
+ ApplicationGenerationFetcher& _generationFetcher;
+ vespalib::string _rootFolder;
+ bool _attemptedStopped;
+ vespalib::string _pidFile;
+
+ // First components that doesn't depend on others
+ std::unique_ptr<StatusWebServer> _statusWebServer;
+ std::shared_ptr<StorageMetricSet> _metrics;
+ std::unique_ptr<metrics::MetricManager> _metricManager;
+
+ // Depends on bucket databases and stop() functionality
+ std::unique_ptr<DeadLockDetector> _deadLockDetector;
+ // Depends on dead lock detector and threadpool
+ std::unique_ptr<MemoryStatusViewer> _memoryStatusViewer;
+ // Depends on metric manager
+ std::unique_ptr<StatusMetricConsumer> _statusMetrics;
+ // Depends on metric manager
+ std::unique_ptr<StateReporter> _stateReporter;
+
+ std::unique_ptr<StateManager> _stateManager;
+
+ // The storage chain can depend on anything.
+ std::unique_ptr<StorageLink> _chain;
+
+ /** Implementation of config callbacks. */
+ virtual void configure(std::unique_ptr<vespa::config::content::core::StorServerConfig> config);
+ virtual void configure(std::unique_ptr<vespa::config::content::UpgradingConfig> config);
+ virtual void configure(std::unique_ptr<vespa::config::content::StorDistributionConfig> config);
+ virtual void configure(std::unique_ptr<vespa::config::content::core::StorPrioritymappingConfig>);
+ virtual void configure(std::unique_ptr<document::DocumenttypesConfig> config,
+ bool hasChanged, int64_t generation);
+ void updateUpgradeFlag(const vespa::config::content::UpgradingConfig&);
+
+protected:
+ // Lock taken while doing configuration of the server.
+ vespalib::Lock _configLock;
+ // Current running config. Kept, such that we can see what has been
+ // changed in live config updates.
+ std::unique_ptr<vespa::config::content::core::StorServerConfig> _serverConfig;
+ std::unique_ptr<vespa::config::content::UpgradingConfig> _clusterConfig;
+ std::unique_ptr<vespa::config::content::StorDistributionConfig> _distributionConfig;
+ std::unique_ptr<vespa::config::content::core::StorPrioritymappingConfig> _priorityConfig;
+ std::unique_ptr<document::DocumenttypesConfig> _doctypesConfig;
+ // New configs gotten that has yet to have been handled
+ std::unique_ptr<vespa::config::content::core::StorServerConfig> _newServerConfig;
+ std::unique_ptr<vespa::config::content::UpgradingConfig> _newClusterConfig;
+ std::unique_ptr<vespa::config::content::StorDistributionConfig> _newDistributionConfig;
+ std::unique_ptr<vespa::config::content::core::StorPrioritymappingConfig> _newPriorityConfig;
+ std::unique_ptr<document::DocumenttypesConfig> _newDoctypesConfig;
+ StorageComponent::UP _component;
+ config::ConfigUri _configUri;
+ CommunicationManager* _communicationManager;
+
+ /**
+ * Node subclasses currently need to explicitly acquire ownership of state
+ * manager so that they can add it to the end of their processing chains,
+ * which this method allows for.
+ * Any component releasing the state manager must ensure it lives for as
+ * long as the node instance itself lives.
+ */
+ std::unique_ptr<StateManager> releaseStateManager();
+
+ void initialize();
+ virtual void subscribeToConfigs();
+ virtual void initializeNodeSpecific() = 0;
+ virtual StorageLink::UP createChain() = 0;
+ virtual void handleLiveConfigUpdate();
+ void shutdown();
+ virtual void removeConfigSubscriptions();
+
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageserver/storagenodecontext.cpp b/storage/src/vespa/storage/storageserver/storagenodecontext.cpp
new file mode 100644
index 00000000000..90aefcaceae
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/storagenodecontext.cpp
@@ -0,0 +1,33 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageserver/storagenodecontext.h>
+
+#include <vespa/storageframework/defaultimplementation/memory/prioritymemorylogic.h>
+
+using storage::framework::defaultimplementation::AllocationLogic;
+
+namespace storage {
+
+StorageNodeContext::StorageNodeContext(ComponentRegister::UP compReg, framework::Clock::UP clock)
+ : _componentRegister(std::move(compReg)),
+ _clock(std::move(clock)),
+ _threadPool(*_clock),
+ _memoryLogic(new framework::defaultimplementation::PriorityMemoryLogic(
+ *_clock, 1024 * 1024 * 1024)),
+ _memoryManager(AllocationLogic::UP(_memoryLogic))
+{
+ _componentRegister->setClock(*_clock);
+ _componentRegister->setThreadPool(_threadPool);
+ _componentRegister->setMemoryManager(_memoryManager);
+}
+
+void
+StorageNodeContext::setMaximumMemoryUsage(uint64_t max)
+{
+ using storage::framework::defaultimplementation::PriorityMemoryLogic;
+ dynamic_cast<PriorityMemoryLogic*>(_memoryLogic)
+ ->setMaximumMemoryUsage(max);
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageserver/storagenodecontext.h b/storage/src/vespa/storage/storageserver/storagenodecontext.h
new file mode 100644
index 00000000000..dfd8b93f08e
--- /dev/null
+++ b/storage/src/vespa/storage/storageserver/storagenodecontext.h
@@ -0,0 +1,69 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::StorageNodeContext
+ * @ingroup storageserver
+ *
+ * @brief Data available to both provider implementations and storage server
+ *
+ * This utility class sets up the default component register implementation.
+ * It also sets up the clock and the threadpool, such that the most basic
+ * features are available to the provider, before the service layer is set up.
+ *
+ * The service layer still provides the memory manager functionality though,
+ * so you cannot retrieve the memory manager before the service layer has
+ * started up. (Before getPartitionStates() have been called on provider)
+ */
+
+#pragma once
+
+#include <vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h>
+#include <vespa/storageframework/defaultimplementation/clock/realclock.h>
+#include <vespa/storageframework/defaultimplementation/memory/memorymanager.h>
+#include <vespa/storageframework/defaultimplementation/thread/threadpoolimpl.h>
+
+namespace storage {
+
+struct StorageNodeContext {
+ // Typedefs to simplify the remainder of the interface
+ typedef StorageComponentRegisterImpl ComponentRegister;
+ typedef framework::defaultimplementation::RealClock RealClock;
+ typedef framework::defaultimplementation::MemoryManager MemoryManager;
+
+ /**
+ * Get the actual component register. Available as the actual type as the
+ * storage server need to set implementations, and the components need the
+ * actual component register interface.
+ */
+ ComponentRegister& getComponentRegister() { return *_componentRegister; }
+
+ /**
+ * There currently exist threads that doesn't use the component model.
+ * Let the backend threadpool be accessible for now.
+ */
+ FastOS_ThreadPool& getThreadPool() { return _threadPool.getThreadPool(); }
+
+ /**
+ * Get the memory manager. Components that wants to print status of memory
+ * manager need access to the actual implementation.
+ */
+ MemoryManager& getMemoryManager() { return _memoryManager; }
+
+ void setMaximumMemoryUsage(uint64_t max);
+
+protected:
+ // Initialization has been split in two as subclass needs to initialize
+ // component register before sending it on.
+ StorageNodeContext(ComponentRegister::UP,
+ framework::Clock::UP);
+
+private:
+ ComponentRegister::UP _componentRegister;
+ framework::Clock::UP _clock;
+ framework::defaultimplementation::ThreadPoolImpl _threadPool;
+ framework::defaultimplementation::AllocationLogic* _memoryLogic;
+ MemoryManager _memoryManager;
+
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageutil/.gitignore b/storage/src/vespa/storage/storageutil/.gitignore
new file mode 100644
index 00000000000..dfa09296ddb
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/.gitignore
@@ -0,0 +1,4 @@
+*.So
+.*.swp
+.depend
+Makefile
diff --git a/storage/src/vespa/storage/storageutil/CMakeLists.txt b/storage/src/vespa/storage/storageutil/CMakeLists.txt
new file mode 100644
index 00000000000..43f556f5066
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/CMakeLists.txt
@@ -0,0 +1,11 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_storageutil OBJECT
+ SOURCES
+ bloomfilter.cpp
+ piechart.cpp
+ palette.cpp
+ graph.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/storageutil/bloomfilter.cpp b/storage/src/vespa/storage/storageutil/bloomfilter.cpp
new file mode 100644
index 00000000000..26e9e4012d7
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/bloomfilter.cpp
@@ -0,0 +1,41 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include "bloomfilter.h"
+#include <stdlib.h>
+#include <string.h>
+
+BloomFilter::BloomFilter(int size, int hashes, uint32_t *buf)
+ : _size(size),
+ _hashes(hashes),
+ _buf(buf),
+ _mine(false)
+{
+ if (!_buf) {
+ _buf = new uint32_t[(_size / 32) + 1];
+ memset(_buf, 0, ((_size / 32) + 1) * sizeof(uint32_t));
+ _mine = true;
+ }
+}
+
+BloomFilter::~BloomFilter()
+{
+ if (_mine) {
+ delete [] _buf;
+ }
+}
+
+/*
+int main(int argc, char **argv)
+{
+ int size = atoi(argv[1]);
+ int hashes = atoi(argv[2]);
+ char buf[1000];
+ BloomFilter bloom(size, hashes);
+
+ while (fgets(buf, sizeof(buf), stdin)) {
+ if (bloom.check(buf, true)) {
+ printf("matched %s\n", buf);
+ }
+ }
+}
+*/
diff --git a/storage/src/vespa/storage/storageutil/bloomfilter.h b/storage/src/vespa/storage/storageutil/bloomfilter.h
new file mode 100644
index 00000000000..090877ae925
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/bloomfilter.h
@@ -0,0 +1,118 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <stdio.h>
+#include <inttypes.h>
+
+class BloomFilter
+{
+private:
+ BloomFilter(const BloomFilter &);
+ BloomFilter& operator=(const BloomFilter &);
+
+public:
+ BloomFilter(int size, int hashes, uint32_t *buf = NULL);
+ ~BloomFilter();
+
+ bool check(const uint32_t *data, int len, bool add);
+ bool check(const char *data, int len, bool add);
+ bool check(const char *data, bool add);
+
+
+ private:
+ int _size;
+ int _hashes;
+ uint32_t *_buf;
+ bool _mine;
+
+ static const uint32_t MULT1 = 1500450271;
+ static const uint32_t MULT2 = 2860486313U;
+ uint32_t hash(const uint32_t *data, int len, uint32_t multiplier, uint32_t max);
+ uint32_t hash(const char *data, int len, uint32_t multiplier, uint32_t max);
+ uint32_t hash(const char *data, uint32_t multiplier, uint32_t max);
+
+ bool check(uint32_t hash1, uint32_t hash2, bool add);
+ bool isSet(uint32_t pos, bool set);
+
+};
+
+uint32_t
+BloomFilter::hash(const uint32_t *data, int len, uint32_t multiplier, uint32_t max)
+{
+ uint32_t val = 1;
+ for (int i = 0; i < len; i++) {
+ val = (multiplier * val + data[i]) % max;
+ }
+ return val;
+}
+
+uint32_t
+BloomFilter::hash(const char *data, int len, uint32_t multiplier, uint32_t max)
+{
+ uint32_t val = 1;
+ for (int i = 0; i < len; i++) {
+ val = (multiplier * val + data[i]) % max;
+ }
+ return val;
+}
+
+uint32_t
+BloomFilter::hash(const char *data, uint32_t multiplier, uint32_t max)
+{
+ uint32_t val = 1;
+ for (int i = 0; data[i]; i++) {
+ val = (multiplier * val + data[i]) % max;
+ }
+ return val;
+}
+
+
+bool
+BloomFilter::check(const uint32_t *data, int len, bool add)
+{
+ uint32_t hash1 = hash(data, len, MULT1, _size);
+ uint32_t hash2 = hash(data, len, MULT2, _size);
+ return check(hash1, hash2, add);
+}
+
+bool
+BloomFilter::check(const char *data, int len, bool add)
+{
+ uint32_t hash1 = hash(data, len, MULT1, _size);
+ uint32_t hash2 = hash(data, len, MULT2, _size);
+ return check(hash1, hash2, add);
+}
+bool
+BloomFilter::check(const char *data, bool add)
+{
+ uint32_t hash1 = hash(data, MULT1, _size);
+ uint32_t hash2 = hash(data, MULT2, _size);
+ return check(hash1, hash2, add);
+}
+
+bool
+BloomFilter::check(uint32_t hash1, uint32_t hash2, bool add)
+{
+ bool found = true;
+ for (int i = 0; i < _hashes; i++) {
+ hash1 = (hash1 + hash2) % _size;
+ hash2 = (hash2 + i) % _size;
+ if (!isSet(hash1, add)) {
+ if (!add) {
+ return false;
+ }
+ found = false;
+ }
+ }
+ return found;
+}
+
+bool
+BloomFilter::isSet(uint32_t pos, bool add)
+{
+ if ((_buf[pos >> 5] & (1 << (pos & 31))) == 0) {
+ if (add) {
+ _buf[pos >> 5] |= (1 << (pos & 31));
+ }
+ return false;
+ }
+ return true;
+}
diff --git a/storage/src/vespa/storage/storageutil/distributorstatecache.h b/storage/src/vespa/storage/storageutil/distributorstatecache.h
new file mode 100644
index 00000000000..ec6a614d9d7
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/distributorstatecache.h
@@ -0,0 +1,58 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/vdslib/distribution/distribution.h>
+
+namespace storage {
+
+class DistributorStateCache
+{
+public:
+ DistributorStateCache(
+ const lib::Distribution& distr,
+ const lib::ClusterState& state)
+ : _distribution(distr),
+ _state(state),
+ _distrBitMask(0xffffffffffffffffull),
+ _lastDistrBits(0xffffffffffffffffull),
+ _lastResult(0xffff)
+ {
+ _distrBitMask <<= (64 - state.getDistributionBitCount());
+ _distrBitMask >>= (64 - state.getDistributionBitCount());
+ }
+
+ uint16_t getOwner(const document::BucketId& bid,
+ const char* upStates = "ui")
+ {
+ uint64_t distributionBits = bid.getRawId() & _distrBitMask;
+
+ uint16_t i;
+ if (distributionBits == _lastDistrBits) {
+ i = _lastResult;
+ } else {
+ i = _distribution.getIdealDistributorNode(_state, bid, upStates);
+ }
+ _lastDistrBits = distributionBits;
+ _lastResult = i;
+ return i;
+ }
+
+ const lib::Distribution& getDistribution() const {
+ return _distribution;
+ }
+
+ const lib::ClusterState& getClusterState() const {
+ return _state;
+ }
+
+private:
+ const lib::Distribution& _distribution;
+ const lib::ClusterState& _state;
+ uint64_t _distrBitMask;
+ uint64_t _lastDistrBits;
+ uint16_t _lastResult;
+};
+
+}
+
diff --git a/storage/src/vespa/storage/storageutil/functor.h b/storage/src/vespa/storage/storageutil/functor.h
new file mode 100644
index 00000000000..504f9fbd57d
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/functor.h
@@ -0,0 +1,60 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @ingroup storageutil
+ *
+ * @brief Functors ards storage, not dependent on external messaging.
+ *
+ * @author H�kon Humberset
+ * @date 2005-05-13
+ * @version $Id$
+ */
+
+#pragma once
+
+namespace storage {
+
+class Functor {
+public:
+
+ /**
+ * For instance, using this functor you can say:
+ *
+ * string mystring("this is a test");
+ * for_each(mystring.begin(), mystring.end(),
+ * Functor.Replace<char>(' ', '_'));
+ *
+ * or
+ *
+ * vector<string> myvector;
+ * for_each(myvector.begin(), myvector.end(),
+ * Functor.Replace<string>("this", "that"));
+ */
+ template<class T>
+ class Replace {
+ private:
+ const T& _what;
+ const T& _with;
+
+ public:
+ Replace(const T& what, const T& with)
+ : _what(what),
+ _with(with) {}
+
+ void operator()(T& element) const
+ { if (element == _what) element = _with; }
+ };
+
+ /**
+ * To easily delete containers of pointers.
+ *
+ * for_each(myvec.begin(), myvec.end(), Functor::DeletePointer());
+ */
+ class DeletePointer {
+ public:
+ template<class T> void operator()(T *ptr) const { delete ptr; }
+ };
+
+};
+
+}
+
diff --git a/storage/src/vespa/storage/storageutil/graph.cpp b/storage/src/vespa/storage/storageutil/graph.cpp
new file mode 100644
index 00000000000..b1b748ffd84
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/graph.cpp
@@ -0,0 +1,201 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageutil/graph.h>
+
+#include <iomanip>
+#include <sstream>
+#include <vespa/vespalib/util/exceptions.h>
+
+namespace storage {
+
+
+void
+Graph::printHtmlHeadAdditions(std::ostream& out, const std::string& indent)
+{
+ (void) out;
+ (void) indent;
+ // FIXME this used to reference Yahoo-internal JS URIs
+}
+
+Graph::Graph(const std::string& name, ColorScheme cs)
+ : _name(name),
+ _graphs(),
+ _colors(cs),
+ _leftPad(50),
+ _rightPad(0),
+ _topPad(0),
+ _bottomPad(0)
+{
+}
+
+void
+Graph::add(const std::vector<Point>& values, const std::string& name)
+{
+ if (_colors == SCHEME_CUSTOM) {
+ throw vespalib::IllegalArgumentException(
+ "Using custom color scheme you need to supply a color for each "
+ "graph.", VESPA_STRLOC);
+ }
+ _graphs.push_back(Entry(values, name, UNDEFINED));
+}
+
+void
+Graph::add(const std::vector<Point>& values, const std::string& name, Color c)
+{
+ if (_colors != SCHEME_CUSTOM) {
+ throw vespalib::IllegalArgumentException(
+ "Not using custom color scheme you cannot supply a custom "
+ "color for a graph.", VESPA_STRLOC);
+ }
+ _graphs.push_back(Entry(values, name, c));
+}
+
+void
+Graph::add(const std::vector<Point>& values, const std::string& name, int32_t c)
+{
+ if (_colors != SCHEME_CUSTOM) {
+ throw vespalib::IllegalArgumentException(
+ "Not using custom color scheme you cannot supply a custom "
+ "color for a graph.", VESPA_STRLOC);
+ }
+ _graphs.push_back(Entry(values, name, (Color) c));
+}
+
+void
+Graph::printCanvas(std::ostream& out, uint32_t width, uint32_t height) const
+{
+ out << "<div><canvas id=\"" << _name << "\" width=\"" << width
+ << "\" height=\"" << height << "\"/></div>";
+}
+
+namespace {
+ void printDatasetDefinition(std::ostream& o, const std::string& i,
+ const std::string& n, const std::vector<Graph::Entry>& e)
+ {
+ o << i << " var " << n << "_dataset = {\n" << std::dec;
+ bool first = true;
+ for (std::vector<Graph::Entry>::const_iterator it = e.begin();
+ it != e.end(); ++it)
+ {
+ if (!first) o << ",\n";
+ first = false;
+ o << i << " '" << it->_name << "': [";
+ for (uint32_t j=0; j<it->points.size(); ++j) {
+ if (j != 0) o << ", ";
+ o << "[" << it->points[j].x << ", " << it->points[j].y << "]";
+ }
+ o << "]";
+ }
+ o << "\n" << i << " };";
+ }
+
+ void printCustomColorScheme(std::ostream& o, const std::string& i,
+ const std::string& n, const std::vector<Graph::Entry>& e)
+ {
+ o << " var " << n << "_customScheme = new Hash({\n" << std::hex;
+ bool first = true;
+ for (std::vector<Graph::Entry>::const_iterator it = e.begin();
+ it != e.end(); ++it)
+ {
+ if (!first) o << ",\n";
+ first = false;
+ o << i << " '" << it->_name << "': '#" << std::setw(6)
+ << std::setfill('0') << (it->_color & 0x00FFFFFF) << "'";
+ }
+ o << "\n" << i << " });" << std::dec;
+ }
+
+ void printOptions(std::ostream& o, const std::string& i,
+ const std::string& n, Graph::ColorScheme c,
+ const std::vector<Graph::Axis>& xAxis,
+ const std::vector<Graph::Axis>& yAxis,
+ uint32_t leftpad, uint32_t rightpad,
+ uint32_t toppad, uint32_t bottompad,
+ uint32_t legendXPos, uint32_t legendYPos)
+ {
+ o << " var " << n << "_options = {\n"
+ << i << " padding: {\n"
+ << i << " left: " << leftpad << ",\n"
+ << i << " right: " << rightpad << ",\n"
+ << i << " top: " << toppad << ",\n"
+ << i << " bottom: " << bottompad << ",\n"
+ << i << " },\n"
+ << i << " background: {\n"
+ << i << " color: '#ffffff'\n"
+ << i << " },\n"
+ << i << " shouldFill: true,\n";
+ if (c == Graph::SCHEME_CUSTOM) {
+ o << i << " \"colorScheme\": " << n << "_customScheme,\n";
+ } else {
+ o << i << " colorScheme: '";
+ switch (c) {
+ case Graph::SCHEME_RED: o << "red"; break;
+ case Graph::SCHEME_BLUE: o << "blue"; break;
+ case Graph::SCHEME_CUSTOM: break;
+ }
+ o << "',\n";
+ }
+ o << i << " legend: {\n"
+ << i << " opacity: 0.9,\n"
+ << i << " position: {\n"
+ << i << " top: " << legendYPos << ",\n"
+ << i << " left: " << legendXPos << "\n"
+ << i << " }\n"
+ << i << " },\n"
+ << i << " axis: {\n"
+ << i << " labelColor: '#000000',\n"
+ << i << " x: {\n";
+ if (xAxis.size() > 0) {
+ o << i << " ticks: [\n";
+ for (uint32_t j=0; j<xAxis.size(); ++j) {
+ o << i << " {v:" << xAxis[j].value << ", label:'"
+ << xAxis[j].name << "'},\n";
+ }
+ o << i << " ]\n";
+ }
+ o << i << " },\n"
+ << i << " y: {\n";
+ if (yAxis.size() > 0) {
+ o << i << " ticks: [\n";
+ for (uint32_t j=0; j<yAxis.size(); ++j) {
+ o << i << " {v:" << yAxis[j].value << ", label:'"
+ << yAxis[j].name << "'},\n";
+ }
+ o << i << " ]\n";
+ }
+
+ o << i << " }\n"
+ << i << " }\n"
+ << i << " };";
+ }
+
+ void printChart(std::ostream& o, const std::string& i, const std::string& n)
+ {
+ o << " var " << n << "_chart = new Plotr.LineChart('" << n
+ << "', " << n << "_options);\n"
+ << i << " " << n << "_chart.addDataset(" << n << "_dataset);\n"
+ << i << " " << n << "_chart.render();";
+ }
+}
+
+void
+Graph::printScript(std::ostream& out, const std::string& indent) const
+{
+ out << "<script type=\"text/javascript\">\n";
+ printDatasetDefinition(out, indent, _name, _graphs);
+ if (_colors == SCHEME_CUSTOM) {
+ out << "\n" << indent;
+ printCustomColorScheme(out, indent, _name, _graphs);
+ }
+ out << "\n" << indent;
+ printOptions(out, indent, _name, _colors, _xAxis, _yAxis,
+ _leftPad, _rightPad, _topPad, _bottomPad,
+ _legendXPos, _legendYPos);
+ out << "\n" << indent;
+ printChart(out, indent, _name);
+ out << "\n" << indent << "</script>";
+}
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageutil/graph.h b/storage/src/vespa/storage/storageutil/graph.h
new file mode 100644
index 00000000000..2892f36031a
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/graph.h
@@ -0,0 +1,96 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::Graph
+ * \ingroup util
+ *
+ * \brief Helper library to print graphs in HTML.
+ */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <ostream>
+
+namespace storage {
+
+class Graph {
+public:
+ enum ColorScheme {
+ SCHEME_CUSTOM,
+ SCHEME_RED,
+ SCHEME_BLUE
+ };
+ enum Color {
+ UNDEFINED = -1,
+ BLACK = 0x000000,
+ RED = 0xFF0000,
+ GREEN = 0x00FF00,
+ BLUE = 0x0000FF,
+ WHITE = 0xFFFFFF,
+ YELLOW = 0xFFFF00
+ };
+ struct Point {
+ double x;
+ double y;
+
+ Point(double x_, double y_) : x(x_), y(y_) {}
+ };
+ struct Entry {
+ std::vector<Point> points;
+ std::string _name;
+ int32_t _color;
+
+ Entry(const std::vector<Point>& v, const std::string& name, int32_t col)
+ : points(v), _name(name), _color(col) {}
+ };
+ struct Axis {
+ double value;
+ std::string name;
+
+ Axis(double val, const std::string& name_) : value(val), name(name_) {}
+ };
+
+ static void printHtmlHeadAdditions(
+ std::ostream& out, const std::string& indent = "");
+
+private:
+ const std::string _name;
+ std::vector<Entry> _graphs;
+ ColorScheme _colors;
+ std::vector<Axis> _xAxis;
+ std::vector<Axis> _yAxis;
+ uint32_t _leftPad;
+ uint32_t _rightPad;
+ uint32_t _topPad;
+ uint32_t _bottomPad;
+ uint32_t _legendXPos;
+ uint32_t _legendYPos;
+
+public:
+ Graph(const std::string&, ColorScheme = SCHEME_BLUE);
+
+ void add(const std::vector<Point>&, const std::string& name);
+ void add(const std::vector<Point>&, const std::string& name, Color c);
+ void add(const std::vector<Point>&, const std::string& name, int32_t color);
+
+ void addXAxisLabel(double value, const std::string& name)
+ { _xAxis.push_back(Axis(value, name)); }
+ void addYAxisLabel(double value, const std::string& name)
+ { _yAxis.push_back(Axis(value, name)); }
+
+ void setBorders(uint32_t left, uint32_t right,
+ uint32_t top, uint32_t bottom)
+ {
+ _leftPad = left; _rightPad = right; _topPad = top; _bottomPad = bottom;
+ }
+
+ void setLegendPos(uint32_t left, uint32_t top)
+ { _legendXPos = left; _legendYPos = top; }
+
+ void printCanvas(std::ostream& out, uint32_t width, uint32_t height) const;
+ void printScript(std::ostream& out, const std::string& indent = "") const;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageutil/log.h b/storage/src/vespa/storage/storageutil/log.h
new file mode 100644
index 00000000000..5d82afed640
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/log.h
@@ -0,0 +1,30 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+#include <vespa/log/log.h>
+
+#define STORAGE_LOG_INTERVAL 30
+
+#define STORAGE_LOG_COUNT(name, interval) do { \
+ static uint64_t C_count ## name = 0; \
+ static time_t C_last ## name = time(NULL); \
+ C_count ## name ++; \
+ time_t C_now ## name = time(NULL); \
+ if (C_now ## name - C_last ## name >= interval) { \
+ EV_COUNT(#name, C_count ## name); \
+ C_last ## name = C_now ## name; \
+ } } while (false)
+
+#define STORAGE_LOG_AVERAGE(name, value, interval) do { \
+ static uint64_t A_count ## name = 0; \
+ static float A_total ## name = 0.0; \
+ static time_t A_last ## name = time(NULL); \
+ A_count ## name ++; \
+ A_total ## name += value; \
+ time_t A_now ## name = time(NULL); \
+ if (A_now ## name - A_last ## name >= interval) { \
+ EV_VALUE(#name, A_total ## name / A_count ## name); \
+ A_count ## name = 0; \
+ A_total ## name = 0; \
+ A_last ## name = A_now ## name; \
+ }} while (false)
+
diff --git a/storage/src/vespa/storage/storageutil/palette.cpp b/storage/src/vespa/storage/storageutil/palette.cpp
new file mode 100644
index 00000000000..76998bd7677
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/palette.cpp
@@ -0,0 +1,111 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageutil/palette.h>
+
+#include <iostream>
+#include <iomanip>
+
+namespace storage {
+
+namespace {
+ struct Col {
+ int16_t red;
+ int16_t green;
+ int16_t blue;
+
+ Col(int16_t r, int16_t g, int16_t b) : red(r), green(g), blue(b) {}
+ };
+
+ std::vector<Col> createMainColors() {
+ std::vector<Col> v;
+ v.push_back(Col(128, 128, 128));
+ v.push_back(Col(255, 0, 0));
+ v.push_back(Col(255, 255, 0));
+ v.push_back(Col(255, 0, 255));
+ v.push_back(Col(0, 255, 0));
+ v.push_back(Col(0, 255, 255));
+ v.push_back(Col(0, 0, 255));
+ v.push_back(Col(128, 64, 192));
+ v.push_back(Col(192, 128, 64));
+ v.push_back(Col(64, 192, 128));
+ return v;
+ }
+
+ std::vector<Col> mainColors(createMainColors());
+}
+
+Palette::Palette(uint32_t colorCount)
+{
+
+ uint32_t variations = (colorCount + mainColors.size() - 1)
+ / (mainColors.size());
+ int16_t darkvars = variations / 2;
+ int16_t lightvars = (variations - 1) / 2;
+
+ std::vector<Col> darkVars;
+ if (darkvars > 0) {
+ for (int32_t i=darkvars; i>0; --i) {
+ for (uint32_t j=0; j<mainColors.size(); ++j) {
+ Col& main(mainColors[j]);
+ int rdiff = main.red / (darkvars + 1);
+ int gdiff = main.green / (darkvars + 1);
+ int bdiff = main.blue / (darkvars + 1);
+ darkVars.push_back(Col(
+ std::max(0, main.red - rdiff * i),
+ std::max(0, main.green - gdiff * i),
+ std::max(0, main.blue - bdiff * i)));
+ }
+ }
+ }
+ std::vector<Col> lightVars;
+ if (lightvars > 0) {
+ for (int32_t i=1; i<=lightvars; ++i) {
+ for (uint32_t j=0; j<mainColors.size(); ++j) {
+ Col& main(mainColors[j]);
+ int rdiff = (255 - main.red) / (lightvars + 1);
+ int gdiff = (255 - main.green) / (lightvars + 1);
+ int bdiff = (255 - main.blue) / (lightvars + 1);
+ lightVars.push_back(Col(
+ std::min(255, main.red + rdiff * i),
+ std::min(255, main.green + gdiff * i),
+ std::min(255, main.blue + bdiff * i)));
+ }
+ }
+ }
+ for (std::vector<Col>::const_iterator it = darkVars.begin();
+ it != darkVars.end(); ++it)
+ {
+ _colors.push_back((it->red << 16) | (it->green << 8) | it->blue);
+ }
+ for (std::vector<Col>::const_iterator it = mainColors.begin();
+ it != mainColors.end(); ++it)
+ {
+ _colors.push_back((it->red << 16) | (it->green << 8) | it->blue);
+ }
+ for (std::vector<Col>::const_iterator it = lightVars.begin();
+ it != lightVars.end(); ++it)
+ {
+ _colors.push_back((it->red << 16) | (it->green << 8) | it->blue);
+ }
+}
+
+void
+Palette::printHtmlTablePalette(std::ostream& out) const
+{
+ out << "<table>" << std::hex << std::setfill('0');
+ uint32_t col = 0;
+ while (col < _colors.size()) {
+ out << "\n<tr>";
+ for (uint32_t i=0; i<mainColors.size(); ++i) {
+ out << "\n <td bgcolor=\"#" << std::setw(6) << _colors[col++]
+ << "\">";
+ for (uint32_t j=0; j<6; ++j) out << "&nbsp;";
+ out << "</td>";
+ }
+ out << "\n</tr>";
+ }
+ out << "\n</table>" << std::dec;
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/storageutil/palette.h b/storage/src/vespa/storage/storageutil/palette.h
new file mode 100644
index 00000000000..fe42eae0d31
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/palette.h
@@ -0,0 +1,30 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::Palette
+ *
+ * \brief Contains a set of distinct colors.
+ *
+ * When writing graphics like charts one wants to use distinct colors.
+ * This class defines some distinct colors.
+ */
+
+#pragma once
+
+#include <vector>
+
+namespace storage {
+
+class Palette {
+ std::vector<uint32_t> _colors;
+
+public:
+ Palette(uint32_t colorCount);
+
+ uint32_t operator[](uint32_t colorIndex) const
+ { return _colors[colorIndex]; }
+
+ void printHtmlTablePalette(std::ostream& out) const;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageutil/piechart.cpp b/storage/src/vespa/storage/storageutil/piechart.cpp
new file mode 100644
index 00000000000..63fe69bf944
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/piechart.cpp
@@ -0,0 +1,202 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageutil/piechart.h>
+
+#include <iomanip>
+#include <sstream>
+#include <vespa/vespalib/util/exceptions.h>
+
+namespace storage {
+
+double PieChart::_minValue = 0.0000001;
+
+PieChart::Entry::Entry(double val, const std::string& name, int32_t col)
+ : _value(val), _name(name), _color(col)
+{
+}
+
+void
+PieChart::printHtmlHeadAdditions(std::ostream& out, const std::string& indent)
+{
+ (void) out;
+ (void) indent;
+ // FIXME this used to reference Yahoo-internal JS URIs.
+ // Deprecated functionality either way.
+}
+
+PieChart::PieChart(const std::string& name, ColorScheme cs)
+ : _name(name),
+ _values(),
+ _colors(cs),
+ _printLabels(true)
+{
+}
+
+void
+PieChart::add(double value, const std::string& name)
+{
+ if (value < _minValue) {
+ std::ostringstream ost;
+ ost << "Value of " << value << " is below the minimum supported value "
+ << "of the pie chart (" << _minValue << ")";
+ throw vespalib::IllegalArgumentException(ost.str(), VESPA_STRLOC);
+ }
+ if (_colors == SCHEME_CUSTOM) {
+ throw vespalib::IllegalArgumentException(
+ "Using custom color scheme you need to supply a color for each "
+ "value.", VESPA_STRLOC);
+ }
+ _values.push_back(Entry(value, name, UNDEFINED));
+}
+
+void
+PieChart::add(double value, const std::string& name, Color c)
+{
+ if (value < _minValue) {
+ std::ostringstream ost;
+ ost << "Value of " << value << " is below the minimum supported value "
+ << "of the pie chart (" << _minValue << ")";
+ throw vespalib::IllegalArgumentException(ost.str(), VESPA_STRLOC);
+ }
+ if (_colors != SCHEME_CUSTOM) {
+ throw vespalib::IllegalArgumentException(
+ "Not using custom color scheme you cannot supply a custom "
+ "color for a value.", VESPA_STRLOC);
+ }
+ _values.push_back(Entry(value, name, c));
+}
+
+void
+PieChart::add(double value, const std::string& name, int32_t color)
+{
+ if (value < _minValue) {
+ std::ostringstream ost;
+ ost << "Value of " << value << " is below the minimum supported value "
+ << "of the pie chart (" << _minValue << ")";
+ throw vespalib::IllegalArgumentException(ost.str(), VESPA_STRLOC);
+ }
+ if (_colors != SCHEME_CUSTOM) {
+ throw vespalib::IllegalArgumentException(
+ "Not using custom color scheme you cannot supply a custom "
+ "color for a value.", VESPA_STRLOC);
+ }
+ _values.push_back(Entry(value, name, (Color) color));
+}
+
+void
+PieChart::printCanvas(std::ostream& out, uint32_t width, uint32_t height) const
+{
+ out << "<div><canvas id=\"" << _name << "\" width=\"" << width
+ << "\" height=\"" << height << "\"/></div>";
+}
+
+namespace {
+ void printDatasetDefinition(std::ostream& o, const std::string& i,
+ const std::string& n, const std::vector<PieChart::Entry>& e)
+ {
+ o << i << " var " << n << "_dataset = {\n" << std::dec;
+ bool first = true;
+ for (std::vector<PieChart::Entry>::const_iterator it = e.begin();
+ it != e.end(); ++it)
+ {
+ if (!first) o << ",\n";
+ first = false;
+ o << i << " '" << it->_name << "': [[0," << it->_value
+ << "]]";
+ }
+ o << "\n" << i << " };";
+ }
+
+ void printCustomColorScheme(std::ostream& o, const std::string& i,
+ const std::string& n, const std::vector<PieChart::Entry>& e)
+ {
+ o << " var " << n << "_customScheme = new Hash({\n" << std::hex;
+ bool first = true;
+ for (std::vector<PieChart::Entry>::const_iterator it = e.begin();
+ it != e.end(); ++it)
+ {
+ if (!first) o << ",\n";
+ first = false;
+ o << i << " '" << it->_name << "': '#" << std::setw(6)
+ << std::setfill('0') << (it->_color & 0x00FFFFFF) << "'";
+ }
+ o << "\n" << i << " });" << std::dec;
+ }
+
+ void printOptions(std::ostream& o, const std::string& i,
+ const std::string& n, const std::vector<PieChart::Entry>& e,
+ PieChart::ColorScheme c, bool printLabels)
+ {
+ o << " var " << n << "_options = {\n"
+ << i << " padding: {\n"
+ << i << " left: 0,\n"
+ << i << " right: 0,\n"
+ << i << " top: 0,\n"
+ << i << " bottom: 0,\n"
+ << i << " },\n"
+ << i << " background: {\n"
+ << i << " color: '#ffffff'\n"
+ << i << " },\n"
+ << i << " pieRadius: '0.4',\n";
+ if (c == PieChart::SCHEME_CUSTOM) {
+ o << i << " \"colorScheme\": " << n << "_customScheme,\n";
+ } else {
+ o << i << " colorScheme: '";
+ switch (c) {
+ case PieChart::SCHEME_RED: o << "red"; break;
+ case PieChart::SCHEME_BLUE: o << "blue"; break;
+ case PieChart::SCHEME_CUSTOM: break;
+ }
+ o << "',\n";
+ }
+ o << i << " axis: {\n"
+ << i << " labelColor: '#000000',\n"
+ << i << " x: {\n";
+ if (!printLabels) {
+ o << i << " hide: true,\n";
+ }
+ o << i << " ticks: [\n";
+ bool first = true;
+ uint32_t tmp = 0;
+ for (std::vector<PieChart::Entry>::const_iterator it = e.begin();
+ it != e.end(); ++it)
+ {
+ if (!first) o << ",\n";
+ first = false;
+ o << i << " {v:" << tmp++ << ", label:'" << it->_name
+ << "'}";
+ }
+ o << "\n" << i << " ]\n";
+ o << i << " }\n"
+ << i << " }\n"
+ << i << " };";
+ }
+
+ void printPie(std::ostream& o, const std::string& i, const std::string& n)
+ {
+ o << " var " << n << "_pie = new Plotr.PieChart('" << n << "', "
+ << n << "_options);\n"
+ << i << " " << n << "_pie.addDataset(" << n << "_dataset);\n"
+ << i << " " << n << "_pie.render();";
+ }
+}
+
+void
+PieChart::printScript(std::ostream& out, const std::string& indent) const
+{
+ out << "<script type=\"text/javascript\">\n";
+ printDatasetDefinition(out, indent, _name, _values);
+ if (_colors == SCHEME_CUSTOM) {
+ out << "\n" << indent;
+ printCustomColorScheme(out, indent, _name, _values);
+ }
+ out << "\n" << indent;
+ printOptions(out, indent, _name, _values, _colors, _printLabels);
+ out << "\n" << indent;
+ printPie(out, indent, _name);
+ out << "\n" << indent << "</script>";
+}
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageutil/piechart.h b/storage/src/vespa/storage/storageutil/piechart.h
new file mode 100644
index 00000000000..3f0e61a0966
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/piechart.h
@@ -0,0 +1,65 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::PieChart
+ * \ingroup util
+ *
+ * \brief Helper library to print pie charts in HTML.
+ */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include <ostream>
+
+namespace storage {
+
+class PieChart {
+public:
+ static double _minValue;
+
+ enum ColorScheme {
+ SCHEME_CUSTOM,
+ SCHEME_RED,
+ SCHEME_BLUE
+ };
+ enum Color {
+ UNDEFINED = -1,
+ BLACK = 0x000000,
+ RED = 0xFF0000,
+ GREEN = 0x00FF00,
+ BLUE = 0x0000FF,
+ WHITE = 0xFFFFFF
+ };
+ struct Entry {
+ double _value;
+ std::string _name;
+ int32_t _color;
+
+ Entry(double val, const std::string& name, int32_t col);
+ };
+
+ static void printHtmlHeadAdditions(
+ std::ostream& out, const std::string& indent = "");
+
+private:
+ const std::string _name;
+ std::vector<Entry> _values;
+ ColorScheme _colors;
+ bool _printLabels;
+
+public:
+ PieChart(const std::string&, ColorScheme = SCHEME_BLUE);
+
+ void printLabels(bool doprint) { _printLabels = doprint; }
+
+ void add(double value, const std::string& name);
+ void add(double value, const std::string& name, Color c);
+ void add(double value, const std::string& name, int32_t color);
+
+ void printCanvas(std::ostream& out, uint32_t width, uint32_t height) const;
+ void printScript(std::ostream& out, const std::string& indent = "") const;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/storageutil/recordflatfile.cpp b/storage/src/vespa/storage/storageutil/recordflatfile.cpp
new file mode 100644
index 00000000000..2951ae702e9
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/recordflatfile.cpp
@@ -0,0 +1,155 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/storageutil/recordflatfile.h>
+
+#include <sys/stat.h>
+#include <sys/types.h>
+
+using document::IoException;
+using namespace std;
+using namespace storage;
+
+namespace {
+
+ string getLastError(FastOS_File&) {
+ // Potential memory leak if string's operator new throws bad_alloc
+ // or other exception.
+ char *ptr = FastOS_File::GetLastErrorString();
+ string error(ptr);
+ free(ptr);
+ return error;
+ }
+
+}
+
+ExceptionThrowingFile::
+ExceptionThrowingFile(const string& filename)
+ : _file(filename.c_str())
+{
+}
+
+void ExceptionThrowingFile::openReadOnly()
+throw (IoException)
+{
+ if (!_file.OpenReadOnly()) {
+ throw IoException(
+ "FastOS_File.OpenReadOnly reported: "+getLastError(_file),
+ VESPA_STRLOC);
+ }
+}
+
+void ExceptionThrowingFile::openWriteOnly()
+throw (IoException)
+{
+ if (!_file.OpenWriteOnly()) {
+ throw IoException(
+ "FastOS_File.OpenWriteOnly reported: "+getLastError(_file),
+ VESPA_STRLOC);
+ }
+}
+
+void ExceptionThrowingFile::openReadWrite()
+throw (IoException)
+{
+ if (!_file.OpenReadWrite()) {
+ throw IoException(
+ "FastOS_File.OpenReadWrite reported: "+getLastError(_file),
+ VESPA_STRLOC);
+ }
+}
+
+void ExceptionThrowingFile::read(void* buffer, unsigned int length)
+throw (IoException)
+{
+ // Can't do arithmetics on void*, so casting first.
+ char* cbuffer = static_cast<char*>(buffer);
+ unsigned int totalRead = 0;
+ while (totalRead < length) {
+ unsigned int readSize = length - totalRead;
+ ssize_t count = _file.Read(cbuffer + totalRead, readSize);
+ if (count == -1) {
+ throw IoException(
+ "FastOS_File.Read reported: "+getLastError(_file),
+ VESPA_STRLOC);
+ } else if (count == 0) {
+ throw IoException("FastOS_File.Read returned 0",
+ VESPA_STRLOC);
+ }
+ totalRead += count;
+ }
+}
+
+void ExceptionThrowingFile::
+write(const void* buffer, unsigned int length)
+throw (IoException)
+{
+ if (!_file.CheckedWrite(buffer, length)) {
+ throw IoException(
+ "Call to FastOS_File.CheckedWrite() failed: "+getLastError(_file),
+ VESPA_STRLOC);
+ }
+}
+
+void ExceptionThrowingFile::setPosition(int64_t position)
+throw (IoException)
+{
+ if (!_file.SetPosition(position)) {
+ throw IoException(
+ "Call to FastOS_File.SetPosition() failed: "+getLastError(_file),
+ VESPA_STRLOC);
+ }
+}
+
+int64_t ExceptionThrowingFile::getPosition()
+throw (IoException)
+{
+ int64_t position = _file.GetPosition();
+ if (position == -1) {
+ throw IoException(
+ "Call to FastOS_File.GetPosition() failed: "+getLastError(_file),
+ VESPA_STRLOC);
+ }
+ assert(position >= 0);
+ return position;
+}
+
+int64_t ExceptionThrowingFile::getSize()
+throw (IoException)
+{
+ int64_t size = _file.GetSize();
+ if (size == -1) {
+ throw IoException(
+ "Call to FastOS_File.GetSize() failed: "+getLastError(_file),
+ VESPA_STRLOC);
+ }
+ assert(size >= 0);
+ return size;
+}
+
+void ExceptionThrowingFile::setSize(int64_t size)
+throw (IoException)
+{
+ if (!_file.SetSize(size)) {
+ throw IoException(
+ "Call to FastOS_File.SetSize() failed: "+getLastError(_file),
+ VESPA_STRLOC);
+ }
+}
+
+void ExceptionThrowingFile::remove()
+throw (IoException)
+{
+ if (!_file.Delete()) {
+ throw IoException(
+ "Call to FastOS_File.Remove() failed: "+getLastError(_file),
+ VESPA_STRLOC);
+ }
+}
+
+bool ExceptionThrowingFile::exists()
+throw (IoException)
+{
+ struct stat fileinfo;
+ return (stat(_file.GetFileName(), &fileinfo) == 0);
+}
diff --git a/storage/src/vespa/storage/storageutil/recordflatfile.h b/storage/src/vespa/storage/storageutil/recordflatfile.h
new file mode 100644
index 00000000000..b5eba556f5c
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/recordflatfile.h
@@ -0,0 +1,340 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::RecordFlatFile
+ * @ingroup allocator
+ *
+ * @brief Templated class for keeping fixed sized records of primitives on disk.
+ *
+ * This file is used to keep a number of fixed sized records on disk,
+ * it provides an abstraction layer, such that one doesn't have to worry
+ * about the disk access.
+ *
+ * It implements the disk access using FastOS_File and opens the file
+ * in combined Read/Write mode if writing is necessary.
+ *
+ * New entries are appended without checking if they previous exist.
+ * Updating entries change them in place. Deleting entries, moves the
+ * last entry in the file into the position of the entry that is being
+ * deleted, and file is truncated to fit.
+ *
+ * The class is defined to be a template, to prevent the need for the
+ * extra resources consumed by using inheritance.
+ *
+ * A record implementation should look something like this:
+ *
+ * class Record {
+ * public:
+ * Record(const Record&);
+ *
+ * Id getId() const;
+ * Record& operator=(const Record&);
+ * bool isValid();
+ * };
+ *
+ * class Id {
+ * public:
+ * operator==(const Id&) const;
+ * };
+ *
+ * ostream& operator<<(ostream& out, const Id&);
+ *
+ * NB: As records are written directly from memory to disk, and are
+ * reconstructed merely by copying disk content back into memory, they
+ * cannot include pointers or references as these types of variables would
+ * not be correctly saved and restored. It is thus safes to only use
+ * primitives.
+ *
+ * Note that this interface is not threadsafe. The class keeps a memory
+ * area for buffering, that is used during both read and write operations.
+ * Thus only one operation can be performed at a time.
+ *
+ * @author H�kon Humberset
+ * @date 2005-04-28
+ * @version $Id$
+ */
+
+#pragma once
+
+#include <cassert>
+#include <vespa/vespalib/util/exceptions.h>
+#include <vespa/fastos/fastos.h>
+#include <list>
+#include <memory>
+#include <stdexcept>
+#include <sstream>
+#include <string>
+
+namespace storage {
+
+/**
+ * Helper class to get a FastOS file that throws exceptions.
+ */
+class ExceptionThrowingFile {
+private:
+ FastOS_File _file;
+
+public:
+ ExceptionThrowingFile(const std::string& filename);
+
+ void openReadOnly() throw (vespalib::IoException);
+ void openWriteOnly() throw (vespalib::IoException);
+ void openReadWrite() throw (vespalib::IoException);
+ void read(void* _buffer, unsigned int length) throw (vespalib::IoException);
+ void write(const void* _buffer, unsigned int length)
+ throw (vespalib::IoException);
+ void setPosition(int64_t position) throw (vespalib::IoException);
+ int64_t getPosition() throw (vespalib::IoException);
+ void setSize(int64_t size) throw (vespalib::IoException);
+ int64_t getSize() throw (vespalib::IoException);
+ void remove() throw (vespalib::IoException);
+ bool exists() throw (vespalib::IoException);
+};
+
+template<class Record, class Id>
+class RecordFlatFile {
+private:
+ RecordFlatFile(const RecordFlatFile &);
+ RecordFlatFile& operator=(const RecordFlatFile &);
+
+ Record *_record; // Cache of a chunk of records
+ const std::string _path;
+ const unsigned int _chunkSize; // In kilobytes
+ const unsigned int _maxChunkRecordCount;
+ mutable std::list<std::string> _nonFatalErrors;
+
+public:
+ RecordFlatFile(const std::string& path, unsigned int chunksize = 4096)
+ throw (vespalib::IllegalArgumentException, vespalib::FatalException,
+ vespalib::IoException);
+ ~RecordFlatFile();
+
+ bool exists(const Id& id) const throw (vespalib::IoException);
+ std::unique_ptr<Record> getRecord(const Id& id) const
+ throw (vespalib::IoException);
+
+ bool update(const Record& record) throw (vespalib::IoException);
+ void add(const Record& record) throw (vespalib::IoException);
+ bool remove(const Id& id) throw (vespalib::IoException);
+ void clear() throw (vespalib::IoException);
+
+ // Functions to get entries in the flatfile by index. Used by tests to
+ // ensure correct operation.
+ unsigned int getSize() const throw (vespalib::IoException);
+ std::unique_ptr<Record> operator[](unsigned int index) const
+ throw (document::IllegalArgumentException, vespalib::IoException);
+
+ bool errorsFound() const { return (_nonFatalErrors.size() > 0); }
+ const std::list<std::string>& getErrors() const { return _nonFatalErrors; }
+ void clearErrors() { _nonFatalErrors.clear(); }
+};
+
+template<class Record, class Id> RecordFlatFile<Record, Id>::
+RecordFlatFile(const std::string& path,unsigned int chunksize)
+throw (document::IllegalArgumentException, document::FatalException,
+ vespalib::IoException)
+ : _record(0),
+ _path(path),
+ _chunkSize(chunksize * sizeof(Record)),
+ _maxChunkRecordCount(chunksize),
+ _nonFatalErrors()
+{
+ if (_maxChunkRecordCount == 0) {
+ throw document::IllegalArgumentException(
+ "RecordFlatFile("+_path+"): Chunksize given doesn't allow for any "
+ "records. Increase chunksize to at least sizeof(Record)", VESPA_STRLOC);
+ }
+ _record = new Record[chunksize];
+ if (!_record) {
+ throw document::FatalException(
+ "RecordFlatFile("+_path+"): Failed to allocate buffer", VESPA_STRLOC);
+ }
+ // Make sure file exists
+ ExceptionThrowingFile file(_path);
+ file.openReadWrite();
+}
+
+template<class Record, class Id>
+RecordFlatFile<Record, Id>::~RecordFlatFile()
+{
+ delete[] _record;
+}
+
+template<class Record, class Id>
+bool RecordFlatFile<Record, Id>::exists(const Id& id) const
+throw (vespalib::IoException)
+{
+ return (getRecord(id).get() != (Record*) 0);
+}
+
+template<class Record, class Id> std::unique_ptr<Record>
+RecordFlatFile<Record, Id>::getRecord(const Id& id) const
+throw (vespalib::IoException)
+{
+ ExceptionThrowingFile file(_path);
+ if (file.exists()) {
+ file.openReadOnly();
+ unsigned int recordCount = file.getSize() / sizeof(Record);
+ unsigned int currentRecord = 0;
+ while (currentRecord < recordCount) {
+ unsigned int chunkRecordCount =
+ std::min(_maxChunkRecordCount, recordCount - currentRecord);
+ file.read(_record, chunkRecordCount * sizeof(Record));
+ for (unsigned int i=0; i<chunkRecordCount; ++i) {
+ if (_record[i].getId() == id) {
+ if (!_record[i].isValid()) {
+ std::ostringstream ost;
+ ost << "Entry requested '" << id << "' is corrupted "
+ << "in file " << _path;
+ throw vespalib::IoException(ost.str(), VESPA_STRLOC);
+ }
+ return std::unique_ptr<Record>(new Record(_record[i]));
+ }
+ if (!_record[i].isValid()) {
+ _nonFatalErrors.push_back(
+ "Found corrupted entry in file "+_path);
+ }
+ }
+ currentRecord += chunkRecordCount;
+ }
+ }
+ return std::unique_ptr<Record>(0);
+}
+
+template<class Record, class Id>
+bool RecordFlatFile<Record, Id>::update(const Record& record)
+throw (vespalib::IoException)
+{
+ if (!record.isValid()) {
+ std::ostringstream ost;
+ ost << "Updating " << _path << " using invalid record '"
+ << record.getId() << "'.";
+ _nonFatalErrors.push_back(ost.str());
+ }
+ ExceptionThrowingFile file(_path);
+ file.openReadWrite();
+ unsigned int recordCount = file.getSize() / sizeof(Record);
+ unsigned int currentRecord = 0;
+ while (currentRecord < recordCount) {
+ unsigned int chunkRecordCount =
+ std::min(_maxChunkRecordCount, recordCount - currentRecord);
+ file.read(_record, chunkRecordCount * sizeof(Record));
+ for (unsigned int i=0; i<chunkRecordCount; ++i) {
+ if (_record[i].getId() == record.getId()) {
+ _record[i] = record;
+ file.setPosition(file.getPosition()
+ - (chunkRecordCount - i) * sizeof(Record));
+ file.write(&_record[i], sizeof(Record));
+ return true;
+ }
+ }
+ currentRecord += chunkRecordCount;
+ }
+ return false;
+}
+
+template<class Record, class Id>
+void RecordFlatFile<Record, Id>::add(const Record& record)
+throw (vespalib::IoException)
+{
+ if (!record.isValid()) {
+ std::ostringstream ost;
+ ost << "Adding invalid record '"
+ << record.getId() << "' to file " << _path << ".";
+ _nonFatalErrors.push_back(ost.str());
+ }
+ ExceptionThrowingFile file(_path);
+ file.openWriteOnly();
+ file.setPosition(file.getSize());
+ file.write(&record, sizeof(Record));
+}
+
+template<class Record, class Id>
+bool RecordFlatFile<Record, Id>::remove(const Id& id)
+throw (vespalib::IoException)
+{
+ ExceptionThrowingFile file(_path);
+ file.openReadWrite();
+ int64_t fileSize = file.getSize();
+ if (fileSize == 0) return false;
+ Record last;
+ { // Read the last entry
+ file.setPosition(file.getSize() - sizeof(Record));
+ file.read(&last, sizeof(Record));
+ if (!last.isValid()) {
+ _nonFatalErrors.push_back(
+ "Last entry in file "+_path+" is invalid");
+ }
+ if (last.getId() == id) {
+ file.setSize(file.getSize() - sizeof(Record));
+ return true;
+ }
+ file.setPosition(0);
+ }
+
+ unsigned int recordCount = file.getSize() / sizeof(Record);
+ unsigned int currentRecord = 0;
+ while (currentRecord < recordCount) {
+ unsigned int chunkRecordCount =
+ std::min(_maxChunkRecordCount, recordCount - currentRecord);
+ file.read(_record, chunkRecordCount * sizeof(Record));
+ for (unsigned int i=0; i<chunkRecordCount; ++i) {
+ if (_record[i].getId() == id) {
+ _record[i] = last;
+ file.setPosition(file.getPosition()
+ - (chunkRecordCount - i) * sizeof(Record));
+ file.write(&_record[i], sizeof(Record));
+ file.setSize(file.getSize() - sizeof(Record));
+ return true;
+ }
+ if (!_record[i].isValid()) {
+ _nonFatalErrors.push_back(
+ "Found corrupted entry in file "+_path);
+ }
+ }
+ currentRecord += chunkRecordCount;
+ }
+ return false;
+}
+
+template<class Record, class Id>
+void RecordFlatFile<Record, Id>::clear()
+throw (vespalib::IoException)
+{
+ ExceptionThrowingFile file(_path);
+ file.remove();
+}
+
+template<class Record, class Id>
+unsigned int RecordFlatFile<Record, Id>::getSize() const
+throw (vespalib::IoException)
+{
+ ExceptionThrowingFile file(_path);
+ file.openReadOnly();
+ int64_t fileSize = file.getSize();
+ if (fileSize % sizeof(Record) != 0) {
+ _nonFatalErrors.push_back(
+ "Filesize is not a whole number of records. "
+ "File "+_path+" corrupted or wrong size gotten.");
+ }
+ return static_cast<unsigned int>(fileSize / sizeof(Record));
+}
+
+template<class Record, class Id>
+std::unique_ptr<Record> RecordFlatFile<Record, Id>::
+operator[](unsigned int index) const
+throw (document::IllegalArgumentException, vespalib::IoException)
+{
+ ExceptionThrowingFile file(_path);
+ file.openReadOnly();
+ unsigned int recordCount = file.getSize() / sizeof(Record);
+ if (index >= recordCount) {
+ throw document::IllegalArgumentException(
+ "RecordFlatFile.operator[]: Access outside of bounds", VESPA_STRLOC);
+ }
+ file.setPosition(index * sizeof(Record));
+ file.read(_record, sizeof(Record));
+ return std::unique_ptr<Record>(new Record(_record[0]));
+}
+
+}
+
diff --git a/storage/src/vespa/storage/storageutil/resumeguard.h b/storage/src/vespa/storage/storageutil/resumeguard.h
new file mode 100644
index 00000000000..9b60fa78c6d
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/resumeguard.h
@@ -0,0 +1,38 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+namespace storage {
+
+class ResumeGuard {
+public:
+ class Callback {
+ public:
+ virtual ~Callback() {};
+
+ virtual void resume() = 0;
+ };
+
+ ResumeGuard()
+ : _cb(NULL)
+ {}
+
+ ResumeGuard(Callback& cb)
+ : _cb(&cb) {};
+
+ ResumeGuard(const ResumeGuard& other) {
+ _cb = other._cb;
+ const_cast<ResumeGuard&>(other)._cb = NULL;
+ }
+
+ ~ResumeGuard() {
+ if (_cb) {
+ _cb->resume();
+ }
+ }
+
+private:
+ Callback* _cb;
+};
+
+}
+
diff --git a/storage/src/vespa/storage/storageutil/utils.h b/storage/src/vespa/storage/storageutil/utils.h
new file mode 100644
index 00000000000..357063ff29b
--- /dev/null
+++ b/storage/src/vespa/storage/storageutil/utils.h
@@ -0,0 +1,90 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vector>
+
+namespace storage {
+
+/**
+ * Creates a vector of the given type with one entry in it.
+ */
+template<class A>
+std::vector<A> toVector(A entry) {
+ std::vector<A> entries;
+ entries.push_back(entry);
+ return entries;
+};
+
+/**
+ * Creates a vector of the given type with two entries in it.
+ */
+template<class A>
+std::vector<A> toVector(A entry, A entry2) {
+ std::vector<A> entries;
+ entries.push_back(entry);
+ entries.push_back(entry2);
+ return entries;
+};
+
+/**
+ * Creates a vector of the given type with two entries in it.
+ */
+template<class A>
+std::vector<A> toVector(A entry, A entry2, A entry3) {
+ std::vector<A> entries;
+ entries.push_back(entry);
+ entries.push_back(entry2);
+ entries.push_back(entry3);
+ return entries;
+};
+
+/**
+ * Creates a vector of the given type with two entries in it.
+ */
+template<class A>
+std::vector<A> toVector(A entry, A entry2, A entry3, A entry4) {
+ std::vector<A> entries;
+ entries.push_back(entry);
+ entries.push_back(entry2);
+ entries.push_back(entry3);
+ entries.push_back(entry4);
+ return entries;
+};
+
+template<class A>
+std::string dumpVector(const std::vector<A>& vec) {
+ std::ostringstream ost;
+ for (uint32_t i = 0; i < vec.size(); ++i) {
+ if (!ost.str().empty()) {
+ ost << ",";
+ }
+ ost << vec[i];
+ }
+
+ return ost.str();
+}
+
+template<class A>
+bool hasItem(const std::vector<A>& vec, A entry) {
+ for (uint32_t i = 0; i < vec.size(); ++i) {
+ if (vec[i] == entry) {
+ return true;
+ }
+ }
+
+ return false;
+}
+
+template<typename T>
+struct ConfigReader : public T::Subscriber, public T
+{
+ T& config; // Alter to inherit T to simplify but kept this for compatability
+
+ ConfigReader(const std::string& configId) : config(*this) {
+ T::subscribe(configId, *this);
+ }
+ void configure(const T& c) { config = c; }
+};
+
+}
+
diff --git a/storage/src/vespa/storage/subscriptions/.gitignore b/storage/src/vespa/storage/subscriptions/.gitignore
new file mode 100644
index 00000000000..04a221b8052
--- /dev/null
+++ b/storage/src/vespa/storage/subscriptions/.gitignore
@@ -0,0 +1,8 @@
+*.So
+*.lo
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
+features.h
diff --git a/storage/src/vespa/storage/tools/.gitignore b/storage/src/vespa/storage/tools/.gitignore
new file mode 100644
index 00000000000..6f27b7c441d
--- /dev/null
+++ b/storage/src/vespa/storage/tools/.gitignore
@@ -0,0 +1,30 @@
+*.So
+*core
+.*.swp
+.depend
+Makefile
+analyzedistribution
+distbitreport.html
+distributionsim
+dumpslotfile
+generatedistributionbits
+generatemailbuckets
+getidealstate
+idealstate
+pingstorage
+populatenode
+slotfilefeeder
+statfs
+stoccart
+storage-cmd
+throttlingsim
+vdsclient
+vdsdisktool
+vdsidealstate
+vdsstat
+vesparemovelocation
+storage_analyzedistribution_app
+storage_generatedistributionbits_app
+storage_getidealstate_app
+storage_statfs_app
+storage_throttlingsim_app
diff --git a/storage/src/vespa/storage/tools/CMakeLists.txt b/storage/src/vespa/storage/tools/CMakeLists.txt
new file mode 100644
index 00000000000..3f5febcd28a
--- /dev/null
+++ b/storage/src/vespa/storage/tools/CMakeLists.txt
@@ -0,0 +1,53 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(storage_getidealstate_app
+ SOURCES
+ getidealstate.cpp
+ INSTALL bin
+ DEPENDS
+ storage
+ AFTER
+ storage_storageconfig
+)
+vespa_add_executable(storage_generatedistributionbits_app
+ SOURCES
+ generatedistributionbits.cpp
+ INSTALL bin
+ DEPENDS
+ storage
+ AFTER
+ storage_storageconfig
+)
+vespa_add_executable(storage_analyzedistribution_app
+ SOURCES
+ analyzedistribution.cpp
+ INSTALL bin
+ DEPENDS
+ storage
+ AFTER
+ storage_storageconfig
+)
+vespa_add_executable(storage_storage-cmd_app
+ SOURCES
+ storage-cmd.cpp
+ OUTPUT_NAME storage-cmd
+ INSTALL bin
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
+vespa_add_executable(storage_throttlingsim_app
+ SOURCES
+ throttlingsim.cpp
+ INSTALL bin
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
+vespa_add_executable(storage_statfs_app
+ SOURCES
+ statfs.cpp
+ INSTALL bin
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
diff --git a/storage/src/vespa/storage/tools/analyzedistribution.cpp b/storage/src/vespa/storage/tools/analyzedistribution.cpp
new file mode 100644
index 00000000000..86844e245f0
--- /dev/null
+++ b/storage/src/vespa/storage/tools/analyzedistribution.cpp
@@ -0,0 +1,523 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/config/helper/configfetcher.h>
+#include <vespa/document/bucket/bucketidfactory.h>
+#include <vespa/vespalib/util/programoptions.h>
+#include <vespa/vespalib/util/guard.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/vdslib/state/nodestate.h>
+#include <vespa/storage/bucketdb/judyarray.h>
+
+namespace storage {
+
+struct Options : public vespalib::ProgramOptions {
+ bool verbose;
+ bool showSyntaxPage;
+ std::string systemState;
+ int numDisks;
+ int diskDistribution;
+ double redundancy;
+ std::string testdir;
+
+ Options(int argc, const char* const* argv)
+ : vespalib::ProgramOptions(argc, argv),
+ showSyntaxPage(false),
+ systemState(""),
+ numDisks(0),
+ diskDistribution(1),
+ redundancy(2.0)
+ {
+ setSyntaxMessage(
+ "Analyzes distribution from a real cluster. "
+ "This tool reads gzipped files containing directory "
+ "listings from a live system and analyze how current "
+ "distribution and ideal distribution is in that cluster."
+ "The tool is typically run from the perl check_cluster script "
+ "to create raw data for further analysis of cluster "
+ "distribution."
+ );
+ addOption("h help", showSyntaxPage, false,
+ "Shows this help page");
+ addOption("v verbose", verbose, false,
+ "Show verbose progress");
+ addOption("c clusterstate", systemState,
+ "Cluster state to use for ideal state calculations");
+ addOption("n numdisks", numDisks,
+ "The number of disks on each node");
+ addOption("r redundancy", redundancy, 2.0,
+ "The redundancy used");
+ addOption("d distribution", diskDistribution, 1,
+ "The disk distribution to use (0 = MODULO, 1 = "
+ "MODULO_INDEX, 2 = MODULO_KNUTH, 3 = MODULO_BID");
+ addArgument("Test directory", testdir, std::string("."),
+ "The directory within to find gzipped file listings named "
+ "storage.*.shell.filelist.gz");
+ }
+};
+
+struct Disk {
+ struct Count {
+ uint32_t bucketCount;
+ uint64_t totalByteSize;
+
+ Count() : bucketCount(0), totalByteSize(0) {}
+ void add(uint32_t size) { ++bucketCount; totalByteSize += size; }
+ std::string toString() const {
+ std::ostringstream ost;
+ ost << bucketCount << '/' << totalByteSize;
+ return ost.str();
+ }
+ };
+ lib::DiskState state;
+ Count current;
+ Count wrongDisk;
+ Count wrongNode;
+ Count ideal;
+
+ Disk(const lib::DiskState& state_)
+ : state(state_) {}
+
+ void addBucket(uint32_t size, bool currentDistr,
+ bool correctDisk, bool correctNode)
+ {
+ if (currentDistr) {
+ current.add(size);
+ if (!correctNode) {
+ wrongNode.add(size);
+ } else if (!correctDisk) {
+ wrongDisk.add(size);
+ }
+ } else {
+ ideal.add(size);
+ }
+ }
+
+ void print(std::ostream& out, uint32_t nodeIndex, uint32_t diskIndex) {
+ if (state.getState() == lib::State::UP) {
+ out << "N " << nodeIndex << " D " << diskIndex << ": "
+ << current.toString() << ' ' << ideal.toString() << ' '
+ << wrongNode.toString() << ' ' << wrongDisk.toString() << "\n";
+ }
+ }
+};
+
+struct Node {
+ lib::NodeState distributorState;
+ lib::NodeState storageState;
+ std::vector<Disk> disks;
+ Disk::Count distributor;
+
+ Node(const lib::NodeState& dstate, const lib::NodeState& sstate,
+ uint32_t diskCount)
+ : distributorState(dstate),
+ storageState(sstate),
+ disks()
+ {
+ for (uint32_t i=0; i<diskCount; ++i) {
+ disks.push_back(Disk(storageState.getDiskState(i)));
+ }
+ }
+
+ void print(std::ostream& out, uint32_t nodeIndex) {
+ if (distributorState.getState().oneOf("ui")) {
+ out << "N " << nodeIndex << ": " << distributor.toString() << "\n";
+ }
+ if (storageState.getState().oneOf("uir")) {
+ for (uint32_t i=0; i<disks.size(); ++i) {
+ disks[i].print(out, nodeIndex, i);
+ }
+ }
+ }
+};
+
+struct Distribution {
+ std::vector<Node> nodes;
+ enum Type { INDEX, BID, TEST };
+ Type type;
+ document::BucketIdFactory factory;
+ lib::NodeState nodeState;
+ uint32_t diskCount;
+ lib::ClusterState state;
+ std::unique_ptr<lib::Distribution> distribution;
+
+ static vespa::config::content::StorDistributionConfig::DiskDistribution getDistr(Type t) {
+ switch (t) {
+ case INDEX: return vespa::config::content::StorDistributionConfig::MODULO_INDEX;
+ case BID: return vespa::config::content::StorDistributionConfig::MODULO_BID;
+ case TEST: return vespa::config::content::StorDistributionConfig::MODULO_BID;
+ }
+ // Compiler refuse to detect that the above is all possibilities
+ assert(false);
+ return vespa::config::content::StorDistributionConfig::MODULO_BID;
+ }
+
+ static uint8_t getDistributionBits(const lib::ClusterState& state, Type t)
+ {
+ switch (t) {
+ case INDEX:
+ case BID: return 16;
+ case TEST:
+ {
+ uint32_t nodeCount(
+ state.getNodeCount(lib::NodeType::STORAGE));
+ uint32_t minBuckets = 65536 * nodeCount;
+ uint32_t distributionBits = 16;
+ uint32_t buckets = 65536;
+ while (buckets < minBuckets) {
+ ++distributionBits;
+ buckets *= 2;
+ }
+ return distributionBits;
+ }
+ }
+ // Compiler refuse to detect that the above is all possibilities
+ assert(false);
+ return vespa::config::content::StorDistributionConfig::MODULO_BID;
+ }
+
+ Distribution(const lib::ClusterState& state_, uint32_t diskCount_, Type t)
+ : nodes(),
+ type(t),
+ factory(), // getDistributionBits(state, t), 26, getDistr(t)),
+ nodeState(),
+ diskCount(diskCount_),
+ state(state_),
+ distribution(new lib::Distribution(*config::ConfigGetter<vespa::config::content::StorDistributionConfig>::getConfig("storage/cluster.storage")))
+ {
+ for (uint32_t i=0, n=state.getNodeCount(lib::NodeType::STORAGE);
+ i < n; ++i)
+ {
+ nodes.push_back(Node(state.getNodeState(
+ lib::Node(lib::NodeType::DISTRIBUTOR, i)),
+ state.getNodeState(
+ lib::Node(lib::NodeType::STORAGE, i)), diskCount));
+ }
+ nodeState.setDiskCount(diskCount);
+ }
+
+ std::vector<uint16_t> getIdealStorageNodes(const document::BucketId& bucket,
+ double reliability) const
+ {
+ (void) reliability;
+ std::vector<uint16_t> nodes_;
+ switch (type) {
+ case INDEX:
+ case BID:
+ case TEST:
+ nodes_ = distribution->getIdealStorageNodes(state, bucket);
+ }
+ return nodes_;
+ }
+
+ uint16_t getIdealDistributorNode(const document::BucketId& bucket) const {
+ std::vector<uint16_t> nodes_;
+ switch (type) {
+ case INDEX:
+ case BID:
+ case TEST:
+ return distribution->getIdealDistributorNode(state, bucket);
+ }
+ // Compiler refuse to detect that the above is all possibilities
+ assert(false);
+ return 0;
+ }
+
+ uint16_t getDisk(const document::BucketId& bucket, uint16_t nodeIndex) const
+ {
+ uint16_t disk = 65535;
+ switch (type) {
+ case INDEX:
+ case BID:
+ case TEST:
+ disk = distribution->getIdealDisk(
+ nodeState, nodeIndex, bucket,
+ lib::Distribution::IDEAL_DISK_EVEN_IF_DOWN);
+ break;
+ default:
+ assert(false);
+ }
+ return disk;
+ }
+
+ void print(std::ostream& out) {
+ switch (type) {
+ case INDEX: out << "Modulo index distribution\n"; break;
+ case BID: out << "Modulo BID distribution\n"; break;
+ case TEST: out << "Test distribution\n"; break;
+ }
+ for (uint32_t i=0; i<nodes.size(); ++i) {
+ nodes[i].print(out, i);
+ }
+ }
+};
+
+struct BucketDatabase {
+ JudyArray _judyArray;
+
+ BucketDatabase() {}
+
+ bool add(const document::BucketId& id) {
+ bool preExisted;
+ JudyArray::iterator it = _judyArray.find(id.getId(), true, preExisted);
+ if (it.value() == 0) {
+ it.setValue(1);
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ uint64_t size() const {
+ return _judyArray.size();
+ }
+};
+
+std::vector<std::string> getFileNames(const std::string& testdir) {
+ std::vector<std::string> files;
+ vespalib::DirPointer dir(opendir(testdir.c_str()));
+ struct dirent* entry;
+ assert(dir);
+ while ((entry = readdir(dir))) {
+ assert(entry != 0);
+ std::string name(reinterpret_cast<char*>(&entry->d_name));
+ assert(name.size() > 0);
+ if (name.size() < 27) {
+ // std::cerr << "Ignoring file with too short name: " << name
+ // << "\n";
+ continue;
+ }
+ if (name.substr(0, 8) != "storage.") {
+ // std::cerr << "Ignoring non-storage file: " << name << "\n";
+ continue;
+ }
+ std::string::size_type pos = name.find('.', 8);
+ if (pos == std::string::npos) {
+ // std::cerr << "Ignoring file without two dots: " << name << "\n";
+ continue;
+ }
+ if (name.substr(pos) != ".shell.filelist.gz") {
+ // std::cerr << "Ignoring file not filelisting: " << name << "\n";
+ continue;
+ }
+ files.push_back(name);
+ }
+ return files;
+}
+
+struct Analyzer {
+ const Options& o;
+ const lib::ClusterState& state;
+ BucketDatabase bucketdb;
+ std::vector<std::shared_ptr<Distribution> > distributions;
+
+ Analyzer(const lib::ClusterState& state_, const Options& o_)
+ : o(o_),
+ state(state_),
+ bucketdb(),
+ distributions()
+ {
+ distributions.push_back(std::shared_ptr<Distribution>(
+ new Distribution(state, o.numDisks, Distribution::INDEX)));
+ distributions.push_back(std::shared_ptr<Distribution>(
+ new Distribution(state, o.numDisks, Distribution::BID)));
+ distributions.push_back(std::shared_ptr<Distribution>(
+ new Distribution(state, o.numDisks, Distribution::TEST)));
+ }
+
+ void recordBucket(const document::BucketId& bucket, uint32_t size,
+ uint16_t nodeIndex, uint16_t diskIndex)
+ {
+ bool newBucket = bucketdb.add(bucket);
+ //std::cout << "Recording file " << nodeIndex << " " << diskIndex
+ // << ": " << size << ' ' << bucket << "\n";
+ for (uint32_t i=0; i<distributions.size(); ++i) {
+ std::vector<uint16_t> ideal(distributions[i]->getIdealStorageNodes(
+ bucket, o.redundancy));
+ bool correctNode = false;
+ for (uint32_t j=0; j<ideal.size(); ++j) {
+ if (ideal[j] == nodeIndex) correctNode = true;
+ }
+ uint16_t idealDisk = distributions[i]->getDisk(bucket, nodeIndex);
+ distributions[i]->nodes[nodeIndex].disks[diskIndex].addBucket(
+ size, true, diskIndex == idealDisk, correctNode);
+ if (newBucket) {
+ for (uint32_t j=0; j<ideal.size(); ++j) {
+ idealDisk = distributions[i]->getDisk(bucket, ideal[j]);
+ distributions[i]->nodes[ideal[j]].disks[idealDisk]
+ .addBucket(size, false, true, true);
+ }
+ uint16_t distributor(
+ distributions[i]->getIdealDistributorNode(bucket));
+ distributions[i]->nodes[distributor].distributor.add(size);
+ }
+ }
+ }
+ void recordDirectory(const std::string& name, uint32_t size,
+ uint16_t nodeIndex, uint16_t diskIndex)
+ {
+ (void) name; (void) size; (void) nodeIndex; (void) diskIndex;
+ //std::cerr << "Recording dir " << nodeIndex << " " << diskIndex << ": "
+ // << size << ' ' << name << "\n";
+ }
+ void report() {
+ std::cout << "Found " << bucketdb.size() << " buckets\n";
+ for (uint32_t i=0; i<distributions.size(); ++i) {
+ distributions[i]->print(std::cout);
+ }
+ }
+};
+
+void analyze(const Options& o) {
+ lib::ClusterState state(o.systemState);
+
+ if (o.verbose) {
+ std::cerr << "Using test directory " << o.testdir << "\n";
+ }
+
+ Analyzer analyzer(state, o);
+ std::vector<std::string> filenames(getFileNames(o.testdir));
+
+ std::vector<char> buffer(256);
+ std::string path;
+ uint32_t nodeIndex = 0x10000;
+ uint32_t diskIndex = 0x10000;
+ double shownProgress = 0.0001;
+ for (uint32_t j=0; j<filenames.size(); ++j) {
+ std::string cmd("zcat " + o.testdir + "/" + filenames[j]);
+ if (o.verbose) {
+ std::cerr << "Running '" << cmd << "'.\n";
+ } else {
+ double currentProgress = 79.0 * j / filenames.size();
+ while (currentProgress > shownProgress) {
+ std::cerr << ".";
+ shownProgress += 1;
+ }
+ }
+ FILE* file = popen(cmd.c_str(), "r");
+ assert(file);
+ while (fgets(&buffer[0], buffer.size(), file)) {
+ //std::cout << "Read line: " << &buffer[0];
+ if (buffer[0] == '/') {
+ nodeIndex = 0x10000;
+ diskIndex = 0x10000;
+ uint32_t slashcount = 0;
+ uint32_t lastslash = 0;
+ for (uint32_t i=1; i<buffer.size(); ++i) {
+ if (buffer[i] == ':') {
+ path = std::string(&buffer[0], i);
+ break;
+ } else if (buffer[i] == '\n' || buffer[i] == '\0') {
+ assert(0);
+ } else if (buffer[i] == '/') {
+ if (slashcount == 8) {
+ std::string indexs(&buffer[lastslash] + 1,
+ i - lastslash - 1);
+ char* endp;
+ nodeIndex = strtoul(indexs.c_str(), &endp, 10);
+ if (*endp != '\0') {
+ std::cerr << "'" << indexs
+ << "' is not a number.\n";
+ }
+ assert(*endp == '\0');
+ } else if (slashcount == 10) {
+ assert(buffer[lastslash + 1] == 'd');
+ std::string indexs(&buffer[lastslash] + 2,
+ i - lastslash - 2);
+ char* endp;
+ diskIndex = strtoul(indexs.c_str(), &endp, 10);
+ if (*endp != '\0') {
+ std::cerr << "'" << indexs
+ << "' is not a number.\n";
+ }
+ assert(*endp == '\0');
+ }
+ lastslash = i;
+ ++slashcount;
+ }
+ }
+ } else {
+ uint32_t firstDigit, space, dot;
+ firstDigit = space = dot = buffer.size();
+ bool isDirectory = false;
+ for (uint32_t i=0; i<buffer.size(); ++i) {
+ if (firstDigit == buffer.size()) {
+ if (buffer[i] >= '0' && buffer[i] <= '9') {
+ firstDigit = i;
+ } else if (buffer[i] == ' ' || buffer[i] == '\t') {
+ continue;
+ } else {
+ break;
+ }
+ } else if (space == buffer.size()) {
+ if (buffer[i] >= '0' && buffer[i] <= '9') {
+ continue;
+ } else if (buffer[i] == ' ') {
+ space = i;
+ } else {
+ break;
+ }
+ } else if (dot == buffer.size()) {
+ if ( (buffer[i] >= '0' && buffer[i] <= '9')
+ || (buffer[i] >= 'a' && buffer[i] <= 'f')
+ || (buffer[i] >= 'A' && buffer[i] <= 'F'))
+ {
+ continue;
+ } else if (buffer[i] == '.') {
+ dot = i;
+ } else if (buffer[i] == '\n' || buffer[i] == '\0') {
+ isDirectory = true;
+ dot = i;
+ }
+ break;
+ }
+ }
+ if (dot != buffer.size()) {
+ std::string sizes(&buffer[firstDigit], space - firstDigit);
+ char* endp;
+ uint32_t size = strtoul(sizes.c_str(), &endp, 10);
+ assert(*endp == '\0');
+ std::string bucket(&buffer[space + 1], dot - space - 1);
+ if (isDirectory) {
+ analyzer.recordDirectory(path + '/' + bucket, size,
+ nodeIndex, diskIndex);
+ } else {
+ uint64_t bid = strtoull(bucket.c_str(), &endp, 16);
+ assert(*endp == '\0');
+ document::BucketId bucketid(bid);
+ analyzer.recordBucket(bucketid, size,
+ nodeIndex, diskIndex);
+ }
+ } else {
+ // std::cout << "Did not find bucket from line: "
+ // << &buffer[0] << "\n";
+ // std::cout << " " << firstDigit << " " << space << " "
+ // << dot << "\n";
+ }
+ }
+ }
+ assert(ferror(file) == 0);
+ assert(feof(file));
+ assert(pclose(file) == 0);
+ }
+ if (!o.verbose) {
+ std::cerr << "\n";
+ }
+ analyzer.report();
+}
+
+} // storage
+
+int main(int argc, char** argv) {
+ storage::Options o(argc, argv);
+ o.parse();
+
+ if (o.showSyntaxPage) {
+ o.writeSyntaxPage(std::cerr);
+ return 1;
+ }
+ analyze(o);
+ return 0;
+}
+
diff --git a/storage/src/vespa/storage/tools/generate_distribution_doc.sh b/storage/src/vespa/storage/tools/generate_distribution_doc.sh
new file mode 100755
index 00000000000..1b6ad50c00f
--- /dev/null
+++ b/storage/src/vespa/storage/tools/generate_distribution_doc.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+./generatedistributionbits -s -r 1 -b 32 --html > distbitreport.html
+./generatedistributionbits -s -r 2 -b 32 --html >> distbitreport.html
+./generatedistributionbits -s -r 2 -b 32 --highrange --html >> distbitreport.html
diff --git a/storage/src/vespa/storage/tools/generatedistributionbits.cpp b/storage/src/vespa/storage/tools/generatedistributionbits.cpp
new file mode 100644
index 00000000000..f2ac61241b7
--- /dev/null
+++ b/storage/src/vespa/storage/tools/generatedistributionbits.cpp
@@ -0,0 +1,264 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <boost/assign.hpp>
+#include <vespa/document/bucket/bucketidfactory.h>
+#include <vespa/vespalib/util/programoptions.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/vdslib/state/nodestate.h>
+#include <vespa/storage/bucketdb/judyarray.h>
+#include <stdio.h>
+#include <iomanip>
+#include <math.h>
+#include <vespa/vespalib/util/programoptions.h>
+#include <vespa/config-stor-distribution.h>
+
+namespace storage {
+
+ struct Options : public vespalib::ProgramOptions {
+ uint32_t redundancy;
+ uint32_t maxBit;
+ std::vector<uint32_t> nodeCounts;
+ std::vector<uint32_t> bitCounts;
+ double hideUtilizationAbove;
+ bool skipGood;
+ bool highRange;
+ bool printHtml;
+ double htmlErrAbove;
+ double htmlWarnAbove;
+ double htmlInfoAbove;
+ uint32_t skipBitsBelow;
+ uint32_t skipNodeCountsBelow;
+ uint32_t startAtNodeCount;
+
+ Options(int argc, const char* const* argv)
+ : vespalib::ProgramOptions(argc, argv)
+ {
+ setSyntaxMessage(
+ "Utility program for calculating skew of buckets stored on "
+ "storage nodes."
+ );
+ addOption("r redundancy", redundancy, 2u,
+ "Number of copies stored on the nodes.");
+ addOption("b maxbit", maxBit, 32u,
+ "Maximum distribution bit count to calculate for.");
+ addOption("h hide", hideUtilizationAbove, 0.3,
+ "Hide utilizations worse than this.");
+ addOption("s skip", skipGood, false,
+ "Attempt to skip computations for node counts that "
+ "already have good distributions");
+ addOption("highrange", highRange, false,
+ "Compute distribution for large systems instead of small "
+ "systems");
+ addOption("html", printHtml, false,
+ "Print result as an HTML table");
+ addOption("skipbitsbelow", skipBitsBelow, 0u,
+ "Skip calculating for bits below given value");
+ addOption("skipnodecountsbelow", skipNodeCountsBelow, 0u,
+ "Skip calculating for node counts below given value");
+ addOption("startatnodecount", startAtNodeCount, 0u,
+ "Start calculating for first bit at given node count");
+ }
+
+ void finalize() {
+ using namespace boost::assign;
+ if (highRange) {
+ nodeCounts += 16, 20, 32, 48, 64, 100, 128, 160, 200, 256, 350,
+ 500, 800, 1000, 5000;
+ } else {
+ nodeCounts += 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15;
+ }
+ for (uint32_t i=1; i<=maxBit; ++i) {
+ bitCounts.push_back(i);
+ }
+ htmlErrAbove = hideUtilizationAbove;
+ htmlWarnAbove = 0.10;
+ htmlInfoAbove = 0.01;
+ }
+ };
+
+ double generateSkew(uint32_t nodes, uint32_t distributionBits,
+ uint16_t redundancy)
+ {
+ lib::Distribution distr(lib::Distribution::getDefaultDistributionConfig(
+ redundancy, nodes));
+ lib::ClusterState state(
+ vespalib::make_string("bits:%d storage:%d",
+ distributionBits, nodes));
+
+ std::vector<uint32_t> nodeList(nodes);
+ uint32_t lastbucket = (distributionBits == 32
+ ? 0xffffffff : (1 << distributionBits) - 1);
+
+ for (uint32_t i = 0; i <= lastbucket; ++i) {
+ std::vector<uint16_t> curr(
+ distr.getIdealStorageNodes(state,
+ document::BucketId(distributionBits, i).stripUnused()));
+ for (uint32_t j = 0; j < curr.size(); ++j) {
+ ++nodeList[curr[j]];
+ }
+ if (i == 0xffffffff) break;
+ }
+
+ std::sort(nodeList.begin(), nodeList.end());
+ uint64_t max = nodeList[nodeList.size() - 1];
+
+ uint64_t maxArea = max * nodes;
+ uint64_t wastedArea = 0;
+
+ for (uint32_t i = 0; i < nodes; i++) {
+ wastedArea += max - nodeList[i];
+ }
+
+ //std::cerr << "Least " << nodeList[0] << " Most "
+ // << nodeList[nodeList.size() - 1] << " " << "Total: "
+ // << buckets << " Max area " << maxArea << " Wasted area "
+ // << wastedArea << "\n";
+ if (maxArea == 0) {
+ return 0;
+ } else {
+ return ((double) wastedArea) / maxArea;
+ }
+ }
+
+} // storage
+
+int main(int argc, char** argv) {
+ storage::Options o(argc, argv);
+ try{
+ o.parse();
+ } catch (vespalib::InvalidCommandLineArgumentsException& e) {
+ std::cerr << e.getMessage() << "\n\n";
+ o.writeSyntaxPage(std::cerr);
+ std::cerr << "\n";
+ exit(1);
+ }
+ o.finalize();
+ if (o.printHtml) { std::cout << "<b>"; }
+ std::cout << "Distribution with redundancy " << std::setprecision(2)
+ << o.redundancy << ":\n";
+ if (o.printHtml) { std::cout << "</b>"; }
+ if (o.printHtml) {
+ std::cout << "<table border=\"1\">\n"
+ << "<tr>\n"
+ << " <th><nobr>Bits \\ Nodes</nobr></th>\n";
+ for (uint32_t i = 0; i<o.nodeCounts.size(); ++i) {
+ std::cout << " <td>" << o.nodeCounts[i] << "</td>\n";
+ }
+ std::cout << "</tr>\n";
+ } else {
+ std::cout << "\t";
+ for (uint32_t i = 0; i<o.nodeCounts.size(); ++i) {
+ std::cout << std::setw(8) << std::setfill(' ') << o.nodeCounts[i];
+ }
+ std::cout << "\nBits\n";
+ }
+
+ std::vector<double> tmpV(o.bitCounts.size(), -1);
+ std::vector<std::vector<double> > results(o.nodeCounts.size(), tmpV);
+
+ bool firstBitCalculated = true;
+ int32_t firstBitIndex = -1;
+ for (uint32_t bitIndex = 0; bitIndex < o.bitCounts.size();
+ ++bitIndex)
+ {
+ uint32_t bits = o.bitCounts[bitIndex];
+ if (bits < o.skipBitsBelow) {
+ std::cerr << "Skipping calculating data for " << bits << " bit\n";
+ continue;
+ } else {
+ if (firstBitIndex == -1) {
+ firstBitIndex = bitIndex;
+ } else {
+ firstBitIndex = false;
+ }
+ }
+ bool printedStart = false;
+ std::ostringstream start;
+
+ if (o.printHtml) {
+ start << "<tr>\n"
+ << " <td>" << bits << "</td>\n";
+ } else {
+ start << bits << "\t";
+ }
+ for (uint32_t nodeIndex = 0; nodeIndex < o.nodeCounts.size();
+ ++nodeIndex)
+ {
+ uint32_t nodes = o.nodeCounts[nodeIndex];
+ if (nodes < o.skipNodeCountsBelow ||
+ (nodes < o.startAtNodeCount && firstBitCalculated))
+ {
+ std::cerr << "Skipping calculating data for " << bits
+ << " bits and " << nodes << " nodes\n";
+ if (o.printHtml) {
+ (printedStart ? std::cout : start) << " <td>-</td>\n";
+ } else {
+ (printedStart ? std::cout : start)
+ << std::setw(8) << std::setfill(' ') << "-";
+ }
+ } else if (bitIndex - firstBitIndex > 3
+ && results[nodeIndex][bitIndex - 1] <= o.htmlInfoAbove
+ && results[nodeIndex][bitIndex - 2] <= o.htmlInfoAbove
+ && results[nodeIndex][bitIndex - 3] <= o.htmlInfoAbove
+ && results[nodeIndex][bitIndex - 4] <= o.htmlInfoAbove)
+ {
+ if (o.printHtml) {
+ (printedStart ? std::cout : start) << " <td>-</td>\n";
+ } else {
+ (printedStart ? std::cout : start)
+ << std::setw(8) << std::setfill(' ') << "-";
+ }
+ } else {
+ double skew = storage::generateSkew(nodes, bits, o.redundancy);
+ results[nodeIndex][bitIndex] = skew;
+ std::string color = "";
+ if (skew > o.htmlErrAbove) {
+ color = " bgcolor=\"red\"";
+ } else if (skew > o.htmlWarnAbove) {
+ color = " bgcolor=\"#ffa500\""; // orange
+ } else if (skew > o.htmlInfoAbove) {
+ color = " bgcolor=\"yellow\"";
+ } else {
+ color = " bgcolor=\"#adff2f\""; // green
+ }
+ if (skew > o.hideUtilizationAbove) {
+ if (o.printHtml) {
+ (printedStart ? std::cout : start)
+ << " <td" << color << ">"
+ << std::setprecision(4) << std::fixed << skew
+ << "</td>\n" << std::flush;
+ continue;
+ } else {
+ break;
+ }
+ }
+ if (!printedStart) {
+ std::cout << start.str();
+ printedStart = true;
+ }
+ if (o.printHtml) {
+ std::cout << " <td" << color << ">" << std::setprecision(4)
+ << std::fixed << skew << "</td>\n" << std::flush;
+ } else {
+ std::cout << std::setw(8) << std::setfill(' ')
+ << std::setprecision(4) << std::fixed << skew
+ << std::flush;
+ }
+ }
+ }
+ if (printedStart) {
+ if (o.printHtml) {
+ std::cout << "</tr>\n";
+ } else {
+ std::cout << "\n";
+ }
+ }
+ }
+ if (o.printHtml) {
+ std::cout << "</table>\n";
+ }
+
+ return 0;
+}
diff --git a/storage/src/vespa/storage/tools/getidealstate.cpp b/storage/src/vespa/storage/tools/getidealstate.cpp
new file mode 100644
index 00000000000..7c04e094965
--- /dev/null
+++ b/storage/src/vespa/storage/tools/getidealstate.cpp
@@ -0,0 +1,199 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/document/bucket/bucketidfactory.h>
+#include <vespa/vdslib/distribution/distribution.h>
+#include <vespa/vdslib/state/clusterstate.h>
+#include <vespa/vespalib/util/programoptions.h>
+#include <vespa/config/config.h>
+#include <vespa/config/print/ostreamconfigwriter.h>
+
+namespace storage {
+
+struct Options : public vespalib::ProgramOptions {
+ bool showSyntaxPage;
+ std::string clusterName;
+ std::string clusterState;
+ uint32_t diskCount;
+ std::string diskDistribution;
+ uint32_t redundancy;
+ std::string bucket;
+ std::string upStates;
+ bool bucketsOnStdIn;
+ bool verbose;
+
+ Options(int argc, const char* const* argv)
+ : vespalib::ProgramOptions(argc, argv)
+ {
+ setSyntaxMessage(
+ "Utility program for calculating the ideal state of "
+ "buckets. Useful to verify correctness of distribution "
+ "operations."
+ );
+ addOption("h help", showSyntaxPage, false,
+ "Shows this help page");
+ addOption("s clusterstate", clusterState, std::string(""),
+ "The state of the cluster to calculate position in");
+ addOption("n diskcount", diskCount, uint32_t(0),
+ "The number of disks on each node");
+ addOption("r redundancy", redundancy, uint32_t(2),
+ "The redundancy to keep for each bucket");
+ addOption("diskdistribution", diskDistribution,
+ std::string("MODULO_BID"),
+ "Disk distribution algorithm used");
+ addOption("u upstates", upStates, std::string("uims"),
+ "States to consider as up in ideal state calculations");
+ addOption("i stdin", bucketsOnStdIn, false,
+ "Read stdin to get buckets to calculate ideal position for");
+ addOption("v verbose", verbose, false,
+ "Print extra information while running");
+ addArgument("bucket", bucket, std::string(""),
+ "Bucket for which to calculate ideal state");
+
+ addOptionHeader(
+ "By default, it will be assumed that all nodes are in one top "
+ "group, and no config will be read to calculate bucket "
+ "positions. If a cluster name is specified, config will be "
+ "read to get group hierarchy correctly for cluster.");
+ addOption("c clustername", clusterName, std::string(""),
+ "Name of the cluster to get config from");
+ }
+
+ bool useConfig() const { return !clusterName.empty(); }
+
+ std::string getConfigId() const {
+ std::ostringstream ost;
+ ost << "storage/cluster." << clusterName << "/distributor/0";
+ return ost.str();
+ }
+};
+
+void processBucket(const lib::Distribution& distribution,
+ const lib::ClusterState& clusterState,
+ const std::string& upStates,
+ const document::BucketId& bucket)
+{
+ std::ostringstream ost;
+ std::vector<uint16_t> storageNodes(distribution.getIdealStorageNodes(
+ clusterState, bucket, upStates.c_str()));
+ uint16_t distributorNode(distribution.getIdealDistributorNode(
+ clusterState, bucket, upStates.c_str()));
+ ost << bucket << " distributor: " << distributorNode
+ << ", storage:";
+ for (uint32_t i=0; i<storageNodes.size(); ++i) {
+ ost << " " << storageNodes[i];
+ }
+ ost << "\n";
+ std::cout << ost.str() << std::flush;
+}
+
+int run(int argc, char** argv) {
+ Options o(argc, argv);
+ try{
+ o.parse();
+ } catch (vespalib::InvalidCommandLineArgumentsException& e) {
+ if (!o.showSyntaxPage) {
+ std::cerr << e.getMessage() << "\n\n";
+ o.writeSyntaxPage(std::cerr);
+ std::cerr << "\n";
+ return 1;
+ }
+ }
+ if (o.showSyntaxPage) {
+ o.writeSyntaxPage(std::cerr);
+ std::cerr << "\n";
+ return 0;
+ }
+
+ uint16_t redundancy(o.redundancy);
+ vespa::config::content::StorDistributionConfig::DiskDistribution diskDistribution(
+ vespa::config::content::StorDistributionConfig::getDiskDistribution(
+ o.diskDistribution));
+ std::unique_ptr<lib::Distribution> distribution;
+ lib::ClusterState clusterState(o.clusterState);
+
+ if (o.useConfig()) {
+ try{
+ if (o.verbose) {
+ std::cerr << "Fetching distribution config using config id '"
+ << o.getConfigId() << "'.\n";
+ }
+ config::ConfigUri uri(o.getConfigId());
+ std::unique_ptr<vespa::config::content::StorDistributionConfig> config = config::ConfigGetter<vespa::config::content::StorDistributionConfig>::getConfig(uri.getConfigId(), uri.getContext());
+ redundancy = config->redundancy;
+ diskDistribution = config->diskDistribution;
+ distribution.reset(new lib::Distribution(*config));
+ if (o.verbose) {
+ std::cerr << "Using distribution config: '";
+ config::OstreamConfigWriter ocw(std::cerr);
+ ocw.write(*config);
+ std::cerr << "'.\n";
+ }
+ } catch (std::exception& e) {
+ std::cerr << "Failed to initialize from config:\n" << e.what()
+ << "\n";
+ return 1;
+ }
+ } else {
+ uint16_t distributorCount(
+ clusterState.getNodeCount(lib::NodeType::DISTRIBUTOR));
+ if (o.verbose) {
+ std::cerr << "Not reading config. Assuming one top group with all "
+ << distributorCount << " distributors having redundancy "
+ << redundancy << " with cluster state " << clusterState
+ << "\n";
+ }
+ vespa::config::content::StorDistributionConfig config(
+ lib::Distribution::getDefaultDistributionConfig(
+ redundancy,
+ clusterState.getNodeCount(lib::NodeType::DISTRIBUTOR),
+ diskDistribution));
+ distribution.reset(new lib::Distribution(config));
+ if (o.verbose) {
+ std::cerr << "Using distribution config: '";
+ config::OstreamConfigWriter ocw(std::cerr);
+ ocw.write(config);
+ std::cerr << "'.\n";
+ }
+ }
+ if (o.verbose) {
+ std::cerr << "Using cluster state '" << clusterState.toString(true)
+ << "'.\n";
+ }
+
+ if (!o.bucket.empty()) {
+ char* endp;
+ document::BucketId bucket(strtoull(o.bucket.c_str(), &endp, 16));
+ if (*endp == '\0') {
+ processBucket(*distribution, clusterState, o.upStates, bucket);
+ } else {
+ std::cerr << "Skipping bucket " << o.bucket
+ << " which failed to parse as a bucket. Failed to "
+ << "parse: " << endp << "\n";
+ }
+ } else if (o.bucketsOnStdIn) {
+ std::string line;
+ while (getline(std::cin, line)) {
+ char* endp;
+ document::BucketId bucket(strtoull(line.c_str(), &endp, 16));
+ if (*endp == '\0') {
+ processBucket(*distribution, clusterState, o.upStates, bucket);
+ } else {
+ std::cerr << "Skipping bucket " << line
+ << " which failed to parse as a bucket. Failed to "
+ << "parse: " << endp << "\n";
+ }
+ }
+ } else {
+ std::cerr << "Bucket not specified. Option for using stdin not used.\n"
+ << "No buckets to calculate ideal state for.\n";
+ return 1;
+ }
+ return 0;
+}
+
+} // storage
+
+int main(int argc, char** argv) {
+ return storage::run(argc, argv);
+}
diff --git a/storage/src/vespa/storage/tools/lib/.gitignore b/storage/src/vespa/storage/tools/lib/.gitignore
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/storage/src/vespa/storage/tools/lib/.gitignore
diff --git a/storage/src/vespa/storage/tools/statfs.cpp b/storage/src/vespa/storage/tools/statfs.cpp
new file mode 100644
index 00000000000..d23a3037a7a
--- /dev/null
+++ b/storage/src/vespa/storage/tools/statfs.cpp
@@ -0,0 +1,64 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <errno.h>
+#include <iostream>
+#include <sys/vfs.h>
+#include <vespa/vespalib/util/programoptions.h>
+#include <vespa/vespalib/io/fileutil.h>
+
+struct Options : public vespalib::ProgramOptions {
+ bool showSyntaxPage;
+ std::string _filename;
+
+ Options(int argc, const char* const* argv)
+ : vespalib::ProgramOptions(argc, argv),
+ showSyntaxPage(false)
+ {
+ setSyntaxMessage(
+ "Utility program for checking output of statfs."
+ );
+ addOption("h help", showSyntaxPage, false,
+ "Shows this help page");
+ addArgument("file", _filename, "File to use when calling statfs()");
+ }
+ };
+
+int main(int argc, char** argv) {
+ Options o(argc, argv);
+ o.parse();
+
+ if (o.showSyntaxPage) {
+ o.writeSyntaxPage(std::cerr);
+ exit(1);
+ }
+
+ if (!vespalib::fileExists(o._filename)) {
+ std::cerr << "Cannot use statfs on non-existing file '" << o._filename
+ << "'.\n";
+ exit(1);
+ }
+
+ struct statfs buf;
+ if (statfs(o._filename.c_str(), &buf) == 0) {
+ std::cerr << "f_type " << buf.f_type << "\n"
+ << "f_bsize " << buf.f_bsize << "\n"
+ << "f_blocks " << buf.f_blocks << "\n"
+ << "f_bfree " << buf.f_bfree << "\n"
+ << "f_bavail " << buf.f_bavail << "\n"
+ << "f_files " << buf.f_files << "\n"
+ << "f_ffree " << buf.f_ffree << "\n"
+ << "f_namelen " << buf.f_namelen << "\n";
+
+ uint64_t available = buf.f_bavail;
+ uint64_t total = buf.f_blocks;
+ available *= buf.f_bsize;
+ total *= buf.f_bsize;
+
+ std::cerr << "\nAvailable " << available << " of total " << total
+ << "\n" << (100.0 * (total - available) / (double) total)
+ << " % full\n";
+ } else {
+ std::cerr << "statfs() failed: " << errno << "\n";
+ }
+}
diff --git a/storage/src/vespa/storage/tools/storage-cmd.cpp b/storage/src/vespa/storage/tools/storage-cmd.cpp
new file mode 100644
index 00000000000..021e5ed9b03
--- /dev/null
+++ b/storage/src/vespa/storage/tools/storage-cmd.cpp
@@ -0,0 +1,126 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+LOG_SETUP("rpc_invoke");
+#include <vespa/fnet/frt/frt.h>
+#include <vespa/slobrok/sbmirror.h>
+
+class RPCClient : public FastOS_Application
+{
+private:
+ static bool addArg(FRT_RPCRequest *req, const char *param) {
+ int len = strlen(param);
+ if (len < 2 || param[1] != ':') {
+ return false;
+ }
+ const char *value = param + 2;
+ switch (param[0]) {
+ case 'b':
+ req->GetParams()->AddInt8(strtoll(value, NULL, 0));
+ break;
+ case 'h':
+ req->GetParams()->AddInt16(strtoll(value, NULL, 0));
+ break;
+ case 'i':
+ req->GetParams()->AddInt32(strtoll(value, NULL, 0));
+ break;
+ case 'l':
+ req->GetParams()->AddInt64(strtoll(value, NULL, 0));
+ break;
+ case 'f':
+ req->GetParams()->AddFloat(strtod(value, NULL));
+ break;
+ case 'd':
+ req->GetParams()->AddDouble(strtod(value, NULL));
+ break;
+ case 's':
+ req->GetParams()->AddString(value);
+ break;
+ default:
+ return false;
+ }
+ return true;
+ }
+
+public:
+ int Main() {
+ if (_argc < 3) {
+ fprintf(stderr, "usage: storage-cmd <connectspec> <method> [args]\n");
+ fprintf(stderr, "Calls RPC method on a storage/distributor process\n");
+ fprintf(stderr, "Call frt.rpc.getMethodList to get available RPC methods\n");
+ fprintf(stderr, " each arg must be on the form <type>:<value>\n");
+ fprintf(stderr, " supported types: {'b','h','i','l','f','d','s'}\n");
+ return 1;
+ }
+ int retCode = 0;
+ FRT_Supervisor supervisor;
+ supervisor.Start();
+
+ slobrok::ConfiguratorFactory sbcfg("admin/slobrok.0");
+ slobrok::api::MirrorAPI mirror(supervisor, sbcfg);
+
+ while (!mirror.ready()) {
+ FastOS_Thread::Sleep(10);
+ }
+
+ slobrok::api::MirrorAPI::SpecList list = mirror.lookup(_argv[1]);
+
+ if (list.size() == 0) {
+ fprintf(stderr, "No servers found matching %s\n", _argv[1]);
+ }
+
+ for (size_t j = 0; j < list.size(); j++) {
+ FRT_Target *target = supervisor.GetTarget(list[j].second.c_str());
+
+ // If not fleet controller, need to connect first.
+ if (strstr(_argv[1], "fleetcontroller") == NULL) {
+ FRT_RPCRequest *req = supervisor.AllocRPCRequest();
+ req->SetMethodName("vespa.storage.connect");
+ req->GetParams()->AddString(_argv[1]);
+ target->InvokeSync(req, 10.0);
+
+ if (req->GetErrorCode() != FRTE_NO_ERROR) {
+ fprintf(stderr, "error(%d): %s\n",
+ req->GetErrorCode(),
+ req->GetErrorMessage());
+ continue;
+ }
+ req->SubRef();
+ }
+
+ FRT_RPCRequest *req = supervisor.AllocRPCRequest();
+ req->SetMethodName(_argv[2]);
+
+ for (int i = 3; i < _argc; ++i) {
+ if (!addArg(req, _argv[i])) {
+ fprintf(stderr, "could not parse parameter: '%s'\n", _argv[i]);
+ retCode = 2;
+ break;
+ }
+ }
+ if (retCode == 0) {
+ target->InvokeSync(req, 10.0);
+ if (req->GetErrorCode() == FRTE_NO_ERROR) {
+ fprintf(stdout, "RETURN VALUES FOR %s:\n", list[j].first.c_str());
+ req->GetReturn()->Print();
+ retCode = 3;
+ } else {
+ fprintf(stderr, "error(%d): %s\n",
+ req->GetErrorCode(),
+ req->GetErrorMessage());
+ }
+ }
+ req->SubRef();
+ target->SubRef();
+ }
+ supervisor.ShutDown(true);
+ return retCode;
+ }
+};
+
+int
+main(int argc, char **argv)
+{
+ RPCClient myapp;
+ return myapp.Entry(argc, argv);
+}
diff --git a/storage/src/vespa/storage/tools/throttlingsim.cpp b/storage/src/vespa/storage/tools/throttlingsim.cpp
new file mode 100644
index 00000000000..cd9e5a29d81
--- /dev/null
+++ b/storage/src/vespa/storage/tools/throttlingsim.cpp
@@ -0,0 +1,474 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include "throttlingsim.h"
+#include <algorithm>
+#include <vespa/vespalib/util/stringfmt.h>
+
+bool Receiver::enqueue(const Message& msg) {
+ vespalib::MonitorGuard lock(sync);
+ if (queue.size() < maxqueuesize) {
+ queue.push_back(msg);
+ lock.broadcast();
+ return true;
+ } else {
+ return false;
+ }
+}
+
+void Receiver::run() {
+ while (running()) {
+ vespalib::MonitorGuard lock(sync);
+ if (!queue.empty()) {
+ Message m = queue.front();
+ queue.pop_front();
+ lock.unlock();
+
+ int maxwaittime = (int)(meanwaitms * 1.5);
+ int minwaittime = (int)(meanwaitms * 0.5);
+ int wait = random() % (maxwaittime - minwaittime) + minwaittime;
+ processed++;
+
+ FastOS_Thread::Sleep(wait);
+ m.client->returnMessage(m);
+ } else {
+ lock.wait();
+ }
+ }
+}
+
+void Receiver::print()
+{
+ vespalib::MonitorGuard lock(sync);
+ fprintf(stderr, "Proc time %d, Processed %d, Queue size: %d\n", meanwaitms, processed, (int)queue.size());
+}
+
+void Messaging::sendMessage(const Message& m)
+{
+ vespalib::MonitorGuard lock(sync);
+ queue.push_back(m);
+ lock.broadcast();
+}
+
+void Messaging::run()
+{
+ while (running()) {
+ vespalib::MonitorGuard lock(sync);
+ if (!queue.empty()) {
+ Message m = queue.front();
+
+ FastOS_Time tm;
+ tm.SetNow();
+ double timestamp = tm.MicroSecs() / 1000;
+
+ double wait = m.timestamp - timestamp + meanwaitms;
+
+ if (wait > 0) {
+ lock.unlock();
+ FastOS_Thread::Sleep(static_cast<int>(wait));
+ continue;
+ }
+
+ queue.pop_front();
+ if (!receivers[m.target]->enqueue(m)) {
+ m.busy = true;
+ lock.unlock();
+ m.client->returnMessage(m);
+ }
+ } else {
+ lock.wait();
+ }
+ }
+}
+
+void Messaging::print()
+{
+ double startT = startTime.MilliSecsToNow();
+ double per = period.MilliSecsToNow();
+
+ fprintf(stderr, "\n\n"
+ "Statistics after %G milliseconds\n"
+ "--------------------------------------------------\n",
+ startT);
+
+ for (size_t i = 0; i < receivers.size(); i++) {
+ fprintf(stderr, "Server %ld\t", i);
+ receivers[i]->print();
+ }
+
+ fprintf(stderr, "--------------------------------------------------\n");
+
+ int ok = 0;
+ int failed = 0;
+ for (size_t i = 0; i < clients.size(); i++) {
+ ok += clients[i]->ok;
+ failed += clients[i]->failed;
+ fprintf(stderr, "Client %ld\t", i);
+ clients[i]->print(startT);
+ }
+
+ fprintf(stderr, "\nThroughput last period %G docs/second\n", 1000 * (ok - lastOk) / per);
+ fprintf(stderr, "Throughput %G docs/second\n", 1000 * (ok / startT));
+
+ if (ok + failed > 0) {
+ fprintf(stderr, "Total OK %d, total failed %d, %% failed %G\n", ok, failed, (100 * (double)failed) / (double)(ok + failed));
+ }
+
+ lastOk = ok;
+}
+
+
+void Client::run() {
+ while (running()) {
+ {
+ vespalib::MonitorGuard lock(sync);
+
+ if (pending < windowsize) {
+ Message m;
+
+ FastOS_Time tm;
+ tm.SetNow();
+ m.timestamp = tm.MicroSecs() / 1000;
+
+ m.client = this;
+ m.target = random() % messaging.receivers.size();
+ messaging.sendMessage(m);
+ pending++;
+ }
+ }
+ FastOS_Thread::Sleep(2);
+ }
+}
+
+void Client::print(double timenow)
+{
+ vespalib::MonitorGuard lock(sync);
+ fprintf(stderr, "Ok %d, failures %d, busy %d, pending %d, windowsize %G, throughput %G max_diff %G\n", ok, failed, busy, pending, windowsize, 1000 * ok/timenow, max_diff);
+}
+
+void FixedClient::returnMessage(const Message& m) {
+ vespalib::MonitorGuard lock(sync);
+
+ pending--;
+
+ FastOS_Time tm;
+ tm.SetNow();
+ double timestamp = tm.MicroSecs() / 1000;
+ double diff = timestamp - m.timestamp;
+
+ if (m.busy) {
+ busy++;
+ } else if (diff < timeout) {
+ ok++;
+ } else {
+ failed++;
+ }
+
+ max_diff = std::max(diff, max_diff);
+
+ lock.broadcast();
+}
+
+LoadBalancingClient::LoadBalancingClient(Messaging& msgng, int winsize, int to)
+ : Client(msgng, winsize, to)
+{
+ for (uint32_t i = 0 ; i < msgng.receivers.size(); i++) {
+ weights.push_back(1.0);
+ }
+};
+
+
+void LoadBalancingClient::run() {
+ while (running()) {
+ {
+ vespalib::MonitorGuard lock(sync);
+
+ if (pending < windowsize) {
+ Message m;
+
+ FastOS_Time tm;
+ tm.SetNow();
+ m.timestamp = tm.MicroSecs() / 1000;
+
+ m.client = this;
+
+ double sum = 0;
+ for (uint32_t i = 0; i < weights.size(); i++) {
+ sum += weights[i];
+ }
+
+ float r = sum * (float)random()/(float)RAND_MAX;
+
+ double curr = 0;
+ for (uint32_t i = 0; i < weights.size(); i++) {
+ curr += weights[i];
+
+ if (curr >= r) {
+ m.target = i;
+ break;
+ }
+
+ }
+
+ messaging.sendMessage(m);
+ pending++;
+ }
+ }
+ FastOS_Thread::Sleep(2);
+ }
+}
+
+void LoadBalancingClient::print(double timenow)
+{
+ vespalib::MonitorGuard lock(sync);
+
+ std::string s;
+ for (uint32_t i = 0; i < weights.size(); i++) {
+ s += vespalib::make_string("%G ", weights[i]);
+ }
+ fprintf(stderr, "Ok %d, failures %d, busy %d, pending %d, windowsize %G, throughput %G max_diff %G\n Weights: [ %s]\n", ok, failed, busy, pending, windowsize, 1000 * ok/timenow, max_diff, s.c_str());
+
+}
+
+void LoadBalancingClient::returnMessage(const Message& m) {
+ vespalib::MonitorGuard lock(sync);
+
+ pending--;
+
+ FastOS_Time tm;
+ tm.SetNow();
+ double timestamp = tm.MicroSecs() / 1000;
+ double diff = timestamp - m.timestamp;
+
+ if (m.busy) {
+ weights[m.target] -= 0.01;
+
+ for (uint32_t i = 1; i < weights.size(); i++) {
+ weights[i] = weights[i] / weights[0];
+ }
+ weights[0] = 1.0;
+
+ busy++;
+ } else if (diff < timeout) {
+ ok++;
+ } else {
+ failed++;
+ }
+
+ max_diff = std::max(diff, max_diff);
+
+ lock.broadcast();
+}
+
+BusyCounterBalancingClient::BusyCounterBalancingClient(Messaging& msgng, int winsize, int to)
+ : Client(msgng, winsize, to)
+{
+ for (uint32_t i = 0 ; i < msgng.receivers.size(); i++) {
+ busyCount.push_back(0);
+ }
+};
+
+
+void BusyCounterBalancingClient::run() {
+ // int startTime = time(NULL);
+
+ while (running()) {
+ {
+ vespalib::MonitorGuard lock(sync);
+
+ if (pending < windowsize) {
+ Message m;
+ FastOS_Time tm;
+ tm.SetNow();
+ m.timestamp = tm.MicroSecs() / 1000;
+
+ m.client = this;
+
+ m.target = 0;
+ for (uint32_t i = 1; i < busyCount.size(); i++) {
+ if (busyCount[i] < busyCount[m.target]) {
+ m.target = i;
+ }
+ }
+
+ messaging.sendMessage(m);
+ pending++;
+ }
+ }
+
+ FastOS_Thread::Sleep(3);
+ }
+}
+
+void BusyCounterBalancingClient::print(double timenow)
+{
+ vespalib::MonitorGuard lock(sync);
+
+ std::string s;
+ for (uint32_t i = 0; i < busyCount.size(); i++) {
+ s += vespalib::make_string("%d ", busyCount[i]);
+ }
+ fprintf(stderr, "Ok %d, failures %d, busy %d, pending %d, windowsize %G, throughput %G max_diff %G\n BusyCount: [ %s]\n", ok, failed, busy, pending, windowsize, 1000 * ok/timenow, max_diff, s.c_str());
+
+}
+
+void BusyCounterBalancingClient::returnMessage(const Message& m) {
+ vespalib::MonitorGuard lock(sync);
+
+ pending--;
+
+ FastOS_Time tm;
+ tm.SetNow();
+ double timestamp = tm.MicroSecs() / 1000;
+ double diff = timestamp - m.timestamp;
+
+ if (m.busy) {
+ busyCount[m.target]++;
+ busy++;
+ } else if (diff < timeout) {
+ ok++;
+ } else {
+ failed++;
+ }
+
+ max_diff = std::max(diff, max_diff);
+
+ lock.broadcast();
+}
+
+
+
+void DynamicClient::returnMessage(const Message& m) {
+ vespalib::MonitorGuard lock(sync);
+
+ pending--;
+
+ FastOS_Time tm;
+ tm.SetNow();
+ double timestamp = tm.MicroSecs() / 1000;
+ double diff = timestamp - m.timestamp;
+
+ if (diff < timeout) {
+ ok++;
+ } else {
+ //ffprintf(stderr, stderr, "Message took %G ms to process, more than %G\n", diff, timeout);
+ failed++;
+ }
+
+ if (diff < timeout / 2) {
+ if (windowsize < maxwinsize) {
+ if (windowsize > threshold) {
+ windowsize += (1/windowsize);
+ } else {
+ windowsize++;
+ }
+ }
+ } else if (m.timestamp > lastFailTimestamp) {
+ threshold = std::max(2, (int)(windowsize / 2));
+ windowsize = 1;
+ lastFailTimestamp = timestamp;
+ }
+
+ lock.broadcast();
+}
+
+void LatencyControlClient::returnMessage(const Message& m) {
+ vespalib::MonitorGuard lock(sync);
+
+ pending--;
+
+ FastOS_Time tm;
+ tm.SetNow();
+ double timestamp = tm.MicroSecs() / 1000;
+ double diff = timestamp - m.timestamp;
+
+ if (diff < timeout) {
+ ok++;
+ } else {
+ //ffprintf(stderr, stderr, "Message took %G ms to process, more than %G\n", diff, timeout);
+ failed++;
+ }
+
+ max_diff = std::max(diff, max_diff);
+
+ ++count;
+
+ if(count >= windowsize) {
+ if (max_diff < timeout/4) {
+ windowsize+=10;
+ }
+ if (timeout/4 <= max_diff && max_diff <= timeout/1.5) {
+ ++windowsize;
+ }
+ if (max_diff > timeout/1.5) {
+ windowsize= std::max(1.0, 0.66*windowsize);
+ }
+ max_diff = 0;
+ count = 0;
+ }
+
+ lock.broadcast();
+}
+
+void LatencyControlClient::print(double timenow)
+{
+ vespalib::MonitorGuard lock(sync);
+ fprintf(stderr, "Ok %d, failures %d, pending %d, busy %d, windowsize %G, throu %G max_diff %G\n", ok, failed, pending, busy, windowsize, 1000 * ok/timenow, max_diff);
+}
+
+int
+ThrottlingApp::Main()
+{
+ FastOS_ThreadPool threadPool(512*1024);
+ Messaging m(5);
+
+ m.start(threadPool);
+ m.startTime.SetNow();
+
+ for (int i = 0; i < 3; i++) {
+ Receiver* r = new Receiver(20, 16);
+ r->start(threadPool);
+ m.receivers.push_back(r);
+ }
+
+ for (int i = 0; i < 3; i++) {
+ Receiver* r = new Receiver(60, 16);
+ r->start(threadPool);
+ m.receivers.push_back(r);
+ }
+
+ {
+ BusyCounterBalancingClient* c = new BusyCounterBalancingClient(m, 400, 5000);
+ c->start(threadPool);
+ m.clients.push_back(c);
+ }
+/*
+ {
+ LoadBalancingClient* c = new LoadBalancingClient(m, 400, 5000);
+ c->start(threadPool);
+ m.clients.push_back(c);
+ }
+*/
+/*
+ {
+ FixedClient* c = new FixedClient(m, 400, 5000);
+ c->start(threadPool);
+ m.clients.push_back(c);
+ }
+*/
+ int timeNow = time(NULL);
+
+ while (time(NULL) - timeNow < 240) {
+ m.print();
+ m.period.SetNow();
+ sleep(2);
+ }
+
+ exit(0);
+}
+
+int main(int argc, char** argv)
+{
+ ThrottlingApp app;
+ return app.Entry(argc, argv);
+}
+
diff --git a/storage/src/vespa/storage/tools/throttlingsim.h b/storage/src/vespa/storage/tools/throttlingsim.h
new file mode 100644
index 00000000000..765b007ab99
--- /dev/null
+++ b/storage/src/vespa/storage/tools/throttlingsim.h
@@ -0,0 +1,150 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/vespalib/util/sync.h>
+#include <vespa/vespalib/util/document_runnable.h>
+#include <deque>
+#include <vector>
+
+class Client;
+
+class Message {
+public:
+ Message() : timestamp(0), client(0), busy(false), target(0) {};
+ Message(const Message& other) : timestamp(other.timestamp), client(other.client), busy(other.busy), target(other.target) {};
+
+ void operator=(const Message& other) { timestamp = other.timestamp; client = other.client; };
+
+ double timestamp;
+ Client* client;
+ bool busy;
+ int target;
+};
+
+
+class Receiver : public document::Runnable
+{
+public:
+ std::deque<Message> queue;
+ vespalib::Monitor sync;
+
+ int meanwaitms;
+ int processed;
+ size_t maxqueuesize;
+
+ Receiver(int meanwait, int max) : meanwaitms(meanwait), processed(0), maxqueuesize(max) {};
+
+ bool enqueue(const Message& msg);
+ void run();
+ void print();
+};
+
+class Messaging : public document::Runnable
+{
+public:
+ std::deque<Message> queue;
+ std::vector<Receiver*> receivers;
+ std::vector<Client*> clients;
+ vespalib::Monitor sync;
+ FastOS_Time startTime;
+ FastOS_Time period;
+
+ int lastOk;
+ int meanwaitms;
+
+ Messaging(int meanwait) : lastOk(0), meanwaitms(meanwait) {};
+
+ void sendMessage(const Message& m);
+ void print();
+ void run();
+};
+
+class Client : public document::Runnable {
+public:
+ vespalib::Monitor sync;
+
+ int ok;
+ int failed;
+ int busy;
+ int pending;
+ double windowsize;
+ Messaging& messaging;
+ int timeout;
+ double max_diff;
+
+ virtual void returnMessage(const Message& m) = 0;
+ virtual void run();
+ virtual void print(double timenow);
+
+ Client(Messaging& msgng, double windowSize, int to) :
+ ok(0), failed(0), busy(0), pending(0), windowsize(windowSize), messaging(msgng), timeout(to), max_diff(0) {}
+};
+
+
+class FixedClient : public Client {
+public:
+ FixedClient(Messaging& msgng, int winsize, int to)
+ : Client(msgng, winsize, to) {};
+
+ virtual void returnMessage(const Message& m);
+};
+
+class LoadBalancingClient : public Client {
+public:
+ LoadBalancingClient(Messaging& msgng, int winsize, int to);
+
+ virtual void returnMessage(const Message& m);
+ virtual void run();
+ virtual void print(double timenow);
+
+ std::vector<double> weights;
+};
+
+class BusyCounterBalancingClient : public Client {
+public:
+ BusyCounterBalancingClient(Messaging& msgng, int winsize, int to);
+
+ virtual void returnMessage(const Message& m);
+ virtual void run();
+ virtual void print(double timenow);
+
+ std::vector<int> busyCount;
+};
+
+
+class DynamicClient : public Client {
+public:
+ int maxwinsize;
+ int threshold;
+ double lastFailTimestamp;
+
+ DynamicClient(Messaging& msgng, int maxWinSize, double to)
+ : Client(msgng, 1, static_cast<int>(to)), maxwinsize(maxWinSize), threshold(maxWinSize / 2), lastFailTimestamp(0) {};
+
+ virtual void returnMessage(const Message& m);
+};
+
+class LatencyControlClient : public Client {
+public:
+ int count;
+
+ LatencyControlClient(Messaging& msgng, double to)
+ : Client(msgng, 1, static_cast<int>(to)),
+ count(0){};
+
+ virtual void returnMessage(const Message& m);
+
+ virtual void print(double timenow);
+};
+
+
+class ThrottlingApp : public FastOS_Application
+{
+private:
+ ThrottlingApp(const ThrottlingApp &);
+ ThrottlingApp& operator=(const ThrottlingApp &);
+
+public:
+ ThrottlingApp() {};
+
+ int Main();
+
+};
diff --git a/storage/src/vespa/storage/visiting/.gitignore b/storage/src/vespa/storage/visiting/.gitignore
new file mode 100644
index 00000000000..e61a8edd9d0
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/.gitignore
@@ -0,0 +1,10 @@
+*.So
+*.lo
+.*.swp
+.depend
+.depend.NEW
+.deps
+.libs
+Makefile
+config-stor-visitor.cpp
+config-stor-visitor.h
diff --git a/storage/src/vespa/storage/visiting/CMakeLists.txt b/storage/src/vespa/storage/visiting/CMakeLists.txt
new file mode 100644
index 00000000000..87406a0b6bb
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/CMakeLists.txt
@@ -0,0 +1,19 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(storage_visitor OBJECT
+ SOURCES
+ ${CMAKE_CURRENT_BINARY_DIR}/config-stor-visitor.h
+ visitor.cpp
+ visitormanager.cpp
+ visitorthread.cpp
+ testvisitor.cpp
+ recoveryvisitor.cpp
+ dumpvisitor.cpp
+ countvisitor.cpp
+ dumpvisitorsingle.cpp
+ memory_bounded_trace.cpp
+ DEPENDS
+ AFTER
+ storage_storageconfig
+)
+vespa_generate_config(storage_visitor stor-visitor.def)
+install(FILES stor-visitor.def DESTINATION var/db/vespa/config_server/serverdb/classes)
diff --git a/storage/src/vespa/storage/visiting/commandqueue.h b/storage/src/vespa/storage/visiting/commandqueue.h
new file mode 100644
index 00000000000..07677fdcd38
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/commandqueue.h
@@ -0,0 +1,250 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class CommandQueue
+ * @ingroup visiting
+ *
+ * @brief Keep an ordered queue of messages that can time out individually.
+ * Messages are ordered by priority and arrival sequence.
+ */
+
+#pragma once
+
+#include <boost/multi_index_container.hpp>
+#include <boost/multi_index/identity.hpp>
+#include <boost/multi_index/member.hpp>
+#include <boost/multi_index/mem_fun.hpp>
+#include <boost/multi_index/ordered_index.hpp>
+#include <boost/multi_index/sequenced_index.hpp>
+#include <vespa/vespalib/util/printable.h>
+#include <vespa/fastos/timestamp.h>
+#include <list>
+#include <vespa/storageframework/storageframework.h>
+
+namespace storage {
+
+ template<class Command>
+ class CommandQueue : public vespalib::Printable
+ {
+ public:
+ struct CommandEntry {
+ typedef typename Command::Priority PriorityType;
+ std::shared_ptr<Command> _command;
+ uint64_t _time;
+ uint64_t _sequenceId;
+ PriorityType _priority;
+
+ CommandEntry(const std::shared_ptr<Command>& cmd,
+ uint64_t time,
+ uint64_t sequenceId,
+ PriorityType priority)
+ : _command(cmd), _time(time), _sequenceId(sequenceId), _priority(priority)
+ {}
+
+ // Sort on both priority and sequence ID
+ bool operator<(const CommandEntry& entry) const {
+ if (_priority != entry._priority) {
+ return (_priority < entry._priority);
+ }
+ return (_sequenceId < entry._sequenceId);
+ }
+ };
+
+ private:
+ typedef boost::multi_index::multi_index_container<
+ CommandEntry,
+ boost::multi_index::indexed_by<
+ boost::multi_index::ordered_unique<
+ boost::multi_index::identity<CommandEntry>
+ >,
+ boost::multi_index::ordered_non_unique<
+ boost::multi_index::member<CommandEntry, uint64_t, &CommandEntry::_time>
+ >
+ >
+ > CommandList;
+ typedef typename boost::multi_index
+ ::nth_index<CommandList, 1>::type timelist;
+
+ framework::Clock& _clock;
+ mutable CommandList _commands;
+ uint64_t _sequenceId;
+
+ public:
+ typedef typename CommandList::iterator iterator;
+ typedef typename CommandList::reverse_iterator reverse_iterator;
+ typedef typename CommandList::const_iterator const_iterator;
+ typedef typename CommandList::const_reverse_iterator const_reverse_iterator;
+ typedef typename timelist::const_iterator const_titerator;
+
+ CommandQueue(framework::Clock& clock)
+ : _clock(clock),
+ _sequenceId(0) {}
+
+ const framework::Clock& getTimer() const { return _clock; }
+
+ iterator begin() { return _commands.begin(); }
+ iterator end() { return _commands.end(); }
+
+ const_iterator begin() const { return _commands.begin(); }
+ const_iterator end() const { return _commands.end(); }
+
+ const_titerator tbegin() const {
+ timelist& tl = boost::multi_index::get<1>(_commands);
+ return tl.begin();
+ }
+ const_titerator tend() const {
+ timelist& tl = boost::multi_index::get<1>(_commands);
+ return tl.end();
+ }
+
+ bool empty() const;
+
+ uint32_t size() const;
+
+ std::pair<std::shared_ptr<Command>, time_t> releaseNextCommand();
+
+ std::shared_ptr<Command> peekNextCommand() const;
+
+ void add(const std::shared_ptr<Command>& msg);
+
+ void erase(iterator it);
+
+ std::list<CommandEntry> releaseTimedOut();
+
+ std::pair<std::shared_ptr<Command>, time_t>
+ releaseLowestPriorityCommand();
+
+ std::shared_ptr<Command> peekLowestPriorityCommand() const;
+
+ void clear();
+
+ void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+ };
+
+ template<class Command>
+ inline bool
+ CommandQueue<Command>::empty() const
+ {
+ return _commands.empty();
+ }
+
+ template<class Command>
+ inline uint32_t
+ CommandQueue<Command>::size() const
+ {
+ return _commands.size();
+ }
+
+ template<class Command>
+ inline std::pair<std::shared_ptr<Command>, time_t>
+ CommandQueue<Command>::releaseNextCommand()
+ {
+ std::pair<std::shared_ptr<Command>, time_t> retVal(
+ std::shared_ptr<Command>(), 0);
+ if (!_commands.empty()) {
+ iterator first = _commands.begin();
+ retVal.first = first->_command;
+ retVal.second = first->_time;
+ _commands.erase(first);
+ }
+ return retVal;
+ }
+
+ template<class Command>
+ inline std::shared_ptr<Command>
+ CommandQueue<Command>::peekNextCommand() const
+ {
+ if (!_commands.empty()) {
+ const_iterator first = _commands.begin();
+ return first->_command;
+ } else {
+ return std::shared_ptr<Command>();
+ }
+ }
+
+ template<class Command>
+ inline void
+ CommandQueue<Command>::add(
+ const std::shared_ptr<Command>& cmd)
+ {
+ framework::MicroSecTime time(_clock.getTimeInMicros()
+ + framework::MicroSecTime(cmd->getQueueTimeout() * 1000000));
+ _commands.insert(CommandEntry(cmd, time.getTime(), ++_sequenceId, cmd->getPriority()));
+ }
+
+ template<class Command>
+ inline void
+ CommandQueue<Command>::erase(iterator it)
+ {
+ _commands.erase(it);
+ }
+
+ template<class Command>
+ inline std::list<typename CommandQueue<Command>::CommandEntry>
+ CommandQueue<Command>::releaseTimedOut()
+ {
+ std::list<CommandEntry> mylist;
+ framework::MicroSecTime time(_clock.getTimeInMicros());
+ while (!empty() && tbegin()->_time <= time.getTime()) {
+ mylist.push_back(*tbegin());
+ timelist& tl = boost::multi_index::get<1>(_commands);
+ tl.erase(tbegin());
+ }
+ return mylist;
+ }
+
+ template <class Command>
+ inline std::pair<std::shared_ptr<Command>, time_t>
+ CommandQueue<Command>::releaseLowestPriorityCommand()
+ {
+ if (!_commands.empty()) {
+ iterator last = (++_commands.rbegin()).base();
+ time_t time = last->_time;
+ std::shared_ptr<Command> cmd(last->_command);
+ _commands.erase(last);
+ return std::pair<std::shared_ptr<Command>, time_t>(cmd, time);
+ } else {
+ return std::pair<std::shared_ptr<Command>, time_t>(
+ std::shared_ptr<Command>(), 0);
+ }
+ }
+
+ template <class Command>
+ inline std::shared_ptr<Command>
+ CommandQueue<Command>::peekLowestPriorityCommand() const
+ {
+ if (!_commands.empty()) {
+ const_reverse_iterator last = _commands.rbegin();
+ return last->_command;
+ } else {
+ return std::shared_ptr<Command>();
+ }
+ }
+
+ template<class Command>
+ inline void
+ CommandQueue<Command>::clear()
+ {
+ _commands.clear();
+ }
+
+ template<class Command>
+ inline void
+ CommandQueue<Command>::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+ {
+ (void) verbose;
+ out << "Insert order:\n";
+ for (const_iterator it = begin(); it != end(); ++it) {
+ out << indent << *it->_command << ", priority " << it->_priority
+ << ", time " << it->_time << "\n";
+ }
+ out << indent << "Time order:";
+ for (const_titerator it = tbegin(); it != tend(); ++it) {
+ out << "\n" << indent << *it->_command << ", priority " << it->_priority
+ << ", time " << it->_time;
+ }
+ }
+
+} // storage
+
diff --git a/storage/src/vespa/storage/visiting/countvisitor.cpp b/storage/src/vespa/storage/visiting/countvisitor.cpp
new file mode 100644
index 00000000000..4c1a082893c
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/countvisitor.cpp
@@ -0,0 +1,117 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/visiting/countvisitor.h>
+
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/log/log.h>
+#include <vespa/documentapi/messagebus/messages/visitor.h>
+
+LOG_SETUP(".visitor.instance.countvisitor");
+
+namespace storage {
+
+CountVisitor::CountVisitor(StorageComponent& component,
+ const vdslib::Parameters& params)
+ : Visitor(component),
+ _doScheme(params.get("scheme") == "true"),
+ _doNamespace(params.get("namespace") == "true"),
+ _doUser(params.get("user") == "true"),
+ _doGroup(params.get("group") == "true")
+{
+}
+
+void
+CountVisitor::handleDocuments(const document::BucketId& /*bucketId*/,
+ std::vector<spi::DocEntry::LP>& entries,
+ HitCounter& hitCounter)
+{
+ for (size_t i = 0; i < entries.size(); ++i) {
+ const spi::DocEntry& entry(*entries[i]);
+ if (!entry.isRemove()) {
+ const document::Document* doc = entry.getDocument();
+
+ if (doc) {
+ const document::IdString& idString = doc->getId().getScheme();
+ hitCounter.addHit(doc->getId(), 0);
+
+ if (_doNamespace) {
+ _namespaceCount[idString.getNamespace()]++;
+ }
+
+ if (_doUser && idString.hasNumber()) {
+ _userCount[idString.getNumber()]++;
+ }
+
+ if (_doGroup && idString.hasGroup()) {
+ _groupCount[idString.getGroup()]++;
+ }
+
+ switch (idString.getType()) {
+ case document::IdString::DOC:
+ if (_doScheme) {
+ _schemeCount["doc"]++;
+ }
+ break;
+ case document::IdString::USERDOC:
+ if (_doScheme) {
+ _schemeCount["userdoc"]++;
+ }
+ break;
+ case document::IdString::GROUPDOC:
+ if (_doScheme) {
+ _schemeCount["groupdoc"]++;
+ }
+ break;
+ case document::IdString::ORDERDOC:
+ if (_doScheme) {
+ _schemeCount["orderdoc"]++;
+ }
+ break;
+ case document::IdString::ID:
+ if (_doScheme) {
+ _schemeCount["id"]++;
+ }
+ break;
+ case document::IdString::NULLID:
+ if (_doScheme) {
+ _schemeCount["null"]++;
+ }
+ break;
+ }
+ }
+ }
+ }
+}
+
+void CountVisitor::completedVisiting(HitCounter&) {
+ documentapi::MapVisitorMessage* cmd(new documentapi::MapVisitorMessage());
+
+ for (std::map<std::string, int>::iterator iter = _schemeCount.begin();
+ iter != _schemeCount.end();
+ iter++) {
+ cmd->getData().set(vespalib::make_string("scheme.%s", iter->first.c_str()), iter->second);
+ }
+
+ for (NamespaceCountMap::const_iterator iter = _namespaceCount.begin();
+ iter != _namespaceCount.end();
+ iter++) {
+ cmd->getData().set(vespalib::make_string("namespace.%s", iter->first.c_str()), iter->second);
+ }
+
+ for (GroupCountMap::const_iterator iter = _groupCount.begin();
+ iter != _groupCount.end();
+ iter++) {
+ cmd->getData().set(vespalib::make_string("group.%s", iter->first.c_str()), iter->second);
+ }
+
+ for (std::map<uint64_t, int>::iterator iter = _userCount.begin();
+ iter != _userCount.end();
+ iter++) {
+ cmd->getData().set(vespalib::make_string("user.%" PRIu64, iter->first), iter->second);
+ }
+
+ sendMessage(documentapi::DocumentMessage::UP(cmd));
+}
+
+}
diff --git a/storage/src/vespa/storage/visiting/countvisitor.h b/storage/src/vespa/storage/visiting/countvisitor.h
new file mode 100644
index 00000000000..c2b85c35419
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/countvisitor.h
@@ -0,0 +1,62 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::CountVisitor
+ * @ingroup visitors
+ *
+ * @brief A count visitor is a visitor that sends documentid statistics
+ * to the client.
+ *
+ */
+#pragma once
+
+#include <vespa/storage/visiting/visitor.h>
+
+namespace storage {
+
+class CountVisitor : public Visitor {
+public:
+ CountVisitor(StorageComponent&,
+ const vdslib::Parameters& params);
+
+ virtual void completedVisiting(HitCounter&);
+
+private:
+ void handleDocuments(const document::BucketId& bucketId,
+ std::vector<spi::DocEntry::LP>& entries,
+ HitCounter& hitCounter);
+
+ bool _doScheme;
+ std::map<std::string, int> _schemeCount;
+
+ bool _doNamespace;
+ typedef std::map<vespalib::string, int> NamespaceCountMap;
+ NamespaceCountMap _namespaceCount;
+
+ bool _doUser;
+ std::map<uint64_t, int> _userCount;
+
+ bool _doGroup;
+ typedef std::map<vespalib::string, int> GroupCountMap;
+ GroupCountMap _groupCount;
+};
+
+struct CountVisitorFactory : public VisitorFactory {
+
+ VisitorEnvironment::UP
+ makeVisitorEnvironment(StorageComponent&) {
+ return VisitorEnvironment::UP(new VisitorEnvironment);
+ };
+
+ Visitor*
+ makeVisitor(StorageComponent& c, VisitorEnvironment&,
+ const vdslib::Parameters& params)
+ {
+ return new CountVisitor(c, params);
+ }
+
+};
+
+}
+
+
+
diff --git a/storage/src/vespa/storage/visiting/dumpvisitor.cpp b/storage/src/vespa/storage/visiting/dumpvisitor.cpp
new file mode 100644
index 00000000000..8d94de010ae
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/dumpvisitor.cpp
@@ -0,0 +1,134 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/visiting/dumpvisitor.h>
+#include <vespa/documentapi/messagebus/messages/multioperationmessage.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/log/log.h>
+#include <vespa/vdslib/container/mutabledocumentlist.h>
+#include <vespa/vespalib/text/stringtokenizer.h>
+
+LOG_SETUP(".visitor.instance.dumpvisitor");
+
+namespace storage {
+
+DumpVisitor::DumpVisitor(StorageComponent& component,
+ const vdslib::Parameters& params)
+ : Visitor(component),
+ _keepTimeStamps(false)
+{
+ if (params.hasValue("requestfields")) {
+ std::string fields = params.get("requestfields");
+
+ _requestedFields.reset(new std::set<std::string>());
+ vespalib::StringTokenizer tokenizer(fields);
+ for (uint32_t i = 0; i < tokenizer.size(); i++) {
+ _requestedFields->insert(tokenizer[i]);
+ }
+ }
+
+ if (params.hasValue("requestdocuments")) {
+ std::string documents = params.get("requestdocuments");
+
+ _requestedDocuments.reset(new std::set<std::string>());
+ vespalib::StringTokenizer tokenizer(documents, " \t");
+ for (uint32_t i = 0; i < tokenizer.size(); i++) {
+ _requestedDocuments->insert(tokenizer[i]);
+ }
+ }
+
+ if (params.hasValue("keeptimestamps")) {
+ _keepTimeStamps = true;
+ }
+
+ LOG(debug, "Created DumpVisitor");
+}
+
+std::unique_ptr<documentapi::MultiOperationMessage>
+DumpVisitor::createMultiOperation(const document::BucketId& bucketId,
+ const std::vector<const document::Document*>& docs)
+{
+ for (int multiplier = 1; ; multiplier *= 2) {
+ std::vector<char> buffer(getDocBlockSize() * multiplier);
+ vdslib::MutableDocumentList newBlock(_component.getTypeRepo(),
+ &buffer[0], buffer.size(), false);
+ bool mustResizeBuffer = false;
+ for (uint32_t i = 0; i < docs.size(); i++) {
+ bool ok = newBlock.addPut(*docs[i], docs[i]->getLastModified());
+ if (!ok) {
+ mustResizeBuffer = true;
+ break;
+ }
+ }
+
+ if (!mustResizeBuffer) {
+ return std::unique_ptr<documentapi::MultiOperationMessage>(
+ new documentapi::MultiOperationMessage(bucketId, newBlock, _keepTimeStamps));
+ }
+ }
+ assert(false);
+ return std::unique_ptr<documentapi::MultiOperationMessage>();
+}
+
+void DumpVisitor::handleDocuments(const document::BucketId& bucketId,
+ std::vector<spi::DocEntry::LP>& entries,
+ HitCounter& hitCounter)
+{
+ LOG(debug, "Visitor %s handling block of %zu documents.",
+ _id.c_str(), entries.size());
+
+ std::unique_ptr<documentapi::MultiOperationMessage> cmd;
+ if (_requestedFields.get() || _requestedDocuments.get()) {
+ std::vector<const document::Document*> newDocuments;
+
+ // Remove all fields from the document that are not listed in
+ // requestedFields.
+ for (size_t i = 0; i < entries.size(); ++i) {
+ std::unique_ptr<document::Document> d(entries[i]->getDocument()->clone());
+
+ if (!_requestedDocuments.get()
+ || _requestedDocuments->find(d->getId().toString())
+ != _requestedDocuments->end())
+ {
+ if (_requestedFields.get()) {
+ for (document::Document::const_iterator docIter
+ = d->begin(); docIter != d->end(); docIter++)
+ {
+ if (_requestedFields->find(docIter.field().getName())
+ == _requestedFields->end())
+ {
+ d->remove(docIter.field());
+ }
+ }
+ }
+ newDocuments.push_back(d.release());
+ }
+ }
+
+ cmd = createMultiOperation(bucketId, newDocuments);
+
+ // FIXME: not exception safe
+ for (uint32_t i = 0; i < newDocuments.size(); i++) {
+ delete newDocuments[i];
+ }
+ } else {
+ std::vector<const document::Document*> docs;
+ docs.reserve(entries.size());
+ for (size_t i = 0; i < entries.size(); ++i) {
+ docs.push_back(entries[i]->getDocument());
+ assert(docs.back() != 0);
+ }
+ cmd = createMultiOperation(bucketId, docs);
+ }
+
+ for (vdslib::DocumentList::const_iterator iter
+ = cmd->getOperations().begin();
+ iter != cmd->getOperations().end(); iter++)
+ {
+ hitCounter.addHit(iter->getDocumentId(), iter->getSerializedSize());
+ }
+
+ sendMessage(documentapi::DocumentMessage::UP(cmd.release()));
+}
+
+}
diff --git a/storage/src/vespa/storage/visiting/dumpvisitor.h b/storage/src/vespa/storage/visiting/dumpvisitor.h
new file mode 100644
index 00000000000..c4572766a3e
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/dumpvisitor.h
@@ -0,0 +1,58 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::DumpVisitor
+ * @ingroup visitors
+ *
+ * @brief A dump visitor is a visitor that sends documents to the client.
+ *
+ */
+#pragma once
+
+#include <vespa/storage/visiting/visitor.h>
+
+namespace documentapi {
+class MultiOperationMessage;
+}
+
+namespace storage {
+
+class DumpVisitor : public Visitor {
+public:
+ DumpVisitor(StorageComponent& component, const vdslib::Parameters&);
+
+private:
+ std::unique_ptr<documentapi::MultiOperationMessage>
+ createMultiOperation(const document::BucketId& bucketId,
+ const std::vector<const document::Document*>& docs);
+
+ void handleDocuments(const document::BucketId& bucketId,
+ std::vector<spi::DocEntry::LP>& entries,
+ HitCounter& hitCounter);
+
+ std::unique_ptr<std::set<std::string> > _requestedFields;
+ std::unique_ptr<std::set<std::string> > _requestedDocuments;
+
+ bool _keepTimeStamps;
+};
+
+class DumpVisitorFactory : public VisitorFactory {
+public:
+ DumpVisitorFactory() {}
+
+ VisitorEnvironment::UP
+ makeVisitorEnvironment(StorageComponent&) {
+ return VisitorEnvironment::UP(new VisitorEnvironment);
+ };
+
+ storage::Visitor*
+ makeVisitor(StorageComponent& component, storage::VisitorEnvironment&,
+ const vdslib::Parameters& params)
+ {
+ return new DumpVisitor(component, params);
+ }
+};
+
+}
+
+
+
diff --git a/storage/src/vespa/storage/visiting/dumpvisitorsingle.cpp b/storage/src/vespa/storage/visiting/dumpvisitorsingle.cpp
new file mode 100644
index 00000000000..d323dcf20d5
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/dumpvisitorsingle.cpp
@@ -0,0 +1,47 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/visiting/dumpvisitorsingle.h>
+#include <vespa/documentapi/messagebus/messages/multioperationmessage.h>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/log/log.h>
+#include <vespa/vespalib/text/stringtokenizer.h>
+#include <vespa/documentapi/messagebus/messages/putdocumentmessage.h>
+#include <vespa/documentapi/messagebus/messages/removedocumentmessage.h>
+#include <memory>
+
+LOG_SETUP(".visitor.instance.dumpvisitorsingle");
+
+namespace storage {
+
+DumpVisitorSingle::DumpVisitorSingle(StorageComponent& component,
+ const vdslib::Parameters&)
+ : Visitor(component)
+{
+}
+
+void DumpVisitorSingle::handleDocuments(const document::BucketId& /*bucketId*/,
+ std::vector<spi::DocEntry::LP>& entries,
+ HitCounter& hitCounter)
+{
+ LOG(debug, "Visitor %s handling block of %zu documents.",
+ _id.c_str(), entries.size());
+
+ for (size_t i = 0; i < entries.size(); ++i) {
+ spi::DocEntry& entry(*entries[i]);
+ const uint32_t docSize = entry.getDocumentSize();
+ if (entry.isRemove()) {
+ hitCounter.addHit(*entry.getDocumentId(), docSize);
+ sendMessage(std::make_unique<documentapi::RemoveDocumentMessage>(
+ *entry.getDocumentId()));
+ } else {
+ hitCounter.addHit(*entry.getDocumentId(), docSize);
+ auto msg = std::make_unique<documentapi::PutDocumentMessage>(
+ entry.releaseDocument());
+ msg->setApproxSize(docSize);
+ sendMessage(std::move(msg));
+ }
+ }
+}
+
+}
diff --git a/storage/src/vespa/storage/visiting/dumpvisitorsingle.h b/storage/src/vespa/storage/visiting/dumpvisitorsingle.h
new file mode 100644
index 00000000000..f7b3fe66b88
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/dumpvisitorsingle.h
@@ -0,0 +1,45 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::DumpVisitorSingle
+ * @ingroup visitors
+ *
+ * @brief A dump visitor is a visitor that sends documents to the client.
+ * Each document is sent as a single message
+ *
+ */
+#pragma once
+
+#include <vespa/storage/visiting/visitor.h>
+
+namespace storage {
+
+class DumpVisitorSingle : public Visitor {
+public:
+ DumpVisitorSingle(StorageComponent&,
+ const vdslib::Parameters& params);
+
+private:
+ void handleDocuments(const document::BucketId&,
+ std::vector<spi::DocEntry::LP>&,
+ HitCounter&);
+};
+
+struct DumpVisitorSingleFactory : public VisitorFactory {
+
+ VisitorEnvironment::UP
+ makeVisitorEnvironment(StorageComponent&) {
+ return VisitorEnvironment::UP(new VisitorEnvironment);
+ };
+
+ Visitor*
+ makeVisitor(StorageComponent& c, VisitorEnvironment&,
+ const vdslib::Parameters& params)
+ {
+ return new DumpVisitorSingle(c, params);
+ }
+};
+
+}
+
+
+
diff --git a/storage/src/vespa/storage/visiting/memory_bounded_trace.cpp b/storage/src/vespa/storage/visiting/memory_bounded_trace.cpp
new file mode 100644
index 00000000000..f29c07fdc03
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/memory_bounded_trace.cpp
@@ -0,0 +1,71 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/visiting/memory_bounded_trace.h>
+#include <vespa/vespalib/util/stringfmt.h>
+
+namespace storage {
+
+MemoryBoundedTrace::MemoryBoundedTrace(size_t softMemoryUpperBound)
+ : _node(),
+ _currentMemoryUsed(0),
+ _omittedNodes(0),
+ _omittedBytes(0),
+ _softMemoryUpperBound(softMemoryUpperBound)
+{
+}
+
+namespace {
+
+size_t
+computeTraceTreeMemoryUsage(const mbus::TraceNode& node)
+{
+ if (node.isLeaf()) {
+ return node.getNote().size();
+ }
+ size_t childSum = 0;
+ const uint32_t childCount = node.getNumChildren();
+ for (uint32_t i = 0; i < childCount; ++i) {
+ childSum += computeTraceTreeMemoryUsage(node.getChild(i));
+ }
+ return childSum;
+}
+
+} // anon ns
+
+bool
+MemoryBoundedTrace::add(const mbus::TraceNode& node)
+{
+ const size_t nodeFootprint = computeTraceTreeMemoryUsage(node);
+
+ if (_currentMemoryUsed >= _softMemoryUpperBound) {
+ ++_omittedNodes;
+ _omittedBytes += nodeFootprint;
+ return false;
+ }
+ _node.addChild(node);
+ _currentMemoryUsed += nodeFootprint;
+ return true;
+}
+
+void
+MemoryBoundedTrace::moveTraceTo(mbus::TraceNode& out)
+{
+ if (_node.isEmpty()) {
+ return;
+ }
+ if (_omittedNodes > 0) {
+ _node.addChild(vespalib::make_string(
+ "Trace too large; omitted %zu subsequent trace trees "
+ "containing a total of %zu bytes",
+ _omittedNodes, _omittedBytes));
+ }
+ out.addChild(_node); // XXX rvalue support should be added to TraceNode.
+ _node.clear();
+ _currentMemoryUsed = 0;
+ _omittedNodes = 0;
+ _omittedBytes = 0;
+}
+
+} // storage
+
diff --git a/storage/src/vespa/storage/visiting/memory_bounded_trace.h b/storage/src/vespa/storage/visiting/memory_bounded_trace.h
new file mode 100644
index 00000000000..3ec0aff2ece
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/memory_bounded_trace.h
@@ -0,0 +1,51 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/messagebus/trace.h>
+#include <stdint.h>
+
+namespace storage {
+
+class MemoryBoundedTrace {
+public:
+ MemoryBoundedTrace(size_t softMemoryUpperBound);
+ /**
+ * Attempt to append the given trace node to the internal trace tree.
+ * If the amount of memory currently being used exceeds that of the upper
+ * bound used when constructing `this`, the node will not be added to
+ * the tree. Note that this only takes place on the granularity of full
+ * trees; either the entire trace tree given by `node` is added or nothing
+ * at all. This means it's possible to exceed the upper bound if the node
+ * is sufficiently large when added before memory has hit the limit; only
+ * subsequent adds will fail.
+ *
+ * Returns true if `node` was added to internal trace state, false
+ * otherwise.
+ */
+ bool add(const mbus::TraceNode& node);
+
+ /**
+ * Append current trace tree to the output trace node and clear internal
+ * tree in the process. In the case that at least 1 node has been
+ * omitted due to memory bounds being exceeded, the trace will contain a
+ * node at its end detailing the number of traces and bytes that have been
+ * omitted from the output.
+ *
+ * If current trace is empty, no nodes are added to `out`.
+ */
+ void moveTraceTo(mbus::TraceNode& out);
+
+ size_t getApproxMemoryUsed() const noexcept {
+ return _currentMemoryUsed;
+ }
+
+private:
+ mbus::TraceNode _node;
+ size_t _currentMemoryUsed;
+ size_t _omittedNodes;
+ size_t _omittedBytes;
+ size_t _softMemoryUpperBound;
+};
+
+} // storage
diff --git a/storage/src/vespa/storage/visiting/messagebusvisitormessagesession.h b/storage/src/vespa/storage/visiting/messagebusvisitormessagesession.h
new file mode 100644
index 00000000000..4cd6b25b1fb
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/messagebusvisitormessagesession.h
@@ -0,0 +1,59 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::MessageBusVisitorMessageSession
+ *
+ * \brief Implementation of a visitor message session using messagebus.
+ */
+#pragma once
+
+#include <vespa/messagebus/sourcesession.h>
+#include <vespa/storage/visiting/visitormessagesession.h>
+#include <vespa/storage/visiting/visitorthread.h>
+#include <vespa/storage/visiting/visitor.h>
+
+namespace documentapi {
+ class DocumentMessage;
+}
+
+namespace storage {
+
+class MessageBusVisitorMessageSession : public VisitorMessageSession,
+ public mbus::IReplyHandler
+{
+public:
+ typedef std::unique_ptr<MessageBusVisitorMessageSession> UP;
+
+ MessageBusVisitorMessageSession(Visitor& visitor, VisitorThread& thread)
+ : _visitor(visitor),
+ _visitorThread(thread)
+ {
+ }
+
+ void setSourceSession(mbus::SourceSession::UP sourceSession) {
+ _sourceSession = std::move(sourceSession);
+ }
+
+ virtual mbus::Result send(std::unique_ptr<documentapi::DocumentMessage> msg) {
+ msg->setRetryEnabled(false);
+ return _sourceSession->send(std::move(msg));
+ }
+
+ /**
+ @return Returns the number of pending messages this session has.
+ */
+ virtual uint32_t pending() {
+ return _sourceSession->getPendingCount();
+ }
+
+ virtual void handleReply(mbus::Reply::UP reply) {
+ _visitorThread.handleMessageBusReply(std::move(reply), _visitor);
+ }
+
+private:
+ Visitor& _visitor;
+ VisitorThread& _visitorThread;
+ mbus::SourceSession::UP _sourceSession;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/visiting/messages.h b/storage/src/vespa/storage/visiting/messages.h
new file mode 100644
index 00000000000..ab5628b6669
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/messages.h
@@ -0,0 +1,79 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * Messages used internally within visitor implementation. Sent from visitor
+ * manager to visitor threads, to avoid any locking issues generated by calling
+ * visitor threads directly.
+ */
+
+#pragma once
+
+#include <vespa/storageapi/message/internal.h>
+#include <vespa/storage/visiting/config-stor-visitor.h>
+#include <vespa/storageframework/storageframework.h>
+
+namespace storage {
+
+/**
+ * @class PropagateVisitorConfig
+ * @ingroup visiting
+ *
+ * @brief Used to propagate visitor config to visitor threads.
+ */
+class PropagateVisitorConfig : public api::InternalCommand {
+ vespa::config::content::core::StorVisitorConfig _config;
+public:
+ static const uint32_t ID = 3001;
+
+ PropagateVisitorConfig(const vespa::config::content::core::StorVisitorConfig& config)
+ : api::InternalCommand(ID),
+ _config(config) {}
+
+ std::unique_ptr<api::StorageReply> makeReply();
+
+ const vespa::config::content::core::StorVisitorConfig& getConfig() const { return _config; }
+
+ virtual void print(std::ostream& out, bool verbose, const std::string& indent) const
+ {
+ out << "PropagateVisitorConfig()";
+
+ if (verbose) {
+ out << " : ";
+ api::InternalCommand::print(out, true, indent);
+ }
+ }
+};
+
+/**
+ * @class PropagateVisitorConfigReply
+ * @ingroup visiting
+ */
+class PropagateVisitorConfigReply : public api::InternalReply {
+public:
+ static const int ID = 3002;
+
+ PropagateVisitorConfigReply(const PropagateVisitorConfig& cmd)
+ : api::InternalReply(ID, cmd)
+ {
+ }
+
+
+ virtual void print(std::ostream& out, bool verbose, const std::string& indent) const
+ {
+ out << "PropagateVisitorConfigReply()";
+
+ if (verbose) {
+ out << " : ";
+ api::InternalReply::print(out, true, indent);
+ }
+ }
+};
+
+inline std::unique_ptr<api::StorageReply>
+PropagateVisitorConfig::makeReply()
+{
+ return std::unique_ptr<api::StorageReply>(
+ new PropagateVisitorConfigReply(*this));
+}
+
+} // storage
+
diff --git a/storage/src/vespa/storage/visiting/recoveryvisitor.cpp b/storage/src/vespa/storage/visiting/recoveryvisitor.cpp
new file mode 100644
index 00000000000..f133b2a7c0c
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/recoveryvisitor.cpp
@@ -0,0 +1,106 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/visiting/recoveryvisitor.h>
+
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/log/log.h>
+#include <vespa/documentapi/messagebus/messages/visitor.h>
+#include <vespa/vespalib/text/stringtokenizer.h>
+
+LOG_SETUP(".visitor.instance.recoveryvisitor");
+
+namespace storage {
+
+RecoveryVisitor::RecoveryVisitor(StorageComponent& component,
+ const vdslib::Parameters& params)
+ : Visitor(component)
+{
+ if (params.hasValue("requestfields")) {
+ std::string fields = params.get("requestfields");
+
+ vespalib::StringTokenizer tokenizer(fields);
+ for (uint32_t i = 0; i < tokenizer.size(); i++) {
+ _requestedFields.insert(tokenizer[i]);
+ }
+ }
+
+
+ LOG(debug, "Created RecoveryVisitor with %d requested fields", (int)_requestedFields.size());
+}
+
+void
+RecoveryVisitor::handleDocuments(const document::BucketId& bid,
+ std::vector<spi::DocEntry::LP>& entries,
+ HitCounter& hitCounter)
+{
+ vespalib::LockGuard guard(_mutex);
+
+ LOG(debug, "Visitor %s handling block of %zu documents.",
+ _id.c_str(), entries.size());
+
+ documentapi::DocumentListMessage* cmd = NULL;
+
+ {
+ CommandMap::iterator iter = _activeCommands.find(bid);
+
+ if (iter == _activeCommands.end()) {
+ CommandPtr ptr(new documentapi::DocumentListMessage(bid));
+ cmd = ptr.get();
+ _activeCommands[bid] = ptr;
+ } else {
+ cmd = iter->second.get();
+ }
+ }
+
+ // Remove all fields from the document that are not listed in requestedFields.
+ for (size_t i = 0; i < entries.size(); ++i) {
+ const spi::DocEntry& entry(*entries[i]);
+ std::unique_ptr<document::Document> doc(entry.getDocument()->clone());
+ if (_requestedFields.empty()) {
+ doc->clear();
+ } else {
+ for (document::Document::const_iterator docIter = doc->begin();
+ docIter != doc->end();
+ docIter++) {
+ if (_requestedFields.find(docIter.field().getName())
+ == _requestedFields.end())
+ {
+ doc->remove(docIter.field());
+ }
+ }
+ }
+
+ hitCounter.addHit(doc->getId(), doc->serialize()->getLength());
+
+ int64_t timestamp = doc->getLastModified();
+ cmd->getDocuments().push_back(documentapi::DocumentListMessage::Entry(
+ timestamp,
+ document::Document::SP(doc.release()),
+ entry.isRemove()));
+ }
+}
+
+void RecoveryVisitor::completedBucket(const document::BucketId& bid, HitCounter&)
+{
+ documentapi::DocumentMessage::UP _msgToSend;
+
+ LOG(debug, "Finished bucket %s", bid.toString().c_str());
+
+ {
+ vespalib::LockGuard guard(_mutex);
+
+ CommandMap::iterator iter = _activeCommands.find(bid);
+
+ if (iter != _activeCommands.end()) {
+ _msgToSend.reset(iter->second.release());
+ _activeCommands.erase(iter);
+ }
+ }
+
+ if (_msgToSend.get()) {
+ sendMessage(std::move(_msgToSend));
+ }
+}
+
+}
diff --git a/storage/src/vespa/storage/visiting/recoveryvisitor.h b/storage/src/vespa/storage/visiting/recoveryvisitor.h
new file mode 100644
index 00000000000..b2771c2c64c
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/recoveryvisitor.h
@@ -0,0 +1,61 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::RecoveryVisitor
+ * @ingroup visitors
+ *
+ * @brief A recovery visitor is a visitor that sends messages with bucketid
+ * and a list of minimal documents to the client.
+ *
+ */
+#pragma once
+
+#include <vespa/storage/visiting/visitor.h>
+#include <vespa/storageapi/message/datagram.h>
+
+namespace documentapi {
+class DocumentListMessage;
+}
+
+namespace storage {
+
+class RecoveryVisitor : public Visitor {
+public:
+ RecoveryVisitor(StorageComponent&,
+ const vdslib::Parameters& params);
+
+private:
+ void handleDocuments(const document::BucketId& bucketId,
+ std::vector<spi::DocEntry::LP>& entries,
+ HitCounter& hitCounter);
+
+ void completedBucket(const document::BucketId&, HitCounter&);
+
+ std::set<std::string> _requestedFields;
+
+ typedef vespalib::LinkedPtr<documentapi::DocumentListMessage> CommandPtr;
+ typedef std::map<document::BucketId, CommandPtr> CommandMap;
+ CommandMap _activeCommands;
+
+ vespalib::Lock _mutex;
+};
+
+struct RecoveryVisitorFactory : public VisitorFactory {
+
+ VisitorEnvironment::UP
+ makeVisitorEnvironment(StorageComponent&) {
+ return VisitorEnvironment::UP(new VisitorEnvironment);
+ };
+
+ Visitor*
+ makeVisitor(StorageComponent& c, VisitorEnvironment&,
+ const vdslib::Parameters& params)
+ {
+ return new RecoveryVisitor(c, params);
+ }
+
+};
+
+}
+
+
+
diff --git a/storage/src/vespa/storage/visiting/stor-visitor.def b/storage/src/vespa/storage/visiting/stor-visitor.def
new file mode 100644
index 00000000000..99a415cc9e4
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/stor-visitor.def
@@ -0,0 +1,72 @@
+# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.content.core
+
+## Number of separate threads that runs visitors.
+visitorthreads int default=4 restart
+
+## Default timeout of visitors that loses contact with client (in seconds)
+disconnectedvisitortimeout int default=0 restart
+
+## Time period (in seconds) in which to ignore requests to visitors that doesnt
+## exist anymore. (Normal for visitors to get some messages right after
+## aborting, logging them as faults instead after this timeout has passed.)
+ignorenonexistingvisitortimelimit int default=300 restart
+
+## The number of buckets that are visited in parallel in a visitor visiting
+## multiple buckets. Default is 8, meaning if you send a create visitor to visit
+## 100 buckets, 8 of them will be visited in parallel.
+defaultparalleliterators int default=8
+
+## The number of iterators we send for each bucket being visited from visitor
+## thread. For streaming search we would likely want two or three. Since
+## supporting more than one is a new feature, default is still one.
+## (If you visit 8 buckets in parallel and have 2 iterators per bucket, this
+## will be 16 requests to persistence layer, but only 8 will be able to execute
+## at the same time, since only one operation can be executed at the same time
+## for one bucket)
+iterators_per_bucket int default=3
+
+## Default number of maximum client replies pending.
+defaultpendingmessages int default=8
+
+## Default size of docblocks used to transfer visitor data.
+defaultdocblocksize int default=4190208
+
+## Default docblock timeout in ms used to transfer visitor data.
+## Currently defaults to a day. This is to avoid slow visitor target problems,
+## getting data resent faster than it can process, and since there are very few
+## reasons to actually time out
+defaultdocblocktimeout int default=180000
+
+## Default timeout of visitor info messages: Progress and error reports.
+## If these time out, the visitor will be aborted on the storage node.
+defaultinfotimeout int default=60000
+
+## Max concurrent visitors (legacy)
+maxconcurrentvisitors int default=64
+
+## Priority-based max concurrent visitors. Fixed is the total number of
+## concurrent visitors that can run for any priorities. Variable
+## increases the concurrency limit for higher priorities, the limit
+## being linear with a messages priority. Example: if Fixed is 16
+## and Variable is 64, maxconcurrent for a pri 255 message is 16 and
+## maxconcurrent for a pri 0 message is 16+64=80.
+## If fixed is left as 0, it will take the value of maxconcurrentvisitors,
+## allowing backwards compatability
+maxconcurrentvisitors_fixed int default=16
+maxconcurrentvisitors_variable int default=64
+
+## Max size of visitor priority queue
+maxvisitorqueuesize int default=1024
+
+# Limit of memory used _per visitor_ in bytes.
+# Due to optimistic parallelization, it is possible for this limit to be
+# initially violated when the visitor is first started. This can happen since
+# the visitor does not know the size of the bucket contents before fetching
+# any data from it and it will do so based on parallelization factors specified
+# in the CreateVisitor command. If 3 buckets are initially visited in parallel
+# and these both contain a single 100 MiB document, the memory usage of the
+# visitor will peak at 300 MiB even if the configured limit is e.g. 20 MiB.
+# Default value is set to 20 MiB, which attempts to keep a reasonably safe
+# level in the face of a default number of max concurrent visitors (64).
+visitor_memory_usage_limit int default=25165824
diff --git a/storage/src/vespa/storage/visiting/testvisitor.cpp b/storage/src/vespa/storage/visiting/testvisitor.cpp
new file mode 100644
index 00000000000..5b02ac1271a
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/testvisitor.cpp
@@ -0,0 +1,84 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/visiting/testvisitor.h>
+
+#include <sstream>
+#include <vespa/document/fieldvalue/document.h>
+#include <vespa/log/log.h>
+#include <vespa/documentapi/messagebus/messages/visitor.h>
+#include <vespa/vdslib/container/parameters.h>
+
+
+LOG_SETUP(".visitor.instance.testvisitor");
+
+namespace storage {
+
+TestVisitor::TestVisitor(StorageComponent& c,
+ const vdslib::Parameters& params)
+ : Visitor(c),
+ _params()
+{
+ std::ostringstream ost;
+ for (vdslib::Parameters::ParametersMap::const_iterator
+ it(params.begin()), mt(params.end()); it != mt; ++it)
+ {
+ ost << "\n " << it->first << " = " << it->second.c_str();
+ }
+ _params = ost.str();
+ LOG(info, "Created TestVisitor: %s", _params.c_str());
+}
+
+void
+TestVisitor::startingVisitor(const std::vector<document::BucketId>& buckets)
+{
+ std::ostringstream ost;
+ ost << "Starting visitor with given parameters:" << _params << "\n"
+ << "Visiting the following bucket time intervals:\n";
+ for (uint32_t i=0, n=buckets.size(); i<n; ++i) {
+ ost << " " << buckets[i] << "\n";
+ }
+ LOG(info, "%s", ost.str().c_str());
+ report(ost.str());
+}
+
+void
+TestVisitor::handleDocuments(const document::BucketId& /*bucketId*/,
+ std::vector<spi::DocEntry::LP>& entries,
+ HitCounter& /*hitCounter*/)
+{
+ std::ostringstream ost;
+ ost << "Handling block of " << entries.size() << " documents.\n";
+ LOG(info, "%s", ost.str().c_str());
+ report(ost.str());
+}
+
+void TestVisitor::completedBucket(const document::BucketId& bucket, HitCounter&)
+{
+ std::ostringstream ost;
+ ost << "completedBucket(" << bucket.getId() << ")\n";
+ LOG(info, "%s", ost.str().c_str());
+ report(ost.str());
+}
+
+void TestVisitor::completedVisiting(HitCounter&)
+{
+ LOG(info, "completedVisiting()");
+ report("completedVisiting()\n");
+}
+
+void TestVisitor::abortedVisiting()
+{
+ LOG(info, "abortedVisiting()");
+ report("abortedVisiting()\n");
+}
+
+void TestVisitor::report(const std::string& message) {
+ // As we have no existing way of sending a single message back to the
+ // client, use a map visitor command
+ documentapi::MapVisitorMessage* cmd = new documentapi::MapVisitorMessage();
+ cmd->getData().set("msg", message);
+ sendMessage(documentapi::DocumentMessage::UP(cmd));
+}
+
+}
diff --git a/storage/src/vespa/storage/visiting/testvisitor.h b/storage/src/vespa/storage/visiting/testvisitor.h
new file mode 100644
index 00000000000..5b32ec62906
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/testvisitor.h
@@ -0,0 +1,60 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::TestVisitor
+ * @ingroup visitors
+ *
+ * @brief A visitor used purely for testing the visitor framework.
+ *
+ */
+#pragma once
+
+#include <vespa/storage/visiting/visitor.h>
+
+namespace storage {
+
+class TestVisitor : public Visitor {
+public:
+ TestVisitor(StorageComponent&, const vdslib::Parameters&);
+
+private:
+ void startingVisitor(const std::vector<document::BucketId>& buckets);
+
+ void handleDocuments(const document::BucketId& bucketId,
+ std::vector<spi::DocEntry::LP>& entries,
+ HitCounter& hitCounter);
+
+ void completedBucket(const document::BucketId& bucket, HitCounter& hitCounter);
+
+ spi::ReadConsistency getRequiredReadConsistency() const override {
+ return spi::ReadConsistency::WEAK;
+ }
+
+ void completedVisiting(HitCounter& hitCounter);
+
+ void abortedVisiting();
+
+ // Send datagram with message back to client
+ void report(const std::string& message);
+
+ std::string _params;
+};
+
+struct TestVisitorFactory : public VisitorFactory {
+
+ VisitorEnvironment::UP
+ makeVisitorEnvironment(StorageComponent&) {
+ return VisitorEnvironment::UP(new VisitorEnvironment);
+ };
+
+ Visitor*
+ makeVisitor(StorageComponent& c, VisitorEnvironment&,
+ const vdslib::Parameters& params) {
+ return new TestVisitor(c, params);
+ }
+
+};
+
+}
+
+
+
diff --git a/storage/src/vespa/storage/visiting/visitor.cpp b/storage/src/vespa/storage/visiting/visitor.cpp
new file mode 100644
index 00000000000..9639c0768dd
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/visitor.cpp
@@ -0,0 +1,1295 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/visiting/visitor.h>
+
+#include <vespa/log/log.h>
+#include <vespa/storageapi/message/datagram.h>
+#include <vespa/storageapi/message/visitor.h>
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/storage/visiting/visitormetrics.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/documentapi/messagebus/messages/documentmessage.h>
+#include <vespa/documentapi/messagebus/messages/visitor.h>
+#include <ostream>
+#include <sstream>
+#include <unordered_map>
+
+LOG_SETUP(".visitor.instance");
+
+namespace storage {
+
+Visitor::HitCounter::HitCounter(const document::OrderingSpecification* ordering)
+ : _firstPassHits(0),
+ _firstPassBytes(0),
+ _secondPassHits(0),
+ _secondPassBytes(0),
+ _ordering(ordering)
+{
+}
+
+void
+Visitor::HitCounter::addHit(const document::DocumentId& hit, uint32_t size)
+{
+ bool firstPass = false;
+
+ if (_ordering && _ordering->getWidthBits() > 0
+ && hit.getScheme().getType() == document::IdString::ORDERDOC)
+ {
+ const document::OrderDocIdString& order(
+ static_cast<const document::OrderDocIdString&>(hit.getScheme()));
+
+ int32_t width = (int32_t)pow(2, order.getWidthBits());
+ int32_t division = (int32_t)pow(2, order.getDivisionBits());
+
+ if (_ordering->getOrder() == document::OrderingSpecification::ASCENDING) {
+ uint64_t upperLimit = UINT64_MAX;
+ if (_ordering->getOrderingStart() < upperLimit - (width - division)) {
+ upperLimit = _ordering->getOrderingStart() + width - division;
+ }
+ if (order.getOrdering() >= _ordering->getOrderingStart() &&
+ order.getOrdering() <= upperLimit) {
+ firstPass = true;
+ /*std::cerr << "First pass because ordering (+) "
+ << order.getOrdering() << " is between "
+ << _ordering->getOrderingStart()
+ << " and " << upperLimit << "\n";*/
+ } else {
+ /*std::cerr << "Not first pass because ordering (+) "
+ << order.getOrdering() << " is not between "
+ << _ordering->getOrderingStart()
+ << " and " << upperLimit << "\n";*/
+ }
+ } else {
+ uint64_t lowerLimit = 0;
+ if (_ordering->getOrderingStart() > (uint64_t)(width - division)) {
+ lowerLimit = _ordering->getOrderingStart() - (width - division);
+ }
+ if (order.getOrdering() <= _ordering->getOrderingStart() &&
+ order.getOrdering() >= lowerLimit) {
+ firstPass = true;
+ /*std::cerr << "First pass because ordering (-) "
+ << order.getOrdering() << " is between "
+ << lowerLimit << " and "
+ << _ordering->getOrderingStart() << "\n";*/
+ } else {
+ /*std::cerr << "Not first pass because ordering (-) "
+ << order.getOrdering() << " is not between "
+ << lowerLimit << " and "
+ << _ordering->getOrderingStart() << "\n";*/
+ }
+ }
+ } else {
+// std::cerr << "Not counting first pass: " << _ordering->getWidthBits() << "\n";
+ }
+
+ if (firstPass) {
+ _firstPassHits++;
+ _firstPassBytes += size;
+ } else {
+ _secondPassHits++;
+ _secondPassBytes += size;
+ }
+
+}
+
+void
+Visitor::HitCounter::updateVisitorStatistics(
+ vdslib::VisitorStatistics& statistics)
+{
+ statistics.setDocumentsReturned(
+ statistics.getDocumentsReturned() + _firstPassHits);
+ statistics.setBytesReturned(
+ statistics.getBytesReturned() + _firstPassBytes);
+ statistics.setSecondPassDocumentsReturned(
+ statistics.getSecondPassDocumentsReturned() + _secondPassHits);
+ statistics.setSecondPassBytesReturned(
+ statistics.getSecondPassBytesReturned() + _secondPassBytes);
+}
+
+Visitor::VisitorTarget::MessageMeta::MessageMeta(
+ uint64_t msgId,
+ std::unique_ptr<documentapi::DocumentMessage> msg)
+ : messageId(msgId),
+ retryCount(0),
+ memoryUsage(msg->getApproxSize()),
+ message(std::move(msg)),
+ messageText(message->toString())
+{
+}
+
+Visitor::VisitorTarget::MessageMeta::MessageMeta(
+ Visitor::VisitorTarget::MessageMeta&& rhs) noexcept
+ : messageId(rhs.messageId),
+ retryCount(rhs.retryCount),
+ memoryUsage(rhs.memoryUsage),
+ message(std::move(rhs.message)),
+ messageText(std::move(rhs.messageText))
+{
+}
+
+Visitor::VisitorTarget::MessageMeta::~MessageMeta()
+{
+}
+
+Visitor::VisitorTarget::MessageMeta&
+Visitor::VisitorTarget::MessageMeta::operator=(
+ Visitor::VisitorTarget::MessageMeta&& rhs) noexcept
+{
+ messageId = rhs.messageId;
+ retryCount = rhs.retryCount;
+ memoryUsage = rhs.memoryUsage;
+ message = std::move(rhs.message);
+ messageText = std::move(rhs.messageText);
+ return *this;
+}
+
+Visitor::VisitorTarget::MessageMeta&
+Visitor::VisitorTarget::insertMessage(
+ std::unique_ptr<documentapi::DocumentMessage> msg)
+{
+ ++_pendingMessageId;
+ const uint64_t id = _pendingMessageId;
+ MessageMeta value(id, std::move(msg));
+ _memoryUsage += value.memoryUsage;
+ auto inserted = _messageMeta.insert(std::make_pair(id, std::move(value)));
+ assert(inserted.second);
+ return inserted.first->second;
+}
+
+Visitor::VisitorTarget::MessageMeta
+Visitor::VisitorTarget::releaseMetaForMessageId(uint64_t msgId)
+{
+ auto iter = _messageMeta.find(msgId);
+ assert(iter != _messageMeta.end());
+ MessageMeta meta = std::move(iter->second);
+ assert(_memoryUsage >= meta.memoryUsage);
+ _memoryUsage -= meta.memoryUsage;
+ _messageMeta.erase(iter);
+ return meta;
+}
+
+void
+Visitor::VisitorTarget::reinsertMeta(MessageMeta meta)
+{
+ _memoryUsage += meta.memoryUsage;
+ auto inserted = _messageMeta.insert(
+ std::make_pair(meta.messageId, std::move(meta)));
+ (void) inserted;
+ assert(inserted.second);
+}
+
+Visitor::VisitorTarget::MessageMeta&
+Visitor::VisitorTarget::metaForMessageId(uint64_t msgId)
+{
+ return _messageMeta.find(msgId)->second;
+}
+
+void
+Visitor::VisitorTarget::discardQueuedMessages()
+{
+ for (MessageQueue::iterator
+ it(_queuedMessages.begin()), e(_queuedMessages.end());
+ it != e; ++it)
+ {
+ LOG(spam, "Erasing queued message with id %zu", it->second);
+ releaseMetaForMessageId(it->second);
+ }
+ _queuedMessages.clear();
+}
+
+Visitor::BucketIterationState::~BucketIterationState()
+{
+ if (_iteratorId != 0) {
+ // Making the assumption that this is effectively nothrow.
+ std::shared_ptr<DestroyIteratorCommand> cmd(
+ new DestroyIteratorCommand(_iteratorId));
+ cmd->setLoadType(_visitor._initiatingCmd->getLoadType());
+ cmd->getTrace().setLevel(_visitor._traceLevel);
+ cmd->setPriority(0);
+
+ LOG(debug, "Visitor '%s' sending DestroyIteratorCommand for %s, "
+ "iterator id %zu.",
+ _visitor._id.c_str(),
+ _bucketId.toString().c_str(),
+ uint64_t(_iteratorId));
+ _messageHandler.send(cmd, _visitor);
+ }
+}
+
+Visitor::VisitorOptions::VisitorOptions()
+ : _fromTime(0),
+ _toTime(framework::MicroSecTime::max()),
+ _maxParallel(1),
+ _maxParallelOneBucket(2),
+ _maxPending(1),
+ _fieldSet("[all]"),
+ _visitRemoves(false)
+{
+}
+
+Visitor::VisitorTarget::VisitorTarget()
+ : _pendingMessageId(0),
+ _memoryUsage(0)
+{
+}
+
+Visitor::Visitor(StorageComponent& component)
+ : _component(component),
+ _visitorOptions(),
+ _visitorTarget(),
+ _state(STATE_NOT_STARTED),
+ _buckets(),
+ _currentBucket(),
+ _bucketStates(),
+ _calledStartingVisitor(false),
+ _calledCompletedVisitor(false),
+ _startTime(_component.getClock().getTimeInMicros()),
+ _hasSentReply(false),
+ _docBlockSize(1024),
+ _memoryUsageLimit(UINT32_MAX),
+ _docBlockTimeout(180 * 1000),
+ _visitorInfoTimeout(60 * 1000),
+ _serialNumber(0),
+ _traceLevel(0),
+ _ownNodeIndex(0xffff),
+ _visitorCmdId(0),
+ _visitorId(0),
+ _priority(api::StorageMessage::NORMAL),
+ _result(api::ReturnCode::OK),
+ _trace(DEFAULT_TRACE_MEMORY_LIMIT),
+ _messageHandler(0),
+ _id(),
+ _controlDestination(),
+ _dataDestination(),
+ _documentSelection(),
+ _memoryManager(0)
+{
+}
+
+Visitor::~Visitor()
+{
+ assert(_bucketStates.empty());
+}
+
+void
+Visitor::sendMessage(documentapi::DocumentMessage::UP cmd)
+{
+ assert(cmd.get());
+ if (!isRunning()) return;
+ cmd->setRoute(_dataDestination->getRoute());
+
+ cmd->setPriority(_documentPriority);
+ cmd->setLoadType(_initiatingCmd->getLoadType());
+
+ framework::MicroSecTime time(_component.getClock().getTimeInMicros());
+
+ if (time + _docBlockTimeout.getMicros() > _timeToDie) {
+ cmd->setTimeRemaining((_timeToDie > time)
+ ? (_timeToDie - time).getMillis().getTime()
+ : 0);
+ } else {
+ cmd->setTimeRemaining(_docBlockTimeout.getTime());
+ }
+ cmd->getTrace().setLevel(_traceLevel);
+
+ auto& msgMeta = _visitorTarget.insertMessage(std::move(cmd));
+ sendDocumentApiMessage(msgMeta);
+}
+
+void
+Visitor::sendDocumentApiMessage(VisitorTarget::MessageMeta& msgMeta) {
+ documentapi::DocumentMessage& cmd(*msgMeta.message);
+ // Just enqueue if it's not time to send this message yet
+ if (_messageSession->pending() >= _visitorOptions._maxPending
+ && cmd.getType() != documentapi::DocumentProtocol::MESSAGE_VISITORINFO)
+ {
+ MBUS_TRACE(cmd.getTrace(), 5, vespalib::make_string(
+ "Enqueueing message because the visitor already "
+ "had %d pending messages",
+ _visitorOptions._maxPending));
+
+ LOG(spam,
+ "Visitor '%s' enqueueing message with id %zu",
+ _id.c_str(),
+ msgMeta.messageId);
+ _visitorTarget._queuedMessages.insert(std::make_pair(
+ framework::MicroSecTime(0), msgMeta.messageId));
+ } else {
+ LOG(spam,
+ "Visitor '%s' immediately sending message '%s' with id %zu",
+ _id.c_str(),
+ cmd.toString().c_str(),
+ msgMeta.messageId);
+ cmd.setContext(msgMeta.messageId);
+ mbus::Result res(_messageSession->send(std::move(msgMeta.message)));
+ if (res.isAccepted()) {
+ _visitorTarget._pendingMessages.insert(msgMeta.messageId);
+ } else {
+ LOG(warning,
+ "Visitor '%s' failed to send DocumentAPI message: %s",
+ _id.c_str(),
+ res.getError().toString().c_str());
+ api::ReturnCode returnCode(
+ static_cast<api::ReturnCode::Result>(
+ res.getError().getCode()),
+ res.getError().getMessage());
+ fail(returnCode, true);
+ close();
+ }
+ }
+}
+
+void
+Visitor::sendInfoMessage(documentapi::VisitorInfoMessage::UP cmd)
+{
+ assert(cmd.get());
+ if (!isRunning()) return;
+
+ if (_controlDestination->toString().length()) {
+ cmd->setRoute(_controlDestination->getRoute());
+ cmd->setPriority(_documentPriority);
+ cmd->setTimeRemaining(_visitorInfoTimeout.getTime());
+ auto& msgMeta = _visitorTarget.insertMessage(std::move(cmd));
+ sendDocumentApiMessage(msgMeta);
+ }
+}
+
+void
+Visitor::close()
+{
+ if (_state != STATE_COMPLETED) {
+ transitionTo(STATE_CLOSING);
+ }
+ sendReplyOnce();
+}
+
+const char*
+Visitor::getStateName(VisitorState s)
+{
+ switch (s) {
+ case STATE_NOT_STARTED:
+ return "NOT_STARTED";
+ case STATE_RUNNING:
+ return "RUNNING";
+ case STATE_CLOSING:
+ return "CLOSING";
+ case STATE_COMPLETED:
+ return "COMPLETED";
+ default:
+ assert(!"Unknown visitor state");
+ return NULL;
+ }
+}
+
+Visitor::VisitorState
+Visitor::transitionTo(VisitorState newState)
+{
+ LOG(debug, "Visitor '%s' state transition %s -> %s",
+ _id.c_str(),
+ getStateName(_state),
+ getStateName(newState));
+ VisitorState oldState = _state;
+ _state = newState;
+ return oldState;
+}
+
+bool
+Visitor::mayTransitionToCompleted() const
+{
+ return (!isRunning()
+ && !hasPendingIterators()
+ && _visitorTarget._pendingMessages.empty()
+ && _visitorTarget._queuedMessages.empty()
+ && _messageSession->pending() == 0);
+}
+
+void
+Visitor::forceClose()
+{
+ for (std::list<BucketIterationState*>::iterator it = _bucketStates.begin();
+ it != _bucketStates.end(); ++it)
+ {
+ // Reset iterator id so no destroy iterator will be sent
+ (*it)->setIteratorId(spi::IteratorId(0));
+ delete *it;
+ }
+ _bucketStates.clear();
+ transitionTo(STATE_COMPLETED);
+}
+
+void
+Visitor::sendReplyOnce()
+{
+ assert(_initiatingCmd.get());
+ if (!_hasSentReply) {
+ std::shared_ptr<api::StorageReply> reply(
+ _initiatingCmd->makeReply().release());
+
+ _hitCounter->updateVisitorStatistics(_visitorStatistics);
+ static_cast<api::CreateVisitorReply*>(reply.get())
+ ->setVisitorStatistics(_visitorStatistics);
+ if (shouldAddMbusTrace()) {
+ _trace.moveTraceTo(reply->getTrace().getRoot());
+ }
+ reply->setResult(_result);
+ LOG(debug, "Sending %s", reply->toString(true).c_str());
+ _messageHandler->send(reply);
+ _hasSentReply = true;
+ }
+}
+
+void
+Visitor::finalize()
+{
+ if (_state != STATE_COMPLETED) {
+ LOG(error, "Attempting to finalize non-completed visitor %s",
+ _id.c_str());
+ assert(false);
+ }
+ assert(_bucketStates.empty());
+
+ if (_result.success()) {
+ if (_messageSession->pending() > 0)
+ {
+ _result = api::ReturnCode(api::ReturnCode::ABORTED);
+ try{
+ abortedVisiting();
+ } catch (std::exception& e) {
+ LOG(warning, "Visitor %s had a problem in abortVisiting(). As "
+ "visitor is already complete, this has been ignored: %s",
+ _id.c_str(), e.what());
+ }
+ }
+ }
+ sendReplyOnce();
+ _initiatingCmd.reset();
+}
+
+/**
+ * If a bucket state has no pending iterators or control commands,
+ * we can safely discard it when a visitor fails. No need to push
+ * more traffic to the persistence layer.
+ */
+void
+Visitor::discardAllNoPendingBucketStates()
+{
+ for (BucketStateList::iterator
+ it(_bucketStates.begin()), e(_bucketStates.end());
+ it != e;)
+ {
+ BucketIterationState& bstate(**it);
+ if (bstate.hasPendingControlCommand() || bstate.hasPendingIterators()) {
+ LOG(debug,
+ "Visitor '%s' not discarding bucket state %s "
+ "since it has pending operations",
+ _id.c_str(),
+ bstate.toString().c_str());
+ ++it;
+ continue;
+ }
+ LOG(debug, "Visitor '%s' discarding bucket state %s",
+ _id.c_str(), bstate.toString().c_str());
+ delete *it;
+ it = _bucketStates.erase(it);
+ }
+}
+
+void
+Visitor::fail(const api::ReturnCode& reason,
+ bool overrideExistingError)
+{
+ assert(_state != STATE_COMPLETED);
+ if (_result.getResult() < reason.getResult() || overrideExistingError) {
+ LOG(debug, "Setting result of visitor '%s' to %s",
+ _id.c_str(), reason.toString().c_str());
+ _result = reason;
+ }
+ if (_visitorTarget.hasQueuedMessages()) {
+ LOG(debug, "Visitor '%s' dropping %zu queued messages bound to %s "
+ "since visitor has failed",
+ _id.c_str(),
+ _visitorTarget._queuedMessages.size(),
+ _controlDestination->toString().c_str());
+ _visitorTarget.discardQueuedMessages();
+ }
+ discardAllNoPendingBucketStates();
+ transitionTo(STATE_CLOSING);
+}
+
+bool
+Visitor::shouldReportProblemToClient(const api::ReturnCode& code,
+ size_t retryCount) const
+{
+ // Report _once_ per message if we reach a certain retry threshold.
+ if (retryCount == TRANSIENT_ERROR_RETRIES_BEFORE_NOTIFY) {
+ return true;
+ }
+ return !(code.isBucketDisappearance()
+ || code.isBusy()
+ || code == api::ReturnCode::WRONG_DISTRIBUTION);
+}
+
+void
+Visitor::reportProblem(const std::string& problem)
+{
+ framework::MicroSecTime time(_component.getClock().getTimeInMicros());
+ std::map<std::string, framework::MicroSecTime>::iterator it(
+ _recentlySentErrorMessages.find(problem));
+ // Ignore errors already reported last minute
+ if (it != _recentlySentErrorMessages.end() &&
+ it->second + framework::MicroSecTime(60*1000*1000) > time)
+ {
+ return;
+ }
+ LOG(debug, "Visitor '%s' sending VisitorInfo with message \"%s\" to %s",
+ _id.c_str(),
+ problem.c_str(),
+ _controlDestination->toString().c_str());
+ _recentlySentErrorMessages[problem] = time;
+ documentapi::VisitorInfoMessage::UP cmd(
+ new documentapi::VisitorInfoMessage());
+ cmd->setErrorMessage(problem);
+ sendInfoMessage(std::move(cmd));
+
+ // Clear list if it grows too large
+ if (_recentlySentErrorMessages.size() > 40) {
+ _recentlySentErrorMessages.clear();
+ }
+}
+
+void
+Visitor::reportProblem(const api::ReturnCode& problemCode)
+{
+ std::ostringstream os;
+ os << "[From content node " << _ownNodeIndex << "] ";
+ os << api::ReturnCode::getResultString(problemCode.getResult())
+ << ": " << problemCode.getMessage();
+ reportProblem(os.str());
+}
+
+void
+Visitor::start(api::VisitorId id, api::StorageMessage::Id cmdId,
+ const std::string& name,
+ const std::vector<document::BucketId>& buckets,
+ framework::MicroSecTime fromTimestamp,
+ framework::MicroSecTime toTimestamp,
+ std::unique_ptr<document::select::Node> docSelection,
+ const std::string& docSelectionString,
+ std::unique_ptr<document::OrderingSpecification> ordering,
+ VisitorMessageHandler& handler,
+ VisitorMessageSession::UP messageSession,
+ documentapi::Priority::Value documentPriority)
+{
+ assert(_state == STATE_NOT_STARTED);
+ _visitorId = id;
+ _visitorCmdId = cmdId;
+ _id = name;
+ _messageHandler = &handler;
+ _ordering = std::move(ordering);
+ _documentSelection.reset(docSelection.release());
+ _documentSelectionString = docSelectionString;
+ _buckets = buckets;
+ _visitorOptions._fromTime = fromTimestamp;
+ _visitorOptions._toTime = toTimestamp;
+ _currentBucket = 0;
+ _hitCounter.reset(new HitCounter(_ordering.get()));
+ _messageSession = std::move(messageSession);
+ _documentPriority = documentPriority;
+
+ _state = STATE_RUNNING;
+ if (_memoryAllocType == 0) {
+ _memoryAllocType = &_component.getMemoryManager()
+ .getAllocationType("VISITOR_BUFFER");
+ }
+
+ LOG(debug, "Starting visitor '%s' for %" PRIu64 " buckets from %" PRIu64 " to "
+ "%" PRIu64 ". First is %s. Max pending replies: %u, include "
+ "removes: %s, field set: %s.",
+ _id.c_str(),
+ _buckets.size(),
+ _visitorOptions._fromTime.getTime(),
+ _visitorOptions._toTime.getTime(),
+ (buckets.size() > 0 ? _buckets[0].toString().c_str() : ""),
+ _visitorOptions._maxPending,
+ (_visitorOptions._visitRemoves ? "true" : "false"),
+ _visitorOptions._fieldSet.c_str());
+}
+
+void
+Visitor::attach(std::shared_ptr<api::StorageCommand> initiatingCmd,
+ const api::StorageMessageAddress& controlAddress,
+ const api::StorageMessageAddress& dataAddress,
+ framework::MilliSecTime timeout)
+{
+ _priority = initiatingCmd->getPriority();
+ _timeToDie = _component.getClock().getTimeInMicros() + timeout.getMicros();
+ if (_initiatingCmd.get()) {
+ std::shared_ptr<api::StorageReply> reply(
+ _initiatingCmd->makeReply().release());
+ reply->setResult(api::ReturnCode::ABORTED);
+ _messageHandler->send(reply);
+ }
+ _initiatingCmd = initiatingCmd;
+ _traceLevel = _initiatingCmd->getTrace().getLevel();
+ {
+ // Set new address
+ _controlDestination.reset(
+ new api::StorageMessageAddress(controlAddress));
+ _dataDestination.reset(new api::StorageMessageAddress(dataAddress));
+ }
+ LOG(debug, "Visitor '%s' has control destination %s and data "
+ "destination %s.",
+ _id.c_str(), _controlDestination->toString().c_str(),
+ _dataDestination->toString().c_str());
+ if (!_calledStartingVisitor) {
+ _calledStartingVisitor = true;
+ try{
+ startingVisitor(_buckets);
+ } catch (std::exception& e) {
+ std::ostringstream ost;
+ ost << "Failed to start visitor: " << e.what();
+ fail(api::ReturnCode(api::ReturnCode::ABORTED, ost.str()));
+ return;
+ }
+ }
+
+ // In case there was no messages to resend we need to call
+ // continueVisitor to provoke it to resume.
+ for (uint32_t i=0; i<_visitorOptions._maxParallelOneBucket; ++i) {
+ if (!continueVisitor()) return;
+ }
+}
+
+void
+Visitor::handleDocumentApiReply(mbus::Reply::UP reply,
+ VisitorThreadMetrics& metrics)
+{
+ if (shouldAddMbusTrace()) {
+ _trace.add(reply->getTrace().getRoot());
+ }
+
+ mbus::Message::UP message = reply->getMessage();
+ uint64_t messageId = reply->getContext().value.UINT64;
+ uint32_t removed = _visitorTarget._pendingMessages.erase(messageId);
+
+ LOG(spam, "Visitor '%s' reply %s for message ID %" PRIu64, _id.c_str(),
+ reply->toString().c_str(), messageId);
+
+ assert(removed == 1);
+ (void) removed;
+ // Always remove message from target mapping. We will reinsert it if the
+ // message needs to be retried.
+ auto meta = _visitorTarget.releaseMetaForMessageId(messageId);
+
+ if (!reply->hasErrors()) {
+ metrics.averageMessageSendTime[getLoadType()].addValue(
+ (message->getTimeRemaining() - message->getTimeRemainingNow()) / 1000.0);
+ LOG(debug, "Visitor '%s' reply %s for message ID %" PRIu64 " was OK", _id.c_str(),
+ reply->toString().c_str(), messageId);
+
+ continueVisitor();
+ return;
+ }
+
+ metrics.visitorDestinationFailureReplies[getLoadType()].inc();
+
+ if (message->getType() == documentapi::DocumentProtocol::MESSAGE_VISITORINFO) {
+ LOG(debug, "Aborting visitor as we failed to talk to "
+ "controller: %s",
+ reply->getError(0).toString().c_str());
+ api::ReturnCode returnCode(
+ static_cast<api::ReturnCode::Result>(
+ reply->getError(0).getCode()),
+ reply->getError(0).getMessage());
+ fail(returnCode, true);
+ close();
+ return;
+ }
+
+ api::ReturnCode returnCode(
+ static_cast<api::ReturnCode::Result>(reply->getError(0).getCode()),
+ reply->getError(0).getMessage());
+ if (returnCode.isCriticalForVisitor()) {
+ // Abort - something is wrong with target.
+ fail(returnCode, true);
+ close();
+ return;
+ }
+
+ if (failed()) {
+ LOG(debug, "Failed to send message from visitor '%s', due to "
+ "%s. Not resending since visitor has failed",
+ _id.c_str(), returnCode.toString().c_str());
+ return;
+ }
+ assert(!meta.message);
+ meta.message.reset(
+ static_cast<documentapi::DocumentMessage*>(message.release()));
+ meta.retryCount++;
+ const size_t retryCount = meta.retryCount;
+
+ // Tag time for later resending. nextSendAttemptTime != 0 indicates
+ // that the message is not pending, but should be sent later.
+ framework::MicroSecTime delay(
+ (1 << std::min(12u, meta.retryCount)) * 10000);
+
+ _visitorTarget.reinsertMeta(std::move(meta));
+ _visitorTarget._queuedMessages.insert(
+ std::make_pair(_component.getClock().getTimeInMicros() + delay,
+ messageId));
+ if (shouldReportProblemToClient(returnCode, retryCount)) {
+ reportProblem(returnCode);
+ }
+
+ // Creates delay in the following fashion based on retry count.
+ // Max delay is then 40 seconds. At which time, retrying should not
+ // use up that much resources.
+ // 20, 40, 80, 160, 320, 640, 1280, 2560, 5120, 10240, 20480, 40960
+ LOG(debug, "Failed to send message from visitor '%s', due to "
+ "%s. Resending in %" PRIu64 " ms",
+ _id.c_str(), returnCode.toString().c_str(),
+ delay.getMillis().getTime());
+}
+
+void
+Visitor::onCreateIteratorReply(
+ const std::shared_ptr<CreateIteratorReply>& reply,
+ VisitorThreadMetrics& /*metrics*/)
+{
+ std::list<BucketIterationState*>::reverse_iterator it = _bucketStates.rbegin();
+
+ document::BucketId bucketId(reply->getBucketId());
+ for (; it != _bucketStates.rend(); ++it) {
+ if ((*it)->getBucketId() == bucketId) {
+ break;
+ }
+ }
+ assert(it != _bucketStates.rend());
+ BucketIterationState& bucketState(**it);
+
+ if (reply->getResult().failed()) {
+ LOG(debug, "Failed to create iterator for bucket %s: %s",
+ bucketId.toString().c_str(),
+ reply->getResult().toString().c_str());
+ fail(reply->getResult());
+ delete *it;
+ _bucketStates.erase((++it).base());
+ return;
+ }
+ bucketState.setIteratorId(reply->getIteratorId());
+ if (failed()) {
+ LOG(debug, "Create iterator for bucket %s is OK, "
+ "but visitor has failed: %s",
+ bucketId.toString().c_str(),
+ _result.toString().c_str());
+ delete *it;
+ _bucketStates.erase((++it).base());
+ return;
+ }
+
+ LOG(debug, "Visitor '%s' starting to visit bucket %s.",
+ _id.c_str(), bucketId.toString().c_str());
+ framework::MemoryToken::UP token(
+ _memoryManager->allocate(
+ *_memoryAllocType, _docBlockSize, _docBlockSize, _priority));
+ if (token.get() == 0) {
+ // Not enough memory
+ return;
+ }
+ std::shared_ptr<GetIterCommand> cmd(
+ new GetIterCommand(std::move(token), bucketId,
+ bucketState.getIteratorId(),
+ _docBlockSize));
+ cmd->setLoadType(_initiatingCmd->getLoadType());
+ cmd->getTrace().setLevel(_traceLevel);
+ cmd->setPriority(_priority);
+ ++bucketState._pendingIterators;
+ _messageHandler->send(cmd, *this);
+}
+
+void
+Visitor::onGetIterReply(const std::shared_ptr<GetIterReply>& reply,
+ VisitorThreadMetrics& metrics)
+{
+ LOG(debug, "Visitor '%s' got get iter reply for bucket %s: %s",
+ _id.c_str(),
+ reply->getBucketId().toString().c_str(),
+ reply->getResult().toString().c_str());
+ std::list<BucketIterationState*>::reverse_iterator it = _bucketStates.rbegin();
+
+ // New requests will be pushed on end of list.. So searching
+ // in reverse order should quickly get correct result.
+ for (; it != _bucketStates.rend(); ++it) {
+ if ((*it)->getBucketId() == reply->getBucketId()) {
+ break;
+ }
+ }
+ assert(it != _bucketStates.rend());
+
+ if (reply->getResult().failed() || !isRunning()) {
+ // Don't log warnings for BUCKET_NOT_FOUND and BUCKET_DELETED,
+ // since this can happen during normal splits.
+ // Don't log for ABORT, due to storage shutdown.
+ if (!reply->getResult().success() &&
+ !reply->getResult().isShutdownRelated() &&
+ !reply->getResult().isBucketDisappearance())
+ {
+ LOG(warning, "Failed to talk to persistence layer for bucket "
+ "%s. Aborting visitor '%s': %s",
+ reply->getBucketId().toString().c_str(),
+ _id.c_str(), reply->getResult().toString().c_str());
+ }
+ fail(reply->getResult());
+ BucketIterationState& bucketState(**it);
+ assert(bucketState._pendingIterators > 0);
+ --bucketState._pendingIterators;
+ if (bucketState._pendingIterators == 0) {
+ delete *it;
+ _bucketStates.erase((++it).base());
+ }
+ return;
+ }
+
+ BucketIterationState& bucketState(**it);
+ bucketState.setCompleted(reply->isCompleted());
+ --bucketState._pendingIterators;
+ if (!reply->getEntries().empty()) {
+ LOG(debug, "Processing documents in handle given from bucket %s.",
+ reply->getBucketId().toString().c_str());
+ // While handling documents we should not keep locks, such
+ // that visitor may process several things at once.
+ if (isRunning()) {
+ MBUS_TRACE(reply->getTrace(), 5,
+ vespalib::make_string("Visitor %s handling block of %zu documents.",
+ _id.c_str(),
+ reply->getEntries().size()));
+ LOG(debug, "Visitor %s handling block of %zu documents.",
+ _id.c_str(),
+ reply->getEntries().size());
+ try{
+ framework::MilliSecTimer processingTimer(_component.getClock());
+ handleDocuments(reply->getBucketId(),
+ reply->getEntries(),
+ *_hitCounter);
+ metrics.averageProcessingTime[reply->getLoadType()]
+ .addValue(processingTimer);
+
+ MBUS_TRACE(reply->getTrace(), 5, "Done processing data block in visitor plugin");
+
+ uint64_t size = 0;
+ for (size_t i = 0; i < reply->getEntries().size(); ++i) {
+ size += reply->getEntries()[i]->getPersistedDocumentSize();
+ }
+
+ _visitorStatistics.setDocumentsVisited(
+ _visitorStatistics.getDocumentsVisited()
+ + reply->getEntries().size());
+ _visitorStatistics.setBytesVisited(
+ _visitorStatistics.getBytesVisited() + size);
+ } catch (std::exception& e) {
+ LOG(warning, "handleDocuments threw exception %s",
+ e.what());
+ reportProblem(e.what());
+ }
+ }
+ } else {
+ LOG(debug, "No documents to process in handle given for bucket %s.",
+ reply->getBucketId().toString().c_str());
+ }
+
+ if (shouldAddMbusTrace()) {
+ _trace.add(reply->getTrace().getRoot());
+ }
+
+ LOG(debug, "Continuing visitor %s.", _id.c_str());
+ continueVisitor();
+}
+
+void
+Visitor::sendDueQueuedMessages(framework::MicroSecTime timeNow)
+{
+ // Assuming few messages in sent queue, so cheap to go through all.
+ while (!_visitorTarget._queuedMessages.empty()
+ && (_visitorTarget._pendingMessages.size()
+ < _visitorOptions._maxPending)) {
+ VisitorTarget::MessageQueue::iterator it(
+ _visitorTarget._queuedMessages.begin());
+ if (it->first < timeNow) {
+ auto& msgMeta = _visitorTarget.metaForMessageId(it->second);
+ _visitorTarget._queuedMessages.erase(it);
+ sendDocumentApiMessage(msgMeta);
+ } else {
+ break;
+ }
+ }
+}
+
+bool
+Visitor::continueVisitor()
+{
+ if (mayTransitionToCompleted()) {
+ transitionTo(STATE_COMPLETED);
+ return false;
+ }
+ framework::MicroSecTime time(_component.getClock().getTimeInMicros());
+ if (time > _timeToDie) { // If we have timed out, just shut down.
+ if (isRunning()) {
+ LOG(debug, "Visitor %s timed out. Closing it.", _id.c_str());
+ fail(api::ReturnCode(api::ReturnCode::ABORTED,
+ "Visitor timed out"));
+ close();
+ }
+ return false;
+ }
+
+ sendDueQueuedMessages(time);
+
+ // No need to do more work if we already have maximum pending towards data handler
+ if (_messageSession->pending() + _visitorTarget._queuedMessages.size()
+ >= _visitorOptions._maxPending)
+ {
+ LOG(spam, "Number of pending messages (%zu pending, %zu queued) "
+ "already >= max pending (%u)",
+ _visitorTarget._pendingMessages.size(),
+ _visitorTarget._queuedMessages.size(),
+ _visitorOptions._maxPending);
+ return false;
+ }
+
+ if (_visitorTarget.getMemoryUsage() >= _memoryUsageLimit) {
+ LOG(spam,
+ "Visitor already using maximum amount of memory "
+ "(using %u, limit %u)",
+ _visitorTarget.getMemoryUsage(),
+ _memoryUsageLimit);
+ return false;
+ }
+
+ // If there are no more buckets to visit and no pending messages
+ // to the client, mark visitor as complete.
+ if (!getIterators()) {
+ if (_visitorTarget._pendingMessages.empty()
+ && _visitorTarget._queuedMessages.empty())
+ {
+ if (isRunning()) {
+ LOG(debug, "Visitor '%s' has not been aborted", _id.c_str());
+ if (!_calledCompletedVisitor) {
+ _calledCompletedVisitor = true;
+ try{
+ completedVisiting(*_hitCounter);
+ } catch (std::exception& e) {
+ LOG(warning, "Visitor %s failed in completedVisiting() "
+ "callback. As visitor is already complete, this "
+ "has been ignored: %s", _id.c_str(), e.what());
+ }
+
+ // Visitor could create messages in completed visiting.
+ if (_messageSession->pending() > 0) {
+ return false;
+ }
+ }
+ }
+
+ LOG(debug, "No pending messages, tagging visitor '%s' complete",
+ _id.c_str());
+ transitionTo(STATE_COMPLETED);
+ } else {
+ LOG(debug, "Visitor %s waiting for all commands to be replied to "
+ "(pending=%zu, queued=%zu)",
+ _id.c_str(),
+ _visitorTarget._pendingMessages.size(),
+ _visitorTarget._queuedMessages.size());
+ }
+ return false;
+ } else {
+ return true;
+ }
+}
+
+void
+Visitor::getStatus(std::ostream& out, bool verbose) const
+{
+ out << "<table border=\"1\"><tr><td>Property</td><td>Value</td></tr>\n";
+
+ out << "<tr><td>Visitor id</td><td>" << _visitorId << "</td></tr>\n";
+ out << "<tr><td>Visitor name</td><td>" << _id << "</td></tr>\n";
+
+ out << "<tr><td>Number of buckets to visit</td><td>" << _buckets.size()
+ << "</td></tr>\n";
+ out << "<tr><td>Next bucket to visit</td><td>"
+ << "#" << _currentBucket << ": ";
+ if (_currentBucket >= _buckets.size()) {
+ out << "Out of bounds";
+ } else {
+ out << _buckets[_currentBucket].toString();
+ }
+ out << "</td></tr>\n";
+
+ out << "<tr><td>State</td><td>\n"
+ << getStateName(_state)
+ << "</td></tr>\n";
+
+ out << "<tr><td>Current status</td><td>"
+ << _result << "</td></tr>\n";
+
+ out << "<tr><td>Failed</td><td>" << (failed() ? "true" : "false")
+ << "</td></tr>\n";
+
+ if (verbose) {
+ out << "<tr><td>Max messages pending to client</td><td>"
+ << _visitorOptions._maxPending
+ << "</td></tr>\n";
+ out << "<tr><td>Max parallel buckets visited</td><td>"
+ << _visitorOptions._maxParallel
+ << "</td></tr>\n";
+ out << "<tr><td>Max parallel getiter requests per bucket visited"
+ << "</td><td>" << _visitorOptions._maxParallelOneBucket
+ << "</td></tr>\n";
+ out << "<tr><td>Called starting visitor</td><td>"
+ << (_calledStartingVisitor ? "true" : "false") << "</td></tr>\n";
+ out << "<tr><td>Called completed visitor</td><td>"
+ << (_calledCompletedVisitor ? "true" : "false") << "</td></tr>\n";
+ out << "<tr><td>Visiting fields</td><td>"
+ << _visitorOptions._fieldSet
+ << "</td></tr>\n";
+ out << "<tr><td>Visiting removes</td><td>"
+ << (_visitorOptions._visitRemoves ? "true" : "false")
+ << "</td></tr>\n";
+ out << "<tr><td>Control destination</td><td>";
+ if (_controlDestination.get()) {
+ out << _controlDestination->toString();
+ } else {
+ out << "nil";
+ }
+ out << "</td></tr>\n";
+ out << "<tr><td>Data destination</td><td>";
+ if (_dataDestination.get()) {
+ out << _dataDestination->toString();
+ } else {
+ out << "nil";
+ }
+ out << "</td></tr>\n";
+ out << "<tr><td>Document selection</td><td>";
+ if (_documentSelection.get()) {
+ out << *_documentSelection;
+ } else {
+ out << "nil";
+ }
+ out << "</td></tr>\n";
+
+ out << "<tr><td>Time period(" << _visitorOptions._fromTime << ", "
+ << _visitorOptions._toTime << "):<br>\n";
+ out << "<tr><td>Message id of create visitor command</td><td>"
+ << _visitorCmdId << "</td></tr>\n";
+ out << "<tr><td>Doc block timeout</td><td>"
+ << _docBlockTimeout << "</td></tr>\n";
+ out << "<tr><td>Visitor info timeout</td><td>"
+ << _visitorInfoTimeout << "</td></tr>\n";
+ out << "<tr><td>Visitor priority</td><td>"
+ << static_cast<uint32_t>(_priority) << "</td></tr>\n";
+ out << "<tr><td>Trace level</td><td>"
+ << _traceLevel << "</td></tr>\n";
+
+ framework::MicroSecTime time(_component.getClock().getTimeInMicros());
+
+ out << "<tr><td>Time left until timeout</td><td>";
+ if (time <= _timeToDie) {
+ out << (_timeToDie - time).getMillis().getTime() << " ms";
+ } else {
+ out << "(expired "
+ << (time - _timeToDie).getMillis().getTime()
+ << " ms ago)";
+ }
+ out << "</td></tr>\n";
+ }
+ out << "</table>\n";
+
+ out << "<h4>Buckets to visit</h4>";
+ typedef std::pair<api::Timestamp, api::Timestamp> TimePair;
+ TimePair lastTime;
+ for (uint32_t i=0; i<_buckets.size(); ++i) {
+ out << _buckets[i] << "\n<br>";
+ }
+
+ out << "<h4>States of buckets currently being visited</h4>";
+ if (_bucketStates.size() == 0) {
+ out << "None\n";
+ }
+ for (auto* state : _bucketStates) {
+ out << " " << *state << "<br>\n";
+ }
+
+ std::unordered_map<uint64_t, framework::MicroSecTime> idToSendTime;
+ for (auto& sendTimeToId : _visitorTarget._queuedMessages) {
+ idToSendTime[sendTimeToId.second] = sendTimeToId.first;
+ }
+
+ out << "<h4>Messages being sent to client</h4>\n";
+ out << "<p>Estimated memory usage: "
+ << _visitorTarget.getMemoryUsage()
+ << "</p>\n";
+ for (auto& idAndMeta : _visitorTarget._messageMeta) {
+ const VisitorTarget::MessageMeta& meta(idAndMeta.second);
+ out << "Message #" << idAndMeta.first << " <b>"
+ << meta.messageText << "</b> ";
+ if (meta.retryCount > 0) {
+ out << "Retried " << meta.retryCount << " times. ";
+ }
+ if (_visitorTarget._pendingMessages.find(idAndMeta.first)
+ != _visitorTarget._pendingMessages.end())
+ {
+ out << "<i>pending</i>";
+ };
+ auto queued = idToSendTime.find(idAndMeta.first);
+ if (queued != idToSendTime.end()) {
+ out << "Scheduled for sending at timestamp "
+ << (queued->second.getSeconds());
+ }
+
+ out << "<br/>\n";
+ }
+
+ out << "\n";
+}
+
+bool
+Visitor::getIterators()
+{
+ LOG(debug, "getIterators, visitor %s, _buckets = %zu , _bucketStates = %zu, "
+ "_currentBucket = %d",
+ _id.c_str(), _buckets.size(),
+ _bucketStates.size(), _currentBucket);
+
+ // Don't send any further GetIters if we're closing
+ if (!isRunning()) {
+ if (hasPendingIterators()) {
+ LOG(debug, "Visitor has failed but waiting for %zu "
+ "buckets to finish processing",
+ _bucketStates.size());
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ // Go through buckets found. Take the first that doesn't have requested
+ // state and request a new piece.
+ for (std::list<BucketIterationState*>::iterator it = _bucketStates.begin();
+ it != _bucketStates.end();)
+ {
+ assert(*it);
+ BucketIterationState& bucketState(**it);
+ if ((bucketState._pendingIterators
+ >= _visitorOptions._maxParallelOneBucket)
+ || bucketState.hasPendingControlCommand())
+ {
+ ++it;
+ continue;
+ }
+ if (bucketState.isCompleted()) {
+ if (bucketState._pendingIterators > 0) {
+ // Wait to process finished with bucket stuff until we have
+ // gotten responses for all the getIters pending to bucket
+ ++it;
+ continue;
+ }
+ try{
+ completedBucket(bucketState.getBucketId(), *_hitCounter);
+ _visitorStatistics.setBucketsVisited(
+ _visitorStatistics.getBucketsVisited() + 1);
+ } catch (std::exception& e) {
+ std::ostringstream ost;
+ ost << "Visitor fail to run completedBucket() notification: "
+ << e.what();
+ reportProblem(ost.str());
+ }
+ delete *it;
+ it = _bucketStates.erase(it);
+ continue;
+ }
+ framework::MemoryToken::UP token(
+ _memoryManager->allocate(
+ *_memoryAllocType, _docBlockSize, _docBlockSize,
+ _priority));
+ if (token.get() == 0) {
+ // Not enough memory
+ return true;
+ }
+ std::shared_ptr<GetIterCommand> cmd(
+ new GetIterCommand(std::move(token),
+ bucketState.getBucketId(),
+ bucketState.getIteratorId(),
+ _docBlockSize));
+ cmd->setLoadType(_initiatingCmd->getLoadType());
+ cmd->getTrace().setLevel(_traceLevel);
+ cmd->setPriority(_priority);
+ _messageHandler->send(cmd, *this);
+ ++bucketState._pendingIterators;
+ _bucketStates.erase(it);
+ _bucketStates.push_back(&bucketState);
+ LOG(debug, "Requested new iterator for visitor '%s'.", _id.c_str());
+ return true;
+ }
+
+ // If there aren't anymore buckets to iterate, we're done
+ if (_bucketStates.size() == 0 && _currentBucket >= _buckets.size()) {
+ LOG(debug, "No more buckets to visit for visitor '%s'.", _id.c_str());
+ return false;
+ }
+
+ // If all current buckets have request state and we're below maxParallel
+ // and below maxPending
+ // start iterating a new bucket
+ uint32_t sentCount = 0;
+ while (_bucketStates.size() < _visitorOptions._maxParallel &&
+ _bucketStates.size() < _visitorOptions._maxPending &&
+ _currentBucket < _buckets.size())
+ {
+ document::BucketId bucketId(_buckets[_currentBucket]);
+ std::unique_ptr<BucketIterationState> newBucketState(
+ new BucketIterationState(*this, *_messageHandler, bucketId));
+ LOG(debug, "Visitor '%s': Sending create iterator for bucket %s.",
+ _id.c_str(), bucketId.toString().c_str());
+
+ spi::Selection selection
+ = spi::Selection(spi::DocumentSelection(_documentSelectionString));
+ selection.setFromTimestamp(
+ spi::Timestamp(_visitorOptions._fromTime.getTime()));
+ selection.setToTimestamp(
+ spi::Timestamp(_visitorOptions._toTime.getTime()));
+
+ std::shared_ptr<CreateIteratorCommand> cmd(
+ new CreateIteratorCommand(bucketId,
+ selection,
+ _visitorOptions._fieldSet,
+ _visitorOptions._visitRemoves ?
+ spi::NEWEST_DOCUMENT_OR_REMOVE :
+ spi::NEWEST_DOCUMENT_ONLY));
+
+ cmd->setLoadType(_initiatingCmd->getLoadType());
+ cmd->getTrace().setLevel(_traceLevel);
+ cmd->setPriority(0);
+ cmd->setReadConsistency(getRequiredReadConsistency());
+ _bucketStates.push_back(newBucketState.release());
+ _messageHandler->send(cmd, *this);
+ ++_currentBucket;
+ ++sentCount;
+ }
+ if (sentCount == 0) {
+ if (LOG_WOULD_LOG(debug)) {
+ LOG(debug, "Enough iterators being processed. Doing nothing for "
+ "visitor '%s' bucketStates = %d.",
+ _id.c_str(), (int)_bucketStates.size());
+ for (std::list<BucketIterationState*>::iterator it(
+ _bucketStates.begin());
+ it != _bucketStates.end(); ++it)
+ {
+ LOG(debug, "Existing: %s", (*it)->toString().c_str());
+ }
+ }
+ }
+ return true;
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/visiting/visitor.h b/storage/src/vespa/storage/visiting/visitor.h
new file mode 100644
index 00000000000..cea4a590f3f
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/visitor.h
@@ -0,0 +1,584 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::Visitor
+ * @ingroup storageserver
+ *
+ * @brief Base class for all visitors.
+ *
+ * A visitor is a piece of code existing in a shared library linked in, that
+ * iterates serialized documents from the persistence layer
+ */
+
+#pragma once
+
+#include <list>
+#include <deque>
+#include <vespa/storageapi/messageapi/storagemessage.h>
+#include <vespa/storageapi/message/visitor.h>
+#include <vespa/document/select/orderingspecification.h>
+#include <vespa/storage/common/storagecomponent.h>
+#include <vespa/storage/common/visitorfactory.h>
+#include <vespa/storage/visiting/visitormessagesession.h>
+#include <vespa/storage/visiting/memory_bounded_trace.h>
+#include <vespa/documentapi/messagebus/messages/documentmessage.h>
+#include <vespa/persistence/spi/docentry.h>
+#include <vespa/persistence/spi/selection.h>
+#include <vespa/persistence/spi/read_consistency.h>
+#include <vespa/storageframework/storageframework.h>
+
+namespace document {
+class Document;
+class DocumentId;
+namespace select {
+class Node;
+}
+}
+namespace vdslib {
+class Parameters;
+}
+
+namespace documentapi {
+class DocumentMessage;
+class VisitorInfoMessage;
+}
+
+namespace storage {
+
+namespace api {
+class ReturnCode;
+class StorageCommand;
+class StorageReply;
+}
+
+class GetIterReply;
+class CreateIteratorReply;
+class Visitor;
+class VisitorThreadMetrics;
+
+/**
+ * To prevent circular dependency between visitors and visitor manager, this
+ * interface is used to give visitor access to the functionality needed from
+ * the manager.
+ */
+class VisitorMessageHandler {
+public:
+ virtual void send(const std::shared_ptr<api::StorageCommand>&,
+ Visitor& visitor) = 0;
+ virtual void send(const std::shared_ptr<api::StorageReply>&) = 0;
+ /**
+ * Called once when visitor shuts down and won't call this handler again.
+ * The visitor may still have pending requests sent but not received though.
+ */
+ virtual void closed(api::VisitorId id) = 0;
+
+ virtual ~VisitorMessageHandler() {}
+};
+
+/**
+ * Base class for Visitor implementations.
+ *
+ * Each visitor will implement this base class to become a visitor.
+ * This base class takes care of talking to the persistence layer and
+ * processing all the documents, calling the virtual functions each visitor
+ * must implement. It also provides functions for sending data back to the
+ * client.
+ */
+class Visitor
+{
+public:
+
+ class HitCounter {
+ public:
+ HitCounter(const document::OrderingSpecification* ordering);
+
+ void addHit(const document::DocumentId& hit, uint32_t size);
+
+ void updateVisitorStatistics(vdslib::VisitorStatistics& statistics);
+
+ uint32_t getFirstPassHits() const { return _firstPassHits; }
+
+ uint64_t getFirstPassBytes() const { return _firstPassBytes; }
+
+ uint32_t getSecondPassHits() const { return _secondPassHits; }
+
+ uint64_t getSecondPassBytes() const { return _secondPassBytes; }
+
+ private:
+ uint32_t _firstPassHits;
+ uint64_t _firstPassBytes;
+ uint32_t _secondPassHits;
+ uint64_t _secondPassBytes;
+ const document::OrderingSpecification* _ordering;
+ bool _allowFirstPass;
+ };
+
+ enum VisitorState
+ {
+ STATE_NOT_STARTED,
+ STATE_RUNNING,
+ STATE_CLOSING,
+ STATE_COMPLETED
+ };
+
+ static constexpr size_t TRANSIENT_ERROR_RETRIES_BEFORE_NOTIFY = 7;
+
+private:
+ friend class BucketIterationState;
+ /** Holds status information on progress visiting a single bucket.
+ * Also serves as a guard for ensuring we send down a DestroyVisitor
+ * command when a state instance is destroyed and its iterator id is
+ * non-zero.
+ */
+ class BucketIterationState : public document::Printable
+ {
+ private:
+ Visitor& _visitor;
+ VisitorMessageHandler& _messageHandler;
+ public:
+ document::BucketId _bucketId;
+ spi::IteratorId _iteratorId;
+ uint32_t _pendingIterators;
+ bool _completed;
+
+ BucketIterationState(Visitor& visitor,
+ VisitorMessageHandler& messageHandler,
+ const document::BucketId& id)
+ : _visitor(visitor),
+ _messageHandler(messageHandler),
+ _bucketId(id),
+ _iteratorId(0),
+ _pendingIterators(0),
+ _completed(false)
+ {}
+
+ /** Sends DestroyIterator over _messageHandler if _iteratorId != 0 */
+ ~BucketIterationState();
+
+ void setCompleted(bool completed = true) { _completed = completed; }
+ bool isCompleted() const { return _completed; }
+
+ const document::BucketId& getBucketId() const { return _bucketId; }
+
+ void setIteratorId(spi::IteratorId iteratorId) {
+ _iteratorId = iteratorId;
+ }
+ spi::IteratorId getIteratorId() const { return _iteratorId; }
+
+ void setPendingControlCommand() {
+ _iteratorId = spi::IteratorId(0);
+ }
+
+ bool hasPendingControlCommand() const {
+ return _iteratorId == spi::IteratorId(0);
+ }
+
+ bool hasPendingIterators() const { return _pendingIterators > 0; }
+
+ void print(std::ostream& out, bool /*verbose*/,
+ const std::string& /*indent*/) const
+ {
+ out << "BucketIterationState("
+ << _bucketId
+ << ", pending GetIters: " << _pendingIterators
+ << ", iterator id: " << _iteratorId
+ << ", completed: " << (_completed ? "yes" : "no")
+ << ")";
+ }
+ };
+
+ struct VisitorOptions
+ {
+ // Minimum timestamp to visit.
+ framework::MicroSecTime _fromTime;
+ // Maximum timestamp to visit.
+ framework::MicroSecTime _toTime;
+
+ // Maximum number of buckets that can be visited in parallel
+ uint32_t _maxParallel;
+ // Number of pending get iter operations per bucket
+ uint32_t _maxParallelOneBucket;
+
+ // Maximum number of messages sent to clients that have not yet been
+ // replied to (max size to _sentMessages map)
+ uint32_t _maxPending;
+
+ std::string _fieldSet;
+ bool _visitRemoves;
+
+ VisitorOptions();
+ };
+
+ struct VisitorTarget
+ {
+ uint64_t _pendingMessageId;
+
+ struct MessageMeta {
+ MessageMeta(uint64_t msgId,
+ std::unique_ptr<documentapi::DocumentMessage> msg);
+ MessageMeta(MessageMeta&&) noexcept;
+ ~MessageMeta();
+
+ MessageMeta& operator=(MessageMeta&&) noexcept;
+
+ MessageMeta(const MessageMeta&) = delete;
+ MessageMeta& operator=(const MessageMeta&) = delete;
+
+ uint64_t messageId;
+ uint32_t retryCount;
+ // Memory usage for message the meta object was created with.
+ uint32_t memoryUsage;
+ std::unique_ptr<documentapi::DocumentMessage> message;
+ std::string messageText;
+ };
+
+ /**
+ * Keeps track of all the metadata for both pending and queued messages.
+ */
+ std::map<uint64_t, MessageMeta> _messageMeta;
+
+ /**
+ * Invariants:
+ * _memoryUsage == sum of m.memoryUsage for all m in _messageMeta
+ */
+ uint32_t _memoryUsage;
+
+ /**
+ * Contains the list of messages currently being sent to the client.
+ * Value refers to the message id (key in _messageMeta).
+ */
+ std::set<uint64_t> _pendingMessages;
+
+ // Maps from time sent to message to send.
+ // Value refers to message id (key in _messageMeta).
+ typedef std::multimap<framework::MicroSecTime, uint64_t> MessageQueue;
+
+ MessageQueue _queuedMessages;
+
+ MessageMeta& insertMessage(
+ std::unique_ptr<documentapi::DocumentMessage>);
+ /**
+ * Preconditions:
+ * msgId exists as a key in _messageMeta
+ */
+ MessageMeta& metaForMessageId(uint64_t msgId);
+ MessageMeta releaseMetaForMessageId(uint64_t msgId);
+ void reinsertMeta(MessageMeta);
+
+ bool hasQueuedMessages() const { return !_queuedMessages.empty(); }
+ void discardQueuedMessages();
+
+ uint32_t getMemoryUsage() const noexcept {
+ return _memoryUsage;
+ }
+
+ VisitorTarget();
+ };
+
+protected:
+ StorageComponent& _component;
+
+private:
+ VisitorOptions _visitorOptions;
+ VisitorTarget _visitorTarget;
+ VisitorState _state;
+
+ // The list of buckets to visit.
+ std::vector<document::BucketId> _buckets;
+
+ // The iterator iterating the buckets to visit.
+ uint32_t _currentBucket;
+ // The states of the buckets currently being visited.
+ typedef std::list<BucketIterationState*> BucketStateList;
+ BucketStateList _bucketStates;
+ // Set to true after performing given callbacks
+ bool _calledStartingVisitor;
+ bool _calledCompletedVisitor;
+
+ framework::MicroSecTime _startTime;
+
+ bool _hasSentReply;
+
+ uint32_t _docBlockSize;
+ uint32_t _memoryUsageLimit;
+ framework::MilliSecTime _docBlockTimeout;
+ framework::MilliSecTime _visitorInfoTimeout;
+ uint32_t _serialNumber;
+ // Keep trace level independent of _initiatingCmd, since we might want to
+ // print out the trace level even after the command's ownership has been
+ // released away from us.
+ uint32_t _traceLevel;
+ uint16_t _ownNodeIndex;
+
+ // Used by visitor client to identify what visitor messages belong to
+ api::StorageMessage::Id _visitorCmdId;
+ api::VisitorId _visitorId;
+ std::shared_ptr<api::StorageCommand> _initiatingCmd;
+ api::StorageMessage::Priority _priority;
+
+ api::ReturnCode _result;
+ std::map<std::string, framework::MicroSecTime> _recentlySentErrorMessages;
+ framework::MicroSecTime _timeToDie; // Visitor will time out to distributor at this time
+
+ std::unique_ptr<HitCounter> _hitCounter;
+
+ static constexpr size_t DEFAULT_TRACE_MEMORY_LIMIT = 65536;
+ MemoryBoundedTrace _trace;
+
+ Visitor(const Visitor &);
+ Visitor& operator=(const Visitor &);
+
+protected:
+ // These variables should not be altered after visitor starts. This not
+ // controlled by locks.
+ const framework::MemoryAllocationType* _memoryAllocType;
+ VisitorMessageHandler* _messageHandler;
+ VisitorMessageSession::UP _messageSession;
+ documentapi::Priority::Value _documentPriority;
+
+ std::string _id;
+ std::unique_ptr<api::StorageMessageAddress> _controlDestination;
+ std::unique_ptr<api::StorageMessageAddress> _dataDestination;
+ std::shared_ptr<document::select::Node> _documentSelection;
+ std::string _documentSelectionString;
+ std::unique_ptr<document::OrderingSpecification> _ordering;
+ vdslib::VisitorStatistics _visitorStatistics;
+ framework::MemoryManagerInterface* _memoryManager;
+
+ bool isCompletedCalled() const { return _calledCompletedVisitor; }
+public:
+ Visitor(StorageComponent& component);
+ virtual ~Visitor();
+
+ framework::MicroSecTime getStartTime() const { return _startTime; }
+ api::VisitorId getVisitorId() const { return _visitorId; }
+ const std::string& getVisitorName() const { return _id; }
+ const api::StorageMessageAddress* getControlDestination() const
+ { return _controlDestination.get(); } // Can't be null if attached
+ const api::StorageMessageAddress* getDataDestination() const
+ { return _dataDestination.get(); } // Can't be null if attached
+
+ void setAllocationType(const framework::MemoryAllocationType& mat)
+ { _memoryAllocType = &mat; }
+ void setMaxPending(unsigned int maxPending)
+ { _visitorOptions._maxPending = maxPending; }
+
+ void setFieldSet(const std::string& fieldSet) { _visitorOptions._fieldSet = fieldSet; }
+ void visitRemoves() { _visitorOptions._visitRemoves = true; }
+ void setDocBlockSize(uint32_t size) { _docBlockSize = size; }
+ uint32_t getDocBlockSize() const { return _docBlockSize; }
+ void setMemoryUsageLimit(uint32_t limit) noexcept {
+ _memoryUsageLimit = limit;
+ }
+ uint32_t getMemoryUsageLimit() const noexcept {
+ return _memoryUsageLimit;
+ }
+ void setDocBlockTimeout(framework::MilliSecTime timeout)
+ { _docBlockTimeout = timeout; }
+ void setVisitorInfoTimeout(framework::MilliSecTime timeout)
+ { _visitorInfoTimeout = timeout; }
+ void setMemoryManager(framework::MemoryManagerInterface& mm)
+ { _memoryManager = &mm; }
+ void setOwnNodeIndex(uint16_t nodeIndex) { _ownNodeIndex = nodeIndex; }
+
+ const documentapi::LoadType& getLoadType() const {
+ return _initiatingCmd->getLoadType();
+ }
+
+ /** Override this to know which buckets are currently being visited. */
+ virtual void startingVisitor(const std::vector<document::BucketId>&) {}
+
+ /**
+ * Override this method to receive a callback whenever a new
+ * vector of documents arrive from the persistence layer.
+ */
+ virtual void handleDocuments(const document::BucketId&,
+ std::vector<spi::DocEntry::LP>& entries,
+ HitCounter& hitCounter) = 0;
+
+ /**
+ * Override this if you want to do anything special after bucket completes.
+ */
+ virtual void completedBucket(const document::BucketId&, HitCounter&) {}
+
+ /**
+ * Override this if you want to know if visiting is aborted. Note that you
+ * cannot use this callback to send anything.
+ */
+ virtual void abortedVisiting() {}
+
+ /**
+ * Override if you want to know when the whole visitor has completed.
+ */
+ virtual void completedVisiting(HitCounter&) {}
+
+ /**
+ * By default a visitor requires strong consistency on its reads, i.e.
+ * previously ACKed writes MUST be visible to the operation. Visitor
+ * subclasses might choose to override this if their requirements are more
+ * lax than the deafult of STRONG.
+ *
+ * The consistency level provided here is propagated through the SPI
+ * Context object for createIterator calls.
+ */
+ virtual spi::ReadConsistency getRequiredReadConsistency() const {
+ return spi::ReadConsistency::STRONG;
+ }
+
+ /** Subclass should call this to indicate error conditions. */
+ void fail(const api::ReturnCode& reason,
+ bool overrideExistingError = false);
+
+ /**
+ * Used to silence transient errors that can happen during normal operation.
+ */
+ bool shouldReportProblemToClient(const api::ReturnCode&,
+ size_t retryCount) const;
+
+ /** Called to send report to client of potential non-critical problems. */
+ void reportProblem(const std::string& problem);
+
+ /**
+ * Wrapper for reportProblem which reports string representation of
+ * result code and message
+ **/
+ void reportProblem(const api::ReturnCode& problemCode);
+
+ /** Call to gracefully close visitor */
+ void close();
+
+ /**
+ * Called before deleting this visitor.
+ * Precondition: visitor state must be STATE_COMPLETED.
+ **/
+ void finalize();
+
+ /** Call -ONLY- during process shutdown case where you don't care if
+ * we end up leaking persistence provider layer iterators. Cannot
+ * gracefully close in this case since we shut down the event handler
+ * thread in advance.
+ */
+ void forceClose();
+
+ void start(api::VisitorId id, api::StorageMessage::Id cmdId,
+ const std::string& name,
+ const std::vector<document::BucketId>&,
+ framework::MicroSecTime fromTimestamp,
+ framework::MicroSecTime toTimestamp,
+ std::unique_ptr<document::select::Node> docSelection,
+ const std::string& docSelectionString,
+ std::unique_ptr<document::OrderingSpecification>,
+ VisitorMessageHandler&,
+ VisitorMessageSession::UP,
+ documentapi::Priority::Value);
+
+ void attach(std::shared_ptr<api::StorageCommand> initiatingCmd,
+ const api::StorageMessageAddress& controlAddress,
+ const api::StorageMessageAddress& dataAddress,
+ framework::MilliSecTime timeout);
+
+ void handleDocumentApiReply(mbus::Reply::UP reply,
+ VisitorThreadMetrics& metrics);
+
+ void onGetIterReply(const std::shared_ptr<GetIterReply>& reply,
+ VisitorThreadMetrics& metrics);
+
+ void onCreateIteratorReply(
+ const std::shared_ptr<CreateIteratorReply>& reply,
+ VisitorThreadMetrics& metrics);
+
+ bool failed() const { return _result.failed(); }
+
+ /**
+ * This function will check current state and make the visitor move on, if
+ * there are any space left in queues.
+ */
+ bool continueVisitor();
+
+ void getStatus(std::ostream& out, bool verbose) const;
+
+ void setMaxParallel(uint32_t maxParallel)
+ { _visitorOptions._maxParallel = maxParallel; }
+ void setMaxParallelPerBucket(uint32_t max)
+ { _visitorOptions._maxParallelOneBucket = max; }
+
+ /**
+ * Sends a message to the data handler for this visitor.
+ */
+ void sendMessage(std::unique_ptr<documentapi::DocumentMessage> documentMessage);
+
+ bool isRunning() const { return _state == STATE_RUNNING; }
+ bool isCompleted() const { return _state == STATE_COMPLETED; }
+
+private:
+ /**
+ * Sends a message to the control handler for this visitor.
+ * Utility function used by fail() and reportProblem() for instance.
+ */
+ void sendInfoMessage(std::unique_ptr<documentapi::VisitorInfoMessage> cmd);
+
+ /**
+ * This function will inspect the bucket states and possibly request
+ * new iterators. It is called fairly often (everytime there are free spots
+ * on message queue), thus it is unnecessary to process all buckets at once.
+ * Buckets are thus processed in a round robin fashion.
+ *
+ * @return False if there is no more to iterate.
+ */
+ bool getIterators();
+
+ /**
+ * Attempt to send the message kept in msgMeta over the destination session,
+ * automatically queuing for future transmission if a maximum number of
+ * messages are already pending.
+ *
+ * Preconditions:
+ * msgMeta must be in _visitorTarget._messageMeta
+ * msgMeta.message.get() != nullptr
+ * Postconditions:
+ * case enqueued:
+ * msgMeta.messageId in _visitorTarget._queuedMessages
+ * case sent:
+ * msgMeta.message.get() == nullptr (released to message bus)
+ * case send failure:
+ * visitor transition to STATE_FAILURE
+ */
+ void sendDocumentApiMessage(VisitorTarget::MessageMeta& msgMeta);
+
+ void sendReplyOnce();
+
+ bool hasFailedVisiting() const { return _result.failed(); }
+
+ bool hasPendingIterators() const { return !_bucketStates.empty(); }
+
+ bool mayTransitionToCompleted() const;
+
+ void discardAllNoPendingBucketStates();
+
+ static const char* getStateName(VisitorState);
+
+ /**
+ * (Re-)send any queued messages whose time-to-send has been reached.
+ * Ensures number of resulting pending messages from visitor does not
+ * violate maximum pending options.
+ */
+ void sendDueQueuedMessages(framework::MicroSecTime timeNow);
+
+ /**
+ * Whether visitor should enable and forward message bus traces for messages
+ * going via DocumentAPI or through the SPI.
+ *
+ * Precondition: attach() must have been called on `this`.
+ */
+ bool shouldAddMbusTrace() const noexcept {
+ return _traceLevel != 0;
+ }
+
+ /**
+ * Set internal state to the given state value.
+ * @return Old state.
+ */
+ VisitorState transitionTo(VisitorState newState);
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/visiting/visitorlibraries.cpp b/storage/src/vespa/storage/visiting/visitorlibraries.cpp
new file mode 100644
index 00000000000..8141250f5f7
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/visitorlibraries.cpp
@@ -0,0 +1,70 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/defaults.h>
+#include <vespa/storage/visiting/visitorlibraries.h>
+#include <vespa/log/log.h>
+
+LOG_SETUP(".visiting.libraryloader");
+
+namespace storage {
+
+VisitorLibraries::LibMap VisitorLibraries::_libs;
+vespalib::Lock VisitorLibraries::_libLock;
+
+/**
+ * Utility function to get a dynamic library.
+ * Assumes _libLock has been grabbed before calling.
+ */
+VisitorLibraries::LibraryRef
+VisitorLibraries::getLibrary(StorageServerInterface& storageServer, const std::string& libName, const std::string& libraryPath)
+{
+ vespalib::LockGuard guard(_libLock);
+
+ LibMap::iterator it = _libs.find(libName);
+ if (it != _libs.end()) {
+ return LibraryRef(it->second.factory, it->second.environment.get());
+ }
+
+ std::shared_ptr<FastOS_DynamicLibrary> lib(new FastOS_DynamicLibrary);
+ std::string file = libraryPath + "lib" + libName + ".so";
+ if (!lib->Open(file.c_str())) {
+ std::string error = lib->GetLastErrorString();
+ std::string absfile = vespa::Defaults::vespaHome();
+ absfile.append("libexec/vespa/storage/lib" + libName + ".so");
+ if (!lib->Open(absfile.c_str())) {
+ LOG(error, "Could not load library %s: %s",
+ file.c_str(), error.c_str());
+ return LibraryRef();
+ }
+ }
+ std::shared_ptr<VisitorEnvironment> env(
+ getVisitorEnvironment(storageServer, *lib, libName));
+
+ LibMapEntry entry;
+ entry.library = lib;
+ entry.environment = env;
+ entry.factory = lib.get() ? (VisitorFactoryFuncT) lib->GetSymbol("makeVisitor") : 0;
+ _libs[libName] = entry;
+
+ return LibraryRef(entry.factory, env.get());
+}
+
+std::shared_ptr<VisitorEnvironment>
+VisitorLibraries::getVisitorEnvironment(StorageServerInterface& storageServer, FastOS_DynamicLibrary& lib,
+ const std::string& libName)
+{
+ typedef VisitorEnvironment::UP
+ (*VisitorEnvFuncT)(StorageServerInterface& server);
+ VisitorEnvFuncT factoryFunc
+ = (VisitorEnvFuncT) lib.GetSymbol("makeVisitorEnvironment");
+ if (factoryFunc == 0) {
+ std::string err = lib.GetLastErrorString();
+ LOG(error, "Unable to load symbol 'makeVisitorEnvironment' from "
+ "'%s': %s", libName.c_str(), err.c_str());
+ return std::shared_ptr<VisitorEnvironment>();
+ }
+ return std::shared_ptr<VisitorEnvironment>(
+ factoryFunc(storageServer).release());
+}
+
+}
diff --git a/storage/src/vespa/storage/visiting/visitorlibraries.h b/storage/src/vespa/storage/visiting/visitorlibraries.h
new file mode 100644
index 00000000000..50d96f7849e
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/visitorlibraries.h
@@ -0,0 +1,39 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+/**
+ This class handles ownership and creation of dynamic visitor libraries.
+*/
+
+#include <vespa/storage/visiting/visitor.h>
+
+namespace storage {
+
+class VisitorLibraries {
+public:
+ typedef Visitor* (*VisitorFactoryFuncT)(StorageServerInterface& server,
+ VisitorEnvironment& env,
+ const vdslib::Parameters& params);
+
+ struct LibMapEntry {
+ std::shared_ptr<FastOS_DynamicLibrary> library;
+ std::shared_ptr<VisitorEnvironment> environment;
+ VisitorFactoryFuncT factory;
+ };
+
+ typedef std::map<std::string, LibMapEntry> LibMap;
+ typedef std::pair<VisitorFactoryFuncT, VisitorEnvironment*> LibraryRef;
+
+ static LibraryRef getLibrary(StorageServerInterface& storageServer, const std::string& libName, const std::string& libraryPath);
+
+private:
+ static LibMap _libs;
+ static vespalib::Lock _libLock;
+
+ static std::shared_ptr<VisitorEnvironment> getVisitorEnvironment(StorageServerInterface& storageServer,
+ FastOS_DynamicLibrary& lib,
+ const std::string& libName);
+};
+
+}
+
diff --git a/storage/src/vespa/storage/visiting/visitormanager.cpp b/storage/src/vespa/storage/visiting/visitormanager.cpp
new file mode 100644
index 00000000000..3ef64c5c177
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/visitormanager.cpp
@@ -0,0 +1,716 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/visiting/visitormanager.h>
+
+#include <cstdlib>
+#include <vespa/log/log.h>
+#include <vespa/storageapi/message/visitor.h>
+#include <vespa/storage/common/statusmessages.h>
+#include <vespa/storage/storageserver/storagemetricsset.h>
+#include <vespa/storage/visiting/messages.h>
+
+#include <vespa/storage/visiting/dumpvisitor.h>
+#include <vespa/storage/visiting/dumpvisitorsingle.h>
+#include <vespa/storage/visiting/countvisitor.h>
+#include <vespa/storage/visiting/testvisitor.h>
+#include <vespa/storage/visiting/recoveryvisitor.h>
+
+LOG_SETUP(".visitor.manager");
+
+namespace storage {
+
+VisitorManager::VisitorManager(const config::ConfigUri & configUri,
+ StorageComponentRegister& componentRegister,
+ VisitorMessageSessionFactory& messageSF,
+ const VisitorFactory::Map& externalFactories)
+ : StorageLink("Visitor Manager"),
+ framework::HtmlStatusReporter("visitorman", "Visitor Manager"),
+ _componentRegister(componentRegister),
+ _messageSessionFactory(messageSF),
+ _visitorThread(),
+ _visitorMessages(),
+ _visitorLock(),
+ _visitorCounter(0),
+ _configFetcher(configUri.getContext()),
+ _metrics(new VisitorMetrics),
+ _maxFixedConcurrentVisitors(1),
+ _maxVariableConcurrentVisitors(0),
+ _maxVisitorQueueSize(1024),
+ _nameToId(),
+ _component(componentRegister, "visitormanager"),
+ _visitorQueue(_component.getClock()),
+ _recentlyDeletedVisitors(),
+ _recentlyDeletedMaxTime(5 * 1000 * 1000),
+ _statusLock(),
+ _statusMonitor(),
+ _statusRequest(),
+ _enforceQueueUse(false),
+ _visitorFactories(externalFactories)
+{
+ _component.getMemoryManager().registerAllocationType(
+ framework::MemoryAllocationType(
+ "VISITOR_BUFFER",
+ framework::MemoryAllocationType::EXTERNAL_LOAD));
+ _configFetcher.subscribe<vespa::config::content::core::StorVisitorConfig>(configUri.getConfigId(), this);
+ _configFetcher.start();
+ _component.registerMetric(*_metrics);
+ framework::MilliSecTime maxProcessTime(30 * 1000);
+ framework::MilliSecTime waitTime(1000);
+ _thread = _component.startThread(*this, maxProcessTime, waitTime);
+ _component.registerMetricUpdateHook(*this, framework::SecondTime(5));
+
+ // Register built-in visitors.
+ if (_component.isUpgradingToMajorVersion()) {
+ _visitorFactories["dumpvisitor"].reset(new DumpVisitorFactory);
+ } else {
+ _visitorFactories["dumpvisitor"].reset(new DumpVisitorSingleFactory);
+ }
+ _visitorFactories["dumpvisitorsingle"].reset(new DumpVisitorSingleFactory);
+ _visitorFactories["testvisitor"].reset(new TestVisitorFactory);
+ _visitorFactories["countvisitor"].reset(new CountVisitorFactory);
+ _visitorFactories["recoveryvisitor"].reset(new RecoveryVisitorFactory);
+ _component.registerStatusPage(*this);
+}
+
+VisitorManager::~VisitorManager() {
+ closeNextLink();
+ LOG(debug, "Deleting link %s.", toString().c_str());
+ if (_thread.get() != 0) {
+ _thread->interrupt();
+ {
+ vespalib::MonitorGuard waiter(_visitorLock);
+ waiter.signal();
+ }
+ _thread->join();
+ }
+ _visitorThread.clear();
+}
+
+void
+VisitorManager::updateMetrics(const MetricLockGuard &)
+{
+ _metrics->queueSize.addValue(_visitorQueue.size());
+}
+
+void
+VisitorManager::onClose()
+{
+ // Avoid getting config during shutdown
+ _configFetcher.close();
+ {
+ vespalib::MonitorGuard sync(_visitorLock);
+ for (CommandQueue<api::CreateVisitorCommand>::iterator it
+ = _visitorQueue.begin(); it != _visitorQueue.end(); ++it)
+ {
+ std::shared_ptr<api::CreateVisitorReply> reply(
+ new api::CreateVisitorReply(*it->_command));
+ reply->setResult(api::ReturnCode(
+ api::ReturnCode::ABORTED,
+ "Shutting down storage node."));
+ sendUp(reply);
+ }
+ _visitorQueue.clear();
+ }
+ for (uint32_t i=0; i<_visitorThread.size(); ++i) {
+ _visitorThread[i].first->shutdown();
+ }
+}
+
+void
+VisitorManager::print(std::ostream& out, bool verbose,
+ const std::string& indent) const
+{
+ (void) verbose; (void) indent;
+ out << "VisitorManager";
+}
+
+void
+VisitorManager::configure(std::unique_ptr<vespa::config::content::core::StorVisitorConfig> config)
+{
+ vespalib::MonitorGuard sync(_visitorLock);
+ if (config->defaultdocblocksize % 512 != 0) {
+ throw config::InvalidConfigException(
+ "The default docblock size needs to be a multiplum of the "
+ "disk block size. (512b)");
+ }
+
+ // Do some sanity checking of input. Cannot haphazardly mix and match
+ // old and new max concurrency config values
+ if (config->maxconcurrentvisitors == 0
+ && config->maxconcurrentvisitorsFixed == 0)
+ {
+ throw config::InvalidConfigException(
+ "Maximum concurrent visitor count cannot be 0.");
+ }
+ else if (config->maxconcurrentvisitorsFixed == 0
+ && config->maxconcurrentvisitorsVariable != 0)
+ {
+ throw config::InvalidConfigException(
+ "Cannot specify 'variable' parameter for max concurrent "
+ " visitors without also specifying 'fixed'.");
+ }
+
+ uint32_t maxConcurrentVisitorsFixed;
+ uint32_t maxConcurrentVisitorsVariable;
+
+ // Concurrency parameter fixed takes precedence over legacy maxconcurrent
+ if (config->maxconcurrentvisitorsFixed > 0) {
+ maxConcurrentVisitorsFixed = config->maxconcurrentvisitorsFixed;
+ maxConcurrentVisitorsVariable = config->maxconcurrentvisitorsVariable;
+ } else {
+ maxConcurrentVisitorsFixed = config->maxconcurrentvisitors;
+ maxConcurrentVisitorsVariable = 0;
+ }
+
+ bool liveUpdate = (_visitorThread.size() > 0);
+ if (liveUpdate) {
+ if (_visitorThread.size() != static_cast<uint32_t>(config->visitorthreads)) {
+ LOG(warning, "Ignoring config change requesting %u visitor "
+ "threads, still running %u. Restart storage to apply "
+ "change.",
+ config->visitorthreads,
+ (uint32_t) _visitorThread.size());
+ }
+
+ if (_maxFixedConcurrentVisitors != maxConcurrentVisitorsFixed
+ || _maxVariableConcurrentVisitors != maxConcurrentVisitorsVariable)
+ {
+ LOG(info, "Altered max concurrent visitors setting from "
+ "(fixed=%u, variable=%u) to (fixed=%u, variable=%u).",
+ _maxFixedConcurrentVisitors, _maxVariableConcurrentVisitors,
+ maxConcurrentVisitorsFixed, maxConcurrentVisitorsVariable);
+ }
+
+ if (_maxVisitorQueueSize != static_cast<uint32_t>(config->maxvisitorqueuesize)) {
+ LOG(info, "Altered max visitor queue size setting from %u to %u.",
+ _maxVisitorQueueSize, config->maxvisitorqueuesize);
+ }
+ } else {
+ if (config->visitorthreads == 0) {
+ throw config::InvalidConfigException(
+ "No visitor threads configured. If you don't want visitors "
+ "to run, don't use visitormanager.", VESPA_STRLOC);
+ }
+ _metrics->initThreads(config->visitorthreads,
+ _component.getLoadTypes()->getMetricLoadTypes());
+ for (int32_t i=0; i<config->visitorthreads; ++i) {
+ _visitorThread.push_back(std::make_pair(
+ std::shared_ptr<VisitorThread>(
+ new VisitorThread(i, _componentRegister,
+ _messageSessionFactory,
+ _visitorFactories,
+ *_metrics->threads[i], *this)),
+ std::map<api::VisitorId, std::string>()));
+ }
+ }
+ _maxFixedConcurrentVisitors = maxConcurrentVisitorsFixed;
+ _maxVariableConcurrentVisitors = maxConcurrentVisitorsVariable;
+ _maxVisitorQueueSize = config->maxvisitorqueuesize;
+ std::shared_ptr<PropagateVisitorConfig> cmd(
+ new PropagateVisitorConfig(*config));
+ for (int32_t i=0; i<config->visitorthreads; ++i) {
+ _visitorThread[i].first->processMessage(0, cmd);
+ }
+}
+
+void
+VisitorManager::run(framework::ThreadHandle& thread)
+{
+ LOG(debug, "Started visitor manager thread with pid %d.", getpid());
+ typedef CommandQueue<api::CreateVisitorCommand> CQ;
+ std::list<CQ::CommandEntry> timedOut;
+ // Run forever, dump messages in the visitor queue that times out.
+ while (true) {
+ thread.registerTick(framework::PROCESS_CYCLE);
+ {
+ vespalib::LockGuard waiter(_visitorLock);
+ if (thread.interrupted()) {
+ break;
+ }
+ timedOut = _visitorQueue.releaseTimedOut();
+ }
+ framework::MicroSecTime currentTime(
+ _component.getClock().getTimeInMicros());
+ for (std::list<CQ::CommandEntry>::iterator it = timedOut.begin();
+ it != timedOut.end(); ++it)
+ {
+ _metrics->queueTimeoutWaitTime.addValue(
+ currentTime.getTime() - it->_time);
+ std::shared_ptr<api::StorageReply> reply(
+ it->_command->makeReply().release());
+ reply->setResult(api::ReturnCode(api::ReturnCode::BUSY,
+ "Visitor timed out in visitor queue"));
+ sendUp(reply);
+ }
+ {
+ vespalib::MonitorGuard waiter(_visitorLock);
+ if (thread.interrupted()) {
+ break;
+ } else if (_visitorQueue.empty()) {
+ waiter.wait(1000);
+ thread.registerTick(framework::WAIT_CYCLE);
+ } else {
+ uint64_t timediff = (_visitorQueue.tbegin()->_time
+ - currentTime.getTime())
+ / 1000000;
+ timediff = std::min(timediff, uint64_t(1000));
+ if (timediff > 0) {
+ waiter.wait(timediff);
+ thread.registerTick(framework::WAIT_CYCLE);
+ }
+ }
+ }
+ }
+ LOG(debug, "Stopped visitor manager thread with pid %d.", getpid());
+}
+
+namespace {
+ template<typename T>
+ uint32_t getLeastLoadedThread(const T& t, uint32_t& totalCount) {
+ uint32_t min = 0xFFFFFFFF;
+ totalCount = 0;
+ for (uint32_t i=0; i<t.size(); ++i) {
+ totalCount += t[i].second.size();
+ if (t[i].second.size() < min) {
+ min = t[i].second.size();
+ }
+ }
+ return min;
+ }
+}
+
+uint32_t
+VisitorManager::getActiveVisitorCount() const
+{
+ vespalib::MonitorGuard sync(_visitorLock);
+ uint32_t totalCount = 0;
+ for (uint32_t i=0; i<_visitorThread.size(); ++i) {
+ totalCount += _visitorThread[i].second.size();
+ }
+ return totalCount;
+}
+
+/** For unit testing that we don't leak memory from message tracking. */
+bool
+VisitorManager::hasPendingMessageState() const
+{
+ vespalib::MonitorGuard sync(_visitorLock);
+ return !_visitorMessages.empty();
+}
+
+void
+VisitorManager::setTimeBetweenTicks(uint32_t time)
+{
+ vespalib::MonitorGuard sync(_visitorLock);
+ for (uint32_t i=0; i<_visitorThread.size(); ++i) {
+ _visitorThread[i].first->setTimeBetweenTicks(time);
+ }
+}
+
+bool
+VisitorManager::scheduleVisitor(
+ const std::shared_ptr<api::CreateVisitorCommand>& cmd, bool skipQueue,
+ vespalib::MonitorGuard& visitorLock)
+{
+ api::VisitorId id;
+ typedef std::map<std::string, api::VisitorId> NameToIdMap;
+ typedef std::pair<std::string, api::VisitorId> NameIdPair;
+ std::pair<NameToIdMap::iterator, bool> newEntry;
+ {
+ uint32_t totCount;
+ uint32_t minLoadCount = getLeastLoadedThread(_visitorThread, totCount);
+ if (!skipQueue) {
+ if (_enforceQueueUse || totCount >= maximumConcurrent(*cmd)) {
+ api::CreateVisitorCommand::SP failCommand;
+
+ if (cmd->getQueueTimeout() != 0 && _maxVisitorQueueSize > 0) {
+ if (_visitorQueue.size() < _maxVisitorQueueSize) {
+ // Still room in the queue
+ _visitorQueue.add(cmd);
+ visitorLock.signal();
+ } else {
+ // If tail of priority queue has a lower priority than
+ // the new visitor, evict it and insert the new one. If
+ // not, immediately return with a busy reply
+ std::shared_ptr<api::CreateVisitorCommand> tail(
+ _visitorQueue.peekLowestPriorityCommand());
+ // Lower int ==> higher pri
+ if (cmd->getPriority() < tail->getPriority()) {
+ std::pair<api::CreateVisitorCommand::SP,
+ time_t> evictCommand(
+ _visitorQueue.releaseLowestPriorityCommand());
+ assert(tail == evictCommand.first);
+ _visitorQueue.add(cmd);
+ visitorLock.signal();
+ framework::MicroSecTime t(
+ _component.getClock().getTimeInMicros());
+ _metrics->queueEvictedWaitTime.addValue(
+ t.getTime() - evictCommand.second);
+ failCommand = evictCommand.first;
+ } else {
+ failCommand = cmd;
+ _metrics->queueFull.inc();
+ }
+ }
+ } else {
+ // No queueing allowed; must return busy for new command
+ failCommand = cmd;
+ }
+ visitorLock.unlock();
+
+ if (failCommand.get() != 0) {
+ std::shared_ptr<api::CreateVisitorReply> reply(
+ new api::CreateVisitorReply(*failCommand));
+ std::ostringstream ost;
+ if (cmd->getQueueTimeout() == 0) {
+ ost << "Already running the maximum amount ("
+ << maximumConcurrent(*failCommand)
+ << ") of visitors for this priority ("
+ << static_cast<uint32_t>(failCommand->getPriority())
+ << "), and queue timeout is 0.";
+ } else if (_maxVisitorQueueSize == 0) {
+ ost << "Already running the maximum amount ("
+ << maximumConcurrent(*failCommand)
+ << ") of visitors for this priority ("
+ << static_cast<uint32_t>(failCommand->getPriority())
+ << "), and maximum queue size is 0.";
+ } else {
+ ost << "Queue is full and a higher priority visitor was received, "
+ "taking precedence.";
+ }
+ reply->setResult(api::ReturnCode(api::ReturnCode::BUSY,
+ ost.str()));
+ send(reply);
+ }
+ return false;
+ } else {
+ _metrics->queueSkips.inc();
+ }
+ }
+ while (true) {
+ id = ++_visitorCounter;
+ std::map<api::VisitorId, std::string>& usedIds(
+ _visitorThread[id % _visitorThread.size()].second);
+ if (usedIds.size() == minLoadCount &&
+ usedIds.find(id) == usedIds.end())
+ {
+ newEntry = _nameToId.insert(NameIdPair(cmd->getInstanceId(),
+ id));
+ if (newEntry.second) {
+ usedIds[id] = cmd->getInstanceId();
+ }
+ break;
+ }
+ }
+ }
+ visitorLock.unlock();
+ if (!newEntry.second) {
+ std::shared_ptr<api::CreateVisitorReply> reply(
+ new api::CreateVisitorReply(*cmd));
+ std::ostringstream ost;
+ ost << "Already running a visitor named " << cmd->getInstanceId()
+ << ". Not creating visitor.";
+ reply->setResult(api::ReturnCode(api::ReturnCode::EXISTS,
+ ost.str()));
+ send(reply);
+ return false;
+ }
+ cmd->setVisitorId(id);
+ _visitorThread[id % _visitorThread.size()].first->processMessage(id, cmd);
+ return true;
+}
+
+bool
+VisitorManager::onCreateVisitor(
+ const std::shared_ptr<api::CreateVisitorCommand>& cmd)
+{
+ vespalib::MonitorGuard sync(_visitorLock);
+ scheduleVisitor(cmd, false, sync);
+ return true;
+}
+
+bool
+VisitorManager::onDown(const std::shared_ptr<api::StorageMessage>& r)
+{
+ std::shared_ptr<api::StorageReply> reply(
+ std::dynamic_pointer_cast<api::StorageReply>(r));
+
+ if (reply.get() != 0 && processReply(reply)) {
+ return true;
+ } else {
+ return StorageLink::onDown(r);
+ }
+}
+
+bool
+VisitorManager::onInternalReply(const std::shared_ptr<api::InternalReply>& r)
+{
+ switch(r->getType()) {
+ case RequestStatusPageReply::ID:
+ {
+ std::shared_ptr<RequestStatusPageReply> reply(
+ std::dynamic_pointer_cast<RequestStatusPageReply>(r));
+ assert(reply.get());
+ vespalib::MonitorGuard waiter(_statusMonitor);
+ _statusRequest.push_back(reply);
+ waiter.signal();
+ return true;
+ }
+ case PropagateVisitorConfigReply::ID:
+ {
+ return true; // Ignore replies if any.
+ }
+ default:
+ return processReply(r);
+ }
+}
+
+bool
+VisitorManager::processReply(const std::shared_ptr<api::StorageReply>& reply)
+{
+ api::VisitorId id;
+ {
+ vespalib::MonitorGuard sync(_visitorLock);
+ std::map<api::StorageMessage::Id, MessageInfo>::iterator it
+ = _visitorMessages.find(reply->getMsgId());
+ if (it == _visitorMessages.end()) return false;
+ id = it->second.id;
+ _visitorMessages.erase(it);
+ }
+ _visitorThread[id % _visitorThread.size()].first->processMessage(id, reply);
+ return true;
+}
+
+void
+VisitorManager::send(const std::shared_ptr<api::StorageCommand>& cmd,
+ Visitor& visitor)
+{
+ assert(cmd->getType() == api::MessageType::INTERNAL);
+ // Only add to internal state if not destroy iterator command, as
+ // these are considered special-cased fire-and-forget commands
+ // that don't have replies.
+ if (static_cast<const api::InternalCommand&>(*cmd).getType()
+ != DestroyIteratorCommand::ID)
+ {
+ MessageInfo inf;
+ inf.id = visitor.getVisitorId();
+ inf.timestamp = _component.getClock().getTimeInSeconds().getTime();
+ inf.timeout = cmd->getTimeout();
+
+ if (cmd->getAddress()) {
+ inf.destination = cmd->getAddress()->toString();
+ }
+
+ vespalib::MonitorGuard sync(_visitorLock);
+ _visitorMessages[cmd->getMsgId()] = inf;
+ }
+ mbus::Trace & trace = cmd->getTrace();
+ MBUS_TRACE(trace, 6, "Requesting data from persistence layer: " + cmd->toString());
+ LOG(spam, "Sending visitor command %s down.", cmd->getType().getName().c_str());
+ sendDown(cmd);
+}
+
+void
+VisitorManager::send(const std::shared_ptr<api::StorageReply>& reply)
+{
+ if (reply->getType() == api::MessageType::INTERNAL_REPLY) {
+ LOG(spam, "Received an internal reply");
+ std::shared_ptr<api::InternalReply> rep(
+ std::dynamic_pointer_cast<api::InternalReply>(reply));
+ assert(rep.get());
+ if (onInternalReply(rep)) return;
+ }
+ LOG(spam, "Sending visitor reply %s up.",
+ reply->getType().getName().c_str());
+ sendUp(reply);
+}
+
+// Attempt to schedule a new visitor. visitorLock must be held at
+// the time of the call and will be unlocked if scheduling takes
+// place. Returns true if a visitor was scheduled, false otherwise.
+bool
+VisitorManager::attemptScheduleQueuedVisitor(vespalib::MonitorGuard& visitorLock)
+{
+ if (_visitorQueue.empty()) return false;
+
+ uint32_t totCount;
+ getLeastLoadedThread(_visitorThread, totCount);
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ _visitorQueue.peekNextCommand());
+ assert(cmd.get());
+ if (totCount < maximumConcurrent(*cmd)) {
+ std::pair<api::CreateVisitorCommand::SP, time_t> cmd2(
+ _visitorQueue.releaseNextCommand());
+ assert(cmd == cmd2.first);
+ scheduleVisitor(cmd, true, visitorLock);
+ framework::MicroSecTime time(_component.getClock().getTimeInMicros());
+ _metrics->queueWaitTime.addValue(time.getTime() - cmd2.second);
+ // visitorLock is unlocked at this point
+ return true;
+ }
+ return false;
+}
+
+void
+VisitorManager::closed(api::VisitorId id)
+{
+ vespalib::MonitorGuard sync(_visitorLock);
+ std::map<api::VisitorId, std::string>& usedIds(
+ _visitorThread[id % _visitorThread.size()].second);
+
+ std::map<api::VisitorId, std::string>::iterator it = usedIds.find(id);
+ if (it == usedIds.end()) {
+ LOG(warning, "VisitorManager::closed() called multiple times for the "
+ "same visitor. This was not intended.");
+ return;
+ }
+ framework::MicroSecTime time(_component.getClock().getTimeInMicros());
+ _recentlyDeletedVisitors.push_back(
+ std::make_pair(it->second, time));
+ _nameToId.erase(it->second);
+ usedIds.erase(it);
+ while (_recentlyDeletedVisitors.front().second + _recentlyDeletedMaxTime
+ < time)
+ {
+ _recentlyDeletedVisitors.pop_front();
+ }
+
+ // Schedule as many visitors as we are allowed to for the highest
+ // prioritized queued commands
+ bool scheduled = attemptScheduleQueuedVisitor(sync);
+ while (scheduled) {
+ // At this point, sync is unlocked, so we have to re-acquire
+ // the lock
+ vespalib::MonitorGuard resync(_visitorLock);
+ scheduled = attemptScheduleQueuedVisitor(resync);
+ }
+}
+
+/**
+ * The string in page is just searched through using string::find. Terms found
+ * are printed.. Known terms:
+ *
+ * visitor - Print info on visitor given
+ * allvisitors - Print all info on all visitors
+ *
+ * verbose - If set, print extra details.
+ */
+void
+VisitorManager::reportHtmlStatus(std::ostream& out,
+ const framework::HttpUrlPath& path) const
+{
+ bool showStatus = !path.hasAttribute("visitor");
+ bool verbose = path.hasAttribute("verbose");
+ bool showAll = path.hasAttribute("allvisitors");
+
+ // Print menu
+ out << "<font size=\"-1\">[ <a href=\"/\">Back to top</a>"
+ << " | <a href=\"?" << (verbose ? "verbose" : "")
+ << "\">Main visitor manager status page</a>"
+ << " | <a href=\"?allvisitors" << (verbose ? "&verbose" : "")
+ << "\">Show all visitors</a>"
+ << " | <a href=\"?" << (verbose ? "notverbose" : "verbose");
+ if (!showStatus) out << "&visitor=" << path.get("visitor", std::string(""));
+ if (showAll) out << "&allvisitors";
+ out << "\">" << (verbose ? "Less verbose" : "More verbose") << "</a>\n"
+ << " ]</font><br><br>\n";
+
+ uint32_t visitorCount = 0;
+ if (showStatus) {
+ vespalib::MonitorGuard sync(_visitorLock);
+ if (verbose) {
+ out << "<h3>Currently running visitors</h3>\n";
+ for (uint32_t i=0; i<_visitorThread.size(); ++i) {
+ visitorCount += _visitorThread[i].second.size();
+ out << "Thread " << i << ":";
+ if (_visitorThread[i].second.size() == 0) {
+ out << " none";
+ } else {
+ for (std::map<api::VisitorId,std::string>::const_iterator it
+ = _visitorThread[i].second.begin();
+ it != _visitorThread[i].second.end(); it++)
+ {
+ out << " " << it->second << " (" << it->first << ")";
+ }
+ }
+ out << "<br>\n";
+ }
+ out << "<h3>Queued visitors</h3>\n<ul>\n";
+
+ framework::MicroSecTime time(
+ _component.getClock().getTimeInMicros());
+ for (CommandQueue<api::CreateVisitorCommand>::const_iterator it
+ = _visitorQueue.begin(); it != _visitorQueue.end(); ++it)
+ {
+ std::shared_ptr<api::CreateVisitorCommand> cmd(
+ it->_command);
+ assert(cmd.get());
+ out << "<li>" << cmd->getInstanceId() << " - "
+ << cmd->getQueueTimeout() << ", remaining timeout "
+ << (it->_time - time.getTime()) / 1000000 << " ms\n";
+ }
+ if (_visitorQueue.empty()) {
+ out << "None\n";
+ }
+ out << "</ul>\n";
+ if (_visitorMessages.size() > 0) {
+ out << "<h3>Waiting for the following visitor replies</h3>"
+ << "\n<table><tr>"
+ << "<th>Storage API message id</th>"
+ << "<th>Visitor id</th>"
+ << "<th>Timestamp</th>"
+ << "<th>Timeout</th>"
+ << "<th>Destination</th>"
+ << "</tr>\n";
+ for (std::map<api::StorageMessage::Id,
+ MessageInfo>::const_iterator it
+ = _visitorMessages.begin();
+ it != _visitorMessages.end(); ++it)
+ {
+ out << "<tr>"
+ << "<td>" << it->first << "</td>"
+ << "<td>" << it->second.id << "</td>"
+ << "<td>" << it->second.timestamp << "</td>"
+ << "<td>" << it->second.timeout << "</td>"
+ << "<td>" << it->second.destination << "</td>"
+ << "</tr>\n";
+ }
+ out << "</table>\n";
+ } else {
+ out << "<h3>Not waiting for any visitor replies</h3>\n";
+ }
+ }
+ out << "\n<p>Running " << visitorCount << " visitors. Max concurrent "
+ << "visitors: fixed = " << _maxFixedConcurrentVisitors
+ << ", variable = " << _maxVariableConcurrentVisitors
+ << ", waiting visitors " << _visitorQueue.size() << "<br>\n";
+ }
+ // Only one can access status at a time as _statusRequest only holds
+ // answers from one request at a time
+ vespalib::LockGuard sync(_statusLock);
+ vespalib::MonitorGuard waiter(_statusMonitor);
+ // Send all subrequests
+ uint32_t parts = _visitorThread.size();
+ for (uint32_t i=0; i<parts; ++i) {
+ std::shared_ptr<RequestStatusPage> cmd(new RequestStatusPage(path));
+ std::ostringstream token;
+ token << "Visitor thread " << i;
+ cmd->setSortToken(token.str());
+ _visitorThread[i].first->processMessage(0, cmd);
+ }
+ // Wait for all replies to come back
+ while (_statusRequest.size() < parts) {
+ waiter.wait();
+ }
+ std::sort(_statusRequest.begin(), _statusRequest.end(), StatusReqSorter());
+
+ // Create output
+ for (uint32_t i=0; i<_statusRequest.size(); ++i) {
+ out << "<h2>" << _statusRequest[i]->getSortToken()
+ << "</h2>\n" << _statusRequest[i]->getStatus() << "\n";
+ }
+ _statusRequest.clear();
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/visiting/visitormanager.h b/storage/src/vespa/storage/visiting/visitormanager.h
new file mode 100644
index 00000000000..556a49e6227
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/visitormanager.h
@@ -0,0 +1,184 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::VisitorManager
+ * @ingroup storageserver
+ *
+ * @brief Storage module for handling visitors.
+ *
+ * This module will dispatch iterator commands to the persistence layer, and
+ * feed the results to the correct Visitor modules. As long as there are
+ * active visitors, an iterator is running on the persistence layer. New
+ * visitors hook into this stream and remember their starting position. The
+ * iterator will loop round the database and visitors receive EOF when they are
+ * back at their starting position
+ *
+ * @author Fledsbo
+ * @date 2004-3-30
+ * @version $Id$
+ */
+
+#pragma once
+
+#include <vespa/vespalib/util/document_runnable.h>
+#include <vespa/storageapi/message/datagram.h>
+#include <vespa/storageapi/message/internal.h>
+#include <vespa/storageapi/message/visitor.h>
+#include <vespa/storage/common/storagelink.h>
+#include <vespa/storage/visiting/commandqueue.h>
+#include <vespa/storage/visiting/config-stor-visitor.h>
+#include <vespa/storage/visiting/visitor.h>
+#include <vespa/storage/visiting/visitormetrics.h>
+#include <vespa/storage/visiting/visitorthread.h>
+#include <vespa/storageframework/storageframework.h>
+#include <vespa/storageframework/storageframework.h>
+
+namespace storage {
+namespace api {
+ class BucketTimeInterval;
+}
+class RequestStatusPageReply;
+
+class VisitorManager : public framework::Runnable,
+ public StorageLink,
+ public framework::HtmlStatusReporter,
+ private VisitorMessageHandler,
+ private config::IFetcherCallback<vespa::config::content::core::StorVisitorConfig>,
+ private framework::MetricUpdateHook
+{
+private:
+ StorageComponentRegister& _componentRegister;
+ VisitorMessageSessionFactory& _messageSessionFactory;
+ std::vector<std::pair<std::shared_ptr<VisitorThread>,
+ std::map<api::VisitorId, std::string>
+ > > _visitorThread;
+
+ struct MessageInfo {
+ api::VisitorId id;
+ time_t timestamp;
+ uint64_t timeout;
+ std::string destination;
+ };
+
+ std::map<api::StorageMessage::Id, MessageInfo> _visitorMessages;
+ vespalib::Monitor _visitorLock;
+ uint64_t _visitorCounter;
+ config::ConfigFetcher _configFetcher;
+ std::shared_ptr<VisitorMetrics> _metrics;
+ uint32_t _maxFixedConcurrentVisitors;
+ uint32_t _maxVariableConcurrentVisitors;
+ uint32_t _maxVisitorQueueSize;
+ std::map<std::string, api::VisitorId> _nameToId;
+ StorageComponent _component;
+ framework::Thread::UP _thread;
+ CommandQueue<api::CreateVisitorCommand> _visitorQueue;
+ std::deque<std::pair<std::string,
+ framework::MicroSecTime> > _recentlyDeletedVisitors;
+ framework::MicroSecTime _recentlyDeletedMaxTime;
+
+ mutable vespalib::Lock _statusLock; // Only one can get status at a time
+ mutable vespalib::Monitor _statusMonitor; // Notify when done
+ mutable std::vector<std::shared_ptr<RequestStatusPageReply> >
+ _statusRequest;
+ bool _enforceQueueUse;
+ VisitorFactory::Map _visitorFactories;
+
+ VisitorManager(const VisitorManager &);
+ VisitorManager& operator=(const VisitorManager &);
+
+public:
+ VisitorManager(const config::ConfigUri & configUri, StorageComponentRegister&,
+ VisitorMessageSessionFactory&,
+ const VisitorFactory::Map& external = VisitorFactory::Map());
+ virtual ~VisitorManager();
+
+ virtual void onClose();
+
+ virtual void print(std::ostream& out, bool verbose,
+ const std::string& indent) const;
+
+ uint32_t getActiveVisitorCount() const;
+
+ void setTimeBetweenTicks(uint32_t time);
+
+ void setMaxConcurrentVisitors(uint32_t count) { // Used in unit testing
+ _maxFixedConcurrentVisitors = count;
+ _maxVariableConcurrentVisitors = 0;
+ }
+
+ // Used in unit testing
+ void setMaxConcurrentVisitors(uint32_t fixed, uint32_t variable) {
+ _maxFixedConcurrentVisitors = fixed;
+ _maxVariableConcurrentVisitors = variable;
+ }
+
+ void setMaxVisitorQueueSize(uint32_t count) { // Used in unit testing
+ _maxVisitorQueueSize = count;
+ }
+
+ /** For unit testing */
+ VisitorThread& getThread(uint32_t index) {
+ return *_visitorThread[index].first;
+ }
+ /** For unit testing */
+ bool hasPendingMessageState() const;
+
+ void enforceQueueUsage() { _enforceQueueUse = true; }
+
+private:
+ void configure(std::unique_ptr<vespa::config::content::core::StorVisitorConfig>);
+ virtual void run(framework::ThreadHandle&);
+
+ /**
+ * Schedules a visitor for running. onCreateVisitor will typically call
+ * this with skipQueue = false, and closed(id) will typically call it with
+ * skipQueue = true to schedule next visitor in queue.
+ *
+ * @return True if successful, false if failed and reply is sent.
+ */
+ bool scheduleVisitor(const std::shared_ptr<api::CreateVisitorCommand>&,
+ bool skipQueue, vespalib::MonitorGuard& visitorLock);
+
+ bool onCreateVisitor(const std::shared_ptr<api::CreateVisitorCommand>&);
+
+ bool onDown(const std::shared_ptr<api::StorageMessage>& r);
+ bool onInternalReply(const std::shared_ptr<api::InternalReply>& r);
+ bool processReply(const std::shared_ptr<api::StorageReply>&);
+
+ /**
+ * Internal function that is used for scheduling the highest
+ * priority visitor--if any--for running. Called automatically
+ * by closed(id). visitorLock must be held at the time of the call,
+ * and will in the case of a successful scheduling be unlocked, as
+ * scheduleVisitor() is called internally. If more* visitors are
+ * to be attempted scheduled, the lock must first be re-acquired.
+ *
+ * @return true if a visitor was removed from the queue and scheduled,
+ * false otherwise.
+ */
+ bool attemptScheduleQueuedVisitor(vespalib::MonitorGuard& visitorLock);
+
+ // VisitorMessageHandler implementation
+ void send(const std::shared_ptr<api::StorageCommand>& cmd,
+ Visitor& visitor);
+ void send(const std::shared_ptr<api::StorageReply>& reply);
+ void closed(api::VisitorId id);
+
+ // Status::Reporter implementation
+ virtual void reportHtmlStatus(std::ostream&,
+ const framework::HttpUrlPath&) const;
+
+ /**
+ * The maximum amount of concurrent visitors for a priority is given
+ * by the formula: fixed + variable * ((255 - priority) / 255)
+ */
+ uint32_t maximumConcurrent(const api::CreateVisitorCommand& cmd) const {
+ return _maxFixedConcurrentVisitors + static_cast<uint32_t>(
+ _maxVariableConcurrentVisitors
+ * ((255.0 - cmd.getPriority()) / 255.0));
+ }
+
+ void updateMetrics(const MetricLockGuard &) override;
+};
+
+}
+
diff --git a/storage/src/vespa/storage/visiting/visitormessagesession.h b/storage/src/vespa/storage/visiting/visitormessagesession.h
new file mode 100644
index 00000000000..d61cde818d5
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/visitormessagesession.h
@@ -0,0 +1,28 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * \class storage::VisitorMessageSession
+ */
+#pragma once
+
+#include <vespa/messagebus/result.h>
+
+namespace documentapi {
+ class DocumentMessage;
+}
+
+namespace storage {
+
+struct VisitorMessageSession {
+ typedef std::unique_ptr<VisitorMessageSession> UP;
+
+ virtual ~VisitorMessageSession() {}
+
+ virtual mbus::Result send(std::unique_ptr<documentapi::DocumentMessage>) = 0;
+
+ /** @return Returns the number of pending messages this session has. */
+ virtual uint32_t pending() = 0;
+
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/visiting/visitormessagesessionfactory.h b/storage/src/vespa/storage/visiting/visitormessagesessionfactory.h
new file mode 100644
index 00000000000..3cc6015a926
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/visitormessagesessionfactory.h
@@ -0,0 +1,25 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/storage/visiting/visitormessagesession.h>
+
+namespace storage {
+
+class Visitor;
+class VisitorThread;
+
+struct VisitorMessageSessionFactory {
+ typedef std::unique_ptr<VisitorMessageSessionFactory> UP;
+
+ virtual ~VisitorMessageSessionFactory() {}
+
+ virtual VisitorMessageSession::UP createSession(Visitor&,
+ VisitorThread&) = 0;
+
+ virtual documentapi::Priority::Value toDocumentPriority(
+ uint8_t storagePriority) const = 0;
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/visiting/visitormetrics.h b/storage/src/vespa/storage/visiting/visitormetrics.h
new file mode 100644
index 00000000000..100c2e70896
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/visitormetrics.h
@@ -0,0 +1,76 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::VisitorMetrics
+ * @ingroup visiting
+ *
+ * @brief Metrics for visiting.
+ *
+ * @version $Id$
+ */
+#pragma once
+
+#include <vespa/metrics/metrics.h>
+#include <vespa/storage/visiting/visitorthreadmetrics.h>
+
+namespace storage {
+
+struct VisitorMetrics : public metrics::MetricSet
+{
+ metrics::LongAverageMetric queueSize;
+ metrics::LongCountMetric queueSkips;
+ metrics::LongCountMetric queueFull;
+ metrics::LongAverageMetric queueWaitTime;
+ metrics::LongAverageMetric queueTimeoutWaitTime;
+ metrics::LongAverageMetric queueEvictedWaitTime;
+ std::vector<std::shared_ptr<VisitorThreadMetrics> > threads;
+ metrics::SumMetric<MetricSet> sum;
+
+ VisitorMetrics()
+ : metrics::MetricSet("visitor", "visitor", ""),
+ queueSize("cv_queuesize", "", "Size of create visitor queue", this),
+ queueSkips("cv_skipqueue", "",
+ "Number of times we could skip queue as we had free visitor "
+ "spots", this),
+ queueFull("cv_queuefull", "",
+ "Number of create visitor messages failed as queue is full",
+ this),
+ queueWaitTime("cv_queuewaittime", "",
+ "Milliseconds waiting in create visitor queue, for visitors "
+ "that was added to visitor queue but scheduled later", this),
+ queueTimeoutWaitTime("cv_queuetimeoutwaittime", "",
+ "Milliseconds waiting in create visitor queue, for visitors "
+ "that timed out while in the visitor quueue", this),
+ queueEvictedWaitTime("cv_queueevictedwaittime", "",
+ "Milliseconds waiting in create visitor queue, for visitors "
+ "that was evicted from queue due to higher priority visitors "
+ "coming", this),
+ threads(),
+ sum("allthreads", "sum", "", this)
+ {
+ queueSize.unsetOnZeroValue();
+ }
+
+ void initThreads(uint16_t threadCount,
+ const metrics::LoadTypeSet& loadTypes)
+ {
+ if (!threads.empty()) {
+ throw vespalib::IllegalStateException(
+ "Cannot initialize visitor metrics twice", VESPA_STRLOC);
+ }
+ threads.clear();
+ threads.resize(threadCount);
+ for (uint32_t i=0; i<threads.size(); ++i) {
+ std::ostringstream ost;
+ ost << "visitor_thread_" << i;
+ threads[i].reset(new VisitorThreadMetrics(
+ ost.str(),
+ ost.str(),
+ loadTypes));
+ registerMetric(*threads[i]);
+ sum.addMetricToSum(*threads[i]);
+ }
+ }
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/visiting/visitorthread.cpp b/storage/src/vespa/storage/visiting/visitorthread.cpp
new file mode 100644
index 00000000000..e202e259cc9
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/visitorthread.cpp
@@ -0,0 +1,818 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/fastos/fastos.h>
+#include <vespa/storage/visiting/visitorthread.h>
+
+#include <vespa/document/repo/documenttyperepo.h>
+#include <vespa/document/select/bodyfielddetector.h>
+#include <vespa/document/select/orderingselector.h>
+#include <vespa/document/select/parser.h>
+#include <vespa/log/log.h>
+#include <vespa/messagebus/rpcmessagebus.h>
+#include <vespa/storage/common/statusmessages.h>
+#include <vespa/storage/config/config-stor-server.h>
+#include <vespa/storage/visiting/messages.h>
+#include <vespa/storageapi/message/datagram.h>
+#include <vespa/storageapi/message/visitor.h>
+#include <algorithm>
+#include <locale>
+#include <iostream>
+#include <string>
+
+LOG_SETUP(".visitor.thread");
+
+using storage::api::ReturnCode;
+
+namespace storage {
+
+VisitorThread::Event::Event(Event&& other)
+ : _visitorId(other._visitorId),
+ _message(other._message),
+ _mbusReply(std::move(other._mbusReply)),
+ _timer(other._timer),
+ _type(other._type)
+{
+}
+
+VisitorThread::Event&
+VisitorThread::Event::operator= (Event&& other)
+{
+ _visitorId = other._visitorId;
+ _message = other._message;
+ _mbusReply = std::move(other._mbusReply);
+ _timer = other._timer;
+ _type = other._type;
+ return *this;
+}
+
+VisitorThread::Event::Event(
+ api::VisitorId visitor,
+ const std::shared_ptr<api::StorageMessage>& msg)
+ : _visitorId(visitor),
+ _message(msg),
+ _timer(),
+ _type(PERSISTENCE)
+{
+}
+
+VisitorThread::Event::Event(
+ api::VisitorId visitor,
+ mbus::Reply::UP reply)
+ : _visitorId(visitor),
+ _mbusReply(std::move(reply)),
+ _timer(),
+ _type(MBUS)
+{
+}
+
+namespace {
+ vespalib::stringref getThreadName(uint32_t i) {
+ vespalib::asciistream name;
+ name << "Visitor thread " << i;
+ return name.str();
+ }
+}
+
+VisitorThread::VisitorThread(uint32_t threadIndex,
+ StorageComponentRegister& componentRegister,
+ VisitorMessageSessionFactory& messageSessionFac,
+ VisitorFactory::Map& visitorFactories,
+ VisitorThreadMetrics& metrics,
+ VisitorMessageHandler& sender)
+ : _visitors(),
+ _recentlyCompleted(),
+ _queue(),
+ _queueMonitor(),
+ _currentlyRunningVisitor(_visitors.end()),
+ _messageSender(sender),
+ _metrics(metrics),
+ _threadIndex(threadIndex),
+ _disconnectedVisitorTimeout(0), // Need config to set values
+ _ignoreNonExistingVisitorTimeLimit(0),
+ _defaultParallelIterators(0),
+ _iteratorsPerBucket(1),
+ _defaultPendingMessages(0),
+ _defaultDocBlockSize(0),
+ _visitorMemoryUsageLimit(UINT32_MAX),
+ _defaultDocBlockTimeout(180000),
+ _timeBetweenTicks(1000),
+ _component(componentRegister, getThreadName(threadIndex)),
+ _messageSessionFactory(messageSessionFac),
+ _visitorFactories(visitorFactories),
+ _memoryBufferAlloc(
+ _component.getMemoryManager().getAllocationType("VISITOR_BUFFER"))
+{
+ framework::MilliSecTime maxProcessingTime(30 * 1000);
+ framework::MilliSecTime waitTime(1000);
+ _thread = _component.startThread(*this, maxProcessingTime, waitTime);
+ _component.registerMetricUpdateHook(*this, framework::SecondTime(5));
+}
+
+VisitorThread::~VisitorThread()
+{
+ if (_thread.get() != 0) {
+ _thread->interruptAndJoin(&_queueMonitor);
+ }
+}
+
+void
+VisitorThread::updateMetrics(const MetricLockGuard &) {
+ vespalib::MonitorGuard sync(_queueMonitor);
+ _metrics.queueSize.addValue(_queue.size());
+}
+
+void
+VisitorThread::shutdown()
+{
+ // Stop event thread
+ if (_thread.get() != 0) {
+ _thread->interruptAndJoin(&_queueMonitor);
+ _thread.reset(0);
+ }
+
+ // Answer all queued up commands and clear queue
+ {
+ vespalib::MonitorGuard sync(_queueMonitor);
+ for (std::deque<Event>::iterator it = _queue.begin();
+ it != _queue.end(); ++it)
+ {
+ if (it->_message.get()) {
+ if (!it->_message->getType().isReply()
+ && (it->_message->getType() != api::MessageType::INTERNAL
+ || static_cast<const api::InternalCommand&>(*it->_message)
+ .getType() != PropagateVisitorConfig::ID))
+ {
+ std::shared_ptr<api::StorageReply> reply(
+ static_cast<api::StorageCommand&>(*it->_message)
+ .makeReply().release());
+ reply->setResult(api::ReturnCode(api::ReturnCode::ABORTED,
+ "Shutting down storage node."));
+ _messageSender.send(reply);
+ }
+ }
+ }
+ _queue.clear();
+ }
+ // Close all visitors. Send create visitor replies
+ for (VisitorMap::iterator it = _visitors.begin();
+ it != _visitors.end();)
+ {
+ LOG(debug, "Force-closing visitor %s as we're shutting down.",
+ it->second->getVisitorName().c_str());
+ _currentlyRunningVisitor = it++;
+ _currentlyRunningVisitor->second->forceClose();
+ close();
+ }
+}
+
+void
+VisitorThread::processMessage(api::VisitorId id,
+ const std::shared_ptr<api::StorageMessage>& msg)
+{
+ Event m(id, msg);
+ vespalib::MonitorGuard sync(_queueMonitor);
+ _queue.push_back(Event(id, msg));
+ sync.signal();
+}
+
+VisitorThread::Event
+VisitorThread::popNextQueuedEventIfAvailable()
+{
+ vespalib::MonitorGuard guard(_queueMonitor);
+ if (!_queue.empty()) {
+ Event e(std::move(_queue.front()));
+ _queue.pop_front();
+ return e;
+ }
+ return {};
+}
+
+void
+VisitorThread::run(framework::ThreadHandle& thread)
+{
+ LOG(debug, "Started visitor thread with pid %d.", getpid());
+ // Loop forever. Process the visiting input message queue and periodicly
+ // give visitors something to trigger of.
+ Event entry;
+ while (!thread.interrupted()) {
+ thread.registerTick(framework::PROCESS_CYCLE);
+
+ // Get next message from input queue
+ entry = popNextQueuedEventIfAvailable();
+ if (entry.empty()) {
+ // If none, give visitors something to trigger of.
+ tick();
+ vespalib::MonitorGuard guard(_queueMonitor);
+ if (_queue.empty()) {
+ guard.wait(_timeBetweenTicks);
+ thread.registerTick(framework::WAIT_CYCLE);
+ }
+ continue;
+ } else {
+ // Don't count propagate visitor commands as actual visitor
+ // commands. (Not counting it makes metric be unused and
+ // disappear when no visiting is done)
+ if (entry._message.get() &&
+ (entry._message->getType() != api::MessageType::INTERNAL
+ || static_cast<api::InternalCommand&>(*entry._message).getType() != PropagateVisitorConfig::ID))
+ {
+ entry._timer.stop(_metrics.averageQueueWaitingTime[entry._message->getLoadType()]);
+ }
+ }
+
+ bool handled = false;
+ ReturnCode result(ReturnCode::OK);
+ try{
+ _currentlyRunningVisitor = _visitors.find(entry._visitorId);
+
+ if (entry._message.get()) {
+ // If visitor doesn't exist, log failure only if it wasn't
+ // recently deleted
+ if (_currentlyRunningVisitor == _visitors.end() &&
+ entry._message->getType() != api::MessageType::VISITOR_CREATE &&
+ entry._message->getType() != api::MessageType::INTERNAL)
+ {
+ handleNonExistingVisitorCall(entry, result);
+ } else {
+ handled = entry._message->callHandler(*this, entry._message);
+ }
+ } else {
+ if (_currentlyRunningVisitor == _visitors.end()) {
+ handleNonExistingVisitorCall(entry, result);
+ } else {
+ _currentlyRunningVisitor->second->handleDocumentApiReply(
+ std::move(entry._mbusReply), _metrics);
+ if (_currentlyRunningVisitor->second->isCompleted()) {
+ close();
+ }
+ handled = true;
+ }
+ }
+
+ if (!handled) {
+ result = ReturnCode(ReturnCode::IGNORED, "Unwanted");
+ }
+ } catch (std::exception& e) {
+ vespalib::asciistream ost;
+ ost << "Failed to handle visitor message:" << e.what();
+ LOG(warning, "Failed handling visitor message: %s", ost.str().c_str());
+ result = ReturnCode(ReturnCode::INTERNAL_FAILURE, ost.str());
+ if (entry._message.get() && entry._message->getType() == api::MessageType::VISITOR_CREATE) {
+ _messageSender.closed(entry._visitorId);
+ _metrics.failedVisitors[entry._message->getLoadType()].inc(1);
+ }
+ }
+ _currentlyRunningVisitor = _visitors.end();
+
+ if (!handled && entry._message.get() &&
+ !entry._message->getType().isReply())
+ {
+ api::StorageCommand& cmd(
+ dynamic_cast<api::StorageCommand&>(*entry._message));
+ std::shared_ptr<api::StorageReply> reply(
+ cmd.makeReply().release());
+ reply->setResult(result);
+ _messageSender.send(reply);
+ }
+ }
+}
+
+void
+VisitorThread::tick()
+{
+ // Give all visitors an event
+ for (VisitorMap::iterator it = _visitors.begin(); it != _visitors.end();)
+ {
+ LOG(spam, "Giving tick to visitor %s.",
+ it->second->getVisitorName().c_str());
+ it->second->continueVisitor();
+ if (it->second->isCompleted()) {
+ LOG(debug, "Closing visitor %s. Visitor marked as completed",
+ it->second->getVisitorName().c_str());
+ _currentlyRunningVisitor = it++;
+ close();
+ } else {
+ ++it;
+ }
+ }
+}
+
+void
+VisitorThread::close()
+{
+ framework::MicroSecTime closeTime(_component.getClock().getTimeInMicros());
+
+ Visitor& v = *_currentlyRunningVisitor->second;
+
+ documentapi::LoadType loadType(v.getLoadType());
+
+ _metrics.averageVisitorLifeTime[loadType].addValue(
+ (closeTime - v.getStartTime()).getMillis().getTime());
+ v.finalize();
+ _messageSender.closed(_currentlyRunningVisitor->first);
+ if (v.failed()) {
+ _metrics.abortedVisitors[loadType].inc(1);
+ } else {
+ _metrics.completedVisitors[loadType].inc(1);
+ }
+ framework::SecondTime currentTime(
+ _component.getClock().getTimeInSeconds());
+ trimRecentlyCompletedList(currentTime);
+ _recentlyCompleted.push_back(std::make_pair(
+ _currentlyRunningVisitor->first, currentTime));
+ _visitors.erase(_currentlyRunningVisitor);
+ _currentlyRunningVisitor = _visitors.end();
+}
+
+void
+VisitorThread::trimRecentlyCompletedList(framework::SecondTime currentTime)
+{
+ framework::SecondTime recentLimit(
+ currentTime - framework::SecondTime(30));
+ // Dump all elements that aren't recent anymore
+ while (!_recentlyCompleted.empty()
+ && _recentlyCompleted.front().second < recentLimit)
+ {
+ _recentlyCompleted.pop_front();
+ }
+}
+
+void
+VisitorThread::handleNonExistingVisitorCall(const Event& entry,
+ ReturnCode& code)
+{
+ // Get current time. Set the time that is the oldest still recent.
+ framework::SecondTime currentTime(
+ _component.getClock().getTimeInSeconds());;
+ trimRecentlyCompletedList(currentTime);
+
+ // Go through all recent visitors. Ignore request if recent
+ for (std::deque<std::pair<api::VisitorId, framework::SecondTime> >
+ ::iterator it = _recentlyCompleted.begin();
+ it != _recentlyCompleted.end(); ++it)
+ {
+ if (it->first == entry._visitorId) {
+ code = ReturnCode(ReturnCode::ILLEGAL_PARAMETERS,
+ "Visitor recently completed/failed/aborted.");
+ return;
+ }
+ }
+
+ vespalib::asciistream ost;
+ ost << "Visitor " << entry._visitorId << " no longer exist";
+ code = ReturnCode(ReturnCode::ILLEGAL_PARAMETERS, ost.str());
+}
+
+/**
+ * Utility function to get a visitor instance from a given library.
+ */
+std::shared_ptr<Visitor>
+VisitorThread::createVisitor(const vespalib::stringref & libName,
+ const vdslib::Parameters& params,
+ vespalib::asciistream & error)
+{
+ vespalib::string str = libName;
+ std::transform(str.begin(), str.end(), str.begin(), tolower);
+
+ VisitorFactory::Map::iterator it(_visitorFactories.find(str));
+ if (it == _visitorFactories.end()) {
+ error << "Visitor library " << str << " not found.";
+ return std::shared_ptr<Visitor>();
+ }
+
+ LibMap::iterator libIter = _libs.find(str);
+ if (libIter == _libs.end()) {
+ _libs[str] = std::shared_ptr<VisitorEnvironment>(
+ it->second->makeVisitorEnvironment(_component).release());
+ libIter = _libs.find(str);
+ }
+
+ try{
+ std::shared_ptr<Visitor> visitor(it->second->makeVisitor(
+ _component, *libIter->second, params));
+ if (!visitor.get()) {
+ error << "Factory function in '" << str << "' failed.";
+ }
+ return visitor;
+ } catch (std::exception& e) {
+ error << "Failed to create visitor instance of type " << libName
+ << ": " << e.what();
+ return std::shared_ptr<Visitor>();
+ }
+}
+
+namespace {
+ std::unique_ptr<api::StorageMessageAddress>
+ getDataAddress(const api::CreateVisitorCommand& cmd)
+ {
+ return std::unique_ptr<api::StorageMessageAddress>(
+ new api::StorageMessageAddress(
+ mbus::Route::parse(cmd.getDataDestination())));
+ }
+
+ std::unique_ptr<api::StorageMessageAddress>
+ getControlAddress(const api::CreateVisitorCommand& cmd)
+ {
+ return std::unique_ptr<api::StorageMessageAddress>(
+ new api::StorageMessageAddress(
+ mbus::Route::parse(cmd.getControlDestination())));
+ }
+
+void
+validateDocumentSelection(const document::DocumentTypeRepo& repo,
+ const document::select::Node& selection)
+{
+ // Force building a field path for all field references since field path
+ // correctness is not checked during regular document selection parsing.
+ // This is not in any way speed optimal, but is far less intrusive and
+ // risky than trying to rewrite the logic of Visitor/VisitorThread
+ // to handle exceptions thrown during attach()/continueVisitor().
+ try {
+ document::select::BodyFieldDetector detector(repo);
+ selection.visit(detector);
+ } catch (vespalib::IllegalArgumentException& e) {
+ throw document::select::ParsingFailedException(e.getMessage());
+ }
+}
+
+}
+
+bool
+VisitorThread::onCreateVisitor(
+ const std::shared_ptr<api::CreateVisitorCommand>& cmd)
+{
+ metrics::MetricTimer visitorTimer;
+ assert(_defaultDocBlockSize); // Ensure we've gotten a config
+ assert(_currentlyRunningVisitor == _visitors.end());
+ ReturnCode result(ReturnCode::OK);
+ std::unique_ptr<document::select::Node> docSelection;
+ std::unique_ptr<api::StorageMessageAddress> controlAddress;
+ std::unique_ptr<api::StorageMessageAddress> dataAddress;
+ std::shared_ptr<Visitor> visitor;
+ do {
+ // If no buckets are specified, fail command
+ if (cmd->getBuckets().size() == 0) {
+ result = ReturnCode(ReturnCode::ILLEGAL_PARAMETERS,
+ "No buckets specified");
+ LOG(warning, "CreateVisitor(%s): No buckets specified. Aborting.",
+ cmd->getInstanceId().c_str());
+ break;
+ }
+ // Get the source address
+ controlAddress = getControlAddress(*cmd);
+ dataAddress = getDataAddress(*cmd);
+ // Attempt to load library containing visitor
+ vespalib::asciistream errors;
+ visitor = createVisitor(cmd->getLibraryName(), cmd->getParameters(),
+ errors);
+ if (visitor.get() == 0) {
+ result = ReturnCode(ReturnCode::ILLEGAL_PARAMETERS, errors.str());
+ LOG(warning, "CreateVisitor(%s): Failed to create visitor: %s",
+ cmd->getInstanceId().c_str(), errors.str().c_str());
+ break;
+ }
+ visitor->setAllocationType(_memoryBufferAlloc);
+ visitor->setMemoryManager(_component.getMemoryManager());
+ // Set visitor parameters
+ if (cmd->getMaximumPendingReplyCount() != 0) {
+ visitor->setMaxPending(cmd->getMaximumPendingReplyCount());
+ } else {
+ visitor->setMaxPending(_defaultPendingMessages);
+ }
+
+ visitor->setFieldSet(cmd->getFieldSet());
+
+ if (cmd->visitRemoves()) {
+ visitor->visitRemoves();
+ }
+
+ visitor->setMaxParallel(_defaultParallelIterators);
+ visitor->setMaxParallelPerBucket(_iteratorsPerBucket);
+
+ visitor->setDocBlockSize(_defaultDocBlockSize);
+ visitor->setMemoryUsageLimit(_visitorMemoryUsageLimit);
+
+ visitor->setDocBlockTimeout(_defaultDocBlockTimeout);
+ visitor->setVisitorInfoTimeout(_defaultVisitorInfoTimeout);
+ visitor->setOwnNodeIndex(_component.getIndex());
+
+ // Parse document selection
+ try{
+ if (cmd->getDocumentSelection() != "") {
+ document::DocumentTypeRepo::SP repo(
+ _component.getTypeRepo());
+ const document::BucketIdFactory& idFactory(
+ _component.getBucketIdFactory());
+ document::select::Parser parser(*repo, idFactory);
+ docSelection = parser.parse(cmd->getDocumentSelection());
+ validateDocumentSelection(*repo, *docSelection);
+ }
+ } catch (document::DocumentTypeNotFoundException& e) {
+ vespalib::asciistream ost;
+ ost << "Failed to parse document select string '"
+ << cmd->getDocumentSelection() << "': " << e.getMessage();
+ result = ReturnCode(ReturnCode::ILLEGAL_PARAMETERS, ost.str());
+ LOG(warning, "CreateVisitor(%s): %s",
+ cmd->getInstanceId().c_str(), ost.str().c_str());
+ break;
+ } catch (document::select::ParsingFailedException& e) {
+ vespalib::asciistream ost;
+ ost << "Failed to parse document select string '"
+ << cmd->getDocumentSelection() << "': " << e.getMessage();
+ result = ReturnCode(ReturnCode::ILLEGAL_PARAMETERS, ost.str());
+ LOG(warning, "CreateVisitor(%s): %s",
+ cmd->getInstanceId().c_str(), ost.str().c_str());
+ break;
+ }
+ LOG(debug, "CreateVisitor(%s): Successfully created visitor",
+ cmd->getInstanceId().c_str());
+ // Insert visitor prior to creating successful reply.
+ } while (false);
+ // Start the visitor last, as to ensure client will receive
+ // visitor create reply first, and that all errors we could detect
+ // resulted in proper error code in reply..
+ if (result.success()) {
+ _visitors[cmd->getVisitorId()] = visitor;
+ try{
+ std::unique_ptr<document::OrderingSpecification> order;
+ if (docSelection.get()) {
+ document::OrderingSelector selector;
+ order = selector.select(*docSelection,
+ cmd->getVisitorOrdering());
+ }
+ VisitorMessageSession::UP messageSession(
+ _messageSessionFactory.createSession(*visitor, *this));
+ documentapi::Priority::Value documentPriority =
+ _messageSessionFactory.toDocumentPriority(cmd->getPriority());
+ visitor->start(cmd->getVisitorId(),
+ cmd->getVisitorCmdId(),
+ cmd->getInstanceId(),
+ cmd->getBuckets(),
+ framework::MicroSecTime(cmd->getFromTime()),
+ framework::MicroSecTime(cmd->getToTime()),
+ std::move(docSelection),
+ cmd->getDocumentSelection(),
+ std::move(order),
+ _messageSender,
+ std::move(messageSession),
+ documentPriority);
+ visitor->attach(cmd, *controlAddress, *dataAddress,
+ framework::MilliSecTime(cmd->getTimeout()));
+ } catch (std::exception& e) {
+ // We don't handle exceptions from this code, as we've
+ // added visitor to internal structs we'll end up calling
+ // close() twice.
+ LOG(error, "Got exception we can't handle: %s", e.what());
+ assert(false);
+ }
+ _metrics.createdVisitors[visitor->getLoadType()].inc(1);
+ visitorTimer.stop(_metrics.averageVisitorCreationTime[visitor->getLoadType()]);
+ } else {
+ // Send reply
+ std::shared_ptr<api::CreateVisitorReply> reply(
+ new api::CreateVisitorReply(*cmd));
+ reply->setResult(result);
+ _messageSender.closed(cmd->getVisitorId());
+ _messageSender.send(reply);
+ }
+ return true;
+}
+
+void
+VisitorThread::handleMessageBusReply(mbus::Reply::UP reply,
+ Visitor& visitor)
+{
+ vespalib::MonitorGuard sync(_queueMonitor);
+ _queue.push_back(Event(visitor.getVisitorId(), std::move(reply)));
+ sync.broadcast();
+}
+
+bool
+VisitorThread::onInternal(const std::shared_ptr<api::InternalCommand>& cmd)
+{
+ switch (cmd->getType()) {
+ case PropagateVisitorConfig::ID:
+ {
+ PropagateVisitorConfig& pcmd(
+ dynamic_cast<PropagateVisitorConfig&>(*cmd));
+ const vespa::config::content::core::StorVisitorConfig& config(pcmd.getConfig());
+ if (_defaultDocBlockSize != 0) { // Live update
+ LOG(config, "Updating visitor thread configuration in visitor "
+ "thread %u: "
+ "Current config(disconnectedVisitorTimeout %u,"
+ " ignoreNonExistingVisitorTimeLimit %u,"
+ " defaultParallelIterators %u,"
+ " iteratorsPerBucket %u,"
+ " defaultPendingMessages %u,"
+ " defaultDocBlockSize %u,"
+ " visitorMemoryUsageLimit %u,"
+ " defaultDocBlockTimeout %" PRIu64 ","
+ " defaultVisitorInfoTimeout %" PRIu64 ") "
+ "New config(disconnectedVisitorTimeout %u,"
+ " ignoreNonExistingVisitorTimeLimit %u,"
+ " defaultParallelIterators %u,"
+ " defaultPendingMessages %u,"
+ " defaultDocBlockSize %u,"
+ " visitorMemoryUsageLimit %u,"
+ " defaultDocBlockTimeout %u,"
+ " defaultVisitorInfoTimeout %u) ",
+ _threadIndex,
+ _disconnectedVisitorTimeout,
+ _ignoreNonExistingVisitorTimeLimit,
+ _defaultParallelIterators,
+ _iteratorsPerBucket,
+ _defaultPendingMessages,
+ _defaultDocBlockSize,
+ _visitorMemoryUsageLimit,
+ _defaultDocBlockTimeout.getTime(),
+ _defaultVisitorInfoTimeout.getTime(),
+ config.disconnectedvisitortimeout,
+ config.ignorenonexistingvisitortimelimit,
+ config.defaultparalleliterators,
+ config.defaultpendingmessages,
+ config.defaultdocblocksize,
+ config.visitorMemoryUsageLimit,
+ config.defaultdocblocktimeout,
+ config.defaultinfotimeout
+ );
+ }
+ _disconnectedVisitorTimeout = config.disconnectedvisitortimeout;
+ _ignoreNonExistingVisitorTimeLimit
+ = config.ignorenonexistingvisitortimelimit;
+ _defaultParallelIterators = config.defaultparalleliterators;
+ _iteratorsPerBucket = config.iteratorsPerBucket;
+ _defaultPendingMessages = config.defaultpendingmessages;
+ _defaultDocBlockSize = config.defaultdocblocksize;
+ _visitorMemoryUsageLimit = config.visitorMemoryUsageLimit;
+ _defaultDocBlockTimeout.setTime(config.defaultdocblocktimeout);
+ _defaultVisitorInfoTimeout.setTime(config.defaultinfotimeout);
+ if (_defaultParallelIterators < 1) {
+ LOG(config, "Cannot use value of defaultParallelIterators < 1");
+ _defaultParallelIterators = 1;
+ }
+ if (_iteratorsPerBucket < 1 && _iteratorsPerBucket > 10) {
+ if (_iteratorsPerBucket < 1) _iteratorsPerBucket = 1;
+ else _iteratorsPerBucket = 10;
+ LOG(config, "Invalid value of iterators per bucket %u using %u",
+ config.iteratorsPerBucket, _iteratorsPerBucket);
+ }
+ if (_defaultPendingMessages < 1) {
+ LOG(config, "Cannot use value of defaultPendingMessages < 1");
+ _defaultPendingMessages = 1;
+ }
+ if (_defaultDocBlockSize < 1024) {
+ LOG(config, "Refusing to use default block size less than 1k");
+ _defaultDocBlockSize = 1024;
+ }
+ if (_defaultDocBlockTimeout.getTime() < 1) {
+ LOG(config, "Cannot use value of defaultDocBlockTimeout < 1");
+ _defaultDocBlockTimeout.setTime(1);
+ }
+ break;
+ }
+ case RequestStatusPage::ID:
+ {
+ LOG(spam, "Got RequestStatusPage request");
+ RequestStatusPage& rsp(dynamic_cast<RequestStatusPage&>(*cmd));
+ vespalib::asciistream ost;
+ getStatus(ost, rsp.getPath());
+ std::shared_ptr<RequestStatusPageReply> reply(
+ new RequestStatusPageReply(rsp, ost.str()));
+ _messageSender.send(reply);
+ break;
+ }
+ default:
+ {
+ LOG(error, "Got unknown internal message type %u: %s",
+ cmd->getType(), cmd->toString().c_str());
+ return false;
+ }
+ }
+ return true;
+}
+
+bool
+VisitorThread::onInternalReply(const std::shared_ptr<api::InternalReply>& r)
+{
+ switch (r->getType()) {
+ case GetIterReply::ID:
+ {
+ std::shared_ptr<GetIterReply> reply(
+ std::dynamic_pointer_cast<GetIterReply>(r));
+ assert(reply.get());
+ _currentlyRunningVisitor->second->onGetIterReply(
+ reply, _metrics);
+ if (_currentlyRunningVisitor->second->isCompleted()) {
+ LOG(debug, "onGetIterReply(%s): Visitor completed.",
+ _currentlyRunningVisitor->second->getVisitorName().c_str());
+ close();
+ }
+ break;
+ }
+ case CreateIteratorReply::ID:
+ {
+ std::shared_ptr<CreateIteratorReply> reply(
+ std::dynamic_pointer_cast<CreateIteratorReply>(r));
+ assert(reply.get());
+ _currentlyRunningVisitor->second->onCreateIteratorReply(
+ reply, _metrics);
+ break;
+ }
+ default:
+ {
+ LOG(error, "Got unknown internal message type %u: %s",
+ r->getType(), r->toString().c_str());
+ return false;
+ }
+ }
+ return true;
+}
+
+void
+VisitorThread::getStatus(vespalib::asciistream& out,
+ const framework::HttpUrlPath& path) const
+{
+ bool showAll(path.hasAttribute("allvisitors"));
+ bool verbose(path.hasAttribute("verbose"));
+ uint32_t visitor(path.get("visitor", 0u));
+ bool status(!path.hasAttribute("visitor"));
+
+ if (status && verbose) {
+ out << "<h3>Visitor libraries loaded</h3>\n<ul>\n";
+ if (_libs.size() == 0) {
+ out << "None\n";
+ }
+ for (LibMap::const_iterator it = _libs.begin(); it != _libs.end(); ++it)
+ {
+ out << "<li>" << it->first << "\n";
+ }
+ out << "</ul>\n";
+
+ out << "<h3>Recently completed/failed/aborted visitors</h3>\n<ul>\n";
+ if (_recentlyCompleted.size() == 0) {
+ out << "None\n";
+ }
+ for (std::deque<std::pair<api::VisitorId, framework::SecondTime> >
+ ::const_iterator it = _recentlyCompleted.begin();
+ it != _recentlyCompleted.end(); ++it)
+ {
+ out << "<li> Visitor " << it->first << " done at "
+ << it->second.getTime() << "\n";
+ }
+ out << "</ul>\n";
+ out << "<h3>Current queue size: " << _queue.size() << "</h3>\n";
+ out << "<h3>Config:</h3>\n"
+ << "<table border=\"1\"><tr><td>Parameter</td><td>Value</td></tr>\n"
+ << "<tr><td>Disconnected visitor timeout</td><td>"
+ << _disconnectedVisitorTimeout << "</td></tr>\n"
+ << "<tr><td>Ignore non-existing visitor timelimit</td><td>"
+ << _ignoreNonExistingVisitorTimeLimit << "</td></tr>\n"
+ << "<tr><td>Default parallel iterators</td><td>"
+ << _defaultParallelIterators << "</td></tr>\n"
+ << "<tr><td>Iterators per bucket</td><td>"
+ << _iteratorsPerBucket << "</td></tr>\n"
+ << "<tr><td>Default pending messages</td><td>"
+ << _defaultPendingMessages << "</td></tr>\n"
+ << "<tr><td>Default DocBlock size</td><td>"
+ << _defaultDocBlockSize << "</td></tr>\n"
+ << "<tr><td>Default DocBlock timeout (ms)</td><td>"
+ << _defaultDocBlockTimeout.getTime() << "</td></tr>\n"
+ << "<tr><td>Visitor memory usage limit</td><td>"
+ << _visitorMemoryUsageLimit << "</td></tr>\n"
+ << "</table>\n";
+ }
+ if (showAll) {
+ for (VisitorMap::const_iterator it = _visitors.begin();
+ it != _visitors.end(); ++it)
+ {
+ out << "<h3>Visitor " << it->first << "</h3>\n";
+ std::ostringstream tmp;
+ it->second->getStatus(tmp, verbose);
+ out << tmp.str();
+ }
+ } else if (path.hasAttribute("visitor")) {
+ out << "<h3>Visitor " << visitor << "</h3>\n";
+ VisitorMap::const_iterator it = _visitors.find(visitor);
+ if (it == _visitors.end()) {
+ out << "Not found\n";
+ } else {
+ std::ostringstream tmp;
+ it->second->getStatus(tmp, verbose);
+ out << tmp.str();
+ }
+ } else { // List visitors
+ out << "<h3>Active visitors</h3>\n";
+ if (_visitors.size() == 0) {
+ out << "None\n";
+ }
+ for (VisitorMap::const_iterator it = _visitors.begin();
+ it != _visitors.end(); ++it)
+ {
+ out << "<a href=\"?visitor=" << it->first
+ << (verbose ? "&verbose" : "") << "\">Visitor "
+ << it->first << "</a><br>\n";
+ }
+ }
+}
+
+} // storage
diff --git a/storage/src/vespa/storage/visiting/visitorthread.h b/storage/src/vespa/storage/visiting/visitorthread.h
new file mode 100644
index 00000000000..545ffc6421f
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/visitorthread.h
@@ -0,0 +1,152 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class VisitorThread
+ * @ingroup visiting
+ *
+ * @brief Thread running visitors.
+ *
+ * This thread ensures that everything concerning one visitor runs in a
+ * single thread. This simplifies the visitors as they don't have to
+ * worry about locking, and it is a lot easier to abort visitors when you
+ * know other threads isn't using the visitors.
+ */
+
+#pragma once
+
+#include <deque>
+#include <vespa/vespalib/util/document_runnable.h>
+#include <vespa/metrics/metrictimer.h>
+#include <vespa/storageapi/messageapi/messagehandler.h>
+#include <vespa/vespalib/util/sync.h>
+#include <vespa/storage/common/storagecomponent.h>
+#include <vespa/storage/persistence/messages.h>
+#include <vespa/storage/visiting/visitor.h>
+#include <vespa/storage/visiting/visitormetrics.h>
+#include <vespa/storage/visiting/visitormessagesessionfactory.h>
+#include <vespa/storageframework/storageframework.h>
+
+namespace storage {
+
+class VisitorThread : public framework::Runnable,
+ private api::MessageHandler,
+ private framework::MetricUpdateHook
+{
+ typedef std::map<std::string, std::shared_ptr<VisitorEnvironment> > LibMap;
+ LibMap _libs;
+
+ typedef std::map<api::VisitorId, std::shared_ptr<Visitor> > VisitorMap;
+ VisitorMap _visitors;
+ std::deque<std::pair<api::VisitorId,
+ framework::SecondTime> > _recentlyCompleted;
+
+ struct Event {
+ enum Type {
+ MBUS,
+ PERSISTENCE,
+ NONE
+ };
+
+ api::VisitorId _visitorId;
+ std::shared_ptr<api::StorageMessage> _message;
+ mbus::Reply::UP _mbusReply;
+
+ metrics::MetricTimer _timer;
+ Type _type;
+
+ Event() : _visitorId(0), _message(), _timer(), _type(NONE) {}
+ Event(Event&& other);
+ Event& operator= (Event&& other);
+ Event(const Event& other) = delete;
+ Event& operator= (const Event& other) = delete;
+ Event(api::VisitorId visitor, mbus::Reply::UP reply);
+ Event(api::VisitorId visitor,
+ const std::shared_ptr<api::StorageMessage>& msg);
+
+ bool empty() const noexcept {
+ return (_type == NONE);
+ }
+ };
+
+ std::deque<Event> _queue;
+ vespalib::Monitor _queueMonitor;
+
+ VisitorMap::iterator _currentlyRunningVisitor;
+ VisitorMessageHandler& _messageSender;
+ VisitorThreadMetrics& _metrics;
+ uint32_t _threadIndex;
+ uint32_t _disconnectedVisitorTimeout;
+ uint32_t _ignoreNonExistingVisitorTimeLimit;
+ uint32_t _defaultParallelIterators;
+ uint32_t _iteratorsPerBucket;
+ uint32_t _defaultPendingMessages;
+ uint32_t _defaultDocBlockSize;
+ uint32_t _visitorMemoryUsageLimit;
+ framework::MilliSecTime _defaultDocBlockTimeout;
+ framework::MilliSecTime _defaultVisitorInfoTimeout;
+ uint32_t _timeBetweenTicks;
+ StorageComponent _component;
+ framework::Thread::UP _thread;
+ VisitorMessageSessionFactory& _messageSessionFactory;
+ VisitorFactory::Map& _visitorFactories;
+ const framework::MemoryAllocationType& _memoryBufferAlloc;
+
+public:
+ VisitorThread(uint32_t threadIndex,
+ StorageComponentRegister&,
+ VisitorMessageSessionFactory&,
+ VisitorFactory::Map&,
+ VisitorThreadMetrics& metrics,
+ VisitorMessageHandler& sender);
+ ~VisitorThread();
+
+ void processMessage(api::VisitorId visitorId,
+ const std::shared_ptr<api::StorageMessage>& msg);
+
+ void shutdown();
+
+ void setTimeBetweenTicks(uint32_t time) { _timeBetweenTicks = time; }
+
+ void handleMessageBusReply(std::unique_ptr<mbus::Reply> reply, Visitor& visitor);
+
+ /** For unit tests needing to pause thread. */
+ vespalib::Monitor& getQueueMonitor() { return _queueMonitor; }
+
+ const VisitorThreadMetrics& getMetrics() const noexcept {
+ return _metrics;
+ }
+
+private:
+ virtual void run(framework::ThreadHandle&);
+ /**
+ * Attempt to fetch an event from the visitor thread's queue. If an event
+ * was available, pop it from the queue and return it. If not, return
+ * an empty event. This may be checked with the .empty() method on
+ * the returned event object.
+ */
+ Event popNextQueuedEventIfAvailable();
+ void tick();
+ void trimRecentlyCompletedList(framework::SecondTime currentTime);
+ void handleNonExistingVisitorCall(const Event& entry,
+ api::ReturnCode& code);
+
+ std::shared_ptr<Visitor> createVisitor(const vespalib::stringref & libName,
+ const vdslib::Parameters& params,
+ vespalib::asciistream & error);
+
+ bool onCreateVisitor(const std::shared_ptr<api::CreateVisitorCommand>&);
+
+ bool onVisitorReply(const std::shared_ptr<api::StorageReply>& reply);
+ bool onInternal(const std::shared_ptr<api::InternalCommand>&);
+ bool onInternalReply(const std::shared_ptr<api::InternalReply>&);
+
+ /** Deletes a visitor instance. */
+ void close();
+ void getStatus(vespalib::asciistream & out,
+ const framework::HttpUrlPath& path) const;
+
+ void updateMetrics(const MetricLockGuard &) override;
+
+};
+
+} // storage
+
diff --git a/storage/src/vespa/storage/visiting/visitorthreadmetrics.h b/storage/src/vespa/storage/visiting/visitorthreadmetrics.h
new file mode 100644
index 00000000000..8fa1c77eebd
--- /dev/null
+++ b/storage/src/vespa/storage/visiting/visitorthreadmetrics.h
@@ -0,0 +1,108 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+/**
+ * @class storage::VisitorThreadMetrics
+ * @ingroup visiting
+ *
+ * @brief Metrics for the visitor threads.
+ *
+ * @version $Id$
+ */
+#pragma once
+
+#include <vespa/metrics/metrics.h>
+
+namespace storage {
+
+struct VisitorThreadMetrics : public metrics::MetricSet
+{
+ typedef metrics::DoubleAverageMetric DOUBLE;
+ typedef metrics::LongAverageMetric COUNT;
+
+ metrics::LongAverageMetric queueSize;
+ metrics::LoadMetric<DOUBLE> averageQueueWaitingTime;
+ metrics::LoadMetric<DOUBLE> averageVisitorLifeTime;
+ metrics::LoadMetric<DOUBLE> averageVisitorCreationTime;
+ metrics::LoadMetric<DOUBLE> averageMessageSendTime;
+ metrics::LoadMetric<DOUBLE> averageProcessingTime;
+ metrics::LoadMetric<COUNT> createdVisitors;
+ metrics::LoadMetric<COUNT> abortedVisitors;
+ metrics::LoadMetric<COUNT> completedVisitors;
+ metrics::LoadMetric<COUNT> failedVisitors;
+ metrics::LoadMetric<COUNT> visitorDestinationFailureReplies;
+
+ VisitorThreadMetrics(const std::string& name,
+ const std::string& desc,
+ const metrics::LoadTypeSet& loadTypes)
+ : metrics::MetricSet(name, "visitor partofsum thread", desc),
+ queueSize("queuesize", "",
+ "Size of input message queue.", this),
+ averageQueueWaitingTime(
+ loadTypes,
+ DOUBLE("averagequeuewait",
+ "",
+ "Average time an operation spends in input queue."),
+ this),
+ averageVisitorLifeTime(
+ loadTypes,
+ DOUBLE("averagevisitorlifetime",
+ "",
+ "Average lifetime of a visitor"),
+ this),
+ averageVisitorCreationTime(
+ loadTypes,
+ DOUBLE("averagevisitorcreationtime",
+ "",
+ "Average time spent creating a visitor instance"),
+ this),
+ averageMessageSendTime(
+ loadTypes,
+ DOUBLE("averagemessagesendtime",
+ "",
+ "Average time it takes for messages to be sent to "
+ "their target (and be replied to)"),
+ this),
+ averageProcessingTime(
+ loadTypes,
+ DOUBLE("averageprocessingtime",
+ "",
+ "Average time visitor uses in handleDocuments() call"),
+ this),
+ createdVisitors(
+ loadTypes,
+ COUNT("created",
+ "",
+ "Number of visitors created."),
+ this),
+ abortedVisitors(
+ loadTypes,
+ COUNT("aborted",
+ "",
+ "Number of visitors aborted."),
+ this),
+ completedVisitors(
+ loadTypes,
+ COUNT("completed",
+ "",
+ "Number of visitors completed"),
+ this),
+ failedVisitors(
+ loadTypes,
+ COUNT("failed",
+ "",
+ "Number of visitors failed"),
+ this),
+ visitorDestinationFailureReplies(
+ loadTypes,
+ COUNT("destination_failure_replies",
+ "",
+ "Number of failure replies received from "
+ "the visitor destination"),
+ this)
+ {
+ queueSize.unsetOnZeroValue();
+ }
+
+};
+
+}
+