diff options
author | Geir Storli <geirstorli@yahoo.no> | 2017-02-03 15:41:13 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-02-03 15:41:13 +0100 |
commit | 929ff321f5a687e2f1b64cc12c49a79931a2ad0f (patch) | |
tree | 34d9db465ea070ea0728e8b175c8aab96444153c /searchlib | |
parent | 0cf5991a9f248f852d84ff0db34b5ac1fcac0361 (diff) | |
parent | 486efb8d160cb7bbb09f7b10743fd2ea76e2b099 (diff) |
Merge pull request #1669 from yahoo/toregge/add-reference-attribute
Add reference attribute.
Diffstat (limited to 'searchlib')
17 files changed, 964 insertions, 4 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index 57beefe47a2..9daf335306a 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -85,6 +85,7 @@ vespa_define_module( src/tests/attribute/multi_value_mapping src/tests/attribute/postinglist src/tests/attribute/postinglistattribute + src/tests/attribute/reference_attribute src/tests/attribute/searchable src/tests/attribute/searchcontext src/tests/attribute/sourceselector diff --git a/searchlib/src/tests/attribute/reference_attribute/CMakeLists.txt b/searchlib/src/tests/attribute/reference_attribute/CMakeLists.txt new file mode 100644 index 00000000000..1ee25da88a5 --- /dev/null +++ b/searchlib/src/tests/attribute/reference_attribute/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_reference_attribute_test_app TEST + SOURCES + reference_attribute_test.cpp + DEPENDS + searchlib +) +vespa_add_test(NAME searchlib_reference_attribute_test_app COMMAND searchlib_reference_attribute_test_app) diff --git a/searchlib/src/tests/attribute/reference_attribute/FILES b/searchlib/src/tests/attribute/reference_attribute/FILES new file mode 100644 index 00000000000..97203f7b4b0 --- /dev/null +++ b/searchlib/src/tests/attribute/reference_attribute/FILES @@ -0,0 +1 @@ +reference_attribute_test.cpp diff --git a/searchlib/src/tests/attribute/reference_attribute/reference_attribute_test.cpp b/searchlib/src/tests/attribute/reference_attribute/reference_attribute_test.cpp new file mode 100644 index 00000000000..6de22b0b99b --- /dev/null +++ b/searchlib/src/tests/attribute/reference_attribute/reference_attribute_test.cpp @@ -0,0 +1,215 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP("reference_attribute_test"); +#include <vespa/vespalib/testkit/testapp.h> +#include <vespa/vespalib/test/insertion_operators.h> +#include <vespa/vespalib/util/traits.h> +#include <vespa/vespalib/io/fileutil.h> +#include <vespa/searchlib/attribute/attributeguard.h> +#include <vespa/searchlib/attribute/reference_attribute.h> +#include <vespa/document/base/documentid.h> + +using search::MemoryUsage; +using vespalib::ArrayRef; +using generation_t = vespalib::GenerationHandler::generation_t; +using search::attribute::ReferenceAttribute; +using search::attribute::Config; +using search::attribute::BasicType; +using search::AttributeVector; +using search::AttributeGuard; +using document::GlobalId; +using document::DocumentId; + +namespace { + +GlobalId toGid(vespalib::stringref docId) { + return DocumentId(docId).getGlobalId(); +} + +vespalib::string doc1("id:test:music::1"); +vespalib::string doc2("id:test:music::2"); + +} + + +struct Fixture +{ + std::shared_ptr<ReferenceAttribute> _attr; + + Fixture() + : _attr() + { + resetAttr(); + } + + AttributeVector &attr() { + return *_attr; + } + + void resetAttr() { + _attr.reset(); + _attr = std::make_shared<ReferenceAttribute>("test", + Config(BasicType::REFERENCE)); + } + + void ensureDocIdLimit(uint32_t docIdLimit) { + while (attr().getNumDocs() < docIdLimit) { + uint32_t newDocId = 0u; + _attr->addDoc(newDocId); + _attr->commit(); + } + } + + search::attribute::Status getStatus() { + attr().commit(true); + return attr().getStatus(); + } + + const GlobalId *get(uint32_t doc) { + return _attr->getReference(doc); + } + + void set(uint32_t doc, const GlobalId &gid) { + _attr->update(doc, gid); + } + + void clear(uint32_t doc) { + _attr->clearDoc(doc); + } + + void commit() { attr().commit(); } + + void assertNoRef(uint32_t doc) + { + EXPECT_TRUE(get(doc) == nullptr); + } + + void assertRef(vespalib::stringref str, uint32_t doc) { + const GlobalId *gid = get(doc); + EXPECT_TRUE(gid != nullptr); + EXPECT_EQUAL(toGid(str), *gid); + } + + void save() { + attr().save(); + } + + void load() { + resetAttr(); + attr().load(); + } + + void triggerCompaction(uint64_t iterLimit) { + search::attribute::Status oldStatus = getStatus(); + search::attribute::Status newStatus = oldStatus; + uint64_t iter = 0; + for (; iter < iterLimit; ++iter) { + clear(2); + set(2, toGid(doc2)); + newStatus = getStatus(); + if (newStatus.getUsed() < oldStatus.getUsed()) { + break; + } + oldStatus = newStatus; + } + EXPECT_GREATER(iterLimit, iter); + LOG(info, + "iter = %" PRIu64 ", memory usage %" PRIu64 ", -> %" PRIu64, + iter, oldStatus.getUsed(), newStatus.getUsed()); + } +}; + +TEST_F("require that we can instantiate reference attribute", Fixture) +{ + f.ensureDocIdLimit(5); + f.set(1, toGid(doc1)); + f.set(2, toGid(doc2)); + f.commit(); + + TEST_DO(f.assertNoRef(3)); + TEST_DO(f.assertRef(doc1, 1)); + TEST_DO(f.assertRef(doc2, 2)); +} + +TEST_F("require that we can set new reference for a document", Fixture) +{ + f.ensureDocIdLimit(5); + f.set(1, toGid(doc1)); + f.set(2, toGid(doc2)); + f.set(3, toGid(doc2)); + f.commit(); + TEST_DO(f.assertNoRef(4)); + TEST_DO(f.assertRef(doc1, 1)); + TEST_DO(f.assertRef(doc2, 2)); + TEST_DO(f.assertRef(doc2, 3)); + f.set(2, toGid(doc1)); + f.commit(); + TEST_DO(f.assertNoRef(4)); + TEST_DO(f.assertRef(doc1, 1)); + TEST_DO(f.assertRef(doc1, 2)); + TEST_DO(f.assertRef(doc2, 3)); +} + +TEST_F("require that we can clear reference for a document", Fixture) +{ + f.ensureDocIdLimit(5); + f.set(2, toGid(doc2)); + f.commit(); + TEST_DO(f.assertRef(doc2, 2)); + f.clear(2); + f.commit(); + TEST_DO(f.assertNoRef(2)); + f.clear(2); + f.commit(); + TEST_DO(f.assertNoRef(2)); +} + +TEST_F("require that read guard protects reference", Fixture) +{ + f.ensureDocIdLimit(5); + f.set(2, toGid(doc2)); + f.commit(); + const GlobalId *gid = f.get(2); + EXPECT_TRUE(gid != nullptr); + EXPECT_EQUAL(toGid(doc2), *gid); + { + AttributeGuard guard(f._attr); + f.clear(2); + f.commit(); + EXPECT_EQUAL(toGid(doc2), *gid); + } + f.commit(); + EXPECT_NOT_EQUAL(toGid(doc2), *gid); +} + +TEST_F("require that we can compact attribute", Fixture) +{ + f.ensureDocIdLimit(5); + f.set(1, toGid(doc1)); + f.set(2, toGid(doc2)); + f.commit(); + TEST_DO(f.triggerCompaction(100000)); + TEST_DO(f.assertNoRef(3)); + TEST_DO(f.assertRef(doc1, 1)); + TEST_DO(f.assertRef(doc2, 2)); +} + +TEST_F("require that we can save and load attribute", Fixture) +{ + f.ensureDocIdLimit(5); + f.set(1, toGid(doc1)); + f.set(2, toGid(doc2)); + f.set(4, toGid(doc1)); + f.commit(); + f.save(); + f.load(); + TEST_DO(f.assertNoRef(3)); + TEST_DO(f.assertRef(doc1, 1)); + TEST_DO(f.assertRef(doc2, 2)); + TEST_DO(f.assertRef(doc1, 4)); + EXPECT_TRUE(vespalib::unlink("test.dat")); + EXPECT_TRUE(vespalib::unlink("test.udat")); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/tests/datastore/unique_store/unique_store_test.cpp b/searchlib/src/tests/datastore/unique_store/unique_store_test.cpp index aefbf9a9835..a0ea48ee74a 100644 --- a/searchlib/src/tests/datastore/unique_store/unique_store_test.cpp +++ b/searchlib/src/tests/datastore/unique_store/unique_store_test.cpp @@ -111,6 +111,8 @@ struct Fixture refStore = compactedRefStore; } size_t entrySize() const { return sizeof(EntryT); } + auto getBuilder(uint32_t uniqueValuesHint) { return store.getBuilder(uniqueValuesHint); } + auto getSaver() { return store.getSaver(); } }; using NumberFixture = Fixture<uint32_t>; @@ -205,4 +207,46 @@ TEST_F("require that compaction works", NumberFixture) TEST_DO(f.assertStoreContent()); } +TEST_F("require that builder works", NumberFixture) +{ + auto builder = f.getBuilder(2); + builder.add(10); + builder.add(20); + builder.setupRefCounts(); + EntryRef val10Ref = builder.mapEnumValueToEntryRef(1); + EntryRef val20Ref = builder.mapEnumValueToEntryRef(2); + TEST_DO(f.assertBufferState(val10Ref, MemStats().used(3).dead(1))); // Note: First element is reserved + EXPECT_TRUE(val10Ref.valid()); + EXPECT_TRUE(val20Ref.valid()); + EXPECT_NOT_EQUAL(val10Ref.ref(), val20Ref.ref()); + f.assertGet(val10Ref, 10); + f.assertGet(val20Ref, 20); + builder.makeDictionary(); + EXPECT_EQUAL(val10Ref.ref(), f.add(10).ref()); + EXPECT_EQUAL(val20Ref.ref(), f.add(20).ref()); +} + +TEST_F("require that saver works", NumberFixture) +{ + EntryRef val10Ref = f.add(10); + EntryRef val20Ref = f.add(20); + f.remove(f.add(40)); + f.trimHoldLists(); + + auto saver = f.getSaver(); + std::vector<uint32_t> refs; + saver.foreach_key([&](EntryRef ref) { refs.push_back(ref.ref()); }); + std::vector<uint32_t> expRefs; + expRefs.push_back(val10Ref.ref()); + expRefs.push_back(val20Ref.ref()); + EXPECT_EQUAL(expRefs, refs); + saver.enumerateValues(); + uint32_t invalidEnum = saver.mapEntryRefToEnumValue(EntryRef()); + uint32_t enumValue10 = saver.mapEntryRefToEnumValue(val10Ref); + uint32_t enumValue20 = saver.mapEntryRefToEnumValue(val20Ref); + EXPECT_EQUAL(0u, invalidEnum); + EXPECT_EQUAL(1u, enumValue10); + EXPECT_EQUAL(2u, enumValue20); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt b/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt index 09f42fa6fe5..1f710f1a544 100644 --- a/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/attribute/CMakeLists.txt @@ -72,6 +72,8 @@ vespa_add_library(searchlib_attribute OBJECT postingstore.cpp predicate_attribute.cpp readerbase.cpp + reference_attribute.cpp + reference_attribute_saver.cpp singleenumattribute.cpp singleenumattributesaver.cpp singlenumericattribute.cpp diff --git a/searchlib/src/vespa/searchlib/attribute/reference_attribute.cpp b/searchlib/src/vespa/searchlib/attribute/reference_attribute.cpp new file mode 100644 index 00000000000..2e82c057d62 --- /dev/null +++ b/searchlib/src/vespa/searchlib/attribute/reference_attribute.cpp @@ -0,0 +1,231 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include "reference_attribute.h" +#include "attributesaver.h" +#include <vespa/vespalib/data/fileheader.h> +#include "readerbase.h" +#include <vespa/searchlib/datastore/unique_store_builder.h> +#include <vespa/searchlib/datastore/datastore.hpp> +#include "reference_attribute_saver.h" + +namespace search { +namespace attribute { + +namespace { + +// minimum dead bytes in unique store before consider compaction +constexpr size_t DEAD_BYTES_SLACK = 0x10000u; + +const vespalib::string uniqueValueCountTag = "uniqueValueCount"; + +uint64_t +extractUniqueValueCount(const vespalib::GenericHeader &header) +{ + return (header.hasTag(uniqueValueCountTag)) ? header.getTag(uniqueValueCountTag).asInteger() : 0u; +} + +} + +ReferenceAttribute::ReferenceAttribute(const vespalib::stringref baseFileName, + const Config & cfg) + : NotImplementedAttribute(baseFileName, cfg), + _store(), + _indices(getGenerationHolder()) +{ + setEnum(true); + enableEnumeratedSave(true); +} + +ReferenceAttribute::~ReferenceAttribute() +{ +} + +void +ReferenceAttribute::onAddDocs(DocId limit) +{ + _indices.reserve(limit); +} + +bool +ReferenceAttribute::addDoc(DocId &doc) +{ + bool incGen = _indices.isFull(); + doc = _indices.size(); + _indices.push_back(EntryRef()); + incNumDocs(); + updateUncommittedDocIdLimit(doc); + if (incGen) { + incGeneration(); + } else { + removeAllOldGenerations(); + } + return true; +} + +uint32_t +ReferenceAttribute::clearDoc(DocId doc) +{ + updateUncommittedDocIdLimit(doc); + assert(doc < _indices.size()); + EntryRef oldRef = _indices[doc]; + if (oldRef.valid()) { + _indices[doc] = EntryRef(); + _store.remove(oldRef); + return 1u; + } else { + return 0u; + } +} + +void +ReferenceAttribute::removeOldGenerations(generation_t firstUsed) +{ + _store.trimHoldLists(firstUsed); + getGenerationHolder().trimHoldLists(firstUsed); +} + +void +ReferenceAttribute::onGenerationChange(generation_t generation) +{ + _store.freeze(); + _store.transferHoldLists(generation - 1); + getGenerationHolder().transferHoldLists(generation - 1); +} + +void +ReferenceAttribute::onCommit() +{ + // Note: Cost can be reduced if unneeded generation increments are dropped + incGeneration(); + if (considerCompact(getConfig().getCompactionStrategy())) { + incGeneration(); + updateStat(true); + } +} + +void +ReferenceAttribute::onUpdateStat() +{ + MemoryUsage total = _store.getMemoryUsage(); + _cachedUniqueStoreMemoryUsage = total; + total.merge(_indices.getMemoryUsage()); + updateStatistics(getTotalValueCount(), getUniqueValueCount(), + total.allocatedBytes(), + total.usedBytes(), total.deadBytes(), total.allocatedBytesOnHold()); +} + +std::unique_ptr<AttributeSaver> +ReferenceAttribute::onInitSave() +{ + vespalib::GenerationHandler::Guard guard(this->getGenerationHandler(). + takeGuard()); + return std::make_unique<ReferenceAttributeSaver> + (std::move(guard), + createSaveTargetConfig(), + getIndicesCopy(getCommittedDocIdLimit()), + _store); +} + +bool +ReferenceAttribute::onLoad() +{ + ReaderBase attrReader(*this); + bool ok(attrReader.getHasLoadData()); + if (!ok) { + return false; + } + setCreateSerialNum(attrReader.getCreateSerialNum()); + assert(attrReader.getEnumerated()); + assert(!attrReader.hasIdx()); + size_t numDocs(0); + uint64_t numValues(0); + numValues = attrReader.getEnumCount(); + numDocs = numValues; + fileutil::LoadedBuffer::UP udatBuffer(loadUDAT()); + const GenericHeader &header = udatBuffer->getHeader(); + uint32_t uniqueValueCount = extractUniqueValueCount(header); + assert(uniqueValueCount * sizeof(GlobalId) == udatBuffer->size()); + vespalib::ConstArrayRef<GlobalId> uniques(static_cast<const GlobalId *>(udatBuffer->buffer()), uniqueValueCount); + + auto builder = _store.getBuilder(uniqueValueCount); + for (const auto &value : uniques) { + builder.add(value); + } + builder.setupRefCounts(); + _indices.clear(); + _indices.unsafe_reserve(numDocs); + for (uint32_t doc = 0; doc < numDocs; ++doc) { + uint32_t enumValue = attrReader.getNextEnum(); + _indices.push_back(builder.mapEnumValueToEntryRef(enumValue)); + } + builder.makeDictionary(); + incGeneration(); + return true; +} + +void +ReferenceAttribute::update(DocId doc, const GlobalId &gid) +{ + updateUncommittedDocIdLimit(doc); + assert(doc < _indices.size()); + EntryRef oldRef = _indices[doc]; + EntryRef newRef = _store.add(gid); + std::atomic_thread_fence(std::memory_order_release); + _indices[doc] = newRef; + if (oldRef.valid()) { + _store.remove(oldRef); + } +} + +const ReferenceAttribute::GlobalId * +ReferenceAttribute::getReference(DocId doc) +{ + assert(doc < _indices.size()); + EntryRef oldRef = _indices[doc]; + if (!oldRef.valid()) { + return nullptr; + } else { + return &_store.get(oldRef); + } +} + +bool +ReferenceAttribute::considerCompact(const CompactionStrategy &compactionStrategy) +{ + size_t usedBytes = _cachedUniqueStoreMemoryUsage.usedBytes(); + size_t deadBytes = _cachedUniqueStoreMemoryUsage.deadBytes(); + bool compactMemory = ((deadBytes >= DEAD_BYTES_SLACK) && + (usedBytes * compactionStrategy.getMaxDeadBytesRatio() < deadBytes)); + if (compactMemory) { + compactWorst(); + return true; + } + return false; +} + +void +ReferenceAttribute::compactWorst() +{ + datastore::ICompactionContext::UP compactionContext(_store.compactWorst()); + if (compactionContext) { + compactionContext->compact(vespalib::ArrayRef<EntryRef>(&_indices[0], + _indices.size())); + } +} + +uint64_t +ReferenceAttribute::getUniqueValueCount() const +{ + return _store.getNumUniques(); +} + +ReferenceAttribute::IndicesCopyVector +ReferenceAttribute::getIndicesCopy(uint32_t size) const +{ + assert(size <= _indices.size()); + return IndicesCopyVector(&_indices[0], &_indices[0] + size); +} + +} +} diff --git a/searchlib/src/vespa/searchlib/attribute/reference_attribute.h b/searchlib/src/vespa/searchlib/attribute/reference_attribute.h new file mode 100644 index 00000000000..8b515f0757f --- /dev/null +++ b/searchlib/src/vespa/searchlib/attribute/reference_attribute.h @@ -0,0 +1,54 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "not_implemented_attribute.h" +#include <vespa/document/base/globalid.h> +#include <vespa/searchlib/datastore/unique_store.h> +#include <vespa/searchlib/common/rcuvector.h> + +namespace search { +namespace attribute { + +/* + * Attribute vector mapping from local document ids to global ids + * referencing external documents. + */ +class ReferenceAttribute : public NotImplementedAttribute +{ +public: + using EntryRef = search::datastore::EntryRef; + using GlobalId = document::GlobalId; + using Store = datastore::UniqueStore<GlobalId>; + using IndicesCopyVector = vespalib::Array<EntryRef>; + +private: + Store _store; + RcuVectorBase<EntryRef> _indices; + MemoryUsage _cachedUniqueStoreMemoryUsage; + + virtual void onAddDocs(DocId docIdLimit) override; + virtual void removeOldGenerations(generation_t firstUsed) override; + virtual void onGenerationChange(generation_t generation) override; + virtual void onCommit() override; + virtual void onUpdateStat() override; + virtual std::unique_ptr<AttributeSaver> onInitSave() override; + virtual bool onLoad() override; + virtual uint64_t getUniqueValueCount() const override; + + bool considerCompact(const CompactionStrategy &compactionStrategy); + void compactWorst(); + IndicesCopyVector getIndicesCopy(uint32_t size) const; + +public: + ReferenceAttribute(const vespalib::stringref baseFileName, + const Config & cfg); + virtual ~ReferenceAttribute(); + virtual bool addDoc(DocId &doc) override; + virtual uint32_t clearDoc(DocId doc) override; + void update(DocId doc, const GlobalId &gid); + const GlobalId *getReference(DocId doc); +}; + +} +} diff --git a/searchlib/src/vespa/searchlib/attribute/reference_attribute_saver.cpp b/searchlib/src/vespa/searchlib/attribute/reference_attribute_saver.cpp new file mode 100644 index 00000000000..6dd2db7d754 --- /dev/null +++ b/searchlib/src/vespa/searchlib/attribute/reference_attribute_saver.cpp @@ -0,0 +1,80 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "reference_attribute_saver.h" +#include <vespa/searchlib/util/bufferwriter.h> +#include <vespa/vespalib/util/array.hpp> + + +using vespalib::GenerationHandler; +using document::GlobalId; +using search::datastore::EntryRef; + +namespace search { +namespace attribute { + +ReferenceAttributeSaver:: +ReferenceAttributeSaver(GenerationHandler::Guard &&guard, + const IAttributeSaveTarget::Config &cfg, + IndicesCopyVector &&indices, + const Store &store) + : AttributeSaver(std::move(guard), cfg), + _indices(std::move(indices)), + _store(store), + _saver(store.getSaver()) +{ +} + + +ReferenceAttributeSaver::~ReferenceAttributeSaver() +{ +} + +namespace { + +template <class Store> +class ValueWriter +{ + const Store &_store; + BufferWriter &_writer; +public: + ValueWriter(const Store &store, BufferWriter &writer) + : _store(store), + _writer(writer) + { + } + void operator()(EntryRef ref) { + const GlobalId &gid = _store.get(ref); + _writer.write(&gid, sizeof(GlobalId));; + } +}; + +template <class Store, class Saver> +void +writeUdat(IAttributeSaveTarget &saveTarget, const Store &store, const Saver &saver) +{ + std::unique_ptr<BufferWriter> + udatWriter(saveTarget.udatWriter().allocBufferWriter()); + saver.foreach_key(ValueWriter<Store>(store, *udatWriter)); + udatWriter->flush(); +} + +} + +bool +ReferenceAttributeSaver::onSave(IAttributeSaveTarget &saveTarget) +{ + writeUdat(saveTarget, _store, _saver); + std::unique_ptr<search::BufferWriter> datWriter(saveTarget.datWriter(). + allocBufferWriter()); + + _saver.enumerateValues(); + for (const auto &ref : _indices) { + uint32_t enumValue = _saver.mapEntryRefToEnumValue(ref); + datWriter->write(&enumValue, sizeof(uint32_t)); + } + datWriter->flush(); + return true; +} + +} // namespace search::attribute +} // namespace search diff --git a/searchlib/src/vespa/searchlib/attribute/reference_attribute_saver.h b/searchlib/src/vespa/searchlib/attribute/reference_attribute_saver.h new file mode 100644 index 00000000000..c372cc2a167 --- /dev/null +++ b/searchlib/src/vespa/searchlib/attribute/reference_attribute_saver.h @@ -0,0 +1,51 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "attributesaver.h" +#include <vespa/document/base/globalid.h> +#include <vespa/searchlib/datastore/unique_store.h> +#include <vespa/searchlib/datastore/unique_store_saver.h> +#include <vespa/searchlib/common/rcuvector.h> +#include "iattributesavetarget.h" +#include "reference_attribute.h" + +namespace search { +namespace attribute { + +/* + * Class for saving a reference attribute to disk or memory buffers. + * + * .udat file contains sorted unique values after generic header, in + * host byte order. + * + * .dat file contains enum values after generic header, in host byte order. + * + * enum value 0 means value not set. + * enum value 1 means the first unique value. + * enum value n means the nth unique value. + */ +class ReferenceAttributeSaver : public AttributeSaver +{ +private: + using EntryRef = search::datastore::EntryRef; + using GlobalId = document::GlobalId; + using IndicesCopyVector = ReferenceAttribute::IndicesCopyVector; + using Store = ReferenceAttribute::Store; + using Saver = Store::Saver; + IndicesCopyVector _indices; + const Store &_store; + Saver _saver; + + virtual bool onSave(IAttributeSaveTarget &saveTarget) override; +public: + ReferenceAttributeSaver(vespalib::GenerationHandler::Guard &&guard, + const IAttributeSaveTarget::Config &cfg, + IndicesCopyVector &&indices, + const Store &store); + + virtual ~ReferenceAttributeSaver(); +}; + +} // namespace search::attribute +} // namespace search diff --git a/searchlib/src/vespa/searchlib/datastore/unique_store.cpp b/searchlib/src/vespa/searchlib/datastore/unique_store.cpp index bcbe1be0360..17eb563947d 100644 --- a/searchlib/src/vespa/searchlib/datastore/unique_store.cpp +++ b/searchlib/src/vespa/searchlib/datastore/unique_store.cpp @@ -7,6 +7,8 @@ namespace search { namespace datastore { template class UniqueStore<document::GlobalId, EntryRefT<22>>; +template class UniqueStoreBuilder<document::GlobalId, EntryRefT<22>>; +template class UniqueStoreSaver<document::GlobalId, EntryRefT<22>>; } // namespace datastore } // namespace search diff --git a/searchlib/src/vespa/searchlib/datastore/unique_store.h b/searchlib/src/vespa/searchlib/datastore/unique_store.h index 60381f38889..9846d40deb2 100644 --- a/searchlib/src/vespa/searchlib/datastore/unique_store.h +++ b/searchlib/src/vespa/searchlib/datastore/unique_store.h @@ -1,4 +1,4 @@ -// Copyright 2017 Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once @@ -13,6 +13,12 @@ namespace search { namespace datastore { +template <typename EntryT, typename RefT> +class UniqueStoreBuilder; + +template <typename EntryT, typename RefT> +class UniqueStoreSaver; + /** * Datastore for unique values of type EntryT that is accessed via a * 32-bit EntryRef. @@ -24,6 +30,8 @@ public: using DataStoreType = DataStoreT<RefT>; using EntryType = EntryT; using RefType = RefT; + using Saver = UniqueStoreSaver<EntryT, RefT>; + using Builder = UniqueStoreBuilder<EntryT, RefT>; /* * Compare two values in data store based on reference. Invalid * reference is mapped to local value reference to support @@ -81,11 +89,15 @@ public: MemoryUsage getMemoryUsage() const; // Pass on hold list management to underlying store - void transferHoldLists(generation_t generation) { _dict.getAllocator().transferHoldLists(generation); _store.transferHoldLists(generation); } - void trimHoldLists(generation_t firstUsed) { _dict.getAllocator().trimHoldLists(firstUsed); _store.trimHoldLists(firstUsed); } + void transferHoldLists(generation_t generation); + void trimHoldLists(generation_t firstUsed); vespalib::GenerationHolder &getGenerationHolder(void) { return _store.getGenerationHolder(); } void setInitializing(bool initializing) { _store.setInitializing(initializing); } - void freeze() { _dict.getAllocator().freeze(); } + void freeze(); + uint32_t getNumUniques() const; + + Builder getBuilder(uint32_t uniqueValuesHint); + Saver getSaver() const; // Should only be used for unit testing const BufferState &bufferState(EntryRef ref) const; diff --git a/searchlib/src/vespa/searchlib/datastore/unique_store.hpp b/searchlib/src/vespa/searchlib/datastore/unique_store.hpp index 85ff503625f..13ec9953b9d 100644 --- a/searchlib/src/vespa/searchlib/datastore/unique_store.hpp +++ b/searchlib/src/vespa/searchlib/datastore/unique_store.hpp @@ -5,10 +5,14 @@ #include "unique_store.h" #include "datastore.hpp" #include <vespa/searchlib/btree/btree.hpp> +#include <vespa/searchlib/btree/btreebuilder.hpp> #include <vespa/searchlib/btree/btreeroot.hpp> #include <vespa/searchlib/btree/btreenodeallocator.hpp> #include <vespa/searchlib/btree/btreeiterator.hpp> #include <vespa/searchlib/btree/btreenode.hpp> +#include <vespa/searchlib/util/bufferwriter.h> +#include "unique_store_builder.hpp" +#include "unique_store_saver.hpp" #include <atomic> namespace search { @@ -187,5 +191,50 @@ UniqueStore<EntryT, RefT>::bufferState(EntryRef ref) const return _store.getBufferState(internalRef.bufferId()); } + +template <typename EntryT, typename RefT> +void +UniqueStore<EntryT, RefT>::transferHoldLists(generation_t generation) +{ + _dict.getAllocator().transferHoldLists(generation); + _store.transferHoldLists(generation); +} + +template <typename EntryT, typename RefT> +void +UniqueStore<EntryT, RefT>::trimHoldLists(generation_t firstUsed) +{ + _dict.getAllocator().trimHoldLists(firstUsed); + _store.trimHoldLists(firstUsed); +} + +template <typename EntryT, typename RefT> +void +UniqueStore<EntryT, RefT>::freeze() +{ + _dict.getAllocator().freeze(); +} + +template <typename EntryT, typename RefT> +typename UniqueStore<EntryT, RefT>::Builder +UniqueStore<EntryT, RefT>::getBuilder(uint32_t uniqueValuesHint) +{ + return Builder(_store, _typeId, _dict, uniqueValuesHint); +} + +template <typename EntryT, typename RefT> +typename UniqueStore<EntryT, RefT>::Saver +UniqueStore<EntryT, RefT>::getSaver() const +{ + return Saver(_dict, _store); +} + +template <typename EntryT, typename RefT> +uint32_t +UniqueStore<EntryT, RefT>::getNumUniques() const +{ + return _dict.getFrozenView().size(); +} + } } diff --git a/searchlib/src/vespa/searchlib/datastore/unique_store_builder.h b/searchlib/src/vespa/searchlib/datastore/unique_store_builder.h new file mode 100644 index 00000000000..85637df0bec --- /dev/null +++ b/searchlib/src/vespa/searchlib/datastore/unique_store_builder.h @@ -0,0 +1,48 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "unique_store.h" + +namespace search { +namespace datastore { + +/** + * Builder for related UniqueStore class. + * + * Contains utility method for adding new unique values and mapping + * from enum value to EntryRef value. New unique values must be added + * in sorted order. + */ +template <typename EntryT, typename RefT> +class UniqueStoreBuilder { + using UniqueStoreType = UniqueStore<EntryT, RefT>; + using DataStoreType = typename UniqueStoreType::DataStoreType; + using Dictionary = typename UniqueStoreType::Dictionary; + using EntryType = EntryT; + using RefType = RefT; + + DataStoreType &_store; + uint32_t _typeId; + Dictionary &_dict; + std::vector<EntryRef> _refs; + std::vector<uint32_t> _refCounts; +public: + UniqueStoreBuilder(DataStoreType &store, uint32_t typeId, + Dictionary &dict, uint32_t uniqueValuesHint); + ~UniqueStoreBuilder(); + void setupRefCounts(); + void makeDictionary(); + void add(const EntryType &value) { + EntryRef newRef = _store.template allocator<EntryType>(_typeId).alloc(value).ref; + _refs.push_back(newRef); + } + EntryRef mapEnumValueToEntryRef(uint32_t enumValue) { + assert(enumValue < _refs.size()); + ++_refCounts[enumValue]; + return _refs[enumValue]; + } +}; + +} +} diff --git a/searchlib/src/vespa/searchlib/datastore/unique_store_builder.hpp b/searchlib/src/vespa/searchlib/datastore/unique_store_builder.hpp new file mode 100644 index 00000000000..1fb95a7fed7 --- /dev/null +++ b/searchlib/src/vespa/searchlib/datastore/unique_store_builder.hpp @@ -0,0 +1,60 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "unique_store_builder.h" +#include "datastore.hpp" +#include <vespa/searchlib/btree/btree.hpp> +#include <vespa/searchlib/btree/btreebuilder.hpp> +#include <vespa/searchlib/btree/btreeroot.hpp> +#include <vespa/searchlib/btree/btreenodeallocator.hpp> +#include <vespa/searchlib/btree/btreeiterator.hpp> +#include <vespa/searchlib/btree/btreenode.hpp> + +namespace search { +namespace datastore { + +template <typename EntryT, typename RefT> +UniqueStoreBuilder<EntryT, RefT>::UniqueStoreBuilder(DataStoreType &store, uint32_t typeId, Dictionary &dict, uint32_t uniqueValuesHint) + : _store(store), + _typeId(typeId), + _dict(dict), + _refs(), + _refCounts() +{ + _refs.reserve(uniqueValuesHint); + _refs.push_back(EntryRef()); +} + +template <typename EntryT, typename RefT> +UniqueStoreBuilder<EntryT, RefT>::~UniqueStoreBuilder() +{ +} + +template <typename EntryT, typename RefT> +void +UniqueStoreBuilder<EntryT, RefT>::setupRefCounts() +{ + _refCounts.resize(_refs.size()); +} + + +template <typename EntryT, typename RefT> +void +UniqueStoreBuilder<EntryT, RefT>::makeDictionary() +{ + assert(_refs.size() == _refCounts.size()); + assert(!_refs.empty()); + typename Dictionary::Builder builder(_dict.getAllocator()); + for (size_t i = 1; i < _refs.size(); ++i) { + if (_refCounts[i] != 0u) { + builder.insert(_refs[i], _refCounts[i]); + } else { + _store.holdElem(_refs[i], 1); + } + } + _dict.assign(builder); +} + +} +} diff --git a/searchlib/src/vespa/searchlib/datastore/unique_store_saver.h b/searchlib/src/vespa/searchlib/datastore/unique_store_saver.h new file mode 100644 index 00000000000..530d36bc9d8 --- /dev/null +++ b/searchlib/src/vespa/searchlib/datastore/unique_store_saver.h @@ -0,0 +1,53 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "unique_store.h" + +namespace search { +namespace datastore { + +/** + * Saver for related UniqueStore class. + * + * Contains utility methods for traversing all unique values (as + * EntryRef value) and mapping from EntryRef value to enum value. + */ +template <typename EntryT, typename RefT> +class UniqueStoreSaver { + using UniqueStoreType = UniqueStore<EntryT, RefT>; + using Dictionary = typename UniqueStoreType::Dictionary; + using ConstIterator = typename Dictionary::ConstIterator; + using EntryType = EntryT; + using RefType = RefT; + + ConstIterator _itr; + const DataStoreBase &_store; + std::vector<std::vector<uint32_t>> _enumValues; +public: + UniqueStoreSaver(const Dictionary &dict, const DataStoreBase &store); + ~UniqueStoreSaver(); + void enumerateValues(); + + template <typename Function> + void + foreach_key(Function &&func) const + { + _itr.foreach_key(func); + } + + uint32_t mapEntryRefToEnumValue(EntryRef ref) const { + if (ref.valid()) { + RefType iRef(ref); + assert(iRef.offset() < _enumValues[iRef.bufferId()].size()); + uint32_t enumValue = _enumValues[iRef.bufferId()][iRef.offset()]; + assert(enumValue != 0); + return enumValue; + } else { + return 0u; + } + } +}; + +} +} diff --git a/searchlib/src/vespa/searchlib/datastore/unique_store_saver.hpp b/searchlib/src/vespa/searchlib/datastore/unique_store_saver.hpp new file mode 100644 index 00000000000..64b8338f88b --- /dev/null +++ b/searchlib/src/vespa/searchlib/datastore/unique_store_saver.hpp @@ -0,0 +1,49 @@ +// Copyright 2017 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "unique_store_saver.h" + +namespace search { +namespace datastore { + +template <typename EntryT, typename RefT> +UniqueStoreSaver<EntryT, RefT>::UniqueStoreSaver(const Dictionary &dict, const DataStoreBase &store) + : _itr(), + _store(store) +{ + _itr = dict.getFrozenView().begin(); +} + +template <typename EntryT, typename RefT> +UniqueStoreSaver<EntryT, RefT>::~UniqueStoreSaver() +{ +} + +template <typename EntryT, typename RefT> +void +UniqueStoreSaver<EntryT, RefT>::enumerateValues() +{ + _enumValues.resize(RefType::numBuffers()); + for (uint32_t bufferId = 0; bufferId < RefType::numBuffers(); ++bufferId) { + const BufferState &state = _store.getBufferState(bufferId); + if (state.isActive()) { + _enumValues[bufferId].resize(state.size()); + } + } + ConstIterator it = _itr; + uint32_t nextEnumVal = 1; + while (it.valid()) { + RefType ref(it.getKey()); + assert(ref.valid()); + assert(ref.offset() < _enumValues[ref.bufferId()].size()); + uint32_t &enumVal = _enumValues[ref.bufferId()][ref.offset()]; + assert(enumVal == 0u); + enumVal = nextEnumVal; + ++it; + ++nextEnumVal; + } +} + +} +} |