diff options
21 files changed, 289 insertions, 1297 deletions
diff --git a/searchcore/src/vespa/searchcore/proton/attribute/attribute_vector_explorer.cpp b/searchcore/src/vespa/searchcore/proton/attribute/attribute_vector_explorer.cpp index a1d5f72bc9d..5a199c529b6 100644 --- a/searchcore/src/vespa/searchcore/proton/attribute/attribute_vector_explorer.cpp +++ b/searchcore/src/vespa/searchcore/proton/attribute/attribute_vector_explorer.cpp @@ -77,8 +77,8 @@ void convertEnumStoreToSlime(const IEnumStore &enumStore, Cursor &object) { object.setLong("numUniques", enumStore.getNumUniques()); - convertMemoryUsageToSlime(enumStore.getMemoryUsage(), object.setObject("memoryUsage")); - convertMemoryUsageToSlime(enumStore.getTreeMemoryUsage(), object.setObject("treeMemoryUsage")); + convertMemoryUsageToSlime(enumStore.getValuesMemoryUsage(), object.setObject("valuesMemoryUsage")); + convertMemoryUsageToSlime(enumStore.getDictionaryMemoryUsage(), object.setObject("dictionaryMemoryUsage")); } void diff --git a/searchlib/src/tests/attribute/attribute_test.cpp b/searchlib/src/tests/attribute/attribute_test.cpp index 4e520e86707..98caf39dace 100644 --- a/searchlib/src/tests/attribute/attribute_test.cpp +++ b/searchlib/src/tests/attribute/attribute_test.cpp @@ -2036,11 +2036,11 @@ AttributeTest::requireThatAddressSpaceUsageIsReported(const Config &config, bool AddressSpaceUsage after = attrPtr->getAddressSpaceUsage(); if (attrPtr->hasEnum()) { LOG(info, "requireThatAddressSpaceUsageIsReported(%s): Has enum", attrName.c_str()); - EXPECT_EQUAL(before.enumStoreUsage().used(), 16u); - EXPECT_EQUAL(before.enumStoreUsage().dead(), 16u); + EXPECT_EQUAL(before.enumStoreUsage().used(), 1u); + EXPECT_EQUAL(before.enumStoreUsage().dead(), 1u); EXPECT_GREATER(after.enumStoreUsage().used(), before.enumStoreUsage().used()); - EXPECT_EQUAL(after.enumStoreUsage().limit(), before.enumStoreUsage().limit()); - EXPECT_EQUAL(34359738368u, after.enumStoreUsage().limit()); // EnumStoreBase::DataStoreType::RefType::offsetSize() + EXPECT_GREATER_EQUAL(after.enumStoreUsage().limit(), before.enumStoreUsage().limit()); + EXPECT_GREATER(after.enumStoreUsage().limit(), 4200000000u); } else { LOG(info, "requireThatAddressSpaceUsageIsReported(%s): NOT enum", attrName.c_str()); EXPECT_EQUAL(before.enumStoreUsage().used(), 0u); diff --git a/searchlib/src/tests/attribute/comparator/comparator_test.cpp b/searchlib/src/tests/attribute/comparator/comparator_test.cpp index a2000c48423..7bd6f3ca013 100644 --- a/searchlib/src/tests/attribute/comparator/comparator_test.cpp +++ b/searchlib/src/tests/attribute/comparator/comparator_test.cpp @@ -47,7 +47,7 @@ public: void Test::requireThatNumericComparatorIsWorking() { - NumericEnumStore es(1024, false); + NumericEnumStore es(false); EnumIndex e1, e2; es.addEnum(10, e1); es.addEnum(30, e2); @@ -63,7 +63,7 @@ Test::requireThatNumericComparatorIsWorking() void Test::requireThatFloatComparatorIsWorking() { - FloatEnumStore es(1024, false); + FloatEnumStore es(false); EnumIndex e1, e2, e3; es.addEnum(10.5, e1); es.addEnum(30.5, e2); @@ -83,7 +83,7 @@ Test::requireThatFloatComparatorIsWorking() void Test::requireThatStringComparatorIsWorking() { - StringEnumStore es(1024, false); + StringEnumStore es(false); EnumIndex e1, e2, e3; es.addEnum("Aa", e1); es.addEnum("aa", e2); @@ -102,7 +102,7 @@ Test::requireThatStringComparatorIsWorking() void Test::requireThatComparatorWithTreeIsWorking() { - NumericEnumStore es(2048, false); + NumericEnumStore es(false); vespalib::GenerationHandler g; TreeType t; NodeAllocator m; @@ -129,7 +129,7 @@ Test::requireThatComparatorWithTreeIsWorking() void Test::requireThatFoldedComparatorIsWorking() { - StringEnumStore es(1024, false); + StringEnumStore es(false); EnumIndex e1, e2, e3, e4; es.addEnum("Aa", e1); es.addEnum("aa", e2); diff --git a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp index c4ba8eecf43..f61211283a4 100644 --- a/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp +++ b/searchlib/src/tests/attribute/enumstore/enumstore_test.cpp @@ -17,7 +17,8 @@ size_t enumStoreAlign(size_t size) // IEnumStore::Index(0,0) is reserved thus 16 bytes are reserved in buffer 0 const uint32_t RESERVED_BYTES = 16u; -typedef EnumStoreT<NumericEntryType<uint32_t> > NumericEnumStore; +using NumericEnumStore = EnumStoreT<NumericEntryType<uint32_t> >; +using generation_t = vespalib::GenerationHandler::generation_t; class EnumStoreTest : public vespalib::TestApp { @@ -27,15 +28,6 @@ private: typedef EnumStoreT<NumericEntryType<double> > DoubleEnumStore; typedef IEnumStore::Index EnumIndex; - typedef vespalib::GenerationHandler::generation_t generation_t; - - void testIndex(); - void fillDataBuffer(char * data, uint32_t refCount, - const std::string & string); - void fillDataBuffer(char * data, uint32_t refCount, - uint32_t value); - void testStringEntry(); - void testNumericEntry(); template <typename EnumStoreType, typename T> void testFloatEnumStore(EnumStoreType & es); @@ -51,27 +43,11 @@ private: testUniques(const EnumStoreType &ses, const std::vector<std::string> &unique); - - void testCompaction(); - template <typename EnumStoreType> - void testCompaction(bool hasPostings); - - void testReset(); - template <typename EnumStoreType> - void testReset(bool hasPostings); - void testHoldListAndGeneration(); - void testMemoryUsage(); void requireThatAddressSpaceUsageIsReported(); - void testBufferLimit(); // helper methods typedef std::vector<std::string> StringVector; - template <typename T> - T random(T low, T high); - std::string getRandomString(uint32_t minLen, uint32_t maxLen); - StringVector fillRandomStrings(uint32_t numStrings, uint32_t minLen, uint32_t maxLen); - StringVector sortRandomStrings(StringVector & strings); struct StringEntry { StringEntry(uint32_t r, const std::string & s) : @@ -107,123 +83,6 @@ EnumStoreTest::Reader::Reader(uint32_t generation, const IndexVector & indices, {} EnumStoreTest::Reader::~Reader() { } -void -EnumStoreTest::testIndex() -{ - { - StringEnumStore::Index idx; - EXPECT_TRUE( ! idx.valid()); - EXPECT_EQUAL(idx.offset(), 0u); - EXPECT_TRUE(idx.bufferId() == 0); - } - { - StringEnumStore::Index idx(enumStoreAlign(1000), 0); - EXPECT_TRUE(idx.offset() == enumStoreAlign(1000)); - EXPECT_TRUE(idx.bufferId() == 0); - } - { - StringEnumStore::Index idx((UINT64_C(1) << 31)- RESERVED_BYTES, 1); - EXPECT_TRUE(idx.offset() == (UINT64_C(1) << 31) - RESERVED_BYTES); - EXPECT_TRUE(idx.bufferId() == 1); - } - { - StringEnumStore::Index idx((UINT64_C(1) << 33) - RESERVED_BYTES, 1); - EXPECT_TRUE(idx.offset() == (UINT64_C(1) << 33) - RESERVED_BYTES); - EXPECT_TRUE(idx.bufferId() == 1); - } - { - StringEnumStore::Index idx((UINT64_C(1) << 35) - RESERVED_BYTES, 1); - EXPECT_TRUE(idx.offset() == (UINT64_C(1) << 35) - RESERVED_BYTES); - EXPECT_TRUE(idx.bufferId() == 1); - } - { - // Change offsets when alignment changes. - StringEnumStore::Index idx1(48, 0); - StringEnumStore::Index idx2(80, 0); - StringEnumStore::Index idx3(48, 0); - EXPECT_TRUE(!(idx1 == idx2)); - EXPECT_TRUE(idx1 == idx3); - } - { - EXPECT_TRUE(StringEnumStore::Index::numBuffers() == 2); - } -} - -void -EnumStoreTest::fillDataBuffer(char * data, uint32_t refCount, - const std::string & string) -{ - StringEnumStore::insertEntry(data, refCount, string.c_str()); -} - -void -EnumStoreTest::fillDataBuffer(char * data, uint32_t refCount, - uint32_t value) -{ - NumericEnumStore::insertEntry(data, refCount, value); -} - -void -EnumStoreTest::testStringEntry() -{ - { - char data[9]; - fillDataBuffer(data, 0, ""); - StringEnumStore::Entry e(data); - EXPECT_TRUE(StringEnumStore::getEntrySize("") == - StringEnumStore::alignEntrySize(8 + 1)); - - EXPECT_TRUE(e.getRefCount() == 0); - EXPECT_TRUE(strcmp(e.getValue(), "") == 0); - - e.incRefCount(); - EXPECT_TRUE(e.getRefCount() == 1); - EXPECT_TRUE(strcmp(e.getValue(), "") == 0); - e.decRefCount(); - EXPECT_TRUE(e.getRefCount() == 0); - EXPECT_TRUE(strcmp(e.getValue(), "") == 0); - } - { - char data[18]; - fillDataBuffer(data, 5, "enumstore"); - StringEnumStore::Entry e(data); - EXPECT_TRUE(StringEnumStore::getEntrySize("enumstore") == - StringEnumStore::alignEntrySize(8 + 1 + 9)); - - EXPECT_TRUE(e.getRefCount() == 5); - EXPECT_TRUE(strcmp(e.getValue(), "enumstore") == 0); - - e.incRefCount(); - EXPECT_TRUE(e.getRefCount() == 6); - EXPECT_TRUE(strcmp(e.getValue(), "enumstore") == 0); - e.decRefCount(); - EXPECT_TRUE(e.getRefCount() == 5); - EXPECT_TRUE(strcmp(e.getValue(), "enumstore") == 0); - } -} - -void -EnumStoreTest::testNumericEntry() -{ - { - char data[12]; - fillDataBuffer(data, 20, 30); - NumericEnumStore::Entry e(data); - EXPECT_TRUE(NumericEnumStore::getEntrySize(30) == - NumericEnumStore::alignEntrySize(8 + 4)); - - EXPECT_TRUE(e.getRefCount() == 20); - EXPECT_TRUE(e.getValue() == 30); - - e.incRefCount(); - EXPECT_TRUE(e.getRefCount() == 21); - EXPECT_TRUE(e.getValue() == 30); - e.decRefCount(); - EXPECT_TRUE(e.getRefCount() == 20); - EXPECT_TRUE(e.getValue() == 30); - } -} - template <typename EnumStoreType, typename T> void EnumStoreTest::testFloatEnumStore(EnumStoreType & es) @@ -256,11 +115,11 @@ void EnumStoreTest::testFloatEnumStore() { { - FloatEnumStore fes(1000, false); + FloatEnumStore fes(false); testFloatEnumStore<FloatEnumStore, float>(fes); } { - DoubleEnumStore des(1000, false); + DoubleEnumStore des(false); testFloatEnumStore<DoubleEnumStore, double>(des); } } @@ -268,7 +127,7 @@ EnumStoreTest::testFloatEnumStore() void EnumStoreTest::testFindFolded() { - StringEnumStore ses(100, false); + StringEnumStore ses(false); std::vector<EnumIndex> indices; std::vector<std::string> unique({"", "one", "two", "TWO", "Two", "three"}); for (std::string &str : unique) { @@ -308,15 +167,10 @@ template <typename EnumStoreType> void EnumStoreTest::testAddEnum(bool hasPostings) { - EnumStoreType ses(100, hasPostings); - EXPECT_EQUAL(enumStoreAlign(100u) + RESERVED_BYTES, - ses.getBuffer(0).capacity()); - EXPECT_EQUAL(RESERVED_BYTES, ses.getBuffer(0).size()); - EXPECT_EQUAL(enumStoreAlign(100u), ses.getBuffer(0).remaining()); - EXPECT_EQUAL(RESERVED_BYTES, ses.getBuffer(0).getDeadElems()); + // TODO: Rewrite test to use BatchUpdater + EnumStoreType ses(hasPostings); EnumIndex idx; - uint64_t offset = ses.getBuffer(0).size(); std::vector<EnumIndex> indices; std::vector<std::string> unique; unique.push_back(""); @@ -326,12 +180,9 @@ EnumStoreTest::testAddEnum(bool hasPostings) for (uint32_t i = 0; i < unique.size(); ++i) { ses.addEnum(unique[i].c_str(), idx); - EXPECT_EQUAL(offset, idx.offset()); - EXPECT_EQUAL(0u, idx.bufferId()); ses.incRefCount(idx); EXPECT_EQUAL(1u, ses.getRefCount(idx)); indices.push_back(idx); - offset += EnumStoreType::alignEntrySize(unique[i].size() + 1 + 8); EXPECT_TRUE(ses.findIndex(unique[i].c_str(), idx)); } ses.freezeTree(); @@ -374,197 +225,11 @@ EnumStoreTest::testUniques EXPECT_EQUAL(static_cast<uint32_t>(unique.size()), i); } - -void -EnumStoreTest::testCompaction() -{ - testCompaction<StringEnumStore>(false); - testCompaction<StringEnumStore>(true); -} - -template <typename EnumStoreType> -void -EnumStoreTest::testCompaction(bool hasPostings) -{ - // entrySize = 15 before alignment - uint32_t entrySize = EnumStoreType::alignEntrySize(15); - uint32_t initBufferSize = entrySize * 5; - EnumStoreType ses(initBufferSize, hasPostings); - // Note: Sizes of underlying data store buffers are power of 2. - uint32_t adjustedBufferSize = vespalib::roundUp2inN(initBufferSize) - RESERVED_BYTES; - EnumIndex idx; - std::vector<EnumIndex> indices; - typename EnumStoreType::Type t = "foo"; - std::vector<std::string> uniques; - uniques.push_back("enum00"); - uniques.push_back("enum01"); - uniques.push_back("enum02"); - uniques.push_back("enum03"); - uniques.push_back("enum04"); - - // fill with unique values - for (uint32_t i = 0; i < 5; ++i) { - size_t expRemaining = adjustedBufferSize - i * entrySize; - EXPECT_EQUAL(expRemaining, ses.getRemaining()); - ses.addEnum(uniques[i].c_str(), idx); - ses.incRefCount(idx); - EXPECT_TRUE(ses.getRefCount(idx)); - indices.push_back(idx); - } - EXPECT_EQUAL(32u, ses.getRemaining()); - EXPECT_EQUAL(32u, ses.getBuffer(0).remaining()); - EXPECT_EQUAL(entrySize * 5 + RESERVED_BYTES, ses.getBuffer(0).size()); - EXPECT_EQUAL(RESERVED_BYTES, ses.getBuffer(0).getDeadElems()); - uint32_t failEntrySize = ses.getEntrySize("enum05"); - EXPECT_EQUAL(16u, failEntrySize); - - // change from enum00 -> enum01 - ses.decRefCount(indices[0]); - ses.incRefCount(indices[1]); - indices[0] = indices[1]; - - // check correct refcount - for (uint32_t i = 0; i < 5; ++i) { - EXPECT_TRUE(ses.findIndex(uniques[i].c_str(), idx)); - uint32_t refCount = ses.getRefCount(idx); - if (i == 0) { - EXPECT_TRUE(refCount == 0); - } else if (i == 1) { - EXPECT_TRUE(refCount == 2); - } else { - EXPECT_TRUE(refCount == 1); - } - } - - // free unused enums - ses.freeUnusedEnums(true); - EXPECT_TRUE(!ses.findIndex("enum00", idx)); - EXPECT_EQUAL(entrySize + RESERVED_BYTES, ses.getBuffer(0).getDeadElems()); - - auto &data_store_base = ses.get_data_store_base(); - auto old_compaction_count = data_store_base.get_compaction_count(); - - // perform compaction - IEnumStore::EnumIndexMap old2New; - EXPECT_TRUE(ses.performCompaction(3 * entrySize, old2New)); - EXPECT_TRUE(ses.getRemaining() >= 3 * entrySize); - EXPECT_TRUE(ses.getBuffer(1).remaining() >= 3 * entrySize); - EXPECT_TRUE(ses.getBuffer(1).size() == entrySize * 4); - EXPECT_TRUE(ses.getBuffer(1).getDeadElems() == 0); - - EXPECT_NOT_EQUAL(old_compaction_count, data_store_base.get_compaction_count()); - - // add new unique strings - ses.addEnum("enum05", idx); - ses.addEnum("enum06", idx); - ses.addEnum("enum00", idx); - - // compare old and new indices - for (uint32_t i = 0; i < indices.size(); ++i) { - idx = old2New[indices[i]]; - EXPECT_TRUE(indices[i].bufferId() == 0); - EXPECT_TRUE(idx.bufferId() == 1); - EXPECT_TRUE(ses.getValue(indices[i], t)); - typename EnumStoreType::Type s = "bar"; - EXPECT_TRUE(ses.getValue(idx, s)); - EXPECT_TRUE(strcmp(t, s) == 0); - } - // EnumIndex(0,0) is reserved so we have 4 bytes extra at the start of buffer 0 - idx = old2New[indices[0]]; - EXPECT_EQUAL(entrySize + RESERVED_BYTES, indices[0].offset()); - EXPECT_EQUAL(0u, idx.offset()); - idx = old2New[indices[1]]; - EXPECT_EQUAL(entrySize + RESERVED_BYTES, indices[1].offset()); - EXPECT_EQUAL(0u, idx.offset()); - idx = old2New[indices[2]]; - EXPECT_EQUAL(2 * entrySize + RESERVED_BYTES, indices[2].offset()); - EXPECT_EQUAL(entrySize, idx.offset()); - idx = old2New[indices[3]]; - EXPECT_EQUAL(3 * entrySize + RESERVED_BYTES, indices[3].offset()); - EXPECT_EQUAL(2 * entrySize, idx.offset()); - idx = old2New[indices[4]]; - EXPECT_EQUAL(4 * entrySize + RESERVED_BYTES, indices[4].offset()); - EXPECT_EQUAL(3 * entrySize, idx.offset()); -} - -void -EnumStoreTest::testReset() -{ - testReset<StringEnumStore>(false); - - testReset<StringEnumStore>(true); -} - -template <typename EnumStoreType> -void -EnumStoreTest::testReset(bool hasPostings) -{ - uint32_t numUniques = 10000; - srand(123456789); - StringVector rndStrings = fillRandomStrings(numUniques, 10, 15); - EXPECT_EQUAL(rndStrings.size(), size_t(numUniques)); - StringVector uniques = sortRandomStrings(rndStrings); - EXPECT_EQUAL(uniques.size(), size_t(numUniques)); - // max entrySize = 25 before alignment - uint32_t maxEntrySize = EnumStoreType::alignEntrySize(8 + 1 + 16); - EnumStoreType ses(numUniques * maxEntrySize, hasPostings); - EnumIndex idx; - - uint32_t cnt = 0; - // add new unique strings - for (StringVector::reverse_iterator iter = uniques.rbegin(); iter != uniques.rend(); ++iter) { - ses.addEnum(iter->c_str(), idx); - EXPECT_EQUAL(ses.getNumUniques(), ++cnt); - } - - // check for unique strings - for (StringVector::iterator iter = uniques.begin(); iter != uniques.end(); ++iter) { - EXPECT_TRUE(ses.findIndex(iter->c_str(), idx)); - } - - EXPECT_EQUAL(ses.getNumUniques(), numUniques); - if (hasPostings) { - testUniques<EnumStoreType, EnumPostingTree>(ses, uniques); - } else { - testUniques<EnumStoreType, EnumTree>(ses, uniques); - } - - rndStrings = fillRandomStrings(numUniques, 15, 20); - StringVector newUniques = sortRandomStrings(rndStrings); - - typename EnumStoreType::Builder builder; - for (StringVector::iterator iter = newUniques.begin(); iter != newUniques.end(); ++iter) { - builder.insert(iter->c_str()); - } - - ses.reset(builder); - // Note: Sizes of underlying data store buffers are power of 2. - EXPECT_EQUAL(524288u, ses.getCapacity()); - EXPECT_EQUAL(204272u, ses.getRemaining()); - - // check for old unique strings - for (StringVector::iterator iter = uniques.begin(); iter != uniques.end(); ++iter) { - EXPECT_TRUE(!ses.findIndex(iter->c_str(), idx)); - } - - // check for new unique strings - for (StringVector::iterator iter = newUniques.begin(); iter != newUniques.end(); ++iter) { - EXPECT_TRUE(ses.findIndex(iter->c_str(), idx)); - } - - EXPECT_EQUAL(ses.getNumUniques(), numUniques); - if (hasPostings) { - testUniques<EnumStoreType, EnumPostingTree>(ses, newUniques); - } else { - testUniques<EnumStoreType, EnumTree>(ses, newUniques); - } -} - void EnumStoreTest::testHoldListAndGeneration() { - uint32_t entrySize = StringEnumStore::alignEntrySize(8 + 1 + 6); - StringEnumStore ses(100 * entrySize, false); + // TODO: Rewrite test to use BatchUpdater + StringEnumStore ses(false); StringEnumStore::Index idx; StringVector uniques; generation_t sesGen = 0u; @@ -597,11 +262,11 @@ EnumStoreTest::testHoldListAndGeneration() for (uint32_t j = i - 9; j <= i; ++j) { EXPECT_TRUE(ses.findIndex(uniques[j].c_str(), idx)); indices.push_back(idx); - StringEnumStore::Entry entry = ses.getEntry(idx); - EXPECT_TRUE(entry.getRefCount() == 1); - EXPECT_TRUE(strcmp(entry.getValue(), uniques[j].c_str()) == 0); - expected.push_back(StringEntry(entry.getRefCount(), - std::string(entry.getValue()))); + uint32_t ref_count = ses.getRefCount(idx); + std::string value(ses.getValue(idx)); + EXPECT_EQUAL(1u, ref_count); + EXPECT_EQUAL(uniques[j], value); + expected.emplace_back(ref_count, value); } EXPECT_TRUE(indices.size() == 10); EXPECT_TRUE(expected.size() == 10); @@ -611,10 +276,6 @@ EnumStoreTest::testHoldListAndGeneration() } } - // Note: Sizes of underlying data store buffers are power of 2. - EXPECT_EQUAL(432u, ses.getRemaining()); - EXPECT_EQUAL(RESERVED_BYTES, ses.getBuffer(0).getDeadElems()); - // remove all uniques for (uint32_t i = 0; i < 100; ++i) { EXPECT_TRUE(ses.findIndex(uniques[i].c_str(), idx)); @@ -622,117 +283,12 @@ EnumStoreTest::testHoldListAndGeneration() EXPECT_EQUAL(0u, ses.getRefCount(idx)); } ses.freeUnusedEnums(true); - EXPECT_EQUAL(100 * entrySize + RESERVED_BYTES, ses.getBuffer(0).getDeadElems()); - - // perform compaction - uint32_t newEntrySize = StringEnumStore::alignEntrySize(8 + 1 + 8); - IEnumStore::EnumIndexMap old2New; - EXPECT_TRUE(ses.performCompaction(5 * newEntrySize, old2New)); // check readers again checkReaders(ses, sesGen, readers); - // fill up buffer - uint32_t i = 0; - while (ses.getRemaining() >= newEntrySize) { - //LOG(info, "fill: %s", newUniques[i].c_str()); - ses.addEnum(newUniques[i++].c_str(), idx); - ses.incRefCount(idx); - EXPECT_TRUE(ses.getRefCount(idx)); - } - EXPECT_LESS(ses.getRemaining(), newEntrySize); - // buffer on hold list - old2New.clear(); - EXPECT_TRUE(!ses.performCompaction(5 * newEntrySize, old2New)); - - checkReaders(ses, sesGen, readers); - ses.transferHoldLists(sesGen); - ses.trimHoldLists(sesGen + 1); - - // buffer no longer on hold list - EXPECT_LESS(ses.getRemaining(), newEntrySize); - old2New.clear(); - EXPECT_TRUE(ses.performCompaction(5 * newEntrySize, old2New)); - EXPECT_TRUE(ses.getRemaining() >= 5 * newEntrySize); -} - -void -EnumStoreTest::testMemoryUsage() -{ - StringEnumStore ses(200, false); - StringEnumStore::Index idx; - uint32_t num = 8; - std::vector<StringEnumStore::Index> indices; - std::vector<std::string> uniques; - for (uint32_t i = 0; i < num; ++i) { - std::stringstream ss; - ss << "enum" << i; - uniques.push_back(ss.str()); - } - generation_t sesGen = 0u; - uint32_t entrySize = StringEnumStore::alignEntrySize(8 + 1 + 5); // enum(4) + refcount(4) + 1(\0) + strlen("enumx") - - // usage before inserting enums - vespalib::MemoryUsage usage = ses.getMemoryUsage(); - EXPECT_EQUAL(ses.getNumUniques(), uint32_t(0)); - // Note: Sizes of underlying data store buffers are power of 2. - EXPECT_EQUAL(vespalib::roundUp2inN(enumStoreAlign(200u) + RESERVED_BYTES), usage.allocatedBytes()); - EXPECT_EQUAL(RESERVED_BYTES, usage.usedBytes()); - EXPECT_EQUAL(RESERVED_BYTES, usage.deadBytes()); - EXPECT_EQUAL(0u, usage.allocatedBytesOnHold()); - - for (uint32_t i = 0; i < num; ++i) { - ses.addEnum(uniques[i].c_str(), idx); - indices.push_back(idx); - ses.incRefCount(idx); - EXPECT_TRUE(ses.getRefCount(idx)); - } - - // usage after inserting enums - usage = ses.getMemoryUsage(); - EXPECT_EQUAL(ses.getNumUniques(), num); - // Note: Sizes of underlying data store buffers are power of 2. - EXPECT_EQUAL(vespalib::roundUp2inN(enumStoreAlign(200u) + RESERVED_BYTES), usage.allocatedBytes()); - EXPECT_EQUAL(num * entrySize + RESERVED_BYTES, usage.usedBytes()); - EXPECT_EQUAL(RESERVED_BYTES, usage.deadBytes()); - EXPECT_EQUAL(0u, usage.allocatedBytesOnHold()); - - // assign new enum for num / 2 of indices - for (uint32_t i = 0; i < num / 2; ++i) { - ses.decRefCount(indices[i]); - EXPECT_TRUE(ses.findIndex(uniques.back().c_str(), idx)); - ses.incRefCount(idx); - indices[i] = idx; - } - ses.freeUnusedEnums(true); - - // usage after removing enums - usage = ses.getMemoryUsage(); - EXPECT_EQUAL(ses.getNumUniques(), num / 2); - // Note: Sizes of underlying data store buffers are power of 2. - EXPECT_EQUAL(vespalib::roundUp2inN(enumStoreAlign(200u) + RESERVED_BYTES), usage.allocatedBytes()); - EXPECT_EQUAL(num * entrySize + RESERVED_BYTES, usage.usedBytes()); - EXPECT_EQUAL((num / 2) * entrySize + RESERVED_BYTES, usage.deadBytes()); - EXPECT_EQUAL(0u, usage.allocatedBytesOnHold()); - - IEnumStore::EnumIndexMap old2New; - ses.performCompaction(400, old2New); - - // usage after compaction - vespalib::MemoryUsage usage2 = ses.getMemoryUsage(); - EXPECT_EQUAL(ses.getNumUniques(), num / 2); - EXPECT_EQUAL(usage.usedBytes() + (num / 2) * entrySize, usage2.usedBytes()); - EXPECT_EQUAL(usage.deadBytes(), usage2.deadBytes()); - EXPECT_EQUAL(usage.usedBytes() - usage.deadBytes(), usage2.allocatedBytesOnHold()); - ses.transferHoldLists(sesGen); ses.trimHoldLists(sesGen + 1); - - // usage after hold list trimming - vespalib::MemoryUsage usage3 = ses.getMemoryUsage(); - EXPECT_EQUAL((num / 2) * entrySize, usage3.usedBytes()); - EXPECT_EQUAL(0u, usage3.deadBytes()); - EXPECT_EQUAL(0u, usage3.allocatedBytesOnHold()); } namespace { @@ -747,10 +303,13 @@ addEnum(NumericEnumStore &store, uint32_t value) } void -decRefCount(NumericEnumStore &store, NumericEnumStore::Index idx) +decRefCount(NumericEnumStore& store, NumericEnumStore::Index idx) { store.decRefCount(idx); store.freeUnusedEnums(false); + generation_t gen = 5; + store.transferHoldLists(gen); + store.trimHoldLists(gen + 1); } } @@ -758,106 +317,21 @@ decRefCount(NumericEnumStore &store, NumericEnumStore::Index idx) void EnumStoreTest::requireThatAddressSpaceUsageIsReported() { - const size_t ADDRESS_LIMIT = 34359738368; // NumericEnumStore::DataStoreType::RefType::offsetSize() - NumericEnumStore store(200, false); + // TODO: Rewrite test to use BatchUpdater + const size_t ADDRESS_LIMIT = 4290772994; // Max allocated elements in un-allocated buffers + allocated elements in allocated buffers. + NumericEnumStore store(false); using vespalib::AddressSpace; - EXPECT_EQUAL(AddressSpace(16, 16, ADDRESS_LIMIT), store.getAddressSpaceUsage()); + EXPECT_EQUAL(AddressSpace(1, 1, ADDRESS_LIMIT), store.getAddressSpaceUsage()); NumericEnumStore::Index idx1 = addEnum(store, 10); - EXPECT_EQUAL(AddressSpace(32, 16, ADDRESS_LIMIT), store.getAddressSpaceUsage()); + EXPECT_EQUAL(AddressSpace(2, 1, ADDRESS_LIMIT), store.getAddressSpaceUsage()); NumericEnumStore::Index idx2 = addEnum(store, 20); - EXPECT_EQUAL(AddressSpace(48, 16, ADDRESS_LIMIT), store.getAddressSpaceUsage()); + // Address limit increases because buffer is re-sized. + EXPECT_EQUAL(AddressSpace(3, 1, ADDRESS_LIMIT + 2), store.getAddressSpaceUsage()); decRefCount(store, idx1); - EXPECT_EQUAL(AddressSpace(48, 32, ADDRESS_LIMIT), store.getAddressSpaceUsage()); + EXPECT_EQUAL(AddressSpace(3, 2, ADDRESS_LIMIT + 2), store.getAddressSpaceUsage()); decRefCount(store, idx2); - EXPECT_EQUAL(AddressSpace(48, 48, ADDRESS_LIMIT), store.getAddressSpaceUsage()); -} - -size_t -digits(size_t num) -{ - size_t digits = 1; - while (num / 10 > 0) { - num /= 10; - digits++; - } - return digits; -} - -void -EnumStoreTest::testBufferLimit() -{ - size_t enumSize = StringEnumStore::Index::offsetSize(); - StringEnumStore es(enumSize, false); - - size_t strLen = 65536; - char str[strLen + 1]; - for (size_t i = 0; i < strLen; ++i) { - str[i] = 'X'; - } - str[strLen] = 0; - - size_t entrySize = StringEnumStore::getEntrySize(str); - size_t numUniques = enumSize / entrySize; - size_t uniqDigits = digits(numUniques); - - EnumIndex idx; - EnumIndex lastIdx; - for (size_t i = 0; i < numUniques; ++i) { - sprintf(str, "%0*zu", (int)uniqDigits, i); - str[uniqDigits] = 'X'; - es.addEnum(str, idx); - if (i % (numUniques / 32) == 1) { - EXPECT_TRUE(idx.offset() > lastIdx.offset()); - EXPECT_EQUAL(i + 1, es.getNumUniques()); - std::cout << "idx.offset(" << idx.offset() << "), str(" << std::string(str, uniqDigits) << ")" << std::endl; - } - lastIdx = idx; - } - EXPECT_EQUAL(idx.offset(), lastIdx.offset()); - EXPECT_EQUAL(numUniques, es.getNumUniques()); - std::cout << "idx.offset(" << idx.offset() << "), str(" << std::string(str, uniqDigits) << ")" << std::endl; -} - -template <typename T> -T -EnumStoreTest::random(T low, T high) -{ - return (rand() % (high - low)) + low; -} - -std::string -EnumStoreTest::getRandomString(uint32_t minLen, uint32_t maxLen) -{ - uint32_t len = random(minLen, maxLen); - std::string retval; - for (uint32_t i = 0; i < len; ++i) { - char c = random('a', 'z'); - retval.push_back(c); - } - return retval; -} - -EnumStoreTest::StringVector -EnumStoreTest::fillRandomStrings(uint32_t numStrings, uint32_t minLen, uint32_t maxLen) -{ - StringVector retval; - retval.reserve(numStrings); - for (uint32_t i = 0; i < numStrings; ++i) { - retval.push_back(getRandomString(minLen, maxLen)); - } - return retval; -} - -EnumStoreTest::StringVector -EnumStoreTest::sortRandomStrings(StringVector & strings) -{ - std::sort(strings.begin(), strings.end()); - std::vector<std::string> retval; - retval.reserve(strings.size()); - std::vector<std::string>::iterator pos = std::unique(strings.begin(), strings.end()); - std::copy(strings.begin(), pos, std::back_inserter(retval)); - return retval; + EXPECT_EQUAL(AddressSpace(3, 3, ADDRESS_LIMIT + 2), store.getAddressSpaceUsage()); } void @@ -867,7 +341,7 @@ EnumStoreTest::checkReaders(const StringEnumStore & ses, { (void) sesGen; //uint32_t refCount = 1000; - StringEnumStore::Type t = ""; + StringEnumStore::DataType t = ""; for (uint32_t i = 0; i < readers.size(); ++i) { const Reader & r = readers[i]; for (uint32_t j = 0; j < r._indices.size(); ++j) { @@ -883,20 +357,11 @@ EnumStoreTest::Main() { TEST_INIT("enumstore_test"); - testIndex(); - testStringEntry(); - testNumericEntry(); testFloatEnumStore(); testFindFolded(); testAddEnum(); - testCompaction(); - testReset(); testHoldListAndGeneration(); - testMemoryUsage(); TEST_DO(requireThatAddressSpaceUsageIsReported()); - if (_argc > 1) { - testBufferLimit(); // large test with 8 GB buffer - } TEST_DONE(); } diff --git a/searchlib/src/vespa/searchlib/attribute/attributevector.cpp b/searchlib/src/vespa/searchlib/attribute/attributevector.cpp index 3e949384d4a..5f9ebd1bf44 100644 --- a/searchlib/src/vespa/searchlib/attribute/attributevector.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attributevector.cpp @@ -218,6 +218,12 @@ AttributeVector::updateStatistics(uint64_t numValues, uint64_t numUniqueValue, u _status.updateStatistics(numValues, numUniqueValue, allocated, used, dead, onHold); } +vespalib::MemoryUsage +AttributeVector::getEnumStoreValuesMemoryUsage() const +{ + return vespalib::MemoryUsage(); +} + vespalib::AddressSpace AttributeVector::getEnumStoreAddressSpaceUsage() const { @@ -715,7 +721,7 @@ AttributeVector::getEstimatedSaveByteSize() const uint64_t idxFileSize = 0; uint64_t udatFileSize = 0; size_t fixedWidth = getFixedWidth(); - vespalib::AddressSpace enumAddressSpace(getEnumStoreAddressSpaceUsage()); + vespalib::MemoryUsage values_mem_usage = getEnumStoreValuesMemoryUsage(); if (hasMultiValue()) { idxFileSize = headerSize + sizeof(uint32_t) * (docIdLimit + 1); @@ -723,13 +729,15 @@ AttributeVector::getEstimatedSaveByteSize() const if (hasWeightedSetType()) { weightFileSize = headerSize + sizeof(int32_t) * totalValueCount; } - if (hasEnum() && getEnumeratedSave()) { - datFileSize = headerSize + 4 * totalValueCount; + if (hasEnum()) { + datFileSize = headerSize + sizeof(uint32_t) * totalValueCount; if (fixedWidth != 0) { udatFileSize = headerSize + fixedWidth * uniqueValueCount; } else { - udatFileSize = headerSize + enumAddressSpace.used() - - 8 * uniqueValueCount; + size_t unique_values_bytes = values_mem_usage.usedBytes() - + (values_mem_usage.deadBytes() + values_mem_usage.allocatedBytesOnHold()); + size_t ref_count_mem_usage = sizeof(uint32_t) * uniqueValueCount; + udatFileSize = headerSize + unique_values_bytes - ref_count_mem_usage; } } else { BasicType::Type basicType(getBasicType()); @@ -744,12 +752,7 @@ AttributeVector::getEstimatedSaveByteSize() const datFileSize = headerSize + memorySize; break; case BasicType::Type::STRING: - assert(hasEnum()); - datFileSize = headerSize; - if (uniqueValueCount > 0) { - double avgEntrySize = (static_cast<double>(enumAddressSpace.used()) / uniqueValueCount) - 8; - datFileSize += avgEntrySize * totalValueCount; - } + abort(); break; default: datFileSize = headerSize + fixedWidth * totalValueCount; diff --git a/searchlib/src/vespa/searchlib/attribute/attributevector.h b/searchlib/src/vespa/searchlib/attribute/attributevector.h index 52e63385c7d..b5474fda9c9 100644 --- a/searchlib/src/vespa/searchlib/attribute/attributevector.h +++ b/searchlib/src/vespa/searchlib/attribute/attributevector.h @@ -378,6 +378,7 @@ protected: return value; } + virtual vespalib::MemoryUsage getEnumStoreValuesMemoryUsage() const; virtual vespalib::AddressSpace getEnumStoreAddressSpaceUsage() const; virtual vespalib::AddressSpace getMultiValueAddressSpaceUsage() const; void logEnumStoreEvent(const char *reason, const char *stage); diff --git a/searchlib/src/vespa/searchlib/attribute/enumattribute.h b/searchlib/src/vespa/searchlib/attribute/enumattribute.h index 55af5a874f9..db8952d4f71 100644 --- a/searchlib/src/vespa/searchlib/attribute/enumattribute.h +++ b/searchlib/src/vespa/searchlib/attribute/enumattribute.h @@ -78,6 +78,7 @@ protected: void insertNewUniqueValues(EnumStoreBatchUpdater& updater); virtual void considerAttributeChange(const Change & c, UniqueSet & newUniques) = 0; virtual void reEnumerate(const EnumIndexMap &) = 0; + vespalib::MemoryUsage getEnumStoreValuesMemoryUsage() const override; vespalib::AddressSpace getEnumStoreAddressSpaceUsage() const override; public: EnumAttribute(const vespalib::string & baseFileName, const AttributeVector::Config & cfg); diff --git a/searchlib/src/vespa/searchlib/attribute/enumattribute.hpp b/searchlib/src/vespa/searchlib/attribute/enumattribute.hpp index a5ba60cad4d..57cb33b1b70 100644 --- a/searchlib/src/vespa/searchlib/attribute/enumattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/enumattribute.hpp @@ -13,7 +13,7 @@ EnumAttribute<B>:: EnumAttribute(const vespalib::string &baseFileName, const AttributeVector::Config &cfg) : B(baseFileName, cfg), - _enumStore(0, cfg.fastSearch()) + _enumStore(cfg.fastSearch()) { this->setEnum(true); } @@ -27,7 +27,7 @@ template <typename B> void EnumAttribute<B>::fillEnum(LoadedVector & loaded) { if constexpr(!std::is_same_v<LoadedVector, NoLoadedVector>) { - typename EnumStore::Builder builder; + auto builder = _enumStore.make_builder(); if (!loaded.empty()) { auto value = loaded.read(); LoadedValueType prev = value.getValue(); @@ -36,7 +36,7 @@ void EnumAttribute<B>::fillEnum(LoadedVector & loaded) for (size_t i(0), m(loaded.size()); i < m; ++i, loaded.next()) { value = loaded.read(); if (EnumStore::ComparatorType::compare(prev, value.getValue()) != 0) { - builder.updateRefCount(prevRefCount); + builder.set_ref_count_for_last_value(prevRefCount); index = builder.insert(value.getValue(), value._pidx.ref()); prev = value.getValue(); prevRefCount = 1; @@ -46,9 +46,9 @@ void EnumAttribute<B>::fillEnum(LoadedVector & loaded) value.setEidx(index); loaded.write(value); } - builder.updateRefCount(prevRefCount); + builder.set_ref_count_for_last_value(prevRefCount); } - _enumStore.reset(builder); + builder.build(); } } @@ -93,48 +93,18 @@ EnumAttribute<B>::insertNewUniqueValues(EnumStoreBatchUpdater& updater) considerAttributeChange(data, newUniques); } - uint64_t extraBytesNeeded = 0; - for (const auto & data : newUniques) { - extraBytesNeeded += _enumStore.getEntrySize(data.raw()); - } - - do { - // perform compaction on EnumStore if necessary - if (extraBytesNeeded > this->_enumStore.getRemaining() || - this->_enumStore.getPendingCompact()) - { - this->logEnumStoreEvent("enumstorecompact", "reserve"); - this->removeAllOldGenerations(); - this->_enumStore.clearPendingCompact(); - EnumIndexMap old2New(this->_enumStore.getNumUniques()*3); - this->logEnumStoreEvent("enumstorecompact", "start"); - if (!this->_enumStore.performCompaction(extraBytesNeeded, old2New)) { - this->logEnumStoreEvent("enumstorecompact", "failed_compact"); - // fallback to resize strategy - this->_enumStore.fallbackResize(extraBytesNeeded); - this->logEnumStoreEvent("enumstorecompact", "fallbackresize_complete"); - if (extraBytesNeeded > this->_enumStore.getRemaining()) { - HDR_ABORT("Cannot fallbackResize enumStore"); - } - break; // fallback resize performed instead of compaction. - } - - // update underlying structure with new EnumIndex values. - reEnumerate(old2New); - // Clear scratch enumeration - for (auto & data : this->_changes) { - data._enumScratchPad = ChangeBase::UNSET_ENUM; - } - this->logEnumStoreEvent("enumstorecompact", "complete"); - } - } while (0); - // insert new unique values in EnumStore for (const auto & data : newUniques) { updater.add(data.raw()); } } +template <typename B> +vespalib::MemoryUsage +EnumAttribute<B>::getEnumStoreValuesMemoryUsage() const +{ + return _enumStore.getValuesMemoryUsage(); +} template <typename B> vespalib::AddressSpace diff --git a/searchlib/src/vespa/searchlib/attribute/enumstore.cpp b/searchlib/src/vespa/searchlib/attribute/enumstore.cpp index 4cf5ea9c766..7ce65193c40 100644 --- a/searchlib/src/vespa/searchlib/attribute/enumstore.cpp +++ b/searchlib/src/vespa/searchlib/attribute/enumstore.cpp @@ -10,78 +10,50 @@ LOG_SETUP(".searchlib.attribute.enum_store"); namespace search { -template <> -void -EnumStoreT<StringEntryType>:: -insertEntryValue(char * dst, Type value) -{ - strcpy(dst, value); -} template <> void -EnumStoreT<StringEntryType>::writeValues(BufferWriter &writer, - const Index *idxs, +EnumStoreT<StringEntryType>::writeValues(BufferWriter& writer, + const Index* idxs, size_t count) const { - for (uint32_t i = 0; i < count; ++i) { + for (size_t i = 0; i < count; ++i) { Index idx = idxs[i]; - const char *src(_store.getEntry<char>(idx) + - EntryBase::size()); + const char* src = _store.get(idx); size_t sz = strlen(src) + 1; writer.write(src, sz); } } - template <> ssize_t -EnumStoreT<StringEntryType>::deserialize(const void *src, - size_t available, - size_t &initSpace) +EnumStoreT<StringEntryType>::deserialize(const void* src, + size_t available, + Index& idx) { - size_t slen = strlen(static_cast<const char *>(src)); - size_t sz(StringEntryType::fixedSize() + slen); - if (available < sz) + const char* value = static_cast<const char*>(src); + size_t slen = strlen(value); + size_t sz = slen + 1; + if (available < sz) { return -1; - uint32_t entrySize(alignEntrySize(EntryBase::size() + sz)); - initSpace += entrySize; + } + Index prev_idx = idx; + idx = _store.get_allocator().allocate(value); + + if (prev_idx.valid()) { + assert(ComparatorType::compare(getValue(prev_idx), value) < 0); + } return sz; } - -template <> -ssize_t -EnumStoreT<StringEntryType>::deserialize(const void *src, - size_t available, - Index &idx) +std::unique_ptr<datastore::IUniqueStoreDictionary> +make_enum_store_dictionary(IEnumStore &store, bool has_postings) { - size_t slen = strlen(static_cast<const char *>(src)); - size_t sz(StringEntryType::fixedSize() + slen); - if (available < sz) - return -1; - uint32_t activeBufferId = _store.getActiveBufferId(TYPE_ID); - datastore::BufferState & buffer = _store.getBufferState(activeBufferId); - uint32_t entrySize(alignEntrySize(EntryBase::size() + sz)); - if (buffer.remaining() < entrySize) { - LOG_ABORT("Out of enumstore bufferspace"); - } - uint64_t offset = buffer.size(); - Index newIdx(offset, activeBufferId); - char *dst(_store.getEntry<char>(newIdx)); - memcpy(dst, &dummy_enum_value, sizeof(uint32_t)); - uint32_t pos = sizeof(uint32_t); - uint32_t refCount(0); - memcpy(dst + pos, &refCount, sizeof(uint32_t)); - pos += sizeof(uint32_t); - memcpy(dst + pos, src, sz); - buffer.pushed_back(entrySize); - - if (idx.valid()) { - assert(ComparatorType::compare(getValue(idx), Entry(dst).getValue()) < 0); + if (has_postings) { + return std::make_unique<EnumStoreDictionary<EnumPostingTree>>(store); + } else { + return std::make_unique<EnumStoreDictionary<EnumTree>>(store); } - idx = newIdx; - return sz; } vespalib::asciistream & operator << (vespalib::asciistream & os, const IEnumStore::Index & idx) { diff --git a/searchlib/src/vespa/searchlib/attribute/enumstore.h b/searchlib/src/vespa/searchlib/attribute/enumstore.h index fa5e9611c55..032acfc0ee2 100644 --- a/searchlib/src/vespa/searchlib/attribute/enumstore.h +++ b/searchlib/src/vespa/searchlib/attribute/enumstore.h @@ -10,6 +10,8 @@ #include <vespa/vespalib/btree/btree.h> #include <vespa/vespalib/btree/btreebuilder.h> #include <vespa/vespalib/datastore/entryref.h> +#include <vespa/vespalib/datastore/unique_store.h> +#include <vespa/vespalib/datastore/unique_store_string_allocator.h> #include <vespa/vespalib/util/buffer.h> #include <vespa/vespalib/util/array.h> #include <vespa/vespalib/util/stringfmt.h> @@ -78,202 +80,123 @@ class EnumStoreT : public IEnumStore { friend class EnumStoreTest; public: - using Type = typename EntryType::Type; + using DataType = typename EntryType::Type; using ComparatorType = EnumStoreComparatorT<EntryType>; + using AllocatorType = std::conditional_t<std::is_same_v<DataType, const char *>, + datastore::UniqueStoreStringAllocator<Index>, + datastore::UniqueStoreAllocator<DataType, Index>>; + + using UniqueStoreType = datastore::UniqueStore<DataType, Index, ComparatorType, AllocatorType>; using FoldedComparatorType = EnumStoreFoldedComparatorT<EntryType>; using EnumStoreType = EnumStoreT<EntryType>; - using DataStoreType = datastore::DataStoreT<Index>; + using EntryRef = datastore::EntryRef; using generation_t = vespalib::GenerationHandler::generation_t; - class EntryBase { - protected: - char * _data; - public: - EntryBase(void * data) : _data(static_cast<char *>(data)) {} - uint32_t getRefCount() const { - return *(reinterpret_cast<uint32_t *>(_data) + 1); - } - void incRefCount() { - uint32_t *dst = reinterpret_cast<uint32_t *>(_data) + 1; - ++(*dst); - } - void decRefCount() { - uint32_t *dst = reinterpret_cast<uint32_t *>(_data) + 1; - --(*dst); - } - void setRefCount(uint32_t refCount) { - uint32_t *dst = reinterpret_cast<uint32_t *>(_data) + 1; - *dst = refCount; - } - static uint32_t size() { return 2*sizeof(uint32_t); } - }; - - class Entry : public EntryBase { - public: - Entry(void * data) : EntryBase(data) {} - Type getValue() const; - static uint32_t fixedSize() { return EntryBase::size() + EntryType::fixedSize(); } - }; - - class EnumBufferType : public datastore::BufferType<char> { - private: - size_t _minSizeNeeded; // lower cap for sizeNeeded - size_t _deadElems; // dead elements in active buffer - bool _pendingCompact; - bool _wantCompact; - public: - EnumBufferType(); - size_t calcArraysToAlloc(uint32_t bufferId, size_t sizeNeeded, bool resizing) const override; - void setSizeNeededAndDead(size_t sizeNeeded, size_t deadElems) { - _minSizeNeeded = sizeNeeded; - _deadElems = deadElems; - } - void onFree(size_t usedElems) override { - datastore::BufferType<char>::onFree(usedElems); - _pendingCompact = _wantCompact; - _wantCompact = false; - } - void setWantCompact() { _wantCompact = true; } - bool getPendingCompact() const { return _pendingCompact; } - void clearPendingCompact() { _pendingCompact = false; } - }; - - static void insertEntry(char * dst, uint32_t refCount, Type value); private: - IEnumStoreDictionary *_enumDict; - DataStoreType _store; - EnumBufferType _type; - std::vector<uint32_t> _toHoldBuffers; // used during compaction - - static const uint32_t TYPE_ID = 0; + UniqueStoreType _store; + IEnumStoreDictionary& _dict; EnumStoreT(const EnumStoreT & rhs) = delete; EnumStoreT & operator=(const EnumStoreT & rhs) = delete; - static void insertEntryValue(char * dst, Type value) { - memcpy(dst, &value, sizeof(Type)); - } + void freeUnusedEnum(Index idx, IndexSet& unused) override; - EntryBase getEntryBase(Index idx) const { - return EntryBase(const_cast<DataStoreType &>(_store).getEntry<char>(idx)); + const datastore::UniqueStoreEntryBase& get_entry_base(Index idx) const { + return _store.get_allocator().get_wrapped(idx); } - datastore::BufferState & getBuffer(uint32_t bufferIdx) { - return _store.getBufferState(bufferIdx); - } - const datastore::BufferState & getBuffer(uint32_t bufferIdx) const { - return _store.getBufferState(bufferIdx); - } - bool validIndex(Index idx) const { - return (idx.valid() && idx.offset() < _store.getBufferState(idx.bufferId()).size()); - } - uint32_t getBufferIndex(datastore::BufferState::State status); - void postCompact(); - bool preCompact(uint64_t bytesNeeded); - - Entry getEntry(Index idx) const { - return Entry(const_cast<DataStoreType &>(_store).getEntry<char>(idx)); - } - - void freeUnusedEnum(Index idx, IndexSet & unused) override; public: - EnumStoreT(uint64_t initBufferSize, bool hasPostings); + EnumStoreT(bool hasPostings); virtual ~EnumStoreT(); - void reset(uint64_t initBufferSize); - - uint32_t getRefCount(Index idx) const { return getEntryBase(idx).getRefCount(); } - void incRefCount(Index idx) { getEntryBase(idx).incRefCount(); } - void decRefCount(Index idx) { getEntryBase(idx).decRefCount(); } + uint32_t getRefCount(Index idx) const { return get_entry_base(idx).get_ref_count(); } + // TODO: Remove from public API + void incRefCount(Index idx) { return get_entry_base(idx).inc_ref_count(); } + void decRefCount(Index idx) { return get_entry_base(idx).dec_ref_count(); } // Only use when reading from enumerated attribute save files + // TODO: Instead create an API that is used for loading/initializing. void fixupRefCount(Index idx, uint32_t refCount) override { - getEntryBase(idx).setRefCount(refCount); + get_entry_base(idx).set_ref_count(refCount); } - uint32_t getNumUniques() const override { return _enumDict->getNumUniques(); } + uint32_t getNumUniques() const override { return _dict.getNumUniques(); } - uint32_t getRemaining() const { - return _store.getBufferState(_store.getActiveBufferId(TYPE_ID)).remaining(); - } - uint32_t getCapacity() const { - return _store.getBufferState(_store.getActiveBufferId(TYPE_ID)).capacity(); - } - vespalib::MemoryUsage getMemoryUsage() const override { return _store.getMemoryUsage(); } - vespalib::MemoryUsage getTreeMemoryUsage() const override { return _enumDict->get_memory_usage(); } + vespalib::MemoryUsage getValuesMemoryUsage() const override { return _store.get_allocator().get_data_store().getMemoryUsage(); } + vespalib::MemoryUsage getDictionaryMemoryUsage() const override { return _dict.get_memory_usage(); } vespalib::AddressSpace getAddressSpaceUsage() const; void transferHoldLists(generation_t generation); void trimHoldLists(generation_t firstUsed); - static void failNewSize(uint64_t minNewSize, uint64_t maxSize); - - // Align buffers and entries to 4 bytes boundary. - static uint64_t alignBufferSize(uint64_t val) { return Index::align(val); } - static uint32_t alignEntrySize(uint32_t val) { return Index::align(val); } - - void fallbackResize(uint64_t bytesNeeded); - bool getPendingCompact() const { return _type.getPendingCompact(); } - void clearPendingCompact() { _type.clearPendingCompact(); } - ssize_t deserialize0(const void *src, size_t available, IndexVector &idx) override; ssize_t deserialize(const void *src, size_t available, IndexVector &idx) { - return _enumDict->deserialize(src, available, idx); + return _dict.deserialize(src, available, idx); } - void fixupRefCounts(const EnumVector &hist) { _enumDict->fixupRefCounts(hist); } - void freezeTree() { _enumDict->freeze(); } + void fixupRefCounts(const EnumVector &hist) { _dict.fixupRefCounts(hist); } + void freezeTree() { _store.freeze(); } - IEnumStoreDictionary &getEnumStoreDict() override { return *_enumDict; } - const IEnumStoreDictionary &getEnumStoreDict() const override { return *_enumDict; } - EnumPostingTree &getPostingDictionary() { return _enumDict->getPostingDictionary(); } + IEnumStoreDictionary &getEnumStoreDict() override { return _dict; } + const IEnumStoreDictionary &getEnumStoreDict() const override { return _dict; } + EnumPostingTree &getPostingDictionary() { return _dict.getPostingDictionary(); } const EnumPostingTree &getPostingDictionary() const { - return _enumDict->getPostingDictionary(); + return _dict.getPostingDictionary(); } - const datastore::DataStoreBase &get_data_store_base() const override { return _store; } + // TODO: Add API for getting compaction count instead. + const datastore::DataStoreBase &get_data_store_base() const override { return _store.get_allocator().get_data_store(); } - bool getValue(Index idx, Type & value) const; - Type getValue(uint32_t idx) const { return getValue(Index(datastore::EntryRef(idx))); } - Type getValue(Index idx) const { return getEntry(idx).getValue(); } - static uint32_t getEntrySize(Type value) { - return alignEntrySize(EntryBase::size() + EntryType::size(value)); - } + bool getValue(Index idx, DataType& value) const; + DataType getValue(uint32_t idx) const { return getValue(Index(EntryRef(idx))); } + DataType getValue(Index idx) const { return _store.get(idx); } + // TODO: Implement helper class to populate enum store when loading from enumerated save files. + + /** + * Used when building enum store from non-enumerated save files. + * TODO: Find better name. + */ class Builder { - public: - struct UniqueEntry { - UniqueEntry(const Type & val, size_t sz, uint32_t pidx = 0) : _value(val), _sz(sz), _pidx(pidx), _refCount(1) { } - Type _value; - size_t _sz; - size_t _pidx; - uint32_t _refCount; - }; - - typedef vespalib::Array<UniqueEntry> Uniques; private: - Uniques _uniques; - uint64_t _bufferSize; + AllocatorType& _allocator; + datastore::IUniqueStoreDictionary& _dict; + std::vector<EntryRef> _refs; + std::vector<uint32_t> _payloads; + public: - Builder(); + Builder(AllocatorType& allocator, datastore::IUniqueStoreDictionary& dict) + : _allocator(allocator), + _dict(dict), + _refs(), + _payloads() + { + } ~Builder(); - Index insert(Type value, uint32_t pidx = 0) { - uint32_t entrySize = getEntrySize(value); - _uniques.push_back(UniqueEntry(value, entrySize, pidx)); - Index index(_bufferSize, 0); // bufferId 0 should be used when resetting with a builder - _bufferSize += entrySize; - return index; + Index insert(const DataType& value, uint32_t posting_idx = 0) { + EntryRef new_ref = _allocator.allocate(value); + _refs.emplace_back(new_ref); + _payloads.emplace_back(posting_idx); + return new_ref; + } + void set_ref_count_for_last_value(uint32_t ref_count) { + assert(!_refs.empty()); + _allocator.get_wrapped(_refs.back()).set_ref_count(ref_count); + } + void build() { + _dict.build_with_payload(_refs, _payloads); } - void updateRefCount(uint32_t refCount) { _uniques.rbegin()->_refCount = refCount; } - const Uniques & getUniques() const { return _uniques; } - uint64_t getBufferSize() const { return _bufferSize; } }; + Builder make_builder() { + return Builder(_store.get_allocator(), _dict); + } + class BatchUpdater { private: EnumStoreType& _store; @@ -284,17 +207,19 @@ public: : _store(store), _possibly_unused() {} - void add(Type value) { - Index new_idx; - _store.addEnum(value, new_idx); - _possibly_unused.insert(new_idx); + // TODO: Rename to insert() + void add(DataType value) { + Index idx; + _store.addEnum(value, idx); + _possibly_unused.insert(idx); } void inc_ref_count(Index idx) { - _store.incRefCount(idx); + _store.get_entry_base(idx).inc_ref_count(); } void dec_ref_count(Index idx) { - _store.decRefCount(idx); - if (_store.getRefCount(idx) == 0) { + auto& entry = _store.get_entry_base(idx); + entry.dec_ref_count(); + if (entry.get_ref_count() == 0) { _possibly_unused.insert(idx); } } @@ -307,81 +232,44 @@ public: return BatchUpdater(*this); } + // TODO: Change to sending enum indexes as const array ref. void writeValues(BufferWriter &writer, const Index *idxs, size_t count) const override; ssize_t deserialize(const void *src, size_t available, size_t &initSpace); ssize_t deserialize(const void *src, size_t available, Index &idx); bool foldedChange(const Index &idx1, const Index &idx2) override; - virtual bool findEnum(Type value, IEnumStore::EnumHandle &e) const; - virtual std::vector<IEnumStore::EnumHandle> findFoldedEnums(Type value) const; - void addEnum(Type value, Index &newIdx); - virtual bool findIndex(Type value, Index &idx) const; + bool findEnum(DataType value, IEnumStore::EnumHandle &e) const; + std::vector<IEnumStore::EnumHandle> findFoldedEnums(DataType value) const; + void addEnum(DataType value, Index &newIdx); + bool findIndex(DataType value, Index &idx) const; void freeUnusedEnums(bool movePostingidx) override; void freeUnusedEnums(const IndexSet& toRemove); - void reset(Builder &builder); - bool performCompaction(uint64_t bytesNeeded, EnumIndexMap & old2New); private: template <typename Dictionary> - void reset(Builder &builder, Dictionary &dict); - - template <typename Dictionary> - void addEnum(Type value, Index &newIdx, Dictionary &dict); + void addEnum(DataType value, Index& newIdx, Dictionary& dict); - template <typename Dictionary> - void performCompaction(Dictionary &dict, EnumIndexMap & old2New); }; +std::unique_ptr<datastore::IUniqueStoreDictionary> +make_enum_store_dictionary(IEnumStore &store, bool has_postings); + vespalib::asciistream & operator << (vespalib::asciistream & os, const IEnumStore::Index & idx); extern template class datastore::DataStoreT<IEnumStore::Index>; -template <typename EntryType> -inline typename EntryType::Type -EnumStoreT<EntryType>::Entry::getValue() const // implementation for numeric -{ - Type dst; - const char * src = this->_data + EntryBase::size(); - memcpy(&dst, src, sizeof(Type)); - return dst; -} - -template <> -inline StringEntryType::Type -EnumStoreT<StringEntryType>::Entry::getValue() const -{ - return (_data + EntryBase::size()); -} - - template <> void -EnumStoreT<StringEntryType>::writeValues(BufferWriter &writer, - const Index *idxs, +EnumStoreT<StringEntryType>::writeValues(BufferWriter& writer, + const IEnumStore::Index* idxs, size_t count) const; template <> ssize_t -EnumStoreT<StringEntryType>::deserialize(const void *src, - size_t available, - size_t &initSpace); - -template <> -ssize_t -EnumStoreT<StringEntryType>::deserialize(const void *src, - size_t available, - Index &idx); - - -//----------------------------------------------------------------------------- -// EnumStore -//----------------------------------------------------------------------------- - -template <> -void -EnumStoreT<StringEntryType>:: -insertEntryValue(char * dst, Type value); +EnumStoreT<StringEntryType>::deserialize(const void* src, + size_t available, + Index& idx); extern template diff --git a/searchlib/src/vespa/searchlib/attribute/enumstore.hpp b/searchlib/src/vespa/searchlib/attribute/enumstore.hpp index 428875e00db..254f517ada2 100644 --- a/searchlib/src/vespa/searchlib/attribute/enumstore.hpp +++ b/searchlib/src/vespa/searchlib/attribute/enumstore.hpp @@ -14,174 +14,44 @@ #include <vespa/vespalib/btree/btreeroot.hpp> #include <vespa/vespalib/btree/btreebuilder.hpp> #include <vespa/vespalib/btree/btree.hpp> +#include <vespa/vespalib/datastore/unique_store.hpp> +#include <vespa/vespalib/datastore/unique_store_string_allocator.hpp> #include <vespa/vespalib/util/array.hpp> #include <vespa/vespalib/util/bufferwriter.h> namespace search { -namespace { - -const uint32_t dummy_enum_value = 0; - -} - -template <typename EntryType> -EnumStoreT<EntryType>::EnumBufferType::EnumBufferType() - : datastore::BufferType<char>(Index::align(1), - Index::offsetSize() / Index::align(1), - Index::offsetSize() / Index::align(1)), - _minSizeNeeded(0), - _deadElems(0), - _pendingCompact(false), - _wantCompact(false) -{ -} - -template <typename EntryType> -size_t -EnumStoreT<EntryType>::EnumBufferType::calcArraysToAlloc(uint32_t bufferId, size_t sizeNeeded, bool resizing) const -{ - (void) resizing; - size_t reservedElements = getReservedElements(bufferId); - sizeNeeded = std::max(sizeNeeded, _minSizeNeeded); - size_t usedElems = _activeUsedElems; - if (_lastUsedElems != nullptr) { - usedElems += *_lastUsedElems; - } - assert((usedElems % _arraySize) == 0); - double growRatio = 1.5f; - uint64_t maxSize = static_cast<uint64_t>(_maxArrays) * _arraySize; - uint64_t newSize = usedElems - _deadElems + sizeNeeded; - if (usedElems != 0) { - newSize *= growRatio; - } - newSize += reservedElements; - newSize = alignBufferSize(newSize); - assert((newSize % _arraySize) == 0); - if (newSize <= maxSize) { - return newSize / _arraySize; - } - newSize = usedElems - _deadElems + sizeNeeded + reservedElements + 1000000; - newSize = alignBufferSize(newSize); - assert((newSize % _arraySize) == 0); - if (newSize <= maxSize) { - return _maxArrays; - } - failNewSize(newSize, maxSize); - return 0; -} - -template <typename EntryType> -void -EnumStoreT<EntryType>:: -insertEntry(char * dst, uint32_t refCount, Type value) -{ - memcpy(dst, &dummy_enum_value, sizeof(uint32_t)); - uint32_t pos = sizeof(uint32_t); - memcpy(dst + pos, &refCount, sizeof(uint32_t)); - pos += sizeof(uint32_t); - insertEntryValue(dst + pos, value); -} - -template <> -void -EnumStoreT<StringEntryType>:: -insertEntryValue(char * dst, Type value); - template <typename EntryType> -uint32_t -EnumStoreT<EntryType>::getBufferIndex(datastore::BufferState::State status) +void EnumStoreT<EntryType>::freeUnusedEnum(Index idx, IndexSet& unused) { - for (uint32_t i = 0; i < _store.getNumBuffers(); ++i) { - if (_store.getBufferState(i).getState() == status) { - return i; - } + const auto& entry = get_entry_base(idx); + if (entry.get_ref_count() == 0) { + unused.insert(idx); + _store.get_allocator().hold(idx); } - return Index::numBuffers(); } template <typename EntryType> -void -EnumStoreT<EntryType>::postCompact() +EnumStoreT<EntryType>::EnumStoreT(bool has_postings) + : _store(make_enum_store_dictionary(*this, has_postings)), + _dict(static_cast<IEnumStoreDictionary&>(_store.get_dictionary())) { - _store.finishCompact(_toHoldBuffers); } template <typename EntryType> -bool -EnumStoreT<EntryType>::preCompact(uint64_t bytesNeeded) -{ - if (getBufferIndex(datastore::BufferState::FREE) == Index::numBuffers()) { - return false; - } - uint32_t activeBufId = _store.getActiveBufferId(TYPE_ID); - datastore::BufferState & activeBuf = _store.getBufferState(activeBufId); - _type.setSizeNeededAndDead(bytesNeeded, activeBuf.getDeadElems()); - _toHoldBuffers = _store.startCompact(TYPE_ID); - return true; -} - -template <typename EntryType> -void EnumStoreT<EntryType>::freeUnusedEnum(Index idx, IndexSet & unused) -{ - Entry e = getEntry(idx); - if (e.getRefCount() == 0) { - Type value = e.getValue(); - if (unused.insert(idx).second) { - _store.incDead(idx, getEntrySize(value)); - } - } -} - -template <typename EntryType> -EnumStoreT<EntryType>::EnumStoreT(uint64_t initBufferSize, bool hasPostings) - : _enumDict(nullptr), - _store(), - _type(), - _toHoldBuffers() -{ - if (hasPostings) { - _enumDict = new EnumStoreDictionary<EnumPostingTree>(*this); - } else { - _enumDict = new EnumStoreDictionary<EnumTree>(*this); - } - _store.addType(&_type); - _type.setSizeNeededAndDead(initBufferSize, 0); - _store.initActiveBuffers(); -} - -template <typename EntryType> -EnumStoreT<EntryType>::~EnumStoreT() -{ - _store.clearHoldLists(); - _store.dropBuffers(); - delete _enumDict; -} - -template <typename EntryType> -void -EnumStoreT<EntryType>::reset(uint64_t initBufferSize) -{ - _store.clearHoldLists(); - _store.dropBuffers(); - _type.setSizeNeededAndDead(initBufferSize, 0); - _store.initActiveBuffers(); - _enumDict->onReset(); -} +EnumStoreT<EntryType>::~EnumStoreT() = default; template <typename EntryType> vespalib::AddressSpace EnumStoreT<EntryType>::getAddressSpaceUsage() const { - const datastore::BufferState &activeState = _store.getBufferState(_store.getActiveBufferId(TYPE_ID)); - return vespalib::AddressSpace(activeState.size(), activeState.getDeadElems(), DataStoreType::RefType::offsetSize()); + return _store.get_address_space_usage(); } template <typename EntryType> void EnumStoreT<EntryType>::transferHoldLists(generation_t generation) { - _enumDict->transfer_hold_lists(generation); _store.transferHoldLists(generation); } @@ -190,52 +60,24 @@ void EnumStoreT<EntryType>::trimHoldLists(generation_t firstUsed) { // remove generations in the range [0, firstUsed> - _enumDict->trim_hold_lists(firstUsed); _store.trimHoldLists(firstUsed); } -template <typename EntryType> -void -EnumStoreT<EntryType>::failNewSize(uint64_t minNewSize, uint64_t maxSize) -{ - throw vespalib::IllegalStateException(vespalib::make_string("EnumStoreT::failNewSize: Minimum new size (%" PRIu64 ") exceeds max size (%" PRIu64 ")", minNewSize, maxSize)); -} - -template <typename EntryType> -void -EnumStoreT<EntryType>::fallbackResize(uint64_t bytesNeeded) -{ - uint32_t activeBufId = _store.getActiveBufferId(TYPE_ID); - size_t reservedElements = _type.getReservedElements(activeBufId); - _type.setSizeNeededAndDead(bytesNeeded, reservedElements); - _type.setWantCompact(); - _store.fallbackResize(activeBufId, bytesNeeded); -} template <typename EntryType> ssize_t -EnumStoreT<EntryType>::deserialize0(const void *src, +EnumStoreT<EntryType>::deserialize0(const void* src, size_t available, - IndexVector &idx) + IndexVector& idx) { size_t left = available; - size_t initSpace = Index::align(1); - const char * p = static_cast<const char *>(src); - while (left > 0) { - ssize_t sz = deserialize(p, left, initSpace); - if (sz < 0) - return sz; - p += sz; - left -= sz; - } - reset(initSpace); - left = available; - p = static_cast<const char *>(src); + const char* p = static_cast<const char*>(src); Index idx1; while (left > 0) { ssize_t sz = deserialize(p, left, idx1); - if (sz < 0) + if (sz < 0) { return sz; + } p += sz; left -= sz; idx.push_back(idx1); @@ -245,81 +87,45 @@ EnumStoreT<EntryType>::deserialize0(const void *src, template <typename EntryType> bool -EnumStoreT<EntryType>::getValue(Index idx, Type & value) const +EnumStoreT<EntryType>::getValue(Index idx, DataType& value) const { - if (!validIndex(idx)) { + if (!idx.valid()) { return false; } - value = getEntry(idx).getValue(); + value = _store.get(idx); return true; } template <typename EntryType> -EnumStoreT<EntryType>::Builder::Builder() - : _uniques(), - _bufferSize(Index::align(1)) -{ } - -template <typename EntryType> -EnumStoreT<EntryType>::Builder::~Builder() { } +EnumStoreT<EntryType>::Builder::~Builder() = default; template <class EntryType> void -EnumStoreT<EntryType>::writeValues(BufferWriter &writer, const Index *idxs, size_t count) const +EnumStoreT<EntryType>::writeValues(BufferWriter& writer, const Index* idxs, size_t count) const { - size_t sz(EntryType::fixedSize()); - for (uint32_t i = 0; i < count; ++i) { + for (size_t i = 0; i < count; ++i) { Index idx = idxs[i]; - const char *src(_store.getEntry<char>(idx) + EntryBase::size()); - writer.write(src, sz); + writer.write(&_store.get(idx), sizeof(DataType)); } } template <class EntryType> ssize_t -EnumStoreT<EntryType>::deserialize(const void *src, size_t available, size_t &initSpace) +EnumStoreT<EntryType>::deserialize(const void* src, size_t available, Index& idx) { - (void) src; - size_t sz(EntryType::fixedSize()); - if (available < sz) + if (available < sizeof(DataType)) { return -1; - uint32_t entrySize(alignEntrySize(EntryBase::size() + sz)); - initSpace += entrySize; - return sz; -} - -template <class EntryType> -ssize_t -EnumStoreT<EntryType>::deserialize(const void *src, size_t available, Index &idx) -{ - size_t sz(EntryType::fixedSize()); - if (available < sz) - return -1; - uint32_t activeBufferId = _store.getActiveBufferId(TYPE_ID); - datastore::BufferState & buffer = _store.getBufferState(activeBufferId); - uint32_t entrySize(alignEntrySize(EntryBase::size() + sz)); - if (buffer.remaining() < entrySize) { - HDR_ABORT("not enough space"); } - uint64_t offset = buffer.size(); - Index newIdx(offset, activeBufferId); - char *dst(_store.getEntry<char>(newIdx)); - memcpy(dst, &dummy_enum_value, sizeof(uint32_t)); - uint32_t pos = sizeof(uint32_t); - uint32_t refCount(0); - memcpy(dst + pos, &refCount, sizeof(uint32_t)); - pos += sizeof(uint32_t); - memcpy(dst + pos, src, sz); - buffer.pushed_back(entrySize); + const auto* value = static_cast<const DataType*>(src); + Index prev_idx = idx; + idx = _store.get_allocator().allocate(*value); - if (idx.valid()) { - assert(ComparatorType::compare(getValue(idx), Entry(dst).getValue()) < 0); + if (prev_idx.valid()) { + assert(ComparatorType::compare(getValue(prev_idx), *value) < 0); } - idx = newIdx; - return sz; + return sizeof(DataType); } - template <class EntryType> bool EnumStoreT<EntryType>::foldedChange(const Index &idx1, const Index &idx2) @@ -329,14 +135,13 @@ EnumStoreT<EntryType>::foldedChange(const Index &idx1, const Index &idx2) return cmpres < 0; } - template <typename EntryType> bool -EnumStoreT<EntryType>::findEnum(Type value, IEnumStore::EnumHandle &e) const +EnumStoreT<EntryType>::findEnum(DataType value, IEnumStore::EnumHandle &e) const { ComparatorType cmp(*this, value); Index idx; - if (_enumDict->findFrozenIndex(cmp, idx)) { + if (_dict.findFrozenIndex(cmp, idx)) { e = idx.ref(); return true; } @@ -345,22 +150,20 @@ EnumStoreT<EntryType>::findEnum(Type value, IEnumStore::EnumHandle &e) const template <typename EntryType> std::vector<IEnumStore::EnumHandle> -EnumStoreT<EntryType>::findFoldedEnums(Type value) const +EnumStoreT<EntryType>::findFoldedEnums(DataType value) const { FoldedComparatorType cmp(*this, value); - return _enumDict->findMatchingEnums(cmp); + return _dict.findMatchingEnums(cmp); } - template <typename EntryType> bool -EnumStoreT<EntryType>::findIndex(Type value, Index &idx) const +EnumStoreT<EntryType>::findIndex(DataType value, Index &idx) const { ComparatorType cmp(*this, value); - return _enumDict->findIndex(cmp, idx); + return _dict.findIndex(cmp, idx); } - template <typename EntryType> void EnumStoreT<EntryType>::freeUnusedEnums(bool movePostingIdx) @@ -368,13 +171,12 @@ EnumStoreT<EntryType>::freeUnusedEnums(bool movePostingIdx) ComparatorType cmp(*this); if (EntryType::hasFold() && movePostingIdx) { FoldedComparatorType fcmp(*this); - _enumDict->freeUnusedEnums(cmp, &fcmp); + _dict.freeUnusedEnums(cmp, &fcmp); } else { - _enumDict->freeUnusedEnums(cmp, nullptr); + _dict.freeUnusedEnums(cmp, nullptr); } } - template <typename EntryType> void EnumStoreT<EntryType>::freeUnusedEnums(const IndexSet& toRemove) @@ -382,34 +184,18 @@ EnumStoreT<EntryType>::freeUnusedEnums(const IndexSet& toRemove) ComparatorType cmp(*this); if (EntryType::hasFold()) { FoldedComparatorType fcmp(*this); - _enumDict->freeUnusedEnums(toRemove, cmp, &fcmp); + _dict.freeUnusedEnums(toRemove, cmp, &fcmp); } else { - _enumDict->freeUnusedEnums(toRemove, cmp, nullptr); + _dict.freeUnusedEnums(toRemove, cmp, nullptr); } } - template <typename EntryType> template <typename Dictionary> void -EnumStoreT<EntryType>::addEnum(Type value, Index &newIdx, Dictionary &dict) +EnumStoreT<EntryType>::addEnum(DataType value, Index& newIdx, Dictionary& dict) { typedef typename Dictionary::Iterator DictionaryIterator; - uint32_t entrySize = this->getEntrySize(value); - uint32_t activeBufferId = _store.getActiveBufferId(TYPE_ID); - datastore::BufferState & buffer = _store.getBufferState(activeBufferId); -#ifdef LOG_ENUM_STORE - LOG(info, - "addEnum(): buffer[%u]: capacity = %" PRIu64 - ", size = %" PRIu64 ", remaining = %" PRIu64 - ", dead = %" PRIu64 ", entrySize = %u", - activeBufferId, buffer.capacity(), - buffer.size(), buffer.remaining(), - buffer._deadElems, entrySize); -#endif - if (buffer.remaining() < entrySize) { - HDR_ABORT("not enough space"); - } // check if already present ComparatorType cmp(*this, value); @@ -420,33 +206,31 @@ EnumStoreT<EntryType>::addEnum(Type value, Index &newIdx, Dictionary &dict) return; } - uint64_t offset = buffer.size(); - newIdx = Index(offset, activeBufferId); - char * dst = _store.template getEntry<char>(newIdx); - this->insertEntry(dst, 0, value); - buffer.pushed_back(entrySize); - assert(Index::pad(offset) == 0); + newIdx = _store.get_allocator().allocate(value); + // TODO: Move this logic to "add/insert" on the dictionary // update tree with new index dict.insert(it, newIdx, typename Dictionary::DataType()); - // Copy posting list idx from next entry if same - // folded value. + // Copy posting list idx from next entry if same folded value. // Only for string posting list attributes, i.e. dictionary has // data and entry type has folded compare. if (DictionaryIterator::hasData() && EntryType::hasFold()) { FoldedComparatorType foldCmp(*this); ++it; - if (!it.valid() || foldCmp(newIdx, it.getKey())) + if (!it.valid() || foldCmp(newIdx, it.getKey())) { return; // Next entry does not use same posting list + } --it; --it; - if (it.valid() && !foldCmp(it.getKey(), newIdx)) + if (it.valid() && !foldCmp(it.getKey(), newIdx)) { return; // Previous entry uses same posting list - if (it.valid()) + } + if (it.valid()) { ++it; - else + } else { it.begin(); + } assert(it.valid() && it.getKey() == newIdx); ++it; typename Dictionary::DataType pidx(it.getData()); @@ -458,148 +242,15 @@ EnumStoreT<EntryType>::addEnum(Type value, Index &newIdx, Dictionary &dict) } } - template <typename EntryType> void -EnumStoreT<EntryType>::addEnum(Type value, Index & newIdx) +EnumStoreT<EntryType>::addEnum(DataType value, Index& newIdx) { - if (_enumDict->hasData()) { - addEnum(value, newIdx, static_cast<EnumStoreDictionary<EnumPostingTree> *>(_enumDict)->getDictionary()); + if (_dict.hasData()) { + addEnum(value, newIdx, static_cast<EnumStoreDictionary<EnumPostingTree> &>(_dict).getDictionary()); } else { - addEnum(value, newIdx, static_cast<EnumStoreDictionary<EnumTree> *>(_enumDict)->getDictionary()); + addEnum(value, newIdx, static_cast<EnumStoreDictionary<EnumTree> &>(_dict).getDictionary()); } } -template <typename DictionaryType> -struct TreeBuilderInserter { - static void insert(typename DictionaryType::Builder & builder, - IEnumStore::Index enumIdx, - datastore::EntryRef postingIdx) - { - (void) postingIdx; - builder.insert(enumIdx, typename DictionaryType::DataType()); - } -}; - -template <> -struct TreeBuilderInserter<EnumPostingTree> { - static void insert(EnumPostingTree::Builder & builder, - IEnumStore::Index enumIdx, - datastore::EntryRef postingIdx) - { - builder.insert(enumIdx, postingIdx); - } -}; - - -template <typename EntryType> -template <typename Dictionary> -void -EnumStoreT<EntryType>::reset(Builder &builder, Dictionary &dict) -{ - typedef typename Dictionary::Builder DictionaryBuilder; - reset(builder.getBufferSize()); - - DictionaryBuilder treeBuilder(dict.getAllocator()); - uint32_t activeBufferId = _store.getActiveBufferId(TYPE_ID); - datastore::BufferState & state = _store.getBufferState(activeBufferId); - - // insert entries and update DictionaryBuilder - const typename Builder::Uniques & uniques = builder.getUniques(); - for (typename Builder::Uniques::const_iterator iter = uniques.begin(); - iter != uniques.end(); ++iter) - { - uint64_t offset = state.size(); - Index idx(offset, activeBufferId); - char * dst = _store.template getEntry<char>(idx); - this->insertEntry(dst, iter->_refCount, iter->_value); - state.pushed_back(iter->_sz); - - // update DictionaryBuilder with enum index and posting index - TreeBuilderInserter<Dictionary>::insert(treeBuilder, idx, datastore::EntryRef(iter->_pidx)); - } - - // reset Dictionary - dict.assign(treeBuilder); // destructive copy of treeBuilder } - - -template <typename EntryType> -void -EnumStoreT<EntryType>::reset(Builder &builder) -{ - if (_enumDict->hasData()) { - reset(builder, static_cast<EnumStoreDictionary<EnumPostingTree> *>(_enumDict)->getDictionary()); - } else { - reset(builder, static_cast<EnumStoreDictionary<EnumTree> *>(_enumDict)->getDictionary()); - } -} - - -template <typename EntryType> -template <typename Dictionary> -void -EnumStoreT<EntryType>::performCompaction(Dictionary &dict, EnumIndexMap & old2New) -{ - typedef typename Dictionary::Iterator DictionaryIterator; - uint32_t freeBufferIdx = _store.getActiveBufferId(TYPE_ID); - datastore::BufferState & freeBuf = _store.getBufferState(freeBufferIdx); - // copy entries from active buffer to free buffer - for (DictionaryIterator iter = dict.begin(); iter.valid(); ++iter) { - Index activeIdx = iter.getKey(); - - Entry e = this->getEntry(activeIdx); - - // At this point the tree shall never reference any empty stuff. - assert(e.getRefCount() > 0); -#ifdef LOG_ENUM_STORE - LOG(info, "performCompaction(): copy entry: enum = %u, refCount = %u, value = %s", - e.getEnum(), e.getRefCount(), e.getValue()); -#endif - Type value = e.getValue(); - uint32_t refCount = e.getRefCount(); - uint32_t entrySize = this->getEntrySize(value); - - uint64_t offset = freeBuf.size(); - Index newIdx = Index(offset, freeBufferIdx); - char * dst = _store.template getEntry<char>(newIdx); - // insert entry into free buffer - this->insertEntry(dst, refCount, value); -#ifdef LOG_ENUM_STORE - LOG(info, "performCompaction(): new entry: refCount = %u, value = %s", 0, value); -#endif - freeBuf.pushed_back(entrySize); - assert(Index::pad(offset) == 0); -#ifdef LOG_ENUM_STORE - LOG(info, - "performCompaction(): new index: offset = %" PRIu64 - ", bufferIdx = %u", - offset, freeBufferIdx); -#endif - - // update tree with new index - std::atomic_thread_fence(std::memory_order_release); - iter.writeKey(newIdx); - - old2New[activeIdx] = newIdx; - } - this->postCompact(); -} - - -template <typename EntryType> -bool -EnumStoreT<EntryType>::performCompaction(uint64_t bytesNeeded, EnumIndexMap & old2New) -{ - if ( ! this->preCompact(bytesNeeded) ) { - return false; - } - if (_enumDict->hasData()) { - performCompaction(static_cast<EnumStoreDictionary<EnumPostingTree> *>(_enumDict)->getDictionary(), old2New); - } else { - performCompaction(static_cast<EnumStoreDictionary<EnumTree> *>(_enumDict)->getDictionary(), old2New); - } - return true; -} - -} // namespace search diff --git a/searchlib/src/vespa/searchlib/attribute/i_enum_store.h b/searchlib/src/vespa/searchlib/attribute/i_enum_store.h index 0963e0ff67d..f79098a67df 100644 --- a/searchlib/src/vespa/searchlib/attribute/i_enum_store.h +++ b/searchlib/src/vespa/searchlib/attribute/i_enum_store.h @@ -22,7 +22,7 @@ class IEnumStoreDictionary; */ class IEnumStore { public: - using Index = datastore::AlignedEntryRefT<31, 4>; + using Index = datastore::EntryRefT<22>; using IndexVector = vespalib::Array<Index>; using EnumHandle = attribute::IAttributeVector::EnumHandle; using EnumVector = vespalib::Array<uint32_t>; @@ -52,8 +52,8 @@ public: virtual const IEnumStoreDictionary& getEnumStoreDict() const = 0; virtual const datastore::DataStoreBase& get_data_store_base() const = 0; virtual uint32_t getNumUniques() const = 0; - virtual vespalib::MemoryUsage getMemoryUsage() const = 0; - virtual vespalib::MemoryUsage getTreeMemoryUsage() const = 0; + virtual vespalib::MemoryUsage getValuesMemoryUsage() const = 0; + virtual vespalib::MemoryUsage getDictionaryMemoryUsage() const = 0; template <typename TreeT> diff --git a/searchlib/src/vespa/searchlib/attribute/multienumattribute.hpp b/searchlib/src/vespa/searchlib/attribute/multienumattribute.hpp index 5352dc492fd..9bdc36e805b 100644 --- a/searchlib/src/vespa/searchlib/attribute/multienumattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/multienumattribute.hpp @@ -183,8 +183,8 @@ MultiValueEnumAttribute<B, M>::onUpdateStat() { // update statistics vespalib::MemoryUsage total; - total.merge(this->_enumStore.getMemoryUsage()); - total.merge(this->_enumStore.getTreeMemoryUsage()); + total.merge(this->_enumStore.getValuesMemoryUsage()); + total.merge(this->_enumStore.getDictionaryMemoryUsage()); total.merge(this->_mvMapping.updateStat()); total.merge(this->getChangeVectorMemoryUsage()); mergeMemoryStats(total); diff --git a/searchlib/src/vespa/searchlib/attribute/singleenumattribute.hpp b/searchlib/src/vespa/searchlib/attribute/singleenumattribute.hpp index 08095b6bf13..7f4f7503eff 100644 --- a/searchlib/src/vespa/searchlib/attribute/singleenumattribute.hpp +++ b/searchlib/src/vespa/searchlib/attribute/singleenumattribute.hpp @@ -102,8 +102,8 @@ SingleValueEnumAttribute<B>::onUpdateStat() // update statistics vespalib::MemoryUsage total = _enumIndices.getMemoryUsage(); total.mergeGenerationHeldBytes(getGenerationHolder().getHeldBytes()); - total.merge(this->_enumStore.getMemoryUsage()); - total.merge(this->_enumStore.getTreeMemoryUsage()); + total.merge(this->_enumStore.getValuesMemoryUsage()); + total.merge(this->_enumStore.getDictionaryMemoryUsage()); total.merge(this->getChangeVectorMemoryUsage()); mergeMemoryStats(total); this->updateStatistics(_enumIndices.size(), this->_enumStore.getNumUniques(), total.allocatedBytes(), diff --git a/vespalib/src/vespa/vespalib/datastore/i_unique_store_dictionary.h b/vespalib/src/vespa/vespalib/datastore/i_unique_store_dictionary.h index cda62884318..a780cb4fe98 100644 --- a/vespalib/src/vespa/vespalib/datastore/i_unique_store_dictionary.h +++ b/vespalib/src/vespa/vespalib/datastore/i_unique_store_dictionary.h @@ -45,6 +45,7 @@ public: virtual uint32_t get_num_uniques() const = 0; virtual vespalib::MemoryUsage get_memory_usage() const = 0; virtual void build(const std::vector<EntryRef> &refs, const std::vector<uint32_t> &ref_counts, std::function<void(EntryRef)> hold) = 0; + virtual void build_with_payload(const std::vector<EntryRef>& refs, const std::vector<uint32_t>& payloads) = 0; virtual std::unique_ptr<ReadSnapshot> get_read_snapshot() const = 0; virtual EntryRef get_frozen_root() const = 0; }; diff --git a/vespalib/src/vespa/vespalib/datastore/unique_store.h b/vespalib/src/vespa/vespalib/datastore/unique_store.h index bf7808e9325..6b85e79d3eb 100644 --- a/vespalib/src/vespa/vespalib/datastore/unique_store.h +++ b/vespalib/src/vespa/vespalib/datastore/unique_store.h @@ -44,6 +44,7 @@ private: public: UniqueStore(); + UniqueStore(std::unique_ptr<IUniqueStoreDictionary> dict); ~UniqueStore(); UniqueStoreAddResult add(EntryConstRefType value); EntryRef find(EntryConstRefType value); @@ -51,6 +52,12 @@ public: void remove(EntryRef ref); ICompactionContext::UP compactWorst(); vespalib::MemoryUsage getMemoryUsage() const; + vespalib::AddressSpace get_address_space_usage() const; + + // TODO: Consider exposing only the needed functions from allocator + Allocator& get_allocator() { return _allocator; } + const Allocator& get_allocator() const { return _allocator; } + IUniqueStoreDictionary& get_dictionary() { return *_dict; } // Pass on hold list management to underlying store void transferHoldLists(generation_t generation); diff --git a/vespalib/src/vespa/vespalib/datastore/unique_store.hpp b/vespalib/src/vespa/vespalib/datastore/unique_store.hpp index f1b60845403..ebd81010612 100644 --- a/vespalib/src/vespa/vespalib/datastore/unique_store.hpp +++ b/vespalib/src/vespa/vespalib/datastore/unique_store.hpp @@ -28,9 +28,15 @@ using DefaultUniqueStoreDictionary = UniqueStoreDictionary<DefaultDictionary>; template <typename EntryT, typename RefT, typename Compare, typename Allocator> UniqueStore<EntryT, RefT, Compare, Allocator>::UniqueStore() + : UniqueStore<EntryT, RefT, Compare, Allocator>(std::make_unique<uniquestore::DefaultUniqueStoreDictionary>()) +{ +} + +template <typename EntryT, typename RefT, typename Compare, typename Allocator> +UniqueStore<EntryT, RefT, Compare, Allocator>::UniqueStore(std::unique_ptr<IUniqueStoreDictionary> dict) : _allocator(), _store(_allocator.get_data_store()), - _dict(std::make_unique<uniquestore::DefaultUniqueStoreDictionary>()) + _dict(std::move(dict)) { } @@ -178,6 +184,13 @@ UniqueStore<EntryT, RefT, Compare, Allocator>::getMemoryUsage() const } template <typename EntryT, typename RefT, typename Compare, typename Allocator> +vespalib::AddressSpace +UniqueStore<EntryT, RefT, Compare, Allocator>::get_address_space_usage() const +{ + return _allocator.get_data_store().getAddressSpaceUsage(); +} + +template <typename EntryT, typename RefT, typename Compare, typename Allocator> const BufferState & UniqueStore<EntryT, RefT, Compare, Allocator>::bufferState(EntryRef ref) const { diff --git a/vespalib/src/vespa/vespalib/datastore/unique_store_allocator.h b/vespalib/src/vespa/vespalib/datastore/unique_store_allocator.h index 1981a190cc6..a4443742e33 100644 --- a/vespalib/src/vespa/vespalib/datastore/unique_store_allocator.h +++ b/vespalib/src/vespa/vespalib/datastore/unique_store_allocator.h @@ -42,6 +42,7 @@ public: return get_wrapped(ref).value(); } DataStoreType& get_data_store() { return _store; } + const DataStoreType& get_data_store() const { return _store; } }; } diff --git a/vespalib/src/vespa/vespalib/datastore/unique_store_builder.h b/vespalib/src/vespa/vespalib/datastore/unique_store_builder.h index a0e9f3d63a7..7f5162d97ff 100644 --- a/vespalib/src/vespa/vespalib/datastore/unique_store_builder.h +++ b/vespalib/src/vespa/vespalib/datastore/unique_store_builder.h @@ -23,6 +23,7 @@ class UniqueStoreBuilder { IUniqueStoreDictionary& _dict; std::vector<EntryRef> _refs; std::vector<uint32_t> _refCounts; + public: UniqueStoreBuilder(Allocator& allocator, IUniqueStoreDictionary& dict, uint32_t uniqueValuesHint); ~UniqueStoreBuilder(); diff --git a/vespalib/src/vespa/vespalib/datastore/unique_store_dictionary.h b/vespalib/src/vespa/vespalib/datastore/unique_store_dictionary.h index 4ae32c45dea..15b947e283b 100644 --- a/vespalib/src/vespa/vespalib/datastore/unique_store_dictionary.h +++ b/vespalib/src/vespa/vespalib/datastore/unique_store_dictionary.h @@ -48,6 +48,7 @@ public: uint32_t get_num_uniques() const override; vespalib::MemoryUsage get_memory_usage() const override; void build(const std::vector<EntryRef> &refs, const std::vector<uint32_t> &ref_counts, std::function<void(EntryRef)> hold) override; + void build_with_payload(const std::vector<EntryRef>& refs, const std::vector<uint32_t>& payloads) override; std::unique_ptr<ReadSnapshot> get_read_snapshot() const override; EntryRef get_frozen_root() const override; }; diff --git a/vespalib/src/vespa/vespalib/datastore/unique_store_dictionary.hpp b/vespalib/src/vespa/vespalib/datastore/unique_store_dictionary.hpp index f3087bc5610..3784b903ad6 100644 --- a/vespalib/src/vespa/vespalib/datastore/unique_store_dictionary.hpp +++ b/vespalib/src/vespa/vespalib/datastore/unique_store_dictionary.hpp @@ -176,6 +176,23 @@ UniqueStoreDictionary<DictionaryT, ParentT>::build(const std::vector<EntryRef> & } template <typename DictionaryT, typename ParentT> +void +UniqueStoreDictionary<DictionaryT, ParentT>::build_with_payload(const std::vector<EntryRef>& refs, + const std::vector<uint32_t>& payloads) +{ + assert(refs.size() == payloads.size()); + typename DictionaryType::Builder builder(_dict.getAllocator()); + for (size_t i = 0; i < refs.size(); ++i) { + if constexpr (std::is_same_v<DataType, uint32_t>) { + builder.insert(refs[i], payloads[i]); + } else { + builder.insert(refs[i], DataType()); + } + } + _dict.assign(builder); +} + +template <typename DictionaryT, typename ParentT> std::unique_ptr<typename ParentT::ReadSnapshot> UniqueStoreDictionary<DictionaryT, ParentT>::get_read_snapshot() const { |