diff options
author | Håvard Pettersen <havardpe@oath.com> | 2021-01-12 14:45:24 +0000 |
---|---|---|
committer | Håvard Pettersen <havardpe@oath.com> | 2021-01-13 15:10:48 +0000 |
commit | 45ecd1ac250ecf42875bcac1a436ad58aa95fad5 (patch) | |
tree | 4a6b7076a5da2ecd83955873ca5d0e2da4657e47 /vespalib | |
parent | d9db41186c2732dbb3eec3d0259a6020adc75b48 (diff) |
use direct string id for small numbers
add stat for min_free (free address space in most filled part)
fail with assert when address space is exhausted, but not before
Diffstat (limited to 'vespalib')
3 files changed, 192 insertions, 32 deletions
diff --git a/vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp b/vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp index aaf4a5e3e56..ddbde3b0485 100644 --- a/vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp +++ b/vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp @@ -30,6 +30,15 @@ std::vector<vespalib::string> make_strings(size_t cnt) { return strings; } +std::vector<vespalib::string> make_direct_strings(size_t cnt) { + std::vector<vespalib::string> strings; + strings.reserve(cnt); + for (size_t i = 0; i < cnt; ++i) { + strings.push_back(fmt("%zu", (i % 100000))); + } + return strings; +} + std::vector<vespalib::string> copy_strings(const std::vector<vespalib::string> &strings) { return strings; } @@ -148,10 +157,11 @@ struct Fixture { Avg avg; Vote vote; std::vector<vespalib::string> work; + std::vector<vespalib::string> direct_work; steady_time start_time; std::map<vespalib::string,double> time_ms; Fixture(size_t num_threads) - : avg(num_threads), vote(num_threads), work(make_strings(work_size)), start_time(steady_clock::now()) {} + : avg(num_threads), vote(num_threads), work(make_strings(work_size)), direct_work(make_direct_strings(work_size)), start_time(steady_clock::now()) {} ~Fixture() { if (verbose) { fprintf(stderr, "benchmark results for %zu threads:\n", vote.num_threads()); @@ -183,9 +193,11 @@ struct Fixture { std::vector<std::pair<vespalib::string,uint64_t>> copy_and_hash_result; std::vector<uint32_t> local_enum_result; std::vector<Handle> resolve_result; + std::vector<Handle> resolve_direct_result; std::vector<Handle> copy_handles_result; std::vector<Handle> resolve_again_result; std::vector<vespalib::string> get_result; + std::vector<vespalib::string> get_direct_result; std::unique_ptr<SharedStringRepo::Handles> strong; std::unique_ptr<SharedStringRepo::Handles> strong_copy; std::unique_ptr<std::vector<string_id>> weak; @@ -193,9 +205,11 @@ struct Fixture { auto copy_and_hash_task = [&](){ copy_and_hash_result = copy_and_hash(work); }; auto local_enum_task = [&](){ local_enum_result = local_enum(work); }; auto resolve_task = [&](){ resolve_result = resolve_strings(work); }; + auto resolve_direct_task = [&](){ resolve_direct_result = resolve_strings(direct_work); }; auto copy_handles_task = [&](){ copy_handles_result = resolve_result; }; auto resolve_again_task = [&](){ resolve_again_result = resolve_strings(work); }; auto get_task = [&](){ get_result = get_strings(resolve_result); }; + auto get_direct_task = [&](){ get_direct_result = get_strings(resolve_direct_result); }; auto reclaim_task = [&]() { resolve_again_result.clear(); }; auto reclaim_last_task = [&]() { resolve_result.clear(); }; auto make_strong_task = [&]() { strong = make_strong_handles(work); }; @@ -208,20 +222,23 @@ struct Fixture { measure_task("[02] copy and hash", is_master, copy_and_hash_task); measure_task("[03] local enum", is_master, local_enum_task); measure_task("[04] resolve", is_master, resolve_task); - measure_task("[05] copy handles", is_master, copy_handles_task); - measure_task("[06] resolve again", is_master, resolve_again_task); + measure_task("[05] resolve direct", is_master, resolve_direct_task); + measure_task("[06] copy handles", is_master, copy_handles_task); + measure_task("[07] resolve again", is_master, resolve_again_task); verify_equal(resolve_result, resolve_again_result); - measure_task("[07] as_string", is_master, get_task); + measure_task("[08] as_string", is_master, get_task); + measure_task("[09] as_string direct", is_master, get_direct_task); verify_equal(get_result, work); - measure_task("[08] reclaim", is_master, reclaim_task); + verify_equal(get_direct_result, direct_work); + measure_task("[10] reclaim", is_master, reclaim_task); copy_handles_result.clear(); - measure_task("[09] reclaim last", is_master, reclaim_last_task); - measure_task("[10] make strong handles", is_master, make_strong_task); - measure_task("[11] copy strong handles", is_master, copy_strong_task); - measure_task("[12] make weak handles", is_master, make_weak_task); - measure_task("[13] free weak handles", is_master, free_weak_task); - measure_task("[14] free strong handles copy", is_master, free_strong_copy_task); - measure_task("[15] free strong handles", is_master, free_strong_task); + measure_task("[11] reclaim last", is_master, reclaim_last_task); + measure_task("[12] make strong handles", is_master, make_strong_task); + measure_task("[13] copy strong handles", is_master, copy_strong_task); + measure_task("[14] make weak handles", is_master, make_weak_task); + measure_task("[15] free weak handles", is_master, free_weak_task); + measure_task("[16] free strong handles copy", is_master, free_strong_copy_task); + measure_task("[17] free strong handles", is_master, free_strong_task); } } }; @@ -250,11 +267,52 @@ void verify_not_eq(const Handle &a, const Handle &b) { //----------------------------------------------------------------------------- +TEST("require that empty stats object have expected values") { + size_t part_size = (uint32_t(-1) - 100001) / 64; + SharedStringRepo::Stats empty; + EXPECT_EQUAL(empty.active_entries, 0u); + EXPECT_EQUAL(empty.total_entries, 0u); + EXPECT_EQUAL(empty.min_free, part_size); + if (verbose) { + fprintf(stderr, "max entries per part: %zu\n", empty.min_free); + } +} + +TEST("require that stats can be merged") { + SharedStringRepo::Stats a; + SharedStringRepo::Stats b; + a.active_entries = 1; + a.total_entries = 10; + a.min_free = 100; + b.active_entries = 3; + b.total_entries = 20; + b.min_free = 50; + a.merge(b); + EXPECT_EQUAL(a.active_entries, 4u); + EXPECT_EQUAL(a.total_entries, 30u); + EXPECT_EQUAL(a.min_free, 50u); +} + +TEST("require that id_space_usage is sane") { + SharedStringRepo::Stats empty; + SharedStringRepo::Stats stats; + stats.min_free = empty.min_free; + EXPECT_EQUAL(stats.id_space_usage(), 0.0); + stats.min_free = empty.min_free / 2; + EXPECT_EQUAL(stats.id_space_usage(), 0.5); + stats.min_free = empty.min_free / 4; + EXPECT_EQUAL(stats.id_space_usage(), 0.75); + stats.min_free = 0; + EXPECT_EQUAL(stats.id_space_usage(), 1.0); +} + +//----------------------------------------------------------------------------- + TEST("require that basic handle usage works") { Handle empty; Handle foo("foo"); Handle bar("bar"); - Handle empty2; + Handle empty2(""); Handle foo2("foo"); Handle bar2("bar"); @@ -315,6 +373,39 @@ TEST("require that handle can be self-assigned") { //----------------------------------------------------------------------------- +void verify_direct(const vespalib::string &str) { + size_t before = SharedStringRepo::stats().active_entries; + Handle handle(str); + EXPECT_EQUAL(SharedStringRepo::stats().active_entries, before); + EXPECT_EQUAL(handle.as_string(), str); +} + +void verify_not_direct(const vespalib::string &str) { + size_t before = SharedStringRepo::stats().active_entries; + Handle handle(str); + EXPECT_EQUAL(SharedStringRepo::stats().active_entries, before + 1); + EXPECT_EQUAL(handle.as_string(), str); +} + +TEST("require that direct handles work as expected") { + TEST_DO(verify_direct("")); + for (size_t i = 0; i < 100000; ++i) { + verify_direct(fmt("%zu", i)); + } + TEST_DO(verify_not_direct(" ")); + TEST_DO(verify_not_direct(" 5")); + TEST_DO(verify_not_direct("5 ")); + TEST_DO(verify_not_direct("100000")); + TEST_DO(verify_not_direct("00")); + TEST_DO(verify_not_direct("01")); + TEST_DO(verify_not_direct("001")); + TEST_DO(verify_not_direct("-0")); + TEST_DO(verify_not_direct("-1")); + TEST_DO(verify_not_direct("a1")); +} + +//----------------------------------------------------------------------------- + TEST("require that basic multi-handle usage works") { Handles a; a.reserve(4); @@ -336,6 +427,26 @@ TEST("require that basic multi-handle usage works") { //----------------------------------------------------------------------------- +#if 0 +// needs a lot of memory or tweaking of PART_LIMIT +TEST("allocate handles until we run out") { + size_t cnt = 0; + std::vector<Handle> handles; + for (;;) { + auto stats = SharedStringRepo::stats(); + fprintf(stderr, "cnt: %zu, used: %zu/%zu, min free: %zu, usage: %g\n", + cnt, stats.active_entries, stats.total_entries, stats.min_free, + stats.id_space_usage()); + size_t n = std::max(size_t(1), stats.min_free); + for (size_t i = 0; i < n; ++i) { + handles.emplace_back(fmt("my_id_%zu", cnt++)); + } + } +} +#endif + +//----------------------------------------------------------------------------- + TEST_MT_F("test shared string repo operations with 1 threads", 1, Fixture(num_threads)) { f1.benchmark(thread_id == 0); } @@ -368,7 +479,6 @@ TEST_MT_F("test shared string repo operations with 64 threads", 64, Fixture(num_ #if 0 // verify leak-detection and reporting - TEST("leak some handles on purpose") { new Handle("leaked string"); new Handle("also leaked"); diff --git a/vespalib/src/vespa/vespalib/util/shared_string_repo.cpp b/vespalib/src/vespa/vespalib/util/shared_string_repo.cpp index 187f586d344..c39b79f7899 100644 --- a/vespalib/src/vespa/vespalib/util/shared_string_repo.cpp +++ b/vespalib/src/vespa/vespalib/util/shared_string_repo.cpp @@ -9,7 +9,8 @@ namespace vespalib { SharedStringRepo::Stats::Stats() : active_entries(0), - total_entries(0) + total_entries(0), + min_free(PART_LIMIT) { } @@ -18,6 +19,13 @@ SharedStringRepo::Stats::merge(const Stats &s) { active_entries += s.active_entries; total_entries += s.total_entries; + min_free = std::min(min_free, s.min_free); +} + +double +SharedStringRepo::Stats::id_space_usage() const +{ + return (1.0 - (double(min_free) / double(PART_LIMIT))); } SharedStringRepo::Partition::~Partition() = default; @@ -41,6 +49,7 @@ SharedStringRepo::Partition::stats() const std::lock_guard guard(_lock); stats.active_entries = _hash.size(); stats.total_entries = _entries.size(); + stats.min_free = (PART_LIMIT - _hash.size()); return stats; } @@ -49,7 +58,9 @@ SharedStringRepo::Partition::make_entries(size_t hint) { hint = std::max(hint, _entries.size() + 1); size_t want_mem = roundUp2inN(hint * sizeof(Entry)); - size_t want_entries = want_mem / sizeof(Entry); + size_t want_entries = want_mem / sizeof(Entry); + want_entries = std::min(want_entries, PART_LIMIT); + assert(want_entries > _entries.size()); _entries.reserve(want_entries); while (_entries.size() < _entries.capacity()) { _entries.emplace_back(_free); diff --git a/vespalib/src/vespa/vespalib/util/shared_string_repo.h b/vespalib/src/vespa/vespalib/util/shared_string_repo.h index 7e9f5d41f96..518d3874b7d 100644 --- a/vespalib/src/vespa/vespalib/util/shared_string_repo.h +++ b/vespalib/src/vespa/vespalib/util/shared_string_repo.h @@ -13,6 +13,9 @@ #include <vector> #include <array> #include <cassert> +#include <ctype.h> +#include <limits> +#include <charconv> namespace vespalib { @@ -29,14 +32,20 @@ public: struct Stats { size_t active_entries; size_t total_entries; + size_t min_free; Stats(); void merge(const Stats &s); + double id_space_usage() const; }; private: - static constexpr int NUM_PARTS = 64; - static constexpr int PART_BITS = 6; - static constexpr int PART_MASK = 0x3f; + static constexpr uint32_t PART_BITS = 6; + static constexpr uint32_t NUM_PARTS = 1 << PART_BITS; + static constexpr uint32_t PART_MASK = NUM_PARTS - 1; + static constexpr uint32_t FAST_DIGITS = 5; + static constexpr uint32_t FAST_ID_MAX = 99999; + static constexpr uint32_t ID_BIAS = (FAST_ID_MAX + 2); + static constexpr size_t PART_LIMIT = (std::numeric_limits<uint32_t>::max() - ID_BIAS) / NUM_PARTS; struct AltKey { vespalib::stringref str; @@ -166,41 +175,71 @@ private: SharedStringRepo(); ~SharedStringRepo(); + static uint32_t try_make_direct_id(vespalib::stringref str) noexcept { + if ((str.size() > FAST_DIGITS) || ((str.size() > 1) && (str[0] == '0'))) { + return ID_BIAS; + } else if (str.size() == 0) { + return 0; + } else { + uint32_t value = 0; + for (size_t i = 0; i < str.size(); ++i) { + char c = str[i]; + if (!isdigit(c)) { + return ID_BIAS; + } else { + value = ((value * 10) + (c - '0')); + } + } + return (value + 1); + } + } + + static vespalib::string string_from_direct_id(uint32_t id) { + if (id == 0) { + return {}; + } else { + char tmp[16]; + auto res = std::to_chars(tmp, tmp + sizeof(tmp), (id - 1), 10); + return vespalib::string(tmp, res.ptr - tmp); + } + } + string_id resolve(vespalib::stringref str) { - if (!str.empty()) { + uint32_t direct_id = try_make_direct_id(str); + if (direct_id >= ID_BIAS) { uint64_t full_hash = XXH3_64bits(str.data(), str.size()); uint32_t part = full_hash & PART_MASK; uint32_t local_hash = full_hash >> PART_BITS; uint32_t local_idx = _partitions[part].resolve(AltKey{str, local_hash}); - return string_id(((local_idx << PART_BITS) | part) + 1); + return string_id(((local_idx << PART_BITS) | part) + ID_BIAS); } else { - return {}; + return string_id(direct_id); } } vespalib::string as_string(string_id id) { - if (id._id != 0) { - uint32_t part = (id._id - 1) & PART_MASK; - uint32_t local_idx = (id._id - 1) >> PART_BITS; + if (id._id >= ID_BIAS) { + uint32_t part = (id._id - ID_BIAS) & PART_MASK; + uint32_t local_idx = (id._id - ID_BIAS) >> PART_BITS; return _partitions[part].as_string(local_idx); } else { - return {}; + return string_from_direct_id(id._id); } } string_id copy(string_id id) { - if (id._id != 0) { - uint32_t part = (id._id - 1) & PART_MASK; - uint32_t local_idx = (id._id - 1) >> PART_BITS; + if (id._id >= ID_BIAS) { + uint32_t part = (id._id - ID_BIAS) & PART_MASK; + uint32_t local_idx = (id._id - ID_BIAS) >> PART_BITS; _partitions[part].copy(local_idx); } return id; } void reclaim(string_id id) { - if (id._id != 0) { - uint32_t part = (id._id - 1) & PART_MASK; - uint32_t local_idx = (id._id - 1) >> PART_BITS; + if (id._id >= ID_BIAS) { + uint32_t part = (id._id - ID_BIAS) & PART_MASK; + uint32_t local_idx = (id._id - ID_BIAS) >> PART_BITS; _partitions[part].reclaim(local_idx); } } |