diff options
author | Håvard Pettersen <havardpe@oath.com> | 2020-11-18 09:45:19 +0000 |
---|---|---|
committer | Håvard Pettersen <havardpe@oath.com> | 2020-11-25 16:55:18 +0000 |
commit | a57880315976dc49be7f32f1736571845659ed0a (patch) | |
tree | 7285188be36987559870cfd9d8b2ac2d4271a939 /vespalib | |
parent | e1584673531bc771fa94731da337ce311b4ff7d1 (diff) |
shared string repo -- WIP
Diffstat (limited to 'vespalib')
-rw-r--r-- | vespalib/CMakeLists.txt | 1 | ||||
-rw-r--r-- | vespalib/src/tests/shared_string_repo/CMakeLists.txt | 8 | ||||
-rw-r--r-- | vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp | 219 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/util/CMakeLists.txt | 3 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/util/shared_string_repo.cpp | 11 | ||||
-rw-r--r-- | vespalib/src/vespa/vespalib/util/shared_string_repo.h | 193 |
6 files changed, 434 insertions, 1 deletions
diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt index dbcf2beadd8..a5e6b24abb2 100644 --- a/vespalib/CMakeLists.txt +++ b/vespalib/CMakeLists.txt @@ -94,6 +94,7 @@ vespa_define_module( src/tests/rendezvous src/tests/runnable_pair src/tests/sha1 + src/tests/shared_string_repo src/tests/sharedptr src/tests/signalhandler src/tests/simple_thread_bundle diff --git a/vespalib/src/tests/shared_string_repo/CMakeLists.txt b/vespalib/src/tests/shared_string_repo/CMakeLists.txt new file mode 100644 index 00000000000..3911deb9e91 --- /dev/null +++ b/vespalib/src/tests/shared_string_repo/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vespalib_shared_string_repo_test_app TEST + SOURCES + shared_string_repo_test.cpp + DEPENDS + vespalib +) +vespa_add_test(NAME vespalib_shared_string_repo_test_app COMMAND vespalib_shared_string_repo_test_app) diff --git a/vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp b/vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp new file mode 100644 index 00000000000..57faa76061a --- /dev/null +++ b/vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp @@ -0,0 +1,219 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/util/shared_string_repo.h> +#include <vespa/vespalib/util/rendezvous.h> +#include <vespa/vespalib/util/time.h> +#include <vespa/vespalib/util/stringfmt.h> +#include <vespa/vespalib/testkit/test_kit.h> +#include <vector> +#include <map> + +using namespace vespalib; +using make_string_short::fmt; + +bool verbose = false; +double budget = 0.10; +size_t work_size = 4096; + +//----------------------------------------------------------------------------- + +std::vector<vespalib::string> make_strings(size_t cnt) { + std::vector<vespalib::string> strings; + strings.reserve(cnt); + for (size_t i = 0; i < cnt; ++i) { + strings.push_back(fmt("str_%zu", i)); + } + return strings; +} + +std::vector<vespalib::string> copy_strings(const std::vector<vespalib::string> &strings) { + std::vector<vespalib::string> copy; + copy.reserve(strings.size()); + for (size_t i = 0; i < strings.size(); ++i) { + copy.push_back(strings[i]); + } + return copy; +} + +std::vector<SharedStringRepo::Handle> resolve_strings(const std::vector<vespalib::string> &strings) { + std::vector<SharedStringRepo::Handle> handles; + handles.reserve(strings.size()); + for (size_t i = 0; i < strings.size(); ++i) { + handles.emplace_back(strings[i]); + } + return handles; +} + +std::vector<SharedStringRepo::Handle> copy_handles(const std::vector<SharedStringRepo::Handle> &handles) { + std::vector<SharedStringRepo::Handle> copy; + copy.reserve(handles.size()); + for (size_t i = 0; i < handles.size(); ++i) { + copy.push_back(handles[i]); + } + return copy; +} + +std::vector<vespalib::string> get_strings(const std::vector<SharedStringRepo::Handle> &handles) { + std::vector<vespalib::string> strings; + strings.reserve(handles.size()); + for (size_t i = 0; i < handles.size(); ++i) { + strings.push_back(handles[i].as_string()); + } + return strings; +} + +void reclaim_handles(std::vector<SharedStringRepo::Handle>) {} + +//----------------------------------------------------------------------------- + +struct Avg : Rendezvous<double, double> { + Avg(size_t n) : Rendezvous<double, double>(n) {} + void mingle() override { + double sum = 0; + for (size_t i = 0; i < size(); ++i) { + sum += in(i); + } + double result = sum / size(); + for (size_t i = 0; i < size(); ++i) { + out(i) = result; + } + } + double operator()(double value) { return rendezvous(value); } +}; + +struct Vote : Rendezvous<bool, bool> { + Vote(size_t n) : Rendezvous<bool, bool>(n) {} + void mingle() override { + size_t true_cnt = 0; + size_t false_cnt = 0; + for (size_t i = 0; i < size(); ++i) { + if (in(i)) { + ++true_cnt; + } else { + ++false_cnt; + } + } + bool result = (true_cnt > false_cnt); + for (size_t i = 0; i < size(); ++i) { + out(i) = result; + } + } + size_t num_threads() const { return size(); } + bool operator()(bool flag) { return rendezvous(flag); } +}; + +//----------------------------------------------------------------------------- + +template <typename T> +void verify_equal(const std::vector<T> &a, const std::vector<T> &b) { + ASSERT_EQUAL(a.size(), b.size()); + for (size_t i = 0; i < a.size(); ++i) { + EXPECT_TRUE(a[i] == b[i]); + } +} + +//----------------------------------------------------------------------------- + +struct Fixture { + Avg avg; + Vote vote; + std::vector<vespalib::string> work; + steady_time start_time; + std::map<vespalib::string,double> time_ms; + Fixture(size_t num_threads) + : avg(num_threads), vote(num_threads), work(make_strings(work_size)), start_time(steady_clock::now()) {} + ~Fixture() { + if (verbose) { + fprintf(stderr, "benchmark results for %zu threads:\n", vote.num_threads()); + for (const auto &[tag, ms_cost]: time_ms) { + fprintf(stderr, " %s: %g ms\n", tag.c_str(), ms_cost); + } + } + } + bool has_budget() { + return to_s(steady_clock::now() - start_time) < budget; + } + template <typename F> + void measure_task(const vespalib::string &tag, bool is_master, F &&task) { + auto before = steady_clock::now(); + task(); + double ms_cost = to_s(steady_clock::now() - before) * 1000.0; + double avg_ms = avg(ms_cost); + if (is_master) { + if (time_ms.count(tag) > 0) { + time_ms[tag] = std::min(time_ms[tag], avg_ms); + } else { + time_ms[tag] = avg_ms; + } + } + } + void benchmark(bool is_master) { + for (bool once_more = true; vote(once_more); once_more = has_budget()) { + std::vector<vespalib::string> copy_strings_result; + std::vector<SharedStringRepo::Handle> resolve_result; + std::vector<SharedStringRepo::Handle> copy_handles_result; + std::vector<SharedStringRepo::Handle> resolve_again_result; + std::vector<vespalib::string> get_result; + auto copy_strings_task = [&](){ copy_strings_result = copy_strings(work); }; + auto resolve_task = [&](){ resolve_result = resolve_strings(work); }; + auto copy_handles_task = [&](){ copy_handles_result = copy_handles(resolve_result); }; + auto resolve_again_task = [&](){ resolve_again_result = resolve_strings(work); }; + auto get_task = [&](){ get_result = get_strings(resolve_result); }; + auto reclaim_task = [&]() { resolve_again_result.clear(); }; + auto reclaim_last_task = [&]() { resolve_result.clear(); }; + measure_task("[0] copy strings", is_master, copy_strings_task); + measure_task("[1] resolve", is_master, resolve_task); + measure_task("[2] copy handles", is_master, copy_handles_task); + measure_task("[3] resolve again", is_master, resolve_again_task); + verify_equal(resolve_result, resolve_again_result); + measure_task("[4] as_string", is_master, get_task); + verify_equal(get_result, work); + measure_task("[5] reclaim", is_master, reclaim_task); + copy_handles_result.clear(); + measure_task("[6] reclaim last", is_master, reclaim_last_task); + } + } +}; + +//----------------------------------------------------------------------------- + +TEST_MT_F("test shared string repo operations with 1 threads", 1, Fixture(num_threads)) { + f1.benchmark(thread_id == 0); +} + +TEST_MT_F("test shared string repo operations with 2 threads", 2, Fixture(num_threads)) { + f1.benchmark(thread_id == 0); +} + +TEST_MT_F("test shared string repo operations with 4 threads", 4, Fixture(num_threads)) { + f1.benchmark(thread_id == 0); +} + +TEST_MT_F("test shared string repo operations with 8 threads", 8, Fixture(num_threads)) { + f1.benchmark(thread_id == 0); +} + +TEST_MT_F("test shared string repo operations with 16 threads", 16, Fixture(num_threads)) { + f1.benchmark(thread_id == 0); +} + +TEST_MT_F("test shared string repo operations with 32 threads", 32, Fixture(num_threads)) { + f1.benchmark(thread_id == 0); +} + +TEST_MT_F("test shared string repo operations with 64 threads", 64, Fixture(num_threads)) { + f1.benchmark(thread_id == 0); +} + +//----------------------------------------------------------------------------- + +int main(int argc, char **argv) { + TEST_MASTER.init(__FILE__); + if ((argc == 2) && (argv[1] == std::string("verbose"))) { + verbose = true; + budget = 10.0; + work_size = 128000; + } + TEST_RUN_ALL(); + return (TEST_MASTER.fini() ? 0 : 1); +} diff --git a/vespalib/src/vespa/vespalib/util/CMakeLists.txt b/vespalib/src/vespa/vespalib/util/CMakeLists.txt index 610310fd43c..5a82fb200e0 100644 --- a/vespalib/src/vespa/vespalib/util/CMakeLists.txt +++ b/vespalib/src/vespa/vespalib/util/CMakeLists.txt @@ -12,6 +12,7 @@ vespa_add_library(vespalib_vespalib_util OBJECT benchmark_timer.cpp blockingthreadstackexecutor.cpp box.cpp + child_process.cpp classname.cpp closuretask.cpp compress.cpp @@ -42,10 +43,10 @@ vespa_add_library(vespalib_vespalib_util OBJECT runnable_pair.cpp sequence.cpp sha1.cpp + shared_string_repo.cpp sig_catch.cpp signalhandler.cpp simple_thread_bundle.cpp - child_process.cpp stash.cpp string_hash.cpp stringfmt.cpp diff --git a/vespalib/src/vespa/vespalib/util/shared_string_repo.cpp b/vespalib/src/vespa/vespalib/util/shared_string_repo.cpp new file mode 100644 index 00000000000..fb1a18d46ee --- /dev/null +++ b/vespalib/src/vespa/vespalib/util/shared_string_repo.cpp @@ -0,0 +1,11 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "shared_string_repo.h" + +namespace vespalib { + +SharedStringRepo::Partition::~Partition() = default; + +std::array<SharedStringRepo::Partition, SharedStringRepo::NUM_PARTS> SharedStringRepo::_partitions{}; + +} diff --git a/vespalib/src/vespa/vespalib/util/shared_string_repo.h b/vespalib/src/vespa/vespalib/util/shared_string_repo.h new file mode 100644 index 00000000000..5f89b0fe7db --- /dev/null +++ b/vespalib/src/vespa/vespalib/util/shared_string_repo.h @@ -0,0 +1,193 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "spin_lock.h" +#include <vespa/vespalib/stllike/string.h> +#include <vespa/vespalib/stllike/hash_set.hpp> +#include <xxhash.h> +#include <mutex> +#include <vector> + +namespace vespalib { + +/** + * This class implements application-wide in-memory string + * interning. Each string stored in the repo will be assigned a unique + * 32-bit id that can be used by the application to check for + * equality. The repo can never be shrunk in size, but ids can be + * re-used when the corresponding strings are evicted from the + * repo. Strong handles are used to track which strings are in + * use. Weak handles are used as a cheap alternative to strong handles + * when you also have a strong handle that ensures the string will not + * be evicted. For example, an efficient sparse tensor attribute could + * have a pool of strong handles to all strings used + * internally. Individual tensor indexes could then contain only weak + * handles, making string comparison as cheap as possible. + **/ +class SharedStringRepo { +private: + static constexpr int NUM_PARTS = 64; + static constexpr int PART_BITS = 6; + static constexpr int PART_MASK = 0x3f; + + struct AltKey { + vespalib::stringref str; + uint32_t hash; + }; + + class Partition { + public: + struct Entry { + uint32_t hash; + uint32_t ref_cnt; + vespalib::string str; + Entry(const AltKey &key) noexcept : hash(key.hash), ref_cnt(1), str(key.str) {} + void reset() { + str.reset(); + } + void reuse(const AltKey &key) { + hash = key.hash; + ref_cnt = 1; + str = key.str; + } + }; + struct Key { + uint32_t idx; + uint32_t hash; + }; + struct Hash { + uint32_t operator()(const Key &key) const { return key.hash; } + uint32_t operator()(const AltKey &key) const { return key.hash; } + }; + struct Equal { + const std::vector<Entry> &entries; + Equal(const std::vector<Entry> &entries_in) : entries(entries_in) {} + Equal(const Equal &rhs) = default; + bool operator()(const Key &a, const Key &b) const { return (a.idx == b.idx); } + bool operator()(const Key &a, const AltKey &b) const { return ((a.hash == b.hash) && (entries[a.idx].str == b.str)); } + }; + using HashType = vespalib::hash_set<Key,Hash,Equal>; + + private: + SpinLock _lock; + std::vector<Entry> _entries; + std::vector<uint32_t> _free; + HashType _hash; + + uint32_t make_entry(const AltKey &alt_key) { + if (_free.empty()) { + uint32_t idx = _entries.size(); + _entries.emplace_back(alt_key); + return idx; + } else { + uint32_t idx = _free.back(); + _free.pop_back(); + _entries[idx].reuse(alt_key); + return idx; + } + } + + public: + Partition() + : _lock(), _entries(), _free(), _hash(0, Hash(), Equal(_entries)) {} + ~Partition(); + + uint32_t resolve(const AltKey &alt_key) { + std::lock_guard guard(_lock); + auto pos = _hash.find(alt_key); + if (pos != _hash.end()) { + ++_entries[pos->idx].ref_cnt; + return pos->idx; + } else { + uint32_t idx = make_entry(alt_key); + _hash.insert(Key{idx, alt_key.hash}); + return idx; + } + } + + vespalib::string get(uint32_t idx) { + std::lock_guard guard(_lock); + return _entries[idx].str; + } + + void copy(uint32_t idx) { + std::lock_guard guard(_lock); + ++_entries[idx].ref_cnt; + } + + void reclaim(uint32_t idx) { + std::lock_guard guard(_lock); + Entry &entry = _entries[idx]; + if (--entry.ref_cnt == 0) { + _hash.erase(Key{idx, entry.hash}); + entry.reset(); + _free.push_back(idx); + } + } + }; + + static std::array<Partition,NUM_PARTS> _partitions; + + static uint32_t resolve(vespalib::stringref str) { + if (!str.empty()) { + uint64_t full_hash = XXH3_64bits(str.data(), str.size()); + uint32_t part = full_hash & PART_MASK; + uint32_t local_hash = full_hash >> PART_BITS; + uint32_t local_idx = _partitions[part].resolve(AltKey{str, local_hash}); + return (((local_idx << PART_BITS) | part) + 1); + } else { + return 0; + } + } + + static vespalib::string get(uint32_t id) { + uint32_t part = (id - 1) & PART_MASK; + uint32_t local_idx = (id - 1) >> PART_BITS; + return _partitions[part].get(local_idx); + } + + static uint32_t copy(uint32_t id) { + uint32_t part = (id - 1) & PART_MASK; + uint32_t local_idx = (id - 1) >> PART_BITS; + _partitions[part].copy(local_idx); + return id; + } + + static void reclaim(uint32_t id) { + if (id != 0) { + uint32_t part = (id - 1) & PART_MASK; + uint32_t local_idx = (id - 1) >> PART_BITS; + _partitions[part].reclaim(local_idx); + } + } + +public: + class Handle { + private: + uint32_t _id; + public: + Handle() : _id(0) {} + Handle(vespalib::stringref str) : _id(resolve(str)) {} + Handle(const Handle &rhs) : _id(copy(rhs._id)) {} + Handle &operator=(const Handle &rhs) { + reclaim(_id); + _id = copy(rhs._id); + return *this; + } + Handle(Handle &&rhs) noexcept : _id(rhs._id) { + rhs._id = 0; + } + Handle &operator=(Handle &&rhs) { + reclaim(_id); + _id = rhs._id; + rhs._id = 0; + return *this; + } + bool operator==(const Handle &rhs) const { return (_id == rhs._id); } + vespalib::string as_string() const { return SharedStringRepo::get(_id); } + ~Handle() { reclaim(_id); } + }; +}; + +} |