summaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorHåvard Pettersen <havardpe@oath.com>2020-11-18 09:45:19 +0000
committerHåvard Pettersen <havardpe@oath.com>2020-11-25 16:55:18 +0000
commita57880315976dc49be7f32f1736571845659ed0a (patch)
tree7285188be36987559870cfd9d8b2ac2d4271a939 /vespalib
parente1584673531bc771fa94731da337ce311b4ff7d1 (diff)
shared string repo -- WIP
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/CMakeLists.txt1
-rw-r--r--vespalib/src/tests/shared_string_repo/CMakeLists.txt8
-rw-r--r--vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp219
-rw-r--r--vespalib/src/vespa/vespalib/util/CMakeLists.txt3
-rw-r--r--vespalib/src/vespa/vespalib/util/shared_string_repo.cpp11
-rw-r--r--vespalib/src/vespa/vespalib/util/shared_string_repo.h193
6 files changed, 434 insertions, 1 deletions
diff --git a/vespalib/CMakeLists.txt b/vespalib/CMakeLists.txt
index dbcf2beadd8..a5e6b24abb2 100644
--- a/vespalib/CMakeLists.txt
+++ b/vespalib/CMakeLists.txt
@@ -94,6 +94,7 @@ vespa_define_module(
src/tests/rendezvous
src/tests/runnable_pair
src/tests/sha1
+ src/tests/shared_string_repo
src/tests/sharedptr
src/tests/signalhandler
src/tests/simple_thread_bundle
diff --git a/vespalib/src/tests/shared_string_repo/CMakeLists.txt b/vespalib/src/tests/shared_string_repo/CMakeLists.txt
new file mode 100644
index 00000000000..3911deb9e91
--- /dev/null
+++ b/vespalib/src/tests/shared_string_repo/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(vespalib_shared_string_repo_test_app TEST
+ SOURCES
+ shared_string_repo_test.cpp
+ DEPENDS
+ vespalib
+)
+vespa_add_test(NAME vespalib_shared_string_repo_test_app COMMAND vespalib_shared_string_repo_test_app)
diff --git a/vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp b/vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp
new file mode 100644
index 00000000000..57faa76061a
--- /dev/null
+++ b/vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp
@@ -0,0 +1,219 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/util/shared_string_repo.h>
+#include <vespa/vespalib/util/rendezvous.h>
+#include <vespa/vespalib/util/time.h>
+#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/vespalib/testkit/test_kit.h>
+#include <vector>
+#include <map>
+
+using namespace vespalib;
+using make_string_short::fmt;
+
+bool verbose = false;
+double budget = 0.10;
+size_t work_size = 4096;
+
+//-----------------------------------------------------------------------------
+
+std::vector<vespalib::string> make_strings(size_t cnt) {
+ std::vector<vespalib::string> strings;
+ strings.reserve(cnt);
+ for (size_t i = 0; i < cnt; ++i) {
+ strings.push_back(fmt("str_%zu", i));
+ }
+ return strings;
+}
+
+std::vector<vespalib::string> copy_strings(const std::vector<vespalib::string> &strings) {
+ std::vector<vespalib::string> copy;
+ copy.reserve(strings.size());
+ for (size_t i = 0; i < strings.size(); ++i) {
+ copy.push_back(strings[i]);
+ }
+ return copy;
+}
+
+std::vector<SharedStringRepo::Handle> resolve_strings(const std::vector<vespalib::string> &strings) {
+ std::vector<SharedStringRepo::Handle> handles;
+ handles.reserve(strings.size());
+ for (size_t i = 0; i < strings.size(); ++i) {
+ handles.emplace_back(strings[i]);
+ }
+ return handles;
+}
+
+std::vector<SharedStringRepo::Handle> copy_handles(const std::vector<SharedStringRepo::Handle> &handles) {
+ std::vector<SharedStringRepo::Handle> copy;
+ copy.reserve(handles.size());
+ for (size_t i = 0; i < handles.size(); ++i) {
+ copy.push_back(handles[i]);
+ }
+ return copy;
+}
+
+std::vector<vespalib::string> get_strings(const std::vector<SharedStringRepo::Handle> &handles) {
+ std::vector<vespalib::string> strings;
+ strings.reserve(handles.size());
+ for (size_t i = 0; i < handles.size(); ++i) {
+ strings.push_back(handles[i].as_string());
+ }
+ return strings;
+}
+
+void reclaim_handles(std::vector<SharedStringRepo::Handle>) {}
+
+//-----------------------------------------------------------------------------
+
+struct Avg : Rendezvous<double, double> {
+ Avg(size_t n) : Rendezvous<double, double>(n) {}
+ void mingle() override {
+ double sum = 0;
+ for (size_t i = 0; i < size(); ++i) {
+ sum += in(i);
+ }
+ double result = sum / size();
+ for (size_t i = 0; i < size(); ++i) {
+ out(i) = result;
+ }
+ }
+ double operator()(double value) { return rendezvous(value); }
+};
+
+struct Vote : Rendezvous<bool, bool> {
+ Vote(size_t n) : Rendezvous<bool, bool>(n) {}
+ void mingle() override {
+ size_t true_cnt = 0;
+ size_t false_cnt = 0;
+ for (size_t i = 0; i < size(); ++i) {
+ if (in(i)) {
+ ++true_cnt;
+ } else {
+ ++false_cnt;
+ }
+ }
+ bool result = (true_cnt > false_cnt);
+ for (size_t i = 0; i < size(); ++i) {
+ out(i) = result;
+ }
+ }
+ size_t num_threads() const { return size(); }
+ bool operator()(bool flag) { return rendezvous(flag); }
+};
+
+//-----------------------------------------------------------------------------
+
+template <typename T>
+void verify_equal(const std::vector<T> &a, const std::vector<T> &b) {
+ ASSERT_EQUAL(a.size(), b.size());
+ for (size_t i = 0; i < a.size(); ++i) {
+ EXPECT_TRUE(a[i] == b[i]);
+ }
+}
+
+//-----------------------------------------------------------------------------
+
+struct Fixture {
+ Avg avg;
+ Vote vote;
+ std::vector<vespalib::string> work;
+ steady_time start_time;
+ std::map<vespalib::string,double> time_ms;
+ Fixture(size_t num_threads)
+ : avg(num_threads), vote(num_threads), work(make_strings(work_size)), start_time(steady_clock::now()) {}
+ ~Fixture() {
+ if (verbose) {
+ fprintf(stderr, "benchmark results for %zu threads:\n", vote.num_threads());
+ for (const auto &[tag, ms_cost]: time_ms) {
+ fprintf(stderr, " %s: %g ms\n", tag.c_str(), ms_cost);
+ }
+ }
+ }
+ bool has_budget() {
+ return to_s(steady_clock::now() - start_time) < budget;
+ }
+ template <typename F>
+ void measure_task(const vespalib::string &tag, bool is_master, F &&task) {
+ auto before = steady_clock::now();
+ task();
+ double ms_cost = to_s(steady_clock::now() - before) * 1000.0;
+ double avg_ms = avg(ms_cost);
+ if (is_master) {
+ if (time_ms.count(tag) > 0) {
+ time_ms[tag] = std::min(time_ms[tag], avg_ms);
+ } else {
+ time_ms[tag] = avg_ms;
+ }
+ }
+ }
+ void benchmark(bool is_master) {
+ for (bool once_more = true; vote(once_more); once_more = has_budget()) {
+ std::vector<vespalib::string> copy_strings_result;
+ std::vector<SharedStringRepo::Handle> resolve_result;
+ std::vector<SharedStringRepo::Handle> copy_handles_result;
+ std::vector<SharedStringRepo::Handle> resolve_again_result;
+ std::vector<vespalib::string> get_result;
+ auto copy_strings_task = [&](){ copy_strings_result = copy_strings(work); };
+ auto resolve_task = [&](){ resolve_result = resolve_strings(work); };
+ auto copy_handles_task = [&](){ copy_handles_result = copy_handles(resolve_result); };
+ auto resolve_again_task = [&](){ resolve_again_result = resolve_strings(work); };
+ auto get_task = [&](){ get_result = get_strings(resolve_result); };
+ auto reclaim_task = [&]() { resolve_again_result.clear(); };
+ auto reclaim_last_task = [&]() { resolve_result.clear(); };
+ measure_task("[0] copy strings", is_master, copy_strings_task);
+ measure_task("[1] resolve", is_master, resolve_task);
+ measure_task("[2] copy handles", is_master, copy_handles_task);
+ measure_task("[3] resolve again", is_master, resolve_again_task);
+ verify_equal(resolve_result, resolve_again_result);
+ measure_task("[4] as_string", is_master, get_task);
+ verify_equal(get_result, work);
+ measure_task("[5] reclaim", is_master, reclaim_task);
+ copy_handles_result.clear();
+ measure_task("[6] reclaim last", is_master, reclaim_last_task);
+ }
+ }
+};
+
+//-----------------------------------------------------------------------------
+
+TEST_MT_F("test shared string repo operations with 1 threads", 1, Fixture(num_threads)) {
+ f1.benchmark(thread_id == 0);
+}
+
+TEST_MT_F("test shared string repo operations with 2 threads", 2, Fixture(num_threads)) {
+ f1.benchmark(thread_id == 0);
+}
+
+TEST_MT_F("test shared string repo operations with 4 threads", 4, Fixture(num_threads)) {
+ f1.benchmark(thread_id == 0);
+}
+
+TEST_MT_F("test shared string repo operations with 8 threads", 8, Fixture(num_threads)) {
+ f1.benchmark(thread_id == 0);
+}
+
+TEST_MT_F("test shared string repo operations with 16 threads", 16, Fixture(num_threads)) {
+ f1.benchmark(thread_id == 0);
+}
+
+TEST_MT_F("test shared string repo operations with 32 threads", 32, Fixture(num_threads)) {
+ f1.benchmark(thread_id == 0);
+}
+
+TEST_MT_F("test shared string repo operations with 64 threads", 64, Fixture(num_threads)) {
+ f1.benchmark(thread_id == 0);
+}
+
+//-----------------------------------------------------------------------------
+
+int main(int argc, char **argv) {
+ TEST_MASTER.init(__FILE__);
+ if ((argc == 2) && (argv[1] == std::string("verbose"))) {
+ verbose = true;
+ budget = 10.0;
+ work_size = 128000;
+ }
+ TEST_RUN_ALL();
+ return (TEST_MASTER.fini() ? 0 : 1);
+}
diff --git a/vespalib/src/vespa/vespalib/util/CMakeLists.txt b/vespalib/src/vespa/vespalib/util/CMakeLists.txt
index 610310fd43c..5a82fb200e0 100644
--- a/vespalib/src/vespa/vespalib/util/CMakeLists.txt
+++ b/vespalib/src/vespa/vespalib/util/CMakeLists.txt
@@ -12,6 +12,7 @@ vespa_add_library(vespalib_vespalib_util OBJECT
benchmark_timer.cpp
blockingthreadstackexecutor.cpp
box.cpp
+ child_process.cpp
classname.cpp
closuretask.cpp
compress.cpp
@@ -42,10 +43,10 @@ vespa_add_library(vespalib_vespalib_util OBJECT
runnable_pair.cpp
sequence.cpp
sha1.cpp
+ shared_string_repo.cpp
sig_catch.cpp
signalhandler.cpp
simple_thread_bundle.cpp
- child_process.cpp
stash.cpp
string_hash.cpp
stringfmt.cpp
diff --git a/vespalib/src/vespa/vespalib/util/shared_string_repo.cpp b/vespalib/src/vespa/vespalib/util/shared_string_repo.cpp
new file mode 100644
index 00000000000..fb1a18d46ee
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/util/shared_string_repo.cpp
@@ -0,0 +1,11 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "shared_string_repo.h"
+
+namespace vespalib {
+
+SharedStringRepo::Partition::~Partition() = default;
+
+std::array<SharedStringRepo::Partition, SharedStringRepo::NUM_PARTS> SharedStringRepo::_partitions{};
+
+}
diff --git a/vespalib/src/vespa/vespalib/util/shared_string_repo.h b/vespalib/src/vespa/vespalib/util/shared_string_repo.h
new file mode 100644
index 00000000000..5f89b0fe7db
--- /dev/null
+++ b/vespalib/src/vespa/vespalib/util/shared_string_repo.h
@@ -0,0 +1,193 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "spin_lock.h"
+#include <vespa/vespalib/stllike/string.h>
+#include <vespa/vespalib/stllike/hash_set.hpp>
+#include <xxhash.h>
+#include <mutex>
+#include <vector>
+
+namespace vespalib {
+
+/**
+ * This class implements application-wide in-memory string
+ * interning. Each string stored in the repo will be assigned a unique
+ * 32-bit id that can be used by the application to check for
+ * equality. The repo can never be shrunk in size, but ids can be
+ * re-used when the corresponding strings are evicted from the
+ * repo. Strong handles are used to track which strings are in
+ * use. Weak handles are used as a cheap alternative to strong handles
+ * when you also have a strong handle that ensures the string will not
+ * be evicted. For example, an efficient sparse tensor attribute could
+ * have a pool of strong handles to all strings used
+ * internally. Individual tensor indexes could then contain only weak
+ * handles, making string comparison as cheap as possible.
+ **/
+class SharedStringRepo {
+private:
+ static constexpr int NUM_PARTS = 64;
+ static constexpr int PART_BITS = 6;
+ static constexpr int PART_MASK = 0x3f;
+
+ struct AltKey {
+ vespalib::stringref str;
+ uint32_t hash;
+ };
+
+ class Partition {
+ public:
+ struct Entry {
+ uint32_t hash;
+ uint32_t ref_cnt;
+ vespalib::string str;
+ Entry(const AltKey &key) noexcept : hash(key.hash), ref_cnt(1), str(key.str) {}
+ void reset() {
+ str.reset();
+ }
+ void reuse(const AltKey &key) {
+ hash = key.hash;
+ ref_cnt = 1;
+ str = key.str;
+ }
+ };
+ struct Key {
+ uint32_t idx;
+ uint32_t hash;
+ };
+ struct Hash {
+ uint32_t operator()(const Key &key) const { return key.hash; }
+ uint32_t operator()(const AltKey &key) const { return key.hash; }
+ };
+ struct Equal {
+ const std::vector<Entry> &entries;
+ Equal(const std::vector<Entry> &entries_in) : entries(entries_in) {}
+ Equal(const Equal &rhs) = default;
+ bool operator()(const Key &a, const Key &b) const { return (a.idx == b.idx); }
+ bool operator()(const Key &a, const AltKey &b) const { return ((a.hash == b.hash) && (entries[a.idx].str == b.str)); }
+ };
+ using HashType = vespalib::hash_set<Key,Hash,Equal>;
+
+ private:
+ SpinLock _lock;
+ std::vector<Entry> _entries;
+ std::vector<uint32_t> _free;
+ HashType _hash;
+
+ uint32_t make_entry(const AltKey &alt_key) {
+ if (_free.empty()) {
+ uint32_t idx = _entries.size();
+ _entries.emplace_back(alt_key);
+ return idx;
+ } else {
+ uint32_t idx = _free.back();
+ _free.pop_back();
+ _entries[idx].reuse(alt_key);
+ return idx;
+ }
+ }
+
+ public:
+ Partition()
+ : _lock(), _entries(), _free(), _hash(0, Hash(), Equal(_entries)) {}
+ ~Partition();
+
+ uint32_t resolve(const AltKey &alt_key) {
+ std::lock_guard guard(_lock);
+ auto pos = _hash.find(alt_key);
+ if (pos != _hash.end()) {
+ ++_entries[pos->idx].ref_cnt;
+ return pos->idx;
+ } else {
+ uint32_t idx = make_entry(alt_key);
+ _hash.insert(Key{idx, alt_key.hash});
+ return idx;
+ }
+ }
+
+ vespalib::string get(uint32_t idx) {
+ std::lock_guard guard(_lock);
+ return _entries[idx].str;
+ }
+
+ void copy(uint32_t idx) {
+ std::lock_guard guard(_lock);
+ ++_entries[idx].ref_cnt;
+ }
+
+ void reclaim(uint32_t idx) {
+ std::lock_guard guard(_lock);
+ Entry &entry = _entries[idx];
+ if (--entry.ref_cnt == 0) {
+ _hash.erase(Key{idx, entry.hash});
+ entry.reset();
+ _free.push_back(idx);
+ }
+ }
+ };
+
+ static std::array<Partition,NUM_PARTS> _partitions;
+
+ static uint32_t resolve(vespalib::stringref str) {
+ if (!str.empty()) {
+ uint64_t full_hash = XXH3_64bits(str.data(), str.size());
+ uint32_t part = full_hash & PART_MASK;
+ uint32_t local_hash = full_hash >> PART_BITS;
+ uint32_t local_idx = _partitions[part].resolve(AltKey{str, local_hash});
+ return (((local_idx << PART_BITS) | part) + 1);
+ } else {
+ return 0;
+ }
+ }
+
+ static vespalib::string get(uint32_t id) {
+ uint32_t part = (id - 1) & PART_MASK;
+ uint32_t local_idx = (id - 1) >> PART_BITS;
+ return _partitions[part].get(local_idx);
+ }
+
+ static uint32_t copy(uint32_t id) {
+ uint32_t part = (id - 1) & PART_MASK;
+ uint32_t local_idx = (id - 1) >> PART_BITS;
+ _partitions[part].copy(local_idx);
+ return id;
+ }
+
+ static void reclaim(uint32_t id) {
+ if (id != 0) {
+ uint32_t part = (id - 1) & PART_MASK;
+ uint32_t local_idx = (id - 1) >> PART_BITS;
+ _partitions[part].reclaim(local_idx);
+ }
+ }
+
+public:
+ class Handle {
+ private:
+ uint32_t _id;
+ public:
+ Handle() : _id(0) {}
+ Handle(vespalib::stringref str) : _id(resolve(str)) {}
+ Handle(const Handle &rhs) : _id(copy(rhs._id)) {}
+ Handle &operator=(const Handle &rhs) {
+ reclaim(_id);
+ _id = copy(rhs._id);
+ return *this;
+ }
+ Handle(Handle &&rhs) noexcept : _id(rhs._id) {
+ rhs._id = 0;
+ }
+ Handle &operator=(Handle &&rhs) {
+ reclaim(_id);
+ _id = rhs._id;
+ rhs._id = 0;
+ return *this;
+ }
+ bool operator==(const Handle &rhs) const { return (_id == rhs._id); }
+ vespalib::string as_string() const { return SharedStringRepo::get(_id); }
+ ~Handle() { reclaim(_id); }
+ };
+};
+
+}