summaryrefslogtreecommitdiffstats
path: root/vespalib
diff options
context:
space:
mode:
authorHåvard Pettersen <havardpe@oath.com>2021-01-12 14:45:24 +0000
committerHåvard Pettersen <havardpe@oath.com>2021-01-13 15:10:48 +0000
commit45ecd1ac250ecf42875bcac1a436ad58aa95fad5 (patch)
tree4a6b7076a5da2ecd83955873ca5d0e2da4657e47 /vespalib
parentd9db41186c2732dbb3eec3d0259a6020adc75b48 (diff)
use direct string id for small numbers
add stat for min_free (free address space in most filled part) fail with assert when address space is exhausted, but not before
Diffstat (limited to 'vespalib')
-rw-r--r--vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp138
-rw-r--r--vespalib/src/vespa/vespalib/util/shared_string_repo.cpp15
-rw-r--r--vespalib/src/vespa/vespalib/util/shared_string_repo.h71
3 files changed, 192 insertions, 32 deletions
diff --git a/vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp b/vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp
index aaf4a5e3e56..ddbde3b0485 100644
--- a/vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp
+++ b/vespalib/src/tests/shared_string_repo/shared_string_repo_test.cpp
@@ -30,6 +30,15 @@ std::vector<vespalib::string> make_strings(size_t cnt) {
return strings;
}
+std::vector<vespalib::string> make_direct_strings(size_t cnt) {
+ std::vector<vespalib::string> strings;
+ strings.reserve(cnt);
+ for (size_t i = 0; i < cnt; ++i) {
+ strings.push_back(fmt("%zu", (i % 100000)));
+ }
+ return strings;
+}
+
std::vector<vespalib::string> copy_strings(const std::vector<vespalib::string> &strings) {
return strings;
}
@@ -148,10 +157,11 @@ struct Fixture {
Avg avg;
Vote vote;
std::vector<vespalib::string> work;
+ std::vector<vespalib::string> direct_work;
steady_time start_time;
std::map<vespalib::string,double> time_ms;
Fixture(size_t num_threads)
- : avg(num_threads), vote(num_threads), work(make_strings(work_size)), start_time(steady_clock::now()) {}
+ : avg(num_threads), vote(num_threads), work(make_strings(work_size)), direct_work(make_direct_strings(work_size)), start_time(steady_clock::now()) {}
~Fixture() {
if (verbose) {
fprintf(stderr, "benchmark results for %zu threads:\n", vote.num_threads());
@@ -183,9 +193,11 @@ struct Fixture {
std::vector<std::pair<vespalib::string,uint64_t>> copy_and_hash_result;
std::vector<uint32_t> local_enum_result;
std::vector<Handle> resolve_result;
+ std::vector<Handle> resolve_direct_result;
std::vector<Handle> copy_handles_result;
std::vector<Handle> resolve_again_result;
std::vector<vespalib::string> get_result;
+ std::vector<vespalib::string> get_direct_result;
std::unique_ptr<SharedStringRepo::Handles> strong;
std::unique_ptr<SharedStringRepo::Handles> strong_copy;
std::unique_ptr<std::vector<string_id>> weak;
@@ -193,9 +205,11 @@ struct Fixture {
auto copy_and_hash_task = [&](){ copy_and_hash_result = copy_and_hash(work); };
auto local_enum_task = [&](){ local_enum_result = local_enum(work); };
auto resolve_task = [&](){ resolve_result = resolve_strings(work); };
+ auto resolve_direct_task = [&](){ resolve_direct_result = resolve_strings(direct_work); };
auto copy_handles_task = [&](){ copy_handles_result = resolve_result; };
auto resolve_again_task = [&](){ resolve_again_result = resolve_strings(work); };
auto get_task = [&](){ get_result = get_strings(resolve_result); };
+ auto get_direct_task = [&](){ get_direct_result = get_strings(resolve_direct_result); };
auto reclaim_task = [&]() { resolve_again_result.clear(); };
auto reclaim_last_task = [&]() { resolve_result.clear(); };
auto make_strong_task = [&]() { strong = make_strong_handles(work); };
@@ -208,20 +222,23 @@ struct Fixture {
measure_task("[02] copy and hash", is_master, copy_and_hash_task);
measure_task("[03] local enum", is_master, local_enum_task);
measure_task("[04] resolve", is_master, resolve_task);
- measure_task("[05] copy handles", is_master, copy_handles_task);
- measure_task("[06] resolve again", is_master, resolve_again_task);
+ measure_task("[05] resolve direct", is_master, resolve_direct_task);
+ measure_task("[06] copy handles", is_master, copy_handles_task);
+ measure_task("[07] resolve again", is_master, resolve_again_task);
verify_equal(resolve_result, resolve_again_result);
- measure_task("[07] as_string", is_master, get_task);
+ measure_task("[08] as_string", is_master, get_task);
+ measure_task("[09] as_string direct", is_master, get_direct_task);
verify_equal(get_result, work);
- measure_task("[08] reclaim", is_master, reclaim_task);
+ verify_equal(get_direct_result, direct_work);
+ measure_task("[10] reclaim", is_master, reclaim_task);
copy_handles_result.clear();
- measure_task("[09] reclaim last", is_master, reclaim_last_task);
- measure_task("[10] make strong handles", is_master, make_strong_task);
- measure_task("[11] copy strong handles", is_master, copy_strong_task);
- measure_task("[12] make weak handles", is_master, make_weak_task);
- measure_task("[13] free weak handles", is_master, free_weak_task);
- measure_task("[14] free strong handles copy", is_master, free_strong_copy_task);
- measure_task("[15] free strong handles", is_master, free_strong_task);
+ measure_task("[11] reclaim last", is_master, reclaim_last_task);
+ measure_task("[12] make strong handles", is_master, make_strong_task);
+ measure_task("[13] copy strong handles", is_master, copy_strong_task);
+ measure_task("[14] make weak handles", is_master, make_weak_task);
+ measure_task("[15] free weak handles", is_master, free_weak_task);
+ measure_task("[16] free strong handles copy", is_master, free_strong_copy_task);
+ measure_task("[17] free strong handles", is_master, free_strong_task);
}
}
};
@@ -250,11 +267,52 @@ void verify_not_eq(const Handle &a, const Handle &b) {
//-----------------------------------------------------------------------------
+TEST("require that empty stats object have expected values") {
+ size_t part_size = (uint32_t(-1) - 100001) / 64;
+ SharedStringRepo::Stats empty;
+ EXPECT_EQUAL(empty.active_entries, 0u);
+ EXPECT_EQUAL(empty.total_entries, 0u);
+ EXPECT_EQUAL(empty.min_free, part_size);
+ if (verbose) {
+ fprintf(stderr, "max entries per part: %zu\n", empty.min_free);
+ }
+}
+
+TEST("require that stats can be merged") {
+ SharedStringRepo::Stats a;
+ SharedStringRepo::Stats b;
+ a.active_entries = 1;
+ a.total_entries = 10;
+ a.min_free = 100;
+ b.active_entries = 3;
+ b.total_entries = 20;
+ b.min_free = 50;
+ a.merge(b);
+ EXPECT_EQUAL(a.active_entries, 4u);
+ EXPECT_EQUAL(a.total_entries, 30u);
+ EXPECT_EQUAL(a.min_free, 50u);
+}
+
+TEST("require that id_space_usage is sane") {
+ SharedStringRepo::Stats empty;
+ SharedStringRepo::Stats stats;
+ stats.min_free = empty.min_free;
+ EXPECT_EQUAL(stats.id_space_usage(), 0.0);
+ stats.min_free = empty.min_free / 2;
+ EXPECT_EQUAL(stats.id_space_usage(), 0.5);
+ stats.min_free = empty.min_free / 4;
+ EXPECT_EQUAL(stats.id_space_usage(), 0.75);
+ stats.min_free = 0;
+ EXPECT_EQUAL(stats.id_space_usage(), 1.0);
+}
+
+//-----------------------------------------------------------------------------
+
TEST("require that basic handle usage works") {
Handle empty;
Handle foo("foo");
Handle bar("bar");
- Handle empty2;
+ Handle empty2("");
Handle foo2("foo");
Handle bar2("bar");
@@ -315,6 +373,39 @@ TEST("require that handle can be self-assigned") {
//-----------------------------------------------------------------------------
+void verify_direct(const vespalib::string &str) {
+ size_t before = SharedStringRepo::stats().active_entries;
+ Handle handle(str);
+ EXPECT_EQUAL(SharedStringRepo::stats().active_entries, before);
+ EXPECT_EQUAL(handle.as_string(), str);
+}
+
+void verify_not_direct(const vespalib::string &str) {
+ size_t before = SharedStringRepo::stats().active_entries;
+ Handle handle(str);
+ EXPECT_EQUAL(SharedStringRepo::stats().active_entries, before + 1);
+ EXPECT_EQUAL(handle.as_string(), str);
+}
+
+TEST("require that direct handles work as expected") {
+ TEST_DO(verify_direct(""));
+ for (size_t i = 0; i < 100000; ++i) {
+ verify_direct(fmt("%zu", i));
+ }
+ TEST_DO(verify_not_direct(" "));
+ TEST_DO(verify_not_direct(" 5"));
+ TEST_DO(verify_not_direct("5 "));
+ TEST_DO(verify_not_direct("100000"));
+ TEST_DO(verify_not_direct("00"));
+ TEST_DO(verify_not_direct("01"));
+ TEST_DO(verify_not_direct("001"));
+ TEST_DO(verify_not_direct("-0"));
+ TEST_DO(verify_not_direct("-1"));
+ TEST_DO(verify_not_direct("a1"));
+}
+
+//-----------------------------------------------------------------------------
+
TEST("require that basic multi-handle usage works") {
Handles a;
a.reserve(4);
@@ -336,6 +427,26 @@ TEST("require that basic multi-handle usage works") {
//-----------------------------------------------------------------------------
+#if 0
+// needs a lot of memory or tweaking of PART_LIMIT
+TEST("allocate handles until we run out") {
+ size_t cnt = 0;
+ std::vector<Handle> handles;
+ for (;;) {
+ auto stats = SharedStringRepo::stats();
+ fprintf(stderr, "cnt: %zu, used: %zu/%zu, min free: %zu, usage: %g\n",
+ cnt, stats.active_entries, stats.total_entries, stats.min_free,
+ stats.id_space_usage());
+ size_t n = std::max(size_t(1), stats.min_free);
+ for (size_t i = 0; i < n; ++i) {
+ handles.emplace_back(fmt("my_id_%zu", cnt++));
+ }
+ }
+}
+#endif
+
+//-----------------------------------------------------------------------------
+
TEST_MT_F("test shared string repo operations with 1 threads", 1, Fixture(num_threads)) {
f1.benchmark(thread_id == 0);
}
@@ -368,7 +479,6 @@ TEST_MT_F("test shared string repo operations with 64 threads", 64, Fixture(num_
#if 0
// verify leak-detection and reporting
-
TEST("leak some handles on purpose") {
new Handle("leaked string");
new Handle("also leaked");
diff --git a/vespalib/src/vespa/vespalib/util/shared_string_repo.cpp b/vespalib/src/vespa/vespalib/util/shared_string_repo.cpp
index 187f586d344..c39b79f7899 100644
--- a/vespalib/src/vespa/vespalib/util/shared_string_repo.cpp
+++ b/vespalib/src/vespa/vespalib/util/shared_string_repo.cpp
@@ -9,7 +9,8 @@ namespace vespalib {
SharedStringRepo::Stats::Stats()
: active_entries(0),
- total_entries(0)
+ total_entries(0),
+ min_free(PART_LIMIT)
{
}
@@ -18,6 +19,13 @@ SharedStringRepo::Stats::merge(const Stats &s)
{
active_entries += s.active_entries;
total_entries += s.total_entries;
+ min_free = std::min(min_free, s.min_free);
+}
+
+double
+SharedStringRepo::Stats::id_space_usage() const
+{
+ return (1.0 - (double(min_free) / double(PART_LIMIT)));
}
SharedStringRepo::Partition::~Partition() = default;
@@ -41,6 +49,7 @@ SharedStringRepo::Partition::stats() const
std::lock_guard guard(_lock);
stats.active_entries = _hash.size();
stats.total_entries = _entries.size();
+ stats.min_free = (PART_LIMIT - _hash.size());
return stats;
}
@@ -49,7 +58,9 @@ SharedStringRepo::Partition::make_entries(size_t hint)
{
hint = std::max(hint, _entries.size() + 1);
size_t want_mem = roundUp2inN(hint * sizeof(Entry));
- size_t want_entries = want_mem / sizeof(Entry);
+ size_t want_entries = want_mem / sizeof(Entry);
+ want_entries = std::min(want_entries, PART_LIMIT);
+ assert(want_entries > _entries.size());
_entries.reserve(want_entries);
while (_entries.size() < _entries.capacity()) {
_entries.emplace_back(_free);
diff --git a/vespalib/src/vespa/vespalib/util/shared_string_repo.h b/vespalib/src/vespa/vespalib/util/shared_string_repo.h
index 7e9f5d41f96..518d3874b7d 100644
--- a/vespalib/src/vespa/vespalib/util/shared_string_repo.h
+++ b/vespalib/src/vespa/vespalib/util/shared_string_repo.h
@@ -13,6 +13,9 @@
#include <vector>
#include <array>
#include <cassert>
+#include <ctype.h>
+#include <limits>
+#include <charconv>
namespace vespalib {
@@ -29,14 +32,20 @@ public:
struct Stats {
size_t active_entries;
size_t total_entries;
+ size_t min_free;
Stats();
void merge(const Stats &s);
+ double id_space_usage() const;
};
private:
- static constexpr int NUM_PARTS = 64;
- static constexpr int PART_BITS = 6;
- static constexpr int PART_MASK = 0x3f;
+ static constexpr uint32_t PART_BITS = 6;
+ static constexpr uint32_t NUM_PARTS = 1 << PART_BITS;
+ static constexpr uint32_t PART_MASK = NUM_PARTS - 1;
+ static constexpr uint32_t FAST_DIGITS = 5;
+ static constexpr uint32_t FAST_ID_MAX = 99999;
+ static constexpr uint32_t ID_BIAS = (FAST_ID_MAX + 2);
+ static constexpr size_t PART_LIMIT = (std::numeric_limits<uint32_t>::max() - ID_BIAS) / NUM_PARTS;
struct AltKey {
vespalib::stringref str;
@@ -166,41 +175,71 @@ private:
SharedStringRepo();
~SharedStringRepo();
+ static uint32_t try_make_direct_id(vespalib::stringref str) noexcept {
+ if ((str.size() > FAST_DIGITS) || ((str.size() > 1) && (str[0] == '0'))) {
+ return ID_BIAS;
+ } else if (str.size() == 0) {
+ return 0;
+ } else {
+ uint32_t value = 0;
+ for (size_t i = 0; i < str.size(); ++i) {
+ char c = str[i];
+ if (!isdigit(c)) {
+ return ID_BIAS;
+ } else {
+ value = ((value * 10) + (c - '0'));
+ }
+ }
+ return (value + 1);
+ }
+ }
+
+ static vespalib::string string_from_direct_id(uint32_t id) {
+ if (id == 0) {
+ return {};
+ } else {
+ char tmp[16];
+ auto res = std::to_chars(tmp, tmp + sizeof(tmp), (id - 1), 10);
+ return vespalib::string(tmp, res.ptr - tmp);
+ }
+ }
+
string_id resolve(vespalib::stringref str) {
- if (!str.empty()) {
+ uint32_t direct_id = try_make_direct_id(str);
+ if (direct_id >= ID_BIAS) {
uint64_t full_hash = XXH3_64bits(str.data(), str.size());
uint32_t part = full_hash & PART_MASK;
uint32_t local_hash = full_hash >> PART_BITS;
uint32_t local_idx = _partitions[part].resolve(AltKey{str, local_hash});
- return string_id(((local_idx << PART_BITS) | part) + 1);
+ return string_id(((local_idx << PART_BITS) | part) + ID_BIAS);
} else {
- return {};
+ return string_id(direct_id);
}
}
vespalib::string as_string(string_id id) {
- if (id._id != 0) {
- uint32_t part = (id._id - 1) & PART_MASK;
- uint32_t local_idx = (id._id - 1) >> PART_BITS;
+ if (id._id >= ID_BIAS) {
+ uint32_t part = (id._id - ID_BIAS) & PART_MASK;
+ uint32_t local_idx = (id._id - ID_BIAS) >> PART_BITS;
return _partitions[part].as_string(local_idx);
} else {
- return {};
+ return string_from_direct_id(id._id);
}
}
string_id copy(string_id id) {
- if (id._id != 0) {
- uint32_t part = (id._id - 1) & PART_MASK;
- uint32_t local_idx = (id._id - 1) >> PART_BITS;
+ if (id._id >= ID_BIAS) {
+ uint32_t part = (id._id - ID_BIAS) & PART_MASK;
+ uint32_t local_idx = (id._id - ID_BIAS) >> PART_BITS;
_partitions[part].copy(local_idx);
}
return id;
}
void reclaim(string_id id) {
- if (id._id != 0) {
- uint32_t part = (id._id - 1) & PART_MASK;
- uint32_t local_idx = (id._id - 1) >> PART_BITS;
+ if (id._id >= ID_BIAS) {
+ uint32_t part = (id._id - ID_BIAS) & PART_MASK;
+ uint32_t local_idx = (id._id - ID_BIAS) >> PART_BITS;
_partitions[part].reclaim(local_idx);
}
}