diff options
author | Håvard Pettersen <havardpe@oath.com> | 2020-12-10 15:31:35 +0000 |
---|---|---|
committer | Håvard Pettersen <havardpe@oath.com> | 2021-01-05 12:18:35 +0000 |
commit | 74665956e649cb28682ba25d4f2089a7fae5087e (patch) | |
tree | b89c61226566a11e13adb38430ba841d5a20ae0e /eval/src | |
parent | 6438e2f64460178525a42fcfdaf207e264a020ef (diff) |
use string ids as tensor labels
Diffstat (limited to 'eval/src')
32 files changed, 639 insertions, 307 deletions
diff --git a/eval/src/tests/eval/fast_value/fast_value_test.cpp b/eval/src/tests/eval/fast_value/fast_value_test.cpp index 03658d8351b..e809fb1bcda 100644 --- a/eval/src/tests/eval/fast_value/fast_value_test.cpp +++ b/eval/src/tests/eval/fast_value/fast_value_test.cpp @@ -8,6 +8,8 @@ using namespace vespalib; using namespace vespalib::eval; +using Handle = SharedStringRepo::Handle; + TEST(FastCellsTest, push_back_fast_works) { FastCells<float> cells(3); EXPECT_EQ(cells.capacity, 4); @@ -60,38 +62,37 @@ TEST(FastCellsTest, add_cells_works) { using SA = std::vector<vespalib::stringref>; -TEST(FastValueBuilderTest, dense_add_subspace_robustness) { +TEST(FastValueBuilderTest, scalar_add_subspace_robustness) { auto factory = FastValueBuilderFactory::get(); - ValueType type = ValueType::from_spec("tensor(x[2])"); + ValueType type = ValueType::from_spec("double"); auto builder = factory.create_value_builder<double>(type); - auto subspace = builder->add_subspace({}); + auto subspace = builder->add_subspace(); subspace[0] = 17.0; - subspace[1] = 666; - auto other = builder->add_subspace({}); - other[1] = 42.0; + auto other = builder->add_subspace(); + other[0] = 42.0; auto value = builder->build(std::move(builder)); + EXPECT_EQ(value->index().size(), 1); auto actual = spec_from_value(*value); - auto expected = TensorSpec("tensor(x[2])"). - add({{"x", 0}}, 17.0). - add({{"x", 1}}, 42.0); - EXPECT_EQ(actual, expected); + auto expected = TensorSpec("double"). + add({}, 42.0); + EXPECT_EQ(actual, expected); } -TEST(FastValueBuilderTest, sparse_add_subspace_robustness) { +TEST(FastValueBuilderTest, dense_add_subspace_robustness) { auto factory = FastValueBuilderFactory::get(); - ValueType type = ValueType::from_spec("tensor(x{})"); + ValueType type = ValueType::from_spec("tensor(x[2])"); auto builder = factory.create_value_builder<double>(type); - auto subspace = builder->add_subspace(SA{"foo"}); + auto subspace = builder->add_subspace(); subspace[0] = 17.0; - subspace = builder->add_subspace(SA{"bar"}); - subspace[0] = 18.0; - auto other = builder->add_subspace(SA{"foo"}); - other[0] = 42.0; + subspace[1] = 666; + auto other = builder->add_subspace(); + other[1] = 42.0; auto value = builder->build(std::move(builder)); + EXPECT_EQ(value->index().size(), 1); auto actual = spec_from_value(*value); - auto expected = TensorSpec("tensor(x{})"). - add({{"x", "bar"}}, 18.0). - add({{"x", "foo"}}, 42.0); + auto expected = TensorSpec("tensor(x[2])"). + add({{"x", 0}}, 17.0). + add({{"x", 1}}, 42.0); EXPECT_EQ(actual, expected); } @@ -100,21 +101,43 @@ TEST(FastValueBuilderTest, mixed_add_subspace_robustness) { ValueType type = ValueType::from_spec("tensor(x{},y[2])"); auto builder = factory.create_value_builder<double>(type); auto subspace = builder->add_subspace(SA{"foo"}); - subspace[0] = 17.0; - subspace[1] = 666; + subspace[0] = 1.0; + subspace[1] = 5.0; subspace = builder->add_subspace(SA{"bar"}); - subspace[0] = 18.0; - subspace[1] = 19.0; + subspace[0] = 2.0; + subspace[1] = 10.0; auto other = builder->add_subspace(SA{"foo"}); - other[1] = 42.0; + other[0] = 3.0; + other[1] = 15.0; auto value = builder->build(std::move(builder)); - auto actual = spec_from_value(*value); - auto expected = TensorSpec("tensor(x{},y[2])"). - add({{"x", "foo"}, {"y", 0}}, 17.0). - add({{"x", "bar"}, {"y", 0}}, 18.0). - add({{"x", "bar"}, {"y", 1}}, 19.0). - add({{"x", "foo"}, {"y", 1}}, 42.0); - EXPECT_EQ(actual, expected); + EXPECT_EQ(value->index().size(), 3); + Handle foo("foo"); + Handle bar("bar"); + label_t label; + label_t *label_ptr = &label; + size_t subspace_idx; + auto get_subspace = [&]() { + auto cells = value->cells().typify<double>(); + return ConstArrayRef<double>(cells.begin() + subspace_idx * 2, 2); + }; + auto view = value->index().create_view({}); + view->lookup({}); + while (view->next_result({&label_ptr, 1}, subspace_idx)) { + if (label == bar.id()) { + auto values = get_subspace(); + EXPECT_EQ(values[0], 2.0); + EXPECT_EQ(values[1], 10.0); + } else { + EXPECT_EQ(label, foo.id()); + auto values = get_subspace(); + if (values[0] == 1) { + EXPECT_EQ(values[1], 5.0); + } else { + EXPECT_EQ(values[0], 3.0); + EXPECT_EQ(values[1], 15.0); + } + } + } } GTEST_MAIN_RUN_ALL_TESTS() diff --git a/eval/src/tests/eval/simple_value/simple_value_test.cpp b/eval/src/tests/eval/simple_value/simple_value_test.cpp index c05f9976e1a..1691d5c263c 100644 --- a/eval/src/tests/eval/simple_value/simple_value_test.cpp +++ b/eval/src/tests/eval/simple_value/simple_value_test.cpp @@ -16,8 +16,12 @@ using namespace vespalib::eval::test; using vespalib::make_string_short::fmt; -using PA = std::vector<vespalib::stringref *>; -using CPA = std::vector<const vespalib::stringref *>; +using PA = std::vector<label_t *>; +using CPA = std::vector<const label_t *>; + +using Handle = SharedStringRepo::Handle; + +vespalib::string as_str(label_t label) { return Handle::string_from_id(label); } std::vector<Layout> layouts = { {}, @@ -98,17 +102,18 @@ TEST(SimpleValueTest, simple_value_can_be_built_and_inspected) { std::unique_ptr<Value> value = builder->build(std::move(builder)); EXPECT_EQ(value->index().size(), 6); auto view = value->index().create_view({0}); - vespalib::stringref query = "b"; - vespalib::stringref label; + Handle query_handle("b"); + label_t query = query_handle.id(); + label_t label; size_t subspace; + std::map<vespalib::string,size_t> result; view->lookup(CPA{&query}); - EXPECT_TRUE(view->next_result(PA{&label}, subspace)); - EXPECT_EQ(label, "aa"); - EXPECT_EQ(subspace, 2); - EXPECT_TRUE(view->next_result(PA{&label}, subspace)); - EXPECT_EQ(label, "bb"); - EXPECT_EQ(subspace, 3); - EXPECT_FALSE(view->next_result(PA{&label}, subspace)); + while (view->next_result(PA{&label}, subspace)) { + result[as_str(label)] = subspace; + } + EXPECT_EQ(result.size(), 2); + EXPECT_EQ(result["aa"], 2); + EXPECT_EQ(result["bb"], 3); } TEST(SimpleValueTest, new_generic_join_works_for_simple_values) { diff --git a/eval/src/tests/streamed/value/streamed_value_test.cpp b/eval/src/tests/streamed/value/streamed_value_test.cpp index 05d6e20451c..5221c4eda64 100644 --- a/eval/src/tests/streamed/value/streamed_value_test.cpp +++ b/eval/src/tests/streamed/value/streamed_value_test.cpp @@ -16,8 +16,12 @@ using namespace vespalib::eval::test; using vespalib::make_string_short::fmt; -using PA = std::vector<vespalib::stringref *>; -using CPA = std::vector<const vespalib::stringref *>; +using PA = std::vector<label_t *>; +using CPA = std::vector<const label_t *>; + +using Handle = SharedStringRepo::Handle; + +vespalib::string as_str(label_t label) { return Handle::string_from_id(label); } std::vector<Layout> layouts = { {}, @@ -98,17 +102,18 @@ TEST(StreamedValueTest, streamed_value_can_be_built_and_inspected) { std::unique_ptr<Value> value = builder->build(std::move(builder)); EXPECT_EQ(value->index().size(), 6); auto view = value->index().create_view({0}); - vespalib::stringref query = "b"; - vespalib::stringref label; + Handle query_handle("b"); + label_t query = query_handle.id(); + label_t label; size_t subspace; + std::map<vespalib::string,size_t> result; view->lookup(CPA{&query}); - EXPECT_TRUE(view->next_result(PA{&label}, subspace)); - EXPECT_EQ(label, "aa"); - EXPECT_EQ(subspace, 2); - EXPECT_TRUE(view->next_result(PA{&label}, subspace)); - EXPECT_EQ(label, "bb"); - EXPECT_EQ(subspace, 3); - EXPECT_FALSE(view->next_result(PA{&label}, subspace)); + while (view->next_result(PA{&label}, subspace)) { + result[as_str(label)] = subspace; + } + EXPECT_EQ(result.size(), 2); + EXPECT_EQ(result["aa"], 2); + EXPECT_EQ(result["bb"], 3); } TEST(StreamedValueTest, new_generic_join_works_for_streamed_values) { diff --git a/eval/src/vespa/eval/eval/CMakeLists.txt b/eval/src/vespa/eval/eval/CMakeLists.txt index 01eeff49662..5f8dd478a7b 100644 --- a/eval/src/vespa/eval/eval/CMakeLists.txt +++ b/eval/src/vespa/eval/eval/CMakeLists.txt @@ -10,6 +10,7 @@ vespa_add_library(eval_eval OBJECT delete_node.cpp dense_cells_value.cpp double_value_builder.cpp + fast_addr_map.cpp fast_forest.cpp fast_sparse_map.cpp fast_value.cpp diff --git a/eval/src/vespa/eval/eval/fast_addr_map.cpp b/eval/src/vespa/eval/eval/fast_addr_map.cpp new file mode 100644 index 00000000000..73163f411e6 --- /dev/null +++ b/eval/src/vespa/eval/eval/fast_addr_map.cpp @@ -0,0 +1,9 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "fast_addr_map.h" + +namespace vespalib::eval { + +FastAddrMap::~FastAddrMap() = default; + +} diff --git a/eval/src/vespa/eval/eval/fast_addr_map.h b/eval/src/vespa/eval/eval/fast_addr_map.h new file mode 100644 index 00000000000..a8a82718a28 --- /dev/null +++ b/eval/src/vespa/eval/eval/fast_addr_map.h @@ -0,0 +1,152 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "label.h" +#include "memory_usage_stuff.h" +#include <vespa/vespalib/util/arrayref.h> +#include <vespa/vespalib/stllike/identity.h> +#include <vespa/vespalib/stllike/hashtable.h> +#include <vespa/vespalib/util/shared_string_repo.h> +#include <vector> + +namespace vespalib::eval { + +/** + * A wrapper around vespalib::hashtable, using it to map a list of + * labels (a sparse address) to an integer value (dense subspace + * index). Labels are represented by string enum values stored and + * handled outside this class. + **/ +class FastAddrMap +{ +public: + // label hasing functions + static constexpr uint32_t hash_label(label_t label) { return label; } + static constexpr uint32_t hash_label(const label_t *label) { return *label; } + static constexpr uint32_t combine_label_hash(uint32_t full_hash, uint32_t next_hash) { + return ((full_hash * 31) + next_hash); + } + template <typename T> + static constexpr uint32_t hash_labels(ConstArrayRef<T> addr) { + uint32_t hash = 0; + for (const T &label: addr) { + hash = combine_label_hash(hash, hash_label(label)); + } + return hash; + } + + // typed uint32_t index used to identify sparse address/dense subspace + struct Tag { + uint32_t idx; + static constexpr uint32_t npos() { return uint32_t(-1); } + static constexpr Tag make_invalid() { return Tag{npos()}; } + constexpr bool valid() const { return (idx != npos()); } + }; + + // sparse hash set entry + struct Entry { + Tag tag; + uint32_t hash; + }; + + // alternative key(s) used for lookup in sparse hash set + template <typename T> struct AltKey { + ConstArrayRef<T> key; + uint32_t hash; + }; + + // view able to convert tags into sparse addresses + struct LabelView { + size_t addr_size; + const std::vector<label_t> &labels; + LabelView(size_t num_mapped_dims, SharedStringRepo::HandleView handle_view) + : addr_size(num_mapped_dims), labels(handle_view.handles()) {} + ConstArrayRef<label_t> get_addr(size_t idx) const { + return {&labels[idx * addr_size], addr_size}; + } + }; + + // hashing functor for sparse hash set + struct Hash { + template <typename T> + constexpr uint32_t operator()(const AltKey<T> &key) const { return key.hash; } + constexpr uint32_t operator()(const Entry &entry) const { return entry.hash; } + }; + + // equality functor for sparse hash set + struct Equal { + const LabelView &label_view; + Equal(const LabelView &label_view_in) : label_view(label_view_in) {} + static constexpr bool eq_labels(label_t a, label_t b) { return (a == b); } + static constexpr bool eq_labels(label_t a, const label_t *b) { return (a == *b); } + template <typename T> + bool operator()(const Entry &a, const AltKey<T> &b) const { + if ((a.hash != b.hash) || (b.key.size() != label_view.addr_size)) { + return false; + } + auto a_key = label_view.get_addr(a.tag.idx); + for (size_t i = 0; i < a_key.size(); ++i) { + if (!eq_labels(a_key[i], b.key[i])) { + return false; + } + } + return true; + } + }; + + using HashType = hashtable<Entry, Entry, Hash, Equal, Identity, hashtable_base::and_modulator>; + +private: + LabelView _labels; + HashType _map; + +public: + FastAddrMap(size_t num_mapped_dims, SharedStringRepo::HandleView handle_view, size_t expected_subspaces) + : _labels(num_mapped_dims, handle_view), + _map(expected_subspaces * 2, Hash(), Equal(_labels)) {} + ~FastAddrMap(); + FastAddrMap(const FastAddrMap &) = delete; + FastAddrMap &operator=(const FastAddrMap &) = delete; + FastAddrMap(FastAddrMap &&) = delete; + FastAddrMap &operator=(FastAddrMap &&) = delete; + static constexpr size_t npos() { return -1; } + ConstArrayRef<label_t> get_addr(size_t idx) const { return _labels.get_addr(idx); } + size_t size() const { return _map.size(); } + constexpr size_t addr_size() const { return _labels.addr_size; } + template <typename T> + size_t lookup(ConstArrayRef<T> addr, uint32_t hash) const { + AltKey<T> key{addr, hash}; + auto pos = _map.find(key); + return (pos == _map.end()) ? npos() : pos->tag.idx; + } + template <typename T> + size_t lookup(ConstArrayRef<T> addr) const { + return lookup(addr, hash_labels(addr)); + } + void add_mapping(uint32_t hash) { + uint32_t idx = _map.size(); + _map.force_insert(Entry{{idx}, hash}); + } + template <typename F> + void each_map_entry(F &&f) const { + _map.for_each([&](const auto &entry) + { + f(entry.tag.idx, entry.hash); + }); + } + MemoryUsage estimate_extra_memory_usage() const { + MemoryUsage extra_usage; + size_t map_self_size = sizeof(_map); + size_t map_used = _map.getMemoryUsed(); + size_t map_allocated = _map.getMemoryConsumption(); + // avoid double-counting the map itself + map_used = std::min(map_used, map_used - map_self_size); + map_allocated = std::min(map_allocated, map_allocated - map_self_size); + extra_usage.incUsedBytes(map_used); + extra_usage.incAllocatedBytes(map_allocated); + return extra_usage; + } +}; + +} diff --git a/eval/src/vespa/eval/eval/fast_value.cpp b/eval/src/vespa/eval/eval/fast_value.cpp index 116e561a868..96d0fa84149 100644 --- a/eval/src/vespa/eval/eval/fast_value.cpp +++ b/eval/src/vespa/eval/eval/fast_value.cpp @@ -11,7 +11,7 @@ namespace vespalib::eval { namespace { struct CreateFastValueBuilderBase { - template <typename T> static std::unique_ptr<ValueBuilderBase> invoke(const ValueType &type, + template <typename T, typename R2> static std::unique_ptr<ValueBuilderBase> invoke(const ValueType &type, size_t num_mapped_dims, size_t subspace_size, size_t expected_subspaces) { assert(check_cell_type<T>(type.cell_type())); @@ -20,7 +20,7 @@ struct CreateFastValueBuilderBase { } else if (num_mapped_dims == 0) { return std::make_unique<FastDenseValue<T>>(type, subspace_size); } else { - return std::make_unique<FastValue<T>>(type, num_mapped_dims, subspace_size, expected_subspaces); + return std::make_unique<FastValue<T,R2::value>>(type, num_mapped_dims, subspace_size, expected_subspaces); } } }; @@ -32,11 +32,11 @@ struct CreateFastValueBuilderBase { std::unique_ptr<Value::Index::View> FastValueIndex::create_view(const std::vector<size_t> &dims) const { - if (map.num_dims() == 0) { + if (map.addr_size() == 0) { return TrivialIndex::get().create_view(dims); } else if (dims.empty()) { return std::make_unique<FastIterateView>(map); - } else if (dims.size() == map.num_dims()) { + } else if (dims.size() == map.addr_size()) { return std::make_unique<FastLookupView>(map); } else { return std::make_unique<FastFilterView>(map, dims); @@ -49,10 +49,11 @@ FastValueBuilderFactory::FastValueBuilderFactory() = default; FastValueBuilderFactory FastValueBuilderFactory::_factory; std::unique_ptr<ValueBuilderBase> -FastValueBuilderFactory::create_value_builder_base(const ValueType &type, size_t num_mapped_dims, size_t subspace_size, - size_t expected_subspaces) const +FastValueBuilderFactory::create_value_builder_base(const ValueType &type, bool transient, size_t num_mapped_dims, size_t subspace_size, + size_t expected_subspaces) const { - return typify_invoke<1,TypifyCellType,CreateFastValueBuilderBase>(type.cell_type(), type, num_mapped_dims, subspace_size, expected_subspaces); + using MyTypify = TypifyValue<TypifyCellType,TypifyBool>; + return typify_invoke<2,MyTypify,CreateFastValueBuilderBase>(type.cell_type(), transient, type, num_mapped_dims, subspace_size, expected_subspaces); } //----------------------------------------------------------------------------- diff --git a/eval/src/vespa/eval/eval/fast_value.h b/eval/src/vespa/eval/eval/fast_value.h index ac924ecc6eb..c6280b492db 100644 --- a/eval/src/vespa/eval/eval/fast_value.h +++ b/eval/src/vespa/eval/eval/fast_value.h @@ -19,7 +19,7 @@ class FastValueBuilderFactory : public ValueBuilderFactory { private: FastValueBuilderFactory(); static FastValueBuilderFactory _factory; - std::unique_ptr<ValueBuilderBase> create_value_builder_base(const ValueType &type, + std::unique_ptr<ValueBuilderBase> create_value_builder_base(const ValueType &type, bool transient, size_t num_mapped_dims, size_t subspace_size, size_t expected_subspaces) const override; public: static const FastValueBuilderFactory &get() { return _factory; } diff --git a/eval/src/vespa/eval/eval/fast_value.hpp b/eval/src/vespa/eval/eval/fast_value.hpp index 9914378cc9e..972aa68b8bd 100644 --- a/eval/src/vespa/eval/eval/fast_value.hpp +++ b/eval/src/vespa/eval/eval/fast_value.hpp @@ -1,11 +1,10 @@ // Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "value.h" -#include "fast_sparse_map.h" +#include "fast_addr_map.h" #include "inline_operation.h" #include <vespa/eval/instruction/generic_join.h> -#include <vespa/vespalib/stllike/hash_map.hpp> -#include <vespa/vespalib/util/alloc.h> +#include <vespa/vespalib/stllike/hashtable.hpp> namespace vespalib::eval { @@ -18,22 +17,22 @@ namespace { // look up a full address in the map directly struct FastLookupView : public Value::Index::View { - const FastSparseMap ↦ - size_t subspace; + const FastAddrMap ↦ + size_t subspace; - FastLookupView(const FastSparseMap &map_in) - : map(map_in), subspace(FastSparseMap::npos()) {} + FastLookupView(const FastAddrMap &map_in) + : map(map_in), subspace(FastAddrMap::npos()) {} - void lookup(ConstArrayRef<const vespalib::stringref*> addr) override { + void lookup(ConstArrayRef<const label_t*> addr) override { subspace = map.lookup(addr); } - bool next_result(ConstArrayRef<vespalib::stringref*>, size_t &idx_out) override { - if (subspace == FastSparseMap::npos()) { + bool next_result(ConstArrayRef<label_t*>, size_t &idx_out) override { + if (subspace == FastAddrMap::npos()) { return false; } idx_out = subspace; - subspace = FastSparseMap::npos(); + subspace = FastAddrMap::npos(); return true; } }; @@ -43,30 +42,27 @@ struct FastLookupView : public Value::Index::View { // find matching mappings for a partial address with brute force filtering struct FastFilterView : public Value::Index::View { - using Label = FastSparseMap::HashedLabel; - - size_t num_mapped_dims; - const std::vector<Label> &labels; + const FastAddrMap ↦ std::vector<size_t> match_dims; std::vector<size_t> extract_dims; - std::vector<Label> query; + std::vector<label_t> query; size_t pos; - bool is_match() const { + bool is_match(ConstArrayRef<label_t> addr) const { for (size_t i = 0; i < query.size(); ++i) { - if (query[i].hash != labels[pos + match_dims[i]].hash) { + if (query[i] != addr[match_dims[i]]) { return false; } } return true; } - FastFilterView(const FastSparseMap &map, const std::vector<size_t> &match_dims_in) - : num_mapped_dims(map.num_dims()), labels(map.labels()), match_dims(match_dims_in), - extract_dims(), query(match_dims.size(), Label()), pos(labels.size()) + FastFilterView(const FastAddrMap &map_in, const std::vector<size_t> &match_dims_in) + : map(map_in), match_dims(match_dims_in), + extract_dims(), query(match_dims.size()), pos(FastAddrMap::npos()) { auto my_pos = match_dims.begin(); - for (size_t i = 0; i < num_mapped_dims; ++i) { + for (size_t i = 0; i < map.addr_size(); ++i) { if ((my_pos == match_dims.end()) || (*my_pos != i)) { extract_dims.push_back(i); } else { @@ -74,29 +70,29 @@ struct FastFilterView : public Value::Index::View { } } assert(my_pos == match_dims.end()); - assert((match_dims.size() + extract_dims.size()) == num_mapped_dims); + assert((match_dims.size() + extract_dims.size()) == map.addr_size()); } - void lookup(ConstArrayRef<const vespalib::stringref*> addr) override { + void lookup(ConstArrayRef<const label_t*> addr) override { assert(addr.size() == query.size()); for (size_t i = 0; i < addr.size(); ++i) { - query[i] = Label(*addr[i]); + query[i] = *addr[i]; } pos = 0; } - bool next_result(ConstArrayRef<vespalib::stringref*> addr_out, size_t &idx_out) override { - while (pos < labels.size()) { - if (is_match()) { + bool next_result(ConstArrayRef<label_t*> addr_out, size_t &idx_out) override { + while (pos < map.size()) { + auto addr = map.get_addr(pos); + if (is_match(addr)) { assert(addr_out.size() == extract_dims.size()); for (size_t i = 0; i < extract_dims.size(); ++i) { - *addr_out[i] = labels[pos + extract_dims[i]].label; + *addr_out[i] = addr[extract_dims[i]]; } - idx_out = (pos / num_mapped_dims); // is this expensive? - pos += num_mapped_dims; + idx_out = pos++; return true; } - pos += num_mapped_dims; + ++pos; } return false; } @@ -107,29 +103,26 @@ struct FastFilterView : public Value::Index::View { // iterate all mappings struct FastIterateView : public Value::Index::View { - using Labels = std::vector<FastSparseMap::HashedLabel>; - - size_t num_mapped_dims; - const Labels &labels; - size_t pos; + const FastAddrMap ↦ + size_t pos; - FastIterateView(const FastSparseMap &map) - : num_mapped_dims(map.num_dims()), labels(map.labels()), pos(labels.size()) {} + FastIterateView(const FastAddrMap &map_in) + : map(map_in), pos(FastAddrMap::npos()) {} - void lookup(ConstArrayRef<const vespalib::stringref*>) override { + void lookup(ConstArrayRef<const label_t*>) override { pos = 0; } - bool next_result(ConstArrayRef<vespalib::stringref*> addr_out, size_t &idx_out) override { - if (pos >= labels.size()) { + bool next_result(ConstArrayRef<label_t*> addr_out, size_t &idx_out) override { + if (pos >= map.size()) { return false; } - assert(addr_out.size() == num_mapped_dims); - for (size_t i = 0; i < num_mapped_dims; ++i) { - *addr_out[i] = labels[pos + i].label; + auto addr = map.get_addr(pos); + assert(addr.size() == addr_out.size()); + for (size_t i = 0; i < addr.size(); ++i) { + *addr_out[i] = addr[i]; } - idx_out = (pos / num_mapped_dims); // is this expensive? - pos += num_mapped_dims; + idx_out = pos++; return true; } }; @@ -145,9 +138,9 @@ using JoinAddrSource = instruction::SparseJoinPlan::Source; // operations by calling inline functions directly. struct FastValueIndex final : Value::Index { - FastSparseMap map; - FastValueIndex(size_t num_mapped_dims_in, size_t expected_subspaces_in) - : map(num_mapped_dims_in, expected_subspaces_in) {} + FastAddrMap map; + FastValueIndex(size_t num_mapped_dims_in, SharedStringRepo::HandleView handle_view, size_t expected_subspaces_in) + : map(num_mapped_dims_in, handle_view, expected_subspaces_in) {} template <typename LCT, typename RCT, typename OCT, typename Fun> static const Value &sparse_full_overlap_join(const ValueType &res_type, const Fun &fun, @@ -220,31 +213,64 @@ struct FastCells { //----------------------------------------------------------------------------- -template <typename T> +template <typename T, bool transient> struct FastValue final : Value, ValueBuilder<T> { + using Handles = std::conditional<transient, + SharedStringRepo::WeakHandles, + SharedStringRepo::StrongHandles>::type; + ValueType my_type; size_t my_subspace_size; + Handles my_handles; FastValueIndex my_index; FastCells<T> my_cells; FastValue(const ValueType &type_in, size_t num_mapped_dims_in, size_t subspace_size_in, size_t expected_subspaces_in) : my_type(type_in), my_subspace_size(subspace_size_in), - my_index(num_mapped_dims_in, expected_subspaces_in), + my_handles(expected_subspaces_in * num_mapped_dims_in), + my_index(num_mapped_dims_in, my_handles.view(), expected_subspaces_in), my_cells(subspace_size_in * expected_subspaces_in) {} ~FastValue() override; const ValueType &type() const override { return my_type; } const Value::Index &index() const override { return my_index; } TypedCells cells() const override { return TypedCells(my_cells.memory, get_cell_type<T>(), my_cells.size); } + void add_mapping(ConstArrayRef<vespalib::stringref> addr) { + if constexpr (transient) { + (void) addr; + abort(); // cannot use this for transient values + } else { + uint32_t hash = 0; + for (const auto &label: addr) { + hash = FastAddrMap::combine_label_hash(hash, FastAddrMap::hash_label(my_handles.add(label))); + } + my_index.map.add_mapping(hash); + } + } + void add_mapping(ConstArrayRef<label_t> addr) { + uint32_t hash = 0; + for (label_t label: addr) { + hash = FastAddrMap::combine_label_hash(hash, FastAddrMap::hash_label(label)); + my_handles.add(label); + } + my_index.map.add_mapping(hash); + } + void add_mapping(ConstArrayRef<label_t> addr, uint32_t hash) { + for (label_t label: addr) { + my_handles.add(label); + } + my_index.map.add_mapping(hash); + } ArrayRef<T> add_subspace(ConstArrayRef<vespalib::stringref> addr) override { - size_t idx = my_index.map.add_mapping(addr) * my_subspace_size; - if (__builtin_expect((idx == my_cells.size), true)) { - return my_cells.add_cells(my_subspace_size); - } - return ArrayRef<T>(my_cells.get(idx), my_subspace_size); + add_mapping(addr); + return my_cells.add_cells(my_subspace_size); + } + ArrayRef<T> add_subspace(ConstArrayRef<label_t> addr) override { + add_mapping(addr); + return my_cells.add_cells(my_subspace_size); } std::unique_ptr<Value> build(std::unique_ptr<ValueBuilder<T>> self) override { - if (my_index.map.num_dims() == 0) { + if (my_index.map.addr_size() == 0) { assert(my_index.map.size() == 1); } assert(my_cells.size == (my_index.map.size() * my_subspace_size)); @@ -254,13 +280,14 @@ struct FastValue final : Value, ValueBuilder<T> { return std::unique_ptr<Value>(this); } MemoryUsage get_memory_usage() const override { - MemoryUsage usage = self_memory_usage<FastValue<T>>(); + MemoryUsage usage = self_memory_usage<FastValue<T,transient>>(); + usage.merge(vector_extra_memory_usage(my_handles.view().handles())); usage.merge(my_index.map.estimate_extra_memory_usage()); usage.merge(my_cells.estimate_extra_memory_usage()); return usage; } }; -template <typename T> FastValue<T>::~FastValue() = default; +template <typename T,bool transient> FastValue<T,transient>::~FastValue() = default; //----------------------------------------------------------------------------- @@ -282,6 +309,9 @@ struct FastDenseValue final : Value, ValueBuilder<T> { ArrayRef<T> add_subspace(ConstArrayRef<vespalib::stringref>) override { return ArrayRef<T>(my_cells.get(0), my_cells.size); } + ArrayRef<T> add_subspace(ConstArrayRef<label_t>) override { + return ArrayRef<T>(my_cells.get(0), my_cells.size); + } std::unique_ptr<Value> build(std::unique_ptr<ValueBuilder<T>> self) override { ValueBuilder<T>* me = this; assert(me == self.get()); @@ -289,7 +319,7 @@ struct FastDenseValue final : Value, ValueBuilder<T> { return std::unique_ptr<Value>(this); } MemoryUsage get_memory_usage() const override { - MemoryUsage usage = self_memory_usage<FastValue<T>>(); + MemoryUsage usage = self_memory_usage<FastDenseValue<T>>(); usage.merge(my_cells.estimate_extra_memory_usage()); return usage; } @@ -302,6 +332,7 @@ template <typename T> struct FastScalarBuilder final : ValueBuilder<T> { T _value; ArrayRef<T> add_subspace(ConstArrayRef<vespalib::stringref>) final override { return ArrayRef<T>(&_value, 1); } + ArrayRef<T> add_subspace(ConstArrayRef<label_t>) final override { return ArrayRef<T>(&_value, 1); }; std::unique_ptr<Value> build(std::unique_ptr<ValueBuilder<T>>) final override { return std::make_unique<ScalarValue<T>>(_value); } }; @@ -313,19 +344,16 @@ FastValueIndex::sparse_full_overlap_join(const ValueType &res_type, const Fun &f const FastValueIndex &lhs, const FastValueIndex &rhs, ConstArrayRef<LCT> lhs_cells, ConstArrayRef<RCT> rhs_cells, Stash &stash) { - auto &result = stash.create<FastValue<OCT>>(res_type, lhs.map.num_dims(), 1, lhs.map.size()); - auto &result_map = result.my_index.map; - lhs.map.each_map_entry([&](auto lhs_subspace, auto hash) - { - auto rhs_subspace = rhs.map.lookup(hash); - if (rhs_subspace != FastSparseMap::npos()) { - auto idx = result_map.add_mapping(lhs.map.make_addr(lhs_subspace), hash); - if (__builtin_expect((idx == result.my_cells.size), true)) { - auto cell_value = fun(lhs_cells[lhs_subspace], rhs_cells[rhs_subspace]); - result.my_cells.push_back_fast(cell_value); - } - } - }); + auto &result = stash.create<FastValue<OCT,true>>(res_type, lhs.map.addr_size(), 1, lhs.map.size()); + lhs.map.each_map_entry([&](auto lhs_subspace, auto hash) { + auto lhs_addr = lhs.map.get_addr(lhs_subspace); + auto rhs_subspace = rhs.map.lookup(lhs_addr, hash); + if (rhs_subspace != FastAddrMap::npos()) { + result.add_mapping(lhs_addr, hash); + auto cell_value = fun(lhs_cells[lhs_subspace], rhs_cells[rhs_subspace]); + result.my_cells.push_back_fast(cell_value); + } + }); return result; } @@ -338,10 +366,9 @@ FastValueIndex::sparse_no_overlap_join(const ValueType &res_type, const Fun &fun const std::vector<JoinAddrSource> &addr_sources, ConstArrayRef<LCT> lhs_cells, ConstArrayRef<RCT> rhs_cells, Stash &stash) { - using HashedLabelRef = std::reference_wrapper<const FastSparseMap::HashedLabel>; size_t num_mapped_dims = addr_sources.size(); - auto &result = stash.create<FastValue<OCT>>(res_type, num_mapped_dims, 1, lhs.map.size()*rhs.map.size()); - std::vector<HashedLabelRef> output_addr(num_mapped_dims, FastSparseMap::empty_label); + auto &result = stash.create<FastValue<OCT,true>>(res_type, num_mapped_dims, 1, lhs.map.size()*rhs.map.size()); + std::vector<label_t> output_addr(num_mapped_dims); std::vector<size_t> store_lhs_idx; std::vector<size_t> store_rhs_idx; size_t out_idx = 0; @@ -359,24 +386,22 @@ FastValueIndex::sparse_no_overlap_join(const ValueType &res_type, const Fun &fun } assert(out_idx == output_addr.size()); for (size_t lhs_subspace = 0; lhs_subspace < lhs.map.size(); ++lhs_subspace) { - auto l_addr = lhs.map.make_addr(lhs_subspace); + auto l_addr = lhs.map.get_addr(lhs_subspace); assert(l_addr.size() == store_lhs_idx.size()); for (size_t i = 0; i < store_lhs_idx.size(); ++i) { size_t addr_idx = store_lhs_idx[i]; output_addr[addr_idx] = l_addr[i]; } for (size_t rhs_subspace = 0; rhs_subspace < rhs.map.size(); ++rhs_subspace) { - auto r_addr = rhs.map.make_addr(rhs_subspace); + auto r_addr = rhs.map.get_addr(rhs_subspace); assert(r_addr.size() == store_rhs_idx.size()); for (size_t i = 0; i < store_rhs_idx.size(); ++i) { size_t addr_idx = store_rhs_idx[i]; output_addr[addr_idx] = r_addr[i]; } - auto idx = result.my_index.map.add_mapping(ConstArrayRef(output_addr)); - if (__builtin_expect((idx == result.my_cells.size), true)) { - auto cell_value = fun(lhs_cells[lhs_subspace], rhs_cells[rhs_subspace]); - result.my_cells.push_back_fast(cell_value); - } + result.add_mapping(ConstArrayRef(output_addr)); + auto cell_value = fun(lhs_cells[lhs_subspace], rhs_cells[rhs_subspace]); + result.my_cells.push_back_fast(cell_value); } } return result; @@ -391,22 +416,22 @@ FastValueIndex::sparse_only_merge(const ValueType &res_type, const Fun &fun, ConstArrayRef<LCT> lhs_cells, ConstArrayRef<RCT> rhs_cells, Stash &stash) { size_t guess_size = lhs.map.size() + rhs.map.size(); - auto &result = stash.create<FastValue<OCT>>(res_type, lhs.map.num_dims(), 1, guess_size); - result.my_index = lhs; - for (auto val : lhs_cells) { - result.my_cells.push_back_fast(val); - } + auto &result = stash.create<FastValue<OCT,true>>(res_type, lhs.map.addr_size(), 1, guess_size); + lhs.map.each_map_entry([&](auto lhs_subspace, auto hash) + { + result.add_mapping(lhs.map.get_addr(lhs_subspace), hash); + result.my_cells.push_back_fast(lhs_cells[lhs_subspace]); + }); rhs.map.each_map_entry([&](auto rhs_subspace, auto hash) { - auto lhs_subspace = lhs.map.lookup(hash); - if (lhs_subspace == FastSparseMap::npos()) { - auto idx = result.my_index.map.add_mapping(rhs.map.make_addr(rhs_subspace), hash); - if (__builtin_expect((idx == result.my_cells.size), true)) { - result.my_cells.push_back_fast(rhs_cells[rhs_subspace]); - } + auto rhs_addr = rhs.map.get_addr(rhs_subspace); + auto result_subspace = result.my_index.map.lookup(rhs_addr, hash); + if (result_subspace == FastAddrMap::npos()) { + result.add_mapping(rhs_addr, hash); + result.my_cells.push_back_fast(rhs_cells[rhs_subspace]); } else { - auto cell_value = fun(lhs_cells[lhs_subspace], rhs_cells[rhs_subspace]); - *result.my_cells.get(lhs_subspace) = cell_value; + OCT &out_cell = *result.my_cells.get(result_subspace); + out_cell = fun(out_cell, rhs_cells[rhs_subspace]); } }); return result; diff --git a/eval/src/vespa/eval/eval/label.h b/eval/src/vespa/eval/eval/label.h new file mode 100644 index 00000000000..931f96a4f1a --- /dev/null +++ b/eval/src/vespa/eval/eval/label.h @@ -0,0 +1,15 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <cstdint> + +namespace vespalib::eval { + +// We use string ids from SharedStringRepo as labels. Note that +// label_t represents the lightweight reference type. Other structures +// (Handle/StrongHandles) are needed to keep the id valid. + +using label_t = uint32_t; + +} diff --git a/eval/src/vespa/eval/eval/simple_value.cpp b/eval/src/vespa/eval/eval/simple_value.cpp index 113f89f77fb..0cbbb29ecf1 100644 --- a/eval/src/vespa/eval/eval/simple_value.cpp +++ b/eval/src/vespa/eval/eval/simple_value.cpp @@ -30,7 +30,8 @@ struct CreateSimpleValueBuilderBase { // look up a full address in the map directly struct SimpleLookupView : public Value::Index::View { - using Labels = std::vector<vespalib::string>; + using Handle = SharedStringRepo::Handle; + using Labels = std::vector<Handle>; using Map = std::map<Labels, size_t>; const Map ↦ @@ -38,17 +39,17 @@ struct SimpleLookupView : public Value::Index::View { Map::const_iterator pos; SimpleLookupView(const Map &map_in, size_t num_dims) - : map(map_in), my_addr(num_dims, ""), pos(map.end()) {} + : map(map_in), my_addr(num_dims), pos(map.end()) {} - void lookup(ConstArrayRef<const vespalib::stringref*> addr) override { + void lookup(ConstArrayRef<const label_t*> addr) override { assert(addr.size() == my_addr.size()); for (size_t i = 0; i < my_addr.size(); ++i) { - my_addr[i] = *addr[i]; + my_addr[i] = Handle::handle_from_id(*addr[i]); } pos = map.find(my_addr); } - bool next_result(ConstArrayRef<vespalib::stringref*>, size_t &idx_out) override { + bool next_result(ConstArrayRef<label_t*>, size_t &idx_out) override { if (pos == map.end()) { return false; } @@ -63,13 +64,14 @@ struct SimpleLookupView : public Value::Index::View { // find matching mappings for a partial address with brute force filtering struct SimpleFilterView : public Value::Index::View { - using Labels = std::vector<vespalib::string>; + using Handle = SharedStringRepo::Handle; + using Labels = std::vector<Handle>; using Map = std::map<Labels, size_t>; const Map ↦ std::vector<size_t> match_dims; std::vector<size_t> extract_dims; - std::vector<vespalib::string> query; + std::vector<Handle> query; Map::const_iterator pos; bool is_match() const { @@ -82,7 +84,7 @@ struct SimpleFilterView : public Value::Index::View { } SimpleFilterView(const Map &map_in, const std::vector<size_t> &match_dims_in, size_t num_dims) - : map(map_in), match_dims(match_dims_in), extract_dims(), query(match_dims.size(), ""), pos(map.end()) + : map(map_in), match_dims(match_dims_in), extract_dims(), query(match_dims.size()), pos(map.end()) { auto my_pos = match_dims.begin(); for (size_t i = 0; i < num_dims; ++i) { @@ -96,20 +98,20 @@ struct SimpleFilterView : public Value::Index::View { assert((match_dims.size() + extract_dims.size()) == num_dims); } - void lookup(ConstArrayRef<const vespalib::stringref*> addr) override { + void lookup(ConstArrayRef<const label_t*> addr) override { assert(addr.size() == query.size()); for (size_t i = 0; i < addr.size(); ++i) { - query[i] = *addr[i]; + query[i] = Handle::handle_from_id(*addr[i]); } pos = map.begin(); } - bool next_result(ConstArrayRef<vespalib::stringref*> addr_out, size_t &idx_out) override { + bool next_result(ConstArrayRef<label_t*> addr_out, size_t &idx_out) override { while (pos != map.end()) { if (is_match()) { assert(addr_out.size() == extract_dims.size()); for (size_t i = 0; i < extract_dims.size(); ++i) { - *addr_out[i] = pos->first[extract_dims[i]]; + *addr_out[i] = pos->first[extract_dims[i]].id(); } idx_out = pos->second; ++pos; @@ -126,7 +128,8 @@ struct SimpleFilterView : public Value::Index::View { // iterate all mappings struct SimpleIterateView : public Value::Index::View { - using Labels = std::vector<vespalib::string>; + using Handle = SharedStringRepo::Handle; + using Labels = std::vector<Handle>; using Map = std::map<Labels, size_t>; const Map ↦ @@ -135,17 +138,17 @@ struct SimpleIterateView : public Value::Index::View { SimpleIterateView(const Map &map_in) : map(map_in), pos(map.end()) {} - void lookup(ConstArrayRef<const vespalib::stringref*>) override { + void lookup(ConstArrayRef<const label_t*>) override { pos = map.begin(); } - bool next_result(ConstArrayRef<vespalib::stringref*> addr_out, size_t &idx_out) override { + bool next_result(ConstArrayRef<label_t*> addr_out, size_t &idx_out) override { if (pos == map.end()) { return false; } assert(addr_out.size() == pos->first.size()); for (size_t i = 0; i < addr_out.size(); ++i) { - *addr_out[i] = pos->first[i]; + *addr_out[i] = pos->first[i].id(); } idx_out = pos->second; ++pos; @@ -182,6 +185,17 @@ SimpleValue::add_mapping(ConstArrayRef<vespalib::stringref> addr) assert(was_inserted); } +void +SimpleValue::add_mapping(ConstArrayRef<label_t> addr) +{ + Labels my_addr; + for(label_t label: addr) { + my_addr.emplace_back(Handle::handle_from_id(label)); + } + auto [ignore, was_inserted] = _index.emplace(my_addr, _index.size()); + assert(was_inserted); +} + MemoryUsage SimpleValue::estimate_extra_memory_usage() const { @@ -246,15 +260,26 @@ SimpleValueT<T>::add_subspace(ConstArrayRef<vespalib::stringref> addr) return ArrayRef<T>(&_cells[old_size], subspace_size()); } +template <typename T> +ArrayRef<T> +SimpleValueT<T>::add_subspace(ConstArrayRef<label_t> addr) +{ + size_t old_size = _cells.size(); + add_mapping(addr); + _cells.resize(old_size + subspace_size(), std::numeric_limits<T>::quiet_NaN()); + return ArrayRef<T>(&_cells[old_size], subspace_size()); +} + //----------------------------------------------------------------------------- SimpleValueBuilderFactory::SimpleValueBuilderFactory() = default; SimpleValueBuilderFactory SimpleValueBuilderFactory::_factory; std::unique_ptr<ValueBuilderBase> -SimpleValueBuilderFactory::create_value_builder_base(const ValueType &type, size_t num_mapped_dims, size_t subspace_size, +SimpleValueBuilderFactory::create_value_builder_base(const ValueType &type, bool transient, size_t num_mapped_dims, size_t subspace_size, size_t expected_subspaces) const { + (void) transient; return typify_invoke<1,TypifyCellType,CreateSimpleValueBuilderBase>(type.cell_type(), type, num_mapped_dims, subspace_size, expected_subspaces); } diff --git a/eval/src/vespa/eval/eval/simple_value.h b/eval/src/vespa/eval/eval/simple_value.h index 590c0b4ef16..1fd645b704c 100644 --- a/eval/src/vespa/eval/eval/simple_value.h +++ b/eval/src/vespa/eval/eval/simple_value.h @@ -3,7 +3,7 @@ #pragma once #include "value.h" -#include <vespa/vespalib/stllike/string.h> +#include <vespa/vespalib/util/shared_string_repo.h> #include <vector> #include <map> @@ -26,7 +26,8 @@ class TensorSpec; class SimpleValue : public Value, public Value::Index { private: - using Labels = std::vector<vespalib::string>; + using Handle = SharedStringRepo::Handle; + using Labels = std::vector<Handle>; ValueType _type; size_t _num_mapped_dims; @@ -36,6 +37,7 @@ protected: size_t num_mapped_dims() const { return _num_mapped_dims; } size_t subspace_size() const { return _subspace_size; } void add_mapping(ConstArrayRef<vespalib::stringref> addr); + void add_mapping(ConstArrayRef<label_t> addr); MemoryUsage estimate_extra_memory_usage() const; public: SimpleValue(const ValueType &type, size_t num_mapped_dims_in, size_t subspace_size_in); @@ -62,6 +64,7 @@ public: ~SimpleValueT() override; TypedCells cells() const override { return TypedCells(ConstArrayRef<T>(_cells)); } ArrayRef<T> add_subspace(ConstArrayRef<vespalib::stringref> addr) override; + ArrayRef<T> add_subspace(ConstArrayRef<label_t> addr) override; std::unique_ptr<Value> build(std::unique_ptr<ValueBuilder<T>> self) override { if (num_mapped_dims() == 0) { assert(size() == 1); @@ -87,7 +90,7 @@ class SimpleValueBuilderFactory : public ValueBuilderFactory { private: SimpleValueBuilderFactory(); static SimpleValueBuilderFactory _factory; - std::unique_ptr<ValueBuilderBase> create_value_builder_base(const ValueType &type, + std::unique_ptr<ValueBuilderBase> create_value_builder_base(const ValueType &type, bool transient, size_t num_mapped_dims, size_t subspace_size, size_t expected_subspaces) const override; public: static const SimpleValueBuilderFactory &get() { return _factory; } diff --git a/eval/src/vespa/eval/eval/value.cpp b/eval/src/vespa/eval/eval/value.cpp index 7abc8d568cb..73c7c40636c 100644 --- a/eval/src/vespa/eval/eval/value.cpp +++ b/eval/src/vespa/eval/eval/value.cpp @@ -12,8 +12,8 @@ namespace { struct TrivialView : Value::Index::View { bool first = false; - void lookup(ConstArrayRef<const vespalib::stringref*> ) override { first = true; } - bool next_result(ConstArrayRef<vespalib::stringref*> , size_t &idx_out) override { + void lookup(ConstArrayRef<const label_t*> ) override { first = true; } + bool next_result(ConstArrayRef<label_t*> , size_t &idx_out) override { if (first) { idx_out = 0; first = false; diff --git a/eval/src/vespa/eval/eval/value.h b/eval/src/vespa/eval/eval/value.h index 186c3698dcd..2efb7d7c1e4 100644 --- a/eval/src/vespa/eval/eval/value.h +++ b/eval/src/vespa/eval/eval/value.h @@ -2,6 +2,7 @@ #pragma once +#include "label.h" #include "memory_usage_stuff.h" #include "value_type.h" #include "typed_cells.h" @@ -36,13 +37,13 @@ struct Value { // partial address for the dimensions given to // create_view. Results from the lookup is extracted using // the next_result function. - virtual void lookup(ConstArrayRef<const vespalib::stringref*> addr) = 0; + virtual void lookup(ConstArrayRef<const label_t*> addr) = 0; // Extract the next result (if any) from the previous // lookup into the given partial address and index. Only // the labels for the dimensions NOT specified in // create_view will be extracted here. - virtual bool next_result(ConstArrayRef<vespalib::stringref*> addr_out, size_t &idx_out) = 0; + virtual bool next_result(ConstArrayRef<label_t*> addr_out, size_t &idx_out) = 0; virtual ~View() {} }; @@ -163,6 +164,14 @@ struct ValueBuilder : ValueBuilderBase { // is not allowed. virtual ArrayRef<T> add_subspace(ConstArrayRef<vespalib::stringref> addr) = 0; + // add a dense subspace for the given address where labels are + // specified by shared string repo ids. Note that the caller is + // responsible for making sure the ids are valid 'long enough'. + virtual ArrayRef<T> add_subspace(ConstArrayRef<label_t> addr) = 0; + + // convenience function to add a subspace with an empty address + ArrayRef<T> add_subspace() { return add_subspace(ConstArrayRef<label_t>()); } + // Given the ownership of the builder itself, produce the newly // created value. This means that builders can only be used once, // it also means values can build themselves. @@ -179,26 +188,40 @@ struct ValueBuilder : ValueBuilderBase { * builder. With interoperability between all values. **/ struct ValueBuilderFactory { +private: template <typename T> - std::unique_ptr<ValueBuilder<T>> create_value_builder(const ValueType &type, + std::unique_ptr<ValueBuilder<T>> create_value_builder(const ValueType &type, bool transient, size_t num_mapped_dims_in, size_t subspace_size_in, size_t expected_subspaces) const { assert(check_cell_type<T>(type.cell_type())); - auto base = create_value_builder_base(type, num_mapped_dims_in, subspace_size_in, expected_subspaces); + auto base = create_value_builder_base(type, transient, num_mapped_dims_in, subspace_size_in, expected_subspaces); ValueBuilder<T> *builder = dynamic_cast<ValueBuilder<T>*>(base.get()); assert(builder); base.release(); return std::unique_ptr<ValueBuilder<T>>(builder); } +public: + template <typename T> + std::unique_ptr<ValueBuilder<T>> create_value_builder(const ValueType &type, + size_t num_mapped_dims_in, size_t subspace_size_in, size_t expected_subspaces) const + { + return create_value_builder<T>(type, false, num_mapped_dims_in, subspace_size_in, expected_subspaces); + } + template <typename T> + std::unique_ptr<ValueBuilder<T>> create_transient_value_builder(const ValueType &type, + size_t num_mapped_dims_in, size_t subspace_size_in, size_t expected_subspaces) const + { + return create_value_builder<T>(type, true, num_mapped_dims_in, subspace_size_in, expected_subspaces); + } template <typename T> std::unique_ptr<ValueBuilder<T>> create_value_builder(const ValueType &type) const { - return create_value_builder<T>(type, type.count_mapped_dimensions(), type.dense_subspace_size(), 1); + return create_value_builder<T>(type, false, type.count_mapped_dimensions(), type.dense_subspace_size(), 1); } std::unique_ptr<Value> copy(const Value &value) const; virtual ~ValueBuilderFactory() {} protected: - virtual std::unique_ptr<ValueBuilderBase> create_value_builder_base(const ValueType &type, + virtual std::unique_ptr<ValueBuilderBase> create_value_builder_base(const ValueType &type, bool transient, size_t num_mapped_dims_in, size_t subspace_size_in, size_t expected_subspaces) const = 0; }; diff --git a/eval/src/vespa/eval/eval/value_codec.cpp b/eval/src/vespa/eval/eval/value_codec.cpp index 923d3f29cd3..53131da86d8 100644 --- a/eval/src/vespa/eval/eval/value_codec.cpp +++ b/eval/src/vespa/eval/eval/value_codec.cpp @@ -7,6 +7,7 @@ #include <vespa/vespalib/util/exceptions.h> #include <vespa/vespalib/util/typify.h> #include <vespa/vespalib/util/stringfmt.h> +#include <vespa/vespalib/util/shared_string_repo.h> using vespalib::make_string_short::fmt; @@ -128,9 +129,10 @@ size_t maybe_decode_num_blocks(nbostream &input, bool has_mapped_dims, const For return 1; } -void encode_mapped_labels(nbostream &output, size_t num_mapped_dims, const std::vector<vespalib::stringref> &addr) { +void encode_mapped_labels(nbostream &output, size_t num_mapped_dims, const std::vector<label_t> &addr) { for (size_t i = 0; i < num_mapped_dims; ++i) { - output.writeSmallString(addr[i]); + vespalib::string str = SharedStringRepo::Handle::string_from_id(addr[i]); + output.writeSmallString(str); } } @@ -175,7 +177,7 @@ struct ContentDecoder { } // add implicit empty subspace if ((state.num_mapped_dims == 0) && (state.num_blocks == 0)) { - for (T &cell: builder->add_subspace({})) { + for (T &cell: builder->add_subspace()) { cell = T{}; } } @@ -229,8 +231,8 @@ struct CreateTensorSpecFromValue { TensorSpec spec(value.type().to_spec()); size_t subspace_id = 0; size_t subspace_size = value.type().dense_subspace_size(); - std::vector<vespalib::stringref> labels(value.type().count_mapped_dimensions()); - std::vector<vespalib::stringref*> label_refs; + std::vector<label_t> labels(value.type().count_mapped_dimensions()); + std::vector<label_t*> label_refs; for (auto &label: labels) { label_refs.push_back(&label); } @@ -241,7 +243,7 @@ struct CreateTensorSpecFromValue { TensorSpec::Address addr; for (const auto &dim: value.type().dimensions()) { if (dim.is_mapped()) { - addr.emplace(dim.name, labels[label_idx++]); + addr.emplace(dim.name, SharedStringRepo::Handle::string_from_id(labels[label_idx++])); } } for (size_t i = 0; i < subspace_size; ++i) { @@ -270,8 +272,8 @@ struct EncodeState { struct ContentEncoder { template<typename T> static void invoke(const Value &value, const EncodeState &state, nbostream &output) { - std::vector<vespalib::stringref> address(state.num_mapped_dims); - std::vector<vespalib::stringref*> a_refs(state.num_mapped_dims);; + std::vector<label_t> address(state.num_mapped_dims); + std::vector<label_t*> a_refs(state.num_mapped_dims);; for (size_t i = 0; i < state.num_mapped_dims; ++i) { a_refs[i] = &address[i]; } diff --git a/eval/src/vespa/eval/instruction/generic_concat.cpp b/eval/src/vespa/eval/instruction/generic_concat.cpp index fa9d2192b99..5d8ab7187c0 100644 --- a/eval/src/vespa/eval/instruction/generic_concat.cpp +++ b/eval/src/vespa/eval/instruction/generic_concat.cpp @@ -47,10 +47,10 @@ generic_concat(const Value &a, const Value &b, auto a_cells = a.cells().typify<LCT>(); auto b_cells = b.cells().typify<RCT>(); SparseJoinState sparse(sparse_plan, a.index(), b.index()); - auto builder = factory.create_value_builder<OCT>(res_type, - sparse_plan.sources.size(), - dense_plan.output_size, - sparse.first_index.size()); + auto builder = factory.create_transient_value_builder<OCT>(res_type, + sparse_plan.sources.size(), + dense_plan.output_size, + sparse.first_index.size()); auto outer = sparse.first_index.create_view({}); auto inner = sparse.second_index.create_view(sparse.second_view_dims); outer->lookup({}); diff --git a/eval/src/vespa/eval/instruction/generic_create.cpp b/eval/src/vespa/eval/instruction/generic_create.cpp index 02c89e0b43f..6e30da846e7 100644 --- a/eval/src/vespa/eval/instruction/generic_create.cpp +++ b/eval/src/vespa/eval/instruction/generic_create.cpp @@ -5,6 +5,7 @@ #include <vespa/eval/eval/array_array_map.h> #include <vespa/vespalib/util/stash.h> #include <vespa/vespalib/util/typify.h> +#include <vespa/vespalib/util/shared_string_repo.h> #include <cassert> using namespace vespalib::eval::tensor_function; @@ -13,6 +14,7 @@ namespace vespalib::eval::instruction { using State = InterpretedFunction::State; using Instruction = InterpretedFunction::Instruction; +using Handle = SharedStringRepo::Handle; namespace { @@ -21,12 +23,12 @@ struct CreateParam { size_t num_mapped_dims; size_t dense_subspace_size; size_t num_children; - ArrayArrayMap<vespalib::string,size_t> my_spec; + ArrayArrayMap<Handle,size_t> my_spec; const ValueBuilderFactory &factory; static constexpr size_t npos = -1; - ArrayRef<size_t> indexes(ConstArrayRef<vespalib::string> key) { + ArrayRef<size_t> indexes(ConstArrayRef<Handle> key) { auto [tag, first_time] = my_spec.lookup_or_add_entry(key); auto rv = my_spec.get_values(tag); if (first_time) { @@ -49,7 +51,7 @@ struct CreateParam { { size_t last_child = num_children - 1; for (const auto & entry : spec_in) { - std::vector<vespalib::string> sparse_key; + std::vector<Handle> sparse_key; size_t dense_key = 0; auto dim = res_type.dimensions().begin(); auto binding = entry.first.begin(); @@ -58,7 +60,7 @@ struct CreateParam { assert(dim->name == binding->first); assert(dim->is_mapped() == binding->second.is_mapped()); if (dim->is_mapped()) { - sparse_key.push_back(binding->second.name); + sparse_key.push_back(Handle(binding->second.name)); } else { assert(binding->second.index < dim->size); dense_key = (dense_key * dim->size) + binding->second.index; @@ -76,16 +78,16 @@ struct CreateParam { template <typename T> void my_generic_create_op(State &state, uint64_t param_in) { const auto ¶m = unwrap_param<CreateParam>(param_in); - auto builder = param.factory.create_value_builder<T>(param.res_type, - param.num_mapped_dims, - param.dense_subspace_size, - param.my_spec.size()); - std::vector<vespalib::stringref> sparse_addr; + auto builder = param.factory.create_transient_value_builder<T>(param.res_type, + param.num_mapped_dims, + param.dense_subspace_size, + param.my_spec.size()); + std::vector<label_t> sparse_addr; param.my_spec.each_entry([&](const auto &key, const auto &values) { sparse_addr.clear(); for (const auto & label : key) { - sparse_addr.push_back(label); + sparse_addr.push_back(label.id()); } T *dst = builder->add_subspace(sparse_addr).begin(); for (size_t stack_idx : values) { diff --git a/eval/src/vespa/eval/instruction/generic_join.cpp b/eval/src/vespa/eval/instruction/generic_join.cpp index 026df5aa993..e0dc0feea28 100644 --- a/eval/src/vespa/eval/instruction/generic_join.cpp +++ b/eval/src/vespa/eval/instruction/generic_join.cpp @@ -41,7 +41,7 @@ generic_mixed_join(const Value &lhs, const Value &rhs, const JoinParam ¶m) if (param.sparse_plan.lhs_overlap.empty() && param.sparse_plan.rhs_overlap.empty()) { expected_subspaces = sparse.first_index.size() * sparse.second_index.size(); } - auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.sources.size(), param.dense_plan.out_size, expected_subspaces); + auto builder = param.factory.create_transient_value_builder<OCT>(param.res_type, param.sparse_plan.sources.size(), param.dense_plan.out_size, expected_subspaces); auto outer = sparse.first_index.create_view({}); auto inner = sparse.second_index.create_view(sparse.second_view_dims); outer->lookup({}); @@ -92,7 +92,7 @@ void my_sparse_no_overlap_join_op(State &state, uint64_t param_in) { SparseJoinState sparse(param.sparse_plan, lhs.index(), rhs.index()); auto guess = lhs.index().size() * rhs.index().size(); assert(param.dense_plan.out_size == 1); - auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.sources.size(), 1, guess); + auto builder = param.factory.create_transient_value_builder<OCT>(param.res_type, param.sparse_plan.sources.size(), 1, guess); auto outer = sparse.first_index.create_view({}); assert(sparse.second_view_dims.empty()); auto inner = sparse.second_index.create_view({}); @@ -131,7 +131,7 @@ void my_sparse_full_overlap_join_op(State &state, uint64_t param_in) { } Fun fun(param.function); SparseJoinState sparse(param.sparse_plan, lhs_index, rhs_index); - auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.sources.size(), param.dense_plan.out_size, sparse.first_index.size()); + auto builder = param.factory.create_transient_value_builder<OCT>(param.res_type, param.sparse_plan.sources.size(), param.dense_plan.out_size, sparse.first_index.size()); auto outer = sparse.first_index.create_view({}); auto inner = sparse.second_index.create_view(sparse.second_view_dims); outer->lookup({}); diff --git a/eval/src/vespa/eval/instruction/generic_join.h b/eval/src/vespa/eval/instruction/generic_join.h index 988286be980..217f3195dec 100644 --- a/eval/src/vespa/eval/instruction/generic_join.h +++ b/eval/src/vespa/eval/instruction/generic_join.h @@ -68,10 +68,10 @@ struct SparseJoinState { const Value::Index &first_index; const Value::Index &second_index; const std::vector<size_t> &second_view_dims; - std::vector<vespalib::stringref> full_address; - std::vector<vespalib::stringref*> first_address; - std::vector<const vespalib::stringref*> address_overlap; - std::vector<vespalib::stringref*> second_only_address; + std::vector<label_t> full_address; + std::vector<label_t*> first_address; + std::vector<const label_t*> address_overlap; + std::vector<label_t*> second_only_address; size_t lhs_subspace; size_t rhs_subspace; size_t &first_subspace; diff --git a/eval/src/vespa/eval/instruction/generic_merge.cpp b/eval/src/vespa/eval/instruction/generic_merge.cpp index 02749a04eb9..107cb805d74 100644 --- a/eval/src/vespa/eval/instruction/generic_merge.cpp +++ b/eval/src/vespa/eval/instruction/generic_merge.cpp @@ -63,10 +63,10 @@ generic_mixed_merge(const Value &a, const Value &b, const size_t num_mapped = params.num_mapped_dimensions; const size_t subspace_size = params.dense_subspace_size; size_t guess_subspaces = std::max(a.index().size(), b.index().size()); - auto builder = params.factory.create_value_builder<OCT>(params.res_type, num_mapped, subspace_size, guess_subspaces); - std::vector<vespalib::stringref> address(num_mapped); - std::vector<const vespalib::stringref *> addr_cref; - std::vector<vespalib::stringref *> addr_ref; + auto builder = params.factory.create_transient_value_builder<OCT>(params.res_type, num_mapped, subspace_size, guess_subspaces); + std::vector<label_t> address(num_mapped); + std::vector<const label_t *> addr_cref; + std::vector<label_t *> addr_ref; for (auto & ref : address) { addr_cref.push_back(&ref); addr_ref.push_back(&ref); diff --git a/eval/src/vespa/eval/instruction/generic_peek.cpp b/eval/src/vespa/eval/instruction/generic_peek.cpp index 66538911890..d94742ae15c 100644 --- a/eval/src/vespa/eval/instruction/generic_peek.cpp +++ b/eval/src/vespa/eval/instruction/generic_peek.cpp @@ -7,6 +7,7 @@ #include <vespa/vespalib/util/stash.h> #include <vespa/vespalib/util/typify.h> #include <vespa/vespalib/util/visit_ranges.h> +#include <vespa/vespalib/util/shared_string_repo.h> #include <cassert> using namespace vespalib::eval::tensor_function; @@ -16,6 +17,8 @@ namespace vespalib::eval::instruction { using State = InterpretedFunction::State; using Instruction = InterpretedFunction::Instruction; +using Handle = SharedStringRepo::Handle; + namespace { static constexpr size_t npos = -1; @@ -35,28 +38,43 @@ size_t count_children(const Spec &spec) } struct DimSpec { - vespalib::stringref name; - GenericPeek::SpecMap::mapped_type child_or_label; + enum class DimType { CHILD_IDX, LABEL_IDX, LABEL_STR }; + vespalib::string name; + DimType dim_type; + size_t idx; + Handle str; + static DimSpec from_child(const vespalib::string &name_in, size_t child_idx) { + return {name_in, DimType::CHILD_IDX, child_idx, Handle()}; + } + static DimSpec from_label(const vespalib::string &name_in, const TensorSpec::Label &label) { + if (label.is_mapped()) { + return {name_in, DimType::LABEL_STR, 0, Handle(label.name)}; + } else { + assert(label.is_indexed()); + return {name_in, DimType::LABEL_IDX, label.index, Handle()}; + } + } + ~DimSpec(); bool has_child() const { - return std::holds_alternative<size_t>(child_or_label); + return (dim_type == DimType::CHILD_IDX); } bool has_label() const { - return std::holds_alternative<TensorSpec::Label>(child_or_label); + return (dim_type != DimType::CHILD_IDX); } size_t get_child_idx() const { - return std::get<size_t>(child_or_label); + assert(dim_type == DimType::CHILD_IDX); + return idx; } - vespalib::stringref get_label_name() const { - auto & label = std::get<TensorSpec::Label>(child_or_label); - assert(label.is_mapped()); - return label.name; + label_t get_label_name() const { + assert(dim_type == DimType::LABEL_STR); + return str.id(); } size_t get_label_index() const { - auto & label = std::get<TensorSpec::Label>(child_or_label); - assert(label.is_indexed()); - return label.index; + assert(dim_type == DimType::LABEL_IDX); + return idx; } }; +DimSpec::~DimSpec() = default; struct ExtractedSpecs { using Dimension = ValueType::Dimension; @@ -85,7 +103,11 @@ struct ExtractedSpecs { dimensions.push_back(a); const auto & [spec_dim_name, child_or_label] = b; assert(a.name == spec_dim_name); - specs.emplace_back(DimSpec{a.name, child_or_label}); + if (std::holds_alternative<size_t>(child_or_label)) { + specs.push_back(DimSpec::from_child(a.name, std::get<size_t>(child_or_label))); + } else { + specs.push_back(DimSpec::from_label(a.name, std::get<TensorSpec::Label>(child_or_label))); + } } } }; @@ -181,22 +203,21 @@ struct DensePlan { }; struct SparseState { - std::vector<vespalib::string> view_addr; - std::vector<vespalib::stringref> view_refs; - std::vector<const vespalib::stringref *> lookup_refs; - std::vector<vespalib::stringref> output_addr; - std::vector<vespalib::stringref *> fetch_addr; - - SparseState(std::vector<vespalib::string> view_addr_in, size_t out_dims) - : view_addr(std::move(view_addr_in)), - view_refs(view_addr.size()), + std::vector<Handle> handles; + std::vector<label_t> view_addr; + std::vector<const label_t *> lookup_refs; + std::vector<label_t> output_addr; + std::vector<label_t *> fetch_addr; + + SparseState(std::vector<Handle> handles_in, std::vector<label_t> view_addr_in, size_t out_dims) + : handles(std::move(handles_in)), + view_addr(std::move(view_addr_in)), lookup_refs(view_addr.size()), output_addr(out_dims), fetch_addr(out_dims) { for (size_t i = 0; i < view_addr.size(); ++i) { - view_refs[i] = view_addr[i]; - lookup_refs[i] = &view_refs[i]; + lookup_refs[i] = &view_addr[i]; } for (size_t i = 0; i < out_dims; ++i) { fetch_addr[i] = &output_addr[i]; @@ -236,17 +257,19 @@ struct SparsePlan { template <typename Getter> SparseState make_state(const Getter &get_child_value) const { - std::vector<vespalib::string> view_addr; + std::vector<Handle> handles; + std::vector<label_t> view_addr; for (const auto & dim : lookup_specs) { if (dim.has_child()) { int64_t child_value = get_child_value(dim.get_child_idx()); - view_addr.push_back(vespalib::make_string("%" PRId64, child_value)); + handles.emplace_back(vespalib::make_string("%" PRId64, child_value)); + view_addr.push_back(handles.back().id()); } else { view_addr.push_back(dim.get_label_name()); } } assert(view_addr.size() == view_dims.size()); - return SparseState(std::move(view_addr), out_mapped_dims); + return SparseState(std::move(handles), std::move(view_addr), out_mapped_dims); } }; SparsePlan::~SparsePlan() = default; @@ -284,10 +307,10 @@ generic_mixed_peek(const ValueType &res_type, { auto input_cells = input_value.cells().typify<ICT>(); size_t bad_guess = 1; - auto builder = factory.create_value_builder<OCT>(res_type, - sparse_plan.out_mapped_dims, - dense_plan.out_dense_size, - bad_guess); + auto builder = factory.create_transient_value_builder<OCT>(res_type, + sparse_plan.out_mapped_dims, + dense_plan.out_dense_size, + bad_guess); size_t filled_subspaces = 0; size_t dense_offset = dense_plan.get_offset(get_child_value); if (dense_offset != npos) { @@ -304,7 +327,7 @@ generic_mixed_peek(const ValueType &res_type, } } if ((sparse_plan.out_mapped_dims == 0) && (filled_subspaces == 0)) { - for (auto & v : builder->add_subspace({})) { + for (auto & v : builder->add_subspace()) { v = OCT{}; } } diff --git a/eval/src/vespa/eval/instruction/generic_reduce.cpp b/eval/src/vespa/eval/instruction/generic_reduce.cpp index afc46e8ee7d..d30186d3dd8 100644 --- a/eval/src/vespa/eval/instruction/generic_reduce.cpp +++ b/eval/src/vespa/eval/instruction/generic_reduce.cpp @@ -45,10 +45,10 @@ ReduceParam::~ReduceParam() = default; //----------------------------------------------------------------------------- struct SparseReduceState { - std::vector<vespalib::stringref> full_address; - std::vector<vespalib::stringref*> fetch_address; - std::vector<vespalib::stringref*> keep_address; - size_t subspace; + std::vector<label_t> full_address; + std::vector<label_t*> fetch_address; + std::vector<label_t*> keep_address; + size_t subspace; SparseReduceState(const SparseReducePlan &plan) : full_address(plan.keep_dims.size() + plan.num_reduce_dims), @@ -71,20 +71,20 @@ template <typename ICT, typename OCT, typename AGGR> Value::UP generic_reduce(const Value &value, const ReduceParam ¶m) { auto cells = value.cells().typify<ICT>(); - ArrayArrayMap<vespalib::stringref,AGGR> map(param.sparse_plan.keep_dims.size(), - param.dense_plan.out_size, - value.index().size()); + ArrayArrayMap<label_t,AGGR> map(param.sparse_plan.keep_dims.size(), + param.dense_plan.out_size, + value.index().size()); SparseReduceState sparse(param.sparse_plan); auto full_view = value.index().create_view({}); full_view->lookup({}); - ConstArrayRef<vespalib::stringref*> keep_addr(sparse.keep_address); + ConstArrayRef<label_t*> keep_addr(sparse.keep_address); while (full_view->next_result(sparse.fetch_address, sparse.subspace)) { auto [tag, ignore] = map.lookup_or_add_entry(keep_addr); AGGR *dst = map.get_values(tag).begin(); auto sample = [&](size_t src_idx, size_t dst_idx) { dst[dst_idx].sample(cells[src_idx]); }; param.dense_plan.execute(sparse.subspace * param.dense_plan.in_size, sample); } - auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.keep_dims.size(), param.dense_plan.out_size, map.size()); + auto builder = param.factory.create_transient_value_builder<OCT>(param.res_type, param.sparse_plan.keep_dims.size(), param.dense_plan.out_size, map.size()); map.each_entry([&](const auto &keys, const auto &values) { OCT *dst = builder->add_subspace(keys).begin(); @@ -93,7 +93,7 @@ generic_reduce(const Value &value, const ReduceParam ¶m) { } }); if ((map.size() == 0) && param.sparse_plan.keep_dims.empty()) { - auto zero = builder->add_subspace({}); + auto zero = builder->add_subspace(); for (size_t i = 0; i < zero.size(); ++i) { zero[i] = OCT{}; } diff --git a/eval/src/vespa/eval/instruction/generic_rename.cpp b/eval/src/vespa/eval/instruction/generic_rename.cpp index 1ce18597ec2..894ef37b678 100644 --- a/eval/src/vespa/eval/instruction/generic_rename.cpp +++ b/eval/src/vespa/eval/instruction/generic_rename.cpp @@ -69,15 +69,15 @@ generic_rename(const Value &a, const ValueType &res_type, const ValueBuilderFactory &factory) { auto cells = a.cells().typify<CT>(); - std::vector<vespalib::stringref> output_address(sparse_plan.mapped_dims); - std::vector<vespalib::stringref*> input_address; + std::vector<label_t> output_address(sparse_plan.mapped_dims); + std::vector<label_t*> input_address; for (size_t maps_to : sparse_plan.output_dimensions) { input_address.push_back(&output_address[maps_to]); } - auto builder = factory.create_value_builder<CT>(res_type, - sparse_plan.mapped_dims, - dense_plan.subspace_size, - a.index().size()); + auto builder = factory.create_transient_value_builder<CT>(res_type, + sparse_plan.mapped_dims, + dense_plan.subspace_size, + a.index().size()); auto view = a.index().create_view({}); view->lookup({}); size_t subspace; diff --git a/eval/src/vespa/eval/streamed/streamed_value.cpp b/eval/src/vespa/eval/streamed/streamed_value.cpp index bdfe5fd4e27..06162b2200d 100644 --- a/eval/src/vespa/eval/streamed/streamed_value.cpp +++ b/eval/src/vespa/eval/streamed/streamed_value.cpp @@ -16,8 +16,7 @@ StreamedValue<T>::get_memory_usage() const { MemoryUsage usage = self_memory_usage<StreamedValue<T>>(); usage.merge(vector_extra_memory_usage(_my_cells)); - usage.incUsedBytes(_label_buf.byteSize()); - usage.incAllocatedBytes(_label_buf.byteCapacity()); + usage.merge(vector_extra_memory_usage(_my_labels.view().handles())); return usage; } diff --git a/eval/src/vespa/eval/streamed/streamed_value.h b/eval/src/vespa/eval/streamed/streamed_value.h index 258802a53e8..b7ace4191c3 100644 --- a/eval/src/vespa/eval/streamed/streamed_value.h +++ b/eval/src/vespa/eval/streamed/streamed_value.h @@ -4,6 +4,7 @@ #include <vespa/eval/eval/value_type.h> #include <vespa/eval/eval/value.h> +#include <vespa/vespalib/util/shared_string_repo.h> #include "streamed_value_index.h" #include <cassert> @@ -19,20 +20,22 @@ template <typename T> class StreamedValue : public Value { private: + using StrongHandles = SharedStringRepo::StrongHandles; + ValueType _type; std::vector<T> _my_cells; - Array<char> _label_buf; + StrongHandles _my_labels; StreamedValueIndex _my_index; public: StreamedValue(ValueType type, size_t num_mapped_dimensions, - std::vector<T> cells, size_t num_subspaces, Array<char> && label_buf) + std::vector<T> cells, size_t num_subspaces, StrongHandles && handles) : _type(std::move(type)), _my_cells(std::move(cells)), - _label_buf(std::move(label_buf)), + _my_labels(std::move(handles)), _my_index(num_mapped_dimensions, num_subspaces, - ConstArrayRef<char>(_label_buf.begin(), _label_buf.size())) + _my_labels.view().handles()) { assert(num_subspaces * _type.dense_subspace_size() == _my_cells.size()); } diff --git a/eval/src/vespa/eval/streamed/streamed_value_builder.h b/eval/src/vespa/eval/streamed/streamed_value_builder.h index 5698c805756..48a01f893de 100644 --- a/eval/src/vespa/eval/streamed/streamed_value_builder.h +++ b/eval/src/vespa/eval/streamed/streamed_value_builder.h @@ -3,7 +3,7 @@ #pragma once #include "streamed_value.h" -#include <vespa/vespalib/objects/nbostream.h> +#include <vespa/vespalib/util/shared_string_repo.h> namespace vespalib::eval { @@ -14,12 +14,14 @@ template <typename T> class StreamedValueBuilder : public ValueBuilder<T> { private: + using StrongHandles = SharedStringRepo::StrongHandles; + ValueType _type; size_t _num_mapped_dimensions; size_t _dense_subspace_size; std::vector<T> _cells; size_t _num_subspaces; - nbostream _labels; + StrongHandles _labels; public: StreamedValueBuilder(const ValueType &type, size_t num_mapped_in, @@ -30,18 +32,26 @@ public: _dense_subspace_size(subspace_size_in), _cells(), _num_subspaces(0), - _labels() + _labels(num_mapped_in * expected_subspaces) { _cells.reserve(subspace_size_in * expected_subspaces); - // assume small sized label strings: - _labels.reserve(num_mapped_in * expected_subspaces * 3); }; ~StreamedValueBuilder(); ArrayRef<T> add_subspace(ConstArrayRef<vespalib::stringref> addr) override { for (auto label : addr) { - _labels.writeSmallString(label); + _labels.add(label); + } + size_t old_sz = _cells.size(); + _cells.resize(old_sz + _dense_subspace_size); + _num_subspaces++; + return ArrayRef<T>(&_cells[old_sz], _dense_subspace_size); + } + + ArrayRef<T> add_subspace(ConstArrayRef<label_t> addr) override { + for (auto label : addr) { + _labels.add(label); } size_t old_sz = _cells.size(); _cells.resize(old_sz + _dense_subspace_size); @@ -58,7 +68,7 @@ public: _num_mapped_dimensions, std::move(_cells), _num_subspaces, - _labels.extract_buffer()); + std::move(_labels)); } }; diff --git a/eval/src/vespa/eval/streamed/streamed_value_builder_factory.cpp b/eval/src/vespa/eval/streamed/streamed_value_builder_factory.cpp index aa6347a2c51..5111ba8a71e 100644 --- a/eval/src/vespa/eval/streamed/streamed_value_builder_factory.cpp +++ b/eval/src/vespa/eval/streamed/streamed_value_builder_factory.cpp @@ -19,10 +19,12 @@ struct SelectStreamedValueBuilder { std::unique_ptr<ValueBuilderBase> StreamedValueBuilderFactory::create_value_builder_base(const ValueType &type, + bool transient, size_t num_mapped, size_t subspace_size, size_t expected_subspaces) const { + (void) transient; return typify_invoke<1,TypifyCellType,SelectStreamedValueBuilder>( type.cell_type(), type, num_mapped, subspace_size, expected_subspaces); diff --git a/eval/src/vespa/eval/streamed/streamed_value_builder_factory.h b/eval/src/vespa/eval/streamed/streamed_value_builder_factory.h index 3f81981f429..58072aa31dc 100644 --- a/eval/src/vespa/eval/streamed/streamed_value_builder_factory.h +++ b/eval/src/vespa/eval/streamed/streamed_value_builder_factory.h @@ -14,7 +14,7 @@ private: StreamedValueBuilderFactory() {} static StreamedValueBuilderFactory _factory; std::unique_ptr<ValueBuilderBase> create_value_builder_base( - const ValueType &type, size_t num_mapped_in, + const ValueType &type, bool transient, size_t num_mapped_in, size_t subspace_size_in, size_t expected_subspaces) const override; public: static const StreamedValueBuilderFactory &get() { return _factory; } diff --git a/eval/src/vespa/eval/streamed/streamed_value_index.cpp b/eval/src/vespa/eval/streamed/streamed_value_index.cpp index 17cf7316554..0adaa35fc84 100644 --- a/eval/src/vespa/eval/streamed/streamed_value_index.cpp +++ b/eval/src/vespa/eval/streamed/streamed_value_index.cpp @@ -18,7 +18,7 @@ struct StreamedFilterView : Value::Index::View { LabelBlockStream label_blocks; std::vector<size_t> view_dims; - std::vector<vespalib::stringref> to_match; + std::vector<label_t> to_match; StreamedFilterView(LabelBlockStream labels, std::vector<size_t> view_dims_in) : label_blocks(std::move(labels)), @@ -28,7 +28,7 @@ struct StreamedFilterView : Value::Index::View to_match.reserve(view_dims.size()); } - void lookup(ConstArrayRef<const vespalib::stringref*> addr) override { + void lookup(ConstArrayRef<const label_t*> addr) override { label_blocks.reset(); to_match.clear(); for (auto ptr : addr) { @@ -37,7 +37,7 @@ struct StreamedFilterView : Value::Index::View assert(view_dims.size() == to_match.size()); } - bool next_result(ConstArrayRef<vespalib::stringref*> addr_out, size_t &idx_out) override { + bool next_result(ConstArrayRef<label_t*> addr_out, size_t &idx_out) override { while (const auto block = label_blocks.next_block()) { idx_out = block.subspace_index; bool matches = true; @@ -66,12 +66,12 @@ struct StreamedIterationView : Value::Index::View : label_blocks(std::move(labels)) {} - void lookup(ConstArrayRef<const vespalib::stringref*> addr) override { + void lookup(ConstArrayRef<const label_t*> addr) override { label_blocks.reset(); assert(addr.size() == 0); } - bool next_result(ConstArrayRef<vespalib::stringref*> addr_out, size_t &idx_out) override { + bool next_result(ConstArrayRef<label_t*> addr_out, size_t &idx_out) override { if (auto block = label_blocks.next_block()) { idx_out = block.subspace_index; size_t i = 0; @@ -90,7 +90,7 @@ struct StreamedIterationView : Value::Index::View std::unique_ptr<Value::Index::View> StreamedValueIndex::create_view(const std::vector<size_t> &dims) const { - LabelBlockStream label_stream(_data.num_subspaces, _data.labels_buffer, _data.num_mapped_dims); + LabelBlockStream label_stream(_data.num_subspaces, _data.labels, _data.num_mapped_dims); if (dims.empty()) { return std::make_unique<StreamedIterationView>(std::move(label_stream)); } diff --git a/eval/src/vespa/eval/streamed/streamed_value_index.h b/eval/src/vespa/eval/streamed/streamed_value_index.h index 8fd561200c3..fb3d48d8176 100644 --- a/eval/src/vespa/eval/streamed/streamed_value_index.h +++ b/eval/src/vespa/eval/streamed/streamed_value_index.h @@ -3,6 +3,7 @@ #pragma once #include <vespa/eval/eval/value.h> +#include <vespa/vespalib/util/shared_string_repo.h> namespace vespalib::eval { @@ -13,19 +14,23 @@ namespace vespalib::eval { class StreamedValueIndex : public Value::Index { public: + struct SerializedDataRef { uint32_t num_mapped_dims; uint32_t num_subspaces; - ConstArrayRef<char> labels_buffer; + const std::vector<label_t> labels; }; - StreamedValueIndex(uint32_t num_mapped_dims, uint32_t num_subspaces, ConstArrayRef<char> labels_buf) - : _data{num_mapped_dims, num_subspaces, labels_buf} + StreamedValueIndex(uint32_t num_mapped_dims, uint32_t num_subspaces, const std::vector<label_t> &labels_in) + : _data{num_mapped_dims, num_subspaces, labels_in} {} // index API: size_t size() const override { return _data.num_subspaces; } std::unique_ptr<View> create_view(const std::vector<size_t> &dims) const override; + // NB NOTE WARNING XXX: simply serializing the handle view and + // discarding the backing streamed value will result in dangling + // string enum value usage when the value is later deserialized. SerializedDataRef get_data_reference() const { return _data; } private: diff --git a/eval/src/vespa/eval/streamed/streamed_value_utils.h b/eval/src/vespa/eval/streamed/streamed_value_utils.h index b88d4df8581..6b44e052f0c 100644 --- a/eval/src/vespa/eval/streamed/streamed_value_utils.h +++ b/eval/src/vespa/eval/streamed/streamed_value_utils.h @@ -4,24 +4,23 @@ #include <vespa/eval/eval/value.h> #include <vespa/vespalib/objects/nbostream.h> +#include <cassert> namespace vespalib::eval { /** * Reads a stream of serialized labels. - * Reading more labels than available will - * throw an exception. + * Reading more labels than available will trigger an assert. **/ struct LabelStream { - nbostream source; - LabelStream(ConstArrayRef<char> data) : source(data.begin(), data.size()) {} - vespalib::stringref next_label() { - size_t str_size = source.getInt1_4Bytes(); - vespalib::stringref label(source.peek(), str_size); - source.adjustReadPos(str_size); - return label; + const std::vector<label_t> &source; + size_t pos; + LabelStream(const std::vector<label_t> &data) : source(data), pos(0) {} + label_t next_label() { + assert(pos < source.size()); + return source[pos++]; } - void reset() { source.rp(0); } + void reset() { pos = 0; } }; /** @@ -30,7 +29,7 @@ struct LabelStream { struct LabelBlock { static constexpr size_t npos = -1; size_t subspace_index; - ConstArrayRef<vespalib::stringref> address; + ConstArrayRef<label_t> address; operator bool() const { return subspace_index != npos; } }; @@ -43,7 +42,7 @@ private: size_t _num_subspaces; LabelStream _labels; size_t _subspace_index; - std::vector<vespalib::stringref> _current_address; + std::vector<label_t> _current_address; public: LabelBlock next_block() { if (_subspace_index < _num_subspaces) { @@ -62,10 +61,10 @@ public: } LabelBlockStream(uint32_t num_subspaces, - ConstArrayRef<char> label_buf, + const std::vector<label_t> &labels, uint32_t num_mapped_dims) : _num_subspaces(num_subspaces), - _labels(label_buf), + _labels(labels), _subspace_index(num_subspaces), _current_address(num_mapped_dims) {} diff --git a/eval/src/vespa/eval/streamed/streamed_value_view.h b/eval/src/vespa/eval/streamed/streamed_value_view.h index e37f442dd9a..060961f5e16 100644 --- a/eval/src/vespa/eval/streamed/streamed_value_view.h +++ b/eval/src/vespa/eval/streamed/streamed_value_view.h @@ -24,10 +24,10 @@ private: public: StreamedValueView(const ValueType &type, size_t num_mapped_dimensions, TypedCells cells, size_t num_subspaces, - ConstArrayRef<char> labels_buf) + const std::vector<label_t> &labels) : _type(type), _cells_ref(cells), - _my_index(num_mapped_dimensions, num_subspaces, labels_buf) + _my_index(num_mapped_dimensions, num_subspaces, labels) { assert(num_subspaces * _type.dense_subspace_size() == _cells_ref.size); } |