diff options
author | Håvard Pettersen <havardpe@oath.com> | 2020-10-20 10:23:24 +0000 |
---|---|---|
committer | Håvard Pettersen <havardpe@oath.com> | 2020-10-20 16:32:02 +0000 |
commit | 90633ae73e68208da7c99b766b03165dedce649c (patch) | |
tree | e0985e63cf4b534c83555f6e0c0261922c710e3d /eval | |
parent | 36f8d6761ef4bed884c884c7f673e6c7d847af7f (diff) |
make generic reduce faster
* use faster map implementation
* use multiple aggregators (invert dense loops)
* handle reducing all cells separately
* extend nested loop benchmarking
Diffstat (limited to 'eval')
-rw-r--r-- | eval/CMakeLists.txt | 1 | ||||
-rw-r--r-- | eval/src/tests/eval/array_array_map/CMakeLists.txt | 10 | ||||
-rw-r--r-- | eval/src/tests/eval/array_array_map/array_array_map_test.cpp | 70 | ||||
-rw-r--r-- | eval/src/tests/eval/nested_loop/nested_loop_bench.cpp | 155 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/CMakeLists.txt | 1 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/array_array_map.cpp | 7 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/array_array_map.h | 173 | ||||
-rw-r--r-- | eval/src/vespa/eval/instruction/generic_reduce.cpp | 112 | ||||
-rw-r--r-- | eval/src/vespa/eval/instruction/generic_reduce.h | 4 |
9 files changed, 470 insertions, 63 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt index eb94a9498cb..cd33b051846 100644 --- a/eval/CMakeLists.txt +++ b/eval/CMakeLists.txt @@ -12,6 +12,7 @@ vespa_define_module( TESTS src/tests/ann src/tests/eval/aggr + src/tests/eval/array_array_map src/tests/eval/compile_cache src/tests/eval/compiled_function src/tests/eval/engine_or_factory diff --git a/eval/src/tests/eval/array_array_map/CMakeLists.txt b/eval/src/tests/eval/array_array_map/CMakeLists.txt new file mode 100644 index 00000000000..aef54d0fcb0 --- /dev/null +++ b/eval/src/tests/eval/array_array_map/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +vespa_add_executable(eval_array_array_map_test_app TEST + SOURCES + array_array_map_test.cpp + DEPENDS + vespaeval + GTest::GTest +) +vespa_add_test(NAME eval_array_array_map_test_app COMMAND eval_array_array_map_test_app) diff --git a/eval/src/tests/eval/array_array_map/array_array_map_test.cpp b/eval/src/tests/eval/array_array_map/array_array_map_test.cpp new file mode 100644 index 00000000000..d6a97549b88 --- /dev/null +++ b/eval/src/tests/eval/array_array_map/array_array_map_test.cpp @@ -0,0 +1,70 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/eval/eval/array_array_map.h> +#include <vespa/vespalib/gtest/gtest.h> + +using namespace vespalib; +using namespace vespalib::eval; + +std::vector<int> ints(std::vector<int> vec) { return vec; } + +template <typename T> +ConstArrayRef<T> refs(const std::vector<T> &vec) { return ConstArrayRef<T>(vec); } + +//----------------------------------------------------------------------------- + +TEST(ArrayArrayMapTest, simple_map_can_be_created_and_used) { + // ctor params: 'keys_per_entry', 'values_per_entry', 'expected_entries' + ArrayArrayMap<int,int> map(2,3,5); + EXPECT_EQ(map.size(), 0); + EXPECT_FALSE(map.lookup(refs(ints({1, 2}))).valid()); + auto tag = map.add_entry(refs(ints({1, 2}))); + EXPECT_EQ(map.size(), 1); + auto values = map.get_values(tag); + ASSERT_EQ(values.size(), 3); + values[0] = 10; + values[1] = 20; + values[2] = 30; + EXPECT_FALSE(map.lookup(refs(ints({2, 1}))).valid()); + auto tag2 = map.lookup(refs(ints({1, 2}))); + ASSERT_TRUE(tag2.valid()); + EXPECT_EQ(map.get_values(tag2)[1], 20); +} + +TEST(ArrayArrayMapTest, lookup_or_add_entry_works) { + ArrayArrayMap<int,int> map(2,3,5); + auto res1 = map.lookup_or_add_entry(refs(ints({1, 2}))); + auto res2 = map.lookup_or_add_entry(refs(ints({1, 2}))); + EXPECT_TRUE(res1.second); + EXPECT_FALSE(res2.second); + EXPECT_EQ(map.get_values(res1.first).begin(), map.get_values(res2.first).begin()); + EXPECT_EQ(map.get_values(res1.first).size(), 3); +} + +TEST(ArrayArrayMapTest, each_entry_works) { + ArrayArrayMap<int,int> map(2,3,5); + auto res1 = map.add_entry(refs(ints({1, 2}))); + auto res2 = map.add_entry(refs(ints({2, 1}))); + map.get_values(res1)[0] = 10; + map.get_values(res2)[0] = 20; + EXPECT_EQ(map.size(), 2); + bool first = true; + // Note: insert order is guaranteed here + map.each_entry([&](const auto &keys, const auto &values) + { + if (first) { + EXPECT_EQ(keys[0], 1); + EXPECT_EQ(keys[1], 2); + EXPECT_EQ(values[0], 10); + first = false; + } else { + EXPECT_EQ(keys[0], 2); + EXPECT_EQ(keys[1], 1); + EXPECT_EQ(values[0], 20); + } + }); +} + +//----------------------------------------------------------------------------- + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp b/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp index c6e2f278cd4..3fbb80e07e5 100644 --- a/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp +++ b/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp @@ -10,7 +10,33 @@ using vespalib::eval::run_nested_loop; using LIST = std::vector<size_t>; using call_t = void (*)(const LIST &loop, const LIST &stride); -void perform_direct(const LIST &loop, const LIST &stride) { +void perform_direct_1(const LIST &loop, const LIST &stride) { + assert((loop.size() == 1) && (stride.size() == 1)); + size_t idx1 = 0; + size_t expect = 0; + for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) { + assert(idx1 == expect); + ++expect; + } + assert(expect == 4096); +} + +void perform_direct_2(const LIST &loop, const LIST &stride) { + assert((loop.size() == 2) && (stride.size() == 2)); + size_t idx1 = 0; + size_t expect = 0; + for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) { + size_t idx2 = idx1; + for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) { + assert(idx2 == expect); + ++expect; + } + } + assert(expect == 4096); +} + +void perform_direct_3(const LIST &loop, const LIST &stride) { + assert((loop.size() == 3) && (stride.size() == 3)); size_t idx1 = 0; size_t expect = 0; for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) { @@ -26,7 +52,61 @@ void perform_direct(const LIST &loop, const LIST &stride) { assert(expect == 4096); } -void perform_direct_lambda(const LIST &loop, const LIST &stride) { +void perform_direct_4(const LIST &loop, const LIST &stride) { + assert((loop.size() == 4) && (stride.size() == 4)); + size_t idx1 = 0; + size_t expect = 0; + for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) { + size_t idx2 = idx1; + for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) { + size_t idx3 = idx2; + for (size_t k = 0; k < loop[2]; ++k, idx3 += stride[2]) { + size_t idx4 = idx3; + for (size_t l = 0; l < loop[3]; ++l, idx4 += stride[3]) { + assert(idx4 == expect); + ++expect; + } + } + } + } + assert(expect == 4096); +} + +void perform_direct_lambda_1(const LIST &loop, const LIST &stride) { + assert((loop.size() == 1) && (stride.size() == 1)); + size_t expect = 0; + auto fun = [&](size_t idx) { + (void) idx; + assert(idx == expect); + ++expect; + }; + size_t idx1 = 0; + for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) { + fun(idx1); + } + assert(expect == 4096); +} + +void perform_direct_lambda_2(const LIST &loop, const LIST &stride) { + assert((loop.size() == 2) && (stride.size() == 2)); + size_t expect = 0; + auto fun = [&](size_t idx) { + (void) idx; + assert(idx == expect); + ++expect; + }; + size_t idx1 = 0; + for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) { + size_t idx2 = idx1; + for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) { + fun(idx2); + } + } + assert(expect == 4096); +} + +void perform_direct_lambda_3(const LIST &loop, const LIST &stride) { + assert((loop.size() == 3) && (stride.size() == 3)); size_t expect = 0; auto fun = [&](size_t idx) { (void) idx; @@ -46,6 +126,30 @@ void perform_direct_lambda(const LIST &loop, const LIST &stride) { assert(expect == 4096); } +void perform_direct_lambda_4(const LIST &loop, const LIST &stride) { + assert((loop.size() == 4) && (stride.size() == 4)); + size_t expect = 0; + auto fun = [&](size_t idx) { + (void) idx; + assert(idx == expect); + ++expect; + }; + size_t idx1 = 0; + for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) { + size_t idx2 = idx1; + for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) { + size_t idx3 = idx2; + for (size_t k = 0; k < loop[2]; ++k, idx3 += stride[2]) { + size_t idx4 = idx3; + for (size_t l = 0; l < loop[3]; ++l, idx4 += stride[3]) { + fun(idx4); + } + } + } + } + assert(expect == 4096); +} + void perform_generic(const LIST &loop, const LIST &stride) { size_t expect = 0; auto fun = [&](size_t idx) { @@ -70,19 +174,54 @@ void perform_generic_isolate_first(const LIST &loop, const LIST &stride) { void nop() {} -double estimate_cost_us(call_t perform_fun) { +double estimate_cost_1_us(call_t perform_fun) { + LIST loop({4096}); + LIST stride({1}); + return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 10000, 5.0) * 1000.0 * 1000.0; +} + +double estimate_cost_2_us(call_t perform_fun) { + LIST loop({64,64}); + LIST stride({64,1}); + return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 10000, 5.0) * 1000.0 * 1000.0; +} + +double estimate_cost_3_us(call_t perform_fun) { LIST loop({16,16,16}); LIST stride({256,16,1}); - return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 100000, 5.0) * 1000.0 * 1000.0; + return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 10000, 5.0) * 1000.0 * 1000.0; +} + +double estimate_cost_4_us(call_t perform_fun) { + LIST loop({8,8,8,8}); + LIST stride({512,64,8,1}); + return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 10000, 5.0) * 1000.0 * 1000.0; } //----------------------------------------------------------------------------- TEST(NestedLoopBenchmark, single_loop) { - fprintf(stderr, "manual direct single loop: %g us\n", estimate_cost_us(perform_direct)); - fprintf(stderr, "manual call lambda single loop: %g us\n", estimate_cost_us(perform_direct_lambda)); - fprintf(stderr, "generic single loop: %g us\n", estimate_cost_us(perform_generic)); - fprintf(stderr, "generic single loop (isolate first): %g us\n", estimate_cost_us(perform_generic_isolate_first)); + fprintf(stderr, "---------------------------------------------------------------\n"); + fprintf(stderr, "manual direct single loop (1 layer): %g us\n", estimate_cost_1_us(perform_direct_1)); + fprintf(stderr, "manual call lambda single loop (1 layer): %g us\n", estimate_cost_1_us(perform_direct_lambda_1)); + fprintf(stderr, "generic single loop (1 layer): %g us\n", estimate_cost_1_us(perform_generic)); + fprintf(stderr, "generic single loop (1 layer, isolate first): %g us\n", estimate_cost_1_us(perform_generic_isolate_first)); + fprintf(stderr, "---------------------------------------------------------------\n"); + fprintf(stderr, "manual direct single loop (2 layers): %g us\n", estimate_cost_2_us(perform_direct_2)); + fprintf(stderr, "manual call lambda single loop (2 layers): %g us\n", estimate_cost_2_us(perform_direct_lambda_2)); + fprintf(stderr, "generic single loop (2 layers): %g us\n", estimate_cost_2_us(perform_generic)); + fprintf(stderr, "generic single loop (2 layers, isolate first): %g us\n", estimate_cost_2_us(perform_generic_isolate_first)); + fprintf(stderr, "---------------------------------------------------------------\n"); + fprintf(stderr, "manual direct single loop (3 layers): %g us\n", estimate_cost_3_us(perform_direct_3)); + fprintf(stderr, "manual call lambda single loop (3 layers): %g us\n", estimate_cost_3_us(perform_direct_lambda_3)); + fprintf(stderr, "generic single loop (3 layers): %g us\n", estimate_cost_3_us(perform_generic)); + fprintf(stderr, "generic single loop (3 layers, isolate first): %g us\n", estimate_cost_3_us(perform_generic_isolate_first)); + fprintf(stderr, "---------------------------------------------------------------\n"); + fprintf(stderr, "manual direct single loop (4 layers): %g us\n", estimate_cost_4_us(perform_direct_4)); + fprintf(stderr, "manual call lambda single loop (4 layers): %g us\n", estimate_cost_4_us(perform_direct_lambda_4)); + fprintf(stderr, "generic single loop (4 layers): %g us\n", estimate_cost_4_us(perform_generic)); + fprintf(stderr, "generic single loop (4 layers, isolate first): %g us\n", estimate_cost_4_us(perform_generic_isolate_first)); + fprintf(stderr, "---------------------------------------------------------------\n"); } //----------------------------------------------------------------------------- diff --git a/eval/src/vespa/eval/eval/CMakeLists.txt b/eval/src/vespa/eval/eval/CMakeLists.txt index 5a60adfb58b..6c1f99265a7 100644 --- a/eval/src/vespa/eval/eval/CMakeLists.txt +++ b/eval/src/vespa/eval/eval/CMakeLists.txt @@ -2,6 +2,7 @@ vespa_add_library(eval_eval OBJECT SOURCES aggr.cpp + array_array_map.cpp basic_nodes.cpp call_nodes.cpp compile_tensor_function.cpp diff --git a/eval/src/vespa/eval/eval/array_array_map.cpp b/eval/src/vespa/eval/eval/array_array_map.cpp new file mode 100644 index 00000000000..93e17eaddc6 --- /dev/null +++ b/eval/src/vespa/eval/eval/array_array_map.cpp @@ -0,0 +1,7 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "array_array_map.h" + +namespace vespalib::eval { + +} diff --git a/eval/src/vespa/eval/eval/array_array_map.h b/eval/src/vespa/eval/eval/array_array_map.h new file mode 100644 index 00000000000..8ccc3492428 --- /dev/null +++ b/eval/src/vespa/eval/eval/array_array_map.h @@ -0,0 +1,173 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/util/arrayref.h> +#include <vespa/vespalib/stllike/hash_set.h> +#include <vespa/vespalib/stllike/hash_set.hpp> +#include <vector> +#include <cassert> +#include <type_traits> + +namespace vespalib::eval { + +/** + * A map where both keys and values are arrays of some type (K and V + * respectively). All map entries have exactly the same number of keys + * and exactly the same number of values. Keys and values are stored + * in separate vectors external to the map itself in order to reduce + * memory fragmentation both by co-locating the keys and values + * themselves and also by reducing the internal hash node size. Once + * entries are added they cannot be removed. Keys cannot be + * overwritten, but values can. + **/ +template <typename K, typename V, typename H = vespalib::hash<K>, typename EQ = std::equal_to<> > +class ArrayArrayMap +{ +private: + size_t _keys_per_entry; + size_t _values_per_entry; + std::vector<K> _keys; + std::vector<V> _values; + +public: + size_t keys_per_entry() const { return _keys_per_entry; } + size_t values_per_entry() const { return _values_per_entry; } + + struct Tag { + uint32_t id; + static constexpr uint32_t npos() { return uint32_t(-1); } + static Tag make_invalid() { return Tag{npos()}; } + bool valid() const { return (id != npos()); } + }; + + ConstArrayRef<K> get_keys(Tag tag) const { return {&_keys[tag.id * _keys_per_entry], _keys_per_entry}; } + ArrayRef<V> get_values(Tag tag) { return {&_values[tag.id * _values_per_entry], _values_per_entry}; } + ConstArrayRef<V> get_values(Tag tag) const { return {&_values[tag.id * _values_per_entry], _values_per_entry}; } + + struct MyKey { + Tag tag; + uint32_t hash; + }; + + template <typename T> struct AltKey { + ConstArrayRef<T> key; + uint32_t hash; + }; + + struct Hash { + H hash_fun; + template <typename T> uint32_t operator()(ConstArrayRef<T> key) const { + uint32_t h = 0; + for (const T &k: key) { + if constexpr (std::is_pointer_v<T>) { + h = h * 31 + hash_fun(*k); + } else { + h = h * 31 + hash_fun(k); + } + } + return h; + } + uint32_t operator()(const MyKey &key) const { return key.hash; } + template <typename T> uint32_t operator()(const AltKey<T> &key) const { return key.hash; } + }; + + struct Equal { + const ArrayArrayMap &parent; + EQ eq_fun; + Equal(const ArrayArrayMap &parent_in) : parent(parent_in), eq_fun() {} + template <typename T> + bool operator()(const MyKey &a, const AltKey<T> &b) const { + if ((a.hash != b.hash) || (b.key.size() != parent.keys_per_entry())) { + return false; + } + auto a_key = parent.get_keys(a.tag); + for (size_t i = 0; i < a_key.size(); ++i) { + if constexpr (std::is_pointer_v<T>) { + if (!eq_fun(a_key[i], *b.key[i])) { + return false; + } + } else { + if (!eq_fun(a_key[i], b.key[i])) { + return false; + } + } + } + return true; + } + bool operator()(const MyKey &a, const MyKey &b) const { + return operator()(a, AltKey<K>{parent.get_keys(b.tag), b.hash}); + } + }; + + using MapType = vespalib::hash_set<MyKey,Hash,Equal>; + +private: + MapType _map; + Hash _hasher; + + template <typename T> + Tag add_entry(ConstArrayRef<T> key, uint32_t hash) { + uint32_t tag_id = _map.size(); + for (const auto &k: key) { + if constexpr (std::is_pointer_v<T>) { + _keys.push_back(*k); + } else { + _keys.push_back(k); + } + } + _values.resize(_values.size() + _values_per_entry, V{}); + auto [pos, was_inserted] = _map.insert(MyKey{tag_id,hash}); + assert(was_inserted); + return Tag{tag_id}; + } + +public: + ArrayArrayMap(size_t keys_per_entry_in, size_t values_per_entry_in, size_t expected_entries) + : _keys_per_entry(keys_per_entry_in), _values_per_entry(values_per_entry_in), _keys(), _values(), + _map(expected_entries * 2, Hash(), Equal(*this)), _hasher() + { + _keys.reserve(_keys_per_entry * expected_entries); + _values.reserve(_values_per_entry * expected_entries); + static_assert(!std::is_pointer_v<K>, "keys cannot be pointers due to auto-deref of alt keys"); + } + ~ArrayArrayMap(); + + size_t size() const { return _map.size(); } + + template <typename T> + Tag lookup(ConstArrayRef<T> key) const { + auto pos = _map.find(AltKey<T>{key, _hasher(key)}); + if (pos == _map.end()) { + return Tag::make_invalid(); + } + return pos->tag; + } + + template <typename T> + Tag add_entry(ConstArrayRef<T> key) { + return add_entry(key, _hasher(key)); + } + + template <typename T> + std::pair<Tag,bool> lookup_or_add_entry(ConstArrayRef<T> key) { + uint32_t hash = _hasher(key); + auto pos = _map.find(AltKey<T>{key, hash}); + if (pos == _map.end()) { + return {add_entry(key, hash), true}; + } + return {pos->tag, false}; + } + + template <typename F> + void each_entry(F &&f) const { + for (uint32_t i = 0; i < size(); ++i) { + f(get_keys(Tag{i}), get_values(Tag{i})); + } + } +}; + +template <typename K, typename V, typename H, typename EQ> +ArrayArrayMap<K,V,H,EQ>::~ArrayArrayMap() = default; + +} diff --git a/eval/src/vespa/eval/instruction/generic_reduce.cpp b/eval/src/vespa/eval/instruction/generic_reduce.cpp index 305f51ea445..c1cb22fad41 100644 --- a/eval/src/vespa/eval/instruction/generic_reduce.cpp +++ b/eval/src/vespa/eval/instruction/generic_reduce.cpp @@ -3,6 +3,7 @@ #include "generic_reduce.h" #include <vespa/eval/eval/value.h> #include <vespa/eval/eval/wrap_param.h> +#include <vespa/eval/eval/array_array_map.h> #include <vespa/vespalib/util/stash.h> #include <vespa/vespalib/util/typify.h> #include <cassert> @@ -41,17 +42,6 @@ ReduceParam::~ReduceParam() = default; //----------------------------------------------------------------------------- struct SparseReduceState { - - // TODO(havardpe): Optimize using vespalib::hash_map with - // reference to slice of external vector of vespalib::stringref as - // key. Track matching subspaces as linked lists contained in a - // single shared vector. - - using MapKey = std::vector<vespalib::string>; - using MapValue = std::vector<size_t>; - using Map = std::map<MapKey,MapValue>; - - Map map; std::vector<vespalib::stringref> full_address; std::vector<vespalib::stringref*> fetch_address; std::vector<vespalib::stringref*> keep_address; @@ -60,7 +50,8 @@ struct SparseReduceState { SparseReduceState(const SparseReducePlan &plan) : full_address(plan.keep_dims.size() + plan.num_reduce_dims), fetch_address(full_address.size(), nullptr), - keep_address(plan.keep_dims.size(), nullptr) + keep_address(plan.keep_dims.size(), nullptr), + subspace() { for (size_t i = 0; i < keep_address.size(); ++i) { keep_address[i] = &full_address[plan.keep_dims[i]]; @@ -69,54 +60,49 @@ struct SparseReduceState { fetch_address[i] = &full_address[i]; } } + ConstArrayRef<vespalib::stringref*> keep_addr() const { return {keep_address}; } ~SparseReduceState(); - void populate_map(std::unique_ptr<Value::Index::View> full_view) { - full_view->lookup({}); - while (full_view->next_result(fetch_address, subspace)) { - std::vector<vespalib::string> key; - key.reserve(keep_address.size()); - for (const vespalib::stringref *label: keep_address) { - key.emplace_back(*label); - } - map[key].push_back(subspace); - } - } - template <typename T> - std::vector<vespalib::stringref> make_addr(const T &map_entry) { - std::vector<vespalib::stringref> addr; - addr.reserve(map_entry.first.size()); - for (const vespalib::string &label: map_entry.first) { - addr.push_back(label); - } - return addr; - } }; SparseReduceState::~SparseReduceState() = default; template <typename ICT, typename OCT, typename AGGR> Value::UP generic_reduce(const Value &value, const ReduceParam ¶m) { - SparseReduceState sparse(param.sparse_plan); - sparse.populate_map(value.index().create_view({})); auto cells = value.cells().typify<ICT>(); - AGGR aggr; - auto first = [&](size_t idx) { aggr.first(cells[idx]); }; - auto next = [&](size_t idx) { aggr.next(cells[idx]); }; - auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.keep_dims.size(), param.dense_plan.out_size, sparse.map.size()); - for (const auto &entry: sparse.map) { - OCT *dst = builder->add_subspace(sparse.make_addr(entry)).begin(); - auto reduce_cells = [&](size_t rel_idx) - { - auto pos = entry.second.begin(); - param.dense_plan.execute_reduce((*pos * param.dense_plan.in_size) + rel_idx, first, next); - for (++pos; pos != entry.second.end(); ++pos) { - param.dense_plan.execute_reduce((*pos * param.dense_plan.in_size) + rel_idx, next); - } - *dst++ = aggr.result(); - }; - param.dense_plan.execute_keep(reduce_cells); + ArrayArrayMap<vespalib::stringref,AGGR> map(param.sparse_plan.keep_dims.size(), + param.dense_plan.out_size, + value.index().size()); + SparseReduceState sparse(param.sparse_plan); + auto full_view = value.index().create_view({}); + full_view->lookup({}); + while (full_view->next_result(sparse.fetch_address, sparse.subspace)) { + auto res = map.lookup_or_add_entry(sparse.keep_addr()); + AGGR *dst = nullptr; + auto first = [&](size_t idx) { (dst++)->first(cells[idx]); }; + auto next = [&](size_t idx) { (dst++)->next(cells[idx]); }; + auto init_aggrs = [&](size_t idx) { + dst = map.get_values(res.first).begin(); + param.dense_plan.execute_keep(idx, first); + }; + auto fill_aggrs = [&](size_t idx) { + dst = map.get_values(res.first).begin(); + param.dense_plan.execute_keep(idx, next); + }; + if (res.second) { + param.dense_plan.execute_reduce((sparse.subspace * param.dense_plan.in_size), init_aggrs, fill_aggrs); + } else { + param.dense_plan.execute_reduce((sparse.subspace * param.dense_plan.in_size), fill_aggrs); + } } - if (sparse.map.empty() && param.sparse_plan.keep_dims.empty()) { + auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.keep_dims.size(), param.dense_plan.out_size, map.size()); + map.each_entry([&](const auto &keys, const auto &values) + { + OCT *dst = builder->add_subspace(keys).begin(); + for (const AGGR &aggr: values) { + *dst++ = aggr.result(); + } + }); + if ((map.size() == 0) && param.sparse_plan.keep_dims.empty()) { auto zero = builder->add_subspace({}); for (size_t i = 0; i < zero.size(); ++i) { zero[i] = OCT{}; @@ -135,8 +121,28 @@ void my_generic_reduce_op(State &state, uint64_t param_in) { state.pop_push(result_ref); }; +template <typename ICT, typename AGGR> +void my_full_reduce_op(State &state, uint64_t) { + auto cells = state.peek(0).cells().typify<ICT>(); + if (cells.size() > 0) { + AGGR aggr; + aggr.first(cells[0]); + for (size_t i = 1; i < cells.size(); ++i) { + aggr.next(cells[i]); + } + state.pop_push(state.stash.create<DoubleValue>(aggr.result())); + } else { + state.pop_push(state.stash.create<DoubleValue>(0.0)); + } +}; + struct SelectGenericReduceOp { - template <typename ICT, typename OCT, typename AGGR> static auto invoke() { + template <typename ICT, typename OCT, typename AGGR> static auto invoke(const ReduceParam ¶m) { + if (param.res_type.is_double()) { + bool output_is_double = std::is_same_v<OCT, double>; + assert(output_is_double); + return my_full_reduce_op<ICT, typename AGGR::template templ<ICT>>; + } return my_generic_reduce_op<ICT, OCT, typename AGGR::template templ<ICT>>; } }; @@ -223,7 +229,7 @@ GenericReduce::make_instruction(const ValueType &type, Aggr aggr, const std::vec const ValueBuilderFactory &factory, Stash &stash) { auto ¶m = stash.create<ReduceParam>(type, dimensions, factory); - auto fun = typify_invoke<3,ReduceTypify,SelectGenericReduceOp>(type.cell_type(), param.res_type.cell_type(), aggr); + auto fun = typify_invoke<3,ReduceTypify,SelectGenericReduceOp>(type.cell_type(), param.res_type.cell_type(), aggr, param); return Instruction(fun, wrap_param<ReduceParam>(param)); } diff --git a/eval/src/vespa/eval/instruction/generic_reduce.h b/eval/src/vespa/eval/instruction/generic_reduce.h index 856cfe362ae..47a73820601 100644 --- a/eval/src/vespa/eval/instruction/generic_reduce.h +++ b/eval/src/vespa/eval/instruction/generic_reduce.h @@ -23,8 +23,8 @@ struct DenseReducePlan { std::vector<size_t> reduce_stride; DenseReducePlan(const ValueType &type, const ValueType &res_type); ~DenseReducePlan(); - template <typename F> void execute_keep(const F &f) const { - run_nested_loop(0, keep_loop, keep_stride, f); + template <typename F> void execute_keep(size_t offset, const F &f) const { + run_nested_loop(offset, keep_loop, keep_stride, f); } template <typename F> void execute_reduce(size_t offset, const F &f) const { run_nested_loop(offset, reduce_loop, reduce_stride, f); |