summaryrefslogtreecommitdiffstats
path: root/eval
diff options
context:
space:
mode:
authorHåvard Pettersen <havardpe@oath.com>2020-10-20 10:23:24 +0000
committerHåvard Pettersen <havardpe@oath.com>2020-10-20 16:32:02 +0000
commit90633ae73e68208da7c99b766b03165dedce649c (patch)
treee0985e63cf4b534c83555f6e0c0261922c710e3d /eval
parent36f8d6761ef4bed884c884c7f673e6c7d847af7f (diff)
make generic reduce faster
* use faster map implementation * use multiple aggregators (invert dense loops) * handle reducing all cells separately * extend nested loop benchmarking
Diffstat (limited to 'eval')
-rw-r--r--eval/CMakeLists.txt1
-rw-r--r--eval/src/tests/eval/array_array_map/CMakeLists.txt10
-rw-r--r--eval/src/tests/eval/array_array_map/array_array_map_test.cpp70
-rw-r--r--eval/src/tests/eval/nested_loop/nested_loop_bench.cpp155
-rw-r--r--eval/src/vespa/eval/eval/CMakeLists.txt1
-rw-r--r--eval/src/vespa/eval/eval/array_array_map.cpp7
-rw-r--r--eval/src/vespa/eval/eval/array_array_map.h173
-rw-r--r--eval/src/vespa/eval/instruction/generic_reduce.cpp112
-rw-r--r--eval/src/vespa/eval/instruction/generic_reduce.h4
9 files changed, 470 insertions, 63 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt
index eb94a9498cb..cd33b051846 100644
--- a/eval/CMakeLists.txt
+++ b/eval/CMakeLists.txt
@@ -12,6 +12,7 @@ vespa_define_module(
TESTS
src/tests/ann
src/tests/eval/aggr
+ src/tests/eval/array_array_map
src/tests/eval/compile_cache
src/tests/eval/compiled_function
src/tests/eval/engine_or_factory
diff --git a/eval/src/tests/eval/array_array_map/CMakeLists.txt b/eval/src/tests/eval/array_array_map/CMakeLists.txt
new file mode 100644
index 00000000000..aef54d0fcb0
--- /dev/null
+++ b/eval/src/tests/eval/array_array_map/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+vespa_add_executable(eval_array_array_map_test_app TEST
+ SOURCES
+ array_array_map_test.cpp
+ DEPENDS
+ vespaeval
+ GTest::GTest
+)
+vespa_add_test(NAME eval_array_array_map_test_app COMMAND eval_array_array_map_test_app)
diff --git a/eval/src/tests/eval/array_array_map/array_array_map_test.cpp b/eval/src/tests/eval/array_array_map/array_array_map_test.cpp
new file mode 100644
index 00000000000..d6a97549b88
--- /dev/null
+++ b/eval/src/tests/eval/array_array_map/array_array_map_test.cpp
@@ -0,0 +1,70 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/eval/eval/array_array_map.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using namespace vespalib;
+using namespace vespalib::eval;
+
+std::vector<int> ints(std::vector<int> vec) { return vec; }
+
+template <typename T>
+ConstArrayRef<T> refs(const std::vector<T> &vec) { return ConstArrayRef<T>(vec); }
+
+//-----------------------------------------------------------------------------
+
+TEST(ArrayArrayMapTest, simple_map_can_be_created_and_used) {
+ // ctor params: 'keys_per_entry', 'values_per_entry', 'expected_entries'
+ ArrayArrayMap<int,int> map(2,3,5);
+ EXPECT_EQ(map.size(), 0);
+ EXPECT_FALSE(map.lookup(refs(ints({1, 2}))).valid());
+ auto tag = map.add_entry(refs(ints({1, 2})));
+ EXPECT_EQ(map.size(), 1);
+ auto values = map.get_values(tag);
+ ASSERT_EQ(values.size(), 3);
+ values[0] = 10;
+ values[1] = 20;
+ values[2] = 30;
+ EXPECT_FALSE(map.lookup(refs(ints({2, 1}))).valid());
+ auto tag2 = map.lookup(refs(ints({1, 2})));
+ ASSERT_TRUE(tag2.valid());
+ EXPECT_EQ(map.get_values(tag2)[1], 20);
+}
+
+TEST(ArrayArrayMapTest, lookup_or_add_entry_works) {
+ ArrayArrayMap<int,int> map(2,3,5);
+ auto res1 = map.lookup_or_add_entry(refs(ints({1, 2})));
+ auto res2 = map.lookup_or_add_entry(refs(ints({1, 2})));
+ EXPECT_TRUE(res1.second);
+ EXPECT_FALSE(res2.second);
+ EXPECT_EQ(map.get_values(res1.first).begin(), map.get_values(res2.first).begin());
+ EXPECT_EQ(map.get_values(res1.first).size(), 3);
+}
+
+TEST(ArrayArrayMapTest, each_entry_works) {
+ ArrayArrayMap<int,int> map(2,3,5);
+ auto res1 = map.add_entry(refs(ints({1, 2})));
+ auto res2 = map.add_entry(refs(ints({2, 1})));
+ map.get_values(res1)[0] = 10;
+ map.get_values(res2)[0] = 20;
+ EXPECT_EQ(map.size(), 2);
+ bool first = true;
+ // Note: insert order is guaranteed here
+ map.each_entry([&](const auto &keys, const auto &values)
+ {
+ if (first) {
+ EXPECT_EQ(keys[0], 1);
+ EXPECT_EQ(keys[1], 2);
+ EXPECT_EQ(values[0], 10);
+ first = false;
+ } else {
+ EXPECT_EQ(keys[0], 2);
+ EXPECT_EQ(keys[1], 1);
+ EXPECT_EQ(values[0], 20);
+ }
+ });
+}
+
+//-----------------------------------------------------------------------------
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp b/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp
index c6e2f278cd4..3fbb80e07e5 100644
--- a/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp
+++ b/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp
@@ -10,7 +10,33 @@ using vespalib::eval::run_nested_loop;
using LIST = std::vector<size_t>;
using call_t = void (*)(const LIST &loop, const LIST &stride);
-void perform_direct(const LIST &loop, const LIST &stride) {
+void perform_direct_1(const LIST &loop, const LIST &stride) {
+ assert((loop.size() == 1) && (stride.size() == 1));
+ size_t idx1 = 0;
+ size_t expect = 0;
+ for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+ assert(idx1 == expect);
+ ++expect;
+ }
+ assert(expect == 4096);
+}
+
+void perform_direct_2(const LIST &loop, const LIST &stride) {
+ assert((loop.size() == 2) && (stride.size() == 2));
+ size_t idx1 = 0;
+ size_t expect = 0;
+ for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+ size_t idx2 = idx1;
+ for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) {
+ assert(idx2 == expect);
+ ++expect;
+ }
+ }
+ assert(expect == 4096);
+}
+
+void perform_direct_3(const LIST &loop, const LIST &stride) {
+ assert((loop.size() == 3) && (stride.size() == 3));
size_t idx1 = 0;
size_t expect = 0;
for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
@@ -26,7 +52,61 @@ void perform_direct(const LIST &loop, const LIST &stride) {
assert(expect == 4096);
}
-void perform_direct_lambda(const LIST &loop, const LIST &stride) {
+void perform_direct_4(const LIST &loop, const LIST &stride) {
+ assert((loop.size() == 4) && (stride.size() == 4));
+ size_t idx1 = 0;
+ size_t expect = 0;
+ for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+ size_t idx2 = idx1;
+ for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) {
+ size_t idx3 = idx2;
+ for (size_t k = 0; k < loop[2]; ++k, idx3 += stride[2]) {
+ size_t idx4 = idx3;
+ for (size_t l = 0; l < loop[3]; ++l, idx4 += stride[3]) {
+ assert(idx4 == expect);
+ ++expect;
+ }
+ }
+ }
+ }
+ assert(expect == 4096);
+}
+
+void perform_direct_lambda_1(const LIST &loop, const LIST &stride) {
+ assert((loop.size() == 1) && (stride.size() == 1));
+ size_t expect = 0;
+ auto fun = [&](size_t idx) {
+ (void) idx;
+ assert(idx == expect);
+ ++expect;
+ };
+ size_t idx1 = 0;
+ for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+ fun(idx1);
+ }
+ assert(expect == 4096);
+}
+
+void perform_direct_lambda_2(const LIST &loop, const LIST &stride) {
+ assert((loop.size() == 2) && (stride.size() == 2));
+ size_t expect = 0;
+ auto fun = [&](size_t idx) {
+ (void) idx;
+ assert(idx == expect);
+ ++expect;
+ };
+ size_t idx1 = 0;
+ for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+ size_t idx2 = idx1;
+ for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) {
+ fun(idx2);
+ }
+ }
+ assert(expect == 4096);
+}
+
+void perform_direct_lambda_3(const LIST &loop, const LIST &stride) {
+ assert((loop.size() == 3) && (stride.size() == 3));
size_t expect = 0;
auto fun = [&](size_t idx) {
(void) idx;
@@ -46,6 +126,30 @@ void perform_direct_lambda(const LIST &loop, const LIST &stride) {
assert(expect == 4096);
}
+void perform_direct_lambda_4(const LIST &loop, const LIST &stride) {
+ assert((loop.size() == 4) && (stride.size() == 4));
+ size_t expect = 0;
+ auto fun = [&](size_t idx) {
+ (void) idx;
+ assert(idx == expect);
+ ++expect;
+ };
+ size_t idx1 = 0;
+ for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+ size_t idx2 = idx1;
+ for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) {
+ size_t idx3 = idx2;
+ for (size_t k = 0; k < loop[2]; ++k, idx3 += stride[2]) {
+ size_t idx4 = idx3;
+ for (size_t l = 0; l < loop[3]; ++l, idx4 += stride[3]) {
+ fun(idx4);
+ }
+ }
+ }
+ }
+ assert(expect == 4096);
+}
+
void perform_generic(const LIST &loop, const LIST &stride) {
size_t expect = 0;
auto fun = [&](size_t idx) {
@@ -70,19 +174,54 @@ void perform_generic_isolate_first(const LIST &loop, const LIST &stride) {
void nop() {}
-double estimate_cost_us(call_t perform_fun) {
+double estimate_cost_1_us(call_t perform_fun) {
+ LIST loop({4096});
+ LIST stride({1});
+ return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 10000, 5.0) * 1000.0 * 1000.0;
+}
+
+double estimate_cost_2_us(call_t perform_fun) {
+ LIST loop({64,64});
+ LIST stride({64,1});
+ return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 10000, 5.0) * 1000.0 * 1000.0;
+}
+
+double estimate_cost_3_us(call_t perform_fun) {
LIST loop({16,16,16});
LIST stride({256,16,1});
- return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 100000, 5.0) * 1000.0 * 1000.0;
+ return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 10000, 5.0) * 1000.0 * 1000.0;
+}
+
+double estimate_cost_4_us(call_t perform_fun) {
+ LIST loop({8,8,8,8});
+ LIST stride({512,64,8,1});
+ return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 10000, 5.0) * 1000.0 * 1000.0;
}
//-----------------------------------------------------------------------------
TEST(NestedLoopBenchmark, single_loop) {
- fprintf(stderr, "manual direct single loop: %g us\n", estimate_cost_us(perform_direct));
- fprintf(stderr, "manual call lambda single loop: %g us\n", estimate_cost_us(perform_direct_lambda));
- fprintf(stderr, "generic single loop: %g us\n", estimate_cost_us(perform_generic));
- fprintf(stderr, "generic single loop (isolate first): %g us\n", estimate_cost_us(perform_generic_isolate_first));
+ fprintf(stderr, "---------------------------------------------------------------\n");
+ fprintf(stderr, "manual direct single loop (1 layer): %g us\n", estimate_cost_1_us(perform_direct_1));
+ fprintf(stderr, "manual call lambda single loop (1 layer): %g us\n", estimate_cost_1_us(perform_direct_lambda_1));
+ fprintf(stderr, "generic single loop (1 layer): %g us\n", estimate_cost_1_us(perform_generic));
+ fprintf(stderr, "generic single loop (1 layer, isolate first): %g us\n", estimate_cost_1_us(perform_generic_isolate_first));
+ fprintf(stderr, "---------------------------------------------------------------\n");
+ fprintf(stderr, "manual direct single loop (2 layers): %g us\n", estimate_cost_2_us(perform_direct_2));
+ fprintf(stderr, "manual call lambda single loop (2 layers): %g us\n", estimate_cost_2_us(perform_direct_lambda_2));
+ fprintf(stderr, "generic single loop (2 layers): %g us\n", estimate_cost_2_us(perform_generic));
+ fprintf(stderr, "generic single loop (2 layers, isolate first): %g us\n", estimate_cost_2_us(perform_generic_isolate_first));
+ fprintf(stderr, "---------------------------------------------------------------\n");
+ fprintf(stderr, "manual direct single loop (3 layers): %g us\n", estimate_cost_3_us(perform_direct_3));
+ fprintf(stderr, "manual call lambda single loop (3 layers): %g us\n", estimate_cost_3_us(perform_direct_lambda_3));
+ fprintf(stderr, "generic single loop (3 layers): %g us\n", estimate_cost_3_us(perform_generic));
+ fprintf(stderr, "generic single loop (3 layers, isolate first): %g us\n", estimate_cost_3_us(perform_generic_isolate_first));
+ fprintf(stderr, "---------------------------------------------------------------\n");
+ fprintf(stderr, "manual direct single loop (4 layers): %g us\n", estimate_cost_4_us(perform_direct_4));
+ fprintf(stderr, "manual call lambda single loop (4 layers): %g us\n", estimate_cost_4_us(perform_direct_lambda_4));
+ fprintf(stderr, "generic single loop (4 layers): %g us\n", estimate_cost_4_us(perform_generic));
+ fprintf(stderr, "generic single loop (4 layers, isolate first): %g us\n", estimate_cost_4_us(perform_generic_isolate_first));
+ fprintf(stderr, "---------------------------------------------------------------\n");
}
//-----------------------------------------------------------------------------
diff --git a/eval/src/vespa/eval/eval/CMakeLists.txt b/eval/src/vespa/eval/eval/CMakeLists.txt
index 5a60adfb58b..6c1f99265a7 100644
--- a/eval/src/vespa/eval/eval/CMakeLists.txt
+++ b/eval/src/vespa/eval/eval/CMakeLists.txt
@@ -2,6 +2,7 @@
vespa_add_library(eval_eval OBJECT
SOURCES
aggr.cpp
+ array_array_map.cpp
basic_nodes.cpp
call_nodes.cpp
compile_tensor_function.cpp
diff --git a/eval/src/vespa/eval/eval/array_array_map.cpp b/eval/src/vespa/eval/eval/array_array_map.cpp
new file mode 100644
index 00000000000..93e17eaddc6
--- /dev/null
+++ b/eval/src/vespa/eval/eval/array_array_map.cpp
@@ -0,0 +1,7 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "array_array_map.h"
+
+namespace vespalib::eval {
+
+}
diff --git a/eval/src/vespa/eval/eval/array_array_map.h b/eval/src/vespa/eval/eval/array_array_map.h
new file mode 100644
index 00000000000..8ccc3492428
--- /dev/null
+++ b/eval/src/vespa/eval/eval/array_array_map.h
@@ -0,0 +1,173 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/util/arrayref.h>
+#include <vespa/vespalib/stllike/hash_set.h>
+#include <vespa/vespalib/stllike/hash_set.hpp>
+#include <vector>
+#include <cassert>
+#include <type_traits>
+
+namespace vespalib::eval {
+
+/**
+ * A map where both keys and values are arrays of some type (K and V
+ * respectively). All map entries have exactly the same number of keys
+ * and exactly the same number of values. Keys and values are stored
+ * in separate vectors external to the map itself in order to reduce
+ * memory fragmentation both by co-locating the keys and values
+ * themselves and also by reducing the internal hash node size. Once
+ * entries are added they cannot be removed. Keys cannot be
+ * overwritten, but values can.
+ **/
+template <typename K, typename V, typename H = vespalib::hash<K>, typename EQ = std::equal_to<> >
+class ArrayArrayMap
+{
+private:
+ size_t _keys_per_entry;
+ size_t _values_per_entry;
+ std::vector<K> _keys;
+ std::vector<V> _values;
+
+public:
+ size_t keys_per_entry() const { return _keys_per_entry; }
+ size_t values_per_entry() const { return _values_per_entry; }
+
+ struct Tag {
+ uint32_t id;
+ static constexpr uint32_t npos() { return uint32_t(-1); }
+ static Tag make_invalid() { return Tag{npos()}; }
+ bool valid() const { return (id != npos()); }
+ };
+
+ ConstArrayRef<K> get_keys(Tag tag) const { return {&_keys[tag.id * _keys_per_entry], _keys_per_entry}; }
+ ArrayRef<V> get_values(Tag tag) { return {&_values[tag.id * _values_per_entry], _values_per_entry}; }
+ ConstArrayRef<V> get_values(Tag tag) const { return {&_values[tag.id * _values_per_entry], _values_per_entry}; }
+
+ struct MyKey {
+ Tag tag;
+ uint32_t hash;
+ };
+
+ template <typename T> struct AltKey {
+ ConstArrayRef<T> key;
+ uint32_t hash;
+ };
+
+ struct Hash {
+ H hash_fun;
+ template <typename T> uint32_t operator()(ConstArrayRef<T> key) const {
+ uint32_t h = 0;
+ for (const T &k: key) {
+ if constexpr (std::is_pointer_v<T>) {
+ h = h * 31 + hash_fun(*k);
+ } else {
+ h = h * 31 + hash_fun(k);
+ }
+ }
+ return h;
+ }
+ uint32_t operator()(const MyKey &key) const { return key.hash; }
+ template <typename T> uint32_t operator()(const AltKey<T> &key) const { return key.hash; }
+ };
+
+ struct Equal {
+ const ArrayArrayMap &parent;
+ EQ eq_fun;
+ Equal(const ArrayArrayMap &parent_in) : parent(parent_in), eq_fun() {}
+ template <typename T>
+ bool operator()(const MyKey &a, const AltKey<T> &b) const {
+ if ((a.hash != b.hash) || (b.key.size() != parent.keys_per_entry())) {
+ return false;
+ }
+ auto a_key = parent.get_keys(a.tag);
+ for (size_t i = 0; i < a_key.size(); ++i) {
+ if constexpr (std::is_pointer_v<T>) {
+ if (!eq_fun(a_key[i], *b.key[i])) {
+ return false;
+ }
+ } else {
+ if (!eq_fun(a_key[i], b.key[i])) {
+ return false;
+ }
+ }
+ }
+ return true;
+ }
+ bool operator()(const MyKey &a, const MyKey &b) const {
+ return operator()(a, AltKey<K>{parent.get_keys(b.tag), b.hash});
+ }
+ };
+
+ using MapType = vespalib::hash_set<MyKey,Hash,Equal>;
+
+private:
+ MapType _map;
+ Hash _hasher;
+
+ template <typename T>
+ Tag add_entry(ConstArrayRef<T> key, uint32_t hash) {
+ uint32_t tag_id = _map.size();
+ for (const auto &k: key) {
+ if constexpr (std::is_pointer_v<T>) {
+ _keys.push_back(*k);
+ } else {
+ _keys.push_back(k);
+ }
+ }
+ _values.resize(_values.size() + _values_per_entry, V{});
+ auto [pos, was_inserted] = _map.insert(MyKey{tag_id,hash});
+ assert(was_inserted);
+ return Tag{tag_id};
+ }
+
+public:
+ ArrayArrayMap(size_t keys_per_entry_in, size_t values_per_entry_in, size_t expected_entries)
+ : _keys_per_entry(keys_per_entry_in), _values_per_entry(values_per_entry_in), _keys(), _values(),
+ _map(expected_entries * 2, Hash(), Equal(*this)), _hasher()
+ {
+ _keys.reserve(_keys_per_entry * expected_entries);
+ _values.reserve(_values_per_entry * expected_entries);
+ static_assert(!std::is_pointer_v<K>, "keys cannot be pointers due to auto-deref of alt keys");
+ }
+ ~ArrayArrayMap();
+
+ size_t size() const { return _map.size(); }
+
+ template <typename T>
+ Tag lookup(ConstArrayRef<T> key) const {
+ auto pos = _map.find(AltKey<T>{key, _hasher(key)});
+ if (pos == _map.end()) {
+ return Tag::make_invalid();
+ }
+ return pos->tag;
+ }
+
+ template <typename T>
+ Tag add_entry(ConstArrayRef<T> key) {
+ return add_entry(key, _hasher(key));
+ }
+
+ template <typename T>
+ std::pair<Tag,bool> lookup_or_add_entry(ConstArrayRef<T> key) {
+ uint32_t hash = _hasher(key);
+ auto pos = _map.find(AltKey<T>{key, hash});
+ if (pos == _map.end()) {
+ return {add_entry(key, hash), true};
+ }
+ return {pos->tag, false};
+ }
+
+ template <typename F>
+ void each_entry(F &&f) const {
+ for (uint32_t i = 0; i < size(); ++i) {
+ f(get_keys(Tag{i}), get_values(Tag{i}));
+ }
+ }
+};
+
+template <typename K, typename V, typename H, typename EQ>
+ArrayArrayMap<K,V,H,EQ>::~ArrayArrayMap() = default;
+
+}
diff --git a/eval/src/vespa/eval/instruction/generic_reduce.cpp b/eval/src/vespa/eval/instruction/generic_reduce.cpp
index 305f51ea445..c1cb22fad41 100644
--- a/eval/src/vespa/eval/instruction/generic_reduce.cpp
+++ b/eval/src/vespa/eval/instruction/generic_reduce.cpp
@@ -3,6 +3,7 @@
#include "generic_reduce.h"
#include <vespa/eval/eval/value.h>
#include <vespa/eval/eval/wrap_param.h>
+#include <vespa/eval/eval/array_array_map.h>
#include <vespa/vespalib/util/stash.h>
#include <vespa/vespalib/util/typify.h>
#include <cassert>
@@ -41,17 +42,6 @@ ReduceParam::~ReduceParam() = default;
//-----------------------------------------------------------------------------
struct SparseReduceState {
-
- // TODO(havardpe): Optimize using vespalib::hash_map with
- // reference to slice of external vector of vespalib::stringref as
- // key. Track matching subspaces as linked lists contained in a
- // single shared vector.
-
- using MapKey = std::vector<vespalib::string>;
- using MapValue = std::vector<size_t>;
- using Map = std::map<MapKey,MapValue>;
-
- Map map;
std::vector<vespalib::stringref> full_address;
std::vector<vespalib::stringref*> fetch_address;
std::vector<vespalib::stringref*> keep_address;
@@ -60,7 +50,8 @@ struct SparseReduceState {
SparseReduceState(const SparseReducePlan &plan)
: full_address(plan.keep_dims.size() + plan.num_reduce_dims),
fetch_address(full_address.size(), nullptr),
- keep_address(plan.keep_dims.size(), nullptr)
+ keep_address(plan.keep_dims.size(), nullptr),
+ subspace()
{
for (size_t i = 0; i < keep_address.size(); ++i) {
keep_address[i] = &full_address[plan.keep_dims[i]];
@@ -69,54 +60,49 @@ struct SparseReduceState {
fetch_address[i] = &full_address[i];
}
}
+ ConstArrayRef<vespalib::stringref*> keep_addr() const { return {keep_address}; }
~SparseReduceState();
- void populate_map(std::unique_ptr<Value::Index::View> full_view) {
- full_view->lookup({});
- while (full_view->next_result(fetch_address, subspace)) {
- std::vector<vespalib::string> key;
- key.reserve(keep_address.size());
- for (const vespalib::stringref *label: keep_address) {
- key.emplace_back(*label);
- }
- map[key].push_back(subspace);
- }
- }
- template <typename T>
- std::vector<vespalib::stringref> make_addr(const T &map_entry) {
- std::vector<vespalib::stringref> addr;
- addr.reserve(map_entry.first.size());
- for (const vespalib::string &label: map_entry.first) {
- addr.push_back(label);
- }
- return addr;
- }
};
SparseReduceState::~SparseReduceState() = default;
template <typename ICT, typename OCT, typename AGGR>
Value::UP
generic_reduce(const Value &value, const ReduceParam &param) {
- SparseReduceState sparse(param.sparse_plan);
- sparse.populate_map(value.index().create_view({}));
auto cells = value.cells().typify<ICT>();
- AGGR aggr;
- auto first = [&](size_t idx) { aggr.first(cells[idx]); };
- auto next = [&](size_t idx) { aggr.next(cells[idx]); };
- auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.keep_dims.size(), param.dense_plan.out_size, sparse.map.size());
- for (const auto &entry: sparse.map) {
- OCT *dst = builder->add_subspace(sparse.make_addr(entry)).begin();
- auto reduce_cells = [&](size_t rel_idx)
- {
- auto pos = entry.second.begin();
- param.dense_plan.execute_reduce((*pos * param.dense_plan.in_size) + rel_idx, first, next);
- for (++pos; pos != entry.second.end(); ++pos) {
- param.dense_plan.execute_reduce((*pos * param.dense_plan.in_size) + rel_idx, next);
- }
- *dst++ = aggr.result();
- };
- param.dense_plan.execute_keep(reduce_cells);
+ ArrayArrayMap<vespalib::stringref,AGGR> map(param.sparse_plan.keep_dims.size(),
+ param.dense_plan.out_size,
+ value.index().size());
+ SparseReduceState sparse(param.sparse_plan);
+ auto full_view = value.index().create_view({});
+ full_view->lookup({});
+ while (full_view->next_result(sparse.fetch_address, sparse.subspace)) {
+ auto res = map.lookup_or_add_entry(sparse.keep_addr());
+ AGGR *dst = nullptr;
+ auto first = [&](size_t idx) { (dst++)->first(cells[idx]); };
+ auto next = [&](size_t idx) { (dst++)->next(cells[idx]); };
+ auto init_aggrs = [&](size_t idx) {
+ dst = map.get_values(res.first).begin();
+ param.dense_plan.execute_keep(idx, first);
+ };
+ auto fill_aggrs = [&](size_t idx) {
+ dst = map.get_values(res.first).begin();
+ param.dense_plan.execute_keep(idx, next);
+ };
+ if (res.second) {
+ param.dense_plan.execute_reduce((sparse.subspace * param.dense_plan.in_size), init_aggrs, fill_aggrs);
+ } else {
+ param.dense_plan.execute_reduce((sparse.subspace * param.dense_plan.in_size), fill_aggrs);
+ }
}
- if (sparse.map.empty() && param.sparse_plan.keep_dims.empty()) {
+ auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.keep_dims.size(), param.dense_plan.out_size, map.size());
+ map.each_entry([&](const auto &keys, const auto &values)
+ {
+ OCT *dst = builder->add_subspace(keys).begin();
+ for (const AGGR &aggr: values) {
+ *dst++ = aggr.result();
+ }
+ });
+ if ((map.size() == 0) && param.sparse_plan.keep_dims.empty()) {
auto zero = builder->add_subspace({});
for (size_t i = 0; i < zero.size(); ++i) {
zero[i] = OCT{};
@@ -135,8 +121,28 @@ void my_generic_reduce_op(State &state, uint64_t param_in) {
state.pop_push(result_ref);
};
+template <typename ICT, typename AGGR>
+void my_full_reduce_op(State &state, uint64_t) {
+ auto cells = state.peek(0).cells().typify<ICT>();
+ if (cells.size() > 0) {
+ AGGR aggr;
+ aggr.first(cells[0]);
+ for (size_t i = 1; i < cells.size(); ++i) {
+ aggr.next(cells[i]);
+ }
+ state.pop_push(state.stash.create<DoubleValue>(aggr.result()));
+ } else {
+ state.pop_push(state.stash.create<DoubleValue>(0.0));
+ }
+};
+
struct SelectGenericReduceOp {
- template <typename ICT, typename OCT, typename AGGR> static auto invoke() {
+ template <typename ICT, typename OCT, typename AGGR> static auto invoke(const ReduceParam &param) {
+ if (param.res_type.is_double()) {
+ bool output_is_double = std::is_same_v<OCT, double>;
+ assert(output_is_double);
+ return my_full_reduce_op<ICT, typename AGGR::template templ<ICT>>;
+ }
return my_generic_reduce_op<ICT, OCT, typename AGGR::template templ<ICT>>;
}
};
@@ -223,7 +229,7 @@ GenericReduce::make_instruction(const ValueType &type, Aggr aggr, const std::vec
const ValueBuilderFactory &factory, Stash &stash)
{
auto &param = stash.create<ReduceParam>(type, dimensions, factory);
- auto fun = typify_invoke<3,ReduceTypify,SelectGenericReduceOp>(type.cell_type(), param.res_type.cell_type(), aggr);
+ auto fun = typify_invoke<3,ReduceTypify,SelectGenericReduceOp>(type.cell_type(), param.res_type.cell_type(), aggr, param);
return Instruction(fun, wrap_param<ReduceParam>(param));
}
diff --git a/eval/src/vespa/eval/instruction/generic_reduce.h b/eval/src/vespa/eval/instruction/generic_reduce.h
index 856cfe362ae..47a73820601 100644
--- a/eval/src/vespa/eval/instruction/generic_reduce.h
+++ b/eval/src/vespa/eval/instruction/generic_reduce.h
@@ -23,8 +23,8 @@ struct DenseReducePlan {
std::vector<size_t> reduce_stride;
DenseReducePlan(const ValueType &type, const ValueType &res_type);
~DenseReducePlan();
- template <typename F> void execute_keep(const F &f) const {
- run_nested_loop(0, keep_loop, keep_stride, f);
+ template <typename F> void execute_keep(size_t offset, const F &f) const {
+ run_nested_loop(offset, keep_loop, keep_stride, f);
}
template <typename F> void execute_reduce(size_t offset, const F &f) const {
run_nested_loop(offset, reduce_loop, reduce_stride, f);