make generic reduce faster

* use faster map implementation * use multiple aggregators (invert dense loops) * handle reducing all cells separately * extend nested loop benchmarking
author: Håvard Pettersen <havardpe@oath.com> 2020-10-20 10:23:24 +0000
committer: Håvard Pettersen <havardpe@oath.com> 2020-10-20 16:32:02 +0000
commit: 90633ae73e68208da7c99b766b03165dedce649c (patch)
tree: e0985e63cf4b534c83555f6e0c0261922c710e3d /eval
parent: 36f8d6761ef4bed884c884c7f673e6c7d847af7f (diff)
9 files changed, 470 insertions, 63 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt
index eb94a9498cb..cd33b051846 100644
--- a/eval/CMakeLists.txt
+++ b/eval/CMakeLists.txt
@@ -12,6 +12,7 @@ vespa_define_module(
     TESTS
     src/tests/ann
     src/tests/eval/aggr
+    src/tests/eval/array_array_map
     src/tests/eval/compile_cache
     src/tests/eval/compiled_function
     src/tests/eval/engine_or_factory
diff --git a/eval/src/tests/eval/array_array_map/CMakeLists.txt b/eval/src/tests/eval/array_array_map/CMakeLists.txt
new file mode 100644
index 00000000000..aef54d0fcb0
--- /dev/null
+++ b/eval/src/tests/eval/array_array_map/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+vespa_add_executable(eval_array_array_map_test_app TEST
+    SOURCES
+    array_array_map_test.cpp
+    DEPENDS
+    vespaeval
+    GTest::GTest
+)
+vespa_add_test(NAME eval_array_array_map_test_app COMMAND eval_array_array_map_test_app)
diff --git a/eval/src/tests/eval/array_array_map/array_array_map_test.cpp b/eval/src/tests/eval/array_array_map/array_array_map_test.cpp
new file mode 100644
index 00000000000..d6a97549b88
--- /dev/null
+++ b/eval/src/tests/eval/array_array_map/array_array_map_test.cpp
@@ -0,0 +1,70 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/eval/eval/array_array_map.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using namespace vespalib;
+using namespace vespalib::eval;
+
+std::vector<int> ints(std::vector<int> vec) { return vec; }
+
+template <typename T>
+ConstArrayRef<T> refs(const std::vector<T> &vec) { return ConstArrayRef<T>(vec); }
+
+//-----------------------------------------------------------------------------
+
+TEST(ArrayArrayMapTest, simple_map_can_be_created_and_used) {
+    // ctor params: 'keys_per_entry', 'values_per_entry', 'expected_entries'
+    ArrayArrayMap<int,int> map(2,3,5);
+    EXPECT_EQ(map.size(), 0);
+    EXPECT_FALSE(map.lookup(refs(ints({1, 2}))).valid());
+    auto tag = map.add_entry(refs(ints({1, 2})));
+    EXPECT_EQ(map.size(), 1);
+    auto values = map.get_values(tag);
+    ASSERT_EQ(values.size(), 3);
+    values[0] = 10;
+    values[1] = 20;
+    values[2] = 30;
+    EXPECT_FALSE(map.lookup(refs(ints({2, 1}))).valid());
+    auto tag2 = map.lookup(refs(ints({1, 2})));
+    ASSERT_TRUE(tag2.valid());
+    EXPECT_EQ(map.get_values(tag2)[1], 20);
+}
+
+TEST(ArrayArrayMapTest, lookup_or_add_entry_works) {
+    ArrayArrayMap<int,int> map(2,3,5);
+    auto res1 = map.lookup_or_add_entry(refs(ints({1, 2})));
+    auto res2 = map.lookup_or_add_entry(refs(ints({1, 2})));
+    EXPECT_TRUE(res1.second);
+    EXPECT_FALSE(res2.second);
+    EXPECT_EQ(map.get_values(res1.first).begin(), map.get_values(res2.first).begin());
+    EXPECT_EQ(map.get_values(res1.first).size(), 3);
+}
+
+TEST(ArrayArrayMapTest, each_entry_works) {
+    ArrayArrayMap<int,int> map(2,3,5);
+    auto res1 = map.add_entry(refs(ints({1, 2})));
+    auto res2 = map.add_entry(refs(ints({2, 1})));
+    map.get_values(res1)[0] = 10;
+    map.get_values(res2)[0] = 20;
+    EXPECT_EQ(map.size(), 2);
+    bool first = true;
+    // Note: insert order is guaranteed here
+    map.each_entry([&](const auto &keys, const auto &values)
+                   {
+                       if (first) {
+                           EXPECT_EQ(keys[0], 1);
+                           EXPECT_EQ(keys[1], 2);
+                           EXPECT_EQ(values[0], 10);
+                           first = false;
+                       } else {
+                           EXPECT_EQ(keys[0], 2);
+                           EXPECT_EQ(keys[1], 1);
+                           EXPECT_EQ(values[0], 20);
+                       } 
+                   });
+}
+
+//-----------------------------------------------------------------------------
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp b/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp
index c6e2f278cd4..3fbb80e07e5 100644
--- a/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp
+++ b/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp
@@ -10,7 +10,33 @@ using vespalib::eval::run_nested_loop;
 using LIST = std::vector<size_t>;
 using call_t = void (*)(const LIST &loop, const LIST &stride);
 
-void perform_direct(const LIST &loop, const LIST &stride) {
+void perform_direct_1(const LIST &loop, const LIST &stride) {
+    assert((loop.size() == 1) && (stride.size() == 1));
+    size_t idx1 = 0;
+    size_t expect = 0;
+    for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+        assert(idx1 == expect);
+        ++expect;
+    }
+    assert(expect == 4096);
+}
+
+void perform_direct_2(const LIST &loop, const LIST &stride) {
+    assert((loop.size() == 2) && (stride.size() == 2));
+    size_t idx1 = 0;
+    size_t expect = 0;
+    for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+        size_t idx2 = idx1;
+        for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) {
+            assert(idx2 == expect);
+            ++expect;
+        }
+    }
+    assert(expect == 4096);
+}
+
+void perform_direct_3(const LIST &loop, const LIST &stride) {
+    assert((loop.size() == 3) && (stride.size() == 3));
     size_t idx1 = 0;
     size_t expect = 0;
     for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
@@ -26,7 +52,61 @@ void perform_direct(const LIST &loop, const LIST &stride) {
     assert(expect == 4096);
 }
 
-void perform_direct_lambda(const LIST &loop, const LIST &stride) {
+void perform_direct_4(const LIST &loop, const LIST &stride) {
+    assert((loop.size() == 4) && (stride.size() == 4));
+    size_t idx1 = 0;
+    size_t expect = 0;
+    for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+        size_t idx2 = idx1;
+        for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) {
+            size_t idx3 = idx2;
+            for (size_t k = 0; k < loop[2]; ++k, idx3 += stride[2]) {
+                size_t idx4 = idx3;
+                for (size_t l = 0; l < loop[3]; ++l, idx4 += stride[3]) {
+                    assert(idx4 == expect);
+                    ++expect;
+                }
+            }
+        }
+    }
+    assert(expect == 4096);
+}
+
+void perform_direct_lambda_1(const LIST &loop, const LIST &stride) {
+    assert((loop.size() == 1) && (stride.size() == 1));
+    size_t expect = 0;
+    auto fun = [&](size_t idx) {
+        (void) idx;
+        assert(idx == expect);
+        ++expect;
+    };
+    size_t idx1 = 0;
+    for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+        fun(idx1);
+    }
+    assert(expect == 4096);
+}
+
+void perform_direct_lambda_2(const LIST &loop, const LIST &stride) {
+    assert((loop.size() == 2) && (stride.size() == 2));
+    size_t expect = 0;
+    auto fun = [&](size_t idx) {
+        (void) idx;
+        assert(idx == expect);
+        ++expect;
+    };
+    size_t idx1 = 0;
+    for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+        size_t idx2 = idx1;
+        for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) {
+            fun(idx2);
+        }
+    }
+    assert(expect == 4096);
+}
+
+void perform_direct_lambda_3(const LIST &loop, const LIST &stride) {
+    assert((loop.size() == 3) && (stride.size() == 3));
     size_t expect = 0;
     auto fun = [&](size_t idx) {
         (void) idx;
@@ -46,6 +126,30 @@ void perform_direct_lambda(const LIST &loop, const LIST &stride) {
     assert(expect == 4096);
 }
 
+void perform_direct_lambda_4(const LIST &loop, const LIST &stride) {
+    assert((loop.size() == 4) && (stride.size() == 4));
+    size_t expect = 0;
+    auto fun = [&](size_t idx) {
+        (void) idx;
+        assert(idx == expect);
+        ++expect;
+    };
+    size_t idx1 = 0;
+    for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+        size_t idx2 = idx1;
+        for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) {
+            size_t idx3 = idx2;
+            for (size_t k = 0; k < loop[2]; ++k, idx3 += stride[2]) {
+                size_t idx4 = idx3;
+                for (size_t l = 0; l < loop[3]; ++l, idx4 += stride[3]) {
+                    fun(idx4);
+                }
+            }
+        }
+    }
+    assert(expect == 4096);
+}
+
 void perform_generic(const LIST &loop, const LIST &stride) {
     size_t expect = 0;
     auto fun = [&](size_t idx) {
@@ -70,19 +174,54 @@ void perform_generic_isolate_first(const LIST &loop, const LIST &stride) {
 
 void nop() {}
 
-double estimate_cost_us(call_t perform_fun) {
+double estimate_cost_1_us(call_t perform_fun) {
+    LIST loop({4096});
+    LIST stride({1});
+    return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 10000, 5.0) * 1000.0 * 1000.0;
+}
+
+double estimate_cost_2_us(call_t perform_fun) {
+    LIST loop({64,64});
+    LIST stride({64,1});
+    return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 10000, 5.0) * 1000.0 * 1000.0;
+}
+
+double estimate_cost_3_us(call_t perform_fun) {
     LIST loop({16,16,16});
     LIST stride({256,16,1});
-    return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 100000, 5.0) * 1000.0 * 1000.0;
+    return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 10000, 5.0) * 1000.0 * 1000.0;
+}
+
+double estimate_cost_4_us(call_t perform_fun) {
+    LIST loop({8,8,8,8});
+    LIST stride({512,64,8,1});
+    return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 10000, 5.0) * 1000.0 * 1000.0;
 }
 
 //-----------------------------------------------------------------------------
 
 TEST(NestedLoopBenchmark, single_loop) {
-    fprintf(stderr, "manual direct single loop: %g us\n", estimate_cost_us(perform_direct));
-    fprintf(stderr, "manual call lambda single loop: %g us\n", estimate_cost_us(perform_direct_lambda));
-    fprintf(stderr, "generic single loop: %g us\n", estimate_cost_us(perform_generic));
-    fprintf(stderr, "generic single loop (isolate first): %g us\n", estimate_cost_us(perform_generic_isolate_first));
+    fprintf(stderr, "---------------------------------------------------------------\n");
+    fprintf(stderr, "manual direct single loop (1 layer): %g us\n", estimate_cost_1_us(perform_direct_1));
+    fprintf(stderr, "manual call lambda single loop (1 layer): %g us\n", estimate_cost_1_us(perform_direct_lambda_1));
+    fprintf(stderr, "generic single loop (1 layer): %g us\n", estimate_cost_1_us(perform_generic));
+    fprintf(stderr, "generic single loop (1 layer, isolate first): %g us\n", estimate_cost_1_us(perform_generic_isolate_first));
+    fprintf(stderr, "---------------------------------------------------------------\n");
+    fprintf(stderr, "manual direct single loop (2 layers): %g us\n", estimate_cost_2_us(perform_direct_2));
+    fprintf(stderr, "manual call lambda single loop (2 layers): %g us\n", estimate_cost_2_us(perform_direct_lambda_2));
+    fprintf(stderr, "generic single loop (2 layers): %g us\n", estimate_cost_2_us(perform_generic));
+    fprintf(stderr, "generic single loop (2 layers, isolate first): %g us\n", estimate_cost_2_us(perform_generic_isolate_first));
+    fprintf(stderr, "---------------------------------------------------------------\n");
+    fprintf(stderr, "manual direct single loop (3 layers): %g us\n", estimate_cost_3_us(perform_direct_3));
+    fprintf(stderr, "manual call lambda single loop (3 layers): %g us\n", estimate_cost_3_us(perform_direct_lambda_3));
+    fprintf(stderr, "generic single loop (3 layers): %g us\n", estimate_cost_3_us(perform_generic));
+    fprintf(stderr, "generic single loop (3 layers, isolate first): %g us\n", estimate_cost_3_us(perform_generic_isolate_first));
+    fprintf(stderr, "---------------------------------------------------------------\n");
+    fprintf(stderr, "manual direct single loop (4 layers): %g us\n", estimate_cost_4_us(perform_direct_4));
+    fprintf(stderr, "manual call lambda single loop (4 layers): %g us\n", estimate_cost_4_us(perform_direct_lambda_4));
+    fprintf(stderr, "generic single loop (4 layers): %g us\n", estimate_cost_4_us(perform_generic));
+    fprintf(stderr, "generic single loop (4 layers, isolate first): %g us\n", estimate_cost_4_us(perform_generic_isolate_first));
+    fprintf(stderr, "---------------------------------------------------------------\n");
 }
 
 //-----------------------------------------------------------------------------
diff --git a/eval/src/vespa/eval/eval/CMakeLists.txt b/eval/src/vespa/eval/eval/CMakeLists.txt
index 5a60adfb58b..6c1f99265a7 100644
--- a/eval/src/vespa/eval/eval/CMakeLists.txt
+++ b/eval/src/vespa/eval/eval/CMakeLists.txt
@@ -2,6 +2,7 @@
 vespa_add_library(eval_eval OBJECT
     SOURCES
     aggr.cpp
+    array_array_map.cpp
     basic_nodes.cpp
     call_nodes.cpp
     compile_tensor_function.cpp
diff --git a/eval/src/vespa/eval/eval/array_array_map.cpp b/eval/src/vespa/eval/eval/array_array_map.cpp
new file mode 100644
index 00000000000..93e17eaddc6
--- /dev/null
+++ b/eval/src/vespa/eval/eval/array_array_map.cpp
@@ -0,0 +1,7 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "array_array_map.h"
+
+namespace vespalib::eval {
+
+}
diff --git a/eval/src/vespa/eval/eval/array_array_map.h b/eval/src/vespa/eval/eval/array_array_map.h
new file mode 100644
index 00000000000..8ccc3492428
--- /dev/null
+++ b/eval/src/vespa/eval/eval/array_array_map.h
@@ -0,0 +1,173 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/util/arrayref.h>
+#include <vespa/vespalib/stllike/hash_set.h>
+#include <vespa/vespalib/stllike/hash_set.hpp>
+#include <vector>
+#include <cassert>
+#include <type_traits>
+
+namespace vespalib::eval {
+
+/**
+ * A map where both keys and values are arrays of some type (K and V
+ * respectively). All map entries have exactly the same number of keys
+ * and exactly the same number of values. Keys and values are stored
+ * in separate vectors external to the map itself in order to reduce
+ * memory fragmentation both by co-locating the keys and values
+ * themselves and also by reducing the internal hash node size. Once
+ * entries are added they cannot be removed. Keys cannot be
+ * overwritten, but values can.
+ **/
+template <typename K, typename V, typename H = vespalib::hash<K>, typename EQ = std::equal_to<> >
+class ArrayArrayMap
+{
+private:
+    size_t _keys_per_entry;
+    size_t _values_per_entry;
+    std::vector<K> _keys;
+    std::vector<V> _values;
+
+public:
+    size_t keys_per_entry() const { return _keys_per_entry; }
+    size_t values_per_entry() const { return _values_per_entry; }
+
+    struct Tag {
+        uint32_t id;
+        static constexpr uint32_t npos() { return uint32_t(-1); }
+        static Tag make_invalid() { return Tag{npos()}; }
+        bool valid() const { return (id != npos()); }
+    };
+
+    ConstArrayRef<K> get_keys(Tag tag) const { return {&_keys[tag.id * _keys_per_entry], _keys_per_entry}; }
+    ArrayRef<V> get_values(Tag tag) { return {&_values[tag.id * _values_per_entry], _values_per_entry}; }
+    ConstArrayRef<V> get_values(Tag tag) const { return {&_values[tag.id * _values_per_entry], _values_per_entry}; }
+
+    struct MyKey {
+        Tag tag;
+        uint32_t hash;
+    };
+
+    template <typename T> struct AltKey {
+        ConstArrayRef<T> key;
+        uint32_t hash;
+    };
+
+    struct Hash {
+        H hash_fun;
+        template <typename T> uint32_t operator()(ConstArrayRef<T> key) const {
+            uint32_t h = 0;
+            for (const T &k: key) {
+                if constexpr (std::is_pointer_v<T>) {
+                    h = h * 31 + hash_fun(*k);
+                } else {
+                    h = h * 31 + hash_fun(k);
+                }
+            }
+            return h;
+        }
+        uint32_t operator()(const MyKey &key) const { return key.hash; }
+        template <typename T> uint32_t operator()(const AltKey<T> &key) const { return key.hash; }
+    };
+
+    struct Equal {
+        const ArrayArrayMap &parent;
+        EQ eq_fun;
+        Equal(const ArrayArrayMap &parent_in) : parent(parent_in), eq_fun() {}
+        template <typename T>
+        bool operator()(const MyKey &a, const AltKey<T> &b) const {
+            if ((a.hash != b.hash) || (b.key.size() != parent.keys_per_entry())) {
+                return false;
+            }
+            auto a_key = parent.get_keys(a.tag);
+            for (size_t i = 0; i < a_key.size(); ++i) {
+                if constexpr (std::is_pointer_v<T>) {
+                    if (!eq_fun(a_key[i], *b.key[i])) {
+                        return false;
+                    }
+                } else {
+                    if (!eq_fun(a_key[i], b.key[i])) {
+                        return false;
+                    }
+                }
+            }
+            return true;
+        }
+        bool operator()(const MyKey &a, const MyKey &b) const {
+            return operator()(a, AltKey<K>{parent.get_keys(b.tag), b.hash});
+        }
+    };
+
+    using MapType = vespalib::hash_set<MyKey,Hash,Equal>;
+
+private:
+    MapType _map;
+    Hash _hasher;
+
+    template <typename T>
+    Tag add_entry(ConstArrayRef<T> key, uint32_t hash) {
+        uint32_t tag_id = _map.size();
+        for (const auto &k: key) {
+            if constexpr (std::is_pointer_v<T>) {
+                _keys.push_back(*k);
+            } else {
+                _keys.push_back(k);
+            }
+        }
+        _values.resize(_values.size() + _values_per_entry, V{});
+        auto [pos, was_inserted] = _map.insert(MyKey{tag_id,hash});
+        assert(was_inserted);
+        return Tag{tag_id};
+    }
+
+public:
+    ArrayArrayMap(size_t keys_per_entry_in, size_t values_per_entry_in, size_t expected_entries)
+        : _keys_per_entry(keys_per_entry_in), _values_per_entry(values_per_entry_in), _keys(), _values(),
+          _map(expected_entries * 2, Hash(), Equal(*this)), _hasher()
+    {
+        _keys.reserve(_keys_per_entry * expected_entries);
+        _values.reserve(_values_per_entry * expected_entries);
+        static_assert(!std::is_pointer_v<K>, "keys cannot be pointers due to auto-deref of alt keys");
+    }
+    ~ArrayArrayMap();
+
+    size_t size() const { return _map.size(); }
+
+    template <typename T>
+    Tag lookup(ConstArrayRef<T> key) const {
+        auto pos = _map.find(AltKey<T>{key, _hasher(key)});
+        if (pos == _map.end()) {
+            return Tag::make_invalid();
+        }
+        return pos->tag;
+    }
+
+    template <typename T>
+    Tag add_entry(ConstArrayRef<T> key) {
+        return add_entry(key, _hasher(key));
+    }
+
+    template <typename T>
+    std::pair<Tag,bool> lookup_or_add_entry(ConstArrayRef<T> key) {
+        uint32_t hash = _hasher(key);
+        auto pos = _map.find(AltKey<T>{key, hash});
+        if (pos == _map.end()) {
+            return {add_entry(key, hash), true};
+        }
+        return {pos->tag, false};
+    }
+
+    template <typename F>
+    void each_entry(F &&f) const {
+        for (uint32_t i = 0; i < size(); ++i) {
+            f(get_keys(Tag{i}), get_values(Tag{i}));
+        }
+    }
+};
+
+template <typename K, typename V, typename H, typename EQ>
+ArrayArrayMap<K,V,H,EQ>::~ArrayArrayMap() = default;
+
+}
diff --git a/eval/src/vespa/eval/instruction/generic_reduce.cpp b/eval/src/vespa/eval/instruction/generic_reduce.cpp
index 305f51ea445..c1cb22fad41 100644
--- a/eval/src/vespa/eval/instruction/generic_reduce.cpp
+++ b/eval/src/vespa/eval/instruction/generic_reduce.cpp
@@ -3,6 +3,7 @@
 #include "generic_reduce.h"
 #include <vespa/eval/eval/value.h>
 #include <vespa/eval/eval/wrap_param.h>
+#include <vespa/eval/eval/array_array_map.h>
 #include <vespa/vespalib/util/stash.h>
 #include <vespa/vespalib/util/typify.h>
 #include <cassert>
@@ -41,17 +42,6 @@ ReduceParam::~ReduceParam() = default;
 //-----------------------------------------------------------------------------
 
 struct SparseReduceState {
-
-    // TODO(havardpe): Optimize using vespalib::hash_map with
-    // reference to slice of external vector of vespalib::stringref as
-    // key. Track matching subspaces as linked lists contained in a
-    // single shared vector.
-
-    using MapKey = std::vector<vespalib::string>;
-    using MapValue = std::vector<size_t>;
-    using Map = std::map<MapKey,MapValue>;
-
-    Map                               map;
     std::vector<vespalib::stringref>  full_address;
     std::vector<vespalib::stringref*> fetch_address;
     std::vector<vespalib::stringref*> keep_address;
@@ -60,7 +50,8 @@ struct SparseReduceState {
     SparseReduceState(const SparseReducePlan &plan)
         : full_address(plan.keep_dims.size() + plan.num_reduce_dims),
           fetch_address(full_address.size(), nullptr),
-          keep_address(plan.keep_dims.size(), nullptr)
+          keep_address(plan.keep_dims.size(), nullptr),
+          subspace()
     {
         for (size_t i = 0; i < keep_address.size(); ++i) {
             keep_address[i] = &full_address[plan.keep_dims[i]];
@@ -69,54 +60,49 @@ struct SparseReduceState {
             fetch_address[i] = &full_address[i];
         }
     }
+    ConstArrayRef<vespalib::stringref*> keep_addr() const { return {keep_address}; }
     ~SparseReduceState();
-    void populate_map(std::unique_ptr<Value::Index::View> full_view) {
-        full_view->lookup({});
-        while (full_view->next_result(fetch_address, subspace)) {
-            std::vector<vespalib::string> key;
-            key.reserve(keep_address.size());
-            for (const vespalib::stringref *label: keep_address) {
-                key.emplace_back(*label);
-            }
-            map[key].push_back(subspace);
-        }
-    }
-    template <typename T>
-    std::vector<vespalib::stringref> make_addr(const T &map_entry) {
-        std::vector<vespalib::stringref> addr;
-        addr.reserve(map_entry.first.size());
-        for (const vespalib::string &label: map_entry.first) {
-            addr.push_back(label);
-        }
-        return addr;
-    }
 };
 SparseReduceState::~SparseReduceState() = default;
 
 template <typename ICT, typename OCT, typename AGGR>
 Value::UP
 generic_reduce(const Value &value, const ReduceParam &param) {
-    SparseReduceState sparse(param.sparse_plan);
-    sparse.populate_map(value.index().create_view({}));
     auto cells = value.cells().typify<ICT>();
-    AGGR aggr;
-    auto first = [&](size_t idx) { aggr.first(cells[idx]); };
-    auto next = [&](size_t idx) { aggr.next(cells[idx]); };
-    auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.keep_dims.size(), param.dense_plan.out_size, sparse.map.size());
-    for (const auto &entry: sparse.map) {
-        OCT *dst = builder->add_subspace(sparse.make_addr(entry)).begin();
-        auto reduce_cells = [&](size_t rel_idx)
-                            {
-                                auto pos = entry.second.begin();
-                                param.dense_plan.execute_reduce((*pos * param.dense_plan.in_size) + rel_idx, first, next);
-                                for (++pos; pos != entry.second.end(); ++pos) {
-                                    param.dense_plan.execute_reduce((*pos * param.dense_plan.in_size) + rel_idx, next);
-                                }
-                                *dst++ = aggr.result();
-                            };
-        param.dense_plan.execute_keep(reduce_cells);
+    ArrayArrayMap<vespalib::stringref,AGGR> map(param.sparse_plan.keep_dims.size(),
+                                                param.dense_plan.out_size,
+                                                value.index().size());
+    SparseReduceState sparse(param.sparse_plan);
+    auto full_view = value.index().create_view({});
+    full_view->lookup({});
+    while (full_view->next_result(sparse.fetch_address, sparse.subspace)) {
+        auto res = map.lookup_or_add_entry(sparse.keep_addr());
+        AGGR *dst = nullptr;
+        auto first = [&](size_t idx) { (dst++)->first(cells[idx]); };
+        auto next = [&](size_t idx) { (dst++)->next(cells[idx]); };
+        auto init_aggrs = [&](size_t idx) {
+            dst = map.get_values(res.first).begin();
+            param.dense_plan.execute_keep(idx, first);
+        };
+        auto fill_aggrs = [&](size_t idx) {
+            dst = map.get_values(res.first).begin();
+            param.dense_plan.execute_keep(idx, next);
+        };
+        if (res.second) {
+            param.dense_plan.execute_reduce((sparse.subspace * param.dense_plan.in_size), init_aggrs, fill_aggrs);
+        } else {
+            param.dense_plan.execute_reduce((sparse.subspace * param.dense_plan.in_size), fill_aggrs);
+        }
     }
-    if (sparse.map.empty() && param.sparse_plan.keep_dims.empty()) {
+    auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.keep_dims.size(), param.dense_plan.out_size, map.size());
+    map.each_entry([&](const auto &keys, const auto &values)
+                   {
+                       OCT *dst = builder->add_subspace(keys).begin();
+                       for (const AGGR &aggr: values) {
+                           *dst++ = aggr.result();
+                       }
+                   });
+    if ((map.size() == 0) && param.sparse_plan.keep_dims.empty()) {
         auto zero = builder->add_subspace({});
         for (size_t i = 0; i < zero.size(); ++i) {
             zero[i] = OCT{};
@@ -135,8 +121,28 @@ void my_generic_reduce_op(State &state, uint64_t param_in) {
     state.pop_push(result_ref);
 };
 
+template <typename ICT, typename AGGR>
+void my_full_reduce_op(State &state, uint64_t) {
+    auto cells = state.peek(0).cells().typify<ICT>();
+    if (cells.size() > 0) {
+        AGGR aggr;
+        aggr.first(cells[0]);
+        for (size_t i = 1; i < cells.size(); ++i) {
+            aggr.next(cells[i]);
+        }
+        state.pop_push(state.stash.create<DoubleValue>(aggr.result()));
+    } else {
+        state.pop_push(state.stash.create<DoubleValue>(0.0));
+    }
+};
+
 struct SelectGenericReduceOp {
-    template <typename ICT, typename OCT, typename AGGR> static auto invoke() {
+    template <typename ICT, typename OCT, typename AGGR> static auto invoke(const ReduceParam &param) {
+        if (param.res_type.is_double()) {
+            bool output_is_double = std::is_same_v<OCT, double>;
+            assert(output_is_double);
+            return my_full_reduce_op<ICT, typename AGGR::template templ<ICT>>;
+        }
         return my_generic_reduce_op<ICT, OCT, typename AGGR::template templ<ICT>>;
     }
 };
@@ -223,7 +229,7 @@ GenericReduce::make_instruction(const ValueType &type, Aggr aggr, const std::vec
                                 const ValueBuilderFactory &factory, Stash &stash)
 {
     auto &param = stash.create<ReduceParam>(type, dimensions, factory);
-    auto fun = typify_invoke<3,ReduceTypify,SelectGenericReduceOp>(type.cell_type(), param.res_type.cell_type(), aggr);
+    auto fun = typify_invoke<3,ReduceTypify,SelectGenericReduceOp>(type.cell_type(), param.res_type.cell_type(), aggr, param);
     return Instruction(fun, wrap_param<ReduceParam>(param));
 }
 
diff --git a/eval/src/vespa/eval/instruction/generic_reduce.h b/eval/src/vespa/eval/instruction/generic_reduce.h
index 856cfe362ae..47a73820601 100644
--- a/eval/src/vespa/eval/instruction/generic_reduce.h
+++ b/eval/src/vespa/eval/instruction/generic_reduce.h
@@ -23,8 +23,8 @@ struct DenseReducePlan {
     std::vector<size_t> reduce_stride;
     DenseReducePlan(const ValueType &type, const ValueType &res_type);
     ~DenseReducePlan();
-    template <typename F> void execute_keep(const F &f) const {
-        run_nested_loop(0, keep_loop, keep_stride, f);
+    template <typename F> void execute_keep(size_t offset, const F &f) const {
+        run_nested_loop(offset, keep_loop, keep_stride, f);
     }
     template <typename F> void execute_reduce(size_t offset, const F &f) const {
         run_nested_loop(offset, reduce_loop, reduce_stride, f);
author	Håvard Pettersen <havardpe@oath.com>	2020-10-20 10:23:24 +0000
committer	Håvard Pettersen <havardpe@oath.com>	2020-10-20 16:32:02 +0000
commit	90633ae73e68208da7c99b766b03165dedce649c (patch)
tree	e0985e63cf4b534c83555f6e0c0261922c710e3d /eval
parent	36f8d6761ef4bed884c884c7f673e6c7d847af7f (diff)