Merge pull request #14681 from vespa-engine/havardpe/generic-reduce

generic reduce
author: Håvard Pettersen <3535158+havardpe@users.noreply.github.com> 2020-10-02 15:14:40 +0200
committer: GitHub <noreply@github.com> 2020-10-02 15:14:40 +0200
commit: 46f99f3f700f3cae1b60209a01645f259e5a3f74 (patch)
tree: e84e06c8d3ac29cc829511bbf3dc93576d4596df
parent: c67283d48af378f25cf1bd3ed8e578d0e529813f (diff)
parent: 8c8c529e6fea1d0e8869dfb74881ca15ca9a89a9 (diff)
16 files changed, 918 insertions, 90 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt
index 1872ee1410a..b8cc77b8290 100644
--- a/eval/CMakeLists.txt
+++ b/eval/CMakeLists.txt
@@ -20,6 +20,7 @@ vespa_define_module(
     src/tests/eval/inline_operation
     src/tests/eval/interpreted_function
     src/tests/eval/multiply_add
+    src/tests/eval/nested_loop
     src/tests/eval/node_tools
     src/tests/eval/node_types
     src/tests/eval/param_usage
@@ -33,6 +34,7 @@ vespa_define_module(
     src/tests/eval/value_type
     src/tests/gp/ponder_nov2017
     src/tests/instruction/generic_join
+    src/tests/instruction/generic_reduce
     src/tests/instruction/generic_merge
     src/tests/instruction/generic_rename
     src/tests/tensor/default_value_builder_factory
diff --git a/eval/src/tests/eval/nested_loop/.gitignore b/eval/src/tests/eval/nested_loop/.gitignore
new file mode 100644
index 00000000000..a3c003c3a2d
--- /dev/null
+++ b/eval/src/tests/eval/nested_loop/.gitignore
@@ -0,0 +1 @@
+/eval_nested_loop_bench_app
diff --git a/eval/src/tests/eval/nested_loop/CMakeLists.txt b/eval/src/tests/eval/nested_loop/CMakeLists.txt
new file mode 100644
index 00000000000..ccd751143b1
--- /dev/null
+++ b/eval/src/tests/eval/nested_loop/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(eval_nested_loop_test_app TEST
+    SOURCES
+    nested_loop_test.cpp
+    DEPENDS
+    vespaeval
+    GTest::GTest
+)
+vespa_add_test(NAME eval_nested_loop_test_app COMMAND eval_nested_loop_test_app)
+vespa_add_executable(eval_nested_loop_bench_app TEST
+    SOURCES
+    nested_loop_bench.cpp
+    DEPENDS
+    vespaeval
+    GTest::GTest
+)
diff --git a/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp b/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp
new file mode 100644
index 00000000000..c6e2f278cd4
--- /dev/null
+++ b/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp
@@ -0,0 +1,90 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/eval/eval/nested_loop.h>
+#include <vespa/vespalib/util/benchmark_timer.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using vespalib::BenchmarkTimer;
+using vespalib::eval::run_nested_loop;
+
+using LIST = std::vector<size_t>;
+using call_t = void (*)(const LIST &loop, const LIST &stride);
+
+void perform_direct(const LIST &loop, const LIST &stride) {
+    size_t idx1 = 0;
+    size_t expect = 0;
+    for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+        size_t idx2 = idx1;
+        for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) {
+            size_t idx3 = idx2;
+            for (size_t k = 0; k < loop[2]; ++k, idx3 += stride[2]) {
+                assert(idx3 == expect);
+                ++expect;
+            }
+        }
+    }
+    assert(expect == 4096);
+}
+
+void perform_direct_lambda(const LIST &loop, const LIST &stride) {
+    size_t expect = 0;
+    auto fun = [&](size_t idx) {
+        (void) idx;
+        assert(idx == expect);
+        ++expect;
+    };
+    size_t idx1 = 0;
+    for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+        size_t idx2 = idx1;
+        for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) {
+            size_t idx3 = idx2;
+            for (size_t k = 0; k < loop[2]; ++k, idx3 += stride[2]) {
+                fun(idx3);
+            }
+        }
+    }
+    assert(expect == 4096);
+}
+
+void perform_generic(const LIST &loop, const LIST &stride) {
+    size_t expect = 0;
+    auto fun = [&](size_t idx) {
+        (void) idx;
+        assert(idx == expect);
+        ++expect;
+    };
+    run_nested_loop(0, loop, stride, fun);
+    assert(expect == 4096);
+}
+
+void perform_generic_isolate_first(const LIST &loop, const LIST &stride) {
+    size_t expect = 0;
+    auto fun = [&](size_t idx) {
+        (void) idx;
+        assert(idx == expect);
+        ++expect;
+    };
+    run_nested_loop(0, loop, stride, fun, fun);
+    assert(expect == 4096);
+}
+
+void nop() {}
+
+double estimate_cost_us(call_t perform_fun) {
+    LIST loop({16,16,16});
+    LIST stride({256,16,1});
+    return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 100000, 5.0) * 1000.0 * 1000.0;
+}
+
+//-----------------------------------------------------------------------------
+
+TEST(NestedLoopBenchmark, single_loop) {
+    fprintf(stderr, "manual direct single loop: %g us\n", estimate_cost_us(perform_direct));
+    fprintf(stderr, "manual call lambda single loop: %g us\n", estimate_cost_us(perform_direct_lambda));
+    fprintf(stderr, "generic single loop: %g us\n", estimate_cost_us(perform_generic));
+    fprintf(stderr, "generic single loop (isolate first): %g us\n", estimate_cost_us(perform_generic_isolate_first));
+}
+
+//-----------------------------------------------------------------------------
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/tests/eval/nested_loop/nested_loop_test.cpp b/eval/src/tests/eval/nested_loop/nested_loop_test.cpp
new file mode 100644
index 00000000000..a72c5ed9cca
--- /dev/null
+++ b/eval/src/tests/eval/nested_loop/nested_loop_test.cpp
@@ -0,0 +1,88 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/eval/eval/nested_loop.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using namespace vespalib::eval;
+
+std::vector<size_t> run_single(size_t idx_in, const std::vector<size_t> &loop, const std::vector<size_t> &stride) {
+    std::vector<size_t> result;
+    auto capture = [&](size_t idx_out) { result.push_back(idx_out); };
+    assert(loop.size() == stride.size());
+    run_nested_loop(idx_in, loop, stride, capture);
+    return result;
+}
+
+std::pair<std::vector<size_t>,std::vector<size_t>> run_single_isolated(size_t idx_in, const std::vector<size_t> &loop, const std::vector<size_t> &stride) {
+    std::vector<size_t> res1;
+    std::vector<size_t> res2;
+    auto capture1 = [&](size_t idx_out) { res1.push_back(idx_out); };
+    auto capture2 = [&](size_t idx_out) { res2.push_back(idx_out); };
+    assert(loop.size() == stride.size());
+    run_nested_loop(idx_in, loop, stride, capture1, capture2);
+    return std::make_pair(res1, res2);
+}
+
+std::vector<std::pair<size_t,size_t>> run_double(size_t idx1_in, size_t idx2_in, const std::vector<size_t> &loop,
+                                                 const std::vector<size_t> &stride1, const std::vector<size_t> &stride2)
+{
+    std::vector<std::pair<size_t,size_t>> result;
+    auto capture = [&](size_t idx1_out, size_t idx2_out) { result.emplace_back(idx1_out, idx2_out); };
+    assert(loop.size() == stride1.size());
+    assert(loop.size() == stride2.size());
+    run_nested_loop(idx1_in, idx2_in, loop, stride1, stride2, capture);
+    return result;
+}
+
+void verify_isolated(size_t idx_in, const std::vector<size_t> &loop, const std::vector<size_t> &stride) {
+    auto full = run_single(idx_in, loop, stride);
+    auto actual = run_single_isolated(idx_in, loop, stride);
+    ASSERT_EQ(actual.first.size(), 1);
+    ASSERT_EQ(actual.second.size(), full.size() - 1);
+    EXPECT_EQ(actual.first[0], full[0]);
+    full.erase(full.begin());
+    EXPECT_EQ(actual.second, full);
+}
+
+void verify_double(size_t idx1_in, size_t idx2_in, const std::vector<size_t> &loop,
+                   const std::vector<size_t> &stride1, const std::vector<size_t> &stride2)
+{
+    auto res1 = run_single(idx1_in, loop, stride1);
+    auto res2 = run_single(idx2_in, loop, stride2);
+    ASSERT_EQ(res1.size(), res2.size());
+    std::vector<std::pair<size_t,size_t>> expect;
+    for (size_t i = 0; i < res1.size(); ++i) {
+        expect.emplace_back(res1[i], res2[i]);
+    }
+    auto actual = run_double(idx1_in, idx2_in, loop, stride1, stride2);
+    EXPECT_EQ(actual, expect);
+}
+
+std::vector<size_t> v(std::vector<size_t> vec) { return vec; }
+
+TEST(NestedLoopTest, single_nested_loop_can_be_executed) {
+    EXPECT_EQ(v({123}), run_single(123, {}, {}));
+    EXPECT_EQ(v({10,11}), run_single(10, {2}, {1}));
+    EXPECT_EQ(v({100,110,101,111}), run_single(100, {2,2}, {1,10}));
+    EXPECT_EQ(v({100,110,100,110,101,111,101,111}), run_single(100, {2,2,2}, {1,0,10}));
+    EXPECT_EQ(v({100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115}),
+              run_single(100, {2,2,2,2}, {8,4,2,1}));
+}
+
+TEST(NestedLoopTest, single_nested_loop_with_first_entry_isolated_can_be_executed) {
+    verify_isolated(10, {}, {});
+    verify_isolated(10, {3}, {5});
+    verify_isolated(10, {3,3}, {2,3});
+    verify_isolated(10, {3,3,2}, {2,0,3});
+    verify_isolated(10, {2,3,2,3}, {7,2,1,3});
+}
+
+TEST(NestedLoopTest, double_nested_loop_can_be_executed) {
+    verify_double(10, 20, {}, {}, {});
+    verify_double(10, 20, {3}, {5}, {7});
+    verify_double(10, 20, {3,3}, {2,3}, {7,5});
+    verify_double(10, 20, {3,3,2}, {2,0,3}, {0,7,5});
+    verify_double(10, 20, {2,3,2,3}, {7,2,1,3}, {3,7,5,1});
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/tests/instruction/generic_reduce/CMakeLists.txt b/eval/src/tests/instruction/generic_reduce/CMakeLists.txt
new file mode 100644
index 00000000000..f04664ae64c
--- /dev/null
+++ b/eval/src/tests/instruction/generic_reduce/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(eval_generic_reduce_test_app TEST
+    SOURCES
+    generic_reduce_test.cpp
+    DEPENDS
+    vespaeval
+    GTest::GTest
+)
+vespa_add_test(NAME eval_generic_reduce_test_app COMMAND eval_generic_reduce_test_app)
diff --git a/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp b/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp
new file mode 100644
index 00000000000..7e9a059967a
--- /dev/null
+++ b/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp
@@ -0,0 +1,109 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/eval/eval/simple_value.h>
+#include <vespa/eval/eval/value_codec.h>
+#include <vespa/eval/instruction/generic_reduce.h>
+#include <vespa/eval/eval/interpreted_function.h>
+#include <vespa/eval/eval/test/tensor_model.hpp>
+#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/vespalib/gtest/gtest.h>
+#include <optional>
+
+using namespace vespalib;
+using namespace vespalib::eval;
+using namespace vespalib::eval::instruction;
+using namespace vespalib::eval::test;
+
+using vespalib::make_string_short::fmt;
+
+std::vector<Layout> layouts = {
+    {},
+    {x(3)},
+    {x(3),y(5)},
+    {x(3),y(5),z(7)},
+    float_cells({x(3),y(5),z(7)}),
+    {x({"a","b","c"})},
+    {x({"a","b","c"}),y({"foo","bar"})},
+    {x({"a","b","c"}),y({"foo","bar"}),z({"i","j","k","l"})},
+    float_cells({x({"a","b","c"}),y({"foo","bar"}),z({"i","j","k","l"})}),
+    {x(3),y({"foo", "bar"}),z(7)},
+    {x({"a","b","c"}),y(5),z({"i","j","k","l"})},
+    float_cells({x({"a","b","c"}),y(5),z({"i","j","k","l"})})
+};
+
+TensorSpec reference_reduce(const TensorSpec &a, const std::vector<vespalib::string> &dims, Aggr aggr) {
+    Stash stash;
+    ValueType res_type = ValueType::from_spec(a.type()).reduce(dims);
+    EXPECT_FALSE(res_type.is_error());
+    std::map<TensorSpec::Address,std::optional<Aggregator*>> my_map;
+    for (const auto &cell: a.cells()) {
+        TensorSpec::Address addr;
+        for (const auto &dim: cell.first) {
+            if (res_type.dimension_index(dim.first) != ValueType::Dimension::npos) {
+                addr.insert_or_assign(dim.first, dim.second);
+            }
+        }
+        auto [pos, is_empty] = my_map.emplace(addr, std::nullopt);
+        if (is_empty) {
+            pos->second = &Aggregator::create(aggr, stash);
+            pos->second.value()->first(cell.second);
+        } else {
+            pos->second.value()->next(cell.second);
+        }
+    }
+    TensorSpec result(res_type.to_spec());
+    for (const auto &my_entry: my_map) {
+        result.add(my_entry.first, my_entry.second.value()->result());
+    }
+    return result;
+}
+
+TensorSpec perform_generic_reduce(const TensorSpec &a, const std::vector<vespalib::string> &dims, Aggr aggr) {
+    Stash stash;
+    const auto &factory = SimpleValueBuilderFactory::get();
+    auto lhs = value_from_spec(a, factory);
+    auto my_op = GenericReduce::make_instruction(lhs->type(), aggr, dims, factory, stash);
+    InterpretedFunction::EvalSingle single(my_op);
+    return spec_from_value(single.eval(std::vector<Value::CREF>({*lhs})));
+}
+
+TEST(GenericReduceTest, dense_reduce_plan_can_be_created) {
+    auto type = ValueType::from_spec("tensor(a[2],aa{},b[2],bb[1],c[2],cc{},d[2],dd[1],e[2],ee{},f[2])");
+    auto plan = DenseReducePlan(type, type.reduce({"a", "d", "e"}));
+    std::vector<size_t> expect_keep_loop = {4,2};
+    std::vector<size_t> expect_keep_stride = {8,1};
+    std::vector<size_t> expect_reduce_loop = {2,4};
+    std::vector<size_t> expect_reduce_stride = {32,2};
+    EXPECT_EQ(plan.in_size, 64);
+    EXPECT_EQ(plan.out_size, 8);
+    EXPECT_EQ(plan.keep_loop, expect_keep_loop);
+    EXPECT_EQ(plan.keep_stride, expect_keep_stride);
+    EXPECT_EQ(plan.reduce_loop, expect_reduce_loop);
+    EXPECT_EQ(plan.reduce_stride, expect_reduce_stride);
+}
+
+TEST(GenericReduceTest, sparse_reduce_plan_can_be_created) {
+    auto type = ValueType::from_spec("tensor(a{},aa[10],b{},c{},cc[5],d{},e{},ee[1],f{})");
+    auto plan = SparseReducePlan(type, type.reduce({"a", "d", "e"}));
+    std::vector<size_t> expect_keep_dims = {1,2,5};
+    EXPECT_EQ(plan.num_reduce_dims, 3);
+    EXPECT_EQ(plan.keep_dims, expect_keep_dims);
+}
+
+TEST(GenericReduceTest, generic_reduce_works_for_simple_values) {
+    for (const Layout &layout: layouts) {
+        TensorSpec input = spec(layout, Div16(N()));
+        for (Aggr aggr: {Aggr::SUM, Aggr::AVG, Aggr::MIN, Aggr::MAX}) {
+            for (const Domain &domain: layout) {
+                auto expect = reference_reduce(input, {domain.dimension}, aggr);
+                auto actual = perform_generic_reduce(input, {domain.dimension}, aggr);
+                EXPECT_EQ(actual, expect);
+            }
+            auto expect = reference_reduce(input, {}, aggr);
+            auto actual = perform_generic_reduce(input, {}, aggr);
+            EXPECT_EQ(actual, expect);
+        }
+    }
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/tests/instruction/generic_rename/generic_rename_test.cpp b/eval/src/tests/instruction/generic_rename/generic_rename_test.cpp
index f61899e4dda..d5c7f58fc1f 100644
--- a/eval/src/tests/instruction/generic_rename/generic_rename_test.cpp
+++ b/eval/src/tests/instruction/generic_rename/generic_rename_test.cpp
@@ -111,12 +111,12 @@ TensorSpec reference_rename(const TensorSpec &a, const FromTo &ft) {
     return result;
 }
 
-TensorSpec perform_generic_rename(const TensorSpec &a, const ValueType &res_type,
-                              const FromTo &ft, const ValueBuilderFactory &factory)
+TensorSpec perform_generic_rename(const TensorSpec &a,
+                                  const FromTo &ft, const ValueBuilderFactory &factory)
 {
     Stash stash;
     auto lhs = value_from_spec(a, factory);
-    auto my_op = GenericRename::make_instruction(lhs->type(), res_type, ft.from, ft.to, factory, stash);
+    auto my_op = GenericRename::make_instruction(lhs->type(), ft.from, ft.to, factory, stash);
     InterpretedFunction::EvalSingle single(my_op);
     return spec_from_value(single.eval(std::vector<Value::CREF>({*lhs})));
 }
@@ -132,7 +132,7 @@ void test_generic_rename(const ValueBuilderFactory &factory) {
             // printf("type %s -> %s\n", lhs_type.to_spec().c_str(), renamed_type.to_spec().c_str());
             SCOPED_TRACE(fmt("\n===\nLHS: %s\n===\n", lhs.to_string().c_str()));
             auto expect = reference_rename(lhs, from_to);
-            auto actual = perform_generic_rename(lhs, renamed_type, from_to, factory);
+            auto actual = perform_generic_rename(lhs, from_to, factory);
             EXPECT_EQ(actual, expect);
         }
     }
diff --git a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
index 4771034902b..2e126ea941f 100644
--- a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
+++ b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
@@ -14,15 +14,17 @@
 // improvement in generic dense join and possibly also in sparse join
 // with partial dimensional overlap. Benchmarks are done using float
 // cells since this is what gives best overall performance in
-// production. Also, we use the multiply operation since it is the
-// most optimized operations across all implementations. When
-// benchmarking different implementations against each other, a smoke
-// test is performed by verifying that all implementations produce the
-// same result.
+// production. Also, we use the multiply operation for join and sum
+// operation for reduce since those are the most optimized operations
+// across all implementations. When benchmarking different
+// implementations against each other, a smoke test is performed by
+// verifying that all implementations produce the same result.
 
 #include <vespa/eval/eval/simple_value.h>
 #include <vespa/eval/eval/interpreted_function.h>
 #include <vespa/eval/instruction/generic_join.h>
+#include <vespa/eval/instruction/generic_reduce.h>
+#include <vespa/eval/instruction/generic_rename.h>
 #include <vespa/eval/eval/simple_tensor_engine.h>
 #include <vespa/eval/eval/tensor_spec.h>
 #include <vespa/eval/eval/value_codec.h>
@@ -60,6 +62,8 @@ struct Impl {
     virtual Value::UP create_value(const TensorSpec &spec) const = 0;
     virtual TensorSpec create_spec(const Value &value) const = 0;
     virtual Instruction create_join(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const = 0;
+    virtual Instruction create_reduce(const ValueType &lhs, Aggr aggr, const std::vector<vespalib::string> &dims, Stash &stash) const = 0;
+    virtual Instruction create_rename(const ValueType &lhs, const std::vector<vespalib::string> &from, const std::vector<vespalib::string> &to, Stash &stash) const = 0;
     virtual const TensorEngine &engine() const { return SimpleTensorEngine::ref(); } // engine used by EvalSingle
     virtual ~Impl() {}
 };
@@ -73,6 +77,12 @@ struct ValueImpl : Impl {
     Instruction create_join(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const override {
         return GenericJoin::make_instruction(lhs, rhs, function, my_factory, stash);
     }
+    Instruction create_reduce(const ValueType &lhs, Aggr aggr, const std::vector<vespalib::string> &dims, Stash &stash) const override {
+        return GenericReduce::make_instruction(lhs, aggr, dims, my_factory, stash);
+    }
+    Instruction create_rename(const ValueType &lhs, const std::vector<vespalib::string> &from, const std::vector<vespalib::string> &to, Stash &stash) const override {
+        return GenericRename::make_instruction(lhs, from, to, my_factory, stash);
+    }
 };
 
 struct EngineImpl : Impl {
@@ -82,12 +92,24 @@ struct EngineImpl : Impl {
     Value::UP create_value(const TensorSpec &spec) const override { return my_engine.from_spec(spec); }
     TensorSpec create_spec(const Value &value) const override { return my_engine.to_spec(value); }
     Instruction create_join(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const override {
-        // create a complete tensor function joining two parameters, but only compile the join instruction itself
+        // create a complete tensor function, but only compile the relevant instruction
         const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
         const auto &rhs_node = tensor_function::inject(rhs, 1, stash);
         const auto &join_node = tensor_function::join(lhs_node, rhs_node, function, stash); 
         return join_node.compile_self(my_engine, stash);
     }
+    Instruction create_reduce(const ValueType &lhs, Aggr aggr, const std::vector<vespalib::string> &dims, Stash &stash) const override {
+        // create a complete tensor function, but only compile the relevant instruction
+        const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
+        const auto &reduce_node = tensor_function::reduce(lhs_node, aggr, dims, stash); 
+        return reduce_node.compile_self(my_engine, stash);
+    }
+    Instruction create_rename(const ValueType &lhs, const std::vector<vespalib::string> &from, const std::vector<vespalib::string> &to, Stash &stash) const override {
+        // create a complete tensor function, but only compile the relevant instruction
+        const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
+        const auto &rename_node = tensor_function::rename(lhs_node, from, to, stash);
+        return rename_node.compile_self(my_engine, stash);
+    }
     const TensorEngine &engine() const override { return my_engine; }
 };
 
@@ -100,7 +122,11 @@ ValueImpl    packed_mixed_tensor_impl(2, "  PackedMixedTensor", "  Packed", Pack
 ValueImpl   default_tensor_value_impl(0, "       DefaultValue", "NEW PROD", DefaultValueBuilderFactory::get());
 vespalib::string                                   short_header("--------");
 
-double budget = 5.0;
+constexpr double budget = 5.0;
+constexpr double best_limit = 0.95; // everything within 95% of best performance gets a star
+constexpr double bad_limit = 0.90; // BAD: new prod has performance lower than 90% of old prod
+constexpr double good_limit = 1.10; // GOOD: new prod has performance higher than 110% of old prod
+
 std::vector<CREF<Impl>> impl_list = {simple_tensor_engine_impl,
                                      default_tensor_engine_impl,
                                      simple_value_impl,
@@ -117,25 +143,25 @@ struct BenchmarkHeader {
             short_names[impl.order] = impl.short_name;
         }
     }
+    void print_header(const vespalib::string &desc) const {
+        for (const auto &name: short_names) {
+            fprintf(stderr, "|%s", name.c_str());
+        }
+        fprintf(stderr, "| %s Benchmark cases\n", desc.c_str());
+    }
     void print_trailer() const {
         for (size_t i = 0; i < short_names.size(); ++i) {
             fprintf(stderr, "+%s", short_header.c_str());
         }
         fprintf(stderr, "+------------------------------------------------\n");
     }
-    void print() const {
-        for (const auto &name: short_names) {
-            fprintf(stderr, "|%s", name.c_str());
-        }
-        fprintf(stderr, "| Benchmark description\n");
-        print_trailer();
-    }
 };
 
 struct BenchmarkResult {
     vespalib::string desc;
     std::optional<double> ref_time;
     std::vector<double> relative_perf;
+    double star_rating;
     BenchmarkResult(const vespalib::string &desc_in, size_t num_values)
         : desc(desc_in), ref_time(std::nullopt), relative_perf(num_values, 0.0) {}
     ~BenchmarkResult();
@@ -150,13 +176,20 @@ struct BenchmarkResult {
         }
     }
     void normalize() {
+        star_rating = 0.0;
         for (double &perf: relative_perf) {
             perf = ref_time.value() / perf;
+            star_rating = std::max(star_rating, perf);
         }
+        star_rating *= best_limit;
     }
     void print() const {
         for (double perf: relative_perf) {
-            fprintf(stderr, "|%8.2f", perf);
+            if (perf > star_rating) {
+                fprintf(stderr, "|*%7.2f", perf);
+            } else {
+                fprintf(stderr, "| %7.2f", perf);
+            }
         }
         fprintf(stderr, "| %s\n", desc.c_str());
     }
@@ -239,6 +272,45 @@ void benchmark_join(const vespalib::string &desc, const TensorSpec &lhs,
 
 //-----------------------------------------------------------------------------
 
+void benchmark_reduce(const vespalib::string &desc, const TensorSpec &lhs,
+                      Aggr aggr, const std::vector<vespalib::string> &dims)
+{
+    Stash stash;
+    ValueType lhs_type = ValueType::from_spec(lhs.type());
+    ValueType res_type = lhs_type.reduce(dims);
+    ASSERT_FALSE(lhs_type.is_error());
+    ASSERT_FALSE(res_type.is_error());
+    std::vector<EvalOp::UP> list;
+    for (const Impl &impl: impl_list) {
+        auto op = impl.create_reduce(lhs_type, aggr, dims, stash);
+        std::vector<CREF<TensorSpec>> stack_spec({lhs});
+        list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl));
+    }
+    benchmark(desc, list);
+}
+
+//-----------------------------------------------------------------------------
+
+void benchmark_rename(const vespalib::string &desc, const TensorSpec &lhs,
+                      const std::vector<vespalib::string> &from,
+                      const std::vector<vespalib::string> &to)
+{
+    Stash stash;
+    ValueType lhs_type = ValueType::from_spec(lhs.type());
+    ValueType res_type = lhs_type.rename(from, to);
+    ASSERT_FALSE(lhs_type.is_error());
+    ASSERT_FALSE(res_type.is_error());
+    std::vector<EvalOp::UP> list;
+    for (const Impl &impl: impl_list) {
+        auto op = impl.create_rename(lhs_type, from, to, stash);
+        std::vector<CREF<TensorSpec>> stack_spec({lhs});
+        list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl));
+    }
+    benchmark(desc, list);
+}
+
+//-----------------------------------------------------------------------------
+
 struct D {
     vespalib::string name;
     bool mapped;
@@ -281,6 +353,7 @@ template <typename ...Ds> TensorSpec make_spec(double seq, const Ds &...ds) {
 }
 
 TensorSpec make_vector(const D &d1, double seq) { return make_spec(seq, d1); }
+TensorSpec make_matrix(const D &d1, const D &d2, double seq) { return make_spec(seq, d1, d2); }
 TensorSpec make_cube(const D &d1, const D &d2, const D &d3, double seq) { return make_spec(seq, d1, d2, d3); }
 
 //-----------------------------------------------------------------------------
@@ -392,15 +465,91 @@ TEST(MixedJoin, no_overlap) {
 
 //-----------------------------------------------------------------------------
 
-TEST(PrintResults, print_results) {
+TEST(ReduceBench, dense_reduce) {
+    auto lhs = make_cube(D::idx("a", 16), D::idx("b", 16), D::idx("c", 16), 1.0);
+    benchmark_reduce("dense reduce inner", lhs, Aggr::SUM, {"c"});
+    benchmark_reduce("dense reduce middle", lhs, Aggr::SUM, {"b"});
+    benchmark_reduce("dense reduce outer", lhs, Aggr::SUM, {"a"});
+    benchmark_reduce("dense multi-reduce inner", lhs, Aggr::SUM, {"b", "c"});
+    benchmark_reduce("dense multi-reduce outer", lhs, Aggr::SUM, {"a", "b"});
+    benchmark_reduce("dense multi-reduce outer-inner", lhs, Aggr::SUM, {"a", "c"});
+    benchmark_reduce("dense reduce all", lhs, Aggr::SUM, {});
+}
+
+TEST(ReduceBench, sparse_reduce) {
+    auto lhs = make_cube(D::map("a", 16, 1), D::map("b", 16, 1), D::map("c", 16, 1), 1.0);
+    benchmark_reduce("sparse reduce inner", lhs, Aggr::SUM, {"c"});
+    benchmark_reduce("sparse reduce middle", lhs, Aggr::SUM, {"b"});
+    benchmark_reduce("sparse reduce outer", lhs, Aggr::SUM, {"a"});
+    benchmark_reduce("sparse multi-reduce inner", lhs, Aggr::SUM, {"b", "c"});
+    benchmark_reduce("sparse multi-reduce outer", lhs, Aggr::SUM, {"a", "b"});
+    benchmark_reduce("sparse multi-reduce outer-inner", lhs, Aggr::SUM, {"a", "c"});
+    benchmark_reduce("sparse reduce all", lhs, Aggr::SUM, {});
+}
+
+TEST(ReduceBench, mixed_reduce) {
+    auto lhs = make_spec(1.0, D::map("a", 4, 1), D::map("b", 4, 1), D::map("c", 4, 1),
+                         D::idx("d", 4), D::idx("e", 4), D::idx("f", 4));
+    benchmark_reduce("mixed reduce middle dense", lhs, Aggr::SUM, {"e"});
+    benchmark_reduce("mixed reduce middle sparse", lhs, Aggr::SUM, {"b"});
+    benchmark_reduce("mixed reduce middle sparse/dense", lhs, Aggr::SUM, {"b", "e"});
+    benchmark_reduce("mixed reduce all dense", lhs, Aggr::SUM, {"d", "e", "f"});
+    benchmark_reduce("mixed reduce all sparse", lhs, Aggr::SUM, {"a", "b", "c"});
+    benchmark_reduce("mixed reduce all", lhs, Aggr::SUM, {});
+}
+
+//-----------------------------------------------------------------------------
+
+TEST(RenameBench, dense_rename) {
+    auto lhs = make_matrix(D::idx("a", 16), D::idx("b", 16), 1.0);
+    benchmark_rename("dense transpose", lhs, {"a", "b"}, {"b", "a"});
+}
+
+TEST(RenameBench, sparse_rename) {
+    auto lhs = make_matrix(D::map("a", 16, 1), D::map("b", 16, 1), 1.0);
+    benchmark_rename("sparse transpose", lhs, {"a", "b"}, {"b", "a"});
+}
+
+TEST(RenameBench, mixed_rename) {
+    auto lhs = make_spec(1.0, D::map("a", 8, 1), D::map("b", 8, 1), D::idx("c", 8), D::idx("d", 8));
+    benchmark_rename("mixed multi-transpose", lhs, {"a", "b", "c", "d"}, {"b", "a", "d", "c"});
+}
+
+//-----------------------------------------------------------------------------
+
+void print_results(const vespalib::string &desc, const std::vector<BenchmarkResult> &results) {
+    if (results.empty()) {
+        return;
+    }
     BenchmarkHeader header;
+    header.print_trailer();
+    header.print_header(desc);
+    header.print_trailer();
+    for (const auto &result: results) {
+        result.print();
+    }
+    header.print_trailer();
+}
+
+TEST(PrintResults, print_results) {
+    std::vector<BenchmarkResult> bad_results;
+    std::vector<BenchmarkResult> neutral_results;
+    std::vector<BenchmarkResult> good_results;
     std::sort(benchmark_results.begin(), benchmark_results.end(),
               [](const auto &a, const auto &b){ return (a.relative_perf[0] < b.relative_perf[0]); });
-    header.print();
     for (const auto &result: benchmark_results) {
-        result.print();
+        double perf = result.relative_perf[0];
+        if (perf < bad_limit) {
+            bad_results.push_back(result);
+        } else if (perf > good_limit) {
+            good_results.push_back(result);
+        } else {
+            neutral_results.push_back(result);
+        }
     }
-    header.print_trailer();
+    print_results("BAD", bad_results);
+    print_results("NEUTRAL", neutral_results);
+    print_results("GOOD", good_results);
 }
 
 GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/vespa/eval/eval/nested_loop.h b/eval/src/vespa/eval/eval/nested_loop.h
new file mode 100644
index 00000000000..2fe98fb1e9b
--- /dev/null
+++ b/eval/src/vespa/eval/eval/nested_loop.h
@@ -0,0 +1,143 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vector>
+
+namespace vespalib::eval {
+
+// This file contains implementations for the generic nested loops
+// used by DenseJoinPlan, DenseReducePlan and similar. The loops act
+// like arbitrarily nested for-loops that are index-based where each
+// loop-level has a different stride that modifies the overall
+// index. An initial index is passed to the top-level function, which
+// is then modified by each loop-layer and finally passed back to a
+// callable for each iteration of the inner loop. There are different
+// implementations for traversing different numbers of index spaces in
+// parallel. Note that all loop layers must have at least 1 iteration.
+
+namespace nested_loop {
+
+//-----------------------------------------------------------------------------
+
+template <typename F, size_t N> void execute_few(size_t idx, const size_t *loop, const size_t *stride, const F &f) {
+    if constexpr (N == 0) {
+        f(idx);
+    } else {
+        for (size_t i = 0; i < *loop; ++i, idx += *stride) {
+            execute_few<F, N - 1>(idx, loop + 1, stride + 1, f);
+        }
+    }
+}
+
+template <typename F> void execute_many(size_t idx, const size_t *loop, const size_t *stride, size_t levels, const F &f) {
+    for (size_t i = 0; i < *loop; ++i, idx += *stride) {
+        if ((levels - 1) == 3) {
+            execute_few<F, 3>(idx, loop + 1, stride + 1, f);
+        } else {
+            execute_many<F>(idx, loop + 1, stride + 1, levels - 1, f);
+        }
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+template <typename FIRST, typename NEXT, size_t N>
+void execute_few(size_t idx, const size_t *loop, const size_t *stride, const FIRST &first, const NEXT &next) {
+    if constexpr (N == 0) {
+        first(idx);
+    } else {
+        execute_few<FIRST, NEXT, N - 1>(idx, loop + 1, stride + 1, first, next);
+        idx += *stride;
+        for (size_t i = 1; i < *loop; ++i, idx += *stride) {
+            execute_few<NEXT, N - 1>(idx, loop + 1, stride + 1, next);
+        }
+    }
+}
+
+template <typename FIRST, typename NEXT>
+void execute_many(size_t idx, const size_t *loop, const size_t *stride, size_t levels, const FIRST &first, const NEXT &next) {
+    if ((levels - 1) == 3) {
+        execute_few<FIRST, NEXT, 3>(idx, loop + 1, stride + 1, first, next);
+    } else {
+        execute_many<FIRST, NEXT>(idx, loop + 1, stride + 1, levels - 1, first, next);
+    }
+    idx += *stride;
+    for (size_t i = 1; i < *loop; ++i, idx += *stride) {
+        if ((levels - 1) == 3) {
+            execute_few<NEXT, 3>(idx, loop + 1, stride + 1, next);
+        } else {
+            execute_many<NEXT>(idx, loop + 1, stride + 1, levels - 1, next);
+        }
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+template <typename F, size_t N> void execute_few(size_t idx1, size_t idx2, const size_t *loop, const size_t *stride1, const size_t *stride2, const F &f) {
+    if constexpr (N == 0) {
+        f(idx1, idx2);
+    } else {
+        for (size_t i = 0; i < *loop; ++i, idx1 += *stride1, idx2 += *stride2) {
+            execute_few<F, N - 1>(idx1, idx2, loop + 1, stride1 + 1, stride2 + 1, f);
+        }
+    }
+}
+
+template <typename F> void execute_many(size_t idx1, size_t idx2, const size_t *loop, const size_t *stride1, const size_t *stride2, size_t levels, const F &f) {
+    for (size_t i = 0; i < *loop; ++i, idx1 += *stride1, idx2 += *stride2) {
+        if ((levels - 1) == 3) {
+            execute_few<F, 3>(idx1, idx2, loop + 1, stride1 + 1, stride2 + 1, f);
+        } else {
+            execute_many<F>(idx1, idx2, loop + 1, stride1 + 1, stride2 + 1, levels - 1, f);
+        }
+    }
+}
+
+//-----------------------------------------------------------------------------
+
+} // implementation details
+
+// Run a nested loop and pass indexes to 'f'
+template <typename F>
+void run_nested_loop(size_t idx, const std::vector<size_t> &loop, const std::vector<size_t> &stride, const F &f) {
+    size_t levels = loop.size();
+    switch(levels) {
+    case 0: return f(idx);
+    case 1: return nested_loop::execute_few<F, 1>(idx, &loop[0], &stride[0], f);
+    case 2: return nested_loop::execute_few<F, 2>(idx, &loop[0], &stride[0], f);
+    case 3: return nested_loop::execute_few<F, 3>(idx, &loop[0], &stride[0], f);
+    default: return nested_loop::execute_many<F>(idx, &loop[0], &stride[0], levels, f);
+    }
+}
+
+// Run a nested loop and pass the first index to 'first' and all
+// subsequent indexes to 'next'
+template <typename FIRST, typename NEXT>
+void run_nested_loop(size_t idx, const std::vector<size_t> &loop, const std::vector<size_t> &stride, const FIRST &first, const NEXT &next) {
+    size_t levels = loop.size();
+    switch(levels) {
+    case 0: return first(idx);
+    case 1: return nested_loop::execute_few<FIRST, NEXT, 1>(idx, &loop[0], &stride[0], first, next);
+    case 2: return nested_loop::execute_few<FIRST, NEXT, 2>(idx, &loop[0], &stride[0], first, next);
+    case 3: return nested_loop::execute_few<FIRST, NEXT, 3>(idx, &loop[0], &stride[0], first, next);
+    default: return nested_loop::execute_many<FIRST, NEXT>(idx, &loop[0], &stride[0], levels, first, next);
+    }
+}
+
+// Run two nested loops in parallel and pass both indexes to 'f'. Note
+// that 'loop' is shared, which means that only individual strides may
+// differ between the two loops.
+template <typename F>
+void run_nested_loop(size_t idx1, size_t idx2, const std::vector<size_t> &loop, const std::vector<size_t> &stride1, const std::vector<size_t> &stride2, const F &f) {
+    size_t levels = loop.size();
+    switch(levels) {
+    case 0: return f(idx1, idx2);
+    case 1: return nested_loop::execute_few<F, 1>(idx1, idx2, &loop[0], &stride1[0], &stride2[0], f);
+    case 2: return nested_loop::execute_few<F, 2>(idx1, idx2, &loop[0], &stride1[0], &stride2[0], f);
+    case 3: return nested_loop::execute_few<F, 3>(idx1, idx2, &loop[0], &stride1[0], &stride2[0], f);
+    default: return nested_loop::execute_many<F>(idx1, idx2, &loop[0], &stride1[0], &stride2[0], levels, f);
+    }
+}
+
+} // namespace
diff --git a/eval/src/vespa/eval/instruction/CMakeLists.txt b/eval/src/vespa/eval/instruction/CMakeLists.txt
index 71d08f601dd..91ff4fd63dc 100644
--- a/eval/src/vespa/eval/instruction/CMakeLists.txt
+++ b/eval/src/vespa/eval/instruction/CMakeLists.txt
@@ -3,6 +3,7 @@
 vespa_add_library(eval_instruction OBJECT
     SOURCES
     generic_join
+    generic_reduce
     generic_merge
     generic_rename
 )
diff --git a/eval/src/vespa/eval/instruction/generic_join.h b/eval/src/vespa/eval/instruction/generic_join.h
index 25647452dff..0f121104d5e 100644
--- a/eval/src/vespa/eval/instruction/generic_join.h
+++ b/eval/src/vespa/eval/instruction/generic_join.h
@@ -2,6 +2,7 @@
 
 #pragma once
 
+#include <vespa/eval/eval/nested_loop.h>
 #include <vespa/eval/eval/value_type.h>
 #include <vespa/eval/eval/interpreted_function.h>
 
@@ -37,34 +38,8 @@ struct DenseJoinPlan {
     std::vector<size_t> rhs_stride;
     DenseJoinPlan(const ValueType &lhs_type, const ValueType &rhs_type);
     ~DenseJoinPlan();
-    template <typename F> void execute(size_t lhs, size_t rhs, F &&f) const {
-        switch(loops_left(0)) {
-        case 0: return execute_few<F, 0>(0, lhs, rhs, std::forward<F>(f));
-        case 1: return execute_few<F, 1>(0, lhs, rhs, std::forward<F>(f));
-        case 2: return execute_few<F, 2>(0, lhs, rhs, std::forward<F>(f));
-        case 3: return execute_few<F, 3>(0, lhs, rhs, std::forward<F>(f));
-        default: return execute_many<F>(0, lhs, rhs, std::forward<F>(f));
-        }
-    }
-private:
-    size_t loops_left(size_t idx) const { return (loop_cnt.size() - idx); }
-    template <typename F, size_t N> void execute_few(size_t idx, size_t lhs, size_t rhs, F &&f) const {
-        if constexpr (N == 0) {
-            f(lhs, rhs);
-        } else {
-            for (size_t i = 0; i < loop_cnt[idx]; ++i, lhs += lhs_stride[idx], rhs += rhs_stride[idx]) {
-                execute_few<F, N - 1>(idx + 1, lhs, rhs, std::forward<F>(f));
-            }
-        }
-    }
-    template <typename F> void execute_many(size_t idx, size_t lhs, size_t rhs, F &&f) const {
-        for (size_t i = 0; i < loop_cnt[idx]; ++i, lhs += lhs_stride[idx], rhs += rhs_stride[idx]) {
-            if (loops_left(idx + 1) == 3) {
-                execute_few<F, 3>(idx + 1, lhs, rhs, std::forward<F>(f));
-            } else {
-                execute_many<F>(idx + 1, lhs, rhs, std::forward<F>(f));
-            }
-        }
+    template <typename F> void execute(size_t lhs, size_t rhs, const F &f) const {
+        run_nested_loop(lhs, rhs, loop_cnt, lhs_stride, rhs_stride, f);
     }
 };
 
diff --git a/eval/src/vespa/eval/instruction/generic_reduce.cpp b/eval/src/vespa/eval/instruction/generic_reduce.cpp
new file mode 100644
index 00000000000..d294b478210
--- /dev/null
+++ b/eval/src/vespa/eval/instruction/generic_reduce.cpp
@@ -0,0 +1,219 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "generic_reduce.h"
+#include <vespa/eval/eval/value.h>
+#include <vespa/vespalib/util/stash.h>
+#include <vespa/vespalib/util/typify.h>
+#include <cassert>
+
+namespace vespalib::eval::instruction {
+
+using State = InterpretedFunction::State;
+using Instruction = InterpretedFunction::Instruction;
+
+namespace {
+
+//-----------------------------------------------------------------------------
+
+template <typename T, typename IN> uint64_t wrap_param(const IN &value_in) {
+    const T &value = value_in;
+    static_assert(sizeof(uint64_t) == sizeof(&value));
+    return (uint64_t)&value;
+}
+
+template <typename T> const T &unwrap_param(uint64_t param) {
+    return *((const T *)param);
+}
+
+//-----------------------------------------------------------------------------
+
+struct ReduceParam {
+    ValueType res_type;
+    SparseReducePlan sparse_plan;
+    DenseReducePlan dense_plan;
+    const ValueBuilderFactory &factory;
+    ReduceParam(const ValueType &type, const std::vector<vespalib::string> &dimensions,
+                const ValueBuilderFactory &factory_in)
+        : res_type(type.reduce(dimensions)),
+          sparse_plan(type, res_type),
+          dense_plan(type, res_type),
+          factory(factory_in)
+    {
+        assert(!res_type.is_error());
+        assert(dense_plan.in_size == type.dense_subspace_size());
+        assert(dense_plan.out_size == res_type.dense_subspace_size());
+    }
+    ~ReduceParam();
+};
+ReduceParam::~ReduceParam() = default;
+
+//-----------------------------------------------------------------------------
+
+struct SparseReduceState {
+
+    // TODO(havardpe): Optimize using vespalib::hash_map with
+    // reference to slice of external vector of vespalib::stringref as
+    // key. Track matching subspaces as linked lists contained in a
+    // single shared vector.
+
+    using MapKey = std::vector<vespalib::string>;
+    using MapValue = std::vector<size_t>;
+    using Map = std::map<MapKey,MapValue>;
+
+    Map                               map;
+    std::vector<vespalib::stringref>  full_address;
+    std::vector<vespalib::stringref*> fetch_address;
+    std::vector<vespalib::stringref*> keep_address;
+    size_t                            subspace;
+
+    SparseReduceState(const SparseReducePlan &plan)
+        : full_address(plan.keep_dims.size() + plan.num_reduce_dims),
+          fetch_address(full_address.size(), nullptr),
+          keep_address(plan.keep_dims.size(), nullptr)
+    {
+        for (size_t i = 0; i < keep_address.size(); ++i) {
+            keep_address[i] = &full_address[plan.keep_dims[i]];
+        }
+        for (size_t i = 0; i < full_address.size(); ++i) {
+            fetch_address[i] = &full_address[i];
+        }
+    }
+    ~SparseReduceState();
+    void populate_map(std::unique_ptr<Value::Index::View> full_view) {
+        full_view->lookup({});
+        while (full_view->next_result(fetch_address, subspace)) {
+            std::vector<vespalib::string> key;
+            key.reserve(keep_address.size());
+            for (const vespalib::stringref *label: keep_address) {
+                key.emplace_back(*label);
+            }
+            map[key].push_back(subspace);
+        }
+    }
+    template <typename T>
+    std::vector<vespalib::stringref> make_addr(const T &map_entry) {
+        std::vector<vespalib::stringref> addr;
+        addr.reserve(map_entry.first.size());
+        for (const vespalib::string &label: map_entry.first) {
+            addr.emplace_back(label);
+        }
+        return addr;
+    }
+};
+SparseReduceState::~SparseReduceState() = default;
+
+template <typename ICT, typename OCT, typename AGGR>
+void my_generic_reduce_op(State &state, uint64_t param_in) {
+    const auto &param = unwrap_param<ReduceParam>(param_in);
+    SparseReduceState sparse(param.sparse_plan);
+    const Value &value = state.peek(0);
+    sparse.populate_map(value.index().create_view({}));
+    auto cells = value.cells().typify<ICT>();
+    AGGR aggr;
+    auto first = [&](size_t idx) { aggr.first(cells[idx]); };
+    auto next = [&](size_t idx) { aggr.next(cells[idx]); };
+    auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.keep_dims.size(), param.dense_plan.out_size, sparse.map.size());
+    for (const auto &entry: sparse.map) {
+        OCT *dst = builder->add_subspace(sparse.make_addr(entry)).begin();
+        auto reduce_cells = [&](size_t rel_idx)
+                            {
+                                auto pos = entry.second.begin();
+                                param.dense_plan.execute_reduce((*pos * param.dense_plan.in_size) + rel_idx, first, next);
+                                for (++pos; pos != entry.second.end(); ++pos) {
+                                    param.dense_plan.execute_reduce((*pos * param.dense_plan.in_size) + rel_idx, next);
+                                }
+                                *dst++ = aggr.result();
+                            };
+        param.dense_plan.execute_keep(reduce_cells);
+    }
+    auto &result = state.stash.create<std::unique_ptr<Value>>(builder->build(std::move(builder)));
+    const Value &result_ref = *(result.get());
+    state.pop_push(result_ref);
+};
+
+struct SelectGenericReduceOp {
+    template <typename ICT, typename OCT, typename AGGR> static auto invoke() {
+        return my_generic_reduce_op<ICT, OCT, typename AGGR::template templ<ICT>>;
+    }
+};
+
+//-----------------------------------------------------------------------------
+
+} // namespace <unnamed>
+
+//-----------------------------------------------------------------------------
+
+DenseReducePlan::DenseReducePlan(const ValueType &type, const ValueType &res_type)
+    : in_size(1),
+      out_size(1),
+      keep_loop(),
+      keep_stride(),
+      reduce_loop(),
+      reduce_stride()
+{
+    std::vector<bool> keep;
+    std::vector<size_t> size;
+    for (const auto &dim: type.nontrivial_indexed_dimensions()) {
+        keep.push_back(res_type.dimension_index(dim.name) != ValueType::Dimension::npos);
+        size.push_back(dim.size);
+    }
+    std::vector<size_t> stride(size.size(), 0);
+    for (size_t i = stride.size(); i-- > 0; ) {
+        stride[i] = in_size;
+        in_size *= size[i];
+        if (keep[i]) {
+            out_size *= size[i];
+        }
+    }
+    int prev_case = 2;
+    for (size_t i = 0; i < size.size(); ++i) {
+        int my_case = keep[i] ? 1 : 0;
+        auto &my_loop = keep[i] ? keep_loop : reduce_loop;
+        auto &my_stride = keep[i] ? keep_stride : reduce_stride;
+        if (my_case == prev_case) {
+            assert(!my_loop.empty());
+            my_loop.back() *= size[i];
+            my_stride.back() = stride[i];
+        } else {
+            my_loop.push_back(size[i]);
+            my_stride.push_back(stride[i]);
+        }
+        prev_case = my_case;
+    }
+}
+
+DenseReducePlan::~DenseReducePlan() = default;
+
+//-----------------------------------------------------------------------------
+
+SparseReducePlan::SparseReducePlan(const ValueType &type, const ValueType &res_type)
+    : num_reduce_dims(0),
+      keep_dims()
+{
+    auto dims = type.mapped_dimensions();
+    for (size_t i = 0; i < dims.size(); ++i) {
+        bool keep = (res_type.dimension_index(dims[i].name) != ValueType::Dimension::npos);
+        if (keep) {
+            keep_dims.push_back(i);
+        } else {
+            ++num_reduce_dims;
+        }
+    }
+}
+
+SparseReducePlan::~SparseReducePlan() = default;
+
+//-----------------------------------------------------------------------------
+
+using ReduceTypify = TypifyValue<TypifyCellType,TypifyAggr>;
+
+Instruction
+GenericReduce::make_instruction(const ValueType &type, Aggr aggr, const std::vector<vespalib::string> &dimensions,
+                                const ValueBuilderFactory &factory, Stash &stash)
+{
+    auto &param = stash.create<ReduceParam>(type, dimensions, factory);
+    auto fun = typify_invoke<3,ReduceTypify,SelectGenericReduceOp>(type.cell_type(), param.res_type.cell_type(), aggr);
+    return Instruction(fun, wrap_param<ReduceParam>(param));
+}
+
+} // namespace
diff --git a/eval/src/vespa/eval/instruction/generic_reduce.h b/eval/src/vespa/eval/instruction/generic_reduce.h
new file mode 100644
index 00000000000..e412f8795ac
--- /dev/null
+++ b/eval/src/vespa/eval/instruction/generic_reduce.h
@@ -0,0 +1,54 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/eval/eval/value_type.h>
+#include <vespa/eval/eval/aggr.h>
+#include <vespa/eval/eval/interpreted_function.h>
+#include <vespa/eval/eval/nested_loop.h>
+
+namespace vespalib { class Stash; }
+namespace vespalib::eval { struct ValueBuilderFactory; }
+
+namespace vespalib::eval::instruction {
+
+//-----------------------------------------------------------------------------
+
+struct GenericReduce {
+    static InterpretedFunction::Instruction
+    make_instruction(const ValueType &type, Aggr aggr, const std::vector<vespalib::string> &dimensions,
+                     const ValueBuilderFactory &factory, Stash &stash);
+};
+
+//-----------------------------------------------------------------------------
+
+struct DenseReducePlan {
+    size_t in_size;
+    size_t out_size;
+    std::vector<size_t> keep_loop;
+    std::vector<size_t> keep_stride;
+    std::vector<size_t> reduce_loop;
+    std::vector<size_t> reduce_stride;
+    DenseReducePlan(const ValueType &type, const ValueType &res_type);
+    ~DenseReducePlan();
+    template <typename F> void execute_keep(const F &f) const {
+        run_nested_loop(0, keep_loop, keep_stride, f);
+    }
+    template <typename F> void execute_reduce(size_t offset, const F &f) const {
+        run_nested_loop(offset, reduce_loop, reduce_stride, f);
+    }
+    template <typename FIRST, typename NEXT> void execute_reduce(size_t offset, const FIRST &first, const NEXT &next) const {
+        run_nested_loop(offset, reduce_loop, reduce_stride, first, next);
+    }
+};
+
+struct SparseReducePlan {
+    size_t num_reduce_dims;
+    std::vector<size_t> keep_dims;
+    SparseReducePlan(const ValueType &type, const ValueType &res_type);
+    ~SparseReducePlan();
+};
+
+//-----------------------------------------------------------------------------
+
+} // namespace
diff --git a/eval/src/vespa/eval/instruction/generic_rename.cpp b/eval/src/vespa/eval/instruction/generic_rename.cpp
index cda0aa6e89a..1f8398555b4 100644
--- a/eval/src/vespa/eval/instruction/generic_rename.cpp
+++ b/eval/src/vespa/eval/instruction/generic_rename.cpp
@@ -53,13 +53,13 @@ struct RenameParam {
     SparseRenamePlan sparse_plan;
     DenseRenamePlan dense_plan;
     const ValueBuilderFactory &factory;
-    RenameParam(const ValueType &lhs_type, const ValueType &output_type,
+    RenameParam(const ValueType &lhs_type,
                 const std::vector<vespalib::string> &rename_dimension_from,
                 const std::vector<vespalib::string> &rename_dimension_to,
                 const ValueBuilderFactory &factory_in)
-        : res_type(output_type),
-          sparse_plan(lhs_type, output_type, rename_dimension_from, rename_dimension_to),
-          dense_plan(lhs_type, output_type, rename_dimension_from, rename_dimension_to),
+        : res_type(lhs_type.rename(rename_dimension_from, rename_dimension_to)),
+          sparse_plan(lhs_type, res_type, rename_dimension_from, rename_dimension_to),
+          dense_plan(lhs_type, res_type, rename_dimension_from, rename_dimension_to),
           factory(factory_in)
     {
         assert(!res_type.is_error());
@@ -181,15 +181,15 @@ DenseRenamePlan::DenseRenamePlan(const ValueType &lhs_type,
 DenseRenamePlan::~DenseRenamePlan() = default;
 
 InterpretedFunction::Instruction
-GenericRename::make_instruction(const ValueType &lhs_type, const ValueType &output_type,
+GenericRename::make_instruction(const ValueType &lhs_type,
                                 const std::vector<vespalib::string> &rename_dimension_from,
                                 const std::vector<vespalib::string> &rename_dimension_to,
                                 const ValueBuilderFactory &factory, Stash &stash)
 {
-    auto &param = stash.create<RenameParam>(lhs_type, output_type,
+    auto &param = stash.create<RenameParam>(lhs_type,
                                             rename_dimension_from, rename_dimension_to,
                                             factory);
-    auto fun = typify_invoke<1,TypifyCellType,SelectGenericRenameOp>(output_type.cell_type());
+    auto fun = typify_invoke<1,TypifyCellType,SelectGenericRenameOp>(param.res_type.cell_type());
     return Instruction(fun, wrap_param<RenameParam>(param));
 }
 
diff --git a/eval/src/vespa/eval/instruction/generic_rename.h b/eval/src/vespa/eval/instruction/generic_rename.h
index ca9f45bd341..6c94ff02b24 100644
--- a/eval/src/vespa/eval/instruction/generic_rename.h
+++ b/eval/src/vespa/eval/instruction/generic_rename.h
@@ -2,6 +2,7 @@
 
 #pragma once
 
+#include <vespa/eval/eval/nested_loop.h>
 #include <vespa/eval/eval/value_type.h>
 #include <vespa/eval/eval/interpreted_function.h>
 #include <vespa/vespalib/stllike/string.h>
@@ -20,37 +21,8 @@ struct DenseRenamePlan {
                     const std::vector<vespalib::string> &from,
                     const std::vector<vespalib::string> &to);
     ~DenseRenamePlan();
-    template <typename F> void execute(size_t offset, F &&f) const {
-        switch(loops_left(0)) {
-        case 0: return execute_few<F, 0>(0, offset, std::forward<F>(f));
-        case 1: return execute_few<F, 1>(0, offset, std::forward<F>(f));
-        case 2: return execute_few<F, 2>(0, offset, std::forward<F>(f));
-        case 3: return execute_few<F, 3>(0, offset, std::forward<F>(f));
-        default: return execute_many<F>(0, offset, std::forward<F>(f));
-        }
-    }
-private:
-    size_t loops_left(size_t idx) const { return (loop_cnt.size() - idx); }
-
-    template <typename F, size_t N> void execute_few(size_t idx, size_t offset, F &&f) const {
-        if constexpr (N == 0) {
-            f(offset);
-        } else {
-            for (size_t i = 0; i < loop_cnt[idx]; ++i) {
-                execute_few<F, N - 1>(idx + 1, offset, std::forward<F>(f));
-                offset += stride[idx];
-            }
-        }
-    }
-    template <typename F> void execute_many(size_t idx, size_t offset, F &&f) const {
-        for (size_t i = 0; i < loop_cnt[idx]; ++i) {
-            if (loops_left(idx + 1) == 3) {
-                execute_few<F, 3>(idx + 1, offset, std::forward<F>(f));
-            } else {
-                execute_many<F>(idx + 1, offset, std::forward<F>(f));
-            }
-            offset += stride[idx];
-        }
+    template <typename F> void execute(size_t offset, const F &f) const {
+        run_nested_loop(offset, loop_cnt, stride, f);
     }
 };
 
@@ -68,7 +40,7 @@ struct SparseRenamePlan {
 
 struct GenericRename {
     static InterpretedFunction::Instruction
-    make_instruction(const ValueType &lhs_type, const ValueType &output_type,
+    make_instruction(const ValueType &lhs_type,
                      const std::vector<vespalib::string> &rename_dimension_from,
                      const std::vector<vespalib::string> &rename_dimension_to,
                      const ValueBuilderFactory &factory, Stash &stash);
author	Håvard Pettersen <3535158+havardpe@users.noreply.github.com>	2020-10-02 15:14:40 +0200
committer	GitHub <noreply@github.com>	2020-10-02 15:14:40 +0200
commit	46f99f3f700f3cae1b60209a01645f259e5a3f74 (patch)
tree	e84e06c8d3ac29cc829511bbf3dc93576d4596df
parent	c67283d48af378f25cf1bd3ed8e578d0e529813f (diff)
parent	8c8c529e6fea1d0e8869dfb74881ca15ca9a89a9 (diff)