summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHÃ¥vard Pettersen <3535158+havardpe@users.noreply.github.com>2020-10-02 15:14:40 +0200
committerGitHub <noreply@github.com>2020-10-02 15:14:40 +0200
commit46f99f3f700f3cae1b60209a01645f259e5a3f74 (patch)
treee84e06c8d3ac29cc829511bbf3dc93576d4596df
parentc67283d48af378f25cf1bd3ed8e578d0e529813f (diff)
parent8c8c529e6fea1d0e8869dfb74881ca15ca9a89a9 (diff)
Merge pull request #14681 from vespa-engine/havardpe/generic-reduce
generic reduce
-rw-r--r--eval/CMakeLists.txt2
-rw-r--r--eval/src/tests/eval/nested_loop/.gitignore1
-rw-r--r--eval/src/tests/eval/nested_loop/CMakeLists.txt16
-rw-r--r--eval/src/tests/eval/nested_loop/nested_loop_bench.cpp90
-rw-r--r--eval/src/tests/eval/nested_loop/nested_loop_test.cpp88
-rw-r--r--eval/src/tests/instruction/generic_reduce/CMakeLists.txt9
-rw-r--r--eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp109
-rw-r--r--eval/src/tests/instruction/generic_rename/generic_rename_test.cpp8
-rw-r--r--eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp187
-rw-r--r--eval/src/vespa/eval/eval/nested_loop.h143
-rw-r--r--eval/src/vespa/eval/instruction/CMakeLists.txt1
-rw-r--r--eval/src/vespa/eval/instruction/generic_join.h31
-rw-r--r--eval/src/vespa/eval/instruction/generic_reduce.cpp219
-rw-r--r--eval/src/vespa/eval/instruction/generic_reduce.h54
-rw-r--r--eval/src/vespa/eval/instruction/generic_rename.cpp14
-rw-r--r--eval/src/vespa/eval/instruction/generic_rename.h36
16 files changed, 918 insertions, 90 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt
index 1872ee1410a..b8cc77b8290 100644
--- a/eval/CMakeLists.txt
+++ b/eval/CMakeLists.txt
@@ -20,6 +20,7 @@ vespa_define_module(
src/tests/eval/inline_operation
src/tests/eval/interpreted_function
src/tests/eval/multiply_add
+ src/tests/eval/nested_loop
src/tests/eval/node_tools
src/tests/eval/node_types
src/tests/eval/param_usage
@@ -33,6 +34,7 @@ vespa_define_module(
src/tests/eval/value_type
src/tests/gp/ponder_nov2017
src/tests/instruction/generic_join
+ src/tests/instruction/generic_reduce
src/tests/instruction/generic_merge
src/tests/instruction/generic_rename
src/tests/tensor/default_value_builder_factory
diff --git a/eval/src/tests/eval/nested_loop/.gitignore b/eval/src/tests/eval/nested_loop/.gitignore
new file mode 100644
index 00000000000..a3c003c3a2d
--- /dev/null
+++ b/eval/src/tests/eval/nested_loop/.gitignore
@@ -0,0 +1 @@
+/eval_nested_loop_bench_app
diff --git a/eval/src/tests/eval/nested_loop/CMakeLists.txt b/eval/src/tests/eval/nested_loop/CMakeLists.txt
new file mode 100644
index 00000000000..ccd751143b1
--- /dev/null
+++ b/eval/src/tests/eval/nested_loop/CMakeLists.txt
@@ -0,0 +1,16 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(eval_nested_loop_test_app TEST
+ SOURCES
+ nested_loop_test.cpp
+ DEPENDS
+ vespaeval
+ GTest::GTest
+)
+vespa_add_test(NAME eval_nested_loop_test_app COMMAND eval_nested_loop_test_app)
+vespa_add_executable(eval_nested_loop_bench_app TEST
+ SOURCES
+ nested_loop_bench.cpp
+ DEPENDS
+ vespaeval
+ GTest::GTest
+)
diff --git a/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp b/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp
new file mode 100644
index 00000000000..c6e2f278cd4
--- /dev/null
+++ b/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp
@@ -0,0 +1,90 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/eval/eval/nested_loop.h>
+#include <vespa/vespalib/util/benchmark_timer.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using vespalib::BenchmarkTimer;
+using vespalib::eval::run_nested_loop;
+
+using LIST = std::vector<size_t>;
+using call_t = void (*)(const LIST &loop, const LIST &stride);
+
+void perform_direct(const LIST &loop, const LIST &stride) {
+ size_t idx1 = 0;
+ size_t expect = 0;
+ for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+ size_t idx2 = idx1;
+ for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) {
+ size_t idx3 = idx2;
+ for (size_t k = 0; k < loop[2]; ++k, idx3 += stride[2]) {
+ assert(idx3 == expect);
+ ++expect;
+ }
+ }
+ }
+ assert(expect == 4096);
+}
+
+void perform_direct_lambda(const LIST &loop, const LIST &stride) {
+ size_t expect = 0;
+ auto fun = [&](size_t idx) {
+ (void) idx;
+ assert(idx == expect);
+ ++expect;
+ };
+ size_t idx1 = 0;
+ for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) {
+ size_t idx2 = idx1;
+ for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) {
+ size_t idx3 = idx2;
+ for (size_t k = 0; k < loop[2]; ++k, idx3 += stride[2]) {
+ fun(idx3);
+ }
+ }
+ }
+ assert(expect == 4096);
+}
+
+void perform_generic(const LIST &loop, const LIST &stride) {
+ size_t expect = 0;
+ auto fun = [&](size_t idx) {
+ (void) idx;
+ assert(idx == expect);
+ ++expect;
+ };
+ run_nested_loop(0, loop, stride, fun);
+ assert(expect == 4096);
+}
+
+void perform_generic_isolate_first(const LIST &loop, const LIST &stride) {
+ size_t expect = 0;
+ auto fun = [&](size_t idx) {
+ (void) idx;
+ assert(idx == expect);
+ ++expect;
+ };
+ run_nested_loop(0, loop, stride, fun, fun);
+ assert(expect == 4096);
+}
+
+void nop() {}
+
+double estimate_cost_us(call_t perform_fun) {
+ LIST loop({16,16,16});
+ LIST stride({256,16,1});
+ return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 100000, 5.0) * 1000.0 * 1000.0;
+}
+
+//-----------------------------------------------------------------------------
+
+TEST(NestedLoopBenchmark, single_loop) {
+ fprintf(stderr, "manual direct single loop: %g us\n", estimate_cost_us(perform_direct));
+ fprintf(stderr, "manual call lambda single loop: %g us\n", estimate_cost_us(perform_direct_lambda));
+ fprintf(stderr, "generic single loop: %g us\n", estimate_cost_us(perform_generic));
+ fprintf(stderr, "generic single loop (isolate first): %g us\n", estimate_cost_us(perform_generic_isolate_first));
+}
+
+//-----------------------------------------------------------------------------
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/tests/eval/nested_loop/nested_loop_test.cpp b/eval/src/tests/eval/nested_loop/nested_loop_test.cpp
new file mode 100644
index 00000000000..a72c5ed9cca
--- /dev/null
+++ b/eval/src/tests/eval/nested_loop/nested_loop_test.cpp
@@ -0,0 +1,88 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/eval/eval/nested_loop.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using namespace vespalib::eval;
+
+std::vector<size_t> run_single(size_t idx_in, const std::vector<size_t> &loop, const std::vector<size_t> &stride) {
+ std::vector<size_t> result;
+ auto capture = [&](size_t idx_out) { result.push_back(idx_out); };
+ assert(loop.size() == stride.size());
+ run_nested_loop(idx_in, loop, stride, capture);
+ return result;
+}
+
+std::pair<std::vector<size_t>,std::vector<size_t>> run_single_isolated(size_t idx_in, const std::vector<size_t> &loop, const std::vector<size_t> &stride) {
+ std::vector<size_t> res1;
+ std::vector<size_t> res2;
+ auto capture1 = [&](size_t idx_out) { res1.push_back(idx_out); };
+ auto capture2 = [&](size_t idx_out) { res2.push_back(idx_out); };
+ assert(loop.size() == stride.size());
+ run_nested_loop(idx_in, loop, stride, capture1, capture2);
+ return std::make_pair(res1, res2);
+}
+
+std::vector<std::pair<size_t,size_t>> run_double(size_t idx1_in, size_t idx2_in, const std::vector<size_t> &loop,
+ const std::vector<size_t> &stride1, const std::vector<size_t> &stride2)
+{
+ std::vector<std::pair<size_t,size_t>> result;
+ auto capture = [&](size_t idx1_out, size_t idx2_out) { result.emplace_back(idx1_out, idx2_out); };
+ assert(loop.size() == stride1.size());
+ assert(loop.size() == stride2.size());
+ run_nested_loop(idx1_in, idx2_in, loop, stride1, stride2, capture);
+ return result;
+}
+
+void verify_isolated(size_t idx_in, const std::vector<size_t> &loop, const std::vector<size_t> &stride) {
+ auto full = run_single(idx_in, loop, stride);
+ auto actual = run_single_isolated(idx_in, loop, stride);
+ ASSERT_EQ(actual.first.size(), 1);
+ ASSERT_EQ(actual.second.size(), full.size() - 1);
+ EXPECT_EQ(actual.first[0], full[0]);
+ full.erase(full.begin());
+ EXPECT_EQ(actual.second, full);
+}
+
+void verify_double(size_t idx1_in, size_t idx2_in, const std::vector<size_t> &loop,
+ const std::vector<size_t> &stride1, const std::vector<size_t> &stride2)
+{
+ auto res1 = run_single(idx1_in, loop, stride1);
+ auto res2 = run_single(idx2_in, loop, stride2);
+ ASSERT_EQ(res1.size(), res2.size());
+ std::vector<std::pair<size_t,size_t>> expect;
+ for (size_t i = 0; i < res1.size(); ++i) {
+ expect.emplace_back(res1[i], res2[i]);
+ }
+ auto actual = run_double(idx1_in, idx2_in, loop, stride1, stride2);
+ EXPECT_EQ(actual, expect);
+}
+
+std::vector<size_t> v(std::vector<size_t> vec) { return vec; }
+
+TEST(NestedLoopTest, single_nested_loop_can_be_executed) {
+ EXPECT_EQ(v({123}), run_single(123, {}, {}));
+ EXPECT_EQ(v({10,11}), run_single(10, {2}, {1}));
+ EXPECT_EQ(v({100,110,101,111}), run_single(100, {2,2}, {1,10}));
+ EXPECT_EQ(v({100,110,100,110,101,111,101,111}), run_single(100, {2,2,2}, {1,0,10}));
+ EXPECT_EQ(v({100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115}),
+ run_single(100, {2,2,2,2}, {8,4,2,1}));
+}
+
+TEST(NestedLoopTest, single_nested_loop_with_first_entry_isolated_can_be_executed) {
+ verify_isolated(10, {}, {});
+ verify_isolated(10, {3}, {5});
+ verify_isolated(10, {3,3}, {2,3});
+ verify_isolated(10, {3,3,2}, {2,0,3});
+ verify_isolated(10, {2,3,2,3}, {7,2,1,3});
+}
+
+TEST(NestedLoopTest, double_nested_loop_can_be_executed) {
+ verify_double(10, 20, {}, {}, {});
+ verify_double(10, 20, {3}, {5}, {7});
+ verify_double(10, 20, {3,3}, {2,3}, {7,5});
+ verify_double(10, 20, {3,3,2}, {2,0,3}, {0,7,5});
+ verify_double(10, 20, {2,3,2,3}, {7,2,1,3}, {3,7,5,1});
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/tests/instruction/generic_reduce/CMakeLists.txt b/eval/src/tests/instruction/generic_reduce/CMakeLists.txt
new file mode 100644
index 00000000000..f04664ae64c
--- /dev/null
+++ b/eval/src/tests/instruction/generic_reduce/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(eval_generic_reduce_test_app TEST
+ SOURCES
+ generic_reduce_test.cpp
+ DEPENDS
+ vespaeval
+ GTest::GTest
+)
+vespa_add_test(NAME eval_generic_reduce_test_app COMMAND eval_generic_reduce_test_app)
diff --git a/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp b/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp
new file mode 100644
index 00000000000..7e9a059967a
--- /dev/null
+++ b/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp
@@ -0,0 +1,109 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/eval/eval/simple_value.h>
+#include <vespa/eval/eval/value_codec.h>
+#include <vespa/eval/instruction/generic_reduce.h>
+#include <vespa/eval/eval/interpreted_function.h>
+#include <vespa/eval/eval/test/tensor_model.hpp>
+#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/vespalib/gtest/gtest.h>
+#include <optional>
+
+using namespace vespalib;
+using namespace vespalib::eval;
+using namespace vespalib::eval::instruction;
+using namespace vespalib::eval::test;
+
+using vespalib::make_string_short::fmt;
+
+std::vector<Layout> layouts = {
+ {},
+ {x(3)},
+ {x(3),y(5)},
+ {x(3),y(5),z(7)},
+ float_cells({x(3),y(5),z(7)}),
+ {x({"a","b","c"})},
+ {x({"a","b","c"}),y({"foo","bar"})},
+ {x({"a","b","c"}),y({"foo","bar"}),z({"i","j","k","l"})},
+ float_cells({x({"a","b","c"}),y({"foo","bar"}),z({"i","j","k","l"})}),
+ {x(3),y({"foo", "bar"}),z(7)},
+ {x({"a","b","c"}),y(5),z({"i","j","k","l"})},
+ float_cells({x({"a","b","c"}),y(5),z({"i","j","k","l"})})
+};
+
+TensorSpec reference_reduce(const TensorSpec &a, const std::vector<vespalib::string> &dims, Aggr aggr) {
+ Stash stash;
+ ValueType res_type = ValueType::from_spec(a.type()).reduce(dims);
+ EXPECT_FALSE(res_type.is_error());
+ std::map<TensorSpec::Address,std::optional<Aggregator*>> my_map;
+ for (const auto &cell: a.cells()) {
+ TensorSpec::Address addr;
+ for (const auto &dim: cell.first) {
+ if (res_type.dimension_index(dim.first) != ValueType::Dimension::npos) {
+ addr.insert_or_assign(dim.first, dim.second);
+ }
+ }
+ auto [pos, is_empty] = my_map.emplace(addr, std::nullopt);
+ if (is_empty) {
+ pos->second = &Aggregator::create(aggr, stash);
+ pos->second.value()->first(cell.second);
+ } else {
+ pos->second.value()->next(cell.second);
+ }
+ }
+ TensorSpec result(res_type.to_spec());
+ for (const auto &my_entry: my_map) {
+ result.add(my_entry.first, my_entry.second.value()->result());
+ }
+ return result;
+}
+
+TensorSpec perform_generic_reduce(const TensorSpec &a, const std::vector<vespalib::string> &dims, Aggr aggr) {
+ Stash stash;
+ const auto &factory = SimpleValueBuilderFactory::get();
+ auto lhs = value_from_spec(a, factory);
+ auto my_op = GenericReduce::make_instruction(lhs->type(), aggr, dims, factory, stash);
+ InterpretedFunction::EvalSingle single(my_op);
+ return spec_from_value(single.eval(std::vector<Value::CREF>({*lhs})));
+}
+
+TEST(GenericReduceTest, dense_reduce_plan_can_be_created) {
+ auto type = ValueType::from_spec("tensor(a[2],aa{},b[2],bb[1],c[2],cc{},d[2],dd[1],e[2],ee{},f[2])");
+ auto plan = DenseReducePlan(type, type.reduce({"a", "d", "e"}));
+ std::vector<size_t> expect_keep_loop = {4,2};
+ std::vector<size_t> expect_keep_stride = {8,1};
+ std::vector<size_t> expect_reduce_loop = {2,4};
+ std::vector<size_t> expect_reduce_stride = {32,2};
+ EXPECT_EQ(plan.in_size, 64);
+ EXPECT_EQ(plan.out_size, 8);
+ EXPECT_EQ(plan.keep_loop, expect_keep_loop);
+ EXPECT_EQ(plan.keep_stride, expect_keep_stride);
+ EXPECT_EQ(plan.reduce_loop, expect_reduce_loop);
+ EXPECT_EQ(plan.reduce_stride, expect_reduce_stride);
+}
+
+TEST(GenericReduceTest, sparse_reduce_plan_can_be_created) {
+ auto type = ValueType::from_spec("tensor(a{},aa[10],b{},c{},cc[5],d{},e{},ee[1],f{})");
+ auto plan = SparseReducePlan(type, type.reduce({"a", "d", "e"}));
+ std::vector<size_t> expect_keep_dims = {1,2,5};
+ EXPECT_EQ(plan.num_reduce_dims, 3);
+ EXPECT_EQ(plan.keep_dims, expect_keep_dims);
+}
+
+TEST(GenericReduceTest, generic_reduce_works_for_simple_values) {
+ for (const Layout &layout: layouts) {
+ TensorSpec input = spec(layout, Div16(N()));
+ for (Aggr aggr: {Aggr::SUM, Aggr::AVG, Aggr::MIN, Aggr::MAX}) {
+ for (const Domain &domain: layout) {
+ auto expect = reference_reduce(input, {domain.dimension}, aggr);
+ auto actual = perform_generic_reduce(input, {domain.dimension}, aggr);
+ EXPECT_EQ(actual, expect);
+ }
+ auto expect = reference_reduce(input, {}, aggr);
+ auto actual = perform_generic_reduce(input, {}, aggr);
+ EXPECT_EQ(actual, expect);
+ }
+ }
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/tests/instruction/generic_rename/generic_rename_test.cpp b/eval/src/tests/instruction/generic_rename/generic_rename_test.cpp
index f61899e4dda..d5c7f58fc1f 100644
--- a/eval/src/tests/instruction/generic_rename/generic_rename_test.cpp
+++ b/eval/src/tests/instruction/generic_rename/generic_rename_test.cpp
@@ -111,12 +111,12 @@ TensorSpec reference_rename(const TensorSpec &a, const FromTo &ft) {
return result;
}
-TensorSpec perform_generic_rename(const TensorSpec &a, const ValueType &res_type,
- const FromTo &ft, const ValueBuilderFactory &factory)
+TensorSpec perform_generic_rename(const TensorSpec &a,
+ const FromTo &ft, const ValueBuilderFactory &factory)
{
Stash stash;
auto lhs = value_from_spec(a, factory);
- auto my_op = GenericRename::make_instruction(lhs->type(), res_type, ft.from, ft.to, factory, stash);
+ auto my_op = GenericRename::make_instruction(lhs->type(), ft.from, ft.to, factory, stash);
InterpretedFunction::EvalSingle single(my_op);
return spec_from_value(single.eval(std::vector<Value::CREF>({*lhs})));
}
@@ -132,7 +132,7 @@ void test_generic_rename(const ValueBuilderFactory &factory) {
// printf("type %s -> %s\n", lhs_type.to_spec().c_str(), renamed_type.to_spec().c_str());
SCOPED_TRACE(fmt("\n===\nLHS: %s\n===\n", lhs.to_string().c_str()));
auto expect = reference_rename(lhs, from_to);
- auto actual = perform_generic_rename(lhs, renamed_type, from_to, factory);
+ auto actual = perform_generic_rename(lhs, from_to, factory);
EXPECT_EQ(actual, expect);
}
}
diff --git a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
index 4771034902b..2e126ea941f 100644
--- a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
+++ b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp
@@ -14,15 +14,17 @@
// improvement in generic dense join and possibly also in sparse join
// with partial dimensional overlap. Benchmarks are done using float
// cells since this is what gives best overall performance in
-// production. Also, we use the multiply operation since it is the
-// most optimized operations across all implementations. When
-// benchmarking different implementations against each other, a smoke
-// test is performed by verifying that all implementations produce the
-// same result.
+// production. Also, we use the multiply operation for join and sum
+// operation for reduce since those are the most optimized operations
+// across all implementations. When benchmarking different
+// implementations against each other, a smoke test is performed by
+// verifying that all implementations produce the same result.
#include <vespa/eval/eval/simple_value.h>
#include <vespa/eval/eval/interpreted_function.h>
#include <vespa/eval/instruction/generic_join.h>
+#include <vespa/eval/instruction/generic_reduce.h>
+#include <vespa/eval/instruction/generic_rename.h>
#include <vespa/eval/eval/simple_tensor_engine.h>
#include <vespa/eval/eval/tensor_spec.h>
#include <vespa/eval/eval/value_codec.h>
@@ -60,6 +62,8 @@ struct Impl {
virtual Value::UP create_value(const TensorSpec &spec) const = 0;
virtual TensorSpec create_spec(const Value &value) const = 0;
virtual Instruction create_join(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const = 0;
+ virtual Instruction create_reduce(const ValueType &lhs, Aggr aggr, const std::vector<vespalib::string> &dims, Stash &stash) const = 0;
+ virtual Instruction create_rename(const ValueType &lhs, const std::vector<vespalib::string> &from, const std::vector<vespalib::string> &to, Stash &stash) const = 0;
virtual const TensorEngine &engine() const { return SimpleTensorEngine::ref(); } // engine used by EvalSingle
virtual ~Impl() {}
};
@@ -73,6 +77,12 @@ struct ValueImpl : Impl {
Instruction create_join(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const override {
return GenericJoin::make_instruction(lhs, rhs, function, my_factory, stash);
}
+ Instruction create_reduce(const ValueType &lhs, Aggr aggr, const std::vector<vespalib::string> &dims, Stash &stash) const override {
+ return GenericReduce::make_instruction(lhs, aggr, dims, my_factory, stash);
+ }
+ Instruction create_rename(const ValueType &lhs, const std::vector<vespalib::string> &from, const std::vector<vespalib::string> &to, Stash &stash) const override {
+ return GenericRename::make_instruction(lhs, from, to, my_factory, stash);
+ }
};
struct EngineImpl : Impl {
@@ -82,12 +92,24 @@ struct EngineImpl : Impl {
Value::UP create_value(const TensorSpec &spec) const override { return my_engine.from_spec(spec); }
TensorSpec create_spec(const Value &value) const override { return my_engine.to_spec(value); }
Instruction create_join(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const override {
- // create a complete tensor function joining two parameters, but only compile the join instruction itself
+ // create a complete tensor function, but only compile the relevant instruction
const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
const auto &rhs_node = tensor_function::inject(rhs, 1, stash);
const auto &join_node = tensor_function::join(lhs_node, rhs_node, function, stash);
return join_node.compile_self(my_engine, stash);
}
+ Instruction create_reduce(const ValueType &lhs, Aggr aggr, const std::vector<vespalib::string> &dims, Stash &stash) const override {
+ // create a complete tensor function, but only compile the relevant instruction
+ const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
+ const auto &reduce_node = tensor_function::reduce(lhs_node, aggr, dims, stash);
+ return reduce_node.compile_self(my_engine, stash);
+ }
+ Instruction create_rename(const ValueType &lhs, const std::vector<vespalib::string> &from, const std::vector<vespalib::string> &to, Stash &stash) const override {
+ // create a complete tensor function, but only compile the relevant instruction
+ const auto &lhs_node = tensor_function::inject(lhs, 0, stash);
+ const auto &rename_node = tensor_function::rename(lhs_node, from, to, stash);
+ return rename_node.compile_self(my_engine, stash);
+ }
const TensorEngine &engine() const override { return my_engine; }
};
@@ -100,7 +122,11 @@ ValueImpl packed_mixed_tensor_impl(2, " PackedMixedTensor", " Packed", Pack
ValueImpl default_tensor_value_impl(0, " DefaultValue", "NEW PROD", DefaultValueBuilderFactory::get());
vespalib::string short_header("--------");
-double budget = 5.0;
+constexpr double budget = 5.0;
+constexpr double best_limit = 0.95; // everything within 95% of best performance gets a star
+constexpr double bad_limit = 0.90; // BAD: new prod has performance lower than 90% of old prod
+constexpr double good_limit = 1.10; // GOOD: new prod has performance higher than 110% of old prod
+
std::vector<CREF<Impl>> impl_list = {simple_tensor_engine_impl,
default_tensor_engine_impl,
simple_value_impl,
@@ -117,25 +143,25 @@ struct BenchmarkHeader {
short_names[impl.order] = impl.short_name;
}
}
+ void print_header(const vespalib::string &desc) const {
+ for (const auto &name: short_names) {
+ fprintf(stderr, "|%s", name.c_str());
+ }
+ fprintf(stderr, "| %s Benchmark cases\n", desc.c_str());
+ }
void print_trailer() const {
for (size_t i = 0; i < short_names.size(); ++i) {
fprintf(stderr, "+%s", short_header.c_str());
}
fprintf(stderr, "+------------------------------------------------\n");
}
- void print() const {
- for (const auto &name: short_names) {
- fprintf(stderr, "|%s", name.c_str());
- }
- fprintf(stderr, "| Benchmark description\n");
- print_trailer();
- }
};
struct BenchmarkResult {
vespalib::string desc;
std::optional<double> ref_time;
std::vector<double> relative_perf;
+ double star_rating;
BenchmarkResult(const vespalib::string &desc_in, size_t num_values)
: desc(desc_in), ref_time(std::nullopt), relative_perf(num_values, 0.0) {}
~BenchmarkResult();
@@ -150,13 +176,20 @@ struct BenchmarkResult {
}
}
void normalize() {
+ star_rating = 0.0;
for (double &perf: relative_perf) {
perf = ref_time.value() / perf;
+ star_rating = std::max(star_rating, perf);
}
+ star_rating *= best_limit;
}
void print() const {
for (double perf: relative_perf) {
- fprintf(stderr, "|%8.2f", perf);
+ if (perf > star_rating) {
+ fprintf(stderr, "|*%7.2f", perf);
+ } else {
+ fprintf(stderr, "| %7.2f", perf);
+ }
}
fprintf(stderr, "| %s\n", desc.c_str());
}
@@ -239,6 +272,45 @@ void benchmark_join(const vespalib::string &desc, const TensorSpec &lhs,
//-----------------------------------------------------------------------------
+void benchmark_reduce(const vespalib::string &desc, const TensorSpec &lhs,
+ Aggr aggr, const std::vector<vespalib::string> &dims)
+{
+ Stash stash;
+ ValueType lhs_type = ValueType::from_spec(lhs.type());
+ ValueType res_type = lhs_type.reduce(dims);
+ ASSERT_FALSE(lhs_type.is_error());
+ ASSERT_FALSE(res_type.is_error());
+ std::vector<EvalOp::UP> list;
+ for (const Impl &impl: impl_list) {
+ auto op = impl.create_reduce(lhs_type, aggr, dims, stash);
+ std::vector<CREF<TensorSpec>> stack_spec({lhs});
+ list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl));
+ }
+ benchmark(desc, list);
+}
+
+//-----------------------------------------------------------------------------
+
+void benchmark_rename(const vespalib::string &desc, const TensorSpec &lhs,
+ const std::vector<vespalib::string> &from,
+ const std::vector<vespalib::string> &to)
+{
+ Stash stash;
+ ValueType lhs_type = ValueType::from_spec(lhs.type());
+ ValueType res_type = lhs_type.rename(from, to);
+ ASSERT_FALSE(lhs_type.is_error());
+ ASSERT_FALSE(res_type.is_error());
+ std::vector<EvalOp::UP> list;
+ for (const Impl &impl: impl_list) {
+ auto op = impl.create_rename(lhs_type, from, to, stash);
+ std::vector<CREF<TensorSpec>> stack_spec({lhs});
+ list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl));
+ }
+ benchmark(desc, list);
+}
+
+//-----------------------------------------------------------------------------
+
struct D {
vespalib::string name;
bool mapped;
@@ -281,6 +353,7 @@ template <typename ...Ds> TensorSpec make_spec(double seq, const Ds &...ds) {
}
TensorSpec make_vector(const D &d1, double seq) { return make_spec(seq, d1); }
+TensorSpec make_matrix(const D &d1, const D &d2, double seq) { return make_spec(seq, d1, d2); }
TensorSpec make_cube(const D &d1, const D &d2, const D &d3, double seq) { return make_spec(seq, d1, d2, d3); }
//-----------------------------------------------------------------------------
@@ -392,15 +465,91 @@ TEST(MixedJoin, no_overlap) {
//-----------------------------------------------------------------------------
-TEST(PrintResults, print_results) {
+TEST(ReduceBench, dense_reduce) {
+ auto lhs = make_cube(D::idx("a", 16), D::idx("b", 16), D::idx("c", 16), 1.0);
+ benchmark_reduce("dense reduce inner", lhs, Aggr::SUM, {"c"});
+ benchmark_reduce("dense reduce middle", lhs, Aggr::SUM, {"b"});
+ benchmark_reduce("dense reduce outer", lhs, Aggr::SUM, {"a"});
+ benchmark_reduce("dense multi-reduce inner", lhs, Aggr::SUM, {"b", "c"});
+ benchmark_reduce("dense multi-reduce outer", lhs, Aggr::SUM, {"a", "b"});
+ benchmark_reduce("dense multi-reduce outer-inner", lhs, Aggr::SUM, {"a", "c"});
+ benchmark_reduce("dense reduce all", lhs, Aggr::SUM, {});
+}
+
+TEST(ReduceBench, sparse_reduce) {
+ auto lhs = make_cube(D::map("a", 16, 1), D::map("b", 16, 1), D::map("c", 16, 1), 1.0);
+ benchmark_reduce("sparse reduce inner", lhs, Aggr::SUM, {"c"});
+ benchmark_reduce("sparse reduce middle", lhs, Aggr::SUM, {"b"});
+ benchmark_reduce("sparse reduce outer", lhs, Aggr::SUM, {"a"});
+ benchmark_reduce("sparse multi-reduce inner", lhs, Aggr::SUM, {"b", "c"});
+ benchmark_reduce("sparse multi-reduce outer", lhs, Aggr::SUM, {"a", "b"});
+ benchmark_reduce("sparse multi-reduce outer-inner", lhs, Aggr::SUM, {"a", "c"});
+ benchmark_reduce("sparse reduce all", lhs, Aggr::SUM, {});
+}
+
+TEST(ReduceBench, mixed_reduce) {
+ auto lhs = make_spec(1.0, D::map("a", 4, 1), D::map("b", 4, 1), D::map("c", 4, 1),
+ D::idx("d", 4), D::idx("e", 4), D::idx("f", 4));
+ benchmark_reduce("mixed reduce middle dense", lhs, Aggr::SUM, {"e"});
+ benchmark_reduce("mixed reduce middle sparse", lhs, Aggr::SUM, {"b"});
+ benchmark_reduce("mixed reduce middle sparse/dense", lhs, Aggr::SUM, {"b", "e"});
+ benchmark_reduce("mixed reduce all dense", lhs, Aggr::SUM, {"d", "e", "f"});
+ benchmark_reduce("mixed reduce all sparse", lhs, Aggr::SUM, {"a", "b", "c"});
+ benchmark_reduce("mixed reduce all", lhs, Aggr::SUM, {});
+}
+
+//-----------------------------------------------------------------------------
+
+TEST(RenameBench, dense_rename) {
+ auto lhs = make_matrix(D::idx("a", 16), D::idx("b", 16), 1.0);
+ benchmark_rename("dense transpose", lhs, {"a", "b"}, {"b", "a"});
+}
+
+TEST(RenameBench, sparse_rename) {
+ auto lhs = make_matrix(D::map("a", 16, 1), D::map("b", 16, 1), 1.0);
+ benchmark_rename("sparse transpose", lhs, {"a", "b"}, {"b", "a"});
+}
+
+TEST(RenameBench, mixed_rename) {
+ auto lhs = make_spec(1.0, D::map("a", 8, 1), D::map("b", 8, 1), D::idx("c", 8), D::idx("d", 8));
+ benchmark_rename("mixed multi-transpose", lhs, {"a", "b", "c", "d"}, {"b", "a", "d", "c"});
+}
+
+//-----------------------------------------------------------------------------
+
+void print_results(const vespalib::string &desc, const std::vector<BenchmarkResult> &results) {
+ if (results.empty()) {
+ return;
+ }
BenchmarkHeader header;
+ header.print_trailer();
+ header.print_header(desc);
+ header.print_trailer();
+ for (const auto &result: results) {
+ result.print();
+ }
+ header.print_trailer();
+}
+
+TEST(PrintResults, print_results) {
+ std::vector<BenchmarkResult> bad_results;
+ std::vector<BenchmarkResult> neutral_results;
+ std::vector<BenchmarkResult> good_results;
std::sort(benchmark_results.begin(), benchmark_results.end(),
[](const auto &a, const auto &b){ return (a.relative_perf[0] < b.relative_perf[0]); });
- header.print();
for (const auto &result: benchmark_results) {
- result.print();
+ double perf = result.relative_perf[0];
+ if (perf < bad_limit) {
+ bad_results.push_back(result);
+ } else if (perf > good_limit) {
+ good_results.push_back(result);
+ } else {
+ neutral_results.push_back(result);
+ }
}
- header.print_trailer();
+ print_results("BAD", bad_results);
+ print_results("NEUTRAL", neutral_results);
+ print_results("GOOD", good_results);
}
GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/vespa/eval/eval/nested_loop.h b/eval/src/vespa/eval/eval/nested_loop.h
new file mode 100644
index 00000000000..2fe98fb1e9b
--- /dev/null
+++ b/eval/src/vespa/eval/eval/nested_loop.h
@@ -0,0 +1,143 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vector>
+
+namespace vespalib::eval {
+
+// This file contains implementations for the generic nested loops
+// used by DenseJoinPlan, DenseReducePlan and similar. The loops act
+// like arbitrarily nested for-loops that are index-based where each
+// loop-level has a different stride that modifies the overall
+// index. An initial index is passed to the top-level function, which
+// is then modified by each loop-layer and finally passed back to a
+// callable for each iteration of the inner loop. There are different
+// implementations for traversing different numbers of index spaces in
+// parallel. Note that all loop layers must have at least 1 iteration.
+
+namespace nested_loop {
+
+//-----------------------------------------------------------------------------
+
+template <typename F, size_t N> void execute_few(size_t idx, const size_t *loop, const size_t *stride, const F &f) {
+ if constexpr (N == 0) {
+ f(idx);
+ } else {
+ for (size_t i = 0; i < *loop; ++i, idx += *stride) {
+ execute_few<F, N - 1>(idx, loop + 1, stride + 1, f);
+ }
+ }
+}
+
+template <typename F> void execute_many(size_t idx, const size_t *loop, const size_t *stride, size_t levels, const F &f) {
+ for (size_t i = 0; i < *loop; ++i, idx += *stride) {
+ if ((levels - 1) == 3) {
+ execute_few<F, 3>(idx, loop + 1, stride + 1, f);
+ } else {
+ execute_many<F>(idx, loop + 1, stride + 1, levels - 1, f);
+ }
+ }
+}
+
+//-----------------------------------------------------------------------------
+
+template <typename FIRST, typename NEXT, size_t N>
+void execute_few(size_t idx, const size_t *loop, const size_t *stride, const FIRST &first, const NEXT &next) {
+ if constexpr (N == 0) {
+ first(idx);
+ } else {
+ execute_few<FIRST, NEXT, N - 1>(idx, loop + 1, stride + 1, first, next);
+ idx += *stride;
+ for (size_t i = 1; i < *loop; ++i, idx += *stride) {
+ execute_few<NEXT, N - 1>(idx, loop + 1, stride + 1, next);
+ }
+ }
+}
+
+template <typename FIRST, typename NEXT>
+void execute_many(size_t idx, const size_t *loop, const size_t *stride, size_t levels, const FIRST &first, const NEXT &next) {
+ if ((levels - 1) == 3) {
+ execute_few<FIRST, NEXT, 3>(idx, loop + 1, stride + 1, first, next);
+ } else {
+ execute_many<FIRST, NEXT>(idx, loop + 1, stride + 1, levels - 1, first, next);
+ }
+ idx += *stride;
+ for (size_t i = 1; i < *loop; ++i, idx += *stride) {
+ if ((levels - 1) == 3) {
+ execute_few<NEXT, 3>(idx, loop + 1, stride + 1, next);
+ } else {
+ execute_many<NEXT>(idx, loop + 1, stride + 1, levels - 1, next);
+ }
+ }
+}
+
+//-----------------------------------------------------------------------------
+
+template <typename F, size_t N> void execute_few(size_t idx1, size_t idx2, const size_t *loop, const size_t *stride1, const size_t *stride2, const F &f) {
+ if constexpr (N == 0) {
+ f(idx1, idx2);
+ } else {
+ for (size_t i = 0; i < *loop; ++i, idx1 += *stride1, idx2 += *stride2) {
+ execute_few<F, N - 1>(idx1, idx2, loop + 1, stride1 + 1, stride2 + 1, f);
+ }
+ }
+}
+
+template <typename F> void execute_many(size_t idx1, size_t idx2, const size_t *loop, const size_t *stride1, const size_t *stride2, size_t levels, const F &f) {
+ for (size_t i = 0; i < *loop; ++i, idx1 += *stride1, idx2 += *stride2) {
+ if ((levels - 1) == 3) {
+ execute_few<F, 3>(idx1, idx2, loop + 1, stride1 + 1, stride2 + 1, f);
+ } else {
+ execute_many<F>(idx1, idx2, loop + 1, stride1 + 1, stride2 + 1, levels - 1, f);
+ }
+ }
+}
+
+//-----------------------------------------------------------------------------
+
+} // implementation details
+
+// Run a nested loop and pass indexes to 'f'
+template <typename F>
+void run_nested_loop(size_t idx, const std::vector<size_t> &loop, const std::vector<size_t> &stride, const F &f) {
+ size_t levels = loop.size();
+ switch(levels) {
+ case 0: return f(idx);
+ case 1: return nested_loop::execute_few<F, 1>(idx, &loop[0], &stride[0], f);
+ case 2: return nested_loop::execute_few<F, 2>(idx, &loop[0], &stride[0], f);
+ case 3: return nested_loop::execute_few<F, 3>(idx, &loop[0], &stride[0], f);
+ default: return nested_loop::execute_many<F>(idx, &loop[0], &stride[0], levels, f);
+ }
+}
+
+// Run a nested loop and pass the first index to 'first' and all
+// subsequent indexes to 'next'
+template <typename FIRST, typename NEXT>
+void run_nested_loop(size_t idx, const std::vector<size_t> &loop, const std::vector<size_t> &stride, const FIRST &first, const NEXT &next) {
+ size_t levels = loop.size();
+ switch(levels) {
+ case 0: return first(idx);
+ case 1: return nested_loop::execute_few<FIRST, NEXT, 1>(idx, &loop[0], &stride[0], first, next);
+ case 2: return nested_loop::execute_few<FIRST, NEXT, 2>(idx, &loop[0], &stride[0], first, next);
+ case 3: return nested_loop::execute_few<FIRST, NEXT, 3>(idx, &loop[0], &stride[0], first, next);
+ default: return nested_loop::execute_many<FIRST, NEXT>(idx, &loop[0], &stride[0], levels, first, next);
+ }
+}
+
+// Run two nested loops in parallel and pass both indexes to 'f'. Note
+// that 'loop' is shared, which means that only individual strides may
+// differ between the two loops.
+template <typename F>
+void run_nested_loop(size_t idx1, size_t idx2, const std::vector<size_t> &loop, const std::vector<size_t> &stride1, const std::vector<size_t> &stride2, const F &f) {
+ size_t levels = loop.size();
+ switch(levels) {
+ case 0: return f(idx1, idx2);
+ case 1: return nested_loop::execute_few<F, 1>(idx1, idx2, &loop[0], &stride1[0], &stride2[0], f);
+ case 2: return nested_loop::execute_few<F, 2>(idx1, idx2, &loop[0], &stride1[0], &stride2[0], f);
+ case 3: return nested_loop::execute_few<F, 3>(idx1, idx2, &loop[0], &stride1[0], &stride2[0], f);
+ default: return nested_loop::execute_many<F>(idx1, idx2, &loop[0], &stride1[0], &stride2[0], levels, f);
+ }
+}
+
+} // namespace
diff --git a/eval/src/vespa/eval/instruction/CMakeLists.txt b/eval/src/vespa/eval/instruction/CMakeLists.txt
index 71d08f601dd..91ff4fd63dc 100644
--- a/eval/src/vespa/eval/instruction/CMakeLists.txt
+++ b/eval/src/vespa/eval/instruction/CMakeLists.txt
@@ -3,6 +3,7 @@
vespa_add_library(eval_instruction OBJECT
SOURCES
generic_join
+ generic_reduce
generic_merge
generic_rename
)
diff --git a/eval/src/vespa/eval/instruction/generic_join.h b/eval/src/vespa/eval/instruction/generic_join.h
index 25647452dff..0f121104d5e 100644
--- a/eval/src/vespa/eval/instruction/generic_join.h
+++ b/eval/src/vespa/eval/instruction/generic_join.h
@@ -2,6 +2,7 @@
#pragma once
+#include <vespa/eval/eval/nested_loop.h>
#include <vespa/eval/eval/value_type.h>
#include <vespa/eval/eval/interpreted_function.h>
@@ -37,34 +38,8 @@ struct DenseJoinPlan {
std::vector<size_t> rhs_stride;
DenseJoinPlan(const ValueType &lhs_type, const ValueType &rhs_type);
~DenseJoinPlan();
- template <typename F> void execute(size_t lhs, size_t rhs, F &&f) const {
- switch(loops_left(0)) {
- case 0: return execute_few<F, 0>(0, lhs, rhs, std::forward<F>(f));
- case 1: return execute_few<F, 1>(0, lhs, rhs, std::forward<F>(f));
- case 2: return execute_few<F, 2>(0, lhs, rhs, std::forward<F>(f));
- case 3: return execute_few<F, 3>(0, lhs, rhs, std::forward<F>(f));
- default: return execute_many<F>(0, lhs, rhs, std::forward<F>(f));
- }
- }
-private:
- size_t loops_left(size_t idx) const { return (loop_cnt.size() - idx); }
- template <typename F, size_t N> void execute_few(size_t idx, size_t lhs, size_t rhs, F &&f) const {
- if constexpr (N == 0) {
- f(lhs, rhs);
- } else {
- for (size_t i = 0; i < loop_cnt[idx]; ++i, lhs += lhs_stride[idx], rhs += rhs_stride[idx]) {
- execute_few<F, N - 1>(idx + 1, lhs, rhs, std::forward<F>(f));
- }
- }
- }
- template <typename F> void execute_many(size_t idx, size_t lhs, size_t rhs, F &&f) const {
- for (size_t i = 0; i < loop_cnt[idx]; ++i, lhs += lhs_stride[idx], rhs += rhs_stride[idx]) {
- if (loops_left(idx + 1) == 3) {
- execute_few<F, 3>(idx + 1, lhs, rhs, std::forward<F>(f));
- } else {
- execute_many<F>(idx + 1, lhs, rhs, std::forward<F>(f));
- }
- }
+ template <typename F> void execute(size_t lhs, size_t rhs, const F &f) const {
+ run_nested_loop(lhs, rhs, loop_cnt, lhs_stride, rhs_stride, f);
}
};
diff --git a/eval/src/vespa/eval/instruction/generic_reduce.cpp b/eval/src/vespa/eval/instruction/generic_reduce.cpp
new file mode 100644
index 00000000000..d294b478210
--- /dev/null
+++ b/eval/src/vespa/eval/instruction/generic_reduce.cpp
@@ -0,0 +1,219 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "generic_reduce.h"
+#include <vespa/eval/eval/value.h>
+#include <vespa/vespalib/util/stash.h>
+#include <vespa/vespalib/util/typify.h>
+#include <cassert>
+
+namespace vespalib::eval::instruction {
+
+using State = InterpretedFunction::State;
+using Instruction = InterpretedFunction::Instruction;
+
+namespace {
+
+//-----------------------------------------------------------------------------
+
+template <typename T, typename IN> uint64_t wrap_param(const IN &value_in) {
+ const T &value = value_in;
+ static_assert(sizeof(uint64_t) == sizeof(&value));
+ return (uint64_t)&value;
+}
+
+template <typename T> const T &unwrap_param(uint64_t param) {
+ return *((const T *)param);
+}
+
+//-----------------------------------------------------------------------------
+
+struct ReduceParam {
+ ValueType res_type;
+ SparseReducePlan sparse_plan;
+ DenseReducePlan dense_plan;
+ const ValueBuilderFactory &factory;
+ ReduceParam(const ValueType &type, const std::vector<vespalib::string> &dimensions,
+ const ValueBuilderFactory &factory_in)
+ : res_type(type.reduce(dimensions)),
+ sparse_plan(type, res_type),
+ dense_plan(type, res_type),
+ factory(factory_in)
+ {
+ assert(!res_type.is_error());
+ assert(dense_plan.in_size == type.dense_subspace_size());
+ assert(dense_plan.out_size == res_type.dense_subspace_size());
+ }
+ ~ReduceParam();
+};
+ReduceParam::~ReduceParam() = default;
+
+//-----------------------------------------------------------------------------
+
+struct SparseReduceState {
+
+ // TODO(havardpe): Optimize using vespalib::hash_map with
+ // reference to slice of external vector of vespalib::stringref as
+ // key. Track matching subspaces as linked lists contained in a
+ // single shared vector.
+
+ using MapKey = std::vector<vespalib::string>;
+ using MapValue = std::vector<size_t>;
+ using Map = std::map<MapKey,MapValue>;
+
+ Map map;
+ std::vector<vespalib::stringref> full_address;
+ std::vector<vespalib::stringref*> fetch_address;
+ std::vector<vespalib::stringref*> keep_address;
+ size_t subspace;
+
+ SparseReduceState(const SparseReducePlan &plan)
+ : full_address(plan.keep_dims.size() + plan.num_reduce_dims),
+ fetch_address(full_address.size(), nullptr),
+ keep_address(plan.keep_dims.size(), nullptr)
+ {
+ for (size_t i = 0; i < keep_address.size(); ++i) {
+ keep_address[i] = &full_address[plan.keep_dims[i]];
+ }
+ for (size_t i = 0; i < full_address.size(); ++i) {
+ fetch_address[i] = &full_address[i];
+ }
+ }
+ ~SparseReduceState();
+ void populate_map(std::unique_ptr<Value::Index::View> full_view) {
+ full_view->lookup({});
+ while (full_view->next_result(fetch_address, subspace)) {
+ std::vector<vespalib::string> key;
+ key.reserve(keep_address.size());
+ for (const vespalib::stringref *label: keep_address) {
+ key.emplace_back(*label);
+ }
+ map[key].push_back(subspace);
+ }
+ }
+ template <typename T>
+ std::vector<vespalib::stringref> make_addr(const T &map_entry) {
+ std::vector<vespalib::stringref> addr;
+ addr.reserve(map_entry.first.size());
+ for (const vespalib::string &label: map_entry.first) {
+ addr.emplace_back(label);
+ }
+ return addr;
+ }
+};
+SparseReduceState::~SparseReduceState() = default;
+
+template <typename ICT, typename OCT, typename AGGR>
+void my_generic_reduce_op(State &state, uint64_t param_in) {
+ const auto &param = unwrap_param<ReduceParam>(param_in);
+ SparseReduceState sparse(param.sparse_plan);
+ const Value &value = state.peek(0);
+ sparse.populate_map(value.index().create_view({}));
+ auto cells = value.cells().typify<ICT>();
+ AGGR aggr;
+ auto first = [&](size_t idx) { aggr.first(cells[idx]); };
+ auto next = [&](size_t idx) { aggr.next(cells[idx]); };
+ auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.keep_dims.size(), param.dense_plan.out_size, sparse.map.size());
+ for (const auto &entry: sparse.map) {
+ OCT *dst = builder->add_subspace(sparse.make_addr(entry)).begin();
+ auto reduce_cells = [&](size_t rel_idx)
+ {
+ auto pos = entry.second.begin();
+ param.dense_plan.execute_reduce((*pos * param.dense_plan.in_size) + rel_idx, first, next);
+ for (++pos; pos != entry.second.end(); ++pos) {
+ param.dense_plan.execute_reduce((*pos * param.dense_plan.in_size) + rel_idx, next);
+ }
+ *dst++ = aggr.result();
+ };
+ param.dense_plan.execute_keep(reduce_cells);
+ }
+ auto &result = state.stash.create<std::unique_ptr<Value>>(builder->build(std::move(builder)));
+ const Value &result_ref = *(result.get());
+ state.pop_push(result_ref);
+};
+
+struct SelectGenericReduceOp {
+ template <typename ICT, typename OCT, typename AGGR> static auto invoke() {
+ return my_generic_reduce_op<ICT, OCT, typename AGGR::template templ<ICT>>;
+ }
+};
+
+//-----------------------------------------------------------------------------
+
+} // namespace <unnamed>
+
+//-----------------------------------------------------------------------------
+
+DenseReducePlan::DenseReducePlan(const ValueType &type, const ValueType &res_type)
+ : in_size(1),
+ out_size(1),
+ keep_loop(),
+ keep_stride(),
+ reduce_loop(),
+ reduce_stride()
+{
+ std::vector<bool> keep;
+ std::vector<size_t> size;
+ for (const auto &dim: type.nontrivial_indexed_dimensions()) {
+ keep.push_back(res_type.dimension_index(dim.name) != ValueType::Dimension::npos);
+ size.push_back(dim.size);
+ }
+ std::vector<size_t> stride(size.size(), 0);
+ for (size_t i = stride.size(); i-- > 0; ) {
+ stride[i] = in_size;
+ in_size *= size[i];
+ if (keep[i]) {
+ out_size *= size[i];
+ }
+ }
+ int prev_case = 2;
+ for (size_t i = 0; i < size.size(); ++i) {
+ int my_case = keep[i] ? 1 : 0;
+ auto &my_loop = keep[i] ? keep_loop : reduce_loop;
+ auto &my_stride = keep[i] ? keep_stride : reduce_stride;
+ if (my_case == prev_case) {
+ assert(!my_loop.empty());
+ my_loop.back() *= size[i];
+ my_stride.back() = stride[i];
+ } else {
+ my_loop.push_back(size[i]);
+ my_stride.push_back(stride[i]);
+ }
+ prev_case = my_case;
+ }
+}
+
+DenseReducePlan::~DenseReducePlan() = default;
+
+//-----------------------------------------------------------------------------
+
+SparseReducePlan::SparseReducePlan(const ValueType &type, const ValueType &res_type)
+ : num_reduce_dims(0),
+ keep_dims()
+{
+ auto dims = type.mapped_dimensions();
+ for (size_t i = 0; i < dims.size(); ++i) {
+ bool keep = (res_type.dimension_index(dims[i].name) != ValueType::Dimension::npos);
+ if (keep) {
+ keep_dims.push_back(i);
+ } else {
+ ++num_reduce_dims;
+ }
+ }
+}
+
+SparseReducePlan::~SparseReducePlan() = default;
+
+//-----------------------------------------------------------------------------
+
+using ReduceTypify = TypifyValue<TypifyCellType,TypifyAggr>;
+
+Instruction
+GenericReduce::make_instruction(const ValueType &type, Aggr aggr, const std::vector<vespalib::string> &dimensions,
+ const ValueBuilderFactory &factory, Stash &stash)
+{
+ auto &param = stash.create<ReduceParam>(type, dimensions, factory);
+ auto fun = typify_invoke<3,ReduceTypify,SelectGenericReduceOp>(type.cell_type(), param.res_type.cell_type(), aggr);
+ return Instruction(fun, wrap_param<ReduceParam>(param));
+}
+
+} // namespace
diff --git a/eval/src/vespa/eval/instruction/generic_reduce.h b/eval/src/vespa/eval/instruction/generic_reduce.h
new file mode 100644
index 00000000000..e412f8795ac
--- /dev/null
+++ b/eval/src/vespa/eval/instruction/generic_reduce.h
@@ -0,0 +1,54 @@
+// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/eval/eval/value_type.h>
+#include <vespa/eval/eval/aggr.h>
+#include <vespa/eval/eval/interpreted_function.h>
+#include <vespa/eval/eval/nested_loop.h>
+
+namespace vespalib { class Stash; }
+namespace vespalib::eval { struct ValueBuilderFactory; }
+
+namespace vespalib::eval::instruction {
+
+//-----------------------------------------------------------------------------
+
+struct GenericReduce {
+ static InterpretedFunction::Instruction
+ make_instruction(const ValueType &type, Aggr aggr, const std::vector<vespalib::string> &dimensions,
+ const ValueBuilderFactory &factory, Stash &stash);
+};
+
+//-----------------------------------------------------------------------------
+
+struct DenseReducePlan {
+ size_t in_size;
+ size_t out_size;
+ std::vector<size_t> keep_loop;
+ std::vector<size_t> keep_stride;
+ std::vector<size_t> reduce_loop;
+ std::vector<size_t> reduce_stride;
+ DenseReducePlan(const ValueType &type, const ValueType &res_type);
+ ~DenseReducePlan();
+ template <typename F> void execute_keep(const F &f) const {
+ run_nested_loop(0, keep_loop, keep_stride, f);
+ }
+ template <typename F> void execute_reduce(size_t offset, const F &f) const {
+ run_nested_loop(offset, reduce_loop, reduce_stride, f);
+ }
+ template <typename FIRST, typename NEXT> void execute_reduce(size_t offset, const FIRST &first, const NEXT &next) const {
+ run_nested_loop(offset, reduce_loop, reduce_stride, first, next);
+ }
+};
+
+struct SparseReducePlan {
+ size_t num_reduce_dims;
+ std::vector<size_t> keep_dims;
+ SparseReducePlan(const ValueType &type, const ValueType &res_type);
+ ~SparseReducePlan();
+};
+
+//-----------------------------------------------------------------------------
+
+} // namespace
diff --git a/eval/src/vespa/eval/instruction/generic_rename.cpp b/eval/src/vespa/eval/instruction/generic_rename.cpp
index cda0aa6e89a..1f8398555b4 100644
--- a/eval/src/vespa/eval/instruction/generic_rename.cpp
+++ b/eval/src/vespa/eval/instruction/generic_rename.cpp
@@ -53,13 +53,13 @@ struct RenameParam {
SparseRenamePlan sparse_plan;
DenseRenamePlan dense_plan;
const ValueBuilderFactory &factory;
- RenameParam(const ValueType &lhs_type, const ValueType &output_type,
+ RenameParam(const ValueType &lhs_type,
const std::vector<vespalib::string> &rename_dimension_from,
const std::vector<vespalib::string> &rename_dimension_to,
const ValueBuilderFactory &factory_in)
- : res_type(output_type),
- sparse_plan(lhs_type, output_type, rename_dimension_from, rename_dimension_to),
- dense_plan(lhs_type, output_type, rename_dimension_from, rename_dimension_to),
+ : res_type(lhs_type.rename(rename_dimension_from, rename_dimension_to)),
+ sparse_plan(lhs_type, res_type, rename_dimension_from, rename_dimension_to),
+ dense_plan(lhs_type, res_type, rename_dimension_from, rename_dimension_to),
factory(factory_in)
{
assert(!res_type.is_error());
@@ -181,15 +181,15 @@ DenseRenamePlan::DenseRenamePlan(const ValueType &lhs_type,
DenseRenamePlan::~DenseRenamePlan() = default;
InterpretedFunction::Instruction
-GenericRename::make_instruction(const ValueType &lhs_type, const ValueType &output_type,
+GenericRename::make_instruction(const ValueType &lhs_type,
const std::vector<vespalib::string> &rename_dimension_from,
const std::vector<vespalib::string> &rename_dimension_to,
const ValueBuilderFactory &factory, Stash &stash)
{
- auto &param = stash.create<RenameParam>(lhs_type, output_type,
+ auto &param = stash.create<RenameParam>(lhs_type,
rename_dimension_from, rename_dimension_to,
factory);
- auto fun = typify_invoke<1,TypifyCellType,SelectGenericRenameOp>(output_type.cell_type());
+ auto fun = typify_invoke<1,TypifyCellType,SelectGenericRenameOp>(param.res_type.cell_type());
return Instruction(fun, wrap_param<RenameParam>(param));
}
diff --git a/eval/src/vespa/eval/instruction/generic_rename.h b/eval/src/vespa/eval/instruction/generic_rename.h
index ca9f45bd341..6c94ff02b24 100644
--- a/eval/src/vespa/eval/instruction/generic_rename.h
+++ b/eval/src/vespa/eval/instruction/generic_rename.h
@@ -2,6 +2,7 @@
#pragma once
+#include <vespa/eval/eval/nested_loop.h>
#include <vespa/eval/eval/value_type.h>
#include <vespa/eval/eval/interpreted_function.h>
#include <vespa/vespalib/stllike/string.h>
@@ -20,37 +21,8 @@ struct DenseRenamePlan {
const std::vector<vespalib::string> &from,
const std::vector<vespalib::string> &to);
~DenseRenamePlan();
- template <typename F> void execute(size_t offset, F &&f) const {
- switch(loops_left(0)) {
- case 0: return execute_few<F, 0>(0, offset, std::forward<F>(f));
- case 1: return execute_few<F, 1>(0, offset, std::forward<F>(f));
- case 2: return execute_few<F, 2>(0, offset, std::forward<F>(f));
- case 3: return execute_few<F, 3>(0, offset, std::forward<F>(f));
- default: return execute_many<F>(0, offset, std::forward<F>(f));
- }
- }
-private:
- size_t loops_left(size_t idx) const { return (loop_cnt.size() - idx); }
-
- template <typename F, size_t N> void execute_few(size_t idx, size_t offset, F &&f) const {
- if constexpr (N == 0) {
- f(offset);
- } else {
- for (size_t i = 0; i < loop_cnt[idx]; ++i) {
- execute_few<F, N - 1>(idx + 1, offset, std::forward<F>(f));
- offset += stride[idx];
- }
- }
- }
- template <typename F> void execute_many(size_t idx, size_t offset, F &&f) const {
- for (size_t i = 0; i < loop_cnt[idx]; ++i) {
- if (loops_left(idx + 1) == 3) {
- execute_few<F, 3>(idx + 1, offset, std::forward<F>(f));
- } else {
- execute_many<F>(idx + 1, offset, std::forward<F>(f));
- }
- offset += stride[idx];
- }
+ template <typename F> void execute(size_t offset, const F &f) const {
+ run_nested_loop(offset, loop_cnt, stride, f);
}
};
@@ -68,7 +40,7 @@ struct SparseRenamePlan {
struct GenericRename {
static InterpretedFunction::Instruction
- make_instruction(const ValueType &lhs_type, const ValueType &output_type,
+ make_instruction(const ValueType &lhs_type,
const std::vector<vespalib::string> &rename_dimension_from,
const std::vector<vespalib::string> &rename_dimension_to,
const ValueBuilderFactory &factory, Stash &stash);