diff options
author | HÃ¥vard Pettersen <3535158+havardpe@users.noreply.github.com> | 2020-10-02 15:14:40 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-10-02 15:14:40 +0200 |
commit | 46f99f3f700f3cae1b60209a01645f259e5a3f74 (patch) | |
tree | e84e06c8d3ac29cc829511bbf3dc93576d4596df | |
parent | c67283d48af378f25cf1bd3ed8e578d0e529813f (diff) | |
parent | 8c8c529e6fea1d0e8869dfb74881ca15ca9a89a9 (diff) |
Merge pull request #14681 from vespa-engine/havardpe/generic-reduce
generic reduce
-rw-r--r-- | eval/CMakeLists.txt | 2 | ||||
-rw-r--r-- | eval/src/tests/eval/nested_loop/.gitignore | 1 | ||||
-rw-r--r-- | eval/src/tests/eval/nested_loop/CMakeLists.txt | 16 | ||||
-rw-r--r-- | eval/src/tests/eval/nested_loop/nested_loop_bench.cpp | 90 | ||||
-rw-r--r-- | eval/src/tests/eval/nested_loop/nested_loop_test.cpp | 88 | ||||
-rw-r--r-- | eval/src/tests/instruction/generic_reduce/CMakeLists.txt | 9 | ||||
-rw-r--r-- | eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp | 109 | ||||
-rw-r--r-- | eval/src/tests/instruction/generic_rename/generic_rename_test.cpp | 8 | ||||
-rw-r--r-- | eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp | 187 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/nested_loop.h | 143 | ||||
-rw-r--r-- | eval/src/vespa/eval/instruction/CMakeLists.txt | 1 | ||||
-rw-r--r-- | eval/src/vespa/eval/instruction/generic_join.h | 31 | ||||
-rw-r--r-- | eval/src/vespa/eval/instruction/generic_reduce.cpp | 219 | ||||
-rw-r--r-- | eval/src/vespa/eval/instruction/generic_reduce.h | 54 | ||||
-rw-r--r-- | eval/src/vespa/eval/instruction/generic_rename.cpp | 14 | ||||
-rw-r--r-- | eval/src/vespa/eval/instruction/generic_rename.h | 36 |
16 files changed, 918 insertions, 90 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt index 1872ee1410a..b8cc77b8290 100644 --- a/eval/CMakeLists.txt +++ b/eval/CMakeLists.txt @@ -20,6 +20,7 @@ vespa_define_module( src/tests/eval/inline_operation src/tests/eval/interpreted_function src/tests/eval/multiply_add + src/tests/eval/nested_loop src/tests/eval/node_tools src/tests/eval/node_types src/tests/eval/param_usage @@ -33,6 +34,7 @@ vespa_define_module( src/tests/eval/value_type src/tests/gp/ponder_nov2017 src/tests/instruction/generic_join + src/tests/instruction/generic_reduce src/tests/instruction/generic_merge src/tests/instruction/generic_rename src/tests/tensor/default_value_builder_factory diff --git a/eval/src/tests/eval/nested_loop/.gitignore b/eval/src/tests/eval/nested_loop/.gitignore new file mode 100644 index 00000000000..a3c003c3a2d --- /dev/null +++ b/eval/src/tests/eval/nested_loop/.gitignore @@ -0,0 +1 @@ +/eval_nested_loop_bench_app diff --git a/eval/src/tests/eval/nested_loop/CMakeLists.txt b/eval/src/tests/eval/nested_loop/CMakeLists.txt new file mode 100644 index 00000000000..ccd751143b1 --- /dev/null +++ b/eval/src/tests/eval/nested_loop/CMakeLists.txt @@ -0,0 +1,16 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(eval_nested_loop_test_app TEST + SOURCES + nested_loop_test.cpp + DEPENDS + vespaeval + GTest::GTest +) +vespa_add_test(NAME eval_nested_loop_test_app COMMAND eval_nested_loop_test_app) +vespa_add_executable(eval_nested_loop_bench_app TEST + SOURCES + nested_loop_bench.cpp + DEPENDS + vespaeval + GTest::GTest +) diff --git a/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp b/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp new file mode 100644 index 00000000000..c6e2f278cd4 --- /dev/null +++ b/eval/src/tests/eval/nested_loop/nested_loop_bench.cpp @@ -0,0 +1,90 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/eval/eval/nested_loop.h> +#include <vespa/vespalib/util/benchmark_timer.h> +#include <vespa/vespalib/gtest/gtest.h> + +using vespalib::BenchmarkTimer; +using vespalib::eval::run_nested_loop; + +using LIST = std::vector<size_t>; +using call_t = void (*)(const LIST &loop, const LIST &stride); + +void perform_direct(const LIST &loop, const LIST &stride) { + size_t idx1 = 0; + size_t expect = 0; + for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) { + size_t idx2 = idx1; + for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) { + size_t idx3 = idx2; + for (size_t k = 0; k < loop[2]; ++k, idx3 += stride[2]) { + assert(idx3 == expect); + ++expect; + } + } + } + assert(expect == 4096); +} + +void perform_direct_lambda(const LIST &loop, const LIST &stride) { + size_t expect = 0; + auto fun = [&](size_t idx) { + (void) idx; + assert(idx == expect); + ++expect; + }; + size_t idx1 = 0; + for (size_t i = 0; i < loop[0]; ++i, idx1 += stride[0]) { + size_t idx2 = idx1; + for (size_t j = 0; j < loop[1]; ++j, idx2 += stride[1]) { + size_t idx3 = idx2; + for (size_t k = 0; k < loop[2]; ++k, idx3 += stride[2]) { + fun(idx3); + } + } + } + assert(expect == 4096); +} + +void perform_generic(const LIST &loop, const LIST &stride) { + size_t expect = 0; + auto fun = [&](size_t idx) { + (void) idx; + assert(idx == expect); + ++expect; + }; + run_nested_loop(0, loop, stride, fun); + assert(expect == 4096); +} + +void perform_generic_isolate_first(const LIST &loop, const LIST &stride) { + size_t expect = 0; + auto fun = [&](size_t idx) { + (void) idx; + assert(idx == expect); + ++expect; + }; + run_nested_loop(0, loop, stride, fun, fun); + assert(expect == 4096); +} + +void nop() {} + +double estimate_cost_us(call_t perform_fun) { + LIST loop({16,16,16}); + LIST stride({256,16,1}); + return BenchmarkTimer::benchmark([&](){ perform_fun(loop, stride); }, nop, 100000, 5.0) * 1000.0 * 1000.0; +} + +//----------------------------------------------------------------------------- + +TEST(NestedLoopBenchmark, single_loop) { + fprintf(stderr, "manual direct single loop: %g us\n", estimate_cost_us(perform_direct)); + fprintf(stderr, "manual call lambda single loop: %g us\n", estimate_cost_us(perform_direct_lambda)); + fprintf(stderr, "generic single loop: %g us\n", estimate_cost_us(perform_generic)); + fprintf(stderr, "generic single loop (isolate first): %g us\n", estimate_cost_us(perform_generic_isolate_first)); +} + +//----------------------------------------------------------------------------- + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/eval/src/tests/eval/nested_loop/nested_loop_test.cpp b/eval/src/tests/eval/nested_loop/nested_loop_test.cpp new file mode 100644 index 00000000000..a72c5ed9cca --- /dev/null +++ b/eval/src/tests/eval/nested_loop/nested_loop_test.cpp @@ -0,0 +1,88 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/eval/eval/nested_loop.h> +#include <vespa/vespalib/gtest/gtest.h> + +using namespace vespalib::eval; + +std::vector<size_t> run_single(size_t idx_in, const std::vector<size_t> &loop, const std::vector<size_t> &stride) { + std::vector<size_t> result; + auto capture = [&](size_t idx_out) { result.push_back(idx_out); }; + assert(loop.size() == stride.size()); + run_nested_loop(idx_in, loop, stride, capture); + return result; +} + +std::pair<std::vector<size_t>,std::vector<size_t>> run_single_isolated(size_t idx_in, const std::vector<size_t> &loop, const std::vector<size_t> &stride) { + std::vector<size_t> res1; + std::vector<size_t> res2; + auto capture1 = [&](size_t idx_out) { res1.push_back(idx_out); }; + auto capture2 = [&](size_t idx_out) { res2.push_back(idx_out); }; + assert(loop.size() == stride.size()); + run_nested_loop(idx_in, loop, stride, capture1, capture2); + return std::make_pair(res1, res2); +} + +std::vector<std::pair<size_t,size_t>> run_double(size_t idx1_in, size_t idx2_in, const std::vector<size_t> &loop, + const std::vector<size_t> &stride1, const std::vector<size_t> &stride2) +{ + std::vector<std::pair<size_t,size_t>> result; + auto capture = [&](size_t idx1_out, size_t idx2_out) { result.emplace_back(idx1_out, idx2_out); }; + assert(loop.size() == stride1.size()); + assert(loop.size() == stride2.size()); + run_nested_loop(idx1_in, idx2_in, loop, stride1, stride2, capture); + return result; +} + +void verify_isolated(size_t idx_in, const std::vector<size_t> &loop, const std::vector<size_t> &stride) { + auto full = run_single(idx_in, loop, stride); + auto actual = run_single_isolated(idx_in, loop, stride); + ASSERT_EQ(actual.first.size(), 1); + ASSERT_EQ(actual.second.size(), full.size() - 1); + EXPECT_EQ(actual.first[0], full[0]); + full.erase(full.begin()); + EXPECT_EQ(actual.second, full); +} + +void verify_double(size_t idx1_in, size_t idx2_in, const std::vector<size_t> &loop, + const std::vector<size_t> &stride1, const std::vector<size_t> &stride2) +{ + auto res1 = run_single(idx1_in, loop, stride1); + auto res2 = run_single(idx2_in, loop, stride2); + ASSERT_EQ(res1.size(), res2.size()); + std::vector<std::pair<size_t,size_t>> expect; + for (size_t i = 0; i < res1.size(); ++i) { + expect.emplace_back(res1[i], res2[i]); + } + auto actual = run_double(idx1_in, idx2_in, loop, stride1, stride2); + EXPECT_EQ(actual, expect); +} + +std::vector<size_t> v(std::vector<size_t> vec) { return vec; } + +TEST(NestedLoopTest, single_nested_loop_can_be_executed) { + EXPECT_EQ(v({123}), run_single(123, {}, {})); + EXPECT_EQ(v({10,11}), run_single(10, {2}, {1})); + EXPECT_EQ(v({100,110,101,111}), run_single(100, {2,2}, {1,10})); + EXPECT_EQ(v({100,110,100,110,101,111,101,111}), run_single(100, {2,2,2}, {1,0,10})); + EXPECT_EQ(v({100,101,102,103,104,105,106,107,108,109,110,111,112,113,114,115}), + run_single(100, {2,2,2,2}, {8,4,2,1})); +} + +TEST(NestedLoopTest, single_nested_loop_with_first_entry_isolated_can_be_executed) { + verify_isolated(10, {}, {}); + verify_isolated(10, {3}, {5}); + verify_isolated(10, {3,3}, {2,3}); + verify_isolated(10, {3,3,2}, {2,0,3}); + verify_isolated(10, {2,3,2,3}, {7,2,1,3}); +} + +TEST(NestedLoopTest, double_nested_loop_can_be_executed) { + verify_double(10, 20, {}, {}, {}); + verify_double(10, 20, {3}, {5}, {7}); + verify_double(10, 20, {3,3}, {2,3}, {7,5}); + verify_double(10, 20, {3,3,2}, {2,0,3}, {0,7,5}); + verify_double(10, 20, {2,3,2,3}, {7,2,1,3}, {3,7,5,1}); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/eval/src/tests/instruction/generic_reduce/CMakeLists.txt b/eval/src/tests/instruction/generic_reduce/CMakeLists.txt new file mode 100644 index 00000000000..f04664ae64c --- /dev/null +++ b/eval/src/tests/instruction/generic_reduce/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(eval_generic_reduce_test_app TEST + SOURCES + generic_reduce_test.cpp + DEPENDS + vespaeval + GTest::GTest +) +vespa_add_test(NAME eval_generic_reduce_test_app COMMAND eval_generic_reduce_test_app) diff --git a/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp b/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp new file mode 100644 index 00000000000..7e9a059967a --- /dev/null +++ b/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp @@ -0,0 +1,109 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/eval/eval/simple_value.h> +#include <vespa/eval/eval/value_codec.h> +#include <vespa/eval/instruction/generic_reduce.h> +#include <vespa/eval/eval/interpreted_function.h> +#include <vespa/eval/eval/test/tensor_model.hpp> +#include <vespa/vespalib/util/stringfmt.h> +#include <vespa/vespalib/gtest/gtest.h> +#include <optional> + +using namespace vespalib; +using namespace vespalib::eval; +using namespace vespalib::eval::instruction; +using namespace vespalib::eval::test; + +using vespalib::make_string_short::fmt; + +std::vector<Layout> layouts = { + {}, + {x(3)}, + {x(3),y(5)}, + {x(3),y(5),z(7)}, + float_cells({x(3),y(5),z(7)}), + {x({"a","b","c"})}, + {x({"a","b","c"}),y({"foo","bar"})}, + {x({"a","b","c"}),y({"foo","bar"}),z({"i","j","k","l"})}, + float_cells({x({"a","b","c"}),y({"foo","bar"}),z({"i","j","k","l"})}), + {x(3),y({"foo", "bar"}),z(7)}, + {x({"a","b","c"}),y(5),z({"i","j","k","l"})}, + float_cells({x({"a","b","c"}),y(5),z({"i","j","k","l"})}) +}; + +TensorSpec reference_reduce(const TensorSpec &a, const std::vector<vespalib::string> &dims, Aggr aggr) { + Stash stash; + ValueType res_type = ValueType::from_spec(a.type()).reduce(dims); + EXPECT_FALSE(res_type.is_error()); + std::map<TensorSpec::Address,std::optional<Aggregator*>> my_map; + for (const auto &cell: a.cells()) { + TensorSpec::Address addr; + for (const auto &dim: cell.first) { + if (res_type.dimension_index(dim.first) != ValueType::Dimension::npos) { + addr.insert_or_assign(dim.first, dim.second); + } + } + auto [pos, is_empty] = my_map.emplace(addr, std::nullopt); + if (is_empty) { + pos->second = &Aggregator::create(aggr, stash); + pos->second.value()->first(cell.second); + } else { + pos->second.value()->next(cell.second); + } + } + TensorSpec result(res_type.to_spec()); + for (const auto &my_entry: my_map) { + result.add(my_entry.first, my_entry.second.value()->result()); + } + return result; +} + +TensorSpec perform_generic_reduce(const TensorSpec &a, const std::vector<vespalib::string> &dims, Aggr aggr) { + Stash stash; + const auto &factory = SimpleValueBuilderFactory::get(); + auto lhs = value_from_spec(a, factory); + auto my_op = GenericReduce::make_instruction(lhs->type(), aggr, dims, factory, stash); + InterpretedFunction::EvalSingle single(my_op); + return spec_from_value(single.eval(std::vector<Value::CREF>({*lhs}))); +} + +TEST(GenericReduceTest, dense_reduce_plan_can_be_created) { + auto type = ValueType::from_spec("tensor(a[2],aa{},b[2],bb[1],c[2],cc{},d[2],dd[1],e[2],ee{},f[2])"); + auto plan = DenseReducePlan(type, type.reduce({"a", "d", "e"})); + std::vector<size_t> expect_keep_loop = {4,2}; + std::vector<size_t> expect_keep_stride = {8,1}; + std::vector<size_t> expect_reduce_loop = {2,4}; + std::vector<size_t> expect_reduce_stride = {32,2}; + EXPECT_EQ(plan.in_size, 64); + EXPECT_EQ(plan.out_size, 8); + EXPECT_EQ(plan.keep_loop, expect_keep_loop); + EXPECT_EQ(plan.keep_stride, expect_keep_stride); + EXPECT_EQ(plan.reduce_loop, expect_reduce_loop); + EXPECT_EQ(plan.reduce_stride, expect_reduce_stride); +} + +TEST(GenericReduceTest, sparse_reduce_plan_can_be_created) { + auto type = ValueType::from_spec("tensor(a{},aa[10],b{},c{},cc[5],d{},e{},ee[1],f{})"); + auto plan = SparseReducePlan(type, type.reduce({"a", "d", "e"})); + std::vector<size_t> expect_keep_dims = {1,2,5}; + EXPECT_EQ(plan.num_reduce_dims, 3); + EXPECT_EQ(plan.keep_dims, expect_keep_dims); +} + +TEST(GenericReduceTest, generic_reduce_works_for_simple_values) { + for (const Layout &layout: layouts) { + TensorSpec input = spec(layout, Div16(N())); + for (Aggr aggr: {Aggr::SUM, Aggr::AVG, Aggr::MIN, Aggr::MAX}) { + for (const Domain &domain: layout) { + auto expect = reference_reduce(input, {domain.dimension}, aggr); + auto actual = perform_generic_reduce(input, {domain.dimension}, aggr); + EXPECT_EQ(actual, expect); + } + auto expect = reference_reduce(input, {}, aggr); + auto actual = perform_generic_reduce(input, {}, aggr); + EXPECT_EQ(actual, expect); + } + } +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/eval/src/tests/instruction/generic_rename/generic_rename_test.cpp b/eval/src/tests/instruction/generic_rename/generic_rename_test.cpp index f61899e4dda..d5c7f58fc1f 100644 --- a/eval/src/tests/instruction/generic_rename/generic_rename_test.cpp +++ b/eval/src/tests/instruction/generic_rename/generic_rename_test.cpp @@ -111,12 +111,12 @@ TensorSpec reference_rename(const TensorSpec &a, const FromTo &ft) { return result; } -TensorSpec perform_generic_rename(const TensorSpec &a, const ValueType &res_type, - const FromTo &ft, const ValueBuilderFactory &factory) +TensorSpec perform_generic_rename(const TensorSpec &a, + const FromTo &ft, const ValueBuilderFactory &factory) { Stash stash; auto lhs = value_from_spec(a, factory); - auto my_op = GenericRename::make_instruction(lhs->type(), res_type, ft.from, ft.to, factory, stash); + auto my_op = GenericRename::make_instruction(lhs->type(), ft.from, ft.to, factory, stash); InterpretedFunction::EvalSingle single(my_op); return spec_from_value(single.eval(std::vector<Value::CREF>({*lhs}))); } @@ -132,7 +132,7 @@ void test_generic_rename(const ValueBuilderFactory &factory) { // printf("type %s -> %s\n", lhs_type.to_spec().c_str(), renamed_type.to_spec().c_str()); SCOPED_TRACE(fmt("\n===\nLHS: %s\n===\n", lhs.to_string().c_str())); auto expect = reference_rename(lhs, from_to); - auto actual = perform_generic_rename(lhs, renamed_type, from_to, factory); + auto actual = perform_generic_rename(lhs, from_to, factory); EXPECT_EQ(actual, expect); } } diff --git a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp index 4771034902b..2e126ea941f 100644 --- a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp +++ b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp @@ -14,15 +14,17 @@ // improvement in generic dense join and possibly also in sparse join // with partial dimensional overlap. Benchmarks are done using float // cells since this is what gives best overall performance in -// production. Also, we use the multiply operation since it is the -// most optimized operations across all implementations. When -// benchmarking different implementations against each other, a smoke -// test is performed by verifying that all implementations produce the -// same result. +// production. Also, we use the multiply operation for join and sum +// operation for reduce since those are the most optimized operations +// across all implementations. When benchmarking different +// implementations against each other, a smoke test is performed by +// verifying that all implementations produce the same result. #include <vespa/eval/eval/simple_value.h> #include <vespa/eval/eval/interpreted_function.h> #include <vespa/eval/instruction/generic_join.h> +#include <vespa/eval/instruction/generic_reduce.h> +#include <vespa/eval/instruction/generic_rename.h> #include <vespa/eval/eval/simple_tensor_engine.h> #include <vespa/eval/eval/tensor_spec.h> #include <vespa/eval/eval/value_codec.h> @@ -60,6 +62,8 @@ struct Impl { virtual Value::UP create_value(const TensorSpec &spec) const = 0; virtual TensorSpec create_spec(const Value &value) const = 0; virtual Instruction create_join(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const = 0; + virtual Instruction create_reduce(const ValueType &lhs, Aggr aggr, const std::vector<vespalib::string> &dims, Stash &stash) const = 0; + virtual Instruction create_rename(const ValueType &lhs, const std::vector<vespalib::string> &from, const std::vector<vespalib::string> &to, Stash &stash) const = 0; virtual const TensorEngine &engine() const { return SimpleTensorEngine::ref(); } // engine used by EvalSingle virtual ~Impl() {} }; @@ -73,6 +77,12 @@ struct ValueImpl : Impl { Instruction create_join(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const override { return GenericJoin::make_instruction(lhs, rhs, function, my_factory, stash); } + Instruction create_reduce(const ValueType &lhs, Aggr aggr, const std::vector<vespalib::string> &dims, Stash &stash) const override { + return GenericReduce::make_instruction(lhs, aggr, dims, my_factory, stash); + } + Instruction create_rename(const ValueType &lhs, const std::vector<vespalib::string> &from, const std::vector<vespalib::string> &to, Stash &stash) const override { + return GenericRename::make_instruction(lhs, from, to, my_factory, stash); + } }; struct EngineImpl : Impl { @@ -82,12 +92,24 @@ struct EngineImpl : Impl { Value::UP create_value(const TensorSpec &spec) const override { return my_engine.from_spec(spec); } TensorSpec create_spec(const Value &value) const override { return my_engine.to_spec(value); } Instruction create_join(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const override { - // create a complete tensor function joining two parameters, but only compile the join instruction itself + // create a complete tensor function, but only compile the relevant instruction const auto &lhs_node = tensor_function::inject(lhs, 0, stash); const auto &rhs_node = tensor_function::inject(rhs, 1, stash); const auto &join_node = tensor_function::join(lhs_node, rhs_node, function, stash); return join_node.compile_self(my_engine, stash); } + Instruction create_reduce(const ValueType &lhs, Aggr aggr, const std::vector<vespalib::string> &dims, Stash &stash) const override { + // create a complete tensor function, but only compile the relevant instruction + const auto &lhs_node = tensor_function::inject(lhs, 0, stash); + const auto &reduce_node = tensor_function::reduce(lhs_node, aggr, dims, stash); + return reduce_node.compile_self(my_engine, stash); + } + Instruction create_rename(const ValueType &lhs, const std::vector<vespalib::string> &from, const std::vector<vespalib::string> &to, Stash &stash) const override { + // create a complete tensor function, but only compile the relevant instruction + const auto &lhs_node = tensor_function::inject(lhs, 0, stash); + const auto &rename_node = tensor_function::rename(lhs_node, from, to, stash); + return rename_node.compile_self(my_engine, stash); + } const TensorEngine &engine() const override { return my_engine; } }; @@ -100,7 +122,11 @@ ValueImpl packed_mixed_tensor_impl(2, " PackedMixedTensor", " Packed", Pack ValueImpl default_tensor_value_impl(0, " DefaultValue", "NEW PROD", DefaultValueBuilderFactory::get()); vespalib::string short_header("--------"); -double budget = 5.0; +constexpr double budget = 5.0; +constexpr double best_limit = 0.95; // everything within 95% of best performance gets a star +constexpr double bad_limit = 0.90; // BAD: new prod has performance lower than 90% of old prod +constexpr double good_limit = 1.10; // GOOD: new prod has performance higher than 110% of old prod + std::vector<CREF<Impl>> impl_list = {simple_tensor_engine_impl, default_tensor_engine_impl, simple_value_impl, @@ -117,25 +143,25 @@ struct BenchmarkHeader { short_names[impl.order] = impl.short_name; } } + void print_header(const vespalib::string &desc) const { + for (const auto &name: short_names) { + fprintf(stderr, "|%s", name.c_str()); + } + fprintf(stderr, "| %s Benchmark cases\n", desc.c_str()); + } void print_trailer() const { for (size_t i = 0; i < short_names.size(); ++i) { fprintf(stderr, "+%s", short_header.c_str()); } fprintf(stderr, "+------------------------------------------------\n"); } - void print() const { - for (const auto &name: short_names) { - fprintf(stderr, "|%s", name.c_str()); - } - fprintf(stderr, "| Benchmark description\n"); - print_trailer(); - } }; struct BenchmarkResult { vespalib::string desc; std::optional<double> ref_time; std::vector<double> relative_perf; + double star_rating; BenchmarkResult(const vespalib::string &desc_in, size_t num_values) : desc(desc_in), ref_time(std::nullopt), relative_perf(num_values, 0.0) {} ~BenchmarkResult(); @@ -150,13 +176,20 @@ struct BenchmarkResult { } } void normalize() { + star_rating = 0.0; for (double &perf: relative_perf) { perf = ref_time.value() / perf; + star_rating = std::max(star_rating, perf); } + star_rating *= best_limit; } void print() const { for (double perf: relative_perf) { - fprintf(stderr, "|%8.2f", perf); + if (perf > star_rating) { + fprintf(stderr, "|*%7.2f", perf); + } else { + fprintf(stderr, "| %7.2f", perf); + } } fprintf(stderr, "| %s\n", desc.c_str()); } @@ -239,6 +272,45 @@ void benchmark_join(const vespalib::string &desc, const TensorSpec &lhs, //----------------------------------------------------------------------------- +void benchmark_reduce(const vespalib::string &desc, const TensorSpec &lhs, + Aggr aggr, const std::vector<vespalib::string> &dims) +{ + Stash stash; + ValueType lhs_type = ValueType::from_spec(lhs.type()); + ValueType res_type = lhs_type.reduce(dims); + ASSERT_FALSE(lhs_type.is_error()); + ASSERT_FALSE(res_type.is_error()); + std::vector<EvalOp::UP> list; + for (const Impl &impl: impl_list) { + auto op = impl.create_reduce(lhs_type, aggr, dims, stash); + std::vector<CREF<TensorSpec>> stack_spec({lhs}); + list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl)); + } + benchmark(desc, list); +} + +//----------------------------------------------------------------------------- + +void benchmark_rename(const vespalib::string &desc, const TensorSpec &lhs, + const std::vector<vespalib::string> &from, + const std::vector<vespalib::string> &to) +{ + Stash stash; + ValueType lhs_type = ValueType::from_spec(lhs.type()); + ValueType res_type = lhs_type.rename(from, to); + ASSERT_FALSE(lhs_type.is_error()); + ASSERT_FALSE(res_type.is_error()); + std::vector<EvalOp::UP> list; + for (const Impl &impl: impl_list) { + auto op = impl.create_rename(lhs_type, from, to, stash); + std::vector<CREF<TensorSpec>> stack_spec({lhs}); + list.push_back(std::make_unique<EvalOp>(op, stack_spec, impl)); + } + benchmark(desc, list); +} + +//----------------------------------------------------------------------------- + struct D { vespalib::string name; bool mapped; @@ -281,6 +353,7 @@ template <typename ...Ds> TensorSpec make_spec(double seq, const Ds &...ds) { } TensorSpec make_vector(const D &d1, double seq) { return make_spec(seq, d1); } +TensorSpec make_matrix(const D &d1, const D &d2, double seq) { return make_spec(seq, d1, d2); } TensorSpec make_cube(const D &d1, const D &d2, const D &d3, double seq) { return make_spec(seq, d1, d2, d3); } //----------------------------------------------------------------------------- @@ -392,15 +465,91 @@ TEST(MixedJoin, no_overlap) { //----------------------------------------------------------------------------- -TEST(PrintResults, print_results) { +TEST(ReduceBench, dense_reduce) { + auto lhs = make_cube(D::idx("a", 16), D::idx("b", 16), D::idx("c", 16), 1.0); + benchmark_reduce("dense reduce inner", lhs, Aggr::SUM, {"c"}); + benchmark_reduce("dense reduce middle", lhs, Aggr::SUM, {"b"}); + benchmark_reduce("dense reduce outer", lhs, Aggr::SUM, {"a"}); + benchmark_reduce("dense multi-reduce inner", lhs, Aggr::SUM, {"b", "c"}); + benchmark_reduce("dense multi-reduce outer", lhs, Aggr::SUM, {"a", "b"}); + benchmark_reduce("dense multi-reduce outer-inner", lhs, Aggr::SUM, {"a", "c"}); + benchmark_reduce("dense reduce all", lhs, Aggr::SUM, {}); +} + +TEST(ReduceBench, sparse_reduce) { + auto lhs = make_cube(D::map("a", 16, 1), D::map("b", 16, 1), D::map("c", 16, 1), 1.0); + benchmark_reduce("sparse reduce inner", lhs, Aggr::SUM, {"c"}); + benchmark_reduce("sparse reduce middle", lhs, Aggr::SUM, {"b"}); + benchmark_reduce("sparse reduce outer", lhs, Aggr::SUM, {"a"}); + benchmark_reduce("sparse multi-reduce inner", lhs, Aggr::SUM, {"b", "c"}); + benchmark_reduce("sparse multi-reduce outer", lhs, Aggr::SUM, {"a", "b"}); + benchmark_reduce("sparse multi-reduce outer-inner", lhs, Aggr::SUM, {"a", "c"}); + benchmark_reduce("sparse reduce all", lhs, Aggr::SUM, {}); +} + +TEST(ReduceBench, mixed_reduce) { + auto lhs = make_spec(1.0, D::map("a", 4, 1), D::map("b", 4, 1), D::map("c", 4, 1), + D::idx("d", 4), D::idx("e", 4), D::idx("f", 4)); + benchmark_reduce("mixed reduce middle dense", lhs, Aggr::SUM, {"e"}); + benchmark_reduce("mixed reduce middle sparse", lhs, Aggr::SUM, {"b"}); + benchmark_reduce("mixed reduce middle sparse/dense", lhs, Aggr::SUM, {"b", "e"}); + benchmark_reduce("mixed reduce all dense", lhs, Aggr::SUM, {"d", "e", "f"}); + benchmark_reduce("mixed reduce all sparse", lhs, Aggr::SUM, {"a", "b", "c"}); + benchmark_reduce("mixed reduce all", lhs, Aggr::SUM, {}); +} + +//----------------------------------------------------------------------------- + +TEST(RenameBench, dense_rename) { + auto lhs = make_matrix(D::idx("a", 16), D::idx("b", 16), 1.0); + benchmark_rename("dense transpose", lhs, {"a", "b"}, {"b", "a"}); +} + +TEST(RenameBench, sparse_rename) { + auto lhs = make_matrix(D::map("a", 16, 1), D::map("b", 16, 1), 1.0); + benchmark_rename("sparse transpose", lhs, {"a", "b"}, {"b", "a"}); +} + +TEST(RenameBench, mixed_rename) { + auto lhs = make_spec(1.0, D::map("a", 8, 1), D::map("b", 8, 1), D::idx("c", 8), D::idx("d", 8)); + benchmark_rename("mixed multi-transpose", lhs, {"a", "b", "c", "d"}, {"b", "a", "d", "c"}); +} + +//----------------------------------------------------------------------------- + +void print_results(const vespalib::string &desc, const std::vector<BenchmarkResult> &results) { + if (results.empty()) { + return; + } BenchmarkHeader header; + header.print_trailer(); + header.print_header(desc); + header.print_trailer(); + for (const auto &result: results) { + result.print(); + } + header.print_trailer(); +} + +TEST(PrintResults, print_results) { + std::vector<BenchmarkResult> bad_results; + std::vector<BenchmarkResult> neutral_results; + std::vector<BenchmarkResult> good_results; std::sort(benchmark_results.begin(), benchmark_results.end(), [](const auto &a, const auto &b){ return (a.relative_perf[0] < b.relative_perf[0]); }); - header.print(); for (const auto &result: benchmark_results) { - result.print(); + double perf = result.relative_perf[0]; + if (perf < bad_limit) { + bad_results.push_back(result); + } else if (perf > good_limit) { + good_results.push_back(result); + } else { + neutral_results.push_back(result); + } } - header.print_trailer(); + print_results("BAD", bad_results); + print_results("NEUTRAL", neutral_results); + print_results("GOOD", good_results); } GTEST_MAIN_RUN_ALL_TESTS() diff --git a/eval/src/vespa/eval/eval/nested_loop.h b/eval/src/vespa/eval/eval/nested_loop.h new file mode 100644 index 00000000000..2fe98fb1e9b --- /dev/null +++ b/eval/src/vespa/eval/eval/nested_loop.h @@ -0,0 +1,143 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vector> + +namespace vespalib::eval { + +// This file contains implementations for the generic nested loops +// used by DenseJoinPlan, DenseReducePlan and similar. The loops act +// like arbitrarily nested for-loops that are index-based where each +// loop-level has a different stride that modifies the overall +// index. An initial index is passed to the top-level function, which +// is then modified by each loop-layer and finally passed back to a +// callable for each iteration of the inner loop. There are different +// implementations for traversing different numbers of index spaces in +// parallel. Note that all loop layers must have at least 1 iteration. + +namespace nested_loop { + +//----------------------------------------------------------------------------- + +template <typename F, size_t N> void execute_few(size_t idx, const size_t *loop, const size_t *stride, const F &f) { + if constexpr (N == 0) { + f(idx); + } else { + for (size_t i = 0; i < *loop; ++i, idx += *stride) { + execute_few<F, N - 1>(idx, loop + 1, stride + 1, f); + } + } +} + +template <typename F> void execute_many(size_t idx, const size_t *loop, const size_t *stride, size_t levels, const F &f) { + for (size_t i = 0; i < *loop; ++i, idx += *stride) { + if ((levels - 1) == 3) { + execute_few<F, 3>(idx, loop + 1, stride + 1, f); + } else { + execute_many<F>(idx, loop + 1, stride + 1, levels - 1, f); + } + } +} + +//----------------------------------------------------------------------------- + +template <typename FIRST, typename NEXT, size_t N> +void execute_few(size_t idx, const size_t *loop, const size_t *stride, const FIRST &first, const NEXT &next) { + if constexpr (N == 0) { + first(idx); + } else { + execute_few<FIRST, NEXT, N - 1>(idx, loop + 1, stride + 1, first, next); + idx += *stride; + for (size_t i = 1; i < *loop; ++i, idx += *stride) { + execute_few<NEXT, N - 1>(idx, loop + 1, stride + 1, next); + } + } +} + +template <typename FIRST, typename NEXT> +void execute_many(size_t idx, const size_t *loop, const size_t *stride, size_t levels, const FIRST &first, const NEXT &next) { + if ((levels - 1) == 3) { + execute_few<FIRST, NEXT, 3>(idx, loop + 1, stride + 1, first, next); + } else { + execute_many<FIRST, NEXT>(idx, loop + 1, stride + 1, levels - 1, first, next); + } + idx += *stride; + for (size_t i = 1; i < *loop; ++i, idx += *stride) { + if ((levels - 1) == 3) { + execute_few<NEXT, 3>(idx, loop + 1, stride + 1, next); + } else { + execute_many<NEXT>(idx, loop + 1, stride + 1, levels - 1, next); + } + } +} + +//----------------------------------------------------------------------------- + +template <typename F, size_t N> void execute_few(size_t idx1, size_t idx2, const size_t *loop, const size_t *stride1, const size_t *stride2, const F &f) { + if constexpr (N == 0) { + f(idx1, idx2); + } else { + for (size_t i = 0; i < *loop; ++i, idx1 += *stride1, idx2 += *stride2) { + execute_few<F, N - 1>(idx1, idx2, loop + 1, stride1 + 1, stride2 + 1, f); + } + } +} + +template <typename F> void execute_many(size_t idx1, size_t idx2, const size_t *loop, const size_t *stride1, const size_t *stride2, size_t levels, const F &f) { + for (size_t i = 0; i < *loop; ++i, idx1 += *stride1, idx2 += *stride2) { + if ((levels - 1) == 3) { + execute_few<F, 3>(idx1, idx2, loop + 1, stride1 + 1, stride2 + 1, f); + } else { + execute_many<F>(idx1, idx2, loop + 1, stride1 + 1, stride2 + 1, levels - 1, f); + } + } +} + +//----------------------------------------------------------------------------- + +} // implementation details + +// Run a nested loop and pass indexes to 'f' +template <typename F> +void run_nested_loop(size_t idx, const std::vector<size_t> &loop, const std::vector<size_t> &stride, const F &f) { + size_t levels = loop.size(); + switch(levels) { + case 0: return f(idx); + case 1: return nested_loop::execute_few<F, 1>(idx, &loop[0], &stride[0], f); + case 2: return nested_loop::execute_few<F, 2>(idx, &loop[0], &stride[0], f); + case 3: return nested_loop::execute_few<F, 3>(idx, &loop[0], &stride[0], f); + default: return nested_loop::execute_many<F>(idx, &loop[0], &stride[0], levels, f); + } +} + +// Run a nested loop and pass the first index to 'first' and all +// subsequent indexes to 'next' +template <typename FIRST, typename NEXT> +void run_nested_loop(size_t idx, const std::vector<size_t> &loop, const std::vector<size_t> &stride, const FIRST &first, const NEXT &next) { + size_t levels = loop.size(); + switch(levels) { + case 0: return first(idx); + case 1: return nested_loop::execute_few<FIRST, NEXT, 1>(idx, &loop[0], &stride[0], first, next); + case 2: return nested_loop::execute_few<FIRST, NEXT, 2>(idx, &loop[0], &stride[0], first, next); + case 3: return nested_loop::execute_few<FIRST, NEXT, 3>(idx, &loop[0], &stride[0], first, next); + default: return nested_loop::execute_many<FIRST, NEXT>(idx, &loop[0], &stride[0], levels, first, next); + } +} + +// Run two nested loops in parallel and pass both indexes to 'f'. Note +// that 'loop' is shared, which means that only individual strides may +// differ between the two loops. +template <typename F> +void run_nested_loop(size_t idx1, size_t idx2, const std::vector<size_t> &loop, const std::vector<size_t> &stride1, const std::vector<size_t> &stride2, const F &f) { + size_t levels = loop.size(); + switch(levels) { + case 0: return f(idx1, idx2); + case 1: return nested_loop::execute_few<F, 1>(idx1, idx2, &loop[0], &stride1[0], &stride2[0], f); + case 2: return nested_loop::execute_few<F, 2>(idx1, idx2, &loop[0], &stride1[0], &stride2[0], f); + case 3: return nested_loop::execute_few<F, 3>(idx1, idx2, &loop[0], &stride1[0], &stride2[0], f); + default: return nested_loop::execute_many<F>(idx1, idx2, &loop[0], &stride1[0], &stride2[0], levels, f); + } +} + +} // namespace diff --git a/eval/src/vespa/eval/instruction/CMakeLists.txt b/eval/src/vespa/eval/instruction/CMakeLists.txt index 71d08f601dd..91ff4fd63dc 100644 --- a/eval/src/vespa/eval/instruction/CMakeLists.txt +++ b/eval/src/vespa/eval/instruction/CMakeLists.txt @@ -3,6 +3,7 @@ vespa_add_library(eval_instruction OBJECT SOURCES generic_join + generic_reduce generic_merge generic_rename ) diff --git a/eval/src/vespa/eval/instruction/generic_join.h b/eval/src/vespa/eval/instruction/generic_join.h index 25647452dff..0f121104d5e 100644 --- a/eval/src/vespa/eval/instruction/generic_join.h +++ b/eval/src/vespa/eval/instruction/generic_join.h @@ -2,6 +2,7 @@ #pragma once +#include <vespa/eval/eval/nested_loop.h> #include <vespa/eval/eval/value_type.h> #include <vespa/eval/eval/interpreted_function.h> @@ -37,34 +38,8 @@ struct DenseJoinPlan { std::vector<size_t> rhs_stride; DenseJoinPlan(const ValueType &lhs_type, const ValueType &rhs_type); ~DenseJoinPlan(); - template <typename F> void execute(size_t lhs, size_t rhs, F &&f) const { - switch(loops_left(0)) { - case 0: return execute_few<F, 0>(0, lhs, rhs, std::forward<F>(f)); - case 1: return execute_few<F, 1>(0, lhs, rhs, std::forward<F>(f)); - case 2: return execute_few<F, 2>(0, lhs, rhs, std::forward<F>(f)); - case 3: return execute_few<F, 3>(0, lhs, rhs, std::forward<F>(f)); - default: return execute_many<F>(0, lhs, rhs, std::forward<F>(f)); - } - } -private: - size_t loops_left(size_t idx) const { return (loop_cnt.size() - idx); } - template <typename F, size_t N> void execute_few(size_t idx, size_t lhs, size_t rhs, F &&f) const { - if constexpr (N == 0) { - f(lhs, rhs); - } else { - for (size_t i = 0; i < loop_cnt[idx]; ++i, lhs += lhs_stride[idx], rhs += rhs_stride[idx]) { - execute_few<F, N - 1>(idx + 1, lhs, rhs, std::forward<F>(f)); - } - } - } - template <typename F> void execute_many(size_t idx, size_t lhs, size_t rhs, F &&f) const { - for (size_t i = 0; i < loop_cnt[idx]; ++i, lhs += lhs_stride[idx], rhs += rhs_stride[idx]) { - if (loops_left(idx + 1) == 3) { - execute_few<F, 3>(idx + 1, lhs, rhs, std::forward<F>(f)); - } else { - execute_many<F>(idx + 1, lhs, rhs, std::forward<F>(f)); - } - } + template <typename F> void execute(size_t lhs, size_t rhs, const F &f) const { + run_nested_loop(lhs, rhs, loop_cnt, lhs_stride, rhs_stride, f); } }; diff --git a/eval/src/vespa/eval/instruction/generic_reduce.cpp b/eval/src/vespa/eval/instruction/generic_reduce.cpp new file mode 100644 index 00000000000..d294b478210 --- /dev/null +++ b/eval/src/vespa/eval/instruction/generic_reduce.cpp @@ -0,0 +1,219 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "generic_reduce.h" +#include <vespa/eval/eval/value.h> +#include <vespa/vespalib/util/stash.h> +#include <vespa/vespalib/util/typify.h> +#include <cassert> + +namespace vespalib::eval::instruction { + +using State = InterpretedFunction::State; +using Instruction = InterpretedFunction::Instruction; + +namespace { + +//----------------------------------------------------------------------------- + +template <typename T, typename IN> uint64_t wrap_param(const IN &value_in) { + const T &value = value_in; + static_assert(sizeof(uint64_t) == sizeof(&value)); + return (uint64_t)&value; +} + +template <typename T> const T &unwrap_param(uint64_t param) { + return *((const T *)param); +} + +//----------------------------------------------------------------------------- + +struct ReduceParam { + ValueType res_type; + SparseReducePlan sparse_plan; + DenseReducePlan dense_plan; + const ValueBuilderFactory &factory; + ReduceParam(const ValueType &type, const std::vector<vespalib::string> &dimensions, + const ValueBuilderFactory &factory_in) + : res_type(type.reduce(dimensions)), + sparse_plan(type, res_type), + dense_plan(type, res_type), + factory(factory_in) + { + assert(!res_type.is_error()); + assert(dense_plan.in_size == type.dense_subspace_size()); + assert(dense_plan.out_size == res_type.dense_subspace_size()); + } + ~ReduceParam(); +}; +ReduceParam::~ReduceParam() = default; + +//----------------------------------------------------------------------------- + +struct SparseReduceState { + + // TODO(havardpe): Optimize using vespalib::hash_map with + // reference to slice of external vector of vespalib::stringref as + // key. Track matching subspaces as linked lists contained in a + // single shared vector. + + using MapKey = std::vector<vespalib::string>; + using MapValue = std::vector<size_t>; + using Map = std::map<MapKey,MapValue>; + + Map map; + std::vector<vespalib::stringref> full_address; + std::vector<vespalib::stringref*> fetch_address; + std::vector<vespalib::stringref*> keep_address; + size_t subspace; + + SparseReduceState(const SparseReducePlan &plan) + : full_address(plan.keep_dims.size() + plan.num_reduce_dims), + fetch_address(full_address.size(), nullptr), + keep_address(plan.keep_dims.size(), nullptr) + { + for (size_t i = 0; i < keep_address.size(); ++i) { + keep_address[i] = &full_address[plan.keep_dims[i]]; + } + for (size_t i = 0; i < full_address.size(); ++i) { + fetch_address[i] = &full_address[i]; + } + } + ~SparseReduceState(); + void populate_map(std::unique_ptr<Value::Index::View> full_view) { + full_view->lookup({}); + while (full_view->next_result(fetch_address, subspace)) { + std::vector<vespalib::string> key; + key.reserve(keep_address.size()); + for (const vespalib::stringref *label: keep_address) { + key.emplace_back(*label); + } + map[key].push_back(subspace); + } + } + template <typename T> + std::vector<vespalib::stringref> make_addr(const T &map_entry) { + std::vector<vespalib::stringref> addr; + addr.reserve(map_entry.first.size()); + for (const vespalib::string &label: map_entry.first) { + addr.emplace_back(label); + } + return addr; + } +}; +SparseReduceState::~SparseReduceState() = default; + +template <typename ICT, typename OCT, typename AGGR> +void my_generic_reduce_op(State &state, uint64_t param_in) { + const auto ¶m = unwrap_param<ReduceParam>(param_in); + SparseReduceState sparse(param.sparse_plan); + const Value &value = state.peek(0); + sparse.populate_map(value.index().create_view({})); + auto cells = value.cells().typify<ICT>(); + AGGR aggr; + auto first = [&](size_t idx) { aggr.first(cells[idx]); }; + auto next = [&](size_t idx) { aggr.next(cells[idx]); }; + auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.keep_dims.size(), param.dense_plan.out_size, sparse.map.size()); + for (const auto &entry: sparse.map) { + OCT *dst = builder->add_subspace(sparse.make_addr(entry)).begin(); + auto reduce_cells = [&](size_t rel_idx) + { + auto pos = entry.second.begin(); + param.dense_plan.execute_reduce((*pos * param.dense_plan.in_size) + rel_idx, first, next); + for (++pos; pos != entry.second.end(); ++pos) { + param.dense_plan.execute_reduce((*pos * param.dense_plan.in_size) + rel_idx, next); + } + *dst++ = aggr.result(); + }; + param.dense_plan.execute_keep(reduce_cells); + } + auto &result = state.stash.create<std::unique_ptr<Value>>(builder->build(std::move(builder))); + const Value &result_ref = *(result.get()); + state.pop_push(result_ref); +}; + +struct SelectGenericReduceOp { + template <typename ICT, typename OCT, typename AGGR> static auto invoke() { + return my_generic_reduce_op<ICT, OCT, typename AGGR::template templ<ICT>>; + } +}; + +//----------------------------------------------------------------------------- + +} // namespace <unnamed> + +//----------------------------------------------------------------------------- + +DenseReducePlan::DenseReducePlan(const ValueType &type, const ValueType &res_type) + : in_size(1), + out_size(1), + keep_loop(), + keep_stride(), + reduce_loop(), + reduce_stride() +{ + std::vector<bool> keep; + std::vector<size_t> size; + for (const auto &dim: type.nontrivial_indexed_dimensions()) { + keep.push_back(res_type.dimension_index(dim.name) != ValueType::Dimension::npos); + size.push_back(dim.size); + } + std::vector<size_t> stride(size.size(), 0); + for (size_t i = stride.size(); i-- > 0; ) { + stride[i] = in_size; + in_size *= size[i]; + if (keep[i]) { + out_size *= size[i]; + } + } + int prev_case = 2; + for (size_t i = 0; i < size.size(); ++i) { + int my_case = keep[i] ? 1 : 0; + auto &my_loop = keep[i] ? keep_loop : reduce_loop; + auto &my_stride = keep[i] ? keep_stride : reduce_stride; + if (my_case == prev_case) { + assert(!my_loop.empty()); + my_loop.back() *= size[i]; + my_stride.back() = stride[i]; + } else { + my_loop.push_back(size[i]); + my_stride.push_back(stride[i]); + } + prev_case = my_case; + } +} + +DenseReducePlan::~DenseReducePlan() = default; + +//----------------------------------------------------------------------------- + +SparseReducePlan::SparseReducePlan(const ValueType &type, const ValueType &res_type) + : num_reduce_dims(0), + keep_dims() +{ + auto dims = type.mapped_dimensions(); + for (size_t i = 0; i < dims.size(); ++i) { + bool keep = (res_type.dimension_index(dims[i].name) != ValueType::Dimension::npos); + if (keep) { + keep_dims.push_back(i); + } else { + ++num_reduce_dims; + } + } +} + +SparseReducePlan::~SparseReducePlan() = default; + +//----------------------------------------------------------------------------- + +using ReduceTypify = TypifyValue<TypifyCellType,TypifyAggr>; + +Instruction +GenericReduce::make_instruction(const ValueType &type, Aggr aggr, const std::vector<vespalib::string> &dimensions, + const ValueBuilderFactory &factory, Stash &stash) +{ + auto ¶m = stash.create<ReduceParam>(type, dimensions, factory); + auto fun = typify_invoke<3,ReduceTypify,SelectGenericReduceOp>(type.cell_type(), param.res_type.cell_type(), aggr); + return Instruction(fun, wrap_param<ReduceParam>(param)); +} + +} // namespace diff --git a/eval/src/vespa/eval/instruction/generic_reduce.h b/eval/src/vespa/eval/instruction/generic_reduce.h new file mode 100644 index 00000000000..e412f8795ac --- /dev/null +++ b/eval/src/vespa/eval/instruction/generic_reduce.h @@ -0,0 +1,54 @@ +// Copyright Verizon Media. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/eval/eval/value_type.h> +#include <vespa/eval/eval/aggr.h> +#include <vespa/eval/eval/interpreted_function.h> +#include <vespa/eval/eval/nested_loop.h> + +namespace vespalib { class Stash; } +namespace vespalib::eval { struct ValueBuilderFactory; } + +namespace vespalib::eval::instruction { + +//----------------------------------------------------------------------------- + +struct GenericReduce { + static InterpretedFunction::Instruction + make_instruction(const ValueType &type, Aggr aggr, const std::vector<vespalib::string> &dimensions, + const ValueBuilderFactory &factory, Stash &stash); +}; + +//----------------------------------------------------------------------------- + +struct DenseReducePlan { + size_t in_size; + size_t out_size; + std::vector<size_t> keep_loop; + std::vector<size_t> keep_stride; + std::vector<size_t> reduce_loop; + std::vector<size_t> reduce_stride; + DenseReducePlan(const ValueType &type, const ValueType &res_type); + ~DenseReducePlan(); + template <typename F> void execute_keep(const F &f) const { + run_nested_loop(0, keep_loop, keep_stride, f); + } + template <typename F> void execute_reduce(size_t offset, const F &f) const { + run_nested_loop(offset, reduce_loop, reduce_stride, f); + } + template <typename FIRST, typename NEXT> void execute_reduce(size_t offset, const FIRST &first, const NEXT &next) const { + run_nested_loop(offset, reduce_loop, reduce_stride, first, next); + } +}; + +struct SparseReducePlan { + size_t num_reduce_dims; + std::vector<size_t> keep_dims; + SparseReducePlan(const ValueType &type, const ValueType &res_type); + ~SparseReducePlan(); +}; + +//----------------------------------------------------------------------------- + +} // namespace diff --git a/eval/src/vespa/eval/instruction/generic_rename.cpp b/eval/src/vespa/eval/instruction/generic_rename.cpp index cda0aa6e89a..1f8398555b4 100644 --- a/eval/src/vespa/eval/instruction/generic_rename.cpp +++ b/eval/src/vespa/eval/instruction/generic_rename.cpp @@ -53,13 +53,13 @@ struct RenameParam { SparseRenamePlan sparse_plan; DenseRenamePlan dense_plan; const ValueBuilderFactory &factory; - RenameParam(const ValueType &lhs_type, const ValueType &output_type, + RenameParam(const ValueType &lhs_type, const std::vector<vespalib::string> &rename_dimension_from, const std::vector<vespalib::string> &rename_dimension_to, const ValueBuilderFactory &factory_in) - : res_type(output_type), - sparse_plan(lhs_type, output_type, rename_dimension_from, rename_dimension_to), - dense_plan(lhs_type, output_type, rename_dimension_from, rename_dimension_to), + : res_type(lhs_type.rename(rename_dimension_from, rename_dimension_to)), + sparse_plan(lhs_type, res_type, rename_dimension_from, rename_dimension_to), + dense_plan(lhs_type, res_type, rename_dimension_from, rename_dimension_to), factory(factory_in) { assert(!res_type.is_error()); @@ -181,15 +181,15 @@ DenseRenamePlan::DenseRenamePlan(const ValueType &lhs_type, DenseRenamePlan::~DenseRenamePlan() = default; InterpretedFunction::Instruction -GenericRename::make_instruction(const ValueType &lhs_type, const ValueType &output_type, +GenericRename::make_instruction(const ValueType &lhs_type, const std::vector<vespalib::string> &rename_dimension_from, const std::vector<vespalib::string> &rename_dimension_to, const ValueBuilderFactory &factory, Stash &stash) { - auto ¶m = stash.create<RenameParam>(lhs_type, output_type, + auto ¶m = stash.create<RenameParam>(lhs_type, rename_dimension_from, rename_dimension_to, factory); - auto fun = typify_invoke<1,TypifyCellType,SelectGenericRenameOp>(output_type.cell_type()); + auto fun = typify_invoke<1,TypifyCellType,SelectGenericRenameOp>(param.res_type.cell_type()); return Instruction(fun, wrap_param<RenameParam>(param)); } diff --git a/eval/src/vespa/eval/instruction/generic_rename.h b/eval/src/vespa/eval/instruction/generic_rename.h index ca9f45bd341..6c94ff02b24 100644 --- a/eval/src/vespa/eval/instruction/generic_rename.h +++ b/eval/src/vespa/eval/instruction/generic_rename.h @@ -2,6 +2,7 @@ #pragma once +#include <vespa/eval/eval/nested_loop.h> #include <vespa/eval/eval/value_type.h> #include <vespa/eval/eval/interpreted_function.h> #include <vespa/vespalib/stllike/string.h> @@ -20,37 +21,8 @@ struct DenseRenamePlan { const std::vector<vespalib::string> &from, const std::vector<vespalib::string> &to); ~DenseRenamePlan(); - template <typename F> void execute(size_t offset, F &&f) const { - switch(loops_left(0)) { - case 0: return execute_few<F, 0>(0, offset, std::forward<F>(f)); - case 1: return execute_few<F, 1>(0, offset, std::forward<F>(f)); - case 2: return execute_few<F, 2>(0, offset, std::forward<F>(f)); - case 3: return execute_few<F, 3>(0, offset, std::forward<F>(f)); - default: return execute_many<F>(0, offset, std::forward<F>(f)); - } - } -private: - size_t loops_left(size_t idx) const { return (loop_cnt.size() - idx); } - - template <typename F, size_t N> void execute_few(size_t idx, size_t offset, F &&f) const { - if constexpr (N == 0) { - f(offset); - } else { - for (size_t i = 0; i < loop_cnt[idx]; ++i) { - execute_few<F, N - 1>(idx + 1, offset, std::forward<F>(f)); - offset += stride[idx]; - } - } - } - template <typename F> void execute_many(size_t idx, size_t offset, F &&f) const { - for (size_t i = 0; i < loop_cnt[idx]; ++i) { - if (loops_left(idx + 1) == 3) { - execute_few<F, 3>(idx + 1, offset, std::forward<F>(f)); - } else { - execute_many<F>(idx + 1, offset, std::forward<F>(f)); - } - offset += stride[idx]; - } + template <typename F> void execute(size_t offset, const F &f) const { + run_nested_loop(offset, loop_cnt, stride, f); } }; @@ -68,7 +40,7 @@ struct SparseRenamePlan { struct GenericRename { static InterpretedFunction::Instruction - make_instruction(const ValueType &lhs_type, const ValueType &output_type, + make_instruction(const ValueType &lhs_type, const std::vector<vespalib::string> &rename_dimension_from, const std::vector<vespalib::string> &rename_dimension_to, const ValueBuilderFactory &factory, Stash &stash); |