diff options
author | Arne H Juul <arnej27959@users.noreply.github.com> | 2020-10-22 22:22:24 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-10-22 22:22:24 +0200 |
commit | b6e7689c75e280e27cd397f41a949fd98635d3a3 (patch) | |
tree | 9a702f7c82c815c5017c9dd1afac6f3a80ef4c8b /eval | |
parent | d01edb5dff3a2c63a47109d387a079abd7e0bfe7 (diff) | |
parent | db9120c61f1f5974aed00f06a8f6fa56809e0e68 (diff) |
Merge pull request #15012 from vespa-engine/havardpe/better-dense-plan-for-generic-reduce
improve generic dense reduce with more robust cell ordering
Diffstat (limited to 'eval')
-rw-r--r-- | eval/src/tests/eval/aggr/aggr_test.cpp | 28 | ||||
-rw-r--r-- | eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp | 14 | ||||
-rw-r--r-- | eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp | 56 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/aggr.cpp | 4 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/aggr.h | 68 | ||||
-rw-r--r-- | eval/src/vespa/eval/instruction/generic_reduce.cpp | 98 | ||||
-rw-r--r-- | eval/src/vespa/eval/instruction/generic_reduce.h | 14 | ||||
-rw-r--r-- | eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.cpp | 9 |
8 files changed, 178 insertions, 113 deletions
diff --git a/eval/src/tests/eval/aggr/aggr_test.cpp b/eval/src/tests/eval/aggr/aggr_test.cpp index b1da9704bfc..603fdb508e2 100644 --- a/eval/src/tests/eval/aggr/aggr_test.cpp +++ b/eval/src/tests/eval/aggr/aggr_test.cpp @@ -5,6 +5,7 @@ using vespalib::Stash; using namespace vespalib::eval; +using namespace vespalib::eval::aggr; TEST("require that aggregator list returns appropriate entries") { auto list = Aggregator::list(); @@ -83,4 +84,31 @@ TEST("require that MIN aggregator works as expected") { aggr.next(200.0), EXPECT_EQUAL(aggr.result(), 100.0); } +template <template <typename T> typename A> +float aggr_merge(const std::vector<float> &a, const std::vector<float> &b) { + A<float> aggr0; + A<float> aggr1; + A<float> aggr2; + A<float> aggr3; + for (float v: a) { + aggr1.sample(v); + } + for (float v: b) { + aggr2.sample(v); + } + aggr0.merge(aggr1); + aggr2.merge(aggr3); + aggr0.merge(aggr2); + return aggr0.result(); +} + +TEST("require that aggregator merge works") { + EXPECT_EQUAL(aggr_merge<Avg>({1,2},{3,4}), 2.5); + EXPECT_EQUAL(aggr_merge<Count>({1,2},{3,4}), 4.0); + EXPECT_EQUAL(aggr_merge<Prod>({1,2},{3,4}), 24.0); + EXPECT_EQUAL(aggr_merge<Sum>({1,2},{3,4}), 10.0); + EXPECT_EQUAL(aggr_merge<Max>({1,2},{3,4}), 4.0); + EXPECT_EQUAL(aggr_merge<Min>({1,2},{3,4}), 1.0); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp b/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp index cc36c139c99..d894d273f02 100644 --- a/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp +++ b/eval/src/tests/instruction/generic_reduce/generic_reduce_test.cpp @@ -76,16 +76,14 @@ TensorSpec perform_generic_reduce(const TensorSpec &a, const std::vector<vespali TEST(GenericReduceTest, dense_reduce_plan_can_be_created) { auto type = ValueType::from_spec("tensor(a[2],aa{},b[2],bb[1],c[2],cc{},d[2],dd[1],e[2],ee{},f[2])"); auto plan = DenseReducePlan(type, type.reduce({"a", "d", "e"})); - std::vector<size_t> expect_keep_loop = {4,2}; - std::vector<size_t> expect_keep_stride = {8,1}; - std::vector<size_t> expect_reduce_loop = {2,4}; - std::vector<size_t> expect_reduce_stride = {32,2}; + std::vector<size_t> expect_loop_cnt = {2,4,4,2}; + std::vector<size_t> expect_in_stride = {32,2,8,1}; + std::vector<size_t> expect_out_stride = {0,0,2,1}; EXPECT_EQ(plan.in_size, 64); EXPECT_EQ(plan.out_size, 8); - EXPECT_EQ(plan.keep_loop, expect_keep_loop); - EXPECT_EQ(plan.keep_stride, expect_keep_stride); - EXPECT_EQ(plan.reduce_loop, expect_reduce_loop); - EXPECT_EQ(plan.reduce_stride, expect_reduce_stride); + EXPECT_EQ(plan.loop_cnt, expect_loop_cnt); + EXPECT_EQ(plan.in_stride, expect_in_stride); + EXPECT_EQ(plan.out_stride, expect_out_stride); } TEST(GenericReduceTest, sparse_reduce_plan_can_be_created) { diff --git a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp index 1c892a0a3a5..0299dc3ebba 100644 --- a/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp +++ b/eval/src/tests/tensor/instruction_benchmark/instruction_benchmark.cpp @@ -127,35 +127,40 @@ struct Impl { vespalib::string name; vespalib::string short_name; EngineOrFactory engine; - Impl(size_t order_in, const vespalib::string &name_in, const vespalib::string &short_name_in, EngineOrFactory engine_in) - : order(order_in), name(name_in), short_name(short_name_in), engine(engine_in) {} + bool optimize; + Impl(size_t order_in, const vespalib::string &name_in, const vespalib::string &short_name_in, EngineOrFactory engine_in, bool optimize_in) + : order(order_in), name(name_in), short_name(short_name_in), engine(engine_in), optimize(optimize_in) {} Value::UP create_value(const TensorSpec &spec) const { return engine.from_spec(spec); } TensorSpec create_spec(const Value &value) const { return engine.to_spec(value); } Instruction create_join(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const { // create a complete tensor function, but only compile the relevant instruction const auto &lhs_node = tensor_function::inject(lhs, 0, stash); const auto &rhs_node = tensor_function::inject(rhs, 1, stash); - const auto &join_node = tensor_function::join(lhs_node, rhs_node, function, stash); - return join_node.compile_self(engine, stash); + const auto &join_node = tensor_function::join(lhs_node, rhs_node, function, stash); + const auto &node = optimize ? engine.optimize(join_node, stash) : join_node; + return node.compile_self(engine, stash); } Instruction create_reduce(const ValueType &lhs, Aggr aggr, const std::vector<vespalib::string> &dims, Stash &stash) const { // create a complete tensor function, but only compile the relevant instruction const auto &lhs_node = tensor_function::inject(lhs, 0, stash); - const auto &reduce_node = tensor_function::reduce(lhs_node, aggr, dims, stash); - return reduce_node.compile_self(engine, stash); + const auto &reduce_node = tensor_function::reduce(lhs_node, aggr, dims, stash); + const auto &node = optimize ? engine.optimize(reduce_node, stash) : reduce_node; + return node.compile_self(engine, stash); } Instruction create_rename(const ValueType &lhs, const std::vector<vespalib::string> &from, const std::vector<vespalib::string> &to, Stash &stash) const { // create a complete tensor function, but only compile the relevant instruction const auto &lhs_node = tensor_function::inject(lhs, 0, stash); const auto &rename_node = tensor_function::rename(lhs_node, from, to, stash); - return rename_node.compile_self(engine, stash); + const auto &node = optimize ? engine.optimize(rename_node, stash) : rename_node; + return node.compile_self(engine, stash); } Instruction create_merge(const ValueType &lhs, const ValueType &rhs, operation::op2_t function, Stash &stash) const { // create a complete tensor function, but only compile the relevant instruction const auto &lhs_node = tensor_function::inject(lhs, 0, stash); const auto &rhs_node = tensor_function::inject(rhs, 1, stash); const auto &merge_node = tensor_function::merge(lhs_node, rhs_node, function, stash); - return merge_node.compile_self(engine, stash); + const auto &node = optimize ? engine.optimize(merge_node, stash) : merge_node; + return node.compile_self(engine, stash); } Instruction create_concat(const ValueType &lhs, const ValueType &rhs, const std::string &dimension, Stash &stash) const { // create a complete tensor function, but only compile the relevant instruction @@ -163,12 +168,15 @@ struct Impl { const auto &rhs_node = tensor_function::inject(rhs, 1, stash); const auto &concat_node = tensor_function::concat(lhs_node, rhs_node, dimension, stash); return concat_node.compile_self(engine, stash); + const auto &node = optimize ? engine.optimize(concat_node, stash) : concat_node; + return node.compile_self(engine, stash); } Instruction create_map(const ValueType &lhs, operation::op1_t function, Stash &stash) const { // create a complete tensor function, but only compile the relevant instruction const auto &lhs_node = tensor_function::inject(lhs, 0, stash); const auto &map_node = tensor_function::map(lhs_node, function, stash); - return map_node.compile_self(engine, stash); + const auto &node = optimize ? engine.optimize(map_node, stash) : map_node; + return node.compile_self(engine, stash); } Instruction create_tensor_create(const ValueType &proto_type, const TensorSpec &proto, Stash &stash) const { // create a complete tensor function, but only compile the relevant instruction @@ -178,7 +186,8 @@ struct Impl { spec.emplace(cell.first, my_double); } const auto &create_tensor_node = tensor_function::create(proto_type, spec, stash); - return create_tensor_node.compile_self(engine, stash); + const auto &node = optimize ? engine.optimize(create_tensor_node, stash) : create_tensor_node; + return node.compile_self(engine, stash); } Instruction create_tensor_lambda(const ValueType &type, const Function &function, const ValueType &p0_type, Stash &stash) const { std::vector<ValueType> arg_types(type.dimensions().size(), ValueType::double_type()); @@ -186,7 +195,8 @@ struct Impl { NodeTypes types(function, arg_types); EXPECT_EQ(types.errors(), std::vector<vespalib::string>()); const auto &tensor_lambda_node = tensor_function::lambda(type, {0}, function, std::move(types), stash); - return tensor_lambda_node.compile_self(engine, stash); + const auto &node = optimize ? engine.optimize(tensor_lambda_node, stash) : tensor_lambda_node; + return node.compile_self(engine, stash); } Instruction create_tensor_peek(const ValueType &type, const MyPeekSpec &my_spec, Stash &stash) const { // create a complete tensor function, but only compile the relevant instruction @@ -209,18 +219,19 @@ struct Impl { } } const auto &peek_node = tensor_function::peek(my_param, spec, stash); - return peek_node.compile_self(engine, stash); + const auto &node = optimize ? engine.optimize(peek_node, stash) : peek_node; + return node.compile_self(engine, stash); } }; //----------------------------------------------------------------------------- -Impl simple_tensor_engine_impl(5, " SimpleTensorEngine", " SimpleT", SimpleTensorEngine::ref()); -Impl default_tensor_engine_impl(1, "DefaultTensorEngine", "OLD PROD", DefaultTensorEngine::ref()); -Impl simple_value_impl(2, " SimpleValue", " SimpleV", SimpleValueBuilderFactory::get()); -Impl fast_value_impl(0, " FastValue", "NEW PROD", FastValueBuilderFactory::get()); -Impl packed_mixed_tensor_impl(4, " PackedMixedTensor", " Packed", PackedMixedTensorBuilderFactory::get()); -Impl default_tensor_value_impl(3, " DefaultValue", "DefaultV", DefaultValueBuilderFactory::get()); +Impl default_tensor_engine_impl(1, "DefaultTensorEngine", "OLD PROD", DefaultTensorEngine::ref(), false); +Impl simple_value_impl(3, " SimpleValue", " SimpleV", SimpleValueBuilderFactory::get(), false); +Impl fast_value_impl(0, " FastValue", "NEW PROD", FastValueBuilderFactory::get(), false); +Impl optimized_fast_value_impl(2, "Optimized FastValue", "Optimize", FastValueBuilderFactory::get(), true); +Impl packed_mixed_tensor_impl(5, " PackedMixedTensor", " Packed", PackedMixedTensorBuilderFactory::get(), false); +Impl default_tensor_value_impl(4, " DefaultValue", "DefaultV", DefaultValueBuilderFactory::get(), false); vespalib::string short_header("--------"); constexpr double budget = 5.0; @@ -228,10 +239,10 @@ constexpr double best_limit = 0.95; // everything within 95% of best performance constexpr double bad_limit = 0.90; // BAD: new prod has performance lower than 90% of old prod constexpr double good_limit = 1.10; // GOOD: new prod has performance higher than 110% of old prod -std::vector<CREF<Impl>> impl_list = {simple_tensor_engine_impl, - default_tensor_engine_impl, +std::vector<CREF<Impl>> impl_list = {default_tensor_engine_impl, simple_value_impl, fast_value_impl, + optimized_fast_value_impl, packed_mixed_tensor_impl, default_tensor_value_impl}; @@ -756,6 +767,11 @@ TEST(MixedJoin, no_overlap) { //----------------------------------------------------------------------------- +TEST(ReduceBench, number_reduce) { + auto lhs = make_spec(1.0); + benchmark_reduce("number reduce", lhs, Aggr::SUM, {}); +} + TEST(ReduceBench, dense_reduce) { auto lhs = make_cube(D::idx("a", 16), D::idx("b", 16), D::idx("c", 16), 1.0); benchmark_reduce("dense reduce inner", lhs, Aggr::SUM, {"c"}); diff --git a/eval/src/vespa/eval/eval/aggr.cpp b/eval/src/vespa/eval/eval/aggr.cpp index 8efb0ec9fe7..e731c7a1f09 100644 --- a/eval/src/vespa/eval/eval/aggr.cpp +++ b/eval/src/vespa/eval/eval/aggr.cpp @@ -14,8 +14,8 @@ namespace { template <typename T> struct Wrapper : Aggregator { T aggr; - virtual void first(double value) final override { aggr.first(value); } - virtual void next(double value) final override { aggr.next(value); } + virtual void first(double value) final override { aggr = T{value}; } + virtual void next(double value) final override { aggr.sample(value); } virtual double result() const final override { return aggr.result(); } }; diff --git a/eval/src/vespa/eval/eval/aggr.h b/eval/src/vespa/eval/eval/aggr.h index a67d8f77a15..050287d183c 100644 --- a/eval/src/vespa/eval/eval/aggr.h +++ b/eval/src/vespa/eval/eval/aggr.h @@ -60,63 +60,75 @@ namespace aggr { template <typename T> class Avg { private: - T _sum = 0.0; - size_t _cnt = 0; + T _sum; + size_t _cnt; public: - void first(T value) { - _sum = value; - _cnt = 1; - } - void next(T value) { + constexpr Avg() : _sum{0}, _cnt{0} {} + constexpr Avg(T value) : _sum{value}, _cnt{1} {} + constexpr void sample(T value) { _sum += value; ++_cnt; } - T result() const { return (_sum / _cnt); } + constexpr void merge(const Avg &rhs) { + _sum += rhs._sum; + _cnt += rhs._cnt; + }; + constexpr T result() const { return (_sum / _cnt); } }; template <typename T> class Count { private: - size_t _cnt = 0; + size_t _cnt; public: - void first(T) { _cnt = 1; } - void next(T) { ++_cnt; } - T result() const { return _cnt; } + constexpr Count() : _cnt{0} {} + constexpr Count(T) : _cnt{1} {} + constexpr void sample(T) { ++_cnt; } + constexpr void merge(const Count &rhs) { _cnt += rhs._cnt; } + constexpr T result() const { return _cnt; } }; template <typename T> class Prod { private: - T _prod = 1.0; + T _prod; public: - void first(T value) { _prod = value; } - void next(T value) { _prod *= value; } - T result() const { return _prod; } + constexpr Prod() : _prod{1} {} + constexpr Prod(T value) : _prod{value} {} + constexpr void sample(T value) { _prod *= value; } + constexpr void merge(const Prod &rhs) { _prod *= rhs._prod; } + constexpr T result() const { return _prod; } }; template <typename T> class Sum { private: - T _sum = 0.0; + T _sum; public: - void first(T value) { _sum = value; } - void next(T value) { _sum += value; } - T result() const { return _sum; } + constexpr Sum() : _sum{0} {} + constexpr Sum(T value) : _sum{value} {} + constexpr void sample(T value) { _sum += value; } + constexpr void merge(const Sum &rhs) { _sum += rhs._sum; } + constexpr T result() const { return _sum; } }; template <typename T> class Max { private: - T _max = -std::numeric_limits<T>::infinity(); + T _max; public: - void first(T value) { _max = value; } - void next(T value) { _max = std::max(_max, value); } - T result() const { return _max; } + constexpr Max() : _max{-std::numeric_limits<T>::infinity()} {} + constexpr Max(T value) : _max{value} {} + constexpr void sample(T value) { _max = std::max(_max, value); } + constexpr void merge(const Max &rhs) { _max = std::max(_max, rhs._max); } + constexpr T result() const { return _max; } }; template <typename T> class Min { private: - T _min = std::numeric_limits<T>::infinity(); + T _min; public: - void first(T value) { _min = value; } - void next(T value) { _min = std::min(_min, value); } - T result() const { return _min; } + constexpr Min() : _min{std::numeric_limits<T>::infinity()} {} + constexpr Min(T value) : _min{value} {} + constexpr void sample(T value) { _min = std::min(_min, value); } + constexpr void merge(const Min &rhs) { _min = std::min(_min, rhs._min); } + constexpr T result() const { return _min; } }; } // namespave vespalib::eval::aggr diff --git a/eval/src/vespa/eval/instruction/generic_reduce.cpp b/eval/src/vespa/eval/instruction/generic_reduce.cpp index 43e11ced49b..ca9b9f0cd52 100644 --- a/eval/src/vespa/eval/instruction/generic_reduce.cpp +++ b/eval/src/vespa/eval/instruction/generic_reduce.cpp @@ -6,6 +6,8 @@ #include <vespa/eval/eval/array_array_map.h> #include <vespa/vespalib/util/stash.h> #include <vespa/vespalib/util/typify.h> +#include <vespa/vespalib/util/overload.h> +#include <vespa/vespalib/util/visit_ranges.h> #include <cassert> using namespace vespalib::eval::tensor_function; @@ -77,13 +79,9 @@ generic_reduce(const Value &value, const ReduceParam ¶m) { ConstArrayRef<vespalib::stringref*> keep_addr(sparse.keep_address); while (full_view->next_result(sparse.fetch_address, sparse.subspace)) { auto [tag, ignore] = map.lookup_or_add_entry(keep_addr); - AGGR *dst = nullptr; - auto next = [&](size_t idx) { (dst++)->next(cells[idx]); }; - auto fill_aggrs = [&,tag = tag](size_t idx) { - dst = map.get_values(tag).begin(); - param.dense_plan.execute_keep(idx, next); - }; - param.dense_plan.execute_reduce((sparse.subspace * param.dense_plan.in_size), fill_aggrs); + AGGR *dst = map.get_values(tag).begin(); + auto sample = [&](size_t src_idx, size_t dst_idx) { dst[dst_idx].sample(cells[src_idx]); }; + param.dense_plan.execute(sparse.subspace * param.dense_plan.in_size, sample); } auto builder = param.factory.create_value_builder<OCT>(param.res_type, param.sparse_plan.keep_dims.size(), param.dense_plan.out_size, map.size()); map.each_entry([&](const auto &keys, const auto &values) @@ -116,12 +114,21 @@ template <typename ICT, typename AGGR> void my_full_reduce_op(State &state, uint64_t) { auto cells = state.peek(0).cells().typify<ICT>(); if (cells.size() > 0) { - AGGR aggr; - aggr.first(cells[0]); - for (size_t i = 1; i < cells.size(); ++i) { - aggr.next(cells[i]); + AGGR aggr[4]; + size_t i = 0; + for (; (i + 3) < cells.size(); i += 4) { + aggr[0].sample(cells[i+0]); + aggr[1].sample(cells[i+1]); + aggr[2].sample(cells[i+2]); + aggr[3].sample(cells[i+3]); } - state.pop_push(state.stash.create<DoubleValue>(aggr.result())); + for (; i < cells.size(); ++i) { + aggr[0].sample(cells[i]); + } + aggr[0].merge(aggr[1]); + aggr[0].merge(aggr[2]); + aggr[0].merge(aggr[3]); + state.pop_push(state.stash.create<DoubleValue>(aggr[0].result())); } else { state.pop_push(state.stash.create<DoubleValue>(0.0)); } @@ -154,39 +161,48 @@ struct PerformGenericReduce { DenseReducePlan::DenseReducePlan(const ValueType &type, const ValueType &res_type) : in_size(1), out_size(1), - keep_loop(), - keep_stride(), - reduce_loop(), - reduce_stride() + loop_cnt(), + in_stride(), + out_stride() { - std::vector<bool> keep; - std::vector<size_t> size; - for (const auto &dim: type.nontrivial_indexed_dimensions()) { - keep.push_back(res_type.dimension_index(dim.name) != ValueType::Dimension::npos); - size.push_back(dim.size); - } - std::vector<size_t> stride(size.size(), 0); - for (size_t i = stride.size(); i-- > 0; ) { - stride[i] = in_size; - in_size *= size[i]; - if (keep[i]) { - out_size *= size[i]; - } - } - int prev_case = 2; - for (size_t i = 0; i < size.size(); ++i) { - int my_case = keep[i] ? 1 : 0; - auto &my_loop = keep[i] ? keep_loop : reduce_loop; - auto &my_stride = keep[i] ? keep_stride : reduce_stride; + enum class Case { NONE, KEEP, REDUCE }; + Case prev_case = Case::NONE; + auto update_plan = [&](Case my_case, size_t my_size) { if (my_case == prev_case) { - assert(!my_loop.empty()); - my_loop.back() *= size[i]; - my_stride.back() = stride[i]; + assert(!loop_cnt.empty()); + loop_cnt.back() *= my_size; } else { - my_loop.push_back(size[i]); - my_stride.push_back(stride[i]); + loop_cnt.push_back(my_size); + in_stride.push_back(1); + out_stride.push_back((my_case == Case::KEEP) ? 1 : 0); + prev_case = my_case; + } + }; + auto visitor = overload + { + [&](visit_ranges_either, const auto &a) { update_plan(Case::REDUCE, a.size); }, + [&](visit_ranges_both, const auto &a, const auto &) { update_plan(Case::KEEP, a.size); } + }; + auto in_dims = type.nontrivial_indexed_dimensions(); + auto out_dims = res_type.nontrivial_indexed_dimensions(); + visit_ranges(visitor, in_dims.begin(), in_dims.end(), out_dims.begin(), out_dims.end(), + [](const auto &a, const auto &b){ return (a.name < b.name); }); + for (size_t i = loop_cnt.size(); i-- > 0; ) { + in_stride[i] = in_size; + in_size *= loop_cnt[i]; + if (out_stride[i] != 0) { + out_stride[i] = out_size; + out_size *= loop_cnt[i]; + } + } + for (size_t i = 1; i < loop_cnt.size(); ++i) { + for (size_t j = i; j > 0; --j) { + if ((out_stride[j] == 0) && (out_stride[j - 1] > 0)) { + std::swap(loop_cnt[j], loop_cnt[j - 1]); + std::swap(in_stride[j], in_stride[j - 1]); + std::swap(out_stride[j], out_stride[j - 1]); + } } - prev_case = my_case; } } diff --git a/eval/src/vespa/eval/instruction/generic_reduce.h b/eval/src/vespa/eval/instruction/generic_reduce.h index 2edceb291b1..cacc0f4cfd3 100644 --- a/eval/src/vespa/eval/instruction/generic_reduce.h +++ b/eval/src/vespa/eval/instruction/generic_reduce.h @@ -17,17 +17,13 @@ namespace vespalib::eval::instruction { struct DenseReducePlan { size_t in_size; size_t out_size; - std::vector<size_t> keep_loop; - std::vector<size_t> keep_stride; - std::vector<size_t> reduce_loop; - std::vector<size_t> reduce_stride; + std::vector<size_t> loop_cnt; + std::vector<size_t> in_stride; + std::vector<size_t> out_stride; DenseReducePlan(const ValueType &type, const ValueType &res_type); ~DenseReducePlan(); - template <typename F> void execute_keep(size_t offset, const F &f) const { - run_nested_loop(offset, keep_loop, keep_stride, f); - } - template <typename F> void execute_reduce(size_t offset, const F &f) const { - run_nested_loop(offset, reduce_loop, reduce_stride, f); + template <typename F> void execute(size_t in_idx, const F &f) const { + run_nested_loop(in_idx, 0, loop_cnt, in_stride, out_stride, f); } }; diff --git a/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.cpp b/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.cpp index c0bc3f26fe9..c0fc29d2800 100644 --- a/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.cpp +++ b/eval/src/vespa/eval/tensor/dense/dense_single_reduce_function.cpp @@ -43,11 +43,11 @@ struct Params { }; template <typename CT, typename AGGR> -CT reduce_cells(const CT *src, size_t dim_size, size_t stride, AGGR &aggr) { - aggr.first(*src); +CT reduce_cells(const CT *src, size_t dim_size, size_t stride) { + AGGR aggr(*src); for (size_t i = 1; i < dim_size; ++i) { src += stride; - aggr.next(*src); + aggr.sample(*src); } return aggr.result(); } @@ -57,12 +57,11 @@ void my_single_reduce_op(InterpretedFunction::State &state, uint64_t param) { const auto ¶ms = unwrap_param<Params>(param); const CT *src = state.peek(0).cells().typify<CT>().cbegin(); auto dst_cells = state.stash.create_array<CT>(params.outer_size * params.inner_size); - AGGR aggr; CT *dst = dst_cells.begin(); const size_t block_size = (params.dim_size * params.inner_size); for (size_t outer = 0; outer < params.outer_size; ++outer) { for (size_t inner = 0; inner < params.inner_size; ++inner) { - *dst++ = reduce_cells<CT, AGGR>(src + inner, params.dim_size, params.inner_size, aggr); + *dst++ = reduce_cells<CT, AGGR>(src + inner, params.dim_size, params.inner_size); } src += block_size; } |