diff options
author | Håvard Pettersen <havardpe@yahooinc.com> | 2023-08-17 13:23:42 +0000 |
---|---|---|
committer | Håvard Pettersen <havardpe@yahooinc.com> | 2023-08-23 14:32:34 +0000 |
commit | eed4b7ec6d3753bd9a1e27655d8c81716d47bd98 (patch) | |
tree | defe1e75ca959743037ab9a9bf6004d93a8655c5 /eval | |
parent | f37ad506b56dffd8002eb9334d0811e788dade01 (diff) |
sparse join reduce plan
Diffstat (limited to 'eval')
7 files changed, 437 insertions, 0 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt index 4e9fd1b27be..e149727d94f 100644 --- a/eval/CMakeLists.txt +++ b/eval/CMakeLists.txt @@ -83,6 +83,7 @@ vespa_define_module( src/tests/instruction/sparse_112_dot_product src/tests/instruction/sparse_dot_product_function src/tests/instruction/sparse_full_overlap_join_function + src/tests/instruction/sparse_join_reduce_plan src/tests/instruction/sparse_merge_function src/tests/instruction/sparse_no_overlap_join_function src/tests/instruction/sparse_singledim_lookup diff --git a/eval/src/tests/instruction/sparse_join_reduce_plan/CMakeLists.txt b/eval/src/tests/instruction/sparse_join_reduce_plan/CMakeLists.txt new file mode 100644 index 00000000000..a333a5e9638 --- /dev/null +++ b/eval/src/tests/instruction/sparse_join_reduce_plan/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(eval_sparse_join_reduce_plan_test_app TEST + SOURCES + sparse_join_reduce_plan_test.cpp + DEPENDS + vespaeval + GTest::GTest +) +vespa_add_test(NAME eval_sparse_join_reduce_plan_test_app COMMAND eval_sparse_join_reduce_plan_test_app) diff --git a/eval/src/tests/instruction/sparse_join_reduce_plan/sparse_join_reduce_plan_test.cpp b/eval/src/tests/instruction/sparse_join_reduce_plan/sparse_join_reduce_plan_test.cpp new file mode 100644 index 00000000000..e101487ff59 --- /dev/null +++ b/eval/src/tests/instruction/sparse_join_reduce_plan/sparse_join_reduce_plan_test.cpp @@ -0,0 +1,205 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/eval/eval/value.h> +#include <vespa/eval/eval/fast_value.h> +#include <vespa/eval/eval/value_codec.h> +#include <vespa/eval/eval/test/gen_spec.h> +#include <vespa/eval/instruction/sparse_join_reduce_plan.h> +#include <vespa/vespalib/util/shared_string_repo.h> +#include <vespa/vespalib/gtest/gtest.h> +#include <algorithm> + +using namespace vespalib; +using namespace vespalib::eval; +using namespace vespalib::eval::test; +using namespace vespalib::eval::instruction; + +using Handle = vespalib::SharedStringRepo::Handle; + +Value::UP val(const vespalib::string &value_desc) { + return value_from_spec(GenSpec::from_desc(value_desc), FastValueBuilderFactory::get()); +} + +Handle make_handle(string_id id) { + return Handle::handle_from_id(id); +} + +Handle make_handle(const vespalib::string &str) { + return Handle(str); +} + +struct Event { + size_t lhs_idx; + size_t rhs_idx; + std::vector<Handle> res_addr; + template <typename ADDR> + Event(size_t a, size_t b, ADDR addr) + : lhs_idx(a), rhs_idx(b), res_addr() + { + for (auto label: addr) { + res_addr.push_back(make_handle(label)); + } + } + auto operator<=>(const Event &rhs) const = default; +}; + +struct Trace { + size_t estimate; + std::vector<Event> events; + Trace(size_t estimate_in) + : estimate(estimate_in), events() {} + void add_raw(size_t lhs_idx, size_t rhs_idx, ConstArrayRef<string_id> res_addr) { + events.emplace_back(lhs_idx, rhs_idx, res_addr); + } + Trace &add(size_t lhs_idx, size_t rhs_idx, std::vector<vespalib::string> res_addr) { + events.emplace_back(lhs_idx, rhs_idx, res_addr); + return *this; + } + auto operator<=>(const Trace &rhs) const = default; +}; + +std::ostream & +operator<<(std::ostream &os, const Event &event) { + os << "{ lhs: " << event.lhs_idx << ", rhs: " << event.rhs_idx << ", addr: ["; + for (size_t i = 0; i < event.res_addr.size(); ++i) { + if (i > 0) { + os << ", "; + } + os << event.res_addr[i].as_string(); + } + os << "] }"; + return os; +} + +std::ostream & +operator<<(std::ostream &os, const Trace &trace) { + os << "estimate: " << trace.estimate << "\n"; + for (const Event &event: trace.events) { + os << " " << event << "\n"; + } + return os; +} + +Trace trace(size_t est) { return Trace(est); } + +Trace trace(const vespalib::string &a_desc, const vespalib::string &b_desc, + const std::vector<vespalib::string> &reduce_dims) +{ + auto a = val(a_desc); + auto b = val(b_desc); + auto res_type = ValueType::join(a->type(), b->type()); + if (!reduce_dims.empty()) { + res_type = res_type.reduce(reduce_dims); + } + SparseJoinReducePlan plan(a->type(), b->type(), res_type); + Trace trace(plan.estimate_result_size(a->index(), b->index())); + plan.execute(a->index(), b->index(), + [&trace](size_t lhs_idx, size_t rhs_idx, ConstArrayRef<string_id> res_addr) { + trace.add_raw(lhs_idx, rhs_idx, res_addr); + }); + return trace; +} + +//----------------------------------------------------------------------------- + +TEST(SparseJoinReducePlanTest, simple_dense) { + EXPECT_EQ(trace("x10", "x10", {}), trace(1).add(0, 0, {})); + EXPECT_EQ(trace("x10", "x10", {"x"}), trace(1).add(0, 0, {})); +} + +TEST(SparseJoinReducePlanTest, many_dimensions) { + EXPECT_EQ(trace("a1_1b1_2c1_3d1_4", "c1_3d1_4e1_5f1_6", {"b","d","f"}), trace(1).add(0, 0, {"1", "3", "5"})); + EXPECT_EQ(trace("c1_3d1_4e1_5f1_6", "a1_1b1_2c1_3d1_4", {"b","d","f"}), trace(1).add(0, 0, {"1", "3", "5"})); +} + +TEST(SparseJoinReducePlanTest, traverse_order_can_be_swapped) { + EXPECT_EQ(trace("x2_4", "y3_1", {}), trace(6).add(0, 0, {"4", "1"}).add(0, 1, {"4", "2"}).add(0, 2, {"4", "3"}) + .add(1, 0, {"8", "1"}).add(1, 1, {"8", "2"}).add(1, 2, {"8", "3"})); + EXPECT_EQ(trace("y3_1", "x2_4", {}), trace(6).add(0, 0, {"4", "1"}).add(1, 0, {"4", "2"}).add(2, 0, {"4", "3"}) + .add(0, 1, {"8", "1"}).add(1, 1, {"8", "2"}).add(2, 1, {"8", "3"})); +} + +//----------------------------------------------------------------------------- + +TEST(SparseJoinReducePlanTest, full_overlap_no_reduce) { + EXPECT_EQ(trace("x4_1", "x2_2", {}), trace(2).add(1, 0, {"2"}).add(3, 1, {"4"})); + EXPECT_EQ(trace("x1_1", "x0_0", {}), trace(0)); + EXPECT_EQ(trace("x0_0", "x1_1", {}), trace(0)); +} + +TEST(SparseJoinReducePlanTest, full_overlap_reduce_all) { + EXPECT_EQ(trace("x4_1", "x2_2", {"x"}), trace(1).add(1, 0, {}).add(3, 1, {})); + EXPECT_EQ(trace("x1_1", "x0_0", {"x"}), trace(1)); + EXPECT_EQ(trace("x0_0", "x1_1", {"x"}), trace(1)); +} + +//----------------------------------------------------------------------------- + +TEST(SparseJoinReducePlanTest, no_overlap_no_reduce) { + EXPECT_EQ(trace("x2_1", "y3_1", {}), trace(6).add(0, 0, {"1", "1"}).add(0, 1, {"1", "2"}).add(0, 2, {"1", "3"}) + .add(1, 0, {"2", "1"}).add(1, 1, {"2", "2"}).add(1, 2, {"2", "3"})); + EXPECT_EQ(trace("x1_1", "y0_0", {}), trace(0)); + EXPECT_EQ(trace("y0_0", "x1_1", {}), trace(0)); +} + +TEST(SparseJoinReducePlanTest, no_overlap_reduce_last) { + EXPECT_EQ(trace("x2_1", "y3_1", {"y"}), trace(2).add(0, 0, {"1"}).add(0, 1, {"1"}).add(0, 2, {"1"}) + .add(1, 0, {"2"}).add(1, 1, {"2"}).add(1, 2, {"2"})); + EXPECT_EQ(trace("x1_1", "y0_0", {"y"}), trace(0)); + EXPECT_EQ(trace("y0_0", "x1_1", {"y"}), trace(0)); +} + +TEST(SparseJoinReducePlanTest, no_overlap_reduce_first) { + EXPECT_EQ(trace("x2_1", "y3_1", {"x"}), trace(3).add(0, 0, {"1"}).add(0, 1, {"2"}).add(0, 2, {"3"}) + .add(1, 0, {"1"}).add(1, 1, {"2"}).add(1, 2, {"3"})); + EXPECT_EQ(trace("x0_0", "y1_1", {"x"}), trace(0)); + EXPECT_EQ(trace("y1_1", "x0_0", {"x"}), trace(0)); +} + +TEST(SparseJoinReducePlanTest, no_overlap_reduce_all) { + EXPECT_EQ(trace("x2_1", "y3_1", {"x", "y"}), trace(1).add(0, 0, {}).add(0, 1, {}).add(0, 2, {}) + .add(1, 0, {}).add(1, 1, {}).add(1, 2, {})); + EXPECT_EQ(trace("x0_0", "y1_1", {"x", "y"}), trace(1)); + EXPECT_EQ(trace("y1_1", "x0_0", {"x", "y"}), trace(1)); +} + +//----------------------------------------------------------------------------- + +TEST(SparseJoinReducePlanTest, partial_overlap_no_reduce) { + EXPECT_EQ(trace("x2_1y1_1", "y1_1z2_3", {}), trace(2).add(0, 0, {"1", "1", "3"}).add(0, 1, {"1", "1", "6"}) + .add(1, 0, {"2", "1", "3"}).add(1, 1, {"2", "1", "6"})); + EXPECT_EQ(trace("x2_1y1_1", "y1_2z3_1", {}), trace(2)); + EXPECT_EQ(trace("x2_1y1_1", "y0_0z2_3", {}), trace(0)); +} + +TEST(SparseJoinReducePlanTest, partial_overlap_reduce_first) { + EXPECT_EQ(trace("x2_1y1_1", "y1_1z2_3", {"x"}), trace(2).add(0, 0, {"1", "3"}).add(0, 1, {"1", "6"}) + .add(1, 0, {"1", "3"}).add(1, 1, {"1", "6"})); + EXPECT_EQ(trace("x2_1y1_1", "y1_2z3_1", {"x"}), trace(2)); + EXPECT_EQ(trace("x2_1y1_1", "y0_0z2_3", {"x"}), trace(0)); +} + +TEST(SparseJoinReducePlanTest, partial_overlap_reduce_middle) { + EXPECT_EQ(trace("x2_1y1_1", "y1_1z2_3", {"y"}), trace(2).add(0, 0, {"1", "3"}).add(0, 1, {"1", "6"}) + .add(1, 0, {"2", "3"}).add(1, 1, {"2", "6"})); + EXPECT_EQ(trace("x2_1y1_1", "y1_2z3_1", {"y"}), trace(2)); + EXPECT_EQ(trace("x2_1y1_1", "y0_0z2_3", {"y"}), trace(0)); +} + +TEST(SparseJoinReducePlanTest, partial_overlap_reduce_last) { + EXPECT_EQ(trace("x2_1y1_1", "y1_1z2_3", {"z"}), trace(2).add(0, 0, {"1", "1"}).add(0, 1, {"1", "1"}) + .add(1, 0, {"2", "1"}).add(1, 1, {"2", "1"})); + EXPECT_EQ(trace("x2_1y1_1", "y1_2z3_1", {"z"}), trace(2)); + EXPECT_EQ(trace("x2_1y1_1", "y0_0z2_3", {"z"}), trace(0)); +} + +TEST(SparseJoinReducePlanTest, partial_overlap_reduce_all) { + EXPECT_EQ(trace("x2_1y1_1", "y1_1z2_3", {"x", "y", "z"}), trace(1).add(0, 0, {}).add(0, 1, {}) + .add(1, 0, {}).add(1, 1, {})); + EXPECT_EQ(trace("x2_1y1_1", "y1_2z3_1", {"x", "y", "z"}), trace(1)); + EXPECT_EQ(trace("x2_1y1_1", "y0_0z2_3", {"x", "y", "z"}), trace(1)); +} + +//----------------------------------------------------------------------------- + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/eval/src/vespa/eval/eval/value_type.h b/eval/src/vespa/eval/eval/value_type.h index 5c0d9e3317d..ddbdcfb4a5f 100644 --- a/eval/src/vespa/eval/eval/value_type.h +++ b/eval/src/vespa/eval/eval/value_type.h @@ -69,6 +69,9 @@ public: std::vector<Dimension> indexed_dimensions() const; std::vector<Dimension> mapped_dimensions() const; size_t dimension_index(const vespalib::string &name) const; + bool has_dimension(const vespalib::string &name) const { + return (dimension_index(name) != Dimension::npos); + } std::vector<vespalib::string> dimension_names() const; bool operator==(const ValueType &rhs) const noexcept { return ((_error == rhs._error) && diff --git a/eval/src/vespa/eval/instruction/CMakeLists.txt b/eval/src/vespa/eval/instruction/CMakeLists.txt index 02bbfec5dd3..487539aeff1 100644 --- a/eval/src/vespa/eval/instruction/CMakeLists.txt +++ b/eval/src/vespa/eval/instruction/CMakeLists.txt @@ -43,6 +43,7 @@ vespa_add_library(eval_instruction OBJECT sparse_112_dot_product.cpp sparse_dot_product_function.cpp sparse_full_overlap_join_function.cpp + sparse_join_reduce_plan.cpp sparse_merge_function.cpp sparse_no_overlap_join_function.cpp sparse_singledim_lookup.cpp diff --git a/eval/src/vespa/eval/instruction/sparse_join_reduce_plan.cpp b/eval/src/vespa/eval/instruction/sparse_join_reduce_plan.cpp new file mode 100644 index 00000000000..806e24dd7df --- /dev/null +++ b/eval/src/vespa/eval/instruction/sparse_join_reduce_plan.cpp @@ -0,0 +1,183 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "sparse_join_reduce_plan.h" +#include <vespa/vespalib/util/overload.h> +#include <vespa/vespalib/util/visit_ranges.h> +#include <cassert> + +namespace vespalib::eval::instruction { + +namespace { + +using Dim = ValueType::Dimension; +using Dims = std::vector<ValueType::Dimension>; + +void visit(auto &v, const Dims &a, const Dims &b) { + visit_ranges(v, a.begin(), a.end(), b.begin(), b.end(), + [](const auto &x, const auto &y){ return (x.name < y.name); }); +} + +Dims merge(const Dims &first, const Dims &second) { + Dims result; + auto visitor = overload { + [&result](visit_ranges_either, const Dim &dim) { result.push_back(dim); }, + [&result](visit_ranges_both, const Dim &dim, const Dim &) { result.push_back(dim); } + }; + visit(visitor, first, second); + return result; +} + +size_t count_only_in_second(const Dims &first, const Dims &second) { + size_t result = 0; + auto visitor = overload { + [](visit_ranges_first, const Dim &) {}, + [&result](visit_ranges_second, const Dim &) { ++result; }, + [](visit_ranges_both, const Dim &, const Dim &) {} + }; + visit(visitor, first, second); + return result; +} + +struct SparseJoinReduceState { + SmallVector<string_id,4> addr_space; + SmallVector<string_id*,4> a_addr; + SmallVector<const string_id*,4> overlap; + SmallVector<string_id*,4> b_only; + SmallVector<size_t,4> b_view; + size_t a_subspace; + size_t b_subspace; + uint32_t res_dims; + SparseJoinReduceState(const bool *in_a, const bool *in_b, const bool *in_res, size_t dims) + : addr_space(dims), a_addr(), overlap(), b_only(), b_view(), a_subspace(), b_subspace(), res_dims(0) + { + size_t b_idx = 0; + uint32_t dims_end = addr_space.size(); + for (size_t i = 0; i < dims; ++i) { + string_id *id = in_res[i] ? &addr_space[res_dims++] : &addr_space[--dims_end]; + if (in_a[i]) { + a_addr.push_back(id); + if (in_b[i]) { + overlap.push_back(id); + b_view.push_back(b_idx++); + } + } else if (in_b[i]) { + b_only.push_back(id); + ++b_idx; + } + } + // Kept dimensions are allocated from the start and dropped + // dimensions are allocated from the end. Make sure they + // combine to exactly cover the complete address space. + assert(res_dims == dims_end); + } + ~SparseJoinReduceState(); +}; +SparseJoinReduceState::~SparseJoinReduceState() = default; + +void execute_plan(const Value::Index &a, const Value::Index &b, + const bool *in_a, const bool *in_b, const bool *in_res, + size_t dims, auto &&f) +{ + SparseJoinReduceState state(in_a, in_b, in_res, dims); + auto outer = a.create_view({}); + auto inner = b.create_view(state.b_view); + outer->lookup({}); + while (outer->next_result(state.a_addr, state.a_subspace)) { + inner->lookup(state.overlap); + while (inner->next_result(state.b_only, state.b_subspace)) { + f(state.a_subspace, state.b_subspace, ConstArrayRef<string_id>{state.addr_space.begin(), state.res_dims}); + } + } +} + +using est_fun = SparseJoinReducePlan::est_fun_t; +using est_filter = std::function<bool(bool, bool, bool)>; + +struct Est { + est_filter filter; + est_fun estimate; + bool can_use; + Est(est_filter filter_in, est_fun estimate_in) + : filter(filter_in), estimate(estimate_in), can_use(true) {} + ~Est(); +}; +Est::~Est() = default; + +size_t est_1(size_t, size_t) noexcept { return 1; } +size_t est_a_or_0(size_t a, size_t b) noexcept { return (b == 0) ? 0 : a; } +size_t est_b_or_0(size_t a, size_t b) noexcept { return (a == 0) ? 0 : b; } +size_t est_min(size_t a, size_t b) noexcept { return std::min(a, b); } +size_t est_mul(size_t a, size_t b) noexcept { return (a * b); } + +bool no_dims(bool, bool, bool) noexcept { return false; } +bool reduce_all(bool, bool, bool keep) noexcept { return !keep; } +bool keep_a_reduce_b(bool a, bool b, bool keep) noexcept { + if (keep) { + return (a && !b); + } else { + return (!a && b); + } +} +bool keep_b_reduce_a(bool a, bool b, bool keep) noexcept { return keep_a_reduce_b(b, a, keep); } +bool full_overlap(bool a, bool b, bool) noexcept { return (a == b); } +bool no_overlap_keep_all(bool a, bool b, bool keep) noexcept { return keep && (a != b); } + +std::vector<Est> make_est_list() { + return { + { no_dims, est_1 }, + { reduce_all, est_1 }, + { keep_a_reduce_b, est_a_or_0 }, + { keep_b_reduce_a, est_b_or_0 }, + { full_overlap, est_min }, + { no_overlap_keep_all, est_mul } + }; +} + +void update_est_list(std::vector<Est> &est_list, bool in_lhs, bool in_rhs, bool in_res) { + for (Est &est: est_list) { + if (est.can_use && !est.filter(in_lhs, in_rhs, in_res)) { + est.can_use = false; + } + } +} + +est_fun select_estimate(const std::vector<Est> &est_list) { + for (const Est &est: est_list) { + if (est.can_use) { + return est.estimate; + } + } + return est_min; +} + +} // <unnamed> + +SparseJoinReducePlan::SparseJoinReducePlan(const ValueType &lhs, const ValueType &rhs, const ValueType &res) + : _in_lhs(), _in_rhs(), _in_res(), _estimate() +{ + auto dims = merge(lhs.mapped_dimensions(), rhs.mapped_dimensions()); + assert(count_only_in_second(dims, res.mapped_dimensions()) == 0); + auto est_list = make_est_list(); + for (const auto &dim: dims) { + _in_lhs.push_back(lhs.has_dimension(dim.name)); + _in_rhs.push_back(rhs.has_dimension(dim.name)); + _in_res.push_back(res.has_dimension(dim.name)); + update_est_list(est_list, _in_lhs.back(), _in_rhs.back(), _in_res.back()); + } + _estimate = select_estimate(est_list); + assert(bool(_estimate)); +} + +SparseJoinReducePlan::~SparseJoinReducePlan() = default; + +void +SparseJoinReducePlan::execute(const Value::Index &lhs, const Value::Index &rhs, F f) const { + if (rhs.size() < lhs.size()) { + auto swap = [&](auto a, auto b, auto addr) { f(b, a, addr); }; + execute_plan(rhs, lhs, _in_rhs.data(), _in_lhs.data(), _in_res.data(), _in_res.size(), swap); + } else { + execute_plan(lhs, rhs, _in_lhs.data(), _in_rhs.data(), _in_res.data(), _in_res.size(), f); + } +} + +} // namespace diff --git a/eval/src/vespa/eval/instruction/sparse_join_reduce_plan.h b/eval/src/vespa/eval/instruction/sparse_join_reduce_plan.h new file mode 100644 index 00000000000..8864c56887a --- /dev/null +++ b/eval/src/vespa/eval/instruction/sparse_join_reduce_plan.h @@ -0,0 +1,35 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/eval/eval/value.h> +#include <vespa/vespalib/util/small_vector.h> +#include <functional> + +namespace vespalib::eval::instruction { + +class SparseJoinReducePlan +{ +public: + friend class SparseJoinReducePlanTest; + + using BitList = SmallVector<bool,8>; + using est_fun_t = std::function<size_t(size_t lhs_size, size_t rhs_size)>; + using F = std::function<void(size_t lhs_subspace, size_t rhs_subspace, ConstArrayRef<string_id> res_addr)>; + +private: + BitList _in_lhs; + BitList _in_rhs; + BitList _in_res; + est_fun_t _estimate; + +public: + SparseJoinReducePlan(const ValueType &lhs, const ValueType &rhs, const ValueType &res); + ~SparseJoinReducePlan(); + size_t estimate_result_size(const Value::Index &lhs, const Value::Index &rhs) const { + return _estimate(lhs.size(), rhs.size()); + } + void execute(const Value::Index &lhs, const Value::Index &rhs, F f) const; +}; + +} // namespace |