sparse 112 dot product

author: Håvard Pettersen <havardpe@oath.com> 2021-12-20 15:05:22 +0000
committer: Håvard Pettersen <havardpe@oath.com> 2021-12-21 14:12:39 +0000
commit: bc219a3cb4c01ce449584284aa7ff03afb9e9dca (patch)
tree: c2e2b417f2e5f3ee3148637e5b91a12b105fca90 /eval
parent: 28ae61202ad963955cf92719bab9b9d97181d5dd (diff)
9 files changed, 377 insertions, 0 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt
index 2e0af3acfa7..eed4fa5ce66 100644
--- a/eval/CMakeLists.txt
+++ b/eval/CMakeLists.txt
@@ -75,6 +75,7 @@ vespa_define_module(
     src/tests/instruction/mixed_simple_join_function
     src/tests/instruction/pow_as_map_optimizer
     src/tests/instruction/remove_trivial_dimension_optimizer
+    src/tests/instruction/sparse_112_dot_product
     src/tests/instruction/sparse_dot_product_function
     src/tests/instruction/sparse_full_overlap_join_function
     src/tests/instruction/sparse_merge_function
diff --git a/eval/src/tests/instruction/sparse_112_dot_product/CMakeLists.txt b/eval/src/tests/instruction/sparse_112_dot_product/CMakeLists.txt
new file mode 100644
index 00000000000..af7f59f091b
--- /dev/null
+++ b/eval/src/tests/instruction/sparse_112_dot_product/CMakeLists.txt
@@ -0,0 +1,9 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(eval_sparse_112_dot_product_test_app TEST
+    SOURCES
+    sparse_112_dot_product_test.cpp
+    DEPENDS
+    vespaeval
+    GTest::GTest
+)
+vespa_add_test(NAME eval_sparse_112_dot_product_test_app COMMAND eval_sparse_112_dot_product_test_app)
diff --git a/eval/src/tests/instruction/sparse_112_dot_product/sparse_112_dot_product_test.cpp b/eval/src/tests/instruction/sparse_112_dot_product/sparse_112_dot_product_test.cpp
new file mode 100644
index 00000000000..7dcddc3bf80
--- /dev/null
+++ b/eval/src/tests/instruction/sparse_112_dot_product/sparse_112_dot_product_test.cpp
@@ -0,0 +1,88 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/eval/eval/fast_value.h>
+#include <vespa/eval/eval/simple_value.h>
+#include <vespa/eval/instruction/sparse_112_dot_product.h>
+#include <vespa/eval/eval/test/eval_fixture.h>
+#include <vespa/eval/eval/test/gen_spec.h>
+#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using namespace vespalib::eval;
+using namespace vespalib::eval::test;
+
+using vespalib::make_string_short::fmt;
+
+const ValueBuilderFactory &prod_factory = FastValueBuilderFactory::get();
+
+//-----------------------------------------------------------------------------
+
+struct FunInfo {
+    using LookFor = Sparse112DotProduct;
+    void verify(const LookFor &fun) const {
+        EXPECT_TRUE(fun.result_is_mutable());
+    }
+};
+
+void verify_optimized_cell_types(const vespalib::string &expr)
+{
+    CellTypeSpace types(CellTypeUtils::list_types(), 3);
+    EvalFixture::verify<FunInfo>(expr, {FunInfo()}, CellTypeSpace(types).same());
+    EvalFixture::verify<FunInfo>(expr, {}, CellTypeSpace(types).different());
+}
+
+void verify_optimized(const vespalib::string &expr, size_t num_params = 3)
+{
+    CellTypeSpace just_float({CellType::FLOAT}, num_params);
+    EvalFixture::verify<FunInfo>(expr, {FunInfo()}, just_float);
+}
+
+void verify_not_optimized(const vespalib::string &expr) {
+    CellTypeSpace just_double({CellType::DOUBLE}, 3);
+    EvalFixture::verify<FunInfo>(expr, {}, just_double);
+}
+
+//-----------------------------------------------------------------------------
+
+TEST(Sparse112DotProduct, expression_can_be_optimized)
+{
+    verify_optimized_cell_types("reduce(x5_2*y4_2*x5_1y4_1,sum)");
+}
+
+TEST(Sparse112DotProduct, different_input_placement_is_handeled)
+{
+    std::array<vespalib::string,3> params = {"x3_1", "y3_1", "x3_1y3_1"};
+    for (size_t p1 = 0; p1 < params.size(); ++p1) {
+        for (size_t p2 = 0; p2 < params.size(); ++p2) {
+            for (size_t p3 = 0; p3 < params.size(); ++p3) {
+                if ((p1 != p2) && (p1 != p3) && (p2 != p3)) {
+                    verify_optimized(fmt("reduce((%s*%s)*%s,sum)", params[p1].c_str(), params[p2].c_str(), params[p3].c_str()));
+                    verify_optimized(fmt("reduce(%s*(%s*%s),sum)", params[p1].c_str(), params[p2].c_str(), params[p3].c_str()));
+                }
+            }
+        }
+    }
+}
+
+TEST(Sparse112DotProduct, expression_can_be_optimized_with_extra_tensors)
+{
+    verify_optimized("reduce((x5_2*y4_2)*(x5_1y4_1*x3_1),sum)", 4);
+    verify_optimized("reduce((x5_2*x3_1)*(y4_2*x5_1y4_1),sum)", 4);
+}
+
+TEST(Sparse112DotProduct, similar_expressions_are_not_optimized)
+{
+    verify_not_optimized("reduce(x5_2*y4_2*x5_1y4_1,prod)");
+    verify_not_optimized("reduce(x5_2+y4_2*x5_1y4_1,sum)");
+    verify_not_optimized("reduce(x5_2*y4_2+x5_1y4_1,sum)");
+    verify_not_optimized("reduce(x5_2*z4_2*x5_1y4_1,sum)");
+    verify_not_optimized("reduce(x5_2*y4_2*x5_1z4_1,sum)");
+    verify_not_optimized("reduce(x5_2*x1_1y4_2*x5_1y4_1,sum)");
+    verify_not_optimized("reduce(x5_2*y4_2*x5_1,sum)");
+    verify_not_optimized("reduce(x5*y4*x5y4,sum)");
+    verify_not_optimized("reduce(x5*y4_1*x5y4_1,sum)");
+}
+
+//-----------------------------------------------------------------------------
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/vespa/eval/eval/fast_value.hpp b/eval/src/vespa/eval/eval/fast_value.hpp
index 66b44dbbf49..8a07b049a69 100644
--- a/eval/src/vespa/eval/eval/fast_value.hpp
+++ b/eval/src/vespa/eval/eval/fast_value.hpp
@@ -153,6 +153,10 @@ inline bool are_fast(const Value::Index &a, const Value::Index &b) {
     return (is_fast(a) && is_fast(b));
 }
 
+inline bool are_fast(const Value::Index &a, const Value::Index &b, const Value::Index &c) {
+    return (is_fast(a) && is_fast(b) && is_fast(c));
+}
+
 constexpr const FastValueIndex &as_fast(const Value::Index &index) {
     return static_cast<const FastValueIndex &>(index);
 }
diff --git a/eval/src/vespa/eval/eval/interpreted_function.h b/eval/src/vespa/eval/eval/interpreted_function.h
index b5eaf3a8b9c..57d7f79caf4 100644
--- a/eval/src/vespa/eval/eval/interpreted_function.h
+++ b/eval/src/vespa/eval/eval/interpreted_function.h
@@ -50,6 +50,11 @@ public:
             stack.pop_back();
             stack.back() = value;
         }
+        void pop_pop_pop_push(const Value &value) {
+            stack.pop_back();
+            stack.pop_back();
+            stack.back() = value;
+        }
         void pop_n_push(size_t n, const Value &value) {
             stack.resize(stack.size() - (n - 1), value);
             stack.back() = value;
diff --git a/eval/src/vespa/eval/eval/optimize_tensor_function.cpp b/eval/src/vespa/eval/eval/optimize_tensor_function.cpp
index e1520d4deb2..f9d3b1c6f54 100644
--- a/eval/src/vespa/eval/eval/optimize_tensor_function.cpp
+++ b/eval/src/vespa/eval/eval/optimize_tensor_function.cpp
@@ -6,6 +6,7 @@
 
 #include <vespa/eval/instruction/dense_dot_product_function.h>
 #include <vespa/eval/instruction/sparse_dot_product_function.h>
+#include <vespa/eval/instruction/sparse_112_dot_product.h>
 #include <vespa/eval/instruction/sparse_merge_function.h>
 #include <vespa/eval/instruction/sparse_no_overlap_join_function.h>
 #include <vespa/eval/instruction/sparse_full_overlap_join_function.h>
@@ -65,6 +66,7 @@ const TensorFunction &optimize_for_factory(const ValueBuilderFactory &, const Te
                       });
     run_optimize_pass(root, [&stash](const Child &child)
                       {
+                          child.set(Sparse112DotProduct::optimize(child.get(), stash));
                           child.set(BestSimilarityFunction::optimize(child.get(), stash));
                           child.set(L2Distance::optimize(child.get(), stash));                          
                       });
diff --git a/eval/src/vespa/eval/instruction/CMakeLists.txt b/eval/src/vespa/eval/instruction/CMakeLists.txt
index 56184c113d4..b614606199c 100644
--- a/eval/src/vespa/eval/instruction/CMakeLists.txt
+++ b/eval/src/vespa/eval/instruction/CMakeLists.txt
@@ -36,6 +36,7 @@ vespa_add_library(eval_instruction OBJECT
     pow_as_map_optimizer.cpp
     remove_trivial_dimension_optimizer.cpp
     replace_type_function.cpp
+    sparse_112_dot_product.cpp
     sparse_dot_product_function.cpp
     sparse_full_overlap_join_function.cpp
     sparse_merge_function.cpp
diff --git a/eval/src/vespa/eval/instruction/sparse_112_dot_product.cpp b/eval/src/vespa/eval/instruction/sparse_112_dot_product.cpp
new file mode 100644
index 00000000000..080f51e384a
--- /dev/null
+++ b/eval/src/vespa/eval/instruction/sparse_112_dot_product.cpp
@@ -0,0 +1,236 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "sparse_112_dot_product.h"
+#include <vespa/eval/eval/fast_value.hpp>
+#include <vespa/vespalib/util/typify.h>
+#include <vespa/vespalib/util/require.h>
+#include <vespa/eval/eval/visit_stuff.h>
+#include <algorithm>
+
+namespace vespalib::eval {
+
+using namespace tensor_function;
+using namespace operation;
+using namespace instruction;
+
+namespace {
+
+template <typename T, size_t N>
+ConstArrayRef<T> as_car(std::array<T, N> &array) {
+    return {array.data(), array.size()};
+}
+
+template <typename T, size_t N>
+ConstArrayRef<const T *> as_ccar(std::array<T *, N> &array) {
+    return {array.data(), array.size()};
+}
+
+template <typename T>
+ConstArrayRef<T> as_car(T &value) {
+    return {&value, 1};
+}
+
+constexpr std::array<size_t, 2> both_dims = { 0, 1 };
+
+template <typename CT>
+double my_sparse_112_dot_product_fallback(const Value::Index &a_idx, const Value::Index &b_idx, const Value::Index &c_idx,
+                                          const CT *a_cells, const CT *b_cells, const CT *c_cells) __attribute__((noinline));
+template <typename CT>
+double my_sparse_112_dot_product_fallback(const Value::Index &a_idx, const Value::Index &b_idx, const Value::Index &c_idx,
+                                          const CT *a_cells, const CT *b_cells, const CT *c_cells)
+{
+    double result = 0.0;
+    size_t a_space = 0;
+    size_t b_space = 0;
+    size_t c_space = 0;
+    std::array<string_id, 2> c_addr;
+    std::array<string_id*, 2> c_addr_ref = {&c_addr[0], &c_addr[1]};
+    auto outer = a_idx.create_view({});
+    auto inner = b_idx.create_view({});
+    auto model = c_idx.create_view({&both_dims[0], 2});
+    outer->lookup({});
+    while (outer->next_result(as_car(c_addr_ref[0]), a_space)) {
+        inner->lookup({});
+        while (inner->next_result(as_car(c_addr_ref[1]), b_space)) {
+            model->lookup(as_ccar(c_addr_ref));
+            if (model->next_result({}, c_space)) {
+                result += (a_cells[a_space] * b_cells[b_space] * c_cells[c_space]);
+            }
+        }
+    }
+    return result;
+}
+
+template <typename CT>
+double my_fast_sparse_112_dot_product(const FastAddrMap *a_map, const FastAddrMap *b_map, const FastAddrMap *c_map,
+                                      const CT *a_cells, const CT *b_cells, const CT *c_cells)
+{
+    double result = 0.0;
+    std::array<string_id, 2> c_addr;
+    const auto &a_labels = a_map->labels();
+    for (size_t a_space = 0; a_space < a_labels.size(); ++a_space) {
+        if (a_cells[a_space] != 0.0) { // handle pseudo-sparse input
+            c_addr[0] = a_labels[a_space];
+            const auto &b_labels = b_map->labels();
+            for (size_t b_space = 0; b_space < b_labels.size(); ++b_space) {
+                if (b_cells[b_space] != 0.0) { // handle pseudo-sparse input
+                    c_addr[1] = b_labels[b_space];
+                    auto c_space = c_map->lookup(as_car(c_addr));
+                    if (c_space != FastAddrMap::npos()) {
+                        result += (a_cells[a_space] * b_cells[b_space] * c_cells[c_space]);
+                    }
+                }
+            }
+        }
+    }
+    return result;
+}
+
+template <typename CT>
+void my_sparse_112_dot_product_op(InterpretedFunction::State &state, uint64_t) {
+    const auto &a_idx = state.peek(2).index();
+    const auto &b_idx = state.peek(1).index();
+    const auto &c_idx = state.peek(0).index();
+    const CT *a_cells = state.peek(2).cells().unsafe_typify<CT>().cbegin();
+    const CT *b_cells = state.peek(1).cells().unsafe_typify<CT>().cbegin();
+    const CT *c_cells = state.peek(0).cells().unsafe_typify<CT>().cbegin();
+    double result = __builtin_expect(are_fast(a_idx, b_idx, c_idx), true)
+        ? my_fast_sparse_112_dot_product<CT>(&as_fast(a_idx).map, &as_fast(b_idx).map, &as_fast(c_idx).map,
+                                             a_cells, b_cells, c_cells)
+        : my_sparse_112_dot_product_fallback<CT>(a_idx, b_idx, c_idx, a_cells, b_cells, c_cells);
+    state.pop_pop_pop_push(state.stash.create<DoubleValue>(result));
+}
+
+struct MyGetFun {
+    template <typename CT>
+    static auto invoke() { return my_sparse_112_dot_product_op<CT>; }
+};
+
+using MyTypify = TypifyValue<TypifyCellType>;
+
+// Try to collect input nodes and organize them into a dot product
+// between (n sparse non-overlapping single-dimension tensors) and (a
+// sparse n-dimensional tensor) all having the same cell type.
+
+struct InputState {
+    std::vector<const TensorFunction *> single;
+    const TensorFunction *multi = nullptr;
+    bool collision = false;
+
+    void collect(const TensorFunction &node) {
+        const auto &type = node.result_type();
+        if (type.is_sparse()) {
+            if (type.dimensions().size() == 1) {
+                single.push_back(&node);
+            } else {
+                if (multi) {
+                    collision = true;
+                } else {
+                    multi = &node;
+                }
+            }
+        }
+    }
+    void finalize() {
+        std::sort(single.begin(), single.end(), [](const auto *a, const auto *b)
+                  { return (a->result_type().dimensions()[0].name < b->result_type().dimensions()[0].name); });
+    }
+    bool verify(size_t n) const {
+        if (collision || (single.size() != n) || (multi == nullptr) || (multi->result_type().dimensions().size() != n)) {
+            return false;
+        }
+        const auto &multi_type = multi->result_type();
+        for (size_t i = 0; i < n; ++i) {
+            const auto &single_type = single[i]->result_type();
+            if ((single_type.cell_type() != multi_type.cell_type()) ||
+                (single_type.dimensions()[0].name != multi_type.dimensions()[i].name))
+            {
+                return false;
+            }
+        }
+        return true;
+    }
+};
+
+// Try to find inputs that form a 112 dot product.
+
+struct FindInputs {
+    const TensorFunction *a = nullptr;
+    const TensorFunction *b = nullptr;
+    const TensorFunction *c = nullptr;
+
+    bool try_match(const TensorFunction &one, const TensorFunction &two) {
+        auto join = as<Join>(two);
+        if (join && (join->function() == Mul::f)) {
+            InputState state;
+            state.collect(one);
+            state.collect(join->lhs());
+            state.collect(join->rhs());
+            state.finalize();
+            if (state.verify(2)) {
+                a = state.single[0];
+                b = state.single[1];
+                c = state.multi;
+                return true;
+            }
+        }
+        return false;
+    }
+};
+
+} // namespace <unnamed>
+
+Sparse112DotProduct::Sparse112DotProduct(const TensorFunction &a_in,
+                                         const TensorFunction &b_in,
+                                         const TensorFunction &c_in)
+  : tensor_function::Node(DoubleValue::shared_type()),
+    _a(a_in),
+    _b(b_in),
+    _c(c_in)
+{
+}
+
+InterpretedFunction::Instruction
+Sparse112DotProduct::compile_self(const ValueBuilderFactory &, Stash &) const
+{
+    REQUIRE_EQ(_a.get().result_type().cell_type(), _b.get().result_type().cell_type());
+    REQUIRE_EQ(_a.get().result_type().cell_type(), _c.get().result_type().cell_type());
+    auto op = typify_invoke<1,MyTypify,MyGetFun>(_a.get().result_type().cell_type());
+    return InterpretedFunction::Instruction(op);
+}
+
+void
+Sparse112DotProduct::push_children(std::vector<Child::CREF> &children) const
+{
+    children.emplace_back(_a);
+    children.emplace_back(_b);
+    children.emplace_back(_c);
+}
+
+void
+Sparse112DotProduct::visit_children(vespalib::ObjectVisitor &visitor) const
+{
+    ::visit(visitor, "a", _a.get());
+    ::visit(visitor, "b", _b.get());
+    ::visit(visitor, "c", _c.get());
+}
+
+const TensorFunction &
+Sparse112DotProduct::optimize(const TensorFunction &expr, Stash &stash)
+{
+    auto reduce = as<Reduce>(expr);
+    if (reduce && (reduce->aggr() == Aggr::SUM) && expr.result_type().is_double()) {
+        auto join = as<Join>(reduce->child());
+        if (join && (join->function() == Mul::f)) {
+            FindInputs inputs;
+            if (inputs.try_match(join->lhs(), join->rhs()) ||
+                inputs.try_match(join->rhs(), join->lhs()))
+            {
+                return stash.create<Sparse112DotProduct>(*inputs.a, *inputs.b, *inputs.c);
+            }
+        }
+    }
+    return expr;
+}
+
+} // namespace
diff --git a/eval/src/vespa/eval/instruction/sparse_112_dot_product.h b/eval/src/vespa/eval/instruction/sparse_112_dot_product.h
new file mode 100644
index 00000000000..2344a5eee2d
--- /dev/null
+++ b/eval/src/vespa/eval/instruction/sparse_112_dot_product.h
@@ -0,0 +1,31 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/eval/eval/tensor_function.h>
+
+namespace vespalib::eval {
+
+/**
+ * Tensor function for the dot product between the expansion of two 1d
+ * sparse tensors and a 2d sparse tensor.
+ */
+class Sparse112DotProduct : public tensor_function::Node
+{
+private:
+    Child _a;
+    Child _b;
+    Child _c;
+
+public:
+    Sparse112DotProduct(const TensorFunction &a_in,
+                        const TensorFunction &b_in,
+                        const TensorFunction &c_in);
+    InterpretedFunction::Instruction compile_self(const ValueBuilderFactory &factory, Stash &stash) const override;
+    bool result_is_mutable() const override { return true; }
+    void push_children(std::vector<Child::CREF> &children) const final override;
+    void visit_children(vespalib::ObjectVisitor &visitor) const final override;
+    static const TensorFunction &optimize(const TensorFunction &expr, Stash &stash);
+};
+
+} // namespace
author	Håvard Pettersen <havardpe@oath.com>	2021-12-20 15:05:22 +0000
committer	Håvard Pettersen <havardpe@oath.com>	2021-12-21 14:12:39 +0000
commit	bc219a3cb4c01ce449584284aa7ff03afb9e9dca (patch)
tree	c2e2b417f2e5f3ee3148637e5b91a12b105fca90 /eval
parent	28ae61202ad963955cf92719bab9b9d97181d5dd (diff)