Merge pull request #16344 from vespa-engine/havardpe/faster-single-dim-matching

added some optimizations for single-dimension sparse matching
author: Arne H Juul <arnej27959@users.noreply.github.com> 2021-02-03 11:53:20 +0100
committer: GitHub <noreply@github.com> 2021-02-03 11:53:20 +0100
commit: 5bc79680f181742971c35eda70a136c81b3affba (patch)
tree: ed1b0ad2ed5ee4c92ed0f926f7a5703a6b99e70a /eval
parent: 986dc62765dc2264ab07c2f4744d86d48e707193 (diff)
parent: cfe138a74ec1db72993ac4b1a6bca7ca63bbc781 (diff)
4 files changed, 109 insertions, 64 deletions
diff --git a/eval/src/tests/instruction/sparse_dot_product_function/sparse_dot_product_function_test.cpp b/eval/src/tests/instruction/sparse_dot_product_function/sparse_dot_product_function_test.cpp
index 65eab2778aa..ac333f5224a 100644
--- a/eval/src/tests/instruction/sparse_dot_product_function/sparse_dot_product_function_test.cpp
+++ b/eval/src/tests/instruction/sparse_dot_product_function/sparse_dot_product_function_test.cpp
@@ -17,10 +17,8 @@ const ValueBuilderFactory &test_factory = SimpleValueBuilderFactory::get();
 
 EvalFixture::ParamRepo make_params() {
     return EvalFixture::ParamRepo()
-        .add("v1_x",   GenSpec().map("x", 32, 1).seq_bias(3.0).gen())
-        .add("v1_x_f", GenSpec().map("x", 32, 1).seq_bias(3.0).cells_float().gen())
-        .add("v2_x",   GenSpec().map("x", 16, 2).seq_bias(7.0).gen())
-        .add("v2_x_f", GenSpec().map("x", 16, 2).seq_bias(7.0).cells_float().gen())
+        .add_variants("v1_x", GenSpec().map("x", 32, 1).seq_bias(3.0))
+        .add_variants("v2_x", GenSpec().map("x", 16, 2).seq_bias(7.0))
         .add("v3_y",   GenSpec().map("y", 10, 1).gen())
         .add("v4_xd",  GenSpec().idx("x", 10).gen())
         .add("m1_xy",  GenSpec().map("x", 32, 1).map("y", 16, 2).seq_bias(3.0).gen())
@@ -53,8 +51,6 @@ TEST(SparseDotProduct, expression_can_be_optimized)
 {
     assert_optimized("reduce(v1_x*v2_x,sum,x)");
     assert_optimized("reduce(v2_x*v1_x,sum)");
-    assert_optimized("reduce(v1_x*v2_x_f,sum)");
-    assert_optimized("reduce(v1_x_f*v2_x,sum)");
     assert_optimized("reduce(v1_x_f*v2_x_f,sum)");
 }
 
@@ -80,6 +76,12 @@ TEST(SparseDotProduct, similar_expressions_are_not_optimized)
     assert_not_optimized("reduce(m3_xym*m3_xym,sum)");
 }
 
+TEST(SparseDotProduct, mixed_cell_types_are_not_optimized)
+{
+    assert_not_optimized("reduce(v1_x*v2_x_f,sum)");
+    assert_not_optimized("reduce(v1_x_f*v2_x,sum)");
+}
+
 //-----------------------------------------------------------------------------
 
 GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/eval/src/vespa/eval/eval/fast_addr_map.h b/eval/src/vespa/eval/eval/fast_addr_map.h
index d8f68d2a37c..7df6bd7fafc 100644
--- a/eval/src/vespa/eval/eval/fast_addr_map.h
+++ b/eval/src/vespa/eval/eval/fast_addr_map.h
@@ -20,9 +20,13 @@ namespace vespalib::eval {
 class FastAddrMap
 {
 public:
-    // label hasing functions
-    static constexpr uint32_t hash_label(string_id label) { return label.hash(); }
-    static constexpr uint32_t hash_label(const string_id *label) { return label->hash(); }
+    // label extracting functions
+    static constexpr string_id self(string_id label) { return label; }
+    static constexpr string_id self(const string_id *label) { return *label; }
+
+    // label hashing functions
+    static constexpr uint32_t hash_label(string_id label) { return label.value(); }
+    static constexpr uint32_t hash_label(const string_id *label) { return label->value(); }
     static constexpr uint32_t combine_label_hash(uint32_t full_hash, uint32_t next_hash) {
         return ((full_hash * 31) + next_hash);
     }
@@ -71,27 +75,27 @@ public:
         template <typename T>
         constexpr uint32_t operator()(const AltKey<T> &key) const { return key.hash; }
         constexpr uint32_t operator()(const Entry &entry) const { return entry.hash; }
+        constexpr uint32_t operator()(string_id label) const { return label.value(); }
     };
 
     // equality functor for sparse hash set
     struct Equal {
         const LabelView &label_view;
         Equal(const LabelView &label_view_in) : label_view(label_view_in) {}
-        static constexpr bool eq_labels(string_id a, string_id b) { return (a == b); }
-        static constexpr bool eq_labels(string_id a, const string_id *b) { return (a == *b); }
         template <typename T>
         bool operator()(const Entry &a, const AltKey<T> &b) const {
-            if ((a.hash != b.hash) || (b.key.size() != label_view.addr_size)) {
+            if (a.hash != b.hash) {
                 return false;
             }
             auto a_key = label_view.get_addr(a.tag.idx);
             for (size_t i = 0; i < a_key.size(); ++i) {
-                if (!eq_labels(a_key[i], b.key[i])) {
+                if (a_key[i] != self(b.key[i])) {
                     return false;
                 }
             }
             return true;
         }
+        bool operator()(const Entry &a, string_id b) const { return (a.hash == b.value()); }
     };
 
     using HashType = hashtable<Entry, Entry, Hash, Equal, Identity, hashtable_base::and_modulator>;
@@ -101,8 +105,8 @@ private:
     HashType _map;
 
 public:
-    FastAddrMap(size_t num_mapped_dims, const std::vector<string_id> &labels, size_t expected_subspaces)
-        : _labels(num_mapped_dims, labels),
+    FastAddrMap(size_t num_mapped_dims, const std::vector<string_id> &labels_in, size_t expected_subspaces)
+        : _labels(num_mapped_dims, labels_in),
           _map(expected_subspaces * 2, Hash(), Equal(_labels)) {}
     ~FastAddrMap();
     FastAddrMap(const FastAddrMap &) = delete;
@@ -113,15 +117,24 @@ public:
     ConstArrayRef<string_id> get_addr(size_t idx) const { return _labels.get_addr(idx); }
     size_t size() const { return _map.size(); }
     constexpr size_t addr_size() const { return _labels.addr_size; }
+    const std::vector<string_id> &labels() const { return _labels.labels; }
     template <typename T>
     size_t lookup(ConstArrayRef<T> addr, uint32_t hash) const {
+        // assert(addr_size() == addr.size());
         AltKey<T> key{addr, hash};
         auto pos = _map.find(key);
         return (pos == _map.end()) ? npos() : pos->tag.idx;
     }
+    size_t lookup_singledim(string_id addr) const {
+        // assert(addr_size() == 1);
+        auto pos = _map.find(addr);
+        return (pos == _map.end()) ? npos() : pos->tag.idx;
+    }
     template <typename T>
     size_t lookup(ConstArrayRef<T> addr) const {
-        return lookup(addr, hash_labels(addr));
+        return (addr.size() == 1)
+            ? lookup_singledim(self(addr[0]))
+            : lookup(addr, hash_labels(addr));
     }
     void add_mapping(uint32_t hash) {
         uint32_t idx = _map.size();
diff --git a/eval/src/vespa/eval/eval/fast_value.hpp b/eval/src/vespa/eval/eval/fast_value.hpp
index 5657c93f53f..88319df7590 100644
--- a/eval/src/vespa/eval/eval/fast_value.hpp
+++ b/eval/src/vespa/eval/eval/fast_value.hpp
@@ -6,6 +6,7 @@
 #include <vespa/eval/instruction/generic_join.h>
 #include <vespa/vespalib/stllike/hashtable.hpp>
 #include <vespa/vespalib/util/shared_string_repo.h>
+#include <typeindex>
 
 namespace vespalib::eval {
 
@@ -135,9 +136,9 @@ struct FastIterateView : public Value::Index::View {
 //-----------------------------------------------------------------------------
 
 using JoinAddrSource = instruction::SparseJoinPlan::Source;
+
 // This is the class instructions will look for when optimizing sparse
 // operations by calling inline functions directly.
-
 struct FastValueIndex final : Value::Index {
     FastAddrMap map;
     FastValueIndex(size_t num_mapped_dims_in, const std::vector<string_id> &labels, size_t expected_subspaces_in)
@@ -164,6 +165,18 @@ struct FastValueIndex final : Value::Index {
     std::unique_ptr<View> create_view(const std::vector<size_t> &dims) const override;
 };
 
+inline bool is_fast(const Value::Index &index) {
+    return (std::type_index(typeid(index)) == std::type_index(typeid(FastValueIndex)));
+}
+
+inline bool are_fast(const Value::Index &a, const Value::Index &b) {
+    return (is_fast(a) && is_fast(b));
+}
+
+constexpr const FastValueIndex &as_fast(const Value::Index &index) {
+    return static_cast<const FastValueIndex &>(index);
+}
+
 //-----------------------------------------------------------------------------
 
 template <typename T>
diff --git a/eval/src/vespa/eval/instruction/sparse_dot_product_function.cpp b/eval/src/vespa/eval/instruction/sparse_dot_product_function.cpp
index 93ae2856372..7cc4417bdbb 100644
--- a/eval/src/vespa/eval/instruction/sparse_dot_product_function.cpp
+++ b/eval/src/vespa/eval/instruction/sparse_dot_product_function.cpp
@@ -2,8 +2,8 @@
 
 #include "sparse_dot_product_function.h"
 #include "generic_join.h"
-#include "detect_type.h"
 #include <vespa/eval/eval/fast_value.hpp>
+#include <vespa/vespalib/util/typify.h>
 
 namespace vespalib::eval {
 
@@ -13,62 +13,76 @@ using namespace instruction;
 
 namespace {
 
-template <typename SCT, typename BCT>
-double my_fast_sparse_dot_product(const FastValueIndex &small_idx, const FastValueIndex &big_idx,
-                                  const SCT *small_cells, const BCT *big_cells)
+template <typename CT>
+double my_sparse_dot_product_fallback(const Value::Index &lhs_idx, const Value::Index &rhs_idx,
+                                      const CT *lhs_cells, const CT *rhs_cells, size_t num_mapped_dims) __attribute__((noinline));
+template <typename CT>
+double my_sparse_dot_product_fallback(const Value::Index &lhs_idx, const Value::Index &rhs_idx,
+                                      const CT *lhs_cells, const CT *rhs_cells, size_t num_mapped_dims)
 {
     double result = 0.0;
-    small_idx.map.each_map_entry([&](auto small_subspace, auto hash) {
-                auto small_addr = small_idx.map.get_addr(small_subspace);
-                auto big_subspace = big_idx.map.lookup(small_addr, hash);
-                if (big_subspace != FastAddrMap::npos()) {
-                    result += (small_cells[small_subspace] * big_cells[big_subspace]);
-                }
-            });
+    SparseJoinPlan plan(num_mapped_dims);
+    SparseJoinState sparse(plan, lhs_idx, rhs_idx);
+    auto outer = sparse.first_index.create_view({});
+    auto inner = sparse.second_index.create_view(sparse.second_view_dims);
+    outer->lookup({});
+    while (outer->next_result(sparse.first_address, sparse.first_subspace)) {
+        inner->lookup(sparse.address_overlap);
+        if (inner->next_result(sparse.second_only_address, sparse.second_subspace)) {
+            result += (lhs_cells[sparse.lhs_subspace] * rhs_cells[sparse.rhs_subspace]);
+        }
+    }
     return result;
 }
 
-template <typename LCT, typename RCT>
-void my_sparse_dot_product_op(InterpretedFunction::State &state, uint64_t num_mapped_dims) {
-    const auto &lhs_idx = state.peek(1).index();
-    const auto &rhs_idx = state.peek(0).index();
-    const LCT *lhs_cells = state.peek(1).cells().typify<LCT>().cbegin();
-    const RCT *rhs_cells = state.peek(0).cells().typify<RCT>().cbegin();
-    if (auto indexes = detect_type<FastValueIndex>(lhs_idx, rhs_idx)) {
-#if __has_cpp_attribute(likely)
-        [[likely]];
-#endif
-        const auto &lhs_fast = indexes.get<0>();
-        const auto &rhs_fast = indexes.get<1>();
-        double result = (rhs_fast.map.size() < lhs_fast.map.size())
-                        ? my_fast_sparse_dot_product(rhs_fast, lhs_fast, rhs_cells, lhs_cells)
-                        : my_fast_sparse_dot_product(lhs_fast, rhs_fast, lhs_cells, rhs_cells);
-        state.pop_pop_push(state.stash.create<ScalarValue<double>>(result));
-    } else {
-#if __has_cpp_attribute(unlikely)
-        [[unlikely]];
-#endif
-        double result = 0.0;
-        SparseJoinPlan plan(num_mapped_dims);
-        SparseJoinState sparse(plan, lhs_idx, rhs_idx);
-        auto outer = sparse.first_index.create_view({});
-        auto inner = sparse.second_index.create_view(sparse.second_view_dims);
-        outer->lookup({});
-        while (outer->next_result(sparse.first_address, sparse.first_subspace)) {
-            inner->lookup(sparse.address_overlap);
-            if (inner->next_result(sparse.second_only_address, sparse.second_subspace)) {
-                result += (lhs_cells[sparse.lhs_subspace] * rhs_cells[sparse.rhs_subspace]);
+template <typename CT, bool single_dim>
+double my_fast_sparse_dot_product(const FastAddrMap *small_map, const FastAddrMap *big_map,
+                                  const CT *small_cells, const CT *big_cells)
+{
+    double result = 0.0;
+    if (big_map->size() < small_map->size()) {
+        std::swap(small_map, big_map);
+        std::swap(small_cells, big_cells);
+    }
+    if constexpr (single_dim) {
+        const auto &labels = small_map->labels();
+        for (size_t i = 0; i < labels.size(); ++i) {
+            auto big_subspace = big_map->lookup_singledim(labels[i]);
+            if (big_subspace != FastAddrMap::npos()) {
+                result += (small_cells[i] * big_cells[big_subspace]);
             }
         }
-        state.pop_pop_push(state.stash.create<ScalarValue<double>>(result));
+    } else {
+        small_map->each_map_entry([&](auto small_subspace, auto hash) {
+                    auto small_addr = small_map->get_addr(small_subspace);
+                    auto big_subspace = big_map->lookup(small_addr, hash);
+                    if (big_subspace != FastAddrMap::npos()) {
+                        result += (small_cells[small_subspace] * big_cells[big_subspace]);
+                    }
+                });
     }
+    return result;
+}
+
+template <typename CT, bool single_dim>
+void my_sparse_dot_product_op(InterpretedFunction::State &state, uint64_t num_mapped_dims) {
+    const auto &lhs_idx = state.peek(1).index();
+    const auto &rhs_idx = state.peek(0).index();
+    const CT *lhs_cells = state.peek(1).cells().typify<CT>().cbegin();
+    const CT *rhs_cells = state.peek(0).cells().typify<CT>().cbegin();
+    double result = __builtin_expect(are_fast(lhs_idx, rhs_idx), true)
+                    ? my_fast_sparse_dot_product<CT,single_dim>(&as_fast(lhs_idx).map, &as_fast(rhs_idx).map, lhs_cells, rhs_cells)
+                    : my_sparse_dot_product_fallback<CT>(lhs_idx, rhs_idx, lhs_cells, rhs_cells, num_mapped_dims);
+    state.pop_pop_push(state.stash.create<ScalarValue<double>>(result));
 }
 
 struct MyGetFun {
-    template <typename LCT, typename RCT>
-    static auto invoke() { return my_sparse_dot_product_op<LCT,RCT>; }
+    template <typename CT, typename SINGLE_DIM>
+    static auto invoke() { return my_sparse_dot_product_op<CT,SINGLE_DIM::value>; }
 };
 
+using MyTypify = TypifyValue<TypifyCellType,TypifyBool>;
+
 } // namespace <unnamed>
 
 SparseDotProductFunction::SparseDotProductFunction(const TensorFunction &lhs_in,
@@ -80,15 +94,18 @@ SparseDotProductFunction::SparseDotProductFunction(const TensorFunction &lhs_in,
 InterpretedFunction::Instruction
 SparseDotProductFunction::compile_self(const ValueBuilderFactory &, Stash &) const
 {
-    auto op = typify_invoke<2,TypifyCellType,MyGetFun>(lhs().result_type().cell_type(), rhs().result_type().cell_type());
-    return InterpretedFunction::Instruction(op, lhs().result_type().count_mapped_dimensions());
+    size_t num_dims = lhs().result_type().count_mapped_dimensions();
+    auto op = typify_invoke<2,MyTypify,MyGetFun>(lhs().result_type().cell_type(),
+                                                 (num_dims == 1));
+    return InterpretedFunction::Instruction(op, num_dims);
 }
 
 bool
 SparseDotProductFunction::compatible_types(const ValueType &res, const ValueType &lhs, const ValueType &rhs)
 {
     return (res.is_scalar() && (res.cell_type() == CellType::DOUBLE) &&
-            lhs.is_sparse() && (rhs.dimensions() == lhs.dimensions()));
+            lhs.is_sparse() && (rhs.dimensions() == lhs.dimensions()) &&
+            lhs.cell_type() == rhs.cell_type());
 }
 
 const TensorFunction &
author	Arne H Juul <arnej27959@users.noreply.github.com>	2021-02-03 11:53:20 +0100
committer	GitHub <noreply@github.com>	2021-02-03 11:53:20 +0100
commit	5bc79680f181742971c35eda70a136c81b3affba (patch)
tree	ed1b0ad2ed5ee4c92ed0f926f7a5703a6b99e70a /eval
parent	986dc62765dc2264ab07c2f4744d86d48e707193 (diff)
parent	cfe138a74ec1db72993ac4b1a6bca7ca63bbc781 (diff)