optimize additional variants

author: Håvard Pettersen <havardpe@oath.com> 2021-06-29 10:30:38 +0000
committer: Håvard Pettersen <havardpe@oath.com> 2021-06-29 10:55:10 +0000
commit: 77b57d25819751b082cfcf746537d254b08ccdfa (patch)
tree: 1a5de60e5dd354a1cc06319ae45618126f94261c /eval/src
parent: e928ee61e47fe9c1cd15585df6dc553eaffa4370 (diff)
3 files changed, 97 insertions, 56 deletions
diff --git a/eval/src/tests/instruction/unpack_bits_function/unpack_bits_function_test.cpp b/eval/src/tests/instruction/unpack_bits_function/unpack_bits_function_test.cpp
index 8250893225a..c0d7cdc43e7 100644
--- a/eval/src/tests/instruction/unpack_bits_function/unpack_bits_function_test.cpp
+++ b/eval/src/tests/instruction/unpack_bits_function/unpack_bits_function_test.cpp
@@ -46,40 +46,49 @@ void assert_not_optimized(const vespalib::string &expr) {
 
 //-----------------------------------------------------------------------------
 
-TEST(UnpackBitsTest, expression_can_be_optimized) {
+TEST(UnpackBitsTest, expression_can_be_optimized_with_big_bitorder) {
     assert_optimized("tensor<int8>(x[2048])(bit(full{x:(x/8)},7-x%8))");
     assert_optimized("tensor<int8>(x[64])(bit(vx8{x:(x/8)},7-x%8))");
 }
 
+TEST(UnpackBitsTest, expression_can_be_optimized_with_small_bitorder) {
+    assert_optimized("tensor<int8>(x[2048])(bit(full{x:(x/8)},x%8))");
+    assert_optimized("tensor<int8>(x[64])(bit(vx8{x:(x/8)},x%8))");
+}
+
 TEST(UnpackBitsTest, unpack_bits_can_rename_dimension) {
     assert_optimized("tensor<int8>(x[64])(bit(vy8{y:(x/8)},7-x%8))");
+    assert_optimized("tensor<int8>(x[64])(bit(vy8{y:(x/8)},x%8))");
 }
 
-//-----------------------------------------------------------------------------
+TEST(UnpackBitsTest, result_may_have_other_cell_types_than_int8) {
+    assert_optimized("tensor<bfloat16>(x[64])(bit(vx8{x:(x/8)},7-x%8))");
+    assert_optimized("tensor<float>(x[64])(bit(vx8{x:(x/8)},7-x%8))");
+    assert_optimized("tensor<double>(x[64])(bit(vx8{x:(x/8)},7-x%8))");
 
-TEST(UnpackBitsTest, dimension_sizes_must_be_appropriate) {
-    assert_not_optimized("tensor<int8>(x[60])(bit(vx8{x:(x/8)},7-x%8))");
-    assert_not_optimized("tensor<int8>(x[68])(bit(vx8{x:(x/8)},7-x%8))");
+    assert_optimized("tensor<bfloat16>(x[64])(bit(vx8{x:(x/8)},x%8))");
+    assert_optimized("tensor<float>(x[64])(bit(vx8{x:(x/8)},x%8))");
+    assert_optimized("tensor<double>(x[64])(bit(vx8{x:(x/8)},x%8))");
 }
 
+//-----------------------------------------------------------------------------
+
 TEST(UnpackBitsTest, source_must_be_int8) {
     assert_not_optimized("tensor<int8>(x[64])(bit(vxf{x:(x/8)},7-x%8))");
 }
 
-TEST(UnpackBitsTest, result_must_be_int8) {
-    assert_not_optimized("tensor<float>(x[64])(bit(vx8{x:(x/8)},7-x%8))");
+TEST(UnpackBitsTest, dimension_sizes_must_be_appropriate) {
+    assert_not_optimized("tensor<int8>(x[60])(bit(vx8{x:(x/8)},7-x%8))");
+    assert_not_optimized("tensor<int8>(x[68])(bit(vx8{x:(x/8)},7-x%8))");
 }
 
 TEST(UnpackBitsTest, similar_expressions_are_not_optimized) {
-    assert_not_optimized("tensor<int8>(x[64])(bit(vx8{x:(x/8)},7-x%7))");
-    assert_not_optimized("tensor<int8>(x[64])(bit(vx8{x:(x/8)},7-x%9))");
-    assert_not_optimized("tensor<int8>(x[64])(bit(vx8{x:(x/7)},7-x%8))");
+    assert_not_optimized("tensor<int8>(x[64])(bit(vx8{x:(x*8)},7-x%8))");
     assert_not_optimized("tensor<int8>(x[64])(bit(vx8{x:(x/9)},7-x%8))");
-    assert_not_optimized("tensor<int8>(x[64])(bit(vx8{x:(x/8)},x%8-7))");
-    assert_not_optimized("tensor<int8>(x[64])(bit(vx8{x:(8/x)},7-x%8))");
+    assert_not_optimized("tensor<int8>(x[64])(bit(vx8{x:(x/8)},8-x%8))");
     assert_not_optimized("tensor<int8>(x[64])(bit(vx8{x:(x/8)},7+x%8))");
-    assert_not_optimized("tensor<int8>(x[64])(bit(vx8{x:(x*8)},7-x%8))");
-    assert_not_optimized("tensor<int8>(x[64])(bit(vx8{x:(x/8)},(7-x)%8))");
+    assert_not_optimized("tensor<int8>(x[64])(bit(vx8{x:(x/8)},7-x/8))");
+    assert_not_optimized("tensor<int8>(x[64])(bit(vx8{x:(x/8)},7-x%9))");
 }
 
 //-----------------------------------------------------------------------------
diff --git a/eval/src/vespa/eval/instruction/unpack_bits_function.cpp b/eval/src/vespa/eval/instruction/unpack_bits_function.cpp
index d77ead79a37..330982aa4b7 100644
--- a/eval/src/vespa/eval/instruction/unpack_bits_function.cpp
+++ b/eval/src/vespa/eval/instruction/unpack_bits_function.cpp
@@ -20,49 +20,74 @@ using tensor_function::inject;
 
 namespace {
 
+//-----------------------------------------------------------------------------
+
+template <typename OCT, bool big>
 void my_unpack_bits_op(InterpretedFunction::State &state, uint64_t param) {
     const ValueType &res_type = unwrap_param<ValueType>(param);
     auto packed_cells = state.peek(0).cells().typify<Int8Float>();
-    auto unpacked_cells = state.stash.create_uninitialized_array<Int8Float>(packed_cells.size() * 8);
-    int8_t *dst = reinterpret_cast<int8_t*>(unpacked_cells.begin());
+    auto unpacked_cells = state.stash.create_uninitialized_array<OCT>(packed_cells.size() * 8);
+    OCT *dst = unpacked_cells.begin();
     for (Int8Float cell: packed_cells) {
-        for (int n = 7; n >= 0; --n) {
-            *dst++ = bool(cell.get_bits() & (1 << n));
+        if constexpr (big) {
+            for (int n = 7; n >= 0; --n) {
+                *dst++ = (OCT) bool(cell.get_bits() & (1 << n));
+            }
+        } else {
+            for (int n = 0; n <= 7; ++n) {
+                *dst++ = (OCT) bool(cell.get_bits() & (1 << n));
+            }
         }
     }
     Value &result_ref = state.stash.create<DenseValueView>(res_type, TypedCells(unpacked_cells));
     state.pop_push(result_ref);
 }
 
+//-----------------------------------------------------------------------------
+
+struct MyGetFun {
+    template <typename OCT, typename BIG> static auto invoke() {
+        return my_unpack_bits_op<OCT, BIG::value>;
+    }
+};
+
+using MyTypify = TypifyValue<TypifyCellType,TypifyBool>;
+
+//-----------------------------------------------------------------------------
+
 bool valid_lambda_params(const Lambda &lambda) {
     return ((lambda.lambda().num_params() == 2) &&
             (lambda.bindings().size() == 1));
 }
 
-bool valid_type(const ValueType &type) {
+bool valid_type(const ValueType &type, bool must_be_int8) {
     return ((type.is_dense()) &&
             (type.dimensions().size() == 1) &&
-            (type.cell_type() == CellType::INT8));
+            (!must_be_int8 || (type.cell_type() == CellType::INT8)));
 }
 
 bool compatible_types(const ValueType &packed, const ValueType &unpacked) {
-    return (valid_type(packed) && valid_type(unpacked) &&
+    return (valid_type(packed, true) && valid_type(unpacked, false) &&
             (unpacked.dimensions()[0].size == (packed.dimensions()[0].size * 8)));
 }
 
-bool is_bit_expr(const Node &node) {
+bool is_little_bit_expr(const Node &node) {
+    // 'x%8'
+    if (auto mod = as<Mod>(node)) {
+        if (auto param = as<Symbol>(mod->lhs())) {
+            if (auto eight = as<Number>(mod->rhs())) {
+                return ((param->id() == 0) && (eight->value() == 8.0));
+            }
+        }
+    }
+    return false;
+}
+
+bool is_big_bit_expr(const Node &node) {
     // '7-(x%8)'
     if (auto sub = as<Sub>(node)) {
         if (auto seven = as<Number>(sub->lhs())) {
-            if (auto mod = as<Mod>(sub->rhs())) {
-                if (auto param = as<Symbol>(mod->lhs())) {
-                    if (auto eight = as<Number>(mod->rhs())) {
-                        return ((seven->value() == 7.0) &&
-                                (eight->value() == 8.0) &&
-                                (param->id() == 0));
-                    }
-                }
-            }
+            return ((seven->value() == 7.0) && is_little_bit_expr(sub->rhs()));
         }
     }
     return false;
@@ -73,8 +98,7 @@ bool is_byte_expr(const Node &node) {
     if (auto div = as<Div>(node)) {
         if (auto param = as<Symbol>(div->lhs())) {
             if (auto eight = as<Number>(div->rhs())) {
-                return ((eight->value() == 8.0) &&
-                        (param->id() == 0));
+                return ((param->id() == 0) && (eight->value() == 8.0));
             }
         }
     }
@@ -93,37 +117,44 @@ bool is_byte_peek(const TensorPeek &peek) {
     return false;
 }
 
+//-----------------------------------------------------------------------------
+
 } // namespace <unnamed>
 
 UnpackBitsFunction::UnpackBitsFunction(const ValueType &res_type_in,
-                                       const TensorFunction &packed)
-  : Op1(res_type_in, packed)
+                                       const TensorFunction &packed,
+                                       bool big_bitorder)
+  : Op1(res_type_in, packed),
+    _big_bitorder(big_bitorder)
 {
 }
 
 InterpretedFunction::Instruction
 UnpackBitsFunction::compile_self(const ValueBuilderFactory &, Stash &) const
 {
-    return InterpretedFunction::Instruction(my_unpack_bits_op, wrap_param<ValueType>(result_type()));
+    const ValueType &res_type = result_type();
+    auto op = typify_invoke<2,MyTypify,MyGetFun>(res_type.cell_type(), _big_bitorder);
+    return InterpretedFunction::Instruction(op, wrap_param<ValueType>(res_type));
 }
 
 const TensorFunction &
 UnpackBitsFunction::optimize(const TensorFunction &expr, Stash &stash)
 {
     if (auto lambda = as<Lambda>(expr)) {
-        // 'tensor<int8>(x[64])(bit(packed{x:(x/8)},7-(x%8)))'
         const ValueType &dst_type = lambda->result_type();
         if (auto bit = as<Bit>(lambda->lambda().root())) {
             if (auto peek = as<TensorPeek>(bit->get_child(0))) {
                 const ValueType &src_type = lambda->types().get_type(peek->param());
-                if (valid_lambda_params(*lambda) &&
-                    compatible_types(src_type, dst_type) &&
-                    is_bit_expr(bit->get_child(1)) &&
+                if (compatible_types(src_type, dst_type) &&
+                    valid_lambda_params(*lambda) &&
                     is_byte_peek(*peek))
                 {
                     size_t param_idx = lambda->bindings()[0];
-                    const auto &packed_param = inject(src_type, param_idx, stash);
-                    return stash.create<UnpackBitsFunction>(dst_type, packed_param);
+                    if (is_big_bit_expr(bit->get_child(1))) {
+                        return stash.create<UnpackBitsFunction>(dst_type, inject(src_type, param_idx, stash), true);
+                    } else if (is_little_bit_expr(bit->get_child(1))) {
+                        return stash.create<UnpackBitsFunction>(dst_type, inject(src_type, param_idx, stash), false);
+                    }
                 }
             }
         }
diff --git a/eval/src/vespa/eval/instruction/unpack_bits_function.h b/eval/src/vespa/eval/instruction/unpack_bits_function.h
index 5e24746508d..5b0da84072f 100644
--- a/eval/src/vespa/eval/instruction/unpack_bits_function.h
+++ b/eval/src/vespa/eval/instruction/unpack_bits_function.h
@@ -9,26 +9,27 @@ namespace vespalib::eval {
 /**
  * Tensor function unpacking bits into separate values.
  *
- * Both the tensor containing the packed bits and the result tensor
- * must have cell type 'int8'. The bits must be unpacked in canonical
- * order; bytes are unpacked with increasing index, bits within a byte
- * are unpacked from most to least significant.
+ * The tensor containing the packed bits must be a vector (dense
+ * tensor with 1 dimension) with cell type 'int8'. Bytes must be
+ * processed with increasing index. Bits may be unpacked in either
+ * 'big' or 'little' order. The result must be a vector (dense tensor
+ * with 1 dimension) where the dimension is 8 times larger than the
+ * input (since there are 8 bits packed into each int8 value).
  *
- * The baseline expression looks like this:
+ * Baseline expression for 'big' bitorder (most significant bit first):
+ * (Note: this is the default order used by numpy unpack_bits)
+ * 'tensor<int8>(x[64])(bit(packed{x:(x/8)},7-(x%8)))'
  *
- * tensor<int8>(x[64])(bit(packed{x:(x/8)},7-(x%8)))
- *
- * in this case 'packed' must be a tensor with type
- * 'tensor<int8>(x[8])' (the inner result dimension is always 8 times
- * larger than the inner input dimension).
- *
- * Unpacking of bits from multi-dimensional tensors will currently not
- * be optimized.
+ * Baseline expression for 'little' bitorder (least significant bit first):
+ * (Note: make sure this is the actual order of your bits)
+ * 'tensor<int8>(x[64])(bit(packed{x:(x/8)},x%8))'
  **/
 class UnpackBitsFunction : public tensor_function::Op1
 {
+private:
+    bool _big_bitorder;
 public:
-    UnpackBitsFunction(const ValueType &res_type_in, const TensorFunction &packed);
+    UnpackBitsFunction(const ValueType &res_type_in, const TensorFunction &packed, bool big);
     InterpretedFunction::Instruction compile_self(const ValueBuilderFactory &factory, Stash &stash) const override;
     bool result_is_mutable() const override { return true; }
     static const TensorFunction &optimize(const TensorFunction &expr, Stash &stash);
author	Håvard Pettersen <havardpe@oath.com>	2021-06-29 10:30:38 +0000
committer	Håvard Pettersen <havardpe@oath.com>	2021-06-29 10:55:10 +0000
commit	77b57d25819751b082cfcf746537d254b08ccdfa (patch)
tree	1a5de60e5dd354a1cc06319ae45618126f94261c /eval/src
parent	e928ee61e47fe9c1cd15585df6dc553eaffa4370 (diff)