support binary formats with cell type in reference implementation

author: Håvard Pettersen <havardpe@oath.com> 2019-04-08 09:30:20 +0000
committer: Håvard Pettersen <havardpe@oath.com> 2019-04-09 12:05:57 +0000
commit: f5bb9339c3dad56e4f9ace290f315bfe4fcfd07b (patch)
tree: fe5e4bb20628f11369aa749fbbc2a72490230f67 /eval
parent: 543dd57a6fac386ba4f62f77c33d545ed0d29e97 (diff)
3 files changed, 248 insertions, 7 deletions
diff --git a/eval/src/tests/eval/simple_tensor/simple_tensor_test.cpp b/eval/src/tests/eval/simple_tensor/simple_tensor_test.cpp
index c3b42124155..aa4c3b8c021 100644
--- a/eval/src/tests/eval/simple_tensor/simple_tensor_test.cpp
+++ b/eval/src/tests/eval/simple_tensor/simple_tensor_test.cpp
@@ -4,6 +4,8 @@
 #include <vespa/eval/eval/simple_tensor_engine.h>
 #include <vespa/eval/eval/operation.h>
 #include <vespa/vespalib/util/stash.h>
+#include <vespa/vespalib/data/memory.h>
+#include <vespa/vespalib/objects/nbostream.h>
 #include <iostream>
 
 using namespace vespalib::eval;
@@ -12,6 +14,8 @@ using Cell = SimpleTensor::Cell;
 using Cells = SimpleTensor::Cells;
 using Address = SimpleTensor::Address;
 using Stash = vespalib::Stash;
+using vespalib::nbostream;
+using vespalib::Memory;
 
 TensorSpec to_spec(const Value &a) { return SimpleTensorEngine::ref().to_spec(a); }
 
@@ -143,4 +147,208 @@ TEST("require that simple tensors support dimension reduction") {
     EXPECT_NOT_EQUAL(to_spec(*result_sum_y), to_spec(*result_sum_x));
 }
 
+//-----------------------------------------------------------------------------
+
+struct SparseTensorExample {
+    TensorSpec make_spec() const {
+        return TensorSpec("tensor(x{},y{})")
+            .add({{"x","a"},{"y","a"}}, 1)
+            .add({{"x","a"},{"y","b"}}, 2)
+            .add({{"x","b"},{"y","a"}}, 3);
+    }
+    std::unique_ptr<SimpleTensor> make_tensor() const {
+        return SimpleTensor::create(make_spec());
+    }
+    template <typename T>
+    void encode_inner(nbostream &dst) const {
+        dst.putInt1_4Bytes(2);
+        dst.writeSmallString("x");
+        dst.writeSmallString("y");
+        dst.putInt1_4Bytes(3);
+        dst.writeSmallString("a");
+        dst.writeSmallString("a");
+        dst << (T) 1;
+        dst.writeSmallString("a");
+        dst.writeSmallString("b");
+        dst << (T) 2;
+        dst.writeSmallString("b");
+        dst.writeSmallString("a");
+        dst << (T) 3;
+    }
+    void encode_default(nbostream &dst) const {
+        dst.putInt1_4Bytes(1);
+        encode_inner<double>(dst);
+    }
+    void encode_with_double(nbostream &dst) const {
+        dst.putInt1_4Bytes(5);
+        dst.putInt1_4Bytes(0);
+        encode_inner<double>(dst);
+    }
+    void encode_with_float(nbostream &dst) const {
+        dst.putInt1_4Bytes(5);
+        dst.putInt1_4Bytes(1);
+        encode_inner<float>(dst);
+    }
+};
+
+TEST_F("require that sparse tensors can be decoded", SparseTensorExample()) {
+    nbostream data1;
+    nbostream data2;
+    nbostream data3;
+    f1.encode_default(data1);
+    f1.encode_with_double(data2);
+    f1.encode_with_float(data3);
+    EXPECT_EQUAL(to_spec(*SimpleTensor::decode(data1)), f1.make_spec());
+    EXPECT_EQUAL(to_spec(*SimpleTensor::decode(data2)), f1.make_spec());
+    EXPECT_EQUAL(to_spec(*SimpleTensor::decode(data3)), f1.make_spec());
+}
+
+TEST_F("require that sparse tensors can be encoded", SparseTensorExample()) {
+    nbostream data;
+    nbostream expect;
+    SimpleTensor::encode(*f1.make_tensor(), data);
+    f1.encode_default(expect);
+    EXPECT_EQUAL(Memory(data.peek(), data.size()), Memory(expect.peek(), expect.size()));
+}
+
+//-----------------------------------------------------------------------------
+
+struct DenseTensorExample {
+    TensorSpec make_spec() const {
+        return TensorSpec("tensor(x[3],y[2])")
+            .add({{"x",0},{"y",0}}, 1)
+            .add({{"x",0},{"y",1}}, 2)
+            .add({{"x",1},{"y",0}}, 3)
+            .add({{"x",1},{"y",1}}, 4)
+            .add({{"x",2},{"y",0}}, 5)
+            .add({{"x",2},{"y",1}}, 6);
+    }
+    std::unique_ptr<SimpleTensor> make_tensor() const {
+        return SimpleTensor::create(make_spec());
+    }
+    template <typename T>
+    void encode_inner(nbostream &dst) const {
+        dst.putInt1_4Bytes(2);
+        dst.writeSmallString("x");
+        dst.putInt1_4Bytes(3);
+        dst.writeSmallString("y");
+        dst.putInt1_4Bytes(2);
+        dst << (T) 1;
+        dst << (T) 2;
+        dst << (T) 3;
+        dst << (T) 4;
+        dst << (T) 5;
+        dst << (T) 6;
+    }
+    void encode_default(nbostream &dst) const {
+        dst.putInt1_4Bytes(2);
+        encode_inner<double>(dst);
+    }
+    void encode_with_double(nbostream &dst) const {
+        dst.putInt1_4Bytes(6);
+        dst.putInt1_4Bytes(0);
+        encode_inner<double>(dst);
+    }
+    void encode_with_float(nbostream &dst) const {
+        dst.putInt1_4Bytes(6);
+        dst.putInt1_4Bytes(1);
+        encode_inner<float>(dst);
+    }
+};
+
+TEST_F("require that dense tensors can be decoded", DenseTensorExample()) {
+    nbostream data1;
+    nbostream data2;
+    nbostream data3;
+    f1.encode_default(data1);
+    f1.encode_with_double(data2);
+    f1.encode_with_float(data3);
+    EXPECT_EQUAL(to_spec(*SimpleTensor::decode(data1)), f1.make_spec());
+    EXPECT_EQUAL(to_spec(*SimpleTensor::decode(data2)), f1.make_spec());
+    EXPECT_EQUAL(to_spec(*SimpleTensor::decode(data3)), f1.make_spec());
+}
+
+TEST_F("require that dense tensors can be encoded", DenseTensorExample()) {
+    nbostream data;
+    nbostream expect;
+    SimpleTensor::encode(*f1.make_tensor(), data);
+    f1.encode_default(expect);
+    EXPECT_EQUAL(Memory(data.peek(), data.size()), Memory(expect.peek(), expect.size()));
+}
+
+//-----------------------------------------------------------------------------
+
+struct MixedTensorExample {
+    TensorSpec make_spec() const {
+        return TensorSpec("tensor(x{},y{},z[2])")
+            .add({{"x","a"},{"y","a"},{"z",0}}, 1)
+            .add({{"x","a"},{"y","a"},{"z",1}}, 2)
+            .add({{"x","a"},{"y","b"},{"z",0}}, 3)
+            .add({{"x","a"},{"y","b"},{"z",1}}, 4)
+            .add({{"x","b"},{"y","a"},{"z",0}}, 5)
+            .add({{"x","b"},{"y","a"},{"z",1}}, 6);
+    }
+    std::unique_ptr<SimpleTensor> make_tensor() const {
+        return SimpleTensor::create(make_spec());
+    }
+    template <typename T>
+    void encode_inner(nbostream &dst) const {
+        dst.putInt1_4Bytes(2);
+        dst.writeSmallString("x");
+        dst.writeSmallString("y");
+        dst.putInt1_4Bytes(1);
+        dst.writeSmallString("z");
+        dst.putInt1_4Bytes(2);
+        dst.putInt1_4Bytes(3);
+        dst.writeSmallString("a");
+        dst.writeSmallString("a");
+        dst << (T) 1;
+        dst << (T) 2;
+        dst.writeSmallString("a");
+        dst.writeSmallString("b");
+        dst << (T) 3;
+        dst << (T) 4;
+        dst.writeSmallString("b");
+        dst.writeSmallString("a");
+        dst << (T) 5;
+        dst << (T) 6;
+    }
+    void encode_default(nbostream &dst) const {
+        dst.putInt1_4Bytes(3);
+        encode_inner<double>(dst);
+    }
+    void encode_with_double(nbostream &dst) const {
+        dst.putInt1_4Bytes(7);
+        dst.putInt1_4Bytes(0);
+        encode_inner<double>(dst);
+    }
+    void encode_with_float(nbostream &dst) const {
+        dst.putInt1_4Bytes(7);
+        dst.putInt1_4Bytes(1);
+        encode_inner<float>(dst);
+    }
+};
+
+TEST_F("require that mixed tensors can be decoded", MixedTensorExample()) {
+    nbostream data1;
+    nbostream data2;
+    nbostream data3;
+    f1.encode_default(data1);
+    f1.encode_with_double(data2);
+    f1.encode_with_float(data3);
+    EXPECT_EQUAL(to_spec(*SimpleTensor::decode(data1)), f1.make_spec());
+    EXPECT_EQUAL(to_spec(*SimpleTensor::decode(data2)), f1.make_spec());
+    EXPECT_EQUAL(to_spec(*SimpleTensor::decode(data3)), f1.make_spec());
+}
+
+TEST_F("require that mixed tensors can be encoded", MixedTensorExample()) {
+    nbostream data;
+    nbostream expect;
+    SimpleTensor::encode(*f1.make_tensor(), data);
+    f1.encode_default(expect);
+    EXPECT_EQUAL(Memory(data.peek(), data.size()), Memory(expect.peek(), expect.size()));
+}
+
+//-----------------------------------------------------------------------------
+
 TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/eval/src/vespa/eval/eval/simple_tensor.cpp b/eval/src/vespa/eval/eval/simple_tensor.cpp
index 1836f2088f3..1bce8983666 100644
--- a/eval/src/vespa/eval/eval/simple_tensor.cpp
+++ b/eval/src/vespa/eval/eval/simple_tensor.cpp
@@ -19,6 +19,9 @@ using CellRef = std::reference_wrapper<const Cell>;
 
 namespace {
 
+constexpr uint32_t DOUBLE_CELL_TYPE = 0;
+constexpr uint32_t FLOAT_CELL_TYPE = 1;
+
 void assert_type(const ValueType &type) {
     (void) type;
     assert(!type.is_abstract());
@@ -418,14 +421,17 @@ public:
 struct Format {
     bool     is_sparse;
     bool     is_dense;
+    bool     with_cell_type;
     uint32_t tag;
     explicit Format(const TypeMeta &meta)
         : is_sparse(meta.mapped.size() > 0),
           is_dense((meta.indexed.size() > 0) || !is_sparse),
+          with_cell_type(false),
           tag((is_sparse ? 0x1 : 0) | (is_dense ? 0x2 : 0)) {}
     explicit Format(uint32_t tag_in)
         : is_sparse((tag_in & 0x1) != 0),
           is_dense((tag_in & 0x2) != 0),
+          with_cell_type((tag_in & 0x4) != 0),
           tag(tag_in) {}
     ~Format() {}
 };
@@ -458,6 +464,13 @@ void encode_mapped_labels(nbostream &output, const TypeMeta &meta, const Address
     }
 }
 
+uint32_t maybe_decode_cell_type(nbostream &input, const Format &format) {
+    if (format.with_cell_type) {
+        return input.getInt1_4Bytes();
+    }
+    return DOUBLE_CELL_TYPE;
+}
+
 ValueType decode_type(nbostream &input, const Format &format) {
     std::vector<ValueType::Dimension> dim_list;
     if (format.is_sparse) {
@@ -496,17 +509,20 @@ void decode_mapped_labels(nbostream &input, const TypeMeta &meta, Address &addr)
     }
 }
 
-void decode_cells(nbostream &input, const ValueType &type, const TypeMeta meta,
+void decode_cells(uint32_t cell_type, nbostream &input, const ValueType &type, const TypeMeta meta,
                   Address &address, size_t n, Builder &builder)
 {
     if (n < meta.indexed.size()) {
         Label &label = address[meta.indexed[n]];
         size_t size = type.dimensions()[meta.indexed[n]].size;
         for (label.index = 0; label.index < size; ++label.index) {
-            decode_cells(input, type, meta, address, n + 1, builder);
+            decode_cells(cell_type, input, type, meta, address, n + 1, builder);
         }
     } else {
-        builder.set(address, input.readValue<double>());
+        double value = (cell_type == FLOAT_CELL_TYPE)
+                       ? input.readValue<float>()
+                       : input.readValue<double>();
+        builder.set(address, value);
     }
 }
 
@@ -693,6 +709,7 @@ std::unique_ptr<SimpleTensor>
 SimpleTensor::decode(nbostream &input)
 {
     Format format(input.getInt1_4Bytes());
+    uint32_t cell_type = maybe_decode_cell_type(input, format);
     ValueType type = decode_type(input, format);
     TypeMeta meta(type);
     Builder builder(type);
@@ -700,7 +717,7 @@ SimpleTensor::decode(nbostream &input)
     Address address(type.dimensions().size(), Label(size_t(0)));
     for (size_t i = 0; i < num_blocks; ++i) {
         decode_mapped_labels(input, meta, address);
-        decode_cells(input, type, meta, address, 0, builder);
+        decode_cells(cell_type, input, type, meta, address, 0, builder);
     }
     return builder.build();
 }
diff --git a/eval/src/vespa/eval/tensor/serialization/format.txt b/eval/src/vespa/eval/tensor/serialization/format.txt
index 780f88af01a..1a454b0ccf8 100644
--- a/eval/src/vespa/eval/tensor/serialization/format.txt
+++ b/eval/src/vespa/eval/tensor/serialization/format.txt
@@ -4,15 +4,25 @@ interpreted as a single unified binary format. The description below
 uses data types defined by document serialization (nbostream) combined
 with some comments and python-inspired flow-control. The mixed[3]
 binary format is defined in such a way that it overlays as
-effortlessly as possible with both existing formats.
+effortlessly as possible with the sparse[1] and dense[2] formats.
+
+All format archetypes can also be encoded with values other than
+double. Using a separate bit to specify that the format includes cells
+with a specific type gives rise to 3 new formats:
+sparse_with_cell_type[5], dense_with_cell_type[6] and
+mixed_with_cell_type[7].
 
 //-----------------------------------------------------------------------------
 
-1_4_int: type (1:sparse, 2:dense, 3:mixed)
+1_4_int: type (1/5:sparse, 2/6:dense, 3/7:mixed)
   bit 0 -> 'sparse'
   bit 1 -> 'dense'
+  bit 2 -> 'with_cell_type'
   (mixed tensors are tagged as both 'sparse' and 'dense')
 
+if ('with_cell_type')
+  1_4_int -> cell_type (0:double, 1:float)
+
 if ('sparse'):
   1_4_int: number of mapped dimensions -> 'n_mapped'
   'n_mapped' times: (sorted by dimension name)
@@ -33,10 +43,16 @@ else:
   'n_mapped' times:
     small_string: dimension label (same order as dimension names)
   prod('size_i') times: (product of all indexed dimension sizes)
-    double: cell value (last indexed dimension is nested innermost)
+    cell_type: cell value (last indexed dimension is nested innermost)
 
 //-----------------------------------------------------------------------------
 
+Note: The mixed_with_cell_type format can be used to encode any
+tensor.
+
+Note: cell_type defaults to double, but can be overridden by using any
+of the '_with_cell_type' formats.
+
 Note: A tensor with no dimensions should not be serialized as
 sparse[1], but when it is, it will contain an integer indicating the
 number of cells.
author	Håvard Pettersen <havardpe@oath.com>	2019-04-08 09:30:20 +0000
committer	Håvard Pettersen <havardpe@oath.com>	2019-04-09 12:05:57 +0000
commit	f5bb9339c3dad56e4f9ace290f315bfe4fcfd07b (patch)
tree	fe5e4bb20628f11369aa749fbbc2a72490230f67 /eval
parent	543dd57a6fac386ba4f62f77c33d545ed0d29e97 (diff)