From 9b3b15a8f6ea14cd0d81d4714b253761cd4309d4 Mon Sep 17 00:00:00 2001
From: Haavard <havardpe@yahoo-inc.com>
Date: Tue, 2 May 2017 15:27:11 +0000
Subject: added description of new mixed serialization format ... and how it
 relates to old formats (sparse/dense)

---
 .../src/vespa/eval/tensor/serialization/format.txt | 37 ++++++++++++++++++++++
 1 file changed, 37 insertions(+)
 create mode 100644 eval/src/vespa/eval/tensor/serialization/format.txt

(limited to 'eval/src')

diff --git a/eval/src/vespa/eval/tensor/serialization/format.txt b/eval/src/vespa/eval/tensor/serialization/format.txt
new file mode 100644
index 00000000000..9d0a387c36a
--- /dev/null
+++ b/eval/src/vespa/eval/tensor/serialization/format.txt
@@ -0,0 +1,37 @@
+This file explains how the typed binary formats of serialized tensors
+for different archetypes (sparse[1], dense[2] and mixed[3]) can be
+interpreted as a single unified binary format. The description below
+uses data types defined by document serialization (nbostream) combined
+with some comments and python-inspired flow-control. The mixed[3]
+binary format is defined in such a way that it overlays as
+effortlessly as possible with both existing formats. The only thing
+needed to go from sparse[1] or dense[2] binary formats to the mixed[3]
+format for a specific tensor is to add a single byte indicating there
+are no dimensions of the other kind (mapped/indexed).
+
+byte: type (1:sparse, 2:dense, 3:mixed)
+  bit 0 -> 'sparse'
+  bit 1 -> 'dense'
+  (mixed tensors are tagged as both 'sparse' and 'dense')
+
+if ('sparse'):
+  1_4_int: number of mapped dimensions -> ''n_mapped'
+  'n_mapped' times: (sorted by dimension name)
+    small_string: dimension name
+
+if ('dense'):
+  1_4_int: number of indexed dimensions -> 'n_indexed'
+  'n_indexed' times: (sorted by dimension name)
+    small_string: dimensions name
+    1_4_int: dimensions size (must be at least 1) -> 'size_i'
+
+if ('n_mapped > 0'):
+  1_4_int: number of named dense sub-spaces -> 'n_blocks'
+else:
+  'n_blocks' = 1 (a single dense space)
+
+'n_blocks' times:
+  'n_mapped' times:
+    small_string: dimension label (same order as dimension names)
+  prod('size_i') times: (product of all indexed dimension sizes)
+    double: cell value (last indexed dimension is nested innermost)
-- 
cgit v1.2.3


From 603635be530e6fa6d0115fc5a9f0a0b1edb66a40 Mon Sep 17 00:00:00 2001
From: Haavard <havardpe@yahoo-inc.com>
Date: Wed, 3 May 2017 08:38:58 +0000
Subject: account for sparse tensors without dimensions

---
 eval/src/vespa/eval/tensor/serialization/format.txt | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'eval/src')

diff --git a/eval/src/vespa/eval/tensor/serialization/format.txt b/eval/src/vespa/eval/tensor/serialization/format.txt
index 9d0a387c36a..02db6114ab2 100644
--- a/eval/src/vespa/eval/tensor/serialization/format.txt
+++ b/eval/src/vespa/eval/tensor/serialization/format.txt
@@ -4,10 +4,9 @@ interpreted as a single unified binary format. The description below
 uses data types defined by document serialization (nbostream) combined
 with some comments and python-inspired flow-control. The mixed[3]
 binary format is defined in such a way that it overlays as
-effortlessly as possible with both existing formats. The only thing
-needed to go from sparse[1] or dense[2] binary formats to the mixed[3]
-format for a specific tensor is to add a single byte indicating there
-are no dimensions of the other kind (mapped/indexed).
+effortlessly as possible with both existing formats.
+
+//-----------------------------------------------------------------------------
 
 byte: type (1:sparse, 2:dense, 3:mixed)
   bit 0 -> 'sparse'
@@ -15,7 +14,7 @@ byte: type (1:sparse, 2:dense, 3:mixed)
   (mixed tensors are tagged as both 'sparse' and 'dense')
 
 if ('sparse'):
-  1_4_int: number of mapped dimensions -> ''n_mapped'
+  1_4_int: number of mapped dimensions -> 'n_mapped'
   'n_mapped' times: (sorted by dimension name)
     small_string: dimension name
 
@@ -25,7 +24,7 @@ if ('dense'):
     small_string: dimensions name
     1_4_int: dimensions size (must be at least 1) -> 'size_i'
 
-if ('n_mapped > 0'):
+if ('n_mapped > 0' || !'dense'):
   1_4_int: number of named dense sub-spaces -> 'n_blocks'
 else:
   'n_blocks' = 1 (a single dense space)
@@ -35,3 +34,9 @@ else:
     small_string: dimension label (same order as dimension names)
   prod('size_i') times: (product of all indexed dimension sizes)
     double: cell value (last indexed dimension is nested innermost)
+
+//-----------------------------------------------------------------------------
+
+Note: A tensor with no dimensions should not be serialized as
+sparse[1], but when it is, it will contain an integer indicating the
+number of cells.
-- 
cgit v1.2.3


From b4af39d746ad0993b646fabf5114f3cdfb49030e Mon Sep 17 00:00:00 2001
From: Haavard <havardpe@yahoo-inc.com>
Date: Wed, 3 May 2017 08:49:57 +0000
Subject: minor fixup

---
 eval/src/vespa/eval/tensor/serialization/format.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'eval/src')

diff --git a/eval/src/vespa/eval/tensor/serialization/format.txt b/eval/src/vespa/eval/tensor/serialization/format.txt
index 02db6114ab2..8c5d3b331d2 100644
--- a/eval/src/vespa/eval/tensor/serialization/format.txt
+++ b/eval/src/vespa/eval/tensor/serialization/format.txt
@@ -24,7 +24,7 @@ if ('dense'):
     small_string: dimensions name
     1_4_int: dimensions size (must be at least 1) -> 'size_i'
 
-if ('n_mapped > 0' || !'dense'):
+if ('n_mapped' > 0 || !'dense'):
   1_4_int: number of named dense sub-spaces -> 'n_blocks'
 else:
   'n_blocks' = 1 (a single dense space)
-- 
cgit v1.2.3