From 9b3b15a8f6ea14cd0d81d4714b253761cd4309d4 Mon Sep 17 00:00:00 2001 From: Haavard Date: Tue, 2 May 2017 15:27:11 +0000 Subject: added description of new mixed serialization format ... and how it relates to old formats (sparse/dense) --- .../src/vespa/eval/tensor/serialization/format.txt | 37 ++++++++++++++++++++++ 1 file changed, 37 insertions(+) create mode 100644 eval/src/vespa/eval/tensor/serialization/format.txt (limited to 'eval/src') diff --git a/eval/src/vespa/eval/tensor/serialization/format.txt b/eval/src/vespa/eval/tensor/serialization/format.txt new file mode 100644 index 00000000000..9d0a387c36a --- /dev/null +++ b/eval/src/vespa/eval/tensor/serialization/format.txt @@ -0,0 +1,37 @@ +This file explains how the typed binary formats of serialized tensors +for different archetypes (sparse[1], dense[2] and mixed[3]) can be +interpreted as a single unified binary format. The description below +uses data types defined by document serialization (nbostream) combined +with some comments and python-inspired flow-control. The mixed[3] +binary format is defined in such a way that it overlays as +effortlessly as possible with both existing formats. The only thing +needed to go from sparse[1] or dense[2] binary formats to the mixed[3] +format for a specific tensor is to add a single byte indicating there +are no dimensions of the other kind (mapped/indexed). + +byte: type (1:sparse, 2:dense, 3:mixed) + bit 0 -> 'sparse' + bit 1 -> 'dense' + (mixed tensors are tagged as both 'sparse' and 'dense') + +if ('sparse'): + 1_4_int: number of mapped dimensions -> ''n_mapped' + 'n_mapped' times: (sorted by dimension name) + small_string: dimension name + +if ('dense'): + 1_4_int: number of indexed dimensions -> 'n_indexed' + 'n_indexed' times: (sorted by dimension name) + small_string: dimensions name + 1_4_int: dimensions size (must be at least 1) -> 'size_i' + +if ('n_mapped > 0'): + 1_4_int: number of named dense sub-spaces -> 'n_blocks' +else: + 'n_blocks' = 1 (a single dense space) + +'n_blocks' times: + 'n_mapped' times: + small_string: dimension label (same order as dimension names) + prod('size_i') times: (product of all indexed dimension sizes) + double: cell value (last indexed dimension is nested innermost) -- cgit v1.2.3 From 603635be530e6fa6d0115fc5a9f0a0b1edb66a40 Mon Sep 17 00:00:00 2001 From: Haavard Date: Wed, 3 May 2017 08:38:58 +0000 Subject: account for sparse tensors without dimensions --- eval/src/vespa/eval/tensor/serialization/format.txt | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'eval/src') diff --git a/eval/src/vespa/eval/tensor/serialization/format.txt b/eval/src/vespa/eval/tensor/serialization/format.txt index 9d0a387c36a..02db6114ab2 100644 --- a/eval/src/vespa/eval/tensor/serialization/format.txt +++ b/eval/src/vespa/eval/tensor/serialization/format.txt @@ -4,10 +4,9 @@ interpreted as a single unified binary format. The description below uses data types defined by document serialization (nbostream) combined with some comments and python-inspired flow-control. The mixed[3] binary format is defined in such a way that it overlays as -effortlessly as possible with both existing formats. The only thing -needed to go from sparse[1] or dense[2] binary formats to the mixed[3] -format for a specific tensor is to add a single byte indicating there -are no dimensions of the other kind (mapped/indexed). +effortlessly as possible with both existing formats. + +//----------------------------------------------------------------------------- byte: type (1:sparse, 2:dense, 3:mixed) bit 0 -> 'sparse' @@ -15,7 +14,7 @@ byte: type (1:sparse, 2:dense, 3:mixed) (mixed tensors are tagged as both 'sparse' and 'dense') if ('sparse'): - 1_4_int: number of mapped dimensions -> ''n_mapped' + 1_4_int: number of mapped dimensions -> 'n_mapped' 'n_mapped' times: (sorted by dimension name) small_string: dimension name @@ -25,7 +24,7 @@ if ('dense'): small_string: dimensions name 1_4_int: dimensions size (must be at least 1) -> 'size_i' -if ('n_mapped > 0'): +if ('n_mapped > 0' || !'dense'): 1_4_int: number of named dense sub-spaces -> 'n_blocks' else: 'n_blocks' = 1 (a single dense space) @@ -35,3 +34,9 @@ else: small_string: dimension label (same order as dimension names) prod('size_i') times: (product of all indexed dimension sizes) double: cell value (last indexed dimension is nested innermost) + +//----------------------------------------------------------------------------- + +Note: A tensor with no dimensions should not be serialized as +sparse[1], but when it is, it will contain an integer indicating the +number of cells. -- cgit v1.2.3 From b4af39d746ad0993b646fabf5114f3cdfb49030e Mon Sep 17 00:00:00 2001 From: Haavard Date: Wed, 3 May 2017 08:49:57 +0000 Subject: minor fixup --- eval/src/vespa/eval/tensor/serialization/format.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'eval/src') diff --git a/eval/src/vespa/eval/tensor/serialization/format.txt b/eval/src/vespa/eval/tensor/serialization/format.txt index 02db6114ab2..8c5d3b331d2 100644 --- a/eval/src/vespa/eval/tensor/serialization/format.txt +++ b/eval/src/vespa/eval/tensor/serialization/format.txt @@ -24,7 +24,7 @@ if ('dense'): small_string: dimensions name 1_4_int: dimensions size (must be at least 1) -> 'size_i' -if ('n_mapped > 0' || !'dense'): +if ('n_mapped' > 0 || !'dense'): 1_4_int: number of named dense sub-spaces -> 'n_blocks' else: 'n_blocks' = 1 (a single dense space) -- cgit v1.2.3