Add decoding of sentencepiece token sequence to text

author: Lester Solbakken <lesters@oath.com> 2023-02-10 14:06:29 +0100
committer: Lester Solbakken <lesters@oath.com> 2023-02-10 14:06:29 +0100
commit: f5118dcd8b04293cf65434f1509fa0e06833492b (patch)
tree: c6d77c5a81c7fbfe697e219897a459879871ef0e /linguistics
parent: f62bb48baf715609606faa82a6119012b8a727de (diff)
1 files changed, 11 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Embedder.java b/linguistics/src/main/java/com/yahoo/language/process/Embedder.java
index c8ba3395c3c..055861c5388 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/Embedder.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/Embedder.java
@@ -42,6 +42,17 @@ public interface Embedder {
     List<Integer> embed(String text, Context context);
 
     /**
+     * Converts the list of token id's into a text. The opposite operation of embed.
+     *
+     * @param tokens the list of tokens to decode to a string
+     * @param context the context which specifies the language used to select a model
+     * @return the string formed by decoding the tokens back to their string repreesentation
+     */
+    default String decode(List<Integer> tokens, Context context) {
+        throw new UnsupportedOperationException("Decode is not implemented");
+    }
+
+    /**
      * Converts text into tokens in a tensor.
      * The information contained in the embedding may depend on the tensor type.
      *
author	Lester Solbakken <lesters@oath.com>	2023-02-10 14:06:29 +0100
committer	Lester Solbakken <lesters@oath.com>	2023-02-10 14:06:29 +0100
commit	f5118dcd8b04293cf65434f1509fa0e06833492b (patch)
tree	c6d77c5a81c7fbfe697e219897a459879871ef0e /linguistics
parent	f62bb48baf715609606faa82a6119012b8a727de (diff)