summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorLester Solbakken <lesters@oath.com>2023-02-10 14:06:29 +0100
committerLester Solbakken <lesters@oath.com>2023-02-10 14:06:29 +0100
commitf5118dcd8b04293cf65434f1509fa0e06833492b (patch)
treec6d77c5a81c7fbfe697e219897a459879871ef0e /linguistics
parentf62bb48baf715609606faa82a6119012b8a727de (diff)
Add decoding of sentencepiece token sequence to text
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/Embedder.java11
1 files changed, 11 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Embedder.java b/linguistics/src/main/java/com/yahoo/language/process/Embedder.java
index c8ba3395c3c..055861c5388 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/Embedder.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/Embedder.java
@@ -42,6 +42,17 @@ public interface Embedder {
List<Integer> embed(String text, Context context);
/**
+ * Converts the list of token id's into a text. The opposite operation of embed.
+ *
+ * @param tokens the list of tokens to decode to a string
+ * @param context the context which specifies the language used to select a model
+ * @return the string formed by decoding the tokens back to their string repreesentation
+ */
+ default String decode(List<Integer> tokens, Context context) {
+ throw new UnsupportedOperationException("Decode is not implemented");
+ }
+
+ /**
* Converts text into tokens in a tensor.
* The information contained in the embedding may depend on the tensor type.
*