diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-09-28 21:19:41 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-09-28 21:19:41 +0200 |
commit | e7e659e9d26401c8c36300d4760d4e34acd26d0a (patch) | |
tree | 4c8b869a9ef991a6edda1c3a80e433b3b1690bbd /linguistics/src/main/java/com/yahoo/language/process/Embedder.java | |
parent | 35223653327b86a059d23c543bbac3611d43775f (diff) |
encode -> embed
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/process/Embedder.java')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/process/Embedder.java | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Embedder.java b/linguistics/src/main/java/com/yahoo/language/process/Embedder.java new file mode 100644 index 00000000000..56c401a7c61 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Embedder.java @@ -0,0 +1,56 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; +import com.yahoo.tensor.Tensor; +import com.yahoo.tensor.TensorType; + +import java.util.List; + +/** + * An embedder converts a text string to a tensor + * + * @author bratseth + */ +public interface Embedder { + + /** An instance of this which throws IllegalStateException if attempted used */ + Embedder throwsOnUse = new FailingEmbedder(); + + /** + * Converts text into a list of token id's (a vector embedding) + * + * @param text the text to embed + * @param language the language of the text, or UNKNOWN to use language independent embedding + * @return the text embedded as a list of token ids + * @throws IllegalArgumentException if the language is not supported by this embedder + */ + List<Integer> embed(String text, Language language); + + /** + * Converts text into tokens in a tensor. + * The information contained in the embedding may depend on the tensor type. + * + * @param text the text to embed + * @param language the language of the text, or UNKNOWN to use language independent embedding + * @param tensorType the type of the tensor to be returned + * @return the tensor embedding of the text, as the spoecified tensor type + * @throws IllegalArgumentException if the language or tensor type is not supported by this embedder + */ + Tensor embed(String text, Language language, TensorType tensorType); + + class FailingEmbedder implements Embedder { + + @Override + public List<Integer> embed(String text, Language language) { + throw new IllegalStateException("No embedder has been configured"); + } + + @Override + public Tensor embed(String text, Language language, TensorType tensorType) { + throw new IllegalStateException("No embedder has been configured"); + } + + } + +} |