diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-09-28 21:19:41 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-09-28 21:19:41 +0200 |
commit | e7e659e9d26401c8c36300d4760d4e34acd26d0a (patch) | |
tree | 4c8b869a9ef991a6edda1c3a80e433b3b1690bbd /linguistics/src/main | |
parent | 35223653327b86a059d23c543bbac3611d43775f (diff) |
encode -> embed
Diffstat (limited to 'linguistics/src/main')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/process/Embedder.java | 56 | ||||
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/process/Encoder.java | 56 |
2 files changed, 56 insertions, 56 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Embedder.java b/linguistics/src/main/java/com/yahoo/language/process/Embedder.java new file mode 100644 index 00000000000..56c401a7c61 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Embedder.java @@ -0,0 +1,56 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; +import com.yahoo.tensor.Tensor; +import com.yahoo.tensor.TensorType; + +import java.util.List; + +/** + * An embedder converts a text string to a tensor + * + * @author bratseth + */ +public interface Embedder { + + /** An instance of this which throws IllegalStateException if attempted used */ + Embedder throwsOnUse = new FailingEmbedder(); + + /** + * Converts text into a list of token id's (a vector embedding) + * + * @param text the text to embed + * @param language the language of the text, or UNKNOWN to use language independent embedding + * @return the text embedded as a list of token ids + * @throws IllegalArgumentException if the language is not supported by this embedder + */ + List<Integer> embed(String text, Language language); + + /** + * Converts text into tokens in a tensor. + * The information contained in the embedding may depend on the tensor type. + * + * @param text the text to embed + * @param language the language of the text, or UNKNOWN to use language independent embedding + * @param tensorType the type of the tensor to be returned + * @return the tensor embedding of the text, as the spoecified tensor type + * @throws IllegalArgumentException if the language or tensor type is not supported by this embedder + */ + Tensor embed(String text, Language language, TensorType tensorType); + + class FailingEmbedder implements Embedder { + + @Override + public List<Integer> embed(String text, Language language) { + throw new IllegalStateException("No embedder has been configured"); + } + + @Override + public Tensor embed(String text, Language language, TensorType tensorType) { + throw new IllegalStateException("No embedder has been configured"); + } + + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Encoder.java b/linguistics/src/main/java/com/yahoo/language/process/Encoder.java deleted file mode 100644 index 27f73d15e54..00000000000 --- a/linguistics/src/main/java/com/yahoo/language/process/Encoder.java +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.process; - -import com.yahoo.language.Language; -import com.yahoo.tensor.Tensor; -import com.yahoo.tensor.TensorType; - -import java.util.List; - -/** - * An encoder converts a text string to a tensor or list of tokens - * - * @author bratseth - */ -public interface Encoder { - - /** An instance of this which throws IllegalStateException if attempted used */ - Encoder throwsOnUse = new FailingEncoder(); - - /** - * Encodes text into tokens in a list of ids. - * - * @param text the text to encode - * @param language the language of the text, or UNKNOWN to use language independent encoding - * @return the text encoded to a list of segment ids - * @throws IllegalArgumentException if the language is not supported by this encoder - */ - List<Integer> encode(String text, Language language); - - /** - * Encodes text into tokens in a tensor. - * The information contained in the encoding may depend on the tensor type. - * - * @param text the text to encode - * @param language the language of the text, or UNKNOWN to use language independent encoding - * @param tensorType the type of the ttensor to be returned - * @return the tex encoded into a tensor of the supplied type - * @throws IllegalArgumentException if the language or tensor type is not supported by this encoder - */ - Tensor encode(String text, Language language, TensorType tensorType); - - class FailingEncoder implements Encoder { - - @Override - public List<Integer> encode(String text, Language language) { - throw new IllegalStateException("No encoder has been configured"); - } - - @Override - public Tensor encode(String text, Language language, TensorType tensorType) { - throw new IllegalStateException("No encoder has been configured"); - } - - } - -} |