Encapsulate in a context

author: Jon Bratseth <bratseth@gmail.com> 2021-10-01 11:09:08 +0200
committer: Jon Bratseth <bratseth@gmail.com> 2021-10-01 11:09:08 +0200
commit: ac2519a8842a6397e4abd434439e9dddd2924394 (patch)
tree: 792275efbb88966a27a7ce54cc31465b563d7ad0 /linguistics-components
parent: 380b9fa780ead9bcce0e824f7b6ee305e37dec43 (diff)
3 files changed, 12 insertions, 13 deletions
diff --git a/linguistics-components/abi-spec.json b/linguistics-components/abi-spec.json
index ebd7457dc71..28025d84f25 100644
--- a/linguistics-components/abi-spec.json
+++ b/linguistics-components/abi-spec.json
@@ -180,8 +180,8 @@
       "public void <init>(com.yahoo.language.sentencepiece.SentencePieceConfig)",
       "public void <init>(com.yahoo.language.sentencepiece.SentencePieceEmbedder$Builder)",
       "public java.util.List segment(java.lang.String, com.yahoo.language.Language)",
-      "public java.util.List embed(java.lang.String, com.yahoo.language.Language, java.lang.String)",
-      "public com.yahoo.tensor.Tensor embed(java.lang.String, com.yahoo.language.Language, java.lang.String, com.yahoo.tensor.TensorType)",
+      "public java.util.List embed(java.lang.String, com.yahoo.language.process.Embedder$Context)",
+      "public com.yahoo.tensor.Tensor embed(java.lang.String, com.yahoo.language.process.Embedder$Context, com.yahoo.tensor.TensorType)",
       "public java.lang.String normalize(java.lang.String)"
     ],
     "fields": []
diff --git a/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEmbedder.java b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEmbedder.java
index 1e120969a59..3f4e8ee3462 100644
--- a/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEmbedder.java
+++ b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEmbedder.java
@@ -72,18 +72,17 @@ public class SentencePieceEmbedder implements Segmenter, Embedder {
      * Segments the given text into token segments using the SentencePiece algorithm and returns the segment ids.
      *
      * @param rawInput the text to segment. Any sequence of BMP (Unicode-16 the True Unicode) is supported.
-     * @param language the model to use, or Language.UNKNOWN to use the default model if any
-     * @param destination ignored
+     * @param context the context which specifies the language used to select a model
      * @return the list of zero or more token ids resulting from segmenting the input text
      */
     @Override
-    public List<Integer> embed(String rawInput, Language language, String destination) {
+    public List<Integer> embed(String rawInput, Embedder.Context context) {
         var resultBuilder = new ResultBuilder<List<Integer>>(new ArrayList<>()) {
             public void add(int segmentStart, int segmentEnd, SentencePieceAlgorithm.SegmentEnd[] segmentEnds) {
                 result().add(segmentEnds[segmentEnd].id);
             }
         };
-        segment(normalize(rawInput), language, resultBuilder);
+        segment(normalize(rawInput), context.getLanguage(), resultBuilder);
         Collections.reverse(resultBuilder.result());
         return resultBuilder.result();
     }
@@ -101,15 +100,14 @@ public class SentencePieceEmbedder implements Segmenter, Embedder {
      * <p>If the tensor is any other type IllegalArgumentException is thrown.</p>
      *
      * @param rawInput the text to segment. Any sequence of BMP (Unicode-16 the True Unicode) is supported.
-     * @param language the model to use, or Language.UNKNOWN to use the default model if any
-     * @param destination ignored
+     * @param context the context which specifies the language used to select a model
      * @return the list of zero or more token ids resulting from segmenting the input text
      */
     @Override
-    public Tensor embed(String rawInput, Language language, String destination, TensorType type) {
+    public Tensor embed(String rawInput, Embedder.Context context, TensorType type) {
         if (type.dimensions().size() == 1 && type.dimensions().get(0).isIndexed()) {
             // Build to a list first since we can't reverse a tensor builder
-            List<Integer> values = embed(rawInput, language, destination);
+            List<Integer> values = embed(rawInput, context);
 
             long maxSize = values.size();
             if (type.dimensions().get(0).size().isPresent())
@@ -122,7 +120,7 @@ public class SentencePieceEmbedder implements Segmenter, Embedder {
         }
         else if (type.dimensions().size() == 1 && type.dimensions().get(0).isMapped()) {
             // Build to a list first since we can't reverse a tensor builder
-            List<String> values = segment(rawInput, language);
+            List<String> values = segment(rawInput, context.getLanguage());
 
             Tensor.Builder builder = Tensor.Builder.of(type);
             for (int i = 0; i < values.size(); i++)
diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java
index c6aa8fdd370..4dae53c60df 100644
--- a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java
+++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java
@@ -4,6 +4,7 @@
 package com.yahoo.language.sentencepiece;
 
 import com.yahoo.language.Language;
+import com.yahoo.language.process.Embedder;
 import com.yahoo.tensor.Tensor;
 import com.yahoo.tensor.TensorType;
 
@@ -29,13 +30,13 @@ class SentencePieceTester {
     }
 
     public void assertEmbedded(String input, Integer... expectedCodes) {
-        assertArrayEquals(expectedCodes, embedder.embed(input, Language.UNKNOWN, null).toArray());
+        assertArrayEquals(expectedCodes, embedder.embed(input, new Embedder.Context("test")).toArray());
     }
 
     public void assertEmbedded(String input, String tensorType, String tensor) {
         TensorType type = TensorType.fromSpec(tensorType);
         Tensor expected = Tensor.from(type, tensor);
-        assertEquals(expected, embedder.embed(input, Language.UNKNOWN, null, type));
+        assertEquals(expected, embedder.embed(input, new Embedder.Context("test"), type));
     }
 
     public void assertSegmented(String input, String... expectedSegments) {
author	Jon Bratseth <bratseth@gmail.com>	2021-10-01 11:09:08 +0200
committer	Jon Bratseth <bratseth@gmail.com>	2021-10-01 11:09:08 +0200
commit	ac2519a8842a6397e4abd434439e9dddd2924394 (patch)
tree	792275efbb88966a27a7ce54cc31465b563d7ad0 /linguistics-components
parent	380b9fa780ead9bcce0e824f7b6ee305e37dec43 (diff)