From 380b9fa780ead9bcce0e824f7b6ee305e37dec43 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Thu, 30 Sep 2021 14:21:25 +0200 Subject: Update linguisticvs-components --- .../yahoo/language/sentencepiece/SentencePieceEmbedder.java | 12 +++++++++--- .../yahoo/language/sentencepiece/SentencePieceTester.java | 4 ++-- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'linguistics-components/src') diff --git a/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEmbedder.java b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEmbedder.java index 116dd15f563..1e120969a59 100644 --- a/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEmbedder.java +++ b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEmbedder.java @@ -73,10 +73,11 @@ public class SentencePieceEmbedder implements Segmenter, Embedder { * * @param rawInput the text to segment. Any sequence of BMP (Unicode-16 the True Unicode) is supported. * @param language the model to use, or Language.UNKNOWN to use the default model if any + * @param destination ignored * @return the list of zero or more token ids resulting from segmenting the input text */ @Override - public List embed(String rawInput, Language language) { + public List embed(String rawInput, Language language, String destination) { var resultBuilder = new ResultBuilder>(new ArrayList<>()) { public void add(int segmentStart, int segmentEnd, SentencePieceAlgorithm.SegmentEnd[] segmentEnds) { result().add(segmentEnds[segmentEnd].id); @@ -98,12 +99,17 @@ public class SentencePieceEmbedder implements Segmenter, Embedder { * position as value.

* *

If the tensor is any other type IllegalArgumentException is thrown.

+ * + * @param rawInput the text to segment. Any sequence of BMP (Unicode-16 the True Unicode) is supported. + * @param language the model to use, or Language.UNKNOWN to use the default model if any + * @param destination ignored + * @return the list of zero or more token ids resulting from segmenting the input text */ @Override - public Tensor embed(String rawInput, Language language, TensorType type) { + public Tensor embed(String rawInput, Language language, String destination, TensorType type) { if (type.dimensions().size() == 1 && type.dimensions().get(0).isIndexed()) { // Build to a list first since we can't reverse a tensor builder - List values = embed(rawInput, language); + List values = embed(rawInput, language, destination); long maxSize = values.size(); if (type.dimensions().get(0).size().isPresent()) diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java index c4cb13a3d23..c6aa8fdd370 100644 --- a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java @@ -29,13 +29,13 @@ class SentencePieceTester { } public void assertEmbedded(String input, Integer... expectedCodes) { - assertArrayEquals(expectedCodes, embedder.embed(input, Language.UNKNOWN).toArray()); + assertArrayEquals(expectedCodes, embedder.embed(input, Language.UNKNOWN, null).toArray()); } public void assertEmbedded(String input, String tensorType, String tensor) { TensorType type = TensorType.fromSpec(tensorType); Tensor expected = Tensor.from(type, tensor); - assertEquals(expected, embedder.embed(input, Language.UNKNOWN, type)); + assertEquals(expected, embedder.embed(input, Language.UNKNOWN, null, type)); } public void assertSegmented(String input, String... expectedSegments) { -- cgit v1.2.3