From e7e659e9d26401c8c36300d4760d4e34acd26d0a Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Tue, 28 Sep 2021 21:19:41 +0200 Subject: encode -> embed --- .../yahoo/vespa/indexinglanguage/ScriptParser.java | 2 +- .../indexinglanguage/ScriptParserContext.java | 13 ++-- .../expressions/EmbedExpression.java | 69 ++++++++++++++++++++++ .../expressions/EncodeExpression.java | 69 ---------------------- .../indexinglanguage/expressions/Expression.java | 8 +-- .../expressions/ScriptExpression.java | 8 +-- .../expressions/StatementExpression.java | 10 ++-- indexinglanguage/src/main/javacc/IndexingParser.jj | 20 +++---- .../indexinglanguage/ScriptParserTestCase.java | 4 +- .../vespa/indexinglanguage/ScriptTestCase.java | 18 ++---- .../parser/DefaultFieldNameTestCase.java | 4 +- .../parser/ExpressionTestCase.java | 6 +- 12 files changed, 111 insertions(+), 120 deletions(-) create mode 100644 indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EmbedExpression.java delete mode 100644 indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EncodeExpression.java (limited to 'indexinglanguage') diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParser.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParser.java index 34da5b47655..649095d1db8 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParser.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParser.java @@ -62,7 +62,7 @@ public final class ScriptParser { parser.setAnnotatorConfig(context.getAnnotatorConfig()); parser.setDefaultFieldName(context.getDefaultFieldName()); parser.setLinguistics(context.getLinguistcs()); - parser.setEncoder(context.getEncoder()); + parser.setEmbedder(context.getEmbedder()); try { return method.call(parser); } catch (ParseException e) { diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParserContext.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParserContext.java index 06be91703fa..77c2af8dd42 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParserContext.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParserContext.java @@ -2,8 +2,7 @@ package com.yahoo.vespa.indexinglanguage; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; -import com.yahoo.language.simple.SimpleLinguistics; +import com.yahoo.language.process.Embedder; import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig; import com.yahoo.vespa.indexinglanguage.parser.CharStream; @@ -14,13 +13,13 @@ public class ScriptParserContext { private AnnotatorConfig annotatorConfig = new AnnotatorConfig(); private Linguistics linguistics; - private final Encoder encoder; + private final Embedder embedder; private String defaultFieldName = null; private CharStream inputStream = null; - public ScriptParserContext(Linguistics linguistics, Encoder encoder) { + public ScriptParserContext(Linguistics linguistics, Embedder embedder) { this.linguistics = linguistics; - this.encoder = encoder; + this.embedder = embedder; } public AnnotatorConfig getAnnotatorConfig() { @@ -41,8 +40,8 @@ public class ScriptParserContext { return this; } - public Encoder getEncoder() { - return encoder; + public Embedder getEmbedder() { + return embedder; } public String getDefaultFieldName() { diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EmbedExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EmbedExpression.java new file mode 100644 index 00000000000..aa579ed729e --- /dev/null +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EmbedExpression.java @@ -0,0 +1,69 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.indexinglanguage.expressions; + +import com.yahoo.document.DataType; +import com.yahoo.document.TensorDataType; +import com.yahoo.document.datatypes.StringFieldValue; +import com.yahoo.document.datatypes.TensorFieldValue; +import com.yahoo.language.process.Embedder; +import com.yahoo.tensor.Tensor; +import com.yahoo.tensor.TensorType; + +/** + * Embeds a string in a tensor space using the configured Embedder component + * + * @author bratseth + */ +public class EmbedExpression extends Expression { + + private final Embedder embedder; + + /** The target type we are embedding into. */ + private TensorType targetType; + + public EmbedExpression(Embedder embedder) { + super(DataType.STRING); + this.embedder = embedder; + } + + @Override + public void setStatementOutputType(DataType type) { + targetType = ((TensorDataType)type).getTensorType(); + } + + @Override + protected void doExecute(ExecutionContext context) { + StringFieldValue input = (StringFieldValue) context.getValue(); + Tensor tensor = embedder.embed(input.getString(), context.getLanguage(), targetType); + context.setValue(new TensorFieldValue(tensor)); + } + + @Override + protected void doVerify(VerificationContext context) { + String outputField = context.getOutputField(); + if (outputField == null) + throw new VerificationException(this, "No output field in this statement: " + + "Don't know what tensor type to embed into."); + DataType outputFieldType = context.getInputType(this, outputField); + if ( ! (outputFieldType instanceof TensorDataType) ) + throw new VerificationException(this, "The type of the output field " + outputField + + " is not a tensor but " + outputField); + targetType = ((TensorDataType) outputFieldType).getTensorType(); + context.setValueType(createdOutputType()); + } + + @Override + public DataType createdOutputType() { + return new TensorDataType(targetType); + } + + @Override + public String toString() { return "embed"; } + + @Override + public int hashCode() { return 1; } + + @Override + public boolean equals(Object o) { return o instanceof EmbedExpression; } + +} diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EncodeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EncodeExpression.java deleted file mode 100644 index f84da9ddef8..00000000000 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EncodeExpression.java +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.indexinglanguage.expressions; - -import com.yahoo.document.DataType; -import com.yahoo.document.TensorDataType; -import com.yahoo.document.datatypes.StringFieldValue; -import com.yahoo.document.datatypes.TensorFieldValue; -import com.yahoo.language.process.Encoder; -import com.yahoo.tensor.Tensor; -import com.yahoo.tensor.TensorType; - -/** - * Encodes a string as a tensor using the configured Encoder component - * - * @author bratseth - */ -public class EncodeExpression extends Expression { - - private final Encoder encoder; - - /** The target type we are encoding to. Set during verification. */ - private TensorType targetType; - - public EncodeExpression(Encoder encoder) { - super(DataType.STRING); - this.encoder = encoder; - } - - @Override - public void setStatementOutputType(DataType type) { - targetType = ((TensorDataType)type).getTensorType(); - } - - @Override - protected void doExecute(ExecutionContext context) { - StringFieldValue input = (StringFieldValue) context.getValue(); - Tensor tensor = encoder.encode(input.getString(), context.getLanguage(), targetType); - context.setValue(new TensorFieldValue(tensor)); - } - - @Override - protected void doVerify(VerificationContext context) { - String outputField = context.getOutputField(); - if (outputField == null) - throw new VerificationException(this, "No output field in this statement: " + - "Don't know what tensor type to encode to."); - DataType outputFieldType = context.getInputType(this, outputField); - if ( ! (outputFieldType instanceof TensorDataType) ) - throw new VerificationException(this, "The type of the output field " + outputField + - " is not a tensor but " + outputField); - targetType = ((TensorDataType) outputFieldType).getTensorType(); - context.setValueType(createdOutputType()); - } - - @Override - public DataType createdOutputType() { - return new TensorDataType(targetType); - } - - @Override - public String toString() { return "encode"; } - - @Override - public int hashCode() { return 1; } - - @Override - public boolean equals(Object o) { return o instanceof EncodeExpression; } - -} diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java index 67459c2b035..20a0c9804a9 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java @@ -6,7 +6,7 @@ import com.yahoo.document.Document; import com.yahoo.document.DocumentUpdate; import com.yahoo.document.datatypes.FieldValue; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.*; import com.yahoo.vespa.indexinglanguage.parser.IndexingInput; @@ -188,11 +188,11 @@ public abstract class Expression extends Selectable { /** Creates an expression with simple lingustics for testing */ public static Expression fromString(String expression) throws ParseException { - return fromString(expression, new SimpleLinguistics(), Encoder.throwsOnUse); + return fromString(expression, new SimpleLinguistics(), Embedder.throwsOnUse); } - public static Expression fromString(String expression, Linguistics linguistics, Encoder encoder) throws ParseException { - return newInstance(new ScriptParserContext(linguistics, encoder).setInputStream(new IndexingInput(expression))); + public static Expression fromString(String expression, Linguistics linguistics, Embedder embedder) throws ParseException { + return newInstance(new ScriptParserContext(linguistics, embedder).setInputStream(new IndexingInput(expression))); } public static Expression newInstance(ScriptParserContext context) throws ParseException { diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java index 7317cb2216f..b5f71813de3 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java @@ -4,7 +4,7 @@ package com.yahoo.vespa.indexinglanguage.expressions; import com.yahoo.document.DataType; import com.yahoo.document.datatypes.FieldValue; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.ScriptParser; import com.yahoo.vespa.indexinglanguage.ScriptParserContext; @@ -92,11 +92,11 @@ public final class ScriptExpression extends ExpressionList /** Creates an expression with simple lingustics for testing */ @SuppressWarnings("deprecation") public static ScriptExpression fromString(String expression) throws ParseException { - return fromString(expression, new SimpleLinguistics(), Encoder.throwsOnUse); + return fromString(expression, new SimpleLinguistics(), Embedder.throwsOnUse); } - public static ScriptExpression fromString(String expression, Linguistics linguistics, Encoder encoder) throws ParseException { - return newInstance(new ScriptParserContext(linguistics, encoder).setInputStream(new IndexingInput(expression))); + public static ScriptExpression fromString(String expression, Linguistics linguistics, Embedder embedder) throws ParseException { + return newInstance(new ScriptParserContext(linguistics, embedder).setInputStream(new IndexingInput(expression))); } public static ScriptExpression newInstance(ScriptParserContext config) throws ParseException { diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java index 145133e210d..7d157af1a19 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java @@ -2,9 +2,8 @@ package com.yahoo.vespa.indexinglanguage.expressions; import com.yahoo.document.DataType; -import com.yahoo.document.TensorDataType; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.ScriptParser; import com.yahoo.vespa.indexinglanguage.ScriptParserContext; @@ -12,7 +11,6 @@ import com.yahoo.vespa.indexinglanguage.parser.IndexingInput; import com.yahoo.vespa.indexinglanguage.parser.ParseException; import java.util.Arrays; -import java.util.Collection; import java.util.Iterator; import java.util.LinkedList; import java.util.List; @@ -100,11 +98,11 @@ public final class StatementExpression extends ExpressionList { /** Creates an expression with simple lingustics for testing */ public static StatementExpression fromString(String expression) throws ParseException { - return fromString(expression, new SimpleLinguistics(), Encoder.throwsOnUse); + return fromString(expression, new SimpleLinguistics(), Embedder.throwsOnUse); } - public static StatementExpression fromString(String expression, Linguistics linguistics, Encoder encoder) throws ParseException { - return newInstance(new ScriptParserContext(linguistics, encoder).setInputStream(new IndexingInput(expression))); + public static StatementExpression fromString(String expression, Linguistics linguistics, Embedder embedder) throws ParseException { + return newInstance(new ScriptParserContext(linguistics, embedder).setInputStream(new IndexingInput(expression))); } public static StatementExpression newInstance(ScriptParserContext config) throws ParseException { diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj index 4533a17954c..3eee4ea6f08 100644 --- a/indexinglanguage/src/main/javacc/IndexingParser.jj +++ b/indexinglanguage/src/main/javacc/IndexingParser.jj @@ -34,7 +34,7 @@ import com.yahoo.text.StringUtilities; import com.yahoo.vespa.indexinglanguage.expressions.*; import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig; import com.yahoo.language.process.StemMode; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.Linguistics; /** @@ -45,7 +45,7 @@ public class IndexingParser { private String defaultFieldName; private Linguistics linguistics; - private Encoder encoder; + private Embedder embedder; private AnnotatorConfig annotatorCfg; public IndexingParser(String str) { @@ -62,8 +62,8 @@ public class IndexingParser { return this; } - public IndexingParser setEncoder(Encoder encoder) { - this.encoder = encoder; + public IndexingParser setEmbedder(Embedder embedder) { + this.embedder = embedder; return this; } @@ -157,7 +157,7 @@ TOKEN : | | | - | + | | | | @@ -283,7 +283,7 @@ Expression value() : val = base64EncodeExp() | val = clearStateExp() | val = echoExp() | - val = encodeExp() | + val = embedExp() | val = exactExp() | val = flattenExp() | val = forEachExp() | @@ -365,10 +365,10 @@ Expression echoExp() : { } { return new EchoExpression(); } } -Expression encodeExp() : { } +Expression embedExp() : { } { - ( ) - { return new EncodeExpression(encoder); } + ( ) + { return new EmbedExpression(embedder); } } Expression exactExp() : { } @@ -744,7 +744,7 @@ String identifier() : | | | - | + | | | | diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptParserTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptParserTestCase.java index 32e38dbee6f..06d185339a6 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptParserTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptParserTestCase.java @@ -1,7 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.indexinglanguage; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.expressions.EchoExpression; import com.yahoo.vespa.indexinglanguage.expressions.InputExpression; @@ -96,7 +96,7 @@ public class ScriptParserTestCase { } private static ScriptParserContext newContext(String input) { - return new ScriptParserContext(new SimpleLinguistics(), Encoder.throwsOnUse).setInputStream(new IndexingInput(input)); + return new ScriptParserContext(new SimpleLinguistics(), Embedder.throwsOnUse).setInputStream(new IndexingInput(input)); } } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java index 9d3d0abb256..188426b1a06 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java @@ -5,26 +5,20 @@ import com.yahoo.document.DataType; import com.yahoo.document.Document; import com.yahoo.document.DocumentType; import com.yahoo.document.Field; -import com.yahoo.document.FieldPath; import com.yahoo.document.TensorDataType; import com.yahoo.document.datatypes.BoolFieldValue; -import com.yahoo.document.datatypes.FieldValue; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.document.datatypes.TensorFieldValue; import com.yahoo.language.Language; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.tensor.Tensor; -import com.yahoo.tensor.TensorAddress; import com.yahoo.tensor.TensorType; import com.yahoo.vespa.indexinglanguage.expressions.*; import com.yahoo.vespa.indexinglanguage.parser.ParseException; import org.junit.Test; -import java.util.Iterator; import java.util.List; -import java.util.Map; -import java.util.Set; import static org.junit.Assert.*; @@ -106,9 +100,9 @@ public class ScriptTestCase { } @Test - public void testEncode() throws ParseException { + public void testEmbed() throws ParseException { TensorType tensorType = TensorType.fromSpec("tensor(d[4])"); - var expression = Expression.fromString("input myText | encode | attribute 'myTensor'", + var expression = Expression.fromString("input myText | embed | attribute 'myTensor'", new SimpleLinguistics(), new MockEncoder()); @@ -131,15 +125,15 @@ public class ScriptTestCase { ((TensorFieldValue)adapter.values.get("myTensor")).getTensor().get()); } - private static class MockEncoder implements Encoder { + private static class MockEncoder implements Embedder { @Override - public List encode(String text, Language language) { + public List embed(String text, Language language) { return null; } @Override - public Tensor encode(String text, Language language, TensorType tensorType) { + public Tensor embed(String text, Language language, TensorType tensorType) { return Tensor.from(tensorType, "[7,3,0,0]"); } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/DefaultFieldNameTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/DefaultFieldNameTestCase.java index 2a71aeb564c..ea0d9f9cf69 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/DefaultFieldNameTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/DefaultFieldNameTestCase.java @@ -1,7 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.indexinglanguage.parser; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.ScriptParserContext; import com.yahoo.vespa.indexinglanguage.expressions.Expression; @@ -19,7 +19,7 @@ public class DefaultFieldNameTestCase { public void requireThatDefaultFieldNameIsAppliedWhenArgumentIsMissing() throws ParseException { IndexingInput input = new IndexingInput("input"); InputExpression exp = (InputExpression)Expression.newInstance(new ScriptParserContext(new SimpleLinguistics(), - Encoder.throwsOnUse) + Embedder.throwsOnUse) .setInputStream(input) .setDefaultFieldName("foo")); assertEquals("foo", exp.getFieldName()); diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java index d7c5ae5c15a..44aa562028c 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java @@ -2,7 +2,7 @@ package com.yahoo.vespa.indexinglanguage.parser; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.expressions.*; import org.junit.Test; @@ -85,9 +85,9 @@ public class ExpressionTestCase { private static void assertExpression(Class expectedClass, String str) throws ParseException { Linguistics linguistics = new SimpleLinguistics(); - Expression foo = Expression.fromString(str, linguistics, Encoder.throwsOnUse); + Expression foo = Expression.fromString(str, linguistics, Embedder.throwsOnUse); assertEquals(expectedClass, foo.getClass()); - Expression bar = Expression.fromString(foo.toString(), linguistics, Encoder.throwsOnUse); + Expression bar = Expression.fromString(foo.toString(), linguistics, Embedder.throwsOnUse); assertEquals(foo.hashCode(), bar.hashCode()); assertEquals(foo, bar); } -- cgit v1.2.3