From e7e659e9d26401c8c36300d4760d4e34acd26d0a Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Tue, 28 Sep 2021 21:19:41 +0200 Subject: encode -> embed --- .../yahoo/searchdefinition/document/SDField.java | 8 +- .../fieldoperation/IndexingOperation.java | 8 +- .../container/ApplicationContainerCluster.java | 2 +- config-model/src/main/javacc/SDParser.jj | 8 +- .../testutil/HandlersConfigurerTestWrapper.java | 4 +- .../language/provider/DefaultEmbedderProvider.java | 26 +++ .../language/provider/DefaultEncoderProvider.java | 31 --- container-search/abi-spec.json | 12 +- .../src/main/java/com/yahoo/search/Query.java | 26 +-- .../com/yahoo/search/handler/SearchHandler.java | 20 +- .../query/profile/QueryProfileProperties.java | 12 +- .../query/profile/types/ConversionContext.java | 12 +- .../search/query/profile/types/FieldType.java | 2 - .../query/profile/types/PrimitiveFieldType.java | 2 - .../search/query/profile/types/QueryFieldType.java | 2 - .../query/profile/types/QueryProfileFieldType.java | 2 - .../query/profile/types/TensorFieldType.java | 15 +- .../search/query/properties/QueryProperties.java | 10 +- .../grouping/vespa/IntegerEmbedderTestCase.java | 35 ++++ .../grouping/vespa/IntegerEncoderTestCase.java | 35 ---- .../types/test/QueryProfileTypeTestCase.java | 27 ++- .../yahoo/docprocs/indexing/IndexingProcessor.java | 6 +- .../com/yahoo/docprocs/indexing/ScriptManager.java | 10 +- .../indexing/IndexingProcessorTestCase.java | 4 +- .../docprocs/indexing/ScriptManagerTestCase.java | 10 +- .../yahoo/vespa/indexinglanguage/ScriptParser.java | 2 +- .../indexinglanguage/ScriptParserContext.java | 13 +- .../expressions/EmbedExpression.java | 69 +++++++ .../expressions/EncodeExpression.java | 69 ------- .../indexinglanguage/expressions/Expression.java | 8 +- .../expressions/ScriptExpression.java | 8 +- .../expressions/StatementExpression.java | 10 +- indexinglanguage/src/main/javacc/IndexingParser.jj | 20 +- .../indexinglanguage/ScriptParserTestCase.java | 4 +- .../vespa/indexinglanguage/ScriptTestCase.java | 18 +- .../parser/DefaultFieldNameTestCase.java | 4 +- .../parser/ExpressionTestCase.java | 6 +- linguistics-components/abi-spec.json | 20 +- .../sentencepiece/SentencePieceEmbedder.java | 219 ++++++++++++++++++++ .../sentencepiece/SentencePieceEncoder.java | 220 --------------------- .../language.sentencepiece.sentence-piece.def | 2 +- .../SentencePieceConfigurationTest.java | 8 +- .../language/sentencepiece/SentencePieceTest.java | 18 +- .../sentencepiece/SentencePieceTester.java | 20 +- linguistics/abi-spec.json | 16 +- .../java/com/yahoo/language/process/Embedder.java | 56 ++++++ .../java/com/yahoo/language/process/Encoder.java | 56 ------ 47 files changed, 585 insertions(+), 610 deletions(-) create mode 100644 container-core/src/main/java/com/yahoo/language/provider/DefaultEmbedderProvider.java delete mode 100644 container-core/src/main/java/com/yahoo/language/provider/DefaultEncoderProvider.java create mode 100644 container-search/src/test/java/com/yahoo/search/grouping/vespa/IntegerEmbedderTestCase.java delete mode 100644 container-search/src/test/java/com/yahoo/search/grouping/vespa/IntegerEncoderTestCase.java create mode 100644 indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EmbedExpression.java delete mode 100644 indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EncodeExpression.java create mode 100644 linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEmbedder.java delete mode 100644 linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java create mode 100644 linguistics/src/main/java/com/yahoo/language/process/Embedder.java delete mode 100644 linguistics/src/main/java/com/yahoo/language/process/Encoder.java diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java b/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java index 9c89517f72d..02df81fbbb3 100644 --- a/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java +++ b/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java @@ -9,7 +9,7 @@ import com.yahoo.document.MapDataType; import com.yahoo.document.StructDataType; import com.yahoo.document.TensorDataType; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.searchdefinition.Index; import com.yahoo.searchdefinition.Search; @@ -426,12 +426,12 @@ public class SDField extends Field implements TypedKey, FieldOperationContainer, /** Parse an indexing expression which will use the simple linguistics implementatino suitable for testing */ public void parseIndexingScript(String script) { - parseIndexingScript(script, new SimpleLinguistics(), Encoder.throwsOnUse); + parseIndexingScript(script, new SimpleLinguistics(), Embedder.throwsOnUse); } - public void parseIndexingScript(String script, Linguistics linguistics, Encoder encoder) { + public void parseIndexingScript(String script, Linguistics linguistics, Embedder embedder) { try { - ScriptParserContext config = new ScriptParserContext(linguistics, encoder); + ScriptParserContext config = new ScriptParserContext(linguistics, embedder); config.setInputStream(new IndexingInput(script)); setIndexingScript(ScriptExpression.newInstance(config)); } catch (ParseException e) { diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java b/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java index 87fa74b92fe..18e187fd921 100644 --- a/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java +++ b/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java @@ -2,7 +2,7 @@ package com.yahoo.searchdefinition.fieldoperation; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.searchdefinition.document.SDField; import com.yahoo.searchdefinition.parser.ParseException; @@ -30,13 +30,13 @@ public class IndexingOperation implements FieldOperation { /** Creates an indexing operation which will use the simple linguistics implementation suitable for testing */ public static IndexingOperation fromStream(SimpleCharStream input, boolean multiLine) throws ParseException { - return fromStream(input, multiLine, new SimpleLinguistics(), Encoder.throwsOnUse); + return fromStream(input, multiLine, new SimpleLinguistics(), Embedder.throwsOnUse); } public static IndexingOperation fromStream(SimpleCharStream input, boolean multiLine, - Linguistics linguistics, Encoder encoder) + Linguistics linguistics, Embedder embedder) throws ParseException { - ScriptParserContext config = new ScriptParserContext(linguistics, encoder); + ScriptParserContext config = new ScriptParserContext(linguistics, embedder); config.setAnnotatorConfig(new AnnotatorConfig()); config.setInputStream(input); ScriptExpression exp; diff --git a/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java b/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java index 5574082e334..0bb04a1266d 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java @@ -100,7 +100,7 @@ public final class ApplicationContainerCluster extends ContainerCluster { + + @Inject + public DefaultEmbedderProvider() { } + + @Override + public Embedder get() { return Embedder.throwsOnUse; } + + @Override + public void deconstruct() {} + +} diff --git a/container-core/src/main/java/com/yahoo/language/provider/DefaultEncoderProvider.java b/container-core/src/main/java/com/yahoo/language/provider/DefaultEncoderProvider.java deleted file mode 100644 index f8550d04d1c..00000000000 --- a/container-core/src/main/java/com/yahoo/language/provider/DefaultEncoderProvider.java +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.provider; - -import com.google.inject.Inject; -import com.yahoo.container.di.componentgraph.Provider; -import com.yahoo.language.Language; -import com.yahoo.language.process.Encoder; -import com.yahoo.tensor.Tensor; -import com.yahoo.tensor.TensorType; - -import java.util.List; - -/** - * Provides the default encoder implementation if no encoder component has been explicitly configured - * (dependency injection will fallback to providers if no components of the requested type is found). - * - * @author bratseth - */ -@SuppressWarnings("unused") // Injected -public class DefaultEncoderProvider implements Provider { - - @Inject - public DefaultEncoderProvider() { } - - @Override - public Encoder get() { return Encoder.throwsOnUse; } - - @Override - public void deconstruct() {} - -} diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json index 7016eff3185..40071f90c34 100644 --- a/container-search/abi-spec.json +++ b/container-search/abi-spec.json @@ -1801,8 +1801,8 @@ "public java.util.Map getRequestMap()", "public com.yahoo.search.Query$Builder setQueryProfile(com.yahoo.search.query.profile.compiled.CompiledQueryProfile)", "public com.yahoo.search.query.profile.compiled.CompiledQueryProfile getQueryProfile()", - "public com.yahoo.search.Query$Builder setEncoder(com.yahoo.language.process.Encoder)", - "public com.yahoo.language.process.Encoder getEncoder()", + "public com.yahoo.search.Query$Builder setEmbedder(com.yahoo.language.process.Embedder)", + "public com.yahoo.language.process.Embedder getEmbedder()", "public com.yahoo.search.Query build()" ], "fields": [] @@ -4258,7 +4258,7 @@ "public" ], "methods": [ - "public void (com.yahoo.statistics.Statistics, com.yahoo.jdisc.Metric, com.yahoo.container.handler.threadpool.ContainerThreadPool, com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.container.core.ContainerHttpConfig, com.yahoo.language.process.Encoder, com.yahoo.search.searchchain.ExecutionFactory)", + "public void (com.yahoo.statistics.Statistics, com.yahoo.jdisc.Metric, com.yahoo.container.handler.threadpool.ContainerThreadPool, com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.container.core.ContainerHttpConfig, com.yahoo.language.process.Embedder, com.yahoo.search.searchchain.ExecutionFactory)", "public void (com.yahoo.statistics.Statistics, com.yahoo.jdisc.Metric, com.yahoo.container.handler.threadpool.ContainerThreadPool, com.yahoo.container.logging.AccessLog, com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.container.core.ContainerHttpConfig, com.yahoo.search.searchchain.ExecutionFactory)", "public void (com.yahoo.statistics.Statistics, com.yahoo.jdisc.Metric, java.util.concurrent.Executor, com.yahoo.container.logging.AccessLog, com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.container.core.ContainerHttpConfig, com.yahoo.search.searchchain.ExecutionFactory)", "public void (com.yahoo.statistics.Statistics, com.yahoo.jdisc.Metric, java.util.concurrent.Executor, com.yahoo.container.logging.AccessLog, com.yahoo.search.query.profile.config.QueryProfilesConfig, com.yahoo.container.core.ContainerHttpConfig, com.yahoo.search.searchchain.ExecutionFactory)", @@ -5885,7 +5885,7 @@ ], "methods": [ "public void (com.yahoo.search.query.profile.compiled.CompiledQueryProfile)", - "public void (com.yahoo.search.query.profile.compiled.CompiledQueryProfile, com.yahoo.language.process.Encoder)", + "public void (com.yahoo.search.query.profile.compiled.CompiledQueryProfile, com.yahoo.language.process.Embedder)", "public com.yahoo.search.query.profile.compiled.CompiledQueryProfile getQueryProfile()", "public java.lang.Object get(com.yahoo.processing.request.CompoundName, java.util.Map, com.yahoo.processing.request.Properties)", "public void set(com.yahoo.processing.request.CompoundName, java.lang.Object, java.util.Map)", @@ -6259,7 +6259,7 @@ "public" ], "methods": [ - "public void (com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.language.process.Encoder, java.util.Map)", + "public void (com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.language.process.Embedder, java.util.Map)", "public static com.yahoo.search.query.profile.types.ConversionContext empty()" ], "fields": [] @@ -6531,7 +6531,7 @@ "public" ], "methods": [ - "public void (com.yahoo.search.Query, com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.language.process.Encoder)", + "public void (com.yahoo.search.Query, com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.language.process.Embedder)", "public void setParentQuery(com.yahoo.search.Query)", "public java.lang.Object get(com.yahoo.processing.request.CompoundName, java.util.Map, com.yahoo.processing.request.Properties)", "public void set(com.yahoo.processing.request.CompoundName, java.lang.Object, java.util.Map)", diff --git a/container-search/src/main/java/com/yahoo/search/Query.java b/container-search/src/main/java/com/yahoo/search/Query.java index 06b71599103..08ebd74da5a 100644 --- a/container-search/src/main/java/com/yahoo/search/Query.java +++ b/container-search/src/main/java/com/yahoo/search/Query.java @@ -7,7 +7,7 @@ import com.yahoo.collections.Tuple2; import com.yahoo.component.Version; import com.yahoo.container.jdisc.HttpRequest; import com.yahoo.fs4.MapEncoder; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.prelude.fastsearch.DocumentDatabase; import com.yahoo.prelude.query.Highlight; import com.yahoo.prelude.query.textualrepresentation.TextualQueryRepresentation; @@ -334,32 +334,32 @@ public class Query extends com.yahoo.processing.Request implements Cloneable { public Query(HttpRequest request, Map requestMap, CompiledQueryProfile queryProfile) { super(new QueryPropertyAliases(propertyAliases)); this.httpRequest = request; - init(requestMap, queryProfile, Encoder.throwsOnUse); + init(requestMap, queryProfile, Embedder.throwsOnUse); } // TODO: Deprecate most constructors above here private Query(Builder builder) { - this(builder.getRequest(), builder.getRequestMap(), builder.getQueryProfile(), builder.getEncoder()); + this(builder.getRequest(), builder.getRequestMap(), builder.getQueryProfile(), builder.getEmbedder()); } - private Query(HttpRequest request, Map requestMap, CompiledQueryProfile queryProfile, Encoder encoder) { + private Query(HttpRequest request, Map requestMap, CompiledQueryProfile queryProfile, Embedder embedder) { super(new QueryPropertyAliases(propertyAliases)); this.httpRequest = request; - init(requestMap, queryProfile, encoder); + init(requestMap, queryProfile, embedder); } - private void init(Map requestMap, CompiledQueryProfile queryProfile, Encoder encoder) { + private void init(Map requestMap, CompiledQueryProfile queryProfile, Embedder embedder) { startTime = httpRequest.getJDiscRequest().creationTime(TimeUnit.MILLISECONDS); if (queryProfile != null) { // Move all request parameters to the query profile just to validate that the parameter settings are legal - Properties queryProfileProperties = new QueryProfileProperties(queryProfile, encoder); + Properties queryProfileProperties = new QueryProfileProperties(queryProfile, embedder); properties().chain(queryProfileProperties); // TODO: Just checking legality rather than actually setting would be faster setPropertiesFromRequestMap(requestMap, properties(), true); // Adds errors to the query for illegal set attempts // Create the full chain - properties().chain(new QueryProperties(this, queryProfile.getRegistry(), encoder)). + properties().chain(new QueryProperties(this, queryProfile.getRegistry(), embedder)). chain(new ModelObjectMap()). chain(new RequestContextProperties(requestMap)). chain(queryProfileProperties). @@ -378,7 +378,7 @@ public class Query extends com.yahoo.processing.Request implements Cloneable { } else { // bypass these complications if there is no query profile to get values from and validate against properties(). - chain(new QueryProperties(this, CompiledQueryProfileRegistry.empty, encoder)). + chain(new QueryProperties(this, CompiledQueryProfileRegistry.empty, embedder)). chain(new PropertyMap()). chain(new DefaultProperties()); setPropertiesFromRequestMap(requestMap, properties(), false); @@ -1130,7 +1130,7 @@ public class Query extends com.yahoo.processing.Request implements Cloneable { private HttpRequest request = null; private Map requestMap = null; private CompiledQueryProfile queryProfile = null; - private Encoder encoder = Encoder.throwsOnUse; + private Embedder embedder = Embedder.throwsOnUse; public Builder setRequest(String query) { request = HttpRequest.createTestRequest(query, com.yahoo.jdisc.http.HttpRequest.Method.GET); @@ -1168,12 +1168,12 @@ public class Query extends com.yahoo.processing.Request implements Cloneable { /** Returns the query profile of this query, or null if none. */ public CompiledQueryProfile getQueryProfile() { return queryProfile; } - public Builder setEncoder(Encoder encoder) { - this.encoder = encoder; + public Builder setEmbedder(Embedder embedder) { + this.embedder = embedder; return this; } - public Encoder getEncoder() { return encoder; } + public Embedder getEmbedder() { return embedder; } /** Creates a new query from this builder. No properties are required to before calling this. */ public Query build() { return new Query(this); } diff --git a/container-search/src/main/java/com/yahoo/search/handler/SearchHandler.java b/container-search/src/main/java/com/yahoo/search/handler/SearchHandler.java index d1e57a30206..c15aef44f3d 100644 --- a/container-search/src/main/java/com/yahoo/search/handler/SearchHandler.java +++ b/container-search/src/main/java/com/yahoo/search/handler/SearchHandler.java @@ -23,7 +23,7 @@ import com.yahoo.io.IOUtils; import com.yahoo.jdisc.Metric; import com.yahoo.jdisc.Request; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.net.HostName; import com.yahoo.net.UriTools; import com.yahoo.prelude.query.parser.ParseException; @@ -106,7 +106,7 @@ public class SearchHandler extends LoggingRequestHandler { private final String selfHostname = HostName.getLocalhost(); - private final Encoder encoder; + private final Embedder embedder; private final ExecutionFactory executionFactory; @@ -134,9 +134,9 @@ public class SearchHandler extends LoggingRequestHandler { ContainerThreadPool threadpool, CompiledQueryProfileRegistry queryProfileRegistry, ContainerHttpConfig config, - Encoder encoder, + Embedder embedder, ExecutionFactory executionFactory) { - this(statistics, metric, threadpool.executor(), queryProfileRegistry, encoder, executionFactory, + this(statistics, metric, threadpool.executor(), queryProfileRegistry, embedder, executionFactory, config.numQueriesToTraceOnDebugAfterConstruction(), config.hostResponseHeaderKey().equals("") ? Optional.empty() : Optional.of(config.hostResponseHeaderKey())); } @@ -170,7 +170,7 @@ public class SearchHandler extends LoggingRequestHandler { metric, executor, queryProfileRegistry, - Encoder.throwsOnUse, + Embedder.throwsOnUse, executionFactory, containerHttpConfig.numQueriesToTraceOnDebugAfterConstruction(), containerHttpConfig.hostResponseHeaderKey().equals("") ? @@ -192,7 +192,7 @@ public class SearchHandler extends LoggingRequestHandler { metric, executor, QueryProfileConfigurer.createFromConfig(queryProfileConfig).compile(), - Encoder.throwsOnUse, + Embedder.throwsOnUse, executionFactory, containerHttpConfig.numQueriesToTraceOnDebugAfterConstruction(), containerHttpConfig.hostResponseHeaderKey().equals("") ? @@ -210,7 +210,7 @@ public class SearchHandler extends LoggingRequestHandler { CompiledQueryProfileRegistry queryProfileRegistry, ExecutionFactory executionFactory, Optional hostResponseHeaderKey) { - this(statistics, metric, executor, queryProfileRegistry, Encoder.throwsOnUse, + this(statistics, metric, executor, queryProfileRegistry, Embedder.throwsOnUse, executionFactory, 0, hostResponseHeaderKey); } @@ -218,14 +218,14 @@ public class SearchHandler extends LoggingRequestHandler { Metric metric, Executor executor, CompiledQueryProfileRegistry queryProfileRegistry, - Encoder encoder, + Embedder embedder, ExecutionFactory executionFactory, long numQueriesToTraceOnDebugAfterStartup, Optional hostResponseHeaderKey) { super(executor, metric, true); log.log(Level.FINE, () -> "SearchHandler.init " + System.identityHashCode(this)); this.queryProfileRegistry = queryProfileRegistry; - this.encoder = encoder; + this.embedder = embedder; this.executionFactory = executionFactory; this.maxThreads = examineExecutor(executor); @@ -332,7 +332,7 @@ public class SearchHandler extends LoggingRequestHandler { Query query = new Query.Builder().setRequest(request) .setRequestMap(requestMap) .setQueryProfile(queryProfile) - .setEncoder(encoder) + .setEmbedder(embedder) .build(); boolean benchmarking = VespaHeaders.benchmarkOutput(request); diff --git a/container-search/src/main/java/com/yahoo/search/query/profile/QueryProfileProperties.java b/container-search/src/main/java/com/yahoo/search/query/profile/QueryProfileProperties.java index e555000272d..53be827073c 100644 --- a/container-search/src/main/java/com/yahoo/search/query/profile/QueryProfileProperties.java +++ b/container-search/src/main/java/com/yahoo/search/query/profile/QueryProfileProperties.java @@ -2,7 +2,7 @@ package com.yahoo.search.query.profile; import com.yahoo.collections.Pair; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.processing.IllegalInputException; import com.yahoo.processing.request.CompoundName; import com.yahoo.processing.request.properties.PropertyMap; @@ -30,7 +30,7 @@ import java.util.Map; public class QueryProfileProperties extends Properties { private final CompiledQueryProfile profile; - private final Encoder encoder; + private final Embedder embedder; // Note: The priority order is: values has precedence over references @@ -45,14 +45,14 @@ public class QueryProfileProperties extends Properties { private List> references = null; public QueryProfileProperties(CompiledQueryProfile profile) { - this(profile, Encoder.throwsOnUse); + this(profile, Embedder.throwsOnUse); } /** Creates an instance from a profile, throws an exception if the given profile is null */ - public QueryProfileProperties(CompiledQueryProfile profile, Encoder encoder) { + public QueryProfileProperties(CompiledQueryProfile profile, Embedder embedder) { Validator.ensureNotNull("The profile wrapped by this cannot be null", profile); this.profile = profile; - this.encoder = encoder; + this.embedder = embedder; } /** Returns the query profile backing this, or null if none */ @@ -122,7 +122,7 @@ public class QueryProfileProperties extends Properties { if (fieldDescription != null) { if (i == name.size() - 1) { // at the end of the path, check the assignment type value = fieldDescription.getType().convertFrom(value, new ConversionContext(profile.getRegistry(), - encoder, + embedder, context)); if (value == null) throw new IllegalInputException("'" + value + "' is not a " + diff --git a/container-search/src/main/java/com/yahoo/search/query/profile/types/ConversionContext.java b/container-search/src/main/java/com/yahoo/search/query/profile/types/ConversionContext.java index 4aa95741b06..e5b9eb1c1cd 100644 --- a/container-search/src/main/java/com/yahoo/search/query/profile/types/ConversionContext.java +++ b/container-search/src/main/java/com/yahoo/search/query/profile/types/ConversionContext.java @@ -2,7 +2,7 @@ package com.yahoo.search.query.profile.types; import com.yahoo.language.Language; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry; import java.util.Map; @@ -13,12 +13,12 @@ import java.util.Map; public class ConversionContext { private final CompiledQueryProfileRegistry registry; - private final Encoder encoder; + private final Embedder embedder; private final Language language; - public ConversionContext(CompiledQueryProfileRegistry registry, Encoder encoder, Map context) { + public ConversionContext(CompiledQueryProfileRegistry registry, Embedder embedder, Map context) { this.registry = registry; - this.encoder = encoder; + this.embedder = embedder; this.language = context.containsKey("language") ? Language.fromLanguageTag(context.get("language")) : Language.UNKNOWN; } @@ -27,14 +27,14 @@ public class ConversionContext { CompiledQueryProfileRegistry getRegistry() {return registry;} /** Returns the configured encoder, never null */ - Encoder getEncoder() { return encoder; } + Embedder getEncoder() { return embedder; } /** Returns the language, which is never null but may be UNKNOWN */ Language getLanguage() { return language; } /** Returns an empty context */ public static ConversionContext empty() { - return new ConversionContext(null, Encoder.throwsOnUse, Map.of()); + return new ConversionContext(null, Embedder.throwsOnUse, Map.of()); } } diff --git a/container-search/src/main/java/com/yahoo/search/query/profile/types/FieldType.java b/container-search/src/main/java/com/yahoo/search/query/profile/types/FieldType.java index 511b64c7b6e..7a06f9ef534 100644 --- a/container-search/src/main/java/com/yahoo/search/query/profile/types/FieldType.java +++ b/container-search/src/main/java/com/yahoo/search/query/profile/types/FieldType.java @@ -1,10 +1,8 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.search.query.profile.types; -import com.yahoo.language.process.Encoder; import com.yahoo.search.query.profile.QueryProfile; import com.yahoo.search.query.profile.QueryProfileRegistry; -import com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry; import com.yahoo.search.yql.YqlQuery; import com.yahoo.tensor.Tensor; import com.yahoo.tensor.TensorType; diff --git a/container-search/src/main/java/com/yahoo/search/query/profile/types/PrimitiveFieldType.java b/container-search/src/main/java/com/yahoo/search/query/profile/types/PrimitiveFieldType.java index b1a9820c6fa..f9d8950908b 100644 --- a/container-search/src/main/java/com/yahoo/search/query/profile/types/PrimitiveFieldType.java +++ b/container-search/src/main/java/com/yahoo/search/query/profile/types/PrimitiveFieldType.java @@ -1,9 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.search.query.profile.types; -import com.yahoo.language.process.Encoder; import com.yahoo.search.query.profile.QueryProfileRegistry; -import com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry; import static com.yahoo.text.Lowercase.toLowerCase; diff --git a/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryFieldType.java b/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryFieldType.java index 09c1a4d0cc0..cbae6402039 100644 --- a/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryFieldType.java +++ b/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryFieldType.java @@ -1,9 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.search.query.profile.types; -import com.yahoo.language.process.Encoder; import com.yahoo.search.query.profile.QueryProfileRegistry; -import com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry; import com.yahoo.search.yql.YqlQuery; /** diff --git a/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryProfileFieldType.java b/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryProfileFieldType.java index 6958318bee4..ff12224823f 100644 --- a/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryProfileFieldType.java +++ b/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryProfileFieldType.java @@ -1,11 +1,9 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.search.query.profile.types; -import com.yahoo.language.process.Encoder; import com.yahoo.search.query.profile.QueryProfile; import com.yahoo.search.query.profile.QueryProfileRegistry; import com.yahoo.search.query.profile.compiled.CompiledQueryProfile; -import com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry; /** * Represents a query profile field type which is a reference to a query profile. diff --git a/container-search/src/main/java/com/yahoo/search/query/profile/types/TensorFieldType.java b/container-search/src/main/java/com/yahoo/search/query/profile/types/TensorFieldType.java index 34a9f8d41c3..cd21f0b3a61 100644 --- a/container-search/src/main/java/com/yahoo/search/query/profile/types/TensorFieldType.java +++ b/container-search/src/main/java/com/yahoo/search/query/profile/types/TensorFieldType.java @@ -2,9 +2,8 @@ package com.yahoo.search.query.profile.types; import com.yahoo.language.Language; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.search.query.profile.QueryProfileRegistry; -import com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry; import com.yahoo.tensor.Tensor; import com.yahoo.tensor.TensorType; @@ -48,18 +47,18 @@ public class TensorFieldType extends FieldType { return convertFrom(o, context.getEncoder(), context.getLanguage()); } - private Object convertFrom(Object o, Encoder encoder, Language language) { + private Object convertFrom(Object o, Embedder embedder, Language language) { if (o instanceof Tensor) return o; - if (o instanceof String && ((String)o).startsWith("encode(")) return encode((String)o, encoder, language); + if (o instanceof String && ((String)o).startsWith("embed(")) return encode((String)o, embedder, language); if (o instanceof String) return Tensor.from(type, (String)o); return null; } - private Tensor encode(String s, Encoder encoder, Language language) { + private Tensor encode(String s, Embedder embedder, Language language) { if ( ! s.endsWith(")")) - throw new IllegalArgumentException("Expected any string enclosed in encode(), but the argument does not end by ')'"); - String text = s.substring("encode(".length(), s.length() - 1); - return encoder.encode(text, language, type); + throw new IllegalArgumentException("Expected any string enclosed in embed(), but the argument does not end by ')'"); + String text = s.substring("embed(".length(), s.length() - 1); + return embedder.embed(text, language, type); } public static TensorFieldType fromTypeString(String s) { diff --git a/container-search/src/main/java/com/yahoo/search/query/properties/QueryProperties.java b/container-search/src/main/java/com/yahoo/search/query/properties/QueryProperties.java index 02648f84066..3a426656185 100644 --- a/container-search/src/main/java/com/yahoo/search/query/properties/QueryProperties.java +++ b/container-search/src/main/java/com/yahoo/search/query/properties/QueryProperties.java @@ -1,7 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.search.query.properties; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.processing.IllegalInputException; import com.yahoo.processing.request.CompoundName; import com.yahoo.search.Query; @@ -34,12 +34,12 @@ public class QueryProperties extends Properties { private Query query; private final CompiledQueryProfileRegistry profileRegistry; - private final Encoder encoder; + private final Embedder embedder; - public QueryProperties(Query query, CompiledQueryProfileRegistry profileRegistry, Encoder encoder) { + public QueryProperties(Query query, CompiledQueryProfileRegistry profileRegistry, Embedder embedder) { this.query = query; this.profileRegistry = profileRegistry; - this.encoder = encoder; + this.embedder = embedder; } public void setParentQuery(Query query) { @@ -380,7 +380,7 @@ public class QueryProperties extends Properties { if (type == null) return value; // no type info -> keep as string FieldDescription field = type.getField(key); if (field == null) return value; // ditto - return field.getType().convertFrom(value, new ConversionContext(profileRegistry, encoder, context)); + return field.getType().convertFrom(value, new ConversionContext(profileRegistry, embedder, context)); } private void throwIllegalParameter(String key,String namespace) { diff --git a/container-search/src/test/java/com/yahoo/search/grouping/vespa/IntegerEmbedderTestCase.java b/container-search/src/test/java/com/yahoo/search/grouping/vespa/IntegerEmbedderTestCase.java new file mode 100644 index 00000000000..18a9f11e15e --- /dev/null +++ b/container-search/src/test/java/com/yahoo/search/grouping/vespa/IntegerEmbedderTestCase.java @@ -0,0 +1,35 @@ +// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.search.grouping.vespa; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +/** + * @author Simon Thoresen Hult + */ +public class IntegerEmbedderTestCase { + + @Test + public void requireThatIntEncoderWorksAsExpected() { + assertEncode("A", 0); + assertEncode("BC", 1); + assertEncode("CBI", 12); + assertEncode("CPG", 123); + assertEncode("DJKE", 1234); + assertEncode("EGAHC", 12345); + assertEncode("FDMEIA", 123456); + assertEncode("GCFKNAO", 1234567); + assertEncode("HBHIMCJM", 12345678); + assertEncode("HOLHJKCK", 123456789); + assertEncode("IJDCMAFKE", 1234567890); + assertEncode("IIKKEBPOF", -1163005939); + assertEncode("IECKEIKID", -559039810); + } + + private static void assertEncode(String expected, int toEncode) { + IntegerEncoder actual = new IntegerEncoder(); + actual.append(toEncode); + assertEquals(expected, actual.toString()); + } +} diff --git a/container-search/src/test/java/com/yahoo/search/grouping/vespa/IntegerEncoderTestCase.java b/container-search/src/test/java/com/yahoo/search/grouping/vespa/IntegerEncoderTestCase.java deleted file mode 100644 index 3b48ae35fcf..00000000000 --- a/container-search/src/test/java/com/yahoo/search/grouping/vespa/IntegerEncoderTestCase.java +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.search.grouping.vespa; - -import org.junit.Test; - -import static org.junit.Assert.assertEquals; - -/** - * @author Simon Thoresen Hult - */ -public class IntegerEncoderTestCase { - - @Test - public void requireThatIntEncoderWorksAsExpected() { - assertEncode("A", 0); - assertEncode("BC", 1); - assertEncode("CBI", 12); - assertEncode("CPG", 123); - assertEncode("DJKE", 1234); - assertEncode("EGAHC", 12345); - assertEncode("FDMEIA", 123456); - assertEncode("GCFKNAO", 1234567); - assertEncode("HBHIMCJM", 12345678); - assertEncode("HOLHJKCK", 123456789); - assertEncode("IJDCMAFKE", 1234567890); - assertEncode("IIKKEBPOF", -1163005939); - assertEncode("IECKEIKID", -559039810); - } - - private static void assertEncode(String expected, int toEncode) { - IntegerEncoder actual = new IntegerEncoder(); - actual.append(toEncode); - assertEquals(expected, actual.toString()); - } -} diff --git a/container-search/src/test/java/com/yahoo/search/query/profile/types/test/QueryProfileTypeTestCase.java b/container-search/src/test/java/com/yahoo/search/query/profile/types/test/QueryProfileTypeTestCase.java index 45f53a1cdb9..e22263070e0 100644 --- a/container-search/src/test/java/com/yahoo/search/query/profile/types/test/QueryProfileTypeTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/query/profile/types/test/QueryProfileTypeTestCase.java @@ -4,7 +4,7 @@ package com.yahoo.search.query.profile.types.test; import com.yahoo.component.ComponentId; import com.yahoo.container.jdisc.HttpRequest; import com.yahoo.language.Language; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.tensor.Tensor; import com.yahoo.tensor.TensorType; import com.yahoo.yolean.Exceptions; @@ -22,7 +22,6 @@ import com.yahoo.search.query.profile.types.QueryProfileTypeRegistry; import org.junit.Before; import org.junit.Test; -import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.List; @@ -441,19 +440,19 @@ public class QueryProfileTypeTestCase { } @Test - public void testUnencodedTensorRankFeatureInRequest() { + public void testUnembeddedTensorRankFeatureInRequest() { QueryProfile profile = new QueryProfile("test"); profile.setType(testtype); registry.register(profile); CompiledQueryProfileRegistry cRegistry = registry.compile(); - String textToEncode = "text to encode as tensor"; + String textToEmbed = "text to embed into a tensor"; Tensor expectedTensor = Tensor.from("tensor(x[5]):[3,7,4,0,0]]"); Query query1 = new Query.Builder().setRequest(HttpRequest.createTestRequest("?" + urlEncode("ranking.features.query(myTensor4)") + - "=" + urlEncode("encode(" + textToEncode + ")"), + "=" + urlEncode("embed(" + textToEmbed + ")"), com.yahoo.jdisc.http.HttpRequest.Method.GET)) .setQueryProfile(cRegistry.getComponent("test")) - .setEncoder(new MockEncoder(textToEncode, Language.UNKNOWN, expectedTensor)) + .setEmbedder(new MockEmbedder(textToEmbed, Language.UNKNOWN, expectedTensor)) .build(); assertEquals(0, query1.errors().size()); assertEquals(expectedTensor, query1.properties().get("ranking.features.query(myTensor4)")); @@ -461,11 +460,11 @@ public class QueryProfileTypeTestCase { // Explicit language Query query2 = new Query.Builder().setRequest(HttpRequest.createTestRequest("?" + urlEncode("ranking.features.query(myTensor4)") + - "=" + urlEncode("encode(" + textToEncode + ")") + + "=" + urlEncode("embed(" + textToEmbed + ")") + "&language=en", com.yahoo.jdisc.http.HttpRequest.Method.GET)) .setQueryProfile(cRegistry.getComponent("test")) - .setEncoder(new MockEncoder(textToEncode, Language.ENGLISH, expectedTensor)) + .setEmbedder(new MockEmbedder(textToEmbed, Language.ENGLISH, expectedTensor)) .build(); assertEquals(0, query2.errors().size()); assertEquals(expectedTensor, query2.properties().get("ranking.features.query(myTensor4)")); @@ -723,28 +722,28 @@ public class QueryProfileTypeTestCase { } } - private static final class MockEncoder implements Encoder { + private static final class MockEmbedder implements Embedder { private final String expectedText; private final Language expectedLanguage; private final Tensor tensorToReturn; - public MockEncoder(String expectedText, - Language expectedLanguage, - Tensor tensorToReturn) { + public MockEmbedder(String expectedText, + Language expectedLanguage, + Tensor tensorToReturn) { this.expectedText = expectedText; this.expectedLanguage = expectedLanguage; this.tensorToReturn = tensorToReturn; } @Override - public List encode(String text, Language language) { + public List embed(String text, Language language) { fail("Unexpected call"); return null; } @Override - public Tensor encode(String text, Language language, TensorType tensorType) { + public Tensor embed(String text, Language language, TensorType tensorType) { assertEquals(expectedText, text); assertEquals(expectedLanguage, language); assertEquals(tensorToReturn.type(), tensorType); diff --git a/docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java b/docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java index 53709c4ff87..f3a67f855e9 100644 --- a/docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java +++ b/docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java @@ -21,7 +21,7 @@ import com.yahoo.document.config.DocumentmanagerConfig; import com.yahoo.language.Linguistics; import java.util.logging.Level; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.vespa.configdefinition.IlscriptsConfig; import com.yahoo.vespa.indexinglanguage.AdapterFactory; import com.yahoo.vespa.indexinglanguage.SimpleAdapterFactory; @@ -55,9 +55,9 @@ public class IndexingProcessor extends DocumentProcessor { public IndexingProcessor(DocumentmanagerConfig documentmanagerConfig, IlscriptsConfig ilscriptsConfig, Linguistics linguistics, - Encoder encoder) { + Embedder embedder) { docTypeMgr = DocumentTypeManagerConfigurer.configureNewManager(documentmanagerConfig); - scriptMgr = new ScriptManager(docTypeMgr, ilscriptsConfig, linguistics, encoder); + scriptMgr = new ScriptManager(docTypeMgr, ilscriptsConfig, linguistics, embedder); adapterFactory = new SimpleAdapterFactory(new ExpressionSelector()); } diff --git a/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java b/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java index fa5f794f652..7e1d5b5b6ce 100644 --- a/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java +++ b/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java @@ -6,7 +6,7 @@ import com.yahoo.document.DocumentTypeManager; import com.yahoo.language.Linguistics; import java.util.logging.Level; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.vespa.configdefinition.IlscriptsConfig; import com.yahoo.vespa.indexinglanguage.ScriptParserContext; import com.yahoo.vespa.indexinglanguage.expressions.InputExpression; @@ -28,9 +28,9 @@ public class ScriptManager { private final Map> documentFieldScripts; private final DocumentTypeManager docTypeMgr; - public ScriptManager(DocumentTypeManager docTypeMgr, IlscriptsConfig config, Linguistics linguistics, Encoder encoder) { + public ScriptManager(DocumentTypeManager docTypeMgr, IlscriptsConfig config, Linguistics linguistics, Embedder embedder) { this.docTypeMgr = docTypeMgr; - documentFieldScripts = createScriptsMap(docTypeMgr, config, linguistics, encoder); + documentFieldScripts = createScriptsMap(docTypeMgr, config, linguistics, embedder); } @@ -75,9 +75,9 @@ public class ScriptManager { private static Map> createScriptsMap(DocumentTypeManager docTypeMgr, IlscriptsConfig config, Linguistics linguistics, - Encoder encoder) { + Embedder embedder) { Map> documentFieldScripts = new HashMap<>(config.ilscript().size()); - ScriptParserContext parserContext = new ScriptParserContext(linguistics, encoder); + ScriptParserContext parserContext = new ScriptParserContext(linguistics, embedder); parserContext.getAnnotatorConfig().setMaxTermOccurrences(config.maxtermoccurrences()); parserContext.getAnnotatorConfig().setMaxTokenLength(config.fieldmatchmaxlength()); diff --git a/docprocs/src/test/java/com/yahoo/docprocs/indexing/IndexingProcessorTestCase.java b/docprocs/src/test/java/com/yahoo/docprocs/indexing/IndexingProcessorTestCase.java index dc9b1ffba73..f54435329f9 100644 --- a/docprocs/src/test/java/com/yahoo/docprocs/indexing/IndexingProcessorTestCase.java +++ b/docprocs/src/test/java/com/yahoo/docprocs/indexing/IndexingProcessorTestCase.java @@ -13,7 +13,7 @@ import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.document.update.AssignValueUpdate; import com.yahoo.document.update.FieldUpdate; import com.yahoo.document.update.ValueUpdate; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.configdefinition.IlscriptsConfig; import org.junit.Test; @@ -127,6 +127,6 @@ public class IndexingProcessorTestCase { return new IndexingProcessor(ConfigGetter.getConfig(DocumentmanagerConfig.class, configId), ConfigGetter.getConfig(IlscriptsConfig.class, configId), new SimpleLinguistics(), - Encoder.throwsOnUse); + Embedder.throwsOnUse); } } diff --git a/docprocs/src/test/java/com/yahoo/docprocs/indexing/ScriptManagerTestCase.java b/docprocs/src/test/java/com/yahoo/docprocs/indexing/ScriptManagerTestCase.java index ec05fcbe422..a849f437b44 100644 --- a/docprocs/src/test/java/com/yahoo/docprocs/indexing/ScriptManagerTestCase.java +++ b/docprocs/src/test/java/com/yahoo/docprocs/indexing/ScriptManagerTestCase.java @@ -3,7 +3,7 @@ package com.yahoo.docprocs.indexing; import com.yahoo.document.DocumentType; import com.yahoo.document.DocumentTypeManager; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.vespa.configdefinition.IlscriptsConfig; import com.yahoo.vespa.indexinglanguage.parser.ParseException; import org.junit.Test; @@ -29,7 +29,7 @@ public class ScriptManagerTestCase { IlscriptsConfig.Builder config = new IlscriptsConfig.Builder(); config.ilscript(new IlscriptsConfig.Ilscript.Builder().doctype("newssummary") .content("input title | index title")); - ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(config), null, Encoder.throwsOnUse); + ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(config), null, Embedder.throwsOnUse); assertNotNull(scriptMgr.getScript(typeMgr.getDocumentType("newsarticle"))); assertNull(scriptMgr.getScript(new DocumentType("unknown"))); } @@ -44,7 +44,7 @@ public class ScriptManagerTestCase { IlscriptsConfig.Builder config = new IlscriptsConfig.Builder(); config.ilscript(new IlscriptsConfig.Ilscript.Builder().doctype("newsarticle") .content("input title | index title")); - ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(config), null, Encoder.throwsOnUse); + ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(config), null, Embedder.throwsOnUse); assertNotNull(scriptMgr.getScript(typeMgr.getDocumentType("newssummary"))); assertNull(scriptMgr.getScript(new DocumentType("unknown"))); } @@ -53,7 +53,7 @@ public class ScriptManagerTestCase { public void requireThatEmptyConfigurationDoesNotThrow() { DocumentTypeManager typeMgr = new DocumentTypeManager(); typeMgr.configure("file:src/test/cfg/documentmanager_inherit.cfg"); - ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(new IlscriptsConfig.Builder()), null, Encoder.throwsOnUse); + ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(new IlscriptsConfig.Builder()), null, Embedder.throwsOnUse); assertNull(scriptMgr.getScript(new DocumentType("unknown"))); } @@ -61,7 +61,7 @@ public class ScriptManagerTestCase { public void requireThatUnknownDocumentTypeReturnsNull() { DocumentTypeManager typeMgr = new DocumentTypeManager(); typeMgr.configure("file:src/test/cfg/documentmanager_inherit.cfg"); - ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(new IlscriptsConfig.Builder()), null, Encoder.throwsOnUse); + ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(new IlscriptsConfig.Builder()), null, Embedder.throwsOnUse); for (Iterator it = typeMgr.documentTypeIterator(); it.hasNext(); ) { assertNull(scriptMgr.getScript(it.next())); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParser.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParser.java index 34da5b47655..649095d1db8 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParser.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParser.java @@ -62,7 +62,7 @@ public final class ScriptParser { parser.setAnnotatorConfig(context.getAnnotatorConfig()); parser.setDefaultFieldName(context.getDefaultFieldName()); parser.setLinguistics(context.getLinguistcs()); - parser.setEncoder(context.getEncoder()); + parser.setEmbedder(context.getEmbedder()); try { return method.call(parser); } catch (ParseException e) { diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParserContext.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParserContext.java index 06be91703fa..77c2af8dd42 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParserContext.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParserContext.java @@ -2,8 +2,7 @@ package com.yahoo.vespa.indexinglanguage; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; -import com.yahoo.language.simple.SimpleLinguistics; +import com.yahoo.language.process.Embedder; import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig; import com.yahoo.vespa.indexinglanguage.parser.CharStream; @@ -14,13 +13,13 @@ public class ScriptParserContext { private AnnotatorConfig annotatorConfig = new AnnotatorConfig(); private Linguistics linguistics; - private final Encoder encoder; + private final Embedder embedder; private String defaultFieldName = null; private CharStream inputStream = null; - public ScriptParserContext(Linguistics linguistics, Encoder encoder) { + public ScriptParserContext(Linguistics linguistics, Embedder embedder) { this.linguistics = linguistics; - this.encoder = encoder; + this.embedder = embedder; } public AnnotatorConfig getAnnotatorConfig() { @@ -41,8 +40,8 @@ public class ScriptParserContext { return this; } - public Encoder getEncoder() { - return encoder; + public Embedder getEmbedder() { + return embedder; } public String getDefaultFieldName() { diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EmbedExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EmbedExpression.java new file mode 100644 index 00000000000..aa579ed729e --- /dev/null +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EmbedExpression.java @@ -0,0 +1,69 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.indexinglanguage.expressions; + +import com.yahoo.document.DataType; +import com.yahoo.document.TensorDataType; +import com.yahoo.document.datatypes.StringFieldValue; +import com.yahoo.document.datatypes.TensorFieldValue; +import com.yahoo.language.process.Embedder; +import com.yahoo.tensor.Tensor; +import com.yahoo.tensor.TensorType; + +/** + * Embeds a string in a tensor space using the configured Embedder component + * + * @author bratseth + */ +public class EmbedExpression extends Expression { + + private final Embedder embedder; + + /** The target type we are embedding into. */ + private TensorType targetType; + + public EmbedExpression(Embedder embedder) { + super(DataType.STRING); + this.embedder = embedder; + } + + @Override + public void setStatementOutputType(DataType type) { + targetType = ((TensorDataType)type).getTensorType(); + } + + @Override + protected void doExecute(ExecutionContext context) { + StringFieldValue input = (StringFieldValue) context.getValue(); + Tensor tensor = embedder.embed(input.getString(), context.getLanguage(), targetType); + context.setValue(new TensorFieldValue(tensor)); + } + + @Override + protected void doVerify(VerificationContext context) { + String outputField = context.getOutputField(); + if (outputField == null) + throw new VerificationException(this, "No output field in this statement: " + + "Don't know what tensor type to embed into."); + DataType outputFieldType = context.getInputType(this, outputField); + if ( ! (outputFieldType instanceof TensorDataType) ) + throw new VerificationException(this, "The type of the output field " + outputField + + " is not a tensor but " + outputField); + targetType = ((TensorDataType) outputFieldType).getTensorType(); + context.setValueType(createdOutputType()); + } + + @Override + public DataType createdOutputType() { + return new TensorDataType(targetType); + } + + @Override + public String toString() { return "embed"; } + + @Override + public int hashCode() { return 1; } + + @Override + public boolean equals(Object o) { return o instanceof EmbedExpression; } + +} diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EncodeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EncodeExpression.java deleted file mode 100644 index f84da9ddef8..00000000000 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EncodeExpression.java +++ /dev/null @@ -1,69 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.indexinglanguage.expressions; - -import com.yahoo.document.DataType; -import com.yahoo.document.TensorDataType; -import com.yahoo.document.datatypes.StringFieldValue; -import com.yahoo.document.datatypes.TensorFieldValue; -import com.yahoo.language.process.Encoder; -import com.yahoo.tensor.Tensor; -import com.yahoo.tensor.TensorType; - -/** - * Encodes a string as a tensor using the configured Encoder component - * - * @author bratseth - */ -public class EncodeExpression extends Expression { - - private final Encoder encoder; - - /** The target type we are encoding to. Set during verification. */ - private TensorType targetType; - - public EncodeExpression(Encoder encoder) { - super(DataType.STRING); - this.encoder = encoder; - } - - @Override - public void setStatementOutputType(DataType type) { - targetType = ((TensorDataType)type).getTensorType(); - } - - @Override - protected void doExecute(ExecutionContext context) { - StringFieldValue input = (StringFieldValue) context.getValue(); - Tensor tensor = encoder.encode(input.getString(), context.getLanguage(), targetType); - context.setValue(new TensorFieldValue(tensor)); - } - - @Override - protected void doVerify(VerificationContext context) { - String outputField = context.getOutputField(); - if (outputField == null) - throw new VerificationException(this, "No output field in this statement: " + - "Don't know what tensor type to encode to."); - DataType outputFieldType = context.getInputType(this, outputField); - if ( ! (outputFieldType instanceof TensorDataType) ) - throw new VerificationException(this, "The type of the output field " + outputField + - " is not a tensor but " + outputField); - targetType = ((TensorDataType) outputFieldType).getTensorType(); - context.setValueType(createdOutputType()); - } - - @Override - public DataType createdOutputType() { - return new TensorDataType(targetType); - } - - @Override - public String toString() { return "encode"; } - - @Override - public int hashCode() { return 1; } - - @Override - public boolean equals(Object o) { return o instanceof EncodeExpression; } - -} diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java index 67459c2b035..20a0c9804a9 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java @@ -6,7 +6,7 @@ import com.yahoo.document.Document; import com.yahoo.document.DocumentUpdate; import com.yahoo.document.datatypes.FieldValue; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.*; import com.yahoo.vespa.indexinglanguage.parser.IndexingInput; @@ -188,11 +188,11 @@ public abstract class Expression extends Selectable { /** Creates an expression with simple lingustics for testing */ public static Expression fromString(String expression) throws ParseException { - return fromString(expression, new SimpleLinguistics(), Encoder.throwsOnUse); + return fromString(expression, new SimpleLinguistics(), Embedder.throwsOnUse); } - public static Expression fromString(String expression, Linguistics linguistics, Encoder encoder) throws ParseException { - return newInstance(new ScriptParserContext(linguistics, encoder).setInputStream(new IndexingInput(expression))); + public static Expression fromString(String expression, Linguistics linguistics, Embedder embedder) throws ParseException { + return newInstance(new ScriptParserContext(linguistics, embedder).setInputStream(new IndexingInput(expression))); } public static Expression newInstance(ScriptParserContext context) throws ParseException { diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java index 7317cb2216f..b5f71813de3 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java @@ -4,7 +4,7 @@ package com.yahoo.vespa.indexinglanguage.expressions; import com.yahoo.document.DataType; import com.yahoo.document.datatypes.FieldValue; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.ScriptParser; import com.yahoo.vespa.indexinglanguage.ScriptParserContext; @@ -92,11 +92,11 @@ public final class ScriptExpression extends ExpressionList /** Creates an expression with simple lingustics for testing */ @SuppressWarnings("deprecation") public static ScriptExpression fromString(String expression) throws ParseException { - return fromString(expression, new SimpleLinguistics(), Encoder.throwsOnUse); + return fromString(expression, new SimpleLinguistics(), Embedder.throwsOnUse); } - public static ScriptExpression fromString(String expression, Linguistics linguistics, Encoder encoder) throws ParseException { - return newInstance(new ScriptParserContext(linguistics, encoder).setInputStream(new IndexingInput(expression))); + public static ScriptExpression fromString(String expression, Linguistics linguistics, Embedder embedder) throws ParseException { + return newInstance(new ScriptParserContext(linguistics, embedder).setInputStream(new IndexingInput(expression))); } public static ScriptExpression newInstance(ScriptParserContext config) throws ParseException { diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java index 145133e210d..7d157af1a19 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java @@ -2,9 +2,8 @@ package com.yahoo.vespa.indexinglanguage.expressions; import com.yahoo.document.DataType; -import com.yahoo.document.TensorDataType; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.ScriptParser; import com.yahoo.vespa.indexinglanguage.ScriptParserContext; @@ -12,7 +11,6 @@ import com.yahoo.vespa.indexinglanguage.parser.IndexingInput; import com.yahoo.vespa.indexinglanguage.parser.ParseException; import java.util.Arrays; -import java.util.Collection; import java.util.Iterator; import java.util.LinkedList; import java.util.List; @@ -100,11 +98,11 @@ public final class StatementExpression extends ExpressionList { /** Creates an expression with simple lingustics for testing */ public static StatementExpression fromString(String expression) throws ParseException { - return fromString(expression, new SimpleLinguistics(), Encoder.throwsOnUse); + return fromString(expression, new SimpleLinguistics(), Embedder.throwsOnUse); } - public static StatementExpression fromString(String expression, Linguistics linguistics, Encoder encoder) throws ParseException { - return newInstance(new ScriptParserContext(linguistics, encoder).setInputStream(new IndexingInput(expression))); + public static StatementExpression fromString(String expression, Linguistics linguistics, Embedder embedder) throws ParseException { + return newInstance(new ScriptParserContext(linguistics, embedder).setInputStream(new IndexingInput(expression))); } public static StatementExpression newInstance(ScriptParserContext config) throws ParseException { diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj index 4533a17954c..3eee4ea6f08 100644 --- a/indexinglanguage/src/main/javacc/IndexingParser.jj +++ b/indexinglanguage/src/main/javacc/IndexingParser.jj @@ -34,7 +34,7 @@ import com.yahoo.text.StringUtilities; import com.yahoo.vespa.indexinglanguage.expressions.*; import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig; import com.yahoo.language.process.StemMode; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.Linguistics; /** @@ -45,7 +45,7 @@ public class IndexingParser { private String defaultFieldName; private Linguistics linguistics; - private Encoder encoder; + private Embedder embedder; private AnnotatorConfig annotatorCfg; public IndexingParser(String str) { @@ -62,8 +62,8 @@ public class IndexingParser { return this; } - public IndexingParser setEncoder(Encoder encoder) { - this.encoder = encoder; + public IndexingParser setEmbedder(Embedder embedder) { + this.embedder = embedder; return this; } @@ -157,7 +157,7 @@ TOKEN : | | | - | + | | | | @@ -283,7 +283,7 @@ Expression value() : val = base64EncodeExp() | val = clearStateExp() | val = echoExp() | - val = encodeExp() | + val = embedExp() | val = exactExp() | val = flattenExp() | val = forEachExp() | @@ -365,10 +365,10 @@ Expression echoExp() : { } { return new EchoExpression(); } } -Expression encodeExp() : { } +Expression embedExp() : { } { - ( ) - { return new EncodeExpression(encoder); } + ( ) + { return new EmbedExpression(embedder); } } Expression exactExp() : { } @@ -744,7 +744,7 @@ String identifier() : | | | - | + | | | | diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptParserTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptParserTestCase.java index 32e38dbee6f..06d185339a6 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptParserTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptParserTestCase.java @@ -1,7 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.indexinglanguage; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.expressions.EchoExpression; import com.yahoo.vespa.indexinglanguage.expressions.InputExpression; @@ -96,7 +96,7 @@ public class ScriptParserTestCase { } private static ScriptParserContext newContext(String input) { - return new ScriptParserContext(new SimpleLinguistics(), Encoder.throwsOnUse).setInputStream(new IndexingInput(input)); + return new ScriptParserContext(new SimpleLinguistics(), Embedder.throwsOnUse).setInputStream(new IndexingInput(input)); } } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java index 9d3d0abb256..188426b1a06 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java @@ -5,26 +5,20 @@ import com.yahoo.document.DataType; import com.yahoo.document.Document; import com.yahoo.document.DocumentType; import com.yahoo.document.Field; -import com.yahoo.document.FieldPath; import com.yahoo.document.TensorDataType; import com.yahoo.document.datatypes.BoolFieldValue; -import com.yahoo.document.datatypes.FieldValue; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.document.datatypes.TensorFieldValue; import com.yahoo.language.Language; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.tensor.Tensor; -import com.yahoo.tensor.TensorAddress; import com.yahoo.tensor.TensorType; import com.yahoo.vespa.indexinglanguage.expressions.*; import com.yahoo.vespa.indexinglanguage.parser.ParseException; import org.junit.Test; -import java.util.Iterator; import java.util.List; -import java.util.Map; -import java.util.Set; import static org.junit.Assert.*; @@ -106,9 +100,9 @@ public class ScriptTestCase { } @Test - public void testEncode() throws ParseException { + public void testEmbed() throws ParseException { TensorType tensorType = TensorType.fromSpec("tensor(d[4])"); - var expression = Expression.fromString("input myText | encode | attribute 'myTensor'", + var expression = Expression.fromString("input myText | embed | attribute 'myTensor'", new SimpleLinguistics(), new MockEncoder()); @@ -131,15 +125,15 @@ public class ScriptTestCase { ((TensorFieldValue)adapter.values.get("myTensor")).getTensor().get()); } - private static class MockEncoder implements Encoder { + private static class MockEncoder implements Embedder { @Override - public List encode(String text, Language language) { + public List embed(String text, Language language) { return null; } @Override - public Tensor encode(String text, Language language, TensorType tensorType) { + public Tensor embed(String text, Language language, TensorType tensorType) { return Tensor.from(tensorType, "[7,3,0,0]"); } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/DefaultFieldNameTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/DefaultFieldNameTestCase.java index 2a71aeb564c..ea0d9f9cf69 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/DefaultFieldNameTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/DefaultFieldNameTestCase.java @@ -1,7 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.indexinglanguage.parser; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.ScriptParserContext; import com.yahoo.vespa.indexinglanguage.expressions.Expression; @@ -19,7 +19,7 @@ public class DefaultFieldNameTestCase { public void requireThatDefaultFieldNameIsAppliedWhenArgumentIsMissing() throws ParseException { IndexingInput input = new IndexingInput("input"); InputExpression exp = (InputExpression)Expression.newInstance(new ScriptParserContext(new SimpleLinguistics(), - Encoder.throwsOnUse) + Embedder.throwsOnUse) .setInputStream(input) .setDefaultFieldName("foo")); assertEquals("foo", exp.getFieldName()); diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java index d7c5ae5c15a..44aa562028c 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java @@ -2,7 +2,7 @@ package com.yahoo.vespa.indexinglanguage.parser; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.expressions.*; import org.junit.Test; @@ -85,9 +85,9 @@ public class ExpressionTestCase { private static void assertExpression(Class expectedClass, String str) throws ParseException { Linguistics linguistics = new SimpleLinguistics(); - Expression foo = Expression.fromString(str, linguistics, Encoder.throwsOnUse); + Expression foo = Expression.fromString(str, linguistics, Embedder.throwsOnUse); assertEquals(expectedClass, foo.getClass()); - Expression bar = Expression.fromString(foo.toString(), linguistics, Encoder.throwsOnUse); + Expression bar = Expression.fromString(foo.toString(), linguistics, Embedder.throwsOnUse); assertEquals(foo.hashCode(), bar.hashCode()); assertEquals(foo, bar); } diff --git a/linguistics-components/abi-spec.json b/linguistics-components/abi-spec.json index 5b6729c58ef..808ec3af082 100644 --- a/linguistics-components/abi-spec.json +++ b/linguistics-components/abi-spec.json @@ -148,7 +148,7 @@ "public static final java.lang.String[] CONFIG_DEF_SCHEMA" ] }, - "com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder": { + "com.yahoo.language.sentencepiece.SentencePieceEmbedder$Builder": { "superClass": "java.lang.Object", "interfaces": [], "attributes": [ @@ -157,31 +157,31 @@ "methods": [ "public void ()", "public void addModel(com.yahoo.language.Language, java.nio.file.Path)", - "public com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder addDefaultModel(java.nio.file.Path)", + "public com.yahoo.language.sentencepiece.SentencePieceEmbedder$Builder addDefaultModel(java.nio.file.Path)", "public java.util.Map getModels()", - "public com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder setCollapseUnknowns(boolean)", + "public com.yahoo.language.sentencepiece.SentencePieceEmbedder$Builder setCollapseUnknowns(boolean)", "public boolean getCollapseUnknowns()", - "public com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder setScoring(com.yahoo.language.sentencepiece.Scoring)", + "public com.yahoo.language.sentencepiece.SentencePieceEmbedder$Builder setScoring(com.yahoo.language.sentencepiece.Scoring)", "public com.yahoo.language.sentencepiece.Scoring getScoring()", - "public com.yahoo.language.sentencepiece.SentencePieceEncoder build()" + "public com.yahoo.language.sentencepiece.SentencePieceEmbedder build()" ], "fields": [] }, - "com.yahoo.language.sentencepiece.SentencePieceEncoder": { + "com.yahoo.language.sentencepiece.SentencePieceEmbedder": { "superClass": "java.lang.Object", "interfaces": [ "com.yahoo.language.process.Segmenter", - "com.yahoo.language.process.Encoder" + "com.yahoo.language.process.Embedder" ], "attributes": [ "public" ], "methods": [ "public void (com.yahoo.language.sentencepiece.SentencePieceConfig)", - "public void (com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder)", + "public void (com.yahoo.language.sentencepiece.SentencePieceEmbedder$Builder)", "public java.util.List segment(java.lang.String, com.yahoo.language.Language)", - "public java.util.List encode(java.lang.String, com.yahoo.language.Language)", - "public com.yahoo.tensor.Tensor encode(java.lang.String, com.yahoo.language.Language, com.yahoo.tensor.TensorType)", + "public java.util.List embed(java.lang.String, com.yahoo.language.Language)", + "public com.yahoo.tensor.Tensor embed(java.lang.String, com.yahoo.language.Language, com.yahoo.tensor.TensorType)", "public java.lang.String normalize(java.lang.String)" ], "fields": [] diff --git a/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEmbedder.java b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEmbedder.java new file mode 100644 index 00000000000..116dd15f563 --- /dev/null +++ b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEmbedder.java @@ -0,0 +1,219 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.sentencepiece; + +import com.google.common.annotations.Beta; +import com.google.inject.Inject; +import com.yahoo.language.Language; +import com.yahoo.language.process.Embedder; +import com.yahoo.language.process.Segmenter; +import com.yahoo.tensor.Tensor; +import com.yahoo.tensor.TensorAddress; +import com.yahoo.tensor.TensorType; + +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +/** + * A native Java implementation of SentencePiece - see https://github.com/google/sentencepiece + * + * SentencePiece is a language-agnostic segmenter and embedder for neural nets. + * + * @author bratseth + */ +@Beta +public class SentencePieceEmbedder implements Segmenter, Embedder { + + private final Map models; + + private final SentencePieceAlgorithm algorithm; + + @Inject + public SentencePieceEmbedder(SentencePieceConfig config) { + this(new Builder(config)); + } + + public SentencePieceEmbedder(Builder builder) { + algorithm = new SentencePieceAlgorithm(builder.collapseUnknowns, builder.getScoring()); + + models = builder.getModels().entrySet() + .stream() + .map(e -> new Model(e.getKey(), e.getValue())) + .collect(Collectors.toUnmodifiableMap(m -> m.language, m -> m)); + if (models.isEmpty()) + throw new IllegalArgumentException("SentencePieceEmbedder requires at least one model configured"); + } + + /** + * Segments the given text into token segments using the SentencePiece algorithm + * + * @param rawInput the text to segment. Any sequence of BMP (Unicode-16 the True Unicode) is supported. + * @param language the model to use, or Language.UNKNOWN to use the default model if any + * @return the list of zero or more tokens resulting from segmenting the input text + */ + @Override + public List segment(String rawInput, Language language) { + String input = normalize(rawInput); + var resultBuilder = new ResultBuilder>(new ArrayList<>()) { + public void add(int segmentStart, int segmentEnd, SentencePieceAlgorithm.SegmentEnd[] segmentEnds) { + result().add(input.substring(segmentStart, segmentEnd)); + } + }; + segment(input, language, resultBuilder); + Collections.reverse(resultBuilder.result()); + return resultBuilder.result(); + } + + /** + * Segments the given text into token segments using the SentencePiece algorithm and returns the segment ids. + * + * @param rawInput the text to segment. Any sequence of BMP (Unicode-16 the True Unicode) is supported. + * @param language the model to use, or Language.UNKNOWN to use the default model if any + * @return the list of zero or more token ids resulting from segmenting the input text + */ + @Override + public List embed(String rawInput, Language language) { + var resultBuilder = new ResultBuilder>(new ArrayList<>()) { + public void add(int segmentStart, int segmentEnd, SentencePieceAlgorithm.SegmentEnd[] segmentEnds) { + result().add(segmentEnds[segmentEnd].id); + } + }; + segment(normalize(rawInput), language, resultBuilder); + Collections.reverse(resultBuilder.result()); + return resultBuilder.result(); + } + + /** + *

Embeds text into a tensor.

+ * + *

If the tensor type is indexed 1-d (bound or unbound) this will return a tensor containing the token ids in the order + * they were encountered in the text. If the dimension is bound and too large it will be zero padded, if too small + * it will be truncated.

+ * + *

If the tensor type is1-d sparse this will return a tensor containing the token strings as keys and the token + * position as value.

+ * + *

If the tensor is any other type IllegalArgumentException is thrown.

+ */ + @Override + public Tensor embed(String rawInput, Language language, TensorType type) { + if (type.dimensions().size() == 1 && type.dimensions().get(0).isIndexed()) { + // Build to a list first since we can't reverse a tensor builder + List values = embed(rawInput, language); + + long maxSize = values.size(); + if (type.dimensions().get(0).size().isPresent()) + maxSize = Math.min(maxSize, type.dimensions().get(0).size().get()); + + Tensor.Builder builder = Tensor.Builder.of(type); + for (int i = 0; i < maxSize; i++) + builder.cell(values.get(i), i); + return builder.build(); + } + else if (type.dimensions().size() == 1 && type.dimensions().get(0).isMapped()) { + // Build to a list first since we can't reverse a tensor builder + List values = segment(rawInput, language); + + Tensor.Builder builder = Tensor.Builder.of(type); + for (int i = 0; i < values.size(); i++) + builder.cell(TensorAddress.ofLabels(values.get(i)), i); + return builder.build(); + } + else { + throw new IllegalArgumentException("Don't know how to embed with SentencePiece into " + type); + } + } + + private void segment(String input, Language language, + ResultBuilder resultBuilder) { + Model model = resolveFrom(language); + algorithm.segment(input, resultBuilder, model); + } + + private Model resolveFrom(Language language) { + // Disregard language if there is default model + if (models.size() == 1 && models.containsKey(Language.UNKNOWN)) return models.get(Language.UNKNOWN); + if (models.containsKey(language)) return models.get(language); + throw new IllegalArgumentException("No SentencePiece model for language " + language + " is configured"); + } + + public String normalize(String s) { + StringBuilder b = new StringBuilder(s.length() + 1); + boolean queuedSpace = true; // Always start by one space + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + if (s.charAt(i) == ' ') { + queuedSpace = true; + } + else { + if (queuedSpace) { + b.append(SentencePieceAlgorithm.spaceSymbol); + queuedSpace = false; + } + b.append(c); + } + } + return b.toString(); + } + + public static class Builder { + + private final Map models = new HashMap<>(); + private boolean collapseUnknowns = true; + private Scoring scoring = Scoring.fewestSegments; + + public Builder() { + } + + private Builder(SentencePieceConfig config) { + collapseUnknowns = config.collapseUnknowns(); + scoring = config.scoring() == SentencePieceConfig.Scoring.fewestSegments ? Scoring.fewestSegments + : Scoring.highestScore; + for (SentencePieceConfig.Model model : config.model()) { + addModel(Language.fromLanguageTag(model.language()), model.path()); + } + } + + public void addModel(Language language, Path model) { + models.put(language, model); + } + + /** + * Adds the model that will be used if the language is unknown, OR only one model is specified. + * The same as addModel(Language.UNKNOWN, model). + */ + public Builder addDefaultModel(Path model) { + addModel(Language.UNKNOWN, model); + return this; + } + public Map getModels() { return models; } + + /** + * Sets whether consecutive unknown character should be collapsed into one large unknown token (default) + * or be returned as single character tokens. + */ + public Builder setCollapseUnknowns(boolean collapseUnknowns) { + this.collapseUnknowns = collapseUnknowns; + return this; + } + public boolean getCollapseUnknowns() { return collapseUnknowns; } + + /** Sets the scoring strategy to use when picking a segmentation. Default: fewestSegments. */ + public Builder setScoring(Scoring scoring) { + this.scoring = scoring; + return this; + } + public Scoring getScoring() { return scoring; } + + public SentencePieceEmbedder build() { + if (models.isEmpty()) throw new IllegalStateException("At least one model must be supplied"); + return new SentencePieceEmbedder(this); + } + + } + +} diff --git a/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java deleted file mode 100644 index b6659ebeaa3..00000000000 --- a/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java +++ /dev/null @@ -1,220 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.sentencepiece; - -import com.google.common.annotations.Beta; -import com.google.inject.Inject; -import com.yahoo.language.Language; -import com.yahoo.language.process.Encoder; -import com.yahoo.language.process.Segmenter; -import com.yahoo.tensor.Tensor; -import com.yahoo.tensor.TensorAddress; -import com.yahoo.tensor.TensorType; - -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.Collections; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.stream.Collectors; - -/** - * Integration with https://github.com/google/sentencepiece - * through http://docs.djl.ai/extensions/sentencepiece/index.html - * - * SentencePiece is a language-agnostic tokenizer for neural nets. - * - * @author bratseth - */ -@Beta -public class SentencePieceEncoder implements Segmenter, Encoder { - - private final Map models; - - private final SentencePieceAlgorithm algorithm; - - @Inject - public SentencePieceEncoder(SentencePieceConfig config) { - this(new Builder(config)); - } - - public SentencePieceEncoder(Builder builder) { - algorithm = new SentencePieceAlgorithm(builder.collapseUnknowns, builder.getScoring()); - - models = builder.getModels().entrySet() - .stream() - .map(e -> new Model(e.getKey(), e.getValue())) - .collect(Collectors.toUnmodifiableMap(m -> m.language, m -> m)); - if (models.isEmpty()) - throw new IllegalArgumentException("SentencePieceEncoder requires at least one model configured"); - } - - /** - * Segments the given text into token segments using the SentencePiece algorithm - * - * @param rawInput the text to segment. Any sequence of BMP (Unicode-16 the True Unicode) is supported. - * @param language the model to use, or Language.UNKNOWN to use the default model if any - * @return the list of zero or more tokens resulting from segmenting the input text - */ - @Override - public List segment(String rawInput, Language language) { - String input = normalize(rawInput); - var resultBuilder = new ResultBuilder>(new ArrayList<>()) { - public void add(int segmentStart, int segmentEnd, SentencePieceAlgorithm.SegmentEnd[] segmentEnds) { - result().add(input.substring(segmentStart, segmentEnd)); - } - }; - segment(input, language, resultBuilder); - Collections.reverse(resultBuilder.result()); - return resultBuilder.result(); - } - - /** - * Segments the given text into token segments using the SentencePiece algorithm and returns the segment ids. - * - * @param rawInput the text to segment. Any sequence of BMP (Unicode-16 the True Unicode) is supported. - * @param language the model to use, or Language.UNKNOWN to use the default model if any - * @return the list of zero or more token ids resulting from segmenting the input text - */ - @Override - public List encode(String rawInput, Language language) { - var resultBuilder = new ResultBuilder>(new ArrayList<>()) { - public void add(int segmentStart, int segmentEnd, SentencePieceAlgorithm.SegmentEnd[] segmentEnds) { - result().add(segmentEnds[segmentEnd].id); - } - }; - segment(normalize(rawInput), language, resultBuilder); - Collections.reverse(resultBuilder.result()); - return resultBuilder.result(); - } - - /** - *

Encodes directly to a tensor.

- * - *

If the tensor type is indexed 1-d (bound or unbound) this will return a tensor containing the token ids in the order - * they were encountered in the text. If the dimension is bound and too large it will be zero padded, if too small - * it will be truncated.

- * - *

If the tensor type is1-d sparse this will return a tensor containing the token strings as keys and the token - * position as value.

- * - *

If the tensor is any other type IllegalArgumentException is thrown.

- */ - @Override - public Tensor encode(String rawInput, Language language, TensorType type) { - if (type.dimensions().size() == 1 && type.dimensions().get(0).isIndexed()) { - // Build to a list first since we can't reverse a tensor builder - List values = encode(rawInput, language); - - long maxSize = values.size(); - if (type.dimensions().get(0).size().isPresent()) - maxSize = Math.min(maxSize, type.dimensions().get(0).size().get()); - - Tensor.Builder builder = Tensor.Builder.of(type); - for (int i = 0; i < maxSize; i++) - builder.cell(values.get(i), i); - return builder.build(); - } - else if (type.dimensions().size() == 1 && type.dimensions().get(0).isMapped()) { - // Build to a list first since we can't reverse a tensor builder - List values = segment(rawInput, language); - - Tensor.Builder builder = Tensor.Builder.of(type); - for (int i = 0; i < values.size(); i++) - builder.cell(TensorAddress.ofLabels(values.get(i)), i); - return builder.build(); - } - else { - throw new IllegalArgumentException("Don't know how to encode with SentencePiece into " + type); - } - } - - private void segment(String input, Language language, - ResultBuilder resultBuilder) { - Model model = resolveFrom(language); - algorithm.segment(input, resultBuilder, model); - } - - private Model resolveFrom(Language language) { - // Disregard language if there is default model - if (models.size() == 1 && models.containsKey(Language.UNKNOWN)) return models.get(Language.UNKNOWN); - if (models.containsKey(language)) return models.get(language); - throw new IllegalArgumentException("No SentencePiece model for language " + language + " is configured"); - } - - public String normalize(String s) { - StringBuilder b = new StringBuilder(s.length() + 1); - boolean queuedSpace = true; // Always start by one space - for (int i = 0; i < s.length(); i++) { - char c = s.charAt(i); - if (s.charAt(i) == ' ') { - queuedSpace = true; - } - else { - if (queuedSpace) { - b.append(SentencePieceAlgorithm.spaceSymbol); - queuedSpace = false; - } - b.append(c); - } - } - return b.toString(); - } - - public static class Builder { - - private final Map models = new HashMap<>(); - private boolean collapseUnknowns = true; - private Scoring scoring = Scoring.fewestSegments; - - public Builder() { - } - - private Builder(SentencePieceConfig config) { - collapseUnknowns = config.collapseUnknowns(); - scoring = config.scoring() == SentencePieceConfig.Scoring.fewestSegments ? Scoring.fewestSegments - : Scoring.highestScore; - for (SentencePieceConfig.Model model : config.model()) { - addModel(Language.fromLanguageTag(model.language()), model.path()); - } - } - - public void addModel(Language language, Path model) { - models.put(language, model); - } - - /** - * Adds the model that will be used if the language is unknown, OR only one model is specified. - * The same as addModel(Language.UNKNOWN, model). - */ - public Builder addDefaultModel(Path model) { - addModel(Language.UNKNOWN, model); - return this; - } - public Map getModels() { return models; } - - /** - * Sets whether consecutive unknown character should be collapsed into one large unknown token (default) - * or be returned as single character tokens. - */ - public Builder setCollapseUnknowns(boolean collapseUnknowns) { - this.collapseUnknowns = collapseUnknowns; - return this; - } - public boolean getCollapseUnknowns() { return collapseUnknowns; } - - /** Sets the scoring strategy to use when picking a segmentation. Default: fewestSegments. */ - public Builder setScoring(Scoring scoring) { - this.scoring = scoring; - return this; - } - public Scoring getScoring() { return scoring; } - - public SentencePieceEncoder build() { - if (models.isEmpty()) throw new IllegalStateException("At least one model must be supplied"); - return new SentencePieceEncoder(this); - } - - } - -} diff --git a/linguistics-components/src/main/resources/configdefinitions/language.sentencepiece.sentence-piece.def b/linguistics-components/src/main/resources/configdefinitions/language.sentencepiece.sentence-piece.def index b91c0c45dc4..16ada78688a 100644 --- a/linguistics-components/src/main/resources/configdefinitions/language.sentencepiece.sentence-piece.def +++ b/linguistics-components/src/main/resources/configdefinitions/language.sentencepiece.sentence-piece.def @@ -1,6 +1,6 @@ # Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -# Configures com.yahoo.language.sentencepiece.SentencePieceEncoder +# Configures com.yahoo.language.sentencepiece.SentencePieceEmbedder namespace=language.sentencepiece diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java index edbbe21ec53..1ed2271f774 100644 --- a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java @@ -15,7 +15,7 @@ public class SentencePieceConfigurationTest { public void testEnglishTokenization() { var b = new SentencePieceConfig.Builder(); addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + var tester = new SentencePieceTester(new SentencePieceEmbedder(b.build())); tester.assertSegmented("this is another sentence", "▁this", "▁is", "▁another", "▁sentence"); tester.assertSegmented("KHJKJHHKJHHSH hello", "▁", "KHJKJHHKJHHSH", "▁hel", "lo"); } @@ -25,7 +25,7 @@ public class SentencePieceConfigurationTest { var b = new SentencePieceConfig.Builder(); addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); b.collapseUnknowns(false); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + var tester = new SentencePieceTester(new SentencePieceEmbedder(b.build())); tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo"); } @@ -34,7 +34,7 @@ public class SentencePieceConfigurationTest { var b = new SentencePieceConfig.Builder(); addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); b.scoring(SentencePieceConfig.Scoring.highestScore); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + var tester = new SentencePieceTester(new SentencePieceEmbedder(b.build())); tester.assertSegmented("hello", "▁h", "el", "lo"); } @@ -43,7 +43,7 @@ public class SentencePieceConfigurationTest { var b = new SentencePieceConfig.Builder(); addModel("ja", "src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model", b); addModel("en", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + var tester = new SentencePieceTester(new SentencePieceEmbedder(b.build())); tester.assertSegmented(Language.JAPANESE, "いくつかの通常のテキスト", "▁", "いく", "つか", "の", "通常", "の", "テ", "キ", "スト"); tester.assertSegmented(Language.ENGLISH, "hello", "▁hel", "lo"); tester.assertSegmented(Language.JAPANESE, "hello", "▁h", "ell", "o"); diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java index d60d7386d4b..939f8ebe9d3 100644 --- a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java @@ -38,27 +38,27 @@ public class SentencePieceTest { @Test public void testIntegerListEncoding() { var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); - tester.assertEncoded("hello, world!", 908, 1418, 9934, 501, 9960); - tester.assertEncoded("Hello, world!", 9912, 0, 6595, 9934, 501, 9960); + tester.assertEmbedded("hello, world!", 908, 1418, 9934, 501, 9960); + tester.assertEmbedded("Hello, world!", 9912, 0, 6595, 9934, 501, 9960); } @Test public void testDenseTensorEncoding() { var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); - tester.assertEncoded("hello, world!", "tensor(d[10])", "[908,1418,9934,501,9960,0,0,0,0,0]"); - tester.assertEncoded("Hello, world!", "tensor(d[10])", "[9912,0,6595,9934,501,9960,0,0,0,0]"); - tester.assertEncoded("hello, world!", "tensor(d[2])", "[908,1418]"); + tester.assertEmbedded("hello, world!", "tensor(d[10])", "[908,1418,9934,501,9960,0,0,0,0,0]"); + tester.assertEmbedded("Hello, world!", "tensor(d[10])", "[9912,0,6595,9934,501,9960,0,0,0,0]"); + tester.assertEmbedded("hello, world!", "tensor(d[2])", "[908,1418]"); } @Test public void testSparseTensorEncoding() { var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); - tester.assertEncoded("hello", "tensor(token{})", "{lo:1.0,'▁hel':0.0}"); + tester.assertEmbedded("hello", "tensor(token{})", "{lo:1.0,'▁hel':0.0}"); } @Test public void testNoCollapse() { - var tester = new SentencePieceTester(new SentencePieceEncoder.Builder() + var tester = new SentencePieceTester(new SentencePieceEmbedder.Builder() .addDefaultModel(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()) .setCollapseUnknowns(false)); tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo"); @@ -66,7 +66,7 @@ public class SentencePieceTest { @Test public void testHighestScore() { - var tester = new SentencePieceTester(new SentencePieceEncoder.Builder() + var tester = new SentencePieceTester(new SentencePieceEmbedder.Builder() .addDefaultModel(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()) .setScoring(Scoring.highestScore)); tester.assertSegmented("h", "▁h"); @@ -77,7 +77,7 @@ public class SentencePieceTest { @Test public void testMultiLanguageTokenization() { - SentencePieceEncoder.Builder builder = new SentencePieceEncoder.Builder(); + SentencePieceEmbedder.Builder builder = new SentencePieceEmbedder.Builder(); builder.addModel(Language.JAPANESE, new File("src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model").toPath()); builder.addModel(Language.ENGLISH, new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); var tester = new SentencePieceTester(builder); diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java index 1ba7c9b472d..c4cb13a3d23 100644 --- a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java @@ -14,28 +14,28 @@ import static org.junit.Assert.assertEquals; class SentencePieceTester { - private final SentencePieceEncoder encoder; + private final SentencePieceEmbedder embedder; public SentencePieceTester(Path model) { - this(new SentencePieceEncoder.Builder().addDefaultModel(model)); + this(new SentencePieceEmbedder.Builder().addDefaultModel(model)); } - public SentencePieceTester(SentencePieceEncoder.Builder builder) { + public SentencePieceTester(SentencePieceEmbedder.Builder builder) { this(builder.build()); } - public SentencePieceTester(SentencePieceEncoder encoder) { - this.encoder = encoder; + public SentencePieceTester(SentencePieceEmbedder embedder) { + this.embedder = embedder; } - public void assertEncoded(String input, Integer... expectedCodes) { - assertArrayEquals(expectedCodes, encoder.encode(input, Language.UNKNOWN).toArray()); + public void assertEmbedded(String input, Integer... expectedCodes) { + assertArrayEquals(expectedCodes, embedder.embed(input, Language.UNKNOWN).toArray()); } - public void assertEncoded(String input, String tensorType, String tensor) { + public void assertEmbedded(String input, String tensorType, String tensor) { TensorType type = TensorType.fromSpec(tensorType); Tensor expected = Tensor.from(type, tensor); - assertEquals(expected, encoder.encode(input, Language.UNKNOWN, type)); + assertEquals(expected, embedder.embed(input, Language.UNKNOWN, type)); } public void assertSegmented(String input, String... expectedSegments) { @@ -43,7 +43,7 @@ class SentencePieceTester { } public void assertSegmented(Language language, String input, String... expectedSegments) { - assertArrayEquals(expectedSegments, encoder.segment(input, language).toArray()); + assertArrayEquals(expectedSegments, embedder.segment(input, language).toArray()); } } diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json index cfbf2abda1a..d257c451739 100644 --- a/linguistics/abi-spec.json +++ b/linguistics/abi-spec.json @@ -328,22 +328,22 @@ ], "fields": [] }, - "com.yahoo.language.process.Encoder$FailingEncoder": { + "com.yahoo.language.process.Embedder$FailingEmbedder": { "superClass": "java.lang.Object", "interfaces": [ - "com.yahoo.language.process.Encoder" + "com.yahoo.language.process.Embedder" ], "attributes": [ "public" ], "methods": [ "public void ()", - "public java.util.List encode(java.lang.String, com.yahoo.language.Language)", - "public com.yahoo.tensor.Tensor encode(java.lang.String, com.yahoo.language.Language, com.yahoo.tensor.TensorType)" + "public java.util.List embed(java.lang.String, com.yahoo.language.Language)", + "public com.yahoo.tensor.Tensor embed(java.lang.String, com.yahoo.language.Language, com.yahoo.tensor.TensorType)" ], "fields": [] }, - "com.yahoo.language.process.Encoder": { + "com.yahoo.language.process.Embedder": { "superClass": "java.lang.Object", "interfaces": [], "attributes": [ @@ -352,11 +352,11 @@ "abstract" ], "methods": [ - "public abstract java.util.List encode(java.lang.String, com.yahoo.language.Language)", - "public abstract com.yahoo.tensor.Tensor encode(java.lang.String, com.yahoo.language.Language, com.yahoo.tensor.TensorType)" + "public abstract java.util.List embed(java.lang.String, com.yahoo.language.Language)", + "public abstract com.yahoo.tensor.Tensor embed(java.lang.String, com.yahoo.language.Language, com.yahoo.tensor.TensorType)" ], "fields": [ - "public static final com.yahoo.language.process.Encoder throwsOnUse" + "public static final com.yahoo.language.process.Embedder throwsOnUse" ] }, "com.yahoo.language.process.GramSplitter$Gram": { diff --git a/linguistics/src/main/java/com/yahoo/language/process/Embedder.java b/linguistics/src/main/java/com/yahoo/language/process/Embedder.java new file mode 100644 index 00000000000..56c401a7c61 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Embedder.java @@ -0,0 +1,56 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; +import com.yahoo.tensor.Tensor; +import com.yahoo.tensor.TensorType; + +import java.util.List; + +/** + * An embedder converts a text string to a tensor + * + * @author bratseth + */ +public interface Embedder { + + /** An instance of this which throws IllegalStateException if attempted used */ + Embedder throwsOnUse = new FailingEmbedder(); + + /** + * Converts text into a list of token id's (a vector embedding) + * + * @param text the text to embed + * @param language the language of the text, or UNKNOWN to use language independent embedding + * @return the text embedded as a list of token ids + * @throws IllegalArgumentException if the language is not supported by this embedder + */ + List embed(String text, Language language); + + /** + * Converts text into tokens in a tensor. + * The information contained in the embedding may depend on the tensor type. + * + * @param text the text to embed + * @param language the language of the text, or UNKNOWN to use language independent embedding + * @param tensorType the type of the tensor to be returned + * @return the tensor embedding of the text, as the spoecified tensor type + * @throws IllegalArgumentException if the language or tensor type is not supported by this embedder + */ + Tensor embed(String text, Language language, TensorType tensorType); + + class FailingEmbedder implements Embedder { + + @Override + public List embed(String text, Language language) { + throw new IllegalStateException("No embedder has been configured"); + } + + @Override + public Tensor embed(String text, Language language, TensorType tensorType) { + throw new IllegalStateException("No embedder has been configured"); + } + + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Encoder.java b/linguistics/src/main/java/com/yahoo/language/process/Encoder.java deleted file mode 100644 index 27f73d15e54..00000000000 --- a/linguistics/src/main/java/com/yahoo/language/process/Encoder.java +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.process; - -import com.yahoo.language.Language; -import com.yahoo.tensor.Tensor; -import com.yahoo.tensor.TensorType; - -import java.util.List; - -/** - * An encoder converts a text string to a tensor or list of tokens - * - * @author bratseth - */ -public interface Encoder { - - /** An instance of this which throws IllegalStateException if attempted used */ - Encoder throwsOnUse = new FailingEncoder(); - - /** - * Encodes text into tokens in a list of ids. - * - * @param text the text to encode - * @param language the language of the text, or UNKNOWN to use language independent encoding - * @return the text encoded to a list of segment ids - * @throws IllegalArgumentException if the language is not supported by this encoder - */ - List encode(String text, Language language); - - /** - * Encodes text into tokens in a tensor. - * The information contained in the encoding may depend on the tensor type. - * - * @param text the text to encode - * @param language the language of the text, or UNKNOWN to use language independent encoding - * @param tensorType the type of the ttensor to be returned - * @return the tex encoded into a tensor of the supplied type - * @throws IllegalArgumentException if the language or tensor type is not supported by this encoder - */ - Tensor encode(String text, Language language, TensorType tensorType); - - class FailingEncoder implements Encoder { - - @Override - public List encode(String text, Language language) { - throw new IllegalStateException("No encoder has been configured"); - } - - @Override - public Tensor encode(String text, Language language, TensorType tensorType) { - throw new IllegalStateException("No encoder has been configured"); - } - - } - -} -- cgit v1.2.3