diff options
author | Jon Bratseth <bratseth@oath.com> | 2021-09-28 21:51:45 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-09-28 21:51:45 +0200 |
commit | 12a415efca5749433fd22424592ddc18f04160f6 (patch) | |
tree | 9324cb3aa2aabfa4fb8f0dc5fc0f7639869db7b1 | |
parent | b57543dc1a1e3d32bcd03afb7af972490d691bf1 (diff) | |
parent | e7e659e9d26401c8c36300d4760d4e34acd26d0a (diff) |
Merge pull request #19337 from vespa-engine/bratseth/encoder-to-embedderv7.474.25
encode -> embed
43 files changed, 267 insertions, 292 deletions
diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java b/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java index 9c89517f72d..02df81fbbb3 100644 --- a/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java +++ b/config-model/src/main/java/com/yahoo/searchdefinition/document/SDField.java @@ -9,7 +9,7 @@ import com.yahoo.document.MapDataType; import com.yahoo.document.StructDataType; import com.yahoo.document.TensorDataType; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.searchdefinition.Index; import com.yahoo.searchdefinition.Search; @@ -426,12 +426,12 @@ public class SDField extends Field implements TypedKey, FieldOperationContainer, /** Parse an indexing expression which will use the simple linguistics implementatino suitable for testing */ public void parseIndexingScript(String script) { - parseIndexingScript(script, new SimpleLinguistics(), Encoder.throwsOnUse); + parseIndexingScript(script, new SimpleLinguistics(), Embedder.throwsOnUse); } - public void parseIndexingScript(String script, Linguistics linguistics, Encoder encoder) { + public void parseIndexingScript(String script, Linguistics linguistics, Embedder embedder) { try { - ScriptParserContext config = new ScriptParserContext(linguistics, encoder); + ScriptParserContext config = new ScriptParserContext(linguistics, embedder); config.setInputStream(new IndexingInput(script)); setIndexingScript(ScriptExpression.newInstance(config)); } catch (ParseException e) { diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java b/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java index 87fa74b92fe..18e187fd921 100644 --- a/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java +++ b/config-model/src/main/java/com/yahoo/searchdefinition/fieldoperation/IndexingOperation.java @@ -2,7 +2,7 @@ package com.yahoo.searchdefinition.fieldoperation; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.searchdefinition.document.SDField; import com.yahoo.searchdefinition.parser.ParseException; @@ -30,13 +30,13 @@ public class IndexingOperation implements FieldOperation { /** Creates an indexing operation which will use the simple linguistics implementation suitable for testing */ public static IndexingOperation fromStream(SimpleCharStream input, boolean multiLine) throws ParseException { - return fromStream(input, multiLine, new SimpleLinguistics(), Encoder.throwsOnUse); + return fromStream(input, multiLine, new SimpleLinguistics(), Embedder.throwsOnUse); } public static IndexingOperation fromStream(SimpleCharStream input, boolean multiLine, - Linguistics linguistics, Encoder encoder) + Linguistics linguistics, Embedder embedder) throws ParseException { - ScriptParserContext config = new ScriptParserContext(linguistics, encoder); + ScriptParserContext config = new ScriptParserContext(linguistics, embedder); config.setAnnotatorConfig(new AnnotatorConfig()); config.setInputStream(input); ScriptExpression exp; diff --git a/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java b/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java index 5574082e334..0bb04a1266d 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/container/ApplicationContainerCluster.java @@ -100,7 +100,7 @@ public final class ApplicationContainerCluster extends ContainerCluster<Applicat .collect(Collectors.toUnmodifiableSet()); addSimpleComponent("com.yahoo.language.provider.DefaultLinguisticsProvider"); - addSimpleComponent("com.yahoo.language.provider.DefaultEncoderProvider"); + addSimpleComponent("com.yahoo.language.provider.DefaultEmbedderProvider"); addSimpleComponent("com.yahoo.container.jdisc.SecretStoreProvider"); addSimpleComponent("com.yahoo.container.jdisc.DeprecatedSecretStoreProvider"); addSimpleComponent("com.yahoo.container.jdisc.CertificateStoreProvider"); diff --git a/config-model/src/main/javacc/SDParser.jj b/config-model/src/main/javacc/SDParser.jj index 7df77588fe8..6fd2f09f445 100644 --- a/config-model/src/main/javacc/SDParser.jj +++ b/config-model/src/main/javacc/SDParser.jj @@ -58,7 +58,7 @@ import com.yahoo.config.application.api.DeployLogger; import com.yahoo.config.application.api.FileRegistry; import com.yahoo.config.model.api.ModelContext; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.search.query.ranking.Diversity; import java.util.Map; @@ -112,7 +112,7 @@ public class SDParser { */ @SuppressWarnings("deprecation") private IndexingOperation newIndexingOperation(boolean multiline) throws ParseException { - return newIndexingOperation(multiline, new SimpleLinguistics(), Encoder.throwsOnUse); + return newIndexingOperation(multiline, new SimpleLinguistics(), Embedder.throwsOnUse); } /** @@ -121,13 +121,13 @@ public class SDParser { * @param multiline Whether or not to allow multi-line expressions. * @param linguistics What to use for tokenizing. */ - private IndexingOperation newIndexingOperation(boolean multiline, Linguistics linguistics, Encoder encoder) throws ParseException { + private IndexingOperation newIndexingOperation(boolean multiline, Linguistics linguistics, Embedder embedder) throws ParseException { SimpleCharStream input = (SimpleCharStream)token_source.input_stream; if (token.next != null) { input.backup(token.next.image.length()); } try { - return IndexingOperation.fromStream(input, multiline, linguistics, encoder); + return IndexingOperation.fromStream(input, multiline, linguistics, embedder); } finally { token.next = null; jj_ntk = -1; diff --git a/container-core/src/main/java/com/yahoo/container/core/config/testutil/HandlersConfigurerTestWrapper.java b/container-core/src/main/java/com/yahoo/container/core/config/testutil/HandlersConfigurerTestWrapper.java index 0c4709e4a2c..e6231f11ae5 100644 --- a/container-core/src/main/java/com/yahoo/container/core/config/testutil/HandlersConfigurerTestWrapper.java +++ b/container-core/src/main/java/com/yahoo/container/core/config/testutil/HandlersConfigurerTestWrapper.java @@ -17,7 +17,7 @@ import com.yahoo.jdisc.Metric; import com.yahoo.jdisc.handler.RequestHandler; import com.yahoo.jdisc.test.MockMetric; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import java.io.File; @@ -141,7 +141,7 @@ public class HandlersConfigurerTestWrapper { protected void configure() { // Needed by e.g. SearchHandler bind(Linguistics.class).to(SimpleLinguistics.class).in(Scopes.SINGLETON); - bind(Encoder.class).to(Encoder.FailingEncoder.class).in(Scopes.SINGLETON); + bind(Embedder.class).to(Embedder.FailingEmbedder.class).in(Scopes.SINGLETON); bind(ContainerThreadPool.class).to(SimpleContainerThreadpool.class); bind(Metric.class).to(MockMetric.class); } diff --git a/container-core/src/main/java/com/yahoo/language/provider/DefaultEncoderProvider.java b/container-core/src/main/java/com/yahoo/language/provider/DefaultEmbedderProvider.java index f8550d04d1c..ef371106b7d 100644 --- a/container-core/src/main/java/com/yahoo/language/provider/DefaultEncoderProvider.java +++ b/container-core/src/main/java/com/yahoo/language/provider/DefaultEmbedderProvider.java @@ -3,27 +3,22 @@ package com.yahoo.language.provider; import com.google.inject.Inject; import com.yahoo.container.di.componentgraph.Provider; -import com.yahoo.language.Language; -import com.yahoo.language.process.Encoder; -import com.yahoo.tensor.Tensor; -import com.yahoo.tensor.TensorType; - -import java.util.List; +import com.yahoo.language.process.Embedder; /** - * Provides the default encoder implementation if no encoder component has been explicitly configured + * Provides the default embedder implementation if no embedder component has been explicitly configured * (dependency injection will fallback to providers if no components of the requested type is found). * * @author bratseth */ @SuppressWarnings("unused") // Injected -public class DefaultEncoderProvider implements Provider<Encoder> { +public class DefaultEmbedderProvider implements Provider<Embedder> { @Inject - public DefaultEncoderProvider() { } + public DefaultEmbedderProvider() { } @Override - public Encoder get() { return Encoder.throwsOnUse; } + public Embedder get() { return Embedder.throwsOnUse; } @Override public void deconstruct() {} diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json index 7016eff3185..40071f90c34 100644 --- a/container-search/abi-spec.json +++ b/container-search/abi-spec.json @@ -1801,8 +1801,8 @@ "public java.util.Map getRequestMap()", "public com.yahoo.search.Query$Builder setQueryProfile(com.yahoo.search.query.profile.compiled.CompiledQueryProfile)", "public com.yahoo.search.query.profile.compiled.CompiledQueryProfile getQueryProfile()", - "public com.yahoo.search.Query$Builder setEncoder(com.yahoo.language.process.Encoder)", - "public com.yahoo.language.process.Encoder getEncoder()", + "public com.yahoo.search.Query$Builder setEmbedder(com.yahoo.language.process.Embedder)", + "public com.yahoo.language.process.Embedder getEmbedder()", "public com.yahoo.search.Query build()" ], "fields": [] @@ -4258,7 +4258,7 @@ "public" ], "methods": [ - "public void <init>(com.yahoo.statistics.Statistics, com.yahoo.jdisc.Metric, com.yahoo.container.handler.threadpool.ContainerThreadPool, com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.container.core.ContainerHttpConfig, com.yahoo.language.process.Encoder, com.yahoo.search.searchchain.ExecutionFactory)", + "public void <init>(com.yahoo.statistics.Statistics, com.yahoo.jdisc.Metric, com.yahoo.container.handler.threadpool.ContainerThreadPool, com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.container.core.ContainerHttpConfig, com.yahoo.language.process.Embedder, com.yahoo.search.searchchain.ExecutionFactory)", "public void <init>(com.yahoo.statistics.Statistics, com.yahoo.jdisc.Metric, com.yahoo.container.handler.threadpool.ContainerThreadPool, com.yahoo.container.logging.AccessLog, com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.container.core.ContainerHttpConfig, com.yahoo.search.searchchain.ExecutionFactory)", "public void <init>(com.yahoo.statistics.Statistics, com.yahoo.jdisc.Metric, java.util.concurrent.Executor, com.yahoo.container.logging.AccessLog, com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.container.core.ContainerHttpConfig, com.yahoo.search.searchchain.ExecutionFactory)", "public void <init>(com.yahoo.statistics.Statistics, com.yahoo.jdisc.Metric, java.util.concurrent.Executor, com.yahoo.container.logging.AccessLog, com.yahoo.search.query.profile.config.QueryProfilesConfig, com.yahoo.container.core.ContainerHttpConfig, com.yahoo.search.searchchain.ExecutionFactory)", @@ -5885,7 +5885,7 @@ ], "methods": [ "public void <init>(com.yahoo.search.query.profile.compiled.CompiledQueryProfile)", - "public void <init>(com.yahoo.search.query.profile.compiled.CompiledQueryProfile, com.yahoo.language.process.Encoder)", + "public void <init>(com.yahoo.search.query.profile.compiled.CompiledQueryProfile, com.yahoo.language.process.Embedder)", "public com.yahoo.search.query.profile.compiled.CompiledQueryProfile getQueryProfile()", "public java.lang.Object get(com.yahoo.processing.request.CompoundName, java.util.Map, com.yahoo.processing.request.Properties)", "public void set(com.yahoo.processing.request.CompoundName, java.lang.Object, java.util.Map)", @@ -6259,7 +6259,7 @@ "public" ], "methods": [ - "public void <init>(com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.language.process.Encoder, java.util.Map)", + "public void <init>(com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.language.process.Embedder, java.util.Map)", "public static com.yahoo.search.query.profile.types.ConversionContext empty()" ], "fields": [] @@ -6531,7 +6531,7 @@ "public" ], "methods": [ - "public void <init>(com.yahoo.search.Query, com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.language.process.Encoder)", + "public void <init>(com.yahoo.search.Query, com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry, com.yahoo.language.process.Embedder)", "public void setParentQuery(com.yahoo.search.Query)", "public java.lang.Object get(com.yahoo.processing.request.CompoundName, java.util.Map, com.yahoo.processing.request.Properties)", "public void set(com.yahoo.processing.request.CompoundName, java.lang.Object, java.util.Map)", diff --git a/container-search/src/main/java/com/yahoo/search/Query.java b/container-search/src/main/java/com/yahoo/search/Query.java index 06b71599103..08ebd74da5a 100644 --- a/container-search/src/main/java/com/yahoo/search/Query.java +++ b/container-search/src/main/java/com/yahoo/search/Query.java @@ -7,7 +7,7 @@ import com.yahoo.collections.Tuple2; import com.yahoo.component.Version; import com.yahoo.container.jdisc.HttpRequest; import com.yahoo.fs4.MapEncoder; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.prelude.fastsearch.DocumentDatabase; import com.yahoo.prelude.query.Highlight; import com.yahoo.prelude.query.textualrepresentation.TextualQueryRepresentation; @@ -334,32 +334,32 @@ public class Query extends com.yahoo.processing.Request implements Cloneable { public Query(HttpRequest request, Map<String, String> requestMap, CompiledQueryProfile queryProfile) { super(new QueryPropertyAliases(propertyAliases)); this.httpRequest = request; - init(requestMap, queryProfile, Encoder.throwsOnUse); + init(requestMap, queryProfile, Embedder.throwsOnUse); } // TODO: Deprecate most constructors above here private Query(Builder builder) { - this(builder.getRequest(), builder.getRequestMap(), builder.getQueryProfile(), builder.getEncoder()); + this(builder.getRequest(), builder.getRequestMap(), builder.getQueryProfile(), builder.getEmbedder()); } - private Query(HttpRequest request, Map<String, String> requestMap, CompiledQueryProfile queryProfile, Encoder encoder) { + private Query(HttpRequest request, Map<String, String> requestMap, CompiledQueryProfile queryProfile, Embedder embedder) { super(new QueryPropertyAliases(propertyAliases)); this.httpRequest = request; - init(requestMap, queryProfile, encoder); + init(requestMap, queryProfile, embedder); } - private void init(Map<String, String> requestMap, CompiledQueryProfile queryProfile, Encoder encoder) { + private void init(Map<String, String> requestMap, CompiledQueryProfile queryProfile, Embedder embedder) { startTime = httpRequest.getJDiscRequest().creationTime(TimeUnit.MILLISECONDS); if (queryProfile != null) { // Move all request parameters to the query profile just to validate that the parameter settings are legal - Properties queryProfileProperties = new QueryProfileProperties(queryProfile, encoder); + Properties queryProfileProperties = new QueryProfileProperties(queryProfile, embedder); properties().chain(queryProfileProperties); // TODO: Just checking legality rather than actually setting would be faster setPropertiesFromRequestMap(requestMap, properties(), true); // Adds errors to the query for illegal set attempts // Create the full chain - properties().chain(new QueryProperties(this, queryProfile.getRegistry(), encoder)). + properties().chain(new QueryProperties(this, queryProfile.getRegistry(), embedder)). chain(new ModelObjectMap()). chain(new RequestContextProperties(requestMap)). chain(queryProfileProperties). @@ -378,7 +378,7 @@ public class Query extends com.yahoo.processing.Request implements Cloneable { } else { // bypass these complications if there is no query profile to get values from and validate against properties(). - chain(new QueryProperties(this, CompiledQueryProfileRegistry.empty, encoder)). + chain(new QueryProperties(this, CompiledQueryProfileRegistry.empty, embedder)). chain(new PropertyMap()). chain(new DefaultProperties()); setPropertiesFromRequestMap(requestMap, properties(), false); @@ -1130,7 +1130,7 @@ public class Query extends com.yahoo.processing.Request implements Cloneable { private HttpRequest request = null; private Map<String, String> requestMap = null; private CompiledQueryProfile queryProfile = null; - private Encoder encoder = Encoder.throwsOnUse; + private Embedder embedder = Embedder.throwsOnUse; public Builder setRequest(String query) { request = HttpRequest.createTestRequest(query, com.yahoo.jdisc.http.HttpRequest.Method.GET); @@ -1168,12 +1168,12 @@ public class Query extends com.yahoo.processing.Request implements Cloneable { /** Returns the query profile of this query, or null if none. */ public CompiledQueryProfile getQueryProfile() { return queryProfile; } - public Builder setEncoder(Encoder encoder) { - this.encoder = encoder; + public Builder setEmbedder(Embedder embedder) { + this.embedder = embedder; return this; } - public Encoder getEncoder() { return encoder; } + public Embedder getEmbedder() { return embedder; } /** Creates a new query from this builder. No properties are required to before calling this. */ public Query build() { return new Query(this); } diff --git a/container-search/src/main/java/com/yahoo/search/handler/SearchHandler.java b/container-search/src/main/java/com/yahoo/search/handler/SearchHandler.java index d1e57a30206..c15aef44f3d 100644 --- a/container-search/src/main/java/com/yahoo/search/handler/SearchHandler.java +++ b/container-search/src/main/java/com/yahoo/search/handler/SearchHandler.java @@ -23,7 +23,7 @@ import com.yahoo.io.IOUtils; import com.yahoo.jdisc.Metric; import com.yahoo.jdisc.Request; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.net.HostName; import com.yahoo.net.UriTools; import com.yahoo.prelude.query.parser.ParseException; @@ -106,7 +106,7 @@ public class SearchHandler extends LoggingRequestHandler { private final String selfHostname = HostName.getLocalhost(); - private final Encoder encoder; + private final Embedder embedder; private final ExecutionFactory executionFactory; @@ -134,9 +134,9 @@ public class SearchHandler extends LoggingRequestHandler { ContainerThreadPool threadpool, CompiledQueryProfileRegistry queryProfileRegistry, ContainerHttpConfig config, - Encoder encoder, + Embedder embedder, ExecutionFactory executionFactory) { - this(statistics, metric, threadpool.executor(), queryProfileRegistry, encoder, executionFactory, + this(statistics, metric, threadpool.executor(), queryProfileRegistry, embedder, executionFactory, config.numQueriesToTraceOnDebugAfterConstruction(), config.hostResponseHeaderKey().equals("") ? Optional.empty() : Optional.of(config.hostResponseHeaderKey())); } @@ -170,7 +170,7 @@ public class SearchHandler extends LoggingRequestHandler { metric, executor, queryProfileRegistry, - Encoder.throwsOnUse, + Embedder.throwsOnUse, executionFactory, containerHttpConfig.numQueriesToTraceOnDebugAfterConstruction(), containerHttpConfig.hostResponseHeaderKey().equals("") ? @@ -192,7 +192,7 @@ public class SearchHandler extends LoggingRequestHandler { metric, executor, QueryProfileConfigurer.createFromConfig(queryProfileConfig).compile(), - Encoder.throwsOnUse, + Embedder.throwsOnUse, executionFactory, containerHttpConfig.numQueriesToTraceOnDebugAfterConstruction(), containerHttpConfig.hostResponseHeaderKey().equals("") ? @@ -210,7 +210,7 @@ public class SearchHandler extends LoggingRequestHandler { CompiledQueryProfileRegistry queryProfileRegistry, ExecutionFactory executionFactory, Optional<String> hostResponseHeaderKey) { - this(statistics, metric, executor, queryProfileRegistry, Encoder.throwsOnUse, + this(statistics, metric, executor, queryProfileRegistry, Embedder.throwsOnUse, executionFactory, 0, hostResponseHeaderKey); } @@ -218,14 +218,14 @@ public class SearchHandler extends LoggingRequestHandler { Metric metric, Executor executor, CompiledQueryProfileRegistry queryProfileRegistry, - Encoder encoder, + Embedder embedder, ExecutionFactory executionFactory, long numQueriesToTraceOnDebugAfterStartup, Optional<String> hostResponseHeaderKey) { super(executor, metric, true); log.log(Level.FINE, () -> "SearchHandler.init " + System.identityHashCode(this)); this.queryProfileRegistry = queryProfileRegistry; - this.encoder = encoder; + this.embedder = embedder; this.executionFactory = executionFactory; this.maxThreads = examineExecutor(executor); @@ -332,7 +332,7 @@ public class SearchHandler extends LoggingRequestHandler { Query query = new Query.Builder().setRequest(request) .setRequestMap(requestMap) .setQueryProfile(queryProfile) - .setEncoder(encoder) + .setEmbedder(embedder) .build(); boolean benchmarking = VespaHeaders.benchmarkOutput(request); diff --git a/container-search/src/main/java/com/yahoo/search/query/profile/QueryProfileProperties.java b/container-search/src/main/java/com/yahoo/search/query/profile/QueryProfileProperties.java index e555000272d..53be827073c 100644 --- a/container-search/src/main/java/com/yahoo/search/query/profile/QueryProfileProperties.java +++ b/container-search/src/main/java/com/yahoo/search/query/profile/QueryProfileProperties.java @@ -2,7 +2,7 @@ package com.yahoo.search.query.profile; import com.yahoo.collections.Pair; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.processing.IllegalInputException; import com.yahoo.processing.request.CompoundName; import com.yahoo.processing.request.properties.PropertyMap; @@ -30,7 +30,7 @@ import java.util.Map; public class QueryProfileProperties extends Properties { private final CompiledQueryProfile profile; - private final Encoder encoder; + private final Embedder embedder; // Note: The priority order is: values has precedence over references @@ -45,14 +45,14 @@ public class QueryProfileProperties extends Properties { private List<Pair<CompoundName, CompiledQueryProfile>> references = null; public QueryProfileProperties(CompiledQueryProfile profile) { - this(profile, Encoder.throwsOnUse); + this(profile, Embedder.throwsOnUse); } /** Creates an instance from a profile, throws an exception if the given profile is null */ - public QueryProfileProperties(CompiledQueryProfile profile, Encoder encoder) { + public QueryProfileProperties(CompiledQueryProfile profile, Embedder embedder) { Validator.ensureNotNull("The profile wrapped by this cannot be null", profile); this.profile = profile; - this.encoder = encoder; + this.embedder = embedder; } /** Returns the query profile backing this, or null if none */ @@ -122,7 +122,7 @@ public class QueryProfileProperties extends Properties { if (fieldDescription != null) { if (i == name.size() - 1) { // at the end of the path, check the assignment type value = fieldDescription.getType().convertFrom(value, new ConversionContext(profile.getRegistry(), - encoder, + embedder, context)); if (value == null) throw new IllegalInputException("'" + value + "' is not a " + diff --git a/container-search/src/main/java/com/yahoo/search/query/profile/types/ConversionContext.java b/container-search/src/main/java/com/yahoo/search/query/profile/types/ConversionContext.java index 4aa95741b06..e5b9eb1c1cd 100644 --- a/container-search/src/main/java/com/yahoo/search/query/profile/types/ConversionContext.java +++ b/container-search/src/main/java/com/yahoo/search/query/profile/types/ConversionContext.java @@ -2,7 +2,7 @@ package com.yahoo.search.query.profile.types; import com.yahoo.language.Language; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry; import java.util.Map; @@ -13,12 +13,12 @@ import java.util.Map; public class ConversionContext { private final CompiledQueryProfileRegistry registry; - private final Encoder encoder; + private final Embedder embedder; private final Language language; - public ConversionContext(CompiledQueryProfileRegistry registry, Encoder encoder, Map<String, String> context) { + public ConversionContext(CompiledQueryProfileRegistry registry, Embedder embedder, Map<String, String> context) { this.registry = registry; - this.encoder = encoder; + this.embedder = embedder; this.language = context.containsKey("language") ? Language.fromLanguageTag(context.get("language")) : Language.UNKNOWN; } @@ -27,14 +27,14 @@ public class ConversionContext { CompiledQueryProfileRegistry getRegistry() {return registry;} /** Returns the configured encoder, never null */ - Encoder getEncoder() { return encoder; } + Embedder getEncoder() { return embedder; } /** Returns the language, which is never null but may be UNKNOWN */ Language getLanguage() { return language; } /** Returns an empty context */ public static ConversionContext empty() { - return new ConversionContext(null, Encoder.throwsOnUse, Map.of()); + return new ConversionContext(null, Embedder.throwsOnUse, Map.of()); } } diff --git a/container-search/src/main/java/com/yahoo/search/query/profile/types/FieldType.java b/container-search/src/main/java/com/yahoo/search/query/profile/types/FieldType.java index 511b64c7b6e..7a06f9ef534 100644 --- a/container-search/src/main/java/com/yahoo/search/query/profile/types/FieldType.java +++ b/container-search/src/main/java/com/yahoo/search/query/profile/types/FieldType.java @@ -1,10 +1,8 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.search.query.profile.types; -import com.yahoo.language.process.Encoder; import com.yahoo.search.query.profile.QueryProfile; import com.yahoo.search.query.profile.QueryProfileRegistry; -import com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry; import com.yahoo.search.yql.YqlQuery; import com.yahoo.tensor.Tensor; import com.yahoo.tensor.TensorType; diff --git a/container-search/src/main/java/com/yahoo/search/query/profile/types/PrimitiveFieldType.java b/container-search/src/main/java/com/yahoo/search/query/profile/types/PrimitiveFieldType.java index b1a9820c6fa..f9d8950908b 100644 --- a/container-search/src/main/java/com/yahoo/search/query/profile/types/PrimitiveFieldType.java +++ b/container-search/src/main/java/com/yahoo/search/query/profile/types/PrimitiveFieldType.java @@ -1,9 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.search.query.profile.types; -import com.yahoo.language.process.Encoder; import com.yahoo.search.query.profile.QueryProfileRegistry; -import com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry; import static com.yahoo.text.Lowercase.toLowerCase; diff --git a/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryFieldType.java b/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryFieldType.java index 09c1a4d0cc0..cbae6402039 100644 --- a/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryFieldType.java +++ b/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryFieldType.java @@ -1,9 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.search.query.profile.types; -import com.yahoo.language.process.Encoder; import com.yahoo.search.query.profile.QueryProfileRegistry; -import com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry; import com.yahoo.search.yql.YqlQuery; /** diff --git a/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryProfileFieldType.java b/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryProfileFieldType.java index 6958318bee4..ff12224823f 100644 --- a/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryProfileFieldType.java +++ b/container-search/src/main/java/com/yahoo/search/query/profile/types/QueryProfileFieldType.java @@ -1,11 +1,9 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.search.query.profile.types; -import com.yahoo.language.process.Encoder; import com.yahoo.search.query.profile.QueryProfile; import com.yahoo.search.query.profile.QueryProfileRegistry; import com.yahoo.search.query.profile.compiled.CompiledQueryProfile; -import com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry; /** * Represents a query profile field type which is a reference to a query profile. diff --git a/container-search/src/main/java/com/yahoo/search/query/profile/types/TensorFieldType.java b/container-search/src/main/java/com/yahoo/search/query/profile/types/TensorFieldType.java index 34a9f8d41c3..cd21f0b3a61 100644 --- a/container-search/src/main/java/com/yahoo/search/query/profile/types/TensorFieldType.java +++ b/container-search/src/main/java/com/yahoo/search/query/profile/types/TensorFieldType.java @@ -2,9 +2,8 @@ package com.yahoo.search.query.profile.types; import com.yahoo.language.Language; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.search.query.profile.QueryProfileRegistry; -import com.yahoo.search.query.profile.compiled.CompiledQueryProfileRegistry; import com.yahoo.tensor.Tensor; import com.yahoo.tensor.TensorType; @@ -48,18 +47,18 @@ public class TensorFieldType extends FieldType { return convertFrom(o, context.getEncoder(), context.getLanguage()); } - private Object convertFrom(Object o, Encoder encoder, Language language) { + private Object convertFrom(Object o, Embedder embedder, Language language) { if (o instanceof Tensor) return o; - if (o instanceof String && ((String)o).startsWith("encode(")) return encode((String)o, encoder, language); + if (o instanceof String && ((String)o).startsWith("embed(")) return encode((String)o, embedder, language); if (o instanceof String) return Tensor.from(type, (String)o); return null; } - private Tensor encode(String s, Encoder encoder, Language language) { + private Tensor encode(String s, Embedder embedder, Language language) { if ( ! s.endsWith(")")) - throw new IllegalArgumentException("Expected any string enclosed in encode(), but the argument does not end by ')'"); - String text = s.substring("encode(".length(), s.length() - 1); - return encoder.encode(text, language, type); + throw new IllegalArgumentException("Expected any string enclosed in embed(), but the argument does not end by ')'"); + String text = s.substring("embed(".length(), s.length() - 1); + return embedder.embed(text, language, type); } public static TensorFieldType fromTypeString(String s) { diff --git a/container-search/src/main/java/com/yahoo/search/query/properties/QueryProperties.java b/container-search/src/main/java/com/yahoo/search/query/properties/QueryProperties.java index 02648f84066..3a426656185 100644 --- a/container-search/src/main/java/com/yahoo/search/query/properties/QueryProperties.java +++ b/container-search/src/main/java/com/yahoo/search/query/properties/QueryProperties.java @@ -1,7 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.search.query.properties; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.processing.IllegalInputException; import com.yahoo.processing.request.CompoundName; import com.yahoo.search.Query; @@ -34,12 +34,12 @@ public class QueryProperties extends Properties { private Query query; private final CompiledQueryProfileRegistry profileRegistry; - private final Encoder encoder; + private final Embedder embedder; - public QueryProperties(Query query, CompiledQueryProfileRegistry profileRegistry, Encoder encoder) { + public QueryProperties(Query query, CompiledQueryProfileRegistry profileRegistry, Embedder embedder) { this.query = query; this.profileRegistry = profileRegistry; - this.encoder = encoder; + this.embedder = embedder; } public void setParentQuery(Query query) { @@ -380,7 +380,7 @@ public class QueryProperties extends Properties { if (type == null) return value; // no type info -> keep as string FieldDescription field = type.getField(key); if (field == null) return value; // ditto - return field.getType().convertFrom(value, new ConversionContext(profileRegistry, encoder, context)); + return field.getType().convertFrom(value, new ConversionContext(profileRegistry, embedder, context)); } private void throwIllegalParameter(String key,String namespace) { diff --git a/container-search/src/test/java/com/yahoo/search/grouping/vespa/IntegerEncoderTestCase.java b/container-search/src/test/java/com/yahoo/search/grouping/vespa/IntegerEmbedderTestCase.java index 3b48ae35fcf..18a9f11e15e 100644 --- a/container-search/src/test/java/com/yahoo/search/grouping/vespa/IntegerEncoderTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/grouping/vespa/IntegerEmbedderTestCase.java @@ -8,7 +8,7 @@ import static org.junit.Assert.assertEquals; /** * @author Simon Thoresen Hult */ -public class IntegerEncoderTestCase { +public class IntegerEmbedderTestCase { @Test public void requireThatIntEncoderWorksAsExpected() { diff --git a/container-search/src/test/java/com/yahoo/search/query/profile/types/test/QueryProfileTypeTestCase.java b/container-search/src/test/java/com/yahoo/search/query/profile/types/test/QueryProfileTypeTestCase.java index 45f53a1cdb9..e22263070e0 100644 --- a/container-search/src/test/java/com/yahoo/search/query/profile/types/test/QueryProfileTypeTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/query/profile/types/test/QueryProfileTypeTestCase.java @@ -4,7 +4,7 @@ package com.yahoo.search.query.profile.types.test; import com.yahoo.component.ComponentId; import com.yahoo.container.jdisc.HttpRequest; import com.yahoo.language.Language; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.tensor.Tensor; import com.yahoo.tensor.TensorType; import com.yahoo.yolean.Exceptions; @@ -22,7 +22,6 @@ import com.yahoo.search.query.profile.types.QueryProfileTypeRegistry; import org.junit.Before; import org.junit.Test; -import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.nio.charset.StandardCharsets; import java.util.List; @@ -441,19 +440,19 @@ public class QueryProfileTypeTestCase { } @Test - public void testUnencodedTensorRankFeatureInRequest() { + public void testUnembeddedTensorRankFeatureInRequest() { QueryProfile profile = new QueryProfile("test"); profile.setType(testtype); registry.register(profile); CompiledQueryProfileRegistry cRegistry = registry.compile(); - String textToEncode = "text to encode as tensor"; + String textToEmbed = "text to embed into a tensor"; Tensor expectedTensor = Tensor.from("tensor<float>(x[5]):[3,7,4,0,0]]"); Query query1 = new Query.Builder().setRequest(HttpRequest.createTestRequest("?" + urlEncode("ranking.features.query(myTensor4)") + - "=" + urlEncode("encode(" + textToEncode + ")"), + "=" + urlEncode("embed(" + textToEmbed + ")"), com.yahoo.jdisc.http.HttpRequest.Method.GET)) .setQueryProfile(cRegistry.getComponent("test")) - .setEncoder(new MockEncoder(textToEncode, Language.UNKNOWN, expectedTensor)) + .setEmbedder(new MockEmbedder(textToEmbed, Language.UNKNOWN, expectedTensor)) .build(); assertEquals(0, query1.errors().size()); assertEquals(expectedTensor, query1.properties().get("ranking.features.query(myTensor4)")); @@ -461,11 +460,11 @@ public class QueryProfileTypeTestCase { // Explicit language Query query2 = new Query.Builder().setRequest(HttpRequest.createTestRequest("?" + urlEncode("ranking.features.query(myTensor4)") + - "=" + urlEncode("encode(" + textToEncode + ")") + + "=" + urlEncode("embed(" + textToEmbed + ")") + "&language=en", com.yahoo.jdisc.http.HttpRequest.Method.GET)) .setQueryProfile(cRegistry.getComponent("test")) - .setEncoder(new MockEncoder(textToEncode, Language.ENGLISH, expectedTensor)) + .setEmbedder(new MockEmbedder(textToEmbed, Language.ENGLISH, expectedTensor)) .build(); assertEquals(0, query2.errors().size()); assertEquals(expectedTensor, query2.properties().get("ranking.features.query(myTensor4)")); @@ -723,28 +722,28 @@ public class QueryProfileTypeTestCase { } } - private static final class MockEncoder implements Encoder { + private static final class MockEmbedder implements Embedder { private final String expectedText; private final Language expectedLanguage; private final Tensor tensorToReturn; - public MockEncoder(String expectedText, - Language expectedLanguage, - Tensor tensorToReturn) { + public MockEmbedder(String expectedText, + Language expectedLanguage, + Tensor tensorToReturn) { this.expectedText = expectedText; this.expectedLanguage = expectedLanguage; this.tensorToReturn = tensorToReturn; } @Override - public List<Integer> encode(String text, Language language) { + public List<Integer> embed(String text, Language language) { fail("Unexpected call"); return null; } @Override - public Tensor encode(String text, Language language, TensorType tensorType) { + public Tensor embed(String text, Language language, TensorType tensorType) { assertEquals(expectedText, text); assertEquals(expectedLanguage, language); assertEquals(tensorToReturn.type(), tensorType); diff --git a/docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java b/docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java index 53709c4ff87..f3a67f855e9 100644 --- a/docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java +++ b/docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java @@ -21,7 +21,7 @@ import com.yahoo.document.config.DocumentmanagerConfig; import com.yahoo.language.Linguistics; import java.util.logging.Level; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.vespa.configdefinition.IlscriptsConfig; import com.yahoo.vespa.indexinglanguage.AdapterFactory; import com.yahoo.vespa.indexinglanguage.SimpleAdapterFactory; @@ -55,9 +55,9 @@ public class IndexingProcessor extends DocumentProcessor { public IndexingProcessor(DocumentmanagerConfig documentmanagerConfig, IlscriptsConfig ilscriptsConfig, Linguistics linguistics, - Encoder encoder) { + Embedder embedder) { docTypeMgr = DocumentTypeManagerConfigurer.configureNewManager(documentmanagerConfig); - scriptMgr = new ScriptManager(docTypeMgr, ilscriptsConfig, linguistics, encoder); + scriptMgr = new ScriptManager(docTypeMgr, ilscriptsConfig, linguistics, embedder); adapterFactory = new SimpleAdapterFactory(new ExpressionSelector()); } diff --git a/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java b/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java index fa5f794f652..7e1d5b5b6ce 100644 --- a/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java +++ b/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java @@ -6,7 +6,7 @@ import com.yahoo.document.DocumentTypeManager; import com.yahoo.language.Linguistics; import java.util.logging.Level; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.vespa.configdefinition.IlscriptsConfig; import com.yahoo.vespa.indexinglanguage.ScriptParserContext; import com.yahoo.vespa.indexinglanguage.expressions.InputExpression; @@ -28,9 +28,9 @@ public class ScriptManager { private final Map<String, Map<String, DocumentScript>> documentFieldScripts; private final DocumentTypeManager docTypeMgr; - public ScriptManager(DocumentTypeManager docTypeMgr, IlscriptsConfig config, Linguistics linguistics, Encoder encoder) { + public ScriptManager(DocumentTypeManager docTypeMgr, IlscriptsConfig config, Linguistics linguistics, Embedder embedder) { this.docTypeMgr = docTypeMgr; - documentFieldScripts = createScriptsMap(docTypeMgr, config, linguistics, encoder); + documentFieldScripts = createScriptsMap(docTypeMgr, config, linguistics, embedder); } @@ -75,9 +75,9 @@ public class ScriptManager { private static Map<String, Map<String, DocumentScript>> createScriptsMap(DocumentTypeManager docTypeMgr, IlscriptsConfig config, Linguistics linguistics, - Encoder encoder) { + Embedder embedder) { Map<String, Map<String, DocumentScript>> documentFieldScripts = new HashMap<>(config.ilscript().size()); - ScriptParserContext parserContext = new ScriptParserContext(linguistics, encoder); + ScriptParserContext parserContext = new ScriptParserContext(linguistics, embedder); parserContext.getAnnotatorConfig().setMaxTermOccurrences(config.maxtermoccurrences()); parserContext.getAnnotatorConfig().setMaxTokenLength(config.fieldmatchmaxlength()); diff --git a/docprocs/src/test/java/com/yahoo/docprocs/indexing/IndexingProcessorTestCase.java b/docprocs/src/test/java/com/yahoo/docprocs/indexing/IndexingProcessorTestCase.java index dc9b1ffba73..f54435329f9 100644 --- a/docprocs/src/test/java/com/yahoo/docprocs/indexing/IndexingProcessorTestCase.java +++ b/docprocs/src/test/java/com/yahoo/docprocs/indexing/IndexingProcessorTestCase.java @@ -13,7 +13,7 @@ import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.document.update.AssignValueUpdate; import com.yahoo.document.update.FieldUpdate; import com.yahoo.document.update.ValueUpdate; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.configdefinition.IlscriptsConfig; import org.junit.Test; @@ -127,6 +127,6 @@ public class IndexingProcessorTestCase { return new IndexingProcessor(ConfigGetter.getConfig(DocumentmanagerConfig.class, configId), ConfigGetter.getConfig(IlscriptsConfig.class, configId), new SimpleLinguistics(), - Encoder.throwsOnUse); + Embedder.throwsOnUse); } } diff --git a/docprocs/src/test/java/com/yahoo/docprocs/indexing/ScriptManagerTestCase.java b/docprocs/src/test/java/com/yahoo/docprocs/indexing/ScriptManagerTestCase.java index ec05fcbe422..a849f437b44 100644 --- a/docprocs/src/test/java/com/yahoo/docprocs/indexing/ScriptManagerTestCase.java +++ b/docprocs/src/test/java/com/yahoo/docprocs/indexing/ScriptManagerTestCase.java @@ -3,7 +3,7 @@ package com.yahoo.docprocs.indexing; import com.yahoo.document.DocumentType; import com.yahoo.document.DocumentTypeManager; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.vespa.configdefinition.IlscriptsConfig; import com.yahoo.vespa.indexinglanguage.parser.ParseException; import org.junit.Test; @@ -29,7 +29,7 @@ public class ScriptManagerTestCase { IlscriptsConfig.Builder config = new IlscriptsConfig.Builder(); config.ilscript(new IlscriptsConfig.Ilscript.Builder().doctype("newssummary") .content("input title | index title")); - ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(config), null, Encoder.throwsOnUse); + ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(config), null, Embedder.throwsOnUse); assertNotNull(scriptMgr.getScript(typeMgr.getDocumentType("newsarticle"))); assertNull(scriptMgr.getScript(new DocumentType("unknown"))); } @@ -44,7 +44,7 @@ public class ScriptManagerTestCase { IlscriptsConfig.Builder config = new IlscriptsConfig.Builder(); config.ilscript(new IlscriptsConfig.Ilscript.Builder().doctype("newsarticle") .content("input title | index title")); - ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(config), null, Encoder.throwsOnUse); + ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(config), null, Embedder.throwsOnUse); assertNotNull(scriptMgr.getScript(typeMgr.getDocumentType("newssummary"))); assertNull(scriptMgr.getScript(new DocumentType("unknown"))); } @@ -53,7 +53,7 @@ public class ScriptManagerTestCase { public void requireThatEmptyConfigurationDoesNotThrow() { DocumentTypeManager typeMgr = new DocumentTypeManager(); typeMgr.configure("file:src/test/cfg/documentmanager_inherit.cfg"); - ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(new IlscriptsConfig.Builder()), null, Encoder.throwsOnUse); + ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(new IlscriptsConfig.Builder()), null, Embedder.throwsOnUse); assertNull(scriptMgr.getScript(new DocumentType("unknown"))); } @@ -61,7 +61,7 @@ public class ScriptManagerTestCase { public void requireThatUnknownDocumentTypeReturnsNull() { DocumentTypeManager typeMgr = new DocumentTypeManager(); typeMgr.configure("file:src/test/cfg/documentmanager_inherit.cfg"); - ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(new IlscriptsConfig.Builder()), null, Encoder.throwsOnUse); + ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(new IlscriptsConfig.Builder()), null, Embedder.throwsOnUse); for (Iterator<DocumentType> it = typeMgr.documentTypeIterator(); it.hasNext(); ) { assertNull(scriptMgr.getScript(it.next())); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParser.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParser.java index 34da5b47655..649095d1db8 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParser.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParser.java @@ -62,7 +62,7 @@ public final class ScriptParser { parser.setAnnotatorConfig(context.getAnnotatorConfig()); parser.setDefaultFieldName(context.getDefaultFieldName()); parser.setLinguistics(context.getLinguistcs()); - parser.setEncoder(context.getEncoder()); + parser.setEmbedder(context.getEmbedder()); try { return method.call(parser); } catch (ParseException e) { diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParserContext.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParserContext.java index 06be91703fa..77c2af8dd42 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParserContext.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/ScriptParserContext.java @@ -2,8 +2,7 @@ package com.yahoo.vespa.indexinglanguage; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; -import com.yahoo.language.simple.SimpleLinguistics; +import com.yahoo.language.process.Embedder; import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig; import com.yahoo.vespa.indexinglanguage.parser.CharStream; @@ -14,13 +13,13 @@ public class ScriptParserContext { private AnnotatorConfig annotatorConfig = new AnnotatorConfig(); private Linguistics linguistics; - private final Encoder encoder; + private final Embedder embedder; private String defaultFieldName = null; private CharStream inputStream = null; - public ScriptParserContext(Linguistics linguistics, Encoder encoder) { + public ScriptParserContext(Linguistics linguistics, Embedder embedder) { this.linguistics = linguistics; - this.encoder = encoder; + this.embedder = embedder; } public AnnotatorConfig getAnnotatorConfig() { @@ -41,8 +40,8 @@ public class ScriptParserContext { return this; } - public Encoder getEncoder() { - return encoder; + public Embedder getEmbedder() { + return embedder; } public String getDefaultFieldName() { diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EncodeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EmbedExpression.java index f84da9ddef8..aa579ed729e 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EncodeExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EmbedExpression.java @@ -5,25 +5,25 @@ import com.yahoo.document.DataType; import com.yahoo.document.TensorDataType; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.document.datatypes.TensorFieldValue; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.tensor.Tensor; import com.yahoo.tensor.TensorType; /** - * Encodes a string as a tensor using the configured Encoder component + * Embeds a string in a tensor space using the configured Embedder component * * @author bratseth */ -public class EncodeExpression extends Expression { +public class EmbedExpression extends Expression { - private final Encoder encoder; + private final Embedder embedder; - /** The target type we are encoding to. Set during verification. */ + /** The target type we are embedding into. */ private TensorType targetType; - public EncodeExpression(Encoder encoder) { + public EmbedExpression(Embedder embedder) { super(DataType.STRING); - this.encoder = encoder; + this.embedder = embedder; } @Override @@ -34,7 +34,7 @@ public class EncodeExpression extends Expression { @Override protected void doExecute(ExecutionContext context) { StringFieldValue input = (StringFieldValue) context.getValue(); - Tensor tensor = encoder.encode(input.getString(), context.getLanguage(), targetType); + Tensor tensor = embedder.embed(input.getString(), context.getLanguage(), targetType); context.setValue(new TensorFieldValue(tensor)); } @@ -43,7 +43,7 @@ public class EncodeExpression extends Expression { String outputField = context.getOutputField(); if (outputField == null) throw new VerificationException(this, "No output field in this statement: " + - "Don't know what tensor type to encode to."); + "Don't know what tensor type to embed into."); DataType outputFieldType = context.getInputType(this, outputField); if ( ! (outputFieldType instanceof TensorDataType) ) throw new VerificationException(this, "The type of the output field " + outputField + @@ -58,12 +58,12 @@ public class EncodeExpression extends Expression { } @Override - public String toString() { return "encode"; } + public String toString() { return "embed"; } @Override public int hashCode() { return 1; } @Override - public boolean equals(Object o) { return o instanceof EncodeExpression; } + public boolean equals(Object o) { return o instanceof EmbedExpression; } } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java index 67459c2b035..20a0c9804a9 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java @@ -6,7 +6,7 @@ import com.yahoo.document.Document; import com.yahoo.document.DocumentUpdate; import com.yahoo.document.datatypes.FieldValue; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.*; import com.yahoo.vespa.indexinglanguage.parser.IndexingInput; @@ -188,11 +188,11 @@ public abstract class Expression extends Selectable { /** Creates an expression with simple lingustics for testing */ public static Expression fromString(String expression) throws ParseException { - return fromString(expression, new SimpleLinguistics(), Encoder.throwsOnUse); + return fromString(expression, new SimpleLinguistics(), Embedder.throwsOnUse); } - public static Expression fromString(String expression, Linguistics linguistics, Encoder encoder) throws ParseException { - return newInstance(new ScriptParserContext(linguistics, encoder).setInputStream(new IndexingInput(expression))); + public static Expression fromString(String expression, Linguistics linguistics, Embedder embedder) throws ParseException { + return newInstance(new ScriptParserContext(linguistics, embedder).setInputStream(new IndexingInput(expression))); } public static Expression newInstance(ScriptParserContext context) throws ParseException { diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java index 7317cb2216f..b5f71813de3 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ScriptExpression.java @@ -4,7 +4,7 @@ package com.yahoo.vespa.indexinglanguage.expressions; import com.yahoo.document.DataType; import com.yahoo.document.datatypes.FieldValue; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.ScriptParser; import com.yahoo.vespa.indexinglanguage.ScriptParserContext; @@ -92,11 +92,11 @@ public final class ScriptExpression extends ExpressionList<StatementExpression> /** Creates an expression with simple lingustics for testing */ @SuppressWarnings("deprecation") public static ScriptExpression fromString(String expression) throws ParseException { - return fromString(expression, new SimpleLinguistics(), Encoder.throwsOnUse); + return fromString(expression, new SimpleLinguistics(), Embedder.throwsOnUse); } - public static ScriptExpression fromString(String expression, Linguistics linguistics, Encoder encoder) throws ParseException { - return newInstance(new ScriptParserContext(linguistics, encoder).setInputStream(new IndexingInput(expression))); + public static ScriptExpression fromString(String expression, Linguistics linguistics, Embedder embedder) throws ParseException { + return newInstance(new ScriptParserContext(linguistics, embedder).setInputStream(new IndexingInput(expression))); } public static ScriptExpression newInstance(ScriptParserContext config) throws ParseException { diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java index 145133e210d..7d157af1a19 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/StatementExpression.java @@ -2,9 +2,8 @@ package com.yahoo.vespa.indexinglanguage.expressions; import com.yahoo.document.DataType; -import com.yahoo.document.TensorDataType; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.ScriptParser; import com.yahoo.vespa.indexinglanguage.ScriptParserContext; @@ -12,7 +11,6 @@ import com.yahoo.vespa.indexinglanguage.parser.IndexingInput; import com.yahoo.vespa.indexinglanguage.parser.ParseException; import java.util.Arrays; -import java.util.Collection; import java.util.Iterator; import java.util.LinkedList; import java.util.List; @@ -100,11 +98,11 @@ public final class StatementExpression extends ExpressionList<Expression> { /** Creates an expression with simple lingustics for testing */ public static StatementExpression fromString(String expression) throws ParseException { - return fromString(expression, new SimpleLinguistics(), Encoder.throwsOnUse); + return fromString(expression, new SimpleLinguistics(), Embedder.throwsOnUse); } - public static StatementExpression fromString(String expression, Linguistics linguistics, Encoder encoder) throws ParseException { - return newInstance(new ScriptParserContext(linguistics, encoder).setInputStream(new IndexingInput(expression))); + public static StatementExpression fromString(String expression, Linguistics linguistics, Embedder embedder) throws ParseException { + return newInstance(new ScriptParserContext(linguistics, embedder).setInputStream(new IndexingInput(expression))); } public static StatementExpression newInstance(ScriptParserContext config) throws ParseException { diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj index 4533a17954c..3eee4ea6f08 100644 --- a/indexinglanguage/src/main/javacc/IndexingParser.jj +++ b/indexinglanguage/src/main/javacc/IndexingParser.jj @@ -34,7 +34,7 @@ import com.yahoo.text.StringUtilities; import com.yahoo.vespa.indexinglanguage.expressions.*; import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig; import com.yahoo.language.process.StemMode; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.Linguistics; /** @@ -45,7 +45,7 @@ public class IndexingParser { private String defaultFieldName; private Linguistics linguistics; - private Encoder encoder; + private Embedder embedder; private AnnotatorConfig annotatorCfg; public IndexingParser(String str) { @@ -62,8 +62,8 @@ public class IndexingParser { return this; } - public IndexingParser setEncoder(Encoder encoder) { - this.encoder = encoder; + public IndexingParser setEmbedder(Embedder embedder) { + this.embedder = embedder; return this; } @@ -157,7 +157,7 @@ TOKEN : <CREATE_IF_NON_EXISTENT: "create_if_non_existent"> | <ECHO: "echo"> | <ELSE: "else"> | - <ENCODE: "encode"> | + <EMBED: "embed"> | <EXACT: "exact"> | <FLATTEN: "flatten"> | <FOR_EACH: "for_each"> | @@ -283,7 +283,7 @@ Expression value() : val = base64EncodeExp() | val = clearStateExp() | val = echoExp() | - val = encodeExp() | + val = embedExp() | val = exactExp() | val = flattenExp() | val = forEachExp() | @@ -365,10 +365,10 @@ Expression echoExp() : { } { return new EchoExpression(); } } -Expression encodeExp() : { } +Expression embedExp() : { } { - ( <ENCODE> ) - { return new EncodeExpression(encoder); } + ( <EMBED> ) + { return new EmbedExpression(embedder); } } Expression exactExp() : { } @@ -744,7 +744,7 @@ String identifier() : <ECHO> | <EXACT> | <ELSE> | - <ENCODE> | + <EMBED> | <FLATTEN> | <FOR_EACH> | <GET_FIELD> | diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptParserTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptParserTestCase.java index 32e38dbee6f..06d185339a6 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptParserTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptParserTestCase.java @@ -1,7 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.indexinglanguage; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.expressions.EchoExpression; import com.yahoo.vespa.indexinglanguage.expressions.InputExpression; @@ -96,7 +96,7 @@ public class ScriptParserTestCase { } private static ScriptParserContext newContext(String input) { - return new ScriptParserContext(new SimpleLinguistics(), Encoder.throwsOnUse).setInputStream(new IndexingInput(input)); + return new ScriptParserContext(new SimpleLinguistics(), Embedder.throwsOnUse).setInputStream(new IndexingInput(input)); } } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java index 9d3d0abb256..188426b1a06 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java @@ -5,26 +5,20 @@ import com.yahoo.document.DataType; import com.yahoo.document.Document; import com.yahoo.document.DocumentType; import com.yahoo.document.Field; -import com.yahoo.document.FieldPath; import com.yahoo.document.TensorDataType; import com.yahoo.document.datatypes.BoolFieldValue; -import com.yahoo.document.datatypes.FieldValue; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.document.datatypes.TensorFieldValue; import com.yahoo.language.Language; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.tensor.Tensor; -import com.yahoo.tensor.TensorAddress; import com.yahoo.tensor.TensorType; import com.yahoo.vespa.indexinglanguage.expressions.*; import com.yahoo.vespa.indexinglanguage.parser.ParseException; import org.junit.Test; -import java.util.Iterator; import java.util.List; -import java.util.Map; -import java.util.Set; import static org.junit.Assert.*; @@ -106,9 +100,9 @@ public class ScriptTestCase { } @Test - public void testEncode() throws ParseException { + public void testEmbed() throws ParseException { TensorType tensorType = TensorType.fromSpec("tensor(d[4])"); - var expression = Expression.fromString("input myText | encode | attribute 'myTensor'", + var expression = Expression.fromString("input myText | embed | attribute 'myTensor'", new SimpleLinguistics(), new MockEncoder()); @@ -131,15 +125,15 @@ public class ScriptTestCase { ((TensorFieldValue)adapter.values.get("myTensor")).getTensor().get()); } - private static class MockEncoder implements Encoder { + private static class MockEncoder implements Embedder { @Override - public List<Integer> encode(String text, Language language) { + public List<Integer> embed(String text, Language language) { return null; } @Override - public Tensor encode(String text, Language language, TensorType tensorType) { + public Tensor embed(String text, Language language, TensorType tensorType) { return Tensor.from(tensorType, "[7,3,0,0]"); } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/DefaultFieldNameTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/DefaultFieldNameTestCase.java index 2a71aeb564c..ea0d9f9cf69 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/DefaultFieldNameTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/DefaultFieldNameTestCase.java @@ -1,7 +1,7 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespa.indexinglanguage.parser; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.ScriptParserContext; import com.yahoo.vespa.indexinglanguage.expressions.Expression; @@ -19,7 +19,7 @@ public class DefaultFieldNameTestCase { public void requireThatDefaultFieldNameIsAppliedWhenArgumentIsMissing() throws ParseException { IndexingInput input = new IndexingInput("input"); InputExpression exp = (InputExpression)Expression.newInstance(new ScriptParserContext(new SimpleLinguistics(), - Encoder.throwsOnUse) + Embedder.throwsOnUse) .setInputStream(input) .setDefaultFieldName("foo")); assertEquals("foo", exp.getFieldName()); diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java index d7c5ae5c15a..44aa562028c 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/parser/ExpressionTestCase.java @@ -2,7 +2,7 @@ package com.yahoo.vespa.indexinglanguage.parser; import com.yahoo.language.Linguistics; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.expressions.*; import org.junit.Test; @@ -85,9 +85,9 @@ public class ExpressionTestCase { private static void assertExpression(Class expectedClass, String str) throws ParseException { Linguistics linguistics = new SimpleLinguistics(); - Expression foo = Expression.fromString(str, linguistics, Encoder.throwsOnUse); + Expression foo = Expression.fromString(str, linguistics, Embedder.throwsOnUse); assertEquals(expectedClass, foo.getClass()); - Expression bar = Expression.fromString(foo.toString(), linguistics, Encoder.throwsOnUse); + Expression bar = Expression.fromString(foo.toString(), linguistics, Embedder.throwsOnUse); assertEquals(foo.hashCode(), bar.hashCode()); assertEquals(foo, bar); } diff --git a/linguistics-components/abi-spec.json b/linguistics-components/abi-spec.json index 5b6729c58ef..808ec3af082 100644 --- a/linguistics-components/abi-spec.json +++ b/linguistics-components/abi-spec.json @@ -148,7 +148,7 @@ "public static final java.lang.String[] CONFIG_DEF_SCHEMA" ] }, - "com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder": { + "com.yahoo.language.sentencepiece.SentencePieceEmbedder$Builder": { "superClass": "java.lang.Object", "interfaces": [], "attributes": [ @@ -157,31 +157,31 @@ "methods": [ "public void <init>()", "public void addModel(com.yahoo.language.Language, java.nio.file.Path)", - "public com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder addDefaultModel(java.nio.file.Path)", + "public com.yahoo.language.sentencepiece.SentencePieceEmbedder$Builder addDefaultModel(java.nio.file.Path)", "public java.util.Map getModels()", - "public com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder setCollapseUnknowns(boolean)", + "public com.yahoo.language.sentencepiece.SentencePieceEmbedder$Builder setCollapseUnknowns(boolean)", "public boolean getCollapseUnknowns()", - "public com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder setScoring(com.yahoo.language.sentencepiece.Scoring)", + "public com.yahoo.language.sentencepiece.SentencePieceEmbedder$Builder setScoring(com.yahoo.language.sentencepiece.Scoring)", "public com.yahoo.language.sentencepiece.Scoring getScoring()", - "public com.yahoo.language.sentencepiece.SentencePieceEncoder build()" + "public com.yahoo.language.sentencepiece.SentencePieceEmbedder build()" ], "fields": [] }, - "com.yahoo.language.sentencepiece.SentencePieceEncoder": { + "com.yahoo.language.sentencepiece.SentencePieceEmbedder": { "superClass": "java.lang.Object", "interfaces": [ "com.yahoo.language.process.Segmenter", - "com.yahoo.language.process.Encoder" + "com.yahoo.language.process.Embedder" ], "attributes": [ "public" ], "methods": [ "public void <init>(com.yahoo.language.sentencepiece.SentencePieceConfig)", - "public void <init>(com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder)", + "public void <init>(com.yahoo.language.sentencepiece.SentencePieceEmbedder$Builder)", "public java.util.List segment(java.lang.String, com.yahoo.language.Language)", - "public java.util.List encode(java.lang.String, com.yahoo.language.Language)", - "public com.yahoo.tensor.Tensor encode(java.lang.String, com.yahoo.language.Language, com.yahoo.tensor.TensorType)", + "public java.util.List embed(java.lang.String, com.yahoo.language.Language)", + "public com.yahoo.tensor.Tensor embed(java.lang.String, com.yahoo.language.Language, com.yahoo.tensor.TensorType)", "public java.lang.String normalize(java.lang.String)" ], "fields": [] diff --git a/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEmbedder.java index b6659ebeaa3..116dd15f563 100644 --- a/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java +++ b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEmbedder.java @@ -4,7 +4,7 @@ package com.yahoo.language.sentencepiece; import com.google.common.annotations.Beta; import com.google.inject.Inject; import com.yahoo.language.Language; -import com.yahoo.language.process.Encoder; +import com.yahoo.language.process.Embedder; import com.yahoo.language.process.Segmenter; import com.yahoo.tensor.Tensor; import com.yahoo.tensor.TensorAddress; @@ -19,26 +19,25 @@ import java.util.Map; import java.util.stream.Collectors; /** - * Integration with https://github.com/google/sentencepiece - * through http://docs.djl.ai/extensions/sentencepiece/index.html + * A native Java implementation of SentencePiece - see https://github.com/google/sentencepiece * - * SentencePiece is a language-agnostic tokenizer for neural nets. + * SentencePiece is a language-agnostic segmenter and embedder for neural nets. * * @author bratseth */ @Beta -public class SentencePieceEncoder implements Segmenter, Encoder { +public class SentencePieceEmbedder implements Segmenter, Embedder { private final Map<Language, Model> models; private final SentencePieceAlgorithm algorithm; @Inject - public SentencePieceEncoder(SentencePieceConfig config) { + public SentencePieceEmbedder(SentencePieceConfig config) { this(new Builder(config)); } - public SentencePieceEncoder(Builder builder) { + public SentencePieceEmbedder(Builder builder) { algorithm = new SentencePieceAlgorithm(builder.collapseUnknowns, builder.getScoring()); models = builder.getModels().entrySet() @@ -46,7 +45,7 @@ public class SentencePieceEncoder implements Segmenter, Encoder { .map(e -> new Model(e.getKey(), e.getValue())) .collect(Collectors.toUnmodifiableMap(m -> m.language, m -> m)); if (models.isEmpty()) - throw new IllegalArgumentException("SentencePieceEncoder requires at least one model configured"); + throw new IllegalArgumentException("SentencePieceEmbedder requires at least one model configured"); } /** @@ -77,7 +76,7 @@ public class SentencePieceEncoder implements Segmenter, Encoder { * @return the list of zero or more token ids resulting from segmenting the input text */ @Override - public List<Integer> encode(String rawInput, Language language) { + public List<Integer> embed(String rawInput, Language language) { var resultBuilder = new ResultBuilder<List<Integer>>(new ArrayList<>()) { public void add(int segmentStart, int segmentEnd, SentencePieceAlgorithm.SegmentEnd[] segmentEnds) { result().add(segmentEnds[segmentEnd].id); @@ -89,7 +88,7 @@ public class SentencePieceEncoder implements Segmenter, Encoder { } /** - * <p>Encodes directly to a tensor.</p> + * <p>Embeds text into a tensor.</p> * * <p>If the tensor type is indexed 1-d (bound or unbound) this will return a tensor containing the token ids in the order * they were encountered in the text. If the dimension is bound and too large it will be zero padded, if too small @@ -101,10 +100,10 @@ public class SentencePieceEncoder implements Segmenter, Encoder { * <p>If the tensor is any other type IllegalArgumentException is thrown.</p> */ @Override - public Tensor encode(String rawInput, Language language, TensorType type) { + public Tensor embed(String rawInput, Language language, TensorType type) { if (type.dimensions().size() == 1 && type.dimensions().get(0).isIndexed()) { // Build to a list first since we can't reverse a tensor builder - List<Integer> values = encode(rawInput, language); + List<Integer> values = embed(rawInput, language); long maxSize = values.size(); if (type.dimensions().get(0).size().isPresent()) @@ -125,7 +124,7 @@ public class SentencePieceEncoder implements Segmenter, Encoder { return builder.build(); } else { - throw new IllegalArgumentException("Don't know how to encode with SentencePiece into " + type); + throw new IllegalArgumentException("Don't know how to embed with SentencePiece into " + type); } } @@ -210,9 +209,9 @@ public class SentencePieceEncoder implements Segmenter, Encoder { } public Scoring getScoring() { return scoring; } - public SentencePieceEncoder build() { + public SentencePieceEmbedder build() { if (models.isEmpty()) throw new IllegalStateException("At least one model must be supplied"); - return new SentencePieceEncoder(this); + return new SentencePieceEmbedder(this); } } diff --git a/linguistics-components/src/main/resources/configdefinitions/language.sentencepiece.sentence-piece.def b/linguistics-components/src/main/resources/configdefinitions/language.sentencepiece.sentence-piece.def index b91c0c45dc4..16ada78688a 100644 --- a/linguistics-components/src/main/resources/configdefinitions/language.sentencepiece.sentence-piece.def +++ b/linguistics-components/src/main/resources/configdefinitions/language.sentencepiece.sentence-piece.def @@ -1,6 +1,6 @@ # Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -# Configures com.yahoo.language.sentencepiece.SentencePieceEncoder +# Configures com.yahoo.language.sentencepiece.SentencePieceEmbedder namespace=language.sentencepiece diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java index edbbe21ec53..1ed2271f774 100644 --- a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java @@ -15,7 +15,7 @@ public class SentencePieceConfigurationTest { public void testEnglishTokenization() { var b = new SentencePieceConfig.Builder(); addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + var tester = new SentencePieceTester(new SentencePieceEmbedder(b.build())); tester.assertSegmented("this is another sentence", "▁this", "▁is", "▁another", "▁sentence"); tester.assertSegmented("KHJKJHHKJHHSH hello", "▁", "KHJKJHHKJHHSH", "▁hel", "lo"); } @@ -25,7 +25,7 @@ public class SentencePieceConfigurationTest { var b = new SentencePieceConfig.Builder(); addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); b.collapseUnknowns(false); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + var tester = new SentencePieceTester(new SentencePieceEmbedder(b.build())); tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo"); } @@ -34,7 +34,7 @@ public class SentencePieceConfigurationTest { var b = new SentencePieceConfig.Builder(); addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); b.scoring(SentencePieceConfig.Scoring.highestScore); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + var tester = new SentencePieceTester(new SentencePieceEmbedder(b.build())); tester.assertSegmented("hello", "▁h", "el", "lo"); } @@ -43,7 +43,7 @@ public class SentencePieceConfigurationTest { var b = new SentencePieceConfig.Builder(); addModel("ja", "src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model", b); addModel("en", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b); - var tester = new SentencePieceTester(new SentencePieceEncoder(b.build())); + var tester = new SentencePieceTester(new SentencePieceEmbedder(b.build())); tester.assertSegmented(Language.JAPANESE, "いくつかの通常のテキスト", "▁", "いく", "つか", "の", "通常", "の", "テ", "キ", "スト"); tester.assertSegmented(Language.ENGLISH, "hello", "▁hel", "lo"); tester.assertSegmented(Language.JAPANESE, "hello", "▁h", "ell", "o"); diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java index d60d7386d4b..939f8ebe9d3 100644 --- a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java @@ -38,27 +38,27 @@ public class SentencePieceTest { @Test public void testIntegerListEncoding() { var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); - tester.assertEncoded("hello, world!", 908, 1418, 9934, 501, 9960); - tester.assertEncoded("Hello, world!", 9912, 0, 6595, 9934, 501, 9960); + tester.assertEmbedded("hello, world!", 908, 1418, 9934, 501, 9960); + tester.assertEmbedded("Hello, world!", 9912, 0, 6595, 9934, 501, 9960); } @Test public void testDenseTensorEncoding() { var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); - tester.assertEncoded("hello, world!", "tensor(d[10])", "[908,1418,9934,501,9960,0,0,0,0,0]"); - tester.assertEncoded("Hello, world!", "tensor(d[10])", "[9912,0,6595,9934,501,9960,0,0,0,0]"); - tester.assertEncoded("hello, world!", "tensor(d[2])", "[908,1418]"); + tester.assertEmbedded("hello, world!", "tensor(d[10])", "[908,1418,9934,501,9960,0,0,0,0,0]"); + tester.assertEmbedded("Hello, world!", "tensor(d[10])", "[9912,0,6595,9934,501,9960,0,0,0,0]"); + tester.assertEmbedded("hello, world!", "tensor(d[2])", "[908,1418]"); } @Test public void testSparseTensorEncoding() { var tester = new SentencePieceTester(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); - tester.assertEncoded("hello", "tensor(token{})", "{lo:1.0,'▁hel':0.0}"); + tester.assertEmbedded("hello", "tensor(token{})", "{lo:1.0,'▁hel':0.0}"); } @Test public void testNoCollapse() { - var tester = new SentencePieceTester(new SentencePieceEncoder.Builder() + var tester = new SentencePieceTester(new SentencePieceEmbedder.Builder() .addDefaultModel(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()) .setCollapseUnknowns(false)); tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo"); @@ -66,7 +66,7 @@ public class SentencePieceTest { @Test public void testHighestScore() { - var tester = new SentencePieceTester(new SentencePieceEncoder.Builder() + var tester = new SentencePieceTester(new SentencePieceEmbedder.Builder() .addDefaultModel(new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()) .setScoring(Scoring.highestScore)); tester.assertSegmented("h", "▁h"); @@ -77,7 +77,7 @@ public class SentencePieceTest { @Test public void testMultiLanguageTokenization() { - SentencePieceEncoder.Builder builder = new SentencePieceEncoder.Builder(); + SentencePieceEmbedder.Builder builder = new SentencePieceEmbedder.Builder(); builder.addModel(Language.JAPANESE, new File("src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model").toPath()); builder.addModel(Language.ENGLISH, new File("src/test/models/sentencepiece/en.wiki.bpe.vs10000.model").toPath()); var tester = new SentencePieceTester(builder); diff --git a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java index 1ba7c9b472d..c4cb13a3d23 100644 --- a/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java @@ -14,28 +14,28 @@ import static org.junit.Assert.assertEquals; class SentencePieceTester { - private final SentencePieceEncoder encoder; + private final SentencePieceEmbedder embedder; public SentencePieceTester(Path model) { - this(new SentencePieceEncoder.Builder().addDefaultModel(model)); + this(new SentencePieceEmbedder.Builder().addDefaultModel(model)); } - public SentencePieceTester(SentencePieceEncoder.Builder builder) { + public SentencePieceTester(SentencePieceEmbedder.Builder builder) { this(builder.build()); } - public SentencePieceTester(SentencePieceEncoder encoder) { - this.encoder = encoder; + public SentencePieceTester(SentencePieceEmbedder embedder) { + this.embedder = embedder; } - public void assertEncoded(String input, Integer... expectedCodes) { - assertArrayEquals(expectedCodes, encoder.encode(input, Language.UNKNOWN).toArray()); + public void assertEmbedded(String input, Integer... expectedCodes) { + assertArrayEquals(expectedCodes, embedder.embed(input, Language.UNKNOWN).toArray()); } - public void assertEncoded(String input, String tensorType, String tensor) { + public void assertEmbedded(String input, String tensorType, String tensor) { TensorType type = TensorType.fromSpec(tensorType); Tensor expected = Tensor.from(type, tensor); - assertEquals(expected, encoder.encode(input, Language.UNKNOWN, type)); + assertEquals(expected, embedder.embed(input, Language.UNKNOWN, type)); } public void assertSegmented(String input, String... expectedSegments) { @@ -43,7 +43,7 @@ class SentencePieceTester { } public void assertSegmented(Language language, String input, String... expectedSegments) { - assertArrayEquals(expectedSegments, encoder.segment(input, language).toArray()); + assertArrayEquals(expectedSegments, embedder.segment(input, language).toArray()); } } diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json index cfbf2abda1a..d257c451739 100644 --- a/linguistics/abi-spec.json +++ b/linguistics/abi-spec.json @@ -328,22 +328,22 @@ ], "fields": [] }, - "com.yahoo.language.process.Encoder$FailingEncoder": { + "com.yahoo.language.process.Embedder$FailingEmbedder": { "superClass": "java.lang.Object", "interfaces": [ - "com.yahoo.language.process.Encoder" + "com.yahoo.language.process.Embedder" ], "attributes": [ "public" ], "methods": [ "public void <init>()", - "public java.util.List encode(java.lang.String, com.yahoo.language.Language)", - "public com.yahoo.tensor.Tensor encode(java.lang.String, com.yahoo.language.Language, com.yahoo.tensor.TensorType)" + "public java.util.List embed(java.lang.String, com.yahoo.language.Language)", + "public com.yahoo.tensor.Tensor embed(java.lang.String, com.yahoo.language.Language, com.yahoo.tensor.TensorType)" ], "fields": [] }, - "com.yahoo.language.process.Encoder": { + "com.yahoo.language.process.Embedder": { "superClass": "java.lang.Object", "interfaces": [], "attributes": [ @@ -352,11 +352,11 @@ "abstract" ], "methods": [ - "public abstract java.util.List encode(java.lang.String, com.yahoo.language.Language)", - "public abstract com.yahoo.tensor.Tensor encode(java.lang.String, com.yahoo.language.Language, com.yahoo.tensor.TensorType)" + "public abstract java.util.List embed(java.lang.String, com.yahoo.language.Language)", + "public abstract com.yahoo.tensor.Tensor embed(java.lang.String, com.yahoo.language.Language, com.yahoo.tensor.TensorType)" ], "fields": [ - "public static final com.yahoo.language.process.Encoder throwsOnUse" + "public static final com.yahoo.language.process.Embedder throwsOnUse" ] }, "com.yahoo.language.process.GramSplitter$Gram": { diff --git a/linguistics/src/main/java/com/yahoo/language/process/Embedder.java b/linguistics/src/main/java/com/yahoo/language/process/Embedder.java new file mode 100644 index 00000000000..56c401a7c61 --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/process/Embedder.java @@ -0,0 +1,56 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.process; + +import com.yahoo.language.Language; +import com.yahoo.tensor.Tensor; +import com.yahoo.tensor.TensorType; + +import java.util.List; + +/** + * An embedder converts a text string to a tensor + * + * @author bratseth + */ +public interface Embedder { + + /** An instance of this which throws IllegalStateException if attempted used */ + Embedder throwsOnUse = new FailingEmbedder(); + + /** + * Converts text into a list of token id's (a vector embedding) + * + * @param text the text to embed + * @param language the language of the text, or UNKNOWN to use language independent embedding + * @return the text embedded as a list of token ids + * @throws IllegalArgumentException if the language is not supported by this embedder + */ + List<Integer> embed(String text, Language language); + + /** + * Converts text into tokens in a tensor. + * The information contained in the embedding may depend on the tensor type. + * + * @param text the text to embed + * @param language the language of the text, or UNKNOWN to use language independent embedding + * @param tensorType the type of the tensor to be returned + * @return the tensor embedding of the text, as the spoecified tensor type + * @throws IllegalArgumentException if the language or tensor type is not supported by this embedder + */ + Tensor embed(String text, Language language, TensorType tensorType); + + class FailingEmbedder implements Embedder { + + @Override + public List<Integer> embed(String text, Language language) { + throw new IllegalStateException("No embedder has been configured"); + } + + @Override + public Tensor embed(String text, Language language, TensorType tensorType) { + throw new IllegalStateException("No embedder has been configured"); + } + + } + +} diff --git a/linguistics/src/main/java/com/yahoo/language/process/Encoder.java b/linguistics/src/main/java/com/yahoo/language/process/Encoder.java deleted file mode 100644 index 27f73d15e54..00000000000 --- a/linguistics/src/main/java/com/yahoo/language/process/Encoder.java +++ /dev/null @@ -1,56 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.language.process; - -import com.yahoo.language.Language; -import com.yahoo.tensor.Tensor; -import com.yahoo.tensor.TensorType; - -import java.util.List; - -/** - * An encoder converts a text string to a tensor or list of tokens - * - * @author bratseth - */ -public interface Encoder { - - /** An instance of this which throws IllegalStateException if attempted used */ - Encoder throwsOnUse = new FailingEncoder(); - - /** - * Encodes text into tokens in a list of ids. - * - * @param text the text to encode - * @param language the language of the text, or UNKNOWN to use language independent encoding - * @return the text encoded to a list of segment ids - * @throws IllegalArgumentException if the language is not supported by this encoder - */ - List<Integer> encode(String text, Language language); - - /** - * Encodes text into tokens in a tensor. - * The information contained in the encoding may depend on the tensor type. - * - * @param text the text to encode - * @param language the language of the text, or UNKNOWN to use language independent encoding - * @param tensorType the type of the ttensor to be returned - * @return the tex encoded into a tensor of the supplied type - * @throws IllegalArgumentException if the language or tensor type is not supported by this encoder - */ - Tensor encode(String text, Language language, TensorType tensorType); - - class FailingEncoder implements Encoder { - - @Override - public List<Integer> encode(String text, Language language) { - throw new IllegalStateException("No encoder has been configured"); - } - - @Override - public Tensor encode(String text, Language language, TensorType tensorType) { - throw new IllegalStateException("No encoder has been configured"); - } - - } - -} |