diff options
author | Jon Bratseth <bratseth@oath.com> | 2021-09-27 23:09:03 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-09-27 23:09:03 +0200 |
commit | 2df97d23d9f25ae60f010a2e9f273cb5b38e049b (patch) | |
tree | d2923a45682e91d80e7011c60cfb301e05acead3 | |
parent | 037f756caf4cfb99bcd988174839d7bc385267b9 (diff) | |
parent | 8f3fb1a105ded07144f6de527266a438e48a1766 (diff) |
Merge pull request #19294 from vespa-engine/bratseth/linguistics-componentsv7.473.17
Bratseth/linguistics components
34 files changed, 342 insertions, 197 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 7d9968b6329..c9980fb1928 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -95,6 +95,7 @@ add_subdirectory(jdisc_jetty) add_subdirectory(jrt_test) add_subdirectory(juniper) add_subdirectory(linguistics) +add_subdirectory(linguistics-components) add_subdirectory(logd) add_subdirectory(logserver) add_subdirectory(logforwarder) diff --git a/container-disc/pom.xml b/container-disc/pom.xml index e3b2830a974..a362fb11be6 100644 --- a/container-disc/pom.xml +++ b/container-disc/pom.xml @@ -174,6 +174,7 @@ docprocs-jar-with-dependencies.jar, hosted-zone-api-jar-with-dependencies.jar, jdisc-security-filters-jar-with-dependencies.jar, + linguistics-components-jar-with-dependencies.jar, vespaclient-container-plugin-jar-with-dependencies.jar, vespa-athenz-jar-with-dependencies.jar, container-apache-http-client-bundle-jar-with-dependencies.jar, <!-- Apache http client repackaged as bundle --> diff --git a/dist/vespa.spec b/dist/vespa.spec index e45c45d41f9..1fa9fbc9796 100644 --- a/dist/vespa.spec +++ b/dist/vespa.spec @@ -814,6 +814,7 @@ fi %{_prefix}/lib/jars/jdisc_core-jar-with-dependencies.jar %{_prefix}/lib/jars/jdisc-security-filters-jar-with-dependencies.jar %{_prefix}/lib/jars/jersey-*.jar +%{_prefix}/lib/jars/linguistics-components-jar-with-dependencies.jar %{_prefix}/lib/jars/alpn-*.jar %{_prefix}/lib/jars/http2-*.jar %{_prefix}/lib/jars/jetty-*.jar diff --git a/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java b/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java index 8f3f75af795..fa5f794f652 100644 --- a/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java +++ b/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java @@ -10,13 +10,13 @@ import com.yahoo.language.process.Encoder; import com.yahoo.vespa.configdefinition.IlscriptsConfig; import com.yahoo.vespa.indexinglanguage.ScriptParserContext; import com.yahoo.vespa.indexinglanguage.expressions.InputExpression; +import com.yahoo.vespa.indexinglanguage.expressions.OutputExpression; import com.yahoo.vespa.indexinglanguage.expressions.ScriptExpression; import com.yahoo.vespa.indexinglanguage.expressions.StatementExpression; import com.yahoo.vespa.indexinglanguage.parser.IndexingInput; import com.yahoo.vespa.indexinglanguage.parser.ParseException; import java.util.*; -import java.util.logging.Level; /** * @author Simon Thoresen Hult @@ -86,11 +86,17 @@ public class ScriptManager { List<StatementExpression> expressions = new ArrayList<>(ilscript.content().size()); Map<String, DocumentScript> fieldScripts = new HashMap<>(ilscript.content().size()); for (String content : ilscript.content()) { - expressions.add(parse(ilscript.doctype(), parserContext, content)); StatementExpression statement = parse(ilscript.doctype(), parserContext, content); + expressions.add(statement); InputExpression.InputFieldNameExtractor inputFieldNameExtractor = new InputExpression.InputFieldNameExtractor(); statement.select(inputFieldNameExtractor, inputFieldNameExtractor); + OutputExpression.OutputFieldNameExtractor outputFieldNameExtractor = new OutputExpression.OutputFieldNameExtractor(); + statement.select(outputFieldNameExtractor, outputFieldNameExtractor); statement.select(fieldPathOptimizer, fieldPathOptimizer); + if ( ! outputFieldNameExtractor.getOutputFieldNames().isEmpty()) { + String outputFieldName = outputFieldNameExtractor.getOutputFieldNames().get(0); + statement.setStatementOutputType(docTypeMgr.getDocumentType(ilscript.doctype()).getField(outputFieldName).getDataType()); + } if (inputFieldNameExtractor.getInputFieldNames().size() == 1) { String fieldName = inputFieldNameExtractor.getInputFieldNames().get(0); ScriptExpression script; diff --git a/docprocs/src/test/java/com/yahoo/docprocs/indexing/ScriptManagerTestCase.java b/docprocs/src/test/java/com/yahoo/docprocs/indexing/ScriptManagerTestCase.java index 607fee4f10d..ec05fcbe422 100644 --- a/docprocs/src/test/java/com/yahoo/docprocs/indexing/ScriptManagerTestCase.java +++ b/docprocs/src/test/java/com/yahoo/docprocs/indexing/ScriptManagerTestCase.java @@ -28,7 +28,7 @@ public class ScriptManagerTestCase { IlscriptsConfig.Builder config = new IlscriptsConfig.Builder(); config.ilscript(new IlscriptsConfig.Ilscript.Builder().doctype("newssummary") - .content("index")); + .content("input title | index title")); ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(config), null, Encoder.throwsOnUse); assertNotNull(scriptMgr.getScript(typeMgr.getDocumentType("newsarticle"))); assertNull(scriptMgr.getScript(new DocumentType("unknown"))); @@ -43,7 +43,7 @@ public class ScriptManagerTestCase { IlscriptsConfig.Builder config = new IlscriptsConfig.Builder(); config.ilscript(new IlscriptsConfig.Ilscript.Builder().doctype("newsarticle") - .content("index")); + .content("input title | index title")); ScriptManager scriptMgr = new ScriptManager(typeMgr, new IlscriptsConfig(config), null, Encoder.throwsOnUse); assertNotNull(scriptMgr.getScript(typeMgr.getDocumentType("newssummary"))); assertNull(scriptMgr.getScript(new DocumentType("unknown"))); diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EncodeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EncodeExpression.java index 09034659ad0..f84da9ddef8 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EncodeExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/EncodeExpression.java @@ -27,6 +27,11 @@ public class EncodeExpression extends Expression { } @Override + public void setStatementOutputType(DataType type) { + targetType = ((TensorDataType)type).getTensorType(); + } + + @Override protected void doExecute(ExecutionContext context) { StringFieldValue input = (StringFieldValue) context.getValue(); Tensor tensor = encoder.encode(input.getString(), context.getLanguage(), targetType); diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java index a121df8e5a8..67459c2b035 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/Expression.java @@ -31,6 +31,8 @@ public abstract class Expression extends Selectable { this.inputType = inputType; } + public void setStatementOutputType(DataType type) {} + public final FieldValue execute(FieldValue val) { return execute(new ExecutionContext().setValue(val)); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExpressionList.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExpressionList.java index 0ac195efb5d..0f7c2a411de 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExpressionList.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExpressionList.java @@ -24,6 +24,12 @@ public abstract class ExpressionList<T extends Expression> extends CompositeExpr } } + @Override + public void setStatementOutputType(DataType type) { + for (Expression expression : expressions) + expression.setStatementOutputType(type); + } + public int size() { return expressions.size(); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/GuardExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/GuardExpression.java index 179f202788c..78c261caccb 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/GuardExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/GuardExpression.java @@ -27,6 +27,11 @@ public final class GuardExpression extends CompositeExpression { } @Override + public void setStatementOutputType(DataType type) { + exp.setStatementOutputType(type); + } + + @Override protected void doExecute(ExecutionContext context) { if (!shouldExecute && context.getAdapter() instanceof UpdateAdapter) { context.setValue(null); diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/OutputExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/OutputExpression.java index 398c2751bd8..267fb6fc51b 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/OutputExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/OutputExpression.java @@ -2,6 +2,11 @@ package com.yahoo.vespa.indexinglanguage.expressions; import com.yahoo.document.DataType; +import com.yahoo.vespa.objects.ObjectOperation; +import com.yahoo.vespa.objects.ObjectPredicate; + +import java.util.ArrayList; +import java.util.List; /** * @author Simon Thoresen Hult @@ -58,4 +63,22 @@ public abstract class OutputExpression extends Expression { return getClass().hashCode() + (fieldName != null ? fieldName.hashCode() : 0); } + public static class OutputFieldNameExtractor implements ObjectOperation, ObjectPredicate { + + private final List<String> outputFieldNames = new ArrayList<>(1); + + public List<String> getOutputFieldNames() { return outputFieldNames; } + + @Override + public void execute(Object obj) { + outputFieldNames.add(((OutputExpression) obj).getFieldName()); + } + + @Override + public boolean check(Object obj) { + return obj instanceof OutputExpression; + } + + } + } diff --git a/linguistics-components/.gitignore b/linguistics-components/.gitignore new file mode 100644 index 00000000000..8b990078588 --- /dev/null +++ b/linguistics-components/.gitignore @@ -0,0 +1,5 @@ +target +*.iml +*.ipr +*.iws +/pom.xml.build diff --git a/linguistics-components/CMakeLists.txt b/linguistics-components/CMakeLists.txt new file mode 100644 index 00000000000..b53c8001959 --- /dev/null +++ b/linguistics-components/CMakeLists.txt @@ -0,0 +1,5 @@ +# Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +install_fat_java_artifact(linguistics-components) + +install_config_definitions() diff --git a/linguistics-components/OWNERS b/linguistics-components/OWNERS new file mode 100644 index 00000000000..cd50f7a263a --- /dev/null +++ b/linguistics-components/OWNERS @@ -0,0 +1,2 @@ +bratseth +arnej27959 diff --git a/linguistics-components/README b/linguistics-components/README new file mode 100644 index 00000000000..e26a51e2f53 --- /dev/null +++ b/linguistics-components/README @@ -0,0 +1,4 @@ +Java library for linguistic operations in Vespa. + +This API is pluggable - multiple implementations may be supplied. +This module contains a default pure Java implementation, "simple".
\ No newline at end of file diff --git a/linguistics-components/abi-spec.json b/linguistics-components/abi-spec.json new file mode 100644 index 00000000000..5b6729c58ef --- /dev/null +++ b/linguistics-components/abi-spec.json @@ -0,0 +1,189 @@ +{ + "com.yahoo.language.sentencepiece.Scoring": { + "superClass": "java.lang.Enum", + "interfaces": [], + "attributes": [ + "public", + "final", + "enum" + ], + "methods": [ + "public static com.yahoo.language.sentencepiece.Scoring[] values()", + "public static com.yahoo.language.sentencepiece.Scoring valueOf(java.lang.String)" + ], + "fields": [ + "public static final enum com.yahoo.language.sentencepiece.Scoring highestScore", + "public static final enum com.yahoo.language.sentencepiece.Scoring fewestSegments" + ] + }, + "com.yahoo.language.sentencepiece.SentencePieceConfig$Builder": { + "superClass": "java.lang.Object", + "interfaces": [ + "com.yahoo.config.ConfigInstance$Builder" + ], + "attributes": [ + "public" + ], + "methods": [ + "public void <init>()", + "public void <init>(com.yahoo.language.sentencepiece.SentencePieceConfig)", + "public com.yahoo.language.sentencepiece.SentencePieceConfig$Builder collapseUnknowns(boolean)", + "public com.yahoo.language.sentencepiece.SentencePieceConfig$Builder scoring(com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum)", + "public com.yahoo.language.sentencepiece.SentencePieceConfig$Builder model(com.yahoo.language.sentencepiece.SentencePieceConfig$Model$Builder)", + "public com.yahoo.language.sentencepiece.SentencePieceConfig$Builder model(java.util.List)", + "public final boolean dispatchGetConfig(com.yahoo.config.ConfigInstance$Producer)", + "public final java.lang.String getDefMd5()", + "public final java.lang.String getDefName()", + "public final java.lang.String getDefNamespace()", + "public final boolean getApplyOnRestart()", + "public final void setApplyOnRestart(boolean)", + "public com.yahoo.language.sentencepiece.SentencePieceConfig build()" + ], + "fields": [ + "public java.util.List model" + ] + }, + "com.yahoo.language.sentencepiece.SentencePieceConfig$Model$Builder": { + "superClass": "java.lang.Object", + "interfaces": [ + "com.yahoo.config.ConfigBuilder" + ], + "attributes": [ + "public" + ], + "methods": [ + "public void <init>()", + "public void <init>(com.yahoo.language.sentencepiece.SentencePieceConfig$Model)", + "public com.yahoo.language.sentencepiece.SentencePieceConfig$Model$Builder language(java.lang.String)", + "public com.yahoo.language.sentencepiece.SentencePieceConfig$Model$Builder path(com.yahoo.config.FileReference)", + "public com.yahoo.language.sentencepiece.SentencePieceConfig$Model build()" + ], + "fields": [] + }, + "com.yahoo.language.sentencepiece.SentencePieceConfig$Model": { + "superClass": "com.yahoo.config.InnerNode", + "interfaces": [], + "attributes": [ + "public", + "final" + ], + "methods": [ + "public void <init>(com.yahoo.language.sentencepiece.SentencePieceConfig$Model$Builder)", + "public java.lang.String language()", + "public java.nio.file.Path path()" + ], + "fields": [] + }, + "com.yahoo.language.sentencepiece.SentencePieceConfig$Producer": { + "superClass": "java.lang.Object", + "interfaces": [ + "com.yahoo.config.ConfigInstance$Producer" + ], + "attributes": [ + "public", + "interface", + "abstract" + ], + "methods": [ + "public abstract void getConfig(com.yahoo.language.sentencepiece.SentencePieceConfig$Builder)" + ], + "fields": [] + }, + "com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum": { + "superClass": "java.lang.Enum", + "interfaces": [], + "attributes": [ + "public", + "final", + "enum" + ], + "methods": [ + "public static com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum[] values()", + "public static com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum valueOf(java.lang.String)" + ], + "fields": [ + "public static final enum com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum highestScore", + "public static final enum com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum fewestSegments" + ] + }, + "com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring": { + "superClass": "com.yahoo.config.EnumNode", + "interfaces": [], + "attributes": [ + "public", + "final" + ], + "methods": [ + "public void <init>()", + "public void <init>(com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum)" + ], + "fields": [ + "public static final com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum highestScore", + "public static final com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum fewestSegments" + ] + }, + "com.yahoo.language.sentencepiece.SentencePieceConfig": { + "superClass": "com.yahoo.config.ConfigInstance", + "interfaces": [], + "attributes": [ + "public", + "final" + ], + "methods": [ + "public static java.lang.String getDefMd5()", + "public static java.lang.String getDefName()", + "public static java.lang.String getDefNamespace()", + "public static java.lang.String getDefVersion()", + "public void <init>(com.yahoo.language.sentencepiece.SentencePieceConfig$Builder)", + "public boolean collapseUnknowns()", + "public com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum scoring()", + "public java.util.List model()", + "public com.yahoo.language.sentencepiece.SentencePieceConfig$Model model(int)" + ], + "fields": [ + "public static final java.lang.String CONFIG_DEF_MD5", + "public static final java.lang.String CONFIG_DEF_NAME", + "public static final java.lang.String CONFIG_DEF_NAMESPACE", + "public static final java.lang.String CONFIG_DEF_VERSION", + "public static final java.lang.String[] CONFIG_DEF_SCHEMA" + ] + }, + "com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder": { + "superClass": "java.lang.Object", + "interfaces": [], + "attributes": [ + "public" + ], + "methods": [ + "public void <init>()", + "public void addModel(com.yahoo.language.Language, java.nio.file.Path)", + "public com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder addDefaultModel(java.nio.file.Path)", + "public java.util.Map getModels()", + "public com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder setCollapseUnknowns(boolean)", + "public boolean getCollapseUnknowns()", + "public com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder setScoring(com.yahoo.language.sentencepiece.Scoring)", + "public com.yahoo.language.sentencepiece.Scoring getScoring()", + "public com.yahoo.language.sentencepiece.SentencePieceEncoder build()" + ], + "fields": [] + }, + "com.yahoo.language.sentencepiece.SentencePieceEncoder": { + "superClass": "java.lang.Object", + "interfaces": [ + "com.yahoo.language.process.Segmenter", + "com.yahoo.language.process.Encoder" + ], + "attributes": [ + "public" + ], + "methods": [ + "public void <init>(com.yahoo.language.sentencepiece.SentencePieceConfig)", + "public void <init>(com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder)", + "public java.util.List segment(java.lang.String, com.yahoo.language.Language)", + "public java.util.List encode(java.lang.String, com.yahoo.language.Language)", + "public com.yahoo.tensor.Tensor encode(java.lang.String, com.yahoo.language.Language, com.yahoo.tensor.TensorType)", + "public java.lang.String normalize(java.lang.String)" + ], + "fields": [] + } +}
\ No newline at end of file diff --git a/linguistics-components/pom.xml b/linguistics-components/pom.xml new file mode 100644 index 00000000000..44e58fb7588 --- /dev/null +++ b/linguistics-components/pom.xml @@ -0,0 +1,80 @@ +<?xml version="1.0"?> +<!-- Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>com.yahoo.vespa</groupId> + <artifactId>parent</artifactId> + <version>7-SNAPSHOT</version> + <relativePath>../parent/pom.xml</relativePath> + </parent> + <artifactId>linguistics-components</artifactId> + <packaging>container-plugin</packaging> + <version>7-SNAPSHOT</version> + <dependencies> + <dependency> + <groupId>com.google.protobuf</groupId> + <artifactId>protobuf-java</artifactId> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>com.yahoo.vespa</groupId> + <artifactId>component</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>com.yahoo.vespa</groupId> + <artifactId>linguistics</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>com.yahoo.vespa</groupId> + <artifactId>config-bundle</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>com.google.inject</groupId> + <artifactId>guice</artifactId> + <scope>provided</scope> + <classifier>no_aop</classifier> + </dependency> + </dependencies> + <build> + <plugins> + <plugin> + <groupId>com.github.os72</groupId> + <artifactId>protoc-jar-maven-plugin</artifactId> + </plugin> + <plugin> + <groupId>com.yahoo.vespa</groupId> + <artifactId>bundle-plugin</artifactId> + <extensions>true</extensions> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <configuration> + <compilerArgs> + <arg>-Xlint:rawtypes</arg> + <arg>-Xlint:unchecked</arg> + <arg>-Xlint:deprecation</arg> + <arg>-Werror</arg> + </compilerArgs> + </configuration> + </plugin> + <plugin> + <groupId>com.yahoo.vespa</groupId> + <artifactId>abi-check-plugin</artifactId> + </plugin> + </plugins> + </build> +</project> diff --git a/linguistics/src/main/java/com/yahoo/language/sentencepiece/Model.java b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/Model.java index 74f300057dc..74f300057dc 100644 --- a/linguistics/src/main/java/com/yahoo/language/sentencepiece/Model.java +++ b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/Model.java diff --git a/linguistics/src/main/java/com/yahoo/language/sentencepiece/ResultBuilder.java b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/ResultBuilder.java index 2141505374c..2141505374c 100644 --- a/linguistics/src/main/java/com/yahoo/language/sentencepiece/ResultBuilder.java +++ b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/ResultBuilder.java diff --git a/linguistics/src/main/java/com/yahoo/language/sentencepiece/Scoring.java b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/Scoring.java index 6c8560abee7..6c8560abee7 100644 --- a/linguistics/src/main/java/com/yahoo/language/sentencepiece/Scoring.java +++ b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/Scoring.java diff --git a/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceAlgorithm.java b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceAlgorithm.java index 1659e3c0fa7..1659e3c0fa7 100644 --- a/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceAlgorithm.java +++ b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceAlgorithm.java diff --git a/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java index b6659ebeaa3..b6659ebeaa3 100644 --- a/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java +++ b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java diff --git a/linguistics/src/main/java/com/yahoo/language/sentencepiece/TokenType.java b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/TokenType.java index 782030a8e4d..782030a8e4d 100644 --- a/linguistics/src/main/java/com/yahoo/language/sentencepiece/TokenType.java +++ b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/TokenType.java diff --git a/linguistics/src/main/java/com/yahoo/language/sentencepiece/Trie.java b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/Trie.java index 8e7c2db2ed3..8e7c2db2ed3 100644 --- a/linguistics/src/main/java/com/yahoo/language/sentencepiece/Trie.java +++ b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/Trie.java diff --git a/linguistics/src/main/java/com/yahoo/language/sentencepiece/package-info.java b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/package-info.java index 4a8673705ec..3f97277c489 100644 --- a/linguistics/src/main/java/com/yahoo/language/sentencepiece/package-info.java +++ b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/package-info.java @@ -1,4 +1,4 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// Copyright 2021 Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. @ExportPackage @PublicApi package com.yahoo.language.sentencepiece; diff --git a/linguistics/src/main/protobuf/sentencepiece_model.proto b/linguistics-components/src/main/protobuf/sentencepiece_model.proto index 39626aede53..39626aede53 100644 --- a/linguistics/src/main/protobuf/sentencepiece_model.proto +++ b/linguistics-components/src/main/protobuf/sentencepiece_model.proto diff --git a/linguistics/src/main/resources/configdefinitions/language.sentencepiece.sentence-piece.def b/linguistics-components/src/main/resources/configdefinitions/language.sentencepiece.sentence-piece.def index 0b8e5103aa3..b91c0c45dc4 100644 --- a/linguistics/src/main/resources/configdefinitions/language.sentencepiece.sentence-piece.def +++ b/linguistics-components/src/main/resources/configdefinitions/language.sentencepiece.sentence-piece.def @@ -15,4 +15,4 @@ scoring enum { highestScore, fewestSegments } default=fewestSegments # Use "unknown" for models to be used with any language. model[].language string # The path to the model relative to the application package root -model[].path path +model[].path path
\ No newline at end of file diff --git a/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java index edbbe21ec53..edbbe21ec53 100644 --- a/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceConfigurationTest.java diff --git a/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java index d60d7386d4b..d60d7386d4b 100644 --- a/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTest.java diff --git a/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java index 1ba7c9b472d..1ba7c9b472d 100644 --- a/linguistics/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java +++ b/linguistics-components/src/test/java/com/yahoo/language/sentencepiece/SentencePieceTester.java diff --git a/linguistics/src/test/models/sentencepiece/en.wiki.bpe.vs10000.model b/linguistics-components/src/test/models/sentencepiece/en.wiki.bpe.vs10000.model Binary files differindex 89f93ef3517..89f93ef3517 100644 --- a/linguistics/src/test/models/sentencepiece/en.wiki.bpe.vs10000.model +++ b/linguistics-components/src/test/models/sentencepiece/en.wiki.bpe.vs10000.model diff --git a/linguistics/src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model b/linguistics-components/src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model Binary files differindex 41c0688d9df..41c0688d9df 100644 --- a/linguistics/src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model +++ b/linguistics-components/src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json index dbf4842ea1a..cfbf2abda1a 100644 --- a/linguistics/abi-spec.json +++ b/linguistics/abi-spec.json @@ -731,192 +731,5 @@ "public abstract java.lang.String accentDrop(java.lang.String, com.yahoo.language.Language)" ], "fields": [] - }, - "com.yahoo.language.sentencepiece.Scoring": { - "superClass": "java.lang.Enum", - "interfaces": [], - "attributes": [ - "public", - "final", - "enum" - ], - "methods": [ - "public static com.yahoo.language.sentencepiece.Scoring[] values()", - "public static com.yahoo.language.sentencepiece.Scoring valueOf(java.lang.String)" - ], - "fields": [ - "public static final enum com.yahoo.language.sentencepiece.Scoring highestScore", - "public static final enum com.yahoo.language.sentencepiece.Scoring fewestSegments" - ] - }, - "com.yahoo.language.sentencepiece.SentencePieceConfig$Builder": { - "superClass": "java.lang.Object", - "interfaces": [ - "com.yahoo.config.ConfigInstance$Builder" - ], - "attributes": [ - "public" - ], - "methods": [ - "public void <init>()", - "public void <init>(com.yahoo.language.sentencepiece.SentencePieceConfig)", - "public com.yahoo.language.sentencepiece.SentencePieceConfig$Builder collapseUnknowns(boolean)", - "public com.yahoo.language.sentencepiece.SentencePieceConfig$Builder scoring(com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum)", - "public com.yahoo.language.sentencepiece.SentencePieceConfig$Builder model(com.yahoo.language.sentencepiece.SentencePieceConfig$Model$Builder)", - "public com.yahoo.language.sentencepiece.SentencePieceConfig$Builder model(java.util.List)", - "public final boolean dispatchGetConfig(com.yahoo.config.ConfigInstance$Producer)", - "public final java.lang.String getDefMd5()", - "public final java.lang.String getDefName()", - "public final java.lang.String getDefNamespace()", - "public final boolean getApplyOnRestart()", - "public final void setApplyOnRestart(boolean)", - "public com.yahoo.language.sentencepiece.SentencePieceConfig build()" - ], - "fields": [ - "public java.util.List model" - ] - }, - "com.yahoo.language.sentencepiece.SentencePieceConfig$Model$Builder": { - "superClass": "java.lang.Object", - "interfaces": [ - "com.yahoo.config.ConfigBuilder" - ], - "attributes": [ - "public" - ], - "methods": [ - "public void <init>()", - "public void <init>(com.yahoo.language.sentencepiece.SentencePieceConfig$Model)", - "public com.yahoo.language.sentencepiece.SentencePieceConfig$Model$Builder language(java.lang.String)", - "public com.yahoo.language.sentencepiece.SentencePieceConfig$Model$Builder path(com.yahoo.config.FileReference)", - "public com.yahoo.language.sentencepiece.SentencePieceConfig$Model build()" - ], - "fields": [] - }, - "com.yahoo.language.sentencepiece.SentencePieceConfig$Model": { - "superClass": "com.yahoo.config.InnerNode", - "interfaces": [], - "attributes": [ - "public", - "final" - ], - "methods": [ - "public void <init>(com.yahoo.language.sentencepiece.SentencePieceConfig$Model$Builder)", - "public java.lang.String language()", - "public java.nio.file.Path path()" - ], - "fields": [] - }, - "com.yahoo.language.sentencepiece.SentencePieceConfig$Producer": { - "superClass": "java.lang.Object", - "interfaces": [ - "com.yahoo.config.ConfigInstance$Producer" - ], - "attributes": [ - "public", - "interface", - "abstract" - ], - "methods": [ - "public abstract void getConfig(com.yahoo.language.sentencepiece.SentencePieceConfig$Builder)" - ], - "fields": [] - }, - "com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum": { - "superClass": "java.lang.Enum", - "interfaces": [], - "attributes": [ - "public", - "final", - "enum" - ], - "methods": [ - "public static com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum[] values()", - "public static com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum valueOf(java.lang.String)" - ], - "fields": [ - "public static final enum com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum highestScore", - "public static final enum com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum fewestSegments" - ] - }, - "com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring": { - "superClass": "com.yahoo.config.EnumNode", - "interfaces": [], - "attributes": [ - "public", - "final" - ], - "methods": [ - "public void <init>()", - "public void <init>(com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum)" - ], - "fields": [ - "public static final com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum highestScore", - "public static final com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum fewestSegments" - ] - }, - "com.yahoo.language.sentencepiece.SentencePieceConfig": { - "superClass": "com.yahoo.config.ConfigInstance", - "interfaces": [], - "attributes": [ - "public", - "final" - ], - "methods": [ - "public static java.lang.String getDefMd5()", - "public static java.lang.String getDefName()", - "public static java.lang.String getDefNamespace()", - "public static java.lang.String getDefVersion()", - "public void <init>(com.yahoo.language.sentencepiece.SentencePieceConfig$Builder)", - "public boolean collapseUnknowns()", - "public com.yahoo.language.sentencepiece.SentencePieceConfig$Scoring$Enum scoring()", - "public java.util.List model()", - "public com.yahoo.language.sentencepiece.SentencePieceConfig$Model model(int)" - ], - "fields": [ - "public static final java.lang.String CONFIG_DEF_MD5", - "public static final java.lang.String CONFIG_DEF_NAME", - "public static final java.lang.String CONFIG_DEF_NAMESPACE", - "public static final java.lang.String CONFIG_DEF_VERSION", - "public static final java.lang.String[] CONFIG_DEF_SCHEMA" - ] - }, - "com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder": { - "superClass": "java.lang.Object", - "interfaces": [], - "attributes": [ - "public" - ], - "methods": [ - "public void <init>()", - "public void addModel(com.yahoo.language.Language, java.nio.file.Path)", - "public com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder addDefaultModel(java.nio.file.Path)", - "public java.util.Map getModels()", - "public com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder setCollapseUnknowns(boolean)", - "public boolean getCollapseUnknowns()", - "public com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder setScoring(com.yahoo.language.sentencepiece.Scoring)", - "public com.yahoo.language.sentencepiece.Scoring getScoring()", - "public com.yahoo.language.sentencepiece.SentencePieceEncoder build()" - ], - "fields": [] - }, - "com.yahoo.language.sentencepiece.SentencePieceEncoder": { - "superClass": "java.lang.Object", - "interfaces": [ - "com.yahoo.language.process.Segmenter", - "com.yahoo.language.process.Encoder" - ], - "attributes": [ - "public" - ], - "methods": [ - "public void <init>(com.yahoo.language.sentencepiece.SentencePieceConfig)", - "public void <init>(com.yahoo.language.sentencepiece.SentencePieceEncoder$Builder)", - "public java.util.List segment(java.lang.String, com.yahoo.language.Language)", - "public java.util.List encode(java.lang.String, com.yahoo.language.Language)", - "public com.yahoo.tensor.Tensor encode(java.lang.String, com.yahoo.language.Language, com.yahoo.tensor.TensorType)", - "public java.lang.String normalize(java.lang.String)" - ], - "fields": [] } }
\ No newline at end of file diff --git a/linguistics/pom.xml b/linguistics/pom.xml index 221d7181616..0e5f9e15b85 100644 --- a/linguistics/pom.xml +++ b/linguistics/pom.xml @@ -15,10 +15,6 @@ <version>7-SNAPSHOT</version> <dependencies> <dependency> - <groupId>com.google.protobuf</groupId> - <artifactId>protobuf-java</artifactId> - </dependency> - <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <scope>test</scope> @@ -94,6 +94,7 @@ <module>jdisc_jetty</module> <module>jrt</module> <module>linguistics</module> + <module>linguistics-components</module> <module>logd</module> <module>logserver</module> <module>messagebus</module> |