summaryrefslogtreecommitdiffstats
path: root/linguistics-components
diff options
context:
space:
mode:
authorBjørn Christian Seime <bjorncs@yahooinc.com>2023-06-02 12:10:32 +0200
committerBjørn Christian Seime <bjorncs@yahooinc.com>2023-06-02 12:10:32 +0200
commita67788f2b7786a2cfcb9244d1e72a7fb1815425b (patch)
treefa34be2f0f13ef4ea116dd12853c734de3bc2eca /linguistics-components
parente757e5ff2e6dadbe31389c7dfeb3f52827a1668b (diff)
Introduce services.xml syntax for configuring HuggingFace embedders
Diffstat (limited to 'linguistics-components')
-rw-r--r--linguistics-components/pom.xml6
-rw-r--r--linguistics-components/src/main/java/com/yahoo/language/huggingface/HuggingFaceTokenizer.java1
-rw-r--r--linguistics-components/src/main/resources/configdefinitions/language.huggingface.hugging-face-tokenizer.def13
3 files changed, 7 insertions, 13 deletions
diff --git a/linguistics-components/pom.xml b/linguistics-components/pom.xml
index 5031ad73556..b3bc52c5e23 100644
--- a/linguistics-components/pom.xml
+++ b/linguistics-components/pom.xml
@@ -89,6 +89,12 @@
<scope>provided</scope>
<classifier>no_aop</classifier>
</dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>configdefinitions</artifactId>
+ <version>${project.version}</version>
+ <scope>compile</scope>
+ </dependency>
</dependencies>
<build>
<plugins>
diff --git a/linguistics-components/src/main/java/com/yahoo/language/huggingface/HuggingFaceTokenizer.java b/linguistics-components/src/main/java/com/yahoo/language/huggingface/HuggingFaceTokenizer.java
index f9a37bc477b..2c66fc18c9b 100644
--- a/linguistics-components/src/main/java/com/yahoo/language/huggingface/HuggingFaceTokenizer.java
+++ b/linguistics-components/src/main/java/com/yahoo/language/huggingface/HuggingFaceTokenizer.java
@@ -6,6 +6,7 @@ import com.yahoo.api.annotations.Beta;
import com.yahoo.component.AbstractComponent;
import com.yahoo.component.annotation.Inject;
import com.yahoo.language.Language;
+import com.yahoo.language.huggingface.config.HuggingFaceTokenizerConfig;
import com.yahoo.language.process.Embedder;
import com.yahoo.language.process.Segmenter;
import com.yahoo.language.tools.Embed;
diff --git a/linguistics-components/src/main/resources/configdefinitions/language.huggingface.hugging-face-tokenizer.def b/linguistics-components/src/main/resources/configdefinitions/language.huggingface.hugging-face-tokenizer.def
deleted file mode 100644
index 67b3b927f94..00000000000
--- a/linguistics-components/src/main/resources/configdefinitions/language.huggingface.hugging-face-tokenizer.def
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-
-namespace=language.huggingface
-
-# The language a model is for, one of the language tags in com.yahoo.language.Language.
-# Use "unknown" for models to be used with any language.
-model[].language string
-# The path to the model relative to the application package root
-model[].path model
-
-addSpecialTokens bool default=true
-maxLength int default=-1
-truncation bool default=false \ No newline at end of file