summaryrefslogtreecommitdiffstats
path: root/config-model
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2022-08-23 11:12:39 +0200
committerJon Bratseth <bratseth@gmail.com>2022-08-23 11:12:39 +0200
commit207f2963125c9094d65f50f5ea41d98cf3ba8524 (patch)
tree2011e1e3cfddc4eed10411af02eb728526e12e31 /config-model
parent7e290cd7574f69071490dbfb78da9e2773a863e7 (diff)
Simplify and generalize
Let embedder rewrites simply always change <x path="y" url="z"> to <xPath>y</xPath> <xUrl>z</xUrl>, as well as translating the id attirebute to the corresponding path if on hosted. This means that the current built-in embedder accepting "vocab" and "model" is broken as these names are different from the names in the config model, but this isn't documented yet so should be okay. The effect of this is that our built-in models can be used in any embedder and config, and the embedder syntax can be used in application specific embedders.
Diffstat (limited to 'config-model')
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/container/xml/embedder/EmbedderConfig.java65
-rw-r--r--config-model/src/main/resources/schema/common.rnc3
-rw-r--r--config-model/src/main/resources/schema/containercluster.rnc9
-rw-r--r--config-model/src/main/resources/schema/docproc.rnc1
-rw-r--r--config-model/src/test/cfg/application/embed/services.xml4
-rw-r--r--config-model/src/test/cfg/application/embed_generic/configdefinitions/sentence-embedder.def3
-rw-r--r--config-model/src/test/java/com/yahoo/vespa/model/container/xml/EmbedderTestCase.java134
-rw-r--r--config-model/src/test/schema-test-files/services.xml1
8 files changed, 109 insertions, 111 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/container/xml/embedder/EmbedderConfig.java b/config-model/src/main/java/com/yahoo/vespa/model/container/xml/embedder/EmbedderConfig.java
index fa531176b9c..f28776fd0ae 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/container/xml/embedder/EmbedderConfig.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/container/xml/embedder/EmbedderConfig.java
@@ -1,9 +1,10 @@
package com.yahoo.vespa.model.container.xml.embedder;
import com.yahoo.config.model.deploy.DeployState;
+import com.yahoo.text.XML;
import org.w3c.dom.Element;
-import org.w3c.dom.Node;
-import org.w3c.dom.NodeList;
+
+import java.util.Map;
/**
* Translates config in services.xml of the form
@@ -26,23 +27,57 @@ import org.w3c.dom.NodeList;
*/
public class EmbedderConfig {
+ // Until we have optional path parameters, use services.xml as it is guaranteed to exist
+ private final static String dummyPath = "services.xml";
+
/**
* Transforms the &lt;embedder ...&gt; element to component configuration.
*
* @param deployState the deploy state - as config generation can depend on context
- * @param embedderSpec the XML element containing the &lt;embedder ...&gt;
+ * @param embedder the XML element containing the &lt;embedder ...&gt;
* @return a new XML element containting the &lt;component ...&gt; configuration
*/
- public static Element transform(DeployState deployState, Element embedderSpec) {
- EmbedderConfigTransformer transformer = getEmbedderTransformer(embedderSpec, deployState.isHosted());
- NodeList children = embedderSpec.getChildNodes();
- for (int i = 0; i < children.getLength(); i++) {
- Node child = children.item(i);
- if (child instanceof Element) {
- transformer.addOption((Element) child);
- }
+ public static Element transform(DeployState deployState, Element embedder) {
+ Element component = XML.getDocumentBuilder().newDocument().createElement("component");
+ component.setAttribute("id", embedder.getAttribute("id"));
+ component.setAttribute("class", embedderClassFrom(embedder));
+ component.setAttribute("bundle", embedder.hasAttribute("bundle") ? embedder.getAttribute("bundle") : "model-integration");
+
+ String configDef = embedderConfigFrom(embedder);
+ if ( ! configDef.isEmpty()) {
+ Element config = component.getOwnerDocument().createElement("config");
+ config.setAttribute("name", configDef);
+ for (Element child : XML.getChildren(embedder))
+ addConfigValue(child, config, deployState.isHosted());
+ component.appendChild(config);
+ }
+
+ return component;
+ }
+
+ /** Adds a config value from an embedder element into a regular config. */
+ private static void addConfigValue(Element value, Element config, boolean hosted) {
+ if (value.hasAttribute("path")) {
+ addChild(value.getTagName() + "Url", "", config);
+ addChild(value.getTagName() + "Path", value.getAttribute("path"), config);
+ }
+ else if (value.hasAttribute("id") && hosted) {
+ addChild(value.getTagName() + "Url", modelIdToUrl(value.getAttribute("id")), config);
+ addChild(value.getTagName() + "Path", dummyPath, config);
+ }
+ else if (value.hasAttribute("url")) {
+ addChild(value.getTagName() + "Url", value.getAttribute("url"), config);
+ addChild(value.getTagName() + "Path", dummyPath, config);
}
- return transformer.createComponentConfig(deployState);
+ else {
+ addChild(value.getTagName(), value.getTextContent(), config);
+ }
+ }
+
+ private static void addChild(String name, String value, Element parent) {
+ Element element = parent.getOwnerDocument().createElement(name);
+ element.setTextContent(value);
+ parent.appendChild(element);
}
private static EmbedderConfigTransformer getEmbedderTransformer(Element spec, boolean hosted) {
@@ -57,7 +92,7 @@ public class EmbedderConfig {
if ( ! explicitDefinition.isEmpty()) return explicitDefinition;
// Implicit from class name
- return switch (getEmbedderClass(spec)) {
+ return switch (embedderClassFrom(spec)) {
case "ai.vespa.embedding.BertBaseEmbedder" -> "embedding.bert-base-embedder";
default -> "";
};
@@ -72,10 +107,10 @@ public class EmbedderConfig {
case "bert-base-uncased":
return "https://data.vespa.oath.cloud/onnx_models/bert-base-uncased-vocab.txt";
}
- throw new IllegalArgumentException("Unknown model id: '" + id + "'");
+ throw new IllegalArgumentException("Unknown model id '" + id + "'");
}
- private static String getEmbedderClass(Element spec) {
+ private static String embedderClassFrom(Element spec) {
if (spec.hasAttribute("class")) {
return spec.getAttribute("class");
}
diff --git a/config-model/src/main/resources/schema/common.rnc b/config-model/src/main/resources/schema/common.rnc
index 7259a3159ff..2584725eb9c 100644
--- a/config-model/src/main/resources/schema/common.rnc
+++ b/config-model/src/main/resources/schema/common.rnc
@@ -65,7 +65,8 @@ ComponentDefinition =
ComponentId &
BundleSpec &
GenericConfig* &
- Component*
+ Component* &
+ Embedder*
diff --git a/config-model/src/main/resources/schema/containercluster.rnc b/config-model/src/main/resources/schema/containercluster.rnc
index 006db8fe510..b3b021260e4 100644
--- a/config-model/src/main/resources/schema/containercluster.rnc
+++ b/config-model/src/main/resources/schema/containercluster.rnc
@@ -31,7 +31,8 @@ ClientAuthorize = element client-authorize { empty }
Components = element components {
Include* &
- Component*
+ Component* &
+ Embedder*
}
Include = element \include {
@@ -97,10 +98,10 @@ ZooKeeper = element zookeeper {
}
Embedder = element embedder {
- attribute id { string }? &
+ attribute id { string } &
attribute class { xsd:Name | JavaId }? &
- attribute bundle { xsd:Name }? &
- attribute def { xsd:Name }? &
+ attribute bundle { xsd:Name } &
+ attribute def { xsd:Name } &
anyElement*
}
diff --git a/config-model/src/main/resources/schema/docproc.rnc b/config-model/src/main/resources/schema/docproc.rnc
index 11f8e14fb2d..b4db09f2fb8 100644
--- a/config-model/src/main/resources/schema/docproc.rnc
+++ b/config-model/src/main/resources/schema/docproc.rnc
@@ -50,6 +50,7 @@ ClusterV3 = element cluster {
GenericConfig* &
SchemaMapping? &
Component* &
+ Embedder* &
Handler* &
DocprocChainsV3?
}
diff --git a/config-model/src/test/cfg/application/embed/services.xml b/config-model/src/test/cfg/application/embed/services.xml
index f319d875ed9..9a05337f954 100644
--- a/config-model/src/test/cfg/application/embed/services.xml
+++ b/config-model/src/test/cfg/application/embed/services.xml
@@ -11,8 +11,8 @@
<embedder id="transformer" class="ai.vespa.embedding.BertBaseEmbedder">
<!-- model specifics -->
- <model id="test-model-id" url="test-model-url"/>
- <vocab path="files/vocab.txt"/>
+ <transformerModel id="test-model-id" url="test-model-url"/>
+ <tokenizerVocab path="files/vocab.txt"/>
<!-- tunable parameters: number of threads etc -->
<onnxIntraOpThreads>4</onnxIntraOpThreads>
diff --git a/config-model/src/test/cfg/application/embed_generic/configdefinitions/sentence-embedder.def b/config-model/src/test/cfg/application/embed_generic/configdefinitions/sentence-embedder.def
index f62e2019189..ac5c79d2714 100644
--- a/config-model/src/test/cfg/application/embed_generic/configdefinitions/sentence-embedder.def
+++ b/config-model/src/test/cfg/application/embed_generic/configdefinitions/sentence-embedder.def
@@ -4,7 +4,8 @@ package=ai.vespa.example.paragraph
vocab path
# Transformer model settings
-model path
+modelPath path
+modelUrl string
# Max length of token sequence model can handle
transforerMaxTokens int default=128
diff --git a/config-model/src/test/java/com/yahoo/vespa/model/container/xml/EmbedderTestCase.java b/config-model/src/test/java/com/yahoo/vespa/model/container/xml/EmbedderTestCase.java
index 766c2b11256..fef461a4b7a 100644
--- a/config-model/src/test/java/com/yahoo/vespa/model/container/xml/EmbedderTestCase.java
+++ b/config-model/src/test/java/com/yahoo/vespa/model/container/xml/EmbedderTestCase.java
@@ -34,101 +34,63 @@ public class EmbedderTestCase {
@Test
void testGenericEmbedConfig() throws IOException, SAXException {
- String embedder = "<embedder id=\"test\" class=\"ai.vespa.test\" bundle=\"bundle\" def=\"def.name\">" +
- " <val>123</val>" +
- "</embedder>";
- String component = "<component id=\"test\" class=\"ai.vespa.test\" bundle=\"bundle\">" +
- " <config name=\"def.name\">" +
- " <val>123</val>" +
- " </config>" +
- "</component>";
+ String embedder = "<embedder id='test' class='ai.vespa.test' bundle='bundle' def='def.name'>" +
+ " <val>123</val>" +
+ "</embedder>";
+ String component = "<component id='test' class='ai.vespa.test' bundle='bundle'>" +
+ " <config name='def.name'>" +
+ " <val>123</val>" +
+ " </config>" +
+ "</component>";
assertTransform(embedder, component);
}
@Test
- void testGenericEmbedConfigRequiresBundleAndDef() throws IOException, SAXException {
- assertTransformThrows("<embedder id=\"test\" class=\"ai.vespa.test\"></embedder>",
- "Embedder configuration requires a bundle name");
- assertTransformThrows("<embedder id=\"test\" class=\"ai.vespa.test\" bundle=\"bundle\"></embedder>",
- "Embedder configuration requires a config definition name");
- }
-
- @Test
void testPredefinedEmbedConfigSelfHosted() throws IOException, SAXException {
- String embedder = "<embedder id=\"test\" class=\"" + PREDEFINED_EMBEDDER_CLASS + "\">" +
- " <model id=\"my_model_id\" url=\"my-model-url\" />" +
- " <vocab id=\"my_vocab_id\" url=\"my-vocab-url\" />" +
+ String embedder = "<embedder id='test' class='" + PREDEFINED_EMBEDDER_CLASS + "'>" +
+ " <transformerModel id='my_model_id' url='my-model-url' />" +
+ " <tokenizerVocab id='my_vocab_id' url='my-vocab-url' />" +
"</embedder>";
- String component = "<component id=\"test\" class=\"" + PREDEFINED_EMBEDDER_CLASS + "\" bundle=\"model-integration\">" +
- " <config name=\"" + PREDEFINED_EMBEDDER_CONFIG + "\">" +
- " <tokenizerVocabUrl>my-vocab-url</tokenizerVocabUrl>" +
- " <tokenizerVocabPath></tokenizerVocabPath>" +
+ String component = "<component id='test' class='" + PREDEFINED_EMBEDDER_CLASS + "' bundle='model-integration'>" +
+ " <config name='" + PREDEFINED_EMBEDDER_CONFIG + "'>" +
" <transformerModelUrl>my-model-url</transformerModelUrl>" +
" <transformerModelPath></transformerModelPath>" +
+ " <tokenizerVocabUrl>my-vocab-url</tokenizerVocabUrl>" +
+ " <tokenizerVocabPath></tokenizerVocabPath>" +
" </config>" +
"</component>";
assertTransform(embedder, component, false);
}
@Test
- void testIncorrectEmbedderOptionsSelfHosted() throws IOException, SAXException {
- assertTransformThrows("<embedder id=\"test\" class=\"" + PREDEFINED_EMBEDDER_CLASS + "\"></embedder>",
- "Embedder '" + PREDEFINED_EMBEDDER_CLASS + "' requires options for [vocab, model]");
- assertTransformThrows("<embedder id=\"test\" class=\"" + PREDEFINED_EMBEDDER_CLASS + "\">" +
- " <model />" +
- " <vocab />" +
- "</embedder>",
- "Model option requires either a 'path' or a 'url' attribute");
- assertTransformThrows("<embedder id=\"test\" class=\"" + PREDEFINED_EMBEDDER_CLASS + "\">" +
- " <model id=\"my_model_id\" />" +
- " <vocab id=\"my_vocab_id\" />" +
- "</embedder>",
- "Model option 'id' is not valid here");
- }
-
- @Test
- void testPathHasprioritySelfHosted() throws IOException, SAXException {
- String embedder = "<embedder id=\"test\" class=\"" + PREDEFINED_EMBEDDER_CLASS + "\">" +
- " <model id=\"my_model_id\" url=\"my-model-url\" path=\"files/model.onnx\" />" +
- " <vocab id=\"my_vocab_id\" url=\"my-vocab-url\" path=\"files/vocab.txt\" />" +
+ void testPathHasPrioritySelfHosted() throws IOException, SAXException {
+ String embedder = "<embedder id='test' class='" + PREDEFINED_EMBEDDER_CLASS + "'>" +
+ " <transformerModel id='my_model_id' url='my-model-url' path='files/model.onnx' />" +
+ " <tokenizerVocab id='my_vocab_id' url='my-vocab-url' path='files/vocab.txt' />" +
"</embedder>";
- String component = "<component id=\"test\" class=\"" + PREDEFINED_EMBEDDER_CLASS + "\" bundle=\"model-integration\">" +
- " <config name=\"" + PREDEFINED_EMBEDDER_CONFIG + "\">" +
- " <tokenizerVocabPath>files/vocab.txt</tokenizerVocabPath>" +
- " <tokenizerVocabUrl></tokenizerVocabUrl>" +
- " <transformerModelPath>files/model.onnx</transformerModelPath>" +
+ String component = "<component id='test' class='" + PREDEFINED_EMBEDDER_CLASS + "' bundle='model-integration'>" +
+ " <config name='" + PREDEFINED_EMBEDDER_CONFIG + "'>" +
" <transformerModelUrl></transformerModelUrl>" +
+ " <transformerModelPath>files/model.onnx</transformerModelPath>" +
+ " <tokenizerVocabUrl></tokenizerVocabUrl>" +
+ " <tokenizerVocabPath>files/vocab.txt</tokenizerVocabPath>" +
" </config>" +
"</component>";
assertTransform(embedder, component, false);
}
@Test
- void testPredefinedEmptyEmbedConfigCloud() throws IOException, SAXException {
- String embedder = "<embedder id=\"test\" class=\"" + PREDEFINED_EMBEDDER_CLASS + "\" />";
- String component = "<component id=\"test\" class=\"" + PREDEFINED_EMBEDDER_CLASS + "\" bundle=\"model-integration\">" +
- " <config name=\"" + PREDEFINED_EMBEDDER_CONFIG + "\">" +
- " <tokenizerVocabUrl>some url</tokenizerVocabUrl>" +
- " <tokenizerVocabPath></tokenizerVocabPath>" +
- " <transformerModelUrl>some url</transformerModelUrl>" +
- " <transformerModelPath></transformerModelPath>" +
- " </config>" +
- "</component>";
- assertTransform(embedder, component, true);
- }
-
- @Test
void testPredefinedEmbedConfigCloud() throws IOException, SAXException {
- String embedder = "<embedder id=\"test\" class=\"" + PREDEFINED_EMBEDDER_CLASS + "\">" +
- " <model id=\"test-model-id\" />" +
- " <vocab id=\"test-model-id\" />" +
+ String embedder = "<embedder id='test' class='" + PREDEFINED_EMBEDDER_CLASS + "'>" +
+ " <transformerModel id='test-model-id' />" +
+ " <tokenizerVocab id='test-model-id' />" +
"</embedder>";
- String component = "<component id=\"test\" class=\"" + PREDEFINED_EMBEDDER_CLASS + "\" bundle=\"model-integration\">" +
- " <config name=\"" + PREDEFINED_EMBEDDER_CONFIG + "\">" +
- " <tokenizerVocabUrl>test-model-url</tokenizerVocabUrl>" +
- " <tokenizerVocabPath></tokenizerVocabPath>" +
+ String component = "<component id='test' class='" + PREDEFINED_EMBEDDER_CLASS + "' bundle='model-integration'>" +
+ " <config name='" + PREDEFINED_EMBEDDER_CONFIG + "'>" +
" <transformerModelUrl>test-model-url</transformerModelUrl>" +
" <transformerModelPath></transformerModelPath>" +
+ " <tokenizerVocabUrl>test-model-url</tokenizerVocabUrl>" +
+ " <tokenizerVocabPath></tokenizerVocabPath>" +
" </config>" +
"</component>";
assertTransform(embedder, component, true);
@@ -136,16 +98,16 @@ public class EmbedderTestCase {
@Test
void testCustomEmbedderWithPredefinedConfigCloud() throws IOException, SAXException {
- String embedder = "<embedder id=\"test\" class=\"ApplicationSpecificEmbedder\" def=\"" + PREDEFINED_EMBEDDER_CONFIG + "\">" +
- " <model id=\"test-model-id\" />" +
- " <vocab id=\"test-model-id\" />" +
+ String embedder = "<embedder id='test' class='ApplicationSpecificEmbedder' def='" + PREDEFINED_EMBEDDER_CONFIG + "'>" +
+ " <transformerModel id='test-model-id' />" +
+ " <tokenizerVocab id='test-model-id' />" +
"</embedder>";
- String component = "<component id=\"test\" class=\"ApplicationSpecificEmbedder\" bundle=\"model-integration\">" +
- " <config name=\"" + PREDEFINED_EMBEDDER_CONFIG + "\">" +
- " <tokenizerVocabUrl>test-model-url</tokenizerVocabUrl>" +
- " <tokenizerVocabPath></tokenizerVocabPath>" +
+ String component = "<component id='test' class='ApplicationSpecificEmbedder' bundle='model-integration'>" +
+ " <config name='" + PREDEFINED_EMBEDDER_CONFIG + "'>" +
" <transformerModelUrl>test-model-url</transformerModelUrl>" +
" <transformerModelPath></transformerModelPath>" +
+ " <tokenizerVocabUrl>test-model-url</tokenizerVocabUrl>" +
+ " <tokenizerVocabPath></tokenizerVocabPath>" +
" </config>" +
"</component>";
assertTransform(embedder, component, true);
@@ -153,11 +115,11 @@ public class EmbedderTestCase {
@Test
void testUnknownModelIdCloud() throws IOException, SAXException {
- String embedder = "<embedder id=\"test\" class=\"" + PREDEFINED_EMBEDDER_CLASS + "\">" +
- " <model id=\"my_model_id\" />" +
- " <vocab id=\"my_vocab_id\" />" +
+ String embedder = "<embedder id='test' class='" + PREDEFINED_EMBEDDER_CLASS + "'>" +
+ " <transformerModel id='my_model_id' />" +
+ " <tokenizerVocab id='my_vocab_id' />" +
"</embedder>";
- assertTransformThrows(embedder, "Unknown model id: 'my_vocab_id'", true);
+ assertTransformThrows(embedder, "Unknown model id 'my_model_id'", true);
}
@Test
@@ -190,7 +152,7 @@ public class EmbedderTestCase {
Component<?, ?> testComponent = containerCluster.getComponentsMap().get(new ComponentId("transformer"));
ConfigPayloadBuilder config = testComponent.getUserConfigs().get(new ConfigDefinitionKey("sentence-embedder", "ai.vespa.example.paragraph"));
assertEquals("files/vocab.txt", config.getObject("vocab").getValue());
- assertEquals("files/model.onnx", config.getObject("model").getValue());
+ assertEquals("files/model.onnx", config.getObject("modelPath").getValue());
}
private VespaModel loadModel(Path path, boolean hosted) throws Exception {
@@ -237,16 +199,12 @@ public class EmbedderTestCase {
}
}
- private void assertTransformThrows(String embedder, String msg) throws IOException, SAXException {
- assertTransformThrows(embedder, msg, false);
- }
-
- private void assertTransformThrows(String embedder, String msg, boolean hosted) throws IOException, SAXException {
+ private void assertTransformThrows(String embedder, String expectedMessage, boolean hosted) throws IOException, SAXException {
try {
EmbedderConfig.transform(createEmptyDeployState(hosted), createElement(embedder));
- fail("Expected exception was not thrown: " + msg);
+ fail("Expected exception was not thrown: " + expectedMessage);
} catch (IllegalArgumentException e) {
- assertEquals(e.getMessage(), msg);
+ assertEquals(expectedMessage, e.getMessage());
}
}
diff --git a/config-model/src/test/schema-test-files/services.xml b/config-model/src/test/schema-test-files/services.xml
index b32849bb55f..ffb28726d9a 100644
--- a/config-model/src/test/schema-test-files/services.xml
+++ b/config-model/src/test/schema-test-files/services.xml
@@ -196,6 +196,7 @@
<component id="injected-to-handler">
<config name="foo"/>
</component>
+ <embedder id="transformer" class="ai.vespa.example.SomeEmbedder" bundle="myBundle" def="my.def-file"/>
</handler>
<server id="server-provider">