diff options
author | Jon Bratseth <bratseth@gmail.com> | 2022-08-26 09:58:10 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-08-26 09:58:10 +0200 |
commit | 40bb8680dbef01e603b8947a194c86e9acc14e30 (patch) | |
tree | 3badd5c97ff41449514805921e567c218661ab79 /config-model/src/test | |
parent | d227d62f0cef26ebdb30c0d5280a2462cd39767d (diff) | |
parent | ffab68b3f5c28034eaf3a606c1b220c14f7204fa (diff) |
Merge pull request #23770 from vespa-engine/bratseth/embedder-syntax-5
Bratseth/embedder syntax 5
Diffstat (limited to 'config-model/src/test')
5 files changed, 132 insertions, 104 deletions
diff --git a/config-model/src/test/cfg/application/embed/services.xml b/config-model/src/test/cfg/application/embed/services.xml index 9a05337f954..88558ace4bf 100644 --- a/config-model/src/test/cfg/application/embed/services.xml +++ b/config-model/src/test/cfg/application/embed/services.xml @@ -4,19 +4,16 @@ <container version="1.0"> - <embedder id="test" class="ai.vespa.embedding.UndefinedEmbedder" bundle="dummy" def="test.dummy"> - <num>12</num> - <str>some text</str> - </embedder> + <component id="transformer" class="ai.vespa.embedding.BertBaseEmbedder" bindle="model-integration"> + <config name="embedding.bert-base-embedder"> + <!-- model specifics --> + <transformerModel id="minilm-l6-v2" url="application-url"/> + <tokenizerVocab path="files/vocab.txt"/> - <embedder id="transformer" class="ai.vespa.embedding.BertBaseEmbedder"> - <!-- model specifics --> - <transformerModel id="test-model-id" url="test-model-url"/> - <tokenizerVocab path="files/vocab.txt"/> - - <!-- tunable parameters: number of threads etc --> - <onnxIntraOpThreads>4</onnxIntraOpThreads> - </embedder> + <!-- tunable parameters: number of threads etc --> + <onnxIntraOpThreads>4</onnxIntraOpThreads> + </config> + </component> <nodes> <node hostalias="node1" /> diff --git a/config-model/src/test/cfg/application/embed_generic/configdefinitions/sentence-embedder.def b/config-model/src/test/cfg/application/embed_generic/configdefinitions/sentence-embedder.def index ac5c79d2714..81fc88dbf01 100644 --- a/config-model/src/test/cfg/application/embed_generic/configdefinitions/sentence-embedder.def +++ b/config-model/src/test/cfg/application/embed_generic/configdefinitions/sentence-embedder.def @@ -1,12 +1,15 @@ package=ai.vespa.example.paragraph # Settings for wordpiece tokenizer -vocab path +vocabPath path +vocabUrl string # Transformer model settings modelPath path modelUrl string +myValue string + # Max length of token sequence model can handle transforerMaxTokens int default=128 diff --git a/config-model/src/test/cfg/application/embed_generic/services.xml b/config-model/src/test/cfg/application/embed_generic/services.xml index ab2c1be9745..ea430f24e2f 100644 --- a/config-model/src/test/cfg/application/embed_generic/services.xml +++ b/config-model/src/test/cfg/application/embed_generic/services.xml @@ -4,13 +4,15 @@ <container version="1.0"> - <embedder id='transformer' - class='ai.vespa.example.paragraph.ApplicationSpecificEmbedder' - bundle='exampleEmbedder' - def='ai.vespa.example.paragraph.sentence-embedder'> - <model path="files/model.onnx" /> <!-- Embedder syntax for file path --> - <vocab>files/vocab.txt</vocab> <!-- Generic config syntax for file path --> - </embedder> + <component id='transformer' + class='ai.vespa.example.paragraph.ApplicationSpecificEmbedder' + bundle='exampleEmbedder'> + <config name='ai.vespa.example.paragraph.sentence-embedder'> + <model id="minilm-l6-v2" url="application-url" /> + <vocab path="files/vocab.txt"/> + <myValue>foo</myValue> + </config> + </component> <nodes> <node hostalias='node1'/> diff --git a/config-model/src/test/java/com/yahoo/vespa/model/container/xml/EmbedderTestCase.java b/config-model/src/test/java/com/yahoo/vespa/model/container/xml/EmbedderTestCase.java index d64e726eb6a..ffa7e52136f 100644 --- a/config-model/src/test/java/com/yahoo/vespa/model/container/xml/EmbedderTestCase.java +++ b/config-model/src/test/java/com/yahoo/vespa/model/container/xml/EmbedderTestCase.java @@ -28,130 +28,158 @@ import static org.junit.jupiter.api.Assertions.fail; public class EmbedderTestCase { - private static final String PREDEFINED_EMBEDDER_CLASS = "ai.vespa.embedding.BertBaseEmbedder"; - private static final String PREDEFINED_EMBEDDER_CONFIG = "embedding.bert-base-embedder"; + private static final String emptyPathFileName = "services.xml"; + private static final String BUNDLED_EMBEDDER_CLASS = "ai.vespa.embedding.BertBaseEmbedder"; + private static final String BUNDLED_EMBEDDER_CONFIG = "embedding.bert-base-embedder"; @Test - void testGenericEmbedConfig() throws IOException, SAXException { - String embedder = "<embedder id='test' class='ai.vespa.test' bundle='bundle' def='def.name'>" + - " <val>123</val>" + - "</embedder>"; - String component = "<component id='test' class='ai.vespa.test' bundle='bundle'>" + - " <config name='def.name'>" + - " <val>123</val>" + - " </config>" + - "</component>"; - assertTransform(embedder, component); - } - - @Test - void testPredefinedEmbedConfigSelfHosted() throws IOException, SAXException { - String embedder = "<embedder id='test' class='" + PREDEFINED_EMBEDDER_CLASS + "'>" + - " <transformerModel id='my_model_id' url='my-model-url' />" + - " <tokenizerVocab id='my_vocab_id' url='my-vocab-url' />" + - "</embedder>"; - String component = "<component id='test' class='" + PREDEFINED_EMBEDDER_CLASS + "' bundle='model-integration'>" + - " <config name='" + PREDEFINED_EMBEDDER_CONFIG + "'>" + + void testBundledEmbedder_selfhosted() throws IOException, SAXException { + String input = "<component id='test' class='" + BUNDLED_EMBEDDER_CLASS + "' bundle='model-integration'>" + + " <config name='" + BUNDLED_EMBEDDER_CONFIG + "'>" + + " <transformerModel id='my_model_id' url='my-model-url' />" + + " <tokenizerVocab id='my_vocab_id' url='my-vocab-url' />" + + " </config>" + + "</component>"; + String component = "<component id='test' class='" + BUNDLED_EMBEDDER_CLASS + "' bundle='model-integration'>" + + " <config name='" + BUNDLED_EMBEDDER_CONFIG + "'>" + " <transformerModelUrl>my-model-url</transformerModelUrl>" + - " <transformerModelPath></transformerModelPath>" + + " <transformerModelPath>services.xml</transformerModelPath>" + " <tokenizerVocabUrl>my-vocab-url</tokenizerVocabUrl>" + - " <tokenizerVocabPath></tokenizerVocabPath>" + + " <tokenizerVocabPath>services.xml</tokenizerVocabPath>" + " </config>" + "</component>"; - assertTransform(embedder, component, false); + assertTransform(input, component, false); } @Test - void testPathHasPrioritySelfHosted() throws IOException, SAXException { - String embedder = "<embedder id='test' class='" + PREDEFINED_EMBEDDER_CLASS + "'>" + - " <transformerModel id='my_model_id' url='my-model-url' path='files/model.onnx' />" + - " <tokenizerVocab id='my_vocab_id' url='my-vocab-url' path='files/vocab.txt' />" + - "</embedder>"; - String component = "<component id='test' class='" + PREDEFINED_EMBEDDER_CLASS + "' bundle='model-integration'>" + - " <config name='" + PREDEFINED_EMBEDDER_CONFIG + "'>" + + void testPathHasPriority_selfhosted() throws IOException, SAXException { + String input = "<component id='test' class='" + BUNDLED_EMBEDDER_CLASS + "' bundle='model-integration'>" + + " <config name='" + BUNDLED_EMBEDDER_CONFIG + "'>" + + " <transformerModel id='my_model_id' url='my-model-url' path='files/model.onnx' />" + + " <tokenizerVocab id='my_vocab_id' url='my-vocab-url' path='files/vocab.txt' />" + + " </config>" + + "</component>"; + String component = "<component id='test' class='" + BUNDLED_EMBEDDER_CLASS + "' bundle='model-integration'>" + + " <config name='" + BUNDLED_EMBEDDER_CONFIG + "'>" + " <transformerModelUrl></transformerModelUrl>" + " <transformerModelPath>files/model.onnx</transformerModelPath>" + " <tokenizerVocabUrl></tokenizerVocabUrl>" + " <tokenizerVocabPath>files/vocab.txt</tokenizerVocabPath>" + " </config>" + "</component>"; - assertTransform(embedder, component, false); + assertTransform(input, component, false); } @Test - void testPredefinedEmbedConfigCloud() throws IOException, SAXException { - String embedder = "<embedder id='test' class='" + PREDEFINED_EMBEDDER_CLASS + "'>" + - " <transformerModel id='test-model-id' />" + - " <tokenizerVocab id='test-model-id' />" + - "</embedder>"; - String component = "<component id='test' class='" + PREDEFINED_EMBEDDER_CLASS + "' bundle='model-integration'>" + - " <config name='" + PREDEFINED_EMBEDDER_CONFIG + "'>" + - " <transformerModelUrl>test-model-url</transformerModelUrl>" + - " <transformerModelPath></transformerModelPath>" + - " <tokenizerVocabUrl>test-model-url</tokenizerVocabUrl>" + - " <tokenizerVocabPath></tokenizerVocabPath>" + + void testBundledEmbedder_hosted() throws IOException, SAXException { + String input = "<component id='test' class='" + BUNDLED_EMBEDDER_CLASS + "' bundle='model-integration'>" + + " <config name='" + BUNDLED_EMBEDDER_CONFIG + "'>" + + " <transformerModel id='minilm-l6-v2' />" + + " <tokenizerVocab id='bert-base-uncased' />" + + " </config>" + + "</component>"; + String component = "<component id='test' class='" + BUNDLED_EMBEDDER_CLASS + "' bundle='model-integration'>" + + " <config name='" + BUNDLED_EMBEDDER_CONFIG + "'>" + + " <transformerModelUrl>https://data.vespa.oath.cloud/onnx_models/sentence_all_MiniLM_L6_v2.onnx</transformerModelUrl>" + + " <transformerModelPath>services.xml</transformerModelPath>" + + " <tokenizerVocabUrl>https://data.vespa.oath.cloud/onnx_models/bert-base-uncased-vocab.txt</tokenizerVocabUrl>" + + " <tokenizerVocabPath>services.xml</tokenizerVocabPath>" + " </config>" + "</component>"; - assertTransform(embedder, component, true); + assertTransform(input, component, true); } @Test - void testCustomEmbedderWithPredefinedConfigCloud() throws IOException, SAXException { - String embedder = "<embedder id='test' class='ApplicationSpecificEmbedder' def='" + PREDEFINED_EMBEDDER_CONFIG + "'>" + - " <transformerModel id='test-model-id' />" + - " <tokenizerVocab id='test-model-id' />" + - "</embedder>"; + void testApplicationEmbedderWithBundledConfig_hosted() throws IOException, SAXException { + String input = "<component id='test' class='ApplicationSpecificEmbedder' bundle='model-integration'>" + + " <config name='" + BUNDLED_EMBEDDER_CONFIG + "'>" + + " <transformerModel id='minilm-l6-v2' />" + + " <tokenizerVocab id='bert-base-uncased' />" + + " </config>" + + "</component>"; String component = "<component id='test' class='ApplicationSpecificEmbedder' bundle='model-integration'>" + - " <config name='" + PREDEFINED_EMBEDDER_CONFIG + "'>" + - " <transformerModelUrl>test-model-url</transformerModelUrl>" + - " <transformerModelPath></transformerModelPath>" + - " <tokenizerVocabUrl>test-model-url</tokenizerVocabUrl>" + - " <tokenizerVocabPath></tokenizerVocabPath>" + + " <config name='" + BUNDLED_EMBEDDER_CONFIG + "'>" + + " <transformerModelUrl>https://data.vespa.oath.cloud/onnx_models/sentence_all_MiniLM_L6_v2.onnx</transformerModelUrl>" + + " <transformerModelPath>services.xml</transformerModelPath>" + + " <tokenizerVocabUrl>https://data.vespa.oath.cloud/onnx_models/bert-base-uncased-vocab.txt</tokenizerVocabUrl>" + + " <tokenizerVocabPath>services.xml</tokenizerVocabPath>" + " </config>" + "</component>"; - assertTransform(embedder, component, true); + assertTransform(input, component, true); } @Test - void testUnknownModelIdCloud() throws IOException, SAXException { - String embedder = "<embedder id='test' class='" + PREDEFINED_EMBEDDER_CLASS + "'>" + - " <transformerModel id='my_model_id' />" + - " <tokenizerVocab id='my_vocab_id' />" + - "</embedder>"; - assertTransformThrows(embedder, "Unknown model id 'my_model_id'", true); + void testUnknownModelId_hosted() throws IOException, SAXException { + String embedder = "<component id='test' class='" + BUNDLED_EMBEDDER_CLASS + "'>" + + " <config name='" + BUNDLED_EMBEDDER_CONFIG + "'>" + + " <transformerModel id='my_model_id' />" + + " <tokenizerVocab id='my_vocab_id' />" + + " </config>" + + "</component>"; + assertTransformThrows(embedder, + "Unknown embedder model 'my_model_id'. " + + "Available models are [bert-base-uncased, minilm-l6-v2]", + true); } @Test - void testApplicationWithEmbedConfig() throws Exception { - final String emptyPathFileName = "services.xml"; - + void testApplicationPackageWithEmbedder_selfhosted() throws Exception { Path applicationDir = Path.fromString("src/test/cfg/application/embed/"); VespaModel model = loadModel(applicationDir, false); ApplicationContainerCluster containerCluster = model.getContainerClusters().get("container"); - Component<?, ?> testComponent = containerCluster.getComponentsMap().get(new ComponentId("test")); - ConfigPayloadBuilder testConfig = testComponent.getUserConfigs().get(new ConfigDefinitionKey("dummy", "test")); - assertEquals("12", testConfig.getObject("num").getValue()); - assertEquals("some text", testConfig.getObject("str").getValue()); + Component<?, ?> transformer = containerCluster.getComponentsMap().get(new ComponentId("transformer")); + ConfigPayloadBuilder config = transformer.getUserConfigs().get(new ConfigDefinitionKey("bert-base-embedder", "embedding")); + assertEquals("application-url", config.getObject("transformerModelUrl").getValue()); + assertEquals(emptyPathFileName, config.getObject("transformerModelPath").getValue()); + assertEquals("", config.getObject("tokenizerVocabUrl").getValue()); + assertEquals("files/vocab.txt", config.getObject("tokenizerVocabPath").getValue()); + assertEquals("4", config.getObject("onnxIntraOpThreads").getValue()); + } + + @Test + void testApplicationPackageWithEmbedder_hosted() throws Exception { + Path applicationDir = Path.fromString("src/test/cfg/application/embed/"); + VespaModel model = loadModel(applicationDir, true); + ApplicationContainerCluster containerCluster = model.getContainerClusters().get("container"); Component<?, ?> transformer = containerCluster.getComponentsMap().get(new ComponentId("transformer")); - ConfigPayloadBuilder transformerConfig = transformer.getUserConfigs().get(new ConfigDefinitionKey("bert-base-embedder", "embedding")); - assertEquals("test-model-url", transformerConfig.getObject("transformerModelUrl").getValue()); - assertEquals(emptyPathFileName, transformerConfig.getObject("transformerModelPath").getValue()); - assertEquals("", transformerConfig.getObject("tokenizerVocabUrl").getValue()); - assertEquals("files/vocab.txt", transformerConfig.getObject("tokenizerVocabPath").getValue()); + ConfigPayloadBuilder config = transformer.getUserConfigs().get(new ConfigDefinitionKey("bert-base-embedder", "embedding")); + assertEquals("https://data.vespa.oath.cloud/onnx_models/sentence_all_MiniLM_L6_v2.onnx", + config.getObject("transformerModelUrl").getValue()); + assertEquals(emptyPathFileName, config.getObject("transformerModelPath").getValue()); + assertEquals("", config.getObject("tokenizerVocabUrl").getValue()); + assertEquals("files/vocab.txt", config.getObject("tokenizerVocabPath").getValue()); + assertEquals("4", config.getObject("onnxIntraOpThreads").getValue()); } @Test - void testApplicationWithGenericEmbedConfig() throws Exception { + void testApplicationPackageWithApplicationEmbedder_selfhosted() throws Exception { Path applicationDir = Path.fromString("src/test/cfg/application/embed_generic/"); VespaModel model = loadModel(applicationDir, false); ApplicationContainerCluster containerCluster = model.getContainerClusters().get("container"); Component<?, ?> testComponent = containerCluster.getComponentsMap().get(new ComponentId("transformer")); ConfigPayloadBuilder config = testComponent.getUserConfigs().get(new ConfigDefinitionKey("sentence-embedder", "ai.vespa.example.paragraph")); - assertEquals("files/vocab.txt", config.getObject("vocab").getValue()); - assertEquals("files/model.onnx", config.getObject("modelPath").getValue()); + assertEquals("application-url", config.getObject("modelUrl").getValue()); + assertEquals(emptyPathFileName, config.getObject("modelPath").getValue()); + assertEquals("files/vocab.txt", config.getObject("vocabPath").getValue()); + assertEquals("foo", config.getObject("myValue").getValue()); + } + + @Test + void testApplicationPackageWithApplicationEmbedder_hosted() throws Exception { + Path applicationDir = Path.fromString("src/test/cfg/application/embed_generic/"); + VespaModel model = loadModel(applicationDir, true); + ApplicationContainerCluster containerCluster = model.getContainerClusters().get("container"); + + Component<?, ?> testComponent = containerCluster.getComponentsMap().get(new ComponentId("transformer")); + ConfigPayloadBuilder config = testComponent.getUserConfigs().get(new ConfigDefinitionKey("sentence-embedder", "ai.vespa.example.paragraph")); + assertEquals("https://data.vespa.oath.cloud/onnx_models/sentence_all_MiniLM_L6_v2.onnx", + config.getObject("modelUrl").getValue()); + assertEquals(emptyPathFileName, config.getObject("modelPath").getValue()); + assertEquals("files/vocab.txt", config.getObject("vocabPath").getValue()); + assertEquals("foo", config.getObject("myValue").getValue()); } private VespaModel loadModel(Path path, boolean hosted) throws Exception { @@ -165,17 +193,16 @@ public class EmbedderTestCase { assertTransform(embedder, component, false); } - private void assertTransform(String embedder, String component, boolean hosted) throws IOException, SAXException { - Element emb = createElement(embedder); - Element cmp = createElement(component); - Element trans = EmbedderConfigTransformer.transform(createEmptyDeployState(hosted), emb); - assertSpec(cmp, trans); + private void assertTransform(String embedder, String expectedComponent, boolean hosted) throws IOException, SAXException { + assertSpec(createElement(expectedComponent), + ModelConfigTransformer.transform(createEmptyDeployState(hosted), createElement(embedder))); } private void assertSpec(Element e1, Element e2) { assertEquals(e1.getTagName(), e2.getTagName()); assertAttributes(e1, e2); assertAttributes(e2, e1); + assertEquals(XML.getValue(e1).trim(), XML.getValue(e2).trim(), "Content of " + e1.getTagName() + "' is identical"); assertChildren(e1, e2); } @@ -200,7 +227,7 @@ public class EmbedderTestCase { private void assertTransformThrows(String embedder, String expectedMessage, boolean hosted) throws IOException, SAXException { try { - EmbedderConfigTransformer.transform(createEmptyDeployState(hosted), createElement(embedder)); + ModelConfigTransformer.transform(createEmptyDeployState(hosted), createElement(embedder)); fail("Expected exception was not thrown: " + expectedMessage); } catch (IllegalArgumentException e) { assertEquals(expectedMessage, e.getMessage()); diff --git a/config-model/src/test/schema-test-files/services.xml b/config-model/src/test/schema-test-files/services.xml index ffb28726d9a..b32849bb55f 100644 --- a/config-model/src/test/schema-test-files/services.xml +++ b/config-model/src/test/schema-test-files/services.xml @@ -196,7 +196,6 @@ <component id="injected-to-handler"> <config name="foo"/> </component> - <embedder id="transformer" class="ai.vespa.example.SomeEmbedder" bundle="myBundle" def="my.def-file"/> </handler> <server id="server-provider"> |