diff options
author | MariusArhaug <mariusarhaug@hotmail.com> | 2024-06-21 10:29:30 +0200 |
---|---|---|
committer | Bjørn Christian Seime <bjorncs@vespa.ai> | 2024-06-21 13:39:11 +0200 |
commit | 0b28acb8655862f1f57693c9e4ccf36fe5f78dbd (patch) | |
tree | 9935856f2fa1349180058cf33aa084283892f8d2 /vespaclient-java/src/test/java/com/yahoo | |
parent | 7304b3bd208ed5c8997106dcee90e8d00d9e05dc (diff) |
Add zst support for significance cli and registry
Diffstat (limited to 'vespaclient-java/src/test/java/com/yahoo')
-rw-r--r-- | vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java | 55 |
1 files changed, 48 insertions, 7 deletions
diff --git a/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java b/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java index 8e1e8fd5627..916fe05ef7b 100644 --- a/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java +++ b/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java @@ -5,11 +5,14 @@ package com.yahoo.vespasignificance; import com.fasterxml.jackson.databind.ObjectMapper; import com.yahoo.language.significance.impl.DocumentFrequencyFile; import com.yahoo.language.significance.impl.SignificanceModelFile; +import io.airlift.compress.zstd.ZstdInputStream; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; import java.nio.file.Path; import java.util.HashMap; @@ -25,7 +28,7 @@ public class SignificanceModelGeneratorTest { @TempDir private Path tempDir; - private ClientParameters.Builder createParameters(String inputPath, String outputPath, String field, String language, String docType) { + private ClientParameters.Builder createParameters(String inputPath, String outputPath, String field, String language, String docType, String zstCompression) { tempDir.toFile().mkdirs(); return new ClientParameters.Builder() @@ -33,7 +36,8 @@ public class SignificanceModelGeneratorTest { .setOutputFile(tempDir.resolve(outputPath).toString()) .setField(field) .setLanguage(language) - .setDocType(docType); + .setDocType(docType) + .setZstCompression(zstCompression); } private SignificanceModelGenerator createSignificanceModelGenerator(ClientParameters params) { @@ -44,7 +48,7 @@ public class SignificanceModelGeneratorTest { void testGenerateSimpleFile() throws IOException { String inputPath = "no.jsonl"; String outputPath = "output.json"; - ClientParameters params = createParameters(inputPath, outputPath, "text", "NB", "nb").build(); + ClientParameters params = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build(); SignificanceModelGenerator generator = createSignificanceModelGenerator(params); generator.generate(); @@ -68,10 +72,47 @@ public class SignificanceModelGeneratorTest { } @Test + void testGenerateSimpleFileWithZST() throws IOException { + String inputPath = "no.jsonl"; + ClientParameters params1 = createParameters(inputPath, "output.json", "text", "NB", "nb", "true").build(); + + // Throws exception when outputfile does not have .zst extension when using zst compression + assertThrows(IllegalArgumentException.class, () -> createSignificanceModelGenerator(params1)); + + String outputPath = "output.json.zst"; + ClientParameters params = createParameters(inputPath, outputPath, "text", "NB", "nb", "true").build(); + + SignificanceModelGenerator generator = createSignificanceModelGenerator(params); + generator.generate(); + + + + File outputFile = new File(tempDir.resolve(outputPath ).toString()); + assertTrue(outputFile.exists()); + + InputStream in = new ZstdInputStream(new FileInputStream(outputFile)); + + SignificanceModelFile modelFile = objectMapper.readValue(in, SignificanceModelFile.class); + + HashMap<String, DocumentFrequencyFile> languages = modelFile.languages(); + assertEquals(1, languages.size()); + + assertTrue(languages.containsKey("NB")); + + DocumentFrequencyFile documentFrequencyFile = languages.get("NB"); + + assertEquals(3, documentFrequencyFile.frequencies().get("fra")); + assertEquals(3, documentFrequencyFile.frequencies().get("skriveform")); + assertEquals(3, documentFrequencyFile.frequencies().get("kategori")); + assertEquals(3, documentFrequencyFile.frequencies().get("eldr")); + + } + + @Test void testGenerateFileWithMultipleLanguages() throws IOException { String inputPath = "no.jsonl"; String outputPath = "output.json"; - ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb").build(); + ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build(); SignificanceModelGenerator generator = createSignificanceModelGenerator(params1); generator.generate(); @@ -79,7 +120,7 @@ public class SignificanceModelGeneratorTest { assertTrue(outputFile.exists()); String inputPath2 = "en.jsonl"; - ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "EN", "en").build(); + ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "EN", "en", "false").build(); generator = createSignificanceModelGenerator(params2); generator.generate(); @@ -113,7 +154,7 @@ public class SignificanceModelGeneratorTest { void testOverwriteExistingDocumentFrequencyLanguage() throws IOException { String inputPath = "no.jsonl"; String outputPath = "output.json"; - ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb").build(); + ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build(); SignificanceModelGenerator generator = createSignificanceModelGenerator(params1); generator.generate(); @@ -134,7 +175,7 @@ public class SignificanceModelGeneratorTest { assertFalse(oldDf.frequencies().containsKey("nytt")); String inputPath2 = "no_2.jsonl"; - ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "NB", "nb").build(); + ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "NB", "nb", "false").build(); SignificanceModelGenerator generator2 = createSignificanceModelGenerator(params2); generator2.generate(); |