diff options
9 files changed, 137 insertions, 18 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java index 72874c15d9e..6f3a108e9e1 100644 --- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java @@ -7,9 +7,12 @@ import com.yahoo.language.Language; import com.yahoo.language.significance.SignificanceModel; import com.yahoo.language.significance.SignificanceModelRegistry; import com.yahoo.search.significance.config.SignificanceConfig; +import io.airlift.compress.zstd.ZstdInputStream; import java.io.IOException; import java.io.UncheckedIOException; +import java.io.FileInputStream; +import java.io.InputStream; import java.nio.file.Path; import java.util.EnumMap; import java.util.List; @@ -44,7 +47,11 @@ public class DefaultSignificanceModelRegistry implements SignificanceModelRegist public void addModel(Path path) { ObjectMapper objectMapper = new ObjectMapper(); try { - SignificanceModelFile file = objectMapper.readValue(path.toFile(), SignificanceModelFile.class); + InputStream in = path.toString().endsWith(".zst") ? + new ZstdInputStream(new FileInputStream(path.toFile())) : + new FileInputStream(path.toFile()); + + SignificanceModelFile file = objectMapper.readValue(in, SignificanceModelFile.class); for (var pair : file.languages().entrySet()) { this.models.put( Language.fromLanguageTag(pair.getKey()), diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java index e8594885b9e..a5be567717e 100644 --- a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java +++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java @@ -55,6 +55,27 @@ public class DefaultSignificanceModelRegistryTest { } @Test + public void testDefaultSignificanceModelRegistryWithZSTDecompressing() { + List<Path> models = new ArrayList<>(); + + models.add(Path.of("src/test/models/docv1.json.zst")); + + DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models); + + var optionalEnglishModel = defaultSignificanceModelRegistry.getModel(Language.ENGLISH); + assertTrue(optionalEnglishModel.isPresent()); + + var englishModel = optionalEnglishModel.get(); + + assertTrue( defaultSignificanceModelRegistry.getModel(Language.FRENCH).isEmpty()); + assertNotNull(englishModel); + assertEquals("test::1", englishModel.getId()); + assertEquals(2, englishModel.documentFrequency("test").frequency()); + assertEquals(10, englishModel.documentFrequency("test").corpusSize()); + + } + + @Test public void testDefaultSignificanceModelRegistryInOppsiteOrder() { List<Path> models = new ArrayList<>(); diff --git a/linguistics/src/test/models/docv1.json.zst b/linguistics/src/test/models/docv1.json.zst Binary files differnew file mode 100644 index 00000000000..61de6c7cb72 --- /dev/null +++ b/linguistics/src/test/models/docv1.json.zst diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java index 326b932cabc..ad14708015b 100644 --- a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java +++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java @@ -26,19 +26,24 @@ public class ClientParameters { // Document type identifier public final String docType; + // Zstandard compression + public final boolean zstCompression; + public ClientParameters( boolean help, String inputFile, String outputFile, String field, String language, - String docType) { + String docType, + boolean zstCompression) { this.help = help; this.inputFile = inputFile; this.outputFile = outputFile; this.field = field; this.language = language; this.docType = docType; + this.zstCompression = zstCompression; } public static class Builder { @@ -47,8 +52,8 @@ public class ClientParameters { private String outputFile; private String field; private String language; - private String docType; + private boolean zstCompression; public Builder setHelp(boolean help) { this.help = help; @@ -79,8 +84,13 @@ public class ClientParameters { return this; } + public Builder setZstCompression(String useZstCompression) { + this.zstCompression = Boolean.parseBoolean(useZstCompression); + return this; + } + public ClientParameters build() { - return new ClientParameters(help, inputFile, outputFile, field, language, docType); + return new ClientParameters(help, inputFile, outputFile, field, language, docType, zstCompression); } } } diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java index e5d16854647..2deaee983fe 100644 --- a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java +++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java @@ -26,6 +26,7 @@ public class CommandLineOptions { public static final String FIELD_OPTION = "field"; public static final String LANGUAGE_OPTION = "language"; public static final String DOC_TYPE_OPTION = "doc-type"; + public static final String ZST_COMPRESSION = "zst-compression"; private final Options options = createOptions(); @@ -40,35 +41,46 @@ public class CommandLineOptions { .build()); options.addOption(Option.builder("i") + .required() .hasArg(true) .desc("Input file") .longOpt(INPUT_OPTION) .build()); options.addOption(Option.builder("i") + .required() .hasArg(true) .desc("Output file") .longOpt(OUTPUT_OPTION) .build()); options.addOption(Option.builder("f") + .required() .hasArg(true) .desc("Field to analyze") .longOpt(FIELD_OPTION) .build()); options.addOption(Option.builder("l") + .required() .hasArg(true) .desc("Language tag for output file") .longOpt(LANGUAGE_OPTION) .build()); options.addOption(Option.builder("d") + .required() .hasArg(true) .desc("Document type identifier") .longOpt(DOC_TYPE_OPTION) .build()); + options.addOption(Option.builder("zst") + .hasArg(true) + .desc("Use Zstandard compression") + .longOpt(ZST_COMPRESSION) + .build()); + return options; } @@ -93,6 +105,7 @@ public class CommandLineOptions { builder.setField(cl.getOptionValue(FIELD_OPTION)); builder.setLanguage(cl.getOptionValue(LANGUAGE_OPTION)); builder.setDocType(cl.getOptionValue(DOC_TYPE_OPTION)); + builder.setZstCompression(cl.hasOption(ZST_COMPRESSION) ? cl.getOptionValue(ZST_COMPRESSION) : "true"); return builder.build(); } catch (ParseException e) { diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java index a60408b1f96..e368eebefa5 100644 --- a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java +++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java @@ -32,6 +32,7 @@ public class Main { if (params.help) { options.printHelp(); } else { + System.setProperty("vespa.replace_invalid_unicode", "true"); SignificanceModelGenerator significanceModelGenerator = createSignificanceModelGenerator(params); significanceModelGenerator.generate(); diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java index 7972a70cd10..d620820e14f 100644 --- a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java +++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java @@ -2,8 +2,6 @@ package com.yahoo.vespasignificance; -import com.fasterxml.jackson.annotation.JsonAutoDetect; -import com.fasterxml.jackson.annotation.PropertyAccessor; import com.fasterxml.jackson.core.JsonFactory; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.ObjectWriter; @@ -27,6 +25,8 @@ import com.yahoo.language.process.Tokenizer; import com.yahoo.language.significance.impl.DocumentFrequencyFile; import com.yahoo.language.significance.impl.SignificanceModelFile; import com.yahoo.text.Utf8; +import io.airlift.compress.zstd.ZstdInputStream; +import io.airlift.compress.zstd.ZstdOutputStream; import java.io.IOException; import java.io.BufferedReader; @@ -34,6 +34,9 @@ import java.io.ByteArrayInputStream; import java.io.File; import java.io.InputStream; import java.io.InputStreamReader; +import java.io.FileOutputStream; +import java.io.FileInputStream; +import java.io.OutputStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; @@ -59,6 +62,7 @@ public class SignificanceModelGenerator { final DocumentTypeManager types = new DocumentTypeManager(); final DocumentType docType; + private final boolean useZstCompression; private final static String VERSION = "1.0"; private final static String ID = "1"; private final static String SIGNIFICANCE_DESCRIPTION = "Significance model for input file"; @@ -66,14 +70,24 @@ public class SignificanceModelGenerator { public SignificanceModelGenerator(ClientParameters clientParameters) { this.clientParameters = clientParameters; + + if (clientParameters.zstCompression && !clientParameters.outputFile.endsWith(".zst")) { + throw new IllegalArgumentException("Output file must have .zst extension when using zst compression"); + } + + language = Language.fromLanguageTag(clientParameters.language); + if (language == Language.UNKNOWN) { + throw new IllegalArgumentException("Unknown language: " + clientParameters.language); + } + OpenNlpLinguistics openNlpLinguistics = new OpenNlpLinguistics(); tokenizer = openNlpLinguistics.getTokenizer(); objectMapper = new ObjectMapper(); - language = Language.fromLanguageTag(clientParameters.language); - docType = new DocumentType(clientParameters.docType); docType.addField(new Field(clientParameters.field, DataType.STRING)); + useZstCompression = clientParameters.zstCompression; + types.registerDocumentType(docType); } @@ -103,8 +117,14 @@ public class SignificanceModelGenerator { long pageCount = i - 1; SignificanceModelFile modelFile; - if (Paths.get(clientParameters.outputFile).toFile().exists()) { - modelFile = objectMapper.readValue(new File(clientParameters.outputFile), SignificanceModelFile.class); + File outputFile = Paths.get(clientParameters.outputFile).toFile(); + if (outputFile.exists()) { + + InputStream in = outputFile.toString().endsWith(".zst") ? + new ZstdInputStream(new FileInputStream(outputFile)) : + new FileInputStream(outputFile); + + modelFile = objectMapper.readValue(in, SignificanceModelFile.class); modelFile.addLanguage(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency())); @@ -117,7 +137,12 @@ public class SignificanceModelGenerator { } try { ObjectWriter writer = objectMapper.writerWithDefaultPrettyPrinter(); - writer.writeValue(new File(clientParameters.outputFile), modelFile); + + OutputStream out = useZstCompression ? + new ZstdOutputStream(new FileOutputStream(clientParameters.outputFile)) : + new FileOutputStream(clientParameters.outputFile); + + writer.writeValue(out, modelFile); } catch (IOException e) { throw new IllegalStateException("Failed to write model to output file", e); } diff --git a/vespaclient-java/src/main/sh/vespa-significance.sh b/vespaclient-java/src/main/sh/vespa-significance.sh index 0c0bac275e6..f26c8dfca46 100755 --- a/vespaclient-java/src/main/sh/vespa-significance.sh +++ b/vespaclient-java/src/main/sh/vespa-significance.sh @@ -79,6 +79,7 @@ export ROOT export MALLOC_ARENA_MAX=1 #Does not need fast allocation exec java \ +-D vespa.replace_invalid_unicode=true \ -server -enableassertions \ -XX:ThreadStackSize=512 \ -XX:MaxJavaStackTraceDepth=1000000 \ diff --git a/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java b/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java index 8e1e8fd5627..916fe05ef7b 100644 --- a/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java +++ b/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java @@ -5,11 +5,14 @@ package com.yahoo.vespasignificance; import com.fasterxml.jackson.databind.ObjectMapper; import com.yahoo.language.significance.impl.DocumentFrequencyFile; import com.yahoo.language.significance.impl.SignificanceModelFile; +import io.airlift.compress.zstd.ZstdInputStream; import org.junit.jupiter.api.Test; import org.junit.jupiter.api.io.TempDir; import java.io.File; +import java.io.FileInputStream; import java.io.IOException; +import java.io.InputStream; import java.nio.file.Path; import java.util.HashMap; @@ -25,7 +28,7 @@ public class SignificanceModelGeneratorTest { @TempDir private Path tempDir; - private ClientParameters.Builder createParameters(String inputPath, String outputPath, String field, String language, String docType) { + private ClientParameters.Builder createParameters(String inputPath, String outputPath, String field, String language, String docType, String zstCompression) { tempDir.toFile().mkdirs(); return new ClientParameters.Builder() @@ -33,7 +36,8 @@ public class SignificanceModelGeneratorTest { .setOutputFile(tempDir.resolve(outputPath).toString()) .setField(field) .setLanguage(language) - .setDocType(docType); + .setDocType(docType) + .setZstCompression(zstCompression); } private SignificanceModelGenerator createSignificanceModelGenerator(ClientParameters params) { @@ -44,7 +48,7 @@ public class SignificanceModelGeneratorTest { void testGenerateSimpleFile() throws IOException { String inputPath = "no.jsonl"; String outputPath = "output.json"; - ClientParameters params = createParameters(inputPath, outputPath, "text", "NB", "nb").build(); + ClientParameters params = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build(); SignificanceModelGenerator generator = createSignificanceModelGenerator(params); generator.generate(); @@ -68,10 +72,47 @@ public class SignificanceModelGeneratorTest { } @Test + void testGenerateSimpleFileWithZST() throws IOException { + String inputPath = "no.jsonl"; + ClientParameters params1 = createParameters(inputPath, "output.json", "text", "NB", "nb", "true").build(); + + // Throws exception when outputfile does not have .zst extension when using zst compression + assertThrows(IllegalArgumentException.class, () -> createSignificanceModelGenerator(params1)); + + String outputPath = "output.json.zst"; + ClientParameters params = createParameters(inputPath, outputPath, "text", "NB", "nb", "true").build(); + + SignificanceModelGenerator generator = createSignificanceModelGenerator(params); + generator.generate(); + + + + File outputFile = new File(tempDir.resolve(outputPath ).toString()); + assertTrue(outputFile.exists()); + + InputStream in = new ZstdInputStream(new FileInputStream(outputFile)); + + SignificanceModelFile modelFile = objectMapper.readValue(in, SignificanceModelFile.class); + + HashMap<String, DocumentFrequencyFile> languages = modelFile.languages(); + assertEquals(1, languages.size()); + + assertTrue(languages.containsKey("NB")); + + DocumentFrequencyFile documentFrequencyFile = languages.get("NB"); + + assertEquals(3, documentFrequencyFile.frequencies().get("fra")); + assertEquals(3, documentFrequencyFile.frequencies().get("skriveform")); + assertEquals(3, documentFrequencyFile.frequencies().get("kategori")); + assertEquals(3, documentFrequencyFile.frequencies().get("eldr")); + + } + + @Test void testGenerateFileWithMultipleLanguages() throws IOException { String inputPath = "no.jsonl"; String outputPath = "output.json"; - ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb").build(); + ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build(); SignificanceModelGenerator generator = createSignificanceModelGenerator(params1); generator.generate(); @@ -79,7 +120,7 @@ public class SignificanceModelGeneratorTest { assertTrue(outputFile.exists()); String inputPath2 = "en.jsonl"; - ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "EN", "en").build(); + ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "EN", "en", "false").build(); generator = createSignificanceModelGenerator(params2); generator.generate(); @@ -113,7 +154,7 @@ public class SignificanceModelGeneratorTest { void testOverwriteExistingDocumentFrequencyLanguage() throws IOException { String inputPath = "no.jsonl"; String outputPath = "output.json"; - ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb").build(); + ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build(); SignificanceModelGenerator generator = createSignificanceModelGenerator(params1); generator.generate(); @@ -134,7 +175,7 @@ public class SignificanceModelGeneratorTest { assertFalse(oldDf.frequencies().containsKey("nytt")); String inputPath2 = "no_2.jsonl"; - ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "NB", "nb").build(); + ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "NB", "nb", "false").build(); SignificanceModelGenerator generator2 = createSignificanceModelGenerator(params2); generator2.generate(); |