aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java9
-rw-r--r--linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java21
-rw-r--r--linguistics/src/test/models/docv1.json.zstbin0 -> 190 bytes
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java16
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java13
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java1
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java39
-rwxr-xr-xvespaclient-java/src/main/sh/vespa-significance.sh1
-rw-r--r--vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java55
9 files changed, 137 insertions, 18 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
index 72874c15d9e..6f3a108e9e1 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
@@ -7,9 +7,12 @@ import com.yahoo.language.Language;
import com.yahoo.language.significance.SignificanceModel;
import com.yahoo.language.significance.SignificanceModelRegistry;
import com.yahoo.search.significance.config.SignificanceConfig;
+import io.airlift.compress.zstd.ZstdInputStream;
import java.io.IOException;
import java.io.UncheckedIOException;
+import java.io.FileInputStream;
+import java.io.InputStream;
import java.nio.file.Path;
import java.util.EnumMap;
import java.util.List;
@@ -44,7 +47,11 @@ public class DefaultSignificanceModelRegistry implements SignificanceModelRegist
public void addModel(Path path) {
ObjectMapper objectMapper = new ObjectMapper();
try {
- SignificanceModelFile file = objectMapper.readValue(path.toFile(), SignificanceModelFile.class);
+ InputStream in = path.toString().endsWith(".zst") ?
+ new ZstdInputStream(new FileInputStream(path.toFile())) :
+ new FileInputStream(path.toFile());
+
+ SignificanceModelFile file = objectMapper.readValue(in, SignificanceModelFile.class);
for (var pair : file.languages().entrySet()) {
this.models.put(
Language.fromLanguageTag(pair.getKey()),
diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
index e8594885b9e..a5be567717e 100644
--- a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
+++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
@@ -55,6 +55,27 @@ public class DefaultSignificanceModelRegistryTest {
}
@Test
+ public void testDefaultSignificanceModelRegistryWithZSTDecompressing() {
+ List<Path> models = new ArrayList<>();
+
+ models.add(Path.of("src/test/models/docv1.json.zst"));
+
+ DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models);
+
+ var optionalEnglishModel = defaultSignificanceModelRegistry.getModel(Language.ENGLISH);
+ assertTrue(optionalEnglishModel.isPresent());
+
+ var englishModel = optionalEnglishModel.get();
+
+ assertTrue( defaultSignificanceModelRegistry.getModel(Language.FRENCH).isEmpty());
+ assertNotNull(englishModel);
+ assertEquals("test::1", englishModel.getId());
+ assertEquals(2, englishModel.documentFrequency("test").frequency());
+ assertEquals(10, englishModel.documentFrequency("test").corpusSize());
+
+ }
+
+ @Test
public void testDefaultSignificanceModelRegistryInOppsiteOrder() {
List<Path> models = new ArrayList<>();
diff --git a/linguistics/src/test/models/docv1.json.zst b/linguistics/src/test/models/docv1.json.zst
new file mode 100644
index 00000000000..61de6c7cb72
--- /dev/null
+++ b/linguistics/src/test/models/docv1.json.zst
Binary files differ
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java
index 326b932cabc..ad14708015b 100644
--- a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java
@@ -26,19 +26,24 @@ public class ClientParameters {
// Document type identifier
public final String docType;
+ // Zstandard compression
+ public final boolean zstCompression;
+
public ClientParameters(
boolean help,
String inputFile,
String outputFile,
String field,
String language,
- String docType) {
+ String docType,
+ boolean zstCompression) {
this.help = help;
this.inputFile = inputFile;
this.outputFile = outputFile;
this.field = field;
this.language = language;
this.docType = docType;
+ this.zstCompression = zstCompression;
}
public static class Builder {
@@ -47,8 +52,8 @@ public class ClientParameters {
private String outputFile;
private String field;
private String language;
-
private String docType;
+ private boolean zstCompression;
public Builder setHelp(boolean help) {
this.help = help;
@@ -79,8 +84,13 @@ public class ClientParameters {
return this;
}
+ public Builder setZstCompression(String useZstCompression) {
+ this.zstCompression = Boolean.parseBoolean(useZstCompression);
+ return this;
+ }
+
public ClientParameters build() {
- return new ClientParameters(help, inputFile, outputFile, field, language, docType);
+ return new ClientParameters(help, inputFile, outputFile, field, language, docType, zstCompression);
}
}
}
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java
index e5d16854647..2deaee983fe 100644
--- a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java
@@ -26,6 +26,7 @@ public class CommandLineOptions {
public static final String FIELD_OPTION = "field";
public static final String LANGUAGE_OPTION = "language";
public static final String DOC_TYPE_OPTION = "doc-type";
+ public static final String ZST_COMPRESSION = "zst-compression";
private final Options options = createOptions();
@@ -40,35 +41,46 @@ public class CommandLineOptions {
.build());
options.addOption(Option.builder("i")
+ .required()
.hasArg(true)
.desc("Input file")
.longOpt(INPUT_OPTION)
.build());
options.addOption(Option.builder("i")
+ .required()
.hasArg(true)
.desc("Output file")
.longOpt(OUTPUT_OPTION)
.build());
options.addOption(Option.builder("f")
+ .required()
.hasArg(true)
.desc("Field to analyze")
.longOpt(FIELD_OPTION)
.build());
options.addOption(Option.builder("l")
+ .required()
.hasArg(true)
.desc("Language tag for output file")
.longOpt(LANGUAGE_OPTION)
.build());
options.addOption(Option.builder("d")
+ .required()
.hasArg(true)
.desc("Document type identifier")
.longOpt(DOC_TYPE_OPTION)
.build());
+ options.addOption(Option.builder("zst")
+ .hasArg(true)
+ .desc("Use Zstandard compression")
+ .longOpt(ZST_COMPRESSION)
+ .build());
+
return options;
}
@@ -93,6 +105,7 @@ public class CommandLineOptions {
builder.setField(cl.getOptionValue(FIELD_OPTION));
builder.setLanguage(cl.getOptionValue(LANGUAGE_OPTION));
builder.setDocType(cl.getOptionValue(DOC_TYPE_OPTION));
+ builder.setZstCompression(cl.hasOption(ZST_COMPRESSION) ? cl.getOptionValue(ZST_COMPRESSION) : "true");
return builder.build();
} catch (ParseException e) {
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java
index a60408b1f96..e368eebefa5 100644
--- a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java
@@ -32,6 +32,7 @@ public class Main {
if (params.help) {
options.printHelp();
} else {
+ System.setProperty("vespa.replace_invalid_unicode", "true");
SignificanceModelGenerator significanceModelGenerator = createSignificanceModelGenerator(params);
significanceModelGenerator.generate();
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java
index 7972a70cd10..d620820e14f 100644
--- a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java
@@ -2,8 +2,6 @@
package com.yahoo.vespasignificance;
-import com.fasterxml.jackson.annotation.JsonAutoDetect;
-import com.fasterxml.jackson.annotation.PropertyAccessor;
import com.fasterxml.jackson.core.JsonFactory;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
@@ -27,6 +25,8 @@ import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.significance.impl.DocumentFrequencyFile;
import com.yahoo.language.significance.impl.SignificanceModelFile;
import com.yahoo.text.Utf8;
+import io.airlift.compress.zstd.ZstdInputStream;
+import io.airlift.compress.zstd.ZstdOutputStream;
import java.io.IOException;
import java.io.BufferedReader;
@@ -34,6 +34,9 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.InputStream;
import java.io.InputStreamReader;
+import java.io.FileOutputStream;
+import java.io.FileInputStream;
+import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
@@ -59,6 +62,7 @@ public class SignificanceModelGenerator {
final DocumentTypeManager types = new DocumentTypeManager();
final DocumentType docType;
+ private final boolean useZstCompression;
private final static String VERSION = "1.0";
private final static String ID = "1";
private final static String SIGNIFICANCE_DESCRIPTION = "Significance model for input file";
@@ -66,14 +70,24 @@ public class SignificanceModelGenerator {
public SignificanceModelGenerator(ClientParameters clientParameters) {
this.clientParameters = clientParameters;
+
+ if (clientParameters.zstCompression && !clientParameters.outputFile.endsWith(".zst")) {
+ throw new IllegalArgumentException("Output file must have .zst extension when using zst compression");
+ }
+
+ language = Language.fromLanguageTag(clientParameters.language);
+ if (language == Language.UNKNOWN) {
+ throw new IllegalArgumentException("Unknown language: " + clientParameters.language);
+ }
+
OpenNlpLinguistics openNlpLinguistics = new OpenNlpLinguistics();
tokenizer = openNlpLinguistics.getTokenizer();
objectMapper = new ObjectMapper();
- language = Language.fromLanguageTag(clientParameters.language);
-
docType = new DocumentType(clientParameters.docType);
docType.addField(new Field(clientParameters.field, DataType.STRING));
+ useZstCompression = clientParameters.zstCompression;
+
types.registerDocumentType(docType);
}
@@ -103,8 +117,14 @@ public class SignificanceModelGenerator {
long pageCount = i - 1;
SignificanceModelFile modelFile;
- if (Paths.get(clientParameters.outputFile).toFile().exists()) {
- modelFile = objectMapper.readValue(new File(clientParameters.outputFile), SignificanceModelFile.class);
+ File outputFile = Paths.get(clientParameters.outputFile).toFile();
+ if (outputFile.exists()) {
+
+ InputStream in = outputFile.toString().endsWith(".zst") ?
+ new ZstdInputStream(new FileInputStream(outputFile)) :
+ new FileInputStream(outputFile);
+
+ modelFile = objectMapper.readValue(in, SignificanceModelFile.class);
modelFile.addLanguage(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency()));
@@ -117,7 +137,12 @@ public class SignificanceModelGenerator {
}
try {
ObjectWriter writer = objectMapper.writerWithDefaultPrettyPrinter();
- writer.writeValue(new File(clientParameters.outputFile), modelFile);
+
+ OutputStream out = useZstCompression ?
+ new ZstdOutputStream(new FileOutputStream(clientParameters.outputFile)) :
+ new FileOutputStream(clientParameters.outputFile);
+
+ writer.writeValue(out, modelFile);
} catch (IOException e) {
throw new IllegalStateException("Failed to write model to output file", e);
}
diff --git a/vespaclient-java/src/main/sh/vespa-significance.sh b/vespaclient-java/src/main/sh/vespa-significance.sh
index 0c0bac275e6..f26c8dfca46 100755
--- a/vespaclient-java/src/main/sh/vespa-significance.sh
+++ b/vespaclient-java/src/main/sh/vespa-significance.sh
@@ -79,6 +79,7 @@ export ROOT
export MALLOC_ARENA_MAX=1 #Does not need fast allocation
exec java \
+-D vespa.replace_invalid_unicode=true \
-server -enableassertions \
-XX:ThreadStackSize=512 \
-XX:MaxJavaStackTraceDepth=1000000 \
diff --git a/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java b/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java
index 8e1e8fd5627..916fe05ef7b 100644
--- a/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java
+++ b/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java
@@ -5,11 +5,14 @@ package com.yahoo.vespasignificance;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.yahoo.language.significance.impl.DocumentFrequencyFile;
import com.yahoo.language.significance.impl.SignificanceModelFile;
+import io.airlift.compress.zstd.ZstdInputStream;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.io.TempDir;
import java.io.File;
+import java.io.FileInputStream;
import java.io.IOException;
+import java.io.InputStream;
import java.nio.file.Path;
import java.util.HashMap;
@@ -25,7 +28,7 @@ public class SignificanceModelGeneratorTest {
@TempDir
private Path tempDir;
- private ClientParameters.Builder createParameters(String inputPath, String outputPath, String field, String language, String docType) {
+ private ClientParameters.Builder createParameters(String inputPath, String outputPath, String field, String language, String docType, String zstCompression) {
tempDir.toFile().mkdirs();
return new ClientParameters.Builder()
@@ -33,7 +36,8 @@ public class SignificanceModelGeneratorTest {
.setOutputFile(tempDir.resolve(outputPath).toString())
.setField(field)
.setLanguage(language)
- .setDocType(docType);
+ .setDocType(docType)
+ .setZstCompression(zstCompression);
}
private SignificanceModelGenerator createSignificanceModelGenerator(ClientParameters params) {
@@ -44,7 +48,7 @@ public class SignificanceModelGeneratorTest {
void testGenerateSimpleFile() throws IOException {
String inputPath = "no.jsonl";
String outputPath = "output.json";
- ClientParameters params = createParameters(inputPath, outputPath, "text", "NB", "nb").build();
+ ClientParameters params = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build();
SignificanceModelGenerator generator = createSignificanceModelGenerator(params);
generator.generate();
@@ -68,10 +72,47 @@ public class SignificanceModelGeneratorTest {
}
@Test
+ void testGenerateSimpleFileWithZST() throws IOException {
+ String inputPath = "no.jsonl";
+ ClientParameters params1 = createParameters(inputPath, "output.json", "text", "NB", "nb", "true").build();
+
+ // Throws exception when outputfile does not have .zst extension when using zst compression
+ assertThrows(IllegalArgumentException.class, () -> createSignificanceModelGenerator(params1));
+
+ String outputPath = "output.json.zst";
+ ClientParameters params = createParameters(inputPath, outputPath, "text", "NB", "nb", "true").build();
+
+ SignificanceModelGenerator generator = createSignificanceModelGenerator(params);
+ generator.generate();
+
+
+
+ File outputFile = new File(tempDir.resolve(outputPath ).toString());
+ assertTrue(outputFile.exists());
+
+ InputStream in = new ZstdInputStream(new FileInputStream(outputFile));
+
+ SignificanceModelFile modelFile = objectMapper.readValue(in, SignificanceModelFile.class);
+
+ HashMap<String, DocumentFrequencyFile> languages = modelFile.languages();
+ assertEquals(1, languages.size());
+
+ assertTrue(languages.containsKey("NB"));
+
+ DocumentFrequencyFile documentFrequencyFile = languages.get("NB");
+
+ assertEquals(3, documentFrequencyFile.frequencies().get("fra"));
+ assertEquals(3, documentFrequencyFile.frequencies().get("skriveform"));
+ assertEquals(3, documentFrequencyFile.frequencies().get("kategori"));
+ assertEquals(3, documentFrequencyFile.frequencies().get("eldr"));
+
+ }
+
+ @Test
void testGenerateFileWithMultipleLanguages() throws IOException {
String inputPath = "no.jsonl";
String outputPath = "output.json";
- ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb").build();
+ ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build();
SignificanceModelGenerator generator = createSignificanceModelGenerator(params1);
generator.generate();
@@ -79,7 +120,7 @@ public class SignificanceModelGeneratorTest {
assertTrue(outputFile.exists());
String inputPath2 = "en.jsonl";
- ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "EN", "en").build();
+ ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "EN", "en", "false").build();
generator = createSignificanceModelGenerator(params2);
generator.generate();
@@ -113,7 +154,7 @@ public class SignificanceModelGeneratorTest {
void testOverwriteExistingDocumentFrequencyLanguage() throws IOException {
String inputPath = "no.jsonl";
String outputPath = "output.json";
- ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb").build();
+ ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build();
SignificanceModelGenerator generator = createSignificanceModelGenerator(params1);
generator.generate();
@@ -134,7 +175,7 @@ public class SignificanceModelGeneratorTest {
assertFalse(oldDf.frequencies().containsKey("nytt"));
String inputPath2 = "no_2.jsonl";
- ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "NB", "nb").build();
+ ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "NB", "nb", "false").build();
SignificanceModelGenerator generator2 = createSignificanceModelGenerator(params2);
generator2.generate();