aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMariusArhaug <mariusarhaug@hotmail.com>2024-05-14 16:18:01 +0200
committerMariusArhaug <mariusarhaug@hotmail.com>2024-05-14 16:23:24 +0200
commit74280df2d2ce04dd14a9ee325c9dec8080145da7 (patch)
treee2c23b0ff19b9a9b4108b25c3ffae2ac16a127ad
parenta26a156f27785c3d84c0d8ca25d75f35ebbb0e90 (diff)
Add significance model generator cli
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java7
-rw-r--r--linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java4
-rw-r--r--opennlp-linguistics/pom.xml10
-rw-r--r--vespaclient-java/pom.xml5
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java73
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java104
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java49
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java147
-rwxr-xr-xvespaclient-java/src/main/sh/vespa-significance.sh87
-rw-r--r--vespaclient-java/src/test/files/en.jsonl3
-rw-r--r--vespaclient-java/src/test/files/no.jsonl3
-rw-r--r--vespaclient-java/src/test/files/no_2.jsonl3
-rw-r--r--vespaclient-java/src/test/files/temp-dir/output.json20
-rw-r--r--vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java150
14 files changed, 661 insertions, 4 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java
index b62754ac8ad..9b7cbae834a 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java
@@ -17,15 +17,14 @@ import java.util.HashMap;
public class DocumentFrequencyFile {
private final String description;
- private final int documentCount;
-
+ private final long documentCount;
private final HashMap<String, Long> frequencies;
@JsonCreator
public DocumentFrequencyFile(
@JsonProperty("description") String description,
- @JsonProperty("document-count") int documentCount,
+ @JsonProperty("document-count") long documentCount,
@JsonProperty("document-frequencies") HashMap<String, Long> frequencies) {
this.description = description;
this.documentCount = documentCount;
@@ -36,7 +35,7 @@ public class DocumentFrequencyFile {
public String description() { return description; }
@JsonProperty("document-count")
- public int documentCount() { return documentCount; }
+ public long documentCount() { return documentCount; }
@JsonProperty("document-frequencies")
public HashMap<String, Long> frequencies() { return frequencies; }
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java
index 902613379f0..94030108671 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java
@@ -45,4 +45,8 @@ public class SignificanceModelFile {
@JsonProperty("languages")
public HashMap<String, DocumentFrequencyFile> languages() { return languages; }
+
+ public void addLanguage(String language, DocumentFrequencyFile documentFrequencyFile) {
+ languages.put(language, documentFrequencyFile);
+ }
}
diff --git a/opennlp-linguistics/pom.xml b/opennlp-linguistics/pom.xml
index 726309c902d..0c53571dc5e 100644
--- a/opennlp-linguistics/pom.xml
+++ b/opennlp-linguistics/pom.xml
@@ -26,6 +26,16 @@
<scope>provided</scope>
</dependency>
<dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-core</artifactId>
+ <scope>compile</scope>
+ </dependency>
+ <dependency>
<groupId>com.yahoo.vespa</groupId>
<artifactId>config-bundle</artifactId>
<version>${project.version}</version>
diff --git a/vespaclient-java/pom.xml b/vespaclient-java/pom.xml
index 7af8160c1aa..21afed7335d 100644
--- a/vespaclient-java/pom.xml
+++ b/vespaclient-java/pom.xml
@@ -70,6 +70,11 @@
</dependency>
<dependency>
<groupId>com.yahoo.vespa</groupId>
+ <artifactId>opennlp-linguistics</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
<artifactId>container-apache-http-client-bundle</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java
new file mode 100644
index 00000000000..f0c351581e4
--- /dev/null
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java
@@ -0,0 +1,73 @@
+package com.yahoo.vespasignificance;
+
+
+/**
+ * This class contains the program parameters.
+ *
+ * @author MariusArhaug
+ */
+public class ClientParameters {
+ // Show help page if true
+ public final boolean help;
+
+ // Input file for the program
+ public final String inputFile;
+
+ // Output file for the program
+ public final String outputFile;
+
+ // Field for the program
+ public final String field;
+
+ // Language for the program
+ public final String language;
+
+ public ClientParameters(
+ boolean help,
+ String inputFile,
+ String outputFile,
+ String field,
+ String language) {
+ this.help = help;
+ this.inputFile = inputFile;
+ this.outputFile = outputFile;
+ this.field = field;
+ this.language = language;
+ }
+
+ public static class Builder {
+ private boolean help;
+ private String inputFile;
+ private String outputFile;
+ private String field;
+ private String language;
+
+ public Builder setHelp(boolean help) {
+ this.help = help;
+ return this;
+ }
+
+ public Builder setInputFile(String inputFile) {
+ this.inputFile = inputFile;
+ return this;
+ }
+
+ public Builder setOutputFile(String outputFile) {
+ this.outputFile = outputFile;
+ return this;
+ }
+
+ public Builder setField(String field) {
+ this.field = field;
+ return this;
+ }
+ public Builder setLanguage(String language) {
+ this.language = language;
+ return this;
+ }
+
+ public ClientParameters build() {
+ return new ClientParameters(help, inputFile, outputFile, field, language);
+ }
+ }
+}
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java
new file mode 100644
index 00000000000..3090b31b9b5
--- /dev/null
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java
@@ -0,0 +1,104 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespasignificance;
+
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+
+
+import java.io.InputStream;
+
+
+/**
+ * This class is responsible for parsing the command line arguments and print the help page.
+ *
+ * @author MariusArhaug
+ */
+public class CommandLineOptions {
+
+ public static final String HELP_OPTION = "help";
+ public static final String INPUT_OPTION = "in";
+ public static final String OUTPUT_OPTION = "out";
+ public static final String FIELD_OPTION = "field";
+ public static final String LANGUAGE_OPTION = "language";
+
+ private final Options options = createOptions();
+ private final InputStream stdIn;
+
+ public CommandLineOptions(InputStream stdIn) {
+ this.stdIn = stdIn;
+ }
+
+ public CommandLineOptions() {
+ this(System.in);
+ }
+
+ @SuppressWarnings("AccessStaticViaInstance")
+ private static Options createOptions() {
+ Options options = new Options();
+
+ options.addOption(Option.builder("h")
+ .hasArg(false)
+ .desc("Show this syntax page.")
+ .longOpt(HELP_OPTION)
+ .build());
+
+ options.addOption(Option.builder("i")
+ .hasArg(true)
+ .desc("Input file")
+ .longOpt(INPUT_OPTION)
+ .build());
+
+ options.addOption(Option.builder("i")
+ .hasArg(true)
+ .desc("Output file")
+ .longOpt(OUTPUT_OPTION)
+ .build());
+
+ options.addOption(Option.builder("f")
+ .hasArg(true)
+ .desc("Field to analyze")
+ .longOpt(FIELD_OPTION)
+ .build());
+
+ options.addOption(Option.builder("l")
+ .hasArg(true)
+ .desc("Language tag for output file")
+ .longOpt(LANGUAGE_OPTION)
+ .build());
+
+ return options;
+ }
+
+ public void printHelp() {
+ HelpFormatter formatter = new HelpFormatter();
+
+ formatter.printHelp(
+ "vespa-significance <command> <options>", "Perform a significance value related operation.", options,
+ "The generate command generates a significance model file for a given corpus type .jsonl file.\n",
+ false);
+ }
+
+ public ClientParameters parseCommandLineArguments(String[] args) throws IllegalArgumentException {
+ try {
+ CommandLineParser clp = new DefaultParser();
+ CommandLine cl = clp.parse(options, args);
+ ClientParameters.Builder builder = new ClientParameters.Builder();
+
+ builder.setHelp(cl.hasOption(HELP_OPTION));
+ builder.setInputFile(cl.getOptionValue(INPUT_OPTION));
+ builder.setOutputFile(cl.getOptionValue(OUTPUT_OPTION));
+ builder.setField(cl.getOptionValue(FIELD_OPTION));
+ builder.setLanguage(cl.getOptionValue(LANGUAGE_OPTION));
+
+ return builder.build();
+ } catch (ParseException e) {
+ throw new IllegalArgumentException("Failed to parse command line arguments: " + e.getMessage());
+ }
+ }
+}
+
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java
new file mode 100644
index 00000000000..a60408b1f96
--- /dev/null
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java
@@ -0,0 +1,49 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+package com.yahoo.vespasignificance;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * The vespa-significance tool generates significance models based on input feed files.
+ *
+ * @author MariusArhaug
+ */
+
+public class Main {
+
+ public static void main(String[] args) {
+ try {
+ if (args.length == 0) {
+ System.err.println("No arguments provided. Use --help to see available options.");
+ System.exit(1);
+ }
+
+ if (!args[0].equals("generate")) {
+ System.err.println("Invalid command. Use 'generate' to generate significance models.");
+ System.exit(1);
+ }
+ String[] commandLineArgs = List.of(args).subList(1, args.length).toArray(new String[0]);
+
+ CommandLineOptions options = new CommandLineOptions();
+ ClientParameters params = options.parseCommandLineArguments(commandLineArgs);
+
+ if (params.help) {
+ options.printHelp();
+ } else {
+ SignificanceModelGenerator significanceModelGenerator = createSignificanceModelGenerator(params);
+
+ significanceModelGenerator.generate();
+ }
+ } catch (IllegalArgumentException e) {
+ System.err.printf("Failed to parse command line arguments: %s.\n", e.getMessage());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static SignificanceModelGenerator createSignificanceModelGenerator(ClientParameters params) {
+ return new SignificanceModelGenerator(params);
+ }
+}
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java
new file mode 100644
index 00000000000..16a7ee18a02
--- /dev/null
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java
@@ -0,0 +1,147 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+package com.yahoo.vespasignificance;
+
+import com.fasterxml.jackson.annotation.JsonAutoDetect;
+import com.fasterxml.jackson.annotation.PropertyAccessor;
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.ObjectWriter;
+import com.yahoo.document.DataType;
+import com.yahoo.document.Document;
+import com.yahoo.document.DocumentPut;
+import com.yahoo.document.DocumentType;
+import com.yahoo.document.DocumentTypeManager;
+import com.yahoo.document.Field;
+import com.yahoo.document.datatypes.FieldValue;
+import com.yahoo.document.json.DocumentOperationType;
+import com.yahoo.document.json.JsonReader;
+import com.yahoo.document.json.ParsedDocumentOperation;
+import com.yahoo.language.Language;
+import com.yahoo.language.opennlp.OpenNlpLinguistics;
+import com.yahoo.language.process.StemMode;
+import com.yahoo.language.process.Token;
+import com.yahoo.language.process.TokenScript;
+import com.yahoo.language.process.TokenType;
+import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.significance.impl.DocumentFrequencyFile;
+import com.yahoo.language.significance.impl.SignificanceModelFile;
+import com.yahoo.text.Utf8;
+
+import java.io.IOException;
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.HashMap;
+import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+/**
+ * @author MariusArhaug
+ */
+public class SignificanceModelGenerator {
+
+ private final ClientParameters clientParameters;
+ private final Tokenizer tokenizer;
+ private final HashMap<String, Long> documentFrequency = new HashMap<>();
+ private final Language language;
+ private final ObjectMapper objectMapper;
+ private final static JsonFactory parserFactory = new JsonFactory();
+
+ final DocumentTypeManager types = new DocumentTypeManager();
+ final DocumentType docType;
+ private final static String VERSION = "1.0";
+ private final static String ID = "1";
+ private final static String SIGNIFICANCE_DESCRIPTION = "Significance model for input file";
+ private final static String DOC_FREQ_DESCRIPTION = "Document frequency for language";
+
+ public SignificanceModelGenerator(ClientParameters clientParameters) {
+ this.clientParameters = clientParameters;
+ OpenNlpLinguistics openNlpLinguistics = new OpenNlpLinguistics();
+ tokenizer = openNlpLinguistics.getTokenizer();
+ objectMapper = new ObjectMapper();
+
+ language = Language.fromLanguageTag(clientParameters.language);
+
+ docType = new DocumentType(language.languageCode().toLowerCase());
+ docType.addField(new Field(clientParameters.field, DataType.STRING));
+ types.registerDocumentType(docType);
+ }
+
+
+ public void generate() throws IOException {
+
+ Path currentWorkingDir = Paths.get("").toAbsolutePath();
+
+ final InputStream rawDoc = Files.newInputStream(currentWorkingDir.resolve(clientParameters.inputFile));
+
+ BufferedReader reader = new BufferedReader(new InputStreamReader(rawDoc));
+
+ long i = 1;
+ while (reader.ready()) {
+ String line = reader.readLine();
+ JsonReader jsonReader = new JsonReader(types, new ByteArrayInputStream(Utf8.toBytes(line)), parserFactory);
+ String wikimediaId = "id:wikimedia:" + language.languageCode() + "::" + i;
+
+ ParsedDocumentOperation operation = jsonReader.readSingleDocumentStreaming(DocumentOperationType.PUT, wikimediaId);
+ DocumentPut put = (DocumentPut) operation.operation();
+ Document document = put.getDocument();
+ FieldValue fieldValue = document.getFieldValue(clientParameters.field);
+ this.handleTokenization(fieldValue.toString());
+ i++;
+ }
+
+ long pageCount = i - 1;
+
+ SignificanceModelFile modelFile;
+ if (Paths.get(clientParameters.outputFile).toFile().exists()) {
+ modelFile = objectMapper.readValue(new File(clientParameters.outputFile), SignificanceModelFile.class);
+
+ modelFile.addLanguage(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency()));
+
+ } else {
+ HashMap<String, DocumentFrequencyFile> languages = new HashMap<>() {{
+ put(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency()));
+ }};
+
+ modelFile = new SignificanceModelFile(VERSION, ID, SIGNIFICANCE_DESCRIPTION, languages);
+ }
+ try {
+ objectMapper.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY);
+ ObjectWriter writer = objectMapper.writerWithDefaultPrettyPrinter();
+ writer.writeValue(new File(clientParameters.outputFile), modelFile);
+ } catch (IOException e) {
+ throw new IllegalStateException("Failed to write model to output file", e);
+ }
+ }
+
+ private void handleTokenization(String field) {
+ var tokens = tokenizer.tokenize(field, language, StemMode.ALL, false);
+
+ Set<String> uniqueWords = StreamSupport.stream(tokens.spliterator(), false)
+ .filter(t -> t.getType() == TokenType.ALPHABETIC)
+ .filter(t -> t.getScript() == TokenScript.LATIN)
+ .map(Token::getTokenString)
+ .collect(Collectors.toSet());
+
+ for (String word : uniqueWords) {
+ if (documentFrequency.containsKey(word)) {
+ documentFrequency.merge(word, 1L, Long::sum);
+ } else {
+ documentFrequency.put(word, 1L);
+ }
+ }
+ }
+
+ public HashMap<String, Long> getFinalDocumentFrequency() {
+ return documentFrequency.entrySet().stream()
+ .filter(k -> k.getValue() > 1)
+ .collect(HashMap::new, (m, v) -> m.put(v.getKey(), v.getValue()), HashMap::putAll);
+ }
+}
diff --git a/vespaclient-java/src/main/sh/vespa-significance.sh b/vespaclient-java/src/main/sh/vespa-significance.sh
new file mode 100755
index 00000000000..0c0bac275e6
--- /dev/null
+++ b/vespaclient-java/src/main/sh/vespa-significance.sh
@@ -0,0 +1,87 @@
+#!/bin/sh
+# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+# BEGIN environment bootstrap section
+# Do not edit between here and END as this section should stay identical in all scripts
+
+findpath () {
+ myname=${0}
+ mypath=${myname%/*}
+ myname=${myname##*/}
+ empty_if_start_slash=${mypath%%/*}
+ if [ "${empty_if_start_slash}" ]; then
+ mypath=$(pwd)/${mypath}
+ fi
+ if [ "$mypath" ] && [ -d "$mypath" ]; then
+ return
+ fi
+ mypath=$(pwd)
+ if [ -f "${mypath}/${myname}" ]; then
+ return
+ fi
+ echo "FATAL: Could not figure out the path where $myname lives from $0"
+ exit 1
+}
+
+COMMON_ENV=libexec/vespa/common-env.sh
+
+source_common_env () {
+ if [ "$VESPA_HOME" ] && [ -d "$VESPA_HOME" ]; then
+ export VESPA_HOME
+ common_env=$VESPA_HOME/$COMMON_ENV
+ if [ -f "$common_env" ]; then
+ . $common_env
+ return
+ fi
+ fi
+ return 1
+}
+
+findroot () {
+ source_common_env && return
+ if [ "$VESPA_HOME" ]; then
+ echo "FATAL: bad VESPA_HOME value '$VESPA_HOME'"
+ exit 1
+ fi
+ if [ "$ROOT" ] && [ -d "$ROOT" ]; then
+ VESPA_HOME="$ROOT"
+ source_common_env && return
+ fi
+ findpath
+ while [ "$mypath" ]; do
+ VESPA_HOME=${mypath}
+ source_common_env && return
+ mypath=${mypath%/*}
+ done
+ echo "FATAL: missing VESPA_HOME environment variable"
+ echo "Could not locate $COMMON_ENV anywhere"
+ exit 1
+}
+
+findhost () {
+ if [ "${VESPA_HOSTNAME}" = "" ]; then
+ VESPA_HOSTNAME=$(vespa-detect-hostname || hostname -f || hostname || echo "localhost") || exit 1
+ fi
+ validate="${VESPA_HOME}/bin/vespa-validate-hostname"
+ if [ -f "$validate" ]; then
+ "$validate" "${VESPA_HOSTNAME}" || exit 1
+ fi
+ export VESPA_HOSTNAME
+}
+
+findroot
+findhost
+
+ROOT=${VESPA_HOME%/}
+export ROOT
+
+# END environment bootstrap section
+
+export MALLOC_ARENA_MAX=1 #Does not need fast allocation
+exec java \
+-server -enableassertions \
+-XX:ThreadStackSize=512 \
+-XX:MaxJavaStackTraceDepth=1000000 \
+-Djava.awt.headless=true \
+-Xms128m -Xmx1024m $(getJavaOptionsIPV46) \
+-cp ${VESPA_HOME}/lib/jars/vespaclient-java-jar-with-dependencies.jar com.yahoo.vespasignificance.Main "$@"
diff --git a/vespaclient-java/src/test/files/en.jsonl b/vespaclient-java/src/test/files/en.jsonl
new file mode 100644
index 00000000000..b4b16e4c773
--- /dev/null
+++ b/vespaclient-java/src/test/files/en.jsonl
@@ -0,0 +1,3 @@
+{"put": "id:wikimedia:en::1", "fields": {"text": "Some english wiki dump"}}
+{"put": "id:wikimedia:en::2", "fields": {"text": "Some english wiki dump"}}
+{"put": "id:wikimedia:en::3", "fields": {"text": "Some english wiki dump"}}
diff --git a/vespaclient-java/src/test/files/no.jsonl b/vespaclient-java/src/test/files/no.jsonl
new file mode 100644
index 00000000000..90cc0da15c5
--- /dev/null
+++ b/vespaclient-java/src/test/files/no.jsonl
@@ -0,0 +1,3 @@
+{"put": "id:wikimedia:nb::1", "fields": {"text": "[[Arbeiderpartiet]][[Kategori:Omdirigeringer fra eldre skriveform]]"}}
+{"put": "id:wikimedia:nb::2", "fields": {"text": "[[Arbeiderpartiet]][[Kategori:Omdirigeringer fra eldre skriveform]]"}}
+{"put": "id:wikimedia:nb::3", "fields": {"text": "[[Arbeiderpartiet]][[Kategori:Omdirigeringer fra eldre skriveform]]"}}
diff --git a/vespaclient-java/src/test/files/no_2.jsonl b/vespaclient-java/src/test/files/no_2.jsonl
new file mode 100644
index 00000000000..f5e996d4810
--- /dev/null
+++ b/vespaclient-java/src/test/files/no_2.jsonl
@@ -0,0 +1,3 @@
+{"put": "id:wikimedia:nb::1", "fields": {"text": "[[Arbeiderpartiet]][[Kategori:Omdirigeringer fra eldre eldre skriveform]]"}}
+{"put": "id:wikimedia:nb::2", "fields": {"text": "[[Arbeiderpartiet]][[Kategori:Omdirigeringer fra eldre skriveform nytt]]"}}
+{"put": "id:wikimedia:nb::3", "fields": {"text": "[[Arbeiderpartiet]][[Kategori:Omdirigeringer frå eldre skriveform nytt]]"}}
diff --git a/vespaclient-java/src/test/files/temp-dir/output.json b/vespaclient-java/src/test/files/temp-dir/output.json
new file mode 100644
index 00000000000..2abc3de91e3
--- /dev/null
+++ b/vespaclient-java/src/test/files/temp-dir/output.json
@@ -0,0 +1,20 @@
+{
+ "version" : "1.0",
+ "id" : "1",
+ "description" : "Significance model for input file",
+ "languages" : {
+ "NB" : {
+ "description" : "Document frequency for language",
+ "document-count" : 3,
+ "document-frequencies" : {
+ "fra" : 2,
+ "skriveform" : 3,
+ "nytt" : 2,
+ "kategori" : 3,
+ "eldr" : 3,
+ "omdirigering" : 3,
+ "arbeiderparti" : 3
+ }
+ }
+ }
+}
diff --git a/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java b/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java
new file mode 100644
index 00000000000..e1471d30049
--- /dev/null
+++ b/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java
@@ -0,0 +1,150 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+package com.yahoo.vespasignificance;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.yahoo.language.significance.impl.DocumentFrequencyFile;
+import com.yahoo.language.significance.impl.SignificanceModelFile;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.HashMap;
+
+import static org.junit.jupiter.api.Assertions.*;
+
+/**
+ * @author MariusArhaug
+ */
+public class SignificanceModelGeneratorTest {
+
+ private ObjectMapper objectMapper = new ObjectMapper();
+
+ @BeforeEach
+ void removeTempDirectory() {
+ Path tempDir = Paths.get("src/test/files/temp-output");
+ if (tempDir.toFile().exists()) {
+ tempDir.toFile().delete();
+ }
+ }
+
+ private static ClientParameters.Builder createParameters(String inputFile, String field, String language) {
+ String outputPath = "src/test/files/temp-dir";
+ Paths.get(outputPath).toFile().mkdirs();
+
+ return new ClientParameters.Builder()
+ .setInputFile("src/test/files/" + inputFile)
+ .setOutputFile(outputPath + "/output.json")
+ .setField(field)
+ .setLanguage(language);
+ }
+
+ private SignificanceModelGenerator createSignificanceModelGenerator(ClientParameters params) {
+ return new SignificanceModelGenerator(params);
+ }
+
+
+ @Test
+ void testGenerateSimpleFile() throws IOException {
+ ClientParameters params = createParameters("no.jsonl", "text", "NB").build();
+ SignificanceModelGenerator generator = createSignificanceModelGenerator(params);
+ generator.generate();
+
+ assertTrue(Paths.get("src/test/files/temp-dir/output.json").toFile().exists());
+
+ SignificanceModelFile modelFile = objectMapper.readValue(Paths.get("src/test/files/temp-dir/output.json").toFile(), SignificanceModelFile.class);
+
+ HashMap<String, DocumentFrequencyFile> languages = modelFile.languages();
+ assertEquals(1, languages.size());
+
+ assertTrue(languages.containsKey("NB"));
+
+ DocumentFrequencyFile documentFrequencyFile = languages.get("NB");
+
+ assertEquals(3, documentFrequencyFile.frequencies().get("fra"));
+ assertEquals(3, documentFrequencyFile.frequencies().get("skriveform"));
+ assertEquals(3, documentFrequencyFile.frequencies().get("kategori"));
+ assertEquals(3, documentFrequencyFile.frequencies().get("eldr"));
+
+ }
+
+ @Test
+ void testGenerateFileWithMultipleLanguages() throws IOException {
+ ClientParameters params1 = createParameters("no.jsonl", "text", "NB").build();
+ SignificanceModelGenerator generator = createSignificanceModelGenerator(params1);
+ generator.generate();
+ assertTrue(Paths.get("src/test/files/temp-dir/output.json").toFile().exists());
+
+ ClientParameters params2 = createParameters("en.jsonl", "text", "EN").build();
+ generator = createSignificanceModelGenerator(params2);
+ generator.generate();
+
+ assertTrue(Paths.get("src/test/files/temp-dir/output.json").toFile().exists());
+
+ SignificanceModelFile modelFile = objectMapper.readValue(Paths.get("src/test/files/temp-dir/output.json").toFile(), SignificanceModelFile.class);
+
+ HashMap<String, DocumentFrequencyFile> languages = modelFile.languages();
+
+ assertEquals(2, languages.size());
+
+ assertTrue(languages.containsKey("NB"));
+ assertTrue(languages.containsKey("EN"));
+
+ DocumentFrequencyFile nb = languages.get("NB");
+ DocumentFrequencyFile en = languages.get("EN");
+
+ assertEquals(3, nb.documentCount());
+ assertEquals(3, en.documentCount());
+
+ assertEquals(3, nb.frequencies().get("fra"));
+ assertEquals(3, nb.frequencies().get("skriveform"));
+
+ assertEquals(3, en.frequencies().get("some"));
+ assertEquals(3, en.frequencies().get("wiki"));
+
+ }
+
+ @Test
+ void testOverwriteExistingDocumentFrequencyLanguage() throws IOException {
+ ClientParameters params1 = createParameters("no.jsonl", "text", "NB").build();
+ SignificanceModelGenerator generator = createSignificanceModelGenerator(params1);
+ generator.generate();
+ assertTrue(Paths.get("src/test/files/temp-dir/output.json").toFile().exists());
+
+ SignificanceModelFile preUpdatedFile = objectMapper.readValue(Paths.get("src/test/files/temp-dir/output.json").toFile(), SignificanceModelFile.class)
+ ;
+ HashMap<String, DocumentFrequencyFile> oldLanguages = preUpdatedFile.languages();
+ assertEquals(1, oldLanguages.size());
+
+ assertTrue(oldLanguages.containsKey("NB"));
+
+ DocumentFrequencyFile oldDf = oldLanguages.get("NB");
+
+ assertEquals(3, oldDf.frequencies().get("fra"));
+ assertEquals(3, oldDf.frequencies().get("skriveform"));
+ assertFalse(oldDf.frequencies().containsKey("nytt"));
+
+ ClientParameters params2 = createParameters("no_2.jsonl", "text", "NB").build();
+ SignificanceModelGenerator generator2 = createSignificanceModelGenerator(params2);
+ generator2.generate();
+
+ assertTrue(Paths.get("src/test/files/temp-dir/output.json").toFile().exists());
+
+ SignificanceModelFile modelFile = objectMapper.readValue(Paths.get("src/test/files/temp-dir/output.json").toFile(), SignificanceModelFile.class);
+
+ HashMap<String, DocumentFrequencyFile> languages = modelFile.languages();
+
+ assertEquals(1, languages.size());
+
+ assertTrue(languages.containsKey("NB"));
+
+ DocumentFrequencyFile df = languages.get("NB");
+
+ assertEquals(2, df.frequencies().get("fra"));
+ assertEquals(3, df.frequencies().get("skriveform"));
+ assertTrue(df.frequencies().containsKey("nytt"));
+ assertEquals(2, df.frequencies().get("nytt"));
+ }
+}