From 74280df2d2ce04dd14a9ee325c9dec8080145da7 Mon Sep 17 00:00:00 2001 From: MariusArhaug Date: Tue, 14 May 2024 16:18:01 +0200 Subject: Add significance model generator cli --- .../significance/impl/DocumentFrequencyFile.java | 7 +- .../significance/impl/SignificanceModelFile.java | 4 + opennlp-linguistics/pom.xml | 10 ++ vespaclient-java/pom.xml | 5 + .../yahoo/vespasignificance/ClientParameters.java | 73 ++++++++++ .../vespasignificance/CommandLineOptions.java | 104 ++++++++++++++ .../java/com/yahoo/vespasignificance/Main.java | 49 +++++++ .../SignificanceModelGenerator.java | 147 ++++++++++++++++++++ vespaclient-java/src/main/sh/vespa-significance.sh | 87 ++++++++++++ vespaclient-java/src/test/files/en.jsonl | 3 + vespaclient-java/src/test/files/no.jsonl | 3 + vespaclient-java/src/test/files/no_2.jsonl | 3 + .../src/test/files/temp-dir/output.json | 20 +++ .../SignificanceModelGeneratorTest.java | 150 +++++++++++++++++++++ 14 files changed, 661 insertions(+), 4 deletions(-) create mode 100644 vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java create mode 100644 vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java create mode 100644 vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java create mode 100644 vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java create mode 100755 vespaclient-java/src/main/sh/vespa-significance.sh create mode 100644 vespaclient-java/src/test/files/en.jsonl create mode 100644 vespaclient-java/src/test/files/no.jsonl create mode 100644 vespaclient-java/src/test/files/no_2.jsonl create mode 100644 vespaclient-java/src/test/files/temp-dir/output.json create mode 100644 vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java index b62754ac8ad..9b7cbae834a 100644 --- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java @@ -17,15 +17,14 @@ import java.util.HashMap; public class DocumentFrequencyFile { private final String description; - private final int documentCount; - + private final long documentCount; private final HashMap frequencies; @JsonCreator public DocumentFrequencyFile( @JsonProperty("description") String description, - @JsonProperty("document-count") int documentCount, + @JsonProperty("document-count") long documentCount, @JsonProperty("document-frequencies") HashMap frequencies) { this.description = description; this.documentCount = documentCount; @@ -36,7 +35,7 @@ public class DocumentFrequencyFile { public String description() { return description; } @JsonProperty("document-count") - public int documentCount() { return documentCount; } + public long documentCount() { return documentCount; } @JsonProperty("document-frequencies") public HashMap frequencies() { return frequencies; } diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java index 902613379f0..94030108671 100644 --- a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java +++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java @@ -45,4 +45,8 @@ public class SignificanceModelFile { @JsonProperty("languages") public HashMap languages() { return languages; } + + public void addLanguage(String language, DocumentFrequencyFile documentFrequencyFile) { + languages.put(language, documentFrequencyFile); + } } diff --git a/opennlp-linguistics/pom.xml b/opennlp-linguistics/pom.xml index 726309c902d..0c53571dc5e 100644 --- a/opennlp-linguistics/pom.xml +++ b/opennlp-linguistics/pom.xml @@ -25,6 +25,16 @@ ${project.version} provided + + com.fasterxml.jackson.core + jackson-databind + compile + + + com.fasterxml.jackson.core + jackson-core + compile + com.yahoo.vespa config-bundle diff --git a/vespaclient-java/pom.xml b/vespaclient-java/pom.xml index 7af8160c1aa..21afed7335d 100644 --- a/vespaclient-java/pom.xml +++ b/vespaclient-java/pom.xml @@ -68,6 +68,11 @@ commons-cli commons-cli + + com.yahoo.vespa + opennlp-linguistics + ${project.version} + com.yahoo.vespa container-apache-http-client-bundle diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java new file mode 100644 index 00000000000..f0c351581e4 --- /dev/null +++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java @@ -0,0 +1,73 @@ +package com.yahoo.vespasignificance; + + +/** + * This class contains the program parameters. + * + * @author MariusArhaug + */ +public class ClientParameters { + // Show help page if true + public final boolean help; + + // Input file for the program + public final String inputFile; + + // Output file for the program + public final String outputFile; + + // Field for the program + public final String field; + + // Language for the program + public final String language; + + public ClientParameters( + boolean help, + String inputFile, + String outputFile, + String field, + String language) { + this.help = help; + this.inputFile = inputFile; + this.outputFile = outputFile; + this.field = field; + this.language = language; + } + + public static class Builder { + private boolean help; + private String inputFile; + private String outputFile; + private String field; + private String language; + + public Builder setHelp(boolean help) { + this.help = help; + return this; + } + + public Builder setInputFile(String inputFile) { + this.inputFile = inputFile; + return this; + } + + public Builder setOutputFile(String outputFile) { + this.outputFile = outputFile; + return this; + } + + public Builder setField(String field) { + this.field = field; + return this; + } + public Builder setLanguage(String language) { + this.language = language; + return this; + } + + public ClientParameters build() { + return new ClientParameters(help, inputFile, outputFile, field, language); + } + } +} diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java new file mode 100644 index 00000000000..3090b31b9b5 --- /dev/null +++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java @@ -0,0 +1,104 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespasignificance; + +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; + + +import java.io.InputStream; + + +/** + * This class is responsible for parsing the command line arguments and print the help page. + * + * @author MariusArhaug + */ +public class CommandLineOptions { + + public static final String HELP_OPTION = "help"; + public static final String INPUT_OPTION = "in"; + public static final String OUTPUT_OPTION = "out"; + public static final String FIELD_OPTION = "field"; + public static final String LANGUAGE_OPTION = "language"; + + private final Options options = createOptions(); + private final InputStream stdIn; + + public CommandLineOptions(InputStream stdIn) { + this.stdIn = stdIn; + } + + public CommandLineOptions() { + this(System.in); + } + + @SuppressWarnings("AccessStaticViaInstance") + private static Options createOptions() { + Options options = new Options(); + + options.addOption(Option.builder("h") + .hasArg(false) + .desc("Show this syntax page.") + .longOpt(HELP_OPTION) + .build()); + + options.addOption(Option.builder("i") + .hasArg(true) + .desc("Input file") + .longOpt(INPUT_OPTION) + .build()); + + options.addOption(Option.builder("i") + .hasArg(true) + .desc("Output file") + .longOpt(OUTPUT_OPTION) + .build()); + + options.addOption(Option.builder("f") + .hasArg(true) + .desc("Field to analyze") + .longOpt(FIELD_OPTION) + .build()); + + options.addOption(Option.builder("l") + .hasArg(true) + .desc("Language tag for output file") + .longOpt(LANGUAGE_OPTION) + .build()); + + return options; + } + + public void printHelp() { + HelpFormatter formatter = new HelpFormatter(); + + formatter.printHelp( + "vespa-significance ", "Perform a significance value related operation.", options, + "The generate command generates a significance model file for a given corpus type .jsonl file.\n", + false); + } + + public ClientParameters parseCommandLineArguments(String[] args) throws IllegalArgumentException { + try { + CommandLineParser clp = new DefaultParser(); + CommandLine cl = clp.parse(options, args); + ClientParameters.Builder builder = new ClientParameters.Builder(); + + builder.setHelp(cl.hasOption(HELP_OPTION)); + builder.setInputFile(cl.getOptionValue(INPUT_OPTION)); + builder.setOutputFile(cl.getOptionValue(OUTPUT_OPTION)); + builder.setField(cl.getOptionValue(FIELD_OPTION)); + builder.setLanguage(cl.getOptionValue(LANGUAGE_OPTION)); + + return builder.build(); + } catch (ParseException e) { + throw new IllegalArgumentException("Failed to parse command line arguments: " + e.getMessage()); + } + } +} + diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java new file mode 100644 index 00000000000..a60408b1f96 --- /dev/null +++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java @@ -0,0 +1,49 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +package com.yahoo.vespasignificance; + +import java.io.IOException; +import java.util.List; + +/** + * The vespa-significance tool generates significance models based on input feed files. + * + * @author MariusArhaug + */ + +public class Main { + + public static void main(String[] args) { + try { + if (args.length == 0) { + System.err.println("No arguments provided. Use --help to see available options."); + System.exit(1); + } + + if (!args[0].equals("generate")) { + System.err.println("Invalid command. Use 'generate' to generate significance models."); + System.exit(1); + } + String[] commandLineArgs = List.of(args).subList(1, args.length).toArray(new String[0]); + + CommandLineOptions options = new CommandLineOptions(); + ClientParameters params = options.parseCommandLineArguments(commandLineArgs); + + if (params.help) { + options.printHelp(); + } else { + SignificanceModelGenerator significanceModelGenerator = createSignificanceModelGenerator(params); + + significanceModelGenerator.generate(); + } + } catch (IllegalArgumentException e) { + System.err.printf("Failed to parse command line arguments: %s.\n", e.getMessage()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static SignificanceModelGenerator createSignificanceModelGenerator(ClientParameters params) { + return new SignificanceModelGenerator(params); + } +} diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java new file mode 100644 index 00000000000..16a7ee18a02 --- /dev/null +++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java @@ -0,0 +1,147 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +package com.yahoo.vespasignificance; + +import com.fasterxml.jackson.annotation.JsonAutoDetect; +import com.fasterxml.jackson.annotation.PropertyAccessor; +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectWriter; +import com.yahoo.document.DataType; +import com.yahoo.document.Document; +import com.yahoo.document.DocumentPut; +import com.yahoo.document.DocumentType; +import com.yahoo.document.DocumentTypeManager; +import com.yahoo.document.Field; +import com.yahoo.document.datatypes.FieldValue; +import com.yahoo.document.json.DocumentOperationType; +import com.yahoo.document.json.JsonReader; +import com.yahoo.document.json.ParsedDocumentOperation; +import com.yahoo.language.Language; +import com.yahoo.language.opennlp.OpenNlpLinguistics; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; +import com.yahoo.language.process.TokenScript; +import com.yahoo.language.process.TokenType; +import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.significance.impl.DocumentFrequencyFile; +import com.yahoo.language.significance.impl.SignificanceModelFile; +import com.yahoo.text.Utf8; + +import java.io.IOException; +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +/** + * @author MariusArhaug + */ +public class SignificanceModelGenerator { + + private final ClientParameters clientParameters; + private final Tokenizer tokenizer; + private final HashMap documentFrequency = new HashMap<>(); + private final Language language; + private final ObjectMapper objectMapper; + private final static JsonFactory parserFactory = new JsonFactory(); + + final DocumentTypeManager types = new DocumentTypeManager(); + final DocumentType docType; + private final static String VERSION = "1.0"; + private final static String ID = "1"; + private final static String SIGNIFICANCE_DESCRIPTION = "Significance model for input file"; + private final static String DOC_FREQ_DESCRIPTION = "Document frequency for language"; + + public SignificanceModelGenerator(ClientParameters clientParameters) { + this.clientParameters = clientParameters; + OpenNlpLinguistics openNlpLinguistics = new OpenNlpLinguistics(); + tokenizer = openNlpLinguistics.getTokenizer(); + objectMapper = new ObjectMapper(); + + language = Language.fromLanguageTag(clientParameters.language); + + docType = new DocumentType(language.languageCode().toLowerCase()); + docType.addField(new Field(clientParameters.field, DataType.STRING)); + types.registerDocumentType(docType); + } + + + public void generate() throws IOException { + + Path currentWorkingDir = Paths.get("").toAbsolutePath(); + + final InputStream rawDoc = Files.newInputStream(currentWorkingDir.resolve(clientParameters.inputFile)); + + BufferedReader reader = new BufferedReader(new InputStreamReader(rawDoc)); + + long i = 1; + while (reader.ready()) { + String line = reader.readLine(); + JsonReader jsonReader = new JsonReader(types, new ByteArrayInputStream(Utf8.toBytes(line)), parserFactory); + String wikimediaId = "id:wikimedia:" + language.languageCode() + "::" + i; + + ParsedDocumentOperation operation = jsonReader.readSingleDocumentStreaming(DocumentOperationType.PUT, wikimediaId); + DocumentPut put = (DocumentPut) operation.operation(); + Document document = put.getDocument(); + FieldValue fieldValue = document.getFieldValue(clientParameters.field); + this.handleTokenization(fieldValue.toString()); + i++; + } + + long pageCount = i - 1; + + SignificanceModelFile modelFile; + if (Paths.get(clientParameters.outputFile).toFile().exists()) { + modelFile = objectMapper.readValue(new File(clientParameters.outputFile), SignificanceModelFile.class); + + modelFile.addLanguage(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency())); + + } else { + HashMap languages = new HashMap<>() {{ + put(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency())); + }}; + + modelFile = new SignificanceModelFile(VERSION, ID, SIGNIFICANCE_DESCRIPTION, languages); + } + try { + objectMapper.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY); + ObjectWriter writer = objectMapper.writerWithDefaultPrettyPrinter(); + writer.writeValue(new File(clientParameters.outputFile), modelFile); + } catch (IOException e) { + throw new IllegalStateException("Failed to write model to output file", e); + } + } + + private void handleTokenization(String field) { + var tokens = tokenizer.tokenize(field, language, StemMode.ALL, false); + + Set uniqueWords = StreamSupport.stream(tokens.spliterator(), false) + .filter(t -> t.getType() == TokenType.ALPHABETIC) + .filter(t -> t.getScript() == TokenScript.LATIN) + .map(Token::getTokenString) + .collect(Collectors.toSet()); + + for (String word : uniqueWords) { + if (documentFrequency.containsKey(word)) { + documentFrequency.merge(word, 1L, Long::sum); + } else { + documentFrequency.put(word, 1L); + } + } + } + + public HashMap getFinalDocumentFrequency() { + return documentFrequency.entrySet().stream() + .filter(k -> k.getValue() > 1) + .collect(HashMap::new, (m, v) -> m.put(v.getKey(), v.getValue()), HashMap::putAll); + } +} diff --git a/vespaclient-java/src/main/sh/vespa-significance.sh b/vespaclient-java/src/main/sh/vespa-significance.sh new file mode 100755 index 00000000000..0c0bac275e6 --- /dev/null +++ b/vespaclient-java/src/main/sh/vespa-significance.sh @@ -0,0 +1,87 @@ +#!/bin/sh +# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +# BEGIN environment bootstrap section +# Do not edit between here and END as this section should stay identical in all scripts + +findpath () { + myname=${0} + mypath=${myname%/*} + myname=${myname##*/} + empty_if_start_slash=${mypath%%/*} + if [ "${empty_if_start_slash}" ]; then + mypath=$(pwd)/${mypath} + fi + if [ "$mypath" ] && [ -d "$mypath" ]; then + return + fi + mypath=$(pwd) + if [ -f "${mypath}/${myname}" ]; then + return + fi + echo "FATAL: Could not figure out the path where $myname lives from $0" + exit 1 +} + +COMMON_ENV=libexec/vespa/common-env.sh + +source_common_env () { + if [ "$VESPA_HOME" ] && [ -d "$VESPA_HOME" ]; then + export VESPA_HOME + common_env=$VESPA_HOME/$COMMON_ENV + if [ -f "$common_env" ]; then + . $common_env + return + fi + fi + return 1 +} + +findroot () { + source_common_env && return + if [ "$VESPA_HOME" ]; then + echo "FATAL: bad VESPA_HOME value '$VESPA_HOME'" + exit 1 + fi + if [ "$ROOT" ] && [ -d "$ROOT" ]; then + VESPA_HOME="$ROOT" + source_common_env && return + fi + findpath + while [ "$mypath" ]; do + VESPA_HOME=${mypath} + source_common_env && return + mypath=${mypath%/*} + done + echo "FATAL: missing VESPA_HOME environment variable" + echo "Could not locate $COMMON_ENV anywhere" + exit 1 +} + +findhost () { + if [ "${VESPA_HOSTNAME}" = "" ]; then + VESPA_HOSTNAME=$(vespa-detect-hostname || hostname -f || hostname || echo "localhost") || exit 1 + fi + validate="${VESPA_HOME}/bin/vespa-validate-hostname" + if [ -f "$validate" ]; then + "$validate" "${VESPA_HOSTNAME}" || exit 1 + fi + export VESPA_HOSTNAME +} + +findroot +findhost + +ROOT=${VESPA_HOME%/} +export ROOT + +# END environment bootstrap section + +export MALLOC_ARENA_MAX=1 #Does not need fast allocation +exec java \ +-server -enableassertions \ +-XX:ThreadStackSize=512 \ +-XX:MaxJavaStackTraceDepth=1000000 \ +-Djava.awt.headless=true \ +-Xms128m -Xmx1024m $(getJavaOptionsIPV46) \ +-cp ${VESPA_HOME}/lib/jars/vespaclient-java-jar-with-dependencies.jar com.yahoo.vespasignificance.Main "$@" diff --git a/vespaclient-java/src/test/files/en.jsonl b/vespaclient-java/src/test/files/en.jsonl new file mode 100644 index 00000000000..b4b16e4c773 --- /dev/null +++ b/vespaclient-java/src/test/files/en.jsonl @@ -0,0 +1,3 @@ +{"put": "id:wikimedia:en::1", "fields": {"text": "Some english wiki dump"}} +{"put": "id:wikimedia:en::2", "fields": {"text": "Some english wiki dump"}} +{"put": "id:wikimedia:en::3", "fields": {"text": "Some english wiki dump"}} diff --git a/vespaclient-java/src/test/files/no.jsonl b/vespaclient-java/src/test/files/no.jsonl new file mode 100644 index 00000000000..90cc0da15c5 --- /dev/null +++ b/vespaclient-java/src/test/files/no.jsonl @@ -0,0 +1,3 @@ +{"put": "id:wikimedia:nb::1", "fields": {"text": "[[Arbeiderpartiet]][[Kategori:Omdirigeringer fra eldre skriveform]]"}} +{"put": "id:wikimedia:nb::2", "fields": {"text": "[[Arbeiderpartiet]][[Kategori:Omdirigeringer fra eldre skriveform]]"}} +{"put": "id:wikimedia:nb::3", "fields": {"text": "[[Arbeiderpartiet]][[Kategori:Omdirigeringer fra eldre skriveform]]"}} diff --git a/vespaclient-java/src/test/files/no_2.jsonl b/vespaclient-java/src/test/files/no_2.jsonl new file mode 100644 index 00000000000..f5e996d4810 --- /dev/null +++ b/vespaclient-java/src/test/files/no_2.jsonl @@ -0,0 +1,3 @@ +{"put": "id:wikimedia:nb::1", "fields": {"text": "[[Arbeiderpartiet]][[Kategori:Omdirigeringer fra eldre eldre skriveform]]"}} +{"put": "id:wikimedia:nb::2", "fields": {"text": "[[Arbeiderpartiet]][[Kategori:Omdirigeringer fra eldre skriveform nytt]]"}} +{"put": "id:wikimedia:nb::3", "fields": {"text": "[[Arbeiderpartiet]][[Kategori:Omdirigeringer frå eldre skriveform nytt]]"}} diff --git a/vespaclient-java/src/test/files/temp-dir/output.json b/vespaclient-java/src/test/files/temp-dir/output.json new file mode 100644 index 00000000000..2abc3de91e3 --- /dev/null +++ b/vespaclient-java/src/test/files/temp-dir/output.json @@ -0,0 +1,20 @@ +{ + "version" : "1.0", + "id" : "1", + "description" : "Significance model for input file", + "languages" : { + "NB" : { + "description" : "Document frequency for language", + "document-count" : 3, + "document-frequencies" : { + "fra" : 2, + "skriveform" : 3, + "nytt" : 2, + "kategori" : 3, + "eldr" : 3, + "omdirigering" : 3, + "arbeiderparti" : 3 + } + } + } +} diff --git a/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java b/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java new file mode 100644 index 00000000000..e1471d30049 --- /dev/null +++ b/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java @@ -0,0 +1,150 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +package com.yahoo.vespasignificance; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.yahoo.language.significance.impl.DocumentFrequencyFile; +import com.yahoo.language.significance.impl.SignificanceModelFile; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; + +import static org.junit.jupiter.api.Assertions.*; + +/** + * @author MariusArhaug + */ +public class SignificanceModelGeneratorTest { + + private ObjectMapper objectMapper = new ObjectMapper(); + + @BeforeEach + void removeTempDirectory() { + Path tempDir = Paths.get("src/test/files/temp-output"); + if (tempDir.toFile().exists()) { + tempDir.toFile().delete(); + } + } + + private static ClientParameters.Builder createParameters(String inputFile, String field, String language) { + String outputPath = "src/test/files/temp-dir"; + Paths.get(outputPath).toFile().mkdirs(); + + return new ClientParameters.Builder() + .setInputFile("src/test/files/" + inputFile) + .setOutputFile(outputPath + "/output.json") + .setField(field) + .setLanguage(language); + } + + private SignificanceModelGenerator createSignificanceModelGenerator(ClientParameters params) { + return new SignificanceModelGenerator(params); + } + + + @Test + void testGenerateSimpleFile() throws IOException { + ClientParameters params = createParameters("no.jsonl", "text", "NB").build(); + SignificanceModelGenerator generator = createSignificanceModelGenerator(params); + generator.generate(); + + assertTrue(Paths.get("src/test/files/temp-dir/output.json").toFile().exists()); + + SignificanceModelFile modelFile = objectMapper.readValue(Paths.get("src/test/files/temp-dir/output.json").toFile(), SignificanceModelFile.class); + + HashMap languages = modelFile.languages(); + assertEquals(1, languages.size()); + + assertTrue(languages.containsKey("NB")); + + DocumentFrequencyFile documentFrequencyFile = languages.get("NB"); + + assertEquals(3, documentFrequencyFile.frequencies().get("fra")); + assertEquals(3, documentFrequencyFile.frequencies().get("skriveform")); + assertEquals(3, documentFrequencyFile.frequencies().get("kategori")); + assertEquals(3, documentFrequencyFile.frequencies().get("eldr")); + + } + + @Test + void testGenerateFileWithMultipleLanguages() throws IOException { + ClientParameters params1 = createParameters("no.jsonl", "text", "NB").build(); + SignificanceModelGenerator generator = createSignificanceModelGenerator(params1); + generator.generate(); + assertTrue(Paths.get("src/test/files/temp-dir/output.json").toFile().exists()); + + ClientParameters params2 = createParameters("en.jsonl", "text", "EN").build(); + generator = createSignificanceModelGenerator(params2); + generator.generate(); + + assertTrue(Paths.get("src/test/files/temp-dir/output.json").toFile().exists()); + + SignificanceModelFile modelFile = objectMapper.readValue(Paths.get("src/test/files/temp-dir/output.json").toFile(), SignificanceModelFile.class); + + HashMap languages = modelFile.languages(); + + assertEquals(2, languages.size()); + + assertTrue(languages.containsKey("NB")); + assertTrue(languages.containsKey("EN")); + + DocumentFrequencyFile nb = languages.get("NB"); + DocumentFrequencyFile en = languages.get("EN"); + + assertEquals(3, nb.documentCount()); + assertEquals(3, en.documentCount()); + + assertEquals(3, nb.frequencies().get("fra")); + assertEquals(3, nb.frequencies().get("skriveform")); + + assertEquals(3, en.frequencies().get("some")); + assertEquals(3, en.frequencies().get("wiki")); + + } + + @Test + void testOverwriteExistingDocumentFrequencyLanguage() throws IOException { + ClientParameters params1 = createParameters("no.jsonl", "text", "NB").build(); + SignificanceModelGenerator generator = createSignificanceModelGenerator(params1); + generator.generate(); + assertTrue(Paths.get("src/test/files/temp-dir/output.json").toFile().exists()); + + SignificanceModelFile preUpdatedFile = objectMapper.readValue(Paths.get("src/test/files/temp-dir/output.json").toFile(), SignificanceModelFile.class) + ; + HashMap oldLanguages = preUpdatedFile.languages(); + assertEquals(1, oldLanguages.size()); + + assertTrue(oldLanguages.containsKey("NB")); + + DocumentFrequencyFile oldDf = oldLanguages.get("NB"); + + assertEquals(3, oldDf.frequencies().get("fra")); + assertEquals(3, oldDf.frequencies().get("skriveform")); + assertFalse(oldDf.frequencies().containsKey("nytt")); + + ClientParameters params2 = createParameters("no_2.jsonl", "text", "NB").build(); + SignificanceModelGenerator generator2 = createSignificanceModelGenerator(params2); + generator2.generate(); + + assertTrue(Paths.get("src/test/files/temp-dir/output.json").toFile().exists()); + + SignificanceModelFile modelFile = objectMapper.readValue(Paths.get("src/test/files/temp-dir/output.json").toFile(), SignificanceModelFile.class); + + HashMap languages = modelFile.languages(); + + assertEquals(1, languages.size()); + + assertTrue(languages.containsKey("NB")); + + DocumentFrequencyFile df = languages.get("NB"); + + assertEquals(2, df.frequencies().get("fra")); + assertEquals(3, df.frequencies().get("skriveform")); + assertTrue(df.frequencies().containsKey("nytt")); + assertEquals(2, df.frequencies().get("nytt")); + } +} -- cgit v1.2.3