aboutsummaryrefslogtreecommitdiffstats
path: root/vespaclient-java/src/main/java/com/yahoo
diff options
context:
space:
mode:
authorMariusArhaug <mariusarhaug@hotmail.com>2024-05-14 16:18:01 +0200
committerMariusArhaug <mariusarhaug@hotmail.com>2024-05-14 16:23:24 +0200
commit74280df2d2ce04dd14a9ee325c9dec8080145da7 (patch)
treee2c23b0ff19b9a9b4108b25c3ffae2ac16a127ad /vespaclient-java/src/main/java/com/yahoo
parenta26a156f27785c3d84c0d8ca25d75f35ebbb0e90 (diff)
Add significance model generator cli
Diffstat (limited to 'vespaclient-java/src/main/java/com/yahoo')
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java73
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java104
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java49
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java147
4 files changed, 373 insertions, 0 deletions
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java
new file mode 100644
index 00000000000..f0c351581e4
--- /dev/null
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java
@@ -0,0 +1,73 @@
+package com.yahoo.vespasignificance;
+
+
+/**
+ * This class contains the program parameters.
+ *
+ * @author MariusArhaug
+ */
+public class ClientParameters {
+ // Show help page if true
+ public final boolean help;
+
+ // Input file for the program
+ public final String inputFile;
+
+ // Output file for the program
+ public final String outputFile;
+
+ // Field for the program
+ public final String field;
+
+ // Language for the program
+ public final String language;
+
+ public ClientParameters(
+ boolean help,
+ String inputFile,
+ String outputFile,
+ String field,
+ String language) {
+ this.help = help;
+ this.inputFile = inputFile;
+ this.outputFile = outputFile;
+ this.field = field;
+ this.language = language;
+ }
+
+ public static class Builder {
+ private boolean help;
+ private String inputFile;
+ private String outputFile;
+ private String field;
+ private String language;
+
+ public Builder setHelp(boolean help) {
+ this.help = help;
+ return this;
+ }
+
+ public Builder setInputFile(String inputFile) {
+ this.inputFile = inputFile;
+ return this;
+ }
+
+ public Builder setOutputFile(String outputFile) {
+ this.outputFile = outputFile;
+ return this;
+ }
+
+ public Builder setField(String field) {
+ this.field = field;
+ return this;
+ }
+ public Builder setLanguage(String language) {
+ this.language = language;
+ return this;
+ }
+
+ public ClientParameters build() {
+ return new ClientParameters(help, inputFile, outputFile, field, language);
+ }
+ }
+}
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java
new file mode 100644
index 00000000000..3090b31b9b5
--- /dev/null
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java
@@ -0,0 +1,104 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespasignificance;
+
+import org.apache.commons.cli.ParseException;
+import org.apache.commons.cli.CommandLine;
+import org.apache.commons.cli.CommandLineParser;
+import org.apache.commons.cli.DefaultParser;
+import org.apache.commons.cli.HelpFormatter;
+import org.apache.commons.cli.Option;
+import org.apache.commons.cli.Options;
+
+
+import java.io.InputStream;
+
+
+/**
+ * This class is responsible for parsing the command line arguments and print the help page.
+ *
+ * @author MariusArhaug
+ */
+public class CommandLineOptions {
+
+ public static final String HELP_OPTION = "help";
+ public static final String INPUT_OPTION = "in";
+ public static final String OUTPUT_OPTION = "out";
+ public static final String FIELD_OPTION = "field";
+ public static final String LANGUAGE_OPTION = "language";
+
+ private final Options options = createOptions();
+ private final InputStream stdIn;
+
+ public CommandLineOptions(InputStream stdIn) {
+ this.stdIn = stdIn;
+ }
+
+ public CommandLineOptions() {
+ this(System.in);
+ }
+
+ @SuppressWarnings("AccessStaticViaInstance")
+ private static Options createOptions() {
+ Options options = new Options();
+
+ options.addOption(Option.builder("h")
+ .hasArg(false)
+ .desc("Show this syntax page.")
+ .longOpt(HELP_OPTION)
+ .build());
+
+ options.addOption(Option.builder("i")
+ .hasArg(true)
+ .desc("Input file")
+ .longOpt(INPUT_OPTION)
+ .build());
+
+ options.addOption(Option.builder("i")
+ .hasArg(true)
+ .desc("Output file")
+ .longOpt(OUTPUT_OPTION)
+ .build());
+
+ options.addOption(Option.builder("f")
+ .hasArg(true)
+ .desc("Field to analyze")
+ .longOpt(FIELD_OPTION)
+ .build());
+
+ options.addOption(Option.builder("l")
+ .hasArg(true)
+ .desc("Language tag for output file")
+ .longOpt(LANGUAGE_OPTION)
+ .build());
+
+ return options;
+ }
+
+ public void printHelp() {
+ HelpFormatter formatter = new HelpFormatter();
+
+ formatter.printHelp(
+ "vespa-significance <command> <options>", "Perform a significance value related operation.", options,
+ "The generate command generates a significance model file for a given corpus type .jsonl file.\n",
+ false);
+ }
+
+ public ClientParameters parseCommandLineArguments(String[] args) throws IllegalArgumentException {
+ try {
+ CommandLineParser clp = new DefaultParser();
+ CommandLine cl = clp.parse(options, args);
+ ClientParameters.Builder builder = new ClientParameters.Builder();
+
+ builder.setHelp(cl.hasOption(HELP_OPTION));
+ builder.setInputFile(cl.getOptionValue(INPUT_OPTION));
+ builder.setOutputFile(cl.getOptionValue(OUTPUT_OPTION));
+ builder.setField(cl.getOptionValue(FIELD_OPTION));
+ builder.setLanguage(cl.getOptionValue(LANGUAGE_OPTION));
+
+ return builder.build();
+ } catch (ParseException e) {
+ throw new IllegalArgumentException("Failed to parse command line arguments: " + e.getMessage());
+ }
+ }
+}
+
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java
new file mode 100644
index 00000000000..a60408b1f96
--- /dev/null
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java
@@ -0,0 +1,49 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+package com.yahoo.vespasignificance;
+
+import java.io.IOException;
+import java.util.List;
+
+/**
+ * The vespa-significance tool generates significance models based on input feed files.
+ *
+ * @author MariusArhaug
+ */
+
+public class Main {
+
+ public static void main(String[] args) {
+ try {
+ if (args.length == 0) {
+ System.err.println("No arguments provided. Use --help to see available options.");
+ System.exit(1);
+ }
+
+ if (!args[0].equals("generate")) {
+ System.err.println("Invalid command. Use 'generate' to generate significance models.");
+ System.exit(1);
+ }
+ String[] commandLineArgs = List.of(args).subList(1, args.length).toArray(new String[0]);
+
+ CommandLineOptions options = new CommandLineOptions();
+ ClientParameters params = options.parseCommandLineArguments(commandLineArgs);
+
+ if (params.help) {
+ options.printHelp();
+ } else {
+ SignificanceModelGenerator significanceModelGenerator = createSignificanceModelGenerator(params);
+
+ significanceModelGenerator.generate();
+ }
+ } catch (IllegalArgumentException e) {
+ System.err.printf("Failed to parse command line arguments: %s.\n", e.getMessage());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private static SignificanceModelGenerator createSignificanceModelGenerator(ClientParameters params) {
+ return new SignificanceModelGenerator(params);
+ }
+}
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java
new file mode 100644
index 00000000000..16a7ee18a02
--- /dev/null
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java
@@ -0,0 +1,147 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+package com.yahoo.vespasignificance;
+
+import com.fasterxml.jackson.annotation.JsonAutoDetect;
+import com.fasterxml.jackson.annotation.PropertyAccessor;
+import com.fasterxml.jackson.core.JsonFactory;
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.databind.ObjectWriter;
+import com.yahoo.document.DataType;
+import com.yahoo.document.Document;
+import com.yahoo.document.DocumentPut;
+import com.yahoo.document.DocumentType;
+import com.yahoo.document.DocumentTypeManager;
+import com.yahoo.document.Field;
+import com.yahoo.document.datatypes.FieldValue;
+import com.yahoo.document.json.DocumentOperationType;
+import com.yahoo.document.json.JsonReader;
+import com.yahoo.document.json.ParsedDocumentOperation;
+import com.yahoo.language.Language;
+import com.yahoo.language.opennlp.OpenNlpLinguistics;
+import com.yahoo.language.process.StemMode;
+import com.yahoo.language.process.Token;
+import com.yahoo.language.process.TokenScript;
+import com.yahoo.language.process.TokenType;
+import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.significance.impl.DocumentFrequencyFile;
+import com.yahoo.language.significance.impl.SignificanceModelFile;
+import com.yahoo.text.Utf8;
+
+import java.io.IOException;
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.File;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.HashMap;
+import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+/**
+ * @author MariusArhaug
+ */
+public class SignificanceModelGenerator {
+
+ private final ClientParameters clientParameters;
+ private final Tokenizer tokenizer;
+ private final HashMap<String, Long> documentFrequency = new HashMap<>();
+ private final Language language;
+ private final ObjectMapper objectMapper;
+ private final static JsonFactory parserFactory = new JsonFactory();
+
+ final DocumentTypeManager types = new DocumentTypeManager();
+ final DocumentType docType;
+ private final static String VERSION = "1.0";
+ private final static String ID = "1";
+ private final static String SIGNIFICANCE_DESCRIPTION = "Significance model for input file";
+ private final static String DOC_FREQ_DESCRIPTION = "Document frequency for language";
+
+ public SignificanceModelGenerator(ClientParameters clientParameters) {
+ this.clientParameters = clientParameters;
+ OpenNlpLinguistics openNlpLinguistics = new OpenNlpLinguistics();
+ tokenizer = openNlpLinguistics.getTokenizer();
+ objectMapper = new ObjectMapper();
+
+ language = Language.fromLanguageTag(clientParameters.language);
+
+ docType = new DocumentType(language.languageCode().toLowerCase());
+ docType.addField(new Field(clientParameters.field, DataType.STRING));
+ types.registerDocumentType(docType);
+ }
+
+
+ public void generate() throws IOException {
+
+ Path currentWorkingDir = Paths.get("").toAbsolutePath();
+
+ final InputStream rawDoc = Files.newInputStream(currentWorkingDir.resolve(clientParameters.inputFile));
+
+ BufferedReader reader = new BufferedReader(new InputStreamReader(rawDoc));
+
+ long i = 1;
+ while (reader.ready()) {
+ String line = reader.readLine();
+ JsonReader jsonReader = new JsonReader(types, new ByteArrayInputStream(Utf8.toBytes(line)), parserFactory);
+ String wikimediaId = "id:wikimedia:" + language.languageCode() + "::" + i;
+
+ ParsedDocumentOperation operation = jsonReader.readSingleDocumentStreaming(DocumentOperationType.PUT, wikimediaId);
+ DocumentPut put = (DocumentPut) operation.operation();
+ Document document = put.getDocument();
+ FieldValue fieldValue = document.getFieldValue(clientParameters.field);
+ this.handleTokenization(fieldValue.toString());
+ i++;
+ }
+
+ long pageCount = i - 1;
+
+ SignificanceModelFile modelFile;
+ if (Paths.get(clientParameters.outputFile).toFile().exists()) {
+ modelFile = objectMapper.readValue(new File(clientParameters.outputFile), SignificanceModelFile.class);
+
+ modelFile.addLanguage(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency()));
+
+ } else {
+ HashMap<String, DocumentFrequencyFile> languages = new HashMap<>() {{
+ put(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency()));
+ }};
+
+ modelFile = new SignificanceModelFile(VERSION, ID, SIGNIFICANCE_DESCRIPTION, languages);
+ }
+ try {
+ objectMapper.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY);
+ ObjectWriter writer = objectMapper.writerWithDefaultPrettyPrinter();
+ writer.writeValue(new File(clientParameters.outputFile), modelFile);
+ } catch (IOException e) {
+ throw new IllegalStateException("Failed to write model to output file", e);
+ }
+ }
+
+ private void handleTokenization(String field) {
+ var tokens = tokenizer.tokenize(field, language, StemMode.ALL, false);
+
+ Set<String> uniqueWords = StreamSupport.stream(tokens.spliterator(), false)
+ .filter(t -> t.getType() == TokenType.ALPHABETIC)
+ .filter(t -> t.getScript() == TokenScript.LATIN)
+ .map(Token::getTokenString)
+ .collect(Collectors.toSet());
+
+ for (String word : uniqueWords) {
+ if (documentFrequency.containsKey(word)) {
+ documentFrequency.merge(word, 1L, Long::sum);
+ } else {
+ documentFrequency.put(word, 1L);
+ }
+ }
+ }
+
+ public HashMap<String, Long> getFinalDocumentFrequency() {
+ return documentFrequency.entrySet().stream()
+ .filter(k -> k.getValue() > 1)
+ .collect(HashMap::new, (m, v) -> m.put(v.getKey(), v.getValue()), HashMap::putAll);
+ }
+}