diff options
author | MariusArhaug <mariusarhaug@hotmail.com> | 2024-05-14 16:18:01 +0200 |
---|---|---|
committer | MariusArhaug <mariusarhaug@hotmail.com> | 2024-05-14 16:23:24 +0200 |
commit | 74280df2d2ce04dd14a9ee325c9dec8080145da7 (patch) | |
tree | e2c23b0ff19b9a9b4108b25c3ffae2ac16a127ad /vespaclient-java/src/main/java/com/yahoo | |
parent | a26a156f27785c3d84c0d8ca25d75f35ebbb0e90 (diff) |
Add significance model generator cli
Diffstat (limited to 'vespaclient-java/src/main/java/com/yahoo')
4 files changed, 373 insertions, 0 deletions
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java new file mode 100644 index 00000000000..f0c351581e4 --- /dev/null +++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/ClientParameters.java @@ -0,0 +1,73 @@ +package com.yahoo.vespasignificance; + + +/** + * This class contains the program parameters. + * + * @author MariusArhaug + */ +public class ClientParameters { + // Show help page if true + public final boolean help; + + // Input file for the program + public final String inputFile; + + // Output file for the program + public final String outputFile; + + // Field for the program + public final String field; + + // Language for the program + public final String language; + + public ClientParameters( + boolean help, + String inputFile, + String outputFile, + String field, + String language) { + this.help = help; + this.inputFile = inputFile; + this.outputFile = outputFile; + this.field = field; + this.language = language; + } + + public static class Builder { + private boolean help; + private String inputFile; + private String outputFile; + private String field; + private String language; + + public Builder setHelp(boolean help) { + this.help = help; + return this; + } + + public Builder setInputFile(String inputFile) { + this.inputFile = inputFile; + return this; + } + + public Builder setOutputFile(String outputFile) { + this.outputFile = outputFile; + return this; + } + + public Builder setField(String field) { + this.field = field; + return this; + } + public Builder setLanguage(String language) { + this.language = language; + return this; + } + + public ClientParameters build() { + return new ClientParameters(help, inputFile, outputFile, field, language); + } + } +} diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java new file mode 100644 index 00000000000..3090b31b9b5 --- /dev/null +++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/CommandLineOptions.java @@ -0,0 +1,104 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespasignificance; + +import org.apache.commons.cli.ParseException; +import org.apache.commons.cli.CommandLine; +import org.apache.commons.cli.CommandLineParser; +import org.apache.commons.cli.DefaultParser; +import org.apache.commons.cli.HelpFormatter; +import org.apache.commons.cli.Option; +import org.apache.commons.cli.Options; + + +import java.io.InputStream; + + +/** + * This class is responsible for parsing the command line arguments and print the help page. + * + * @author MariusArhaug + */ +public class CommandLineOptions { + + public static final String HELP_OPTION = "help"; + public static final String INPUT_OPTION = "in"; + public static final String OUTPUT_OPTION = "out"; + public static final String FIELD_OPTION = "field"; + public static final String LANGUAGE_OPTION = "language"; + + private final Options options = createOptions(); + private final InputStream stdIn; + + public CommandLineOptions(InputStream stdIn) { + this.stdIn = stdIn; + } + + public CommandLineOptions() { + this(System.in); + } + + @SuppressWarnings("AccessStaticViaInstance") + private static Options createOptions() { + Options options = new Options(); + + options.addOption(Option.builder("h") + .hasArg(false) + .desc("Show this syntax page.") + .longOpt(HELP_OPTION) + .build()); + + options.addOption(Option.builder("i") + .hasArg(true) + .desc("Input file") + .longOpt(INPUT_OPTION) + .build()); + + options.addOption(Option.builder("i") + .hasArg(true) + .desc("Output file") + .longOpt(OUTPUT_OPTION) + .build()); + + options.addOption(Option.builder("f") + .hasArg(true) + .desc("Field to analyze") + .longOpt(FIELD_OPTION) + .build()); + + options.addOption(Option.builder("l") + .hasArg(true) + .desc("Language tag for output file") + .longOpt(LANGUAGE_OPTION) + .build()); + + return options; + } + + public void printHelp() { + HelpFormatter formatter = new HelpFormatter(); + + formatter.printHelp( + "vespa-significance <command> <options>", "Perform a significance value related operation.", options, + "The generate command generates a significance model file for a given corpus type .jsonl file.\n", + false); + } + + public ClientParameters parseCommandLineArguments(String[] args) throws IllegalArgumentException { + try { + CommandLineParser clp = new DefaultParser(); + CommandLine cl = clp.parse(options, args); + ClientParameters.Builder builder = new ClientParameters.Builder(); + + builder.setHelp(cl.hasOption(HELP_OPTION)); + builder.setInputFile(cl.getOptionValue(INPUT_OPTION)); + builder.setOutputFile(cl.getOptionValue(OUTPUT_OPTION)); + builder.setField(cl.getOptionValue(FIELD_OPTION)); + builder.setLanguage(cl.getOptionValue(LANGUAGE_OPTION)); + + return builder.build(); + } catch (ParseException e) { + throw new IllegalArgumentException("Failed to parse command line arguments: " + e.getMessage()); + } + } +} + diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java new file mode 100644 index 00000000000..a60408b1f96 --- /dev/null +++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java @@ -0,0 +1,49 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +package com.yahoo.vespasignificance; + +import java.io.IOException; +import java.util.List; + +/** + * The vespa-significance tool generates significance models based on input feed files. + * + * @author MariusArhaug + */ + +public class Main { + + public static void main(String[] args) { + try { + if (args.length == 0) { + System.err.println("No arguments provided. Use --help to see available options."); + System.exit(1); + } + + if (!args[0].equals("generate")) { + System.err.println("Invalid command. Use 'generate' to generate significance models."); + System.exit(1); + } + String[] commandLineArgs = List.of(args).subList(1, args.length).toArray(new String[0]); + + CommandLineOptions options = new CommandLineOptions(); + ClientParameters params = options.parseCommandLineArguments(commandLineArgs); + + if (params.help) { + options.printHelp(); + } else { + SignificanceModelGenerator significanceModelGenerator = createSignificanceModelGenerator(params); + + significanceModelGenerator.generate(); + } + } catch (IllegalArgumentException e) { + System.err.printf("Failed to parse command line arguments: %s.\n", e.getMessage()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private static SignificanceModelGenerator createSignificanceModelGenerator(ClientParameters params) { + return new SignificanceModelGenerator(params); + } +} diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java new file mode 100644 index 00000000000..16a7ee18a02 --- /dev/null +++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java @@ -0,0 +1,147 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +package com.yahoo.vespasignificance; + +import com.fasterxml.jackson.annotation.JsonAutoDetect; +import com.fasterxml.jackson.annotation.PropertyAccessor; +import com.fasterxml.jackson.core.JsonFactory; +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.databind.ObjectWriter; +import com.yahoo.document.DataType; +import com.yahoo.document.Document; +import com.yahoo.document.DocumentPut; +import com.yahoo.document.DocumentType; +import com.yahoo.document.DocumentTypeManager; +import com.yahoo.document.Field; +import com.yahoo.document.datatypes.FieldValue; +import com.yahoo.document.json.DocumentOperationType; +import com.yahoo.document.json.JsonReader; +import com.yahoo.document.json.ParsedDocumentOperation; +import com.yahoo.language.Language; +import com.yahoo.language.opennlp.OpenNlpLinguistics; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; +import com.yahoo.language.process.TokenScript; +import com.yahoo.language.process.TokenType; +import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.significance.impl.DocumentFrequencyFile; +import com.yahoo.language.significance.impl.SignificanceModelFile; +import com.yahoo.text.Utf8; + +import java.io.IOException; +import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +/** + * @author MariusArhaug + */ +public class SignificanceModelGenerator { + + private final ClientParameters clientParameters; + private final Tokenizer tokenizer; + private final HashMap<String, Long> documentFrequency = new HashMap<>(); + private final Language language; + private final ObjectMapper objectMapper; + private final static JsonFactory parserFactory = new JsonFactory(); + + final DocumentTypeManager types = new DocumentTypeManager(); + final DocumentType docType; + private final static String VERSION = "1.0"; + private final static String ID = "1"; + private final static String SIGNIFICANCE_DESCRIPTION = "Significance model for input file"; + private final static String DOC_FREQ_DESCRIPTION = "Document frequency for language"; + + public SignificanceModelGenerator(ClientParameters clientParameters) { + this.clientParameters = clientParameters; + OpenNlpLinguistics openNlpLinguistics = new OpenNlpLinguistics(); + tokenizer = openNlpLinguistics.getTokenizer(); + objectMapper = new ObjectMapper(); + + language = Language.fromLanguageTag(clientParameters.language); + + docType = new DocumentType(language.languageCode().toLowerCase()); + docType.addField(new Field(clientParameters.field, DataType.STRING)); + types.registerDocumentType(docType); + } + + + public void generate() throws IOException { + + Path currentWorkingDir = Paths.get("").toAbsolutePath(); + + final InputStream rawDoc = Files.newInputStream(currentWorkingDir.resolve(clientParameters.inputFile)); + + BufferedReader reader = new BufferedReader(new InputStreamReader(rawDoc)); + + long i = 1; + while (reader.ready()) { + String line = reader.readLine(); + JsonReader jsonReader = new JsonReader(types, new ByteArrayInputStream(Utf8.toBytes(line)), parserFactory); + String wikimediaId = "id:wikimedia:" + language.languageCode() + "::" + i; + + ParsedDocumentOperation operation = jsonReader.readSingleDocumentStreaming(DocumentOperationType.PUT, wikimediaId); + DocumentPut put = (DocumentPut) operation.operation(); + Document document = put.getDocument(); + FieldValue fieldValue = document.getFieldValue(clientParameters.field); + this.handleTokenization(fieldValue.toString()); + i++; + } + + long pageCount = i - 1; + + SignificanceModelFile modelFile; + if (Paths.get(clientParameters.outputFile).toFile().exists()) { + modelFile = objectMapper.readValue(new File(clientParameters.outputFile), SignificanceModelFile.class); + + modelFile.addLanguage(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency())); + + } else { + HashMap<String, DocumentFrequencyFile> languages = new HashMap<>() {{ + put(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency())); + }}; + + modelFile = new SignificanceModelFile(VERSION, ID, SIGNIFICANCE_DESCRIPTION, languages); + } + try { + objectMapper.setVisibility(PropertyAccessor.FIELD, JsonAutoDetect.Visibility.ANY); + ObjectWriter writer = objectMapper.writerWithDefaultPrettyPrinter(); + writer.writeValue(new File(clientParameters.outputFile), modelFile); + } catch (IOException e) { + throw new IllegalStateException("Failed to write model to output file", e); + } + } + + private void handleTokenization(String field) { + var tokens = tokenizer.tokenize(field, language, StemMode.ALL, false); + + Set<String> uniqueWords = StreamSupport.stream(tokens.spliterator(), false) + .filter(t -> t.getType() == TokenType.ALPHABETIC) + .filter(t -> t.getScript() == TokenScript.LATIN) + .map(Token::getTokenString) + .collect(Collectors.toSet()); + + for (String word : uniqueWords) { + if (documentFrequency.containsKey(word)) { + documentFrequency.merge(word, 1L, Long::sum); + } else { + documentFrequency.put(word, 1L); + } + } + } + + public HashMap<String, Long> getFinalDocumentFrequency() { + return documentFrequency.entrySet().stream() + .filter(k -> k.getValue() > 1) + .collect(HashMap::new, (m, v) -> m.put(v.getKey(), v.getValue()), HashMap::putAll); + } +} |