diff options
Diffstat (limited to 'vespaclient-java/src')
3 files changed, 63 insertions, 30 deletions
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java index e368eebefa5..3bd836d7a14 100644 --- a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java +++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java @@ -34,7 +34,6 @@ public class Main { } else { System.setProperty("vespa.replace_invalid_unicode", "true"); SignificanceModelGenerator significanceModelGenerator = createSignificanceModelGenerator(params); - significanceModelGenerator.generate(); } } catch (IllegalArgumentException e) { diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java index d620820e14f..3f257fc6df0 100644 --- a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java +++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java @@ -44,6 +44,8 @@ import java.util.HashMap; import java.util.Map; import java.util.Set; import java.util.TreeMap; +import java.util.List; +import java.util.Arrays; import java.util.stream.Collectors; import java.util.stream.StreamSupport; @@ -56,7 +58,9 @@ public class SignificanceModelGenerator { private final Tokenizer tokenizer; private final TreeMap<String, Long> documentFrequency = new TreeMap<>(); - private final Language language; + private final List<Language> languages; + + private final Language languageTag; private final ObjectMapper objectMapper; private final static JsonFactory parserFactory = new JsonFactory(); @@ -75,11 +79,16 @@ public class SignificanceModelGenerator { throw new IllegalArgumentException("Output file must have .zst extension when using zst compression"); } - language = Language.fromLanguageTag(clientParameters.language); - if (language == Language.UNKNOWN) { - throw new IllegalArgumentException("Unknown language: " + clientParameters.language); + if (!clientParameters.zstCompression && clientParameters.outputFile.endsWith(".zst")) { + throw new IllegalArgumentException("Output file must not have .zst extension when not using zst compression"); } + this.languages = Arrays.stream(clientParameters.language.split(",")) + .map(Language::fromLanguageTag) + .collect(Collectors.toList()); + + this.languageTag = this.languages.get(0); + OpenNlpLinguistics openNlpLinguistics = new OpenNlpLinguistics(); tokenizer = openNlpLinguistics.getTokenizer(); objectMapper = new ObjectMapper(); @@ -104,7 +113,7 @@ public class SignificanceModelGenerator { while (reader.ready()) { String line = reader.readLine(); JsonReader jsonReader = new JsonReader(types, new ByteArrayInputStream(Utf8.toBytes(line)), parserFactory); - String wikimediaId = "id:wikimedia:" + language.languageCode() + "::" + i; + String wikimediaId = "id:wikimedia:" + languageTag.languageCode() + "::" + i; ParsedDocumentOperation operation = jsonReader.readSingleDocumentStreaming(DocumentOperationType.PUT, wikimediaId); DocumentPut put = (DocumentPut) operation.operation(); @@ -118,6 +127,7 @@ public class SignificanceModelGenerator { SignificanceModelFile modelFile; File outputFile = Paths.get(clientParameters.outputFile).toFile(); + String languagesKey = String.join(",", this.languages.stream().map(Language::languageCode).toList()); if (outputFile.exists()) { InputStream in = outputFile.toString().endsWith(".zst") ? @@ -126,11 +136,11 @@ public class SignificanceModelGenerator { modelFile = objectMapper.readValue(in, SignificanceModelFile.class); - modelFile.addLanguage(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency())); + modelFile.addLanguage(languagesKey, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency())); } else { HashMap<String, DocumentFrequencyFile> languages = new HashMap<>() {{ - put(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency())); + put(languagesKey, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency())); }}; modelFile = new SignificanceModelFile(VERSION, ID, SIGNIFICANCE_DESCRIPTION + clientParameters.inputFile, languages); @@ -149,7 +159,7 @@ public class SignificanceModelGenerator { } private void handleTokenization(String field) { - var tokens = tokenizer.tokenize(field, language, StemMode.ALL, false); + var tokens = tokenizer.tokenize(field, languageTag, StemMode.ALL, false); Set<String> uniqueWords = StreamSupport.stream(tokens.spliterator(), false) .filter(t -> t.getType() == TokenType.ALPHABETIC) diff --git a/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java b/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java index 916fe05ef7b..4791d78f0a2 100644 --- a/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java +++ b/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java @@ -48,7 +48,7 @@ public class SignificanceModelGeneratorTest { void testGenerateSimpleFile() throws IOException { String inputPath = "no.jsonl"; String outputPath = "output.json"; - ClientParameters params = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build(); + ClientParameters params = createParameters(inputPath, outputPath, "text", "nb", "nb", "false").build(); SignificanceModelGenerator generator = createSignificanceModelGenerator(params); generator.generate(); @@ -60,9 +60,9 @@ public class SignificanceModelGeneratorTest { HashMap<String, DocumentFrequencyFile> languages = modelFile.languages(); assertEquals(1, languages.size()); - assertTrue(languages.containsKey("NB")); + assertTrue(languages.containsKey("nb")); - DocumentFrequencyFile documentFrequencyFile = languages.get("NB"); + DocumentFrequencyFile documentFrequencyFile = languages.get("nb"); assertEquals(3, documentFrequencyFile.frequencies().get("fra")); assertEquals(3, documentFrequencyFile.frequencies().get("skriveform")); @@ -74,19 +74,17 @@ public class SignificanceModelGeneratorTest { @Test void testGenerateSimpleFileWithZST() throws IOException { String inputPath = "no.jsonl"; - ClientParameters params1 = createParameters(inputPath, "output.json", "text", "NB", "nb", "true").build(); + ClientParameters params1 = createParameters(inputPath, "output.json", "text", "nb", "nb", "true").build(); // Throws exception when outputfile does not have .zst extension when using zst compression assertThrows(IllegalArgumentException.class, () -> createSignificanceModelGenerator(params1)); String outputPath = "output.json.zst"; - ClientParameters params = createParameters(inputPath, outputPath, "text", "NB", "nb", "true").build(); + ClientParameters params = createParameters(inputPath, outputPath, "text", "nb", "nb", "true").build(); SignificanceModelGenerator generator = createSignificanceModelGenerator(params); generator.generate(); - - File outputFile = new File(tempDir.resolve(outputPath ).toString()); assertTrue(outputFile.exists()); @@ -97,9 +95,9 @@ public class SignificanceModelGeneratorTest { HashMap<String, DocumentFrequencyFile> languages = modelFile.languages(); assertEquals(1, languages.size()); - assertTrue(languages.containsKey("NB")); + assertTrue(languages.containsKey("nb")); - DocumentFrequencyFile documentFrequencyFile = languages.get("NB"); + DocumentFrequencyFile documentFrequencyFile = languages.get("nb"); assertEquals(3, documentFrequencyFile.frequencies().get("fra")); assertEquals(3, documentFrequencyFile.frequencies().get("skriveform")); @@ -112,7 +110,7 @@ public class SignificanceModelGeneratorTest { void testGenerateFileWithMultipleLanguages() throws IOException { String inputPath = "no.jsonl"; String outputPath = "output.json"; - ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build(); + ClientParameters params1 = createParameters(inputPath, outputPath, "text", "nb", "nb", "false").build(); SignificanceModelGenerator generator = createSignificanceModelGenerator(params1); generator.generate(); @@ -120,7 +118,7 @@ public class SignificanceModelGeneratorTest { assertTrue(outputFile.exists()); String inputPath2 = "en.jsonl"; - ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "EN", "en", "false").build(); + ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "en", "en", "false").build(); generator = createSignificanceModelGenerator(params2); generator.generate(); @@ -133,11 +131,11 @@ public class SignificanceModelGeneratorTest { assertEquals(2, languages.size()); - assertTrue(languages.containsKey("NB")); - assertTrue(languages.containsKey("EN")); + assertTrue(languages.containsKey("nb")); + assertTrue(languages.containsKey("en")); - DocumentFrequencyFile nb = languages.get("NB"); - DocumentFrequencyFile en = languages.get("EN"); + DocumentFrequencyFile nb = languages.get("nb"); + DocumentFrequencyFile en = languages.get("en"); assertEquals(3, nb.documentCount()); assertEquals(3, en.documentCount()); @@ -154,7 +152,7 @@ public class SignificanceModelGeneratorTest { void testOverwriteExistingDocumentFrequencyLanguage() throws IOException { String inputPath = "no.jsonl"; String outputPath = "output.json"; - ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build(); + ClientParameters params1 = createParameters(inputPath, outputPath, "text", "nb", "nb", "false").build(); SignificanceModelGenerator generator = createSignificanceModelGenerator(params1); generator.generate(); @@ -166,16 +164,16 @@ public class SignificanceModelGeneratorTest { HashMap<String, DocumentFrequencyFile> oldLanguages = preUpdatedFile.languages(); assertEquals(1, oldLanguages.size()); - assertTrue(oldLanguages.containsKey("NB")); + assertTrue(oldLanguages.containsKey("nb")); - DocumentFrequencyFile oldDf = oldLanguages.get("NB"); + DocumentFrequencyFile oldDf = oldLanguages.get("nb"); assertEquals(3, oldDf.frequencies().get("fra")); assertEquals(3, oldDf.frequencies().get("skriveform")); assertFalse(oldDf.frequencies().containsKey("nytt")); String inputPath2 = "no_2.jsonl"; - ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "NB", "nb", "false").build(); + ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "nb", "nb", "false").build(); SignificanceModelGenerator generator2 = createSignificanceModelGenerator(params2); generator2.generate(); @@ -188,13 +186,39 @@ public class SignificanceModelGeneratorTest { assertEquals(1, languages.size()); - assertTrue(languages.containsKey("NB")); + assertTrue(languages.containsKey("nb")); - DocumentFrequencyFile df = languages.get("NB"); + DocumentFrequencyFile df = languages.get("nb"); assertEquals(2, df.frequencies().get("fra")); assertEquals(3, df.frequencies().get("skriveform")); assertTrue(df.frequencies().containsKey("nytt")); assertEquals(2, df.frequencies().get("nytt")); } + + @Test + void testGenerateFileWithMultipleLanguagesForSingleDocumentFrequency() throws IOException { + String inputPath = "no.jsonl"; + String outputPath = "output.json"; + ClientParameters params = createParameters(inputPath, outputPath, "text", "nb,un", "nb", "false").build(); + SignificanceModelGenerator generator = createSignificanceModelGenerator(params); + generator.generate(); + + File outputFile = new File(tempDir.resolve(outputPath).toString()); + assertTrue(outputFile.exists()); + + SignificanceModelFile modelFile = objectMapper.readValue(outputFile, SignificanceModelFile.class); + + HashMap<String, DocumentFrequencyFile> languages = modelFile.languages(); + assertEquals(1, languages.size()); + + assertTrue(languages.containsKey("nb,un")); + + DocumentFrequencyFile documentFrequencyFile = languages.get("nb,un"); + + assertEquals(3, documentFrequencyFile.frequencies().get("fra")); + assertEquals(3, documentFrequencyFile.frequencies().get("skriveform")); + assertEquals(3, documentFrequencyFile.frequencies().get("kategori")); + assertEquals(3, documentFrequencyFile.frequencies().get("eldr")); + } } |