aboutsummaryrefslogtreecommitdiffstats
path: root/vespaclient-java/src
diff options
context:
space:
mode:
Diffstat (limited to 'vespaclient-java/src')
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java1
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java26
-rw-r--r--vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java66
3 files changed, 63 insertions, 30 deletions
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java
index e368eebefa5..3bd836d7a14 100644
--- a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java
@@ -34,7 +34,6 @@ public class Main {
} else {
System.setProperty("vespa.replace_invalid_unicode", "true");
SignificanceModelGenerator significanceModelGenerator = createSignificanceModelGenerator(params);
-
significanceModelGenerator.generate();
}
} catch (IllegalArgumentException e) {
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java
index d620820e14f..3f257fc6df0 100644
--- a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java
@@ -44,6 +44,8 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
+import java.util.List;
+import java.util.Arrays;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
@@ -56,7 +58,9 @@ public class SignificanceModelGenerator {
private final Tokenizer tokenizer;
private final TreeMap<String, Long> documentFrequency = new TreeMap<>();
- private final Language language;
+ private final List<Language> languages;
+
+ private final Language languageTag;
private final ObjectMapper objectMapper;
private final static JsonFactory parserFactory = new JsonFactory();
@@ -75,11 +79,16 @@ public class SignificanceModelGenerator {
throw new IllegalArgumentException("Output file must have .zst extension when using zst compression");
}
- language = Language.fromLanguageTag(clientParameters.language);
- if (language == Language.UNKNOWN) {
- throw new IllegalArgumentException("Unknown language: " + clientParameters.language);
+ if (!clientParameters.zstCompression && clientParameters.outputFile.endsWith(".zst")) {
+ throw new IllegalArgumentException("Output file must not have .zst extension when not using zst compression");
}
+ this.languages = Arrays.stream(clientParameters.language.split(","))
+ .map(Language::fromLanguageTag)
+ .collect(Collectors.toList());
+
+ this.languageTag = this.languages.get(0);
+
OpenNlpLinguistics openNlpLinguistics = new OpenNlpLinguistics();
tokenizer = openNlpLinguistics.getTokenizer();
objectMapper = new ObjectMapper();
@@ -104,7 +113,7 @@ public class SignificanceModelGenerator {
while (reader.ready()) {
String line = reader.readLine();
JsonReader jsonReader = new JsonReader(types, new ByteArrayInputStream(Utf8.toBytes(line)), parserFactory);
- String wikimediaId = "id:wikimedia:" + language.languageCode() + "::" + i;
+ String wikimediaId = "id:wikimedia:" + languageTag.languageCode() + "::" + i;
ParsedDocumentOperation operation = jsonReader.readSingleDocumentStreaming(DocumentOperationType.PUT, wikimediaId);
DocumentPut put = (DocumentPut) operation.operation();
@@ -118,6 +127,7 @@ public class SignificanceModelGenerator {
SignificanceModelFile modelFile;
File outputFile = Paths.get(clientParameters.outputFile).toFile();
+ String languagesKey = String.join(",", this.languages.stream().map(Language::languageCode).toList());
if (outputFile.exists()) {
InputStream in = outputFile.toString().endsWith(".zst") ?
@@ -126,11 +136,11 @@ public class SignificanceModelGenerator {
modelFile = objectMapper.readValue(in, SignificanceModelFile.class);
- modelFile.addLanguage(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency()));
+ modelFile.addLanguage(languagesKey, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency()));
} else {
HashMap<String, DocumentFrequencyFile> languages = new HashMap<>() {{
- put(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency()));
+ put(languagesKey, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency()));
}};
modelFile = new SignificanceModelFile(VERSION, ID, SIGNIFICANCE_DESCRIPTION + clientParameters.inputFile, languages);
@@ -149,7 +159,7 @@ public class SignificanceModelGenerator {
}
private void handleTokenization(String field) {
- var tokens = tokenizer.tokenize(field, language, StemMode.ALL, false);
+ var tokens = tokenizer.tokenize(field, languageTag, StemMode.ALL, false);
Set<String> uniqueWords = StreamSupport.stream(tokens.spliterator(), false)
.filter(t -> t.getType() == TokenType.ALPHABETIC)
diff --git a/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java b/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java
index 916fe05ef7b..4791d78f0a2 100644
--- a/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java
+++ b/vespaclient-java/src/test/java/com/yahoo/vespasignificance/SignificanceModelGeneratorTest.java
@@ -48,7 +48,7 @@ public class SignificanceModelGeneratorTest {
void testGenerateSimpleFile() throws IOException {
String inputPath = "no.jsonl";
String outputPath = "output.json";
- ClientParameters params = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build();
+ ClientParameters params = createParameters(inputPath, outputPath, "text", "nb", "nb", "false").build();
SignificanceModelGenerator generator = createSignificanceModelGenerator(params);
generator.generate();
@@ -60,9 +60,9 @@ public class SignificanceModelGeneratorTest {
HashMap<String, DocumentFrequencyFile> languages = modelFile.languages();
assertEquals(1, languages.size());
- assertTrue(languages.containsKey("NB"));
+ assertTrue(languages.containsKey("nb"));
- DocumentFrequencyFile documentFrequencyFile = languages.get("NB");
+ DocumentFrequencyFile documentFrequencyFile = languages.get("nb");
assertEquals(3, documentFrequencyFile.frequencies().get("fra"));
assertEquals(3, documentFrequencyFile.frequencies().get("skriveform"));
@@ -74,19 +74,17 @@ public class SignificanceModelGeneratorTest {
@Test
void testGenerateSimpleFileWithZST() throws IOException {
String inputPath = "no.jsonl";
- ClientParameters params1 = createParameters(inputPath, "output.json", "text", "NB", "nb", "true").build();
+ ClientParameters params1 = createParameters(inputPath, "output.json", "text", "nb", "nb", "true").build();
// Throws exception when outputfile does not have .zst extension when using zst compression
assertThrows(IllegalArgumentException.class, () -> createSignificanceModelGenerator(params1));
String outputPath = "output.json.zst";
- ClientParameters params = createParameters(inputPath, outputPath, "text", "NB", "nb", "true").build();
+ ClientParameters params = createParameters(inputPath, outputPath, "text", "nb", "nb", "true").build();
SignificanceModelGenerator generator = createSignificanceModelGenerator(params);
generator.generate();
-
-
File outputFile = new File(tempDir.resolve(outputPath ).toString());
assertTrue(outputFile.exists());
@@ -97,9 +95,9 @@ public class SignificanceModelGeneratorTest {
HashMap<String, DocumentFrequencyFile> languages = modelFile.languages();
assertEquals(1, languages.size());
- assertTrue(languages.containsKey("NB"));
+ assertTrue(languages.containsKey("nb"));
- DocumentFrequencyFile documentFrequencyFile = languages.get("NB");
+ DocumentFrequencyFile documentFrequencyFile = languages.get("nb");
assertEquals(3, documentFrequencyFile.frequencies().get("fra"));
assertEquals(3, documentFrequencyFile.frequencies().get("skriveform"));
@@ -112,7 +110,7 @@ public class SignificanceModelGeneratorTest {
void testGenerateFileWithMultipleLanguages() throws IOException {
String inputPath = "no.jsonl";
String outputPath = "output.json";
- ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build();
+ ClientParameters params1 = createParameters(inputPath, outputPath, "text", "nb", "nb", "false").build();
SignificanceModelGenerator generator = createSignificanceModelGenerator(params1);
generator.generate();
@@ -120,7 +118,7 @@ public class SignificanceModelGeneratorTest {
assertTrue(outputFile.exists());
String inputPath2 = "en.jsonl";
- ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "EN", "en", "false").build();
+ ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "en", "en", "false").build();
generator = createSignificanceModelGenerator(params2);
generator.generate();
@@ -133,11 +131,11 @@ public class SignificanceModelGeneratorTest {
assertEquals(2, languages.size());
- assertTrue(languages.containsKey("NB"));
- assertTrue(languages.containsKey("EN"));
+ assertTrue(languages.containsKey("nb"));
+ assertTrue(languages.containsKey("en"));
- DocumentFrequencyFile nb = languages.get("NB");
- DocumentFrequencyFile en = languages.get("EN");
+ DocumentFrequencyFile nb = languages.get("nb");
+ DocumentFrequencyFile en = languages.get("en");
assertEquals(3, nb.documentCount());
assertEquals(3, en.documentCount());
@@ -154,7 +152,7 @@ public class SignificanceModelGeneratorTest {
void testOverwriteExistingDocumentFrequencyLanguage() throws IOException {
String inputPath = "no.jsonl";
String outputPath = "output.json";
- ClientParameters params1 = createParameters(inputPath, outputPath, "text", "NB", "nb", "false").build();
+ ClientParameters params1 = createParameters(inputPath, outputPath, "text", "nb", "nb", "false").build();
SignificanceModelGenerator generator = createSignificanceModelGenerator(params1);
generator.generate();
@@ -166,16 +164,16 @@ public class SignificanceModelGeneratorTest {
HashMap<String, DocumentFrequencyFile> oldLanguages = preUpdatedFile.languages();
assertEquals(1, oldLanguages.size());
- assertTrue(oldLanguages.containsKey("NB"));
+ assertTrue(oldLanguages.containsKey("nb"));
- DocumentFrequencyFile oldDf = oldLanguages.get("NB");
+ DocumentFrequencyFile oldDf = oldLanguages.get("nb");
assertEquals(3, oldDf.frequencies().get("fra"));
assertEquals(3, oldDf.frequencies().get("skriveform"));
assertFalse(oldDf.frequencies().containsKey("nytt"));
String inputPath2 = "no_2.jsonl";
- ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "NB", "nb", "false").build();
+ ClientParameters params2 = createParameters(inputPath2, outputPath, "text", "nb", "nb", "false").build();
SignificanceModelGenerator generator2 = createSignificanceModelGenerator(params2);
generator2.generate();
@@ -188,13 +186,39 @@ public class SignificanceModelGeneratorTest {
assertEquals(1, languages.size());
- assertTrue(languages.containsKey("NB"));
+ assertTrue(languages.containsKey("nb"));
- DocumentFrequencyFile df = languages.get("NB");
+ DocumentFrequencyFile df = languages.get("nb");
assertEquals(2, df.frequencies().get("fra"));
assertEquals(3, df.frequencies().get("skriveform"));
assertTrue(df.frequencies().containsKey("nytt"));
assertEquals(2, df.frequencies().get("nytt"));
}
+
+ @Test
+ void testGenerateFileWithMultipleLanguagesForSingleDocumentFrequency() throws IOException {
+ String inputPath = "no.jsonl";
+ String outputPath = "output.json";
+ ClientParameters params = createParameters(inputPath, outputPath, "text", "nb,un", "nb", "false").build();
+ SignificanceModelGenerator generator = createSignificanceModelGenerator(params);
+ generator.generate();
+
+ File outputFile = new File(tempDir.resolve(outputPath).toString());
+ assertTrue(outputFile.exists());
+
+ SignificanceModelFile modelFile = objectMapper.readValue(outputFile, SignificanceModelFile.class);
+
+ HashMap<String, DocumentFrequencyFile> languages = modelFile.languages();
+ assertEquals(1, languages.size());
+
+ assertTrue(languages.containsKey("nb,un"));
+
+ DocumentFrequencyFile documentFrequencyFile = languages.get("nb,un");
+
+ assertEquals(3, documentFrequencyFile.frequencies().get("fra"));
+ assertEquals(3, documentFrequencyFile.frequencies().get("skriveform"));
+ assertEquals(3, documentFrequencyFile.frequencies().get("kategori"));
+ assertEquals(3, documentFrequencyFile.frequencies().get("eldr"));
+ }
}