aboutsummaryrefslogtreecommitdiffstats
path: root/vespaclient-java/src/main/java/com/yahoo
diff options
context:
space:
mode:
Diffstat (limited to 'vespaclient-java/src/main/java/com/yahoo')
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java1
-rw-r--r--vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java26
2 files changed, 18 insertions, 9 deletions
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java
index e368eebefa5..3bd836d7a14 100644
--- a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/Main.java
@@ -34,7 +34,6 @@ public class Main {
} else {
System.setProperty("vespa.replace_invalid_unicode", "true");
SignificanceModelGenerator significanceModelGenerator = createSignificanceModelGenerator(params);
-
significanceModelGenerator.generate();
}
} catch (IllegalArgumentException e) {
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java
index d620820e14f..3f257fc6df0 100644
--- a/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java
+++ b/vespaclient-java/src/main/java/com/yahoo/vespasignificance/SignificanceModelGenerator.java
@@ -44,6 +44,8 @@ import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import java.util.TreeMap;
+import java.util.List;
+import java.util.Arrays;
import java.util.stream.Collectors;
import java.util.stream.StreamSupport;
@@ -56,7 +58,9 @@ public class SignificanceModelGenerator {
private final Tokenizer tokenizer;
private final TreeMap<String, Long> documentFrequency = new TreeMap<>();
- private final Language language;
+ private final List<Language> languages;
+
+ private final Language languageTag;
private final ObjectMapper objectMapper;
private final static JsonFactory parserFactory = new JsonFactory();
@@ -75,11 +79,16 @@ public class SignificanceModelGenerator {
throw new IllegalArgumentException("Output file must have .zst extension when using zst compression");
}
- language = Language.fromLanguageTag(clientParameters.language);
- if (language == Language.UNKNOWN) {
- throw new IllegalArgumentException("Unknown language: " + clientParameters.language);
+ if (!clientParameters.zstCompression && clientParameters.outputFile.endsWith(".zst")) {
+ throw new IllegalArgumentException("Output file must not have .zst extension when not using zst compression");
}
+ this.languages = Arrays.stream(clientParameters.language.split(","))
+ .map(Language::fromLanguageTag)
+ .collect(Collectors.toList());
+
+ this.languageTag = this.languages.get(0);
+
OpenNlpLinguistics openNlpLinguistics = new OpenNlpLinguistics();
tokenizer = openNlpLinguistics.getTokenizer();
objectMapper = new ObjectMapper();
@@ -104,7 +113,7 @@ public class SignificanceModelGenerator {
while (reader.ready()) {
String line = reader.readLine();
JsonReader jsonReader = new JsonReader(types, new ByteArrayInputStream(Utf8.toBytes(line)), parserFactory);
- String wikimediaId = "id:wikimedia:" + language.languageCode() + "::" + i;
+ String wikimediaId = "id:wikimedia:" + languageTag.languageCode() + "::" + i;
ParsedDocumentOperation operation = jsonReader.readSingleDocumentStreaming(DocumentOperationType.PUT, wikimediaId);
DocumentPut put = (DocumentPut) operation.operation();
@@ -118,6 +127,7 @@ public class SignificanceModelGenerator {
SignificanceModelFile modelFile;
File outputFile = Paths.get(clientParameters.outputFile).toFile();
+ String languagesKey = String.join(",", this.languages.stream().map(Language::languageCode).toList());
if (outputFile.exists()) {
InputStream in = outputFile.toString().endsWith(".zst") ?
@@ -126,11 +136,11 @@ public class SignificanceModelGenerator {
modelFile = objectMapper.readValue(in, SignificanceModelFile.class);
- modelFile.addLanguage(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency()));
+ modelFile.addLanguage(languagesKey, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency()));
} else {
HashMap<String, DocumentFrequencyFile> languages = new HashMap<>() {{
- put(clientParameters.language, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency()));
+ put(languagesKey, new DocumentFrequencyFile(DOC_FREQ_DESCRIPTION, pageCount, getFinalDocumentFrequency()));
}};
modelFile = new SignificanceModelFile(VERSION, ID, SIGNIFICANCE_DESCRIPTION + clientParameters.inputFile, languages);
@@ -149,7 +159,7 @@ public class SignificanceModelGenerator {
}
private void handleTokenization(String field) {
- var tokens = tokenizer.tokenize(field, language, StemMode.ALL, false);
+ var tokens = tokenizer.tokenize(field, languageTag, StemMode.ALL, false);
Set<String> uniqueWords = StreamSupport.stream(tokens.spliterator(), false)
.filter(t -> t.getType() == TokenType.ALPHABETIC)