12 files changed, 250 insertions, 126 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
index 7e7ee44bf74..b82450bc443 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/Segmenter.java
@@ -6,22 +6,23 @@ import com.yahoo.language.Language;
 import java.util.List;
 
 /**
- * Interface providing segmentation, i.e. splitting of CJK character blocks into separate tokens. This is primarily a
- * convenience feature for users who don't need full tokenization (or who use a separate tokenizer and only need CJK
- * processing).
+ * A segmenter splits a string into separate segments (such as words) without applying any further
+ * processing (such as stemming) on each segment.
+ *
+ * This is useful when token processing should be done separately from segmentation, such as in
+ * linguistic processing of queries, where token processing depends on field settings in a specific
+ * schema, while segmentation only depends on language and happens before schema-specific processing.
  *
  * @author Mathias Mølster Lidal
  */
 public interface Segmenter {
 
     /**
-     * Split input-string into tokens, and returned a list of tokens in unprocessed form (i.e. lowercased, normalized
-     * and stemmed if applicable, see @link{StemMode} for list of stemming options). It is assumed that the input only
-     * contains word-characters, any punctuation and spacing tokens will be removed.
+     * Returns a list of segments produced from a string.
      *
-     * @param input the text to segment.
-     * @param language language of input text.
-     * @return the list of segments.
+     * @param input the text to segment
+     * @param language the language of the input text
+     * @return the resulting list of segments
      * @throws ProcessingException if an exception is encountered during processing
      */
     List<String> segment(String input, Language language);
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
index a9f1e48af62..c8a31e1892c 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/SignificanceModel.java
@@ -9,4 +9,6 @@ import com.yahoo.api.annotations.Beta;
 @Beta
 public interface SignificanceModel {
     DocumentFrequency documentFrequency(String word);
+
+    String getId();
 }
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
index 7ed6f442610..6e024c3025e 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
@@ -1,15 +1,15 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 package com.yahoo.language.significance.impl;
 
-import com.fasterxml.jackson.annotation.JsonCreator;
-import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
-import com.fasterxml.jackson.annotation.JsonProperty;
 import com.fasterxml.jackson.databind.ObjectMapper;
 import com.yahoo.language.significance.DocumentFrequency;
 import com.yahoo.language.significance.SignificanceModel;
 
+import java.io.IOException;
 import java.nio.file.Path;
 import java.util.HashMap;
+import java.util.Map;
+import java.util.TreeMap;
 
 /**
  *
@@ -17,71 +17,23 @@ import java.util.HashMap;
  */
 public class DefaultSignificanceModel implements SignificanceModel {
     private final long corpusSize;
-    private final HashMap<String, Long> frequencies;
-    private final Path path;
+    private final Map<String, Long> frequencies;
 
-    @JsonIgnoreProperties(ignoreUnknown = true)
-    public static class SignificanceModelFile {
-        private final String version;
-        private final String id;
-        private final String description;
-        private final long corpusSize;
-        private final String language;
-
-        private final long wordCount;
-        private final HashMap<String, Long> frequencies;
-
-        @JsonCreator
-        public SignificanceModelFile(
-                @JsonProperty("version") String version,
-                @JsonProperty("id") String id,
-                @JsonProperty("description") String description,
-                @JsonProperty("corpus-size") long corpusSize,
-                @JsonProperty("language") String language,
-                @JsonProperty("word-count") long wordCount,
-                @JsonProperty("frequencies") HashMap<String, Long> frequencies) {
-            this.version = version;
-            this.id = id;
-            this.description = description;
-            this.corpusSize = corpusSize;
-            this.language = language;
-            this.wordCount = wordCount;
-            this.frequencies = frequencies;
-        }
-
-        @JsonProperty("version")
-        public String version() { return version; }
-
-        @JsonProperty("id")
-        public String id() { return id; }
-
-        @JsonProperty("description")
-        public String description() { return description; }
-
-        @JsonProperty("corpus-size")
-        public long corpusSize() { return corpusSize; }
-
-        @JsonProperty("language")
-        public String language() { return language; }
-
-        @JsonProperty("frequencies")
-        public HashMap<String, Long> frequencies() { return frequencies; }
-
-        @JsonProperty("word-count")
-        public long wordCount() { return wordCount; }
+    private String id;
 
+    public DefaultSignificanceModel(DocumentFrequencyFile file, String id) {
+        this.frequencies = file.frequencies();
+        this.corpusSize = file.documentCount();
+        this.id = id;
     }
 
     public DefaultSignificanceModel(Path path) {
-        this.path = path;
-
         ObjectMapper objectMapper = new ObjectMapper();
-
         try {
-            SignificanceModelFile model = objectMapper.readValue(this.path.toFile(), SignificanceModelFile.class);
-            this.corpusSize = model.corpusSize;
-            this.frequencies = model.frequencies;
-        } catch (Exception e) {
+            var file         = objectMapper.readValue(path.toFile(), DocumentFrequencyFile.class);
+            this.frequencies = file.frequencies();
+            this.corpusSize  = file.documentCount();
+        } catch (IOException e) {
             throw new RuntimeException("Failed to load model from " + path, e);
         }
     }
@@ -93,4 +45,10 @@ public class DefaultSignificanceModel implements SignificanceModel {
         }
         return new DocumentFrequency(1, corpusSize);
     }
+
+    @Override
+    public String getId() {
+        return this.id;
+    }
+
 }
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
index 1be1d3f13b5..72874c15d9e 100644
--- a/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModelRegistry.java
@@ -1,20 +1,21 @@
 // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 package com.yahoo.language.significance.impl;
 
+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.yahoo.component.annotation.Inject;
 import com.yahoo.language.Language;
 import com.yahoo.language.significance.SignificanceModel;
 import com.yahoo.language.significance.SignificanceModelRegistry;
 import com.yahoo.search.significance.config.SignificanceConfig;
 
+import java.io.IOException;
+import java.io.UncheckedIOException;
 import java.nio.file.Path;
 import java.util.EnumMap;
-import java.util.HashMap;
+import java.util.List;
 import java.util.Map;
 import java.util.Optional;
-import java.util.function.Supplier;
 
-import static com.yahoo.yolean.Exceptions.uncheck;
 /**
  * Default implementation of {@link SignificanceModelRegistry}.
  * This implementation loads models lazily and caches them.
@@ -24,24 +25,35 @@ import static com.yahoo.yolean.Exceptions.uncheck;
 public class DefaultSignificanceModelRegistry implements SignificanceModelRegistry {
 
     private final Map<Language, SignificanceModel> models;
+
     @Inject
-    public DefaultSignificanceModelRegistry(SignificanceConfig cfg) { this(new Builder(cfg)); }
-    private DefaultSignificanceModelRegistry(Builder b) {
+    public DefaultSignificanceModelRegistry(SignificanceConfig cfg) {
         this.models = new EnumMap<>(Language.class);
-        b.models.forEach((language, path) -> {
-            models.put(language,
-                    uncheck(() -> new DefaultSignificanceModel(path)));
-        });
+        for (var model : cfg.model()) {
+           addModel(model.path());
+        }
     }
 
-    public DefaultSignificanceModelRegistry(HashMap<Language, Path> map) {
+    public DefaultSignificanceModelRegistry(List<Path> models) {
         this.models = new EnumMap<>(Language.class);
-        map.forEach((language, path) -> {
-            models.put(language,
-                    uncheck(() -> new DefaultSignificanceModel(path)));
-        });
+        for (var path : models) {
+            addModel(path);
+        }
     }
 
+    public void addModel(Path path) {
+        ObjectMapper objectMapper = new ObjectMapper();
+        try {
+            SignificanceModelFile file = objectMapper.readValue(path.toFile(), SignificanceModelFile.class);
+            for (var pair : file.languages().entrySet()) {
+                this.models.put(
+                        Language.fromLanguageTag(pair.getKey()),
+                        new DefaultSignificanceModel(pair.getValue(), file.id()));
+            }
+        } catch (IOException e) {
+            throw new UncheckedIOException("Failed to load model from " + path, e);
+        }
+    }
 
     @Override
     public Optional<SignificanceModel> getModel(Language language) {
@@ -51,20 +63,4 @@ public class DefaultSignificanceModelRegistry implements SignificanceModelRegist
         }
         return Optional.of(models.get(language));
     }
-
-
-    public static final class Builder {
-        private final Map<Language, Path> models = new EnumMap<>(Language.class);
-
-        public Builder() {}
-        public Builder(SignificanceConfig cfg) {
-            for (var model : cfg.model()) {
-                addModel(Language.fromLanguageTag(model.language()), model.path());
-            }
-        }
-
-        public Builder addModel(Language lang, Path path) { models.put(lang, path); return this; }
-        public DefaultSignificanceModelRegistry build() { return new DefaultSignificanceModelRegistry(this); }
-    }
-
 }
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java
new file mode 100644
index 00000000000..34e73e1b547
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/DocumentFrequencyFile.java
@@ -0,0 +1,44 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance.impl;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.TreeMap;
+
+/**
+ *
+ * @author MariusArhaug
+ */
+@JsonIgnoreProperties(ignoreUnknown = true)
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public class DocumentFrequencyFile {
+    private final String description;
+
+    private final long documentCount;
+
+    private final Map<String, Long> frequencies;
+
+    @JsonCreator
+    public DocumentFrequencyFile(
+            @JsonProperty("description") String description,
+            @JsonProperty("document-count") long documentCount,
+            @JsonProperty("document-frequencies") Map<String, Long> frequencies) {
+        this.description = description;
+        this.documentCount = documentCount;
+        this.frequencies = frequencies;
+    }
+
+    @JsonProperty("description")
+    public String description() { return description; }
+
+    @JsonProperty("document-count")
+    public long documentCount() { return documentCount; }
+
+    @JsonProperty("document-frequencies")
+    public Map<String, Long> frequencies() { return frequencies; }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java
new file mode 100644
index 00000000000..94030108671
--- /dev/null
+++ b/linguistics/src/main/java/com/yahoo/language/significance/impl/SignificanceModelFile.java
@@ -0,0 +1,52 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.significance.impl;
+
+import com.fasterxml.jackson.annotation.JsonCreator;
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import com.fasterxml.jackson.annotation.JsonProperty;
+
+import java.util.HashMap;
+import java.util.List;
+
+/**
+ *
+ * @author MariusArhaug
+ */
+
+@JsonIgnoreProperties(ignoreUnknown = true)
+public class SignificanceModelFile {
+    private final String version;
+    private final String id;
+    private final String description;
+
+    private final HashMap<String, DocumentFrequencyFile> languages;
+
+    @JsonCreator
+    public SignificanceModelFile(
+            @JsonProperty("version") String version,
+            @JsonProperty("id") String id,
+            @JsonProperty("description") String description,
+            @JsonProperty("languages") HashMap<String, DocumentFrequencyFile> languages) {
+        this.version = version;
+        this.id = id;
+        this.description = description;
+        this.languages = languages;
+    }
+
+    @JsonProperty("version")
+    public String version() { return version; }
+
+    @JsonProperty("id")
+    public String id() { return id; }
+
+    @JsonProperty("description")
+    public String description() { return description; }
+
+    @JsonProperty("languages")
+    public HashMap<String, DocumentFrequencyFile> languages() { return languages; }
+
+    public void addLanguage(String language, DocumentFrequencyFile documentFrequencyFile) {
+        languages.put(language, documentFrequencyFile);
+    }
+}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
index f8d0dc83abc..662d4a807c5 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleDetector.java
@@ -47,7 +47,7 @@ public class SimpleDetector implements Detector {
     }
 
     public Language guessLanguage(String input) {
-        if (input == null || input.length() == 0) {
+        if (input == null || input.isEmpty()) {
             return Language.UNKNOWN;
         }
 
diff --git a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
index d4849571b5e..e8594885b9e 100644
--- a/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
+++ b/linguistics/src/test/java/com/yahoo/language/significance/DefaultSignificanceModelRegistryTest.java
@@ -6,7 +6,8 @@ import com.yahoo.language.significance.impl.DefaultSignificanceModelRegistry;
 import org.junit.Test;
 
 import java.nio.file.Path;
-import java.util.HashMap;
+import java.util.ArrayList;
+import java.util.List;
 
 import static org.junit.jupiter.api.Assertions.*;
 
@@ -18,10 +19,10 @@ public class DefaultSignificanceModelRegistryTest {
 
     @Test
     public void testDefaultSignificanceModelRegistry() {
-        HashMap<Language, Path> models = new HashMap<>();
+        List<Path> models = new ArrayList<>();
 
-        models.put(Language.ENGLISH, Path.of("src/test/models/en.json"));
-        models.put(Language.NORWEGIAN_BOKMAL, Path.of("src/test/models/no.json"));
+        models.add(Path.of("src/test/models/docv1.json"));
+        models.add(Path.of("src/test/models/docv2.json"));
 
         DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models);
 
@@ -39,6 +40,45 @@ public class DefaultSignificanceModelRegistryTest {
         assertNotNull(englishModel);
         assertNotNull(norwegianModel);
 
+        assertEquals("test::2", englishModel.getId());
+        assertEquals("test::2", norwegianModel.getId());
+
+        assertEquals(4, englishModel.documentFrequency("test").frequency());
+        assertEquals(14, englishModel.documentFrequency("test").corpusSize());
+
+        assertEquals(3, norwegianModel.documentFrequency("nei").frequency());
+        assertEquals(20, norwegianModel.documentFrequency("nei").corpusSize());
+
+        assertEquals(1, norwegianModel.documentFrequency("non-existent-word").frequency());
+        assertEquals(20, norwegianModel.documentFrequency("non-existent-word").corpusSize());
+
+    }
+
+    @Test
+    public void testDefaultSignificanceModelRegistryInOppsiteOrder() {
+
+        List<Path> models = new ArrayList<>();
+
+        models.add(Path.of("src/test/models/docv2.json"));
+        models.add(Path.of("src/test/models/docv1.json"));
+
+        DefaultSignificanceModelRegistry defaultSignificanceModelRegistry = new DefaultSignificanceModelRegistry(models);
+
+        var optionalEnglishModel = defaultSignificanceModelRegistry.getModel(Language.ENGLISH);
+        var optionalNorwegianModel = defaultSignificanceModelRegistry.getModel(Language.NORWEGIAN_BOKMAL);
+
+        assertTrue(optionalEnglishModel.isPresent());
+        assertTrue(optionalNorwegianModel.isPresent());
+
+        var englishModel = optionalEnglishModel.get();
+        var norwegianModel = optionalNorwegianModel.get();
+
+        assertNotNull(englishModel);
+        assertNotNull(norwegianModel);
+
+        assertEquals("test::1", englishModel.getId());
+        assertEquals("test::2", norwegianModel.getId());
+
         assertEquals(2, englishModel.documentFrequency("test").frequency());
         assertEquals(10, englishModel.documentFrequency("test").corpusSize());
 
@@ -47,6 +87,5 @@ public class DefaultSignificanceModelRegistryTest {
 
         assertEquals(1, norwegianModel.documentFrequency("non-existent-word").frequency());
         assertEquals(20, norwegianModel.documentFrequency("non-existent-word").corpusSize());
-
     }
 }
diff --git a/linguistics/src/test/models/docv1.json b/linguistics/src/test/models/docv1.json
new file mode 100644
index 00000000000..04010959a58
--- /dev/null
+++ b/linguistics/src/test/models/docv1.json
@@ -0,0 +1,18 @@
+{
+  "version" : "1.0",
+  "id" : "test::1",
+  "description" : "desc",
+  "languages" : {
+    "en": {
+      "description" : "english model",
+      "document-count" : 10,
+      "language" : "en",
+      "document-frequencies" : {
+        "usa" : 2,
+        "hello": 3,
+        "world": 5,
+        "test": 2
+      }
+    }
+  }
+}
diff --git a/linguistics/src/test/models/docv2.json b/linguistics/src/test/models/docv2.json
new file mode 100644
index 00000000000..c00d02fb744
--- /dev/null
+++ b/linguistics/src/test/models/docv2.json
@@ -0,0 +1,31 @@
+{
+  "version" : "2.0",
+  "id" : "test::2",
+  "description" : "desc",
+  "languages" : {
+    "en": {
+      "description" : "english model",
+      "document-count" : 14,
+      "document-frequencies" : {
+        "usa" : 2,
+        "hello": 3,
+        "world": 5,
+        "test": 4,
+        "additional": 2
+      }
+    },
+    "nb": {
+      "description" : "norwegian model",
+      "document-count" : 20,
+      "document-frequencies" : {
+        "usa" : 2,
+        "hello": 10,
+        "verden": 5,
+        "test": 2,
+        "norge": 11,
+        "ja": 12,
+        "nei": 3
+      }
+    }
+  }
+}
diff --git a/linguistics/src/test/models/en.json b/linguistics/src/test/models/en.json
index 50bae5e3451..87b7b2faa08 100644
--- a/linguistics/src/test/models/en.json
+++ b/linguistics/src/test/models/en.json
@@ -1,11 +1,11 @@
 {
   "version" : "1.0",
   "id" : "test::1",
-  "description" : "desc",
-  "corpus-size" : 10,
+  "description" : "english model",
+  "document-count" : 10,
   "language" : "en",
   "word-count" : 4,
-  "frequencies" : {
+  "document-frequencies" : {
     "usa" : 2,
     "hello": 3,
     "world": 5,
diff --git a/linguistics/src/test/models/no.json b/linguistics/src/test/models/no.json
deleted file mode 100644
index 5fca8929e74..00000000000
--- a/linguistics/src/test/models/no.json
+++ /dev/null
@@ -1,17 +0,0 @@
-{
-  "version" : "1.0",
-  "id" : "test::2",
-  "description" : "norsk beskrivelse",
-  "corpus-size" : 20,
-  "language" : "nb",
-  "word-count" : 7,
-  "frequencies" : {
-    "usa" : 2,
-    "hello": 10,
-    "verden": 5,
-    "test": 2,
-    "norge": 11,
-    "ja": 12,
-    "nei": 3
-  }
-}