Revert "Merge pull request #29328 from vespa-engine/revert-29314-bratseth/casing-take-2"

This reverts commit a72e949533a46d665440a9c72ca2b8fb58f3a9c3, reversing changes made to 944d635d00e165166508ef23399e9ed65a87a9c8.
author: Jon Bratseth <bratseth@vespa.ai> 2023-11-14 11:28:40 +0100
committer: Jon Bratseth <bratseth@vespa.ai> 2023-11-14 11:28:40 +0100
commit: 997896e40f47770b22a81b5ec8281d2e962ec4d9 (patch)
tree: 3784800283871e8f99b5579e3a8e545a11f86df1
parent: 29109450c8c2c98d969a711b8f6240bb5594c150 (diff)
18 files changed, 157 insertions, 136 deletions
diff --git a/document/src/main/java/com/yahoo/document/annotation/Annotation.java b/document/src/main/java/com/yahoo/document/annotation/Annotation.java
index 3d9300550ff..237ca6db58b 100644
--- a/document/src/main/java/com/yahoo/document/annotation/Annotation.java
+++ b/document/src/main/java/com/yahoo/document/annotation/Annotation.java
@@ -129,7 +129,7 @@ public class Annotation implements Comparable<Annotation> {
     }
 
     /**
-     * WARNING! Should only be used by deserializers!&nbsp;Sets the span node that this annotation points to.
+     * WARNING! Should only be used by deserializers! Sets the span node that this annotation points to.
      *
      * @param spanNode the span node that this annotation shall point to.
      */
@@ -221,10 +221,9 @@ public class Annotation implements Comparable<Annotation> {
 
     @Override
     public String toString() {
-        String retval = "annotation of type " + type;
-        retval += ((value == null) ? " (no value)" : " (with value)");
-        retval += ((spanNode == null) ? " (no span)" : (" with span "+spanNode));
-        return retval;
+        return type + " annotation " +
+               ((value == null) ? " (no value)" : " (with value)") +
+               ((spanNode == null) ? " (no span)" : (" with span "+spanNode));
     }
 
 
diff --git a/document/src/main/java/com/yahoo/document/annotation/AnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/AnnotationContainer.java
index 6e2b986a478..ac2e6aefa1b 100644
--- a/document/src/main/java/com/yahoo/document/annotation/AnnotationContainer.java
+++ b/document/src/main/java/com/yahoo/document/annotation/AnnotationContainer.java
@@ -23,11 +23,7 @@ abstract class AnnotationContainer {
      */
     abstract void annotate(Annotation annotation);
 
-    /**
-     * Returns a mutable collection of annotations.
-     *
-     * @return a mutable collection of annotations.
-     */
+    /** Returns a mutable collection of the annotations in this. */
     abstract Collection<Annotation> annotations();
 
     /**
diff --git a/document/src/main/java/com/yahoo/document/annotation/AnnotationType2AnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/AnnotationType2AnnotationContainer.java
index d8709baa3a1..01b8f990bb4 100644
--- a/document/src/main/java/com/yahoo/document/annotation/AnnotationType2AnnotationContainer.java
+++ b/document/src/main/java/com/yahoo/document/annotation/AnnotationType2AnnotationContainer.java
@@ -16,6 +16,7 @@ import java.util.NoSuchElementException;
  */
 // TODO: Should this be removed?
 public class AnnotationType2AnnotationContainer extends IteratingAnnotationContainer {
+
     private final Multimap<AnnotationType, Annotation> annotationType2Annotation = Multimaps.newMultimap(new IdentityHashMap<>(), ArrayList::new);
 
     @Override
@@ -31,7 +32,6 @@ public class AnnotationType2AnnotationContainer extends IteratingAnnotationConta
     }
 
     @Override
-    @SuppressWarnings("unchecked")
     Collection<Annotation> annotations() {
         return annotationType2Annotation.values();
     }
@@ -56,12 +56,12 @@ public class AnnotationType2AnnotationContainer extends IteratingAnnotationConta
     }
 
     private class NonRecursiveIterator implements Iterator<Annotation> {
+
         private final IdentityHashMap<SpanNode, SpanNode> nodes;
         private final Iterator<Annotation> annotationIt;
         private Annotation next = null;
         private boolean nextCalled;
 
-        @SuppressWarnings("unchecked")
         public NonRecursiveIterator(IdentityHashMap<SpanNode, SpanNode> nodes) {
             this.nodes = nodes;
             this.annotationIt = annotationType2Annotation.values().iterator();
@@ -106,4 +106,5 @@ public class AnnotationType2AnnotationContainer extends IteratingAnnotationConta
             nextCalled = false;
         }
     }
+
 }
diff --git a/document/src/main/java/com/yahoo/document/annotation/IteratingAnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/IteratingAnnotationContainer.java
index 54af80e6e43..c7c3545fc09 100644
--- a/document/src/main/java/com/yahoo/document/annotation/IteratingAnnotationContainer.java
+++ b/document/src/main/java/com/yahoo/document/annotation/IteratingAnnotationContainer.java
@@ -5,30 +5,29 @@ import java.util.IdentityHashMap;
 import java.util.Iterator;
 
 /**
- * @author <a href="mailto:einarmr@yahoo-inc.com">Einar M R Rosenvinge</a>
+ * @author Einar M R Rosenvinge
  */
 abstract class IteratingAnnotationContainer extends AnnotationContainer {
 
     @Override
         Iterator<Annotation> iterator(SpanNode node) {
-        IdentityHashMap<SpanNode, SpanNode> nodes = new IdentityHashMap<SpanNode, SpanNode>();
+        IdentityHashMap<SpanNode, SpanNode> nodes = new IdentityHashMap<>();
         nodes.put(node, node);
         return iterator(nodes);
     }
 
     @Override
     Iterator<Annotation> iteratorRecursive(SpanNode node) {
-        IdentityHashMap<SpanNode, SpanNode> nodes = new IdentityHashMap<SpanNode, SpanNode>();
+        IdentityHashMap<SpanNode, SpanNode> nodes = new IdentityHashMap<>();
         nodes.put(node, node);
-        {
-            Iterator<SpanNode> childrenIt = node.childIteratorRecursive();
-            while (childrenIt.hasNext()) {
-                SpanNode child = childrenIt.next();
-                nodes.put(child, child);
-            }
+        Iterator<SpanNode> childrenIt = node.childIteratorRecursive();
+        while (childrenIt.hasNext()) {
+            SpanNode child = childrenIt.next();
+            nodes.put(child, child);
         }
         return iterator(nodes);
     }
 
     abstract Iterator<Annotation> iterator(IdentityHashMap<SpanNode, SpanNode> nodes);
+
 }
diff --git a/document/src/main/java/com/yahoo/document/annotation/ListAnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/ListAnnotationContainer.java
index c2c22558a32..b8d8cd692d8 100644
--- a/document/src/main/java/com/yahoo/document/annotation/ListAnnotationContainer.java
+++ b/document/src/main/java/com/yahoo/document/annotation/ListAnnotationContainer.java
@@ -13,6 +13,7 @@ import java.util.NoSuchElementException;
  * @author Einar M R Rosenvinge
  */
 public class ListAnnotationContainer extends IteratingAnnotationContainer {
+
     private final List<Annotation> annotations = new LinkedList<>();
 
     @Override
@@ -38,9 +39,8 @@ public class ListAnnotationContainer extends IteratingAnnotationContainer {
     @Override
     public boolean equals(Object o) {
         if (this == o) return true;
-        if (!(o instanceof ListAnnotationContainer)) return false;
-        ListAnnotationContainer that = (ListAnnotationContainer) o;
-        if (!annotations.equals(that.annotations)) return false;
+        if (!(o instanceof ListAnnotationContainer other)) return false;
+        if (!annotations.equals(other.annotations)) return false;
         return true;
     }
 
@@ -50,8 +50,9 @@ public class ListAnnotationContainer extends IteratingAnnotationContainer {
     }
 
     private class AnnotationIterator implements Iterator<Annotation> {
-        private IdentityHashMap<SpanNode, SpanNode> nodes;
-        private PeekableListIterator<Annotation> base;
+
+        private final IdentityHashMap<SpanNode, SpanNode> nodes;
+        private final PeekableListIterator<Annotation> base;
         private boolean nextCalled = false;
 
         AnnotationIterator(ListIterator<Annotation> baseIt, IdentityHashMap<SpanNode, SpanNode> nodes) {
@@ -91,4 +92,5 @@ public class ListAnnotationContainer extends IteratingAnnotationContainer {
             nextCalled = false;
         }
     }
+
 }
diff --git a/document/src/main/java/com/yahoo/document/annotation/SpanList.java b/document/src/main/java/com/yahoo/document/annotation/SpanList.java
index 9530689c3f1..04bdf5c892e 100644
--- a/document/src/main/java/com/yahoo/document/annotation/SpanList.java
+++ b/document/src/main/java/com/yahoo/document/annotation/SpanList.java
@@ -11,13 +11,14 @@ import java.util.ListIterator;
 /**
  * A node in a Span tree that can have child nodes.
  *
- * @author <a href="mailto:einarmr@yahoo-inc.com">Einar M R Rosenvinge</a>
+ * @author Einar M R Rosenvinge
  */
 public class SpanList extends SpanNode {
+
     public static final byte ID = 2;
     private final List<SpanNode> children;
-    private int cachedFrom = Integer.MIN_VALUE; //triggers calculateFrom()
-    private int cachedTo = Integer.MIN_VALUE;  //triggers calculateTo()
+    private int cachedFrom = Integer.MIN_VALUE; // triggers calculateFrom()
+    private int cachedTo = Integer.MIN_VALUE;  // triggers calculateTo()
 
     /** Creates a new SpanList. */
     public SpanList() {
@@ -39,7 +40,7 @@ public class SpanList extends SpanNode {
      * @param other the SpanList to copy.
      */
     public SpanList(SpanList other) {
-        this.children = new LinkedList<SpanNode>();
+        this.children = new LinkedList<>();
         for (SpanNode otherNode : other.children) {
             if (otherNode instanceof Span) {
                 children.add(new Span((Span) otherNode));
@@ -86,15 +87,15 @@ public class SpanList extends SpanNode {
 
     /** Create a span, add it to this list and return it */
     public Span span(int from, int length) {
-        Span span=new Span(from,length);
+        Span span = new Span(from, length);
         add(span);
         return span;
     }
 
     void setInvalid() {
-        //invalidate ourselves:
+        // invalidate ourselves:
         super.setInvalid();
-        //invalidate all our children:
+        // invalidate all our children:
         for (SpanNode node : children()) {
             node.setInvalid();
         }
@@ -213,20 +214,12 @@ public class SpanList extends SpanNode {
         return this;
     }
 
-    /**
-     * Returns a <strong>modifiable</strong> list of the immediate children of this SpanList.
-     *
-     * @return a <strong>modifiable</strong> list of the immediate children of this SpanList.
-     */
+    /** Returns a modifiable list of the immediate children of this SpanList. */
     protected List<SpanNode> children() {
         return children;
     }
 
-    /**
-     * Returns the number of children this SpanList holds.
-     *
-     * @return the number of children this SpanList holds.
-     */
+    /** Returns the number of children this SpanList holds. */
     public int numChildren() {
         return children().size();
     }
@@ -394,11 +387,9 @@ public class SpanList extends SpanNode {
     @Override
     public boolean equals(Object o) {
         if (this == o) return true;
-        if (!(o instanceof SpanList)) return false;
+        if (!(o instanceof SpanList spanList)) return false;
         if (!super.equals(o)) return false;
 
-        SpanList spanList = (SpanList) o;
-
         if (children() != null ? !children().equals(spanList.children()) : spanList.children() != null) return false;
 
         return true;
@@ -415,4 +406,5 @@ public class SpanList extends SpanNode {
     public String toString() {
         return "SpanList with " + children().size() + " children";
     }
+
 }
diff --git a/document/src/main/java/com/yahoo/document/annotation/SpanNode2AnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/SpanNode2AnnotationContainer.java
index a66639b3bfd..f563fd0d6cd 100644
--- a/document/src/main/java/com/yahoo/document/annotation/SpanNode2AnnotationContainer.java
+++ b/document/src/main/java/com/yahoo/document/annotation/SpanNode2AnnotationContainer.java
@@ -18,6 +18,7 @@ import java.util.List;
  * @author Einar M R Rosenvinge
  */
 class SpanNode2AnnotationContainer extends AnnotationContainer {
+
     private final Multimap<SpanNode, Annotation> spanNode2Annotation = Multimaps.newMultimap(new IdentityHashMap<>(), ArrayList::new);
 
     @Override
diff --git a/document/src/main/java/com/yahoo/document/annotation/SpanTree.java b/document/src/main/java/com/yahoo/document/annotation/SpanTree.java
index f785cf3b3ec..2faf17ce428 100644
--- a/document/src/main/java/com/yahoo/document/annotation/SpanTree.java
+++ b/document/src/main/java/com/yahoo/document/annotation/SpanTree.java
@@ -19,6 +19,7 @@ import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.stream.Collectors;
 
 /**
  * A SpanTree holds a root node of a tree of SpanNodes, and a List of Annotations pointing to these nodes
@@ -431,7 +432,7 @@ public class SpanTree implements Iterable<Annotation>, SpanNodeParent, Comparabl
     }
 
     /**
-     * Adds an Annotation to the internal list of annotations for this SpanTree.&nbsp;Use this when
+     * Adds an Annotation to the internal list of annotations for this SpanTree. Use this when
      * adding an Annotation that shall annotate a SpanNode. Upon return, Annotation.getSpanNode()
      * returns the given node.
      *
@@ -446,7 +447,7 @@ public class SpanTree implements Iterable<Annotation>, SpanNodeParent, Comparabl
     }
 
     /**
-     * Adds an Annotation to the internal list of annotations for this SpanTree.&nbsp;Use this when
+     * Adds an Annotation to the internal list of annotations for this SpanTree. Use this when
      * adding an Annotation that shall annotate a SpanNode. Upon return, Annotation.getSpanNode()
      * returns the given node. This one is unchecked and assumes that the SpanNode is valid and has
      * already been attached to the Annotation.
@@ -538,10 +539,7 @@ public class SpanTree implements Iterable<Annotation>, SpanNodeParent, Comparabl
         }
     }
 
-    /**
-     * Returns an Iterator over all annotations in this tree.&nbsp;Note that the iteration order is non-deterministic.
-     * @return an Iterator over all annotations in this tree.
-     */
+    /** Returns an Iterator over all annotations in this tree. Note that the iteration order is non-deterministic. */
     public Iterator<Annotation> iterator() {
         return annotations.annotations().iterator();
     }
@@ -641,7 +639,9 @@ public class SpanTree implements Iterable<Annotation>, SpanNodeParent, Comparabl
 
     @Override
     public String toString() {
-        return "SpanTree '" + name + "'";
+        return "SpanTree '" + name + "' with root: " + root +
+               ( annotations.annotations().size() > 5 ? "" :
+                 ", annotations: " + annotations.annotations().stream().map(Annotation::toString).collect(Collectors.joining(", ")));
     }
 
     @Override
diff --git a/document/src/main/java/com/yahoo/document/datatypes/Struct.java b/document/src/main/java/com/yahoo/document/datatypes/Struct.java
index 0b1bbf5d3ca..bb54b41069b 100644
--- a/document/src/main/java/com/yahoo/document/datatypes/Struct.java
+++ b/document/src/main/java/com/yahoo/document/datatypes/Struct.java
@@ -218,8 +218,7 @@ public class Struct extends StructuredFieldValue {
         StringBuilder retVal = new StringBuilder();
         retVal.append("Struct (").append(getDataType()).append("): ");
         int [] increasing = getInOrder();
-        for (int i = 0; i < increasing.length; i++) {
-            int id = increasing[i];
+        for (int id : increasing) {
             retVal.append(getDataType().getField(id)).append("=").append(values.get(id)).append(", ");
         }
         return retVal.toString();
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
index 26058eeb8f3..fdfadf65400 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
@@ -15,6 +15,8 @@ import com.yahoo.vespa.indexinglanguage.linguistics.LinguisticsAnnotator;
 
 import java.util.Iterator;
 
+import static com.yahoo.language.LinguisticsCase.toLowerCase;
+
 /**
  * A filter which splits incoming text into n-grams.
  *
@@ -68,8 +70,9 @@ public final class NGramExpression extends Expression {
 
             // annotate gram as a word term
             String gramString = gram.extractFrom(output.getString());
-            typedSpan(gram.getStart(), gram.getCodePointCount(), TokenType.ALPHABETIC, spanList).
-                    annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(gramString, gramString));
+            typedSpan(gram.getStart(),
+                      gram.getCodePointCount(),
+                      TokenType.ALPHABETIC, spanList).annotate(LinguisticsAnnotator.termAnnotation(toLowerCase(gramString), gramString));
 
             lastPosition = gram.getStart() + gram.getCodePointCount();
         }
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
index 684bae3bf97..5c1bf0813c4 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
@@ -13,7 +13,7 @@ public class AnnotatorConfig implements Cloneable {
     private Language language;
     private StemMode stemMode;
     private boolean removeAccents;
-    private int maxTermOccurences;
+    private int maxTermOccurrences;
     private int maxTokenizeLength;
 
     public static final int DEFAULT_MAX_TERM_OCCURRENCES;
@@ -29,7 +29,7 @@ public class AnnotatorConfig implements Cloneable {
         language = Language.ENGLISH;
         stemMode = StemMode.NONE;
         removeAccents = false;
-        maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES;
+        maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES;
         maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH;
     }
 
@@ -37,7 +37,7 @@ public class AnnotatorConfig implements Cloneable {
         language = rhs.language;
         stemMode = rhs.stemMode;
         removeAccents = rhs.removeAccents;
-        maxTermOccurences = rhs.maxTermOccurences;
+        maxTermOccurrences = rhs.maxTermOccurrences;
         maxTokenizeLength = rhs.maxTokenizeLength;
     }
 
@@ -74,11 +74,11 @@ public class AnnotatorConfig implements Cloneable {
     }
 
     public int getMaxTermOccurrences() {
-        return maxTermOccurences;
+        return maxTermOccurrences;
     }
 
     public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) {
-        this.maxTermOccurences = maxTermCount;
+        this.maxTermOccurrences = maxTermCount;
         return this;
     }
 
@@ -109,7 +109,7 @@ public class AnnotatorConfig implements Cloneable {
         if (removeAccents != rhs.removeAccents) {
             return false;
         }
-        if (maxTermOccurences != rhs.maxTermOccurences) {
+        if (maxTermOccurrences != rhs.maxTermOccurrences) {
             return false;
         }
         if (maxTokenizeLength != rhs.maxTokenizeLength) {
@@ -121,6 +121,7 @@ public class AnnotatorConfig implements Cloneable {
     @Override
     public int hashCode() {
         return getClass().hashCode() + language.hashCode() + stemMode.hashCode() +
-               Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength;
+               Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength;
     }
+
 }
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 191d067effe..52cd8a8ff54 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -34,8 +34,8 @@ public class LinguisticsAnnotator {
         final Map<String, Integer> termOccurrences = new HashMap<>();
         final int maxOccurrences;
 
-        public TermOccurrences(int maxOccurences) {
-            this.maxOccurrences = maxOccurences;
+        public TermOccurrences(int maxOccurrences) {
+            this.maxOccurrences = maxOccurrences;
         }
 
         boolean termCountBelowLimit(String term) {
@@ -86,24 +86,23 @@ public class LinguisticsAnnotator {
     }
 
     /**
-     * Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the
+     * Creates a TERM annotation which has the term as annotation (only) if it is different from the
      * original.
      *
-     * @param termToLowerCase the term to lower case
-     * @param origTerm        the original term
+     * @param term the term
+     * @param origTerm the original term
      * @return the created TERM annotation
      */
-    public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) {
-        String annotationValue = toLowerCase(termToLowerCase);
-        if (annotationValue.equals(origTerm)) {
+    public static Annotation termAnnotation(String term, String origTerm) {
+        if (term.equals(origTerm))
             return new Annotation(AnnotationTypes.TERM);
-        }
-        return new Annotation(AnnotationTypes.TERM, new StringFieldValue(annotationValue));
+        else
+            return new Annotation(AnnotationTypes.TERM, new StringFieldValue(term));
     }
 
     private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences) {
         if (termOccurrences.termCountBelowLimit(term)) {
-            here.annotate(lowerCaseTermAnnotation(term, orig));
+            here.annotate(termAnnotation(term, orig));
         }
     }
 
@@ -127,29 +126,24 @@ public class LinguisticsAnnotator {
         }
         if (mode == StemMode.ALL) {
             Span where = parent.span((int)token.getOffset(), token.getOrig().length());
-            String lowercasedOrig = toLowerCase(token.getOrig());
-            addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences);
 
-            String lowercasedTerm = lowercasedOrig;
+            String lowercasedOrig = toLowerCase(token.getOrig());
             String term = token.getTokenString();
             if (term != null) {
-                lowercasedTerm = toLowerCase(term);
-            }
-            if (! lowercasedOrig.equals(lowercasedTerm)) {
                 addAnnotation(where, term, token.getOrig(), termOccurrences);
+                if ( ! term.equals(lowercasedOrig))
+                    addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences);
             }
             for (int i = 0; i < token.getNumStems(); i++) {
                 String stem = token.getStem(i);
-                String lowercasedStem = toLowerCase(stem);
-                if (! (lowercasedOrig.equals(lowercasedStem) || lowercasedTerm.equals(lowercasedStem))) {
+                if (! (stem.equals(lowercasedOrig) || stem.equals(term)))
                     addAnnotation(where, stem, token.getOrig(), termOccurrences);
-                }
             }
         } else {
             String term = token.getTokenString();
             if (term == null || term.trim().isEmpty()) return;
             if (termOccurrences.termCountBelowLimit(term))  {
-                parent.span((int)token.getOffset(), token.getOrig().length()).annotate(lowerCaseTermAnnotation(term, token.getOrig()));
+                parent.span((int)token.getOffset(), token.getOrig().length()).annotate(termAnnotation(term, token.getOrig()));
             }
         }
     }
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java
index bcde8751de8..b4e266ab3eb 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java
@@ -57,8 +57,8 @@ public class NGramTestCase {
         new NGramExpression(new SimpleLinguistics(), 3).execute(context);
 
         StringFieldValue value = (StringFieldValue)context.getValue();
-        assertEquals("Grams are pure annotations - field value is unchanged", "en gul Bille sang... ",
-                     value.getString());
+        assertEquals("Grams are pure annotations - field value is unchanged",
+                     "en gul Bille sang... ", value.getString());
         SpanTree gramTree = value.getSpanTree(SpanTrees.LINGUISTICS);
         assertNotNull(gramTree);
         SpanList grams = (SpanList)gramTree.getRoot();
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index 67bff3843ee..a4dbe1fe826 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -19,7 +19,6 @@ import org.junit.Test;
 import org.mockito.Mockito;
 
 import java.util.*;
-import java.util.stream.Collectors;
 
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
@@ -29,8 +28,6 @@ import static org.junit.Assert.assertTrue;
  */
 public class LinguisticsAnnotatorTestCase {
 
-    private static final AnnotatorConfig CONFIG = new AnnotatorConfig();
-
     @Test
     public void requireThatAnnotateFailsWithZeroTokens() {
         assertAnnotations(null, "foo");
@@ -42,7 +39,7 @@ public class LinguisticsAnnotatorTestCase {
             if (type.isIndexable()) {
                 continue;
             }
-            assertAnnotations(null, "foo", newToken("foo", "bar", type));
+            assertAnnotations(null, "foo", token("foo", "bar", type));
         }
     }
 
@@ -54,7 +51,27 @@ public class LinguisticsAnnotatorTestCase {
             if (!type.isIndexable()) {
                 continue;
             }
-            assertAnnotations(expected, "foo", newToken("foo", "bar", type));
+            assertAnnotations(expected, "foo", token("foo", "bar", type));
+        }
+    }
+
+    @Test
+    public void requireThatIndexableTokenStringsAreAnnotatedWithModeALL() {
+        SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
+        expected.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("tesla")));
+        var span2 = expected.spanList().span(0, 4);
+        span2.annotate(new Annotation(AnnotationTypes.TERM));
+        span2.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("car")));
+        var span3 = expected.spanList().span(0, 8);
+        span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelxes")));
+        span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelx")));
+        span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("mex")));
+        for (TokenType type : TokenType.values()) {
+            if (!type.isIndexable()) continue;
+            assertAnnotations(expected, "Tesla cars", new AnnotatorConfig().setStemMode("ALL"),
+                              token("Tesla", "tesla", type),
+                              token("cars", "car", type),
+                              SimpleToken.fromStems("ModelXes", List.of("modelxes", "modelx", "mex")));
         }
     }
 
@@ -63,7 +80,7 @@ public class LinguisticsAnnotatorTestCase {
         SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
         expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
         for (TokenType type : TokenType.values()) {
-            assertAnnotations(expected, "foo", newToken("foo", "bar", type, true));
+            assertAnnotations(expected, "foo", token("foo", "bar", type, true));
         }
     }
 
@@ -76,21 +93,21 @@ public class LinguisticsAnnotatorTestCase {
                 if (!specialToken && !type.isIndexable()) {
                     continue;
                 }
-                assertAnnotations(expected, "foo", newToken("foo", "foo", type, specialToken));
+                assertAnnotations(expected, "foo", token("foo", "foo", type, specialToken));
             }
         }
     }
 
     @Test
-    public void requireThatTermAnnotationsAreLowerCased() {
+    public void requireThatTermAnnotationsPreserveCasing() {
         SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
-        expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
+        expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("BaR")));
         for (boolean specialToken : Arrays.asList(true, false)) {
             for (TokenType type : TokenType.values()) {
                 if (!specialToken && !type.isIndexable()) {
                     continue;
                 }
-                assertAnnotations(expected, "foo", newToken("foo", "BAR", type, specialToken));
+                assertAnnotations(expected, "foo", token("foo", "BaR", type, specialToken));
             }
         }
     }
@@ -102,11 +119,11 @@ public class LinguisticsAnnotatorTestCase {
         expected.spanList().span(3, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
         expected.spanList().span(6, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz")));
 
-        SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC)
-                .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
-                .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
-                                      .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
-                                      .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
+        SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC)
+                .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
+                .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
+                                                                             .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
+                                                                             .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
         assertAnnotations(expected, "foobarbaz", token);
     }
 
@@ -116,11 +133,11 @@ public class LinguisticsAnnotatorTestCase {
         expected.spanList().span(0, 9).annotate(new Annotation(AnnotationTypes.TERM,
                                                                new StringFieldValue("foobarbaz")));
 
-        SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true)
-                .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
-                .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
-                                      .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
-                                      .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
+        SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true)
+                                                                                 .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
+                                                                                 .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
+                                                                                                                                              .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
+                                                                                                                                              .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
         assertAnnotations(expected, "foobarbaz", token);
     }
 
@@ -140,7 +157,8 @@ public class LinguisticsAnnotatorTestCase {
                     continue;
                 }
                 assertAnnotations(expected, "foo",
-                                  newLinguistics(List.of(newToken("foo", "foo", type, specialToken)),
+                                  new AnnotatorConfig(),
+                                  newLinguistics(List.of(token("foo", "foo", type, specialToken)),
                                                  Collections.singletonMap("foo", "bar")));
             }
         }
@@ -154,11 +172,9 @@ public class LinguisticsAnnotatorTestCase {
         StringFieldValue val = new StringFieldValue("foo");
         val.setSpanTree(spanTree);
 
-        Linguistics linguistics = newLinguistics(List.of(newToken("foo", "bar", TokenType.ALPHABETIC, false)),
+        Linguistics linguistics = newLinguistics(List.of(token("foo", "bar", TokenType.ALPHABETIC, false)),
                                                  Collections.<String, String>emptyMap());
-        new LinguisticsAnnotator(linguistics, CONFIG).annotate(val);
-
-        assertTrue(new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
+        assertTrue(new LinguisticsAnnotator(linguistics, new AnnotatorConfig()).annotate(val));
         assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS));
     }
 
@@ -186,7 +202,7 @@ public class LinguisticsAnnotatorTestCase {
     }
 
     @Test
-    public void requireThatMaxTermOccurencesIsHonored() {
+    public void requireThatMaxTermOccurrencesIsHonored() {
         final String inputTerm = "foo";
         final String stemmedInputTerm = "bar"; // completely different from
                                                // inputTerm for safer test
@@ -204,7 +220,7 @@ public class LinguisticsAnnotatorTestCase {
             StringBuilder input = new StringBuilder();
             Token[] tokens = new Token[inputTermOccurence];
             for (int i = 0; i < inputTermOccurence; ++i) {
-                SimpleToken t = newToken(inputTerm, stemmedInputTerm, type);
+                SimpleToken t = token(inputTerm, stemmedInputTerm, type);
                 t.setOffset(i * paddedInputTerm.length());
                 tokens[i] = t;
                 input.append(paddedInputTerm);
@@ -214,28 +230,29 @@ public class LinguisticsAnnotatorTestCase {
     }
 
     // --------------------------------------------------------------------------------
-    //
     // Utilities
-    //
-    // --------------------------------------------------------------------------------
 
-    private static SimpleToken newToken(String orig, String stem, TokenType type) {
-        return newToken(orig, stem, type, false);
+    private static SimpleToken token(String orig, String stem, TokenType type) {
+        return token(orig, stem, type, false);
     }
 
-    private static SimpleToken newToken(String orig, String stem, TokenType type, boolean specialToken) {
+    private static SimpleToken token(String orig, String stem, TokenType type, boolean specialToken) {
         return new SimpleToken(orig).setTokenString(stem)
                                     .setType(type)
                                     .setSpecialToken(specialToken);
     }
 
     private static void assertAnnotations(SpanTree expected, String value, Token... tokens) {
-        assertAnnotations(expected, value, newLinguistics(Arrays.asList(tokens), Collections.<String, String>emptyMap()));
+        assertAnnotations(expected, value, new AnnotatorConfig(), newLinguistics(Arrays.asList(tokens), Collections.emptyMap()));
+    }
+
+    private static void assertAnnotations(SpanTree expected, String value, AnnotatorConfig config, Token... tokens) {
+        assertAnnotations(expected, value, config, newLinguistics(Arrays.asList(tokens), Collections.emptyMap()));
     }
 
-    private static void assertAnnotations(SpanTree expected, String str, Linguistics linguistics) {
+    private static void assertAnnotations(SpanTree expected, String str, AnnotatorConfig config, Linguistics linguistics) {
         StringFieldValue val = new StringFieldValue(str);
-        assertEquals(expected != null, new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
+        assertEquals(expected != null, new LinguisticsAnnotator(linguistics, config).annotate(val));
         assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS));
     }
 
diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
index 5ad6a382abd..f0439a21fec 100644
--- a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
+++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
@@ -26,6 +26,7 @@ public class LinguisticsCase {
     public static String toLowerCase(String in) {
         // def is picked from http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#toLowerCase%28%29
         // Also, at the time of writing, English is the default language for queries
+        if (in == null) return null;
         return Lowercase.toLowerCase(in);
     }
 
diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
index 33f5ee7e4bb..9178c2d7e09 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
@@ -189,9 +189,8 @@ public class GramSplitter {
         @Override
         public boolean equals(Object o) {
             if (this == o) return true;
-            if ( ! (o instanceof Gram)) return false;
+            if ( ! (o instanceof Gram gram)) return false;
 
-            Gram gram = (Gram)o;
             if (codePointCount != gram.codePointCount) return false;
             if (start != gram.start) return false;
             return true;
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
index 6cc68c7ac14..809e9b8d133 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
@@ -15,35 +15,48 @@ import java.util.Objects;
 public class SimpleToken implements Token {
 
     private final List<Token> components = new ArrayList<>();
-    private final String orig;
+    private final String original;
     private TokenType type = TokenType.UNKNOWN;
     private TokenScript script = TokenScript.UNKNOWN;
     private String tokenString;
+    private List<String> stems = null; // Any additional stems after tokenString
     private boolean specialToken = false;
     private long offset = 0;
 
-    public SimpleToken(String orig) {
-        this(orig, null);
+    public SimpleToken(String original) {
+        this(original, (String)null);
     }
 
-    public SimpleToken(String orig, String tokenString) {
-        this.orig = orig;
+    public SimpleToken(String original, String tokenString) {
+        this.original = original;
         this.tokenString = tokenString;
     }
 
+    /** Exposed as fromStems */
+    private SimpleToken(String original, List<String> stems) {
+        this.type = TokenType.ALPHABETIC; // Only type which may have stems
+        this.original = original;
+        this.tokenString = stems.get(0);
+        this.stems = List.copyOf(stems.subList(1, stems.size()));
+    }
+
     @Override
     public String getOrig() {
-        return orig;
+        return original;
     }
 
     @Override
     public int getNumStems() {
-        return tokenString != null ? 1 : 0;
+        return (tokenString != null ? 1 : 0) + (stems != null ? stems.size() : 0);
     }
 
     @Override
     public String getStem(int i) {
-        return tokenString;
+        if (i == 0)
+            return tokenString;
+        if (stems != null && i-1 < stems.size())
+            return stems.get(i-1);
+        return tokenString; // TODO Vespa 9: throw new IllegalArgumentException() instead
     }
 
     @Override
@@ -131,12 +144,12 @@ public class SimpleToken implements Token {
 
     @Override
     public int hashCode() {
-        return orig.hashCode();
+        return original.hashCode();
     }
 
     @Override
     public String toString() {
-        return "token '" + orig + "'";
+        return "token '" + original + "'";
     }
 
     public String toDetailString() {
@@ -171,4 +184,8 @@ public class SimpleToken implements Token {
         return getType().isIndexable() && (getOrig().length() > 0);
     }
 
+    public static SimpleToken fromStems(String original, List<String> stems) {
+        return new SimpleToken(original, stems);
+    }
+
 }
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index 98a84a48095..b72d2bd6d37 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -106,7 +106,7 @@ public class SimpleTokenizer implements Tokenizer {
             String oldToken = token;
             token = stemmer.stem(token);
             String newToken = token;
-            log.log(Level.FINEST, () -> "stem '" + oldToken+"' to '" + newToken+"'");
+            log.log(Level.FINEST, () -> "stem '" + oldToken + "' to '" + newToken + "'");
         }
         String result = token;
         log.log(Level.FINEST, () -> "processed token is: " + result);
author	Jon Bratseth <bratseth@vespa.ai>	2023-11-14 11:28:40 +0100
committer	Jon Bratseth <bratseth@vespa.ai>	2023-11-14 11:28:40 +0100
commit	997896e40f47770b22a81b5ec8281d2e962ec4d9 (patch)
tree	3784800283871e8f99b5579e3a8e545a11f86df1
parent	29109450c8c2c98d969a711b8f6240bb5594c150 (diff)