From ef5be496bc4857c5923f566251dd527873b248bf Mon Sep 17 00:00:00 2001 From: Harald Musum Date: Mon, 13 Nov 2023 21:34:45 +0100 Subject: Revert "Bratseth/casing take 2" --- .../com/yahoo/document/annotation/Annotation.java | 9 ++- .../document/annotation/AnnotationContainer.java | 6 +- .../AnnotationType2AnnotationContainer.java | 5 +- .../annotation/IteratingAnnotationContainer.java | 17 +++-- .../annotation/ListAnnotationContainer.java | 12 ++- .../com/yahoo/document/annotation/SpanList.java | 32 +++++--- .../annotation/SpanNode2AnnotationContainer.java | 1 - .../com/yahoo/document/annotation/SpanTree.java | 14 ++-- .../java/com/yahoo/document/datatypes/Struct.java | 3 +- .../expressions/NGramExpression.java | 7 +- .../linguistics/AnnotatorConfig.java | 15 ++-- .../linguistics/LinguisticsAnnotator.java | 36 +++++---- .../expressions/NGramTestCase.java | 4 +- .../linguistics/LinguisticsAnnotatorTestCase.java | 89 +++++++++------------- .../java/com/yahoo/language/LinguisticsCase.java | 1 - .../com/yahoo/language/process/GramSplitter.java | 3 +- .../com/yahoo/language/simple/SimpleToken.java | 37 +++------ .../com/yahoo/language/simple/SimpleTokenizer.java | 2 +- 18 files changed, 136 insertions(+), 157 deletions(-) diff --git a/document/src/main/java/com/yahoo/document/annotation/Annotation.java b/document/src/main/java/com/yahoo/document/annotation/Annotation.java index 237ca6db58b..3d9300550ff 100644 --- a/document/src/main/java/com/yahoo/document/annotation/Annotation.java +++ b/document/src/main/java/com/yahoo/document/annotation/Annotation.java @@ -129,7 +129,7 @@ public class Annotation implements Comparable { } /** - * WARNING! Should only be used by deserializers! Sets the span node that this annotation points to. + * WARNING! Should only be used by deserializers! Sets the span node that this annotation points to. * * @param spanNode the span node that this annotation shall point to. */ @@ -221,9 +221,10 @@ public class Annotation implements Comparable { @Override public String toString() { - return type + " annotation " + - ((value == null) ? " (no value)" : " (with value)") + - ((spanNode == null) ? " (no span)" : (" with span "+spanNode)); + String retval = "annotation of type " + type; + retval += ((value == null) ? " (no value)" : " (with value)"); + retval += ((spanNode == null) ? " (no span)" : (" with span "+spanNode)); + return retval; } diff --git a/document/src/main/java/com/yahoo/document/annotation/AnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/AnnotationContainer.java index ac2e6aefa1b..6e2b986a478 100644 --- a/document/src/main/java/com/yahoo/document/annotation/AnnotationContainer.java +++ b/document/src/main/java/com/yahoo/document/annotation/AnnotationContainer.java @@ -23,7 +23,11 @@ abstract class AnnotationContainer { */ abstract void annotate(Annotation annotation); - /** Returns a mutable collection of the annotations in this. */ + /** + * Returns a mutable collection of annotations. + * + * @return a mutable collection of annotations. + */ abstract Collection annotations(); /** diff --git a/document/src/main/java/com/yahoo/document/annotation/AnnotationType2AnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/AnnotationType2AnnotationContainer.java index 01b8f990bb4..d8709baa3a1 100644 --- a/document/src/main/java/com/yahoo/document/annotation/AnnotationType2AnnotationContainer.java +++ b/document/src/main/java/com/yahoo/document/annotation/AnnotationType2AnnotationContainer.java @@ -16,7 +16,6 @@ import java.util.NoSuchElementException; */ // TODO: Should this be removed? public class AnnotationType2AnnotationContainer extends IteratingAnnotationContainer { - private final Multimap annotationType2Annotation = Multimaps.newMultimap(new IdentityHashMap<>(), ArrayList::new); @Override @@ -32,6 +31,7 @@ public class AnnotationType2AnnotationContainer extends IteratingAnnotationConta } @Override + @SuppressWarnings("unchecked") Collection annotations() { return annotationType2Annotation.values(); } @@ -56,12 +56,12 @@ public class AnnotationType2AnnotationContainer extends IteratingAnnotationConta } private class NonRecursiveIterator implements Iterator { - private final IdentityHashMap nodes; private final Iterator annotationIt; private Annotation next = null; private boolean nextCalled; + @SuppressWarnings("unchecked") public NonRecursiveIterator(IdentityHashMap nodes) { this.nodes = nodes; this.annotationIt = annotationType2Annotation.values().iterator(); @@ -106,5 +106,4 @@ public class AnnotationType2AnnotationContainer extends IteratingAnnotationConta nextCalled = false; } } - } diff --git a/document/src/main/java/com/yahoo/document/annotation/IteratingAnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/IteratingAnnotationContainer.java index c7c3545fc09..54af80e6e43 100644 --- a/document/src/main/java/com/yahoo/document/annotation/IteratingAnnotationContainer.java +++ b/document/src/main/java/com/yahoo/document/annotation/IteratingAnnotationContainer.java @@ -5,29 +5,30 @@ import java.util.IdentityHashMap; import java.util.Iterator; /** - * @author Einar M R Rosenvinge + * @author Einar M R Rosenvinge */ abstract class IteratingAnnotationContainer extends AnnotationContainer { @Override Iterator iterator(SpanNode node) { - IdentityHashMap nodes = new IdentityHashMap<>(); + IdentityHashMap nodes = new IdentityHashMap(); nodes.put(node, node); return iterator(nodes); } @Override Iterator iteratorRecursive(SpanNode node) { - IdentityHashMap nodes = new IdentityHashMap<>(); + IdentityHashMap nodes = new IdentityHashMap(); nodes.put(node, node); - Iterator childrenIt = node.childIteratorRecursive(); - while (childrenIt.hasNext()) { - SpanNode child = childrenIt.next(); - nodes.put(child, child); + { + Iterator childrenIt = node.childIteratorRecursive(); + while (childrenIt.hasNext()) { + SpanNode child = childrenIt.next(); + nodes.put(child, child); + } } return iterator(nodes); } abstract Iterator iterator(IdentityHashMap nodes); - } diff --git a/document/src/main/java/com/yahoo/document/annotation/ListAnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/ListAnnotationContainer.java index b8d8cd692d8..c2c22558a32 100644 --- a/document/src/main/java/com/yahoo/document/annotation/ListAnnotationContainer.java +++ b/document/src/main/java/com/yahoo/document/annotation/ListAnnotationContainer.java @@ -13,7 +13,6 @@ import java.util.NoSuchElementException; * @author Einar M R Rosenvinge */ public class ListAnnotationContainer extends IteratingAnnotationContainer { - private final List annotations = new LinkedList<>(); @Override @@ -39,8 +38,9 @@ public class ListAnnotationContainer extends IteratingAnnotationContainer { @Override public boolean equals(Object o) { if (this == o) return true; - if (!(o instanceof ListAnnotationContainer other)) return false; - if (!annotations.equals(other.annotations)) return false; + if (!(o instanceof ListAnnotationContainer)) return false; + ListAnnotationContainer that = (ListAnnotationContainer) o; + if (!annotations.equals(that.annotations)) return false; return true; } @@ -50,9 +50,8 @@ public class ListAnnotationContainer extends IteratingAnnotationContainer { } private class AnnotationIterator implements Iterator { - - private final IdentityHashMap nodes; - private final PeekableListIterator base; + private IdentityHashMap nodes; + private PeekableListIterator base; private boolean nextCalled = false; AnnotationIterator(ListIterator baseIt, IdentityHashMap nodes) { @@ -92,5 +91,4 @@ public class ListAnnotationContainer extends IteratingAnnotationContainer { nextCalled = false; } } - } diff --git a/document/src/main/java/com/yahoo/document/annotation/SpanList.java b/document/src/main/java/com/yahoo/document/annotation/SpanList.java index 04bdf5c892e..9530689c3f1 100644 --- a/document/src/main/java/com/yahoo/document/annotation/SpanList.java +++ b/document/src/main/java/com/yahoo/document/annotation/SpanList.java @@ -11,14 +11,13 @@ import java.util.ListIterator; /** * A node in a Span tree that can have child nodes. * - * @author Einar M R Rosenvinge + * @author Einar M R Rosenvinge */ public class SpanList extends SpanNode { - public static final byte ID = 2; private final List children; - private int cachedFrom = Integer.MIN_VALUE; // triggers calculateFrom() - private int cachedTo = Integer.MIN_VALUE; // triggers calculateTo() + private int cachedFrom = Integer.MIN_VALUE; //triggers calculateFrom() + private int cachedTo = Integer.MIN_VALUE; //triggers calculateTo() /** Creates a new SpanList. */ public SpanList() { @@ -40,7 +39,7 @@ public class SpanList extends SpanNode { * @param other the SpanList to copy. */ public SpanList(SpanList other) { - this.children = new LinkedList<>(); + this.children = new LinkedList(); for (SpanNode otherNode : other.children) { if (otherNode instanceof Span) { children.add(new Span((Span) otherNode)); @@ -87,15 +86,15 @@ public class SpanList extends SpanNode { /** Create a span, add it to this list and return it */ public Span span(int from, int length) { - Span span = new Span(from, length); + Span span=new Span(from,length); add(span); return span; } void setInvalid() { - // invalidate ourselves: + //invalidate ourselves: super.setInvalid(); - // invalidate all our children: + //invalidate all our children: for (SpanNode node : children()) { node.setInvalid(); } @@ -214,12 +213,20 @@ public class SpanList extends SpanNode { return this; } - /** Returns a modifiable list of the immediate children of this SpanList. */ + /** + * Returns a modifiable list of the immediate children of this SpanList. + * + * @return a modifiable list of the immediate children of this SpanList. + */ protected List children() { return children; } - /** Returns the number of children this SpanList holds. */ + /** + * Returns the number of children this SpanList holds. + * + * @return the number of children this SpanList holds. + */ public int numChildren() { return children().size(); } @@ -387,9 +394,11 @@ public class SpanList extends SpanNode { @Override public boolean equals(Object o) { if (this == o) return true; - if (!(o instanceof SpanList spanList)) return false; + if (!(o instanceof SpanList)) return false; if (!super.equals(o)) return false; + SpanList spanList = (SpanList) o; + if (children() != null ? !children().equals(spanList.children()) : spanList.children() != null) return false; return true; @@ -406,5 +415,4 @@ public class SpanList extends SpanNode { public String toString() { return "SpanList with " + children().size() + " children"; } - } diff --git a/document/src/main/java/com/yahoo/document/annotation/SpanNode2AnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/SpanNode2AnnotationContainer.java index f563fd0d6cd..a66639b3bfd 100644 --- a/document/src/main/java/com/yahoo/document/annotation/SpanNode2AnnotationContainer.java +++ b/document/src/main/java/com/yahoo/document/annotation/SpanNode2AnnotationContainer.java @@ -18,7 +18,6 @@ import java.util.List; * @author Einar M R Rosenvinge */ class SpanNode2AnnotationContainer extends AnnotationContainer { - private final Multimap spanNode2Annotation = Multimaps.newMultimap(new IdentityHashMap<>(), ArrayList::new); @Override diff --git a/document/src/main/java/com/yahoo/document/annotation/SpanTree.java b/document/src/main/java/com/yahoo/document/annotation/SpanTree.java index 2faf17ce428..f785cf3b3ec 100644 --- a/document/src/main/java/com/yahoo/document/annotation/SpanTree.java +++ b/document/src/main/java/com/yahoo/document/annotation/SpanTree.java @@ -19,7 +19,6 @@ import java.util.Iterator; import java.util.LinkedList; import java.util.List; import java.util.Map; -import java.util.stream.Collectors; /** * A SpanTree holds a root node of a tree of SpanNodes, and a List of Annotations pointing to these nodes @@ -432,7 +431,7 @@ public class SpanTree implements Iterable, SpanNodeParent, Comparabl } /** - * Adds an Annotation to the internal list of annotations for this SpanTree. Use this when + * Adds an Annotation to the internal list of annotations for this SpanTree. Use this when * adding an Annotation that shall annotate a SpanNode. Upon return, Annotation.getSpanNode() * returns the given node. * @@ -447,7 +446,7 @@ public class SpanTree implements Iterable, SpanNodeParent, Comparabl } /** - * Adds an Annotation to the internal list of annotations for this SpanTree. Use this when + * Adds an Annotation to the internal list of annotations for this SpanTree. Use this when * adding an Annotation that shall annotate a SpanNode. Upon return, Annotation.getSpanNode() * returns the given node. This one is unchecked and assumes that the SpanNode is valid and has * already been attached to the Annotation. @@ -539,7 +538,10 @@ public class SpanTree implements Iterable, SpanNodeParent, Comparabl } } - /** Returns an Iterator over all annotations in this tree. Note that the iteration order is non-deterministic. */ + /** + * Returns an Iterator over all annotations in this tree. Note that the iteration order is non-deterministic. + * @return an Iterator over all annotations in this tree. + */ public Iterator iterator() { return annotations.annotations().iterator(); } @@ -639,9 +641,7 @@ public class SpanTree implements Iterable, SpanNodeParent, Comparabl @Override public String toString() { - return "SpanTree '" + name + "' with root: " + root + - ( annotations.annotations().size() > 5 ? "" : - ", annotations: " + annotations.annotations().stream().map(Annotation::toString).collect(Collectors.joining(", "))); + return "SpanTree '" + name + "'"; } @Override diff --git a/document/src/main/java/com/yahoo/document/datatypes/Struct.java b/document/src/main/java/com/yahoo/document/datatypes/Struct.java index bb54b41069b..0b1bbf5d3ca 100644 --- a/document/src/main/java/com/yahoo/document/datatypes/Struct.java +++ b/document/src/main/java/com/yahoo/document/datatypes/Struct.java @@ -218,7 +218,8 @@ public class Struct extends StructuredFieldValue { StringBuilder retVal = new StringBuilder(); retVal.append("Struct (").append(getDataType()).append("): "); int [] increasing = getInOrder(); - for (int id : increasing) { + for (int i = 0; i < increasing.length; i++) { + int id = increasing[i]; retVal.append(getDataType().getField(id)).append("=").append(values.get(id)).append(", "); } return retVal.toString(); diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java index fdfadf65400..26058eeb8f3 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java @@ -15,8 +15,6 @@ import com.yahoo.vespa.indexinglanguage.linguistics.LinguisticsAnnotator; import java.util.Iterator; -import static com.yahoo.language.LinguisticsCase.toLowerCase; - /** * A filter which splits incoming text into n-grams. * @@ -70,9 +68,8 @@ public final class NGramExpression extends Expression { // annotate gram as a word term String gramString = gram.extractFrom(output.getString()); - typedSpan(gram.getStart(), - gram.getCodePointCount(), - TokenType.ALPHABETIC, spanList).annotate(LinguisticsAnnotator.termAnnotation(toLowerCase(gramString), gramString)); + typedSpan(gram.getStart(), gram.getCodePointCount(), TokenType.ALPHABETIC, spanList). + annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(gramString, gramString)); lastPosition = gram.getStart() + gram.getCodePointCount(); } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java index 5c1bf0813c4..684bae3bf97 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java @@ -13,7 +13,7 @@ public class AnnotatorConfig implements Cloneable { private Language language; private StemMode stemMode; private boolean removeAccents; - private int maxTermOccurrences; + private int maxTermOccurences; private int maxTokenizeLength; public static final int DEFAULT_MAX_TERM_OCCURRENCES; @@ -29,7 +29,7 @@ public class AnnotatorConfig implements Cloneable { language = Language.ENGLISH; stemMode = StemMode.NONE; removeAccents = false; - maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES; + maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES; maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH; } @@ -37,7 +37,7 @@ public class AnnotatorConfig implements Cloneable { language = rhs.language; stemMode = rhs.stemMode; removeAccents = rhs.removeAccents; - maxTermOccurrences = rhs.maxTermOccurrences; + maxTermOccurences = rhs.maxTermOccurences; maxTokenizeLength = rhs.maxTokenizeLength; } @@ -74,11 +74,11 @@ public class AnnotatorConfig implements Cloneable { } public int getMaxTermOccurrences() { - return maxTermOccurrences; + return maxTermOccurences; } public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) { - this.maxTermOccurrences = maxTermCount; + this.maxTermOccurences = maxTermCount; return this; } @@ -109,7 +109,7 @@ public class AnnotatorConfig implements Cloneable { if (removeAccents != rhs.removeAccents) { return false; } - if (maxTermOccurrences != rhs.maxTermOccurrences) { + if (maxTermOccurences != rhs.maxTermOccurences) { return false; } if (maxTokenizeLength != rhs.maxTokenizeLength) { @@ -121,7 +121,6 @@ public class AnnotatorConfig implements Cloneable { @Override public int hashCode() { return getClass().hashCode() + language.hashCode() + stemMode.hashCode() + - Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength; + Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength; } - } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java index 52cd8a8ff54..191d067effe 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java @@ -34,8 +34,8 @@ public class LinguisticsAnnotator { final Map termOccurrences = new HashMap<>(); final int maxOccurrences; - public TermOccurrences(int maxOccurrences) { - this.maxOccurrences = maxOccurrences; + public TermOccurrences(int maxOccurences) { + this.maxOccurrences = maxOccurences; } boolean termCountBelowLimit(String term) { @@ -86,23 +86,24 @@ public class LinguisticsAnnotator { } /** - * Creates a TERM annotation which has the term as annotation (only) if it is different from the + * Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the * original. * - * @param term the term - * @param origTerm the original term + * @param termToLowerCase the term to lower case + * @param origTerm the original term * @return the created TERM annotation */ - public static Annotation termAnnotation(String term, String origTerm) { - if (term.equals(origTerm)) + public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) { + String annotationValue = toLowerCase(termToLowerCase); + if (annotationValue.equals(origTerm)) { return new Annotation(AnnotationTypes.TERM); - else - return new Annotation(AnnotationTypes.TERM, new StringFieldValue(term)); + } + return new Annotation(AnnotationTypes.TERM, new StringFieldValue(annotationValue)); } private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences) { if (termOccurrences.termCountBelowLimit(term)) { - here.annotate(termAnnotation(term, orig)); + here.annotate(lowerCaseTermAnnotation(term, orig)); } } @@ -126,24 +127,29 @@ public class LinguisticsAnnotator { } if (mode == StemMode.ALL) { Span where = parent.span((int)token.getOffset(), token.getOrig().length()); - String lowercasedOrig = toLowerCase(token.getOrig()); + addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences); + + String lowercasedTerm = lowercasedOrig; String term = token.getTokenString(); if (term != null) { + lowercasedTerm = toLowerCase(term); + } + if (! lowercasedOrig.equals(lowercasedTerm)) { addAnnotation(where, term, token.getOrig(), termOccurrences); - if ( ! term.equals(lowercasedOrig)) - addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences); } for (int i = 0; i < token.getNumStems(); i++) { String stem = token.getStem(i); - if (! (stem.equals(lowercasedOrig) || stem.equals(term))) + String lowercasedStem = toLowerCase(stem); + if (! (lowercasedOrig.equals(lowercasedStem) || lowercasedTerm.equals(lowercasedStem))) { addAnnotation(where, stem, token.getOrig(), termOccurrences); + } } } else { String term = token.getTokenString(); if (term == null || term.trim().isEmpty()) return; if (termOccurrences.termCountBelowLimit(term)) { - parent.span((int)token.getOffset(), token.getOrig().length()).annotate(termAnnotation(term, token.getOrig())); + parent.span((int)token.getOffset(), token.getOrig().length()).annotate(lowerCaseTermAnnotation(term, token.getOrig())); } } } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java index b4e266ab3eb..bcde8751de8 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java @@ -57,8 +57,8 @@ public class NGramTestCase { new NGramExpression(new SimpleLinguistics(), 3).execute(context); StringFieldValue value = (StringFieldValue)context.getValue(); - assertEquals("Grams are pure annotations - field value is unchanged", - "en gul Bille sang... ", value.getString()); + assertEquals("Grams are pure annotations - field value is unchanged", "en gul Bille sang... ", + value.getString()); SpanTree gramTree = value.getSpanTree(SpanTrees.LINGUISTICS); assertNotNull(gramTree); SpanList grams = (SpanList)gramTree.getRoot(); diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java index a4dbe1fe826..67bff3843ee 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java @@ -19,6 +19,7 @@ import org.junit.Test; import org.mockito.Mockito; import java.util.*; +import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -28,6 +29,8 @@ import static org.junit.Assert.assertTrue; */ public class LinguisticsAnnotatorTestCase { + private static final AnnotatorConfig CONFIG = new AnnotatorConfig(); + @Test public void requireThatAnnotateFailsWithZeroTokens() { assertAnnotations(null, "foo"); @@ -39,7 +42,7 @@ public class LinguisticsAnnotatorTestCase { if (type.isIndexable()) { continue; } - assertAnnotations(null, "foo", token("foo", "bar", type)); + assertAnnotations(null, "foo", newToken("foo", "bar", type)); } } @@ -51,27 +54,7 @@ public class LinguisticsAnnotatorTestCase { if (!type.isIndexable()) { continue; } - assertAnnotations(expected, "foo", token("foo", "bar", type)); - } - } - - @Test - public void requireThatIndexableTokenStringsAreAnnotatedWithModeALL() { - SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); - expected.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("tesla"))); - var span2 = expected.spanList().span(0, 4); - span2.annotate(new Annotation(AnnotationTypes.TERM)); - span2.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("car"))); - var span3 = expected.spanList().span(0, 8); - span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelxes"))); - span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelx"))); - span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("mex"))); - for (TokenType type : TokenType.values()) { - if (!type.isIndexable()) continue; - assertAnnotations(expected, "Tesla cars", new AnnotatorConfig().setStemMode("ALL"), - token("Tesla", "tesla", type), - token("cars", "car", type), - SimpleToken.fromStems("ModelXes", List.of("modelxes", "modelx", "mex"))); + assertAnnotations(expected, "foo", newToken("foo", "bar", type)); } } @@ -80,7 +63,7 @@ public class LinguisticsAnnotatorTestCase { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); for (TokenType type : TokenType.values()) { - assertAnnotations(expected, "foo", token("foo", "bar", type, true)); + assertAnnotations(expected, "foo", newToken("foo", "bar", type, true)); } } @@ -93,21 +76,21 @@ public class LinguisticsAnnotatorTestCase { if (!specialToken && !type.isIndexable()) { continue; } - assertAnnotations(expected, "foo", token("foo", "foo", type, specialToken)); + assertAnnotations(expected, "foo", newToken("foo", "foo", type, specialToken)); } } } @Test - public void requireThatTermAnnotationsPreserveCasing() { + public void requireThatTermAnnotationsAreLowerCased() { SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS); - expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("BaR"))); + expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); for (boolean specialToken : Arrays.asList(true, false)) { for (TokenType type : TokenType.values()) { if (!specialToken && !type.isIndexable()) { continue; } - assertAnnotations(expected, "foo", token("foo", "BaR", type, specialToken)); + assertAnnotations(expected, "foo", newToken("foo", "BAR", type, specialToken)); } } } @@ -119,11 +102,11 @@ public class LinguisticsAnnotatorTestCase { expected.spanList().span(3, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar"))); expected.spanList().span(6, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz"))); - SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC) - .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) - .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) - .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) - .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); + SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC) + .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) + .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) + .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) + .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); assertAnnotations(expected, "foobarbaz", token); } @@ -133,11 +116,11 @@ public class LinguisticsAnnotatorTestCase { expected.spanList().span(0, 9).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("foobarbaz"))); - SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true) - .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) - .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) - .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) - .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); + SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true) + .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0)) + .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3) + .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3)) + .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6))); assertAnnotations(expected, "foobarbaz", token); } @@ -157,8 +140,7 @@ public class LinguisticsAnnotatorTestCase { continue; } assertAnnotations(expected, "foo", - new AnnotatorConfig(), - newLinguistics(List.of(token("foo", "foo", type, specialToken)), + newLinguistics(List.of(newToken("foo", "foo", type, specialToken)), Collections.singletonMap("foo", "bar"))); } } @@ -172,9 +154,11 @@ public class LinguisticsAnnotatorTestCase { StringFieldValue val = new StringFieldValue("foo"); val.setSpanTree(spanTree); - Linguistics linguistics = newLinguistics(List.of(token("foo", "bar", TokenType.ALPHABETIC, false)), + Linguistics linguistics = newLinguistics(List.of(newToken("foo", "bar", TokenType.ALPHABETIC, false)), Collections.emptyMap()); - assertTrue(new LinguisticsAnnotator(linguistics, new AnnotatorConfig()).annotate(val)); + new LinguisticsAnnotator(linguistics, CONFIG).annotate(val); + + assertTrue(new LinguisticsAnnotator(linguistics, CONFIG).annotate(val)); assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS)); } @@ -202,7 +186,7 @@ public class LinguisticsAnnotatorTestCase { } @Test - public void requireThatMaxTermOccurrencesIsHonored() { + public void requireThatMaxTermOccurencesIsHonored() { final String inputTerm = "foo"; final String stemmedInputTerm = "bar"; // completely different from // inputTerm for safer test @@ -220,7 +204,7 @@ public class LinguisticsAnnotatorTestCase { StringBuilder input = new StringBuilder(); Token[] tokens = new Token[inputTermOccurence]; for (int i = 0; i < inputTermOccurence; ++i) { - SimpleToken t = token(inputTerm, stemmedInputTerm, type); + SimpleToken t = newToken(inputTerm, stemmedInputTerm, type); t.setOffset(i * paddedInputTerm.length()); tokens[i] = t; input.append(paddedInputTerm); @@ -230,29 +214,28 @@ public class LinguisticsAnnotatorTestCase { } // -------------------------------------------------------------------------------- + // // Utilities + // + // -------------------------------------------------------------------------------- - private static SimpleToken token(String orig, String stem, TokenType type) { - return token(orig, stem, type, false); + private static SimpleToken newToken(String orig, String stem, TokenType type) { + return newToken(orig, stem, type, false); } - private static SimpleToken token(String orig, String stem, TokenType type, boolean specialToken) { + private static SimpleToken newToken(String orig, String stem, TokenType type, boolean specialToken) { return new SimpleToken(orig).setTokenString(stem) .setType(type) .setSpecialToken(specialToken); } private static void assertAnnotations(SpanTree expected, String value, Token... tokens) { - assertAnnotations(expected, value, new AnnotatorConfig(), newLinguistics(Arrays.asList(tokens), Collections.emptyMap())); - } - - private static void assertAnnotations(SpanTree expected, String value, AnnotatorConfig config, Token... tokens) { - assertAnnotations(expected, value, config, newLinguistics(Arrays.asList(tokens), Collections.emptyMap())); + assertAnnotations(expected, value, newLinguistics(Arrays.asList(tokens), Collections.emptyMap())); } - private static void assertAnnotations(SpanTree expected, String str, AnnotatorConfig config, Linguistics linguistics) { + private static void assertAnnotations(SpanTree expected, String str, Linguistics linguistics) { StringFieldValue val = new StringFieldValue(str); - assertEquals(expected != null, new LinguisticsAnnotator(linguistics, config).annotate(val)); + assertEquals(expected != null, new LinguisticsAnnotator(linguistics, CONFIG).annotate(val)); assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS)); } diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java index f0439a21fec..5ad6a382abd 100644 --- a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java +++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java @@ -26,7 +26,6 @@ public class LinguisticsCase { public static String toLowerCase(String in) { // def is picked from http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#toLowerCase%28%29 // Also, at the time of writing, English is the default language for queries - if (in == null) return null; return Lowercase.toLowerCase(in); } diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java index 9178c2d7e09..33f5ee7e4bb 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java +++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java @@ -189,8 +189,9 @@ public class GramSplitter { @Override public boolean equals(Object o) { if (this == o) return true; - if ( ! (o instanceof Gram gram)) return false; + if ( ! (o instanceof Gram)) return false; + Gram gram = (Gram)o; if (codePointCount != gram.codePointCount) return false; if (start != gram.start) return false; return true; diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java index 809e9b8d133..6cc68c7ac14 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java @@ -15,48 +15,35 @@ import java.util.Objects; public class SimpleToken implements Token { private final List components = new ArrayList<>(); - private final String original; + private final String orig; private TokenType type = TokenType.UNKNOWN; private TokenScript script = TokenScript.UNKNOWN; private String tokenString; - private List stems = null; // Any additional stems after tokenString private boolean specialToken = false; private long offset = 0; - public SimpleToken(String original) { - this(original, (String)null); + public SimpleToken(String orig) { + this(orig, null); } - public SimpleToken(String original, String tokenString) { - this.original = original; + public SimpleToken(String orig, String tokenString) { + this.orig = orig; this.tokenString = tokenString; } - /** Exposed as fromStems */ - private SimpleToken(String original, List stems) { - this.type = TokenType.ALPHABETIC; // Only type which may have stems - this.original = original; - this.tokenString = stems.get(0); - this.stems = List.copyOf(stems.subList(1, stems.size())); - } - @Override public String getOrig() { - return original; + return orig; } @Override public int getNumStems() { - return (tokenString != null ? 1 : 0) + (stems != null ? stems.size() : 0); + return tokenString != null ? 1 : 0; } @Override public String getStem(int i) { - if (i == 0) - return tokenString; - if (stems != null && i-1 < stems.size()) - return stems.get(i-1); - return tokenString; // TODO Vespa 9: throw new IllegalArgumentException() instead + return tokenString; } @Override @@ -144,12 +131,12 @@ public class SimpleToken implements Token { @Override public int hashCode() { - return original.hashCode(); + return orig.hashCode(); } @Override public String toString() { - return "token '" + original + "'"; + return "token '" + orig + "'"; } public String toDetailString() { @@ -184,8 +171,4 @@ public class SimpleToken implements Token { return getType().isIndexable() && (getOrig().length() > 0); } - public static SimpleToken fromStems(String original, List stems) { - return new SimpleToken(original, stems); - } - } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index b72d2bd6d37..98a84a48095 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -106,7 +106,7 @@ public class SimpleTokenizer implements Tokenizer { String oldToken = token; token = stemmer.stem(token); String newToken = token; - log.log(Level.FINEST, () -> "stem '" + oldToken + "' to '" + newToken + "'"); + log.log(Level.FINEST, () -> "stem '" + oldToken+"' to '" + newToken+"'"); } String result = token; log.log(Level.FINEST, () -> "processed token is: " + result); -- cgit v1.2.3