aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHarald Musum <musum@vespa.ai>2023-11-13 21:34:45 +0100
committerGitHub <noreply@github.com>2023-11-13 21:34:45 +0100
commitef5be496bc4857c5923f566251dd527873b248bf (patch)
tree657d51a4166d3f7cf40e04f0a5972f11d0261afd
parent944d635d00e165166508ef23399e9ed65a87a9c8 (diff)
Revert "Bratseth/casing take 2"
-rw-r--r--document/src/main/java/com/yahoo/document/annotation/Annotation.java9
-rw-r--r--document/src/main/java/com/yahoo/document/annotation/AnnotationContainer.java6
-rw-r--r--document/src/main/java/com/yahoo/document/annotation/AnnotationType2AnnotationContainer.java5
-rw-r--r--document/src/main/java/com/yahoo/document/annotation/IteratingAnnotationContainer.java17
-rw-r--r--document/src/main/java/com/yahoo/document/annotation/ListAnnotationContainer.java12
-rw-r--r--document/src/main/java/com/yahoo/document/annotation/SpanList.java32
-rw-r--r--document/src/main/java/com/yahoo/document/annotation/SpanNode2AnnotationContainer.java1
-rw-r--r--document/src/main/java/com/yahoo/document/annotation/SpanTree.java14
-rw-r--r--document/src/main/java/com/yahoo/document/datatypes/Struct.java3
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java7
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java15
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java36
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java4
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java89
-rw-r--r--linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java1
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java3
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java37
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java2
18 files changed, 136 insertions, 157 deletions
diff --git a/document/src/main/java/com/yahoo/document/annotation/Annotation.java b/document/src/main/java/com/yahoo/document/annotation/Annotation.java
index 237ca6db58b..3d9300550ff 100644
--- a/document/src/main/java/com/yahoo/document/annotation/Annotation.java
+++ b/document/src/main/java/com/yahoo/document/annotation/Annotation.java
@@ -129,7 +129,7 @@ public class Annotation implements Comparable<Annotation> {
}
/**
- * WARNING! Should only be used by deserializers! Sets the span node that this annotation points to.
+ * WARNING! Should only be used by deserializers!&nbsp;Sets the span node that this annotation points to.
*
* @param spanNode the span node that this annotation shall point to.
*/
@@ -221,9 +221,10 @@ public class Annotation implements Comparable<Annotation> {
@Override
public String toString() {
- return type + " annotation " +
- ((value == null) ? " (no value)" : " (with value)") +
- ((spanNode == null) ? " (no span)" : (" with span "+spanNode));
+ String retval = "annotation of type " + type;
+ retval += ((value == null) ? " (no value)" : " (with value)");
+ retval += ((spanNode == null) ? " (no span)" : (" with span "+spanNode));
+ return retval;
}
diff --git a/document/src/main/java/com/yahoo/document/annotation/AnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/AnnotationContainer.java
index ac2e6aefa1b..6e2b986a478 100644
--- a/document/src/main/java/com/yahoo/document/annotation/AnnotationContainer.java
+++ b/document/src/main/java/com/yahoo/document/annotation/AnnotationContainer.java
@@ -23,7 +23,11 @@ abstract class AnnotationContainer {
*/
abstract void annotate(Annotation annotation);
- /** Returns a mutable collection of the annotations in this. */
+ /**
+ * Returns a mutable collection of annotations.
+ *
+ * @return a mutable collection of annotations.
+ */
abstract Collection<Annotation> annotations();
/**
diff --git a/document/src/main/java/com/yahoo/document/annotation/AnnotationType2AnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/AnnotationType2AnnotationContainer.java
index 01b8f990bb4..d8709baa3a1 100644
--- a/document/src/main/java/com/yahoo/document/annotation/AnnotationType2AnnotationContainer.java
+++ b/document/src/main/java/com/yahoo/document/annotation/AnnotationType2AnnotationContainer.java
@@ -16,7 +16,6 @@ import java.util.NoSuchElementException;
*/
// TODO: Should this be removed?
public class AnnotationType2AnnotationContainer extends IteratingAnnotationContainer {
-
private final Multimap<AnnotationType, Annotation> annotationType2Annotation = Multimaps.newMultimap(new IdentityHashMap<>(), ArrayList::new);
@Override
@@ -32,6 +31,7 @@ public class AnnotationType2AnnotationContainer extends IteratingAnnotationConta
}
@Override
+ @SuppressWarnings("unchecked")
Collection<Annotation> annotations() {
return annotationType2Annotation.values();
}
@@ -56,12 +56,12 @@ public class AnnotationType2AnnotationContainer extends IteratingAnnotationConta
}
private class NonRecursiveIterator implements Iterator<Annotation> {
-
private final IdentityHashMap<SpanNode, SpanNode> nodes;
private final Iterator<Annotation> annotationIt;
private Annotation next = null;
private boolean nextCalled;
+ @SuppressWarnings("unchecked")
public NonRecursiveIterator(IdentityHashMap<SpanNode, SpanNode> nodes) {
this.nodes = nodes;
this.annotationIt = annotationType2Annotation.values().iterator();
@@ -106,5 +106,4 @@ public class AnnotationType2AnnotationContainer extends IteratingAnnotationConta
nextCalled = false;
}
}
-
}
diff --git a/document/src/main/java/com/yahoo/document/annotation/IteratingAnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/IteratingAnnotationContainer.java
index c7c3545fc09..54af80e6e43 100644
--- a/document/src/main/java/com/yahoo/document/annotation/IteratingAnnotationContainer.java
+++ b/document/src/main/java/com/yahoo/document/annotation/IteratingAnnotationContainer.java
@@ -5,29 +5,30 @@ import java.util.IdentityHashMap;
import java.util.Iterator;
/**
- * @author Einar M R Rosenvinge
+ * @author <a href="mailto:einarmr@yahoo-inc.com">Einar M R Rosenvinge</a>
*/
abstract class IteratingAnnotationContainer extends AnnotationContainer {
@Override
Iterator<Annotation> iterator(SpanNode node) {
- IdentityHashMap<SpanNode, SpanNode> nodes = new IdentityHashMap<>();
+ IdentityHashMap<SpanNode, SpanNode> nodes = new IdentityHashMap<SpanNode, SpanNode>();
nodes.put(node, node);
return iterator(nodes);
}
@Override
Iterator<Annotation> iteratorRecursive(SpanNode node) {
- IdentityHashMap<SpanNode, SpanNode> nodes = new IdentityHashMap<>();
+ IdentityHashMap<SpanNode, SpanNode> nodes = new IdentityHashMap<SpanNode, SpanNode>();
nodes.put(node, node);
- Iterator<SpanNode> childrenIt = node.childIteratorRecursive();
- while (childrenIt.hasNext()) {
- SpanNode child = childrenIt.next();
- nodes.put(child, child);
+ {
+ Iterator<SpanNode> childrenIt = node.childIteratorRecursive();
+ while (childrenIt.hasNext()) {
+ SpanNode child = childrenIt.next();
+ nodes.put(child, child);
+ }
}
return iterator(nodes);
}
abstract Iterator<Annotation> iterator(IdentityHashMap<SpanNode, SpanNode> nodes);
-
}
diff --git a/document/src/main/java/com/yahoo/document/annotation/ListAnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/ListAnnotationContainer.java
index b8d8cd692d8..c2c22558a32 100644
--- a/document/src/main/java/com/yahoo/document/annotation/ListAnnotationContainer.java
+++ b/document/src/main/java/com/yahoo/document/annotation/ListAnnotationContainer.java
@@ -13,7 +13,6 @@ import java.util.NoSuchElementException;
* @author Einar M R Rosenvinge
*/
public class ListAnnotationContainer extends IteratingAnnotationContainer {
-
private final List<Annotation> annotations = new LinkedList<>();
@Override
@@ -39,8 +38,9 @@ public class ListAnnotationContainer extends IteratingAnnotationContainer {
@Override
public boolean equals(Object o) {
if (this == o) return true;
- if (!(o instanceof ListAnnotationContainer other)) return false;
- if (!annotations.equals(other.annotations)) return false;
+ if (!(o instanceof ListAnnotationContainer)) return false;
+ ListAnnotationContainer that = (ListAnnotationContainer) o;
+ if (!annotations.equals(that.annotations)) return false;
return true;
}
@@ -50,9 +50,8 @@ public class ListAnnotationContainer extends IteratingAnnotationContainer {
}
private class AnnotationIterator implements Iterator<Annotation> {
-
- private final IdentityHashMap<SpanNode, SpanNode> nodes;
- private final PeekableListIterator<Annotation> base;
+ private IdentityHashMap<SpanNode, SpanNode> nodes;
+ private PeekableListIterator<Annotation> base;
private boolean nextCalled = false;
AnnotationIterator(ListIterator<Annotation> baseIt, IdentityHashMap<SpanNode, SpanNode> nodes) {
@@ -92,5 +91,4 @@ public class ListAnnotationContainer extends IteratingAnnotationContainer {
nextCalled = false;
}
}
-
}
diff --git a/document/src/main/java/com/yahoo/document/annotation/SpanList.java b/document/src/main/java/com/yahoo/document/annotation/SpanList.java
index 04bdf5c892e..9530689c3f1 100644
--- a/document/src/main/java/com/yahoo/document/annotation/SpanList.java
+++ b/document/src/main/java/com/yahoo/document/annotation/SpanList.java
@@ -11,14 +11,13 @@ import java.util.ListIterator;
/**
* A node in a Span tree that can have child nodes.
*
- * @author Einar M R Rosenvinge
+ * @author <a href="mailto:einarmr@yahoo-inc.com">Einar M R Rosenvinge</a>
*/
public class SpanList extends SpanNode {
-
public static final byte ID = 2;
private final List<SpanNode> children;
- private int cachedFrom = Integer.MIN_VALUE; // triggers calculateFrom()
- private int cachedTo = Integer.MIN_VALUE; // triggers calculateTo()
+ private int cachedFrom = Integer.MIN_VALUE; //triggers calculateFrom()
+ private int cachedTo = Integer.MIN_VALUE; //triggers calculateTo()
/** Creates a new SpanList. */
public SpanList() {
@@ -40,7 +39,7 @@ public class SpanList extends SpanNode {
* @param other the SpanList to copy.
*/
public SpanList(SpanList other) {
- this.children = new LinkedList<>();
+ this.children = new LinkedList<SpanNode>();
for (SpanNode otherNode : other.children) {
if (otherNode instanceof Span) {
children.add(new Span((Span) otherNode));
@@ -87,15 +86,15 @@ public class SpanList extends SpanNode {
/** Create a span, add it to this list and return it */
public Span span(int from, int length) {
- Span span = new Span(from, length);
+ Span span=new Span(from,length);
add(span);
return span;
}
void setInvalid() {
- // invalidate ourselves:
+ //invalidate ourselves:
super.setInvalid();
- // invalidate all our children:
+ //invalidate all our children:
for (SpanNode node : children()) {
node.setInvalid();
}
@@ -214,12 +213,20 @@ public class SpanList extends SpanNode {
return this;
}
- /** Returns a modifiable list of the immediate children of this SpanList. */
+ /**
+ * Returns a <strong>modifiable</strong> list of the immediate children of this SpanList.
+ *
+ * @return a <strong>modifiable</strong> list of the immediate children of this SpanList.
+ */
protected List<SpanNode> children() {
return children;
}
- /** Returns the number of children this SpanList holds. */
+ /**
+ * Returns the number of children this SpanList holds.
+ *
+ * @return the number of children this SpanList holds.
+ */
public int numChildren() {
return children().size();
}
@@ -387,9 +394,11 @@ public class SpanList extends SpanNode {
@Override
public boolean equals(Object o) {
if (this == o) return true;
- if (!(o instanceof SpanList spanList)) return false;
+ if (!(o instanceof SpanList)) return false;
if (!super.equals(o)) return false;
+ SpanList spanList = (SpanList) o;
+
if (children() != null ? !children().equals(spanList.children()) : spanList.children() != null) return false;
return true;
@@ -406,5 +415,4 @@ public class SpanList extends SpanNode {
public String toString() {
return "SpanList with " + children().size() + " children";
}
-
}
diff --git a/document/src/main/java/com/yahoo/document/annotation/SpanNode2AnnotationContainer.java b/document/src/main/java/com/yahoo/document/annotation/SpanNode2AnnotationContainer.java
index f563fd0d6cd..a66639b3bfd 100644
--- a/document/src/main/java/com/yahoo/document/annotation/SpanNode2AnnotationContainer.java
+++ b/document/src/main/java/com/yahoo/document/annotation/SpanNode2AnnotationContainer.java
@@ -18,7 +18,6 @@ import java.util.List;
* @author Einar M R Rosenvinge
*/
class SpanNode2AnnotationContainer extends AnnotationContainer {
-
private final Multimap<SpanNode, Annotation> spanNode2Annotation = Multimaps.newMultimap(new IdentityHashMap<>(), ArrayList::new);
@Override
diff --git a/document/src/main/java/com/yahoo/document/annotation/SpanTree.java b/document/src/main/java/com/yahoo/document/annotation/SpanTree.java
index 2faf17ce428..f785cf3b3ec 100644
--- a/document/src/main/java/com/yahoo/document/annotation/SpanTree.java
+++ b/document/src/main/java/com/yahoo/document/annotation/SpanTree.java
@@ -19,7 +19,6 @@ import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
-import java.util.stream.Collectors;
/**
* A SpanTree holds a root node of a tree of SpanNodes, and a List of Annotations pointing to these nodes
@@ -432,7 +431,7 @@ public class SpanTree implements Iterable<Annotation>, SpanNodeParent, Comparabl
}
/**
- * Adds an Annotation to the internal list of annotations for this SpanTree. Use this when
+ * Adds an Annotation to the internal list of annotations for this SpanTree.&nbsp;Use this when
* adding an Annotation that shall annotate a SpanNode. Upon return, Annotation.getSpanNode()
* returns the given node.
*
@@ -447,7 +446,7 @@ public class SpanTree implements Iterable<Annotation>, SpanNodeParent, Comparabl
}
/**
- * Adds an Annotation to the internal list of annotations for this SpanTree. Use this when
+ * Adds an Annotation to the internal list of annotations for this SpanTree.&nbsp;Use this when
* adding an Annotation that shall annotate a SpanNode. Upon return, Annotation.getSpanNode()
* returns the given node. This one is unchecked and assumes that the SpanNode is valid and has
* already been attached to the Annotation.
@@ -539,7 +538,10 @@ public class SpanTree implements Iterable<Annotation>, SpanNodeParent, Comparabl
}
}
- /** Returns an Iterator over all annotations in this tree. Note that the iteration order is non-deterministic. */
+ /**
+ * Returns an Iterator over all annotations in this tree.&nbsp;Note that the iteration order is non-deterministic.
+ * @return an Iterator over all annotations in this tree.
+ */
public Iterator<Annotation> iterator() {
return annotations.annotations().iterator();
}
@@ -639,9 +641,7 @@ public class SpanTree implements Iterable<Annotation>, SpanNodeParent, Comparabl
@Override
public String toString() {
- return "SpanTree '" + name + "' with root: " + root +
- ( annotations.annotations().size() > 5 ? "" :
- ", annotations: " + annotations.annotations().stream().map(Annotation::toString).collect(Collectors.joining(", ")));
+ return "SpanTree '" + name + "'";
}
@Override
diff --git a/document/src/main/java/com/yahoo/document/datatypes/Struct.java b/document/src/main/java/com/yahoo/document/datatypes/Struct.java
index bb54b41069b..0b1bbf5d3ca 100644
--- a/document/src/main/java/com/yahoo/document/datatypes/Struct.java
+++ b/document/src/main/java/com/yahoo/document/datatypes/Struct.java
@@ -218,7 +218,8 @@ public class Struct extends StructuredFieldValue {
StringBuilder retVal = new StringBuilder();
retVal.append("Struct (").append(getDataType()).append("): ");
int [] increasing = getInOrder();
- for (int id : increasing) {
+ for (int i = 0; i < increasing.length; i++) {
+ int id = increasing[i];
retVal.append(getDataType().getField(id)).append("=").append(values.get(id)).append(", ");
}
return retVal.toString();
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
index fdfadf65400..26058eeb8f3 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
@@ -15,8 +15,6 @@ import com.yahoo.vespa.indexinglanguage.linguistics.LinguisticsAnnotator;
import java.util.Iterator;
-import static com.yahoo.language.LinguisticsCase.toLowerCase;
-
/**
* A filter which splits incoming text into n-grams.
*
@@ -70,9 +68,8 @@ public final class NGramExpression extends Expression {
// annotate gram as a word term
String gramString = gram.extractFrom(output.getString());
- typedSpan(gram.getStart(),
- gram.getCodePointCount(),
- TokenType.ALPHABETIC, spanList).annotate(LinguisticsAnnotator.termAnnotation(toLowerCase(gramString), gramString));
+ typedSpan(gram.getStart(), gram.getCodePointCount(), TokenType.ALPHABETIC, spanList).
+ annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(gramString, gramString));
lastPosition = gram.getStart() + gram.getCodePointCount();
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
index 5c1bf0813c4..684bae3bf97 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
@@ -13,7 +13,7 @@ public class AnnotatorConfig implements Cloneable {
private Language language;
private StemMode stemMode;
private boolean removeAccents;
- private int maxTermOccurrences;
+ private int maxTermOccurences;
private int maxTokenizeLength;
public static final int DEFAULT_MAX_TERM_OCCURRENCES;
@@ -29,7 +29,7 @@ public class AnnotatorConfig implements Cloneable {
language = Language.ENGLISH;
stemMode = StemMode.NONE;
removeAccents = false;
- maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES;
+ maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES;
maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH;
}
@@ -37,7 +37,7 @@ public class AnnotatorConfig implements Cloneable {
language = rhs.language;
stemMode = rhs.stemMode;
removeAccents = rhs.removeAccents;
- maxTermOccurrences = rhs.maxTermOccurrences;
+ maxTermOccurences = rhs.maxTermOccurences;
maxTokenizeLength = rhs.maxTokenizeLength;
}
@@ -74,11 +74,11 @@ public class AnnotatorConfig implements Cloneable {
}
public int getMaxTermOccurrences() {
- return maxTermOccurrences;
+ return maxTermOccurences;
}
public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) {
- this.maxTermOccurrences = maxTermCount;
+ this.maxTermOccurences = maxTermCount;
return this;
}
@@ -109,7 +109,7 @@ public class AnnotatorConfig implements Cloneable {
if (removeAccents != rhs.removeAccents) {
return false;
}
- if (maxTermOccurrences != rhs.maxTermOccurrences) {
+ if (maxTermOccurences != rhs.maxTermOccurences) {
return false;
}
if (maxTokenizeLength != rhs.maxTokenizeLength) {
@@ -121,7 +121,6 @@ public class AnnotatorConfig implements Cloneable {
@Override
public int hashCode() {
return getClass().hashCode() + language.hashCode() + stemMode.hashCode() +
- Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength;
+ Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength;
}
-
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 52cd8a8ff54..191d067effe 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -34,8 +34,8 @@ public class LinguisticsAnnotator {
final Map<String, Integer> termOccurrences = new HashMap<>();
final int maxOccurrences;
- public TermOccurrences(int maxOccurrences) {
- this.maxOccurrences = maxOccurrences;
+ public TermOccurrences(int maxOccurences) {
+ this.maxOccurrences = maxOccurences;
}
boolean termCountBelowLimit(String term) {
@@ -86,23 +86,24 @@ public class LinguisticsAnnotator {
}
/**
- * Creates a TERM annotation which has the term as annotation (only) if it is different from the
+ * Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the
* original.
*
- * @param term the term
- * @param origTerm the original term
+ * @param termToLowerCase the term to lower case
+ * @param origTerm the original term
* @return the created TERM annotation
*/
- public static Annotation termAnnotation(String term, String origTerm) {
- if (term.equals(origTerm))
+ public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) {
+ String annotationValue = toLowerCase(termToLowerCase);
+ if (annotationValue.equals(origTerm)) {
return new Annotation(AnnotationTypes.TERM);
- else
- return new Annotation(AnnotationTypes.TERM, new StringFieldValue(term));
+ }
+ return new Annotation(AnnotationTypes.TERM, new StringFieldValue(annotationValue));
}
private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences) {
if (termOccurrences.termCountBelowLimit(term)) {
- here.annotate(termAnnotation(term, orig));
+ here.annotate(lowerCaseTermAnnotation(term, orig));
}
}
@@ -126,24 +127,29 @@ public class LinguisticsAnnotator {
}
if (mode == StemMode.ALL) {
Span where = parent.span((int)token.getOffset(), token.getOrig().length());
-
String lowercasedOrig = toLowerCase(token.getOrig());
+ addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences);
+
+ String lowercasedTerm = lowercasedOrig;
String term = token.getTokenString();
if (term != null) {
+ lowercasedTerm = toLowerCase(term);
+ }
+ if (! lowercasedOrig.equals(lowercasedTerm)) {
addAnnotation(where, term, token.getOrig(), termOccurrences);
- if ( ! term.equals(lowercasedOrig))
- addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences);
}
for (int i = 0; i < token.getNumStems(); i++) {
String stem = token.getStem(i);
- if (! (stem.equals(lowercasedOrig) || stem.equals(term)))
+ String lowercasedStem = toLowerCase(stem);
+ if (! (lowercasedOrig.equals(lowercasedStem) || lowercasedTerm.equals(lowercasedStem))) {
addAnnotation(where, stem, token.getOrig(), termOccurrences);
+ }
}
} else {
String term = token.getTokenString();
if (term == null || term.trim().isEmpty()) return;
if (termOccurrences.termCountBelowLimit(term)) {
- parent.span((int)token.getOffset(), token.getOrig().length()).annotate(termAnnotation(term, token.getOrig()));
+ parent.span((int)token.getOffset(), token.getOrig().length()).annotate(lowerCaseTermAnnotation(term, token.getOrig()));
}
}
}
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java
index b4e266ab3eb..bcde8751de8 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java
@@ -57,8 +57,8 @@ public class NGramTestCase {
new NGramExpression(new SimpleLinguistics(), 3).execute(context);
StringFieldValue value = (StringFieldValue)context.getValue();
- assertEquals("Grams are pure annotations - field value is unchanged",
- "en gul Bille sang... ", value.getString());
+ assertEquals("Grams are pure annotations - field value is unchanged", "en gul Bille sang... ",
+ value.getString());
SpanTree gramTree = value.getSpanTree(SpanTrees.LINGUISTICS);
assertNotNull(gramTree);
SpanList grams = (SpanList)gramTree.getRoot();
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index a4dbe1fe826..67bff3843ee 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -19,6 +19,7 @@ import org.junit.Test;
import org.mockito.Mockito;
import java.util.*;
+import java.util.stream.Collectors;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
@@ -28,6 +29,8 @@ import static org.junit.Assert.assertTrue;
*/
public class LinguisticsAnnotatorTestCase {
+ private static final AnnotatorConfig CONFIG = new AnnotatorConfig();
+
@Test
public void requireThatAnnotateFailsWithZeroTokens() {
assertAnnotations(null, "foo");
@@ -39,7 +42,7 @@ public class LinguisticsAnnotatorTestCase {
if (type.isIndexable()) {
continue;
}
- assertAnnotations(null, "foo", token("foo", "bar", type));
+ assertAnnotations(null, "foo", newToken("foo", "bar", type));
}
}
@@ -51,27 +54,7 @@ public class LinguisticsAnnotatorTestCase {
if (!type.isIndexable()) {
continue;
}
- assertAnnotations(expected, "foo", token("foo", "bar", type));
- }
- }
-
- @Test
- public void requireThatIndexableTokenStringsAreAnnotatedWithModeALL() {
- SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("tesla")));
- var span2 = expected.spanList().span(0, 4);
- span2.annotate(new Annotation(AnnotationTypes.TERM));
- span2.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("car")));
- var span3 = expected.spanList().span(0, 8);
- span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelxes")));
- span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelx")));
- span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("mex")));
- for (TokenType type : TokenType.values()) {
- if (!type.isIndexable()) continue;
- assertAnnotations(expected, "Tesla cars", new AnnotatorConfig().setStemMode("ALL"),
- token("Tesla", "tesla", type),
- token("cars", "car", type),
- SimpleToken.fromStems("ModelXes", List.of("modelxes", "modelx", "mex")));
+ assertAnnotations(expected, "foo", newToken("foo", "bar", type));
}
}
@@ -80,7 +63,7 @@ public class LinguisticsAnnotatorTestCase {
SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
for (TokenType type : TokenType.values()) {
- assertAnnotations(expected, "foo", token("foo", "bar", type, true));
+ assertAnnotations(expected, "foo", newToken("foo", "bar", type, true));
}
}
@@ -93,21 +76,21 @@ public class LinguisticsAnnotatorTestCase {
if (!specialToken && !type.isIndexable()) {
continue;
}
- assertAnnotations(expected, "foo", token("foo", "foo", type, specialToken));
+ assertAnnotations(expected, "foo", newToken("foo", "foo", type, specialToken));
}
}
}
@Test
- public void requireThatTermAnnotationsPreserveCasing() {
+ public void requireThatTermAnnotationsAreLowerCased() {
SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("BaR")));
+ expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
for (boolean specialToken : Arrays.asList(true, false)) {
for (TokenType type : TokenType.values()) {
if (!specialToken && !type.isIndexable()) {
continue;
}
- assertAnnotations(expected, "foo", token("foo", "BaR", type, specialToken));
+ assertAnnotations(expected, "foo", newToken("foo", "BAR", type, specialToken));
}
}
}
@@ -119,11 +102,11 @@ public class LinguisticsAnnotatorTestCase {
expected.spanList().span(3, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
expected.spanList().span(6, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz")));
- SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC)
- .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
- .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
- .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
- .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
+ SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC)
+ .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
+ .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
+ .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
+ .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
assertAnnotations(expected, "foobarbaz", token);
}
@@ -133,11 +116,11 @@ public class LinguisticsAnnotatorTestCase {
expected.spanList().span(0, 9).annotate(new Annotation(AnnotationTypes.TERM,
new StringFieldValue("foobarbaz")));
- SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true)
- .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
- .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
- .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
- .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
+ SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true)
+ .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
+ .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
+ .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
+ .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
assertAnnotations(expected, "foobarbaz", token);
}
@@ -157,8 +140,7 @@ public class LinguisticsAnnotatorTestCase {
continue;
}
assertAnnotations(expected, "foo",
- new AnnotatorConfig(),
- newLinguistics(List.of(token("foo", "foo", type, specialToken)),
+ newLinguistics(List.of(newToken("foo", "foo", type, specialToken)),
Collections.singletonMap("foo", "bar")));
}
}
@@ -172,9 +154,11 @@ public class LinguisticsAnnotatorTestCase {
StringFieldValue val = new StringFieldValue("foo");
val.setSpanTree(spanTree);
- Linguistics linguistics = newLinguistics(List.of(token("foo", "bar", TokenType.ALPHABETIC, false)),
+ Linguistics linguistics = newLinguistics(List.of(newToken("foo", "bar", TokenType.ALPHABETIC, false)),
Collections.<String, String>emptyMap());
- assertTrue(new LinguisticsAnnotator(linguistics, new AnnotatorConfig()).annotate(val));
+ new LinguisticsAnnotator(linguistics, CONFIG).annotate(val);
+
+ assertTrue(new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS));
}
@@ -202,7 +186,7 @@ public class LinguisticsAnnotatorTestCase {
}
@Test
- public void requireThatMaxTermOccurrencesIsHonored() {
+ public void requireThatMaxTermOccurencesIsHonored() {
final String inputTerm = "foo";
final String stemmedInputTerm = "bar"; // completely different from
// inputTerm for safer test
@@ -220,7 +204,7 @@ public class LinguisticsAnnotatorTestCase {
StringBuilder input = new StringBuilder();
Token[] tokens = new Token[inputTermOccurence];
for (int i = 0; i < inputTermOccurence; ++i) {
- SimpleToken t = token(inputTerm, stemmedInputTerm, type);
+ SimpleToken t = newToken(inputTerm, stemmedInputTerm, type);
t.setOffset(i * paddedInputTerm.length());
tokens[i] = t;
input.append(paddedInputTerm);
@@ -230,29 +214,28 @@ public class LinguisticsAnnotatorTestCase {
}
// --------------------------------------------------------------------------------
+ //
// Utilities
+ //
+ // --------------------------------------------------------------------------------
- private static SimpleToken token(String orig, String stem, TokenType type) {
- return token(orig, stem, type, false);
+ private static SimpleToken newToken(String orig, String stem, TokenType type) {
+ return newToken(orig, stem, type, false);
}
- private static SimpleToken token(String orig, String stem, TokenType type, boolean specialToken) {
+ private static SimpleToken newToken(String orig, String stem, TokenType type, boolean specialToken) {
return new SimpleToken(orig).setTokenString(stem)
.setType(type)
.setSpecialToken(specialToken);
}
private static void assertAnnotations(SpanTree expected, String value, Token... tokens) {
- assertAnnotations(expected, value, new AnnotatorConfig(), newLinguistics(Arrays.asList(tokens), Collections.emptyMap()));
- }
-
- private static void assertAnnotations(SpanTree expected, String value, AnnotatorConfig config, Token... tokens) {
- assertAnnotations(expected, value, config, newLinguistics(Arrays.asList(tokens), Collections.emptyMap()));
+ assertAnnotations(expected, value, newLinguistics(Arrays.asList(tokens), Collections.<String, String>emptyMap()));
}
- private static void assertAnnotations(SpanTree expected, String str, AnnotatorConfig config, Linguistics linguistics) {
+ private static void assertAnnotations(SpanTree expected, String str, Linguistics linguistics) {
StringFieldValue val = new StringFieldValue(str);
- assertEquals(expected != null, new LinguisticsAnnotator(linguistics, config).annotate(val));
+ assertEquals(expected != null, new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS));
}
diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
index f0439a21fec..5ad6a382abd 100644
--- a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
+++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
@@ -26,7 +26,6 @@ public class LinguisticsCase {
public static String toLowerCase(String in) {
// def is picked from http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#toLowerCase%28%29
// Also, at the time of writing, English is the default language for queries
- if (in == null) return null;
return Lowercase.toLowerCase(in);
}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
index 9178c2d7e09..33f5ee7e4bb 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
@@ -189,8 +189,9 @@ public class GramSplitter {
@Override
public boolean equals(Object o) {
if (this == o) return true;
- if ( ! (o instanceof Gram gram)) return false;
+ if ( ! (o instanceof Gram)) return false;
+ Gram gram = (Gram)o;
if (codePointCount != gram.codePointCount) return false;
if (start != gram.start) return false;
return true;
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
index 809e9b8d133..6cc68c7ac14 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
@@ -15,48 +15,35 @@ import java.util.Objects;
public class SimpleToken implements Token {
private final List<Token> components = new ArrayList<>();
- private final String original;
+ private final String orig;
private TokenType type = TokenType.UNKNOWN;
private TokenScript script = TokenScript.UNKNOWN;
private String tokenString;
- private List<String> stems = null; // Any additional stems after tokenString
private boolean specialToken = false;
private long offset = 0;
- public SimpleToken(String original) {
- this(original, (String)null);
+ public SimpleToken(String orig) {
+ this(orig, null);
}
- public SimpleToken(String original, String tokenString) {
- this.original = original;
+ public SimpleToken(String orig, String tokenString) {
+ this.orig = orig;
this.tokenString = tokenString;
}
- /** Exposed as fromStems */
- private SimpleToken(String original, List<String> stems) {
- this.type = TokenType.ALPHABETIC; // Only type which may have stems
- this.original = original;
- this.tokenString = stems.get(0);
- this.stems = List.copyOf(stems.subList(1, stems.size()));
- }
-
@Override
public String getOrig() {
- return original;
+ return orig;
}
@Override
public int getNumStems() {
- return (tokenString != null ? 1 : 0) + (stems != null ? stems.size() : 0);
+ return tokenString != null ? 1 : 0;
}
@Override
public String getStem(int i) {
- if (i == 0)
- return tokenString;
- if (stems != null && i-1 < stems.size())
- return stems.get(i-1);
- return tokenString; // TODO Vespa 9: throw new IllegalArgumentException() instead
+ return tokenString;
}
@Override
@@ -144,12 +131,12 @@ public class SimpleToken implements Token {
@Override
public int hashCode() {
- return original.hashCode();
+ return orig.hashCode();
}
@Override
public String toString() {
- return "token '" + original + "'";
+ return "token '" + orig + "'";
}
public String toDetailString() {
@@ -184,8 +171,4 @@ public class SimpleToken implements Token {
return getType().isIndexable() && (getOrig().length() > 0);
}
- public static SimpleToken fromStems(String original, List<String> stems) {
- return new SimpleToken(original, stems);
- }
-
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index b72d2bd6d37..98a84a48095 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -106,7 +106,7 @@ public class SimpleTokenizer implements Tokenizer {
String oldToken = token;
token = stemmer.stem(token);
String newToken = token;
- log.log(Level.FINEST, () -> "stem '" + oldToken + "' to '" + newToken + "'");
+ log.log(Level.FINEST, () -> "stem '" + oldToken+"' to '" + newToken+"'");
}
String result = token;
log.log(Level.FINEST, () -> "processed token is: " + result);