summaryrefslogtreecommitdiffstats
path: root/indexinglanguage
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@vespa.ai>2023-11-14 11:28:40 +0100
committerJon Bratseth <bratseth@vespa.ai>2023-11-14 11:28:40 +0100
commit997896e40f47770b22a81b5ec8281d2e962ec4d9 (patch)
tree3784800283871e8f99b5579e3a8e545a11f86df1 /indexinglanguage
parent29109450c8c2c98d969a711b8f6240bb5594c150 (diff)
Revert "Merge pull request #29328 from vespa-engine/revert-29314-bratseth/casing-take-2"
This reverts commit a72e949533a46d665440a9c72ca2b8fb58f3a9c3, reversing changes made to 944d635d00e165166508ef23399e9ed65a87a9c8.
Diffstat (limited to 'indexinglanguage')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java7
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java15
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java36
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java4
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java89
5 files changed, 83 insertions, 68 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
index 26058eeb8f3..fdfadf65400 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
@@ -15,6 +15,8 @@ import com.yahoo.vespa.indexinglanguage.linguistics.LinguisticsAnnotator;
import java.util.Iterator;
+import static com.yahoo.language.LinguisticsCase.toLowerCase;
+
/**
* A filter which splits incoming text into n-grams.
*
@@ -68,8 +70,9 @@ public final class NGramExpression extends Expression {
// annotate gram as a word term
String gramString = gram.extractFrom(output.getString());
- typedSpan(gram.getStart(), gram.getCodePointCount(), TokenType.ALPHABETIC, spanList).
- annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(gramString, gramString));
+ typedSpan(gram.getStart(),
+ gram.getCodePointCount(),
+ TokenType.ALPHABETIC, spanList).annotate(LinguisticsAnnotator.termAnnotation(toLowerCase(gramString), gramString));
lastPosition = gram.getStart() + gram.getCodePointCount();
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
index 684bae3bf97..5c1bf0813c4 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
@@ -13,7 +13,7 @@ public class AnnotatorConfig implements Cloneable {
private Language language;
private StemMode stemMode;
private boolean removeAccents;
- private int maxTermOccurences;
+ private int maxTermOccurrences;
private int maxTokenizeLength;
public static final int DEFAULT_MAX_TERM_OCCURRENCES;
@@ -29,7 +29,7 @@ public class AnnotatorConfig implements Cloneable {
language = Language.ENGLISH;
stemMode = StemMode.NONE;
removeAccents = false;
- maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES;
+ maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES;
maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH;
}
@@ -37,7 +37,7 @@ public class AnnotatorConfig implements Cloneable {
language = rhs.language;
stemMode = rhs.stemMode;
removeAccents = rhs.removeAccents;
- maxTermOccurences = rhs.maxTermOccurences;
+ maxTermOccurrences = rhs.maxTermOccurrences;
maxTokenizeLength = rhs.maxTokenizeLength;
}
@@ -74,11 +74,11 @@ public class AnnotatorConfig implements Cloneable {
}
public int getMaxTermOccurrences() {
- return maxTermOccurences;
+ return maxTermOccurrences;
}
public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) {
- this.maxTermOccurences = maxTermCount;
+ this.maxTermOccurrences = maxTermCount;
return this;
}
@@ -109,7 +109,7 @@ public class AnnotatorConfig implements Cloneable {
if (removeAccents != rhs.removeAccents) {
return false;
}
- if (maxTermOccurences != rhs.maxTermOccurences) {
+ if (maxTermOccurrences != rhs.maxTermOccurrences) {
return false;
}
if (maxTokenizeLength != rhs.maxTokenizeLength) {
@@ -121,6 +121,7 @@ public class AnnotatorConfig implements Cloneable {
@Override
public int hashCode() {
return getClass().hashCode() + language.hashCode() + stemMode.hashCode() +
- Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength;
+ Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength;
}
+
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 191d067effe..52cd8a8ff54 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -34,8 +34,8 @@ public class LinguisticsAnnotator {
final Map<String, Integer> termOccurrences = new HashMap<>();
final int maxOccurrences;
- public TermOccurrences(int maxOccurences) {
- this.maxOccurrences = maxOccurences;
+ public TermOccurrences(int maxOccurrences) {
+ this.maxOccurrences = maxOccurrences;
}
boolean termCountBelowLimit(String term) {
@@ -86,24 +86,23 @@ public class LinguisticsAnnotator {
}
/**
- * Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the
+ * Creates a TERM annotation which has the term as annotation (only) if it is different from the
* original.
*
- * @param termToLowerCase the term to lower case
- * @param origTerm the original term
+ * @param term the term
+ * @param origTerm the original term
* @return the created TERM annotation
*/
- public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) {
- String annotationValue = toLowerCase(termToLowerCase);
- if (annotationValue.equals(origTerm)) {
+ public static Annotation termAnnotation(String term, String origTerm) {
+ if (term.equals(origTerm))
return new Annotation(AnnotationTypes.TERM);
- }
- return new Annotation(AnnotationTypes.TERM, new StringFieldValue(annotationValue));
+ else
+ return new Annotation(AnnotationTypes.TERM, new StringFieldValue(term));
}
private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences) {
if (termOccurrences.termCountBelowLimit(term)) {
- here.annotate(lowerCaseTermAnnotation(term, orig));
+ here.annotate(termAnnotation(term, orig));
}
}
@@ -127,29 +126,24 @@ public class LinguisticsAnnotator {
}
if (mode == StemMode.ALL) {
Span where = parent.span((int)token.getOffset(), token.getOrig().length());
- String lowercasedOrig = toLowerCase(token.getOrig());
- addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences);
- String lowercasedTerm = lowercasedOrig;
+ String lowercasedOrig = toLowerCase(token.getOrig());
String term = token.getTokenString();
if (term != null) {
- lowercasedTerm = toLowerCase(term);
- }
- if (! lowercasedOrig.equals(lowercasedTerm)) {
addAnnotation(where, term, token.getOrig(), termOccurrences);
+ if ( ! term.equals(lowercasedOrig))
+ addAnnotation(where, token.getOrig(), token.getOrig(), termOccurrences);
}
for (int i = 0; i < token.getNumStems(); i++) {
String stem = token.getStem(i);
- String lowercasedStem = toLowerCase(stem);
- if (! (lowercasedOrig.equals(lowercasedStem) || lowercasedTerm.equals(lowercasedStem))) {
+ if (! (stem.equals(lowercasedOrig) || stem.equals(term)))
addAnnotation(where, stem, token.getOrig(), termOccurrences);
- }
}
} else {
String term = token.getTokenString();
if (term == null || term.trim().isEmpty()) return;
if (termOccurrences.termCountBelowLimit(term)) {
- parent.span((int)token.getOffset(), token.getOrig().length()).annotate(lowerCaseTermAnnotation(term, token.getOrig()));
+ parent.span((int)token.getOffset(), token.getOrig().length()).annotate(termAnnotation(term, token.getOrig()));
}
}
}
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java
index bcde8751de8..b4e266ab3eb 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NGramTestCase.java
@@ -57,8 +57,8 @@ public class NGramTestCase {
new NGramExpression(new SimpleLinguistics(), 3).execute(context);
StringFieldValue value = (StringFieldValue)context.getValue();
- assertEquals("Grams are pure annotations - field value is unchanged", "en gul Bille sang... ",
- value.getString());
+ assertEquals("Grams are pure annotations - field value is unchanged",
+ "en gul Bille sang... ", value.getString());
SpanTree gramTree = value.getSpanTree(SpanTrees.LINGUISTICS);
assertNotNull(gramTree);
SpanList grams = (SpanList)gramTree.getRoot();
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index 67bff3843ee..a4dbe1fe826 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -19,7 +19,6 @@ import org.junit.Test;
import org.mockito.Mockito;
import java.util.*;
-import java.util.stream.Collectors;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
@@ -29,8 +28,6 @@ import static org.junit.Assert.assertTrue;
*/
public class LinguisticsAnnotatorTestCase {
- private static final AnnotatorConfig CONFIG = new AnnotatorConfig();
-
@Test
public void requireThatAnnotateFailsWithZeroTokens() {
assertAnnotations(null, "foo");
@@ -42,7 +39,7 @@ public class LinguisticsAnnotatorTestCase {
if (type.isIndexable()) {
continue;
}
- assertAnnotations(null, "foo", newToken("foo", "bar", type));
+ assertAnnotations(null, "foo", token("foo", "bar", type));
}
}
@@ -54,7 +51,27 @@ public class LinguisticsAnnotatorTestCase {
if (!type.isIndexable()) {
continue;
}
- assertAnnotations(expected, "foo", newToken("foo", "bar", type));
+ assertAnnotations(expected, "foo", token("foo", "bar", type));
+ }
+ }
+
+ @Test
+ public void requireThatIndexableTokenStringsAreAnnotatedWithModeALL() {
+ SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
+ expected.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("tesla")));
+ var span2 = expected.spanList().span(0, 4);
+ span2.annotate(new Annotation(AnnotationTypes.TERM));
+ span2.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("car")));
+ var span3 = expected.spanList().span(0, 8);
+ span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelxes")));
+ span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("modelx")));
+ span3.annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("mex")));
+ for (TokenType type : TokenType.values()) {
+ if (!type.isIndexable()) continue;
+ assertAnnotations(expected, "Tesla cars", new AnnotatorConfig().setStemMode("ALL"),
+ token("Tesla", "tesla", type),
+ token("cars", "car", type),
+ SimpleToken.fromStems("ModelXes", List.of("modelxes", "modelx", "mex")));
}
}
@@ -63,7 +80,7 @@ public class LinguisticsAnnotatorTestCase {
SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
for (TokenType type : TokenType.values()) {
- assertAnnotations(expected, "foo", newToken("foo", "bar", type, true));
+ assertAnnotations(expected, "foo", token("foo", "bar", type, true));
}
}
@@ -76,21 +93,21 @@ public class LinguisticsAnnotatorTestCase {
if (!specialToken && !type.isIndexable()) {
continue;
}
- assertAnnotations(expected, "foo", newToken("foo", "foo", type, specialToken));
+ assertAnnotations(expected, "foo", token("foo", "foo", type, specialToken));
}
}
}
@Test
- public void requireThatTermAnnotationsAreLowerCased() {
+ public void requireThatTermAnnotationsPreserveCasing() {
SpanTree expected = new SpanTree(SpanTrees.LINGUISTICS);
- expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
+ expected.spanList().span(0, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("BaR")));
for (boolean specialToken : Arrays.asList(true, false)) {
for (TokenType type : TokenType.values()) {
if (!specialToken && !type.isIndexable()) {
continue;
}
- assertAnnotations(expected, "foo", newToken("foo", "BAR", type, specialToken));
+ assertAnnotations(expected, "foo", token("foo", "BaR", type, specialToken));
}
}
}
@@ -102,11 +119,11 @@ public class LinguisticsAnnotatorTestCase {
expected.spanList().span(3, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("bar")));
expected.spanList().span(6, 3).annotate(new Annotation(AnnotationTypes.TERM, new StringFieldValue("baz")));
- SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC)
- .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
- .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
- .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
- .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
+ SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC)
+ .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
+ .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
+ .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
+ .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
assertAnnotations(expected, "foobarbaz", token);
}
@@ -116,11 +133,11 @@ public class LinguisticsAnnotatorTestCase {
expected.spanList().span(0, 9).annotate(new Annotation(AnnotationTypes.TERM,
new StringFieldValue("foobarbaz")));
- SimpleToken token = newToken("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true)
- .addComponent(newToken("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
- .addComponent(newToken("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
- .addComponent(newToken("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
- .addComponent(newToken("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
+ SimpleToken token = token("FOOBARBAZ", "foobarbaz", TokenType.ALPHABETIC).setSpecialToken(true)
+ .addComponent(token("FOO", "foo", TokenType.ALPHABETIC).setOffset(0))
+ .addComponent(token("BARBAZ", "barbaz", TokenType.ALPHABETIC).setOffset(3)
+ .addComponent(token("BAR", "bar", TokenType.ALPHABETIC).setOffset(3))
+ .addComponent(token("BAZ", "baz", TokenType.ALPHABETIC).setOffset(6)));
assertAnnotations(expected, "foobarbaz", token);
}
@@ -140,7 +157,8 @@ public class LinguisticsAnnotatorTestCase {
continue;
}
assertAnnotations(expected, "foo",
- newLinguistics(List.of(newToken("foo", "foo", type, specialToken)),
+ new AnnotatorConfig(),
+ newLinguistics(List.of(token("foo", "foo", type, specialToken)),
Collections.singletonMap("foo", "bar")));
}
}
@@ -154,11 +172,9 @@ public class LinguisticsAnnotatorTestCase {
StringFieldValue val = new StringFieldValue("foo");
val.setSpanTree(spanTree);
- Linguistics linguistics = newLinguistics(List.of(newToken("foo", "bar", TokenType.ALPHABETIC, false)),
+ Linguistics linguistics = newLinguistics(List.of(token("foo", "bar", TokenType.ALPHABETIC, false)),
Collections.<String, String>emptyMap());
- new LinguisticsAnnotator(linguistics, CONFIG).annotate(val);
-
- assertTrue(new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
+ assertTrue(new LinguisticsAnnotator(linguistics, new AnnotatorConfig()).annotate(val));
assertEquals(spanTree, val.getSpanTree(SpanTrees.LINGUISTICS));
}
@@ -186,7 +202,7 @@ public class LinguisticsAnnotatorTestCase {
}
@Test
- public void requireThatMaxTermOccurencesIsHonored() {
+ public void requireThatMaxTermOccurrencesIsHonored() {
final String inputTerm = "foo";
final String stemmedInputTerm = "bar"; // completely different from
// inputTerm for safer test
@@ -204,7 +220,7 @@ public class LinguisticsAnnotatorTestCase {
StringBuilder input = new StringBuilder();
Token[] tokens = new Token[inputTermOccurence];
for (int i = 0; i < inputTermOccurence; ++i) {
- SimpleToken t = newToken(inputTerm, stemmedInputTerm, type);
+ SimpleToken t = token(inputTerm, stemmedInputTerm, type);
t.setOffset(i * paddedInputTerm.length());
tokens[i] = t;
input.append(paddedInputTerm);
@@ -214,28 +230,29 @@ public class LinguisticsAnnotatorTestCase {
}
// --------------------------------------------------------------------------------
- //
// Utilities
- //
- // --------------------------------------------------------------------------------
- private static SimpleToken newToken(String orig, String stem, TokenType type) {
- return newToken(orig, stem, type, false);
+ private static SimpleToken token(String orig, String stem, TokenType type) {
+ return token(orig, stem, type, false);
}
- private static SimpleToken newToken(String orig, String stem, TokenType type, boolean specialToken) {
+ private static SimpleToken token(String orig, String stem, TokenType type, boolean specialToken) {
return new SimpleToken(orig).setTokenString(stem)
.setType(type)
.setSpecialToken(specialToken);
}
private static void assertAnnotations(SpanTree expected, String value, Token... tokens) {
- assertAnnotations(expected, value, newLinguistics(Arrays.asList(tokens), Collections.<String, String>emptyMap()));
+ assertAnnotations(expected, value, new AnnotatorConfig(), newLinguistics(Arrays.asList(tokens), Collections.emptyMap()));
+ }
+
+ private static void assertAnnotations(SpanTree expected, String value, AnnotatorConfig config, Token... tokens) {
+ assertAnnotations(expected, value, config, newLinguistics(Arrays.asList(tokens), Collections.emptyMap()));
}
- private static void assertAnnotations(SpanTree expected, String str, Linguistics linguistics) {
+ private static void assertAnnotations(SpanTree expected, String str, AnnotatorConfig config, Linguistics linguistics) {
StringFieldValue val = new StringFieldValue(str);
- assertEquals(expected != null, new LinguisticsAnnotator(linguistics, CONFIG).annotate(val));
+ assertEquals(expected != null, new LinguisticsAnnotator(linguistics, config).annotate(val));
assertEquals(expected, val.getSpanTree(SpanTrees.LINGUISTICS));
}