aboutsummaryrefslogtreecommitdiffstats
path: root/indexinglanguage/src/main/java/com/yahoo
diff options
context:
space:
mode:
Diffstat (limited to 'indexinglanguage/src/main/java/com/yahoo')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExactExpression.java32
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java5
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java27
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java23
4 files changed, 74 insertions, 13 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExactExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExactExpression.java
index 855430f45fc..7481363b737 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExactExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/ExactExpression.java
@@ -12,6 +12,9 @@ import com.yahoo.document.annotation.SpanTrees;
import com.yahoo.document.datatypes.IntegerFieldValue;
import com.yahoo.document.datatypes.StringFieldValue;
import com.yahoo.language.process.TokenType;
+import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig;
+
+import java.util.OptionalInt;
import static com.yahoo.language.LinguisticsCase.toLowerCase;
@@ -20,8 +23,19 @@ import static com.yahoo.language.LinguisticsCase.toLowerCase;
*/
public final class ExactExpression extends Expression {
- public ExactExpression() {
+ private int maxTokenLength;
+
+ private ExactExpression(OptionalInt maxTokenLength) {
super(DataType.STRING);
+ this.maxTokenLength = maxTokenLength.isPresent() ? maxTokenLength.getAsInt() : AnnotatorConfig.getDefaultMaxTokenLength();
+ }
+
+ public ExactExpression() {
+ this(OptionalInt.empty());;
+ }
+
+ public ExactExpression(int maxTokenLength) {
+ this(OptionalInt.of(maxTokenLength));
}
@Override
@@ -36,6 +50,12 @@ public final class ExactExpression extends Expression {
String next = toLowerCase(prev);
SpanTree tree = output.getSpanTree(SpanTrees.LINGUISTICS);
+ if (next.length() > maxTokenLength) {
+ if (tree != null) {
+ output.removeSpanTree(SpanTrees.LINGUISTICS);
+ }
+ return;
+ }
SpanList root;
if (tree == null) {
root = new SpanList();
@@ -64,8 +84,14 @@ public final class ExactExpression extends Expression {
}
@Override
- public String toString() {
- return "exact";
+ public String toString()
+ {
+ StringBuilder ret = new StringBuilder();
+ ret.append("exact");
+ if (maxTokenLength != AnnotatorConfig.getDefaultMaxTokenLength()) {
+ ret.append(" max-token-length:" + maxTokenLength);
+ }
+ return ret.toString();
}
@Override
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java
index b807ad4cb65..a3c404e50c3 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java
@@ -66,9 +66,12 @@ public final class TokenizeExpression extends Expression {
if (config.getStemMode() != StemMode.NONE) {
ret.append(" stem:\""+config.getStemMode()+"\"");
}
- if (config.hasNonDefaultMaxTokenLength()) {
+ if (config.hasNonDefaultMaxTokenizeLength()) {
ret.append(" max-length:" + config.getMaxTokenizeLength());
}
+ if (config.hasNonDefaultMaxTokenLength()) {
+ ret.append(" max-token-length:" + config.getMaxTokenLength());
+ }
if (config.hasNonDefaultMaxTermOccurrences()) {
ret.append(" max-occurrences:" + config.getMaxTermOccurrences());
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
index 7b6f350d831..6522e284fc8 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
@@ -14,14 +14,17 @@ public class AnnotatorConfig implements Cloneable {
private StemMode stemMode;
private boolean removeAccents;
private int maxTermOccurrences;
+ private int maxTokenLength;
private int maxTokenizeLength;
public static final int DEFAULT_MAX_TERM_OCCURRENCES;
+ private static final int DEFAULT_MAX_TOKEN_LENGTH;
private static final int DEFAULT_MAX_TOKENIZE_LENGTH;
static {
IlscriptsConfig defaults = new IlscriptsConfig(new IlscriptsConfig.Builder());
DEFAULT_MAX_TERM_OCCURRENCES = defaults.maxtermoccurrences();
+ DEFAULT_MAX_TOKEN_LENGTH = defaults.maxtokenlength();
DEFAULT_MAX_TOKENIZE_LENGTH = defaults.fieldmatchmaxlength();
}
@@ -30,6 +33,7 @@ public class AnnotatorConfig implements Cloneable {
stemMode = StemMode.NONE;
removeAccents = false;
maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES;
+ maxTokenLength = DEFAULT_MAX_TOKEN_LENGTH;
maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH;
}
@@ -38,6 +42,7 @@ public class AnnotatorConfig implements Cloneable {
stemMode = rhs.stemMode;
removeAccents = rhs.removeAccents;
maxTermOccurrences = rhs.maxTermOccurrences;
+ maxTokenLength = rhs.maxTokenLength;
maxTokenizeLength = rhs.maxTokenizeLength;
}
@@ -82,7 +87,18 @@ public class AnnotatorConfig implements Cloneable {
return this;
}
- public AnnotatorConfig setMaxTokenLength(int maxTokenizeLength) {
+ public AnnotatorConfig setMaxTokenLength(int maxTokenLength) {
+ this.maxTokenLength = maxTokenLength;
+ return this;
+ }
+
+ public int getMaxTokenLength() {
+ return maxTokenLength;
+ }
+
+ public static int getDefaultMaxTokenLength() { return DEFAULT_MAX_TOKEN_LENGTH; }
+
+ public AnnotatorConfig setMaxTokenizeLength(int maxTokenizeLength) {
this.maxTokenizeLength = maxTokenizeLength;
return this;
}
@@ -92,6 +108,10 @@ public class AnnotatorConfig implements Cloneable {
}
public boolean hasNonDefaultMaxTokenLength() {
+ return maxTokenLength != DEFAULT_MAX_TOKEN_LENGTH;
+ }
+
+ public boolean hasNonDefaultMaxTokenizeLength() {
return maxTokenizeLength != DEFAULT_MAX_TOKENIZE_LENGTH;
}
@@ -116,6 +136,9 @@ public class AnnotatorConfig implements Cloneable {
if (maxTermOccurrences != rhs.maxTermOccurrences) {
return false;
}
+ if (maxTokenLength != rhs.maxTokenLength) {
+ return false;
+ }
if (maxTokenizeLength != rhs.maxTokenizeLength) {
return false;
}
@@ -125,7 +148,7 @@ public class AnnotatorConfig implements Cloneable {
@Override
public int hashCode() {
return getClass().hashCode() + language.hashCode() + stemMode.hashCode() +
- Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength;
+ Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenLength + maxTokenizeLength;
}
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 86d4e91a567..913b874c6f6 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -78,7 +78,8 @@ public class LinguisticsAnnotator {
TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());
SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS);
for (Token token : tokens)
- addAnnotationSpan(text.getString(), tree.spanList(), token, config.getStemMode(), termOccurrences);
+ addAnnotationSpan(text.getString(), tree.spanList(), token, config.getStemMode(), termOccurrences,
+ config.getMaxTokenLength());
if (tree.numAnnotations() == 0) return false;
text.setSpanTree(tree);
@@ -100,17 +101,22 @@ public class LinguisticsAnnotator {
return new Annotation(AnnotationTypes.TERM, new StringFieldValue(term));
}
- private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences) {
+ private static void addAnnotation(Span here, String term, String orig, TermOccurrences termOccurrences,
+ int maxTokenLength) {
+ if (term.length() > maxTokenLength) {
+ return;
+ }
if (termOccurrences.termCountBelowLimit(term)) {
here.annotate(termAnnotation(term, orig));
}
}
- private static void addAnnotationSpan(String input, SpanList parent, Token token, StemMode mode, TermOccurrences termOccurrences) {
+ private static void addAnnotationSpan(String input, SpanList parent, Token token, StemMode mode,
+ TermOccurrences termOccurrences, int maxTokenLength) {
if ( ! token.isSpecialToken()) {
if (token.getNumComponents() > 0) {
for (int i = 0; i < token.getNumComponents(); ++i) {
- addAnnotationSpan(input, parent, token.getComponent(i), mode, termOccurrences);
+ addAnnotationSpan(input, parent, token.getComponent(i), mode, termOccurrences, maxTokenLength);
}
return;
}
@@ -130,18 +136,21 @@ public class LinguisticsAnnotator {
String lowercasedOrig = toLowerCase(token.getOrig());
String term = token.getTokenString();
if (term != null) {
- addAnnotation(where, term, token.getOrig(), termOccurrences);
+ addAnnotation(where, term, token.getOrig(), termOccurrences, maxTokenLength);
if ( ! term.equals(lowercasedOrig))
- addAnnotation(where, lowercasedOrig, token.getOrig(), termOccurrences);
+ addAnnotation(where, lowercasedOrig, token.getOrig(), termOccurrences, maxTokenLength);
}
for (int i = 0; i < token.getNumStems(); i++) {
String stem = token.getStem(i);
if (! (stem.equals(lowercasedOrig) || stem.equals(term)))
- addAnnotation(where, stem, token.getOrig(), termOccurrences);
+ addAnnotation(where, stem, token.getOrig(), termOccurrences, maxTokenLength);
}
} else {
String term = token.getTokenString();
if (term == null || term.trim().isEmpty()) return;
+ if (term.length() > maxTokenLength) {
+ return;
+ }
if (termOccurrences.termCountBelowLimit(term)) {
parent.span((int)token.getOffset(), token.getOrig().length()).annotate(termAnnotation(term, token.getOrig()));
}