summaryrefslogtreecommitdiffstats
path: root/indexinglanguage
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2017-08-07 18:14:53 +0200
committerHenning Baldersheim <balder@yahoo-inc.com>2017-08-07 18:15:39 +0200
commit92c0fcfe10c06163968dcfa2ff07993c2f2f74f5 (patch)
tree03b8a6e54809c2b2d0fe603f426d8ddaf3aaf63e /indexinglanguage
parenta69f61901d6a243eec05d7a8d60eecbf28d70931 (diff)
Add capping of fields before tokenizing
Diffstat (limited to 'indexinglanguage')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java6
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java21
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java5
-rw-r--r--indexinglanguage/src/main/javacc/IndexingParser.jj4
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java24
5 files changed, 57 insertions, 3 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java
index 661cc6c9c3e..b3cee971258 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java
@@ -33,7 +33,8 @@ public class TokenizeExpression extends Expression {
@Override
protected void doExecute(ExecutionContext context) {
- StringFieldValue output = ((StringFieldValue)context.getValue()).clone();
+ StringFieldValue input = (StringFieldValue)context.getValue();
+ StringFieldValue output = input.clone();
context.setValue(output);
AnnotatorConfig cfg = new AnnotatorConfig(config);
@@ -70,6 +71,9 @@ public class TokenizeExpression extends Expression {
if (config.getStemMode() != StemMode.NONE) {
ret.append(" stem:\""+config.getStemMode()+"\"");
}
+ if (config.hasNonDefaultMaxTokenLength()) {
+ ret.append(" max-length:" + config.getMaxTokenizeLength());
+ }
return ret.toString();
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
index ccc1f293112..990a8a513f2 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
@@ -14,8 +14,10 @@ public class AnnotatorConfig implements Cloneable {
private StemMode stemMode;
private boolean removeAccents;
private int maxTermOccurences;
+ private int maxTokenizeLength = MAX_TOKENIZE_LENGTH;
public static final int DEFAULT_MAX_TERM_OCCURRENCES;
+ private static final int MAX_TOKENIZE_LENGTH = 1000000;
static {
IlscriptsConfig defaults = new IlscriptsConfig(new IlscriptsConfig.Builder());
@@ -34,6 +36,7 @@ public class AnnotatorConfig implements Cloneable {
stemMode = rhs.stemMode;
removeAccents = rhs.removeAccents;
maxTermOccurences = rhs.maxTermOccurences;
+ maxTokenizeLength = rhs.maxTokenizeLength;
}
public Language getLanguage() {
@@ -77,6 +80,19 @@ public class AnnotatorConfig implements Cloneable {
return this;
}
+ public AnnotatorConfig setMaxTokenLength(int maxTokenizeLength) {
+ this.maxTokenizeLength = maxTokenizeLength;
+ return this;
+ }
+
+ public int getMaxTokenizeLength() {
+ return maxTokenizeLength;
+ }
+
+ public boolean hasNonDefaultMaxTokenLength() {
+ return maxTokenizeLength != MAX_TOKENIZE_LENGTH;
+ }
+
@Override
public boolean equals(Object obj) {
if (!(obj instanceof AnnotatorConfig)) {
@@ -95,12 +111,15 @@ public class AnnotatorConfig implements Cloneable {
if (maxTermOccurences != rhs.maxTermOccurences) {
return false;
}
+ if (maxTokenizeLength != rhs.maxTokenizeLength) {
+ return false;
+ }
return true;
}
@Override
public int hashCode() {
return getClass().hashCode() + language.hashCode() + stemMode.hashCode() +
- Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences;
+ Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength;
}
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index b320bce6dbf..3adffa30725 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -64,7 +64,10 @@ public class LinguisticsAnnotator {
if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS.
Tokenizer tokenizer = factory.getTokenizer();
- Iterable<Token> tokens = tokenizer.tokenize(text.getString(), config.getLanguage(), config.getStemMode(),
+ String input = (text.getString().length() <= config.getMaxTokenizeLength())
+ ? text.getString()
+ : text.getString().substring(0, config.getMaxTokenizeLength());
+ Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(),
config.getRemoveAccents());
TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());
SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS);
diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj
index f1abb76c645..d564443bb48 100644
--- a/indexinglanguage/src/main/javacc/IndexingParser.jj
+++ b/indexinglanguage/src/main/javacc/IndexingParser.jj
@@ -164,6 +164,7 @@ TOKEN :
<INPUT: "input"> |
<JOIN: "join"> |
<LOWER_CASE: "lowercase"> |
+ <MAX_LENGTH: "max-length"> |
<NGRAM: "ngram"> |
<NORMALIZE: "normalize"> |
<NOW: "now"> |
@@ -615,9 +616,11 @@ AnnotatorConfig tokenizeCfg() :
{
AnnotatorConfig val = new AnnotatorConfig(annotatorCfg);
String str = "SHORTEST";
+ Integer maxLength;
}
{
( <STEM> ( <COLON> str = string() ) ? { val.setStemMode(str); } |
+ <MAX_LENGTH> <COLON> maxLength = integer() { val.setMaxTokenLength(maxLength); } |
<NORMALIZE> { val.setRemoveAccents(true); } )+
{ return val; }
}
@@ -723,6 +726,7 @@ String identifier() :
<INPUT> |
<JOIN> |
<LOWER_CASE> |
+ <MAX_LENGTH> |
<NGRAM> |
<NORMALIZE> |
<NOW> |
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index 5805d56aa57..2d18d410e66 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -12,6 +12,7 @@ import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.TokenType;
import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.simple.SimpleLinguistics;
import com.yahoo.language.simple.SimpleToken;
import org.junit.Test;
@@ -167,6 +168,29 @@ public class LinguisticsAnnotatorTestCase {
}
@Test
+ public void requireThatTokenizeCappingWorks() {
+ String shortString = "short string";
+ SpanTree spanTree = new SpanTree(SpanTrees.LINGUISTICS);
+ spanTree.setStringFieldValue(new StringFieldValue(shortString));
+ spanTree.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM));
+ spanTree.spanList().span(6, 6).annotate(new Annotation(AnnotationTypes.TERM));
+
+ StringFieldValue shortValue = new StringFieldValue(shortString);
+
+ Linguistics linguistics = new SimpleLinguistics();
+
+ LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, new AnnotatorConfig().setMaxTokenLength(12));
+
+ assertTrue(annotator.annotate(shortValue));
+ assertEquals(spanTree, shortValue.getSpanTree(SpanTrees.LINGUISTICS));
+ assertEquals(shortString, shortValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString());
+
+ StringFieldValue cappedValue = new StringFieldValue(shortString + " a longer string");
+ assertTrue(annotator.annotate(cappedValue));
+ assertEquals((shortString + " a longer string"), cappedValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString());
+ }
+
+ @Test
public void requireThatMaxTermOccurencesIsHonored() {
final String inputTerm = "foo";
final String stemmedInputTerm = "bar"; // completely different from