summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <jonbratseth@yahoo.com>2017-08-08 09:55:11 +0200
committerGitHub <noreply@github.com>2017-08-08 09:55:11 +0200
commit3b910df3afbf86a7b57eed82d15a00966532de23 (patch)
treebd327cb0b7d9ec82abafc178bcaddfabf34598b4
parenta69f61901d6a243eec05d7a8d60eecbf28d70931 (diff)
parent12355c67aa9bf5bcd6851374761c938188cd21f4 (diff)
Merge pull request #3057 from yahoo/balder/take-field-match-max-length-into-account-during-indexing
Add capping of fields before tokenizing
-rw-r--r--config-model/src/main/java/com/yahoo/searchdefinition/processing/TextMatch.java3
-rw-r--r--config-model/src/test/derived/advanced/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/annotationsimplicitstruct/ilscripts.cfg3
-rw-r--r--[-rwxr-xr-x]config-model/src/test/derived/annotationsinheritance/ilscripts.cfg3
-rw-r--r--[-rwxr-xr-x]config-model/src/test/derived/annotationsinheritance2/ilscripts.cfg3
-rw-r--r--[-rwxr-xr-x]config-model/src/test/derived/annotationsreference/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/annotationssimple/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/arrays/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/attributeprefetch/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/attributes/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/complex/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/emptydefault/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/exactmatch/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/id/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/indexswitches/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/inheritance/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/music/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/newrank/ilscripts.cfg3
-rw-r--r--[-rwxr-xr-x]config-model/src/test/derived/orderilscripts/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/position_array/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/position_attribute/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/position_extra/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/prefixexactattribute/ilscripts.cfg5
-rw-r--r--config-model/src/test/derived/ranktypes/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/structanyorder/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/types/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/uri_array/ilscripts.cfg3
-rw-r--r--config-model/src/test/derived/uri_wset/ilscripts.cfg3
-rw-r--r--configdefinitions/src/vespa/ilscripts.def1
-rw-r--r--docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java1
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java6
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java23
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java5
-rw-r--r--indexinglanguage/src/main/javacc/IndexingParser.jj4
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java24
35 files changed, 119 insertions, 31 deletions
diff --git a/config-model/src/main/java/com/yahoo/searchdefinition/processing/TextMatch.java b/config-model/src/main/java/com/yahoo/searchdefinition/processing/TextMatch.java
index 9edc3dd00b1..a4d7b1b4054 100644
--- a/config-model/src/main/java/com/yahoo/searchdefinition/processing/TextMatch.java
+++ b/config-model/src/main/java/com/yahoo/searchdefinition/processing/TextMatch.java
@@ -71,6 +71,9 @@ public class TextMatch extends Processor {
}
ret.setStemMode(activeStemming.toStemMode());
ret.setRemoveAccents(field.getNormalizing().doRemoveAccents());
+ if ((field.getMatching() != null) && (field.getMatching().maxLength() != null)) {
+ ret.setMaxTokenLength(field.getMatching().maxLength());
+ }
return ret;
}
diff --git a/config-model/src/test/derived/advanced/ilscripts.cfg b/config-model/src/test/derived/advanced/ilscripts.cfg
index ffa9a7e060f..197b47ccdc4 100644
--- a/config-model/src/test/derived/advanced/ilscripts.cfg
+++ b/config-model/src/test/derived/advanced/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "advanced"
ilscript[0].docfield[0] "debug_src"
ilscript[0].docfield[1] "attributes_src"
@@ -21,4 +22,4 @@ ilscript[0].content[10] "input debug_src | passthrough debug_src"
ilscript[0].content[11] "input product2_src | passthrough product2_src"
ilscript[0].content[12] "input product3_src | passthrough product3_src"
ilscript[0].content[13] "input product_src | passthrough product_src"
-ilscript[0].content[14] "input title_src | passthrough title_src" \ No newline at end of file
+ilscript[0].content[14] "input title_src | passthrough title_src"
diff --git a/config-model/src/test/derived/annotationsimplicitstruct/ilscripts.cfg b/config-model/src/test/derived/annotationsimplicitstruct/ilscripts.cfg
index b49e52092c2..82965f82f1a 100644
--- a/config-model/src/test/derived/annotationsimplicitstruct/ilscripts.cfg
+++ b/config-model/src/test/derived/annotationsimplicitstruct/ilscripts.cfg
@@ -1,2 +1,3 @@
maxtermoccurrences 100
-ilscript[0].doctype "annotationsimplicitstruct" \ No newline at end of file
+fieldmatchmaxlength 1000000
+ilscript[0].doctype "annotationsimplicitstruct"
diff --git a/config-model/src/test/derived/annotationsinheritance/ilscripts.cfg b/config-model/src/test/derived/annotationsinheritance/ilscripts.cfg
index a6b1aabfb47..a1a2e216947 100755..100644
--- a/config-model/src/test/derived/annotationsinheritance/ilscripts.cfg
+++ b/config-model/src/test/derived/annotationsinheritance/ilscripts.cfg
@@ -1,2 +1,3 @@
maxtermoccurrences 100
-ilscript[0].doctype "annotationsinheritance" \ No newline at end of file
+fieldmatchmaxlength 1000000
+ilscript[0].doctype "annotationsinheritance"
diff --git a/config-model/src/test/derived/annotationsinheritance2/ilscripts.cfg b/config-model/src/test/derived/annotationsinheritance2/ilscripts.cfg
index 2717f4ab7e5..d5fc9dae6f6 100755..100644
--- a/config-model/src/test/derived/annotationsinheritance2/ilscripts.cfg
+++ b/config-model/src/test/derived/annotationsinheritance2/ilscripts.cfg
@@ -1,2 +1,3 @@
maxtermoccurrences 100
-ilscript[0].doctype "annotationsinheritance2" \ No newline at end of file
+fieldmatchmaxlength 1000000
+ilscript[0].doctype "annotationsinheritance2"
diff --git a/config-model/src/test/derived/annotationsreference/ilscripts.cfg b/config-model/src/test/derived/annotationsreference/ilscripts.cfg
index 2297994c727..63a1c889893 100755..100644
--- a/config-model/src/test/derived/annotationsreference/ilscripts.cfg
+++ b/config-model/src/test/derived/annotationsreference/ilscripts.cfg
@@ -1,2 +1,3 @@
maxtermoccurrences 100
-ilscript[0].doctype "annotationsreference" \ No newline at end of file
+fieldmatchmaxlength 1000000
+ilscript[0].doctype "annotationsreference"
diff --git a/config-model/src/test/derived/annotationssimple/ilscripts.cfg b/config-model/src/test/derived/annotationssimple/ilscripts.cfg
index a234feff99d..cb1bac309a9 100644
--- a/config-model/src/test/derived/annotationssimple/ilscripts.cfg
+++ b/config-model/src/test/derived/annotationssimple/ilscripts.cfg
@@ -1,2 +1,3 @@
maxtermoccurrences 100
-ilscript[0].doctype "annotationssimple" \ No newline at end of file
+fieldmatchmaxlength 1000000
+ilscript[0].doctype "annotationssimple"
diff --git a/config-model/src/test/derived/arrays/ilscripts.cfg b/config-model/src/test/derived/arrays/ilscripts.cfg
index d8fbe752676..d07f65e170f 100644
--- a/config-model/src/test/derived/arrays/ilscripts.cfg
+++ b/config-model/src/test/derived/arrays/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "arrays"
ilscript[0].docfield[0] "tags"
ilscript[0].docfield[1] "ratings"
@@ -9,4 +10,4 @@ ilscript[0].content[0] "clear_state | guard { input tags | for_each { tokenize n
ilscript[0].content[1] "clear_state | guard { input ratings | summary ratings | attribute ratings; }"
ilscript[0].content[2] "clear_state | guard { input a | tokenize normalize stem:\"SHORTEST\" | index a; }"
ilscript[0].content[3] "clear_state | guard { input b | for_each { tokenize normalize stem:\"SHORTEST\" } | index b | attribute b; }"
-ilscript[0].content[4] "clear_state | guard { input c | for_each { tokenize normalize stem:\"SHORTEST\" } | summary c | index c; }" \ No newline at end of file
+ilscript[0].content[4] "clear_state | guard { input c | for_each { tokenize normalize stem:\"SHORTEST\" } | summary c | index c; }"
diff --git a/config-model/src/test/derived/attributeprefetch/ilscripts.cfg b/config-model/src/test/derived/attributeprefetch/ilscripts.cfg
index 201d529ce1d..cda9d7df73e 100644
--- a/config-model/src/test/derived/attributeprefetch/ilscripts.cfg
+++ b/config-model/src/test/derived/attributeprefetch/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "prefetch"
ilscript[0].docfield[0] "singlebyte"
ilscript[0].docfield[1] "multibyte"
@@ -35,4 +36,4 @@ ilscript[0].content[13] "clear_state | guard { input multidouble | attribute mul
ilscript[0].content[14] "clear_state | guard { input wsdouble | attribute wsdouble; }"
ilscript[0].content[15] "clear_state | guard { input singlestring | attribute singlestring; }"
ilscript[0].content[16] "clear_state | guard { input multistring | attribute multistring; }"
-ilscript[0].content[17] "clear_state | guard { input wsstring | attribute wsstring; }" \ No newline at end of file
+ilscript[0].content[17] "clear_state | guard { input wsstring | attribute wsstring; }"
diff --git a/config-model/src/test/derived/attributes/ilscripts.cfg b/config-model/src/test/derived/attributes/ilscripts.cfg
index 450cc8e499c..4879eff9edc 100644
--- a/config-model/src/test/derived/attributes/ilscripts.cfg
+++ b/config-model/src/test/derived/attributes/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "attributes"
ilscript[0].docfield[0] "a1"
ilscript[0].docfield[1] "a2"
@@ -39,4 +40,4 @@ ilscript[0].content[16] "clear_state | guard { input b7 | summary b7 | attribute
ilscript[0].content[17] "clear_state | guard { input a9 | attribute a9; }"
ilscript[0].content[18] "clear_state | guard { input a10 | attribute a10; }"
ilscript[0].content[19] "clear_state | guard { input a11 | attribute a11; }"
-ilscript[0].content[20] "clear_state | guard { input a12 | attribute a12; }" \ No newline at end of file
+ilscript[0].content[20] "clear_state | guard { input a12 | attribute a12; }"
diff --git a/config-model/src/test/derived/complex/ilscripts.cfg b/config-model/src/test/derived/complex/ilscripts.cfg
index 05de728e16e..4341af4ce87 100644
--- a/config-model/src/test/derived/complex/ilscripts.cfg
+++ b/config-model/src/test/derived/complex/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "complex"
ilscript[0].docfield[0] "title"
ilscript[0].docfield[1] "location"
@@ -45,4 +46,4 @@ ilscript[0].content[20] "clear_state | guard { input combineda | attribute combi
ilscript[0].content[21] "clear_state | guard { input combinedb | tokenize normalize stem:\"SHORTEST\" | index combinedb; }"
ilscript[0].content[22] "input category | passthrough category"
ilscript[0].content[23] "input location | passthrough location"
-ilscript[0].content[24] "input yEaR | passthrough yEaR" \ No newline at end of file
+ilscript[0].content[24] "input yEaR | passthrough yEaR"
diff --git a/config-model/src/test/derived/emptydefault/ilscripts.cfg b/config-model/src/test/derived/emptydefault/ilscripts.cfg
index b28b5ade17f..6806ce8bf91 100644
--- a/config-model/src/test/derived/emptydefault/ilscripts.cfg
+++ b/config-model/src/test/derived/emptydefault/ilscripts.cfg
@@ -1,6 +1,7 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "emptydefault"
ilscript[0].docfield[0] "one"
ilscript[0].docfield[1] "two"
ilscript[0].content[0] "clear_state | guard { input one | tokenize normalize stem:\"SHORTEST\" | index one; }"
-ilscript[0].content[1] "clear_state | guard { input two | tokenize normalize stem:\"SHORTEST\" | index two; }" \ No newline at end of file
+ilscript[0].content[1] "clear_state | guard { input two | tokenize normalize stem:\"SHORTEST\" | index two; }"
diff --git a/config-model/src/test/derived/exactmatch/ilscripts.cfg b/config-model/src/test/derived/exactmatch/ilscripts.cfg
index c3ecab6d3a9..c5bcaf70e41 100644
--- a/config-model/src/test/derived/exactmatch/ilscripts.cfg
+++ b/config-model/src/test/derived/exactmatch/ilscripts.cfg
@@ -1,6 +1,7 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "exactmatch"
ilscript[0].docfield[0] "tag"
ilscript[0].docfield[1] "screweduserids"
ilscript[0].content[0] "clear_state | guard { input tag | exact | summary tag | index tag; }"
-ilscript[0].content[1] "clear_state | guard { input screweduserids | exact | index screweduserids | summary screweduserids | attribute screweduserids; }" \ No newline at end of file
+ilscript[0].content[1] "clear_state | guard { input screweduserids | exact | index screweduserids | summary screweduserids | attribute screweduserids; }"
diff --git a/config-model/src/test/derived/id/ilscripts.cfg b/config-model/src/test/derived/id/ilscripts.cfg
index c613431cdf9..0a617538ded 100644
--- a/config-model/src/test/derived/id/ilscripts.cfg
+++ b/config-model/src/test/derived/id/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "id"
ilscript[0].docfield[0] "uri"
-ilscript[0].content[0] "clear_state | guard { input uri | summary uri | index uri; }" \ No newline at end of file
+ilscript[0].content[0] "clear_state | guard { input uri | summary uri | index uri; }"
diff --git a/config-model/src/test/derived/indexswitches/ilscripts.cfg b/config-model/src/test/derived/indexswitches/ilscripts.cfg
index 706f4105224..6d1ef6b9b6d 100644
--- a/config-model/src/test/derived/indexswitches/ilscripts.cfg
+++ b/config-model/src/test/derived/indexswitches/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "indexswitches"
ilscript[0].docfield[0] "title"
ilscript[0].docfield[1] "descr"
@@ -6,4 +7,4 @@ ilscript[0].docfield[2] "source_src"
ilscript[0].content[0] "clear_state | guard { input source_src | switch { case \"theweb\": input source_src | tokenize normalize | summary source | index source; case \"amg\": input source_src | tokenize normalize | summary source; default: input source_src . \" partner\" | tokenize normalize | summary source | index source; }; }"
ilscript[0].content[1] "clear_state | guard { input title | tokenize normalize stem:\"SHORTEST\" | summary title | index title; }"
ilscript[0].content[2] "clear_state | guard { input descr | tokenize normalize stem:\"SHORTEST\" | summary descr | index descr; }"
-ilscript[0].content[3] "input source_src | passthrough source_src" \ No newline at end of file
+ilscript[0].content[3] "input source_src | passthrough source_src"
diff --git a/config-model/src/test/derived/inheritance/ilscripts.cfg b/config-model/src/test/derived/inheritance/ilscripts.cfg
index ba5fdf58ccc..90bc86bd3f3 100644
--- a/config-model/src/test/derived/inheritance/ilscripts.cfg
+++ b/config-model/src/test/derived/inheritance/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "child"
ilscript[0].docfield[0] "onlygrandparent"
ilscript[0].docfield[1] "overridden"
@@ -9,4 +10,4 @@ ilscript[0].content[0] "clear_state | guard { input onlygrandparent | attribute
ilscript[0].content[1] "clear_state | guard { input overridden | attribute overridden; }"
ilscript[0].content[2] "clear_state | guard { input onlyfather | summary onlyfather; }"
ilscript[0].content[3] "clear_state | guard { input onlymother | tokenize normalize stem:\"SHORTEST\" | attribute onlymother | index onlymother; }"
-ilscript[0].content[4] "clear_state | guard { input onlychild | tokenize normalize stem:\"SHORTEST\" | index onlychild; }" \ No newline at end of file
+ilscript[0].content[4] "clear_state | guard { input onlychild | tokenize normalize stem:\"SHORTEST\" | index onlychild; }"
diff --git a/config-model/src/test/derived/music/ilscripts.cfg b/config-model/src/test/derived/music/ilscripts.cfg
index ca2c35ffcba..1df8a818484 100644
--- a/config-model/src/test/derived/music/ilscripts.cfg
+++ b/config-model/src/test/derived/music/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "music"
ilscript[0].docfield[0] "bgndata"
ilscript[0].docfield[1] "sales"
@@ -73,4 +74,4 @@ ilscript[0].content[33] "clear_state | guard { input cbid | summary cbid | attri
ilscript[0].content[34] "clear_state | guard { input metalvalue | summary metalvalue; }"
ilscript[0].content[35] "clear_state | guard { input hiphopvalue | summary hiphopvalue; }"
ilscript[0].content[36] "clear_state | guard { input powermetalvalue | tokenize normalize stem:\"SHORTEST\" | index powermetalvalue | summary powermetalvalue; }"
-ilscript[0].content[37] "clear_state | guard { input progvalue | tokenize normalize stem:\"SHORTEST\" | index progvalue | summary progvalue; }" \ No newline at end of file
+ilscript[0].content[37] "clear_state | guard { input progvalue | tokenize normalize stem:\"SHORTEST\" | index progvalue | summary progvalue; }"
diff --git a/config-model/src/test/derived/newrank/ilscripts.cfg b/config-model/src/test/derived/newrank/ilscripts.cfg
index 8a252be4dd0..d8f9b5101ad 100644
--- a/config-model/src/test/derived/newrank/ilscripts.cfg
+++ b/config-model/src/test/derived/newrank/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "newrank"
ilscript[0].docfield[0] "bgndata"
ilscript[0].docfield[1] "sales"
@@ -63,4 +64,4 @@ ilscript[0].content[27] "clear_state | guard { input bgnpto | tokenize normalize
ilscript[0].content[28] "clear_state | guard { input year | summary year | attribute year; }"
ilscript[0].content[29] "clear_state | guard { input did | summary did | attribute did; }"
ilscript[0].content[30] "clear_state | guard { input scorekey | summary scorekey | attribute scorekey; }"
-ilscript[0].content[31] "clear_state | guard { input cbid | summary cbid | attribute cbid; }" \ No newline at end of file
+ilscript[0].content[31] "clear_state | guard { input cbid | summary cbid | attribute cbid; }"
diff --git a/config-model/src/test/derived/orderilscripts/ilscripts.cfg b/config-model/src/test/derived/orderilscripts/ilscripts.cfg
index 656381d68a8..a11fbe05f5d 100755..100644
--- a/config-model/src/test/derived/orderilscripts/ilscripts.cfg
+++ b/config-model/src/test/derived/orderilscripts/ilscripts.cfg
@@ -1,5 +1,6 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "orderilscripts"
ilscript[0].docfield[0] "foo"
ilscript[0].content[0] "clear_state | guard { input foo | summary bar; }"
-ilscript[0].content[1] "clear_state | guard { input foo | tokenize normalize stem:\"SHORTEST\" | summary foo | index foo; }" \ No newline at end of file
+ilscript[0].content[1] "clear_state | guard { input foo | tokenize normalize stem:\"SHORTEST\" | summary foo | index foo; }"
diff --git a/config-model/src/test/derived/position_array/ilscripts.cfg b/config-model/src/test/derived/position_array/ilscripts.cfg
index dfe3827a1d5..afdd5b548cf 100644
--- a/config-model/src/test/derived/position_array/ilscripts.cfg
+++ b/config-model/src/test/derived/position_array/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "position_array"
ilscript[0].docfield[0] "pos"
-ilscript[0].content[0] "clear_state | guard { input pos | for_each { zcurve } | attribute pos_zcurve; }" \ No newline at end of file
+ilscript[0].content[0] "clear_state | guard { input pos | for_each { zcurve } | attribute pos_zcurve; }"
diff --git a/config-model/src/test/derived/position_attribute/ilscripts.cfg b/config-model/src/test/derived/position_attribute/ilscripts.cfg
index ea6d6c4080b..dff9d583b3e 100644
--- a/config-model/src/test/derived/position_attribute/ilscripts.cfg
+++ b/config-model/src/test/derived/position_attribute/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "position_attribute"
ilscript[0].docfield[0] "pos"
-ilscript[0].content[0] "clear_state | guard { input pos | zcurve | attribute pos_zcurve; }" \ No newline at end of file
+ilscript[0].content[0] "clear_state | guard { input pos | zcurve | attribute pos_zcurve; }"
diff --git a/config-model/src/test/derived/position_extra/ilscripts.cfg b/config-model/src/test/derived/position_extra/ilscripts.cfg
index 33d61e55465..24d80d0df9d 100644
--- a/config-model/src/test/derived/position_extra/ilscripts.cfg
+++ b/config-model/src/test/derived/position_extra/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "position_extra"
ilscript[0].docfield[0] "pos_str"
-ilscript[0].content[0] "clear_state | guard { input pos_str | to_pos | zcurve | attribute pos_ext_zcurve; }" \ No newline at end of file
+ilscript[0].content[0] "clear_state | guard { input pos_str | to_pos | zcurve | attribute pos_ext_zcurve; }"
diff --git a/config-model/src/test/derived/prefixexactattribute/ilscripts.cfg b/config-model/src/test/derived/prefixexactattribute/ilscripts.cfg
index 76c9bf9abf5..eb7dadf8c1a 100644
--- a/config-model/src/test/derived/prefixexactattribute/ilscripts.cfg
+++ b/config-model/src/test/derived/prefixexactattribute/ilscripts.cfg
@@ -1,12 +1,13 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "prefixexactattribute"
ilscript[0].docfield[0] "indexfield0"
ilscript[0].docfield[1] "attributefield1"
ilscript[0].docfield[2] "attributefield2"
ilscript[0].docfield[3] "indexfield1"
ilscript[0].docfield[4] "indexfield2"
-ilscript[0].content[0] "clear_state | guard { input indexfield0 | tokenize normalize stem:\"SHORTEST\" | index indexfield0; }"
+ilscript[0].content[0] "clear_state | guard { input indexfield0 | tokenize normalize stem:\"SHORTEST\" max-length:79 | index indexfield0; }"
ilscript[0].content[1] "clear_state | guard { input attributefield1 | attribute attributefield1; }"
ilscript[0].content[2] "clear_state | guard { input attributefield2 | attribute attributefield2; }"
ilscript[0].content[3] "clear_state | guard { input indexfield1 | exact | index indexfield1; }"
-ilscript[0].content[4] "clear_state | guard { input indexfield2 | exact | index indexfield2; }" \ No newline at end of file
+ilscript[0].content[4] "clear_state | guard { input indexfield2 | exact | index indexfield2; }"
diff --git a/config-model/src/test/derived/ranktypes/ilscripts.cfg b/config-model/src/test/derived/ranktypes/ilscripts.cfg
index 3a917099bfc..ee191aafe90 100644
--- a/config-model/src/test/derived/ranktypes/ilscripts.cfg
+++ b/config-model/src/test/derived/ranktypes/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "ranktypes"
ilscript[0].docfield[0] "title"
ilscript[0].docfield[1] "descr"
@@ -8,4 +9,4 @@ ilscript[0].content[0] "clear_state | guard { input identity | tokenize | index
ilscript[0].content[1] "clear_state | guard { input title | tokenize normalize stem:\"SHORTEST\" | summary title | index title; }"
ilscript[0].content[2] "clear_state | guard { input descr | tokenize normalize stem:\"SHORTEST\" | summary descr | index descr; }"
ilscript[0].content[3] "clear_state | guard { input keywords | tokenize normalize stem:\"SHORTEST\" | index keywords; }"
-ilscript[0].content[4] "clear_state | guard { input identity | tokenize normalize stem:\"SHORTEST\" | index identity; }" \ No newline at end of file
+ilscript[0].content[4] "clear_state | guard { input identity | tokenize normalize stem:\"SHORTEST\" | index identity; }"
diff --git a/config-model/src/test/derived/structanyorder/ilscripts.cfg b/config-model/src/test/derived/structanyorder/ilscripts.cfg
index b86571ce666..60c4ec83c01 100644
--- a/config-model/src/test/derived/structanyorder/ilscripts.cfg
+++ b/config-model/src/test/derived/structanyorder/ilscripts.cfg
@@ -1,6 +1,7 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "annotationsimplicitstruct"
ilscript[0].docfield[0] "structfield"
ilscript[0].docfield[1] "structarrayfield"
ilscript[0].content[0] "input structarrayfield | passthrough structarrayfield"
-ilscript[0].content[1] "input structfield | passthrough structfield" \ No newline at end of file
+ilscript[0].content[1] "input structfield | passthrough structfield"
diff --git a/config-model/src/test/derived/types/ilscripts.cfg b/config-model/src/test/derived/types/ilscripts.cfg
index 70b799d0df0..108fd0ca830 100644
--- a/config-model/src/test/derived/types/ilscripts.cfg
+++ b/config-model/src/test/derived/types/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "types"
ilscript[0].docfield[0] "abyte"
ilscript[0].docfield[1] "along"
@@ -52,4 +53,4 @@ ilscript[0].content[22] "input mystructfield | passthrough mystructfield"
ilscript[0].content[23] "input mystructmap | passthrough mystructmap"
ilscript[0].content[24] "input stringmapfield | passthrough stringmapfield"
ilscript[0].content[25] "input structarrayfield | passthrough structarrayfield"
-ilscript[0].content[26] "input structfield | passthrough structfield" \ No newline at end of file
+ilscript[0].content[26] "input structfield | passthrough structfield"
diff --git a/config-model/src/test/derived/uri_array/ilscripts.cfg b/config-model/src/test/derived/uri_array/ilscripts.cfg
index de78b44199c..721fca76b9a 100644
--- a/config-model/src/test/derived/uri_array/ilscripts.cfg
+++ b/config-model/src/test/derived/uri_array/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "uri_array"
ilscript[0].docfield[0] "my_uri"
-ilscript[0].content[0] "clear_state | guard { input my_uri | index my_uri; }" \ No newline at end of file
+ilscript[0].content[0] "clear_state | guard { input my_uri | index my_uri; }"
diff --git a/config-model/src/test/derived/uri_wset/ilscripts.cfg b/config-model/src/test/derived/uri_wset/ilscripts.cfg
index 81066cd130d..5d32c7b2e63 100644
--- a/config-model/src/test/derived/uri_wset/ilscripts.cfg
+++ b/config-model/src/test/derived/uri_wset/ilscripts.cfg
@@ -1,4 +1,5 @@
maxtermoccurrences 100
+fieldmatchmaxlength 1000000
ilscript[0].doctype "uri_wset"
ilscript[0].docfield[0] "my_uri"
-ilscript[0].content[0] "clear_state | guard { input my_uri | index my_uri; }" \ No newline at end of file
+ilscript[0].content[0] "clear_state | guard { input my_uri | index my_uri; }"
diff --git a/configdefinitions/src/vespa/ilscripts.def b/configdefinitions/src/vespa/ilscripts.def
index 316e046800b..2fe34f678cc 100644
--- a/configdefinitions/src/vespa/ilscripts.def
+++ b/configdefinitions/src/vespa/ilscripts.def
@@ -4,6 +4,7 @@ namespace=vespa.configdefinition
## The maximum number of occurrences of a given term to index per field
maxtermoccurrences int default=100
+fieldmatchmaxlength int default=1000000
ilscript[].doctype string
ilscript[].docfield[] string
diff --git a/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java b/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java
index 015a5ceb79d..b1ec78fa0c4 100644
--- a/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java
+++ b/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java
@@ -76,6 +76,7 @@ public class ScriptManager {
Map<String, Map<String, DocumentScript>> documentFieldScripts = new HashMap<>(config.ilscript().size());
ScriptParserContext parserContext = new ScriptParserContext(linguistics);
parserContext.getAnnotatorConfig().setMaxTermOccurrences(config.maxtermoccurrences());
+ parserContext.getAnnotatorConfig().setMaxTokenLength(config.fieldmatchmaxlength());
for (IlscriptsConfig.Ilscript ilscript : config.ilscript()) {
InputExpression.FieldPathOptimizer fieldPathOptimizer = new InputExpression.FieldPathOptimizer(docTypeMgr.getDocumentType(ilscript.doctype()));
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java
index 661cc6c9c3e..b3cee971258 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/TokenizeExpression.java
@@ -33,7 +33,8 @@ public class TokenizeExpression extends Expression {
@Override
protected void doExecute(ExecutionContext context) {
- StringFieldValue output = ((StringFieldValue)context.getValue()).clone();
+ StringFieldValue input = (StringFieldValue)context.getValue();
+ StringFieldValue output = input.clone();
context.setValue(output);
AnnotatorConfig cfg = new AnnotatorConfig(config);
@@ -70,6 +71,9 @@ public class TokenizeExpression extends Expression {
if (config.getStemMode() != StemMode.NONE) {
ret.append(" stem:\""+config.getStemMode()+"\"");
}
+ if (config.hasNonDefaultMaxTokenLength()) {
+ ret.append(" max-length:" + config.getMaxTokenizeLength());
+ }
return ret.toString();
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
index ccc1f293112..6d160c489df 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
@@ -14,12 +14,15 @@ public class AnnotatorConfig implements Cloneable {
private StemMode stemMode;
private boolean removeAccents;
private int maxTermOccurences;
+ private int maxTokenizeLength;
public static final int DEFAULT_MAX_TERM_OCCURRENCES;
+ private static final int DEFAULT_MAX_TOKENIZE_LENGTH;
static {
IlscriptsConfig defaults = new IlscriptsConfig(new IlscriptsConfig.Builder());
DEFAULT_MAX_TERM_OCCURRENCES = defaults.maxtermoccurrences();
+ DEFAULT_MAX_TOKENIZE_LENGTH = defaults.fieldmatchmaxlength();
}
public AnnotatorConfig() {
@@ -27,6 +30,7 @@ public class AnnotatorConfig implements Cloneable {
stemMode = StemMode.NONE;
removeAccents = false;
maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES;
+ maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH;
}
public AnnotatorConfig(AnnotatorConfig rhs) {
@@ -34,6 +38,7 @@ public class AnnotatorConfig implements Cloneable {
stemMode = rhs.stemMode;
removeAccents = rhs.removeAccents;
maxTermOccurences = rhs.maxTermOccurences;
+ maxTokenizeLength = rhs.maxTokenizeLength;
}
public Language getLanguage() {
@@ -77,6 +82,19 @@ public class AnnotatorConfig implements Cloneable {
return this;
}
+ public AnnotatorConfig setMaxTokenLength(int maxTokenizeLength) {
+ this.maxTokenizeLength = maxTokenizeLength;
+ return this;
+ }
+
+ public int getMaxTokenizeLength() {
+ return maxTokenizeLength;
+ }
+
+ public boolean hasNonDefaultMaxTokenLength() {
+ return maxTokenizeLength != DEFAULT_MAX_TOKENIZE_LENGTH;
+ }
+
@Override
public boolean equals(Object obj) {
if (!(obj instanceof AnnotatorConfig)) {
@@ -95,12 +113,15 @@ public class AnnotatorConfig implements Cloneable {
if (maxTermOccurences != rhs.maxTermOccurences) {
return false;
}
+ if (maxTokenizeLength != rhs.maxTokenizeLength) {
+ return false;
+ }
return true;
}
@Override
public int hashCode() {
return getClass().hashCode() + language.hashCode() + stemMode.hashCode() +
- Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences;
+ Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength;
}
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index b320bce6dbf..3adffa30725 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -64,7 +64,10 @@ public class LinguisticsAnnotator {
if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS.
Tokenizer tokenizer = factory.getTokenizer();
- Iterable<Token> tokens = tokenizer.tokenize(text.getString(), config.getLanguage(), config.getStemMode(),
+ String input = (text.getString().length() <= config.getMaxTokenizeLength())
+ ? text.getString()
+ : text.getString().substring(0, config.getMaxTokenizeLength());
+ Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(),
config.getRemoveAccents());
TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());
SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS);
diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj
index f1abb76c645..d564443bb48 100644
--- a/indexinglanguage/src/main/javacc/IndexingParser.jj
+++ b/indexinglanguage/src/main/javacc/IndexingParser.jj
@@ -164,6 +164,7 @@ TOKEN :
<INPUT: "input"> |
<JOIN: "join"> |
<LOWER_CASE: "lowercase"> |
+ <MAX_LENGTH: "max-length"> |
<NGRAM: "ngram"> |
<NORMALIZE: "normalize"> |
<NOW: "now"> |
@@ -615,9 +616,11 @@ AnnotatorConfig tokenizeCfg() :
{
AnnotatorConfig val = new AnnotatorConfig(annotatorCfg);
String str = "SHORTEST";
+ Integer maxLength;
}
{
( <STEM> ( <COLON> str = string() ) ? { val.setStemMode(str); } |
+ <MAX_LENGTH> <COLON> maxLength = integer() { val.setMaxTokenLength(maxLength); } |
<NORMALIZE> { val.setRemoveAccents(true); } )+
{ return val; }
}
@@ -723,6 +726,7 @@ String identifier() :
<INPUT> |
<JOIN> |
<LOWER_CASE> |
+ <MAX_LENGTH> |
<NGRAM> |
<NORMALIZE> |
<NOW> |
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
index 5805d56aa57..2d18d410e66 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotatorTestCase.java
@@ -12,6 +12,7 @@ import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.TokenType;
import com.yahoo.language.process.Tokenizer;
+import com.yahoo.language.simple.SimpleLinguistics;
import com.yahoo.language.simple.SimpleToken;
import org.junit.Test;
@@ -167,6 +168,29 @@ public class LinguisticsAnnotatorTestCase {
}
@Test
+ public void requireThatTokenizeCappingWorks() {
+ String shortString = "short string";
+ SpanTree spanTree = new SpanTree(SpanTrees.LINGUISTICS);
+ spanTree.setStringFieldValue(new StringFieldValue(shortString));
+ spanTree.spanList().span(0, 5).annotate(new Annotation(AnnotationTypes.TERM));
+ spanTree.spanList().span(6, 6).annotate(new Annotation(AnnotationTypes.TERM));
+
+ StringFieldValue shortValue = new StringFieldValue(shortString);
+
+ Linguistics linguistics = new SimpleLinguistics();
+
+ LinguisticsAnnotator annotator = new LinguisticsAnnotator(linguistics, new AnnotatorConfig().setMaxTokenLength(12));
+
+ assertTrue(annotator.annotate(shortValue));
+ assertEquals(spanTree, shortValue.getSpanTree(SpanTrees.LINGUISTICS));
+ assertEquals(shortString, shortValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString());
+
+ StringFieldValue cappedValue = new StringFieldValue(shortString + " a longer string");
+ assertTrue(annotator.annotate(cappedValue));
+ assertEquals((shortString + " a longer string"), cappedValue.getSpanTree(SpanTrees.LINGUISTICS).getStringFieldValue().getString());
+ }
+
+ @Test
public void requireThatMaxTermOccurencesIsHonored() {
final String inputTerm = "foo";
final String stemmedInputTerm = "bar"; // completely different from