aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--config-model/src/main/java/com/yahoo/schema/derived/IndexInfo.java127
-rw-r--r--config-model/src/main/java/com/yahoo/schema/derived/VsmFields.java10
-rw-r--r--config-model/src/main/java/com/yahoo/schema/document/Matching.java8
-rw-r--r--config-model/src/test/derived/indexschema/vsmfields.cfg24
-rw-r--r--config-model/src/test/derived/nearestneighbor_streaming/vsmfields.cfg4
-rw-r--r--config-model/src/test/derived/prefixexactattribute/vsmfields.cfg5
-rw-r--r--config-model/src/test/derived/streamingstruct/vsmfields.cfg56
-rw-r--r--config-model/src/test/java/com/yahoo/schema/derived/VsmFieldsTestCase.java49
-rw-r--r--streamingvisitors/src/tests/searcher/searcher_test.cpp47
-rw-r--r--streamingvisitors/src/tests/textutil/textutil_test.cpp3
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp42
-rw-r--r--streamingvisitors/src/vespa/vsm/config/vsmfields.def1
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h4
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp14
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h32
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h8
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp4
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h5
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h6
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h4
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp6
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h9
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h5
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h4
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp4
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h7
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp12
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h6
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp124
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h81
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp2
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h5
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp15
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h9
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp15
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h12
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp70
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h35
38 files changed, 454 insertions, 420 deletions
diff --git a/config-model/src/main/java/com/yahoo/schema/derived/IndexInfo.java b/config-model/src/main/java/com/yahoo/schema/derived/IndexInfo.java
index 7532dec5187..34f485b7f02 100644
--- a/config-model/src/main/java/com/yahoo/schema/derived/IndexInfo.java
+++ b/config-model/src/main/java/com/yahoo/schema/derived/IndexInfo.java
@@ -25,6 +25,7 @@ import com.yahoo.vespa.documentmodel.SummaryField;
import com.yahoo.search.config.IndexInfoConfig;
import java.util.Map;
+import java.util.Objects;
import java.util.Optional;
import java.util.Set;
@@ -238,12 +239,8 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer {
if (DataType.URI.equals(fieldType)) {
return true;
}
- if (fieldType instanceof CollectionDataType &&
- DataType.URI.equals(((CollectionDataType)fieldType).getNestedType()))
- {
- return true;
- }
- return false;
+ return (fieldType instanceof CollectionDataType collectionFieldType) &&
+ DataType.URI.equals(collectionFieldType.getNestedType());
}
private void addUriIndexCommands(ImmutableSDField field) {
@@ -310,7 +307,7 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer {
private boolean notInCommands(String index) {
for (IndexCommand command : commands) {
- if (command.getIndex().equals(index)) {
+ if (command.index().equals(index)) {
return false;
}
}
@@ -322,10 +319,7 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer {
IndexInfoConfig.Indexinfo.Builder iiB = new IndexInfoConfig.Indexinfo.Builder();
iiB.name(getName());
for (IndexCommand command : commands) {
- iiB.command(
- new IndexInfoConfig.Indexinfo.Command.Builder()
- .indexname(command.getIndex())
- .command(command.getCommand()));
+ addIndexCommand(iiB, command.index(), command.command());
}
// Make user defined field sets searchable
for (FieldSet fieldSet : fieldSets.values()) {
@@ -335,18 +329,16 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer {
}
for (Map.Entry<String, String> e : aliases.entrySet()) {
- iiB.alias(
- new IndexInfoConfig.Indexinfo.Alias.Builder()
- .alias(e.getKey())
- .indexname(e.getValue()));
+ iiB.alias(new IndexInfoConfig.Indexinfo.Alias.Builder().alias(e.getKey()).indexname(e.getValue()));
}
builder.indexinfo(iiB);
}
// TODO: Move this to the FieldSetSettings processor (and rename it) as that already has to look at this.
private void addFieldSetCommands(IndexInfoConfig.Indexinfo.Builder iiB, FieldSet fieldSet) {
- for (String qc : fieldSet.queryCommands())
- iiB.command(new IndexInfoConfig.Indexinfo.Command.Builder().indexname(fieldSet.getName()).command(qc));
+ for (String qc : fieldSet.queryCommands()) {
+ addIndexCommand(iiB, fieldSet.getName(), qc);
+ }
boolean anyIndexing = false;
boolean anyAttributing = false;
boolean anyLowerCasing = false;
@@ -397,57 +389,29 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer {
fieldSetMatching = new Matching();
}
if (anyLowerCasing) {
- iiB.command(
- new IndexInfoConfig.Indexinfo.Command.Builder()
- .indexname(fieldSet.getName())
- .command(CMD_LOWERCASE));
+ addIndexCommand(iiB, fieldSet.getName(), CMD_LOWERCASE);
}
if (hasMultiValueField(fieldSet)) {
- iiB.command(
- new IndexInfoConfig.Indexinfo.Command.Builder()
- .indexname(fieldSet.getName())
- .command(CMD_MULTIVALUE));
+ addIndexCommand(iiB, fieldSet.getName(), CMD_MULTIVALUE);
}
if (anyIndexing) {
- iiB.command(
- new IndexInfoConfig.Indexinfo.Command.Builder()
- .indexname(fieldSet.getName())
- .command(CMD_INDEX));
+ addIndexCommand(iiB, fieldSet.getName(), CMD_INDEX);
if ( ! isExactMatch(fieldSetMatching)) {
if (fieldSetMatching == null || fieldSetMatching.getType().equals(MatchType.TEXT)) {
- iiB.command(
- new IndexInfoConfig.Indexinfo.Command.Builder()
- .indexname(fieldSet.getName())
- .command(CMD_PLAIN_TOKENS));
+ addIndexCommand(iiB, fieldSet.getName(), CMD_PLAIN_TOKENS);
}
if (anyStemming) {
- iiB.command(
- new IndexInfoConfig.Indexinfo.Command.Builder()
- .indexname(fieldSet.getName())
- .command(stemmingCommand));
+ addIndexCommand(iiB, fieldSet.getName(), stemmingCommand);
}
if (anyNormalizing)
- iiB.command(
- new IndexInfoConfig.Indexinfo.Command.Builder()
- .indexname(fieldSet.getName())
- .command(CMD_NORMALIZE));
+ addIndexCommand(iiB, fieldSet.getName(), CMD_NORMALIZE);
if (phraseSegmentingCommand != null)
- iiB.command(
- new IndexInfoConfig.Indexinfo.Command.Builder()
- .indexname(fieldSet.getName())
- .command(phraseSegmentingCommand));
+ addIndexCommand(iiB, fieldSet.getName(), phraseSegmentingCommand);
}
} else {
// Assume only attribute fields
- iiB
- .command(
- new IndexInfoConfig.Indexinfo.Command.Builder()
- .indexname(fieldSet.getName())
- .command(CMD_ATTRIBUTE))
- .command(
- new IndexInfoConfig.Indexinfo.Command.Builder()
- .indexname(fieldSet.getName())
- .command(CMD_INDEX));
+ addIndexCommand(iiB, fieldSet.getName(), CMD_ATTRIBUTE);
+ addIndexCommand(iiB, fieldSet.getName(), CMD_INDEX);
}
if (anyString) {
addIndexCommand(iiB, fieldSet.getName(), CMD_STRING);
@@ -460,20 +424,11 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer {
if (fieldSetMatching.getType().equals(MatchType.EXACT)) {
String term = fieldSetMatching.getExactMatchTerminator();
if (term==null) term=ExactMatch.DEFAULT_EXACT_TERMINATOR;
- iiB.command(
- new IndexInfoConfig.Indexinfo.Command.Builder()
- .indexname(fieldSet.getName())
- .command("exact "+term));
+ addIndexCommand(iiB, fieldSet.getName(), "exact "+term);
} else if (fieldSetMatching.getType().equals(MatchType.WORD)) {
- iiB.command(
- new IndexInfoConfig.Indexinfo.Command.Builder()
- .indexname(fieldSet.getName())
- .command(CMD_WORD));
+ addIndexCommand(iiB, fieldSet.getName(), CMD_WORD);
} else if (fieldSetMatching.getType().equals(MatchType.GRAM)) {
- iiB.command(
- new IndexInfoConfig.Indexinfo.Command.Builder()
- .indexname(fieldSet.getName())
- .command("ngram " + fieldSetMatching.getGramSize().orElse(NGramMatch.DEFAULT_GRAM_SIZE)));
+ addIndexCommand(iiB, fieldSet.getName(), "ngram " + fieldSetMatching.getGramSize().orElse(NGramMatch.DEFAULT_GRAM_SIZE));
} else if (fieldSetMatching.getType().equals(MatchType.TEXT)) {
}
@@ -495,10 +450,7 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer {
active = field.getIndex(field.getName()).getStemming();
}
}
- if (active != null) {
- return active;
- }
- return Stemming.BEST; // assume default
+ return Objects.requireNonNullElse(active, Stemming.BEST);
}
private boolean stemming(ImmutableSDField field) {
@@ -514,9 +466,7 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer {
private boolean isExactMatch(Matching m) {
if (m == null) return false;
- if (m.getType().equals(MatchType.EXACT)) return true;
- if (m.getType().equals(MatchType.WORD)) return true;
- return false;
+ return m.getType().equals(MatchType.EXACT) || m.getType().equals(MatchType.WORD);
}
@Override
@@ -528,34 +478,13 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer {
* An index command. Null commands are also represented, to detect consistency issues. This is an (immutable) value
* object.
*/
- public static class IndexCommand {
-
- private final String index;
-
- private final String command;
-
- public IndexCommand(String index, String command) {
- this.index = index;
- this.command = command;
- }
-
- public String getIndex() {
- return index;
- }
-
- public String getCommand() {
- return command;
- }
+ public record IndexCommand(String index, String command) {
/**
* Returns true if this is the null command (do nothing)
*/
public boolean isNull() {
- return command.equals("");
- }
-
- public int hashCode() {
- return index.hashCode() + 17 * command.hashCode();
+ return command.isEmpty();
}
public boolean equals(Object object) {
@@ -564,7 +493,7 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer {
}
return other.index.equals(this.index) &&
- other.command.equals(this.command);
+ other.command.equals(this.command);
}
public String toString() {
@@ -616,9 +545,7 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer {
return false;
}
- if (Stemming.NONE.equals(indexStemming)) {
- // Add nothing
- } else {
+ if ( ! Stemming.NONE.equals(indexStemming)) {
owner.addIndexCommand(indexName, CMD_STEM + ":" + indexStemming.toStemMode());
}
return true;
diff --git a/config-model/src/main/java/com/yahoo/schema/derived/VsmFields.java b/config-model/src/main/java/com/yahoo/schema/derived/VsmFields.java
index cb806d8596e..564161b725d 100644
--- a/config-model/src/main/java/com/yahoo/schema/derived/VsmFields.java
+++ b/config-model/src/main/java/com/yahoo/schema/derived/VsmFields.java
@@ -14,6 +14,7 @@ import com.yahoo.document.datatypes.TensorFieldValue;
import com.yahoo.schema.FieldSets;
import com.yahoo.schema.Schema;
import com.yahoo.schema.document.Attribute;
+import com.yahoo.schema.document.Case;
import com.yahoo.schema.document.FieldSet;
import com.yahoo.schema.document.GeoPos;
import com.yahoo.schema.document.Matching;
@@ -144,7 +145,7 @@ public class VsmFields extends Derived implements VsmfieldsConfig.Producer {
public static Type GEO_POSITION = new Type("GEOPOS");
public static Type NEAREST_NEIGHBOR = new Type("NEAREST_NEIGHBOR");
- private String searchMethod;
+ private final String searchMethod;
private Type(String searchMethod) {
this.searchMethod = searchMethod;
@@ -261,10 +262,17 @@ public class VsmFields extends Derived implements VsmfieldsConfig.Producer {
return getMatchingName();
}
+ private static VsmfieldsConfig.Fieldspec.Normalize.Enum toNormalize(Matching matching) {
+ if (matching.getType() == MatchType.EXACT) return VsmfieldsConfig.Fieldspec.Normalize.Enum.LOWERCASE;
+ if (matching.getCase() == Case.CASED) return VsmfieldsConfig.Fieldspec.Normalize.Enum.NONE;
+ return VsmfieldsConfig.Fieldspec.Normalize.LOWERCASE_AND_FOLD;
+ }
+
public VsmfieldsConfig.Fieldspec.Builder getFieldSpecConfig() {
var fB = new VsmfieldsConfig.Fieldspec.Builder();
fB.name(getName())
.searchmethod(VsmfieldsConfig.Fieldspec.Searchmethod.Enum.valueOf(type.getSearchMethod()))
+ .normalize(toNormalize(matching))
.arg1(getArg1())
.fieldtype(isAttribute
? VsmfieldsConfig.Fieldspec.Fieldtype.ATTRIBUTE
diff --git a/config-model/src/main/java/com/yahoo/schema/document/Matching.java b/config-model/src/main/java/com/yahoo/schema/document/Matching.java
index 0b542f134ad..9d68553fa80 100644
--- a/config-model/src/main/java/com/yahoo/schema/document/Matching.java
+++ b/config-model/src/main/java/com/yahoo/schema/document/Matching.java
@@ -46,12 +46,16 @@ public class Matching implements Cloneable, Serializable {
public MatchType getType() { return type; }
public Case getCase() { return casing; }
- public void setType(MatchType type) {
+ public Matching setType(MatchType type) {
this.type = type;
typeUserSet = true;
+ return this;
}
- public void setCase(Case casing) { this.casing = casing; }
+ public Matching setCase(Case casing) {
+ this.casing = casing;
+ return this;
+ }
public Integer maxLength() { return maxLength; }
public Matching maxLength(int maxLength) { this.maxLength = maxLength; return this; }
diff --git a/config-model/src/test/derived/indexschema/vsmfields.cfg b/config-model/src/test/derived/indexschema/vsmfields.cfg
index 31db622183e..a2152f9787f 100644
--- a/config-model/src/test/derived/indexschema/vsmfields.cfg
+++ b/config-model/src/test/derived/indexschema/vsmfields.cfg
@@ -3,121 +3,145 @@ searchall 1
fieldspec[].name "sa"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "sb"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "sc"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "sd"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "pos"
fieldspec[].searchmethod GEOPOS
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "se"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "word"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
fieldspec[].name "sf"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "sg"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "sh"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "si"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "exact1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "exact"
+fieldspec[].normalize LOWERCASE
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "exact2"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "exact"
+fieldspec[].normalize LOWERCASE
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "bm25_field"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "ia"
fieldspec[].searchmethod INT32
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
fieldspec[].name "ib"
fieldspec[].searchmethod INT32
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
fieldspec[].name "ic"
fieldspec[].searchmethod INT32
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
fieldspec[].name "nostemstring1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "nostemstring2"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "nostemstring3"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "nostemstring4"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "fs9"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "f10.text"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "sd_literal"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "pos_zcurve"
fieldspec[].searchmethod INT64
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
documenttype[].name "indexschema"
diff --git a/config-model/src/test/derived/nearestneighbor_streaming/vsmfields.cfg b/config-model/src/test/derived/nearestneighbor_streaming/vsmfields.cfg
index f8b1cf62048..ec06d01f05a 100644
--- a/config-model/src/test/derived/nearestneighbor_streaming/vsmfields.cfg
+++ b/config-model/src/test/derived/nearestneighbor_streaming/vsmfields.cfg
@@ -3,21 +3,25 @@ searchall 1
fieldspec[].name "vec_a"
fieldspec[].searchmethod NEAREST_NEIGHBOR
fieldspec[].arg1 "EUCLIDEAN"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
fieldspec[].name "vec_b"
fieldspec[].searchmethod NEAREST_NEIGHBOR
fieldspec[].arg1 "ANGULAR"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
fieldspec[].name "vec_c"
fieldspec[].searchmethod NEAREST_NEIGHBOR
fieldspec[].arg1 "INNERPRODUCT"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
fieldspec[].name "vec_d"
fieldspec[].searchmethod NONE
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
documenttype[].name "test"
diff --git a/config-model/src/test/derived/prefixexactattribute/vsmfields.cfg b/config-model/src/test/derived/prefixexactattribute/vsmfields.cfg
index 29bcde9faad..75192ef3121 100644
--- a/config-model/src/test/derived/prefixexactattribute/vsmfields.cfg
+++ b/config-model/src/test/derived/prefixexactattribute/vsmfields.cfg
@@ -3,26 +3,31 @@ searchall 1
fieldspec[].name "indexfield0"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "prefix"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 79
fieldspec[].fieldtype INDEX
fieldspec[].name "attributefield1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "exact"
+fieldspec[].normalize LOWERCASE
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
fieldspec[].name "attributefield2"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "exact"
+fieldspec[].normalize LOWERCASE
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
fieldspec[].name "indexfield1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "exact"
+fieldspec[].normalize LOWERCASE
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "indexfield2"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "exact"
+fieldspec[].normalize LOWERCASE
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
documenttype[].name "prefixexactattribute"
diff --git a/config-model/src/test/derived/streamingstruct/vsmfields.cfg b/config-model/src/test/derived/streamingstruct/vsmfields.cfg
index 7178f9d41ea..b5a234e8095 100644
--- a/config-model/src/test/derived/streamingstruct/vsmfields.cfg
+++ b/config-model/src/test/derived/streamingstruct/vsmfields.cfg
@@ -3,281 +3,337 @@ searchall 1
fieldspec[].name "coupleof"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "normalfields"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "a.f1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "a.f1s"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "substring"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "a.f2"
fieldspec[].searchmethod INT32
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "a.f3"
fieldspec[].searchmethod DOUBLE
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "m.key"
fieldspec[].searchmethod INT64
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "m.value"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "b.f1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "b.f1s"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "substring"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "b.f2"
fieldspec[].searchmethod INT32
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "b.f3"
fieldspec[].searchmethod DOUBLE
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "c.f1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "word"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
fieldspec[].name "c.f1s"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "prefix"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "c.f3"
fieldspec[].searchmethod DOUBLE
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "c2.f1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "word"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
fieldspec[].name "c2.f1s"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "suffix"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "c2.f2"
fieldspec[].searchmethod INT32
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "c2.f3"
fieldspec[].searchmethod DOUBLE
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "c3.f1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "word"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
fieldspec[].name "c3.f1s"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "prefix"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "c3.f2"
fieldspec[].searchmethod INT32
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "c3.f3"
fieldspec[].searchmethod DOUBLE
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "n.nf1.f1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "n.nf1.f1s"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "prefix"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "n.nf1.f3"
fieldspec[].searchmethod DOUBLE
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "n.nf1s.f1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "substring"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "n.nf1s.f1s"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "substring"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "n.nf1s.f2"
fieldspec[].searchmethod INT32
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "n.nf1s.f3"
fieldspec[].searchmethod DOUBLE
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "n.nf2"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "array1.f1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "array1.f1s"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "substring"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "array1.f2"
fieldspec[].searchmethod INT32
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "array1.f3"
fieldspec[].searchmethod DOUBLE
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "array2.f1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "array2.f1s"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "substring"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "array2.f2"
fieldspec[].searchmethod INT32
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "array2.f3"
fieldspec[].searchmethod DOUBLE
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "array3.f1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "word"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype ATTRIBUTE
fieldspec[].name "array3.f1s"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "prefix"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "array3.f3"
fieldspec[].searchmethod DOUBLE
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "subject.f1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "d.f1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "prefix"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "d.f1s"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "prefix"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "d.f2"
fieldspec[].searchmethod INT32
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "d.f3"
fieldspec[].searchmethod DOUBLE
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "e.f1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "substring"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "e.f1s"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "substring"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "e.f2"
fieldspec[].searchmethod INT32
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "e.f3"
fieldspec[].searchmethod DOUBLE
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "f.f1"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "suffix"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "f.f1s"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 "suffix"
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "f.f2"
fieldspec[].searchmethod INT32
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "f.f3"
fieldspec[].searchmethod DOUBLE
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
fieldspec[].name "g"
fieldspec[].searchmethod AUTOUTF8
fieldspec[].arg1 ""
+fieldspec[].normalize LOWERCASE_AND_FOLD
fieldspec[].maxlength 1048576
fieldspec[].fieldtype INDEX
documenttype[].name "streamingstruct"
diff --git a/config-model/src/test/java/com/yahoo/schema/derived/VsmFieldsTestCase.java b/config-model/src/test/java/com/yahoo/schema/derived/VsmFieldsTestCase.java
index 6423b621ab9..601c014bdc1 100644
--- a/config-model/src/test/java/com/yahoo/schema/derived/VsmFieldsTestCase.java
+++ b/config-model/src/test/java/com/yahoo/schema/derived/VsmFieldsTestCase.java
@@ -4,8 +4,12 @@ package com.yahoo.schema.derived;
import com.yahoo.config.model.application.provider.MockFileRegistry;
import com.yahoo.config.model.deploy.TestProperties;
import com.yahoo.config.model.test.MockApplicationPackage;
+import com.yahoo.document.DataType;
import com.yahoo.documentmodel.NewDocumentReferenceDataType;
import com.yahoo.schema.Schema;
+import com.yahoo.schema.document.Case;
+import com.yahoo.schema.document.MatchType;
+import com.yahoo.schema.document.Matching;
import com.yahoo.schema.document.SDDocumentType;
import com.yahoo.schema.document.SDField;
import com.yahoo.schema.document.TemporarySDField;
@@ -19,24 +23,55 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
*/
public class VsmFieldsTestCase {
- @SuppressWarnings("deprecation")
- @Test
- void reference_type_field_is_unsearchable() {
+ private static Schema createSchema() {
Schema schema = new Schema("test", MockApplicationPackage.createEmpty(), new MockFileRegistry(), new TestableDeployLogger(), new TestProperties());
var sdoc = new SDDocumentType("test");
schema.addDocument(sdoc);
- SDField refField = new TemporarySDField(sdoc, "ref_field", NewDocumentReferenceDataType.forDocumentName("parent_type"));
- refField.parseIndexingScript("{ summary }");
- schema.getDocument().addField(refField);
+ return schema;
+ }
+ private static VsmfieldsConfig vsmfieldsConfig(Schema schema) {
VsmFields vsmFields = new VsmFields(schema);
VsmfieldsConfig.Builder cfgBuilder = new VsmfieldsConfig.Builder();
vsmFields.getConfig(cfgBuilder);
- VsmfieldsConfig cfg = cfgBuilder.build();
+ return cfgBuilder.build();
+ }
+
+ @Test
+ void reference_type_field_is_unsearchable() {
+ Schema schema = createSchema();
+ SDField field = new TemporarySDField(schema.getDocument(), "ref_field", NewDocumentReferenceDataType.forDocumentName("parent_type"));
+ field.parseIndexingScript("{ summary }");
+ schema.getDocument().addField(field);
+ VsmfieldsConfig cfg = vsmfieldsConfig(schema);
assertEquals(1, cfg.fieldspec().size());
VsmfieldsConfig.Fieldspec fieldSpec = cfg.fieldspec().get(0);
assertEquals("ref_field", fieldSpec.name());
assertEquals(VsmfieldsConfig.Fieldspec.Searchmethod.NONE, fieldSpec.searchmethod());
}
+
+ private void testIndexMatching(Matching matching, VsmfieldsConfig.Fieldspec.Normalize.Enum normalize, String arg1) {
+ Schema schema = createSchema();
+ SDField field = new TemporarySDField(schema.getDocument(), "f", DataType.STRING);
+ field.parseIndexingScript("{ index }");
+ field.setMatching(matching);
+ schema.getDocument().addField(field);
+ VsmfieldsConfig cfg = vsmfieldsConfig(schema);
+ VsmfieldsConfig.Fieldspec fieldSpec = cfg.fieldspec().get(0);
+ assertEquals("f", fieldSpec.name());
+ assertEquals(VsmfieldsConfig.Fieldspec.Searchmethod.AUTOUTF8, fieldSpec.searchmethod());
+ assertEquals(normalize, fieldSpec.normalize());
+ assertEquals(arg1, fieldSpec.arg1());
+ }
+
+ @Test
+ void test_exact_string() {
+ testIndexMatching(new Matching(MatchType.TEXT),
+ VsmfieldsConfig.Fieldspec.Normalize.LOWERCASE_AND_FOLD, "");
+ testIndexMatching(new Matching(MatchType.TEXT).setCase(Case.CASED),
+ VsmfieldsConfig.Fieldspec.Normalize.NONE, "");
+ testIndexMatching(new Matching(MatchType.EXACT).setCase(Case.CASED),
+ VsmfieldsConfig.Fieldspec.Normalize.LOWERCASE, "exact");
+ }
}
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp
index 83b84fffa11..74d8fdc4bf3 100644
--- a/streamingvisitors/src/tests/searcher/searcher_test.cpp
+++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp
@@ -22,6 +22,7 @@ using search::streaming::HitList;
using search::streaming::QueryNodeResultFactory;
using search::streaming::QueryTerm;
using search::streaming::Normalizing;
+using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod;
using search::streaming::QueryTermList;
using TermType = QueryTerm::Type;
using namespace vsm;
@@ -114,7 +115,7 @@ struct SnippetModifierSetup
SnippetModifierSetup::SnippetModifierSetup(const StringList & terms)
: query(terms),
- searcher(new UTF8SubstringSnippetModifier()),
+ searcher(new UTF8SubstringSnippetModifier(0)),
env(),
modifier(searcher)
{
@@ -361,7 +362,7 @@ assertFieldInfo(FieldSearcher & fs, const StringList & query,
void
assertSnippetModifier(const StringList & query, const std::string & fv, const std::string & exp)
{
- UTF8SubstringSnippetModifier mod;
+ UTF8SubstringSnippetModifier mod(0);
performSearch(mod, query, StringFieldValue(fv));
EXPECT_EQUAL(mod.getModifiedBuf().getPos(), exp.size());
std::string actual(mod.getModifiedBuf().getBuffer(), mod.getModifiedBuf().getPos());
@@ -440,11 +441,11 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs)
assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits()).add(Hits()));
assertString(fs, StringList().add("and").add("overloading"), field, HitsList().add(Hits().add(1)).add(Hits().add(3)));
- fs.setMatchType(FieldSearcher::PREFIX);
+ fs.match_type(FieldSearcher::PREFIX);
assertString(fs, "oper", field, Hits().add(0).add(2));
assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits().add(0).add(2)).add(Hits()));
- fs.setMatchType(FieldSearcher::REGULAR);
+ fs.match_type(FieldSearcher::REGULAR);
if (!EXPECT_TRUE(testStringFieldInfo(fs))) return false;
{ // test handling of several underscores
@@ -553,12 +554,12 @@ TEST("utf8 substring search with empty term")
TEST("utf8 suffix search") {
UTF8SuffixStringFieldSearcher fs(0);
std::string field = "operators and operator overloading";
- assertString(fs, "rsand", field, Hits());
- assertString(fs, "tor", field, Hits().add(2));
- assertString(fs, "tors", field, Hits().add(0));
+ TEST_DO(assertString(fs, "rsand", field, Hits()));
+ TEST_DO(assertString(fs, "tor", field, Hits().add(2)));
+ TEST_DO(assertString(fs, "tors", field, Hits().add(0)));
- assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits()));
- assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3)));
+ TEST_DO(assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits())));
+ TEST_DO(assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3))));
EXPECT_TRUE(testStringFieldInfo(fs));
}
@@ -590,22 +591,22 @@ TEST("utf8 flexible searcher"){
// prefix
assertString(fs, "vesp*", "vespa", Hits().add(0));
- fs.setMatchType(FieldSearcher::PREFIX);
+ fs.match_type(FieldSearcher::PREFIX);
assertString(fs, "vesp", "vespa", Hits().add(0));
// substring
- fs.setMatchType(FieldSearcher::REGULAR);
+ fs.match_type(FieldSearcher::REGULAR);
assertString(fs, "*esp*", "vespa", Hits().add(0));
- fs.setMatchType(FieldSearcher::SUBSTRING);
+ fs.match_type(FieldSearcher::SUBSTRING);
assertString(fs, "esp", "vespa", Hits().add(0));
// suffix
- fs.setMatchType(FieldSearcher::REGULAR);
+ fs.match_type(FieldSearcher::REGULAR);
assertString(fs, "*espa", "vespa", Hits().add(0));
- fs.setMatchType(FieldSearcher::SUFFIX);
+ fs.match_type(FieldSearcher::SUFFIX);
assertString(fs, "espa", "vespa", Hits().add(0));
- fs.setMatchType(FieldSearcher::REGULAR);
+ fs.match_type(FieldSearcher::REGULAR);
EXPECT_TRUE(testStringFieldInfo(fs));
}
@@ -659,7 +660,7 @@ TEST("integer search")
TEST("floating point search")
{
- FloatFieldSearcher fs;
+ FloatFieldSearcher fs(0);
TEST_DO(assertFloat(fs, "10", 10, true));
TEST_DO(assertFloat(fs, "10.5", 10.5, true));
TEST_DO(assertFloat(fs, "-10.5", -10.5, true));
@@ -726,7 +727,7 @@ TEST("Snippet modifier search") {
"\xe7\x9f\xb3\x1f\xe6\x98\x8e\xe5\x87\xb1\x1f\xe5\x9c\xa8");
{ // check that resizing works
- UTF8SubstringSnippetModifier mod;
+ UTF8SubstringSnippetModifier mod(0);
EXPECT_EQUAL(mod.getModifiedBuf().getLength(), 32u);
EXPECT_EQUAL(mod.getModifiedBuf().getPos(), 0u);
performSearch(mod, StringList().add("a"), StringFieldValue("aaaaaaaaaaaaaaaa"));
@@ -763,28 +764,32 @@ TEST("snippet modifier") {
}
}
-TEST("FieldSearchSpec constrution") {
+TEST("FieldSearchSpec construction") {
{
FieldSearchSpec f;
EXPECT_FALSE(f.valid());
EXPECT_EQUAL(0u, f.id());
EXPECT_EQUAL("", f.name());
EXPECT_EQUAL(0x100000u, f.maxLength());
+ EXPECT_EQUAL("", f.arg1());
+ EXPECT_TRUE(Normalizing::LOWERCASE_AND_FOLD == f.normalize_mode());
}
{
- FieldSearchSpec f(7, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 789);
+ FieldSearchSpec f(7, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 789);
EXPECT_TRUE(f.valid());
EXPECT_EQUAL(7u, f.id());
EXPECT_EQUAL("f0", f.name());
EXPECT_EQUAL(789u, f.maxLength());
EXPECT_EQUAL(789u, f.searcher().maxFieldLength());
+ EXPECT_EQUAL("substring", f.arg1());
+ EXPECT_TRUE(Normalizing::LOWERCASE == f.normalize_mode());
}
}
TEST("snippet modifier manager") {
FieldSearchSpecMapT specMap;
- specMap[0] = FieldSearchSpec(0, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 1000);
- specMap[1] = FieldSearchSpec(1, "f1", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "", 1000);
+ specMap[0] = FieldSearchSpec(0, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 1000);
+ specMap[1] = FieldSearchSpec(1, "f1", Searchmethod::AUTOUTF8, Normalizing::NONE, "", 1000);
IndexFieldMapT indexMap;
indexMap["i0"].push_back(0);
indexMap["i1"].push_back(1);
diff --git a/streamingvisitors/src/tests/textutil/textutil_test.cpp b/streamingvisitors/src/tests/textutil/textutil_test.cpp
index b926444e4df..f7f340a2182 100644
--- a/streamingvisitors/src/tests/textutil/textutil_test.cpp
+++ b/streamingvisitors/src/tests/textutil/textutil_test.cpp
@@ -2,7 +2,6 @@
#include <vespa/vespalib/testkit/testapp.h>
#include <vespa/fastlib/text/normwordfolder.h>
-#include <vespa/searchlib/query/base.h>
#include <vespa/vsm/searcher/fold.h>
#include <vespa/vsm/searcher/futf8strchrfieldsearcher.h>
#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
@@ -66,7 +65,7 @@ TextUtilTest::assertSkipSeparators(const char * input, size_t len, const UCS4V &
const byte * srcbuf = reinterpret_cast<const byte *>(input);
auto dstbuf = std::make_unique<ucs4_t[]>(len + 1);
auto offsets = std::make_unique<size_t[]>(len + 1);
- UTF8StrChrFieldSearcher fs;
+ UTF8StrChrFieldSearcher fs(0);
BW bw(dstbuf.get(), offsets.get());
size_t dstlen = fs.skipSeparators(srcbuf, len, bw);
EXPECT_EQUAL(dstlen, expdstbuf.size());
diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
index 4161adaf21f..cdd1a018d84 100644
--- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
+++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp
@@ -42,6 +42,7 @@ using search::aggregation::HitsAggregationResult;
using search::attribute::IAttributeVector;
using search::expression::ConfigureStaticParams;
using search::streaming::Query;
+using search::streaming::Normalizing;
using search::streaming::QueryTermList;
using storage::StorageComponent;
using storage::VisitorEnvironment;
@@ -326,20 +327,41 @@ SearchVisitor::is_text_matching(vespalib::stringref index) const noexcept {
return false;
}
+namespace {
+
+uint32_t
+count_normalize_lowercase(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) {
+ size_t count = 0;
+ for (const auto & fieldId : fieldIdMap.map()) {
+ auto found = specMap.find(fieldId.second);
+ if ((found != specMap.end()) && found->second.searcher().normalize_mode() == Normalizing::LOWERCASE) {
+ count++;
+ }
+ }
+ return count;
+}
+
+uint32_t
+count_normalize_none(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) {
+ size_t count = 0;
+ for (const auto & fieldId : fieldIdMap.map()) {
+ auto found = specMap.find(fieldId.second);
+ if ((found != specMap.end()) && found->second.searcher().normalize_mode() == Normalizing::NONE) {
+ count++;
+ }
+ }
+ return count;
+}
+
+}
+
SearchMethodInfo::Normalizing
SearchVisitor::normalizing_mode(vespalib::stringref index) const noexcept {
StringFieldIdTMap fieldIdMap;
_fieldSearchSpecMap.addFieldsFromIndex(index, fieldIdMap);
- size_t num_exact = 0;
- for (const auto & fieldId : fieldIdMap.map()) {
- auto found = _fieldSearchSpecMap.specMap().find(fieldId.second);
- if ((found != _fieldSearchSpecMap.specMap().end()) && found->second.searcher().exact()) {
- num_exact++;
- }
- }
- return ((num_exact == 0) || (num_exact != fieldIdMap.map().size()))
- ? Normalizing::LOWERCASE_AND_FOLD
- : Normalizing::LOWERCASE;
+ if (count_normalize_none(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::NONE;
+ if (count_normalize_lowercase(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::LOWERCASE;
+ return Normalizing::LOWERCASE_AND_FOLD;
}
void
diff --git a/streamingvisitors/src/vespa/vsm/config/vsmfields.def b/streamingvisitors/src/vespa/vsm/config/vsmfields.def
index 442a044d38f..dac732013d2 100644
--- a/streamingvisitors/src/vespa/vsm/config/vsmfields.def
+++ b/streamingvisitors/src/vespa/vsm/config/vsmfields.def
@@ -14,6 +14,7 @@ fieldspec[].name string
## The search method for a given field. Note: same field in 2 different document types must match on type if not a random result might be expected.
fieldspec[].searchmethod enum { NONE, BOOL, AUTOUTF8, UTF8, SSE2UTF8, INT8, INT16, INT32, INT64, FLOAT16, FLOAT, DOUBLE, GEOPOS, NEAREST_NEIGHBOR } default=AUTOUTF8
fieldspec[].arg1 string default=""
+fieldspec[].normalize enum { NONE, LOWERCASE, LOWERCASE_AND_FOLD } default=LOWERCASE_AND_FOLD
## Maximum number of chars to search per field.
fieldspec[].maxlength int default=1048576
diff --git a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h
index c7e7d2e74bd..3708cca85fb 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h
@@ -9,8 +9,8 @@ class BoolFieldSearcher : public FieldSearcher
{
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- BoolFieldSearcher(FieldIdT fId);
- ~BoolFieldSearcher();
+ explicit BoolFieldSearcher(FieldIdT fId);
+ ~BoolFieldSearcher() override;
void prepare(search::streaming::QueryTermList& qtl,
const SharedSearcherBuf& buf,
const vsm::FieldPathMapT& field_paths,
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
index 55d80413b8c..5e06ae41a03 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
@@ -51,14 +51,13 @@ FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept
: FieldSearcherBase(),
_field(fId),
_matchType(defaultPrefix ? PREFIX : REGULAR),
+ _normalize_mode(Normalizing::LOWERCASE_AND_FOLD),
_maxFieldLength(0x100000),
_currentElementId(0),
_currentElementWeight(1),
_words(0),
- _badUtf8Count(0),
- _zeroCount(0)
+ _badUtf8Count(0)
{
- zeroStat();
}
FieldSearcher::~FieldSearcher() = default;
@@ -71,7 +70,7 @@ FieldSearcher::search(const StorageDocument & doc)
fInfo.setHitOffset(qt->getHitList().size());
}
onSearch(doc);
- for(auto qt : _qtl) {
+ for (auto qt : _qtl) {
QueryTerm::FieldInfo & fInfo = qt->getFieldInfo(field());
fInfo.setHitCount(qt->getHitList().size() - fInfo.getHitOffset());
fInfo.setFieldLength(_words);
@@ -114,13 +113,6 @@ FieldSearcher::prepareFieldId()
}
void
-FieldSearcher::zeroStat()
-{
- _badUtf8Count = 0;
- _zeroCount = 0;
-}
-
-void
FieldSearcher::init()
{
for (unsigned i = 0; i < NELEMS(_foldLowCase); i++) {
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
index e64c41f814f..c5bca6f3899 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
@@ -34,12 +34,13 @@ protected:
class FieldSearcher : public FieldSearcherBase
{
public:
+ using Normalizing = search::streaming::Normalizing;
enum MatchType {
REGULAR,
PREFIX,
SUBSTRING,
SUFFIX,
- EXACT
+ EXACT,
};
explicit FieldSearcher(FieldIdT fId) noexcept : FieldSearcher(fId, false) {}
@@ -50,20 +51,22 @@ public:
virtual void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf,
const vsm::FieldPathMapT& field_paths, search::fef::IQueryEnvironment& query_env);
- FieldIdT field() const { return _field; }
- void field(FieldIdT v) { _field = v; prepareFieldId(); }
- bool prefix() const { return _matchType == PREFIX; }
- bool substring() const { return _matchType == SUBSTRING; }
- bool suffix() const { return _matchType == SUFFIX; }
- bool exact() const { return _matchType == EXACT; }
- void setMatchType(MatchType mt) { _matchType = mt; }
+ FieldIdT field() const noexcept { return _field; }
+ bool prefix() const noexcept { return _matchType == PREFIX; }
+ bool substring() const noexcept { return _matchType == SUBSTRING; }
+ bool suffix() const noexcept { return _matchType == SUFFIX; }
+ bool exact() const noexcept { return _matchType == EXACT; }
+ Normalizing normalize_mode() const noexcept { return _normalize_mode; }
+ MatchType match_type() const noexcept { return _matchType; }
+ void match_type(MatchType mt) noexcept { _matchType = mt; }
+ void normalize_mode(Normalizing mode) noexcept { _normalize_mode = mode; }
+ void field(FieldIdT v) noexcept { _field = v; prepareFieldId(); }
static void init();
static search::byte fold(search::byte c) { return _foldLowCase[c]; }
static search::byte iswordchar(search::byte c) { return _wordChar[c]; }
static search::byte isspace(search::byte c) { return ! iswordchar(c); }
static size_t countWords(const FieldRef & f);
- int32_t getCurrentWeight() const { return _currentElementWeight; }
- void zeroStat();
+ int32_t currentWeight() const { return _currentElementWeight; }
FieldSearcher & maxFieldLength(uint32_t maxFieldLength_) { _maxFieldLength = maxFieldLength_; return *this; }
size_t maxFieldLength() const { return _maxFieldLength; }
@@ -89,22 +92,21 @@ private:
virtual void onStructValue(const document::StructFieldValue &) { }
FieldIdT _field;
MatchType _matchType;
+ Normalizing _normalize_mode;
unsigned _maxFieldLength;
uint32_t _currentElementId;
int32_t _currentElementWeight; // Contains the weight of the current item being evaluated.
protected:
/// Number of terms searched.
- unsigned _words;
+ unsigned _words;
/// Number of utf8 bytes by utf8 size.
- unsigned _badUtf8Count;
- unsigned _zeroCount;
-protected:
+ unsigned _badUtf8Count;
/**
* Adds a hit to the given query term.
* For each call to onValue() a batch of words are processed, and the position is local to this batch.
**/
void addHit(search::streaming::QueryTerm & qt, uint32_t pos) const {
- qt.add(_words + pos, field(), _currentElementId, getCurrentWeight());
+ qt.add(_words + pos, field(), _currentElementId, _currentElementWeight);
}
public:
static search::byte _foldLowCase[256];
diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h
index 07b3f6e1c5f..85341472c26 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h
@@ -9,8 +9,8 @@ template <typename T>
class FloatFieldSearcherT : public FieldSearcher
{
public:
- FloatFieldSearcherT(FieldIdT fId=0);
- ~FloatFieldSearcherT();
+ explicit FloatFieldSearcherT(FieldIdT fId);
+ ~FloatFieldSearcherT() override;
void prepare(search::streaming::QueryTermList& qtl,
const SharedSearcherBuf& buf,
const vsm::FieldPathMapT& field_paths,
@@ -42,14 +42,14 @@ class FloatFieldSearcher : public FloatFieldSearcherTF
{
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- FloatFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTF(fId) { }
+ explicit FloatFieldSearcher(FieldIdT fId) : FloatFieldSearcherTF(fId) { }
};
class DoubleFieldSearcher : public FloatFieldSearcherTD
{
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- DoubleFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTD(fId) { }
+ DoubleFieldSearcher(FieldIdT fId) : FloatFieldSearcherTD(fId) { }
};
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp
index d7d73899e53..c0b5117d6bf 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp
@@ -19,10 +19,6 @@ FUTF8StrChrFieldSearcher::duplicate() const
return std::make_unique<FUTF8StrChrFieldSearcher>(*this);
}
-FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher()
- : UTF8StrChrFieldSearcher(),
- _folded(4_Ki)
-{ }
FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher(FieldIdT fId)
: UTF8StrChrFieldSearcher(fId),
_folded(4_Ki)
diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h
index 5d5ca3d6c3c..b8aa287070a 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h
@@ -9,15 +9,14 @@ class FUTF8StrChrFieldSearcher : public UTF8StrChrFieldSearcher
{
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- FUTF8StrChrFieldSearcher();
- FUTF8StrChrFieldSearcher(FieldIdT fId);
+ explicit FUTF8StrChrFieldSearcher(FieldIdT fId);
~FUTF8StrChrFieldSearcher() override;
static bool ansiFold(const char * toFold, size_t sz, char * folded);
static bool lfoldaa(const char * toFold, size_t sz, char * folded, size_t & unalignedStart);
static bool lfoldua(const char * toFold, size_t sz, char * folded, size_t & alignedStart);
private:
size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
- size_t matchTerms(const FieldRef&, const size_t shortestTerm) override;
+ size_t matchTerms(const FieldRef&, size_t shortestTerm) override;
virtual size_t match(const char *folded, size_t sz, search::streaming::QueryTerm & qt);
size_t match(const char *folded, size_t sz, size_t mintsz, search::streaming::QueryTerm ** qtl, size_t qtlSize);
std::vector<char> _folded;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h
index 741148fbca1..17c9f23fefb 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h
@@ -8,8 +8,8 @@ namespace vsm {
class GeoPosFieldSearcher : public FieldSearcher {
public:
- GeoPosFieldSearcher(FieldIdT fId=0);
- ~GeoPosFieldSearcher();
+ GeoPosFieldSearcher(FieldIdT fId);
+ ~GeoPosFieldSearcher() override;
void prepare(search::streaming::QueryTermList& qtl,
const SharedSearcherBuf& buf,
const vsm::FieldPathMapT& field_paths,
@@ -21,7 +21,7 @@ protected:
using GeoLocation = search::common::GeoLocation;
class GeoPosInfo : public GeoLocation {
public:
- GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {}
+ explicit GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {}
bool cmp(const document::StructFieldValue & fv) const;
};
using GeoPosInfoListT = std::vector<GeoPosInfo>;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h
index 47b83c1538d..9c63d31e3c3 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h
@@ -9,8 +9,8 @@ class IntFieldSearcher : public FieldSearcher
{
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- IntFieldSearcher(FieldIdT fId=0);
- ~IntFieldSearcher();
+ explicit IntFieldSearcher(FieldIdT fId);
+ ~IntFieldSearcher() override;
void prepare(search::streaming::QueryTermList& qtl,
const SharedSearcherBuf& buf,
const vsm::FieldPathMapT& field_paths,
diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
index 76fedbd1166..816317bf86d 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp
@@ -141,17 +141,17 @@ NearestNeighborFieldSearcher::onValue(const document::FieldValue& fv)
}
DistanceMetric
-NearestNeighborFieldSearcher::distance_metric_from_string(const vespalib::string& value)
+NearestNeighborFieldSearcher::distance_metric_from_string(vespalib::stringref value)
{
// Valid string values must match the definition of DistanceMetric in
// config-model/src/main/java/com/yahoo/schema/document/Attribute.java
- auto v = value;
+ vespalib::string v = value;
std::transform(v.begin(), v.end(), v.begin(),
[](unsigned char c) { return std::tolower(c); });
try {
return DistanceMetricUtils::to_distance_metric(v);
} catch (vespalib::IllegalStateException&) {
- vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", value.c_str());
+ vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", v.c_str());
return DistanceMetric::Euclidean;
}
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
index 5629b443c78..ecdc64d1336 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h
@@ -11,10 +11,7 @@
#include <vespa/searchlib/tensor/tensor_ext_attribute.h>
namespace search::fef { class IQueryEnvironment; }
-
-namespace search::tensor {
-class TensorExtAttribute;
-}
+namespace search::tensor { class TensorExtAttribute; }
namespace vsm {
@@ -43,7 +40,7 @@ private:
public:
NearestNeighborFieldSearcher(FieldIdT fid,
search::attribute::DistanceMetric metric);
- ~NearestNeighborFieldSearcher();
+ ~NearestNeighborFieldSearcher() override;
std::unique_ptr<FieldSearcher> duplicate() const override;
void prepare(search::streaming::QueryTermList& qtl,
@@ -52,7 +49,7 @@ public:
search::fef::IQueryEnvironment& query_env) override;
void onValue(const document::FieldValue& fv) override;
- static search::attribute::DistanceMetric distance_metric_from_string(const vespalib::string& value);
+ static search::attribute::DistanceMetric distance_metric_from_string(vespalib::stringref value);
};
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h
index 9ad76712092..19c723d060d 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h
@@ -8,8 +8,7 @@ namespace vsm {
class StrChrFieldSearcher : public FieldSearcher
{
public:
- StrChrFieldSearcher() : FieldSearcher(0) { }
- StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { }
+ explicit StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { }
void onValue(const document::FieldValue & fv) override;
void prepare(search::streaming::QueryTermList& qtl,
const SharedSearcherBuf& buf,
@@ -19,7 +18,7 @@ private:
size_t shortestTerm() const;
bool matchDoc(const FieldRef & field);
virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) = 0;
- virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) = 0;
+ virtual size_t matchTerms(const FieldRef & f, size_t shortestTerm) = 0;
};
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h
index dd6f31581a0..aaf8b940dc8 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h
@@ -17,10 +17,10 @@ protected:
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- UTF8ExactStringFieldSearcher(FieldIdT fId)
+ explicit UTF8ExactStringFieldSearcher(FieldIdT fId)
: UTF8StringFieldSearcherBase(fId)
{
- setMatchType(EXACT);
+ match_type(EXACT);
}
};
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
index 655b068e152..78f491198ad 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
@@ -58,10 +58,6 @@ UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
}
}
-UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher() :
- UTF8StringFieldSearcherBase()
-{ }
-
UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher(FieldIdT fId) :
UTF8StringFieldSearcherBase(fId)
{ }
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
index 5eee6a8862a..04fbee96d36 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
@@ -17,18 +17,17 @@ private:
* Tries to match the given query term against the content of the given field reference.
* Search strategy is choosen based on the query term type.
**/
- virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+ size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
/**
* Tries to match each query term in the underlying query against the content of the given field reference.
* Search strategy is choosen based on the query term type.
**/
- virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+ size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- UTF8FlexibleStringFieldSearcher();
- UTF8FlexibleStringFieldSearcher(FieldIdT fId);
+ explicit UTF8FlexibleStringFieldSearcher(FieldIdT fId);
};
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
index 651d1dcad9f..fa1fc83728c 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
@@ -14,21 +14,19 @@ UTF8StrChrFieldSearcher::duplicate() const
}
size_t
-UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz)
{
(void) mintsz;
termcount_t words(0);
- const byte * n = reinterpret_cast<const byte *> (f.data());
- const byte * e = n + f.size();
if (f.size() >= _buf->size()) {
_buf->reserve(f.size() + 1);
}
cmptype_t * fn = &(*_buf.get())[0];
- size_t fl(0);
- for( ; n < e; ) {
- if (!*n) { _zeroCount++; n++; }
- n = tokenize(n, _buf->capacity(), fn, fl);
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
+ while ( reader.hasNext() ) {
+ tokenize(reader);
+ size_t fl = reader.complete();
for (auto qt : _qtl) {
const cmptype_t * term;
termsize_t tsz = qt->term(term);
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h
index cfe546bc6f6..663ee3a1a62 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h
@@ -13,12 +13,10 @@ class UTF8StrChrFieldSearcher : public UTF8StringFieldSearcherBase
{
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- UTF8StrChrFieldSearcher() : UTF8StringFieldSearcherBase() { }
- UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
-
+ explicit UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
protected:
size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
- size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+ size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
};
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
index ebdf69d0b30..ce63f55ea63 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
@@ -1,7 +1,6 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "utf8stringfieldsearcherbase.h"
-#include <vespa/fastlib/text/normwordfolder.h>
#include <cassert>
using search::streaming::QueryTerm;
@@ -10,107 +9,36 @@ using search::byte;
namespace vsm {
-const byte *
-UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen)
-{
- if (maxSz > 0) {
- maxSz--;
- }
- ucs4_t c(*p);
- ucs4_t *q(dstbuf);
- const byte * end(p+maxSz);
-
- // Skip non-word characters between words
- for (; p < end; ) {
- if (c < 128) {
- if (!c) { break; }
- p++;
- if (__builtin_expect(Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) {
- *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c);
- c = 0;
- } else {
- c = *p;
- }
- } else {
- c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
- if (Fast_UnicodeUtil::IsWordChar(c)) {
- const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
- if (repl != nullptr) {
- size_t repllen = strlen(repl);
- if (repllen > 0) {
- q = Fast_UnicodeUtil::ucs4copy(q,repl);
- }
- } else {
- c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
- *q++ = c;
- }
- break;
- } else {
- if (c == Fast_UnicodeUtil::_BadUTF8Char) {
- _badUtf8Count++;
- }
- c = *p;
- }
- }
- }
-
- c = *p; // Next char
- for (; p < end;) {
- if (c < 128) { // Common case, ASCII
- if (!c) { break; }
- p++;
- if (__builtin_expect(!Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) {
- c = 0;
- } else {
- *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c);
- c = *p;
- }
- } else {
- c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
- if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) {
- const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
- if (repl != nullptr) {
- size_t repllen = strlen(repl);
- if (repllen > 0) {
- q = Fast_UnicodeUtil::ucs4copy(q,repl);
- }
- } else {
- c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
- *q++ = c;
- }
-
- c = *p;
- } else {
- if (c == Fast_UnicodeUtil::_BadUTF8Char) {
- _badUtf8Count++;
- }
- break;
- }
+template<typename Reader>
+void
+UTF8StringFieldSearcherBase::tokenize(Reader & reader) {
+ ucs4_t c(0);
+ Normalizing norm_mode = normalize_mode();
+ while (reader.hasNext() && ! Fast_UnicodeUtil::IsWordChar(c = reader.next()));
+
+ if (Fast_UnicodeUtil::IsWordChar(c)) {
+ reader.normalize(c, norm_mode);
+ while (reader.hasNext() && Fast_UnicodeUtil::IsWordChar(c = reader.next())) {
+ reader.normalize(c, norm_mode);
}
}
- *q = 0;
- tokenlen = q - dstbuf;
- return p;
}
size_t
UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt)
{
termcount_t words(0);
- const byte * n = reinterpret_cast<const byte *> (f.data());
- // __builtin_prefetch(n, 0, 0);
const cmptype_t * term;
termsize_t tsz = qt.term(term);
- const byte * e = n + f.size();
if ( f.size() >= _buf->size()) {
_buf->reserve(f.size() + 1);
}
- cmptype_t * fn = &(*_buf.get())[0];
- size_t fl(0);
+ cmptype_t * fn = _buf->data();
- for( ; n < e; ) {
- if (!*n) { _zeroCount++; n++; }
- n = tokenize(n, _buf->capacity(), fn, fl);
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn);
+ while ( reader.hasNext() ) {
+ tokenize(reader);
+ size_t fl = reader.complete();
if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) {
const cmptype_t *tt=term, *et=term+tsz;
for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++);
@@ -185,22 +113,17 @@ size_t
UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt)
{
termcount_t words = 0;
- const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
- const byte * srcend = srcbuf + f.size();
const cmptype_t * term;
termsize_t tsz = qt.term(term);
if (f.size() >= _buf->size()) {
_buf->reserve(f.size() + 1);
}
- cmptype_t * dstbuf = &(*_buf.get())[0];
- size_t tokenlen = 0;
+ cmptype_t * dstbuf = _buf->data();
- for( ; srcbuf < srcend; ) {
- if (*srcbuf == 0) {
- ++_zeroCount;
- ++srcbuf;
- }
- srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf);
+ while ( reader.hasNext() ) {
+ tokenize(reader);
+ size_t tokenlen = reader.complete();
if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) {
addHit(qt, words);
}
@@ -209,11 +132,6 @@ UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt)
return words;
}
-UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase() :
- StrChrFieldSearcher()
-{
-}
-
UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase(FieldIdT fId) :
StrChrFieldSearcher(fId)
{
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
index 38aac508f4f..115cddce619 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
@@ -2,6 +2,7 @@
#pragma once
#include "strchrfieldsearcher.h"
+#include <vespa/fastlib/text/normwordfolder.h>
namespace vsm {
@@ -28,15 +29,15 @@ public:
ucs4_t * _cbuf;
public:
- BufferWrapper(ucs4_t * buf) : _bbuf(buf), _cbuf(buf) { }
- BufferWrapper(ucs4_t * buf, size_t *) : _bbuf(buf), _cbuf(buf) { }
+ explicit BufferWrapper(ucs4_t * buf) noexcept : _bbuf(buf), _cbuf(buf) { }
+ BufferWrapper(ucs4_t * buf, size_t *) noexcept : _bbuf(buf), _cbuf(buf) { }
void onCharacter(ucs4_t ch, size_t) { *_cbuf++ = ch; }
void onOffset(size_t) { }
void incBuf(size_t inc) { _cbuf += inc; }
ucs4_t * getBuf() { return _cbuf; }
- bool valid() { return true; }
- size_t size() { return (_cbuf - _bbuf); }
- bool hasOffsets() { return false; }
+ bool valid() const noexcept { return true; }
+ size_t size() const noexcept { return (_cbuf - _bbuf); }
+ bool hasOffsets() const noexcept { return false; }
};
/**
@@ -50,17 +51,74 @@ public:
size_t * _coff;
public:
- OffsetWrapper(ucs4_t * buf, size_t * offsets) : BufferWrapper(buf), _boff(offsets), _coff(offsets) {}
+ explicit OffsetWrapper(ucs4_t * buf, size_t * offsets) noexcept : BufferWrapper(buf), _boff(offsets), _coff(offsets) {}
void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; }
void onOffset(size_t of) { *_coff++ = of; }
- bool valid() { return (size() == (size_t)(_coff - _boff)); }
- bool hasOffsets() { return true; }
+ bool valid() const noexcept { return (size() == (size_t)(_coff - _boff)); }
+ bool hasOffsets() const noexcept { return true; }
};
protected:
SharedSearcherBuf _buf;
- const search::byte * tokenize(const search::byte * buf, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen);
+ using byte = search::byte;
+
+ class TokenizeReader {
+ public:
+ TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept
+ : _p(p),
+ _p_end(p + len),
+ _q(q),
+ _q_start(q)
+ {}
+ ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); }
+ void normalize(ucs4_t c, Normalizing normalize_mode) {
+ switch (normalize_mode) {
+ case Normalizing::LOWERCASE:
+ c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
+ [[fallthrough]];
+ case Normalizing::NONE:
+ *_q++ = c;
+ break;
+ case Normalizing::LOWERCASE_AND_FOLD:
+ fold(c);
+ break;
+ }
+ }
+ bool hasNext() const noexcept { return _p < _p_end; }
+ const byte * p() const noexcept { return _p; }
+ size_t complete() noexcept {
+ *_q = 0;
+ size_t token_len = _q - _q_start;
+ _q = _q_start;
+ return token_len;
+ }
+ private:
+ void fold(ucs4_t c) {
+ const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
+ if (repl != nullptr) {
+ size_t repllen = strlen(repl);
+ if (repllen > 0) {
+ _q = Fast_UnicodeUtil::ucs4copy(_q,repl);
+ }
+ } else {
+ c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
+ *_q++ = c;
+ }
+ }
+ void lowercase(ucs4_t c) {
+ c = Fast_NormalizeWordFolder::lowercase_and_fold(c);
+ *_q++ = c;
+ }
+ const byte *_p;
+ const byte *_p_end;
+ ucs4_t *_q;
+ ucs4_t *_q_start;
+ };
+
+
+ template<typename Reader>
+ void tokenize(Reader & reader);
/**
* Matches the given query term against the words in the given field reference
@@ -103,9 +161,8 @@ protected:
size_t matchTermExact(const FieldRef & f, search::streaming::QueryTerm & qt);
public:
- UTF8StringFieldSearcherBase();
- UTF8StringFieldSearcherBase(FieldIdT fId);
- ~UTF8StringFieldSearcherBase();
+ explicit UTF8StringFieldSearcherBase(FieldIdT fId);
+ ~UTF8StringFieldSearcherBase() override;
void prepare(search::streaming::QueryTermList& qtl,
const SharedSearcherBuf& buf,
const vsm::FieldPathMapT& field_paths,
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp
index 25ef9ae7618..fcc2893a71d 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp
@@ -1,6 +1,6 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <vespa/vsm/searcher/utf8substringsearcher.h>
+#include "utf8substringsearcher.h"
#include <vespa/fastlib/text/unicodeutil.h>
using search::byte;
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h
index b1455d5c5f6..22ecf9c41fa 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h
@@ -12,11 +12,10 @@ class UTF8SubStringFieldSearcher : public UTF8StringFieldSearcherBase
{
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- UTF8SubStringFieldSearcher() : UTF8StringFieldSearcherBase() { }
- UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
+ explicit UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
protected:
size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
- size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+ size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
};
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp
index 8403e69658f..6d8a399cd33 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp
@@ -110,20 +110,11 @@ UTF8SubstringSnippetModifier::insertSeparators(const char * mbegin, const char *
_modified->put(_unitSep);
}
-UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier() :
- UTF8StringFieldSearcherBase(),
- _modified(new CharBuffer(32)),
- _offsets(new std::vector<size_t>(32)),
- _readPtr(NULL),
- _unitSep(juniper::separators::unit_separator)
-{
-}
-
UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId) :
UTF8StringFieldSearcherBase(fId),
_modified(new CharBuffer(32)),
_offsets(new std::vector<size_t>(32)),
- _readPtr(NULL),
+ _readPtr(nullptr),
_unitSep(juniper::separators::unit_separator)
{
}
@@ -134,12 +125,12 @@ UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId,
UTF8StringFieldSearcherBase(fId),
_modified(modBuf),
_offsets(offBuf),
- _readPtr(NULL),
+ _readPtr(nullptr),
_unitSep(juniper::separators::unit_separator)
{
}
-UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() {}
+UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() = default;
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h
index ebb806de61c..99e6c29961f 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h
@@ -23,8 +23,8 @@ private:
const char * _readPtr; // buffer to read from (field reference)
char _unitSep; // the unit separator character to use
- virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
- virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+ size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+ size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
/**
* Copies n bytes from the field reference to the modified buffer and updates the read pointer.
@@ -51,9 +51,8 @@ public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- UTF8SubstringSnippetModifier();
- UTF8SubstringSnippetModifier(FieldIdT fId);
- ~UTF8SubstringSnippetModifier();
+ explicit UTF8SubstringSnippetModifier(FieldIdT fId);
+ ~UTF8SubstringSnippetModifier() override;
/**
* Creates a new instance.
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
index e28ce114225..4318d5fe1a3 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
@@ -14,24 +14,19 @@ UTF8SuffixStringFieldSearcher::duplicate() const
}
size_t
-UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz)
{
(void) mintsz;
termcount_t words = 0;
- const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
- const byte * srcend = srcbuf + f.size();
if (f.size() >= _buf->size()) {
_buf->reserve(f.size() + 1);
}
cmptype_t * dstbuf = &(*_buf.get())[0];
- size_t tokenlen = 0;
- for( ; srcbuf < srcend; ) {
- if (*srcbuf == 0) {
- ++_zeroCount;
- ++srcbuf;
- }
- srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
+ TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf);
+ while ( reader.hasNext() ) {
+ tokenize(reader);
+ size_t tokenlen = reader.complete();
for (auto qt : _qtl) {
const cmptype_t * term;
termsize_t tsz = qt->term(term);
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h
index 556f61a714f..dc3bc214b49 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h
@@ -1,10 +1,9 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once
-#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
+#include "utf8stringfieldsearcherbase.h"
-namespace vsm
-{
+namespace vsm {
/**
* This class does suffix utf8 searches.
@@ -12,13 +11,12 @@ namespace vsm
class UTF8SuffixStringFieldSearcher : public UTF8StringFieldSearcherBase
{
protected:
- virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
- virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+ size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+ size_t matchTerms(const FieldRef & f, size_t shortestTerm) override;
public:
std::unique_ptr<FieldSearcher> duplicate() const override;
- UTF8SuffixStringFieldSearcher() : UTF8StringFieldSearcherBase() { }
- UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
+ explicit UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
};
}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
index 4b0efd58a56..715c19a0bb7 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
@@ -31,15 +31,13 @@ namespace {
void
setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) {
if (arg1 == "prefix") {
- searcher->setMatchType(FieldSearcher::PREFIX);
+ searcher->match_type(FieldSearcher::PREFIX);
} else if (arg1 == "substring") {
- searcher->setMatchType(FieldSearcher::SUBSTRING);
+ searcher->match_type(FieldSearcher::SUBSTRING);
} else if (arg1 == "suffix") {
- searcher->setMatchType(FieldSearcher::SUFFIX);
- } else if (arg1 == "exact") {
- searcher->setMatchType(FieldSearcher::EXACT);
- } else if (arg1 == "word") {
- searcher->setMatchType(FieldSearcher::EXACT);
+ searcher->match_type(FieldSearcher::SUFFIX);
+ } else if ((arg1 == "exact") || (arg1 == "word")) {
+ searcher->match_type(FieldSearcher::EXACT);
}
}
@@ -51,6 +49,7 @@ FieldSearchSpec::FieldSearchSpec()
_maxLength(0x100000),
_searcher(),
_searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE),
+ _normalize_mode(Normalizing::LOWERCASE_AND_FOLD),
_arg1(),
_reconfigured(false)
{
@@ -60,15 +59,15 @@ FieldSearchSpec::~FieldSearchSpec() = default;
FieldSearchSpec::FieldSearchSpec(FieldSearchSpec&& rhs) noexcept = default;
FieldSearchSpec& FieldSearchSpec::operator=(FieldSearchSpec&& rhs) noexcept = default;
-FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname,
- VsmfieldsConfig::Fieldspec::Searchmethod searchDef,
- const vespalib::string & arg1, size_t maxLength_) :
+FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, Searchmethod searchDef,
+ Normalizing normalize_mode, vespalib::stringref arg1_in, size_t maxLength_in) :
_id(fid),
_name(fname),
- _maxLength(maxLength_),
+ _maxLength(maxLength_in),
_searcher(),
_searchMethod(searchDef),
- _arg1(arg1),
+ _normalize_mode(normalize_mode),
+ _arg1(arg1_in),
_reconfigured(false)
{
switch(searchDef) {
@@ -79,13 +78,11 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string &
case VsmfieldsConfig::Fieldspec::Searchmethod::NONE:
case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8:
case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8:
- if (arg1 == "substring") {
+ if (_arg1 == "substring") {
_searcher = std::make_unique<UTF8SubStringFieldSearcher>(fid);
- } else if (arg1 == "suffix") {
+ } else if (_arg1 == "suffix") {
_searcher = std::make_unique<UTF8SuffixStringFieldSearcher>(fid);
- } else if (arg1 == "exact") {
- _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid);
- } else if (arg1 == "word") {
+ } else if ((_arg1 == "exact") || (_arg1 == "word")) {
_searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid);
} else if (searchDef == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) {
_searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid);
@@ -112,13 +109,14 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string &
_searcher = std::make_unique<GeoPosFieldSearcher>(fid);
break;
case VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR:
- auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(arg1);
+ auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(_arg1);
_searcher = std::make_unique<NearestNeighborFieldSearcher>(fid, dm);
break;
}
if (_searcher) {
- setMatchType(_searcher, arg1);
+ setMatchType(_searcher, _arg1);
_searcher->maxFieldLength(maxLength());
+ _searcher->normalize_mode(_normalize_mode);
}
}
@@ -166,20 +164,20 @@ FieldSearchSpecMap::FieldSearchSpecMap() = default;
FieldSearchSpecMap::~FieldSearchSpecMap() = default;
namespace {
- const std::string _G_empty("");
- const std::string _G_value(".value");
- const std::regex _G_map1("\\{[a-zA-Z0-9]+\\}");
- const std::regex _G_map2("\\{\".*\"\\}");
- const std::regex _G_array("\\[[0-9]+\\]");
+ const std::string G_empty;
+ const std::string G_value(".value");
+ const std::regex G_map1("\\{[a-zA-Z0-9]+\\}");
+ const std::regex G_map2("\\{\".*\"\\}");
+ const std::regex G_array("\\[[0-9]+\\]");
}
vespalib::string
FieldSearchSpecMap::stripNonFields(vespalib::stringref rawIndex)
{
if ((rawIndex.find('[') != vespalib::string::npos) || (rawIndex.find('{') != vespalib::string::npos)) {
- std::string index = std::regex_replace(std::string(rawIndex), _G_map1, _G_value);
- index = std::regex_replace(index, _G_map2, _G_value);
- index = std::regex_replace(index, _G_array, _G_empty);
+ std::string index = std::regex_replace(std::string(rawIndex), G_map1, G_value);
+ index = std::regex_replace(index, G_map2, G_value);
+ index = std::regex_replace(index, G_array, G_empty);
return index;
}
return rawIndex;
@@ -258,17 +256,26 @@ buildFieldSet(const VsmfieldsConfig::Documenttype::Index & ci, const FieldSearch
return ifm;
}
+search::streaming::Normalizing
+normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode) {
+ switch (normalize_mode) {
+ case VsmfieldsConfig::Fieldspec::Normalize::NONE: return search::streaming::Normalizing::NONE;
+ case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE: return search::streaming::Normalizing::LOWERCASE;
+ case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE_AND_FOLD: return search::streaming::Normalizing::LOWERCASE_AND_FOLD;
+ }
+ return search::streaming::Normalizing::LOWERCASE_AND_FOLD;
}
-bool
+}
+
+void
FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf)
{
- bool retval(true);
LOG(spam, "Parsing %zd fields", conf->fieldspec.size());
for(const VsmfieldsConfig::Fieldspec & cfs : conf->fieldspec) {
LOG(spam, "Parsing %s", cfs.name.c_str());
FieldIdT fieldId = specMap().size();
- FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, cfs.arg1.c_str(), cfs.maxlength);
+ FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, normalize_mode(cfs.normalize), cfs.arg1, cfs.maxlength);
_specMap[fieldId] = std::move(fss);
_nameIdMap.add(cfs.name, fieldId);
LOG(spam, "M in %d = %s", fieldId, cfs.name.c_str());
@@ -283,7 +290,6 @@ FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf)
}
_documentTypeMap[di.name] = indexMapp;
}
- return retval;
}
void
@@ -338,7 +344,7 @@ FieldSearchSpecMap::get_distance_metric(const vespalib::string& name) const
if (!itr->second.uses_nearest_neighbor_search_method()) {
return dm;
}
- return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.get_arg1());
+ return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.arg1());
}
vespalib::asciistream &
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
index 43bb5b04481..7ba9799991e 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
@@ -10,25 +10,29 @@ namespace vsm {
class FieldSearchSpec
{
public:
+ using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod;
+ using Normalizing = search::streaming::Normalizing;
FieldSearchSpec();
- FieldSearchSpec(const FieldIdT & id, const vespalib::string & name,
- VsmfieldsConfig::Fieldspec::Searchmethod searchMethod,
- const vespalib::string & arg1, size_t maxLength);
+ FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, Searchmethod searchMethod,
+ Normalizing normalize_mode, vespalib::stringref arg1, size_t maxLength);
~FieldSearchSpec();
FieldSearchSpec(FieldSearchSpec&& rhs) noexcept;
FieldSearchSpec& operator=(FieldSearchSpec&& rhs) noexcept;
- const FieldSearcher & searcher() const { return *_searcher; }
- const vespalib::string & name() const { return _name; }
- FieldIdT id() const { return _id; }
- bool valid() const { return static_cast<bool>(_searcher); }
- size_t maxLength() const { return _maxLength; }
- bool uses_nearest_neighbor_search_method() const noexcept { return _searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR; }
+ const FieldSearcher & searcher() const noexcept { return *_searcher; }
+ const vespalib::string & name() const noexcept { return _name; }
+ FieldIdT id() const noexcept { return _id; }
+ bool valid() const noexcept { return static_cast<bool>(_searcher); }
+ size_t maxLength() const noexcept { return _maxLength; }
+ Normalizing normalize_mode() const noexcept { return _normalize_mode; }
+ const vespalib::string& arg1() const noexcept { return _arg1; }
+ bool uses_nearest_neighbor_search_method() const noexcept {
+ return _searchMethod == Searchmethod::NEAREST_NEIGHBOR;
+ }
bool uses_string_search_method() const noexcept {
- return (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) ||
- (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8) ||
- (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8);
+ return (_searchMethod == Searchmethod::UTF8) ||
+ (_searchMethod == Searchmethod::AUTOUTF8) ||
+ (_searchMethod == Searchmethod::SSE2UTF8);
}
- const vespalib::string& get_arg1() const noexcept { return _arg1; }
/**
* Reconfigures the field searcher based on information in the given query term.
@@ -42,7 +46,8 @@ private:
vespalib::string _name;
size_t _maxLength;
FieldSearcherContainer _searcher;
- VsmfieldsConfig::Fieldspec::Searchmethod _searchMethod;
+ Searchmethod _searchMethod;
+ Normalizing _normalize_mode;
vespalib::string _arg1;
bool _reconfigured;
};
@@ -60,7 +65,7 @@ public:
* and a mapping from field name to field id. It then iterates over all document types and index names
* and creates a mapping from index name to list of field ids for each document type.
**/
- bool buildFromConfig(const VsmfieldsHandle & conf);
+ void buildFromConfig(const VsmfieldsHandle & conf);
/**
* Iterates over the given field name vector adding extra elements to the mapping from field name to field id.