diff options
38 files changed, 454 insertions, 420 deletions
diff --git a/config-model/src/main/java/com/yahoo/schema/derived/IndexInfo.java b/config-model/src/main/java/com/yahoo/schema/derived/IndexInfo.java index 7532dec5187..34f485b7f02 100644 --- a/config-model/src/main/java/com/yahoo/schema/derived/IndexInfo.java +++ b/config-model/src/main/java/com/yahoo/schema/derived/IndexInfo.java @@ -25,6 +25,7 @@ import com.yahoo.vespa.documentmodel.SummaryField; import com.yahoo.search.config.IndexInfoConfig; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.Set; @@ -238,12 +239,8 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer { if (DataType.URI.equals(fieldType)) { return true; } - if (fieldType instanceof CollectionDataType && - DataType.URI.equals(((CollectionDataType)fieldType).getNestedType())) - { - return true; - } - return false; + return (fieldType instanceof CollectionDataType collectionFieldType) && + DataType.URI.equals(collectionFieldType.getNestedType()); } private void addUriIndexCommands(ImmutableSDField field) { @@ -310,7 +307,7 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer { private boolean notInCommands(String index) { for (IndexCommand command : commands) { - if (command.getIndex().equals(index)) { + if (command.index().equals(index)) { return false; } } @@ -322,10 +319,7 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer { IndexInfoConfig.Indexinfo.Builder iiB = new IndexInfoConfig.Indexinfo.Builder(); iiB.name(getName()); for (IndexCommand command : commands) { - iiB.command( - new IndexInfoConfig.Indexinfo.Command.Builder() - .indexname(command.getIndex()) - .command(command.getCommand())); + addIndexCommand(iiB, command.index(), command.command()); } // Make user defined field sets searchable for (FieldSet fieldSet : fieldSets.values()) { @@ -335,18 +329,16 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer { } for (Map.Entry<String, String> e : aliases.entrySet()) { - iiB.alias( - new IndexInfoConfig.Indexinfo.Alias.Builder() - .alias(e.getKey()) - .indexname(e.getValue())); + iiB.alias(new IndexInfoConfig.Indexinfo.Alias.Builder().alias(e.getKey()).indexname(e.getValue())); } builder.indexinfo(iiB); } // TODO: Move this to the FieldSetSettings processor (and rename it) as that already has to look at this. private void addFieldSetCommands(IndexInfoConfig.Indexinfo.Builder iiB, FieldSet fieldSet) { - for (String qc : fieldSet.queryCommands()) - iiB.command(new IndexInfoConfig.Indexinfo.Command.Builder().indexname(fieldSet.getName()).command(qc)); + for (String qc : fieldSet.queryCommands()) { + addIndexCommand(iiB, fieldSet.getName(), qc); + } boolean anyIndexing = false; boolean anyAttributing = false; boolean anyLowerCasing = false; @@ -397,57 +389,29 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer { fieldSetMatching = new Matching(); } if (anyLowerCasing) { - iiB.command( - new IndexInfoConfig.Indexinfo.Command.Builder() - .indexname(fieldSet.getName()) - .command(CMD_LOWERCASE)); + addIndexCommand(iiB, fieldSet.getName(), CMD_LOWERCASE); } if (hasMultiValueField(fieldSet)) { - iiB.command( - new IndexInfoConfig.Indexinfo.Command.Builder() - .indexname(fieldSet.getName()) - .command(CMD_MULTIVALUE)); + addIndexCommand(iiB, fieldSet.getName(), CMD_MULTIVALUE); } if (anyIndexing) { - iiB.command( - new IndexInfoConfig.Indexinfo.Command.Builder() - .indexname(fieldSet.getName()) - .command(CMD_INDEX)); + addIndexCommand(iiB, fieldSet.getName(), CMD_INDEX); if ( ! isExactMatch(fieldSetMatching)) { if (fieldSetMatching == null || fieldSetMatching.getType().equals(MatchType.TEXT)) { - iiB.command( - new IndexInfoConfig.Indexinfo.Command.Builder() - .indexname(fieldSet.getName()) - .command(CMD_PLAIN_TOKENS)); + addIndexCommand(iiB, fieldSet.getName(), CMD_PLAIN_TOKENS); } if (anyStemming) { - iiB.command( - new IndexInfoConfig.Indexinfo.Command.Builder() - .indexname(fieldSet.getName()) - .command(stemmingCommand)); + addIndexCommand(iiB, fieldSet.getName(), stemmingCommand); } if (anyNormalizing) - iiB.command( - new IndexInfoConfig.Indexinfo.Command.Builder() - .indexname(fieldSet.getName()) - .command(CMD_NORMALIZE)); + addIndexCommand(iiB, fieldSet.getName(), CMD_NORMALIZE); if (phraseSegmentingCommand != null) - iiB.command( - new IndexInfoConfig.Indexinfo.Command.Builder() - .indexname(fieldSet.getName()) - .command(phraseSegmentingCommand)); + addIndexCommand(iiB, fieldSet.getName(), phraseSegmentingCommand); } } else { // Assume only attribute fields - iiB - .command( - new IndexInfoConfig.Indexinfo.Command.Builder() - .indexname(fieldSet.getName()) - .command(CMD_ATTRIBUTE)) - .command( - new IndexInfoConfig.Indexinfo.Command.Builder() - .indexname(fieldSet.getName()) - .command(CMD_INDEX)); + addIndexCommand(iiB, fieldSet.getName(), CMD_ATTRIBUTE); + addIndexCommand(iiB, fieldSet.getName(), CMD_INDEX); } if (anyString) { addIndexCommand(iiB, fieldSet.getName(), CMD_STRING); @@ -460,20 +424,11 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer { if (fieldSetMatching.getType().equals(MatchType.EXACT)) { String term = fieldSetMatching.getExactMatchTerminator(); if (term==null) term=ExactMatch.DEFAULT_EXACT_TERMINATOR; - iiB.command( - new IndexInfoConfig.Indexinfo.Command.Builder() - .indexname(fieldSet.getName()) - .command("exact "+term)); + addIndexCommand(iiB, fieldSet.getName(), "exact "+term); } else if (fieldSetMatching.getType().equals(MatchType.WORD)) { - iiB.command( - new IndexInfoConfig.Indexinfo.Command.Builder() - .indexname(fieldSet.getName()) - .command(CMD_WORD)); + addIndexCommand(iiB, fieldSet.getName(), CMD_WORD); } else if (fieldSetMatching.getType().equals(MatchType.GRAM)) { - iiB.command( - new IndexInfoConfig.Indexinfo.Command.Builder() - .indexname(fieldSet.getName()) - .command("ngram " + fieldSetMatching.getGramSize().orElse(NGramMatch.DEFAULT_GRAM_SIZE))); + addIndexCommand(iiB, fieldSet.getName(), "ngram " + fieldSetMatching.getGramSize().orElse(NGramMatch.DEFAULT_GRAM_SIZE)); } else if (fieldSetMatching.getType().equals(MatchType.TEXT)) { } @@ -495,10 +450,7 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer { active = field.getIndex(field.getName()).getStemming(); } } - if (active != null) { - return active; - } - return Stemming.BEST; // assume default + return Objects.requireNonNullElse(active, Stemming.BEST); } private boolean stemming(ImmutableSDField field) { @@ -514,9 +466,7 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer { private boolean isExactMatch(Matching m) { if (m == null) return false; - if (m.getType().equals(MatchType.EXACT)) return true; - if (m.getType().equals(MatchType.WORD)) return true; - return false; + return m.getType().equals(MatchType.EXACT) || m.getType().equals(MatchType.WORD); } @Override @@ -528,34 +478,13 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer { * An index command. Null commands are also represented, to detect consistency issues. This is an (immutable) value * object. */ - public static class IndexCommand { - - private final String index; - - private final String command; - - public IndexCommand(String index, String command) { - this.index = index; - this.command = command; - } - - public String getIndex() { - return index; - } - - public String getCommand() { - return command; - } + public record IndexCommand(String index, String command) { /** * Returns true if this is the null command (do nothing) */ public boolean isNull() { - return command.equals(""); - } - - public int hashCode() { - return index.hashCode() + 17 * command.hashCode(); + return command.isEmpty(); } public boolean equals(Object object) { @@ -564,7 +493,7 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer { } return other.index.equals(this.index) && - other.command.equals(this.command); + other.command.equals(this.command); } public String toString() { @@ -616,9 +545,7 @@ public class IndexInfo extends Derived implements IndexInfoConfig.Producer { return false; } - if (Stemming.NONE.equals(indexStemming)) { - // Add nothing - } else { + if ( ! Stemming.NONE.equals(indexStemming)) { owner.addIndexCommand(indexName, CMD_STEM + ":" + indexStemming.toStemMode()); } return true; diff --git a/config-model/src/main/java/com/yahoo/schema/derived/VsmFields.java b/config-model/src/main/java/com/yahoo/schema/derived/VsmFields.java index cb806d8596e..564161b725d 100644 --- a/config-model/src/main/java/com/yahoo/schema/derived/VsmFields.java +++ b/config-model/src/main/java/com/yahoo/schema/derived/VsmFields.java @@ -14,6 +14,7 @@ import com.yahoo.document.datatypes.TensorFieldValue; import com.yahoo.schema.FieldSets; import com.yahoo.schema.Schema; import com.yahoo.schema.document.Attribute; +import com.yahoo.schema.document.Case; import com.yahoo.schema.document.FieldSet; import com.yahoo.schema.document.GeoPos; import com.yahoo.schema.document.Matching; @@ -144,7 +145,7 @@ public class VsmFields extends Derived implements VsmfieldsConfig.Producer { public static Type GEO_POSITION = new Type("GEOPOS"); public static Type NEAREST_NEIGHBOR = new Type("NEAREST_NEIGHBOR"); - private String searchMethod; + private final String searchMethod; private Type(String searchMethod) { this.searchMethod = searchMethod; @@ -261,10 +262,17 @@ public class VsmFields extends Derived implements VsmfieldsConfig.Producer { return getMatchingName(); } + private static VsmfieldsConfig.Fieldspec.Normalize.Enum toNormalize(Matching matching) { + if (matching.getType() == MatchType.EXACT) return VsmfieldsConfig.Fieldspec.Normalize.Enum.LOWERCASE; + if (matching.getCase() == Case.CASED) return VsmfieldsConfig.Fieldspec.Normalize.Enum.NONE; + return VsmfieldsConfig.Fieldspec.Normalize.LOWERCASE_AND_FOLD; + } + public VsmfieldsConfig.Fieldspec.Builder getFieldSpecConfig() { var fB = new VsmfieldsConfig.Fieldspec.Builder(); fB.name(getName()) .searchmethod(VsmfieldsConfig.Fieldspec.Searchmethod.Enum.valueOf(type.getSearchMethod())) + .normalize(toNormalize(matching)) .arg1(getArg1()) .fieldtype(isAttribute ? VsmfieldsConfig.Fieldspec.Fieldtype.ATTRIBUTE diff --git a/config-model/src/main/java/com/yahoo/schema/document/Matching.java b/config-model/src/main/java/com/yahoo/schema/document/Matching.java index 0b542f134ad..9d68553fa80 100644 --- a/config-model/src/main/java/com/yahoo/schema/document/Matching.java +++ b/config-model/src/main/java/com/yahoo/schema/document/Matching.java @@ -46,12 +46,16 @@ public class Matching implements Cloneable, Serializable { public MatchType getType() { return type; } public Case getCase() { return casing; } - public void setType(MatchType type) { + public Matching setType(MatchType type) { this.type = type; typeUserSet = true; + return this; } - public void setCase(Case casing) { this.casing = casing; } + public Matching setCase(Case casing) { + this.casing = casing; + return this; + } public Integer maxLength() { return maxLength; } public Matching maxLength(int maxLength) { this.maxLength = maxLength; return this; } diff --git a/config-model/src/test/derived/indexschema/vsmfields.cfg b/config-model/src/test/derived/indexschema/vsmfields.cfg index 31db622183e..a2152f9787f 100644 --- a/config-model/src/test/derived/indexschema/vsmfields.cfg +++ b/config-model/src/test/derived/indexschema/vsmfields.cfg @@ -3,121 +3,145 @@ searchall 1 fieldspec[].name "sa" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "sb" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "sc" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "sd" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "pos" fieldspec[].searchmethod GEOPOS fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "se" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "word" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE fieldspec[].name "sf" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "sg" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "sh" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "si" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "exact1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "exact" +fieldspec[].normalize LOWERCASE fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "exact2" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "exact" +fieldspec[].normalize LOWERCASE fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "bm25_field" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "ia" fieldspec[].searchmethod INT32 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE fieldspec[].name "ib" fieldspec[].searchmethod INT32 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE fieldspec[].name "ic" fieldspec[].searchmethod INT32 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE fieldspec[].name "nostemstring1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "nostemstring2" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "nostemstring3" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "nostemstring4" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "fs9" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "f10.text" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "sd_literal" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "pos_zcurve" fieldspec[].searchmethod INT64 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE documenttype[].name "indexschema" diff --git a/config-model/src/test/derived/nearestneighbor_streaming/vsmfields.cfg b/config-model/src/test/derived/nearestneighbor_streaming/vsmfields.cfg index f8b1cf62048..ec06d01f05a 100644 --- a/config-model/src/test/derived/nearestneighbor_streaming/vsmfields.cfg +++ b/config-model/src/test/derived/nearestneighbor_streaming/vsmfields.cfg @@ -3,21 +3,25 @@ searchall 1 fieldspec[].name "vec_a" fieldspec[].searchmethod NEAREST_NEIGHBOR fieldspec[].arg1 "EUCLIDEAN" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE fieldspec[].name "vec_b" fieldspec[].searchmethod NEAREST_NEIGHBOR fieldspec[].arg1 "ANGULAR" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE fieldspec[].name "vec_c" fieldspec[].searchmethod NEAREST_NEIGHBOR fieldspec[].arg1 "INNERPRODUCT" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE fieldspec[].name "vec_d" fieldspec[].searchmethod NONE fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE documenttype[].name "test" diff --git a/config-model/src/test/derived/prefixexactattribute/vsmfields.cfg b/config-model/src/test/derived/prefixexactattribute/vsmfields.cfg index 29bcde9faad..75192ef3121 100644 --- a/config-model/src/test/derived/prefixexactattribute/vsmfields.cfg +++ b/config-model/src/test/derived/prefixexactattribute/vsmfields.cfg @@ -3,26 +3,31 @@ searchall 1 fieldspec[].name "indexfield0" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "prefix" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 79 fieldspec[].fieldtype INDEX fieldspec[].name "attributefield1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "exact" +fieldspec[].normalize LOWERCASE fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE fieldspec[].name "attributefield2" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "exact" +fieldspec[].normalize LOWERCASE fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE fieldspec[].name "indexfield1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "exact" +fieldspec[].normalize LOWERCASE fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "indexfield2" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "exact" +fieldspec[].normalize LOWERCASE fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX documenttype[].name "prefixexactattribute" diff --git a/config-model/src/test/derived/streamingstruct/vsmfields.cfg b/config-model/src/test/derived/streamingstruct/vsmfields.cfg index 7178f9d41ea..b5a234e8095 100644 --- a/config-model/src/test/derived/streamingstruct/vsmfields.cfg +++ b/config-model/src/test/derived/streamingstruct/vsmfields.cfg @@ -3,281 +3,337 @@ searchall 1 fieldspec[].name "coupleof" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "normalfields" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "a.f1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "a.f1s" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "substring" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "a.f2" fieldspec[].searchmethod INT32 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "a.f3" fieldspec[].searchmethod DOUBLE fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "m.key" fieldspec[].searchmethod INT64 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "m.value" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "b.f1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "b.f1s" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "substring" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "b.f2" fieldspec[].searchmethod INT32 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "b.f3" fieldspec[].searchmethod DOUBLE fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "c.f1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "word" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE fieldspec[].name "c.f1s" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "prefix" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "c.f3" fieldspec[].searchmethod DOUBLE fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "c2.f1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "word" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE fieldspec[].name "c2.f1s" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "suffix" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "c2.f2" fieldspec[].searchmethod INT32 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "c2.f3" fieldspec[].searchmethod DOUBLE fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "c3.f1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "word" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE fieldspec[].name "c3.f1s" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "prefix" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "c3.f2" fieldspec[].searchmethod INT32 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "c3.f3" fieldspec[].searchmethod DOUBLE fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "n.nf1.f1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "n.nf1.f1s" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "prefix" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "n.nf1.f3" fieldspec[].searchmethod DOUBLE fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "n.nf1s.f1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "substring" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "n.nf1s.f1s" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "substring" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "n.nf1s.f2" fieldspec[].searchmethod INT32 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "n.nf1s.f3" fieldspec[].searchmethod DOUBLE fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "n.nf2" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "array1.f1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "array1.f1s" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "substring" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "array1.f2" fieldspec[].searchmethod INT32 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "array1.f3" fieldspec[].searchmethod DOUBLE fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "array2.f1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "array2.f1s" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "substring" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "array2.f2" fieldspec[].searchmethod INT32 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "array2.f3" fieldspec[].searchmethod DOUBLE fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "array3.f1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "word" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype ATTRIBUTE fieldspec[].name "array3.f1s" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "prefix" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "array3.f3" fieldspec[].searchmethod DOUBLE fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "subject.f1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "d.f1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "prefix" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "d.f1s" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "prefix" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "d.f2" fieldspec[].searchmethod INT32 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "d.f3" fieldspec[].searchmethod DOUBLE fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "e.f1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "substring" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "e.f1s" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "substring" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "e.f2" fieldspec[].searchmethod INT32 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "e.f3" fieldspec[].searchmethod DOUBLE fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "f.f1" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "suffix" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "f.f1s" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "suffix" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "f.f2" fieldspec[].searchmethod INT32 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "f.f3" fieldspec[].searchmethod DOUBLE fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX fieldspec[].name "g" fieldspec[].searchmethod AUTOUTF8 fieldspec[].arg1 "" +fieldspec[].normalize LOWERCASE_AND_FOLD fieldspec[].maxlength 1048576 fieldspec[].fieldtype INDEX documenttype[].name "streamingstruct" diff --git a/config-model/src/test/java/com/yahoo/schema/derived/VsmFieldsTestCase.java b/config-model/src/test/java/com/yahoo/schema/derived/VsmFieldsTestCase.java index 6423b621ab9..601c014bdc1 100644 --- a/config-model/src/test/java/com/yahoo/schema/derived/VsmFieldsTestCase.java +++ b/config-model/src/test/java/com/yahoo/schema/derived/VsmFieldsTestCase.java @@ -4,8 +4,12 @@ package com.yahoo.schema.derived; import com.yahoo.config.model.application.provider.MockFileRegistry; import com.yahoo.config.model.deploy.TestProperties; import com.yahoo.config.model.test.MockApplicationPackage; +import com.yahoo.document.DataType; import com.yahoo.documentmodel.NewDocumentReferenceDataType; import com.yahoo.schema.Schema; +import com.yahoo.schema.document.Case; +import com.yahoo.schema.document.MatchType; +import com.yahoo.schema.document.Matching; import com.yahoo.schema.document.SDDocumentType; import com.yahoo.schema.document.SDField; import com.yahoo.schema.document.TemporarySDField; @@ -19,24 +23,55 @@ import static org.junit.jupiter.api.Assertions.assertEquals; */ public class VsmFieldsTestCase { - @SuppressWarnings("deprecation") - @Test - void reference_type_field_is_unsearchable() { + private static Schema createSchema() { Schema schema = new Schema("test", MockApplicationPackage.createEmpty(), new MockFileRegistry(), new TestableDeployLogger(), new TestProperties()); var sdoc = new SDDocumentType("test"); schema.addDocument(sdoc); - SDField refField = new TemporarySDField(sdoc, "ref_field", NewDocumentReferenceDataType.forDocumentName("parent_type")); - refField.parseIndexingScript("{ summary }"); - schema.getDocument().addField(refField); + return schema; + } + private static VsmfieldsConfig vsmfieldsConfig(Schema schema) { VsmFields vsmFields = new VsmFields(schema); VsmfieldsConfig.Builder cfgBuilder = new VsmfieldsConfig.Builder(); vsmFields.getConfig(cfgBuilder); - VsmfieldsConfig cfg = cfgBuilder.build(); + return cfgBuilder.build(); + } + + @Test + void reference_type_field_is_unsearchable() { + Schema schema = createSchema(); + SDField field = new TemporarySDField(schema.getDocument(), "ref_field", NewDocumentReferenceDataType.forDocumentName("parent_type")); + field.parseIndexingScript("{ summary }"); + schema.getDocument().addField(field); + VsmfieldsConfig cfg = vsmfieldsConfig(schema); assertEquals(1, cfg.fieldspec().size()); VsmfieldsConfig.Fieldspec fieldSpec = cfg.fieldspec().get(0); assertEquals("ref_field", fieldSpec.name()); assertEquals(VsmfieldsConfig.Fieldspec.Searchmethod.NONE, fieldSpec.searchmethod()); } + + private void testIndexMatching(Matching matching, VsmfieldsConfig.Fieldspec.Normalize.Enum normalize, String arg1) { + Schema schema = createSchema(); + SDField field = new TemporarySDField(schema.getDocument(), "f", DataType.STRING); + field.parseIndexingScript("{ index }"); + field.setMatching(matching); + schema.getDocument().addField(field); + VsmfieldsConfig cfg = vsmfieldsConfig(schema); + VsmfieldsConfig.Fieldspec fieldSpec = cfg.fieldspec().get(0); + assertEquals("f", fieldSpec.name()); + assertEquals(VsmfieldsConfig.Fieldspec.Searchmethod.AUTOUTF8, fieldSpec.searchmethod()); + assertEquals(normalize, fieldSpec.normalize()); + assertEquals(arg1, fieldSpec.arg1()); + } + + @Test + void test_exact_string() { + testIndexMatching(new Matching(MatchType.TEXT), + VsmfieldsConfig.Fieldspec.Normalize.LOWERCASE_AND_FOLD, ""); + testIndexMatching(new Matching(MatchType.TEXT).setCase(Case.CASED), + VsmfieldsConfig.Fieldspec.Normalize.NONE, ""); + testIndexMatching(new Matching(MatchType.EXACT).setCase(Case.CASED), + VsmfieldsConfig.Fieldspec.Normalize.LOWERCASE, "exact"); + } } diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp index 83b84fffa11..74d8fdc4bf3 100644 --- a/streamingvisitors/src/tests/searcher/searcher_test.cpp +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -22,6 +22,7 @@ using search::streaming::HitList; using search::streaming::QueryNodeResultFactory; using search::streaming::QueryTerm; using search::streaming::Normalizing; +using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod; using search::streaming::QueryTermList; using TermType = QueryTerm::Type; using namespace vsm; @@ -114,7 +115,7 @@ struct SnippetModifierSetup SnippetModifierSetup::SnippetModifierSetup(const StringList & terms) : query(terms), - searcher(new UTF8SubstringSnippetModifier()), + searcher(new UTF8SubstringSnippetModifier(0)), env(), modifier(searcher) { @@ -361,7 +362,7 @@ assertFieldInfo(FieldSearcher & fs, const StringList & query, void assertSnippetModifier(const StringList & query, const std::string & fv, const std::string & exp) { - UTF8SubstringSnippetModifier mod; + UTF8SubstringSnippetModifier mod(0); performSearch(mod, query, StringFieldValue(fv)); EXPECT_EQUAL(mod.getModifiedBuf().getPos(), exp.size()); std::string actual(mod.getModifiedBuf().getBuffer(), mod.getModifiedBuf().getPos()); @@ -440,11 +441,11 @@ testStrChrFieldSearcher(StrChrFieldSearcher & fs) assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits()).add(Hits())); assertString(fs, StringList().add("and").add("overloading"), field, HitsList().add(Hits().add(1)).add(Hits().add(3))); - fs.setMatchType(FieldSearcher::PREFIX); + fs.match_type(FieldSearcher::PREFIX); assertString(fs, "oper", field, Hits().add(0).add(2)); assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits().add(0).add(2)).add(Hits())); - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); if (!EXPECT_TRUE(testStringFieldInfo(fs))) return false; { // test handling of several underscores @@ -553,12 +554,12 @@ TEST("utf8 substring search with empty term") TEST("utf8 suffix search") { UTF8SuffixStringFieldSearcher fs(0); std::string field = "operators and operator overloading"; - assertString(fs, "rsand", field, Hits()); - assertString(fs, "tor", field, Hits().add(2)); - assertString(fs, "tors", field, Hits().add(0)); + TEST_DO(assertString(fs, "rsand", field, Hits())); + TEST_DO(assertString(fs, "tor", field, Hits().add(2))); + TEST_DO(assertString(fs, "tors", field, Hits().add(0))); - assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits())); - assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3))); + TEST_DO(assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits()))); + TEST_DO(assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3)))); EXPECT_TRUE(testStringFieldInfo(fs)); } @@ -590,22 +591,22 @@ TEST("utf8 flexible searcher"){ // prefix assertString(fs, "vesp*", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::PREFIX); + fs.match_type(FieldSearcher::PREFIX); assertString(fs, "vesp", "vespa", Hits().add(0)); // substring - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); assertString(fs, "*esp*", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::SUBSTRING); + fs.match_type(FieldSearcher::SUBSTRING); assertString(fs, "esp", "vespa", Hits().add(0)); // suffix - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); assertString(fs, "*espa", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::SUFFIX); + fs.match_type(FieldSearcher::SUFFIX); assertString(fs, "espa", "vespa", Hits().add(0)); - fs.setMatchType(FieldSearcher::REGULAR); + fs.match_type(FieldSearcher::REGULAR); EXPECT_TRUE(testStringFieldInfo(fs)); } @@ -659,7 +660,7 @@ TEST("integer search") TEST("floating point search") { - FloatFieldSearcher fs; + FloatFieldSearcher fs(0); TEST_DO(assertFloat(fs, "10", 10, true)); TEST_DO(assertFloat(fs, "10.5", 10.5, true)); TEST_DO(assertFloat(fs, "-10.5", -10.5, true)); @@ -726,7 +727,7 @@ TEST("Snippet modifier search") { "\xe7\x9f\xb3\x1f\xe6\x98\x8e\xe5\x87\xb1\x1f\xe5\x9c\xa8"); { // check that resizing works - UTF8SubstringSnippetModifier mod; + UTF8SubstringSnippetModifier mod(0); EXPECT_EQUAL(mod.getModifiedBuf().getLength(), 32u); EXPECT_EQUAL(mod.getModifiedBuf().getPos(), 0u); performSearch(mod, StringList().add("a"), StringFieldValue("aaaaaaaaaaaaaaaa")); @@ -763,28 +764,32 @@ TEST("snippet modifier") { } } -TEST("FieldSearchSpec constrution") { +TEST("FieldSearchSpec construction") { { FieldSearchSpec f; EXPECT_FALSE(f.valid()); EXPECT_EQUAL(0u, f.id()); EXPECT_EQUAL("", f.name()); EXPECT_EQUAL(0x100000u, f.maxLength()); + EXPECT_EQUAL("", f.arg1()); + EXPECT_TRUE(Normalizing::LOWERCASE_AND_FOLD == f.normalize_mode()); } { - FieldSearchSpec f(7, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 789); + FieldSearchSpec f(7, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 789); EXPECT_TRUE(f.valid()); EXPECT_EQUAL(7u, f.id()); EXPECT_EQUAL("f0", f.name()); EXPECT_EQUAL(789u, f.maxLength()); EXPECT_EQUAL(789u, f.searcher().maxFieldLength()); + EXPECT_EQUAL("substring", f.arg1()); + EXPECT_TRUE(Normalizing::LOWERCASE == f.normalize_mode()); } } TEST("snippet modifier manager") { FieldSearchSpecMapT specMap; - specMap[0] = FieldSearchSpec(0, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 1000); - specMap[1] = FieldSearchSpec(1, "f1", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "", 1000); + specMap[0] = FieldSearchSpec(0, "f0", Searchmethod::AUTOUTF8, Normalizing::LOWERCASE, "substring", 1000); + specMap[1] = FieldSearchSpec(1, "f1", Searchmethod::AUTOUTF8, Normalizing::NONE, "", 1000); IndexFieldMapT indexMap; indexMap["i0"].push_back(0); indexMap["i1"].push_back(1); diff --git a/streamingvisitors/src/tests/textutil/textutil_test.cpp b/streamingvisitors/src/tests/textutil/textutil_test.cpp index b926444e4df..f7f340a2182 100644 --- a/streamingvisitors/src/tests/textutil/textutil_test.cpp +++ b/streamingvisitors/src/tests/textutil/textutil_test.cpp @@ -2,7 +2,6 @@ #include <vespa/vespalib/testkit/testapp.h> #include <vespa/fastlib/text/normwordfolder.h> -#include <vespa/searchlib/query/base.h> #include <vespa/vsm/searcher/fold.h> #include <vespa/vsm/searcher/futf8strchrfieldsearcher.h> #include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> @@ -66,7 +65,7 @@ TextUtilTest::assertSkipSeparators(const char * input, size_t len, const UCS4V & const byte * srcbuf = reinterpret_cast<const byte *>(input); auto dstbuf = std::make_unique<ucs4_t[]>(len + 1); auto offsets = std::make_unique<size_t[]>(len + 1); - UTF8StrChrFieldSearcher fs; + UTF8StrChrFieldSearcher fs(0); BW bw(dstbuf.get(), offsets.get()); size_t dstlen = fs.skipSeparators(srcbuf, len, bw); EXPECT_EQUAL(dstlen, expdstbuf.size()); diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp index 4161adaf21f..cdd1a018d84 100644 --- a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp +++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp @@ -42,6 +42,7 @@ using search::aggregation::HitsAggregationResult; using search::attribute::IAttributeVector; using search::expression::ConfigureStaticParams; using search::streaming::Query; +using search::streaming::Normalizing; using search::streaming::QueryTermList; using storage::StorageComponent; using storage::VisitorEnvironment; @@ -326,20 +327,41 @@ SearchVisitor::is_text_matching(vespalib::stringref index) const noexcept { return false; } +namespace { + +uint32_t +count_normalize_lowercase(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) { + size_t count = 0; + for (const auto & fieldId : fieldIdMap.map()) { + auto found = specMap.find(fieldId.second); + if ((found != specMap.end()) && found->second.searcher().normalize_mode() == Normalizing::LOWERCASE) { + count++; + } + } + return count; +} + +uint32_t +count_normalize_none(const vsm::FieldSearchSpecMapT & specMap, const StringFieldIdTMap & fieldIdMap) { + size_t count = 0; + for (const auto & fieldId : fieldIdMap.map()) { + auto found = specMap.find(fieldId.second); + if ((found != specMap.end()) && found->second.searcher().normalize_mode() == Normalizing::NONE) { + count++; + } + } + return count; +} + +} + SearchMethodInfo::Normalizing SearchVisitor::normalizing_mode(vespalib::stringref index) const noexcept { StringFieldIdTMap fieldIdMap; _fieldSearchSpecMap.addFieldsFromIndex(index, fieldIdMap); - size_t num_exact = 0; - for (const auto & fieldId : fieldIdMap.map()) { - auto found = _fieldSearchSpecMap.specMap().find(fieldId.second); - if ((found != _fieldSearchSpecMap.specMap().end()) && found->second.searcher().exact()) { - num_exact++; - } - } - return ((num_exact == 0) || (num_exact != fieldIdMap.map().size())) - ? Normalizing::LOWERCASE_AND_FOLD - : Normalizing::LOWERCASE; + if (count_normalize_none(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::NONE; + if (count_normalize_lowercase(_fieldSearchSpecMap.specMap(), fieldIdMap) == fieldIdMap.map().size()) return Normalizing::LOWERCASE; + return Normalizing::LOWERCASE_AND_FOLD; } void diff --git a/streamingvisitors/src/vespa/vsm/config/vsmfields.def b/streamingvisitors/src/vespa/vsm/config/vsmfields.def index 442a044d38f..dac732013d2 100644 --- a/streamingvisitors/src/vespa/vsm/config/vsmfields.def +++ b/streamingvisitors/src/vespa/vsm/config/vsmfields.def @@ -14,6 +14,7 @@ fieldspec[].name string ## The search method for a given field. Note: same field in 2 different document types must match on type if not a random result might be expected. fieldspec[].searchmethod enum { NONE, BOOL, AUTOUTF8, UTF8, SSE2UTF8, INT8, INT16, INT32, INT64, FLOAT16, FLOAT, DOUBLE, GEOPOS, NEAREST_NEIGHBOR } default=AUTOUTF8 fieldspec[].arg1 string default="" +fieldspec[].normalize enum { NONE, LOWERCASE, LOWERCASE_AND_FOLD } default=LOWERCASE_AND_FOLD ## Maximum number of chars to search per field. fieldspec[].maxlength int default=1048576 diff --git a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h index c7e7d2e74bd..3708cca85fb 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h @@ -9,8 +9,8 @@ class BoolFieldSearcher : public FieldSearcher { public: std::unique_ptr<FieldSearcher> duplicate() const override; - BoolFieldSearcher(FieldIdT fId); - ~BoolFieldSearcher(); + explicit BoolFieldSearcher(FieldIdT fId); + ~BoolFieldSearcher() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp index 55d80413b8c..5e06ae41a03 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp @@ -51,14 +51,13 @@ FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept : FieldSearcherBase(), _field(fId), _matchType(defaultPrefix ? PREFIX : REGULAR), + _normalize_mode(Normalizing::LOWERCASE_AND_FOLD), _maxFieldLength(0x100000), _currentElementId(0), _currentElementWeight(1), _words(0), - _badUtf8Count(0), - _zeroCount(0) + _badUtf8Count(0) { - zeroStat(); } FieldSearcher::~FieldSearcher() = default; @@ -71,7 +70,7 @@ FieldSearcher::search(const StorageDocument & doc) fInfo.setHitOffset(qt->getHitList().size()); } onSearch(doc); - for(auto qt : _qtl) { + for (auto qt : _qtl) { QueryTerm::FieldInfo & fInfo = qt->getFieldInfo(field()); fInfo.setHitCount(qt->getHitList().size() - fInfo.getHitOffset()); fInfo.setFieldLength(_words); @@ -114,13 +113,6 @@ FieldSearcher::prepareFieldId() } void -FieldSearcher::zeroStat() -{ - _badUtf8Count = 0; - _zeroCount = 0; -} - -void FieldSearcher::init() { for (unsigned i = 0; i < NELEMS(_foldLowCase); i++) { diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h index e64c41f814f..c5bca6f3899 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h @@ -34,12 +34,13 @@ protected: class FieldSearcher : public FieldSearcherBase { public: + using Normalizing = search::streaming::Normalizing; enum MatchType { REGULAR, PREFIX, SUBSTRING, SUFFIX, - EXACT + EXACT, }; explicit FieldSearcher(FieldIdT fId) noexcept : FieldSearcher(fId, false) {} @@ -50,20 +51,22 @@ public: virtual void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, search::fef::IQueryEnvironment& query_env); - FieldIdT field() const { return _field; } - void field(FieldIdT v) { _field = v; prepareFieldId(); } - bool prefix() const { return _matchType == PREFIX; } - bool substring() const { return _matchType == SUBSTRING; } - bool suffix() const { return _matchType == SUFFIX; } - bool exact() const { return _matchType == EXACT; } - void setMatchType(MatchType mt) { _matchType = mt; } + FieldIdT field() const noexcept { return _field; } + bool prefix() const noexcept { return _matchType == PREFIX; } + bool substring() const noexcept { return _matchType == SUBSTRING; } + bool suffix() const noexcept { return _matchType == SUFFIX; } + bool exact() const noexcept { return _matchType == EXACT; } + Normalizing normalize_mode() const noexcept { return _normalize_mode; } + MatchType match_type() const noexcept { return _matchType; } + void match_type(MatchType mt) noexcept { _matchType = mt; } + void normalize_mode(Normalizing mode) noexcept { _normalize_mode = mode; } + void field(FieldIdT v) noexcept { _field = v; prepareFieldId(); } static void init(); static search::byte fold(search::byte c) { return _foldLowCase[c]; } static search::byte iswordchar(search::byte c) { return _wordChar[c]; } static search::byte isspace(search::byte c) { return ! iswordchar(c); } static size_t countWords(const FieldRef & f); - int32_t getCurrentWeight() const { return _currentElementWeight; } - void zeroStat(); + int32_t currentWeight() const { return _currentElementWeight; } FieldSearcher & maxFieldLength(uint32_t maxFieldLength_) { _maxFieldLength = maxFieldLength_; return *this; } size_t maxFieldLength() const { return _maxFieldLength; } @@ -89,22 +92,21 @@ private: virtual void onStructValue(const document::StructFieldValue &) { } FieldIdT _field; MatchType _matchType; + Normalizing _normalize_mode; unsigned _maxFieldLength; uint32_t _currentElementId; int32_t _currentElementWeight; // Contains the weight of the current item being evaluated. protected: /// Number of terms searched. - unsigned _words; + unsigned _words; /// Number of utf8 bytes by utf8 size. - unsigned _badUtf8Count; - unsigned _zeroCount; -protected: + unsigned _badUtf8Count; /** * Adds a hit to the given query term. * For each call to onValue() a batch of words are processed, and the position is local to this batch. **/ void addHit(search::streaming::QueryTerm & qt, uint32_t pos) const { - qt.add(_words + pos, field(), _currentElementId, getCurrentWeight()); + qt.add(_words + pos, field(), _currentElementId, _currentElementWeight); } public: static search::byte _foldLowCase[256]; diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h index 07b3f6e1c5f..85341472c26 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h @@ -9,8 +9,8 @@ template <typename T> class FloatFieldSearcherT : public FieldSearcher { public: - FloatFieldSearcherT(FieldIdT fId=0); - ~FloatFieldSearcherT(); + explicit FloatFieldSearcherT(FieldIdT fId); + ~FloatFieldSearcherT() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, @@ -42,14 +42,14 @@ class FloatFieldSearcher : public FloatFieldSearcherTF { public: std::unique_ptr<FieldSearcher> duplicate() const override; - FloatFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTF(fId) { } + explicit FloatFieldSearcher(FieldIdT fId) : FloatFieldSearcherTF(fId) { } }; class DoubleFieldSearcher : public FloatFieldSearcherTD { public: std::unique_ptr<FieldSearcher> duplicate() const override; - DoubleFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTD(fId) { } + DoubleFieldSearcher(FieldIdT fId) : FloatFieldSearcherTD(fId) { } }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp index d7d73899e53..c0b5117d6bf 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp @@ -19,10 +19,6 @@ FUTF8StrChrFieldSearcher::duplicate() const return std::make_unique<FUTF8StrChrFieldSearcher>(*this); } -FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher() - : UTF8StrChrFieldSearcher(), - _folded(4_Ki) -{ } FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StrChrFieldSearcher(fId), _folded(4_Ki) diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h index 5d5ca3d6c3c..b8aa287070a 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h @@ -9,15 +9,14 @@ class FUTF8StrChrFieldSearcher : public UTF8StrChrFieldSearcher { public: std::unique_ptr<FieldSearcher> duplicate() const override; - FUTF8StrChrFieldSearcher(); - FUTF8StrChrFieldSearcher(FieldIdT fId); + explicit FUTF8StrChrFieldSearcher(FieldIdT fId); ~FUTF8StrChrFieldSearcher() override; static bool ansiFold(const char * toFold, size_t sz, char * folded); static bool lfoldaa(const char * toFold, size_t sz, char * folded, size_t & unalignedStart); static bool lfoldua(const char * toFold, size_t sz, char * folded, size_t & alignedStart); private: size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - size_t matchTerms(const FieldRef&, const size_t shortestTerm) override; + size_t matchTerms(const FieldRef&, size_t shortestTerm) override; virtual size_t match(const char *folded, size_t sz, search::streaming::QueryTerm & qt); size_t match(const char *folded, size_t sz, size_t mintsz, search::streaming::QueryTerm ** qtl, size_t qtlSize); std::vector<char> _folded; diff --git a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h index 741148fbca1..17c9f23fefb 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h @@ -8,8 +8,8 @@ namespace vsm { class GeoPosFieldSearcher : public FieldSearcher { public: - GeoPosFieldSearcher(FieldIdT fId=0); - ~GeoPosFieldSearcher(); + GeoPosFieldSearcher(FieldIdT fId); + ~GeoPosFieldSearcher() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, @@ -21,7 +21,7 @@ protected: using GeoLocation = search::common::GeoLocation; class GeoPosInfo : public GeoLocation { public: - GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {} + explicit GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {} bool cmp(const document::StructFieldValue & fv) const; }; using GeoPosInfoListT = std::vector<GeoPosInfo>; diff --git a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h index 47b83c1538d..9c63d31e3c3 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h @@ -9,8 +9,8 @@ class IntFieldSearcher : public FieldSearcher { public: std::unique_ptr<FieldSearcher> duplicate() const override; - IntFieldSearcher(FieldIdT fId=0); - ~IntFieldSearcher(); + explicit IntFieldSearcher(FieldIdT fId); + ~IntFieldSearcher() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp index 76fedbd1166..816317bf86d 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.cpp @@ -141,17 +141,17 @@ NearestNeighborFieldSearcher::onValue(const document::FieldValue& fv) } DistanceMetric -NearestNeighborFieldSearcher::distance_metric_from_string(const vespalib::string& value) +NearestNeighborFieldSearcher::distance_metric_from_string(vespalib::stringref value) { // Valid string values must match the definition of DistanceMetric in // config-model/src/main/java/com/yahoo/schema/document/Attribute.java - auto v = value; + vespalib::string v = value; std::transform(v.begin(), v.end(), v.begin(), [](unsigned char c) { return std::tolower(c); }); try { return DistanceMetricUtils::to_distance_metric(v); } catch (vespalib::IllegalStateException&) { - vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", value.c_str()); + vespalib::Issue::report("Distance metric '%s' is not supported. Using 'euclidean' instead", v.c_str()); return DistanceMetric::Euclidean; } } diff --git a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h index 5629b443c78..ecdc64d1336 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/nearest_neighbor_field_searcher.h @@ -11,10 +11,7 @@ #include <vespa/searchlib/tensor/tensor_ext_attribute.h> namespace search::fef { class IQueryEnvironment; } - -namespace search::tensor { -class TensorExtAttribute; -} +namespace search::tensor { class TensorExtAttribute; } namespace vsm { @@ -43,7 +40,7 @@ private: public: NearestNeighborFieldSearcher(FieldIdT fid, search::attribute::DistanceMetric metric); - ~NearestNeighborFieldSearcher(); + ~NearestNeighborFieldSearcher() override; std::unique_ptr<FieldSearcher> duplicate() const override; void prepare(search::streaming::QueryTermList& qtl, @@ -52,7 +49,7 @@ public: search::fef::IQueryEnvironment& query_env) override; void onValue(const document::FieldValue& fv) override; - static search::attribute::DistanceMetric distance_metric_from_string(const vespalib::string& value); + static search::attribute::DistanceMetric distance_metric_from_string(vespalib::stringref value); }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h index 9ad76712092..19c723d060d 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h @@ -8,8 +8,7 @@ namespace vsm { class StrChrFieldSearcher : public FieldSearcher { public: - StrChrFieldSearcher() : FieldSearcher(0) { } - StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { } + explicit StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { } void onValue(const document::FieldValue & fv) override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, @@ -19,7 +18,7 @@ private: size_t shortestTerm() const; bool matchDoc(const FieldRef & field); virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) = 0; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) = 0; + virtual size_t matchTerms(const FieldRef & f, size_t shortestTerm) = 0; }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h index dd6f31581a0..aaf8b940dc8 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h @@ -17,10 +17,10 @@ protected: public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8ExactStringFieldSearcher(FieldIdT fId) + explicit UTF8ExactStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { - setMatchType(EXACT); + match_type(EXACT); } }; diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp index 655b068e152..78f491198ad 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp @@ -58,10 +58,6 @@ UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) } } -UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher() : - UTF8StringFieldSearcherBase() -{ } - UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h index 5eee6a8862a..04fbee96d36 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h @@ -17,18 +17,17 @@ private: * Tries to match the given query term against the content of the given field reference. * Search strategy is choosen based on the query term type. **/ - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; /** * Tries to match each query term in the underlying query against the content of the given field reference. * Search strategy is choosen based on the query term type. **/ - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8FlexibleStringFieldSearcher(); - UTF8FlexibleStringFieldSearcher(FieldIdT fId); + explicit UTF8FlexibleStringFieldSearcher(FieldIdT fId); }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp index 651d1dcad9f..fa1fc83728c 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp @@ -14,21 +14,19 @@ UTF8StrChrFieldSearcher::duplicate() const } size_t -UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) +UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz) { (void) mintsz; termcount_t words(0); - const byte * n = reinterpret_cast<const byte *> (f.data()); - const byte * e = n + f.size(); if (f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } cmptype_t * fn = &(*_buf.get())[0]; - size_t fl(0); - for( ; n < e; ) { - if (!*n) { _zeroCount++; n++; } - n = tokenize(n, _buf->capacity(), fn, fl); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn); + while ( reader.hasNext() ) { + tokenize(reader); + size_t fl = reader.complete(); for (auto qt : _qtl) { const cmptype_t * term; termsize_t tsz = qt->term(term); diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h index cfe546bc6f6..663ee3a1a62 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h @@ -13,12 +13,10 @@ class UTF8StrChrFieldSearcher : public UTF8StringFieldSearcherBase { public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8StrChrFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } - + explicit UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } protected: size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp index ebdf69d0b30..ce63f55ea63 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp @@ -1,7 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "utf8stringfieldsearcherbase.h" -#include <vespa/fastlib/text/normwordfolder.h> #include <cassert> using search::streaming::QueryTerm; @@ -10,107 +9,36 @@ using search::byte; namespace vsm { -const byte * -UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen) -{ - if (maxSz > 0) { - maxSz--; - } - ucs4_t c(*p); - ucs4_t *q(dstbuf); - const byte * end(p+maxSz); - - // Skip non-word characters between words - for (; p < end; ) { - if (c < 128) { - if (!c) { break; } - p++; - if (__builtin_expect(Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) { - *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c); - c = 0; - } else { - c = *p; - } - } else { - c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); - if (Fast_UnicodeUtil::IsWordChar(c)) { - const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != nullptr) { - size_t repllen = strlen(repl); - if (repllen > 0) { - q = Fast_UnicodeUtil::ucs4copy(q,repl); - } - } else { - c = Fast_NormalizeWordFolder::lowercase_and_fold(c); - *q++ = c; - } - break; - } else { - if (c == Fast_UnicodeUtil::_BadUTF8Char) { - _badUtf8Count++; - } - c = *p; - } - } - } - - c = *p; // Next char - for (; p < end;) { - if (c < 128) { // Common case, ASCII - if (!c) { break; } - p++; - if (__builtin_expect(!Fast_NormalizeWordFolder::is_wordchar_ascii7bit(c), false)) { - c = 0; - } else { - *q++ = Fast_NormalizeWordFolder::lowercase_and_fold_ascii(c); - c = *p; - } - } else { - c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); - if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) { - const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); - if (repl != nullptr) { - size_t repllen = strlen(repl); - if (repllen > 0) { - q = Fast_UnicodeUtil::ucs4copy(q,repl); - } - } else { - c = Fast_NormalizeWordFolder::lowercase_and_fold(c); - *q++ = c; - } - - c = *p; - } else { - if (c == Fast_UnicodeUtil::_BadUTF8Char) { - _badUtf8Count++; - } - break; - } +template<typename Reader> +void +UTF8StringFieldSearcherBase::tokenize(Reader & reader) { + ucs4_t c(0); + Normalizing norm_mode = normalize_mode(); + while (reader.hasNext() && ! Fast_UnicodeUtil::IsWordChar(c = reader.next())); + + if (Fast_UnicodeUtil::IsWordChar(c)) { + reader.normalize(c, norm_mode); + while (reader.hasNext() && Fast_UnicodeUtil::IsWordChar(c = reader.next())) { + reader.normalize(c, norm_mode); } } - *q = 0; - tokenlen = q - dstbuf; - return p; } size_t UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt) { termcount_t words(0); - const byte * n = reinterpret_cast<const byte *> (f.data()); - // __builtin_prefetch(n, 0, 0); const cmptype_t * term; termsize_t tsz = qt.term(term); - const byte * e = n + f.size(); if ( f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } - cmptype_t * fn = &(*_buf.get())[0]; - size_t fl(0); + cmptype_t * fn = _buf->data(); - for( ; n < e; ) { - if (!*n) { _zeroCount++; n++; } - n = tokenize(n, _buf->capacity(), fn, fl); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), fn); + while ( reader.hasNext() ) { + tokenize(reader); + size_t fl = reader.complete(); if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) { const cmptype_t *tt=term, *et=term+tsz; for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++); @@ -185,22 +113,17 @@ size_t UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt) { termcount_t words = 0; - const byte * srcbuf = reinterpret_cast<const byte *> (f.data()); - const byte * srcend = srcbuf + f.size(); const cmptype_t * term; termsize_t tsz = qt.term(term); if (f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } - cmptype_t * dstbuf = &(*_buf.get())[0]; - size_t tokenlen = 0; + cmptype_t * dstbuf = _buf->data(); - for( ; srcbuf < srcend; ) { - if (*srcbuf == 0) { - ++_zeroCount; - ++srcbuf; - } - srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf); + while ( reader.hasNext() ) { + tokenize(reader); + size_t tokenlen = reader.complete(); if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) { addHit(qt, words); } @@ -209,11 +132,6 @@ UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt) return words; } -UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase() : - StrChrFieldSearcher() -{ -} - UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase(FieldIdT fId) : StrChrFieldSearcher(fId) { diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h index 38aac508f4f..115cddce619 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h @@ -2,6 +2,7 @@ #pragma once #include "strchrfieldsearcher.h" +#include <vespa/fastlib/text/normwordfolder.h> namespace vsm { @@ -28,15 +29,15 @@ public: ucs4_t * _cbuf; public: - BufferWrapper(ucs4_t * buf) : _bbuf(buf), _cbuf(buf) { } - BufferWrapper(ucs4_t * buf, size_t *) : _bbuf(buf), _cbuf(buf) { } + explicit BufferWrapper(ucs4_t * buf) noexcept : _bbuf(buf), _cbuf(buf) { } + BufferWrapper(ucs4_t * buf, size_t *) noexcept : _bbuf(buf), _cbuf(buf) { } void onCharacter(ucs4_t ch, size_t) { *_cbuf++ = ch; } void onOffset(size_t) { } void incBuf(size_t inc) { _cbuf += inc; } ucs4_t * getBuf() { return _cbuf; } - bool valid() { return true; } - size_t size() { return (_cbuf - _bbuf); } - bool hasOffsets() { return false; } + bool valid() const noexcept { return true; } + size_t size() const noexcept { return (_cbuf - _bbuf); } + bool hasOffsets() const noexcept { return false; } }; /** @@ -50,17 +51,74 @@ public: size_t * _coff; public: - OffsetWrapper(ucs4_t * buf, size_t * offsets) : BufferWrapper(buf), _boff(offsets), _coff(offsets) {} + explicit OffsetWrapper(ucs4_t * buf, size_t * offsets) noexcept : BufferWrapper(buf), _boff(offsets), _coff(offsets) {} void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; } void onOffset(size_t of) { *_coff++ = of; } - bool valid() { return (size() == (size_t)(_coff - _boff)); } - bool hasOffsets() { return true; } + bool valid() const noexcept { return (size() == (size_t)(_coff - _boff)); } + bool hasOffsets() const noexcept { return true; } }; protected: SharedSearcherBuf _buf; - const search::byte * tokenize(const search::byte * buf, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen); + using byte = search::byte; + + class TokenizeReader { + public: + TokenizeReader(const byte *p, uint32_t len, ucs4_t *q) noexcept + : _p(p), + _p_end(p + len), + _q(q), + _q_start(q) + {} + ucs4_t next() noexcept { return Fast_UnicodeUtil::GetUTF8Char(_p); } + void normalize(ucs4_t c, Normalizing normalize_mode) { + switch (normalize_mode) { + case Normalizing::LOWERCASE: + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); + [[fallthrough]]; + case Normalizing::NONE: + *_q++ = c; + break; + case Normalizing::LOWERCASE_AND_FOLD: + fold(c); + break; + } + } + bool hasNext() const noexcept { return _p < _p_end; } + const byte * p() const noexcept { return _p; } + size_t complete() noexcept { + *_q = 0; + size_t token_len = _q - _q_start; + _q = _q_start; + return token_len; + } + private: + void fold(ucs4_t c) { + const char *repl = Fast_NormalizeWordFolder::ReplacementString(c); + if (repl != nullptr) { + size_t repllen = strlen(repl); + if (repllen > 0) { + _q = Fast_UnicodeUtil::ucs4copy(_q,repl); + } + } else { + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); + *_q++ = c; + } + } + void lowercase(ucs4_t c) { + c = Fast_NormalizeWordFolder::lowercase_and_fold(c); + *_q++ = c; + } + const byte *_p; + const byte *_p_end; + ucs4_t *_q; + ucs4_t *_q_start; + }; + + + template<typename Reader> + void tokenize(Reader & reader); /** * Matches the given query term against the words in the given field reference @@ -103,9 +161,8 @@ protected: size_t matchTermExact(const FieldRef & f, search::streaming::QueryTerm & qt); public: - UTF8StringFieldSearcherBase(); - UTF8StringFieldSearcherBase(FieldIdT fId); - ~UTF8StringFieldSearcherBase(); + explicit UTF8StringFieldSearcherBase(FieldIdT fId); + ~UTF8StringFieldSearcherBase() override; void prepare(search::streaming::QueryTermList& qtl, const SharedSearcherBuf& buf, const vsm::FieldPathMapT& field_paths, diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp index 25ef9ae7618..fcc2893a71d 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp @@ -1,6 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <vespa/vsm/searcher/utf8substringsearcher.h> +#include "utf8substringsearcher.h" #include <vespa/fastlib/text/unicodeutil.h> using search::byte; diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h index b1455d5c5f6..22ecf9c41fa 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h @@ -12,11 +12,10 @@ class UTF8SubStringFieldSearcher : public UTF8StringFieldSearcherBase { public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8SubStringFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } + explicit UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } protected: size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; }; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp index 8403e69658f..6d8a399cd33 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp @@ -110,20 +110,11 @@ UTF8SubstringSnippetModifier::insertSeparators(const char * mbegin, const char * _modified->put(_unitSep); } -UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier() : - UTF8StringFieldSearcherBase(), - _modified(new CharBuffer(32)), - _offsets(new std::vector<size_t>(32)), - _readPtr(NULL), - _unitSep(juniper::separators::unit_separator) -{ -} - UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId) : UTF8StringFieldSearcherBase(fId), _modified(new CharBuffer(32)), _offsets(new std::vector<size_t>(32)), - _readPtr(NULL), + _readPtr(nullptr), _unitSep(juniper::separators::unit_separator) { } @@ -134,12 +125,12 @@ UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId, UTF8StringFieldSearcherBase(fId), _modified(modBuf), _offsets(offBuf), - _readPtr(NULL), + _readPtr(nullptr), _unitSep(juniper::separators::unit_separator) { } -UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() {} +UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() = default; } diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h index ebb806de61c..99e6c29961f 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h @@ -23,8 +23,8 @@ private: const char * _readPtr; // buffer to read from (field reference) char _unitSep; // the unit separator character to use - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; /** * Copies n bytes from the field reference to the modified buffer and updates the read pointer. @@ -51,9 +51,8 @@ public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8SubstringSnippetModifier(); - UTF8SubstringSnippetModifier(FieldIdT fId); - ~UTF8SubstringSnippetModifier(); + explicit UTF8SubstringSnippetModifier(FieldIdT fId); + ~UTF8SubstringSnippetModifier() override; /** * Creates a new instance. diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp index e28ce114225..4318d5fe1a3 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp @@ -14,24 +14,19 @@ UTF8SuffixStringFieldSearcher::duplicate() const } size_t -UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) +UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, size_t mintsz) { (void) mintsz; termcount_t words = 0; - const byte * srcbuf = reinterpret_cast<const byte *> (f.data()); - const byte * srcend = srcbuf + f.size(); if (f.size() >= _buf->size()) { _buf->reserve(f.size() + 1); } cmptype_t * dstbuf = &(*_buf.get())[0]; - size_t tokenlen = 0; - for( ; srcbuf < srcend; ) { - if (*srcbuf == 0) { - ++_zeroCount; - ++srcbuf; - } - srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen); + TokenizeReader reader(reinterpret_cast<const byte *> (f.data()), f.size(), dstbuf); + while ( reader.hasNext() ) { + tokenize(reader); + size_t tokenlen = reader.complete(); for (auto qt : _qtl) { const cmptype_t * term; termsize_t tsz = qt->term(term); diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h index 556f61a714f..dc3bc214b49 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h @@ -1,10 +1,9 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #pragma once -#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> +#include "utf8stringfieldsearcherbase.h" -namespace vsm -{ +namespace vsm { /** * This class does suffix utf8 searches. @@ -12,13 +11,12 @@ namespace vsm class UTF8SuffixStringFieldSearcher : public UTF8StringFieldSearcherBase { protected: - virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; - virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + size_t matchTerms(const FieldRef & f, size_t shortestTerm) override; public: std::unique_ptr<FieldSearcher> duplicate() const override; - UTF8SuffixStringFieldSearcher() : UTF8StringFieldSearcherBase() { } - UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } + explicit UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } }; } diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp index 4b0efd58a56..715c19a0bb7 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp @@ -31,15 +31,13 @@ namespace { void setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) { if (arg1 == "prefix") { - searcher->setMatchType(FieldSearcher::PREFIX); + searcher->match_type(FieldSearcher::PREFIX); } else if (arg1 == "substring") { - searcher->setMatchType(FieldSearcher::SUBSTRING); + searcher->match_type(FieldSearcher::SUBSTRING); } else if (arg1 == "suffix") { - searcher->setMatchType(FieldSearcher::SUFFIX); - } else if (arg1 == "exact") { - searcher->setMatchType(FieldSearcher::EXACT); - } else if (arg1 == "word") { - searcher->setMatchType(FieldSearcher::EXACT); + searcher->match_type(FieldSearcher::SUFFIX); + } else if ((arg1 == "exact") || (arg1 == "word")) { + searcher->match_type(FieldSearcher::EXACT); } } @@ -51,6 +49,7 @@ FieldSearchSpec::FieldSearchSpec() _maxLength(0x100000), _searcher(), _searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE), + _normalize_mode(Normalizing::LOWERCASE_AND_FOLD), _arg1(), _reconfigured(false) { @@ -60,15 +59,15 @@ FieldSearchSpec::~FieldSearchSpec() = default; FieldSearchSpec::FieldSearchSpec(FieldSearchSpec&& rhs) noexcept = default; FieldSearchSpec& FieldSearchSpec::operator=(FieldSearchSpec&& rhs) noexcept = default; -FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, - VsmfieldsConfig::Fieldspec::Searchmethod searchDef, - const vespalib::string & arg1, size_t maxLength_) : +FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, Searchmethod searchDef, + Normalizing normalize_mode, vespalib::stringref arg1_in, size_t maxLength_in) : _id(fid), _name(fname), - _maxLength(maxLength_), + _maxLength(maxLength_in), _searcher(), _searchMethod(searchDef), - _arg1(arg1), + _normalize_mode(normalize_mode), + _arg1(arg1_in), _reconfigured(false) { switch(searchDef) { @@ -79,13 +78,11 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & case VsmfieldsConfig::Fieldspec::Searchmethod::NONE: case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8: case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8: - if (arg1 == "substring") { + if (_arg1 == "substring") { _searcher = std::make_unique<UTF8SubStringFieldSearcher>(fid); - } else if (arg1 == "suffix") { + } else if (_arg1 == "suffix") { _searcher = std::make_unique<UTF8SuffixStringFieldSearcher>(fid); - } else if (arg1 == "exact") { - _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid); - } else if (arg1 == "word") { + } else if ((_arg1 == "exact") || (_arg1 == "word")) { _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid); } else if (searchDef == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) { _searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid); @@ -112,13 +109,14 @@ FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & _searcher = std::make_unique<GeoPosFieldSearcher>(fid); break; case VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR: - auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(arg1); + auto dm = NearestNeighborFieldSearcher::distance_metric_from_string(_arg1); _searcher = std::make_unique<NearestNeighborFieldSearcher>(fid, dm); break; } if (_searcher) { - setMatchType(_searcher, arg1); + setMatchType(_searcher, _arg1); _searcher->maxFieldLength(maxLength()); + _searcher->normalize_mode(_normalize_mode); } } @@ -166,20 +164,20 @@ FieldSearchSpecMap::FieldSearchSpecMap() = default; FieldSearchSpecMap::~FieldSearchSpecMap() = default; namespace { - const std::string _G_empty(""); - const std::string _G_value(".value"); - const std::regex _G_map1("\\{[a-zA-Z0-9]+\\}"); - const std::regex _G_map2("\\{\".*\"\\}"); - const std::regex _G_array("\\[[0-9]+\\]"); + const std::string G_empty; + const std::string G_value(".value"); + const std::regex G_map1("\\{[a-zA-Z0-9]+\\}"); + const std::regex G_map2("\\{\".*\"\\}"); + const std::regex G_array("\\[[0-9]+\\]"); } vespalib::string FieldSearchSpecMap::stripNonFields(vespalib::stringref rawIndex) { if ((rawIndex.find('[') != vespalib::string::npos) || (rawIndex.find('{') != vespalib::string::npos)) { - std::string index = std::regex_replace(std::string(rawIndex), _G_map1, _G_value); - index = std::regex_replace(index, _G_map2, _G_value); - index = std::regex_replace(index, _G_array, _G_empty); + std::string index = std::regex_replace(std::string(rawIndex), G_map1, G_value); + index = std::regex_replace(index, G_map2, G_value); + index = std::regex_replace(index, G_array, G_empty); return index; } return rawIndex; @@ -258,17 +256,26 @@ buildFieldSet(const VsmfieldsConfig::Documenttype::Index & ci, const FieldSearch return ifm; } +search::streaming::Normalizing +normalize_mode(VsmfieldsConfig::Fieldspec::Normalize normalize_mode) { + switch (normalize_mode) { + case VsmfieldsConfig::Fieldspec::Normalize::NONE: return search::streaming::Normalizing::NONE; + case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE: return search::streaming::Normalizing::LOWERCASE; + case VsmfieldsConfig::Fieldspec::Normalize::LOWERCASE_AND_FOLD: return search::streaming::Normalizing::LOWERCASE_AND_FOLD; + } + return search::streaming::Normalizing::LOWERCASE_AND_FOLD; } -bool +} + +void FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf) { - bool retval(true); LOG(spam, "Parsing %zd fields", conf->fieldspec.size()); for(const VsmfieldsConfig::Fieldspec & cfs : conf->fieldspec) { LOG(spam, "Parsing %s", cfs.name.c_str()); FieldIdT fieldId = specMap().size(); - FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, cfs.arg1.c_str(), cfs.maxlength); + FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, normalize_mode(cfs.normalize), cfs.arg1, cfs.maxlength); _specMap[fieldId] = std::move(fss); _nameIdMap.add(cfs.name, fieldId); LOG(spam, "M in %d = %s", fieldId, cfs.name.c_str()); @@ -283,7 +290,6 @@ FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf) } _documentTypeMap[di.name] = indexMapp; } - return retval; } void @@ -338,7 +344,7 @@ FieldSearchSpecMap::get_distance_metric(const vespalib::string& name) const if (!itr->second.uses_nearest_neighbor_search_method()) { return dm; } - return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.get_arg1()); + return vsm::NearestNeighborFieldSearcher::distance_metric_from_string(itr->second.arg1()); } vespalib::asciistream & diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h index 43bb5b04481..7ba9799991e 100644 --- a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h @@ -10,25 +10,29 @@ namespace vsm { class FieldSearchSpec { public: + using Searchmethod = VsmfieldsConfig::Fieldspec::Searchmethod; + using Normalizing = search::streaming::Normalizing; FieldSearchSpec(); - FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, - VsmfieldsConfig::Fieldspec::Searchmethod searchMethod, - const vespalib::string & arg1, size_t maxLength); + FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, Searchmethod searchMethod, + Normalizing normalize_mode, vespalib::stringref arg1, size_t maxLength); ~FieldSearchSpec(); FieldSearchSpec(FieldSearchSpec&& rhs) noexcept; FieldSearchSpec& operator=(FieldSearchSpec&& rhs) noexcept; - const FieldSearcher & searcher() const { return *_searcher; } - const vespalib::string & name() const { return _name; } - FieldIdT id() const { return _id; } - bool valid() const { return static_cast<bool>(_searcher); } - size_t maxLength() const { return _maxLength; } - bool uses_nearest_neighbor_search_method() const noexcept { return _searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::NEAREST_NEIGHBOR; } + const FieldSearcher & searcher() const noexcept { return *_searcher; } + const vespalib::string & name() const noexcept { return _name; } + FieldIdT id() const noexcept { return _id; } + bool valid() const noexcept { return static_cast<bool>(_searcher); } + size_t maxLength() const noexcept { return _maxLength; } + Normalizing normalize_mode() const noexcept { return _normalize_mode; } + const vespalib::string& arg1() const noexcept { return _arg1; } + bool uses_nearest_neighbor_search_method() const noexcept { + return _searchMethod == Searchmethod::NEAREST_NEIGHBOR; + } bool uses_string_search_method() const noexcept { - return (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) || - (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8) || - (_searchMethod == VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8); + return (_searchMethod == Searchmethod::UTF8) || + (_searchMethod == Searchmethod::AUTOUTF8) || + (_searchMethod == Searchmethod::SSE2UTF8); } - const vespalib::string& get_arg1() const noexcept { return _arg1; } /** * Reconfigures the field searcher based on information in the given query term. @@ -42,7 +46,8 @@ private: vespalib::string _name; size_t _maxLength; FieldSearcherContainer _searcher; - VsmfieldsConfig::Fieldspec::Searchmethod _searchMethod; + Searchmethod _searchMethod; + Normalizing _normalize_mode; vespalib::string _arg1; bool _reconfigured; }; @@ -60,7 +65,7 @@ public: * and a mapping from field name to field id. It then iterates over all document types and index names * and creates a mapping from index name to list of field ids for each document type. **/ - bool buildFromConfig(const VsmfieldsHandle & conf); + void buildFromConfig(const VsmfieldsHandle & conf); /** * Iterates over the given field name vector adding extra elements to the mapping from field name to field id. |