From 6fa0791c5bac03554f01fc5a8652741cb33921b5 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Sat, 9 Mar 2024 21:22:26 +0100 Subject: Stem prefix items If we are searching a stemmed index, it's probably better to stem terms also when we are searching for prefixes. --- .../java/com/yahoo/schema/document/Matching.java | 1 + .../application/validation/NoPrefixForIndexes.java | 3 +-- container-search/abi-spec.json | 25 +++++++++++++++++----- .../com/yahoo/prelude/query/ExactStringItem.java | 8 +++++++ .../com/yahoo/prelude/query/MarkerWordItem.java | 10 +++++++++ .../java/com/yahoo/prelude/query/PrefixItem.java | 12 ++++++++++- .../com/yahoo/prelude/query/SubstringItem.java | 8 +++++++ .../java/com/yahoo/prelude/query/SuffixItem.java | 8 +++++++ .../java/com/yahoo/prelude/query/WordItem.java | 8 +++++++ .../prelude/querytransform/StemmingSearcher.java | 15 +++++++------ 10 files changed, 84 insertions(+), 14 deletions(-) diff --git a/config-model/src/main/java/com/yahoo/schema/document/Matching.java b/config-model/src/main/java/com/yahoo/schema/document/Matching.java index 9d68553fa80..9f05045d090 100644 --- a/config-model/src/main/java/com/yahoo/schema/document/Matching.java +++ b/config-model/src/main/java/com/yahoo/schema/document/Matching.java @@ -31,6 +31,7 @@ public class Matching implements Cloneable, Serializable { /** Maximum number of characters to consider when searching in this field. Used for limiting resources, especially in streaming search. */ private Integer maxLength; + /** Maximum number of occurrences for each term */ private Integer maxTermOccurrences; diff --git a/config-model/src/main/java/com/yahoo/vespa/model/application/validation/NoPrefixForIndexes.java b/config-model/src/main/java/com/yahoo/vespa/model/application/validation/NoPrefixForIndexes.java index 15d293e4abc..0aa0dc85ab8 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/application/validation/NoPrefixForIndexes.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/application/validation/NoPrefixForIndexes.java @@ -23,8 +23,7 @@ public class NoPrefixForIndexes implements Validator { @Override public void validate(Context context) { for (SearchCluster cluster : context.model().getSearchClusters()) { - if (cluster instanceof IndexedSearchCluster) { - IndexedSearchCluster sc = (IndexedSearchCluster) cluster; + if (cluster instanceof IndexedSearchCluster sc) { for (DocumentDatabase docDb : sc.getDocumentDbs()) { DerivedConfiguration sdConfig = docDb.getDerivedConfiguration(); Schema schema = sdConfig.getSchema(); diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json index 73376ac4b25..79cc578c6cd 100644 --- a/container-search/abi-spec.json +++ b/container-search/abi-spec.json @@ -524,9 +524,12 @@ "methods" : [ "public void (java.lang.String)", "public void (java.lang.String, boolean)", + "public void (java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", + "public com.yahoo.prelude.query.ExactStringItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", "public com.yahoo.prelude.query.Item$ItemType getItemType()", "public java.lang.String getName()", - "public java.lang.String stringValue()" + "public java.lang.String stringValue()", + "public bridge synthetic com.yahoo.prelude.query.WordItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)" ], "fields" : [ ] }, @@ -914,6 +917,7 @@ "public" ], "methods" : [ + "public com.yahoo.prelude.query.MarkerWordItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", "public boolean isStartAnchor()", "public boolean isEndAnchor()", "protected java.lang.String getEncodedWord()", @@ -923,7 +927,8 @@ "public static com.yahoo.prelude.query.MarkerWordItem createStartOfHost(java.lang.String)", "public static com.yahoo.prelude.query.MarkerWordItem createStartOfHost()", "public static com.yahoo.prelude.query.MarkerWordItem createEndOfHost(java.lang.String)", - "public static com.yahoo.prelude.query.MarkerWordItem createEndOfHost()" + "public static com.yahoo.prelude.query.MarkerWordItem createEndOfHost()", + "public bridge synthetic com.yahoo.prelude.query.WordItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)" ], "fields" : [ ] }, @@ -1296,9 +1301,12 @@ "public void (java.lang.String)", "public void (java.lang.String, boolean)", "public void (java.lang.String, java.lang.String)", + "public void (java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", + "public com.yahoo.prelude.query.PrefixItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", "public com.yahoo.prelude.query.Item$ItemType getItemType()", "public java.lang.String getName()", - "public java.lang.String stringValue()" + "public java.lang.String stringValue()", + "public bridge synthetic com.yahoo.prelude.query.WordItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)" ], "fields" : [ ] }, @@ -1622,9 +1630,12 @@ "methods" : [ "public void (java.lang.String)", "public void (java.lang.String, boolean)", + "public void (java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", + "public com.yahoo.prelude.query.SubstringItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", "public com.yahoo.prelude.query.Item$ItemType getItemType()", "public java.lang.String getName()", - "public java.lang.String stringValue()" + "public java.lang.String stringValue()", + "public bridge synthetic com.yahoo.prelude.query.WordItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)" ], "fields" : [ ] }, @@ -1637,9 +1648,12 @@ "methods" : [ "public void (java.lang.String)", "public void (java.lang.String, boolean)", + "public void (java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", + "public com.yahoo.prelude.query.SuffixItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", "public com.yahoo.prelude.query.Item$ItemType getItemType()", "public java.lang.String getName()", - "public java.lang.String stringValue()" + "public java.lang.String stringValue()", + "public bridge synthetic com.yahoo.prelude.query.WordItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)" ], "fields" : [ ] }, @@ -1962,6 +1976,7 @@ "public void (com.yahoo.prelude.query.parser.Token, boolean)", "public void (java.lang.String, boolean, com.yahoo.prelude.query.Substring)", "public void (java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", + "public com.yahoo.prelude.query.WordItem newInstance(java.lang.String, java.lang.String, boolean, com.yahoo.prelude.query.Substring)", "public com.yahoo.prelude.query.Item$ItemType getItemType()", "public java.lang.String getName()", "public void setWord(java.lang.String)", diff --git a/container-search/src/main/java/com/yahoo/prelude/query/ExactStringItem.java b/container-search/src/main/java/com/yahoo/prelude/query/ExactStringItem.java index cb0752e5408..36e24fa81db 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/ExactStringItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/ExactStringItem.java @@ -17,6 +17,14 @@ public class ExactStringItem extends WordItem { super(substring, isFromQuery); } + public ExactStringItem(String word, String indexName, boolean isFromQuery, Substring origin) { + super(word, indexName, isFromQuery, origin); + } + + public ExactStringItem newInstance(String word, String indexName, boolean isFromQuery, Substring origin) { + return new ExactStringItem(word, indexName, isFromQuery, origin); + } + @Override public ItemType getItemType() { return ItemType.EXACT; diff --git a/container-search/src/main/java/com/yahoo/prelude/query/MarkerWordItem.java b/container-search/src/main/java/com/yahoo/prelude/query/MarkerWordItem.java index 40ea1e37c47..48309cdd8fa 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/MarkerWordItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/MarkerWordItem.java @@ -25,6 +25,16 @@ public class MarkerWordItem extends WordItem { this.markerWord = markerWord; } + private MarkerWordItem(String publicSymbol, String markerWord, String indexName, boolean isFromQuery, Substring origin) { + super(publicSymbol, indexName); + this.markerWord = markerWord; + } + + /** Returns a new instance of this kind of WordItem, initialized with the given data and nothing else. */ + public MarkerWordItem newInstance(String word, String indexName, boolean isFromQuery, Substring origin) { + return new MarkerWordItem(word, markerWord, indexName, isFromQuery, origin); + } + public boolean isStartAnchor() { return getWord().equals(startAnchor); } public boolean isEndAnchor() { return getWord().equals(endAnchor); } diff --git a/container-search/src/main/java/com/yahoo/prelude/query/PrefixItem.java b/container-search/src/main/java/com/yahoo/prelude/query/PrefixItem.java index 5904d805a39..9fc087e70b4 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/PrefixItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/PrefixItem.java @@ -17,7 +17,17 @@ public class PrefixItem extends WordItem { super(prefix, isFromQuery); } - public PrefixItem(String prefix, String indexName) { super(prefix, indexName); } + public PrefixItem(String prefix, String indexName) { + super(prefix, indexName); + } + + public PrefixItem(String prefix, String indexName, boolean isFromQuery, Substring origin) { + super(prefix, indexName, isFromQuery, origin); + } + + public PrefixItem newInstance(String word, String indexName, boolean isFromQuery, Substring origin) { + return new PrefixItem(word, indexName, isFromQuery, origin); + } @Override public ItemType getItemType() { diff --git a/container-search/src/main/java/com/yahoo/prelude/query/SubstringItem.java b/container-search/src/main/java/com/yahoo/prelude/query/SubstringItem.java index 7a05235b199..df9de84b04d 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/SubstringItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/SubstringItem.java @@ -16,6 +16,14 @@ public class SubstringItem extends WordItem { super(substring, isFromQuery); } + public SubstringItem(String substring, String indexName, boolean isFromQuery, Substring origin) { + super(substring, indexName, isFromQuery, origin); + } + + public SubstringItem newInstance(String word, String indexName, boolean isFromQuery, Substring origin) { + return new SubstringItem(word, indexName, isFromQuery, origin); + } + @Override public ItemType getItemType() { return ItemType.SUBSTRING; diff --git a/container-search/src/main/java/com/yahoo/prelude/query/SuffixItem.java b/container-search/src/main/java/com/yahoo/prelude/query/SuffixItem.java index 700564853fd..e364330a377 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/SuffixItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/SuffixItem.java @@ -16,6 +16,14 @@ public class SuffixItem extends WordItem { super(suffix, isFromQuery); } + public SuffixItem(String substring, String indexName, boolean isFromQuery, Substring origin) { + super(substring, indexName, isFromQuery, origin); + } + + public SuffixItem newInstance(String word, String indexName, boolean isFromQuery, Substring origin) { + return new SuffixItem(word, indexName, isFromQuery, origin); + } + @Override public ItemType getItemType() { return ItemType.SUFFIX; diff --git a/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java b/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java index 4f8b02a8d13..9cfa33fa07d 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/WordItem.java @@ -62,6 +62,14 @@ public class WordItem extends TermItem { setWord(word); } + /** + * Returns a new instance of this kind of WordItem, initialized with the given data and any other + * fields belonging to the item subclass copied from this instance. + */ + public WordItem newInstance(String word, String indexName, boolean isFromQuery, Substring origin) { + return new WordItem(word, indexName, isFromQuery, origin); + } + public ItemType getItemType() { return ItemType.WORD; } diff --git a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java index e8350831381..e40f161ede2 100644 --- a/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java +++ b/container-search/src/main/java/com/yahoo/prelude/querytransform/StemmingSearcher.java @@ -163,7 +163,7 @@ public class StemmingSearcher extends Searcher { } private Item checkBlock(BlockItem b, StemContext context) { - if (b instanceof PrefixItem || !b.isWords()) return (Item) b; + if (!b.isWords()) return (Item) b; if (b.isFromQuery() && !b.isStemmed()) { Index index = context.indexFacts.getIndex(b.getIndexName()); @@ -190,10 +190,8 @@ public class StemmingSearcher extends Searcher { // The rewriting logic is here private Item stem(BlockItem current, StemContext context, Index index) { - Item blockAsItem = (Item)current; - CompositeItem composite; List segments = linguistics.getStemmer().stem(current.stringValue(), index.getStemMode(), context.language); - if (segments.isEmpty()) return blockAsItem; + if (segments.isEmpty()) return (Item)current; String indexName = current.getIndexName(); Substring substring = getOffsets(current); @@ -203,6 +201,7 @@ public class StemmingSearcher extends Searcher { return (Item)w; } + CompositeItem composite; if (context.isCJK) composite = chooseCompositeForCJK(current, ((Item) current).getParent(), indexName); else @@ -219,7 +218,7 @@ public class StemmingSearcher extends Searcher { if (composite instanceof AndSegmentItem) { andSegmentConnectivity(current, context.reverseConnectivity, composite); } - copyAttributes(blockAsItem, composite); + copyAttributes((Item)current, composite); composite.lock(); if (composite instanceof PhraseSegmentItem replacement) { @@ -320,7 +319,11 @@ public class StemmingSearcher extends Searcher { private WordItem singleStemSegment(Item blockAsItem, String stem, String indexName, Substring substring) { - WordItem replacement = new WordItem(stem, indexName, true, substring); + WordItem replacement; + if (blockAsItem instanceof WordItem) // preserve the WordItem subclass type + replacement = ((WordItem)blockAsItem).newInstance(stem, indexName, true, substring); + else + replacement = new WordItem(stem, indexName, true, substring); replacement.setStemmed(true); copyAttributes(blockAsItem, replacement); return replacement; -- cgit v1.2.3