From 9c9962463492d26623dfbd8aa838018ac53a8aa7 Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Tue, 19 Dec 2023 10:45:57 +0000 Subject: Add feature flag for allow sorting blueprints by cost estimate instead of est_hits. --- searchlib/src/vespa/searchlib/fef/indexproperties.cpp | 6 ++++++ searchlib/src/vespa/searchlib/fef/indexproperties.h | 9 +++++++++ searchlib/src/vespa/searchlib/fef/ranksetup.cpp | 2 ++ searchlib/src/vespa/searchlib/fef/ranksetup.h | 2 ++ 4 files changed, 19 insertions(+) (limited to 'searchlib') diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp index cd9dbff99cb..65e31a2bc60 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp @@ -455,6 +455,12 @@ FuzzyAlgorithm::lookup(const Properties& props, vespalib::FuzzyMatchingAlgorithm return vespalib::fuzzy_matching_algorithm_from_string(value, default_value); } +const vespalib::string SortBlueprintsByEstimate::NAME("vespa.matching.sort_blueprints_by_estimate"); +const bool SortBlueprintsByEstimate::DEFAULT_VALUE(false); +bool SortBlueprintsByEstimate::check(const Properties &props, bool fallback) { + return lookupBool(props, NAME, fallback); +} + const vespalib::string AlwaysMarkPhraseExpensive::NAME("vespa.matching.always_mark_phrase_expensive"); const bool AlwaysMarkPhraseExpensive::DEFAULT_VALUE(false); bool AlwaysMarkPhraseExpensive::check(const Properties &props, bool fallback) { diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.h b/searchlib/src/vespa/searchlib/fef/indexproperties.h index 0183fdf1a13..13e053c8dcf 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.h +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.h @@ -336,6 +336,15 @@ namespace matching { static vespalib::FuzzyMatchingAlgorithm lookup(const Properties& props); static vespalib::FuzzyMatchingAlgorithm lookup(const Properties& props, vespalib::FuzzyMatchingAlgorithm default_value); }; + /** + * Sort blueprints based on relative cost estimate rather than est_hits + **/ + struct SortBlueprintsByEstimate { + static const vespalib::string NAME; + static const bool DEFAULT_VALUE; + static bool check(const Properties &props) { return check(props, DEFAULT_VALUE); } + static bool check(const Properties &props, bool fallback); + }; /** * When enabled, the unpacking part of the phrase iterator will be tagged as expensive diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp index 5c28f1814d5..0353879be14 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp @@ -56,6 +56,7 @@ RankSetup::RankSetup(const BlueprintFactory &factory, const IIndexEnvironment &i _dumpFeatures(), _warnings(), _feature_rename_map(), + _sort_blueprints_by_estimate(false), _ignoreDefaultRankFeatures(false), _compiled(false), _compileError(false), @@ -137,6 +138,7 @@ RankSetup::configure() _mutateOnSummary._attribute = mutate::on_summary::Attribute::lookup(_indexEnv.getProperties()); _mutateOnSummary._operation = mutate::on_summary::Operation::lookup(_indexEnv.getProperties()); _mutateAllowQueryOverride = mutate::AllowQueryOverride::check(_indexEnv.getProperties()); + _sort_blueprints_by_estimate = matching::SortBlueprintsByEstimate::check(_indexEnv.getProperties()); _always_mark_phrase_expensive = matching::AlwaysMarkPhraseExpensive::check(_indexEnv.getProperties()); _create_postinglist_when_non_strict = matching::CreatePostingListWhenNonStrict::check(_indexEnv.getProperties()); _use_estimate_for_fetch_postings = matching::UseEstimateForFetchPostings::check(_indexEnv.getProperties()); diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.h b/searchlib/src/vespa/searchlib/fef/ranksetup.h index 04659955490..f90ce2b0475 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.h +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.h @@ -65,6 +65,7 @@ private: std::vector _dumpFeatures; Warnings _warnings; StringStringMap _feature_rename_map; + bool _sort_blueprints_by_estimate; bool _ignoreDefaultRankFeatures; bool _compiled; bool _compileError; @@ -465,6 +466,7 @@ public: const MutateOperation & getMutateOnSummary() const { return _mutateOnSummary; } bool allowMutateQueryOverride() const { return _mutateAllowQueryOverride; } + bool sort_blueprints_by_estimate() const noexcept { return _sort_blueprints_by_estimate; } }; } -- cgit v1.2.3 From 83f42a92ff767452aaf209a15bde9c462381754b Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Tue, 19 Dec 2023 12:47:31 +0000 Subject: Estimate => Cost --- searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp | 2 +- searchcore/src/vespa/searchcore/proton/matching/query.cpp | 4 ++-- searchcore/src/vespa/searchcore/proton/matching/query.h | 2 +- searchlib/src/vespa/searchlib/fef/indexproperties.cpp | 6 +++--- searchlib/src/vespa/searchlib/fef/indexproperties.h | 2 +- searchlib/src/vespa/searchlib/fef/ranksetup.cpp | 4 ++-- searchlib/src/vespa/searchlib/fef/ranksetup.h | 4 ++-- 7 files changed, 12 insertions(+), 12 deletions(-) (limited to 'searchlib') diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp index 71a6475b0fb..b5224281724 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp @@ -203,7 +203,7 @@ MatchToolsFactory(QueryLimiter & queryLimiter, trace.addEvent(5, "Build query execution plan"); _query.reserveHandles(_requestContext, searchContext, _mdl); trace.addEvent(5, "Optimize query execution plan"); - _query.optimize(SortBlueprintsByEstimate::check(_queryEnv.getProperties(), rankSetup.sort_blueprints_by_estimate())); + _query.optimize(SortBlueprintsByCost::check(_queryEnv.getProperties(), rankSetup.sort_blueprints_by_cost())); trace.addEvent(4, "Perform dictionary lookups and posting lists initialization"); double hitRate = std::min(1.0, double(maxNumHits)/double(searchContext.getDocIdLimit())); bool create_postinglist_when_non_strict = CreatePostingListWhenNonStrict::check(_queryEnv.getProperties(), rankSetup.create_postinglist_when_non_strict()); diff --git a/searchcore/src/vespa/searchcore/proton/matching/query.cpp b/searchcore/src/vespa/searchcore/proton/matching/query.cpp index b86ee931a53..149828b0a91 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/query.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/query.cpp @@ -198,9 +198,9 @@ Query::reserveHandles(const IRequestContext & requestContext, ISearchContext &co } void -Query::optimize(bool sort_by_estimate) +Query::optimize(bool sort_by_cost) { - (void) sort_by_estimate; + (void) sort_by_cost; _blueprint = Blueprint::optimize(std::move(_blueprint)); LOG(debug, "optimized blueprint:\n%s\n", _blueprint->asString().c_str()); } diff --git a/searchcore/src/vespa/searchcore/proton/matching/query.h b/searchcore/src/vespa/searchcore/proton/matching/query.h index 5148f8ba402..8062f12b70d 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/query.h +++ b/searchcore/src/vespa/searchcore/proton/matching/query.h @@ -103,7 +103,7 @@ public: * testing becomes harder. Not calling this function enables the * test to verify the original query without optimization. **/ - void optimize(bool sort_by_estimate); + void optimize(bool sort_by_cost); void fetchPostings(const ExecuteInfo & executeInfo) ; void handle_global_filter(const IRequestContext & requestContext, uint32_t docid_limit, diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp index 65e31a2bc60..1344a30e239 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp @@ -455,9 +455,9 @@ FuzzyAlgorithm::lookup(const Properties& props, vespalib::FuzzyMatchingAlgorithm return vespalib::fuzzy_matching_algorithm_from_string(value, default_value); } -const vespalib::string SortBlueprintsByEstimate::NAME("vespa.matching.sort_blueprints_by_estimate"); -const bool SortBlueprintsByEstimate::DEFAULT_VALUE(false); -bool SortBlueprintsByEstimate::check(const Properties &props, bool fallback) { +const vespalib::string SortBlueprintsByCost::NAME("vespa.matching.sort_blueprints_by_cost"); +const bool SortBlueprintsByCost::DEFAULT_VALUE(false); +bool SortBlueprintsByCost::check(const Properties &props, bool fallback) { return lookupBool(props, NAME, fallback); } diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.h b/searchlib/src/vespa/searchlib/fef/indexproperties.h index 13e053c8dcf..e0fee951670 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.h +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.h @@ -339,7 +339,7 @@ namespace matching { /** * Sort blueprints based on relative cost estimate rather than est_hits **/ - struct SortBlueprintsByEstimate { + struct SortBlueprintsByCost { static const vespalib::string NAME; static const bool DEFAULT_VALUE; static bool check(const Properties &props) { return check(props, DEFAULT_VALUE); } diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp index 0353879be14..e4a31d27fb1 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp @@ -56,7 +56,7 @@ RankSetup::RankSetup(const BlueprintFactory &factory, const IIndexEnvironment &i _dumpFeatures(), _warnings(), _feature_rename_map(), - _sort_blueprints_by_estimate(false), + _sort_blueprints_by_cost(false), _ignoreDefaultRankFeatures(false), _compiled(false), _compileError(false), @@ -138,7 +138,7 @@ RankSetup::configure() _mutateOnSummary._attribute = mutate::on_summary::Attribute::lookup(_indexEnv.getProperties()); _mutateOnSummary._operation = mutate::on_summary::Operation::lookup(_indexEnv.getProperties()); _mutateAllowQueryOverride = mutate::AllowQueryOverride::check(_indexEnv.getProperties()); - _sort_blueprints_by_estimate = matching::SortBlueprintsByEstimate::check(_indexEnv.getProperties()); + _sort_blueprints_by_cost = matching::SortBlueprintsByCost::check(_indexEnv.getProperties()); _always_mark_phrase_expensive = matching::AlwaysMarkPhraseExpensive::check(_indexEnv.getProperties()); _create_postinglist_when_non_strict = matching::CreatePostingListWhenNonStrict::check(_indexEnv.getProperties()); _use_estimate_for_fetch_postings = matching::UseEstimateForFetchPostings::check(_indexEnv.getProperties()); diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.h b/searchlib/src/vespa/searchlib/fef/ranksetup.h index f90ce2b0475..a74bf3335c6 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.h +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.h @@ -65,7 +65,7 @@ private: std::vector _dumpFeatures; Warnings _warnings; StringStringMap _feature_rename_map; - bool _sort_blueprints_by_estimate; + bool _sort_blueprints_by_cost; bool _ignoreDefaultRankFeatures; bool _compiled; bool _compileError; @@ -466,7 +466,7 @@ public: const MutateOperation & getMutateOnSummary() const { return _mutateOnSummary; } bool allowMutateQueryOverride() const { return _mutateAllowQueryOverride; } - bool sort_blueprints_by_estimate() const noexcept { return _sort_blueprints_by_estimate; } + bool sort_blueprints_by_cost() const noexcept { return _sort_blueprints_by_cost; } }; } -- cgit v1.2.3 From 29a322a5ed88935cf225b913ec652be0cdb056ef Mon Sep 17 00:00:00 2001 From: Geir Storli Date: Tue, 19 Dec 2023 12:48:19 +0000 Subject: Support TermFieldMatchData where doUnpack() sets docid. This will be needed for an InTerm used for ranking, e.g. the matches rank feature. --- .../multi_term_or_filter_search_test.cpp | 8 ++++- .../attribute/multi_term_or_filter_search.cpp | 36 ++++++++++++++++------ .../attribute/multi_term_or_filter_search.h | 2 ++ 3 files changed, 36 insertions(+), 10 deletions(-) (limited to 'searchlib') diff --git a/searchlib/src/tests/attribute/multi_term_or_filter_search/multi_term_or_filter_search_test.cpp b/searchlib/src/tests/attribute/multi_term_or_filter_search/multi_term_or_filter_search_test.cpp index dea2702ef0d..552a128c518 100644 --- a/searchlib/src/tests/attribute/multi_term_or_filter_search/multi_term_or_filter_search_test.cpp +++ b/searchlib/src/tests/attribute/multi_term_or_filter_search/multi_term_or_filter_search_test.cpp @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #define ENABLE_GTEST_MIGRATION @@ -11,13 +12,16 @@ using PostingList = search::attribute::PostingListTraits::PostingStoreBase; using Iterator = search::attribute::PostingListTraits::const_iterator; using KeyData = PostingList::KeyDataType; + using search::BitVector; using search::attribute::MultiTermOrFilterSearch; +using search::fef::TermFieldMatchData; using search::queryeval::SearchIterator; using vespalib::datastore::EntryRef; class MultiTermOrFilterSearchTest : public ::testing::Test { PostingList _postings; + mutable TermFieldMatchData _tfmd; vespalib::GenerationHandler _gens; std::vector _trees; uint32_t _range_start; @@ -62,7 +66,7 @@ public: for (size_t i = 0; i < num_trees(); ++i) { iterators.emplace_back(get_tree(i)); } - auto result = MultiTermOrFilterSearch::create(std::move(iterators)); + auto result = MultiTermOrFilterSearch::create(std::move(iterators), _tfmd); result->initRange(_range_start, _range_end); return result; }; @@ -73,6 +77,8 @@ public: while (doc_id < _range_end) { if (iterator.seek(doc_id)) { result.emplace_back(doc_id); + iterator.unpack(doc_id); + EXPECT_EQ(doc_id, _tfmd.getDocId()); ++doc_id; } else { doc_id = std::max(doc_id + 1, iterator.getDocId()); diff --git a/searchlib/src/vespa/searchlib/attribute/multi_term_or_filter_search.cpp b/searchlib/src/vespa/searchlib/attribute/multi_term_or_filter_search.cpp index 19668522e17..214851e9681 100644 --- a/searchlib/src/vespa/searchlib/attribute/multi_term_or_filter_search.cpp +++ b/searchlib/src/vespa/searchlib/attribute/multi_term_or_filter_search.cpp @@ -15,14 +15,19 @@ template class MultiTermOrFilterSearchImpl : public MultiTermOrFilterSearch { IteratorPack _children; + fef::TermFieldMatchData* _tfmd; void seek_all(uint32_t docId); public: - explicit MultiTermOrFilterSearchImpl(IteratorPack&& children); + explicit MultiTermOrFilterSearchImpl(IteratorPack&& children, fef::TermFieldMatchData* tfmd); ~MultiTermOrFilterSearchImpl() override; void doSeek(uint32_t docId) override; - void doUnpack(uint32_t) override { } + void doUnpack(uint32_t docid) override { + if (_tfmd != nullptr) { + _tfmd->resetOnlyDocId(docid); + } + } void initRange(uint32_t begin, uint32_t end) override { SearchIterator::initRange(begin, end); @@ -46,9 +51,10 @@ public: }; template -MultiTermOrFilterSearchImpl::MultiTermOrFilterSearchImpl(IteratorPack&& children) +MultiTermOrFilterSearchImpl::MultiTermOrFilterSearchImpl(IteratorPack&& children, fef::TermFieldMatchData* tfmd) : MultiTermOrFilterSearch(), - _children(std::move(children)) + _children(std::move(children)), + _tfmd(tfmd) { } @@ -89,7 +95,7 @@ namespace { template std::unique_ptr -create_helper(std::vector&& children) +create_helper(std::vector&& children, fef::TermFieldMatchData* tfmd) { if (children.empty()) { return std::make_unique(); @@ -97,7 +103,7 @@ create_helper(std::vector&& children) std::sort(children.begin(), children.end(), [](const auto & a, const auto & b) { return a.size() > b.size(); }); using OrFilter = MultiTermOrFilterSearchImpl; - return std::make_unique(IteratorPackType(std::move(children))); + return std::make_unique(IteratorPackType(std::move(children)), tfmd); } } @@ -106,13 +112,25 @@ create_helper(std::vector&& children) std::unique_ptr MultiTermOrFilterSearch::create(std::vector&& children) { - return create_helper(std::move(children)); + return create_helper(std::move(children), nullptr); +} + +std::unique_ptr +MultiTermOrFilterSearch::create(std::vector&& children, fef::TermFieldMatchData& tfmd) +{ + return create_helper(std::move(children), &tfmd); } std::unique_ptr MultiTermOrFilterSearch::create(std::vector&& children) { - return create_helper(std::move(children)); + return create_helper(std::move(children), nullptr); +} + +std::unique_ptr +MultiTermOrFilterSearch::create(std::vector&& children, fef::TermFieldMatchData& tfmd) +{ + return create_helper(std::move(children), &tfmd); } std::unique_ptr @@ -123,7 +141,7 @@ MultiTermOrFilterSearch::create(const std::vector& children, return std::make_unique(); } else { using OrFilter = MultiTermOrFilterSearchImpl; - return std::make_unique(SearchIteratorPack(children, std::move(md))); + return std::make_unique(SearchIteratorPack(children, std::move(md)), nullptr); } } diff --git a/searchlib/src/vespa/searchlib/attribute/multi_term_or_filter_search.h b/searchlib/src/vespa/searchlib/attribute/multi_term_or_filter_search.h index 42eb33d2eed..1e8227c3007 100644 --- a/searchlib/src/vespa/searchlib/attribute/multi_term_or_filter_search.h +++ b/searchlib/src/vespa/searchlib/attribute/multi_term_or_filter_search.h @@ -18,7 +18,9 @@ protected: MultiTermOrFilterSearch() = default; public: static std::unique_ptr create(std::vector&& children); + static std::unique_ptr create(std::vector&& children, fef::TermFieldMatchData& tfmd); static std::unique_ptr create(std::vector&& children); + static std::unique_ptr create(std::vector&& children, fef::TermFieldMatchData& tfmd); static std::unique_ptr create(const std::vector& children, std::unique_ptr md); }; -- cgit v1.2.3 From b6368d66d4169e93c98df5fc6fe4df7cc9986c8b Mon Sep 17 00:00:00 2001 From: Morten Tokle Date: Mon, 18 Dec 2023 15:35:09 +0100 Subject: Fix more xxe prevention --- .../provider/FilesApplicationPackage.java | 4 ++- .../java/com/yahoo/searchlib/gbdt/XmlHelper.java | 36 +++++++++++++--------- vespajlib/src/main/java/com/yahoo/text/XML.java | 15 +++++++++ 3 files changed, 39 insertions(+), 16 deletions(-) (limited to 'searchlib') diff --git a/config-application-package/src/main/java/com/yahoo/config/model/application/provider/FilesApplicationPackage.java b/config-application-package/src/main/java/com/yahoo/config/model/application/provider/FilesApplicationPackage.java index 3df11855f75..ab5645eb50d 100644 --- a/config-application-package/src/main/java/com/yahoo/config/model/application/provider/FilesApplicationPackage.java +++ b/config-application-package/src/main/java/com/yahoo/config/model/application/provider/FilesApplicationPackage.java @@ -27,6 +27,7 @@ import com.yahoo.io.IOUtils; import com.yahoo.io.reader.NamedReader; import com.yahoo.path.Path; import com.yahoo.text.Utf8; +import com.yahoo.text.XML; import com.yahoo.vespa.config.ConfigDefinition; import com.yahoo.vespa.config.ConfigDefinitionBuilder; import com.yahoo.vespa.config.ConfigDefinitionKey; @@ -36,6 +37,7 @@ import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; + import javax.xml.parsers.ParserConfigurationException; import javax.xml.transform.TransformerException; import javax.xml.transform.TransformerFactory; @@ -166,7 +168,7 @@ public class FilesApplicationPackage extends AbstractApplicationPackage { configDefsDir = applicationFile(appDir, CONFIG_DEFINITIONS_DIR); addUserIncludeDirs(); this.metaData = metaData; - transformerFactory = TransformerFactory.newInstance(); + this.transformerFactory = XML.createTransformerFactory(); } @Override diff --git a/searchlib/src/main/java/com/yahoo/searchlib/gbdt/XmlHelper.java b/searchlib/src/main/java/com/yahoo/searchlib/gbdt/XmlHelper.java index fce0485f41a..60617687f44 100644 --- a/searchlib/src/main/java/com/yahoo/searchlib/gbdt/XmlHelper.java +++ b/searchlib/src/main/java/com/yahoo/searchlib/gbdt/XmlHelper.java @@ -7,6 +7,7 @@ import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.xml.sax.SAXException; +import javax.xml.XMLConstants; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; @@ -15,21 +16,21 @@ import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.LinkedList; import java.util.List; import java.util.Optional; +import java.util.logging.Level; +import java.util.logging.Logger; /** * @author Simon Thoresen Hult */ abstract class XmlHelper { - - private static final Charset UTF8 = Charset.forName("UTF-8"); - public static Element parseXml(String xml) throws ParserConfigurationException, IOException, SAXException { - return parseXmlStream(new ByteArrayInputStream(xml.getBytes(UTF8))); + return parseXmlStream(new ByteArrayInputStream(xml.getBytes(StandardCharsets.UTF_8))); } public static Element parseXmlFile(String fileName) @@ -41,22 +42,27 @@ abstract class XmlHelper { public static Element parseXmlStream(InputStream in) throws ParserConfigurationException, IOException, SAXException { - DocumentBuilderFactory factory = createDocumentBuilderFactory(); - DocumentBuilder builder = factory.newDocumentBuilder(); + DocumentBuilder builder = createDocumentBuilderFactory().newDocumentBuilder(); Document doc = builder.parse(in); return doc.getDocumentElement(); } - private static DocumentBuilderFactory createDocumentBuilderFactory() throws ParserConfigurationException { - DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); - factory.setNamespaceAware(true); - factory.setXIncludeAware(false); + private static DocumentBuilderFactory createDocumentBuilderFactory() { + try { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(true); + factory.setXIncludeAware(false); + factory.setExpandEntityReferences(false); + factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); - // XXE prevention - factory.setFeature("http://xml.org/sax/features/external-general-entities", false); - factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); - factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); - return factory; + // XXE prevention + factory.setFeature("http://xml.org/sax/features/external-general-entities", false); + factory.setFeature("http://xml.org/sax/features/external-parameter-entities", false); + factory.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false); + return factory; + } catch (ParserConfigurationException e) { + throw new RuntimeException("Failed to initialize XML parser", e); + } } public static String getAttributeText(Node node, String name) { diff --git a/vespajlib/src/main/java/com/yahoo/text/XML.java b/vespajlib/src/main/java/com/yahoo/text/XML.java index a6e36a0c3e1..72a2dba54e1 100644 --- a/vespajlib/src/main/java/com/yahoo/text/XML.java +++ b/vespajlib/src/main/java/com/yahoo/text/XML.java @@ -9,9 +9,11 @@ import org.xml.sax.InputSource; import org.xml.sax.SAXException; import org.xml.sax.SAXParseException; +import javax.xml.XMLConstants; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.TransformerFactory; import java.io.File; import java.io.IOException; import java.io.Reader; @@ -445,6 +447,19 @@ public class XML { return valid; } + /** + * Creates a new XML TransformerFactory. + * + * @return a TransformerFactory + */ + public static TransformerFactory createTransformerFactory() { + TransformerFactory transformerFactory = TransformerFactory.newInstance(); + transformerFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_DTD, ""); + transformerFactory.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, ""); + return transformerFactory; + } + + /** * The point of this weird class and the jumble of abstract methods is * linking the scan for characters that must be quoted into the quoting -- cgit v1.2.3 From 5348bb4a85e8a1364971f83f81f3ac458631d7f3 Mon Sep 17 00:00:00 2001 From: Geir Storli Date: Tue, 19 Dec 2023 13:41:57 +0000 Subject: Align naming of test and iterator for low-level posting list access. --- searchlib/CMakeLists.txt | 2 +- .../tests/attribute/bitvector/bitvector_test.cpp | 4 +- .../attribute/direct_posting_store/.gitignore | 1 + .../attribute/direct_posting_store/CMakeLists.txt | 9 + .../direct_posting_store_test.cpp | 226 +++++++++++++++++++++ .../attribute/document_weight_iterator/.gitignore | 1 - .../document_weight_iterator/CMakeLists.txt | 9 - .../document_weight_iterator_test.cpp | 226 --------------------- .../attribute_searchable_adapter_test.cpp | 6 +- .../parallel_weak_and/parallel_weak_and_test.cpp | 4 +- .../attribute/attribute_blueprint_factory.cpp | 4 +- .../src/vespa/searchlib/queryeval/CMakeLists.txt | 2 +- .../docid_with_weight_search_iterator.cpp | 3 + .../queryeval/docid_with_weight_search_iterator.h | 60 ++++++ .../queryeval/document_weight_search_iterator.cpp | 3 - .../queryeval/document_weight_search_iterator.h | 55 ----- .../queryeval/wand/parallel_weak_and_search.cpp | 4 +- 17 files changed, 312 insertions(+), 307 deletions(-) create mode 100644 searchlib/src/tests/attribute/direct_posting_store/.gitignore create mode 100644 searchlib/src/tests/attribute/direct_posting_store/CMakeLists.txt create mode 100644 searchlib/src/tests/attribute/direct_posting_store/direct_posting_store_test.cpp delete mode 100644 searchlib/src/tests/attribute/document_weight_iterator/.gitignore delete mode 100644 searchlib/src/tests/attribute/document_weight_iterator/CMakeLists.txt delete mode 100644 searchlib/src/tests/attribute/document_weight_iterator/document_weight_iterator_test.cpp create mode 100644 searchlib/src/vespa/searchlib/queryeval/docid_with_weight_search_iterator.cpp create mode 100644 searchlib/src/vespa/searchlib/queryeval/docid_with_weight_search_iterator.h delete mode 100644 searchlib/src/vespa/searchlib/queryeval/document_weight_search_iterator.cpp delete mode 100644 searchlib/src/vespa/searchlib/queryeval/document_weight_search_iterator.h (limited to 'searchlib') diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index c9e19b76cef..5628db99171 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -77,7 +77,7 @@ vespa_define_module( src/tests/attribute/compaction src/tests/attribute/dfa_fuzzy_matcher src/tests/attribute/direct_multi_term_blueprint - src/tests/attribute/document_weight_iterator + src/tests/attribute/direct_posting_store src/tests/attribute/enum_attribute_compaction src/tests/attribute/enum_comparator src/tests/attribute/enumeratedsave diff --git a/searchlib/src/tests/attribute/bitvector/bitvector_test.cpp b/searchlib/src/tests/attribute/bitvector/bitvector_test.cpp index 181c0fdf110..f612bdda87f 100644 --- a/searchlib/src/tests/attribute/bitvector/bitvector_test.cpp +++ b/searchlib/src/tests/attribute/bitvector/bitvector_test.cpp @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include #include #include @@ -432,7 +432,7 @@ BitVectorTest::test(BasicType bt, CollectionType ct, const vespalib::string &pre const auto* dww = v->as_docid_with_weight_posting_store(); if (dww != nullptr) { auto lres = dww->lookup(getSearchStr(), dww->get_dictionary_snapshot()); - using DWSI = search::queryeval::DocumentWeightSearchIterator; + using DWSI = search::queryeval::DocidWithWeightSearchIterator; TermFieldMatchData md; auto dwsi = std::make_unique(md, *dww, lres); if (!filter) { diff --git a/searchlib/src/tests/attribute/direct_posting_store/.gitignore b/searchlib/src/tests/attribute/direct_posting_store/.gitignore new file mode 100644 index 00000000000..5516bc721c7 --- /dev/null +++ b/searchlib/src/tests/attribute/direct_posting_store/.gitignore @@ -0,0 +1 @@ +searchlib_direct_posting_store_test_app diff --git a/searchlib/src/tests/attribute/direct_posting_store/CMakeLists.txt b/searchlib/src/tests/attribute/direct_posting_store/CMakeLists.txt new file mode 100644 index 00000000000..0cb45e6b18f --- /dev/null +++ b/searchlib/src/tests/attribute/direct_posting_store/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_direct_posting_store_test_app TEST + SOURCES + direct_posting_store_test.cpp + DEPENDS + searchlib + searchlib_test +) +vespa_add_test(NAME searchlib_direct_posting_store_test_app COMMAND searchlib_direct_posting_store_test_app) diff --git a/searchlib/src/tests/attribute/direct_posting_store/direct_posting_store_test.cpp b/searchlib/src/tests/attribute/direct_posting_store/direct_posting_store_test.cpp new file mode 100644 index 00000000000..d6b0ff2a4b6 --- /dev/null +++ b/searchlib/src/tests/attribute/direct_posting_store/direct_posting_store_test.cpp @@ -0,0 +1,226 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +LOG_SETUP("direct_posting_store_test"); + +using namespace search; +using namespace search::attribute; + +AttributeVector::SP make_attribute(BasicType type, CollectionType collection, bool fast_search) { + Config cfg(type, collection); + cfg.setFastSearch(fast_search); + return AttributeFactory::createAttribute("my_attribute", cfg); +} + +void add_docs(AttributeVector::SP attr_ptr, size_t limit = 1000) { + AttributeVector::DocId docid; + for (size_t i = 0; i < limit; ++i) { + attr_ptr->addDoc(docid); + } + attr_ptr->commit(); + ASSERT_EQUAL((limit - 1), docid); +} + +template +void set_doc(ATTR *attr, uint32_t docid, KEY key, int32_t weight) { + attr->clearDoc(docid); + attr->append(docid, key, weight); + attr->commit(); +} + +void populate_long(AttributeVector::SP attr_ptr) { + IntegerAttribute *attr = static_cast(attr_ptr.get()); + set_doc(attr, 1, int64_t(111), 20); + set_doc(attr, 5, int64_t(111), 5); + set_doc(attr, 7, int64_t(111), 10); +} + +void populate_string(AttributeVector::SP attr_ptr) { + StringAttribute *attr = static_cast(attr_ptr.get()); + set_doc(attr, 1, "foo", 20); + set_doc(attr, 5, "foo", 5); + set_doc(attr, 7, "foo", 10); +} + +struct LongFixture { + AttributeVector::SP attr; + const IDocidWithWeightPostingStore *api; + LongFixture() : attr(make_attribute(BasicType::INT64, CollectionType::WSET, true)), + api(attr->as_docid_with_weight_posting_store()) + { + ASSERT_TRUE(api != nullptr); + add_docs(attr); + populate_long(attr); + } +}; + +struct StringFixture { + AttributeVector::SP attr; + const IDocidWithWeightPostingStore *api; + StringFixture() : attr(make_attribute(BasicType::STRING, CollectionType::WSET, true)), + api(attr->as_docid_with_weight_posting_store()) + { + ASSERT_TRUE(api != nullptr); + add_docs(attr); + populate_string(attr); + } +}; + +TEST("require that appropriate attributes support the IDocidWithWeightPostingStore interface") { + EXPECT_TRUE(make_attribute(BasicType::INT64, CollectionType::WSET, true)->as_docid_with_weight_posting_store() != nullptr); + EXPECT_TRUE(make_attribute(BasicType::STRING, CollectionType::WSET, true)->as_docid_with_weight_posting_store() != nullptr); +} + +TEST("require that inappropriate attributes do not support the IDocidWithWeightPostingStore interface") { + EXPECT_TRUE(make_attribute(BasicType::INT64, CollectionType::SINGLE, false)->as_docid_with_weight_posting_store() == nullptr); + EXPECT_TRUE(make_attribute(BasicType::INT64, CollectionType::ARRAY, false)->as_docid_with_weight_posting_store() == nullptr); + EXPECT_TRUE(make_attribute(BasicType::INT64, CollectionType::WSET, false)->as_docid_with_weight_posting_store() == nullptr); + EXPECT_TRUE(make_attribute(BasicType::INT64, CollectionType::SINGLE, true)->as_docid_with_weight_posting_store() == nullptr); + EXPECT_TRUE(make_attribute(BasicType::INT64, CollectionType::ARRAY, true)->as_docid_with_weight_posting_store() == nullptr); + EXPECT_TRUE(make_attribute(BasicType::STRING, CollectionType::SINGLE, false)->as_docid_with_weight_posting_store() == nullptr); + EXPECT_TRUE(make_attribute(BasicType::STRING, CollectionType::ARRAY, false)->as_docid_with_weight_posting_store() == nullptr); + EXPECT_TRUE(make_attribute(BasicType::STRING, CollectionType::WSET, false)->as_docid_with_weight_posting_store() == nullptr); + EXPECT_TRUE(make_attribute(BasicType::STRING, CollectionType::SINGLE, true)->as_docid_with_weight_posting_store() == nullptr); + EXPECT_TRUE(make_attribute(BasicType::STRING, CollectionType::ARRAY, true)->as_docid_with_weight_posting_store() == nullptr); + EXPECT_TRUE(make_attribute(BasicType::INT32, CollectionType::WSET, true)->as_docid_with_weight_posting_store() == nullptr); + EXPECT_TRUE(make_attribute(BasicType::DOUBLE, CollectionType::WSET, true)->as_docid_with_weight_posting_store() == nullptr); +} + +void verify_valid_lookup(IDirectPostingStore::LookupResult result) { + EXPECT_TRUE(result.posting_idx.valid()); + EXPECT_EQUAL(3u, result.posting_size); + EXPECT_EQUAL(5, result.min_weight); + EXPECT_EQUAL(20, result.max_weight); +} + +void verify_invalid_lookup(IDirectPostingStore::LookupResult result) { + EXPECT_FALSE(result.posting_idx.valid()); + EXPECT_EQUAL(0u, result.posting_size); + EXPECT_EQUAL(0, result.min_weight); + EXPECT_EQUAL(0, result.max_weight); +} + +TEST_F("require that integer lookup works correctly", LongFixture) { + verify_valid_lookup(f1.api->lookup("111", f1.api->get_dictionary_snapshot())); + verify_invalid_lookup(f1.api->lookup("222", f1.api->get_dictionary_snapshot())); +} + +TEST_F("require string lookup works correctly", StringFixture) { + verify_valid_lookup(f1.api->lookup("foo", f1.api->get_dictionary_snapshot())); + verify_invalid_lookup(f1.api->lookup("bar", f1.api->get_dictionary_snapshot())); +} + +void verify_posting(const IDocidWithWeightPostingStore &api, const char *term) { + auto result = api.lookup(term, api.get_dictionary_snapshot()); + ASSERT_TRUE(result.posting_idx.valid()); + std::vector itr_store; + api.create(result.posting_idx, itr_store); + ASSERT_EQUAL(1u, itr_store.size()); + { + DocidWithWeightIterator &itr = itr_store[0]; + if (itr.valid() && itr.getKey() < 1) { + itr.linearSeek(1); + } + ASSERT_TRUE(itr.valid()); + EXPECT_EQUAL(1u, itr.getKey()); // docid + EXPECT_EQUAL(20, itr.getData()); // weight + itr.linearSeek(2); + ASSERT_TRUE(itr.valid()); + EXPECT_EQUAL(5u, itr.getKey()); // docid + EXPECT_EQUAL(5, itr.getData()); // weight + itr.linearSeek(6); + ASSERT_TRUE(itr.valid()); + EXPECT_EQUAL(7u, itr.getKey()); // docid + EXPECT_EQUAL(10, itr.getData()); // weight + itr.linearSeek(8); + EXPECT_FALSE(itr.valid()); + } +} + +TEST_F("require that integer iterators are created correctly", LongFixture) { + verify_posting(*f1.api, "111"); +} + +TEST_F("require that string iterators are created correctly", StringFixture) { + verify_posting(*f1.api, "foo"); +} + +TEST_F("require that collect_folded works for string", StringFixture) +{ + StringAttribute *attr = static_cast(f1.attr.get()); + set_doc(attr, 2, "bar", 30); + attr->commit(); + set_doc(attr, 3, "FOO", 30); + attr->commit(); + auto dictionary_snapshot = f1.api->get_dictionary_snapshot(); + auto lookup1 = f1.api->lookup("foo", dictionary_snapshot); + std::vector folded; + std::function save_folded = [&folded,attr](vespalib::datastore::EntryRef enum_idx) { folded.emplace_back(attr->getFromEnum(enum_idx.ref())); }; + f1.api->collect_folded(lookup1.enum_idx, dictionary_snapshot, save_folded); + std::vector expected_folded{"FOO", "foo"}; + EXPECT_EQUAL(expected_folded, folded); +} + +TEST_F("require that collect_folded works for integers", LongFixture) +{ + IntegerAttributeTemplate *attr = dynamic_cast *>(f1.attr.get()); + set_doc(attr, 2, int64_t(112), 30); + attr->commit(); + auto dictionary_snapshot = f1.api->get_dictionary_snapshot(); + auto lookup1 = f1.api->lookup("111", dictionary_snapshot); + std::vector folded; + std::function save_folded = [&folded,attr](vespalib::datastore::EntryRef enum_idx) { folded.emplace_back(attr->getFromEnum(enum_idx.ref())); }; + f1.api->collect_folded(lookup1.enum_idx, dictionary_snapshot, save_folded); + std::vector expected_folded{int64_t(111)}; + EXPECT_EQUAL(expected_folded, folded); +} + +class Verifier : public search::test::SearchIteratorVerifier { +public: + Verifier(); + ~Verifier(); + SearchIterator::UP create(bool strict) const override { + (void) strict; + const auto* api = _attr->as_docid_with_weight_posting_store(); + ASSERT_TRUE(api != nullptr); + auto dict_entry = api->lookup("123", api->get_dictionary_snapshot()); + ASSERT_TRUE(dict_entry.posting_idx.valid()); + return std::make_unique(_tfmd, *api, dict_entry); + } +private: + mutable fef::TermFieldMatchData _tfmd; + AttributeVector::SP _attr; +}; + +Verifier::Verifier() + : _attr(make_attribute(BasicType::INT64, CollectionType::WSET, true)) +{ + add_docs(_attr, getDocIdLimit()); + auto docids = getExpectedDocIds(); + IntegerAttribute *int_attr = static_cast(_attr.get()); + for (auto docid: docids) { + set_doc(int_attr, docid, int64_t(123), 1); + } +} +Verifier::~Verifier() {} + +TEST("verify document weight search iterator") { + Verifier verifier; + verifier.verify(); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/tests/attribute/document_weight_iterator/.gitignore b/searchlib/src/tests/attribute/document_weight_iterator/.gitignore deleted file mode 100644 index 08cae9a48df..00000000000 --- a/searchlib/src/tests/attribute/document_weight_iterator/.gitignore +++ /dev/null @@ -1 +0,0 @@ -searchlib_document_weight_iterator_test_app diff --git a/searchlib/src/tests/attribute/document_weight_iterator/CMakeLists.txt b/searchlib/src/tests/attribute/document_weight_iterator/CMakeLists.txt deleted file mode 100644 index 4cb480068e3..00000000000 --- a/searchlib/src/tests/attribute/document_weight_iterator/CMakeLists.txt +++ /dev/null @@ -1,9 +0,0 @@ -# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -vespa_add_executable(searchlib_document_weight_iterator_test_app TEST - SOURCES - document_weight_iterator_test.cpp - DEPENDS - searchlib - searchlib_test -) -vespa_add_test(NAME searchlib_document_weight_iterator_test_app COMMAND searchlib_document_weight_iterator_test_app) diff --git a/searchlib/src/tests/attribute/document_weight_iterator/document_weight_iterator_test.cpp b/searchlib/src/tests/attribute/document_weight_iterator/document_weight_iterator_test.cpp deleted file mode 100644 index 28416d09d6f..00000000000 --- a/searchlib/src/tests/attribute/document_weight_iterator/document_weight_iterator_test.cpp +++ /dev/null @@ -1,226 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -LOG_SETUP("document_weight_iterator_test"); - -using namespace search; -using namespace search::attribute; - -AttributeVector::SP make_attribute(BasicType type, CollectionType collection, bool fast_search) { - Config cfg(type, collection); - cfg.setFastSearch(fast_search); - return AttributeFactory::createAttribute("my_attribute", cfg); -} - -void add_docs(AttributeVector::SP attr_ptr, size_t limit = 1000) { - AttributeVector::DocId docid; - for (size_t i = 0; i < limit; ++i) { - attr_ptr->addDoc(docid); - } - attr_ptr->commit(); - ASSERT_EQUAL((limit - 1), docid); -} - -template -void set_doc(ATTR *attr, uint32_t docid, KEY key, int32_t weight) { - attr->clearDoc(docid); - attr->append(docid, key, weight); - attr->commit(); -} - -void populate_long(AttributeVector::SP attr_ptr) { - IntegerAttribute *attr = static_cast(attr_ptr.get()); - set_doc(attr, 1, int64_t(111), 20); - set_doc(attr, 5, int64_t(111), 5); - set_doc(attr, 7, int64_t(111), 10); -} - -void populate_string(AttributeVector::SP attr_ptr) { - StringAttribute *attr = static_cast(attr_ptr.get()); - set_doc(attr, 1, "foo", 20); - set_doc(attr, 5, "foo", 5); - set_doc(attr, 7, "foo", 10); -} - -struct LongFixture { - AttributeVector::SP attr; - const IDocidWithWeightPostingStore *api; - LongFixture() : attr(make_attribute(BasicType::INT64, CollectionType::WSET, true)), - api(attr->as_docid_with_weight_posting_store()) - { - ASSERT_TRUE(api != nullptr); - add_docs(attr); - populate_long(attr); - } -}; - -struct StringFixture { - AttributeVector::SP attr; - const IDocidWithWeightPostingStore *api; - StringFixture() : attr(make_attribute(BasicType::STRING, CollectionType::WSET, true)), - api(attr->as_docid_with_weight_posting_store()) - { - ASSERT_TRUE(api != nullptr); - add_docs(attr); - populate_string(attr); - } -}; - -TEST("require that appropriate attributes support the document weight attribute interface") { - EXPECT_TRUE(make_attribute(BasicType::INT64, CollectionType::WSET, true)->as_docid_with_weight_posting_store() != nullptr); - EXPECT_TRUE(make_attribute(BasicType::STRING, CollectionType::WSET, true)->as_docid_with_weight_posting_store() != nullptr); -} - -TEST("require that inappropriate attributes do not support the document weight attribute interface") { - EXPECT_TRUE(make_attribute(BasicType::INT64, CollectionType::SINGLE, false)->as_docid_with_weight_posting_store() == nullptr); - EXPECT_TRUE(make_attribute(BasicType::INT64, CollectionType::ARRAY, false)->as_docid_with_weight_posting_store() == nullptr); - EXPECT_TRUE(make_attribute(BasicType::INT64, CollectionType::WSET, false)->as_docid_with_weight_posting_store() == nullptr); - EXPECT_TRUE(make_attribute(BasicType::INT64, CollectionType::SINGLE, true)->as_docid_with_weight_posting_store() == nullptr); - EXPECT_TRUE(make_attribute(BasicType::INT64, CollectionType::ARRAY, true)->as_docid_with_weight_posting_store() == nullptr); - EXPECT_TRUE(make_attribute(BasicType::STRING, CollectionType::SINGLE, false)->as_docid_with_weight_posting_store() == nullptr); - EXPECT_TRUE(make_attribute(BasicType::STRING, CollectionType::ARRAY, false)->as_docid_with_weight_posting_store() == nullptr); - EXPECT_TRUE(make_attribute(BasicType::STRING, CollectionType::WSET, false)->as_docid_with_weight_posting_store() == nullptr); - EXPECT_TRUE(make_attribute(BasicType::STRING, CollectionType::SINGLE, true)->as_docid_with_weight_posting_store() == nullptr); - EXPECT_TRUE(make_attribute(BasicType::STRING, CollectionType::ARRAY, true)->as_docid_with_weight_posting_store() == nullptr); - EXPECT_TRUE(make_attribute(BasicType::INT32, CollectionType::WSET, true)->as_docid_with_weight_posting_store() == nullptr); - EXPECT_TRUE(make_attribute(BasicType::DOUBLE, CollectionType::WSET, true)->as_docid_with_weight_posting_store() == nullptr); -} - -void verify_valid_lookup(IDirectPostingStore::LookupResult result) { - EXPECT_TRUE(result.posting_idx.valid()); - EXPECT_EQUAL(3u, result.posting_size); - EXPECT_EQUAL(5, result.min_weight); - EXPECT_EQUAL(20, result.max_weight); -} - -void verify_invalid_lookup(IDirectPostingStore::LookupResult result) { - EXPECT_FALSE(result.posting_idx.valid()); - EXPECT_EQUAL(0u, result.posting_size); - EXPECT_EQUAL(0, result.min_weight); - EXPECT_EQUAL(0, result.max_weight); -} - -TEST_F("require that integer lookup works correctly", LongFixture) { - verify_valid_lookup(f1.api->lookup("111", f1.api->get_dictionary_snapshot())); - verify_invalid_lookup(f1.api->lookup("222", f1.api->get_dictionary_snapshot())); -} - -TEST_F("require string lookup works correctly", StringFixture) { - verify_valid_lookup(f1.api->lookup("foo", f1.api->get_dictionary_snapshot())); - verify_invalid_lookup(f1.api->lookup("bar", f1.api->get_dictionary_snapshot())); -} - -void verify_posting(const IDocidWithWeightPostingStore &api, const char *term) { - auto result = api.lookup(term, api.get_dictionary_snapshot()); - ASSERT_TRUE(result.posting_idx.valid()); - std::vector itr_store; - api.create(result.posting_idx, itr_store); - ASSERT_EQUAL(1u, itr_store.size()); - { - DocidWithWeightIterator &itr = itr_store[0]; - if (itr.valid() && itr.getKey() < 1) { - itr.linearSeek(1); - } - ASSERT_TRUE(itr.valid()); - EXPECT_EQUAL(1u, itr.getKey()); // docid - EXPECT_EQUAL(20, itr.getData()); // weight - itr.linearSeek(2); - ASSERT_TRUE(itr.valid()); - EXPECT_EQUAL(5u, itr.getKey()); // docid - EXPECT_EQUAL(5, itr.getData()); // weight - itr.linearSeek(6); - ASSERT_TRUE(itr.valid()); - EXPECT_EQUAL(7u, itr.getKey()); // docid - EXPECT_EQUAL(10, itr.getData()); // weight - itr.linearSeek(8); - EXPECT_FALSE(itr.valid()); - } -} - -TEST_F("require that integer iterators are created correctly", LongFixture) { - verify_posting(*f1.api, "111"); -} - -TEST_F("require that string iterators are created correctly", StringFixture) { - verify_posting(*f1.api, "foo"); -} - -TEST_F("require that collect_folded works for string", StringFixture) -{ - StringAttribute *attr = static_cast(f1.attr.get()); - set_doc(attr, 2, "bar", 30); - attr->commit(); - set_doc(attr, 3, "FOO", 30); - attr->commit(); - auto dictionary_snapshot = f1.api->get_dictionary_snapshot(); - auto lookup1 = f1.api->lookup("foo", dictionary_snapshot); - std::vector folded; - std::function save_folded = [&folded,attr](vespalib::datastore::EntryRef enum_idx) { folded.emplace_back(attr->getFromEnum(enum_idx.ref())); }; - f1.api->collect_folded(lookup1.enum_idx, dictionary_snapshot, save_folded); - std::vector expected_folded{"FOO", "foo"}; - EXPECT_EQUAL(expected_folded, folded); -} - -TEST_F("require that collect_folded works for integers", LongFixture) -{ - IntegerAttributeTemplate *attr = dynamic_cast *>(f1.attr.get()); - set_doc(attr, 2, int64_t(112), 30); - attr->commit(); - auto dictionary_snapshot = f1.api->get_dictionary_snapshot(); - auto lookup1 = f1.api->lookup("111", dictionary_snapshot); - std::vector folded; - std::function save_folded = [&folded,attr](vespalib::datastore::EntryRef enum_idx) { folded.emplace_back(attr->getFromEnum(enum_idx.ref())); }; - f1.api->collect_folded(lookup1.enum_idx, dictionary_snapshot, save_folded); - std::vector expected_folded{int64_t(111)}; - EXPECT_EQUAL(expected_folded, folded); -} - -class Verifier : public search::test::SearchIteratorVerifier { -public: - Verifier(); - ~Verifier(); - SearchIterator::UP create(bool strict) const override { - (void) strict; - const auto* api = _attr->as_docid_with_weight_posting_store(); - ASSERT_TRUE(api != nullptr); - auto dict_entry = api->lookup("123", api->get_dictionary_snapshot()); - ASSERT_TRUE(dict_entry.posting_idx.valid()); - return std::make_unique(_tfmd, *api, dict_entry); - } -private: - mutable fef::TermFieldMatchData _tfmd; - AttributeVector::SP _attr; -}; - -Verifier::Verifier() - : _attr(make_attribute(BasicType::INT64, CollectionType::WSET, true)) -{ - add_docs(_attr, getDocIdLimit()); - auto docids = getExpectedDocIds(); - IntegerAttribute *int_attr = static_cast(_attr.get()); - for (auto docid: docids) { - set_doc(int_attr, docid, int64_t(123), 1); - } -} -Verifier::~Verifier() {} - -TEST("verify document weight search iterator") { - Verifier verifier; - verifier.verify(); -} - -TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/tests/attribute/searchable/attribute_searchable_adapter_test.cpp b/searchlib/src/tests/attribute/searchable/attribute_searchable_adapter_test.cpp index 8831bd1ec75..ecc03ac54c5 100644 --- a/searchlib/src/tests/attribute/searchable/attribute_searchable_adapter_test.cpp +++ b/searchlib/src/tests/attribute/searchable/attribute_searchable_adapter_test.cpp @@ -488,11 +488,11 @@ TEST("require that direct attribute iterators work") { EXPECT_TRUE(result.has_minmax); EXPECT_EQUAL(100, result.min_weight); EXPECT_EQUAL(1000, result.max_weight); - EXPECT_TRUE(result.iterator_dump.find("DocumentWeightSearchIterator") != vespalib::string::npos); + EXPECT_TRUE(result.iterator_dump.find("DocidWithWeightSearchIterator") != vespalib::string::npos); } else { EXPECT_EQUAL(num_docs, result.est_hits); EXPECT_FALSE(result.has_minmax); - EXPECT_TRUE(result.iterator_dump.find("DocumentWeightSearchIterator") == vespalib::string::npos); + EXPECT_TRUE(result.iterator_dump.find("DocidWithWeightSearchIterator") == vespalib::string::npos); } ASSERT_EQUAL(3u, result.hits.size()); EXPECT_FALSE(result.est_empty); @@ -513,7 +513,7 @@ TEST("require that single weighted set turns filter on filter fields") { SimpleStringTerm node("foo", "", 0, Weight(1)); Result result = do_search(attribute_manager, node, strict); EXPECT_EQUAL(3u, result.est_hits); - EXPECT_TRUE(result.iterator_dump.find("DocumentWeightSearchIterator") == vespalib::string::npos); + EXPECT_TRUE(result.iterator_dump.find("DocidWithWeightSearchIterator") == vespalib::string::npos); EXPECT_TRUE(result.iterator_dump.find("FilterAttributePostingListIteratorT") != vespalib::string::npos); ASSERT_EQUAL(3u, result.hits.size()); EXPECT_FALSE(result.est_empty); diff --git a/searchlib/src/tests/queryeval/parallel_weak_and/parallel_weak_and_test.cpp b/searchlib/src/tests/queryeval/parallel_weak_and/parallel_weak_and_test.cpp index fa12b453d8c..0e27c77feae 100644 --- a/searchlib/src/tests/queryeval/parallel_weak_and/parallel_weak_and_test.cpp +++ b/searchlib/src/tests/queryeval/parallel_weak_and/parallel_weak_and_test.cpp @@ -1,6 +1,6 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include -#include +#include #include #include #include @@ -661,7 +661,7 @@ SearchIterator::UP create_wand(bool use_dww, assert(childrenMatchData->getNumTermFields() == dict_entries.size()); wand::Terms terms; for (size_t i = 0; i < dict_entries.size(); ++i) { - terms.push_back(wand::Term(new DocumentWeightSearchIterator(*(childrenMatchData->resolveTermField(handles[i])), attr, dict_entries[i]), + terms.push_back(wand::Term(new DocidWithWeightSearchIterator(*(childrenMatchData->resolveTermField(handles[i])), attr, dict_entries[i]), weights[i], dict_entries[i].posting_size, childrenMatchData->resolveTermField(handles[i]))); diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp index b9adcf3b093..037285fedf0 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -527,7 +527,7 @@ public: } } if (_attr.has_btree_iterator(_dict_entry.posting_idx)) { - return std::make_unique(*tfmda[0], _attr, _dict_entry); + return std::make_unique(*tfmda[0], _attr, _dict_entry); } else { return _attr.make_bitvector_iterator(_dict_entry.posting_idx, get_docid_limit(), *tfmda[0], strict); } diff --git a/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt b/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt index 5e6d31d3761..51fe2d12637 100644 --- a/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt @@ -7,7 +7,7 @@ vespa_add_library(searchlib_queryeval OBJECT booleanmatchiteratorwrapper.cpp children_iterators.cpp create_blueprint_visitor_helper.cpp - document_weight_search_iterator.cpp + docid_with_weight_search_iterator.cpp dot_product_blueprint.cpp dot_product_search.cpp elementiterator.cpp diff --git a/searchlib/src/vespa/searchlib/queryeval/docid_with_weight_search_iterator.cpp b/searchlib/src/vespa/searchlib/queryeval/docid_with_weight_search_iterator.cpp new file mode 100644 index 00000000000..85bd751df27 --- /dev/null +++ b/searchlib/src/vespa/searchlib/queryeval/docid_with_weight_search_iterator.cpp @@ -0,0 +1,3 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "docid_with_weight_search_iterator.h" diff --git a/searchlib/src/vespa/searchlib/queryeval/docid_with_weight_search_iterator.h b/searchlib/src/vespa/searchlib/queryeval/docid_with_weight_search_iterator.h new file mode 100644 index 00000000000..8201c6a78b8 --- /dev/null +++ b/searchlib/src/vespa/searchlib/queryeval/docid_with_weight_search_iterator.h @@ -0,0 +1,60 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "searchiterator.h" +#include +#include + +namespace search::queryeval { + +/** + * SearchIterator implementation over a low-level posting list with {docid, weight} tuples. + * + * This is used by the parallel weak AND search iterator. + */ +class DocidWithWeightSearchIterator : public SearchIterator +{ +private: + fef::TermFieldMatchData &_tfmd; + fef::TermFieldMatchDataPosition * _matchPosition; + DocidWithWeightIterator _iterator; + queryeval::MinMaxPostingInfo _postingInfo; + +public: + DocidWithWeightSearchIterator(fef::TermFieldMatchData &tfmd, + const IDocidWithWeightPostingStore &attr, + IDirectPostingStore::LookupResult dict_entry) + : _tfmd(tfmd), + _matchPosition(_tfmd.populate_fixed()), + _iterator(attr.create(dict_entry.posting_idx)), + _postingInfo(queryeval::MinMaxPostingInfo(dict_entry.min_weight, dict_entry.max_weight)) + { } + void initRange(uint32_t begin, uint32_t end) override { + SearchIterator::initRange(begin, end); + _iterator.lower_bound(begin); + updateDocId(); + } + void updateDocId() { + if (_iterator.valid()) { + setDocId(_iterator.getKey()); + } else { + setAtEnd(); + } + } + + void doSeek(uint32_t docId) override { + _iterator.linearSeek(docId); + updateDocId(); + } + + void doUnpack(uint32_t docId) override { + _tfmd.resetOnlyDocId(docId); + _matchPosition->setElementWeight(_iterator.getData()); + } + + const queryeval::PostingInfo *getPostingInfo() const override { return &_postingInfo; } + Trinary is_strict() const override { return Trinary::True; } +}; + +} diff --git a/searchlib/src/vespa/searchlib/queryeval/document_weight_search_iterator.cpp b/searchlib/src/vespa/searchlib/queryeval/document_weight_search_iterator.cpp deleted file mode 100644 index 6b0bd3ec7fc..00000000000 --- a/searchlib/src/vespa/searchlib/queryeval/document_weight_search_iterator.cpp +++ /dev/null @@ -1,3 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#include "document_weight_search_iterator.h" diff --git a/searchlib/src/vespa/searchlib/queryeval/document_weight_search_iterator.h b/searchlib/src/vespa/searchlib/queryeval/document_weight_search_iterator.h deleted file mode 100644 index 448f1c8f2b4..00000000000 --- a/searchlib/src/vespa/searchlib/queryeval/document_weight_search_iterator.h +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. - -#pragma once - -#include "searchiterator.h" -#include -#include - -namespace search::queryeval { - -class DocumentWeightSearchIterator : public SearchIterator -{ -private: - fef::TermFieldMatchData &_tfmd; - fef::TermFieldMatchDataPosition * _matchPosition; - DocidWithWeightIterator _iterator; - queryeval::MinMaxPostingInfo _postingInfo; - -public: - DocumentWeightSearchIterator(fef::TermFieldMatchData &tfmd, - const IDocidWithWeightPostingStore &attr, - IDirectPostingStore::LookupResult dict_entry) - : _tfmd(tfmd), - _matchPosition(_tfmd.populate_fixed()), - _iterator(attr.create(dict_entry.posting_idx)), - _postingInfo(queryeval::MinMaxPostingInfo(dict_entry.min_weight, dict_entry.max_weight)) - { } - void initRange(uint32_t begin, uint32_t end) override { - SearchIterator::initRange(begin, end); - _iterator.lower_bound(begin); - updateDocId(); - } - void updateDocId() { - if (_iterator.valid()) { - setDocId(_iterator.getKey()); - } else { - setAtEnd(); - } - } - - void doSeek(uint32_t docId) override { - _iterator.linearSeek(docId); - updateDocId(); - } - - void doUnpack(uint32_t docId) override { - _tfmd.resetOnlyDocId(docId); - _matchPosition->setElementWeight(_iterator.getData()); - } - - const queryeval::PostingInfo *getPostingInfo() const override { return &_postingInfo; } - Trinary is_strict() const override { return Trinary::True; } -}; - -} diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.cpp b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.cpp index 828ca4be08d..f3028f5159a 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.cpp @@ -1,7 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "parallel_weak_and_search.h" -#include +#include #include #include #include @@ -243,7 +243,7 @@ ParallelWeakAndSearch::create(search::fef::TermFieldMatchData &tfmd, assert(childrenMatchData->getNumTermFields() == dict_entries.size()); wand::Terms terms; for (size_t i = 0; i < dict_entries.size(); ++i) { - terms.push_back(wand::Term(new DocumentWeightSearchIterator(*(childrenMatchData->resolveTermField(handles[i])), attr, dict_entries[i]), + terms.push_back(wand::Term(new DocidWithWeightSearchIterator(*(childrenMatchData->resolveTermField(handles[i])), attr, dict_entries[i]), weights[i], dict_entries[i].posting_size, childrenMatchData->resolveTermField(handles[i]))); -- cgit v1.2.3 From 10a8162be6cae6a30be24ac7c8ca2e5adc62f4f7 Mon Sep 17 00:00:00 2001 From: Henning Baldersheim Date: Fri, 15 Dec 2023 16:46:06 +0100 Subject: Precompute 1024 bits, 128 bytes, 2 cachelines for intel, and 1 for arm64. --- searchlib/src/vespa/searchlib/common/bitvector.cpp | 2 +- .../searchlib/queryeval/multibitvectoriterator.cpp | 9 ++++----- .../searchlib/queryeval/multibitvectoriterator.h | 2 +- vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp | 8 ++++---- vespalib/src/vespa/vespalib/hwaccelrated/avx2.h | 4 ++-- vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp | 8 ++++---- vespalib/src/vespa/vespalib/hwaccelrated/avx512.h | 4 ++-- vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp | 8 ++++---- vespalib/src/vespa/vespalib/hwaccelrated/generic.h | 4 ++-- .../src/vespa/vespalib/hwaccelrated/iaccelrated.cpp | 20 ++++++++++---------- .../src/vespa/vespalib/hwaccelrated/iaccelrated.h | 8 ++++---- .../vespa/vespalib/hwaccelrated/private_helpers.hpp | 4 ++-- 12 files changed, 40 insertions(+), 41 deletions(-) (limited to 'searchlib') diff --git a/searchlib/src/vespa/searchlib/common/bitvector.cpp b/searchlib/src/vespa/searchlib/common/bitvector.cpp index b79703a8e5c..a75066a67a9 100644 --- a/searchlib/src/vespa/searchlib/common/bitvector.cpp +++ b/searchlib/src/vespa/searchlib/common/bitvector.cpp @@ -39,7 +39,7 @@ BitVector::allocatePaddedAndAligned(Index start, Index end, Index capacity, cons { assert(capacity >= end); uint32_t words = numActiveWords(start, capacity); - words += (-words & 15); // Pad to 64 byte alignment + words += (-words & 15); // Pad to 128 byte alignment const size_t sz(words * sizeof(Word)); Alloc alloc = (init_alloc != nullptr) ? init_alloc->create(sz) : Alloc::alloc(sz, MMAP_LIMIT); assert(alloc.size()/sizeof(Word) >= words); diff --git a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp index 66f505581c7..e90156868fb 100644 --- a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.cpp @@ -4,7 +4,6 @@ #include "andsearch.h" #include "andnotsearch.h" #include "sourceblendersearch.h" -#include #include namespace search::queryeval { @@ -18,7 +17,7 @@ namespace { struct And { using Word = BitWord::Word; void operator () (const IAccelrated & accel, size_t offset, const std::vector & src, void *dest) noexcept { - accel.and64(offset, src, dest); + accel.and128(offset, src, dest); } static constexpr bool isAnd() noexcept { return true; } }; @@ -26,7 +25,7 @@ struct And { struct Or { using Word = BitWord::Word; void operator () (const IAccelrated & accel, size_t offset, const std::vector & src, void *dest) noexcept { - accel.or64(offset, src, dest); + accel.or128(offset, src, dest); } static constexpr bool isAnd() noexcept { return false; } }; @@ -56,8 +55,8 @@ MultiBitVector::MultiBitVector(size_t reserved) _accel(IAccelrated::getAccelerator()), _lastWords() { - static_assert(sizeof(_lastWords) == 64, "Lastwords should have 64 byte size"); - static_assert(NumWordsInBatch == 8, "Batch size should be 8 words."); + static_assert(sizeof(_lastWords) == 128, "Lastwords should have 128 byte size"); + static_assert(NumWordsInBatch == 16, "Batch size should be 16 words."); memset(_lastWords, 0, sizeof(_lastWords)); } diff --git a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h index 0d9e2c4f25f..0ecf9d85b92 100644 --- a/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h +++ b/searchlib/src/vespa/searchlib/queryeval/multibitvectoriterator.h @@ -50,7 +50,7 @@ private: Update _update; const IAccelrated & _accel; - alignas(64) Word _lastWords[8]; + alignas(64) Word _lastWords[16]; static constexpr size_t NumWordsInBatch = sizeof(_lastWords) / sizeof(Word); }; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp index bbba4109fc2..66441b3c08b 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.cpp @@ -26,13 +26,13 @@ Avx2Accelrator::squaredEuclideanDistance(const double * a, const double * b, siz } void -Avx2Accelrator::and64(size_t offset, const std::vector> &src, void *dest) const noexcept { - helper::andChunks<32u, 2u>(offset, src, dest); +Avx2Accelrator::and128(size_t offset, const std::vector> &src, void *dest) const noexcept { + helper::andChunks<32u, 4u>(offset, src, dest); } void -Avx2Accelrator::or64(size_t offset, const std::vector> &src, void *dest) const noexcept { - helper::orChunks<32u, 2u>(offset, src, dest); +Avx2Accelrator::or128(size_t offset, const std::vector> &src, void *dest) const noexcept { + helper::orChunks<32u, 4u>(offset, src, dest); } } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h index 934d815d67b..af46035666c 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx2.h @@ -16,8 +16,8 @@ public: double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const noexcept override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept override; - void and64(size_t offset, const std::vector> &src, void *dest) const noexcept override; - void or64(size_t offset, const std::vector> &src, void *dest) const noexcept override; + void and128(size_t offset, const std::vector> &src, void *dest) const noexcept override; + void or128(size_t offset, const std::vector> &src, void *dest) const noexcept override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp index 035f33cb25e..5f408c05fef 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.cpp @@ -36,13 +36,13 @@ Avx512Accelrator::squaredEuclideanDistance(const double * a, const double * b, s } void -Avx512Accelrator::and64(size_t offset, const std::vector> &src, void *dest) const noexcept { - helper::andChunks<64, 1>(offset, src, dest); +Avx512Accelrator::and128(size_t offset, const std::vector> &src, void *dest) const noexcept { + helper::andChunks<64, 2>(offset, src, dest); } void -Avx512Accelrator::or64(size_t offset, const std::vector> &src, void *dest) const noexcept { - helper::orChunks<64, 1>(offset, src, dest); +Avx512Accelrator::or128(size_t offset, const std::vector> &src, void *dest) const noexcept { + helper::orChunks<64, 2>(offset, src, dest); } } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h index 38eab0a2549..a86a2787d5a 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/avx512.h @@ -18,8 +18,8 @@ public: double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const noexcept override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept override; - void and64(size_t offset, const std::vector> &src, void *dest) const noexcept override; - void or64(size_t offset, const std::vector> &src, void *dest) const noexcept override; + void and128(size_t offset, const std::vector> &src, void *dest) const noexcept override; + void or128(size_t offset, const std::vector> &src, void *dest) const noexcept override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp index a8e5535cc21..f0112aaddf7 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.cpp @@ -173,13 +173,13 @@ GenericAccelrator::squaredEuclideanDistance(const double * a, const double * b, } void -GenericAccelrator::and64(size_t offset, const std::vector> &src, void *dest) const noexcept { - helper::andChunks<16, 4>(offset, src, dest); +GenericAccelrator::and128(size_t offset, const std::vector> &src, void *dest) const noexcept { + helper::andChunks<16, 8>(offset, src, dest); } void -GenericAccelrator::or64(size_t offset, const std::vector> &src, void *dest) const noexcept { - helper::orChunks<16,4>(offset, src, dest); +GenericAccelrator::or128(size_t offset, const std::vector> &src, void *dest) const noexcept { + helper::orChunks<16, 8>(offset, src, dest); } } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h index 16c8bab71da..ba986656635 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/generic.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/generic.h @@ -26,8 +26,8 @@ public: double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const noexcept override; double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept override; double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept override; - void and64(size_t offset, const std::vector> &src, void *dest) const noexcept override; - void or64(size_t offset, const std::vector> &src, void *dest) const noexcept override; + void and128(size_t offset, const std::vector> &src, void *dest) const noexcept override; + void or128(size_t offset, const std::vector> &src, void *dest) const noexcept override; }; } diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp index d707553b504..a02e9545765 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.cpp @@ -153,11 +153,11 @@ verifyOr64(const IAccelrated & accel, const std::vector> & simpleOrWith(expected, optionallyInvert(vRefs[j].second, vectors[j])); } - uint64_t dest[8] __attribute((aligned(64))); - accel.or64(offset*sizeof(uint64_t), vRefs, dest); + uint64_t dest[16] __attribute((aligned(64))); + accel.or128(offset * sizeof(uint64_t), vRefs, dest); int diff = memcmp(&expected[offset], dest, sizeof(dest)); if (diff != 0) { - LOG_ABORT("Accelerator fails to compute correct 64 bytes OR"); + LOG_ABORT("Accelerator fails to compute correct 128 bytes OR"); } } @@ -174,11 +174,11 @@ verifyAnd64(const IAccelrated & accel, const std::vector> simpleAndWith(expected, optionallyInvert(vRefs[j].second, vectors[j])); } - uint64_t dest[8] __attribute((aligned(64))); - accel.and64(offset*sizeof(uint64_t), vRefs, dest); + uint64_t dest[16] __attribute((aligned(64))); + accel.and128(offset * sizeof(uint64_t), vRefs, dest); int diff = memcmp(&expected[offset], dest, sizeof(dest)); if (diff != 0) { - LOG_ABORT("Accelerator fails to compute correct 64 bytes AND"); + LOG_ABORT("Accelerator fails to compute correct 128 bytes AND"); } } @@ -186,9 +186,9 @@ void verifyOr64(const IAccelrated & accel) { std::vector> vectors(3) ; for (auto & v : vectors) { - fill(v, 16); + fill(v, 32); } - for (size_t offset = 0; offset < 8; offset++) { + for (size_t offset = 0; offset < 16; offset++) { for (size_t i = 1; i < vectors.size(); i++) { verifyOr64(accel, vectors, offset, i, false); verifyOr64(accel, vectors, offset, i, true); @@ -200,9 +200,9 @@ void verifyAnd64(const IAccelrated & accel) { std::vector> vectors(3); for (auto & v : vectors) { - fill(v, 16); + fill(v, 32); } - for (size_t offset = 0; offset < 8; offset++) { + for (size_t offset = 0; offset < 16; offset++) { for (size_t i = 1; i < vectors.size(); i++) { verifyAnd64(accel, vectors, offset, i, false); verifyAnd64(accel, vectors, offset, i, true); diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h index 806e77caced..f070f206b7e 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h +++ b/vespalib/src/vespa/vespalib/hwaccelrated/iaccelrated.h @@ -31,10 +31,10 @@ public: virtual double squaredEuclideanDistance(const int8_t * a, const int8_t * b, size_t sz) const noexcept = 0; virtual double squaredEuclideanDistance(const float * a, const float * b, size_t sz) const noexcept = 0; virtual double squaredEuclideanDistance(const double * a, const double * b, size_t sz) const noexcept = 0; - // AND 64 bytes from multiple, optionally inverted sources - virtual void and64(size_t offset, const std::vector> &src, void *dest) const noexcept = 0; - // OR 64 bytes from multiple, optionally inverted sources - virtual void or64(size_t offset, const std::vector> &src, void *dest) const noexcept = 0; + // AND 128 bytes from multiple, optionally inverted sources + virtual void and128(size_t offset, const std::vector> &src, void *dest) const noexcept = 0; + // OR 128 bytes from multiple, optionally inverted sources + virtual void or128(size_t offset, const std::vector> &src, void *dest) const noexcept = 0; static const IAccelrated & getAccelerator() __attribute__((noinline)); }; diff --git a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp index c884f0d7bb9..6731b449462 100644 --- a/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp +++ b/vespalib/src/vespa/vespalib/hwaccelrated/private_helpers.hpp @@ -43,7 +43,7 @@ void andChunks(size_t offset, const std::vector> & src, void * dest) { typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize))); static_assert(sizeof(Chunk) == ChunkSize, "sizeof(Chunk) == ChunkSize"); - static_assert(ChunkSize*Chunks == 64, "ChunkSize*Chunks == 64"); + static_assert(ChunkSize*Chunks == 128, "ChunkSize*Chunks == 128"); Chunk * chunk = static_cast(dest); const Chunk * tmp = cast(src[0].first, offset); for (size_t n=0; n < Chunks; n++) { @@ -62,7 +62,7 @@ void orChunks(size_t offset, const std::vector> & src, void * dest) { typedef uint64_t Chunk __attribute__ ((vector_size (ChunkSize))); static_assert(sizeof(Chunk) == ChunkSize, "sizeof(Chunk) == ChunkSize"); - static_assert(ChunkSize*Chunks == 64, "ChunkSize*Chunks == 64"); + static_assert(ChunkSize*Chunks == 128, "ChunkSize*Chunks == 128"); Chunk * chunk = static_cast(dest); const Chunk * tmp = cast(src[0].first, offset); for (size_t n=0; n < Chunks; n++) { -- cgit v1.2.3 From b845595dc7f07f411a94a4691b388468a5bc3d89 Mon Sep 17 00:00:00 2001 From: HÃ¥vard Pettersen Date: Tue, 19 Dec 2023 17:54:34 +0000 Subject: enable sorting on cost --- .../searchcore/proton/matching/match_tools.cpp | 6 +- .../src/vespa/searchcore/proton/matching/query.cpp | 7 +- .../src/vespa/searchcore/proton/matching/query.h | 3 +- .../tests/queryeval/blueprint/blueprint_test.cpp | 23 ++++-- .../blueprint/intermediate_blueprints_test.cpp | 83 ++++++++++++---------- .../queryeval/filter_search/filter_search_test.cpp | 2 +- .../queryeval/same_element/same_element_test.cpp | 2 +- .../src/vespa/searchlib/queryeval/blueprint.cpp | 20 +++--- .../src/vespa/searchlib/queryeval/blueprint.h | 26 +++++-- .../queryeval/intermediate_blueprints.cpp | 62 +++++++++------- .../searchlib/queryeval/intermediate_blueprints.h | 24 +++---- .../searchlib/queryeval/same_element_blueprint.cpp | 2 +- .../searchlib/queryeval/same_element_blueprint.h | 2 +- 13 files changed, 157 insertions(+), 105 deletions(-) (limited to 'searchlib') diff --git a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp index b5224281724..565604826ee 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/match_tools.cpp @@ -203,7 +203,8 @@ MatchToolsFactory(QueryLimiter & queryLimiter, trace.addEvent(5, "Build query execution plan"); _query.reserveHandles(_requestContext, searchContext, _mdl); trace.addEvent(5, "Optimize query execution plan"); - _query.optimize(SortBlueprintsByCost::check(_queryEnv.getProperties(), rankSetup.sort_blueprints_by_cost())); + bool sort_by_cost = SortBlueprintsByCost::check(_queryEnv.getProperties(), rankSetup.sort_blueprints_by_cost()); + _query.optimize(sort_by_cost); trace.addEvent(4, "Perform dictionary lookups and posting lists initialization"); double hitRate = std::min(1.0, double(maxNumHits)/double(searchContext.getDocIdLimit())); bool create_postinglist_when_non_strict = CreatePostingListWhenNonStrict::check(_queryEnv.getProperties(), rankSetup.create_postinglist_when_non_strict()); @@ -216,7 +217,8 @@ MatchToolsFactory(QueryLimiter & queryLimiter, _query.handle_global_filter(_requestContext, searchContext.getDocIdLimit(), _attribute_blueprint_params.global_filter_lower_limit, _attribute_blueprint_params.global_filter_upper_limit, - trace, create_postinglist_when_non_strict, use_estimate_for_fetch_postings); + trace, create_postinglist_when_non_strict, use_estimate_for_fetch_postings, + sort_by_cost); } _query.freeze(); trace.addEvent(5, "Prepare shared state for multi-threaded rank executors"); diff --git a/searchcore/src/vespa/searchcore/proton/matching/query.cpp b/searchcore/src/vespa/searchcore/proton/matching/query.cpp index 149828b0a91..dfa1bd91157 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/query.cpp +++ b/searchcore/src/vespa/searchcore/proton/matching/query.cpp @@ -201,7 +201,7 @@ void Query::optimize(bool sort_by_cost) { (void) sort_by_cost; - _blueprint = Blueprint::optimize(std::move(_blueprint)); + _blueprint = Blueprint::optimize(std::move(_blueprint), sort_by_cost); LOG(debug, "optimized blueprint:\n%s\n", _blueprint->asString().c_str()); } @@ -215,7 +215,8 @@ void Query::handle_global_filter(const IRequestContext & requestContext, uint32_t docid_limit, double global_filter_lower_limit, double global_filter_upper_limit, search::engine::Trace& trace, - bool create_postinglist_when_non_strict, bool use_estimate_for_fetch_postings) + bool create_postinglist_when_non_strict, bool use_estimate_for_fetch_postings, + bool sort_by_cost) { if (!handle_global_filter(*_blueprint, docid_limit, global_filter_lower_limit, global_filter_upper_limit, requestContext.thread_bundle(), &trace)) @@ -224,7 +225,7 @@ Query::handle_global_filter(const IRequestContext & requestContext, uint32_t doc } // optimized order may change after accounting for global filter: trace.addEvent(5, "Optimize query execution plan to account for global filter"); - _blueprint = Blueprint::optimize(std::move(_blueprint)); + _blueprint = Blueprint::optimize(std::move(_blueprint), sort_by_cost); LOG(debug, "blueprint after handle_global_filter:\n%s\n", _blueprint->asString().c_str()); // strictness may change if optimized order changed: fetchPostings(ExecuteInfo::create(true, 1.0, requestContext.getDoom(), requestContext.thread_bundle(), diff --git a/searchcore/src/vespa/searchcore/proton/matching/query.h b/searchcore/src/vespa/searchcore/proton/matching/query.h index 8062f12b70d..b468b4b8f33 100644 --- a/searchcore/src/vespa/searchcore/proton/matching/query.h +++ b/searchcore/src/vespa/searchcore/proton/matching/query.h @@ -109,7 +109,8 @@ public: void handle_global_filter(const IRequestContext & requestContext, uint32_t docid_limit, double global_filter_lower_limit, double global_filter_upper_limit, search::engine::Trace& trace, - bool create_postinglist_when_non_strict, bool use_estimate_for_fetch_postings); + bool create_postinglist_when_non_strict, bool use_estimate_for_fetch_postings, + bool sort_by_cost); /** * Calculates and handles the global filter if needed by the blueprint tree. diff --git a/searchlib/src/tests/queryeval/blueprint/blueprint_test.cpp b/searchlib/src/tests/queryeval/blueprint/blueprint_test.cpp index eefca5c82e9..f800e124bdc 100644 --- a/searchlib/src/tests/queryeval/blueprint/blueprint_test.cpp +++ b/searchlib/src/tests/queryeval/blueprint/blueprint_test.cpp @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "mysearch.h" #include +#include #include #include #include @@ -22,8 +23,12 @@ class MyOr : public IntermediateBlueprint { private: public: - double calculate_cost() const final { return 1.0; } - double calculate_relative_estimate() const final { return 0.5; } + double calculate_cost() const final { + return cost_of(get_children(), OrFlow()); + } + double calculate_relative_estimate() const final { + return estimate_of(get_children(), OrFlow()); + } HitEstimate combine(const std::vector &data) const override { return max(data); } @@ -32,7 +37,7 @@ public: return mixChildrenFields(); } - void sort(Children &children) const override { + void sort(Children &children, bool) const override { std::sort(children.begin(), children.end(), TieredGreaterEstimate()); } @@ -440,7 +445,8 @@ TEST_F("testChildAndNotCollapsing", Fixture) ) ); TEST_DO(f.check_not_equal(*sorted, *unsorted)); - unsorted = Blueprint::optimize(std::move(unsorted)); + unsorted->setDocIdLimit(1000); + unsorted = Blueprint::optimize(std::move(unsorted), true); TEST_DO(f.check_equal(*sorted, *unsorted)); } @@ -479,7 +485,8 @@ TEST_F("testChildAndCollapsing", Fixture) ); TEST_DO(f.check_not_equal(*sorted, *unsorted)); - unsorted = Blueprint::optimize(std::move(unsorted)); + unsorted->setDocIdLimit(1000); + unsorted = Blueprint::optimize(std::move(unsorted), true); TEST_DO(f.check_equal(*sorted, *unsorted)); } @@ -517,7 +524,8 @@ TEST_F("testChildOrCollapsing", Fixture) .add(MyLeafSpec(1).addField(2, 42).create()) ); TEST_DO(f.check_not_equal(*sorted, *unsorted)); - unsorted = Blueprint::optimize(std::move(unsorted)); + unsorted->setDocIdLimit(1000); + unsorted = Blueprint::optimize(std::move(unsorted), true); TEST_DO(f.check_equal(*sorted, *unsorted)); } @@ -560,7 +568,8 @@ TEST_F("testChildSorting", Fixture) ); TEST_DO(f.check_not_equal(*sorted, *unsorted)); - unsorted = Blueprint::optimize(std::move(unsorted)); + unsorted->setDocIdLimit(1000); + unsorted = Blueprint::optimize(std::move(unsorted), true); TEST_DO(f.check_equal(*sorted, *unsorted)); } diff --git a/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp b/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp index 234ff5a9d19..ab1c004c721 100644 --- a/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp +++ b/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -75,7 +76,8 @@ void check_sort_order(IntermediateBlueprint &self, BlueprintVector children, std for (const auto & child: children) { unordered.push_back(child.get()); } - self.sort(children); + // TODO: sort by cost (requires both setDocIdLimit and optimize to be called) + self.sort(children, false); for (size_t i = 0; i < children.size(); ++i) { EXPECT_EQUAL(children[i].get(), unordered[order[i]]); } @@ -129,7 +131,7 @@ TEST("test AndNot Blueprint") { template void optimize(std::unique_ptr &ref) { - auto optimized = Blueprint::optimize(std::move(ref)); + auto optimized = Blueprint::optimize(std::move(ref), true); ref.reset(dynamic_cast(optimized.get())); ASSERT_TRUE(ref); optimized.release(); @@ -141,8 +143,8 @@ TEST("test And propagates updated histestimate") { bp->addChild(ap(MyLeafSpec(20).create()->setSourceId(2))); bp->addChild(ap(MyLeafSpec(200).create()->setSourceId(2))); bp->addChild(ap(MyLeafSpec(2000).create()->setSourceId(2))); - optimize(bp); bp->setDocIdLimit(5000); + optimize(bp); bp->fetchPostings(ExecuteInfo::TRUE); EXPECT_EQUAL(3u, bp->childCnt()); for (uint32_t i = 0; i < bp->childCnt(); i++) { @@ -161,8 +163,8 @@ TEST("test Or propagates updated histestimate") { bp->addChild(ap(MyLeafSpec(2000).create()->setSourceId(2))); bp->addChild(ap(MyLeafSpec(800).create()->setSourceId(2))); bp->addChild(ap(MyLeafSpec(20).create()->setSourceId(2))); - optimize(bp); bp->setDocIdLimit(5000); + optimize(bp); bp->fetchPostings(ExecuteInfo::TRUE); EXPECT_EQUAL(4u, bp->childCnt()); for (uint32_t i = 0; i < bp->childCnt(); i++) { @@ -514,36 +516,45 @@ vespalib::string to_str(const Inspector &value) { } void compare(const Blueprint &bp1, const Blueprint &bp2, bool expect_eq) { - auto ignore_cost = [expect_eq](const auto &path, const auto &a, const auto &b) { - if (!path.empty() && std::holds_alternative(path.back())) { - vespalib::stringref field = std::get(path.back()); - if (field == "cost") { - return true; - } - } - if (expect_eq) { - fprintf(stderr, " mismatch at %s: %s vs %s\n", path_to_str(path).c_str(), - to_str(a).c_str(), to_str(b).c_str()); - } - return false; - }; + auto cmp_hook = [expect_eq](const auto &path, const auto &a, const auto &b) { + if (!path.empty() && std::holds_alternative(path.back())) { + vespalib::stringref field = std::get(path.back()); + if (field == "cost") { + return true; + } + if (field == "relative_estimate") { + double a_val = a.asDouble(); + double b_val = b.asDouble(); + if (a_val != 0.0 && b_val != 0.0 && vespalib::approx_equal(a_val, b_val)) { + return true; + } + } + } + if (expect_eq) { + fprintf(stderr, " mismatch at %s: %s vs %s\n", path_to_str(path).c_str(), + to_str(a).c_str(), to_str(b).c_str()); + } + return false; + }; Slime a; Slime b; bp1.asSlime(SlimeInserter(a)); bp2.asSlime(SlimeInserter(b)); if (expect_eq) { - EXPECT_TRUE(vespalib::slime::are_equal(a.get(), b.get(), ignore_cost)); + EXPECT_TRUE(vespalib::slime::are_equal(a.get(), b.get(), cmp_hook)); } else { - EXPECT_FALSE(vespalib::slime::are_equal(a.get(), b.get(), ignore_cost)); + EXPECT_FALSE(vespalib::slime::are_equal(a.get(), b.get(), cmp_hook)); } } void -optimize_and_compare(Blueprint::UP top, Blueprint::UP expect) { +optimize_and_compare(Blueprint::UP top, Blueprint::UP expect, bool sort_by_cost = true) { + top->setDocIdLimit(1000); + expect->setDocIdLimit(1000); TEST_DO(compare(*top, *expect, false)); - top = Blueprint::optimize(std::move(top)); + top = Blueprint::optimize(std::move(top), sort_by_cost); TEST_DO(compare(*top, *expect, true)); - expect = Blueprint::optimize(std::move(expect)); + expect = Blueprint::optimize(std::move(expect), sort_by_cost); TEST_DO(compare(*expect, *top, true)); } @@ -670,11 +681,11 @@ TEST("test empty root node optimization and safeness") { //------------------------------------------------------------------------- auto expect_up = std::make_unique(); - EXPECT_EQUAL(expect_up->asString(), Blueprint::optimize(std::move(top1))->asString()); - EXPECT_EQUAL(expect_up->asString(), Blueprint::optimize(std::move(top2))->asString()); - EXPECT_EQUAL(expect_up->asString(), Blueprint::optimize(std::move(top3))->asString()); - EXPECT_EQUAL(expect_up->asString(), Blueprint::optimize(std::move(top4))->asString()); - EXPECT_EQUAL(expect_up->asString(), Blueprint::optimize(std::move(top5))->asString()); + EXPECT_EQUAL(expect_up->asString(), Blueprint::optimize(std::move(top1), true)->asString()); + EXPECT_EQUAL(expect_up->asString(), Blueprint::optimize(std::move(top2), true)->asString()); + EXPECT_EQUAL(expect_up->asString(), Blueprint::optimize(std::move(top3), true)->asString()); + EXPECT_EQUAL(expect_up->asString(), Blueprint::optimize(std::move(top4), true)->asString()); + EXPECT_EQUAL(expect_up->asString(), Blueprint::optimize(std::move(top5), true)->asString()); } TEST("and with one empty child is optimized away") { @@ -682,7 +693,7 @@ TEST("and with one empty child is optimized away") { Blueprint::UP top = ap((new SourceBlenderBlueprint(*selector))-> addChild(ap(MyLeafSpec(10).create())). addChild(addLeafs(std::make_unique(), {{0, true}, 10, 20}))); - top = Blueprint::optimize(std::move(top)); + top = Blueprint::optimize(std::move(top), true); Blueprint::UP expect_up(ap((new SourceBlenderBlueprint(*selector))-> addChild(ap(MyLeafSpec(10).create())). addChild(std::make_unique()))); @@ -857,8 +868,8 @@ TEST("require that replaced blueprints retain source id") { addChild(ap(MyLeafSpec(30).create()->setSourceId(55))))); Blueprint::UP expect2_up(ap(MyLeafSpec(30).create()->setSourceId(42))); //------------------------------------------------------------------------- - top1_up = Blueprint::optimize(std::move(top1_up)); - top2_up = Blueprint::optimize(std::move(top2_up)); + top1_up = Blueprint::optimize(std::move(top1_up), true); + top2_up = Blueprint::optimize(std::move(top2_up), true); EXPECT_EQUAL(expect1_up->asString(), top1_up->asString()); EXPECT_EQUAL(expect2_up->asString(), top2_up->asString()); EXPECT_EQUAL(13u, top1_up->getSourceId()); @@ -1177,7 +1188,7 @@ TEST("require that children of near are not optimized") { auto expect_up = ap((new NearBlueprint(10))-> addChild(addLeafs(std::make_unique(), {20, {0, true}})). addChild(addLeafs(std::make_unique(), {{0, true}, 30}))); - top_up = Blueprint::optimize(std::move(top_up)); + top_up = Blueprint::optimize(std::move(top_up), true); TEST_DO(compare(*top_up, *expect_up, true)); } @@ -1188,27 +1199,27 @@ TEST("require that children of onear are not optimized") { auto expect_up = ap((new ONearBlueprint(10))-> addChild(addLeafs(std::make_unique(), {20, {0, true}})). addChild(addLeafs(std::make_unique(), {{0, true}, 30}))); - top_up = Blueprint::optimize(std::move(top_up)); + top_up = Blueprint::optimize(std::move(top_up), true); TEST_DO(compare(*top_up, *expect_up, true)); } TEST("require that ANDNOT without children is optimized to empty search") { Blueprint::UP top_up = std::make_unique(); auto expect_up = std::make_unique(); - top_up = Blueprint::optimize(std::move(top_up)); + top_up = Blueprint::optimize(std::move(top_up), true); EXPECT_EQUAL(expect_up->asString(), top_up->asString()); } TEST("require that highest cost tier sorts last for OR") { Blueprint::UP top = addLeafsWithCostTier(std::make_unique(), {{50, 1}, {30, 3}, {20, 2}, {10, 1}}); Blueprint::UP expect = addLeafsWithCostTier(std::make_unique(), {{50, 1}, {10, 1}, {20, 2}, {30, 3}}); - optimize_and_compare(std::move(top), std::move(expect)); + optimize_and_compare(std::move(top), std::move(expect), false); } TEST("require that highest cost tier sorts last for AND") { Blueprint::UP top = addLeafsWithCostTier(std::make_unique(), {{10, 1}, {20, 3}, {30, 2}, {50, 1}}); Blueprint::UP expect = addLeafsWithCostTier(std::make_unique(), {{10, 1}, {50, 1}, {30, 2}, {20, 3}}); - optimize_and_compare(std::move(top), std::move(expect)); + optimize_and_compare(std::move(top), std::move(expect), false); } template @@ -1325,7 +1336,7 @@ void verify_cost(make &&mk, double expect) { .cost(1.2).leaf(300) .cost(1.3).leaf(500); bp->setDocIdLimit(1000); - bp = Blueprint::optimize(std::move(bp)); + bp = Blueprint::optimize(std::move(bp), true); EXPECT_EQUAL(bp->cost(), expect); } diff --git a/searchlib/src/tests/queryeval/filter_search/filter_search_test.cpp b/searchlib/src/tests/queryeval/filter_search/filter_search_test.cpp index f910ff5be1b..1180206279d 100644 --- a/searchlib/src/tests/queryeval/filter_search/filter_search_test.cpp +++ b/searchlib/src/tests/queryeval/filter_search/filter_search_test.cpp @@ -48,7 +48,7 @@ concept ChildCollector = requires(T a, std::unique_ptr bp) { // inherit Blueprint to capture the default filter factory struct DefaultBlueprint : Blueprint { double calculate_relative_estimate() const override { abort(); } - void optimize(Blueprint* &, OptimizePass) override { abort(); } + void optimize(Blueprint* &, OptimizePass, bool) override { abort(); } const State &getState() const override { abort(); } void fetchPostings(const ExecuteInfo &) override { abort(); } void freeze() override { abort(); } diff --git a/searchlib/src/tests/queryeval/same_element/same_element_test.cpp b/searchlib/src/tests/queryeval/same_element/same_element_test.cpp index d05e6c8e4f4..7c535e5d3d5 100644 --- a/searchlib/src/tests/queryeval/same_element/same_element_test.cpp +++ b/searchlib/src/tests/queryeval/same_element/same_element_test.cpp @@ -46,7 +46,7 @@ std::unique_ptr make_blueprint(const std::vectorfetchPostings(ExecuteInfo::createForTest(strict)); result->freeze(); return result; diff --git a/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp index 71d53ade2f7..ee383f18ea4 100644 --- a/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp @@ -130,15 +130,15 @@ Blueprint::Blueprint() noexcept Blueprint::~Blueprint() = default; Blueprint::UP -Blueprint::optimize(Blueprint::UP bp) { +Blueprint::optimize(Blueprint::UP bp, bool sort_by_cost) { Blueprint *root = bp.release(); - root->optimize(root, OptimizePass::FIRST); - root->optimize(root, OptimizePass::LAST); + root->optimize(root, OptimizePass::FIRST, sort_by_cost); + root->optimize(root, OptimizePass::LAST, sort_by_cost); return Blueprint::UP(root); } void -Blueprint::optimize_self(OptimizePass) +Blueprint::optimize_self(OptimizePass, bool) { } @@ -549,19 +549,19 @@ IntermediateBlueprint::should_do_termwise_eval(const UnpackInfo &unpack, double } void -IntermediateBlueprint::optimize(Blueprint* &self, OptimizePass pass) +IntermediateBlueprint::optimize(Blueprint* &self, OptimizePass pass, bool sort_by_cost) { assert(self == this); if (should_optimize_children()) { for (auto &child : _children) { auto *child_ptr = child.release(); - child_ptr->optimize(child_ptr, pass); + child_ptr->optimize(child_ptr, pass, sort_by_cost); child.reset(child_ptr); } } - optimize_self(pass); + optimize_self(pass, sort_by_cost); if (pass == OptimizePass::LAST) { - sort(_children); + sort(_children, sort_by_cost); set_cost(calculate_cost()); } maybe_eliminate_self(self, get_replacement()); @@ -759,10 +759,10 @@ LeafBlueprint::getRange(vespalib::string &, vespalib::string &) const { } void -LeafBlueprint::optimize(Blueprint* &self, OptimizePass pass) +LeafBlueprint::optimize(Blueprint* &self, OptimizePass pass, bool sort_by_cost) { assert(self == this); - optimize_self(pass); + optimize_self(pass, sort_by_cost); maybe_eliminate_self(self, get_replacement()); } diff --git a/searchlib/src/vespa/searchlib/queryeval/blueprint.h b/searchlib/src/vespa/searchlib/queryeval/blueprint.h index 66d55015f62..7caeeddd01c 100644 --- a/searchlib/src/vespa/searchlib/queryeval/blueprint.h +++ b/searchlib/src/vespa/searchlib/queryeval/blueprint.h @@ -172,6 +172,20 @@ public: // lower limit for docid_limit: max child estimate static HitEstimate sat_sum(const std::vector &data, uint32_t docid_limit); + // sort children to minimize total cost of OR flow + struct MinimalOrCost { + bool operator () (const auto &a, const auto &b) const noexcept { + return a->estimate() / a->cost() > b->estimate() / b->cost(); + } + }; + + // sort children to minimize total cost of AND flow + struct MinimalAndCost { + bool operator () (const auto &a, const auto &b) const noexcept { + return (1.0 - a->estimate()) / a->cost() > (1.0 - b->estimate()) / b->cost(); + } + }; + // utility to get the greater estimate to sort first, higher tiers last struct TieredGreaterEstimate { bool operator () (const auto &a, const auto &b) const noexcept { @@ -246,9 +260,9 @@ public: virtual void setDocIdLimit(uint32_t limit) noexcept { _docid_limit = limit; } uint32_t get_docid_limit() const noexcept { return _docid_limit; } - static Blueprint::UP optimize(Blueprint::UP bp); - virtual void optimize(Blueprint* &self, OptimizePass pass) = 0; - virtual void optimize_self(OptimizePass pass); + static Blueprint::UP optimize(Blueprint::UP bp, bool sort_by_cost); + virtual void optimize(Blueprint* &self, OptimizePass pass, bool sort_by_cost) = 0; + virtual void optimize_self(OptimizePass pass, bool sort_by_cost); virtual Blueprint::UP get_replacement(); virtual bool should_optimize_children() const { return true; } @@ -376,7 +390,7 @@ public: void setDocIdLimit(uint32_t limit) noexcept final; - void optimize(Blueprint* &self, OptimizePass pass) final; + void optimize(Blueprint* &self, OptimizePass pass, bool sort_by_cost) final; void set_global_filter(const GlobalFilter &global_filter, double estimated_hit_ratio) override; IndexList find(const IPredicate & check) const; @@ -393,7 +407,7 @@ public: virtual double calculate_cost() const = 0; virtual HitEstimate combine(const std::vector &data) const = 0; virtual FieldSpecBaseList exposeFields() const = 0; - virtual void sort(Children &children) const = 0; + virtual void sort(Children &children, bool sort_by_cost) const = 0; virtual bool inheritStrict(size_t i) const = 0; virtual SearchIteratorUP createIntermediateSearch(MultiSearch::Children subSearches, @@ -413,7 +427,7 @@ class LeafBlueprint : public Blueprint private: State _state; protected: - void optimize(Blueprint* &self, OptimizePass pass) final; + void optimize(Blueprint* &self, OptimizePass pass, bool sort_by_cost) final; void setEstimate(HitEstimate est) { _state.estimate(est); _state.relative_estimate(calculate_relative_estimate()); diff --git a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp index 364602cba03..a3f32c6eb7b 100644 --- a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp @@ -33,7 +33,7 @@ size_t lookup_create_source(std::vector > &sources, } template -void optimize_source_blenders(IntermediateBlueprint &self, size_t begin_idx) { +void optimize_source_blenders(IntermediateBlueprint &self, size_t begin_idx, bool sort_by_cost) { std::vector source_blenders; const SourceBlenderBlueprint * reference = nullptr; for (size_t i = begin_idx; i < self.childCnt(); ++i) { @@ -63,7 +63,7 @@ void optimize_source_blenders(IntermediateBlueprint &self, size_t begin_idx) { top->addChild(std::move(sources.back())); sources.pop_back(); } - blender_up = Blueprint::optimize(std::move(blender_up)); + blender_up = Blueprint::optimize(std::move(blender_up), sort_by_cost); self.addChild(std::move(blender_up)); } } @@ -114,7 +114,7 @@ AndNotBlueprint::exposeFields() const } void -AndNotBlueprint::optimize_self(OptimizePass pass) +AndNotBlueprint::optimize_self(OptimizePass pass, bool sort_by_cost) { if (childCnt() == 0) { return; @@ -152,7 +152,7 @@ AndNotBlueprint::optimize_self(OptimizePass pass) } } if (pass == OptimizePass::LAST) { - optimize_source_blenders(*this, 1); + optimize_source_blenders(*this, 1, sort_by_cost); } } @@ -166,10 +166,14 @@ AndNotBlueprint::get_replacement() } void -AndNotBlueprint::sort(Children &children) const +AndNotBlueprint::sort(Children &children, bool sort_by_cost) const { if (children.size() > 2) { - std::sort(children.begin() + 1, children.end(), TieredGreaterEstimate()); + if (sort_by_cost) { + std::sort(children.begin() + 1, children.end(), MinimalOrCost()); + } else { + std::sort(children.begin() + 1, children.end(), TieredGreaterEstimate()); + } } } @@ -231,7 +235,7 @@ AndBlueprint::exposeFields() const } void -AndBlueprint::optimize_self(OptimizePass pass) +AndBlueprint::optimize_self(OptimizePass pass, bool sort_by_cost) { if (pass == OptimizePass::FIRST) { for (size_t i = 0; i < childCnt(); ++i) { @@ -244,7 +248,7 @@ AndBlueprint::optimize_self(OptimizePass pass) } } if (pass == OptimizePass::LAST) { - optimize_source_blenders(*this, 0); + optimize_source_blenders(*this, 0, sort_by_cost); } } @@ -258,9 +262,13 @@ AndBlueprint::get_replacement() } void -AndBlueprint::sort(Children &children) const +AndBlueprint::sort(Children &children, bool sort_by_cost) const { - std::sort(children.begin(), children.end(), TieredLessEstimate()); + if (sort_by_cost) { + std::sort(children.begin(), children.end(), MinimalAndCost()); + } else { + std::sort(children.begin(), children.end(), TieredLessEstimate()); + } } bool @@ -344,7 +352,7 @@ OrBlueprint::exposeFields() const } void -OrBlueprint::optimize_self(OptimizePass pass) +OrBlueprint::optimize_self(OptimizePass pass, bool sort_by_cost) { if (pass == OptimizePass::FIRST) { for (size_t i = 0; (childCnt() > 1) && (i < childCnt()); ++i) { @@ -359,7 +367,7 @@ OrBlueprint::optimize_self(OptimizePass pass) } } if (pass == OptimizePass::LAST) { - optimize_source_blenders(*this, 0); + optimize_source_blenders(*this, 0, sort_by_cost); } } @@ -373,9 +381,13 @@ OrBlueprint::get_replacement() } void -OrBlueprint::sort(Children &children) const +OrBlueprint::sort(Children &children, bool sort_by_cost) const { - std::sort(children.begin(), children.end(), TieredGreaterEstimate()); + if (sort_by_cost) { + std::sort(children.begin(), children.end(), MinimalOrCost()); + } else { + std::sort(children.begin(), children.end(), TieredGreaterEstimate()); + } } bool @@ -452,7 +464,7 @@ WeakAndBlueprint::exposeFields() const } void -WeakAndBlueprint::sort(Children &) const +WeakAndBlueprint::sort(Children &, bool) const { // order needs to stay the same as _weights } @@ -516,9 +528,13 @@ NearBlueprint::exposeFields() const } void -NearBlueprint::sort(Children &children) const +NearBlueprint::sort(Children &children, bool sort_by_cost) const { - std::sort(children.begin(), children.end(), TieredLessEstimate()); + if (sort_by_cost) { + std::sort(children.begin(), children.end(), MinimalAndCost()); + } else { + std::sort(children.begin(), children.end(), TieredLessEstimate()); + } } bool @@ -579,10 +595,9 @@ ONearBlueprint::exposeFields() const } void -ONearBlueprint::sort(Children &children) const +ONearBlueprint::sort(Children &, bool) const { // ordered near cannot sort children here - (void)children; } bool @@ -648,7 +663,7 @@ RankBlueprint::exposeFields() const } void -RankBlueprint::optimize_self(OptimizePass pass) +RankBlueprint::optimize_self(OptimizePass pass, bool sort_by_cost) { if (pass == OptimizePass::FIRST) { for (size_t i = 1; i < childCnt(); ++i) { @@ -658,7 +673,7 @@ RankBlueprint::optimize_self(OptimizePass pass) } } if (pass == OptimizePass::LAST) { - optimize_source_blenders(*this, 1); + optimize_source_blenders(*this, 1, sort_by_cost); } } @@ -672,9 +687,8 @@ RankBlueprint::get_replacement() } void -RankBlueprint::sort(Children &children) const +RankBlueprint::sort(Children &, bool) const { - (void)children; } bool @@ -751,7 +765,7 @@ SourceBlenderBlueprint::exposeFields() const } void -SourceBlenderBlueprint::sort(Children &) const +SourceBlenderBlueprint::sort(Children &, bool) const { } diff --git a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h index 14672c2a5cd..ff4360c0f15 100644 --- a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h +++ b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h @@ -19,10 +19,10 @@ public: double calculate_relative_estimate() const final; HitEstimate combine(const std::vector &data) const override; FieldSpecBaseList exposeFields() const override; - void optimize_self(OptimizePass pass) override; + void optimize_self(OptimizePass pass, bool sort_by_cost) override; AndNotBlueprint * asAndNot() noexcept final { return this; } Blueprint::UP get_replacement() override; - void sort(Children &children) const override; + void sort(Children &children, bool sort_by_cost) const override; bool inheritStrict(size_t i) const override; SearchIterator::UP createIntermediateSearch(MultiSearch::Children subSearches, @@ -47,10 +47,10 @@ public: double calculate_relative_estimate() const final; HitEstimate combine(const std::vector &data) const override; FieldSpecBaseList exposeFields() const override; - void optimize_self(OptimizePass pass) override; + void optimize_self(OptimizePass pass, bool sort_by_cost) override; AndBlueprint * asAnd() noexcept final { return this; } Blueprint::UP get_replacement() override; - void sort(Children &children) const override; + void sort(Children &children, bool sort_by_cost) const override; bool inheritStrict(size_t i) const override; SearchIterator::UP createIntermediateSearch(MultiSearch::Children subSearches, @@ -73,10 +73,10 @@ public: double calculate_relative_estimate() const final; HitEstimate combine(const std::vector &data) const override; FieldSpecBaseList exposeFields() const override; - void optimize_self(OptimizePass pass) override; + void optimize_self(OptimizePass pass, bool sort_by_cost) override; OrBlueprint * asOr() noexcept final { return this; } Blueprint::UP get_replacement() override; - void sort(Children &children) const override; + void sort(Children &children, bool sort_by_cost) const override; bool inheritStrict(size_t i) const override; SearchIterator::UP createIntermediateSearch(MultiSearch::Children subSearches, @@ -101,7 +101,7 @@ public: double calculate_relative_estimate() const final; HitEstimate combine(const std::vector &data) const override; FieldSpecBaseList exposeFields() const override; - void sort(Children &children) const override; + void sort(Children &children, bool sort_by_cost) const override; bool inheritStrict(size_t i) const override; bool always_needs_unpack() const override; WeakAndBlueprint * asWeakAnd() noexcept final { return this; } @@ -133,7 +133,7 @@ public: HitEstimate combine(const std::vector &data) const override; FieldSpecBaseList exposeFields() const override; bool should_optimize_children() const override { return false; } - void sort(Children &children) const override; + void sort(Children &children, bool sort_by_cost) const override; bool inheritStrict(size_t i) const override; SearchIteratorUP createSearch(fef::MatchData &md, bool strict) const override; SearchIterator::UP @@ -157,7 +157,7 @@ public: HitEstimate combine(const std::vector &data) const override; FieldSpecBaseList exposeFields() const override; bool should_optimize_children() const override { return false; } - void sort(Children &children) const override; + void sort(Children &children, bool sort_by_cost) const override; bool inheritStrict(size_t i) const override; SearchIteratorUP createSearch(fef::MatchData &md, bool strict) const override; SearchIterator::UP @@ -177,9 +177,9 @@ public: double calculate_relative_estimate() const final; HitEstimate combine(const std::vector &data) const override; FieldSpecBaseList exposeFields() const override; - void optimize_self(OptimizePass pass) override; + void optimize_self(OptimizePass pass, bool sort_by_cost) override; Blueprint::UP get_replacement() override; - void sort(Children &children) const override; + void sort(Children &children, bool sort_by_cost) const override; bool inheritStrict(size_t i) const override; bool isRank() const noexcept final { return true; } SearchIterator::UP @@ -206,7 +206,7 @@ public: double calculate_relative_estimate() const final; HitEstimate combine(const std::vector &data) const override; FieldSpecBaseList exposeFields() const override; - void sort(Children &children) const override; + void sort(Children &children, bool sort_by_cost) const override; bool inheritStrict(size_t i) const override; SearchIterator::UP createIntermediateSearch(MultiSearch::Children subSearches, diff --git a/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp index f0c75173671..bd677289218 100644 --- a/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp @@ -45,7 +45,7 @@ SameElementBlueprint::addTerm(Blueprint::UP term) } void -SameElementBlueprint::optimize_self(OptimizePass pass) +SameElementBlueprint::optimize_self(OptimizePass pass, bool) { if (pass == OptimizePass::LAST) { std::sort(_terms.begin(), _terms.end(), diff --git a/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.h b/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.h index 6a988e67149..06c20339e81 100644 --- a/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.h +++ b/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.h @@ -34,7 +34,7 @@ public: // used by create visitor void addTerm(Blueprint::UP term); - void optimize_self(OptimizePass pass) override; + void optimize_self(OptimizePass pass, bool sort_by_cost) override; void fetchPostings(const ExecuteInfo &execInfo) override; std::unique_ptr create_same_element_search(search::fef::TermFieldMatchData& tfmd, bool strict) const; -- cgit v1.2.3