diff options
Diffstat (limited to 'searchlib')
111 files changed, 2636 insertions, 847 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index a5453ac5273..43e17417c51 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -60,6 +60,7 @@ vespa_define_module( src/apps/vespa-attribute-inspect src/apps/vespa-fileheader-inspect src/apps/vespa-index-inspect + src/apps/vespa-query-analyzer src/apps/vespa-ranking-expression-analyzer TESTS @@ -140,6 +141,7 @@ vespa_define_module( src/tests/features/element_completeness src/tests/features/element_similarity_feature src/tests/features/euclidean_distance + src/tests/features/first_phase_rank src/tests/features/imported_dot_product src/tests/features/internal_max_reduce_prod_join_feature src/tests/features/item_raw_score diff --git a/searchlib/src/apps/vespa-query-analyzer/.gitignore b/searchlib/src/apps/vespa-query-analyzer/.gitignore new file mode 100644 index 00000000000..e5a31caab09 --- /dev/null +++ b/searchlib/src/apps/vespa-query-analyzer/.gitignore @@ -0,0 +1,3 @@ +/.depend +/Makefile +/vespa-query-analyzer diff --git a/searchlib/src/apps/vespa-query-analyzer/CMakeLists.txt b/searchlib/src/apps/vespa-query-analyzer/CMakeLists.txt new file mode 100644 index 00000000000..f84a413ee70 --- /dev/null +++ b/searchlib/src/apps/vespa-query-analyzer/CMakeLists.txt @@ -0,0 +1,9 @@ +# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_vespa-query-analyzer_app + SOURCES + vespa-query-analyzer.cpp + OUTPUT_NAME vespa-query-analyzer + INSTALL bin + DEPENDS + searchlib +) diff --git a/searchlib/src/apps/vespa-query-analyzer/vespa-query-analyzer.cpp b/searchlib/src/apps/vespa-query-analyzer/vespa-query-analyzer.cpp new file mode 100644 index 00000000000..178c09c02ac --- /dev/null +++ b/searchlib/src/apps/vespa-query-analyzer/vespa-query-analyzer.cpp @@ -0,0 +1,361 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/data/simple_buffer.h> +#include <vespa/vespalib/data/slime/json_format.h> +#include <vespa/vespalib/data/slime/slime.h> +#include <vespa/vespalib/io/mapped_file_input.h> +#include <vespa/vespalib/util/overload.h> +#include <vespa/vespalib/util/signalhandler.h> +#include <vespa/vespalib/util/stringfmt.h> +#include <vespa/searchlib/queryeval/flow.h> +#include <variant> +#include <vector> +#include <map> + +using namespace vespalib::slime::convenience; +using vespalib::make_string_short::fmt; +using vespalib::slime::JsonFormat; +using vespalib::slime::ARRAY; +using vespalib::slime::OBJECT; +using vespalib::slime::STRING; +using vespalib::slime::DOUBLE; +using vespalib::slime::BOOL; +using search::queryeval::FlowStats; +using search::queryeval::InFlow; + +//----------------------------------------------------------------------------- + +using Path = std::vector<std::variant<size_t,vespalib::stringref>>; +using Paths = std::vector<Path>; + +template <typename F> +struct Matcher : vespalib::slime::ObjectTraverser { + Path path; + Paths result; + F match; + ~Matcher(); + Matcher(F match_in) noexcept : path(), result(), match(match_in) {} + void search(const Inspector &node) { + if (path.empty() && match(path, node)) { + result.push_back(path); + } + if (node.type() == OBJECT()) { + node.traverse(*this); + } + if (node.type() == ARRAY()) { + size_t size = node.entries(); + for (size_t i = 0; i < size; ++i) { + path.emplace_back(i); + if (match(path, node[i])) { + result.push_back(path); + } + search(node[i]); + path.pop_back(); + } + } + } + void field(const Memory &symbol, const Inspector &inspector) final { + path.emplace_back(symbol.make_stringref()); + if (match(path, inspector)) { + result.push_back(path); + } + search(inspector); + path.pop_back(); + } +}; +template <typename F> Matcher<F>::~Matcher() = default; + +std::vector<Path> find_field(const Inspector &root, const vespalib::string &name) { + auto matcher = Matcher([&](const Path &path, const Inspector &){ + return ((path.size() > 0) && + (std::holds_alternative<vespalib::stringref>(path.back())) && + (std::get<vespalib::stringref>(path.back()) == name)); + }); + matcher.search(root); + return matcher.result; +} + +std::vector<Path> find_tag(const Inspector &root, const vespalib::string &name) { + auto matcher = Matcher([&](const Path &path, const Inspector &value){ + return ((path.size() > 0) && + (std::holds_alternative<vespalib::stringref>(path.back())) && + (std::get<vespalib::stringref>(path.back()) == "tag") && + (value.asString().make_stringref() == name)); + }); + matcher.search(root); + return matcher.result; +} + +vespalib::string path_to_str(const Path &path) { + size_t cnt = 0; + vespalib::string str("["); + for (const auto &item: path) { + if (cnt++ > 0) { + str.append(","); + } + std::visit(vespalib::overload{ + [&str](size_t value)noexcept{ str.append(fmt("%zu", value)); }, + [&str](vespalib::stringref value)noexcept{ str.append(value); }}, item); + } + str.append("]"); + return str; +} + +vespalib::string strip_name(vespalib::stringref name) { + auto end = name.find("<"); + auto ns = name.rfind("::", end); + size_t begin = (ns > name.size()) ? 0 : ns + 2; + return name.substr(begin, end - begin); +} + +const Inspector &apply_path(const Inspector &node, const Path &path, size_t max = -1) { + size_t cnt = 0; + const Inspector *ptr = &node; + for (const auto &elem: path) { + if (cnt++ >= max) { + return *ptr; + } + if (std::holds_alternative<size_t>(elem)) { + ptr = &((*ptr)[std::get<size_t>(elem)]); + } + if (std::holds_alternative<vespalib::stringref>(elem)) { + auto ref = std::get<vespalib::stringref>(elem); + ptr = &((*ptr)[Memory(ref.data(), ref.size())]); + } + } + return *ptr; +} + +void extract(vespalib::string &value, const Inspector &data) { + if (data.valid() && data.type() == STRING()) { + value = data.asString().make_stringref(); + } +} + +struct Sample { + enum class Type { INVALID, INIT, SEEK, UNPACK, TERMWISE }; + Type type = Type::INVALID; + std::vector<size_t> path; + double self_time_ms = 0.0; + double total_time_ms = 0.0; + size_t count = 0; + Sample(const Inspector &sample) { + auto name = sample["name"].asString().make_stringref(); + if (ends_with(name, "/init")) { + type = Type::INIT; + } + if (ends_with(name, "/seek")) { + type = Type::SEEK; + } + if (ends_with(name, "/unpack")) { + type = Type::UNPACK; + } + if (ends_with(name, "/termwise")) { + type = Type::TERMWISE; + } + if (starts_with(name, "/")) { + size_t child = 0; + for (size_t pos = 1; pos < name.size(); ++pos) { + char c = name[pos]; + if (c == '/') { + path.push_back(child); + child = 0; + } else { + if (c < '0' || c > '9') { + break; + } + child = child * 10 + (c - '0'); + } + } + } + self_time_ms = sample["self_time_ms"].asDouble(); + total_time_ms = sample["total_time_ms"].asDouble(); + count = sample["count"].asLong(); + } + static vespalib::string type_to_str(Type type) { + switch(type) { + case Type::INVALID: return "<invalid>"; + case Type::INIT: return "init"; + case Type::SEEK: return "seek"; + case Type::UNPACK: return "unpack"; + case Type::TERMWISE: return "termwise"; + } + abort(); + } + static vespalib::string path_to_str(const std::vector<size_t> &path) { + vespalib::string result("/"); + for (size_t elem: path) { + result += fmt("%zu/", elem); + } + return result; + } + vespalib::string to_string() const { + return fmt("type: %s, path: %s, count: %zu, total_time_ms: %g\n", + type_to_str(type).c_str(), path_to_str(path).c_str(), count, total_time_ms); + } +}; + +struct Node { + vespalib::string type = "unknown"; + bool strict = false; + FlowStats flow_stats = FlowStats(0.0, 0.0, 0.0); + InFlow in_flow = InFlow(0.0); + size_t count = 0; + double self_time_ms = 0.0; + double total_time_ms = 0.0; + std::vector<Node> children; + Node(const Inspector &obj) { + extract(type, obj["[type]"]); + type = strip_name(type); + strict = obj["strict"].asBool(); + flow_stats.estimate = obj["relative_estimate"].asDouble(); + flow_stats.cost = obj["cost"].asDouble(); + flow_stats.strict_cost = obj["strict_cost"].asDouble(); + const Inspector &list = obj["children"]; + for (size_t i = 0; true; ++i) { + const Inspector &child = list[fmt("[%zu]", i)]; + if (child.valid()) { + children.emplace_back(child); + } else { + break; + } + } + } + ~Node(); + void add_sample(const Sample &sample) { + Node *node = this; + for (size_t child: sample.path) { + if (child < node->children.size()) { + node = &node->children[child]; + } else { + fprintf(stderr, "... ignoring bad sample: %s\n", sample.to_string().c_str()); + return; + } + } + node->count += sample.count; + node->self_time_ms += sample.self_time_ms; + node->total_time_ms += sample.total_time_ms; + } + void dump_line(size_t indent) const { + fprintf(stderr, "|%10zu ", count); + fprintf(stderr, "|%11.3f ", total_time_ms); + fprintf(stderr, "|%10.3f | ", self_time_ms); + for (size_t i = 0; i < indent; ++i) { + fprintf(stderr, " "); + } + fprintf(stderr, "%s\n", type.c_str()); + for (const Node &child: children) { + child.dump_line(indent + 1); + } + } + void dump() const { + fprintf(stderr, "| count | total_time | self_time | structure\n"); + fprintf(stderr, "+-----------+------------+-----------+-------------------------------\n"); + dump_line(0); + fprintf(stderr, "+-----------+------------+-----------+-------------------------------\n"); + } +}; +Node::~Node() = default; + +void each_sample_list(const Inspector &list, auto f) { + for (size_t i = 0; i < list.entries(); ++i) { + f(Sample(list[i])); + each_sample_list(list[i]["children"], f); + } +} + +void each_sample(const Inspector &prof, auto f) { + each_sample_list(prof["roots"], f); +} + +struct State { + void analyze(const Inspector &root) { + auto bp_list = find_field(root, "optimized"); + for (const Path &path: bp_list) { + const Inspector &node = apply_path(root, path, path.size()-3); + const Inspector &key_field = node["distribution-key"]; + if (key_field.valid()) { + int key = key_field.asLong(); + Node data(apply_path(root, path)); + auto prof_list = find_tag(node, "match_profiling"); + double total_ms = 0.0; + std::map<Sample::Type,double> time_map; + for (const Path &prof_path: prof_list) { + const Inspector &prof = apply_path(node, prof_path, prof_path.size()-1); + if (prof["profiler"].asString().make_stringref() == "tree") { + total_ms += prof["total_time_ms"].asDouble(); + each_sample(prof, [&](const Sample &sample) { + if (sample.type == Sample::Type::SEEK) { + data.add_sample(sample); + } + if (sample.path.empty()) { + time_map[sample.type] += sample.total_time_ms; + } + }); + } + } + data.dump(); + fprintf(stderr, "distribution key: %d, total_time_ms: %g\n", key, total_ms); + for (auto [type, time]: time_map) { + fprintf(stderr, "sample type %s used %g ms total\n", Sample::type_to_str(type).c_str(), time); + } + } + } + } +}; + +//----------------------------------------------------------------------------- + +void usage(const char *self) { + fprintf(stderr, "usage: %s <json query result file>\n", self); + fprintf(stderr, " analyze query cost (planning vs profiling)\n"); + fprintf(stderr, " query result must contain optimized blueprint dump\n"); + fprintf(stderr, " query result must contain match phase tree profiling\n\n"); +} + +struct MyApp { + vespalib::string file_name; + bool parse_params(int argc, char **argv); + int main(); +}; + +bool +MyApp::parse_params(int argc, char **argv) { + if (argc != 2) { + return false; + } + file_name = argv[1]; + return true; +} + +int +MyApp::main() +{ + vespalib::MappedFileInput file(file_name); + if (!file.valid()) { + fprintf(stderr, "could not read input file: '%s'\n", + file_name.c_str()); + return 1; + } + Slime slime; + if(JsonFormat::decode(file, slime) == 0) { + fprintf(stderr, "file contains invalid json: '%s'\n", + file_name.c_str()); + return 1; + } + State state; + state.analyze(slime.get()); + return 0; +} + +int main(int argc, char **argv) { + MyApp my_app; + vespalib::SignalHandler::PIPE.ignore(); + if (!my_app.parse_params(argc, argv)) { + usage(argv[0]); + return 1; + } + return my_app.main(); +} + +//----------------------------------------------------------------------------- diff --git a/searchlib/src/tests/attribute/bitvector_search_cache/bitvector_search_cache_test.cpp b/searchlib/src/tests/attribute/bitvector_search_cache/bitvector_search_cache_test.cpp index d51ec22a54a..1d66eefaff7 100644 --- a/searchlib/src/tests/attribute/bitvector_search_cache/bitvector_search_cache_test.cpp +++ b/searchlib/src/tests/attribute/bitvector_search_cache/bitvector_search_cache_test.cpp @@ -3,6 +3,7 @@ #include <vespa/vespalib/testkit/test_kit.h> #include <vespa/searchlib/attribute/bitvector_search_cache.h> #include <vespa/searchlib/common/bitvector.h> +#include <vespa/vespalib/util/memoryusage.h> using namespace search; using namespace search::attribute; @@ -31,9 +32,13 @@ struct Fixture { TEST_F("require that bit vectors can be inserted and retrieved", Fixture) { EXPECT_EQUAL(0u, f.cache.size()); + auto old_mem_usage = f.cache.get_memory_usage(); f.cache.insert("foo", f.entry1); f.cache.insert("bar", f.entry2); EXPECT_EQUAL(2u, f.cache.size()); + auto new_mem_usage = f.cache.get_memory_usage(); + EXPECT_LESS(old_mem_usage.usedBytes(), new_mem_usage.usedBytes()); + EXPECT_LESS(old_mem_usage.allocatedBytes(), new_mem_usage.allocatedBytes()); EXPECT_EQUAL(f.entry1, f.cache.find("foo")); EXPECT_EQUAL(f.entry2, f.cache.find("bar")); @@ -43,9 +48,13 @@ TEST_F("require that bit vectors can be inserted and retrieved", Fixture) TEST_F("require that insert() doesn't replace existing bit vector", Fixture) { f.cache.insert("foo", f.entry1); + auto old_mem_usage = f.cache.get_memory_usage(); f.cache.insert("foo", f.entry2); + auto new_mem_usage = f.cache.get_memory_usage(); EXPECT_EQUAL(1u, f.cache.size()); EXPECT_EQUAL(f.entry1, f.cache.find("foo")); + EXPECT_EQUAL(old_mem_usage.usedBytes(), new_mem_usage.usedBytes()); + EXPECT_EQUAL(old_mem_usage.allocatedBytes(), new_mem_usage.allocatedBytes()); } TEST_F("require that cache can be cleared", Fixture) @@ -53,11 +62,15 @@ TEST_F("require that cache can be cleared", Fixture) f.cache.insert("foo", f.entry1); f.cache.insert("bar", f.entry2); EXPECT_EQUAL(2u, f.cache.size()); + auto old_mem_usage = f.cache.get_memory_usage(); f.cache.clear(); + auto new_mem_usage = f.cache.get_memory_usage(); EXPECT_EQUAL(0u, f.cache.size()); EXPECT_TRUE(f.cache.find("foo").get() == nullptr); EXPECT_TRUE(f.cache.find("bar").get() == nullptr); + EXPECT_GREATER(old_mem_usage.usedBytes(), new_mem_usage.usedBytes()); + EXPECT_GREATER(old_mem_usage.allocatedBytes(), new_mem_usage.allocatedBytes()); } TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/tests/attribute/imported_search_context/imported_search_context_test.cpp b/searchlib/src/tests/attribute/imported_search_context/imported_search_context_test.cpp index 41ec377dece..7c38c322bc8 100644 --- a/searchlib/src/tests/attribute/imported_search_context/imported_search_context_test.cpp +++ b/searchlib/src/tests/attribute/imported_search_context/imported_search_context_test.cpp @@ -508,6 +508,7 @@ assertBitVector(const std::vector<uint32_t> &expDocIds, const BitVector &bitVect TEST_F("Entry is inserted into search cache if bit vector posting list is used", SearchCacheFixture) { EXPECT_EQUAL(0u, f.imported_attr->getSearchCache()->size()); + auto old_mem_usage = f.imported_attr->get_memory_usage(); auto ctx = f.create_context(word_term("5678")); ctx->fetchPostings(queryeval::ExecuteInfo::FULL, true); TermFieldMatchData match; @@ -515,6 +516,9 @@ TEST_F("Entry is inserted into search cache if bit vector posting list is used", TEST_DO(f.assertSearch({3, 5}, *iter)); EXPECT_EQUAL(1u, f.imported_attr->getSearchCache()->size()); + auto new_mem_usage = f.imported_attr->get_memory_usage(); + EXPECT_LESS(old_mem_usage.usedBytes(), new_mem_usage.usedBytes()); + EXPECT_LESS(old_mem_usage.allocatedBytes(), new_mem_usage.allocatedBytes()); auto cacheEntry = f.imported_attr->getSearchCache()->find("5678"); EXPECT_EQUAL(cacheEntry->docIdLimit, f.get_imported_attr()->getNumDocs()); TEST_DO(assertBitVector({3, 5}, *cacheEntry->bitVector)); diff --git a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp index b1b2235165f..cce72837dad 100644 --- a/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp +++ b/searchlib/src/tests/attribute/tensorattribute/tensorattribute_test.cpp @@ -210,8 +210,10 @@ public: } void expect_entry(uint32_t exp_docid, const DoubleVector& exp_vector, const EntryVector& entries) const { EXPECT_EQUAL(1u, entries.size()); - EXPECT_EQUAL(exp_docid, entries.back().first); - EXPECT_EQUAL(exp_vector, entries.back().second); + if (entries.size() >= 1u) { + EXPECT_EQUAL(exp_docid, entries.back().first); + EXPECT_EQUAL(exp_vector, entries.back().second); + } } void expect_add(uint32_t exp_docid, const DoubleVector& exp_vector) const { expect_entry(exp_docid, exp_vector, _adds); @@ -329,6 +331,10 @@ public: static search::tensor::DistanceFunctionFactory::UP my_dist_fun = search::tensor::make_distance_function_factory(search::attribute::DistanceMetric::Euclidean, vespalib::eval::CellType::DOUBLE); return *my_dist_fun; } + + uint32_t check_consistency(uint32_t) const noexcept override { + return 0; + } }; class MockNearestNeighborIndexFactory : public NearestNeighborIndexFactory { @@ -1077,8 +1083,22 @@ TEST_F("Populates address space usage in mixed tensor attribute with hnsw index" class DenseTensorAttributeMockIndex : public Fixture { public: DenseTensorAttributeMockIndex() : Fixture(vec_2d_spec, FixtureTraits().mock_hnsw()) {} + void add_vec_a(); }; +void +DenseTensorAttributeMockIndex::add_vec_a() +{ + auto& index = mock_index(); + auto vec_a = vec_2d(3, 5); + auto prepare_result = prepare_set_tensor(1, vec_a); + index.expect_prepare_add(1, {3, 5}); + complete_set_tensor(1, vec_a, std::move(prepare_result)); + assertGetTensor(vec_a, 1); + index.expect_complete_add(1, {3, 5}); + index.clear(); +} + TEST_F("setTensor() updates nearest neighbor index", DenseTensorAttributeMockIndex) { auto& index = f.mock_index(); @@ -1097,15 +1117,7 @@ TEST_F("setTensor() updates nearest neighbor index", DenseTensorAttributeMockInd TEST_F("nearest neighbor index can be updated in two phases", DenseTensorAttributeMockIndex) { auto& index = f.mock_index(); - { - auto vec_a = vec_2d(3, 5); - auto prepare_result = f.prepare_set_tensor(1, vec_a); - index.expect_prepare_add(1, {3, 5}); - f.complete_set_tensor(1, vec_a, std::move(prepare_result)); - f.assertGetTensor(vec_a, 1); - index.expect_complete_add(1, {3, 5}); - } - index.clear(); + f.add_vec_a(); { // Replaces previous value. auto vec_b = vec_2d(7, 9); @@ -1121,15 +1133,7 @@ TEST_F("nearest neighbor index can be updated in two phases", DenseTensorAttribu TEST_F("nearest neighbor index is NOT updated when tensor value is unchanged", DenseTensorAttributeMockIndex) { auto& index = f.mock_index(); - { - auto vec_a = vec_2d(3, 5); - auto prepare_result = f.prepare_set_tensor(1, vec_a); - index.expect_prepare_add(1, {3, 5}); - f.complete_set_tensor(1, vec_a, std::move(prepare_result)); - f.assertGetTensor(vec_a, 1); - index.expect_complete_add(1, {3, 5}); - } - index.clear(); + f.add_vec_a(); { // Replaces previous value with the same value auto vec_b = vec_2d(3, 5); @@ -1139,6 +1143,39 @@ TEST_F("nearest neighbor index is NOT updated when tensor value is unchanged", D f.complete_set_tensor(1, vec_b, std::move(prepare_result)); f.assertGetTensor(vec_b, 1); index.expect_empty_complete_add(); + index.expect_empty_add(); + } +} + +TEST_F("nearest neighbor index is updated when value changes from A to B to A", DenseTensorAttributeMockIndex) +{ + auto& index = f.mock_index(); + f.add_vec_a(); + { + // Prepare replace of A with B + auto vec_b = vec_2d(7, 9); + auto prepare_result_b = f.prepare_set_tensor(1, vec_b); + index.expect_prepare_add(1, {7, 9}); + index.clear(); + // Prepare replace of B with A, but prepare sees original A + auto vec_a = vec_2d(3, 5); + auto prepare_result_a = f.prepare_set_tensor(1, vec_a); + EXPECT_TRUE(prepare_result_a.get() == nullptr); + index.expect_empty_prepare_add(); + index.clear(); + // Complete set B + f.complete_set_tensor(1, vec_b, std::move(prepare_result_b)); + index.expect_remove(1, {3, 5}); + f.assertGetTensor(vec_b, 1); + index.expect_complete_add(1, {7, 9}); + index.expect_empty_add(); + index.clear(); + // Complete set A, no prepare result but tensor cells changed + f.complete_set_tensor(1, vec_a, std::move(prepare_result_a)); + index.expect_remove(1, {7, 9}); + index.expect_empty_complete_add(); + index.expect_add(1, {3, 5}); + f.assertGetTensor(vec_a, 1); } } diff --git a/searchlib/src/tests/features/first_phase_rank/CMakeLists.txt b/searchlib/src/tests/features/first_phase_rank/CMakeLists.txt new file mode 100644 index 00000000000..5aa83399d3d --- /dev/null +++ b/searchlib/src/tests/features/first_phase_rank/CMakeLists.txt @@ -0,0 +1,11 @@ +# Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +vespa_add_executable(searchlib_features_first_phase_rank_test_app TEST + SOURCES + first_phase_rank_test.cpp + DEPENDS + searchlib + searchlib_test + GTest::GTest +) +vespa_add_test(NAME searchlib_features_first_phase_rank_test_app COMMAND searchlib_features_first_phase_rank_test_app) diff --git a/searchlib/src/tests/features/first_phase_rank/first_phase_rank_test.cpp b/searchlib/src/tests/features/first_phase_rank/first_phase_rank_test.cpp new file mode 100644 index 00000000000..01ba6c36124 --- /dev/null +++ b/searchlib/src/tests/features/first_phase_rank/first_phase_rank_test.cpp @@ -0,0 +1,143 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/searchlib/features/first_phase_rank_feature.h> +#include <vespa/searchlib/features/setup.h> +#include <vespa/searchlib/fef/blueprintfactory.h> +#include <vespa/searchlib/fef/test/dummy_dependency_handler.h> +#include <vespa/searchlib/fef/test/indexenvironment.h> +#include <vespa/searchlib/fef/test/indexenvironmentbuilder.h> +#define ENABLE_GTEST_MIGRATION +#include <vespa/searchlib/test/ft_test_app_base.h> +#include <vespa/vespalib/gtest/gtest.h> + +using search::features::FirstPhaseRankBlueprint; +using search::features::FirstPhaseRankLookup; +using search::features::setup_search_features; +using search::fef::Blueprint; +using search::fef::BlueprintFactory; +using search::fef::ObjectStore; +using search::fef::test::IndexEnvironment; +using search::fef::test::DummyDependencyHandler; +using StringVector = std::vector<vespalib::string>; + +constexpr feature_t unranked = std::numeric_limits<feature_t>::max(); + +struct FirstPhaseRankBlueprintTest : public ::testing::Test { + BlueprintFactory factory; + IndexEnvironment index_env; + + FirstPhaseRankBlueprintTest() + : ::testing::Test(), + factory(), + index_env() + { + setup_search_features(factory); + } + + ~FirstPhaseRankBlueprintTest() override; + + std::shared_ptr<Blueprint> make_blueprint() const { + return factory.createBlueprint("firstPhaseRank"); + } + + void expect_setup_fail(const StringVector& params, const vespalib::string& exp_fail_msg) { + auto blueprint = make_blueprint(); + DummyDependencyHandler deps(*blueprint); + EXPECT_FALSE(blueprint->setup(index_env, params)); + EXPECT_EQ(exp_fail_msg, deps.fail_msg); + } + + std::shared_ptr<Blueprint> expect_setup_succeed(const StringVector& params) { + auto blueprint = make_blueprint(); + DummyDependencyHandler deps(*blueprint); + EXPECT_TRUE(blueprint->setup(index_env, params)); + EXPECT_EQ(0, deps.input.size()); + EXPECT_EQ(StringVector({"score"}), deps.output); + return blueprint; + } +}; + +FirstPhaseRankBlueprintTest::~FirstPhaseRankBlueprintTest() = default; + +TEST_F(FirstPhaseRankBlueprintTest, blueprint_can_be_created_from_factory) +{ + auto bp = make_blueprint(); + EXPECT_TRUE(bp); + EXPECT_TRUE(dynamic_pointer_cast<FirstPhaseRankBlueprint>(bp)); +} + +TEST_F(FirstPhaseRankBlueprintTest, blueprint_setup_fails_when_parameter_list_is_not_empty) +{ + expect_setup_fail({"is"}, + "The parameter list used for setting up rank feature firstPhaseRank is not valid: " + "Expected 0 parameter(s), but got 1"); +} + +TEST_F(FirstPhaseRankBlueprintTest, blueprint_setup_succeeds) +{ + expect_setup_succeed({}); +} + +TEST_F(FirstPhaseRankBlueprintTest, blueprint_can_prepare_shared_state) +{ + auto blueprint = expect_setup_succeed({}); + search::fef::test::QueryEnvironment query_env; + ObjectStore store; + EXPECT_EQ(nullptr, FirstPhaseRankLookup::get_mutable_shared_state(store)); + EXPECT_EQ(nullptr, FirstPhaseRankLookup::get_shared_state(store)); + blueprint->prepareSharedState(query_env, store); + EXPECT_NE(nullptr, FirstPhaseRankLookup::get_mutable_shared_state(store)); + EXPECT_NE(nullptr, FirstPhaseRankLookup::get_shared_state(store)); +} + +TEST_F(FirstPhaseRankBlueprintTest, dump_features) +{ + FtTestAppBase::FT_DUMP_EMPTY(factory, "firstPhaseRank", index_env); +} + +struct FirstPhaseRankExecutorTest : public ::testing::Test { + BlueprintFactory factory; + FtFeatureTest test; + + FirstPhaseRankExecutorTest() + : ::testing::Test(), + factory(), + test(factory, "firstPhaseRank") + { + setup_search_features(factory); + } + ~FirstPhaseRankExecutorTest() override; + void setup(std::vector<std::pair<uint32_t,uint32_t>> ranks) { + EXPECT_TRUE(test.setup()); + auto* lookup = FirstPhaseRankLookup::get_mutable_shared_state(test.getQueryEnv().getObjectStore()); + ASSERT_NE(nullptr, lookup); + for (auto& entry : ranks) { + lookup->add(entry.first, entry.second); + } + } + bool execute(feature_t exp_score, uint32_t docid) { + return test.execute(exp_score, 0.000001, docid); + } +}; + +FirstPhaseRankExecutorTest::~FirstPhaseRankExecutorTest() = default; + +TEST_F(FirstPhaseRankExecutorTest, unranked_docid_gives_huge_output) +{ + setup({}); + EXPECT_TRUE(execute(unranked, 1)); +} + +TEST_F(FirstPhaseRankExecutorTest, ranked_docid_gives_expected_output) +{ + setup({{3, 5}, {7, 4}}); + EXPECT_TRUE(execute(unranked, 2)); + EXPECT_TRUE(execute(5, 3)); + EXPECT_TRUE(execute(unranked, 4)); + EXPECT_TRUE(execute(unranked, 5)); + EXPECT_TRUE(execute(unranked, 6)); + EXPECT_TRUE(execute(4, 7)); + EXPECT_TRUE(execute(unranked, 8)); +} + +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/tests/features/prod_features_test.cpp b/searchlib/src/tests/features/prod_features_test.cpp index 22105533895..fb00b4ff5e6 100644 --- a/searchlib/src/tests/features/prod_features_test.cpp +++ b/searchlib/src/tests/features/prod_features_test.cpp @@ -33,6 +33,7 @@ #include <vespa/searchlib/features/random_normal_stable_feature.h> #include <vespa/searchlib/features/randomfeature.h> #include <vespa/searchlib/features/rankingexpressionfeature.h> +#include <vespa/searchlib/features/second_phase_feature.h> #include <vespa/searchlib/features/setup.h> #include <vespa/searchlib/features/termfeature.h> #include <vespa/searchlib/features/utils.h> @@ -614,6 +615,32 @@ TEST_F(ProdFeaturesTest, test_first_phase) } } +TEST_F(ProdFeaturesTest, test_second_phase) +{ + { // Test blueprint. + SecondPhaseBlueprint pt; + + EXPECT_TRUE(assertCreateInstance(pt, "secondPhase")); + + FtIndexEnvironment ie; + ie.getProperties().add(indexproperties::rank::SecondPhase::NAME, "random"); + + StringList params, in, out; + FT_SETUP_OK(pt, ie, params, in.add("random"), out.add("score")); + FT_SETUP_FAIL(pt, params.add("foo")); + params.clear(); + + FT_DUMP_EMPTY(_factory, "secondPhase", ie); + } + + { // Test executor. + FtFeatureTest ft(_factory, "secondPhase"); + ft.getIndexEnv().getProperties().add(indexproperties::rank::SecondPhase::NAME, "value(11)"); + ASSERT_TRUE(ft.setup()); + ASSERT_TRUE(ft.execute(11.0f)); + } +} + TEST_F(ProdFeaturesTest, test_foreach) { { // Test blueprint. diff --git a/searchlib/src/tests/hitcollector/CMakeLists.txt b/searchlib/src/tests/hitcollector/CMakeLists.txt index 5cedbcbd7e6..cc62dd82af4 100644 --- a/searchlib/src/tests/hitcollector/CMakeLists.txt +++ b/searchlib/src/tests/hitcollector/CMakeLists.txt @@ -4,6 +4,7 @@ vespa_add_executable(searchlib_hitcollector_test_app TEST hitcollector_test.cpp DEPENDS searchlib + GTest::gtest ) vespa_add_test(NAME searchlib_hitcollector_test_app COMMAND searchlib_hitcollector_test_app) vespa_add_executable(searchlib_sorted_hit_sequence_test_app TEST @@ -11,5 +12,6 @@ vespa_add_executable(searchlib_sorted_hit_sequence_test_app TEST sorted_hit_sequence_test.cpp DEPENDS searchlib + GTest::gtest ) vespa_add_test(NAME searchlib_sorted_hit_sequence_test_app COMMAND searchlib_sorted_hit_sequence_test_app) diff --git a/searchlib/src/tests/hitcollector/hitcollector_test.cpp b/searchlib/src/tests/hitcollector/hitcollector_test.cpp index e6e38181412..60daa571f1d 100644 --- a/searchlib/src/tests/hitcollector/hitcollector_test.cpp +++ b/searchlib/src/tests/hitcollector/hitcollector_test.cpp @@ -1,9 +1,9 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <vespa/vespalib/testkit/testapp.h> #include <vespa/searchlib/common/bitvector.h> #include <vespa/searchlib/fef/fef.h> #include <vespa/searchlib/queryeval/hitcollector.h> +#include <vespa/vespalib/gtest/gtest.h> #include <vespa/log/log.h> LOG_SETUP("hitcollector_test"); @@ -13,6 +13,8 @@ using namespace search::fef; using namespace search::queryeval; using ScoreMap = std::map<uint32_t, feature_t>; +using DocidVector = std::vector<uint32_t>; +using RankedHitVector = std::vector<RankedHit>; using Ranges = std::pair<Scores, Scores>; @@ -67,11 +69,11 @@ void checkResult(const ResultSet & rs, const std::vector<RankedHit> & exp) if ( ! exp.empty()) { const RankedHit * rh = rs.getArray(); ASSERT_TRUE(rh != nullptr); - ASSERT_EQUAL(rs.getArrayUsed(), exp.size()); + ASSERT_EQ(rs.getArrayUsed(), exp.size()); for (uint32_t i = 0; i < exp.size(); ++i) { - EXPECT_EQUAL(rh[i].getDocId(), exp[i].getDocId()); - EXPECT_EQUAL(rh[i].getRank() + 1.0, exp[i].getRank() + 1.0); + EXPECT_EQ(rh[i].getDocId(), exp[i].getDocId()); + EXPECT_DOUBLE_EQ(rh[i].getRank() + 64.0, exp[i].getRank() + 64.0); } } else { ASSERT_TRUE(rs.getArray() == nullptr); @@ -93,21 +95,24 @@ void checkResult(ResultSet & rs, BitVector * exp) } } -void testAddHit(uint32_t numDocs, uint32_t maxHitsSize) +void testAddHit(uint32_t numDocs, uint32_t maxHitsSize, const vespalib::string& label) { + SCOPED_TRACE(label); LOG(info, "testAddHit: no hits"); - { // no hits + { + SCOPED_TRACE("no hits"); HitCollector hc(numDocs, maxHitsSize); std::vector<RankedHit> expRh; std::unique_ptr<ResultSet> rs = hc.getResultSet(); - TEST_DO(checkResult(*rs, expRh)); - TEST_DO(checkResult(*rs, nullptr)); + checkResult(*rs, expRh); + checkResult(*rs, nullptr); } LOG(info, "testAddHit: only ranked hits"); - { // only ranked hits + { + SCOPED_TRACE("only ranked hits"); HitCollector hc(numDocs, maxHitsSize); std::vector<RankedHit> expRh; @@ -121,12 +126,13 @@ void testAddHit(uint32_t numDocs, uint32_t maxHitsSize) } std::unique_ptr<ResultSet> rs = hc.getResultSet(); - TEST_DO(checkResult(*rs, expRh)); - TEST_DO(checkResult(*rs, nullptr)); + checkResult(*rs, expRh); + checkResult(*rs, nullptr); } LOG(info, "testAddHit: both ranked hits and bit vector hits"); - { // both ranked hits and bit vector hits + { + SCOPED_TRACE("both ranked hits and bitvector hits"); HitCollector hc(numDocs, maxHitsSize); std::vector<RankedHit> expRh; BitVector::UP expBv(BitVector::create(numDocs)); @@ -144,14 +150,15 @@ void testAddHit(uint32_t numDocs, uint32_t maxHitsSize) } std::unique_ptr<ResultSet> rs = hc.getResultSet(); - TEST_DO(checkResult(*rs, expRh)); - TEST_DO(checkResult(*rs, expBv.get())); + checkResult(*rs, expRh); + checkResult(*rs, expBv.get()); } } -TEST("testAddHit") { - TEST_DO(testAddHit(30, 10)); - TEST_DO(testAddHit(400, 10)); // 400/32 = 12 which is bigger than 10. +TEST(HitCollectorTest, testAddHit) +{ + testAddHit(30, 10, "numDocs==30"); + testAddHit(400, 10, "numDocs==400"); // 400/32 = 12 which is bigger than 10. } struct Fixture { @@ -197,14 +204,17 @@ struct DescendingScoreFixture : Fixture { DescendingScoreFixture::~DescendingScoreFixture() = default; -TEST_F("testReRank - empty", Fixture) { - EXPECT_EQUAL(0u, f.reRank()); +TEST(HitCollectorTest, rerank_empty) +{ + Fixture f; + EXPECT_EQ(0u, f.reRank()); } -TEST_F("testReRank - ascending", AscendingScoreFixture) +TEST(HitCollectorTest, rerank_ascending) { + AscendingScoreFixture f; f.addHits(); - EXPECT_EQUAL(5u, f.reRank()); + EXPECT_EQ(5u, f.reRank()); std::vector<RankedHit> expRh; for (uint32_t i = 10; i < 20; ++i) { // 10 last are the best @@ -213,17 +223,18 @@ TEST_F("testReRank - ascending", AscendingScoreFixture) expRh.back()._rankValue = i + 200; // after reranking } } - EXPECT_EQUAL(expRh.size(), 10u); + EXPECT_EQ(expRh.size(), 10u); std::unique_ptr<ResultSet> rs = f.hc.getResultSet(); - TEST_DO(checkResult(*rs, expRh)); - TEST_DO(checkResult(*rs, f.expBv.get())); + checkResult(*rs, expRh); + checkResult(*rs, f.expBv.get()); } -TEST_F("testReRank - descending", DescendingScoreFixture) +TEST(HitCollectorTest, rerank_descending) { + DescendingScoreFixture f; f.addHits(); - EXPECT_EQUAL(5u, f.reRank()); + EXPECT_EQ(5u, f.reRank()); std::vector<RankedHit> expRh; for (uint32_t i = 0; i < 10; ++i) { // 10 first are the best @@ -232,17 +243,18 @@ TEST_F("testReRank - descending", DescendingScoreFixture) expRh.back()._rankValue = i + 200; // after reranking } } - EXPECT_EQUAL(expRh.size(), 10u); + EXPECT_EQ(expRh.size(), 10u); std::unique_ptr<ResultSet> rs = f.hc.getResultSet(); - TEST_DO(checkResult(*rs, expRh)); - TEST_DO(checkResult(*rs, f.expBv.get())); + checkResult(*rs, expRh); + checkResult(*rs, f.expBv.get()); } -TEST_F("testReRank - partial", AscendingScoreFixture) +TEST(HitCollectorTest, rerank_partial) { + AscendingScoreFixture f; f.addHits(); - EXPECT_EQUAL(3u, f.reRank(3)); + EXPECT_EQ(3u, f.reRank(3)); std::vector<RankedHit> expRh; for (uint32_t i = 10; i < 20; ++i) { // 10 last are the best @@ -251,36 +263,39 @@ TEST_F("testReRank - partial", AscendingScoreFixture) expRh.back()._rankValue = i + 200; // after reranking } } - EXPECT_EQUAL(expRh.size(), 10u); + EXPECT_EQ(expRh.size(), 10u); std::unique_ptr<ResultSet> rs = f.hc.getResultSet(); - TEST_DO(checkResult(*rs, expRh)); - TEST_DO(checkResult(*rs, f.expBv.get())); + checkResult(*rs, expRh); + checkResult(*rs, f.expBv.get()); } -TEST_F("require that hits for 2nd phase candidates can be retrieved", DescendingScoreFixture) +TEST(HitCollectorTest, require_that_hits_for_2nd_phase_candidates_can_be_retrieved) { + DescendingScoreFixture f; f.addHits(); std::vector<HitCollector::Hit> scores = extract(f.hc.getSortedHitSequence(5)); - ASSERT_EQUAL(5u, scores.size()); - EXPECT_EQUAL(100, scores[0].second); - EXPECT_EQUAL(99, scores[1].second); - EXPECT_EQUAL(98, scores[2].second); - EXPECT_EQUAL(97, scores[3].second); - EXPECT_EQUAL(96, scores[4].second); + ASSERT_EQ(5u, scores.size()); + EXPECT_EQ(100, scores[0].second); + EXPECT_EQ(99, scores[1].second); + EXPECT_EQ(98, scores[2].second); + EXPECT_EQ(97, scores[3].second); + EXPECT_EQ(96, scores[4].second); } -TEST("require that score ranges can be read and set.") { +TEST(HitCollectorTest, require_that_score_ranges_can_be_read_and_set) +{ std::pair<Scores, Scores> ranges = std::make_pair(Scores(1.0, 2.0), Scores(3.0, 4.0)); HitCollector hc(20, 10); hc.setRanges(ranges); - EXPECT_EQUAL(ranges.first.low, hc.getRanges().first.low); - EXPECT_EQUAL(ranges.first.high, hc.getRanges().first.high); - EXPECT_EQUAL(ranges.second.low, hc.getRanges().second.low); - EXPECT_EQUAL(ranges.second.high, hc.getRanges().second.high); + EXPECT_EQ(ranges.first.low, hc.getRanges().first.low); + EXPECT_EQ(ranges.first.high, hc.getRanges().first.high); + EXPECT_EQ(ranges.second.low, hc.getRanges().second.low); + EXPECT_EQ(ranges.second.high, hc.getRanges().second.high); } -TEST("testNoHitsToReRank") { +TEST(HitCollectorTest, no_hits_to_rerank) +{ uint32_t numDocs = 20; uint32_t maxHitsSize = 10; @@ -299,8 +314,8 @@ TEST("testNoHitsToReRank") { } std::unique_ptr<ResultSet> rs = hc.getResultSet(); - TEST_DO(checkResult(*rs, expRh)); - TEST_DO(checkResult(*rs, nullptr)); + checkResult(*rs, expRh); + checkResult(*rs, nullptr); } } @@ -317,14 +332,15 @@ void testScaling(const std::vector<feature_t> &initScores, PredefinedScorer scorer(std::move(finalScores)); // perform second phase ranking - EXPECT_EQUAL(2u, do_reRank(scorer, hc, 2)); + EXPECT_EQ(2u, do_reRank(scorer, hc, 2)); // check results std::unique_ptr<ResultSet> rs = hc.getResultSet(); - TEST_DO(checkResult(*rs, expected)); + checkResult(*rs, expected); } -TEST("testScaling") { +TEST(HitCollectorTest, scaling) +{ std::vector<feature_t> initScores(5); initScores[0] = 1000; initScores[1] = 2000; @@ -338,7 +354,8 @@ TEST("testScaling") { exp[i]._docId = i; } - { // scale down and adjust down + { + SCOPED_TRACE("scale down and adjust down"); exp[0]._rankValue = 0; // scaled exp[1]._rankValue = 100; // scaled exp[2]._rankValue = 200; // scaled @@ -350,9 +367,10 @@ TEST("testScaling") { finalScores[3] = 300; finalScores[4] = 400; - TEST_DO(testScaling(initScores, std::move(finalScores), exp)); + testScaling(initScores, std::move(finalScores), exp); } - { // scale down and adjust up + { + SCOPED_TRACE("scale down and adjust up"); exp[0]._rankValue = 200; // scaled exp[1]._rankValue = 300; // scaled exp[2]._rankValue = 400; // scaled @@ -364,10 +382,10 @@ TEST("testScaling") { finalScores[3] = 500; finalScores[4] = 600; - TEST_DO(testScaling(initScores, std::move(finalScores), exp)); + testScaling(initScores, std::move(finalScores), exp); } - { // scale up and adjust down - + { + SCOPED_TRACE("scale up and adjust down"); exp[0]._rankValue = -500; // scaled (-500) exp[1]._rankValue = 750; // scaled exp[2]._rankValue = 2000; // scaled @@ -379,9 +397,10 @@ TEST("testScaling") { finalScores[3] = 3250; finalScores[4] = 4500; - TEST_DO(testScaling(initScores, std::move(finalScores), exp)); + testScaling(initScores, std::move(finalScores), exp); } - { // minimal scale (second phase range = 0 (4 - 4) -> 1) + { + SCOPED_TRACE("minimal scale (second phase range = 0 (4 - 4) -> 1)"); exp[0]._rankValue = 1; // scaled exp[1]._rankValue = 2; // scaled exp[2]._rankValue = 3; // scaled @@ -393,9 +412,10 @@ TEST("testScaling") { finalScores[3] = 4; finalScores[4] = 4; - TEST_DO(testScaling(initScores, std::move(finalScores), exp)); + testScaling(initScores, std::move(finalScores), exp); } - { // minimal scale (first phase range = 0 (4000 - 4000) -> 1) + { + SCOPED_TRACE("minimal scale (first phase range = 0 (4000 - 4000) -> 1)"); std::vector<feature_t> is(initScores); is[4] = 4000; exp[0]._rankValue = -299600; // scaled @@ -409,11 +429,12 @@ TEST("testScaling") { finalScores[3] = 400; finalScores[4] = 500; - TEST_DO(testScaling(is, std::move(finalScores), exp)); + testScaling(is, std::move(finalScores), exp); } } -TEST("testOnlyBitVector") { +TEST(HitCollectorTest, only_bitvector) +{ uint32_t numDocs = 20; LOG(info, "testOnlyBitVector: test it"); { @@ -428,8 +449,8 @@ TEST("testOnlyBitVector") { std::unique_ptr<ResultSet> rs = hc.getResultSet(); std::vector<RankedHit> expRh; - TEST_DO(checkResult(*rs, expRh)); // no ranked hits - TEST_DO(checkResult(*rs, expBv.get())); // only bit vector + checkResult(*rs, expRh); // no ranked hits + checkResult(*rs, expBv.get()); // only bit vector } } @@ -443,9 +464,9 @@ struct MergeResultSetFixture { {} }; -TEST_F("require that result set is merged correctly with first phase ranking", - MergeResultSetFixture) +TEST(HitCollectorTest, require_that_result_set_is_merged_correctly_with_first_phase_ranking) { + MergeResultSetFixture f; std::vector<RankedHit> expRh; for (uint32_t i = 0; i < f.numDocs; ++i) { f.hc.addHit(i, i + 1000); @@ -457,7 +478,7 @@ TEST_F("require that result set is merged correctly with first phase ranking", expRh.back()._rankValue = (i < f.numDocs - f.maxHitsSize) ? default_rank_value : i + 1000; } std::unique_ptr<ResultSet> rs = f.hc.getResultSet(); - TEST_DO(checkResult(*rs, expRh)); + checkResult(*rs, expRh); } void @@ -474,9 +495,9 @@ addExpectedHitForMergeTest(const MergeResultSetFixture &f, std::vector<RankedHit } } -TEST_F("require that result set is merged correctly with second phase ranking (document scorer)", - MergeResultSetFixture) +TEST(HitCollectorTest, require_that_result_set_is_merged_correctly_with_second_phase_ranking_using_document_scorer) { + MergeResultSetFixture f; // with second phase ranking that triggers rescoring / scaling BasicScorer scorer(500); // second phase ranking setting score to docId + 500 std::vector<RankedHit> expRh; @@ -484,12 +505,13 @@ TEST_F("require that result set is merged correctly with second phase ranking (d f.hc.addHit(i, i + 1000); addExpectedHitForMergeTest(f, expRh, i); } - EXPECT_EQUAL(f.maxHeapSize, do_reRank(scorer, f.hc, f.maxHeapSize)); + EXPECT_EQ(f.maxHeapSize, do_reRank(scorer, f.hc, f.maxHeapSize)); std::unique_ptr<ResultSet> rs = f.hc.getResultSet(); - TEST_DO(checkResult(*rs, expRh)); + checkResult(*rs, expRh); } -TEST("require that hits can be added out of order") { +TEST(HitCollectorTest, require_that_hits_can_be_added_out_of_order) +{ HitCollector hc(1000, 100); std::vector<RankedHit> expRh; // produce expected result in normal order @@ -503,11 +525,12 @@ TEST("require that hits can be added out of order") { hc.addHit(i, i + 100); } std::unique_ptr<ResultSet> rs = hc.getResultSet(); - TEST_DO(checkResult(*rs, expRh)); - TEST_DO(checkResult(*rs, nullptr)); + checkResult(*rs, expRh); + checkResult(*rs, nullptr); } -TEST("require that hits can be added out of order when passing array limit") { +TEST(HitCollectorTest, require_that_hits_can_be_added_out_of_order_when_passing_array_limit) +{ HitCollector hc(10000, 100); std::vector<RankedHit> expRh; // produce expected result in normal order @@ -525,11 +548,12 @@ TEST("require that hits can be added out of order when passing array limit") { hc.addHit(i, i + 100); } std::unique_ptr<ResultSet> rs = hc.getResultSet(); - TEST_DO(checkResult(*rs, expRh)); - TEST_DO(checkResult(*rs, nullptr)); + checkResult(*rs, expRh); + checkResult(*rs, nullptr); } -TEST("require that hits can be added out of order only after passing array limit") { +TEST(HitCollectorTest, require_that_hits_can_be_added_out_of_order_only_after_passing_array_limit) +{ HitCollector hc(10000, 100); std::vector<RankedHit> expRh; // produce expected result in normal order @@ -548,8 +572,87 @@ TEST("require that hits can be added out of order only after passing array limit hc.addHit(i, i + 100); } std::unique_ptr<ResultSet> rs = hc.getResultSet(); - TEST_DO(checkResult(*rs, expRh)); - TEST_DO(checkResult(*rs, nullptr)); + checkResult(*rs, expRh); + checkResult(*rs, nullptr); +} + +struct RankDropFixture { + uint32_t _docid_limit; + HitCollector _hc; + std::vector<uint32_t> _dropped; + RankDropFixture(uint32_t docid_limit, uint32_t max_hits_size) + : _docid_limit(docid_limit), + _hc(docid_limit, max_hits_size) + { + } + void add(std::vector<RankedHit> hits) { + for (const auto& hit : hits) { + _hc.addHit(hit.getDocId(), hit.getRank()); + } + } + void rerank(ScoreMap score_map, size_t count) { + PredefinedScorer scorer(score_map); + EXPECT_EQ(count, do_reRank(scorer, _hc, count)); + } + std::unique_ptr<BitVector> make_bv(DocidVector docids) { + auto bv = BitVector::create(_docid_limit); + for (auto& docid : docids) { + bv->setBit(docid); + } + return bv; + } + + void setup() { + // Initial 7 hits from first phase + add({{5, 1100},{10, 1200},{11, 1300},{12, 1400},{14, 500},{15, 900},{16,1000}}); + // Rerank two best hits, calculate old and new ranges for reranked + // hits that will cause hits not reranked to later be rescored by + // dividing by 100. + rerank({{11,14},{12,13}}, 2); + } + void check_result(std::optional<double> rank_drop_limit, RankedHitVector exp_array, + std::unique_ptr<BitVector> exp_bv, DocidVector exp_dropped) { + auto rs = _hc.get_result_set(rank_drop_limit, &_dropped); + checkResult(*rs, exp_array); + checkResult(*rs, exp_bv.get()); + EXPECT_EQ(exp_dropped, _dropped); + } +}; + +TEST(HitCollectorTest, require_that_second_phase_rank_drop_limit_is_enforced) +{ + // Track rank score for all 7 hits from first phase + RankDropFixture f(10000, 10); + f.setup(); + f.check_result(9.0, {{5,11},{10,12},{11,14},{12,13},{16,10}}, + {}, {14, 15}); +} + +TEST(HitCollectorTest, require_that_second_phase_rank_drop_limit_is_enforced_when_docid_vector_is_used) +{ + // Track rank score for 4 best hits from first phase, overflow to docid vector + RankDropFixture f(10000, 4); + f.setup(); + f.check_result(13.0, {{11,14}}, + {}, {5,10,12,14,15,16}); +} + +TEST(HitCollectorTest, require_that_bitvector_is_not_dropped_without_second_phase_rank_drop_limit) +{ + // Track rank score for 4 best hits from first phase, overflow to bitvector + RankDropFixture f(20, 4); + f.setup(); + f.check_result(std::nullopt, {{5,11},{10,12},{11,14},{12,13}}, + f.make_bv({5,10,11,12,14,15,16}), {}); +} + +TEST(HitCollectorTest, require_that_bitvector_is_dropped_with_second_phase_rank_drop_limit) +{ + // Track rank for 4 best hits from first phase, overflow to bitvector + RankDropFixture f(20, 4); + f.setup(); + f.check_result(9.0, {{5,11},{10,12},{11,14},{12,13}}, + {}, {14,15,16}); } -TEST_MAIN() { TEST_RUN_ALL(); } +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/tests/hitcollector/sorted_hit_sequence_test.cpp b/searchlib/src/tests/hitcollector/sorted_hit_sequence_test.cpp index c1c3a550d9b..4eefa5b5dfa 100644 --- a/searchlib/src/tests/hitcollector/sorted_hit_sequence_test.cpp +++ b/searchlib/src/tests/hitcollector/sorted_hit_sequence_test.cpp @@ -1,7 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <vespa/vespalib/testkit/test_kit.h> #include <vespa/searchlib/queryeval/sorted_hit_sequence.h> +#include <vespa/vespalib/gtest/gtest.h> using search::queryeval::SortedHitSequence; using Hits = std::vector<SortedHitSequence::Hit>; @@ -10,20 +10,22 @@ using Refs = std::vector<SortedHitSequence::Ref>; Hits hits({{1,10.0},{2,30.0},{3,20.0}}); Refs refs({1,2,0}); -TEST("require that empty hit sequence is empty") { +TEST(SortedHitsSEquenceTest, require_that_empty_hit_sequence_is_empty) +{ EXPECT_TRUE(!SortedHitSequence(nullptr, nullptr, 0).valid()); EXPECT_TRUE(!SortedHitSequence(&hits[0], &refs[0], 0).valid()); } -TEST("require that sorted hit sequence can be iterated") { +TEST(SortedHitsSEquenceTest, require_that_sorted_hit_sequence_can_be_iterated) +{ SortedHitSequence seq(&hits[0], &refs[0], refs.size()); for (const auto &expect: Hits({{2,30.0},{3,20.0},{1,10.0}})) { ASSERT_TRUE(seq.valid()); - EXPECT_EQUAL(expect.first, seq.get().first); - EXPECT_EQUAL(expect.second, seq.get().second); + EXPECT_EQ(expect.first, seq.get().first); + EXPECT_EQ(expect.second, seq.get().second); seq.next(); } EXPECT_TRUE(!seq.valid()); } -TEST_MAIN() { TEST_RUN_ALL(); } +GTEST_MAIN_RUN_ALL_TESTS() diff --git a/searchlib/src/tests/queryeval/blueprint/blueprint_test.cpp b/searchlib/src/tests/queryeval/blueprint/blueprint_test.cpp index 485410e0eba..b0234010f77 100644 --- a/searchlib/src/tests/queryeval/blueprint/blueprint_test.cpp +++ b/searchlib/src/tests/queryeval/blueprint/blueprint_test.cpp @@ -13,7 +13,7 @@ LOG_SETUP("blueprint_test"); using namespace search::queryeval; -using namespace search::fef; +using MatchData = search::fef::MatchData; namespace { @@ -44,9 +44,7 @@ public: } SearchIterator::UP - createIntermediateSearch(MultiSearch::Children subSearches, - MatchData &md) const override - { + createIntermediateSearch(MultiSearch::Children subSearches, MatchData &md) const override { return std::make_unique<MySearch>("or", std::move(subSearches), &md, strict()); } SearchIteratorUP createFilterSearch(FilterConstraint constraint) const override { @@ -63,9 +61,7 @@ class OtherOr : public OrBlueprint private: public: SearchIterator::UP - createIntermediateSearch(MultiSearch::Children subSearches, - MatchData &md) const override - { + createIntermediateSearch(MultiSearch::Children subSearches, MatchData &md) const override { return std::make_unique<MySearch>("or", std::move(subSearches), &md, strict()); } @@ -89,9 +85,7 @@ public: } SearchIterator::UP - createIntermediateSearch(MultiSearch::Children subSearches, - MatchData &md) const override - { + createIntermediateSearch(MultiSearch::Children subSearches, MatchData &md) const override { return std::make_unique<MySearch>("and", std::move(subSearches), &md, strict()); } @@ -106,9 +100,7 @@ class OtherAnd : public AndBlueprint private: public: SearchIterator::UP - createIntermediateSearch(MultiSearch::Children subSearches, - MatchData &md) const override - { + createIntermediateSearch(MultiSearch::Children subSearches, MatchData &md) const override { return std::make_unique<MySearch>("and", std::move(subSearches), &md, strict()); } @@ -121,9 +113,7 @@ class OtherAndNot : public AndNotBlueprint { public: SearchIterator::UP - createIntermediateSearch(MultiSearch::Children subSearches, - MatchData &md) const override - { + createIntermediateSearch(MultiSearch::Children subSearches, MatchData &md) const override { return std::make_unique<MySearch>("andnot", std::move(subSearches), &md, strict()); } diff --git a/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp b/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp index bddc9f92111..490f221d1d8 100644 --- a/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp +++ b/searchlib/src/tests/queryeval/blueprint/intermediate_blueprints_test.cpp @@ -27,8 +27,9 @@ LOG_SETUP("blueprint_test"); using namespace search::queryeval; -using namespace search::fef; using namespace search::query; +using search::fef::MatchData; +using search::queryeval::Blueprint; using search::BitVector; using BlueprintVector = std::vector<std::unique_ptr<Blueprint>>; using vespalib::Slime; @@ -575,7 +576,9 @@ void compare(const Blueprint &bp1, const Blueprint &bp2, bool expect_eq) { bp1.asSlime(SlimeInserter(a)); bp2.asSlime(SlimeInserter(b)); if (expect_eq) { - EXPECT_TRUE(vespalib::slime::are_equal(a.get(), b.get(), cmp_hook)); + if(!EXPECT_TRUE(vespalib::slime::are_equal(a.get(), b.get(), cmp_hook))) { + fprintf(stderr, "a: %s\n\nb: %s\n\n", bp1.asString().c_str(), bp2.asString().c_str()); + } } else { EXPECT_FALSE(vespalib::slime::are_equal(a.get(), b.get(), cmp_hook)); } @@ -613,7 +616,6 @@ TEST_F("test SourceBlender below AND partial optimization", SourceBlenderTestFix auto expect = std::make_unique<AndBlueprint>(); addLeafs(*expect, {1,2,3}); - expect->addChild(addLeafsWithSourceId(std::make_unique<SourceBlenderBlueprint>(f.selector_2), {{10, 1}, {20, 2}})); auto blender = std::make_unique<SourceBlenderBlueprint>(f.selector_1); blender->addChild(addLeafsWithSourceId(3, std::make_unique<AndBlueprint>(), {{30, 3}, {300, 3}})); @@ -621,6 +623,8 @@ TEST_F("test SourceBlender below AND partial optimization", SourceBlenderTestFix blender->addChild(addLeafsWithSourceId(1, std::make_unique<AndBlueprint>(), {{10, 1}, {100, 1}, {1000, 1}})); expect->addChild(std::move(blender)); + expect->addChild(addLeafsWithSourceId(std::make_unique<SourceBlenderBlueprint>(f.selector_2), {{10, 1}, {20, 2}})); + optimize_and_compare(std::move(top), std::move(expect)); } @@ -1401,7 +1405,7 @@ TEST("cost for ANDNOT") { TEST("cost for SB") { InvalidSelector sel; - verify_cost(make::SB(sel), 1.3, 1.3); // max + verify_cost(make::SB(sel), 1.3+1.0, 1.3+(1.0-0.8*0.7*0.5)); // max, non_strict+1.0, strict+est } TEST("cost for NEAR") { diff --git a/searchlib/src/tests/queryeval/filter_search/filter_search_test.cpp b/searchlib/src/tests/queryeval/filter_search/filter_search_test.cpp index 16e78f77eec..9fdf1417a92 100644 --- a/searchlib/src/tests/queryeval/filter_search/filter_search_test.cpp +++ b/searchlib/src/tests/queryeval/filter_search/filter_search_test.cpp @@ -356,7 +356,7 @@ DotProductAdapter::~DotProductAdapter() = default; struct ParallelWeakAndAdapter { FieldSpec field; ParallelWeakAndBlueprint blueprint; - ParallelWeakAndAdapter() : field("foo", 3, 7), blueprint(field, 100, 0.0, 1.0) {} + ParallelWeakAndAdapter() : field("foo", 3, 7), blueprint(field, 100, 0.0, 1.0, true) {} void addChild(std::unique_ptr<Blueprint> child) { auto child_field = blueprint.getNextChildField(field); auto term = std::make_unique<LeafProxy>(child_field, std::move(child)); diff --git a/searchlib/src/tests/queryeval/flow/queryeval_flow_test.cpp b/searchlib/src/tests/queryeval/flow/queryeval_flow_test.cpp index 57fddb0a819..d6008136d73 100644 --- a/searchlib/src/tests/queryeval/flow/queryeval_flow_test.cpp +++ b/searchlib/src/tests/queryeval/flow/queryeval_flow_test.cpp @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include <vespa/searchlib/queryeval/flow.h> +#include <vespa/searchlib/queryeval/flow_tuning.h> #include <vespa/vespalib/gtest/gtest.h> #include <vector> #include <random> @@ -349,6 +350,35 @@ TEST(FlowTest, blender_flow_cost_accumulation_is_max) { } } +double my_non_strict_cost(double est, double adjust) { + return (1.0/adjust) * flow::forced_strict_cost(FlowStats(est, 0.0, est), adjust); +} + +TEST(FlowTest, non_strict_btree_cost) { + for (double est: {0.001, 0.01, 0.1, 0.2, 0.3, 0.5, 0.75, 1.0}) { + auto prev = FlowStats(est, 1.0, est); + auto base = FlowStats(est, flow::non_strict_cost_of_strict_iterator(est, est), est); + auto opt05 = FlowStats(est, my_non_strict_cost(est, 0.5), est); + auto opt02 = FlowStats(est, my_non_strict_cost(est, 0.2), est); + auto opt01 = FlowStats(est, my_non_strict_cost(est, 0.1), est); + auto opt005 = FlowStats(est, my_non_strict_cost(est, 0.05), est); + auto opt003 = FlowStats(est, my_non_strict_cost(est, 0.03), est); + EXPECT_NEAR(strict_crossover(opt05), 0.5, 1e-6); + EXPECT_NEAR(strict_crossover(opt02), 0.2, 1e-6); + EXPECT_NEAR(strict_crossover(opt01), 0.1, 1e-6); + EXPECT_NEAR(strict_crossover(opt005), 0.05, 1e-6); + EXPECT_NEAR(strict_crossover(opt003), 0.03, 1e-6); + fprintf(stderr, "est: %5.3f\n", est); + fprintf(stderr, " prev crossover: %6.4f (cost: %6.4f)\n", strict_crossover(prev), prev.cost); + fprintf(stderr, " base crossover: %6.4f (cost: %6.4f)\n", strict_crossover(base), base.cost); + fprintf(stderr, " 0.5 crossover: %6.4f (cost: %6.4f)\n", strict_crossover(opt05), opt05.cost); + fprintf(stderr, " 0.2 crossover: %6.4f (cost: %6.4f)\n", strict_crossover(opt02), opt02.cost); + fprintf(stderr, " 0.1 crossover: %6.4f (cost: %6.4f)\n", strict_crossover(opt01), opt01.cost); + fprintf(stderr, " 0.05 crossover: %6.4f (cost: %6.4f)\n", strict_crossover(opt005), opt005.cost); + fprintf(stderr, " 0.03 crossover: %6.4f (cost: %6.4f)\n", strict_crossover(opt003), opt003.cost); + } +} + TEST(FlowTest, optimal_and_flow) { for (size_t i = 0; i < loop_cnt; ++i) { for (bool strict: {false, true}) { diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/intermediate_blueprint_factory.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/intermediate_blueprint_factory.cpp index 8591ec1415d..51177850155 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/intermediate_blueprint_factory.cpp +++ b/searchlib/src/tests/queryeval/iterator_benchmark/intermediate_blueprint_factory.cpp @@ -2,14 +2,14 @@ #include "intermediate_blueprint_factory.h" #include <vespa/searchlib/queryeval/intermediate_blueprints.h> +#include <vespa/searchlib/attribute/singlenumericattribute.h> #include <iomanip> #include <sstream> namespace search::queryeval::test { -template <typename BlueprintType> char -IntermediateBlueprintFactory<BlueprintType>::child_name(void* blueprint) const +IntermediateBlueprintFactory::child_name(void* blueprint) const { auto itr = _child_names.find(blueprint); if (itr != _child_names.end()) { @@ -18,35 +18,33 @@ IntermediateBlueprintFactory<BlueprintType>::child_name(void* blueprint) const return '?'; } -template <typename BlueprintType> -IntermediateBlueprintFactory<BlueprintType>::IntermediateBlueprintFactory(vespalib::stringref name) +IntermediateBlueprintFactory::IntermediateBlueprintFactory(vespalib::stringref name) : _name(name), _children(), _child_names() { } -template <typename BlueprintType> -IntermediateBlueprintFactory<BlueprintType>::~IntermediateBlueprintFactory() = default; +IntermediateBlueprintFactory::~IntermediateBlueprintFactory() = default; -template <typename BlueprintType> std::unique_ptr<Blueprint> -IntermediateBlueprintFactory<BlueprintType>::make_blueprint() +IntermediateBlueprintFactory::make_blueprint() { - auto res = std::make_unique<BlueprintType>(); + auto res = make_self(); _child_names.clear(); char name = 'A'; + uint32_t source = 1; for (const auto& factory : _children) { auto child = factory->make_blueprint(); _child_names[child.get()] = name++; + child->setSourceId(source++); // ignored by non-source-blender blueprints res->addChild(std::move(child)); } return res; } -template <typename BlueprintType> vespalib::string -IntermediateBlueprintFactory<BlueprintType>::get_name(Blueprint& blueprint) const +IntermediateBlueprintFactory::get_name(Blueprint& blueprint) const { auto* intermediate = blueprint.asIntermediate(); if (intermediate != nullptr) { @@ -69,11 +67,29 @@ IntermediateBlueprintFactory<BlueprintType>::get_name(Blueprint& blueprint) cons return get_class_name(blueprint); } -template class IntermediateBlueprintFactory<AndBlueprint>; +//----------------------------------------------------------------------------- AndBlueprintFactory::AndBlueprintFactory() - : IntermediateBlueprintFactory<AndBlueprint>("AND") + : IntermediateBlueprintFactory("AND") {} +std::unique_ptr<IntermediateBlueprint> +AndBlueprintFactory::make_self() const +{ + return std::make_unique<AndBlueprint>(); +} + +//----------------------------------------------------------------------------- + +SourceBlenderBlueprintFactory::SourceBlenderBlueprintFactory() + : IntermediateBlueprintFactory("SB"), + _selector(250, "my_source_blender", 1000) +{} + +std::unique_ptr<IntermediateBlueprint> +SourceBlenderBlueprintFactory::make_self() const +{ + return std::make_unique<SourceBlenderBlueprint>(_selector); } +} diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/intermediate_blueprint_factory.h b/searchlib/src/tests/queryeval/iterator_benchmark/intermediate_blueprint_factory.h index 6f7fe4f9ee7..c791d866612 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/intermediate_blueprint_factory.h +++ b/searchlib/src/tests/queryeval/iterator_benchmark/intermediate_blueprint_factory.h @@ -4,6 +4,7 @@ #include "benchmark_blueprint_factory.h" #include <vespa/searchlib/queryeval/intermediate_blueprints.h> +#include <vespa/searchlib/attribute/fixedsourceselector.h> #include <unordered_map> namespace search::queryeval::test { @@ -11,7 +12,6 @@ namespace search::queryeval::test { /** * Factory that creates an IntermediateBlueprint (of the given type) with children created by the given factories. */ -template <typename BlueprintType> class IntermediateBlueprintFactory : public BenchmarkBlueprintFactory { private: vespalib::string _name; @@ -19,7 +19,8 @@ private: std::unordered_map<void*, char> _child_names; char child_name(void* blueprint) const; - +protected: + virtual std::unique_ptr<IntermediateBlueprint> make_self() const = 0; public: IntermediateBlueprintFactory(vespalib::stringref name); ~IntermediateBlueprintFactory(); @@ -30,10 +31,26 @@ public: vespalib::string get_name(Blueprint& blueprint) const override; }; -class AndBlueprintFactory : public IntermediateBlueprintFactory<AndBlueprint> { +class AndBlueprintFactory : public IntermediateBlueprintFactory { +protected: + std::unique_ptr<IntermediateBlueprint> make_self() const override; public: AndBlueprintFactory(); }; -} +class SourceBlenderBlueprintFactory : public IntermediateBlueprintFactory +{ +private: + FixedSourceSelector _selector; +protected: + std::unique_ptr<IntermediateBlueprint> make_self() const override; +public: + SourceBlenderBlueprintFactory(); + void init_selector(auto f, uint32_t limit) { + for (uint32_t i = 0; i < limit; ++i) { + _selector.setSource(i, f(i)); + } + } +}; +} diff --git a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp index f4a1ade8a66..e74fefac70e 100644 --- a/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp +++ b/searchlib/src/tests/queryeval/iterator_benchmark/iterator_benchmark_test.cpp @@ -13,19 +13,31 @@ #include <vector> using namespace search::attribute; -using namespace search::fef; using namespace search::queryeval::test; using namespace search::queryeval; using namespace search; using namespace vespalib; using search::index::Schema; +using search::fef::MatchData; using vespalib::make_string_short::fmt; const vespalib::string field_name = "myfield"; double budget_sec = 1.0; +double estimate_actual_cost(Blueprint &bp, InFlow in_flow) { + if (in_flow.strict()) { + assert(bp.strict()); + return bp.strict_cost(); + } else if (bp.strict()) { + auto stats = FlowStats::from(flow::DefaultAdapter(), &bp); + return flow::forced_strict_cost(stats, in_flow.rate()); + } else { + return bp.cost() * in_flow.rate(); + } +} + enum class PlanningAlgo { Order, Estimate, @@ -236,7 +248,8 @@ strict_search(BenchmarkBlueprintFactory& factory, uint32_t docid_limit, Planning timer.after(); } FlowStats flow(ctx.blueprint->estimate(), ctx.blueprint->cost(), ctx.blueprint->strict_cost()); - return {timer.min_time() * 1000.0, hits + 1, hits, flow, flow.strict_cost, get_class_name(*ctx.iterator), factory.get_name(*ctx.blueprint)}; + double actual_cost = estimate_actual_cost(*ctx.blueprint, InFlow(true)); + return {timer.min_time() * 1000.0, hits + 1, hits, flow, actual_cost, get_class_name(*ctx.iterator), factory.get_name(*ctx.blueprint)}; } template <bool do_unpack> @@ -269,7 +282,7 @@ non_strict_search(BenchmarkBlueprintFactory& factory, uint32_t docid_limit, doub timer.after(); } FlowStats flow(ctx.blueprint->estimate(), ctx.blueprint->cost(), ctx.blueprint->strict_cost()); - double actual_cost = flow.cost * filter_hit_ratio; + double actual_cost = estimate_actual_cost(*ctx.blueprint, InFlow(filter_hit_ratio)); return {timer.min_time() * 1000.0, seeks, hits, flow, actual_cost, get_class_name(*ctx.iterator), factory.get_name(*ctx.blueprint)}; } @@ -291,10 +304,6 @@ benchmark_search(BenchmarkBlueprintFactory& factory, uint32_t docid_limit, bool } } - - - - //----------------------------------------------------------------------------- double est_forced_strict_cost(double estimate, double strict_cost, double rate) { @@ -317,26 +326,26 @@ struct Sample { } }; -double find_crossover(const char *type, const auto &calculate_at, double delta) { +double find_crossover(const char *type, const char *a, const char *b, const auto &calculate_at, double delta) { double min = delta; double max = 1.0; fprintf(stderr, "looking for %s crossover in the range [%g, %g]...\n", type, min, max); auto at_min = calculate_at(min); auto at_max = calculate_at(max); - fprintf(stderr, " before: [%s, %s], after: [%s, %s]\n", - at_min.first.str().c_str(), at_max.first.str().c_str(), - at_min.second.str().c_str(), at_max.second.str().c_str()); - auto best_before = [](auto values) { return (values.first < values.second); }; - if (best_before(at_min) == best_before(at_max)) { + fprintf(stderr, " %s: [%s, %s], %s: [%s, %s]\n", + a, at_min.first.str().c_str(), at_max.first.str().c_str(), + b, at_min.second.str().c_str(), at_max.second.str().c_str()); + auto a_best = [](auto values) { return (values.first < values.second); }; + if (a_best(at_min) == a_best(at_max)) { fprintf(stderr, " NO %s CROSSOVER FOUND\n", type); return 0.0; } while (max > (min + delta)) { double x = (min + max) / 2.0; auto at_x = calculate_at(x); - fprintf(stderr, " best@%g: %s (%s vs %s)\n", x, best_before(at_x) ? "before" : "after", + fprintf(stderr, " best@%g: %s (%s vs %s)\n", x, a_best(at_x) ? a : b, at_x.first.str().c_str(), at_x.second.str().c_str()); - if (best_before(at_min) == best_before(at_x)) { + if (a_best(at_min) == a_best(at_x)) { min = x; at_min = at_x; } else { @@ -409,11 +418,11 @@ void analyze_crossover(BenchmarkBlueprintFactory &fixed, std::function<std::uniq std::vector<double> results; std::vector<const char *> names; names.push_back("time crossover"); - results.push_back(find_crossover("TIME", combine(estimate_AND_time_ms), delta)); + results.push_back(find_crossover("TIME", "before", "after", combine(estimate_AND_time_ms), delta)); names.push_back("cost crossover"); - results.push_back(find_crossover("COST", combine(calculate_AND_cost), delta)); + results.push_back(find_crossover("COST", "before", "after", combine(calculate_AND_cost), delta)); names.push_back("abs_est crossover"); - results.push_back(find_crossover("ABS_EST", combine(first_abs_est), delta)); + results.push_back(find_crossover("ABS_EST", "before", "after", combine(first_abs_est), delta)); sample_at("COST", combine(calculate_AND_cost), results, names); sample_at("TIME", combine(estimate_AND_time_ms), results, names); } @@ -429,21 +438,37 @@ to_string(bool val) void print_result_header() { - std::cout << "| chn | f_ratio | o_ratio | a_ratio | f.est | f.cost | f.scost | hits | seeks | time_ms | act_cost | ns_per_seek | ms_per_act_cost | iterator | blueprint |" << std::endl; + std::cout << "| in_flow | chn | o_ratio | a_ratio | f.est | f.cost | f.act_cost | f.scost | f.act_scost | hits | seeks | time_ms | act_cost | ns_per_seek | ms_per_act_cost | iterator | blueprint |" << std::endl; +} + +std::ostream &operator<<(std::ostream &dst, InFlow in_flow) { + auto old_w = dst.width(); + auto old_p = dst.precision(); + dst << std::setw(7) << std::setprecision(5); + if (in_flow.strict()) { + dst << " STRICT"; + } else { + dst << in_flow.rate(); + } + dst << std::setw(old_w); + dst << std::setprecision(old_p); + return dst; } void -print_result(const BenchmarkResult& res, uint32_t children, double op_hit_ratio, double filter_hit_ratio, uint32_t num_docs) +print_result(const BenchmarkResult& res, uint32_t children, double op_hit_ratio, InFlow in_flow, uint32_t num_docs) { std::cout << std::fixed << std::setprecision(5) - << "| " << std::setw(5) << children - << " | " << std::setw(7) << filter_hit_ratio + << "| " << in_flow + << " | " << std::setw(5) << children << " | " << std::setw(7) << op_hit_ratio << " | " << std::setw(7) << ((double) res.hits / (double) num_docs) << " | " << std::setw(6) << res.flow.estimate << std::setprecision(4) << " | " << std::setw(9) << res.flow.cost + << " | " << std::setw(10) << (res.flow.cost * in_flow.rate()) << " | " << std::setw(7) << res.flow.strict_cost + << " | " << std::setw(11) << (in_flow.strict() ? res.flow.strict_cost : flow::forced_strict_cost(res.flow, in_flow.rate())) << " | " << std::setw(8) << res.hits << " | " << std::setw(8) << res.seeks << std::setprecision(3) @@ -640,7 +665,7 @@ run_benchmark_case(const BenchmarkCaseSetup& setup) if (filter_hit_ratio * setup.filter_crossover_factor <= op_hit_ratio) { auto res = benchmark_search(*factory, setup.num_docs + 1, setup.bcase.strict_context, setup.bcase.force_strict, setup.bcase.unpack_iterator, filter_hit_ratio, PlanningAlgo::Cost); - print_result(res, children, op_hit_ratio, filter_hit_ratio, setup.num_docs); + print_result(res, children, op_hit_ratio, InFlow(setup.bcase.strict_context, filter_hit_ratio), setup.num_docs); result.add(res); } } @@ -681,23 +706,25 @@ run_benchmarks(const BenchmarkSetup& setup) void print_intermediate_blueprint_result_header(size_t children) { + std::cout << "| in_flow"; // This matches the naming scheme in IntermediateBlueprintFactory. char name = 'A'; for (size_t i = 0; i < children; ++i) { - std::cout << "| " << name++ << ".ratio "; + std::cout << " | " << name++ << ".ratio"; } - std::cout << "| flow.cost | flow.scost | flow.est | ratio | hits | seeks | ms_per_cost | time_ms | algo | blueprint |" << std::endl; + std::cout << " | flow.cost | flow.scost | flow.est | ratio | hits | seeks | ms_per_cost | time_ms | algo | blueprint |" << std::endl; } void -print_intermediate_blueprint_result(const BenchmarkResult& res, const std::vector<double>& children_ratios, PlanningAlgo algo, uint32_t num_docs) +print_intermediate_blueprint_result(const BenchmarkResult& res, const std::vector<double>& children_ratios, PlanningAlgo algo, InFlow in_flow, uint32_t num_docs) { - std::cout << std::fixed << std::setprecision(5); + std::cout << std::fixed << std::setprecision(5) + << "| " << in_flow; for (auto ratio : children_ratios) { - std::cout << "| " << std::setw(7) << ratio << " "; + std::cout << " | " << std::setw(7) << ratio; } std::cout << std::setprecision(5) - << "| " << std::setw(10) << res.flow.cost + << " | " << std::setw(10) << res.flow.cost << " | " << std::setw(10) << res.flow.strict_cost << " | " << std::setw(8) << res.flow.estimate << " | " << std::setw(7) << ((double) res.hits / (double) num_docs) @@ -745,9 +772,8 @@ struct BlueprintFactorySetup { BlueprintFactorySetup::~BlueprintFactorySetup() = default; -template <typename IntermediateBlueprintFactoryType> void -run_intermediate_blueprint_benchmark(const BlueprintFactorySetup& a, const BlueprintFactorySetup& b, size_t num_docs) +run_intermediate_blueprint_benchmark(auto factory_factory, std::vector<InFlow> in_flows, const BlueprintFactorySetup& a, const BlueprintFactorySetup& b, size_t num_docs) { print_intermediate_blueprint_result_header(2); double max_speedup = 0.0; @@ -755,26 +781,28 @@ run_intermediate_blueprint_benchmark(const BlueprintFactorySetup& a, const Bluep for (double b_hit_ratio: b.op_hit_ratios) { auto b_factory = b.make_factory_shared(num_docs, b_hit_ratio); for (double a_hit_ratio : a.op_hit_ratios) { - IntermediateBlueprintFactoryType factory; - factory.add_child(a.make_factory(num_docs, a_hit_ratio)); - factory.add_child(b_factory); + auto factory = factory_factory(); + factory->add_child(a.make_factory(num_docs, a_hit_ratio)); + factory->add_child(b_factory); double time_ms_esti = 0.0; - for (auto algo: {PlanningAlgo::Order, PlanningAlgo::Estimate, PlanningAlgo::Cost, - PlanningAlgo::CostForceStrict}) { - auto res = benchmark_search(factory, num_docs + 1, true, false, false, 1.0, algo); - print_intermediate_blueprint_result(res, {a_hit_ratio, b_hit_ratio}, algo, num_docs); - if (algo == PlanningAlgo::Estimate) { - time_ms_esti = res.time_ms; - } - if (algo == PlanningAlgo::CostForceStrict) { - double speedup = time_ms_esti / res.time_ms; - if (speedup > max_speedup) { - max_speedup = speedup; + for (InFlow in_flow: in_flows) { + for (auto algo: {PlanningAlgo::Order, PlanningAlgo::Estimate, PlanningAlgo::Cost, + PlanningAlgo::CostForceStrict}) { + auto res = benchmark_search(*factory, num_docs + 1, in_flow.strict(), false, false, in_flow.rate(), algo); + print_intermediate_blueprint_result(res, {a_hit_ratio, b_hit_ratio}, algo, in_flow, num_docs); + if (algo == PlanningAlgo::Estimate) { + time_ms_esti = res.time_ms; } - if (speedup < min_speedup) { - min_speedup = speedup; + if (algo == PlanningAlgo::CostForceStrict) { + double speedup = time_ms_esti / res.time_ms; + if (speedup > max_speedup) { + max_speedup = speedup; + } + if (speedup < min_speedup) { + min_speedup = speedup; + } + std::cout << "speedup (esti/forc)=" << std::setprecision(4) << speedup << std::endl; } - std::cout << "speedup (esti/forc)=" << std::setprecision(4) << speedup << std::endl; } } } @@ -786,7 +814,19 @@ void run_and_benchmark(const BlueprintFactorySetup& a, const BlueprintFactorySetup& b, size_t num_docs) { std::cout << "AND[A={" << a.to_string() << "},B={" << b.to_string() << "}]" << std::endl; - run_intermediate_blueprint_benchmark<AndBlueprintFactory>(a, b, num_docs); + run_intermediate_blueprint_benchmark([](){ return std::make_unique<AndBlueprintFactory>(); }, {true}, a, b, num_docs); +} + +void +run_source_blender_benchmark(const BlueprintFactorySetup& a, const BlueprintFactorySetup& b, size_t num_docs) +{ + std::cout << "SB[A={" << a.to_string() << "},B={" << b.to_string() << "}]" << std::endl; + auto factory_factory = [&](){ + auto factory = std::make_unique<SourceBlenderBlueprintFactory>(); + factory->init_selector([](uint32_t i){ return (i%10 == 0) ? 1 : 2; }, num_docs + 1); + return factory; + }; + run_intermediate_blueprint_benchmark(factory_factory, {true, 0.75, 0.5, 0.25, 0.1, 0.01, 0.001}, a, b, num_docs); } //------------------------------------------------------------------------------------- @@ -970,16 +1010,40 @@ TEST(IteratorBenchmark, analyze_AND_bitvector_vs_IN) } } +TEST(IteratorBenchmark, analyze_strict_SOURCEBLENDER_memory_and_disk) +{ + for (double small_ratio: {0.001, 0.005, 0.01, 0.05}) { + run_source_blender_benchmark({str_fs, QueryOperator::Term, {small_ratio}}, + {str_index, QueryOperator::Term, {small_ratio * 10}}, + num_docs); + } +} + TEST(IteratorBenchmark, analyze_OR_non_strict_fs) { for (auto or_hit_ratio : {0.01, 0.1, 0.5}) { BenchmarkSetup setup(num_docs, {int32_fs}, {QueryOperator::Or}, {false}, {or_hit_ratio}, {2, 4, 6, 8, 10, 100, 1000}); + //setup.force_strict = true; setup.filter_hit_ratios = gen_ratios(or_hit_ratio, 10.0, 13); run_benchmarks(setup); } } +TEST(IteratorBenchmark, analyze_OR_non_strict_fs_child_est_adjust) +{ + for (auto or_hit_ratio : {0.01, 0.1, 0.5}) { + for (uint32_t children : {2, 4, 6, 8, 10, 100, 1000}) { + double child_est = or_hit_ratio / children; + BenchmarkSetup setup(num_docs, {int32_fs}, {QueryOperator::Or}, {false}, {or_hit_ratio}, + {children}); + //setup.force_strict = true; + setup.filter_hit_ratios = gen_ratios(child_est, 10.0, 13); + run_benchmarks(setup); + } + } +} + TEST(IteratorBenchmark, analyze_OR_non_strict_non_fs) { BenchmarkSetup setup(num_docs, {int32}, {QueryOperator::Or}, {false}, {0.1}, {2, 4, 6, 8, 10}); @@ -1008,6 +1072,22 @@ TEST(IteratorBenchmark, analyze_btree_vs_bitvector_iterators_strict) run_benchmarks(setup); } +TEST(IteratorBenchmark, btree_vs_array_nonstrict_crossover) { + for (double hit_ratio: { 0.001, 0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, + 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, + 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, + 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99, 1.0}) + { + auto btree = make_blueprint_factory(int32_array_fs, QueryOperator::Term, num_docs, 0, hit_ratio, 1, false); + auto array = make_blueprint_factory( int32_array, QueryOperator::Term, num_docs, 0, hit_ratio, 1, false); + auto time_ms = [&](auto &bpf, double in_flow) { + return Sample(benchmark_search(bpf, num_docs + 1, false, false, false, in_flow, PlanningAlgo::Cost).time_ms); + }; + auto calculate_at = [&](double in_flow) { return std::make_pair(time_ms(*btree, in_flow), time_ms(*array, in_flow)); }; + fprintf(stderr, "btree/array crossover@%5.3f: %8.6f\n", hit_ratio, find_crossover("TIME", "btree", "array", calculate_at, 0.0001)); + } +} + int main(int argc, char **argv) { ::testing::InitGoogleTest(&argc, argv); int res = RUN_ALL_TESTS(); diff --git a/searchlib/src/tests/queryeval/parallel_weak_and/parallel_weak_and_test.cpp b/searchlib/src/tests/queryeval/parallel_weak_and/parallel_weak_and_test.cpp index 2bd560637d2..996cd448f44 100644 --- a/searchlib/src/tests/queryeval/parallel_weak_and/parallel_weak_and_test.cpp +++ b/searchlib/src/tests/queryeval/parallel_weak_and/parallel_weak_and_test.cpp @@ -68,8 +68,8 @@ struct TestHeap : public WeakAndHeap { ScoresHistory history; - TestHeap(uint32_t scoresToTrack_) : WeakAndHeap(scoresToTrack_), history() {} - virtual void adjust(score_t *begin, score_t *end) override { + explicit TestHeap(uint32_t scoresToTrack_) : WeakAndHeap(scoresToTrack_), history() {} + void adjust(score_t *begin, score_t *end) override { Scores scores; for (score_t *itr = begin; itr != end; ++itr) { scores.add(*itr); @@ -87,8 +87,8 @@ struct WandTestSpec : public WandSpec TermFieldMatchData rootMatchData; MatchParams matchParams; - WandTestSpec(uint32_t scoresToTrack, uint32_t scoresAdjustFrequency = 1, - score_t scoreThreshold = 0, double thresholdBoostFactor = 1); + explicit WandTestSpec(uint32_t scoresToTrack, uint32_t scoresAdjustFrequency = 1, + score_t scoreThreshold = 0, double thresholdBoostFactor = 1); ~WandTestSpec(); SearchIterator::UP create() { MatchData::UP childrenMatchData = createMatchData(); @@ -114,7 +114,7 @@ WandTestSpec<HeapType>::WandTestSpec(uint32_t scoresToTrack, uint32_t scoresAdju {} template <typename HeapType> -WandTestSpec<HeapType>::~WandTestSpec() {} +WandTestSpec<HeapType>::~WandTestSpec() = default; using WandSpecWithTestHeap = WandTestSpec<TestHeap>; using WandSpecWithRealHeap = WandTestSpec<SharedWeakAndPriorityQueue>; @@ -137,8 +137,8 @@ SimpleResult asSimpleResult(const FakeResult &result) { SimpleResult retval; - for (size_t i = 0; i < result.inspect().size(); ++i) { - retval.addHit(result.inspect()[i].docId); + for (const auto & doc : result.inspect()) { + retval.addHit(doc.docId); } return retval; } @@ -152,26 +152,26 @@ struct WandBlueprintSpec FakeRequestContext requestContext; WandBlueprintSpec &add(const std::string &token, int32_t weight) { - tokens.push_back(std::make_pair(token, weight)); + tokens.emplace_back(token, weight); return *this; } Node::UP createNode(uint32_t scoresToTrack = 100, score_t scoreThreshold = 0, double thresholdBoostFactor = 1) const { - SimpleWandTerm *node = new SimpleWandTerm(tokens.size(), "view", 0, Weight(0), - scoresToTrack, scoreThreshold, thresholdBoostFactor); - for (size_t i = 0; i < tokens.size(); ++i) { - node->addTerm(tokens[i].first, Weight(tokens[i].second)); + auto node = std::make_unique<SimpleWandTerm>(tokens.size(), "view", 0, Weight(0), + scoresToTrack, scoreThreshold, thresholdBoostFactor); + for (const auto & token : tokens) { + node->addTerm(token.first, Weight(token.second)); } - return Node::UP(node); + return node; } Blueprint::UP blueprint(Searchable &searchable, const std::string &field, const search::query::Node &term) const { FieldSpecList fields; fields.add(FieldSpec(field, fieldId, handle)); Blueprint::UP bp = searchable.createBlueprint(requestContext, fields, term); - EXPECT_TRUE(dynamic_cast<ParallelWeakAndBlueprint*>(bp.get()) != 0); + EXPECT_TRUE(dynamic_cast<ParallelWeakAndBlueprint*>(bp.get()) != nullptr); return bp; } @@ -182,7 +182,7 @@ struct WandBlueprintSpec bp->basic_plan(true, docIdLimit); bp->fetchPostings(ExecuteInfo::FULL); SearchIterator::UP sb = bp->createSearch(*md); - EXPECT_TRUE(dynamic_cast<ParallelWeakAndSearch*>(sb.get()) != 0); + EXPECT_TRUE(dynamic_cast<ParallelWeakAndSearch*>(sb.get()) != nullptr); return sb; } @@ -197,7 +197,7 @@ struct WandBlueprintSpec bp->basic_plan(true, docIdLimit); bp->fetchPostings(ExecuteInfo::FULL); SearchIterator::UP sb = bp->createSearch(*md); - EXPECT_TRUE(dynamic_cast<ParallelWeakAndSearch*>(sb.get()) != 0); + EXPECT_TRUE(dynamic_cast<ParallelWeakAndSearch*>(sb.get()) != nullptr); return doSearch(*sb, *md->resolveTermField(handle)); } }; @@ -258,7 +258,7 @@ struct AlgoSameScoreFixture : public FixtureBase struct AlgoScoreThresholdFixture : public FixtureBase { - AlgoScoreThresholdFixture(score_t scoreThreshold) : FixtureBase(3, 1, scoreThreshold) { + explicit AlgoScoreThresholdFixture(score_t scoreThreshold) : FixtureBase(3, 1, scoreThreshold) { spec.leaf(LeafSpec("A", 1).doc(1, 10).doc(2, 30)); spec.leaf(LeafSpec("B", 2).doc(1, 20).doc(3, 40)); prepare(); @@ -267,7 +267,7 @@ struct AlgoScoreThresholdFixture : public FixtureBase struct AlgoLargeScoresFixture : public FixtureBase { - AlgoLargeScoresFixture(score_t scoreThreshold) : FixtureBase(3, 1, scoreThreshold) { + explicit AlgoLargeScoresFixture(score_t scoreThreshold) : FixtureBase(3, 1, scoreThreshold) { spec.leaf(LeafSpec("A", 60000).doc(1, 60000).doc(2, 70000)); spec.leaf(LeafSpec("B", 70000).doc(1, 80000).doc(3, 90000)); prepare(); @@ -276,7 +276,7 @@ struct AlgoLargeScoresFixture : public FixtureBase struct AlgoExhaustPastFixture : public FixtureBase { - AlgoExhaustPastFixture(score_t scoreThreshold) : FixtureBase(3, 1, scoreThreshold) { + explicit AlgoExhaustPastFixture(score_t scoreThreshold) : FixtureBase(3, 1, scoreThreshold) { spec.leaf(LeafSpec("A", 1).doc(1, 20).doc(3, 40).doc(5, 10)); spec.leaf(LeafSpec("B", 1).doc(5, 10)); spec.leaf(LeafSpec("C", 1).doc(5, 10)); @@ -449,11 +449,11 @@ struct BlueprintFixtureBase }; BlueprintFixtureBase::BlueprintFixtureBase() : spec(), searchable() {} -BlueprintFixtureBase::~BlueprintFixtureBase() {} +BlueprintFixtureBase::~BlueprintFixtureBase() = default; struct BlueprintHitsFixture : public BlueprintFixtureBase { - FakeResult createResult(size_t hits) { + static FakeResult createResult(size_t hits) { FakeResult result; for (size_t i = 0; i < hits; ++i) { result.doc(i + 1); @@ -479,7 +479,7 @@ struct BlueprintHitsFixture : public BlueprintFixtureBase struct ThresholdBoostFixture : public FixtureBase { FakeResult result; - ThresholdBoostFixture(double boost) : FixtureBase(1, 1, 800, boost) { + explicit ThresholdBoostFixture(double boost) : FixtureBase(1, 1, 800, boost) { spec.leaf(LeafSpec("A").doc(1, 10)); spec.leaf(LeafSpec("B").doc(2, 20)); spec.leaf(LeafSpec("C").doc(3, 30)); @@ -532,7 +532,7 @@ TEST(ParallelWeakAndTest, require_that_blueprint_picks_up_docid_limit) BlueprintFixture f; Node::UP term = f.spec.createNode(57, 67, 77.7); Blueprint::UP bp = f.blueprint(*term); - const ParallelWeakAndBlueprint * pbp = dynamic_cast<const ParallelWeakAndBlueprint *>(bp.get()); + const auto * pbp = dynamic_cast<const ParallelWeakAndBlueprint *>(bp.get()); EXPECT_EQ(0u, pbp->get_docid_limit()); bp->setDocIdLimit(1000); EXPECT_EQ(1000u, pbp->get_docid_limit()); @@ -543,7 +543,7 @@ TEST(ParallelWeakAndTest, require_that_scores_to_track_score_threshold_and_thres BlueprintFixture f; Node::UP term = f.spec.createNode(57, 67, 77.7); Blueprint::UP bp = f.blueprint(*term); - const ParallelWeakAndBlueprint * pbp = dynamic_cast<const ParallelWeakAndBlueprint *>(bp.get()); + const auto * pbp = dynamic_cast<const ParallelWeakAndBlueprint *>(bp.get()); EXPECT_EQ(57u, pbp->getScores().getScoresToTrack()); EXPECT_EQ(67u, pbp->getScoreThreshold()); EXPECT_EQ(77.7, pbp->getThresholdBoostFactor()); @@ -708,7 +708,7 @@ SearchIterator::UP create_wand(bool use_dww, class Verifier : public search::test::DwwIteratorChildrenVerifier { public: - Verifier(bool use_dww) : _use_dww(use_dww) { } + explicit Verifier(bool use_dww) : _use_dww(use_dww) { } private: SearchIterator::UP create(bool strict) const override { MatchParams match_params(_dummy_heap, _dummy_heap.getMinScore(), 1.0, 1); diff --git a/searchlib/src/tests/queryeval/sourceblender/sourceblender_test.cpp b/searchlib/src/tests/queryeval/sourceblender/sourceblender_test.cpp index b84cb02a357..b2a1f6a645a 100644 --- a/searchlib/src/tests/queryeval/sourceblender/sourceblender_test.cpp +++ b/searchlib/src/tests/queryeval/sourceblender/sourceblender_test.cpp @@ -7,15 +7,14 @@ #include <vespa/searchlib/queryeval/leaf_blueprints.h> #define ENABLE_GTEST_MIGRATION #include <vespa/searchlib/test/searchiteratorverifier.h> -#include <vespa/searchlib/common/bitvectoriterator.h> #include <vespa/searchlib/attribute/fixedsourceselector.h> #include <vespa/searchlib/fef/matchdata.h> #include <vespa/vespalib/gtest/gtest.h> using namespace search::queryeval; -using namespace search::fef; using namespace search; using std::make_unique; +using search::fef::MatchData; /** * Proxy search used to verify unpack pattern @@ -27,24 +26,24 @@ private: SimpleResult _unpacked; protected: - virtual void doSeek(uint32_t docid) override { + void doSeek(uint32_t docid) override { _search->seek(docid); setDocId(_search->getDocId()); } - virtual void doUnpack(uint32_t docid) override { + void doUnpack(uint32_t docid) override { _unpacked.addHit(docid); _search->unpack(docid); } public: - UnpackChecker(SearchIterator *search) : _search(search), _unpacked() {} + explicit UnpackChecker(SearchIterator *search) : _search(search), _unpacked() {} const SimpleResult &getUnpacked() const { return _unpacked; } }; class MySelector : public search::FixedSourceSelector { public: - MySelector(int defaultSource) : search::FixedSourceSelector(defaultSource, "fs") { } + explicit MySelector(int defaultSource) : search::FixedSourceSelector(defaultSource, "fs") { } MySelector & set(Source s, uint32_t docId) { setSource(s, docId); return *this; @@ -65,12 +64,12 @@ TEST(SourceBlenderTest, test_strictness) a.addHit(2).addHit(5).addHit(6).addHit(8); b.addHit(3).addHit(5).addHit(6).addHit(7); - MySelector *sel = new MySelector(5); + auto *sel = new MySelector(5); sel->set(2, 1).set(3, 2).set(5, 2).set(7, 1); - SourceBlenderBlueprint *blend_b = new SourceBlenderBlueprint(*sel); - Blueprint::UP a_b(new SimpleBlueprint(a)); - Blueprint::UP b_b(new SimpleBlueprint(b)); + auto *blend_b = new SourceBlenderBlueprint(*sel); + auto a_b = std::make_unique<SimpleBlueprint>(a); + auto b_b = std::make_unique<SimpleBlueprint>(b); a_b->setSourceId(1); b_b->setSourceId(2); blend_b->addChild(std::move(a_b)); @@ -111,16 +110,16 @@ TEST(SourceBlenderTest, test_full_sourceblender_search) c.addHit(4).addHit(11).addHit(21).addHit(32); // these are all handed over to the blender - UnpackChecker *ua = new UnpackChecker(new SimpleSearch(a)); - UnpackChecker *ub = new UnpackChecker(new SimpleSearch(b)); - UnpackChecker *uc = new UnpackChecker(new SimpleSearch(c)); + auto *ua = new UnpackChecker(new SimpleSearch(a)); + auto *ub = new UnpackChecker(new SimpleSearch(b)); + auto *uc = new UnpackChecker(new SimpleSearch(c)); auto sel = make_unique<MySelector>(5); sel->set(2, 1).set(3, 2).set(11, 2).set(21, 3).set(34, 1); SourceBlenderSearch::Children abc; - abc.push_back(SourceBlenderSearch::Child(ua, 1)); - abc.push_back(SourceBlenderSearch::Child(ub, 2)); - abc.push_back(SourceBlenderSearch::Child(uc, 3)); + abc.emplace_back(ua, 1); + abc.emplace_back(ub, 2); + abc.emplace_back(uc, 3); SearchIterator::UP blend(SourceBlenderSearch::create(sel->createIterator(), abc, true)); SimpleResult result; @@ -149,7 +148,7 @@ using search::test::SearchIteratorVerifier; class Verifier : public SearchIteratorVerifier { public: Verifier(); - ~Verifier(); + ~Verifier() override; SearchIterator::UP create(bool strict) const override { return SearchIterator::UP(SourceBlenderSearch::create(_selector.createIterator(), createChildren(strict), @@ -178,7 +177,7 @@ Verifier::Verifier() : _indexes[indexId].push_back(docId); } } -Verifier::~Verifier() {} +Verifier::~Verifier() = default; TEST(SourceBlenderTest, test_that_source_blender_iterator_adheres_to_search_terator_requirements) { diff --git a/searchlib/src/tests/queryeval/sparse_vector_benchmark/sparse_vector_benchmark_test.cpp b/searchlib/src/tests/queryeval/sparse_vector_benchmark/sparse_vector_benchmark_test.cpp index 94ecd8fa539..a7516226daf 100644 --- a/searchlib/src/tests/queryeval/sparse_vector_benchmark/sparse_vector_benchmark_test.cpp +++ b/searchlib/src/tests/queryeval/sparse_vector_benchmark/sparse_vector_benchmark_test.cpp @@ -13,6 +13,7 @@ #include <vespa/searchlib/queryeval/simpleresult.h> #include <vespa/searchlib/queryeval/wand/weak_and_search.h> #include <vespa/searchlib/queryeval/weighted_set_term_search.h> +#include <vespa/searchlib/queryeval/wand/weak_and_heap.h> #include <vespa/vespalib/util/box.h> #include <vespa/vespalib/util/stringfmt.h> @@ -135,7 +136,7 @@ constexpr vespalib::duration max_time = 1000s; //----------------------------------------------------------------------------- struct ChildFactory { - ChildFactory() {} + ChildFactory() = default; virtual std::string name() const = 0; virtual SearchIterator::UP createChild(uint32_t idx, uint32_t limit) const = 0; virtual ~ChildFactory() = default; @@ -190,8 +191,9 @@ struct ModSearchFactory : ChildFactory { //----------------------------------------------------------------------------- struct VespaWandFactory : SparseVectorFactory { + mutable SharedWeakAndPriorityQueue _scores; uint32_t n; - explicit VespaWandFactory(uint32_t n_in) noexcept : n(n_in) {} + explicit VespaWandFactory(uint32_t n_in) : _scores(n_in), n(n_in) {} std::string name() const override { return vespalib::make_string("VespaWand(%u)", n); } @@ -200,7 +202,7 @@ struct VespaWandFactory : SparseVectorFactory { for (size_t i = 0; i < childCnt; ++i) { terms.emplace_back(childFactory.createChild(i, limit), default_weight, limit / (i + 1)); } - return WeakAndSearch::create(terms, n, true); + return WeakAndSearch::create(terms, wand::MatchParams(_scores), n, true); } }; diff --git a/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp b/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp index 5e056eb6c0e..457f7133dc1 100644 --- a/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp +++ b/searchlib/src/tests/queryeval/weak_and/wand_bench_setup.hpp @@ -29,20 +29,20 @@ struct Stats { size_t unpackCnt; size_t skippedDocs; size_t skippedHits; - Stats() : hitCnt(0), seekCnt(0), unpackCnt(0), + Stats() noexcept : hitCnt(0), seekCnt(0), unpackCnt(0), skippedDocs(0), skippedHits(0) {} - void hit() { + void hit() noexcept { ++hitCnt; } - void seek(size_t docs, size_t hits) { + void seek(size_t docs, size_t hits) noexcept { ++seekCnt; skippedDocs += docs; skippedHits += hits; } - void unpack() { + void unpack() noexcept { ++unpackCnt; } - void print() { + void print() const { fprintf(stderr, "Stats: hits=%zu, seeks=%zu, unpacks=%zu, skippedDocs=%zu, skippedHits=%zu\n", hitCnt, seekCnt, unpackCnt, skippedDocs, skippedHits); } @@ -77,7 +77,7 @@ struct ModSearch : SearchIterator { } } void doUnpack(uint32_t docid) override { - if (tfmd != NULL) { + if (tfmd != nullptr) { tfmd->reset(docid); search::fef::TermFieldMatchDataPosition pos; pos.setElementWeight(info.getMaxWeight()); @@ -96,40 +96,52 @@ ModSearch::~ModSearch() = default; struct WandFactory { virtual std::string name() const = 0; virtual SearchIterator::UP create(const wand::Terms &terms) = 0; - virtual ~WandFactory() {} + virtual ~WandFactory() = default; }; struct VespaWandFactory : WandFactory { + mutable SharedWeakAndPriorityQueue _scores; uint32_t n; - VespaWandFactory(uint32_t n_in) : n(n_in) {} + explicit VespaWandFactory(uint32_t n_in) noexcept + : _scores(n_in), + n(n_in) + {} ~VespaWandFactory() override; - virtual std::string name() const override { return make_string("VESPA WAND (n=%u)", n); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(WeakAndSearch::create(terms, n, true)); + std::string name() const override { return make_string("VESPA WAND (n=%u)", n); } + SearchIterator::UP create(const wand::Terms &terms) override { + return WeakAndSearch::create(terms, wand::MatchParams(_scores, 1, 1), n, true); } }; VespaWandFactory::~VespaWandFactory() = default; struct VespaArrayWandFactory : WandFactory { + mutable SharedWeakAndPriorityQueue _scores; uint32_t n; - VespaArrayWandFactory(uint32_t n_in) : n(n_in) {} + explicit VespaArrayWandFactory(uint32_t n_in) + : _scores(n_in), + n(n_in) + {} ~VespaArrayWandFactory() override; - virtual std::string name() const override { return make_string("VESPA ARRAY WAND (n=%u)", n); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(WeakAndSearch::createArrayWand(terms, n, true)); + std::string name() const override { return make_string("VESPA ARRAY WAND (n=%u)", n); } + SearchIterator::UP create(const wand::Terms &terms) override { + return WeakAndSearch::createArrayWand(terms, wand::MatchParams(_scores, 1, 1), wand::TermFrequencyScorer(), n, true); } }; VespaArrayWandFactory::~VespaArrayWandFactory() = default; struct VespaHeapWandFactory : WandFactory { + mutable SharedWeakAndPriorityQueue _scores; uint32_t n; - VespaHeapWandFactory(uint32_t n_in) : n(n_in) {} + explicit VespaHeapWandFactory(uint32_t n_in) + : _scores(n_in), + n(n_in) + {} ~VespaHeapWandFactory() override; - virtual std::string name() const override { return make_string("VESPA HEAP WAND (n=%u)", n); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(WeakAndSearch::createHeapWand(terms, n, true)); + std::string name() const override { return make_string("VESPA HEAP WAND (n=%u)", n); } + SearchIterator::UP create(const wand::Terms &terms) override { + return WeakAndSearch::createHeapWand(terms, wand::MatchParams(_scores, 1, 1), wand::TermFrequencyScorer(), n, true); } }; @@ -138,39 +150,39 @@ VespaHeapWandFactory::~VespaHeapWandFactory() = default; struct VespaParallelWandFactory : public WandFactory { SharedWeakAndPriorityQueue scores; TermFieldMatchData rootMatchData; - VespaParallelWandFactory(uint32_t n) : scores(n), rootMatchData() {} + explicit VespaParallelWandFactory(uint32_t n) noexcept : scores(n), rootMatchData() {} ~VespaParallelWandFactory() override; - virtual std::string name() const override { return make_string("VESPA PWAND (n=%u)", scores.getScoresToTrack()); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(ParallelWeakAndSearch::create(terms, + std::string name() const override { return make_string("VESPA PWAND (n=%u)", scores.getScoresToTrack()); } + SearchIterator::UP create(const wand::Terms &terms) override { + return ParallelWeakAndSearch::create(terms, PWMatchParams(scores, 0, 1, 1), - PWRankParams(rootMatchData, MatchData::UP()), true)); + PWRankParams(rootMatchData, {}), true); } }; VespaParallelWandFactory::~VespaParallelWandFactory() = default; struct VespaParallelArrayWandFactory : public VespaParallelWandFactory { - VespaParallelArrayWandFactory(uint32_t n) : VespaParallelWandFactory(n) {} + explicit VespaParallelArrayWandFactory(uint32_t n) noexcept : VespaParallelWandFactory(n) {} ~VespaParallelArrayWandFactory() override; - virtual std::string name() const override { return make_string("VESPA ARRAY PWAND (n=%u)", scores.getScoresToTrack()); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(ParallelWeakAndSearch::createArrayWand(terms, + std::string name() const override { return make_string("VESPA ARRAY PWAND (n=%u)", scores.getScoresToTrack()); } + SearchIterator::UP create(const wand::Terms &terms) override { + return ParallelWeakAndSearch::createArrayWand(terms, PWMatchParams(scores, 0, 1, 1), - PWRankParams(rootMatchData, MatchData::UP()), true)); + PWRankParams(rootMatchData, {}), true); } }; VespaParallelArrayWandFactory::~VespaParallelArrayWandFactory() = default; struct VespaParallelHeapWandFactory : public VespaParallelWandFactory { - VespaParallelHeapWandFactory(uint32_t n) : VespaParallelWandFactory(n) {} + explicit VespaParallelHeapWandFactory(uint32_t n) noexcept : VespaParallelWandFactory(n) {} ~VespaParallelHeapWandFactory() override; - virtual std::string name() const override { return make_string("VESPA HEAP PWAND (n=%u)", scores.getScoresToTrack()); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(ParallelWeakAndSearch::createHeapWand(terms, + std::string name() const override { return make_string("VESPA HEAP PWAND (n=%u)", scores.getScoresToTrack()); } + SearchIterator::UP create(const wand::Terms &terms) override { + return ParallelWeakAndSearch::createHeapWand(terms, PWMatchParams(scores, 0, 1, 1), - PWRankParams(rootMatchData, MatchData::UP()), true)); + PWRankParams(rootMatchData, {}), true); } }; @@ -178,11 +190,11 @@ VespaParallelHeapWandFactory::~VespaParallelHeapWandFactory() = default; struct TermFrequencyRiseWandFactory : WandFactory { uint32_t n; - TermFrequencyRiseWandFactory(uint32_t n_in) : n(n_in) {} + explicit TermFrequencyRiseWandFactory(uint32_t n_in) noexcept : n(n_in) {} ~TermFrequencyRiseWandFactory() override; - virtual std::string name() const override { return make_string("RISE WAND TF (n=%u)", n); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(new rise::TermFrequencyRiseWand(terms, n)); + std::string name() const override { return make_string("RISE WAND TF (n=%u)", n); } + SearchIterator::UP create(const wand::Terms &terms) override { + return std::make_unique<rise::TermFrequencyRiseWand>(terms, n); } }; @@ -190,11 +202,11 @@ TermFrequencyRiseWandFactory::~TermFrequencyRiseWandFactory() = default; struct DotProductRiseWandFactory : WandFactory { uint32_t n; - DotProductRiseWandFactory(uint32_t n_in) : n(n_in) {} + explicit DotProductRiseWandFactory(uint32_t n_in) noexcept : n(n_in) {} ~DotProductRiseWandFactory() override; - virtual std::string name() const override { return make_string("RISE WAND DP (n=%u)", n); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { - return SearchIterator::UP(new rise::DotProductRiseWand(terms, n)); + std::string name() const override { return make_string("RISE WAND DP (n=%u)", n); } + SearchIterator::UP create(const wand::Terms &terms) override { + return std::make_unique<rise::DotProductRiseWand>(terms, n); } }; @@ -204,13 +216,13 @@ struct FilterFactory : WandFactory { WandFactory &factory; Stats stats; uint32_t n; - FilterFactory(WandFactory &f, uint32_t n_in) : factory(f), n(n_in) {} + FilterFactory(WandFactory &f, uint32_t n_in) noexcept : factory(f), n(n_in) {} ~FilterFactory() override; - virtual std::string name() const override { return make_string("Filter (mod=%u) [%s]", n, factory.name().c_str()); } - virtual SearchIterator::UP create(const wand::Terms &terms) override { + std::string name() const override { return make_string("Filter (mod=%u) [%s]", n, factory.name().c_str()); } + SearchIterator::UP create(const wand::Terms &terms) override { AndNotSearch::Children children; children.push_back(factory.create(terms)); - children.emplace_back(new ModSearch(stats, n, search::endDocId, n, NULL)); + children.emplace_back(new ModSearch(stats, n, search::endDocId, n, nullptr)); return AndNotSearch::create(std::move(children), true); } }; @@ -220,8 +232,8 @@ FilterFactory::~FilterFactory() = default; struct Setup { Stats stats; vespalib::duration minTime; - Setup() : stats(), minTime(10000s) {} - virtual ~Setup() {} + Setup() noexcept : stats(), minTime(10000s) {} + virtual ~Setup() = default; virtual std::string name() const = 0; virtual SearchIterator::UP create() = 0; void perform() { @@ -256,10 +268,10 @@ struct WandSetup : Setup { MatchData::UP matchData; WandSetup(WandFactory &f, uint32_t c, uint32_t l) : Setup(), factory(f), childCnt(c), limit(l), weight(100), matchData() {} ~WandSetup() override; - virtual std::string name() const override { + std::string name() const override { return make_string("Wand Setup (terms=%u,docs=%u) [%s]", childCnt, limit, factory.name().c_str()); } - virtual SearchIterator::UP create() override { + SearchIterator::UP create() override { MatchDataLayout layout; std::vector<TermFieldHandle> handles; for (size_t i = 0; i < childCnt; ++i) { diff --git a/searchlib/src/tests/queryeval/weak_and/weak_and_test.cpp b/searchlib/src/tests/queryeval/weak_and/weak_and_test.cpp index 689f9f085d0..4aab66f3cc9 100644 --- a/searchlib/src/tests/queryeval/weak_and/weak_and_test.cpp +++ b/searchlib/src/tests/queryeval/weak_and/weak_and_test.cpp @@ -1,8 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -#include <vespa/searchlib/queryeval/fake_search.h> #include <vespa/searchlib/queryeval/wand/weak_and_search.h> +#include <vespa/searchlib/queryeval/wand/weak_and_heap.h> #include <vespa/searchlib/queryeval/simpleresult.h> -#include <vespa/searchlib/queryeval/simplesearch.h> #include <vespa/searchlib/queryeval/test/eagerchild.h> #include <vespa/searchlib/queryeval/test/leafspec.h> #include <vespa/searchlib/queryeval/test/wandspec.h> @@ -20,11 +19,13 @@ namespace { struct MyWandSpec : public WandSpec { + SharedWeakAndPriorityQueue scores; uint32_t n; - MyWandSpec(uint32_t n_) : WandSpec(), n(n_) {} + explicit MyWandSpec(uint32_t n_in) : WandSpec(), scores(n_in), n(n_in) {} SearchIterator *create() { - return new TrackedSearch("WAND", getHistory(), WeakAndSearch::create(getTerms(), n, true)); + return new TrackedSearch("WAND", getHistory(), + WeakAndSearch::create(getTerms(), wand::MatchParams(scores, 1, 1), n, true)); } }; @@ -104,7 +105,8 @@ TEST(WeakAndTest, require_that_initial_docid_for_subsearches_are_taken_into_acco wand::Terms terms; terms.push_back(wand::Term(new TrackedSearch("foo", history, new EagerChild(search::endDocId)), 100, 1)); terms.push_back(wand::Term(new TrackedSearch("bar", history, new EagerChild(10)), 100, 2)); - SearchIterator::UP search(new TrackedSearch("WAND", history, WeakAndSearch::create(terms, 2, true))); + SharedWeakAndPriorityQueue scores(2); + auto search = std::make_unique<TrackedSearch>("WAND", history, WeakAndSearch::create(terms, wand::MatchParams(scores), 2, true)); SimpleResult hits; hits.search(*search); EXPECT_EQ(SimpleResult().addHit(10), hits); @@ -114,17 +116,26 @@ TEST(WeakAndTest, require_that_initial_docid_for_subsearches_are_taken_into_acco } class IteratorChildrenVerifier : public search::test::IteratorChildrenVerifier { +public: + IteratorChildrenVerifier(); + ~IteratorChildrenVerifier() override; private: + mutable std::vector<std::unique_ptr<SharedWeakAndPriorityQueue>> _scores; SearchIterator::UP create(bool strict) const override { wand::Terms terms; for (size_t i = 0; i < _num_children; ++i) { terms.emplace_back(createIterator(_split_lists[i], strict).release(), 100, _split_lists[i].size()); } - return SearchIterator::UP(WeakAndSearch::create(terms, -1, strict)); + static constexpr size_t LARGE_ENOUGH_HEAP_FOR_ALL = 10000; + _scores.push_back(std::make_unique<SharedWeakAndPriorityQueue>(LARGE_ENOUGH_HEAP_FOR_ALL)); + return WeakAndSearch::create(terms, wand::MatchParams(*_scores.back(), 1, 1), -1, strict); } }; +IteratorChildrenVerifier::IteratorChildrenVerifier() : _scores() {} +IteratorChildrenVerifier::~IteratorChildrenVerifier() = default; + TEST(WeakAndTest, verify_search_iterator_conformance) { IteratorChildrenVerifier verifier; diff --git a/searchlib/src/tests/queryeval/weak_and/weak_and_test_expensive.cpp b/searchlib/src/tests/queryeval/weak_and/weak_and_test_expensive.cpp index 54bf1e92037..0573404a3b4 100644 --- a/searchlib/src/tests/queryeval/weak_and/weak_and_test_expensive.cpp +++ b/searchlib/src/tests/queryeval/weak_and/weak_and_test_expensive.cpp @@ -16,15 +16,16 @@ void checkWandHits(WandFactory &vespa, WandFactory &rise, uint32_t step, uint32_ s1->initFullRange(); SearchIterator::UP s2 = riseSetup.create(); s2->initFullRange(); - ASSERT_TRUE(dynamic_cast<WeakAndType*>(s1.get()) != 0); - ASSERT_TRUE(dynamic_cast<WeakAndType*>(s2.get()) == 0); - ASSERT_TRUE(dynamic_cast<RiseType*>(s2.get()) != 0); - ASSERT_TRUE(dynamic_cast<RiseType*>(s1.get()) == 0); + ASSERT_TRUE(dynamic_cast<WeakAndType*>(s1.get()) != nullptr); + ASSERT_TRUE(dynamic_cast<WeakAndType*>(s2.get()) == nullptr); + ASSERT_TRUE(dynamic_cast<RiseType*>(s2.get()) != nullptr); + ASSERT_TRUE(dynamic_cast<RiseType*>(s1.get()) == nullptr); s1->seek(1); s2->seek(1); while (!s1->isAtEnd() && !s2->isAtEnd()) { + if (s1->getDocId() != s2->getDocId()) assert(true); ASSERT_EQUAL(s1->getDocId(), s2->getDocId()); if ((filter == 0) || ((s1->getDocId() % filter) != 0)) { s1->unpack(s1->getDocId()); diff --git a/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp b/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp index e1f3f0805d9..8a0bc28f4dd 100644 --- a/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp +++ b/searchlib/src/tests/queryeval/weak_and_scorers/weak_and_scorers_test.cpp @@ -63,4 +63,27 @@ TEST("require that DotProductScorer calculates term score") EXPECT_EQUAL(11u, itr->_unpackDocId); } +TEST("test bm25 idf scorer for wand") +{ + wand::Bm25TermFrequencyScorer scorer(1000000, 1.0); + EXPECT_EQUAL(13410046, scorer.calculateMaxScore(1, 1)); + EXPECT_EQUAL(11464136, scorer.calculateMaxScore(10, 1)); + EXPECT_EQUAL(6907256, scorer.calculateMaxScore(1000, 1)); + EXPECT_EQUAL(4605121, scorer.calculateMaxScore(10000, 1)); + EXPECT_EQUAL(2302581, scorer.calculateMaxScore(100000, 1)); + EXPECT_EQUAL(693147, scorer.calculateMaxScore(500000, 1)); + EXPECT_EQUAL(105360, scorer.calculateMaxScore(900000, 1)); + EXPECT_EQUAL(10050, scorer.calculateMaxScore(990000, 1)); +} + +TEST("test limited range of bm25 idf scorer for wand") +{ + wand::Bm25TermFrequencyScorer scorer08(1000000, 0.8); + wand::Bm25TermFrequencyScorer scorer10(1000000, 1.0); + EXPECT_EQUAL(8207814, scorer08.calculateMaxScore(1000, 1)); + EXPECT_EQUAL(2690049, scorer08.calculateMaxScore(990000, 1)); + EXPECT_EQUAL(6907256, scorer10.calculateMaxScore(1000, 1)); + EXPECT_EQUAL(10050, scorer10.calculateMaxScore(990000, 1)); +} + TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/tests/ranksetup/ranksetup_test.cpp b/searchlib/src/tests/ranksetup/ranksetup_test.cpp index 53224425a04..348326c3936 100644 --- a/searchlib/src/tests/ranksetup/ranksetup_test.cpp +++ b/searchlib/src/tests/ranksetup/ranksetup_test.cpp @@ -26,6 +26,7 @@ #include <vespa/searchlib/fef/test/rankresult.h> #include <vespa/searchlib/features/rankingexpressionfeature.h> +#include <vespa/searchlib/features/second_phase_feature.h> #include <vespa/searchlib/features/setup.h> #include <vespa/searchlib/features/valuefeature.h> #include <vespa/searchlib/fef/test/plugin/chain.h> @@ -787,6 +788,19 @@ RankSetupTest::testFeatureDump() exp.addScore("test_cfgvalue(foo)", 1.0); EXPECT_EQUAL(exp, dumper.dump()); } + { // Dump secondPhase feature + IndexEnvironment indexEnv; + indexEnv.getProperties().add(indexproperties::rank::FirstPhase::NAME, "value(2)"); + indexEnv.getProperties().add(indexproperties::rank::SecondPhase::NAME, "value(4)"); + RankEnvironment rankEnv(_factory, indexEnv, _queryEnv); + FeatureDumper dumper(rankEnv); + dumper.configure(); + dumper.addDumpFeature("secondPhase"); + EXPECT_TRUE(dumper.setup()); + RankResult exp; + exp.addScore("secondPhase", 4.0); + EXPECT_EQUAL(exp, dumper.dump()); + } } void @@ -939,6 +953,7 @@ RankSetupTest::RankSetupTest() : setup_fef_test_plugin(_factory); _factory.addPrototype(Blueprint::SP(new ValueBlueprint())); _factory.addPrototype(Blueprint::SP(new RankingExpressionBlueprint())); + _factory.addPrototype(std::make_shared<SecondPhaseBlueprint>()); // setup an original attribute manager with two attributes search::attribute::Config cfg(search::attribute::BasicType::INT32, diff --git a/searchlib/src/tests/tensor/distance_calculator/distance_calculator_test.cpp b/searchlib/src/tests/tensor/distance_calculator/distance_calculator_test.cpp index 4ffc1fe366e..136878f0ea5 100644 --- a/searchlib/src/tests/tensor/distance_calculator/distance_calculator_test.cpp +++ b/searchlib/src/tests/tensor/distance_calculator/distance_calculator_test.cpp @@ -9,7 +9,6 @@ #include <vespa/searchlib/test/attribute_builder.h> #include <vespa/vespalib/gtest/gtest.h> #include <vespa/vespalib/util/exceptions.h> -#include <iostream> using namespace search::attribute::test; using namespace search::attribute; diff --git a/searchlib/src/tests/tensor/distance_functions/CMakeLists.txt b/searchlib/src/tests/tensor/distance_functions/CMakeLists.txt index e1a54f7883a..92ad9ae2648 100644 --- a/searchlib/src/tests/tensor/distance_functions/CMakeLists.txt +++ b/searchlib/src/tests/tensor/distance_functions/CMakeLists.txt @@ -7,3 +7,10 @@ vespa_add_executable(searchlib_distance_functions_test_app TEST GTest::GTest ) vespa_add_test(NAME searchlib_distance_functions_test_app COMMAND searchlib_distance_functions_test_app) + +vespa_add_executable(searchlib_distance_functions_benchmark_app TEST + SOURCES + distance_functions_benchmark.cpp + DEPENDS + searchlib +) diff --git a/searchlib/src/tests/tensor/distance_functions/distance_functions_benchmark.cpp b/searchlib/src/tests/tensor/distance_functions/distance_functions_benchmark.cpp new file mode 100644 index 00000000000..14a0adac651 --- /dev/null +++ b/searchlib/src/tests/tensor/distance_functions/distance_functions_benchmark.cpp @@ -0,0 +1,129 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/eval/eval/typed_cells.h> +#include <vespa/searchlib/common/geo_gcd.h> +#include <vespa/searchlib/tensor/distance_functions.h> +#include <vespa/searchlib/tensor/distance_function_factory.h> +#include <vespa/searchlib/tensor/mips_distance_transform.h> +#include <vespa/vespalib/util/benchmark_timer.h> +#include <vespa/vespalib/util/classname.h> + +using namespace search::tensor; +using vespalib::eval::Int8Float; +using vespalib::BFloat16; +using vespalib::eval::TypedCells; +using search::attribute::DistanceMetric; + +size_t npos = std::string::npos; + +double run_calc(size_t iterations, TypedCells b, const BoundDistanceFunction & df) __attribute__((noinline)); +double run_calc_with_limit(size_t iterations, TypedCells b, const BoundDistanceFunction & df) __attribute__((noinline)); + +double +run_calc(size_t iterations, TypedCells b, const BoundDistanceFunction & df) { + vespalib::BenchmarkTimer timer(1.0); + double min_result = std::numeric_limits<double>::max(); + while (timer.has_budget()) { + timer.before(); + for (size_t i(0); i < iterations; i++) { + min_result = std::min(df.calc(b), min_result); + } + timer.after(); + } + printf("%s::calc: Time used = %1.3f, min_result=%3.3f\n", + vespalib::getClassName(df).c_str(), timer.min_time(), min_result); + return min_result; +} + +double +run_calc_with_limit(size_t iterations, TypedCells b, const BoundDistanceFunction & df) { + vespalib::BenchmarkTimer timer(1.0); + double min_result = std::numeric_limits<double>::max(); + while (timer.has_budget()) { + timer.before(); + for (size_t i(0); i < iterations; i++) { + min_result = std::min(df.calc_with_limit(b, std::numeric_limits<double>::max()), min_result); + } + timer.after(); + } + + printf("%s::calc_with_limit: Time used = %1.3f, min_result=%3.3f\n", + vespalib::getClassName(df).c_str(), timer.min_time(), min_result); + return min_result; +} + +template<typename T> +void benchmark(size_t iterations, size_t elems) __attribute__((noinline)); + +template<typename T> +void benchmark(size_t iterations, size_t elems, const DistanceFunctionFactory & df) { + std::vector<T> av, bv; + srandom(7); + av.reserve(elems); + bv.reserve(elems); + for (size_t i(0); i < elems; i++) { + av.push_back(random()%128); + bv.push_back(random()%128); + } + TypedCells a_cells(av), b_cells(bv); + + double calc_result = run_calc(iterations, b_cells, *df.for_query_vector(a_cells)); + double calc_with_limit_result = run_calc_with_limit(iterations, b_cells, *df.for_query_vector(a_cells)); + assert(calc_result == calc_with_limit_result); +} + +template<typename T> +void benchmark(size_t iterations, size_t elems, const std::string & dist_functions) { + if (dist_functions.find("euclid") != npos) { + benchmark<T>(iterations, elems, EuclideanDistanceFunctionFactory<T>()); + } + if (dist_functions.find("angular") != npos) { + if constexpr ( ! std::is_same<T, BFloat16>()) { + benchmark<T>(iterations, elems, AngularDistanceFunctionFactory<T>()); + } + } + if (dist_functions.find("prenorm") != npos) { + if constexpr ( ! std::is_same<T, BFloat16>()) { + benchmark<T>(iterations, elems, PrenormalizedAngularDistanceFunctionFactory<T>()); + } + } + if (dist_functions.find("mips") != npos) { + if constexpr ( !std::is_same<T, BFloat16>()) { + benchmark<T>(iterations, elems, MipsDistanceFunctionFactory<T>()); + } + } +} + +void +benchmark(size_t iterations, size_t elems, const std::string & dist_functions, const std::string & data_types) { + if (data_types.find("double") != npos) { + benchmark<double>(iterations, elems, dist_functions); + } + if (data_types.find("float32") != npos) { + benchmark<float>(iterations, elems, dist_functions); + } + if (data_types.find("bfloat16") != npos) { + benchmark<BFloat16>(iterations, elems, dist_functions); + } + if (data_types.find("float8") != npos) { + benchmark<Int8Float>(iterations, elems, dist_functions); + } +} + +int +main(int argc, char *argv[]) { + size_t num_iterations = 10000000; + size_t num_elems = 1024; + std::string dist_functions = "angular euclid prenorm mips"; + std::string data_types = "double float32 bfloat16 float8"; + if (argc > 1) { num_iterations = atol(argv[1]); } + if (argc > 2) { num_elems = atol(argv[2]); } + if (argc > 3) { dist_functions = argv[3]; } + if (argc > 4) { data_types = argv[4]; } + + printf("Benchmarking %ld iterations with vector length %ld with distance functions '%s' for data types '%s'\n", + num_iterations, num_elems, dist_functions.c_str(), data_types.c_str()); + benchmark(num_iterations, num_elems, dist_functions, data_types); + + return 0; +} diff --git a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp index d50677314df..97b88bc787a 100644 --- a/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp +++ b/searchlib/src/tests/tensor/hnsw_index/hnsw_index_test.cpp @@ -111,7 +111,7 @@ class MyBoundDistanceFunction : public BoundDistanceFunction { std::unique_ptr<BoundDistanceFunction> _real; public: - MyBoundDistanceFunction(std::unique_ptr<BoundDistanceFunction> real) + explicit MyBoundDistanceFunction(std::unique_ptr<BoundDistanceFunction> real) : _real(std::move(real)) { } @@ -147,19 +147,19 @@ class MyDistanceFunctionFactory : public DistanceFunctionFactory { std::unique_ptr<DistanceFunctionFactory> _real; public: - MyDistanceFunctionFactory(std::unique_ptr<DistanceFunctionFactory> real) + explicit MyDistanceFunctionFactory(std::unique_ptr<DistanceFunctionFactory> real) : _real(std::move(real)) { } ~MyDistanceFunctionFactory() override; - std::unique_ptr<BoundDistanceFunction> for_query_vector(TypedCells lhs) override { + std::unique_ptr<BoundDistanceFunction> for_query_vector(TypedCells lhs) const override { EXPECT_FALSE(lhs.non_existing_attribute_value()); return std::make_unique<MyBoundDistanceFunction>(_real->for_query_vector(lhs)); } - std::unique_ptr<BoundDistanceFunction> for_insertion_vector(TypedCells lhs) override { + std::unique_ptr<BoundDistanceFunction> for_insertion_vector(TypedCells lhs) const override { EXPECT_FALSE(lhs.non_existing_attribute_value()); return std::make_unique<MyBoundDistanceFunction>(_real->for_insertion_vector(lhs)); } @@ -936,6 +936,36 @@ TYPED_TEST(HnswIndexTest, search_during_remove) this->expect_top_3_by_docid("{0, 0}", {0, 0}, {7}); } +TYPED_TEST(HnswIndexTest, inconsistent_index) +{ + this->init(false); + this->vectors.clear(); + this->vectors.set(1, {1, 3}).set(2, {7, 1}).set(3, {6, 5}).set(4, {8, 3}).set(5, {10, 3}); + this->add_document(1); + this->add_document(2); + this->add_document(3); + this->add_document(4); + this->add_document(5); + this->expect_entry_point(1, 0); + this->expect_level_0(1, {2, 3}); + this->expect_level_0(2, {1, 3, 4, 5}); + this->expect_level_0(3, {1, 2, 4}); + this->expect_level_0(4, {2, 3, 5}); + this->expect_level_0(5, {2, 4}); + EXPECT_EQ(0, this->index->check_consistency(6)); + // Remove vector for docid 5 but don't update index. + this->vectors.clear(5); + EXPECT_EQ(1, this->index->check_consistency(6)); + /* + * Removing document 2 causes mutual reconnect for nodes [1, 3, 4, 5] + * where nodes 1 and 5 are not previously connected. Distance from + * node 1 to node 5 cannot be calculated due to missing vector. + */ + this->remove_document(2); + // No reconnect for node without vector + this->expect_level_0(5, {4}); +} + using HnswMultiIndexTest = HnswIndexTest<HnswIndex<HnswIndexType::MULTI>>; namespace { diff --git a/searchlib/src/tests/util/token_extractor/token_extractor_test.cpp b/searchlib/src/tests/util/token_extractor/token_extractor_test.cpp index e6944e257e9..5eb42bb8ac4 100644 --- a/searchlib/src/tests/util/token_extractor/token_extractor_test.cpp +++ b/searchlib/src/tests/util/token_extractor/token_extractor_test.cpp @@ -118,7 +118,7 @@ TEST_F(TokenExtractorTest, empty_string) TEST_F(TokenExtractorTest, plain_string) { - EXPECT_EQ((Words{"Plain string"}), process(StringFieldValue("Plain string"))); + EXPECT_EQ((Words{}), process(StringFieldValue("Plain string"))); } TEST_F(TokenExtractorTest, normal_string) diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp index 5b17b491a20..635851f9f1d 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_factory.cpp @@ -11,7 +11,6 @@ #include "in_term_search.h" #include "multi_term_or_filter_search.h" #include "predicate_attribute.h" -#include <vespa/eval/eval/value.h> #include <vespa/searchcommon/attribute/config.h> #include <vespa/searchcommon/attribute/hit_estimate_flow_stats_adapter.h> #include <vespa/searchlib/common/location.h> @@ -94,6 +93,7 @@ using search::queryeval::StrictHeapOrSearch; using search::queryeval::WeightedSetTermBlueprint; using search::queryeval::flow::btree_cost; using search::queryeval::flow::btree_strict_cost; +using search::queryeval::flow::estimate_when_unknown; using search::queryeval::flow::get_num_indirections; using search::queryeval::flow::lookup_cost; using search::queryeval::flow::lookup_strict_cost; @@ -150,10 +150,9 @@ public: search::queryeval::FlowStats calculate_flow_stats(uint32_t docid_limit) const override { if (_hit_estimate.is_unknown()) { // E.g. attributes without fast-search are not able to provide a hit estimate. - // In this case we just assume matching half of the document corpus. // In addition, matching is lookup based, and we are not able to skip documents efficiently when being strict. size_t indirections = get_num_indirections(_attr.getBasicType(), _attr.getCollectionType()); - return {0.5, lookup_cost(indirections), lookup_strict_cost(indirections)}; + return {estimate_when_unknown(), lookup_cost(indirections), lookup_strict_cost(indirections)}; } else { double rel_est = abs_to_rel_est(_hit_estimate.est_hits(), docid_limit); return {rel_est, btree_cost(rel_est), btree_strict_cost(rel_est)}; @@ -443,7 +442,8 @@ private: class DirectWandBlueprint : public queryeval::ComplexLeafBlueprint { private: - mutable queryeval::SharedWeakAndPriorityQueue _scores; + using WeakAndPriorityQueue = queryeval::WeakAndPriorityQueue; + std::unique_ptr<WeakAndPriorityQueue> _scores; const queryeval::wand::score_t _scoreThreshold; double _thresholdBoostFactor; const uint32_t _scoresAdjustFrequency; @@ -452,14 +452,16 @@ private: const IDocidWithWeightPostingStore &_attr; vespalib::datastore::EntryRef _dictionary_snapshot; + public: DirectWandBlueprint(const FieldSpec &field, const IDocidWithWeightPostingStore &attr, uint32_t scoresToTrack, - queryeval::wand::score_t scoreThreshold, double thresholdBoostFactor, size_t size_hint) + queryeval::wand::score_t scoreThreshold, double thresholdBoostFactor, size_t size_hint, + bool thread_safe) : ComplexLeafBlueprint(field), - _scores(scoresToTrack), + _scores(WeakAndPriorityQueue::createHeap(scoresToTrack, thread_safe)), _scoreThreshold(scoreThreshold), _thresholdBoostFactor(thresholdBoostFactor), - _scoresAdjustFrequency(queryeval::DEFAULT_PARALLEL_WAND_SCORES_ADJUST_FREQUENCY), + _scoresAdjustFrequency(queryeval::wand::DEFAULT_PARALLEL_WAND_SCORES_ADJUST_FREQUENCY), _weights(), _terms(), _attr(attr), @@ -496,7 +498,7 @@ public: using OrFlow = search::queryeval::OrFlow; using MyAdapter = attribute::DirectPostingStoreFlowStatsAdapter; double child_est = OrFlow::estimate_of(MyAdapter(docid_limit), _terms); - double my_est = abs_to_rel_est(_scores.getScoresToTrack(), docid_limit); + double my_est = abs_to_rel_est(_scores->getScoresToTrack(), docid_limit); double est = (child_est + my_est) / 2.0; return {est, OrFlow::cost_of(MyAdapter(docid_limit), _terms, false), OrFlow::cost_of(MyAdapter(docid_limit), _terms, true) + queryeval::flow::heap_cost(est, _terms.size())}; @@ -508,9 +510,8 @@ public: return std::make_unique<queryeval::EmptySearch>(); } return queryeval::ParallelWeakAndSearch::create(*tfmda[0], - queryeval::ParallelWeakAndSearch::MatchParams(_scores, _scoreThreshold, - _thresholdBoostFactor, _scoresAdjustFrequency) - .setDocIdLimit(get_docid_limit()), + queryeval::ParallelWeakAndSearch::MatchParams(*_scores, _scoreThreshold, _thresholdBoostFactor, + _scoresAdjustFrequency, get_docid_limit()), _weights, _terms, _attr, strict()); } std::unique_ptr<SearchIterator> createFilterSearch(FilterConstraint constraint) const override; @@ -712,15 +713,12 @@ public: void visit(query::WandTerm &n) override { if (has_always_btree_iterators_with_docid_and_weight()) { - auto *bp = new DirectWandBlueprint(_field, *_dwwps, - n.getTargetNumHits(), n.getScoreThreshold(), n.getThresholdBoostFactor(), - n.getNumTerms()); + auto *bp = new DirectWandBlueprint(_field, *_dwwps, n.getTargetNumHits(), n.getScoreThreshold(), + n.getThresholdBoostFactor(), n.getNumTerms(), is_search_multi_threaded()); createDirectMultiTerm(bp, n); } else { - auto *bp = new ParallelWeakAndBlueprint(_field, - n.getTargetNumHits(), - n.getScoreThreshold(), - n.getThresholdBoostFactor()); + auto *bp = new ParallelWeakAndBlueprint(_field, n.getTargetNumHits(), n.getScoreThreshold(), + n.getThresholdBoostFactor(), is_search_multi_threaded()); createShallowWeightedSet(bp, n, _field, _attr.isIntegerType()); } } diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h index e2928710a32..ac6fc6f603a 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h +++ b/searchlib/src/vespa/searchlib/attribute/attribute_blueprint_params.h @@ -16,15 +16,18 @@ struct AttributeBlueprintParams double global_filter_upper_limit; double target_hits_max_adjustment_factor; vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm; + double weakand_range; AttributeBlueprintParams(double global_filter_lower_limit_in, double global_filter_upper_limit_in, double target_hits_max_adjustment_factor_in, - vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm_in) + vespalib::FuzzyMatchingAlgorithm fuzzy_matching_algorithm_in, + double weakand_range_in) : global_filter_lower_limit(global_filter_lower_limit_in), global_filter_upper_limit(global_filter_upper_limit_in), target_hits_max_adjustment_factor(target_hits_max_adjustment_factor_in), - fuzzy_matching_algorithm(fuzzy_matching_algorithm_in) + fuzzy_matching_algorithm(fuzzy_matching_algorithm_in), + weakand_range(weakand_range_in) { } @@ -32,7 +35,8 @@ struct AttributeBlueprintParams : AttributeBlueprintParams(fef::indexproperties::matching::GlobalFilterLowerLimit::DEFAULT_VALUE, fef::indexproperties::matching::GlobalFilterUpperLimit::DEFAULT_VALUE, fef::indexproperties::matching::TargetHitsMaxAdjustmentFactor::DEFAULT_VALUE, - fef::indexproperties::matching::FuzzyAlgorithm::DEFAULT_VALUE) + fef::indexproperties::matching::FuzzyAlgorithm::DEFAULT_VALUE, + fef::indexproperties::temporary::WeakAndRange::DEFAULT_VALUE) { } }; diff --git a/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp b/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp index 928023c0f94..e2d7f0fe312 100644 --- a/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/attribute/attribute_weighted_set_blueprint.cpp @@ -132,7 +132,7 @@ AttributeWeightedSetBlueprint::calculate_flow_stats(uint32_t docid_limit) const using MyAdapter = attribute::HitEstimateFlowStatsAdapter; size_t num_indirections = queryeval::flow::get_num_indirections(_attr.getBasicType(), _attr.getCollectionType()); double est = OrFlow::estimate_of(MyAdapter(docid_limit, num_indirections), _estimates); - return {est, OrFlow::cost_of(MyAdapter(docid_limit, num_indirections), _estimates, false), + return {est, queryeval::flow::reverse_hash_lookup(), OrFlow::cost_of(MyAdapter(docid_limit, num_indirections), _estimates, true) + queryeval::flow::heap_cost(est, _estimates.size())}; } diff --git a/searchlib/src/vespa/searchlib/attribute/bitvector_search_cache.cpp b/searchlib/src/vespa/searchlib/attribute/bitvector_search_cache.cpp index e20d02afe50..6762c0516b2 100644 --- a/searchlib/src/vespa/searchlib/attribute/bitvector_search_cache.cpp +++ b/searchlib/src/vespa/searchlib/attribute/bitvector_search_cache.cpp @@ -3,6 +3,7 @@ #include "bitvector_search_cache.h" #include <vespa/searchlib/common/bitvector.h> #include <vespa/vespalib/stllike/hash_map.hpp> +#include <vespa/vespalib/util/memoryusage.h> #include <mutex> namespace search::attribute { @@ -10,6 +11,7 @@ namespace search::attribute { BitVectorSearchCache::BitVectorSearchCache() : _mutex(), _size(0), + _entries_extra_memory_usage(0), _cache() {} @@ -18,9 +20,19 @@ BitVectorSearchCache::~BitVectorSearchCache() = default; void BitVectorSearchCache::insert(const vespalib::string &term, std::shared_ptr<Entry> entry) { + size_t entry_extra_memory_usage = 0; + if (entry) { + entry_extra_memory_usage = sizeof(Entry); + if (entry->bitVector) { + entry_extra_memory_usage += entry->bitVector->getFileBytes(); + } + } std::unique_lock guard(_mutex); - _cache.insert(std::make_pair(term, std::move(entry))); + auto ins_res = _cache.insert(std::make_pair(term, std::move(entry))); _size.store(_cache.size()); + if (ins_res.second) { + _entries_extra_memory_usage += entry_extra_memory_usage; + } } std::shared_ptr<BitVectorSearchCache::Entry> @@ -36,12 +48,25 @@ BitVectorSearchCache::find(const vespalib::string &term) const return {}; } +vespalib::MemoryUsage +BitVectorSearchCache::get_memory_usage() const +{ + std::lock_guard guard(_mutex); + size_t cache_memory_consumption = _cache.getMemoryConsumption(); + size_t cache_memory_used = _cache.getMemoryUsed(); + size_t self_memory_used = sizeof(BitVectorSearchCache) - sizeof(_cache); + size_t allocated = self_memory_used + cache_memory_consumption + _entries_extra_memory_usage; + size_t used = self_memory_used + cache_memory_used + _entries_extra_memory_usage; + return vespalib::MemoryUsage(allocated, used, 0, 0); +} + void BitVectorSearchCache::clear() { std::unique_lock guard(_mutex); _cache.clear(); _size.store(0ul, std::memory_order_relaxed); + _entries_extra_memory_usage = 0; } } diff --git a/searchlib/src/vespa/searchlib/attribute/bitvector_search_cache.h b/searchlib/src/vespa/searchlib/attribute/bitvector_search_cache.h index 233f8315aaf..3a38cdcea26 100644 --- a/searchlib/src/vespa/searchlib/attribute/bitvector_search_cache.h +++ b/searchlib/src/vespa/searchlib/attribute/bitvector_search_cache.h @@ -10,6 +10,8 @@ #include <atomic> namespace search { class BitVector; } +namespace vespalib { class MemoryUsage; } + namespace search::attribute { /** @@ -37,6 +39,7 @@ private: mutable std::shared_mutex _mutex; std::atomic<uint64_t> _size; + size_t _entries_extra_memory_usage; Cache _cache; public: @@ -45,6 +48,7 @@ public: void insert(const vespalib::string &term, std::shared_ptr<Entry> entry); std::shared_ptr<Entry> find(const vespalib::string &term) const; size_t size() const { return _size.load(std::memory_order_relaxed); } + vespalib::MemoryUsage get_memory_usage() const; void clear(); }; diff --git a/searchlib/src/vespa/searchlib/attribute/imported_attribute_vector.cpp b/searchlib/src/vespa/searchlib/attribute/imported_attribute_vector.cpp index 029dc155785..f6a33165f0c 100644 --- a/searchlib/src/vespa/searchlib/attribute/imported_attribute_vector.cpp +++ b/searchlib/src/vespa/searchlib/attribute/imported_attribute_vector.cpp @@ -3,6 +3,7 @@ #include "imported_attribute_vector.h" #include "imported_attribute_vector_read_guard.h" #include "imported_search_context.h" +#include <vespa/vespalib/util/memoryusage.h> namespace search::attribute { @@ -58,4 +59,15 @@ void ImportedAttributeVector::clearSearchCache() { } } +vespalib::MemoryUsage +ImportedAttributeVector::get_memory_usage() const +{ + constexpr auto self_memory_usage = sizeof(ImportedAttributeVector); + vespalib::MemoryUsage result(self_memory_usage, self_memory_usage, 0, 0); + if (_search_cache) { + result.merge(_search_cache->get_memory_usage()); + } + return result; +} + } diff --git a/searchlib/src/vespa/searchlib/attribute/imported_attribute_vector.h b/searchlib/src/vespa/searchlib/attribute/imported_attribute_vector.h index bd018df5273..5b68957b7f5 100644 --- a/searchlib/src/vespa/searchlib/attribute/imported_attribute_vector.h +++ b/searchlib/src/vespa/searchlib/attribute/imported_attribute_vector.h @@ -6,6 +6,8 @@ #include <vespa/searchcommon/attribute/i_document_meta_store_context.h> #include <vespa/vespalib/stllike/string.h> +namespace vespalib { class MemoryUsage; } + namespace search::attribute { class BitVectorSearchCache; @@ -62,6 +64,7 @@ public: std::unique_ptr<AttributeReadGuard> makeReadGuard(bool stableEnumGuard) const override; virtual std::unique_ptr<AttributeReadGuard> makeReadGuard(std::shared_ptr<MetaStoreReadGuard> targetMetaStoreReadGuard, bool stableEnumGuard) const; + vespalib::MemoryUsage get_memory_usage() const; protected: vespalib::string _name; diff --git a/searchlib/src/vespa/searchlib/features/CMakeLists.txt b/searchlib/src/vespa/searchlib/features/CMakeLists.txt index 0690801ee61..27c2b6d5e41 100644 --- a/searchlib/src/vespa/searchlib/features/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/features/CMakeLists.txt @@ -26,6 +26,8 @@ vespa_add_library(searchlib_features OBJECT fieldmatchfeature.cpp fieldtermmatchfeature.cpp firstphasefeature.cpp + first_phase_rank_feature.cpp + first_phase_rank_lookup.cpp flow_completeness_feature.cpp foreachfeature.cpp freshnessfeature.cpp @@ -57,6 +59,7 @@ vespa_add_library(searchlib_features OBJECT rankingexpressionfeature.cpp raw_score_feature.cpp reverseproximityfeature.cpp + second_phase_feature.cpp setup.cpp subqueries_feature.cpp tensor_attribute_executor.cpp diff --git a/searchlib/src/vespa/searchlib/features/bm25_feature.cpp b/searchlib/src/vespa/searchlib/features/bm25_feature.cpp index 505b8166ee7..03d2e94b5d0 100644 --- a/searchlib/src/vespa/searchlib/features/bm25_feature.cpp +++ b/searchlib/src/vespa/searchlib/features/bm25_feature.cpp @@ -68,7 +68,7 @@ Bm25Executor::Bm25Executor(const fef::FieldInfo& field, } double -Bm25Executor::calculate_inverse_document_frequency(uint32_t matching_doc_count, uint32_t total_doc_count) +Bm25Executor::calculate_inverse_document_frequency(uint32_t matching_doc_count, uint32_t total_doc_count) noexcept { return std::log(1 + (static_cast<double>(total_doc_count - matching_doc_count + 0.5) / static_cast<double>(matching_doc_count + 0.5))); diff --git a/searchlib/src/vespa/searchlib/features/bm25_feature.h b/searchlib/src/vespa/searchlib/features/bm25_feature.h index a1b45375285..637d656990b 100644 --- a/searchlib/src/vespa/searchlib/features/bm25_feature.h +++ b/searchlib/src/vespa/searchlib/features/bm25_feature.h @@ -39,7 +39,7 @@ public: double k1_param, double b_param); - double static calculate_inverse_document_frequency(uint32_t matching_doc_count, uint32_t total_doc_count); + double static calculate_inverse_document_frequency(uint32_t matching_doc_count, uint32_t total_doc_count) noexcept; void handle_bind_match_data(const fef::MatchData& match_data) override; void execute(uint32_t docId) override; diff --git a/searchlib/src/vespa/searchlib/features/first_phase_rank_feature.cpp b/searchlib/src/vespa/searchlib/features/first_phase_rank_feature.cpp new file mode 100644 index 00000000000..5c8a9a391ff --- /dev/null +++ b/searchlib/src/vespa/searchlib/features/first_phase_rank_feature.cpp @@ -0,0 +1,71 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "first_phase_rank_feature.h" +#include "valuefeature.h" +#include <vespa/vespalib/util/stash.h> + +namespace search::features { + +FirstPhaseRankExecutor::FirstPhaseRankExecutor(const FirstPhaseRankLookup& lookup) + : FeatureExecutor(), + _lookup(lookup) +{ +} +FirstPhaseRankExecutor::~FirstPhaseRankExecutor() = default; + +void +FirstPhaseRankExecutor::execute(uint32_t docid) +{ + outputs().set_number(0, _lookup.lookup(docid)); +} + +FirstPhaseRankBlueprint::FirstPhaseRankBlueprint() + : Blueprint("firstPhaseRank") +{ +} + +FirstPhaseRankBlueprint::~FirstPhaseRankBlueprint() = default; + +void +FirstPhaseRankBlueprint::visitDumpFeatures(const fef::IIndexEnvironment&, fef::IDumpFeatureVisitor&) const +{ +} + +std::unique_ptr<fef::Blueprint> +FirstPhaseRankBlueprint::createInstance() const +{ + return std::make_unique<FirstPhaseRankBlueprint>(); +} + +fef::ParameterDescriptions +FirstPhaseRankBlueprint::getDescriptions() const +{ + return fef::ParameterDescriptions().desc(); +} + +bool +FirstPhaseRankBlueprint::setup(const fef::IIndexEnvironment&, const fef::ParameterList&) +{ + describeOutput("score", "The first phase rank."); + return true; +} + +void +FirstPhaseRankBlueprint::prepareSharedState(const fef::IQueryEnvironment&, fef::IObjectStore& store) const +{ + FirstPhaseRankLookup::make_shared_state(store); +} + +fef::FeatureExecutor& +FirstPhaseRankBlueprint::createExecutor(const fef::IQueryEnvironment& env, vespalib::Stash& stash) const +{ + const auto* lookup = FirstPhaseRankLookup::get_shared_state(env.getObjectStore()); + if (lookup != nullptr) { + return stash.create<FirstPhaseRankExecutor>(*lookup); + } else { + std::vector<feature_t> values{std::numeric_limits<feature_t>::max()}; + return stash.create<ValueExecutor>(values); + } +} + +} diff --git a/searchlib/src/vespa/searchlib/features/first_phase_rank_feature.h b/searchlib/src/vespa/searchlib/features/first_phase_rank_feature.h new file mode 100644 index 00000000000..f90ea26f859 --- /dev/null +++ b/searchlib/src/vespa/searchlib/features/first_phase_rank_feature.h @@ -0,0 +1,40 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "first_phase_rank_lookup.h" +#include <vespa/searchlib/fef/blueprint.h> +#include <vespa/searchlib/fef/featureexecutor.h> + +namespace search::features { + +class FirstPhaseRankLookup; + +/* + * Executor for first phase rank feature that outputs the first phase rank + * for the given docid on this search node (1.0, 2.0, 3.0, etc.). + */ +class FirstPhaseRankExecutor : public fef::FeatureExecutor { + const FirstPhaseRankLookup& _lookup; +public: + FirstPhaseRankExecutor(const FirstPhaseRankLookup& lookup); + ~FirstPhaseRankExecutor() override; + void execute(uint32_t docid) override; +}; + +/* + * Blueprint for first phase rank feature. + */ +class FirstPhaseRankBlueprint : public fef::Blueprint { +public: + FirstPhaseRankBlueprint(); + ~FirstPhaseRankBlueprint() override; + void visitDumpFeatures(const fef::IIndexEnvironment& env, fef::IDumpFeatureVisitor& visitor) const override; + std::unique_ptr<fef::Blueprint> createInstance() const override; + fef::ParameterDescriptions getDescriptions() const override; + bool setup(const fef::IIndexEnvironment& env, const fef::ParameterList& params) override; + void prepareSharedState(const fef::IQueryEnvironment& env, fef::IObjectStore& store) const override; + fef::FeatureExecutor& createExecutor(const fef::IQueryEnvironment& env, vespalib::Stash& stash) const override; +}; + +} diff --git a/searchlib/src/vespa/searchlib/features/first_phase_rank_lookup.cpp b/searchlib/src/vespa/searchlib/features/first_phase_rank_lookup.cpp new file mode 100644 index 00000000000..2dfaabb8326 --- /dev/null +++ b/searchlib/src/vespa/searchlib/features/first_phase_rank_lookup.cpp @@ -0,0 +1,67 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "first_phase_rank_lookup.h" +#include <vespa/searchlib/fef/objectstore.h> +#include <cassert> +#include <limits> + +using search::fef::AnyWrapper; + +namespace search::features { + +namespace { + +const vespalib::string key = "firstPhaseRankLookup"; + +} + +FirstPhaseRankLookup::FirstPhaseRankLookup() + : _map() +{ +} + +FirstPhaseRankLookup::FirstPhaseRankLookup(FirstPhaseRankLookup&&) = default; + +FirstPhaseRankLookup::~FirstPhaseRankLookup() = default; + +feature_t +FirstPhaseRankLookup::lookup(uint32_t docid) const noexcept +{ + auto itr = _map.find(docid); + if (itr != _map.end()) [[likely]] { + return itr->second; + } else { + return std::numeric_limits<feature_t>::max(); + } +} + +void +FirstPhaseRankLookup::add(uint32_t docid, uint32_t rank) +{ + auto insres = _map.insert(std::make_pair(docid, rank)); + assert(insres.second); +} + +void +FirstPhaseRankLookup::make_shared_state(fef::IObjectStore& store) +{ + if (store.get(key) == nullptr) { + store.add(key, std::make_unique<AnyWrapper<FirstPhaseRankLookup>>(FirstPhaseRankLookup())); + } +} + +FirstPhaseRankLookup* +FirstPhaseRankLookup::get_mutable_shared_state(fef::IObjectStore& store) +{ + auto* wrapper = dynamic_cast<AnyWrapper<FirstPhaseRankLookup>*>(store.get_mutable(key)); + return (wrapper == nullptr) ? nullptr : &wrapper->getValue(); +} + +const FirstPhaseRankLookup* +FirstPhaseRankLookup::get_shared_state(const fef::IObjectStore& store) +{ + const auto* wrapper = dynamic_cast<const AnyWrapper<FirstPhaseRankLookup>*>(store.get(key)); + return (wrapper == nullptr) ? nullptr : &wrapper->getValue(); +} + +} diff --git a/searchlib/src/vespa/searchlib/features/first_phase_rank_lookup.h b/searchlib/src/vespa/searchlib/features/first_phase_rank_lookup.h new file mode 100644 index 00000000000..83d89ed2dd1 --- /dev/null +++ b/searchlib/src/vespa/searchlib/features/first_phase_rank_lookup.h @@ -0,0 +1,32 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/common/feature.h> +#include <vespa/vespalib/stllike/hash_map.h> + +namespace search::fef { class IObjectStore; } + +namespace search::features { + +/* + * This class contains a mapping from docids used by second phase to + * first phase rank. + */ +class FirstPhaseRankLookup { + vespalib::hash_map<uint32_t, uint32_t> _map; +public: + FirstPhaseRankLookup(); + FirstPhaseRankLookup(const FirstPhaseRankLookup&) = delete; + FirstPhaseRankLookup(FirstPhaseRankLookup&&); + ~FirstPhaseRankLookup(); + FirstPhaseRankLookup& operator=(const FirstPhaseRankLookup&) = delete; + FirstPhaseRankLookup& operator=(FirstPhaseRankLookup&&) = delete; + feature_t lookup(uint32_t docid) const noexcept; + void add(uint32_t docid, uint32_t rank); + static void make_shared_state(fef::IObjectStore& store); + static FirstPhaseRankLookup* get_mutable_shared_state(fef::IObjectStore& store); + static const FirstPhaseRankLookup* get_shared_state(const fef::IObjectStore& store); +}; + +} diff --git a/searchlib/src/vespa/searchlib/features/second_phase_feature.cpp b/searchlib/src/vespa/searchlib/features/second_phase_feature.cpp new file mode 100644 index 00000000000..82ce36be859 --- /dev/null +++ b/searchlib/src/vespa/searchlib/features/second_phase_feature.cpp @@ -0,0 +1,57 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "second_phase_feature.h" +#include <vespa/searchlib/fef/featureexecutor.h> +#include <vespa/searchlib/fef/indexproperties.h> +#include <vespa/searchlib/fef/properties.h> +#include <vespa/vespalib/util/stash.h> + +using namespace search::fef; + +namespace search::features { + +void +SecondPhaseExecutor::execute(uint32_t) +{ + outputs().set_number(0, inputs().get_number(0)); +} + + +SecondPhaseBlueprint::SecondPhaseBlueprint() + : Blueprint("secondPhase") +{ +} + +void +SecondPhaseBlueprint::visitDumpFeatures(const IIndexEnvironment&, + IDumpFeatureVisitor&) const +{ +} + +Blueprint::UP +SecondPhaseBlueprint::createInstance() const +{ + return std::make_unique<SecondPhaseBlueprint>(); +} + +bool +SecondPhaseBlueprint::setup(const IIndexEnvironment& env, + const ParameterList&) +{ + if (auto maybe_input = defineInput(indexproperties::rank::SecondPhase::lookup(env.getProperties()), + AcceptInput::ANY)) + { + describeOutput("score", "The ranking score for second phase.", maybe_input.value()); + return true; + } else { + return false; + } +} + +FeatureExecutor & +SecondPhaseBlueprint::createExecutor(const IQueryEnvironment&, vespalib::Stash& stash) const +{ + return stash.create<SecondPhaseExecutor>(); +} + +} diff --git a/searchlib/src/vespa/searchlib/features/second_phase_feature.h b/searchlib/src/vespa/searchlib/features/second_phase_feature.h new file mode 100644 index 00000000000..61805186453 --- /dev/null +++ b/searchlib/src/vespa/searchlib/features/second_phase_feature.h @@ -0,0 +1,35 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/fef/blueprint.h> + +namespace search::features { + +/** + * Implements the executor outputting the second phase ranking. + */ +class SecondPhaseExecutor : public fef::FeatureExecutor { +public: + bool isPure() override { return true; } + void execute(uint32_t docId) override; +}; + +/** + * Implements the blueprint for the second phase feature. + */ +class SecondPhaseBlueprint : public fef::Blueprint { +public: + SecondPhaseBlueprint(); + void visitDumpFeatures(const fef::IIndexEnvironment & env, fef::IDumpFeatureVisitor & visitor) const override; + fef::Blueprint::UP createInstance() const override; + + fef::ParameterDescriptions getDescriptions() const override { + return fef::ParameterDescriptions().desc(); + } + bool setup(const fef::IIndexEnvironment & env, const fef::ParameterList & params) override; + + fef::FeatureExecutor &createExecutor(const fef::IQueryEnvironment &env, vespalib::Stash &stash) const override; +}; + +} diff --git a/searchlib/src/vespa/searchlib/features/setup.cpp b/searchlib/src/vespa/searchlib/features/setup.cpp index 71e083e2326..d65459817f0 100644 --- a/searchlib/src/vespa/searchlib/features/setup.cpp +++ b/searchlib/src/vespa/searchlib/features/setup.cpp @@ -22,6 +22,7 @@ #include "fieldmatchfeature.h" #include "fieldtermmatchfeature.h" #include "firstphasefeature.h" +#include "first_phase_rank_feature.h" #include "flow_completeness_feature.h" #include "foreachfeature.h" #include "freshnessfeature.h" @@ -48,6 +49,7 @@ #include "rankingexpressionfeature.h" #include "raw_score_feature.h" #include "reverseproximityfeature.h" +#include "second_phase_feature.h" #include "subqueries_feature.h" #include "tensor_from_labels_feature.h" #include "tensor_from_weighted_set_feature.h" @@ -90,6 +92,7 @@ void setup_search_features(fef::IBlueprintRegistry & registry) registry.addPrototype(std::make_shared<FieldMatchBlueprint>()); registry.addPrototype(std::make_shared<FieldTermMatchBlueprint>()); registry.addPrototype(std::make_shared<FirstPhaseBlueprint>()); + registry.addPrototype(std::make_shared<FirstPhaseRankBlueprint>()); registry.addPrototype(std::make_shared<FlowCompletenessBlueprint>()); registry.addPrototype(std::make_shared<ForeachBlueprint>()); registry.addPrototype(std::make_shared<FreshnessBlueprint>()); @@ -109,6 +112,7 @@ void setup_search_features(fef::IBlueprintRegistry & registry) registry.addPrototype(std::make_shared<RandomNormalBlueprint>()); registry.addPrototype(std::make_shared<RandomNormalStableBlueprint>()); registry.addPrototype(std::make_shared<RawScoreBlueprint>()); + registry.addPrototype(std::make_shared<SecondPhaseBlueprint>()); registry.addPrototype(std::make_shared<SubqueriesBlueprint>()); registry.addPrototype(std::make_shared<TensorFromLabelsBlueprint>()); registry.addPrototype(std::make_shared<TensorFromWeightedSetBlueprint>()); diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp index 4637ad5a4e8..1f88c34bef3 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.cpp +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.cpp @@ -179,6 +179,21 @@ namespace onsummary { namespace temporary { +const vespalib::string WeakAndRange::NAME("vespa.weakand.range"); +const double WeakAndRange::DEFAULT_VALUE(0.0); + +double +WeakAndRange::lookup(const Properties &props) +{ + return lookup(props, DEFAULT_VALUE); +} + +double +WeakAndRange::lookup(const Properties &props, double defaultValue) +{ + return lookupDouble(props, NAME, defaultValue); +} + } namespace mutate { diff --git a/searchlib/src/vespa/searchlib/fef/indexproperties.h b/searchlib/src/vespa/searchlib/fef/indexproperties.h index db8de8209a9..d047eb13347 100644 --- a/searchlib/src/vespa/searchlib/fef/indexproperties.h +++ b/searchlib/src/vespa/searchlib/fef/indexproperties.h @@ -178,6 +178,18 @@ namespace mutate { // Add temporary flags used for safe rollout of new features here namespace temporary { +/** + * A number in the range [0,1] for the effective idf range for WeakAndOperator. + * 1.0 will give the complete range as used by default by bm25. + * scaled_idf = (1.0 - range) * max_idf + (range * idf) + * 0.0 which is default gives default legacy behavior. + **/ +struct WeakAndRange { + static const vespalib::string NAME; + static const double DEFAULT_VALUE; + static double lookup(const Properties &props); + static double lookup(const Properties &props, double defaultValue); +}; } namespace mutate::on_match { diff --git a/searchlib/src/vespa/searchlib/fef/objectstore.cpp b/searchlib/src/vespa/searchlib/fef/objectstore.cpp index 3e5baf49116..a90702a88a6 100644 --- a/searchlib/src/vespa/searchlib/fef/objectstore.cpp +++ b/searchlib/src/vespa/searchlib/fef/objectstore.cpp @@ -35,4 +35,11 @@ ObjectStore::get(const vespalib::string & key) const return (found != _objectMap.end()) ? found->second : NULL; } +Anything * +ObjectStore::get_mutable(const vespalib::string& key) +{ + auto found = _objectMap.find(key); + return (found != _objectMap.end()) ? found->second : nullptr; +} + } diff --git a/searchlib/src/vespa/searchlib/fef/objectstore.h b/searchlib/src/vespa/searchlib/fef/objectstore.h index 9d1671e521c..d2d768ee338 100644 --- a/searchlib/src/vespa/searchlib/fef/objectstore.h +++ b/searchlib/src/vespa/searchlib/fef/objectstore.h @@ -24,6 +24,7 @@ class AnyWrapper : public Anything public: explicit AnyWrapper(T value) : _value(std::move(value)) { } const T & getValue() const { return _value; } + T& getValue() { return _value; } static const T & getValue(const Anything & any) { return static_cast<const AnyWrapper &>(any).getValue(); } private: T _value; @@ -38,6 +39,7 @@ public: virtual ~IObjectStore() = default; virtual void add(const vespalib::string & key, Anything::UP value) = 0; virtual const Anything * get(const vespalib::string & key) const = 0; + virtual Anything* get_mutable(const vespalib::string& key) = 0; }; /** @@ -50,6 +52,7 @@ public: ~ObjectStore() override; void add(const vespalib::string & key, Anything::UP value) override; const Anything * get(const vespalib::string & key) const override; + Anything* get_mutable(const vespalib::string & key) override; private: using ObjectMap = vespalib::hash_map<vespalib::string, Anything *>; ObjectMap _objectMap; diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp index aadc5300ede..ba5abb35141 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.cpp +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.cpp @@ -71,6 +71,7 @@ RankSetup::RankSetup(const BlueprintFactory &factory, const IIndexEnvironment &i _global_filter_lower_limit(0.0), _global_filter_upper_limit(1.0), _target_hits_max_adjustment_factor(20.0), + _weakand_range(0.0), _fuzzy_matching_algorithm(vespalib::FuzzyMatchingAlgorithm::DfaTable), _mutateOnMatch(), _mutateOnFirstPhase(), @@ -126,6 +127,7 @@ RankSetup::configure() set_global_filter_upper_limit(matching::GlobalFilterUpperLimit::lookup(_indexEnv.getProperties())); set_target_hits_max_adjustment_factor(matching::TargetHitsMaxAdjustmentFactor::lookup(_indexEnv.getProperties())); set_fuzzy_matching_algorithm(matching::FuzzyAlgorithm::lookup(_indexEnv.getProperties())); + set_weakand_range(temporary::WeakAndRange::lookup(_indexEnv.getProperties())); _mutateOnMatch._attribute = mutate::on_match::Attribute::lookup(_indexEnv.getProperties()); _mutateOnMatch._operation = mutate::on_match::Operation::lookup(_indexEnv.getProperties()); _mutateOnFirstPhase._attribute = mutate::on_first_phase::Attribute::lookup(_indexEnv.getProperties()); @@ -193,7 +195,7 @@ RankSetup::compile() _firstPhaseRankFeature = parser.featureName(); _first_phase_resolver->addSeed(_firstPhaseRankFeature); } else { - vespalib::string e = fmt("invalid feature name for initial rank: '%s'", _firstPhaseRankFeature.c_str()); + vespalib::string e = fmt("invalid feature name for first phase rank: '%s'", _firstPhaseRankFeature.c_str()); _warnings.emplace_back(e); _compileError = true; } @@ -204,7 +206,7 @@ RankSetup::compile() _secondPhaseRankFeature = parser.featureName(); _second_phase_resolver->addSeed(_secondPhaseRankFeature); } else { - vespalib::string e = fmt("invalid feature name for final rank: '%s'", _secondPhaseRankFeature.c_str()); + vespalib::string e = fmt("invalid feature name for second phase rank: '%s'", _secondPhaseRankFeature.c_str()); _warnings.emplace_back(e); _compileError = true; } diff --git a/searchlib/src/vespa/searchlib/fef/ranksetup.h b/searchlib/src/vespa/searchlib/fef/ranksetup.h index d8b977a0331..f20ecd4b42b 100644 --- a/searchlib/src/vespa/searchlib/fef/ranksetup.h +++ b/searchlib/src/vespa/searchlib/fef/ranksetup.h @@ -80,6 +80,7 @@ private: double _global_filter_lower_limit; double _global_filter_upper_limit; double _target_hits_max_adjustment_factor; + double _weakand_range; vespalib::FuzzyMatchingAlgorithm _fuzzy_matching_algorithm; MutateOperation _mutateOnMatch; MutateOperation _mutateOnFirstPhase; @@ -402,6 +403,8 @@ public: double get_target_hits_max_adjustment_factor() const { return _target_hits_max_adjustment_factor; } void set_fuzzy_matching_algorithm(vespalib::FuzzyMatchingAlgorithm v) { _fuzzy_matching_algorithm = v; } vespalib::FuzzyMatchingAlgorithm get_fuzzy_matching_algorithm() const { return _fuzzy_matching_algorithm; } + void set_weakand_range(double v) { _weakand_range = v; } + double get_weakand_range() const { return _weakand_range; } /** * This method may be used to indicate that certain features diff --git a/searchlib/src/vespa/searchlib/fef/test/matchdatabuilder.cpp b/searchlib/src/vespa/searchlib/fef/test/matchdatabuilder.cpp index 3645496e4fb..41551ac1062 100644 --- a/searchlib/src/vespa/searchlib/fef/test/matchdatabuilder.cpp +++ b/searchlib/src/vespa/searchlib/fef/test/matchdatabuilder.cpp @@ -10,11 +10,11 @@ LOG_SETUP(".fef.matchdatabuilder"); namespace search::fef::test { -MatchDataBuilder::MatchDataBuilder(QueryEnvironment &queryEnv, MatchData &data) : - _queryEnv(queryEnv), - _data(data), - _index(), - _match() +MatchDataBuilder::MatchDataBuilder(QueryEnvironment &queryEnv, MatchData &data) + : _queryEnv(queryEnv), + _data(data), + _index(), + _match() { // reset all match data objects. for (TermFieldHandle handle = 0; handle < _data.getNumTermFields(); ++handle) { @@ -22,7 +22,7 @@ MatchDataBuilder::MatchDataBuilder(QueryEnvironment &queryEnv, MatchData &data) } } -MatchDataBuilder::~MatchDataBuilder() {} +MatchDataBuilder::~MatchDataBuilder() = default; TermFieldMatchData * MatchDataBuilder::getTermFieldMatchData(uint32_t termId, uint32_t fieldId) @@ -59,7 +59,7 @@ MatchDataBuilder::addElement(const vespalib::string &fieldName, int32_t weight, LOG(error, "Field '%s' does not exist.", fieldName.c_str()); return false; } - _index[info->id()].elements.push_back(MyElement(weight, length)); + _index[info->id()].elements.emplace_back(weight, length); return true; } @@ -77,8 +77,7 @@ MatchDataBuilder::addOccurence(const vespalib::string &fieldName, uint32_t termI } const ITermFieldData *tfd = _queryEnv.getTerm(termId)->lookupField(info->id()); if (tfd == nullptr) { - LOG(error, "Field '%s' is not searched by the given term.", - fieldName.c_str()); + LOG(error, "Field '%s' is not searched by the given term.", fieldName.c_str()); return false; } _match[termId][info->id()].insert(Position(pos, element)); @@ -99,14 +98,13 @@ MatchDataBuilder::setWeight(const vespalib::string &fieldName, uint32_t termId, } const ITermFieldData *tfd = _queryEnv.getTerm(termId)->lookupField(info->id()); if (tfd == nullptr) { - LOG(error, "Field '%s' is not searched by the given term.", - fieldName.c_str()); + LOG(error, "Field '%s' is not searched by the given term.", fieldName.c_str()); return false; } uint32_t eid = _index[info->id()].elements.size(); _match[termId][info->id()].clear(); _match[termId][info->id()].insert(Position(0, eid)); - _index[info->id()].elements.push_back(MyElement(weight, 1)); + _index[info->id()].elements.emplace_back(weight, 1); return true; } @@ -142,19 +140,13 @@ MatchDataBuilder::apply(uint32_t docId) // For each occurence of that term, in that field, do for (const auto& occ : field_elem.second) { // Append a term match position to the term match data. - match->appendPosition(TermFieldMatchDataPosition( - occ.eid, - occ.pos, - field.getWeight(occ.eid), - field.getLength(occ.eid))); - LOG(debug, - "Added occurence of term '%u' in field '%s'" - " at position '%u'.", + match->appendPosition(TermFieldMatchDataPosition(occ.eid, occ.pos, + field.getWeight(occ.eid), + field.getLength(occ.eid))); + LOG(debug, "Added occurence of term '%u' in field '%s' at position '%u'.", termId, name.c_str(), occ.pos); if (occ.pos >= field.getLength(occ.eid)) { - LOG(warning, - "Added occurence of term '%u' in field '%s'" - " at position '%u' >= fieldLen '%u'.", + LOG(warning, "Added occurence of term '%u' in field '%s' at position '%u' >= fieldLen '%u'.", termId, name.c_str(), occ.pos, field.getLength(occ.eid)); } } diff --git a/searchlib/src/vespa/searchlib/fef/test/matchdatabuilder.h b/searchlib/src/vespa/searchlib/fef/test/matchdatabuilder.h index 0e5025efd37..753e1596520 100644 --- a/searchlib/src/vespa/searchlib/fef/test/matchdatabuilder.h +++ b/searchlib/src/vespa/searchlib/fef/test/matchdatabuilder.h @@ -13,7 +13,7 @@ public: struct MyElement { int32_t weight; uint32_t length; - MyElement(int32_t w, uint32_t l) : weight(w), length(l) {} + MyElement(int32_t w, uint32_t l) noexcept : weight(w), length(l) {} }; struct MyField { uint32_t fieldLength; @@ -21,7 +21,7 @@ public: MyField() : fieldLength(0), elements() {} MyElement &getElement(uint32_t eid) { while (elements.size() <= eid) { - elements.push_back(MyElement(0, 0)); + elements.emplace_back(0, 0); } return elements[eid]; } @@ -68,6 +68,8 @@ public: * @param data The match data to build in. */ MatchDataBuilder(QueryEnvironment &queryEnv, MatchData &data); + MatchDataBuilder(const MatchDataBuilder &) = delete; + MatchDataBuilder & operator=(const MatchDataBuilder &) = delete; ~MatchDataBuilder(); /** @@ -133,10 +135,6 @@ public: bool apply(uint32_t docId); private: - MatchDataBuilder(const MatchDataBuilder &); // hide - MatchDataBuilder & operator=(const MatchDataBuilder &); // hide - -private: QueryEnvironment &_queryEnv; MatchData &_data; IndexData _index; diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp index 2bc94073c92..49a0f0621d2 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp @@ -213,6 +213,17 @@ FieldIndex<interleaved_features>::getMemoryUsage() const } template <bool interleaved_features> +void +FieldIndex<interleaved_features>::commit() +{ + _remover.flush(); + freeze(); + assign_generation(); + incGeneration(); + reclaim_memory(); +} + +template <bool interleaved_features> queryeval::SearchIterator::UP FieldIndex<interleaved_features>::make_search_iterator(const vespalib::string& term, uint32_t field_id, @@ -248,7 +259,7 @@ public: : SimpleLeafBlueprint(field), _guard(), _field(field), - _posting_itr(posting_itr), + _posting_itr(std::move(posting_itr)), _feature_store(feature_store), _field_id(field_id), _query_term(query_term), @@ -302,7 +313,7 @@ FieldIndex<interleaved_features>::make_term_blueprint(const vespalib::string& te auto posting_itr = findFrozen(term); bool use_bit_vector = field.isFilter(); return std::make_unique<MemoryTermBlueprint<interleaved_features>> - (std::move(guard), posting_itr, getFeatureStore(), field, field_id, term, use_bit_vector); + (std::move(guard), std::move(posting_itr), getFeatureStore(), field, field_id, term, use_bit_vector); } template class FieldIndex<false>; diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index.h b/searchlib/src/vespa/searchlib/memoryindex/field_index.h index 0b245300a7b..18e60cf2194 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_index.h +++ b/searchlib/src/vespa/searchlib/memoryindex/field_index.h @@ -87,13 +87,7 @@ public: vespalib::MemoryUsage getMemoryUsage() const override; PostingListStore &getPostingListStore() { return _postingListStore; } - void commit() override { - _remover.flush(); - freeze(); - assign_generation(); - incGeneration(); - reclaim_memory(); - } + void commit() override; /** * Should only by used by unit tests. diff --git a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp index 09fc443cf0e..9e46df57416 100644 --- a/searchlib/src/vespa/searchlib/query/query_term_simple.cpp +++ b/searchlib/src/vespa/searchlib/query/query_term_simple.cpp @@ -49,7 +49,7 @@ template <typename T> struct FloatDecoder { static T fromstr(const char * q, const char * qend, const char ** end) noexcept { T v(0); -#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 180000 +#if defined(_LIBCPP_VERSION) && _LIBCPP_VERSION < 190000 vespalib::string tmp(q, qend - q); char* tmp_end = nullptr; const char *tmp_cstring = tmp.c_str(); diff --git a/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt b/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt index 51fe2d12637..126ecd56dc2 100644 --- a/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt @@ -21,6 +21,7 @@ vespa_add_library(searchlib_queryeval OBJECT fake_searchable.cpp field_spec.cpp filter_wrapper.cpp + first_phase_rescorer.cpp flow.cpp full_search.cpp get_weight_from_node.cpp diff --git a/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp index 7334db4b716..c02990c5921 100644 --- a/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/blueprint.cpp @@ -1,14 +1,15 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "blueprint.h" -#include "leaf_blueprints.h" +#include "andnotsearch.h" +#include "andsearch.h" #include "emptysearch.h" -#include "full_search.h" #include "field_spec.hpp" -#include "andsearch.h" -#include "orsearch.h" -#include "andnotsearch.h" +#include "flow_tuning.h" +#include "full_search.h" +#include "leaf_blueprints.h" #include "matching_elements_search.h" +#include "orsearch.h" #include <vespa/searchlib/fef/termfieldmatchdataarray.h> #include <vespa/vespalib/objects/visit.hpp> #include <vespa/vespalib/objects/objectdumper.h> @@ -168,31 +169,6 @@ Blueprint::null_plan(InFlow in_flow, uint32_t docid_limit) sort(in_flow); } -double -Blueprint::estimate_actual_cost(InFlow in_flow) const noexcept -{ - double res = estimate_strict_cost_diff(in_flow); - if (in_flow.strict()) { - res += strict_cost(); - } else { - res += in_flow.rate() * cost(); - } - return res; -} - -double -Blueprint::estimate_strict_cost_diff(InFlow &in_flow) const noexcept -{ - if (in_flow.strict()) { - REQUIRE(strict()); - } else if (strict()) { - double rate = in_flow.rate(); - in_flow.force_strict(); - return flow::strict_cost_diff(estimate(), rate); - } - return 0.0; -} - Blueprint::UP Blueprint::optimize(Blueprint::UP bp) { Blueprint *root = bp.release(); @@ -238,7 +214,7 @@ Blueprint::default_flow_stats(uint32_t docid_limit, uint32_t abs_est, size_t chi FlowStats Blueprint::default_flow_stats(size_t child_cnt) { - return {0.5, 1.0 + child_cnt, 1.0 + child_cnt}; + return {flow::estimate_when_unknown(), 1.0 + child_cnt, 1.0 + child_cnt}; } std::unique_ptr<MatchingElementsSearch> @@ -278,8 +254,8 @@ create_op_filter(const Blueprint::Children &children, bool strict, Blueprint::Fi MultiSearch::Children list; std::unique_ptr<SearchIterator> spare; list.reserve(children.size()); - for (size_t i = 0; i < children.size(); ++i) { - auto filter = children[i]->createFilterSearch(constraint); + for (const auto & child : children) { + auto filter = child->createFilterSearch(constraint); auto matches_any = filter->matches_any(); if (should_short_circuit<Op>(matches_any)) { return filter; @@ -623,24 +599,6 @@ IntermediateBlueprint::should_do_termwise_eval(const UnpackInfo &unpack, double return (count_termwise_nodes(unpack) > 1); } -double -IntermediateBlueprint::estimate_self_cost(InFlow) const noexcept -{ - return 0.0; -} - -double -IntermediateBlueprint::estimate_actual_cost(InFlow in_flow) const noexcept -{ - double res = estimate_strict_cost_diff(in_flow); - auto cost_of = [](const auto &child, InFlow child_flow)noexcept{ - return child->estimate_actual_cost(child_flow); - }; - res += flow::actual_cost_of(flow::DefaultAdapter(), _children, my_flow(in_flow), cost_of); - res += estimate_self_cost(in_flow); - return res; -} - void IntermediateBlueprint::optimize(Blueprint* &self, OptimizePass pass) { @@ -665,9 +623,9 @@ IntermediateBlueprint::sort(InFlow in_flow) sort(_children, in_flow); } auto flow = my_flow(in_flow); - for (size_t i = 0; i < _children.size(); ++i) { - _children[i]->sort(InFlow(flow.strict(), flow.flow())); - flow.add(_children[i]->estimate()); + for (const auto & child : _children) { + child->sort(InFlow(flow.strict(), flow.flow())); + flow.add(child->estimate()); } } @@ -686,8 +644,8 @@ IntermediateBlueprint::createSearch(fef::MatchData &md) const { MultiSearch::Children subSearches; subSearches.reserve(_children.size()); - for (size_t i = 0; i < _children.size(); ++i) { - subSearches.push_back(_children[i]->createSearch(md)); + for (const auto & child : _children) { + subSearches.push_back(child->createSearch(md)); } return createIntermediateSearch(std::move(subSearches), md); } @@ -735,18 +693,17 @@ void IntermediateBlueprint::fetchPostings(const ExecuteInfo &execInfo) { auto flow = my_flow(InFlow(strict(), execInfo.hit_rate())); - for (size_t i = 0; i < _children.size(); ++i) { + for (const auto & child : _children) { double nextHitRate = flow.flow(); - Blueprint & child = *_children[i]; - child.fetchPostings(ExecuteInfo::create(nextHitRate, execInfo)); - flow.add(child.estimate()); + child->fetchPostings(ExecuteInfo::create(nextHitRate, execInfo)); + flow.add(child->estimate()); } } void IntermediateBlueprint::freeze() { - for (Blueprint::UP &child: _children) { + for (auto &child: _children) { child->freeze(); } freeze_self(); diff --git a/searchlib/src/vespa/searchlib/queryeval/blueprint.h b/searchlib/src/vespa/searchlib/queryeval/blueprint.h index a493c725407..a443f34f856 100644 --- a/searchlib/src/vespa/searchlib/queryeval/blueprint.h +++ b/searchlib/src/vespa/searchlib/queryeval/blueprint.h @@ -313,20 +313,6 @@ public: // optimal ordering. Used for testing. void null_plan(InFlow in_flow, uint32_t docid_limit); - // Estimate the actual cost of evaluating the (sub-)query - // represented by this blueprint with the given in-flow. This - // function should be called after query planning has been - // performed. This function could be useful to predict very - // expensive queries, but the initial use-case is to understand - // query cost better in micro-benchmarks to improve low-level cost - // tuning. - virtual double estimate_actual_cost(InFlow in_flow) const noexcept; - // Estimate the change in cost caused by having a strict iterator - // with a non-strict in-flow. Note that this function might force - // the in_flow to be strict in order to align it with the - // strictness of this blueprint. - double estimate_strict_cost_diff(InFlow &in_flow) const noexcept; - static Blueprint::UP optimize(Blueprint::UP bp); virtual void sort(InFlow in_flow) = 0; static Blueprint::UP optimize_and_sort(Blueprint::UP bp, InFlow in_flow, const Options &opts) { @@ -496,9 +482,6 @@ public: void setDocIdLimit(uint32_t limit) noexcept final; void each_node_post_order(const std::function<void(Blueprint&)> &f) override; - // additional cost not attributed to the children flow (heap merge/unpack/etc) - virtual double estimate_self_cost(InFlow in_flow) const noexcept; - double estimate_actual_cost(InFlow in_flow) const noexcept override; void optimize(Blueprint* &self, OptimizePass pass) final; void sort(InFlow in_flow) override; void set_global_filter(const GlobalFilter &global_filter, double estimated_hit_ratio) override; diff --git a/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.cpp b/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.cpp index 27ff0d235a3..c4aea7deae8 100644 --- a/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.cpp @@ -22,6 +22,11 @@ CreateBlueprintVisitorHelper::CreateBlueprintVisitorHelper(Searchable &searchabl CreateBlueprintVisitorHelper::~CreateBlueprintVisitorHelper() = default; +bool +CreateBlueprintVisitorHelper::is_search_multi_threaded() const noexcept { + return getRequestContext().thread_bundle().size() > 1; +} + attribute::SearchContextParams CreateBlueprintVisitorHelper::createContextParams() const { return attribute::SearchContextParams().metaStoreReadGuard(_requestContext.getMetaStoreReadGuard()); @@ -104,7 +109,8 @@ void CreateBlueprintVisitorHelper::visitWandTerm(query::WandTerm &n) { createWeightedSet(std::make_unique<ParallelWeakAndBlueprint>(_field, n.getTargetNumHits(), - n.getScoreThreshold(), n.getThresholdBoostFactor()), + n.getScoreThreshold(), n.getThresholdBoostFactor(), + is_search_multi_threaded()), n); } diff --git a/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h b/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h index 98f62fa3249..ec163260dc3 100644 --- a/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h +++ b/searchlib/src/vespa/searchlib/queryeval/create_blueprint_visitor_helper.h @@ -29,6 +29,7 @@ protected: const IRequestContext & getRequestContext() const { return _requestContext; } attribute::SearchContextParams createContextParams() const; attribute::SearchContextParams createContextParams(bool isFilter) const; + bool is_search_multi_threaded() const noexcept; public: CreateBlueprintVisitorHelper(Searchable &searchable, const FieldSpec &field, const IRequestContext & requestContext); ~CreateBlueprintVisitorHelper() override; diff --git a/searchlib/src/vespa/searchlib/queryeval/first_phase_rescorer.cpp b/searchlib/src/vespa/searchlib/queryeval/first_phase_rescorer.cpp new file mode 100644 index 00000000000..a7b1e3a7c92 --- /dev/null +++ b/searchlib/src/vespa/searchlib/queryeval/first_phase_rescorer.cpp @@ -0,0 +1,38 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "first_phase_rescorer.h" + +namespace search::queryeval { + +FirstPhaseRescorer::FirstPhaseRescorer(const std::pair<Scores,Scores>& ranges) + : _scale(1.0), + _adjust(0.0) +{ + if (need_rescore(ranges)) { + auto& first_phase_scores = ranges.first; + auto& second_phase_scores = ranges.second; + // scale and adjust the first phase score according to the + // first phase and second phase heap score values to avoid that + // a score from the first phase is larger than second_phase_scores.low + double first_phase_range = first_phase_scores.high - first_phase_scores.low; + if (first_phase_range < 1.0) { + first_phase_range = 1.0; + } + double second_phase_range = second_phase_scores.high - second_phase_scores.low; + if (second_phase_range < 1.0) { + second_phase_range = 1.0; + } + _scale = second_phase_range / first_phase_range; + _adjust = first_phase_scores.low * _scale - second_phase_scores.low; + } +} + +bool +FirstPhaseRescorer::need_rescore(const std::pair<Scores,Scores>& ranges) +{ + auto& first_phase_scores = ranges.first; + auto& second_phase_scores = ranges.second; + return (first_phase_scores.low > second_phase_scores.low); +} + +} diff --git a/searchlib/src/vespa/searchlib/queryeval/first_phase_rescorer.h b/searchlib/src/vespa/searchlib/queryeval/first_phase_rescorer.h new file mode 100644 index 00000000000..301e2aa78d0 --- /dev/null +++ b/searchlib/src/vespa/searchlib/queryeval/first_phase_rescorer.h @@ -0,0 +1,25 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "scores.h" +#include <cstdint> + +namespace search::queryeval { + +/* + * Rescore hits not selected for second phase to prevent them from getting + * a better score than hits selected for second phase ranking. + */ +class FirstPhaseRescorer { + double _scale; + double _adjust; +public: + FirstPhaseRescorer(const std::pair<Scores,Scores>& ranges); + static bool need_rescore(const std::pair<Scores,Scores>& ranges); + double rescore(uint32_t, double score) const noexcept { + return ((score * _scale) - _adjust); + } +}; + +} diff --git a/searchlib/src/vespa/searchlib/queryeval/flow.h b/searchlib/src/vespa/searchlib/queryeval/flow.h index be7b9031c00..b7841dc2017 100644 --- a/searchlib/src/vespa/searchlib/queryeval/flow.h +++ b/searchlib/src/vespa/searchlib/queryeval/flow.h @@ -204,16 +204,6 @@ double ordered_cost_of(ADAPTER adapter, const T &children, F flow, bool allow_fo return total_cost; } -static double actual_cost_of(auto adapter, const auto &children, auto flow, auto cost_of) noexcept { - double total_cost = 0.0; - for (const auto &child: children) { - double child_cost = cost_of(child, InFlow(flow.strict(), flow.flow())); - flow.update_cost(total_cost, child_cost); - flow.add(adapter.estimate(child)); - } - return total_cost; -} - auto select_strict_and_child(auto adapter, const auto &children, size_t first, double est, bool native_strict) { double cost = 0.0; size_t best_idx = first; diff --git a/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h b/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h index cf1d1a8c09f..5ed61ef9fc8 100644 --- a/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h +++ b/searchlib/src/vespa/searchlib/queryeval/flow_tuning.h @@ -60,6 +60,12 @@ inline size_t get_num_indirections(const attribute::BasicType& basic_type, return res; } +// Some blueprints are not able to provide a hit estimate (e.g. attributes without fast-search). +// In such cases the following estimate is used instead. In most cases this is an overestimate. +inline double estimate_when_unknown() { + return 0.1; +} + // Non-strict cost of lookup based matching in an attribute (not fast-search). // Test used: IteratorBenchmark::analyze_term_search_in_attributes_non_strict inline double lookup_cost(size_t num_indirections) { @@ -90,7 +96,7 @@ inline double lookup_strict_cost(size_t num_indirections) { * as the latency (time) penalty is higher if choosing wrong. */ inline double non_strict_cost_of_strict_iterator(double estimate, double strict_cost) { - return strict_cost + strict_cost_diff(estimate, 1.0); + return 2.0 * (strict_cost + strict_cost_diff(estimate, 0.5)); } // Strict cost of matching in a btree posting list (e.g. fast-search attribute or memory index field). diff --git a/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp b/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp index bf7f44f0e7a..01587ef485a 100644 --- a/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/hitcollector.cpp @@ -1,6 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "hitcollector.h" +#include "first_phase_rescorer.h" #include <vespa/searchlib/common/bitvector.h> #include <vespa/searchlib/common/sort.h> #include <cassert> @@ -43,9 +44,7 @@ HitCollector::HitCollector(uint32_t numDocs, uint32_t maxHitsSize) _unordered(false), _docIdVector(), _bitVector(), - _reRankedHits(), - _scale(1.0), - _adjust(0) + _reRankedHits() { if (_maxHitsSize > 0) { _collector = std::make_unique<RankedHitCollector>(*this); @@ -71,7 +70,7 @@ HitCollector::RankedHitCollector::collect(uint32_t docId, feature_t score) } hc._hits.emplace_back(docId, score); } else { - collectAndChangeCollector(docId, score); + collectAndChangeCollector(docId, score); // note - self-destruct. } } @@ -101,11 +100,10 @@ HitCollector::RankedHitCollector::collectAndChangeCollector(uint32_t docId, feat if (hc._maxDocIdVectorSize > hc._maxHitsSize) { // start using docid vector hc._docIdVector.reserve(hc._maxDocIdVectorSize); - uint32_t iSize = hc._hits.size(); - for (uint32_t i = 0; i < iSize; ++i) { - hc._docIdVector.push_back(hc._hits[i].first); + for (const auto& hit : hc._hits) { + hc._docIdVector.push_back(hit.first); } - if ((iSize > 0) && (docId < hc._docIdVector.back())) { + if (!hc._docIdVector.empty() && (docId < hc._docIdVector.back())) { hc._unordered = true; } hc._docIdVector.push_back(docId); @@ -114,9 +112,8 @@ HitCollector::RankedHitCollector::collectAndChangeCollector(uint32_t docId, feat // start using bit vector hc._bitVector = BitVector::create(hc._numDocs); hc._bitVector->invalidateCachedCount(); - uint32_t iSize = hc._hits.size(); - for (uint32_t i = 0; i < iSize; ++i) { - hc._bitVector->setBit(hc._hits[i].first); + for (const auto& hit : _hc._hits) { + hc._bitVector->setBit(hit.first); } hc._bitVector->setBit(docId); newCollector = std::make_unique<BitVectorCollector<true>>(hc); @@ -125,7 +122,7 @@ HitCollector::RankedHitCollector::collectAndChangeCollector(uint32_t docId, feat std::make_heap(hc._hits.begin(), hc._hits.end(), ScoreComparator()); hc._hitsSortOrder = SortOrder::HEAP; this->considerForHitVector(docId, score); - hc._collector = std::move(newCollector); + hc._collector = std::move(newCollector); // note - self-destruct. } template<bool CollectRankedHit> @@ -145,7 +142,7 @@ HitCollector::DocIdCollector<CollectRankedHit>::collect(uint32_t docId, feature_ } hc._docIdVector.push_back(docId); } else { - collectAndChangeCollector(docId); + collectAndChangeCollector(docId); // note - self-destruct. } } @@ -157,9 +154,8 @@ HitCollector::DocIdCollector<CollectRankedHit>::collectAndChangeCollector(uint32 // start using bit vector instead of docid array. hc._bitVector = BitVector::create(hc._numDocs); hc._bitVector->invalidateCachedCount(); - uint32_t iSize = static_cast<uint32_t>(hc._docIdVector.size()); - for (uint32_t i = 0; i < iSize; ++i) { - hc._bitVector->setBit(hc._docIdVector[i]); + for (auto docid : hc._docIdVector) { + hc._bitVector->setBit(docid); } std::vector<uint32_t> emptyVector; emptyVector.swap(hc._docIdVector); @@ -191,91 +187,231 @@ HitCollector::setRanges(const std::pair<Scores, Scores> &ranges) namespace { +struct NoRescorer +{ + static double rescore(uint32_t, double score) noexcept { return score; } +}; + +template <typename Rescorer> +class RerankRescorer { + Rescorer _rescorer; + using HitVector = std::vector<HitCollector::Hit>; + using Iterator = typename HitVector::const_iterator; + Iterator _reranked_cur; + Iterator _reranked_end; +public: + RerankRescorer(const Rescorer& rescorer, + const HitVector& reranked_hits) + : _rescorer(rescorer), + _reranked_cur(reranked_hits.begin()), + _reranked_end(reranked_hits.end()) + { + } + + double rescore(uint32_t docid, double score) noexcept { + if (_reranked_cur != _reranked_end && _reranked_cur->first == docid) { + double result = _reranked_cur->second; + ++_reranked_cur; + return result; + } else { + return _rescorer.rescore(docid, score); + } + } +}; + +class SimpleHitAdder { +protected: + ResultSet& _rs; +public: + SimpleHitAdder(ResultSet& rs) + : _rs(rs) + { + } + void add(uint32_t docid, double rank_value) { + _rs.push_back({docid, rank_value}); + } +}; + +class ConditionalHitAdder : public SimpleHitAdder { +protected: + double _second_phase_rank_drop_limit; +public: + ConditionalHitAdder(ResultSet& rs, double second_phase_rank_drop_limit) + : SimpleHitAdder(rs), + _second_phase_rank_drop_limit(second_phase_rank_drop_limit) + { + } + void add(uint32_t docid, double rank_value) { + if (rank_value > _second_phase_rank_drop_limit) { + _rs.push_back({docid, rank_value}); + } + } +}; + +class TrackingConditionalHitAdder : public ConditionalHitAdder { + std::vector<uint32_t>& _dropped; +public: + TrackingConditionalHitAdder(ResultSet& rs, double second_phase_rank_drop_limit, std::vector<uint32_t>& dropped) + : ConditionalHitAdder(rs, second_phase_rank_drop_limit), + _dropped(dropped) + { + } + void add(uint32_t docid, double rank_value) { + if (rank_value > _second_phase_rank_drop_limit) { + _rs.push_back({docid, rank_value}); + } else { + _dropped.emplace_back(docid); + } + } +}; + +template <typename HitAdder, typename Rescorer> void -mergeHitsIntoResultSet(const std::vector<HitCollector::Hit> &hits, ResultSet &result) +add_rescored_hits(HitAdder hit_adder, const std::vector<HitCollector::Hit>& hits, Rescorer rescorer) { - uint32_t rhCur(0); - uint32_t rhEnd(result.getArrayUsed()); - for (const auto &hit : hits) { - while (rhCur != rhEnd && result[rhCur].getDocId() != hit.first) { - // just set the iterators right - ++rhCur; + for (auto& hit : hits) { + hit_adder.add(hit.first, rescorer.rescore(hit.first, hit.second)); + } +} + +template <typename HitAdder, typename Rescorer> +void +add_rescored_hits(HitAdder hit_adder, const std::vector<HitCollector::Hit>& hits, const std::vector<HitCollector::Hit>& reranked_hits, Rescorer rescorer) +{ + if (reranked_hits.empty()) { + add_rescored_hits(hit_adder, hits, rescorer); + } else { + add_rescored_hits(hit_adder, hits, RerankRescorer(rescorer, reranked_hits)); + } +} + +template <typename Rescorer> +void +add_rescored_hits(ResultSet& rs, const std::vector<HitCollector::Hit>& hits, const std::vector<HitCollector::Hit>& reranked_hits, std::optional<double> second_phase_rank_drop_limit, std::vector<uint32_t>* dropped, Rescorer rescorer) +{ + if (second_phase_rank_drop_limit.has_value()) { + if (dropped != nullptr) { + add_rescored_hits(TrackingConditionalHitAdder(rs, second_phase_rank_drop_limit.value(), *dropped), hits, reranked_hits, rescorer); + } else { + add_rescored_hits(ConditionalHitAdder(rs, second_phase_rank_drop_limit.value()), hits, reranked_hits, rescorer); } - assert(rhCur != rhEnd); // the hits should be a subset of the hits in ranked hit array. - result[rhCur]._rankValue = hit.second; + } else { + add_rescored_hits(SimpleHitAdder(rs), hits, reranked_hits, rescorer); + } +} + +template <typename HitAdder, typename Rescorer> +void +mixin_rescored_hits(HitAdder hit_adder, const std::vector<HitCollector::Hit>& hits, const std::vector<uint32_t>& docids, double default_value, Rescorer rescorer) +{ + auto hits_cur = hits.begin(); + auto hits_end = hits.end(); + for (auto docid : docids) { + if (hits_cur != hits_end && docid == hits_cur->first) { + hit_adder.add(docid, rescorer.rescore(docid, hits_cur->second)); + ++hits_cur; + } else { + hit_adder.add(docid, default_value); + } + } +} + +template <typename HitAdder, typename Rescorer> +void +mixin_rescored_hits(HitAdder hit_adder, const std::vector<HitCollector::Hit>& hits, const std::vector<uint32_t>& docids, double default_value, const std::vector<HitCollector::Hit>& reranked_hits, Rescorer rescorer) +{ + if (reranked_hits.empty()) { + mixin_rescored_hits(hit_adder, hits, docids, default_value, rescorer); + } else { + mixin_rescored_hits(hit_adder, hits, docids, default_value, RerankRescorer(rescorer, reranked_hits)); + } +} + +template <typename Rescorer> +void +mixin_rescored_hits(ResultSet& rs, const std::vector<HitCollector::Hit>& hits, const std::vector<uint32_t>& docids, double default_value, const std::vector<HitCollector::Hit>& reranked_hits, std::optional<double> second_phase_rank_drop_limit, std::vector<uint32_t>* dropped, Rescorer rescorer) +{ + if (second_phase_rank_drop_limit.has_value()) { + if (dropped != nullptr) { + mixin_rescored_hits(TrackingConditionalHitAdder(rs, second_phase_rank_drop_limit.value(), *dropped), hits, docids, default_value, reranked_hits, rescorer); + } else { + mixin_rescored_hits(ConditionalHitAdder(rs, second_phase_rank_drop_limit.value()), hits, docids, default_value, reranked_hits, rescorer); + } + } else { + mixin_rescored_hits(SimpleHitAdder(rs), hits, docids, default_value, reranked_hits, rescorer); + } +} + +void +add_bitvector_to_dropped(std::vector<uint32_t>& dropped, vespalib::ConstArrayRef<RankedHit> hits, const BitVector& bv) +{ + auto hits_cur = hits.begin(); + auto hits_end = hits.end(); + auto docid = bv.getFirstTrueBit(); + auto docid_limit = bv.size(); + while (docid < docid_limit) { + if (hits_cur != hits_end && hits_cur->getDocId() == docid) { + ++hits_cur; + } else { + dropped.emplace_back(docid); + } + docid = bv.getNextTrueBit(docid + 1); } } } std::unique_ptr<ResultSet> -HitCollector::getResultSet(HitRank default_value) +HitCollector::get_result_set(std::optional<double> second_phase_rank_drop_limit, std::vector<uint32_t>* dropped) { - bool needReScore = false; - Scores &initHeapScores = _ranges.first; - Scores &finalHeapScores = _ranges.second; - if (initHeapScores.low > finalHeapScores.low) { - // scale and adjust the score according to the range - // of the initial and final heap score values to avoid that - // a score from the first phase is larger than finalHeapScores.low - feature_t initRange = initHeapScores.high - initHeapScores.low; - if (initRange < 1.0) initRange = 1.0f; - feature_t finalRange = finalHeapScores.high - finalHeapScores.low; - if (finalRange < 1.0) finalRange = 1.0f; - _scale = finalRange / initRange; - _adjust = initHeapScores.low * _scale - finalHeapScores.low; - needReScore = true; + /* + * Use default_rank_value (i.e. -HUGE_VAL) when hit collector saves + * rank scores, otherwise use zero_rank_value (i.e. 0.0). + */ + auto default_value = save_rank_scores() ? search::default_rank_value : search::zero_rank_value; + + bool needReScore = FirstPhaseRescorer::need_rescore(_ranges); + FirstPhaseRescorer rescorer(_ranges); + + if (dropped != nullptr) { + dropped->clear(); } // destroys the heap property or score sort order sortHitsByDocId(); auto rs = std::make_unique<ResultSet>(); - if ( ! _collector->isDocIdCollector() ) { - unsigned int iSize = _hits.size(); - rs->allocArray(iSize); + if ( ! _collector->isDocIdCollector() || + (second_phase_rank_drop_limit.has_value() && + (_bitVector || dropped == nullptr))) { + rs->allocArray(_hits.size()); + auto* dropped_or_null = dropped; + if (second_phase_rank_drop_limit.has_value() && _bitVector) { + dropped_or_null = nullptr; + } if (needReScore) { - for (uint32_t i = 0; i < iSize; ++i) { - rs->push_back(RankedHit(_hits[i].first, getReScore(_hits[i].second))); - } + add_rescored_hits(*rs, _hits, _reRankedHits, second_phase_rank_drop_limit, dropped_or_null, rescorer); } else { - for (uint32_t i = 0; i < iSize; ++i) { - rs->push_back(RankedHit(_hits[i].first, _hits[i].second)); - } + add_rescored_hits(*rs, _hits, _reRankedHits, second_phase_rank_drop_limit, dropped_or_null, NoRescorer()); } } else { if (_unordered) { std::sort(_docIdVector.begin(), _docIdVector.end()); } - unsigned int iSize = _hits.size(); - unsigned int jSize = _docIdVector.size(); - rs->allocArray(jSize); - uint32_t i = 0; + rs->allocArray(_docIdVector.size()); if (needReScore) { - for (uint32_t j = 0; j < jSize; ++j) { - uint32_t docId = _docIdVector[j]; - if (i < iSize && docId == _hits[i].first) { - rs->push_back(RankedHit(docId, getReScore(_hits[i].second))); - ++i; - } else { - rs->push_back(RankedHit(docId, default_value)); - } - } + mixin_rescored_hits(*rs, _hits, _docIdVector, default_value, _reRankedHits, second_phase_rank_drop_limit, dropped, rescorer); } else { - for (uint32_t j = 0; j < jSize; ++j) { - uint32_t docId = _docIdVector[j]; - if (i < iSize && docId == _hits[i].first) { - rs->push_back(RankedHit(docId, _hits[i].second)); - ++i; - } else { - rs->push_back(RankedHit(docId, default_value)); - } - } + mixin_rescored_hits(*rs, _hits, _docIdVector, default_value, _reRankedHits, second_phase_rank_drop_limit, dropped, NoRescorer()); } } - if (!_reRankedHits.empty()) { - mergeHitsIntoResultSet(_reRankedHits, *rs); + if (second_phase_rank_drop_limit.has_value() && _bitVector) { + if (dropped != nullptr) { + assert(dropped->empty()); + add_bitvector_to_dropped(*dropped, {rs->getArray(), rs->getArrayUsed()}, *_bitVector); + } + _bitVector.reset(); } if (_bitVector) { @@ -285,4 +421,10 @@ HitCollector::getResultSet(HitRank default_value) return rs; } +std::unique_ptr<ResultSet> +HitCollector::getResultSet() +{ + return get_result_set(std::nullopt, nullptr); +} + } diff --git a/searchlib/src/vespa/searchlib/queryeval/hitcollector.h b/searchlib/src/vespa/searchlib/queryeval/hitcollector.h index 94ffe619bab..c23fb0a6ef6 100644 --- a/searchlib/src/vespa/searchlib/queryeval/hitcollector.h +++ b/searchlib/src/vespa/searchlib/queryeval/hitcollector.h @@ -8,6 +8,7 @@ #include <vespa/searchlib/common/resultset.h> #include <vespa/vespalib/util/sort.h> #include <algorithm> +#include <optional> #include <vector> namespace search::queryeval { @@ -35,8 +36,6 @@ private: std::vector<Hit> _reRankedHits; std::pair<Scores, Scores> _ranges; - feature_t _scale; - feature_t _adjust; struct ScoreComparator { bool operator() (const Hit & lhs, const Hit & rhs) const noexcept { @@ -120,12 +119,11 @@ private: void collect(uint32_t docId, feature_t score) override; }; - HitRank getReScore(feature_t score) const { - return ((score * _scale) - _adjust); - } VESPA_DLL_LOCAL void sortHitsByScore(size_t topn); VESPA_DLL_LOCAL void sortHitsByDocId(); + bool save_rank_scores() const noexcept { return _maxHitsSize != 0; } + public: HitCollector(const HitCollector &) = delete; HitCollector &operator=(const HitCollector &) = delete; @@ -169,15 +167,17 @@ public: const std::pair<Scores, Scores> &getRanges() const { return _ranges; } void setRanges(const std::pair<Scores, Scores> &ranges); + std::unique_ptr<ResultSet> + get_result_set(std::optional<double> second_phase_rank_drop_limit, std::vector<uint32_t>* dropped); + /** * Returns a result set based on the content of this collector. * Invoking this method will destroy the heap property of the * ranked hits and the match data heap. * - * @param auto pointer to the result set - * @param default_value rank value to be used for results without rank value + * @return unique pointer to the result set **/ - std::unique_ptr<ResultSet> getResultSet(HitRank default_value = default_rank_value); + std::unique_ptr<ResultSet> getResultSet(); }; } diff --git a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp index 33b249572f0..93cb8d68c33 100644 --- a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.cpp @@ -26,7 +26,7 @@ size_t lookup_create_source(std::vector<std::unique_ptr<CombineType> > &sources, return i; } } - sources.push_back(std::unique_ptr<CombineType>(new CombineType())); + sources.push_back(std::make_unique<CombineType>()); sources.back()->setSourceId(child_source); sources.back()->setDocIdLimit(docid_limit); return (sources.size() - 1); @@ -318,11 +318,6 @@ OrBlueprint::calculate_flow_stats(uint32_t) const { OrFlow::cost_of(get_children(), true) + flow::heap_cost(est, get_children().size())}; } -double -OrBlueprint::estimate_self_cost(InFlow in_flow) const noexcept { - return in_flow.strict() ? flow::heap_cost(estimate(), get_children().size()) : 0.0; -} - Blueprint::HitEstimate OrBlueprint::combine(const std::vector<HitEstimate> &data) const { @@ -424,6 +419,13 @@ WeakAndBlueprint::my_flow(InFlow in_flow) const return AnyFlow::create<OrFlow>(in_flow); } +WeakAndBlueprint::WeakAndBlueprint(uint32_t n, float idf_range, bool thread_safe) + : _scores(WeakAndPriorityQueue::createHeap(n, thread_safe)), + _n(n), + _idf_range(idf_range), + _weights() +{} + WeakAndBlueprint::~WeakAndBlueprint() = default; FlowStats @@ -436,11 +438,6 @@ WeakAndBlueprint::calculate_flow_stats(uint32_t docid_limit) const { OrFlow::cost_of(get_children(), true) + flow::heap_cost(est, get_children().size())}; } -double -WeakAndBlueprint::estimate_self_cost(InFlow in_flow) const noexcept { - return in_flow.strict() ? flow::heap_cost(estimate(), get_children().size()) : 0.0; -} - Blueprint::HitEstimate WeakAndBlueprint::combine(const std::vector<HitEstimate> &data) const { @@ -488,11 +485,12 @@ WeakAndBlueprint::createIntermediateSearch(MultiSearch::Children sub_searches, assert(_weights.size() == childCnt()); for (size_t i = 0; i < sub_searches.size(); ++i) { // TODO: pass ownership with unique_ptr - terms.emplace_back(sub_searches[i].release(), - _weights[i], + terms.emplace_back(sub_searches[i].release(), _weights[i], getChild(i).getState().estimate().estHits); } - return WeakAndSearch::create(terms, _n, strict()); + return (_idf_range == 0.0) + ? WeakAndSearch::create(terms, wand::MatchParams(*_scores), wand::TermFrequencyScorer(), _n, strict()) + : WeakAndSearch::create(terms, wand::MatchParams(*_scores), wand::Bm25TermFrequencyScorer(get_docid_limit(), _idf_range), _n, strict()); } SearchIterator::UP @@ -517,11 +515,6 @@ NearBlueprint::calculate_flow_stats(uint32_t) const { AndFlow::cost_of(get_children(), true) + childCnt() * est}; } -double -NearBlueprint::estimate_self_cost(InFlow) const noexcept { - return childCnt() * estimate(); -} - Blueprint::HitEstimate NearBlueprint::combine(const std::vector<HitEstimate> &data) const { @@ -562,7 +555,7 @@ NearBlueprint::createIntermediateSearch(MultiSearch::Children sub_searches, tfmda.add(cs.field(j).resolve(md)); } } - return SearchIterator::UP(new NearSearch(std::move(sub_searches), tfmda, _window, strict())); + return std::make_unique<NearSearch>(std::move(sub_searches), tfmda, _window, strict()); } SearchIterator::UP @@ -587,11 +580,6 @@ ONearBlueprint::calculate_flow_stats(uint32_t) const { AndFlow::cost_of(get_children(), true) + childCnt() * est}; } -double -ONearBlueprint::estimate_self_cost(InFlow) const noexcept { - return childCnt() * estimate(); -} - Blueprint::HitEstimate ONearBlueprint::combine(const std::vector<HitEstimate> &data) const { @@ -630,7 +618,7 @@ ONearBlueprint::createIntermediateSearch(MultiSearch::Children sub_searches, } // could sort sub_searches here // but then strictness inheritance would also need to be fixed - return SearchIterator::UP(new ONearSearch(std::move(sub_searches), tfmda, _window, strict())); + return std::make_unique<ONearSearch>(std::move(sub_searches), tfmda, _window, strict()); } SearchIterator::UP @@ -756,7 +744,8 @@ SourceBlenderBlueprint::calculate_flow_stats(uint32_t) const { my_cost = std::max(my_cost, child->cost()); my_strict_cost = std::max(my_strict_cost, child->strict_cost()); } - return {OrFlow::estimate_of(get_children()), my_cost, my_strict_cost}; + double my_est = OrFlow::estimate_of(get_children()); + return {my_est, my_cost + 1.0, my_strict_cost + my_est}; } Blueprint::HitEstimate diff --git a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h index ade4c9318e4..87331ca83c5 100644 --- a/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h +++ b/searchlib/src/vespa/searchlib/queryeval/intermediate_blueprints.h @@ -4,6 +4,7 @@ #include "blueprint.h" #include "multisearch.h" +#include <vespa/searchlib/queryeval/wand/weak_and_heap.h> namespace search::queryeval { @@ -67,7 +68,6 @@ public: ~OrBlueprint() override; bool supports_termwise_children() const override { return true; } FlowStats calculate_flow_stats(uint32_t docid_limit) const final; - double estimate_self_cost(InFlow in_flow) const noexcept override; HitEstimate combine(const std::vector<HitEstimate> &data) const override; FieldSpecBaseList exposeFields() const override; void optimize_self(OptimizePass pass) override; @@ -89,13 +89,14 @@ private: class WeakAndBlueprint : public IntermediateBlueprint { private: + std::unique_ptr<WeakAndPriorityQueue> _scores; uint32_t _n; + float _idf_range; std::vector<uint32_t> _weights; AnyFlow my_flow(InFlow in_flow) const override; public: FlowStats calculate_flow_stats(uint32_t docid_limit) const final; - double estimate_self_cost(InFlow in_flow) const noexcept override; HitEstimate combine(const std::vector<HitEstimate> &data) const override; FieldSpecBaseList exposeFields() const override; Blueprint::UP get_replacement() override; @@ -107,14 +108,15 @@ public: fef::MatchData &md) const override; SearchIterator::UP createFilterSearch(FilterConstraint constraint) const override; - explicit WeakAndBlueprint(uint32_t n) noexcept : _n(n) {} + explicit WeakAndBlueprint(uint32_t n) : WeakAndBlueprint(n, 0.0, true) {} + WeakAndBlueprint(uint32_t n, float idf_range, bool thread_safe); ~WeakAndBlueprint() override; void addTerm(Blueprint::UP bp, uint32_t weight) { addChild(std::move(bp)); _weights.push_back(weight); } - uint32_t getN() const { return _n; } - const std::vector<uint32_t> &getWeights() const { return _weights; } + uint32_t getN() const noexcept { return _n; } + const std::vector<uint32_t> &getWeights() const noexcept { return _weights; } }; //----------------------------------------------------------------------------- @@ -127,7 +129,6 @@ private: AnyFlow my_flow(InFlow in_flow) const override; public: FlowStats calculate_flow_stats(uint32_t docid_limit) const final; - double estimate_self_cost(InFlow in_flow) const noexcept override; HitEstimate combine(const std::vector<HitEstimate> &data) const override; FieldSpecBaseList exposeFields() const override; void sort(Children &children, InFlow in_flow) const override; @@ -150,7 +151,6 @@ private: AnyFlow my_flow(InFlow in_flow) const override; public: FlowStats calculate_flow_stats(uint32_t docid_limit) const final; - double estimate_self_cost(InFlow in_flow) const noexcept override; HitEstimate combine(const std::vector<HitEstimate> &data) const override; FieldSpecBaseList exposeFields() const override; void sort(Children &children, InFlow in_flow) const override; diff --git a/searchlib/src/vespa/searchlib/queryeval/leaf_blueprints.cpp b/searchlib/src/vespa/searchlib/queryeval/leaf_blueprints.cpp index d825c9e1a20..9d19ba87af7 100644 --- a/searchlib/src/vespa/searchlib/queryeval/leaf_blueprints.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/leaf_blueprints.cpp @@ -11,9 +11,9 @@ namespace search::queryeval { //----------------------------------------------------------------------------- FlowStats -EmptyBlueprint::calculate_flow_stats(uint32_t docid_limit) const +EmptyBlueprint::calculate_flow_stats(uint32_t) const { - return default_flow_stats(docid_limit, 0, 0); + return {0.0, 0.2, 0.0}; } SearchIterator::UP diff --git a/searchlib/src/vespa/searchlib/queryeval/simple_phrase_search.cpp b/searchlib/src/vespa/searchlib/queryeval/simple_phrase_search.cpp index 2b25aa29747..c5435b557b0 100644 --- a/searchlib/src/vespa/searchlib/queryeval/simple_phrase_search.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/simple_phrase_search.cpp @@ -191,16 +191,14 @@ SimplePhraseSearch::doSeek(uint32_t doc_id) { void SimplePhraseSearch::doStrictSeek(uint32_t doc_id) { uint32_t next_candidate = doc_id; - while (getDocId() < doc_id || getDocId() == beginId()) { - getChildren()[0]->seek(next_candidate + 1); - next_candidate = getChildren()[0]->getDocId(); + auto &best_child = *getChildren()[_eval_order[0]]; + while (getDocId() < doc_id) { + best_child.seek(next_candidate + 1); + next_candidate = best_child.getDocId(); if (isAtEnd(next_candidate)) { setAtEnd(); return; } - // child must behave as strict. - assert(next_candidate > doc_id && next_candidate != beginId()); - phraseSeek(next_candidate); } } diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.cpp index 4c55496822b..48bef125ec3 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.cpp @@ -1,42 +1,23 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "parallel_weak_and_blueprint.h" -#include "wand_parts.h" #include "parallel_weak_and_search.h" #include <vespa/searchlib/queryeval/field_spec.hpp> #include <vespa/searchlib/queryeval/searchiterator.h> #include <vespa/searchlib/queryeval/flow_tuning.h> -#include <vespa/searchlib/fef/termfieldmatchdata.h> #include <vespa/vespalib/objects/visit.hpp> #include <algorithm> namespace search::queryeval { -ParallelWeakAndBlueprint::ParallelWeakAndBlueprint(FieldSpecBase field, - uint32_t scoresToTrack, - score_t scoreThreshold, - double thresholdBoostFactor) +ParallelWeakAndBlueprint::ParallelWeakAndBlueprint(FieldSpecBase field, uint32_t scoresToTrack, + score_t scoreThreshold, double thresholdBoostFactor, + bool thread_safe) : ComplexLeafBlueprint(field), - _scores(scoresToTrack), + _scores(WeakAndPriorityQueue::createHeap(scoresToTrack, thread_safe)), _scoreThreshold(scoreThreshold), _thresholdBoostFactor(thresholdBoostFactor), - _scoresAdjustFrequency(DEFAULT_PARALLEL_WAND_SCORES_ADJUST_FREQUENCY), - _layout(), - _weights(), - _terms() -{ -} - -ParallelWeakAndBlueprint::ParallelWeakAndBlueprint(FieldSpecBase field, - uint32_t scoresToTrack, - score_t scoreThreshold, - double thresholdBoostFactor, - uint32_t scoresAdjustFrequency) - : ComplexLeafBlueprint(field), - _scores(scoresToTrack), - _scoreThreshold(scoreThreshold), - _thresholdBoostFactor(thresholdBoostFactor), - _scoresAdjustFrequency(scoresAdjustFrequency), + _scoresAdjustFrequency(wand::DEFAULT_PARALLEL_WAND_SCORES_ADJUST_FREQUENCY), _layout(), _weights(), _terms() @@ -84,7 +65,7 @@ ParallelWeakAndBlueprint::calculate_flow_stats(uint32_t docid_limit) const term->update_flow_stats(docid_limit); } double child_est = OrFlow::estimate_of(_terms); - double my_est = abs_to_rel_est(_scores.getScoresToTrack(), docid_limit); + double my_est = abs_to_rel_est(_scores->getScoresToTrack(), docid_limit); double est = (child_est + my_est) / 2.0; return {est, OrFlow::cost_of(_terms, false), OrFlow::cost_of(_terms, true) + flow::heap_cost(est, _terms.size())}; @@ -106,14 +87,11 @@ ParallelWeakAndBlueprint::createLeafSearch(const search::fef::TermFieldMatchData childState.estimate().estHits, childState.field(0).resolve(*childrenMatchData)); } - return SearchIterator::UP - (ParallelWeakAndSearch::create(terms, - ParallelWeakAndSearch::MatchParams(_scores, - _scoreThreshold, - _thresholdBoostFactor, - _scoresAdjustFrequency).setDocIdLimit(get_docid_limit()), - ParallelWeakAndSearch::RankParams(*tfmda[0], - std::move(childrenMatchData)), strict())); + return ParallelWeakAndSearch::create(terms, + ParallelWeakAndSearch::MatchParams(*_scores, _scoreThreshold, _thresholdBoostFactor, + _scoresAdjustFrequency, get_docid_limit()), + ParallelWeakAndSearch::RankParams(*tfmda[0],std::move(childrenMatchData)), + strict()); } std::unique_ptr<SearchIterator> diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.h b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.h index 4a55bf14095..c34d366120e 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.h +++ b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_blueprint.h @@ -11,8 +11,6 @@ namespace search::queryeval { -const uint32_t DEFAULT_PARALLEL_WAND_SCORES_ADJUST_FREQUENCY = 4; - /** * Blueprint for the parallel weak and search operator. */ @@ -21,32 +19,24 @@ class ParallelWeakAndBlueprint : public ComplexLeafBlueprint private: using score_t = wand::score_t; - mutable SharedWeakAndPriorityQueue _scores; - const wand::score_t _scoreThreshold; - double _thresholdBoostFactor; - const uint32_t _scoresAdjustFrequency; - fef::MatchDataLayout _layout; - std::vector<int32_t> _weights; - std::vector<Blueprint::UP> _terms; + std::unique_ptr<WeakAndPriorityQueue> _scores; + const wand::score_t _scoreThreshold; + double _thresholdBoostFactor; + const uint32_t _scoresAdjustFrequency; + fef::MatchDataLayout _layout; + std::vector<int32_t> _weights; + std::vector<Blueprint::UP> _terms; public: ParallelWeakAndBlueprint(const ParallelWeakAndBlueprint &) = delete; ParallelWeakAndBlueprint &operator=(const ParallelWeakAndBlueprint &) = delete; - ParallelWeakAndBlueprint(FieldSpecBase field, - uint32_t scoresToTrack, - score_t scoreThreshold, - double thresholdBoostFactor); - ParallelWeakAndBlueprint(FieldSpecBase field, - uint32_t scoresToTrack, - score_t scoreThreshold, - double thresholdBoostFactor, - uint32_t scoresAdjustFrequency); + ParallelWeakAndBlueprint(FieldSpecBase field, uint32_t scoresToTrack, + score_t scoreThreshold, double thresholdBoostFactor, + bool thread_safe); ~ParallelWeakAndBlueprint() override; - const WeakAndHeap &getScores() const { return _scores; } - + const WeakAndHeap &getScores() const { return *_scores; } score_t getScoreThreshold() const { return _scoreThreshold; } - double getThresholdBoostFactor() const { return _thresholdBoostFactor; } // Used by create visitor diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.cpp b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.cpp index 9e887b9d0f7..78d97e8efa0 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.cpp @@ -77,6 +77,7 @@ public: _matchParams(matchParams), _localScores() { + _localScores.reserve(_matchParams.scoresAdjustFrequency); } size_t get_num_terms() const override { return _terms.size(); } int32_t get_term_weight(size_t idx) const override { return _terms.weight(idx); } @@ -199,17 +200,17 @@ namespace { template <typename VectorizedTerms, typename FutureHeap, typename PastHeap> SearchIterator::UP create_helper(search::fef::TermFieldMatchData &tfmd, VectorizedTerms &&terms, const MatchParams ¶ms, bool strict) { if (strict) { - return std::make_unique<wand::ParallelWeakAndSearchImpl<VectorizedTerms, FutureHeap, PastHeap, true>>(tfmd, std::move(terms), params); + return std::make_unique<wand::ParallelWeakAndSearchImpl<VectorizedTerms, FutureHeap, PastHeap, true>>(tfmd, std::forward<VectorizedTerms>(terms), params); } else { - return std::make_unique<wand::ParallelWeakAndSearchImpl<VectorizedTerms, FutureHeap, PastHeap, false>>(tfmd, std::move(terms), params); + return std::make_unique<wand::ParallelWeakAndSearchImpl<VectorizedTerms, FutureHeap, PastHeap, false>>(tfmd, std::forward<VectorizedTerms>(terms), params); } } template <typename VectorizedTerms> SearchIterator::UP create_helper(search::fef::TermFieldMatchData &tfmd, VectorizedTerms &&terms, const MatchParams ¶ms, bool strict, bool use_array) { return (use_array) - ? create_helper<VectorizedTerms, vespalib::LeftArrayHeap, vespalib::RightArrayHeap>(tfmd, std::move(terms), params, strict) - : create_helper<VectorizedTerms, vespalib::LeftHeap, vespalib::RightHeap>(tfmd, std::move(terms), params, strict); + ? create_helper<VectorizedTerms, vespalib::LeftArrayHeap, vespalib::RightArrayHeap>(tfmd, std::forward<VectorizedTerms>(terms), params, strict) + : create_helper<VectorizedTerms, vespalib::LeftHeap, vespalib::RightHeap>(tfmd, std::forward<VectorizedTerms>(terms), params, strict); } } // namespace search::queryeval::<unnamed> diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.h b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.h index bd173ab41eb..70520e267e6 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.h +++ b/searchlib/src/vespa/searchlib/queryeval/wand/parallel_weak_and_search.h @@ -20,27 +20,25 @@ struct ParallelWeakAndSearch : public SearchIterator /** * Params used to tweak the behavior of the WAND algorithm. */ - struct MatchParams + struct MatchParams : wand::MatchParams { - WeakAndHeap &scores; - score_t scoreThreshold; - double thresholdBoostFactor; - uint32_t scoresAdjustFrequency; - docid_t docIdLimit; - MatchParams(WeakAndHeap &scores_, - score_t scoreThreshold_, - double thresholdBoostFactor_, - uint32_t scoresAdjustFrequency_) - : scores(scores_), - scoreThreshold(scoreThreshold_), - thresholdBoostFactor(thresholdBoostFactor_), - scoresAdjustFrequency(scoresAdjustFrequency_), - docIdLimit(0) + const double thresholdBoostFactor; + const docid_t docIdLimit; + MatchParams(WeakAndHeap &scores_in, + score_t scoreThreshold_in, + double thresholdBoostFactor_in, + uint32_t scoresAdjustFrequency_in, + uint32_t docIdLimit_in) noexcept + : wand::MatchParams(scores_in, scoreThreshold_in, scoresAdjustFrequency_in), + thresholdBoostFactor(thresholdBoostFactor_in), + docIdLimit(docIdLimit_in) + {} + MatchParams(WeakAndHeap &scores_in, + score_t scoreThreshold_in, + double thresholdBoostFactor_in, + uint32_t scoresAdjustFrequency_in) noexcept + : MatchParams(scores_in, scoreThreshold_in, thresholdBoostFactor_in, scoresAdjustFrequency_in, 0) {} - MatchParams &setDocIdLimit(docid_t value) { - docIdLimit = value; - return *this; - } }; /** @@ -51,7 +49,7 @@ struct ParallelWeakAndSearch : public SearchIterator fef::TermFieldMatchData &rootMatchData; fef::MatchData::UP childrenMatchData; RankParams(fef::TermFieldMatchData &rootMatchData_, - fef::MatchData::UP &&childrenMatchData_) + fef::MatchData::UP &&childrenMatchData_) noexcept : rootMatchData(rootMatchData_), childrenMatchData(std::move(childrenMatchData_)) {} @@ -68,12 +66,10 @@ struct ParallelWeakAndSearch : public SearchIterator static SearchIterator::UP createHeapWand(const Terms &terms, const MatchParams &matchParams, RankParams &&rankParams, bool strict); static SearchIterator::UP create(const Terms &terms, const MatchParams &matchParams, RankParams &&rankParams, bool strict); - static SearchIterator::UP create(fef::TermFieldMatchData &tmd, - const MatchParams &matchParams, + static SearchIterator::UP create(fef::TermFieldMatchData &tmd, const MatchParams &matchParams, const std::vector<int32_t> &weights, const std::vector<IDirectPostingStore::LookupResult> &dict_entries, - const IDocidWithWeightPostingStore &attr, - bool strict); + const IDocidWithWeightPostingStore &attr, bool strict); }; } diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h b/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h index 4e781f8497b..9496090cca3 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h +++ b/searchlib/src/vespa/searchlib/queryeval/wand/wand_parts.h @@ -2,10 +2,9 @@ #pragma once -#include <algorithm> -#include <cmath> #include <vespa/searchlib/fef/matchdata.h> #include <vespa/searchlib/fef/termfieldmatchdata.h> +#include <vespa/searchlib/features/bm25_feature.h> #include <vespa/searchlib/queryeval/searchiterator.h> #include <vespa/searchlib/queryeval/iterator_pack.h> #include <vespa/searchlib/attribute/posting_iterator_pack.h> @@ -13,23 +12,40 @@ #include <vespa/vespalib/util/priority_queue.h> #include <vespa/searchlib/attribute/i_docid_with_weight_posting_store.h> #include <vespa/vespalib/util/stringfmt.h> +#include <cmath> +namespace search::queryeval { class WeakAndHeap; } namespace search::queryeval::wand { //----------------------------------------------------------------------------- -struct Term; -using Terms = std::vector<Term>; using score_t = int64_t; using docid_t = uint32_t; using ref_t = uint16_t; -using Attr = IDirectPostingStore; -using AttrDictEntry = Attr::LookupResult; +const uint32_t DEFAULT_PARALLEL_WAND_SCORES_ADJUST_FREQUENCY = 4; //----------------------------------------------------------------------------- /** + * Params used to tweak the behavior of the WAND algorithm. + */ +struct MatchParams +{ + WeakAndHeap &scores; + score_t scoreThreshold; + const uint32_t scoresAdjustFrequency; + MatchParams(WeakAndHeap &scores_in) noexcept + : MatchParams(scores_in, 1, DEFAULT_PARALLEL_WAND_SCORES_ADJUST_FREQUENCY) + {} + MatchParams(WeakAndHeap &scores_in, score_t scoreThreshold_in, uint32_t scoresAdjustFrequency_in) noexcept + : scores(scores_in), + scoreThreshold(scoreThreshold_in), + scoresAdjustFrequency(scoresAdjustFrequency_in) + {} +}; + +/** * Wrapper used to specify underlying terms during setup **/ struct Term { @@ -46,7 +62,7 @@ struct Term { Term(SearchIterator *s, int32_t w, uint32_t e) noexcept : Term(s, w, e, nullptr) {} Term(SearchIterator::UP s, int32_t w, uint32_t e) noexcept : Term(s.release(), w, e, nullptr) {} }; - +using Terms = std::vector<Term>; //----------------------------------------------------------------------------- // input manipulation utilities @@ -75,7 +91,7 @@ auto assemble(const F &f, const Order &order)->std::vector<decltype(f(0))> { } int32_t get_max_weight(const SearchIterator &search) { - const MinMaxPostingInfo *minMax = dynamic_cast<const MinMaxPostingInfo *>(search.getPostingInfo()); + const auto *minMax = dynamic_cast<const MinMaxPostingInfo *>(search.getPostingInfo()); return (minMax != nullptr) ? minMax->getMaxWeight() : std::numeric_limits<int32_t>::max(); } @@ -291,7 +307,7 @@ struct VectorizedAttributeTerms : VectorizedState<DocidWithWeightIteratorPack> { **/ struct DocIdOrder { const docid_t *termPos; - explicit DocIdOrder(docid_t *pos) noexcept : termPos(pos) {} + explicit DocIdOrder(const docid_t *pos) noexcept : termPos(pos) {} bool at_end(ref_t ref) const noexcept { return termPos[ref] == search::endDocId; } docid_t get_pos(ref_t ref) const noexcept { return termPos[ref]; } bool operator()(ref_t a, ref_t b) const noexcept { @@ -389,7 +405,7 @@ DualHeap<FutureHeap, PastHeap>::stringify() const { } //----------------------------------------------------------------------------- -#define TermFrequencyScorer_TERM_SCORE_FACTOR 1000000.0 +constexpr double TermFrequencyScorer_TERM_SCORE_FACTOR = 1000000.0; /** * Scorer used with WeakAndAlgorithm that calculates a pseudo term frequency @@ -412,6 +428,38 @@ struct TermFrequencyScorer } }; +class Bm25TermFrequencyScorer +{ +public: + using Bm25Executor = features::Bm25Executor; + Bm25TermFrequencyScorer(uint32_t num_docs, float range) noexcept + : _num_docs(num_docs), + _range(range), + _max_idf(Bm25Executor::calculate_inverse_document_frequency(1, _num_docs)) + { } + double apply_range(double idf) const noexcept { + return (1.0 - _range)*_max_idf + _range * idf; + } + // weight * scaled_bm25_idf, scaled to fixedpoint + score_t calculateMaxScore(double estHits, double weight) const noexcept { + return score_t(TermFrequencyScorer_TERM_SCORE_FACTOR * weight * + apply_range(Bm25Executor::calculate_inverse_document_frequency(estHits, _num_docs))); + } + + score_t calculateMaxScore(const Term &term) const noexcept { + return calculateMaxScore(term.estHits, term.weight) + 1; + } + + template <typename Input> + score_t calculate_max_score(const Input &input, ref_t ref) const noexcept { + return calculateMaxScore(input.get_est_hits(ref), input.get_weight(ref)) + 1; + } +private: + uint32_t _num_docs; + float _range; + double _max_idf; +}; + //----------------------------------------------------------------------------- /** @@ -453,14 +501,14 @@ struct DotProductScorer // used with parallel wand where we can safely discard hits based on score struct GreaterThan { score_t threshold; - GreaterThan(score_t t) : threshold(t) {} + explicit GreaterThan(score_t t) noexcept : threshold(t) {} bool operator()(score_t score) const { return (score > threshold); } }; // used with old-style vespa wand to ensure at least AND'ish results struct GreaterThanEqual { score_t threshold; - GreaterThanEqual(score_t t) : threshold(t) {} + explicit GreaterThanEqual(score_t t) noexcept : threshold(t) {} bool operator()(score_t score) const { return (score >= threshold); } }; diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_heap.cpp b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_heap.cpp index d4b92fd67e6..53ebb33e1ea 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_heap.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_heap.cpp @@ -4,23 +4,28 @@ namespace search::queryeval { -SharedWeakAndPriorityQueue::SharedWeakAndPriorityQueue(uint32_t scoresToTrack) : +WeakAndPriorityQueue::WeakAndPriorityQueue(uint32_t scoresToTrack) : WeakAndHeap(scoresToTrack), - _bestScores(), - _lock() -{ - _bestScores.reserve(scoresToTrack); -} + _bestScores() +{ } -SharedWeakAndPriorityQueue::~SharedWeakAndPriorityQueue() = default; +WeakAndPriorityQueue::~WeakAndPriorityQueue() = default; + +std::unique_ptr<WeakAndPriorityQueue> +WeakAndPriorityQueue::createHeap(uint32_t scoresToTrack, bool thread_safe) { + if (thread_safe) { + return std::make_unique<queryeval::SharedWeakAndPriorityQueue>(scoresToTrack); + } + return std::make_unique<WeakAndPriorityQueue>(scoresToTrack); +} void -SharedWeakAndPriorityQueue::adjust(score_t *begin, score_t *end) +WeakAndPriorityQueue::adjust(score_t *begin, score_t *end) { if (getScoresToTrack() == 0) { return; } - std::lock_guard guard(_lock); + for (score_t *itr = begin; itr != end; ++itr) { score_t score = *itr; if (!is_full()) { @@ -35,4 +40,17 @@ SharedWeakAndPriorityQueue::adjust(score_t *begin, score_t *end) } } +SharedWeakAndPriorityQueue::SharedWeakAndPriorityQueue(uint32_t scoresToTrack) + : WeakAndPriorityQueue(scoresToTrack), + _lock() +{ } + +SharedWeakAndPriorityQueue::~SharedWeakAndPriorityQueue() = default; + +void +SharedWeakAndPriorityQueue::adjust(score_t *begin, score_t *end) { + std::lock_guard guard(_lock); + WeakAndPriorityQueue::adjust(begin, end); +} + } diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_heap.h b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_heap.h index f1c90f5e6ac..db3ddbc39d3 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_heap.h +++ b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_heap.h @@ -17,13 +17,13 @@ namespace search::queryeval { class WeakAndHeap { public: using score_t = wand::score_t; - WeakAndHeap(uint32_t scoresToTrack) : + explicit WeakAndHeap(uint32_t scoresToTrack) noexcept : _minScore((scoresToTrack == 0) ? std::numeric_limits<score_t>::max() : 0), _scoresToTrack(scoresToTrack) { } - virtual ~WeakAndHeap() {} + virtual ~WeakAndHeap() = default; /** * Consider the given scores for insertion into the underlying structure. * The implementation may change the given score array to speed up execution. @@ -33,11 +33,13 @@ public: /** * The number of scores this heap is tracking. **/ - uint32_t getScoresToTrack() const { return _scoresToTrack; } + uint32_t getScoresToTrack() const noexcept { return _scoresToTrack; } - score_t getMinScore() const { return _minScore.load(std::memory_order_relaxed); } + score_t getMinScore() const noexcept { return _minScore.load(std::memory_order_relaxed); } protected: - void setMinScore(score_t minScore) { _minScore.store(minScore, std::memory_order_relaxed); } + void setMinScore(score_t minScore) noexcept { + _minScore.store(minScore, std::memory_order_relaxed); + } private: std::atomic<score_t> _minScore; const uint32_t _scoresToTrack; @@ -47,19 +49,28 @@ private: * An implementation using an underlying priority queue to keep track of the N * best hits that can be shared among multiple search iterators. */ -class SharedWeakAndPriorityQueue : public WeakAndHeap +class WeakAndPriorityQueue : public WeakAndHeap { private: using Scores = vespalib::PriorityQueue<score_t>; Scores _bestScores; - std::mutex _lock; - bool is_full() const { return (_bestScores.size() >= getScoresToTrack()); } + bool is_full() const noexcept { return (_bestScores.size() >= getScoresToTrack()); } +public: + explicit WeakAndPriorityQueue(uint32_t scoresToTrack); + ~WeakAndPriorityQueue() override; + Scores &getScores() noexcept { return _bestScores; } + void adjust(score_t *begin, score_t *end) override; + static std::unique_ptr<WeakAndPriorityQueue> createHeap(uint32_t scoresToTrack, bool thread_safe); +}; +class SharedWeakAndPriorityQueue final : public WeakAndPriorityQueue +{ +private: + std::mutex _lock; public: - SharedWeakAndPriorityQueue(uint32_t scoresToTrack); - ~SharedWeakAndPriorityQueue(); - Scores &getScores() { return _bestScores; } + explicit SharedWeakAndPriorityQueue(uint32_t scoresToTrack); + ~SharedWeakAndPriorityQueue() override; void adjust(score_t *begin, score_t *end) override; }; diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp index 04b1cb75da4..33dd3e46fe5 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp +++ b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.cpp @@ -1,7 +1,7 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "weak_and_search.h" -#include "wand_parts.h" +#include "weak_and_heap.h" #include <vespa/searchlib/queryeval/orsearch.h> #include <vespa/vespalib/util/left_right_heap.h> #include <vespa/vespalib/util/priority_queue.h> @@ -20,7 +20,8 @@ private: DualHeap<FutureHeap, PastHeap> _heaps; Algorithm _algo; score_t _threshold; // current score threshold - Scores _scores; // best n scores + MatchParams _matchParams; + std::vector<score_t> _localScores; const uint32_t _n; void seek_strict(uint32_t docid) { @@ -40,16 +41,24 @@ private: } } } + void updateThreshold(score_t newThreshold) { + if (newThreshold > _threshold) { + _threshold = newThreshold; + } + } public: - WeakAndSearchLR(const Terms &terms, uint32_t n) - : _terms(terms, TermFrequencyScorer(), 0, {}), + template<typename Scorer> + WeakAndSearchLR(const Terms &terms, const MatchParams & matchParams, const Scorer & scorer, uint32_t n) + : _terms(terms, scorer, 0, {}), _heaps(DocIdOrder(_terms.docId()), _terms.size()), _algo(), - _threshold(1), - _scores(), + _threshold(matchParams.scoreThreshold), + _matchParams(matchParams), + _localScores(), _n(n) { + _localScores.reserve(_matchParams.scoresAdjustFrequency); } size_t get_num_terms() const override { return _terms.size(); } int32_t get_term_weight(size_t idx) const override { return _terms.weight(idx); } @@ -57,6 +66,7 @@ public: const Terms &getTerms() const override { return _terms.input_terms(); } uint32_t getN() const override { return _n; } void doSeek(uint32_t docid) override { + updateThreshold(_matchParams.scores.getMinScore()); if (IS_STRICT) { seek_strict(docid); } else { @@ -65,12 +75,11 @@ public: } void doUnpack(uint32_t docid) override { _algo.find_matching_terms(_terms, _heaps); - _scores.push(_algo.get_upper_bound()); - if (_scores.size() > _n) { - _scores.pop_front(); - } - if (_scores.size() == _n) { - _threshold = _scores.front(); + score_t score = _algo.get_upper_bound(); + _localScores.push_back(score); + if (_localScores.size() == _matchParams.scoresAdjustFrequency) { + _matchParams.scores.adjust(&_localScores[0], &_localScores[0] + _localScores.size()); + _localScores.clear(); } ref_t *end = _heaps.present_end(); for (ref_t *ref = _heaps.present_begin(); ref != end; ++ref) { @@ -102,36 +111,51 @@ WeakAndSearch::visitMembers(vespalib::ObjectVisitor &visitor) const //----------------------------------------------------------------------------- +template<typename Scorer> SearchIterator::UP -WeakAndSearch::createArrayWand(const Terms &terms, uint32_t n, bool strict) +WeakAndSearch::createArrayWand(const Terms &terms, const MatchParams & params, + const Scorer & scorer, uint32_t n, bool strict) { if (strict) { - return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftArrayHeap, vespalib::RightArrayHeap, true>>(terms, n); + return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftArrayHeap, vespalib::RightArrayHeap, true>>(terms, params, scorer, n); } else { - return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftArrayHeap, vespalib::RightArrayHeap, false>>(terms, n); + return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftArrayHeap, vespalib::RightArrayHeap, false>>(terms, params, scorer, n); } } +template<typename Scorer> SearchIterator::UP -WeakAndSearch::createHeapWand(const Terms &terms, uint32_t n, bool strict) +WeakAndSearch::createHeapWand(const Terms &terms, const MatchParams & params, const Scorer & scorer, uint32_t n, bool strict) { if (strict) { - return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftHeap, vespalib::RightHeap, true>>(terms, n); + return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftHeap, vespalib::RightHeap, true>>(terms, params, scorer, n); } else { - return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftHeap, vespalib::RightHeap, false>>(terms, n); + return std::make_unique<wand::WeakAndSearchLR<vespalib::LeftHeap, vespalib::RightHeap, false>>(terms, params, scorer, n); } } +template<typename Scorer> SearchIterator::UP -WeakAndSearch::create(const Terms &terms, uint32_t n, bool strict) +WeakAndSearch::create(const Terms &terms, const MatchParams & params, const Scorer & scorer, uint32_t n, bool strict) { if (terms.size() < 128) { - return createArrayWand(terms, n, strict); + return createArrayWand(terms, params, scorer, n, strict); } else { - return createHeapWand(terms, n, strict); + return createHeapWand(terms, params, scorer, n, strict); } } +SearchIterator::UP +WeakAndSearch::create(const Terms &terms, const MatchParams & params, uint32_t n, bool strict) +{ + return create(terms, params, wand::TermFrequencyScorer(), n, strict); +} + //----------------------------------------------------------------------------- +template SearchIterator::UP WeakAndSearch::create<wand::TermFrequencyScorer>(const Terms &terms, const MatchParams & params, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict); +template SearchIterator::UP WeakAndSearch::create<wand::Bm25TermFrequencyScorer>(const Terms &terms, const MatchParams & params, const wand::Bm25TermFrequencyScorer & scorer, uint32_t n, bool strict); +template SearchIterator::UP WeakAndSearch::createArrayWand<wand::TermFrequencyScorer>(const Terms &terms, const MatchParams & params, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict); +template SearchIterator::UP WeakAndSearch::createHeapWand<wand::TermFrequencyScorer>(const Terms &terms, const MatchParams & params, const wand::TermFrequencyScorer & scorer, uint32_t n, bool strict); + } diff --git a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.h b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.h index 6a56a04887c..30292af24ab 100644 --- a/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.h +++ b/searchlib/src/vespa/searchlib/queryeval/wand/weak_and_search.h @@ -9,15 +9,24 @@ namespace search::queryeval { struct WeakAndSearch : SearchIterator { using Terms = wand::Terms; + using MatchParams = wand::MatchParams; virtual size_t get_num_terms() const = 0; virtual int32_t get_term_weight(size_t idx) const = 0; virtual wand::score_t get_max_score(size_t idx) const = 0; virtual const Terms &getTerms() const = 0; virtual uint32_t getN() const = 0; void visitMembers(vespalib::ObjectVisitor &visitor) const override; - static SearchIterator::UP createArrayWand(const Terms &terms, uint32_t n, bool strict); - static SearchIterator::UP createHeapWand(const Terms &terms, uint32_t n, bool strict); - static SearchIterator::UP create(const Terms &terms, uint32_t n, bool strict); + template<typename Scorer> + static SearchIterator::UP createArrayWand(const Terms &terms, const MatchParams & matchParams, + const Scorer & scorer, uint32_t n, bool strict); + template<typename Scorer> + static SearchIterator::UP createHeapWand(const Terms &terms, const MatchParams & matchParams, + const Scorer & scorer, uint32_t n, bool strict); + template<typename Scorer> + static SearchIterator::UP create(const Terms &terms, const MatchParams & matchParams, + const Scorer & scorer, uint32_t n, bool strict); + static SearchIterator::UP create(const Terms &terms, const MatchParams & matchParams, + uint32_t n, bool strict); }; } diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp index af99260979d..d1ebb1f4e4e 100644 --- a/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp +++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.cpp @@ -9,6 +9,7 @@ using vespalib::typify_invoke; using vespalib::eval::TypifyCellType; using vespalib::eval::TypedCells; +using vespalib::eval::Int8Float; namespace search::tensor { @@ -26,16 +27,16 @@ public: _lhs(_tmpSpace.storeLhs(lhs)) { auto a = _lhs.data(); - _lhs_norm_sq = _computer.dotProduct(a, a, lhs.size); + _lhs_norm_sq = _computer.dotProduct(cast(a), cast(a), lhs.size); } double calc(TypedCells rhs) const noexcept override { size_t sz = _lhs.size(); vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs); auto a = _lhs.data(); auto b = rhs_vector.data(); - double b_norm_sq = _computer.dotProduct(b, b, sz); + double b_norm_sq = _computer.dotProduct(cast(b), cast(b), sz); double squared_norms = _lhs_norm_sq * b_norm_sq; - double dot_product = _computer.dotProduct(a, b, sz); + double dot_product = _computer.dotProduct(cast(a), cast(b), sz); double div = (squared_norms > 0) ? sqrt(squared_norms) : 1.0; double cosine_similarity = dot_product / div; double distance = 1.0 - cosine_similarity; // in range [0,2] @@ -70,19 +71,20 @@ template class BoundAngularDistance<double>; template <typename FloatType> BoundDistanceFunction::UP -AngularDistanceFunctionFactory<FloatType>::for_query_vector(TypedCells lhs) { +AngularDistanceFunctionFactory<FloatType>::for_query_vector(TypedCells lhs) const { using DFT = BoundAngularDistance<FloatType>; return std::make_unique<DFT>(lhs); } template <typename FloatType> BoundDistanceFunction::UP -AngularDistanceFunctionFactory<FloatType>::for_insertion_vector(TypedCells lhs) { +AngularDistanceFunctionFactory<FloatType>::for_insertion_vector(TypedCells lhs) const { using DFT = BoundAngularDistance<FloatType>; return std::make_unique<DFT>(lhs); } template class AngularDistanceFunctionFactory<float>; template class AngularDistanceFunctionFactory<double>; +template class AngularDistanceFunctionFactory<Int8Float>; } diff --git a/searchlib/src/vespa/searchlib/tensor/angular_distance.h b/searchlib/src/vespa/searchlib/tensor/angular_distance.h index 5e0a060e060..aa51f58b3cd 100644 --- a/searchlib/src/vespa/searchlib/tensor/angular_distance.h +++ b/searchlib/src/vespa/searchlib/tensor/angular_distance.h @@ -15,8 +15,8 @@ template <typename FloatType> class AngularDistanceFunctionFactory : public DistanceFunctionFactory { public: AngularDistanceFunctionFactory() = default; - BoundDistanceFunction::UP for_query_vector(TypedCells lhs) override; - BoundDistanceFunction::UP for_insertion_vector(TypedCells lhs) override; + BoundDistanceFunction::UP for_query_vector(TypedCells lhs) const override; + BoundDistanceFunction::UP for_insertion_vector(TypedCells lhs) const override; }; } diff --git a/searchlib/src/vespa/searchlib/tensor/bound_distance_function.h b/searchlib/src/vespa/searchlib/tensor/bound_distance_function.h index 85089196a7a..318271835ad 100644 --- a/searchlib/src/vespa/searchlib/tensor/bound_distance_function.h +++ b/searchlib/src/vespa/searchlib/tensor/bound_distance_function.h @@ -19,6 +19,7 @@ class BoundDistanceFunction : public DistanceConverter { public: using UP = std::unique_ptr<BoundDistanceFunction>; using TypedCells = vespalib::eval::TypedCells; + using Int8Float = vespalib::eval::Int8Float; BoundDistanceFunction() noexcept = default; @@ -29,6 +30,10 @@ public: // calculate internal distance, early return allowed if > limit virtual double calc_with_limit(TypedCells rhs, double limit) const noexcept = 0; +protected: + static const double *cast(const double * p) { return p; } + static const float *cast(const float * p) { return p; } + static const int8_t *cast(const Int8Float * p) { return reinterpret_cast<const int8_t *>(p); } }; } diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp index ed08df5866e..f39994dfdcf 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp +++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.cpp @@ -6,6 +6,7 @@ using search::attribute::DistanceMetric; using vespalib::eval::CellType; +using vespalib::eval::Int8Float; namespace search::tensor { @@ -16,25 +17,27 @@ make_distance_function_factory(DistanceMetric variant, CellType cell_type) case DistanceMetric::Angular: switch (cell_type) { case CellType::DOUBLE: return std::make_unique<AngularDistanceFunctionFactory<double>>(); + case CellType::INT8: return std::make_unique<AngularDistanceFunctionFactory<Int8Float>>(); default: return std::make_unique<AngularDistanceFunctionFactory<float>>(); } case DistanceMetric::Euclidean: switch (cell_type) { - case CellType::DOUBLE: return std::make_unique<EuclideanDistanceFunctionFactory<double>>(); - case CellType::INT8: return std::make_unique<EuclideanDistanceFunctionFactory<vespalib::eval::Int8Float>>(); + case CellType::DOUBLE: return std::make_unique<EuclideanDistanceFunctionFactory<double>>(); + case CellType::INT8: return std::make_unique<EuclideanDistanceFunctionFactory<Int8Float>>(); case CellType::BFLOAT16: return std::make_unique<EuclideanDistanceFunctionFactory<vespalib::BFloat16>>(); - default: return std::make_unique<EuclideanDistanceFunctionFactory<float>>(); + default: return std::make_unique<EuclideanDistanceFunctionFactory<float>>(); } case DistanceMetric::InnerProduct: case DistanceMetric::PrenormalizedAngular: switch (cell_type) { case CellType::DOUBLE: return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<double>>(); + case CellType::INT8: return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<Int8Float>>(); default: return std::make_unique<PrenormalizedAngularDistanceFunctionFactory<float>>(); } case DistanceMetric::Dotproduct: switch (cell_type) { case CellType::DOUBLE: return std::make_unique<MipsDistanceFunctionFactory<double>>(); - case CellType::INT8: return std::make_unique<MipsDistanceFunctionFactory<vespalib::eval::Int8Float>>(); + case CellType::INT8: return std::make_unique<MipsDistanceFunctionFactory<Int8Float>>(); default: return std::make_unique<MipsDistanceFunctionFactory<float>>(); } case DistanceMetric::GeoDegrees: @@ -42,7 +45,7 @@ make_distance_function_factory(DistanceMetric variant, CellType cell_type) case DistanceMetric::Hamming: switch (cell_type) { case CellType::DOUBLE: return std::make_unique<HammingDistanceFunctionFactory<double>>(); - case CellType::INT8: return std::make_unique<HammingDistanceFunctionFactory<vespalib::eval::Int8Float>>(); + case CellType::INT8: return std::make_unique<HammingDistanceFunctionFactory<Int8Float>>(); default: return std::make_unique<HammingDistanceFunctionFactory<float>>(); } } diff --git a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.h b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.h index 356366d6a77..3b0a0ac91fd 100644 --- a/searchlib/src/vespa/searchlib/tensor/distance_function_factory.h +++ b/searchlib/src/vespa/searchlib/tensor/distance_function_factory.h @@ -17,8 +17,8 @@ struct DistanceFunctionFactory { using TypedCells = vespalib::eval::TypedCells; DistanceFunctionFactory() noexcept = default; virtual ~DistanceFunctionFactory() = default; - virtual BoundDistanceFunction::UP for_query_vector(TypedCells lhs) = 0; - virtual BoundDistanceFunction::UP for_insertion_vector(TypedCells lhs) = 0; + virtual BoundDistanceFunction::UP for_query_vector(TypedCells lhs) const = 0; + virtual BoundDistanceFunction::UP for_insertion_vector(TypedCells lhs) const = 0; using UP = std::unique_ptr<DistanceFunctionFactory>; }; diff --git a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp index 3ab3a1123eb..62b92b43ad9 100644 --- a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp +++ b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.cpp @@ -16,14 +16,11 @@ using vespalib::BFloat16; template<typename AttributeCellType> class BoundEuclideanDistance final : public BoundDistanceFunction { - using FloatType = std::conditional_t<std::is_same<AttributeCellType,BFloat16>::value,float,AttributeCellType>; + using FloatType = std::conditional_t<std::is_same<AttributeCellType, BFloat16>::value, float, AttributeCellType>; private: const vespalib::hwaccelrated::IAccelrated & _computer; mutable TemporaryVectorStore<FloatType> _tmpSpace; const vespalib::ConstArrayRef<FloatType> _lhs_vector; - static const double *cast(const double * p) { return p; } - static const float *cast(const float * p) { return p; } - static const int8_t *cast(const Int8Float * p) { return reinterpret_cast<const int8_t *>(p); } public: explicit BoundEuclideanDistance(TypedCells lhs) : _computer(vespalib::hwaccelrated::IAccelrated::getAccelerator()), @@ -44,16 +41,8 @@ public: double score = 1.0 / (1.0 + d); return score; } - double calc_with_limit(TypedCells rhs, double limit) const noexcept override { - vespalib::ConstArrayRef<AttributeCellType> rhs_vector = rhs.typify<AttributeCellType>(); - double sum = 0.0; - size_t sz = _lhs_vector.size(); - assert(sz == rhs_vector.size()); - for (size_t i = 0; i < sz && sum <= limit; ++i) { - double diff = _lhs_vector[i] - rhs_vector[i]; - sum += diff*diff; - } - return sum; + double calc_with_limit(TypedCells rhs, double) const noexcept override { + return calc(rhs); } }; @@ -64,14 +53,14 @@ template class BoundEuclideanDistance<double>; template <typename FloatType> BoundDistanceFunction::UP -EuclideanDistanceFunctionFactory<FloatType>::for_query_vector(TypedCells lhs) { +EuclideanDistanceFunctionFactory<FloatType>::for_query_vector(TypedCells lhs) const { using DFT = BoundEuclideanDistance<FloatType>; return std::make_unique<DFT>(lhs); } template <typename FloatType> BoundDistanceFunction::UP -EuclideanDistanceFunctionFactory<FloatType>::for_insertion_vector(TypedCells lhs) { +EuclideanDistanceFunctionFactory<FloatType>::for_insertion_vector(TypedCells lhs) const { using DFT = BoundEuclideanDistance<FloatType>; return std::make_unique<DFT>(lhs); } diff --git a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h index 8c39a12bf86..78460c93307 100644 --- a/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h +++ b/searchlib/src/vespa/searchlib/tensor/euclidean_distance.h @@ -15,8 +15,8 @@ template <typename FloatType> class EuclideanDistanceFunctionFactory : public DistanceFunctionFactory { public: EuclideanDistanceFunctionFactory() noexcept = default; - BoundDistanceFunction::UP for_query_vector(TypedCells lhs) override; - BoundDistanceFunction::UP for_insertion_vector(TypedCells lhs) override; + BoundDistanceFunction::UP for_query_vector(TypedCells lhs) const override; + BoundDistanceFunction::UP for_insertion_vector(TypedCells lhs) const override; }; } diff --git a/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp index f5484f40271..a8a48ae4116 100644 --- a/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp +++ b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.cpp @@ -82,12 +82,12 @@ public: }; BoundDistanceFunction::UP -GeoDistanceFunctionFactory::for_query_vector(TypedCells lhs) { +GeoDistanceFunctionFactory::for_query_vector(TypedCells lhs) const { return std::make_unique<BoundGeoDistance>(lhs); } BoundDistanceFunction::UP -GeoDistanceFunctionFactory::for_insertion_vector(TypedCells lhs) { +GeoDistanceFunctionFactory::for_insertion_vector(TypedCells lhs) const { return std::make_unique<BoundGeoDistance>(lhs); } diff --git a/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h index 1464898421b..a85e31e8ecc 100644 --- a/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h +++ b/searchlib/src/vespa/searchlib/tensor/geo_degrees_distance.h @@ -14,8 +14,8 @@ namespace search::tensor { class GeoDistanceFunctionFactory : public DistanceFunctionFactory { public: GeoDistanceFunctionFactory() = default; - BoundDistanceFunction::UP for_query_vector(TypedCells lhs) override; - BoundDistanceFunction::UP for_insertion_vector(TypedCells lhs) override; + BoundDistanceFunction::UP for_query_vector(TypedCells lhs) const override; + BoundDistanceFunction::UP for_insertion_vector(TypedCells lhs) const override; }; } diff --git a/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp b/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp index 7f29a100492..7ea2e440a51 100644 --- a/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hamming_distance.cpp @@ -49,14 +49,14 @@ public: template <typename FloatType> BoundDistanceFunction::UP -HammingDistanceFunctionFactory<FloatType>::for_query_vector(TypedCells lhs) { +HammingDistanceFunctionFactory<FloatType>::for_query_vector(TypedCells lhs) const { using DFT = BoundHammingDistance<FloatType>; return std::make_unique<DFT>(lhs); } template <typename FloatType> BoundDistanceFunction::UP -HammingDistanceFunctionFactory<FloatType>::for_insertion_vector(TypedCells lhs) { +HammingDistanceFunctionFactory<FloatType>::for_insertion_vector(TypedCells lhs) const { using DFT = BoundHammingDistance<FloatType>; return std::make_unique<DFT>(lhs); } diff --git a/searchlib/src/vespa/searchlib/tensor/hamming_distance.h b/searchlib/src/vespa/searchlib/tensor/hamming_distance.h index 6e7f96e1e2f..2e3b75cc61f 100644 --- a/searchlib/src/vespa/searchlib/tensor/hamming_distance.h +++ b/searchlib/src/vespa/searchlib/tensor/hamming_distance.h @@ -16,8 +16,8 @@ template <typename FloatType> class HammingDistanceFunctionFactory : public DistanceFunctionFactory { public: HammingDistanceFunctionFactory() = default; - BoundDistanceFunction::UP for_query_vector(TypedCells lhs) override; - BoundDistanceFunction::UP for_insertion_vector(TypedCells lhs) override; + BoundDistanceFunction::UP for_query_vector(TypedCells lhs) const override; + BoundDistanceFunction::UP for_insertion_vector(TypedCells lhs) const override; }; } diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp index 1db688156e0..b542c422f50 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.cpp @@ -672,15 +672,18 @@ HnswIndex<type>::mutual_reconnect(const LinkArrayRef &cluster, uint32_t level) std::vector<PairDist> pairs; for (uint32_t i = 0; i + 1 < cluster.size(); ++i) { uint32_t n_id_1 = cluster[i]; + TypedCells n_cells_1 = get_vector(n_id_1); + if (n_cells_1.non_existing_attribute_value()) [[unlikely]] continue; LinkArrayRef n_list_1 = _graph.get_link_array(n_id_1, level); - std::unique_ptr<BoundDistanceFunction> df; + std::unique_ptr<BoundDistanceFunction> df = _distance_ff->for_insertion_vector(n_cells_1); for (uint32_t j = i + 1; j < cluster.size(); ++j) { uint32_t n_id_2 = cluster[j]; - if (has_link_to(n_list_1, n_id_2)) continue; - if (!df) { - df = _distance_ff->for_insertion_vector(get_vector(n_id_1)); + if ( ! has_link_to(n_list_1, n_id_2)) { + auto n_cells_2 = get_vector(n_id_2); + if (!n_cells_2.non_existing_attribute_value()) { + pairs.emplace_back(n_id_1, n_id_2, df->calc(n_cells_2)); + } } - pairs.emplace_back(n_id_1, n_id_2, calc_distance(*df, n_id_2)); } } std::sort(pairs.begin(), pairs.end()); @@ -1120,6 +1123,32 @@ HnswIndex<type>::count_reachable_nodes() const return {found_cnt, true}; } +template <HnswIndexType type> +uint32_t +HnswIndex<type>::get_subspaces(uint32_t docid) const noexcept +{ + if constexpr (type == HnswIndexType::SINGLE) { + return (docid < _graph.nodes.get_size() && _graph.nodes.get_elem_ref(docid).levels_ref().load_relaxed().valid()) ? 1 : 0; + } else { + return _id_mapping.get_ids(docid).size(); + } +} + +template <HnswIndexType type> +uint32_t +HnswIndex<type>::check_consistency(uint32_t docid_limit) const noexcept +{ + uint32_t inconsistencies = 0; + for (uint32_t docid = 1; docid < docid_limit; ++docid) { + auto index_subspaces = get_subspaces(docid); + auto store_subspaces = get_vectors(docid).subspaces(); + if (index_subspaces != store_subspaces) { + ++inconsistencies; + } + } + return inconsistencies; +} + template class HnswIndex<HnswIndexType::SINGLE>; template class HnswIndex<HnswIndexType::MULTI>; diff --git a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h index 616140f426f..4d4440c1bcb 100644 --- a/searchlib/src/vespa/searchlib/tensor/hnsw_index.h +++ b/searchlib/src/vespa/searchlib/tensor/hnsw_index.h @@ -193,6 +193,9 @@ protected: LinkArray filter_valid_nodeids(uint32_t level, const internal::PreparedAddNode::Links &neighbors, uint32_t self_nodeid); void internal_complete_add(uint32_t docid, internal::PreparedAddDoc &op); void internal_complete_add_node(uint32_t nodeid, uint32_t docid, uint32_t subspace, internal::PreparedAddNode &prepared_node); + + // Called from writer only. + uint32_t get_subspaces(uint32_t docid) const noexcept; public: HnswIndex(const DocVectorAccess& vectors, DistanceFunctionFactory::UP distance_ff, RandomLevelGenerator::UP level_generator, const HnswIndexConfig& cfg); @@ -248,6 +251,9 @@ public: uint32_t get_active_nodes() const noexcept { return _graph.get_active_nodes(); } + // Called from writer only. + uint32_t check_consistency(uint32_t docid_limit) const noexcept override; + // Should only be used by unit tests. HnswTestNode get_node(uint32_t nodeid) const; void set_node(uint32_t nodeid, const HnswTestNode &node); diff --git a/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.cpp b/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.cpp index c42242d8dc8..5bc727ebd97 100644 --- a/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.cpp +++ b/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.cpp @@ -19,9 +19,6 @@ class BoundMipsDistanceFunction final : public BoundDistanceFunction { using ExtraDimT = std::conditional_t<extra_dim,double,std::monostate>; [[no_unique_address]] ExtraDimT _lhs_extra_dim; - static const double *cast(const double * p) { return p; } - static const float *cast(const float * p) { return p; } - static const int8_t *cast(const Int8Float * p) { return reinterpret_cast<const int8_t *>(p); } public: BoundMipsDistanceFunction(TypedCells lhs, MaximumSquaredNormStore& sq_norm_store) : BoundDistanceFunction(), @@ -76,13 +73,13 @@ public: template<typename FloatType> BoundDistanceFunction::UP -MipsDistanceFunctionFactory<FloatType>::for_query_vector(TypedCells lhs) { +MipsDistanceFunctionFactory<FloatType>::for_query_vector(TypedCells lhs) const { return std::make_unique<BoundMipsDistanceFunction<FloatType, false>>(lhs, *_sq_norm_store); } template<typename FloatType> BoundDistanceFunction::UP -MipsDistanceFunctionFactory<FloatType>::for_insertion_vector(TypedCells lhs) { +MipsDistanceFunctionFactory<FloatType>::for_insertion_vector(TypedCells lhs) const { return std::make_unique<BoundMipsDistanceFunction<FloatType, true>>(lhs, *_sq_norm_store); }; diff --git a/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h b/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h index 67a6eb58de0..336511ab78f 100644 --- a/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h +++ b/searchlib/src/vespa/searchlib/tensor/mips_distance_transform.h @@ -62,8 +62,8 @@ public: MipsDistanceFunctionFactory() noexcept = default; ~MipsDistanceFunctionFactory() override = default; - BoundDistanceFunction::UP for_query_vector(TypedCells lhs) override; - BoundDistanceFunction::UP for_insertion_vector(TypedCells lhs) override; + BoundDistanceFunction::UP for_query_vector(TypedCells lhs) const override; + BoundDistanceFunction::UP for_insertion_vector(TypedCells lhs) const override; }; } diff --git a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h index 8462ff05eca..c2bbd17ce63 100644 --- a/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h +++ b/searchlib/src/vespa/searchlib/tensor/nearest_neighbor_index.h @@ -114,6 +114,12 @@ public: double distance_threshold) const = 0; virtual DistanceFunctionFactory &distance_function_factory() const = 0; + + /* + * Used when checking consistency during load. + * Called from writer only. + */ + virtual uint32_t check_consistency(uint32_t docid_limit) const noexcept = 0; }; } diff --git a/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp index 4bc90001227..6f0966e7fb3 100644 --- a/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp +++ b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.cpp @@ -6,6 +6,7 @@ using vespalib::typify_invoke; using vespalib::eval::TypifyCellType; +using vespalib::eval::Int8Float; namespace search::tensor { @@ -23,7 +24,7 @@ public: _lhs(_tmpSpace.storeLhs(lhs)) { auto a = _lhs.data(); - _lhs_norm_sq = _computer.dotProduct(a, a, lhs.size); + _lhs_norm_sq = _computer.dotProduct(cast(a), cast(a), lhs.size); if (_lhs_norm_sq <= 0.0) { _lhs_norm_sq = 1.0; } @@ -32,7 +33,7 @@ public: vespalib::ConstArrayRef<FloatType> rhs_vector = _tmpSpace.convertRhs(rhs); auto a = _lhs.data(); auto b = rhs_vector.data(); - double dot_product = _computer.dotProduct(a, b, _lhs.size()); + double dot_product = _computer.dotProduct(cast(a), cast(b), _lhs.size()); double distance = _lhs_norm_sq - dot_product; return distance; } @@ -62,19 +63,20 @@ template class BoundPrenormalizedAngularDistance<double>; template <typename FloatType> BoundDistanceFunction::UP -PrenormalizedAngularDistanceFunctionFactory<FloatType>::for_query_vector(TypedCells lhs) { +PrenormalizedAngularDistanceFunctionFactory<FloatType>::for_query_vector(TypedCells lhs) const { using DFT = BoundPrenormalizedAngularDistance<FloatType>; return std::make_unique<DFT>(lhs); } template <typename FloatType> BoundDistanceFunction::UP -PrenormalizedAngularDistanceFunctionFactory<FloatType>::for_insertion_vector(TypedCells lhs) { +PrenormalizedAngularDistanceFunctionFactory<FloatType>::for_insertion_vector(TypedCells lhs) const { using DFT = BoundPrenormalizedAngularDistance<FloatType>; return std::make_unique<DFT>(lhs); } template class PrenormalizedAngularDistanceFunctionFactory<float>; template class PrenormalizedAngularDistanceFunctionFactory<double>; +template class PrenormalizedAngularDistanceFunctionFactory<Int8Float>; } diff --git a/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h index 7e3a8c2c676..6a791e0b6ec 100644 --- a/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h +++ b/searchlib/src/vespa/searchlib/tensor/prenormalized_angular_distance.h @@ -14,8 +14,8 @@ template <typename FloatType> class PrenormalizedAngularDistanceFunctionFactory : public DistanceFunctionFactory { public: PrenormalizedAngularDistanceFunctionFactory() = default; - BoundDistanceFunction::UP for_query_vector(TypedCells lhs) override; - BoundDistanceFunction::UP for_insertion_vector(TypedCells lhs) override; + BoundDistanceFunction::UP for_query_vector(TypedCells lhs) const override; + BoundDistanceFunction::UP for_insertion_vector(TypedCells lhs) const override; }; } diff --git a/searchlib/src/vespa/searchlib/tensor/temporary_vector_store.cpp b/searchlib/src/vespa/searchlib/tensor/temporary_vector_store.cpp index 4753e9d7c87..097ea67cc9e 100644 --- a/searchlib/src/vespa/searchlib/tensor/temporary_vector_store.cpp +++ b/searchlib/src/vespa/searchlib/tensor/temporary_vector_store.cpp @@ -1,11 +1,13 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "temporary_vector_store.h" +#include <vespa/vespalib/hwaccelrated/iaccelrated.h> using vespalib::ConstArrayRef; using vespalib::ArrayRef; using vespalib::eval::CellType; using vespalib::eval::TypedCells; +using vespalib::hwaccelrated::IAccelrated; namespace search::tensor { @@ -13,18 +15,29 @@ namespace { template<typename FromType, typename ToType> ConstArrayRef<ToType> +convert_cells(ArrayRef<ToType> space, TypedCells cells) noexcept __attribute__((noinline)); + +template<typename FromType, typename ToType> +ConstArrayRef<ToType> convert_cells(ArrayRef<ToType> space, TypedCells cells) noexcept { - assert(cells.size == space.size()); - auto old_cells = cells.typify<FromType>(); + auto old_cells = cells.unsafe_typify<FromType>(); ToType *p = space.data(); for (FromType value : old_cells) { - ToType conv(value); - *p++ = conv; + *p++ = static_cast<ToType>(value); } return space; } +template<> +ConstArrayRef<float> +convert_cells<vespalib::BFloat16, float>(ArrayRef<float> space, TypedCells cells) noexcept +{ + static const IAccelrated & accelerator = IAccelrated::getAccelerator(); + accelerator.convert_bfloat16_to_float(reinterpret_cast<const uint16_t *>(cells.data), space.data(), space.size()); + return space; +} + template <typename ToType> struct ConvertCellsSelector { diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp index a5d670096ab..9f551166a1d 100644 --- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp +++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute.cpp @@ -418,18 +418,25 @@ TensorAttribute::complete_set_tensor(DocId docid, const vespalib::eval::Value& t std::unique_ptr<PrepareResult> prepare_result) { if (_index && !prepare_result) { - // The tensor cells are unchanged - if (!_is_dense) { - // but labels might have changed. - EntryRef ref = _tensorStore.store_tensor(tensor); - assert(ref.valid()); - setTensorRef(docid, ref); + VectorBundle vectors(tensor.cells().data, tensor.index().size(), _subspace_type); + if (tensor_cells_are_unchanged(docid, vectors)) { + // The tensor cells are unchanged + if (!_is_dense) { + // but labels might have changed. + EntryRef ref = _tensorStore.store_tensor(tensor); + assert(ref.valid()); + setTensorRef(docid, ref); + } + return; } - return; } internal_set_tensor(docid, tensor); if (_index) { - _index->complete_add_document(docid, std::move(prepare_result)); + if (prepare_result) { + _index->complete_add_document(docid, std::move(prepare_result)); + } else { + _index->add_document(docid); + } } } diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp index 28c4099c38b..223c9d7d1f2 100644 --- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp +++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.cpp @@ -322,6 +322,9 @@ TensorAttributeLoader::on_load(vespalib::Executor* executor) if (!load_index()) { return false; } + if (dense_store == nullptr) { + check_consistency(docid_limit); + } } else { build_index(executor, docid_limit); } @@ -329,4 +332,15 @@ TensorAttributeLoader::on_load(vespalib::Executor* executor) return true; } +void +TensorAttributeLoader::check_consistency(uint32_t docid_limit) +{ + auto before = vespalib::steady_clock::now(); + uint32_t inconsistencies = _index->check_consistency(docid_limit); + auto after = vespalib::steady_clock::now(); + double elapsed = vespalib::to_s(after - before); + LOG(info, "%u inconsistencies detected after loading index for attribute %s, (check used %6.3fs)", + inconsistencies, _attr.getName().c_str(), elapsed); +} + } diff --git a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h index 6bf68957adc..59baaf0b6dc 100644 --- a/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h +++ b/searchlib/src/vespa/searchlib/tensor/tensor_attribute_loader.h @@ -34,6 +34,7 @@ class TensorAttributeLoader { void load_tensor_store(search::attribute::BlobSequenceReader& reader, uint32_t docid_limit); void build_index(vespalib::Executor* executor, uint32_t docid_limit); bool load_index(); + void check_consistency(uint32_t docid_limit); public: TensorAttributeLoader(TensorAttribute& attr, GenerationHandler& generation_handler, RefVector& ref_vector, TensorStore& store, NearestNeighborIndex* index); diff --git a/searchlib/src/vespa/searchlib/test/CMakeLists.txt b/searchlib/src/vespa/searchlib/test/CMakeLists.txt index 83e185dbfb6..4685ad07808 100644 --- a/searchlib/src/vespa/searchlib/test/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/test/CMakeLists.txt @@ -14,6 +14,7 @@ vespa_add_library(searchlib_test schema_builder.cpp string_field_builder.cpp vector_buffer_writer.cpp + weightedchildrenverifiers.cpp $<TARGET_OBJECTS:searchlib_test_fakedata> $<TARGET_OBJECTS:searchlib_searchlib_test_diskindex> $<TARGET_OBJECTS:searchlib_test_gtest_migration> diff --git a/searchlib/src/vespa/searchlib/test/weightedchildrenverifiers.cpp b/searchlib/src/vespa/searchlib/test/weightedchildrenverifiers.cpp new file mode 100644 index 00000000000..b22dd1a3aa9 --- /dev/null +++ b/searchlib/src/vespa/searchlib/test/weightedchildrenverifiers.cpp @@ -0,0 +1,71 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "weightedchildrenverifiers.h" + +using search::queryeval::SearchIterator; + +namespace search::test { + +WeightedChildrenVerifier::WeightedChildrenVerifier() + : _weights(_num_children, 1) +{ } +WeightedChildrenVerifier::~WeightedChildrenVerifier() = default; + + +IteratorChildrenVerifier::IteratorChildrenVerifier() + : WeightedChildrenVerifier(), + _split_lists(_num_children) +{ + auto full_list = getExpectedDocIds(); + for (size_t i = 0; i < full_list.size(); ++i) { + _split_lists[i % _num_children].push_back(full_list[i]); + } +} +IteratorChildrenVerifier::~IteratorChildrenVerifier() = default; + +SearchIterator::UP +IteratorChildrenVerifier::create(bool strict) const { + (void) strict; + std::vector<SearchIterator*> children; + for (size_t i = 0; i < _num_children; ++i) { + children.push_back(createIterator(_split_lists[i], true).release()); + } + return create(children); +} + +SearchIterator::UP +IteratorChildrenVerifier::create(const std::vector<SearchIterator*> &children) const { + (void) children; + return {}; +} + + +DwwIteratorChildrenVerifier::DwwIteratorChildrenVerifier() + : WeightedChildrenVerifier(), + _helper() +{ + _helper.add_docs(getDocIdLimit()); + auto full_list = getExpectedDocIds(); + for (size_t i = 0; i < full_list.size(); ++i) { + _helper.set_doc(full_list[i], i % _num_children, 1); + } +} +DwwIteratorChildrenVerifier::~DwwIteratorChildrenVerifier() = default; + +SearchIterator::UP +DwwIteratorChildrenVerifier::create(bool strict) const { + (void) strict; + std::vector<DocidWithWeightIterator> children; + for (size_t i = 0; i < _num_children; ++i) { + auto dict_entry = _helper.dww().lookup(vespalib::make_string("%zu", i).c_str(), _helper.dww().get_dictionary_snapshot()); + _helper.dww().create(dict_entry.posting_idx, children); + } + return create(std::move(children)); +} +SearchIterator::UP +DwwIteratorChildrenVerifier::create(std::vector<DocidWithWeightIterator> &&) const { + return {}; +} + + +} diff --git a/searchlib/src/vespa/searchlib/test/weightedchildrenverifiers.h b/searchlib/src/vespa/searchlib/test/weightedchildrenverifiers.h index 86d2fb9aa67..037d1086950 100644 --- a/searchlib/src/vespa/searchlib/test/weightedchildrenverifiers.h +++ b/searchlib/src/vespa/searchlib/test/weightedchildrenverifiers.h @@ -8,11 +8,8 @@ namespace search::test { class WeightedChildrenVerifier : public SearchIteratorVerifier { public: - WeightedChildrenVerifier() - : _weights(_num_children, 1) - { } - ~WeightedChildrenVerifier() override {} - + WeightedChildrenVerifier(); + ~WeightedChildrenVerifier() override; protected: static constexpr size_t _num_children = 7; mutable fef::TermFieldMatchData _tfmd; @@ -21,58 +18,21 @@ protected: class IteratorChildrenVerifier : public WeightedChildrenVerifier { public: - IteratorChildrenVerifier() - : WeightedChildrenVerifier(), - _split_lists(_num_children) - { - auto full_list = getExpectedDocIds(); - for (size_t i = 0; i < full_list.size(); ++i) { - _split_lists[i % _num_children].push_back(full_list[i]); - } - } - ~IteratorChildrenVerifier() override { } - SearchIterator::UP create(bool strict) const override { - (void) strict; - std::vector<SearchIterator*> children; - for (size_t i = 0; i < _num_children; ++i) { - children.push_back(createIterator(_split_lists[i], true).release()); - } - return create(children); - } + IteratorChildrenVerifier(); + ~IteratorChildrenVerifier() override; + SearchIterator::UP create(bool strict) const override; protected: - virtual SearchIterator::UP create(const std::vector<SearchIterator*> &children) const { - (void) children; - return SearchIterator::UP(); - } + virtual SearchIterator::UP create(const std::vector<SearchIterator*> &children) const; std::vector<DocIds> _split_lists; }; class DwwIteratorChildrenVerifier : public WeightedChildrenVerifier { public: - DwwIteratorChildrenVerifier() : - WeightedChildrenVerifier(), - _helper() - { - _helper.add_docs(getDocIdLimit()); - auto full_list = getExpectedDocIds(); - for (size_t i = 0; i < full_list.size(); ++i) { - _helper.set_doc(full_list[i], i % _num_children, 1); - } - } - ~DwwIteratorChildrenVerifier() override {} - SearchIterator::UP create(bool strict) const override { - (void) strict; - std::vector<DocidWithWeightIterator> children; - for (size_t i = 0; i < _num_children; ++i) { - auto dict_entry = _helper.dww().lookup(vespalib::make_string("%zu", i).c_str(), _helper.dww().get_dictionary_snapshot()); - _helper.dww().create(dict_entry.posting_idx, children); - } - return create(std::move(children)); - } + DwwIteratorChildrenVerifier(); + ~DwwIteratorChildrenVerifier() override; + SearchIterator::UP create(bool strict) const override; protected: - virtual SearchIterator::UP create(std::vector<DocidWithWeightIterator> &&) const { - return {}; - } + virtual SearchIterator::UP create(std::vector<DocidWithWeightIterator> &&) const; DocumentWeightAttributeHelper _helper; }; diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.cpp b/searchlib/src/vespa/searchlib/util/token_extractor.cpp index a78f30afe21..6e1573c4551 100644 --- a/searchlib/src/vespa/searchlib/util/token_extractor.cpp +++ b/searchlib/src/vespa/searchlib/util/token_extractor.cpp @@ -143,8 +143,6 @@ TokenExtractor::extract(std::vector<SpanTerm>& terms, const document::StringFiel { auto tree = StringFieldValue::findTree(trees, SPANTREE_NAME); if (tree == nullptr) { - /* field might not be annotated if match type is exact */ - consider_word(terms, text, Span(0, text.size()), nullptr, doc); return; } for (const Annotation & annotation : *tree) { |