From cbf48b1199f094e3643eb05cb44d81ccfc91e0b0 Mon Sep 17 00:00:00 2001 From: HÃ¥vard Pettersen Date: Fri, 25 May 2018 11:40:25 +0000 Subject: SameElement blueprint and search iterator --- searchlib/CMakeLists.txt | 1 + .../tests/queryeval/same_element/CMakeLists.txt | 8 ++ .../queryeval/same_element/same_element_test.cpp | 99 +++++++++++++++++ .../src/vespa/searchlib/queryeval/CMakeLists.txt | 2 + .../searchlib/queryeval/same_element_blueprint.cpp | 82 +++++++++++++++ .../searchlib/queryeval/same_element_blueprint.h | 43 ++++++++ .../searchlib/queryeval/same_element_search.cpp | 117 +++++++++++++++++++++ .../searchlib/queryeval/same_element_search.h | 44 ++++++++ 8 files changed, 396 insertions(+) create mode 100644 searchlib/src/tests/queryeval/same_element/CMakeLists.txt create mode 100644 searchlib/src/tests/queryeval/same_element/same_element_test.cpp create mode 100644 searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp create mode 100644 searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.h create mode 100644 searchlib/src/vespa/searchlib/queryeval/same_element_search.cpp create mode 100644 searchlib/src/vespa/searchlib/queryeval/same_element_search.h diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index 3b321f4a12f..6e4a3ef1e1f 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -194,6 +194,7 @@ vespa_define_module( src/tests/queryeval/multibitvectoriterator src/tests/queryeval/parallel_weak_and src/tests/queryeval/predicate + src/tests/queryeval/same_element src/tests/queryeval/simple_phrase src/tests/queryeval/sourceblender src/tests/queryeval/sparse_vector_benchmark diff --git a/searchlib/src/tests/queryeval/same_element/CMakeLists.txt b/searchlib/src/tests/queryeval/same_element/CMakeLists.txt new file mode 100644 index 00000000000..97034c53376 --- /dev/null +++ b/searchlib/src/tests/queryeval/same_element/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(searchlib_same_element_test_app TEST + SOURCES + same_element_test.cpp + DEPENDS + searchlib +) +vespa_add_test(NAME searchlib_same_element_test_app COMMAND searchlib_same_element_test_app) diff --git a/searchlib/src/tests/queryeval/same_element/same_element_test.cpp b/searchlib/src/tests/queryeval/same_element/same_element_test.cpp new file mode 100644 index 00000000000..e0c20949c8e --- /dev/null +++ b/searchlib/src/tests/queryeval/same_element/same_element_test.cpp @@ -0,0 +1,99 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include +#include +#include +#include +#include + +using namespace search::fef; +using namespace search::queryeval; + +std::unique_ptr make_blueprint(const std::vector &children) { + auto result = std::make_unique(); + for (size_t i = 0; i < children.size(); ++i) { + uint32_t field_id = i; + vespalib::string field_name = vespalib::make_string("f%u", field_id); + FieldSpec field = result->getNextChildField(field_name, field_id); + result->addTerm(std::make_unique(field, children[i])); + } + return result; +} + +Blueprint::UP finalize(Blueprint::UP bp, bool strict) { + Blueprint::UP result = Blueprint::optimize(std::move(bp)); + result->fetchPostings(strict); + result->freeze(); + return result; +} + +SimpleResult find_matches(const std::vector &children) { + auto md = MatchData::makeTestInstance(0, 0); + auto bp = finalize(make_blueprint(children), false); + auto search = bp->createSearch(*md, false); + return SimpleResult().search(*search, 1000); +} + +FakeResult make_result(const std::vector > > &match_data) { + FakeResult result; + uint32_t pos_should_be_ignored = 0; + for (const auto &doc: match_data) { + result.doc(doc.first); + for (const auto &elem: doc.second) { + result.elem(elem).pos(++pos_should_be_ignored); + } + } + return result; +} + +TEST("require that simple match can be found") { + auto a = make_result({{5, {1,3,7}}}); + auto b = make_result({{5, {3,5,10}}}); + SimpleResult result = find_matches({a, b}); + SimpleResult expect({5}); + EXPECT_EQUAL(result, expect); +} + +TEST("require that children must match within same element") { + auto a = make_result({{5, {1,3,7}}}); + auto b = make_result({{5, {2,5,10}}}); + SimpleResult result = find_matches({a, b}); + SimpleResult expect({}); + EXPECT_EQUAL(result, expect); +} + +TEST("require that strict iterator seeks to next hit") { + auto md = MatchData::makeTestInstance(0, 0); + auto a = make_result({{5, {1,2}}, {7, {1,2}}, {8, {1,2}}, {9, {1,2}}}); + auto b = make_result({{5, {3}}, {6, {1,2}}, {7, {2,4}}, {9, {1}}}); + auto bp = finalize(make_blueprint({a,b}), true); + auto search = bp->createSearch(*md, true); + search->initRange(1, 1000); + EXPECT_LESS(search->getDocId(), 1u); + EXPECT_TRUE(!search->seek(1)); + EXPECT_EQUAL(search->getDocId(), 7u); + EXPECT_TRUE(search->seek(9)); + EXPECT_EQUAL(search->getDocId(), 9u); + EXPECT_TRUE(!search->seek(10)); + EXPECT_TRUE(search->isAtEnd()); +} + +TEST("require that results are estimated appropriately") { + auto a = make_result({{5, {0}}, {5, {0}}, {5, {0}}}); + auto b = make_result({{5, {0}}, {5, {0}}}); + auto c = make_result({{5, {0}}, {5, {0}}, {5, {0}}, {5, {0}}}); + auto bp = finalize(make_blueprint({a,b,c}), true); + EXPECT_EQUAL(bp->getState().estimate().estHits, 2u); +} + +TEST("require that children are sorted") { + auto a = make_result({{5, {0}}, {5, {0}}, {5, {0}}}); + auto b = make_result({{5, {0}}, {5, {0}}}); + auto c = make_result({{5, {0}}, {5, {0}}, {5, {0}}, {5, {0}}}); + auto bp = finalize(make_blueprint({a,b,c}), true); + EXPECT_EQUAL(dynamic_cast(*bp).terms()[0]->getState().estimate().estHits, 2u); + EXPECT_EQUAL(dynamic_cast(*bp).terms()[1]->getState().estimate().estHits, 3u); + EXPECT_EQUAL(dynamic_cast(*bp).terms()[2]->getState().estimate().estHits, 4u); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt b/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt index 683de780107..ecda6d6d6ef 100644 --- a/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/queryeval/CMakeLists.txt @@ -33,6 +33,8 @@ vespa_add_library(searchlib_queryeval OBJECT predicate_blueprint.cpp predicate_search.cpp ranksearch.cpp + same_element_blueprint.cpp + same_element_search.cpp searchable.cpp searchiterator.cpp simple_phrase_blueprint.cpp diff --git a/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp b/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp new file mode 100644 index 00000000000..a563a288396 --- /dev/null +++ b/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.cpp @@ -0,0 +1,82 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "same_element_blueprint.h" +#include "same_element_search.h" +#include +#include +#include +#include + +namespace search::queryeval { + +SameElementBlueprint::SameElementBlueprint() + : ComplexLeafBlueprint(FieldSpecBaseList()), + _estimate(), + _layout(), + _terms() +{ +} + +FieldSpec +SameElementBlueprint::getNextChildField(const vespalib::string &field_name, uint32_t field_id) +{ + return FieldSpec(field_name, field_id, _layout.allocTermField(field_id), false); +} + +void +SameElementBlueprint::addTerm(Blueprint::UP term) +{ + const State &childState = term->getState(); + assert(childState.numFields() == 1); + HitEstimate childEst = childState.estimate(); + if (_terms.empty() || (childEst < _estimate)) { + _estimate = childEst; + setEstimate(_estimate); + } + _terms.push_back(std::move(term)); +} + +void +SameElementBlueprint::optimize_self() +{ + std::sort(_terms.begin(), _terms.end(), + [](const auto &a, const auto &b) + { + return (a->getState().estimate() < b->getState().estimate()); + }); +} + +void +SameElementBlueprint::fetchPostings(bool strict) +{ + for (size_t i = 0; i < _terms.size(); ++i) { + _terms[i]->fetchPostings(strict && (i == 0)); + } +} + +SearchIterator::UP +SameElementBlueprint::createLeafSearch(const search::fef::TermFieldMatchDataArray &tfmda, + bool strict) const +{ + (void) tfmda; + assert(!tfmda.valid()); + fef::MatchData::UP md = _layout.createMatchData(); + search::fef::TermFieldMatchDataArray childMatch; + std::vector children(_terms.size()); + for (size_t i = 0; i < _terms.size(); ++i) { + const State &childState = _terms[i]->getState(); + assert(childState.numFields() == 1); + childMatch.add(childState.field(0).resolve(*md)); + children[i] = _terms[i]->createSearch(*md, (strict && (i == 0))); + } + return std::make_unique(std::move(md), std::move(children), childMatch, strict); +} + +void +SameElementBlueprint::visitMembers(vespalib::ObjectVisitor &visitor) const +{ + ComplexLeafBlueprint::visitMembers(visitor); + visit(visitor, "terms", _terms); +} + +} diff --git a/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.h b/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.h new file mode 100644 index 00000000000..050a2bc31d4 --- /dev/null +++ b/searchlib/src/vespa/searchlib/queryeval/same_element_blueprint.h @@ -0,0 +1,43 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "searchable.h" +#include + +namespace search::fef { class TermFieldMatchData; } + +namespace search::queryeval { + +class SameElementBlueprint : public ComplexLeafBlueprint +{ +private: + HitEstimate _estimate; + fef::MatchDataLayout _layout; + std::vector _terms; + +public: + SameElementBlueprint(); + SameElementBlueprint(const SameElementBlueprint &) = delete; + SameElementBlueprint &operator=(const SameElementBlueprint &) = delete; + ~SameElementBlueprint() = default; + + // no match data + bool isWhiteList() const override { return true; } + + // used by create visitor + FieldSpec getNextChildField(const vespalib::string &field_name, uint32_t field_id); + + // used by create visitor + void addTerm(Blueprint::UP term); + + void optimize_self() override; + void fetchPostings(bool strict) override; + + SearchIteratorUP createLeafSearch(const search::fef::TermFieldMatchDataArray &tfmda, + bool strict) const override; + void visitMembers(vespalib::ObjectVisitor &visitor) const override; + const std::vector &terms() const { return _terms; } +}; + +} diff --git a/searchlib/src/vespa/searchlib/queryeval/same_element_search.cpp b/searchlib/src/vespa/searchlib/queryeval/same_element_search.cpp new file mode 100644 index 00000000000..8f3fc9c350d --- /dev/null +++ b/searchlib/src/vespa/searchlib/queryeval/same_element_search.cpp @@ -0,0 +1,117 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "same_element_search.h" +#include +#include +#include +#include +#include + +using TFMD = search::fef::TermFieldMatchData; + +namespace search::queryeval { + +namespace { + +template +int32_t try_match(const fef::TermFieldMatchDataArray &match, std::vector &iterators, uint32_t cand) { + for (size_t i = 0; i < iterators.size(); ++i) { + while ((iterators[i] != match[i]->end()) && (iterators[i]->getElementId() < cand)) { + ++iterators[i]; + } + if (iterators[i] == match[i]->end()) { + return -1; + } + if (iterators[i]->getElementId() != cand) { + return iterators[i]->getElementId(); + } + } + return cand; +} + +} + +bool +SameElementSearch::check_docid_match(uint32_t docid) +{ + for (const auto &child: _children) { + if (!child->seek(docid)) { + return false; + } + } + return true; +} + +void +SameElementSearch::unpack_children(uint32_t docid) +{ + for (const auto &child: _children) { + child->doUnpack(docid); + } + for (size_t i = 0; i < _childMatch.size(); ++i) { + _iterators[i] = _childMatch[i]->begin(); + } +} + +bool +SameElementSearch::check_element_match(uint32_t docid) +{ + unpack_children(docid); + int32_t cand = 0; + int32_t next = try_match(_childMatch, _iterators, cand); + while (next > cand) { + cand = next; + next = try_match(_childMatch, _iterators, cand); + } + return (cand == next); +} + +SameElementSearch::SameElementSearch(fef::MatchData::UP md, + std::vector children, + const fef::TermFieldMatchDataArray &childMatch, + bool strict) + : _md(std::move(md)), + _children(std::move(children)), + _childMatch(childMatch), + _iterators(childMatch.size()), + _strict(strict) +{ + assert(!_children.empty()); + assert(_childMatch.valid()); +} + +void +SameElementSearch::initRange(uint32_t begin_id, uint32_t end_id) +{ + SearchIterator::initRange(begin_id, end_id); + for (const auto &child: _children) { + child->initRange(begin_id, end_id); + } +} + +void +SameElementSearch::doSeek(uint32_t docid) { + if (check_docid_match(docid) && check_element_match(docid)) { + setDocId(docid); + } else if (_strict) { + docid = std::max(docid + 1, _children[0]->getDocId()); + while (!isAtEnd(docid)) { + if (check_docid_match(docid) && check_element_match(docid)) { + setDocId(docid); + return; + } + docid = std::max(docid + 1, _children[0]->getDocId()); + } + setAtEnd(); + } +} + +void +SameElementSearch::visitMembers(vespalib::ObjectVisitor &visitor) const +{ + SearchIterator::visitMembers(visitor); + visit(visitor, "children", _children); + visit(visitor, "strict", _strict); +} + +} diff --git a/searchlib/src/vespa/searchlib/queryeval/same_element_search.h b/searchlib/src/vespa/searchlib/queryeval/same_element_search.h new file mode 100644 index 00000000000..6a116c76e73 --- /dev/null +++ b/searchlib/src/vespa/searchlib/queryeval/same_element_search.h @@ -0,0 +1,44 @@ +// Copyright 2018 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "searchiterator.h" +#include +#include +#include +#include +#include + +namespace search::queryeval { + +/** + * Search iterator for a collection of terms that need to match within + * the same element (array index). + */ +class SameElementSearch : public SearchIterator +{ +private: + using It = fef::TermFieldMatchData::PositionsIterator; + + fef::MatchData::UP _md; + std::vector _children; + fef::TermFieldMatchDataArray _childMatch; + std::vector _iterators; + bool _strict; + + void unpack_children(uint32_t docid); + bool check_docid_match(uint32_t docid); + bool check_element_match(uint32_t docid); + +public: + SameElementSearch(fef::MatchData::UP md, + std::vector children, + const fef::TermFieldMatchDataArray &childMatch, + bool strict); + void initRange(uint32_t begin_id, uint32_t end_id) override; + void doSeek(uint32_t docid) override; + void doUnpack(uint32_t) override {} + void visitMembers(vespalib::ObjectVisitor &visitor) const override; +}; + +} -- cgit v1.2.3