diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /streamingvisitors |
Publish
Diffstat (limited to 'streamingvisitors')
53 files changed, 4432 insertions, 0 deletions
diff --git a/streamingvisitors/.gitignore b/streamingvisitors/.gitignore new file mode 100644 index 00000000000..31d449f16ee --- /dev/null +++ b/streamingvisitors/.gitignore @@ -0,0 +1,3 @@ +testrun +Makefile +Testing diff --git a/streamingvisitors/CMakeLists.txt b/streamingvisitors/CMakeLists.txt new file mode 100644 index 00000000000..6623f743d3b --- /dev/null +++ b/streamingvisitors/CMakeLists.txt @@ -0,0 +1,25 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_define_module( + DEPENDS + fastos + fastlib_fast + vespalog + storage + storageapi + config_cloudconfig + document + vespalib + vdslib + vsm + + EXTERNAL_DEPENDS + cppunit + + LIBS + src/vespa/searchvisitor + + TESTS + src/tests/hitcollector + src/tests/querywrapper + src/tests/searchvisitor +) diff --git a/streamingvisitors/OWNERS b/streamingvisitors/OWNERS new file mode 100644 index 00000000000..f62763ab1da --- /dev/null +++ b/streamingvisitors/OWNERS @@ -0,0 +1,2 @@ +balder +geirst diff --git a/streamingvisitors/doc/SearchVisitorProtocol.html b/streamingvisitors/doc/SearchVisitorProtocol.html new file mode 100644 index 00000000000..55100929ac9 --- /dev/null +++ b/streamingvisitors/doc/SearchVisitorProtocol.html @@ -0,0 +1,93 @@ +<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"> +<!-- Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --> +<HTML> +<HEAD> + <META HTTP-EQUIV="CONTENT-TYPE" CONTENT="text/html; charset=utf-8"> + <TITLE></TITLE> + <META NAME="GENERATOR" CONTENT="OpenOffice.org 2.3 (Unix)"> + <META NAME="AUTHOR" CONTENT="Henning Baldersheim"> + <META NAME="CREATED" CONTENT="20080312;9103800"> + <META NAME="CHANGEDBY" CONTENT="Henning Baldersheim"> + <META NAME="CHANGED" CONTENT="20080314;8394700"> + <META NAME="CHANGEDBY" CONTENT="Henning Baldersheim"> + <META NAME="CHANGEDBY" CONTENT="Henning Baldersheim"> + <STYLE TYPE="text/css"> + <!-- + @page { size: 8.5in 11in; margin: 0.79in } + P { margin-bottom: 0.08in } + H1 { margin-bottom: 0.08in } + H1.western { font-family: "Helvetica"; font-size: 16pt } + H1.cjk { font-family: "AR PL ShanHeiSun Uni"; font-size: 16pt } + H1.ctl { font-family: "Tahoma"; font-size: 16pt } + H2 { margin-bottom: 0.08in } + H2.western { font-family: "Helvetica"; font-size: 14pt; font-style: italic } + H2.cjk { font-family: "AR PL ShanHeiSun Uni"; font-size: 14pt; font-style: italic } + H2.ctl { font-family: "Arial Unicode MS"; font-size: 14pt; font-style: italic } + --> + </STYLE> +</HEAD> +<BODY LANG="en-US" DIR="LTR"> +<H1 CLASS="western">SearchVisitor design</H1> +<P>The SearchVisitor is a visitor plugin running in the storaged +binary. It processes queries and docsum requests and returns +SearchResult and DocumentSummary objects to the client. It uses Vespa +Streaming Matcher (VSM) to generate the search results and document +summaries.</P> +<P>Since the distributors in VDS do not have fdispatch capability, +that is implemented in the QRS(client). It must collect all messages +received, merge them and present them to its liking.</P> +<H2 CLASS="western">Initiation</H2> +<P>The client sends down a createVisitor command with the following +parameters set:</P> +<P>Timeout : This is the query timeout.</P> +<P>VisitorLibrary: "SearchVisitor". Tells the framework to +use the SearchVisitor visitor plugin, rather than the default +DumpVisitor.</P> +<P>VisitorParameters: Containing the following arguments for the +SearchVisitor:</P> +<UL> + <LI><P>"query": The raw encoded query stack from QRS. It + has the same format as the query parameter in the QueryPacket sent + to indexed search.</P> + <LI><P>"searchcluster": This identifies which + searchcluster is queried. The visitor uses this to choose the + correct config.</P> + <LI><P>"summaryclass": Which summaryclass is wanted.</P> + <LI><P>"summarycount": The number of summaries wanted.</P> + <LI><P>"aggregation": The aggregation specification as specified + by the fs4 protocol. The aggregation options are the same as for + indexed search. + </P> + <LI><P>"sort": The sort specification as specified by the fs4 + protocol.</P> + <LI><P>"unique": The field to do duplicate removal on. + </P> + <LI><P>"rankprofile": Which rank profile to use. The default is 0.</P> + <LI><P>"rankproperties": A set of properties to use in ranking for the backend.</P> +</UL> +<P>The backend will return all hits, but only the requested number of +summaries. It is the client's responsibility to handle "hits" +and "offset" query parameters.</P> +<P>Only singlephase has been implemented as all the data are +available anyway.</P> +<P>"aggregate", "sort" and "unique" are not limited to +attributes as they are in indexed search. Every field has attribute +semantics in streamed search.</P> +<H2 CLASS="western">SearchResult</H2> +<P>This is the message returned to the client after the search has +been conducted.</P> +<P>It contains a list of Hits each containing the documentid as known +by VDS, and a rank identifying the relevance of the document with +respect to the query. The list is sorted on descending rank.</P> +<H2 CLASS="western">DocumentSummary</H2> +<P>This is the message returned to the client after all the document +summaries have been generated.</P> +<P>It contains a list of Summary objects, each containing the +documentId as known by VDS and the summary blob. The list is sorted +on ascending docid. The summary blob is encoded as specified by the +fastserver4 protocol.</P> +<H2 CLASS="western">AggregationResult</H2> +<P>This is the message returned containing the aggregation data. They +follow the format of the aggregation packet used in the fs4 protocol.</P> +</BODY> +</HTML> diff --git a/streamingvisitors/src/.gitignore b/streamingvisitors/src/.gitignore new file mode 100644 index 00000000000..f7611c3f5a8 --- /dev/null +++ b/streamingvisitors/src/.gitignore @@ -0,0 +1,6 @@ +.cvsignore +Makefile.ini +config_command.sh +doc +project.dsw +/streamingvisitors.mak diff --git a/streamingvisitors/src/testlist.txt b/streamingvisitors/src/testlist.txt new file mode 100644 index 00000000000..c4ff46e690c --- /dev/null +++ b/streamingvisitors/src/testlist.txt @@ -0,0 +1,3 @@ +tests/hitcollector +tests/querywrapper +tests/searchvisitor diff --git a/streamingvisitors/src/tests/hitcollector/.gitignore b/streamingvisitors/src/tests/hitcollector/.gitignore new file mode 100644 index 00000000000..2ffc5acb4c1 --- /dev/null +++ b/streamingvisitors/src/tests/hitcollector/.gitignore @@ -0,0 +1,4 @@ +.depend +Makefile +hitcollector_test +streamingvisitors_hitcollector_test_app diff --git a/streamingvisitors/src/tests/hitcollector/CMakeLists.txt b/streamingvisitors/src/tests/hitcollector/CMakeLists.txt new file mode 100644 index 00000000000..62c481e13a7 --- /dev/null +++ b/streamingvisitors/src/tests/hitcollector/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(streamingvisitors_hitcollector_test_app + SOURCES + hitcollector.cpp + DEPENDS + streamingvisitors_searchvisitor +) +vespa_add_test(NAME streamingvisitors_hitcollector_test_app COMMAND streamingvisitors_hitcollector_test_app) diff --git a/streamingvisitors/src/tests/hitcollector/DESC b/streamingvisitors/src/tests/hitcollector/DESC new file mode 100644 index 00000000000..4933144da80 --- /dev/null +++ b/streamingvisitors/src/tests/hitcollector/DESC @@ -0,0 +1 @@ +Test of the hit collector used by the streaming searcher. diff --git a/streamingvisitors/src/tests/hitcollector/FILES b/streamingvisitors/src/tests/hitcollector/FILES new file mode 100644 index 00000000000..88a0d4ba4b3 --- /dev/null +++ b/streamingvisitors/src/tests/hitcollector/FILES @@ -0,0 +1 @@ +hitcollector.cpp diff --git a/streamingvisitors/src/tests/hitcollector/hitcollector.cpp b/streamingvisitors/src/tests/hitcollector/hitcollector.cpp new file mode 100644 index 00000000000..4e008211223 --- /dev/null +++ b/streamingvisitors/src/tests/hitcollector/hitcollector.cpp @@ -0,0 +1,314 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <vespa/vespalib/testkit/testapp.h> + +#include <vespa/document/fieldvalue/fieldvalues.h> +#include <vespa/searchlib/fef/matchdata.h> +#include <vespa/searchvisitor/hitcollector.h> +#include <vespa/vdslib/container/searchresult.h> +#include <vespa/vsm/common/storagedocument.h> + +LOG_SETUP("hitcollector_test"); + +using namespace document; +using namespace search::fef; +using namespace vespalib; +using namespace vdslib; +using namespace vsm; + +namespace storage { + +class HitCollectorTest : public vespalib::TestApp +{ +private: + void assertHit(SearchResult::RankType expRank, uint32_t hitNo, SearchResult & rs); + void assertHit(SearchResult::RankType expRank, uint32_t expDocId, uint32_t hitNo, SearchResult & rs); + void addHit(HitCollector &hc, uint32_t docId, double score, + const char *sortData = nullptr, size_t sortDataSize = 0); + void testSimple(); + void testGapsInDocId(); + void testHeapProperty(); + void testHeapPropertyWithSorting(); + void testEmpty(); + void testFeatureSet(); + + DocumentType _docType; + +public: + HitCollectorTest(); + int Main(); +}; + +HitCollectorTest::HitCollectorTest() + : _docType("testdoc", 0) +{ +} + +void +HitCollectorTest::assertHit(SearchResult::RankType expRank, uint32_t hitNo, SearchResult & rs) +{ + assertHit(expRank, hitNo, hitNo, rs); +} + +void +HitCollectorTest::assertHit(SearchResult::RankType expRank, uint32_t expDocId, uint32_t hitNo, SearchResult & rs) +{ + //std::cout << "assertHit(" << expRank << ", " << expDocId << ")" << std::endl; + uint32_t lDocId; + const char * gDocId; + SearchResult::RankType rank; + lDocId = rs.getHit(hitNo, gDocId, rank); + EXPECT_EQUAL(rank, expRank); + EXPECT_EQUAL(lDocId, expDocId); +} + +void +HitCollectorTest::addHit(HitCollector &hc, uint32_t docId, double score, const char *sortData, size_t sortDataSize) +{ + document::Document::UP doc(new document::Document(_docType, DocumentId("doc::"))); + StorageDocument::SP sdoc(new StorageDocument(std::move(doc))); + ASSERT_TRUE(sdoc->valid()); + MatchData md(MatchData::params()); + md.setDocId(docId); + hc.addHit(sdoc, md, score, sortData, sortDataSize); +} + +void +HitCollectorTest::testSimple() +{ + HitCollector hc(5); + + // add hits to hit collector + for (uint32_t i = 0; i < 5; ++i) { + addHit(hc, i, 10 + i); + } + // merge from match data heap and fill search result + for (size_t i = 0; i < 2; ++i) { // try it twice + SearchResult sr; + hc.fillSearchResult(sr); + ASSERT_TRUE(sr.getHitCount() == 5); + assertHit(10, 0, sr); + assertHit(11, 1, sr); + assertHit(12, 2, sr); + assertHit(13, 3, sr); + assertHit(14, 4, sr); + } +} + +void +HitCollectorTest::testGapsInDocId() +{ + HitCollector hc(5); + + // add hits to hit collector + for (uint32_t i = 0; i < 5; ++i) { + addHit(hc, i * 2, i * 2 + 10); + } + + // merge from heap into search result + SearchResult sr; + hc.fillSearchResult(sr); + + ASSERT_TRUE(sr.getHitCount() == 5); + assertHit(10, 0, 0, sr); + assertHit(12, 2, 1, sr); + assertHit(14, 4, 2, sr); + assertHit(16, 6, 3, sr); + assertHit(18, 8, 4, sr); +} + +void +HitCollectorTest::testHeapProperty() +{ + { + HitCollector hc(3); + // add hits (low to high) + for (uint32_t i = 0; i < 6; ++i) { + addHit(hc, i, i + 10); + } + SearchResult sr; + hc.fillSearchResult(sr); + ASSERT_TRUE(sr.getHitCount() == 3); + assertHit(13, 3, 0, sr); + assertHit(14, 4, 1, sr); + assertHit(15, 5, 2, sr); + } + { + HitCollector hc(3); + // add hits (high to low) + for (uint32_t i = 0; i < 6; ++i) { + addHit(hc, i, 10 - i); + } + SearchResult sr; + hc.fillSearchResult(sr); + ASSERT_TRUE(sr.getHitCount() == 3); + assertHit(10, 0, 0, sr); + assertHit(9, 1, 1, sr); + assertHit(8, 2, 2, sr); + } + { + HitCollector hc(3); + // add hits (same rank score) + for (uint32_t i = 0; i < 6; ++i) { + addHit(hc, i, 10); + } + SearchResult sr; + hc.fillSearchResult(sr); + ASSERT_TRUE(sr.getHitCount() == 3); + assertHit(10, 0, 0, sr); + assertHit(10, 1, 1, sr); + assertHit(10, 2, 2, sr); + } +} + +void +HitCollectorTest::testHeapPropertyWithSorting() +{ + std::vector<char> sortData; + sortData.push_back('a'); + sortData.push_back('b'); + sortData.push_back('c'); + sortData.push_back('d'); + sortData.push_back('e'); + sortData.push_back('f'); + { + HitCollector hc(3); + // add hits ('a' is sorted/ranked better than 'b') + for (uint32_t i = 0; i < 6; ++i) { + addHit(hc, i, i + 10, &sortData[i], 1); + } + SearchResult sr; + hc.fillSearchResult(sr); + ASSERT_TRUE(sr.getHitCount() == 3); + assertHit(10, 0, 0, sr); + assertHit(11, 1, 1, sr); + assertHit(12, 2, 2, sr); + } + { + HitCollector hc(3); + // add hits ('a' is sorted/ranked better than 'b') + for (uint32_t i = 0; i < 6; ++i) { + addHit(hc, i, i + 10, &sortData[5 - i], 1); + } + SearchResult sr; + hc.fillSearchResult(sr); + ASSERT_TRUE(sr.getHitCount() == 3); + assertHit(13, 3, 0, sr); + assertHit(14, 4, 1, sr); + assertHit(15, 5, 2, sr); + } + { + HitCollector hc(3); + // add hits (same sort blob) + for (uint32_t i = 0; i < 6; ++i) { + addHit(hc, i, 10, &sortData[0], 1); + } + SearchResult sr; + hc.fillSearchResult(sr); + ASSERT_TRUE(sr.getHitCount() == 3); + assertHit(10, 0, 0, sr); + assertHit(10, 1, 1, sr); + assertHit(10, 2, 2, sr); + } +} + +void +HitCollectorTest::testEmpty() +{ + HitCollector hc(0); + addHit(hc, 0, 0); + SearchResult rs; + hc.fillSearchResult(rs); + ASSERT_TRUE(rs.getHitCount() == 0); +} + +class MyRankProgram : public HitCollector::IRankProgram +{ +private: + MatchData _matchData; + +public: + MyRankProgram() : _matchData(MatchData::params().numFeatures(3)) {} + virtual const search::fef::MatchData &run(uint32_t docid, const std::vector<search::fef::TermFieldMatchData> &) override { + _matchData.setDocId(docid); + *_matchData.resolveFeature(0) = docid + 10; + *_matchData.resolveFeature(1) = docid + 20; + *_matchData.resolveFeature(2) = docid + 30; + return _matchData; + } +}; + +void +HitCollectorTest::testFeatureSet() +{ + HitCollector hc(3); + + addHit(hc, 0, 10); + addHit(hc, 1, 50); // on heap + addHit(hc, 2, 20); + addHit(hc, 3, 40); // on heap + addHit(hc, 4, 30); // on heap + + std::vector<vespalib::string> names; + std::vector<FeatureHandle> handles; + names.push_back("foo"); + names.push_back("bar"); + handles.push_back(0); + handles.push_back(2); + + MyRankProgram rankProgram; + search::FeatureSet::SP sf = hc.getFeatureSet(rankProgram, names, handles); + + EXPECT_EQUAL(sf->getNames().size(), 2u); + EXPECT_EQUAL(sf->getNames()[0], "foo"); + EXPECT_EQUAL(sf->getNames()[1], "bar"); + EXPECT_EQUAL(sf->numFeatures(), 2u); + EXPECT_EQUAL(sf->numDocs(), 3u); + { + const search::feature_t * f = sf->getFeaturesByDocId(1); + ASSERT_TRUE(f != NULL); + EXPECT_EQUAL(f[0], 11); // 10 + docId + EXPECT_EQUAL(f[1], 31); // 30 + docId + } + { + const search::feature_t * f = sf->getFeaturesByDocId(3); + ASSERT_TRUE(f != NULL); + EXPECT_EQUAL(f[0], 13); + EXPECT_EQUAL(f[1], 33); + } + { + const search::feature_t * f = sf->getFeaturesByDocId(4); + ASSERT_TRUE(f != NULL); + EXPECT_EQUAL(f[0], 14); + EXPECT_EQUAL(f[1], 34); + } + ASSERT_TRUE(sf->getFeaturesByDocId(0) == NULL); + ASSERT_TRUE(sf->getFeaturesByDocId(2) == NULL); + + SearchResult sr; + hc.fillSearchResult(sr); + ASSERT_TRUE(sr.getHitCount() == 3); + assertHit(50, 1, 0, sr); + assertHit(40, 3, 1, sr); + assertHit(30, 4, 2, sr); +} + +int +HitCollectorTest::Main() +{ + TEST_INIT("hitcollector_test"); + + testSimple(); + testGapsInDocId(); + testHeapProperty(); + testHeapPropertyWithSorting(); + testEmpty(); + testFeatureSet(); + + TEST_DONE(); +} + +} // namespace storage + +TEST_APPHOOK(storage::HitCollectorTest) diff --git a/streamingvisitors/src/tests/querywrapper/.gitignore b/streamingvisitors/src/tests/querywrapper/.gitignore new file mode 100644 index 00000000000..3c8a4b1c9f8 --- /dev/null +++ b/streamingvisitors/src/tests/querywrapper/.gitignore @@ -0,0 +1,4 @@ +.depend +Makefile +querywrapper_test +streamingvisitors_querywrapper_test_app diff --git a/streamingvisitors/src/tests/querywrapper/CMakeLists.txt b/streamingvisitors/src/tests/querywrapper/CMakeLists.txt new file mode 100644 index 00000000000..501b1eee1a9 --- /dev/null +++ b/streamingvisitors/src/tests/querywrapper/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(streamingvisitors_querywrapper_test_app + SOURCES + querywrapper.cpp + DEPENDS + streamingvisitors_searchvisitor +) +vespa_add_test(NAME streamingvisitors_querywrapper_test_app COMMAND streamingvisitors_querywrapper_test_app) diff --git a/streamingvisitors/src/tests/querywrapper/DESC b/streamingvisitors/src/tests/querywrapper/DESC new file mode 100644 index 00000000000..dfdd9d55a8e --- /dev/null +++ b/streamingvisitors/src/tests/querywrapper/DESC @@ -0,0 +1 @@ +Test of the query wrapper used by the rank processor. diff --git a/streamingvisitors/src/tests/querywrapper/FILES b/streamingvisitors/src/tests/querywrapper/FILES new file mode 100644 index 00000000000..91138d25b2e --- /dev/null +++ b/streamingvisitors/src/tests/querywrapper/FILES @@ -0,0 +1 @@ +querywrapper.cpp diff --git a/streamingvisitors/src/tests/querywrapper/querywrapper.cpp b/streamingvisitors/src/tests/querywrapper/querywrapper.cpp new file mode 100644 index 00000000000..b717962d920 --- /dev/null +++ b/streamingvisitors/src/tests/querywrapper/querywrapper.cpp @@ -0,0 +1,108 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/vespalib/testkit/testapp.h> +#include <iostream> +#include <vespa/searchlib/query/tree/querybuilder.h> +#include <vespa/searchlib/query/tree/simplequery.h> +#include <vespa/searchlib/query/tree/stackdumpcreator.h> + +#include <vespa/searchvisitor/querywrapper.h> + +using namespace search; +using namespace search::query; + +namespace storage { + +class QueryWrapperTest : public vespalib::TestApp +{ +private: + void testQueryWrapper(); + +public: + int Main(); +}; + +void +QueryWrapperTest::testQueryWrapper() +{ + EmptyQueryNodeResult empty; + PhraseQueryNode * null = NULL; + { + QueryBuilder<SimpleQueryNodeTypes> builder; + builder.addAnd(3); + { + builder.addStringTerm("a", "", 0, Weight(0)); + builder.addPhrase(3, "", 0, Weight(0)); + { + builder.addStringTerm("b", "", 0, Weight(0)); + builder.addStringTerm("c", "", 0, Weight(0)); + builder.addStringTerm("d", "", 0, Weight(0)); + } + builder.addStringTerm("e", "", 0, Weight(0)); + } + Node::UP node = builder.build(); + vespalib::string stackDump = StackDumpCreator::create(*node); + Query q(empty, stackDump); + QueryWrapper wrap(q); + QueryWrapper::TermList & tl = wrap.getTermList(); + + QueryTermList terms; + q.getLeafs(terms); + ASSERT_TRUE(tl.size() == 5 && terms.size() == 5); + for (size_t i = 0; i < 5; ++i) { + EXPECT_EQUAL(tl[i].getTerm(), terms[i]); + std::cout << "t[" << i << "]:" << terms[i] << std::endl; + } + + QueryNodeRefList phrases; + q.getPhrases(phrases); + for (size_t i = 0; i < phrases.size(); ++i) { + std::cout << "p[" << i << "]:" << phrases[i] << std::endl; + } + EXPECT_EQUAL(phrases.size(), 1u); + ASSERT_TRUE(phrases.size() == 1); + EXPECT_EQUAL(tl[0].getParent(), null); + EXPECT_EQUAL(tl[1].getParent(), phrases[0]); + EXPECT_EQUAL(tl[2].getParent(), phrases[0]); + EXPECT_EQUAL(tl[3].getParent(), phrases[0]); + EXPECT_EQUAL(tl[4].getParent(), null); + + EXPECT_EQUAL(tl[0].getIndex(), 0u); + EXPECT_EQUAL(tl[1].getIndex(), 0u); + EXPECT_EQUAL(tl[2].getIndex(), 1u); + EXPECT_EQUAL(tl[3].getIndex(), 2u); + EXPECT_EQUAL(tl[4].getIndex(), 0u); + + EXPECT_TRUE(!tl[0].isFirstPhraseTerm()); + EXPECT_TRUE( tl[1].isFirstPhraseTerm()); + EXPECT_TRUE(!tl[2].isFirstPhraseTerm()); + EXPECT_TRUE(!tl[3].isFirstPhraseTerm()); + EXPECT_TRUE(!tl[4].isFirstPhraseTerm()); + + EXPECT_TRUE(!tl[0].isPhraseTerm()); + EXPECT_TRUE( tl[1].isPhraseTerm()); + EXPECT_TRUE( tl[2].isPhraseTerm()); + EXPECT_TRUE( tl[3].isPhraseTerm()); + EXPECT_TRUE(!tl[4].isPhraseTerm()); + + EXPECT_EQUAL(tl[0].getPosAdjust(), 0u); + EXPECT_EQUAL(tl[1].getPosAdjust(), 2u); + EXPECT_EQUAL(tl[2].getPosAdjust(), 2u); + EXPECT_EQUAL(tl[3].getPosAdjust(), 2u); + EXPECT_EQUAL(tl[4].getPosAdjust(), 0u); + } +} + +int +QueryWrapperTest::Main() +{ + TEST_INIT("querywrapper_test"); + + testQueryWrapper(); + + TEST_DONE(); +} + +} // namespace storage + +TEST_APPHOOK(storage::QueryWrapperTest) diff --git a/streamingvisitors/src/tests/searchvisitor/.gitignore b/streamingvisitors/src/tests/searchvisitor/.gitignore new file mode 100644 index 00000000000..543f6428e3c --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/.gitignore @@ -0,0 +1,4 @@ +/.depend +/Makefile +/searchvisitor_test +streamingvisitors_searchvisitor_test_app diff --git a/streamingvisitors/src/tests/searchvisitor/CMakeLists.txt b/streamingvisitors/src/tests/searchvisitor/CMakeLists.txt new file mode 100644 index 00000000000..83abfde144a --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(streamingvisitors_searchvisitor_test_app + SOURCES + searchvisitor.cpp + DEPENDS + streamingvisitors_searchvisitor +) +vespa_add_test(NAME streamingvisitors_searchvisitor_test_app COMMAND streamingvisitors_searchvisitor_test_app) diff --git a/streamingvisitors/src/tests/searchvisitor/cfg/documenttypes.cfg b/streamingvisitors/src/tests/searchvisitor/cfg/documenttypes.cfg new file mode 100644 index 00000000000..7bb863dc8e2 --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/cfg/documenttypes.cfg @@ -0,0 +1,317 @@ +enablecompression false +documenttype[0].id 1843830320 +documenttype[0].name "maptest" +documenttype[0].version 0 +documenttype[0].headerstruct -91088113 +documenttype[0].bodystruct -1659731740 +documenttype[0].inherits[0].id 8 +documenttype[0].datatype[0].id 3474528 +documenttype[0].datatype[0].type STRUCT +documenttype[0].datatype[0].array.element.id 0 +documenttype[0].datatype[0].map.key.id 0 +documenttype[0].datatype[0].map.value.id 0 +documenttype[0].datatype[0].wset.key.id 0 +documenttype[0].datatype[0].wset.createifnonexistent false +documenttype[0].datatype[0].wset.removeifzero false +documenttype[0].datatype[0].annotationref.annotation.id 0 +documenttype[0].datatype[0].sstruct.name "s1" +documenttype[0].datatype[0].sstruct.version 0 +documenttype[0].datatype[0].sstruct.compression.type NONE +documenttype[0].datatype[0].sstruct.compression.level 0 +documenttype[0].datatype[0].sstruct.compression.threshold 95 +documenttype[0].datatype[0].sstruct.compression.minsize 200 +documenttype[0].datatype[0].sstruct.field[0].name "a" +documenttype[0].datatype[0].sstruct.field[0].id 493339625 +documenttype[0].datatype[0].sstruct.field[0].id_v6 703514648 +documenttype[0].datatype[0].sstruct.field[0].datatype 2 +documenttype[0].datatype[0].sstruct.field[1].name "b" +documenttype[0].datatype[0].sstruct.field[1].id 441632370 +documenttype[0].datatype[0].sstruct.field[1].id_v6 1420600727 +documenttype[0].datatype[0].sstruct.field[1].datatype 2 +documenttype[0].datatype[1].id 339965458 +documenttype[0].datatype[1].type MAP +documenttype[0].datatype[1].array.element.id 0 +documenttype[0].datatype[1].map.key.id 2 +documenttype[0].datatype[1].map.value.id 2 +documenttype[0].datatype[1].wset.key.id 0 +documenttype[0].datatype[1].wset.createifnonexistent false +documenttype[0].datatype[1].wset.removeifzero false +documenttype[0].datatype[1].annotationref.annotation.id 0 +documenttype[0].datatype[1].sstruct.name "" +documenttype[0].datatype[1].sstruct.version 0 +documenttype[0].datatype[1].sstruct.compression.type NONE +documenttype[0].datatype[1].sstruct.compression.level 0 +documenttype[0].datatype[1].sstruct.compression.threshold 95 +documenttype[0].datatype[1].sstruct.compression.minsize 200 +documenttype[0].datatype[2].id 1888564261 +documenttype[0].datatype[2].type MAP +documenttype[0].datatype[2].array.element.id 0 +documenttype[0].datatype[2].map.key.id 2 +documenttype[0].datatype[2].map.value.id 3474528 +documenttype[0].datatype[2].wset.key.id 0 +documenttype[0].datatype[2].wset.createifnonexistent false +documenttype[0].datatype[2].wset.removeifzero false +documenttype[0].datatype[2].annotationref.annotation.id 0 +documenttype[0].datatype[2].sstruct.name "" +documenttype[0].datatype[2].sstruct.version 0 +documenttype[0].datatype[2].sstruct.compression.type NONE +documenttype[0].datatype[2].sstruct.compression.level 0 +documenttype[0].datatype[2].sstruct.compression.threshold 95 +documenttype[0].datatype[2].sstruct.compression.minsize 200 +documenttype[0].datatype[3].id -1486737430 +documenttype[0].datatype[3].type ARRAY +documenttype[0].datatype[3].array.element.id 2 +documenttype[0].datatype[3].map.key.id 0 +documenttype[0].datatype[3].map.value.id 0 +documenttype[0].datatype[3].wset.key.id 0 +documenttype[0].datatype[3].wset.createifnonexistent false +documenttype[0].datatype[3].wset.removeifzero false +documenttype[0].datatype[3].annotationref.annotation.id 0 +documenttype[0].datatype[3].sstruct.name "" +documenttype[0].datatype[3].sstruct.version 0 +documenttype[0].datatype[3].sstruct.compression.type NONE +documenttype[0].datatype[3].sstruct.compression.level 0 +documenttype[0].datatype[3].sstruct.compression.threshold 95 +documenttype[0].datatype[3].sstruct.compression.minsize 200 +documenttype[0].datatype[4].id -1220861393 +documenttype[0].datatype[4].type MAP +documenttype[0].datatype[4].array.element.id 0 +documenttype[0].datatype[4].map.key.id 2 +documenttype[0].datatype[4].map.value.id -1486737430 +documenttype[0].datatype[4].wset.key.id 0 +documenttype[0].datatype[4].wset.createifnonexistent false +documenttype[0].datatype[4].wset.removeifzero false +documenttype[0].datatype[4].annotationref.annotation.id 0 +documenttype[0].datatype[4].sstruct.name "" +documenttype[0].datatype[4].sstruct.version 0 +documenttype[0].datatype[4].sstruct.compression.type NONE +documenttype[0].datatype[4].sstruct.compression.level 0 +documenttype[0].datatype[4].sstruct.compression.threshold 95 +documenttype[0].datatype[4].sstruct.compression.minsize 200 +documenttype[0].datatype[5].id 1070047409 +documenttype[0].datatype[5].type MAP +documenttype[0].datatype[5].array.element.id 0 +documenttype[0].datatype[5].map.key.id 2 +documenttype[0].datatype[5].map.value.id 339965458 +documenttype[0].datatype[5].wset.key.id 0 +documenttype[0].datatype[5].wset.createifnonexistent false +documenttype[0].datatype[5].wset.removeifzero false +documenttype[0].datatype[5].annotationref.annotation.id 0 +documenttype[0].datatype[5].sstruct.name "" +documenttype[0].datatype[5].sstruct.version 0 +documenttype[0].datatype[5].sstruct.compression.type NONE +documenttype[0].datatype[5].sstruct.compression.level 0 +documenttype[0].datatype[5].sstruct.compression.threshold 95 +documenttype[0].datatype[5].sstruct.compression.minsize 200 +documenttype[0].datatype[6].id -91088113 +documenttype[0].datatype[6].type STRUCT +documenttype[0].datatype[6].array.element.id 0 +documenttype[0].datatype[6].map.key.id 0 +documenttype[0].datatype[6].map.value.id 0 +documenttype[0].datatype[6].wset.key.id 0 +documenttype[0].datatype[6].wset.createifnonexistent false +documenttype[0].datatype[6].wset.removeifzero false +documenttype[0].datatype[6].annotationref.annotation.id 0 +documenttype[0].datatype[6].sstruct.name "maptest.header" +documenttype[0].datatype[6].sstruct.version 0 +documenttype[0].datatype[6].sstruct.compression.type NONE +documenttype[0].datatype[6].sstruct.compression.level 0 +documenttype[0].datatype[6].sstruct.compression.threshold 95 +documenttype[0].datatype[6].sstruct.compression.minsize 200 +documenttype[0].datatype[6].sstruct.field[0].name "name" +documenttype[0].datatype[6].sstruct.field[0].id 1160796772 +documenttype[0].datatype[6].sstruct.field[0].id_v6 774203930 +documenttype[0].datatype[6].sstruct.field[0].datatype 2 +documenttype[0].datatype[6].sstruct.field[1].name "m1" +documenttype[0].datatype[6].sstruct.field[1].id 656260193 +documenttype[0].datatype[6].sstruct.field[1].id_v6 1013611640 +documenttype[0].datatype[6].sstruct.field[1].datatype 339965458 +documenttype[0].datatype[6].sstruct.field[2].name "m2" +documenttype[0].datatype[6].sstruct.field[2].id 1105173090 +documenttype[0].datatype[6].sstruct.field[2].id_v6 1026497887 +documenttype[0].datatype[6].sstruct.field[2].datatype 1888564261 +documenttype[0].datatype[6].sstruct.field[3].name "m3" +documenttype[0].datatype[6].sstruct.field[3].id 1834987989 +documenttype[0].datatype[6].sstruct.field[3].id_v6 1365320273 +documenttype[0].datatype[6].sstruct.field[3].datatype -1220861393 +documenttype[0].datatype[6].sstruct.field[4].name "m4" +documenttype[0].datatype[6].sstruct.field[4].id 1696105521 +documenttype[0].datatype[6].sstruct.field[4].id_v6 1636310067 +documenttype[0].datatype[6].sstruct.field[4].datatype 1070047409 +documenttype[0].datatype[7].id -1659731740 +documenttype[0].datatype[7].type STRUCT +documenttype[0].datatype[7].array.element.id 0 +documenttype[0].datatype[7].map.key.id 0 +documenttype[0].datatype[7].map.value.id 0 +documenttype[0].datatype[7].wset.key.id 0 +documenttype[0].datatype[7].wset.createifnonexistent false +documenttype[0].datatype[7].wset.removeifzero false +documenttype[0].datatype[7].annotationref.annotation.id 0 +documenttype[0].datatype[7].sstruct.name "maptest.body" +documenttype[0].datatype[7].sstruct.version 0 +documenttype[0].datatype[7].sstruct.compression.type NONE +documenttype[0].datatype[7].sstruct.compression.level 0 +documenttype[0].datatype[7].sstruct.compression.threshold 95 +documenttype[0].datatype[7].sstruct.compression.minsize 200 +documenttype[1].id -753106277 +documenttype[1].name "maptest_search" +documenttype[1].version 0 +documenttype[1].headerstruct 919697476 +documenttype[1].bodystruct -125720743 +documenttype[1].inherits[0].id 8 +documenttype[1].datatype[0].id 3474528 +documenttype[1].datatype[0].type STRUCT +documenttype[1].datatype[0].array.element.id 0 +documenttype[1].datatype[0].map.key.id 0 +documenttype[1].datatype[0].map.value.id 0 +documenttype[1].datatype[0].wset.key.id 0 +documenttype[1].datatype[0].wset.createifnonexistent false +documenttype[1].datatype[0].wset.removeifzero false +documenttype[1].datatype[0].annotationref.annotation.id 0 +documenttype[1].datatype[0].sstruct.name "s1" +documenttype[1].datatype[0].sstruct.version 0 +documenttype[1].datatype[0].sstruct.compression.type NONE +documenttype[1].datatype[0].sstruct.compression.level 0 +documenttype[1].datatype[0].sstruct.compression.threshold 95 +documenttype[1].datatype[0].sstruct.compression.minsize 200 +documenttype[1].datatype[0].sstruct.field[0].name "a" +documenttype[1].datatype[0].sstruct.field[0].id 493339625 +documenttype[1].datatype[0].sstruct.field[0].id_v6 703514648 +documenttype[1].datatype[0].sstruct.field[0].datatype 2 +documenttype[1].datatype[0].sstruct.field[1].name "b" +documenttype[1].datatype[0].sstruct.field[1].id 441632370 +documenttype[1].datatype[0].sstruct.field[1].id_v6 1420600727 +documenttype[1].datatype[0].sstruct.field[1].datatype 2 +documenttype[1].datatype[1].id 339965458 +documenttype[1].datatype[1].type MAP +documenttype[1].datatype[1].array.element.id 0 +documenttype[1].datatype[1].map.key.id 2 +documenttype[1].datatype[1].map.value.id 2 +documenttype[1].datatype[1].wset.key.id 0 +documenttype[1].datatype[1].wset.createifnonexistent false +documenttype[1].datatype[1].wset.removeifzero false +documenttype[1].datatype[1].annotationref.annotation.id 0 +documenttype[1].datatype[1].sstruct.name "" +documenttype[1].datatype[1].sstruct.version 0 +documenttype[1].datatype[1].sstruct.compression.type NONE +documenttype[1].datatype[1].sstruct.compression.level 0 +documenttype[1].datatype[1].sstruct.compression.threshold 95 +documenttype[1].datatype[1].sstruct.compression.minsize 200 +documenttype[1].datatype[2].id 1888564261 +documenttype[1].datatype[2].type MAP +documenttype[1].datatype[2].array.element.id 0 +documenttype[1].datatype[2].map.key.id 2 +documenttype[1].datatype[2].map.value.id 3474528 +documenttype[1].datatype[2].wset.key.id 0 +documenttype[1].datatype[2].wset.createifnonexistent false +documenttype[1].datatype[2].wset.removeifzero false +documenttype[1].datatype[2].annotationref.annotation.id 0 +documenttype[1].datatype[2].sstruct.name "" +documenttype[1].datatype[2].sstruct.version 0 +documenttype[1].datatype[2].sstruct.compression.type NONE +documenttype[1].datatype[2].sstruct.compression.level 0 +documenttype[1].datatype[2].sstruct.compression.threshold 95 +documenttype[1].datatype[2].sstruct.compression.minsize 200 +documenttype[1].datatype[3].id -1486737430 +documenttype[1].datatype[3].type ARRAY +documenttype[1].datatype[3].array.element.id 2 +documenttype[1].datatype[3].map.key.id 0 +documenttype[1].datatype[3].map.value.id 0 +documenttype[1].datatype[3].wset.key.id 0 +documenttype[1].datatype[3].wset.createifnonexistent false +documenttype[1].datatype[3].wset.removeifzero false +documenttype[1].datatype[3].annotationref.annotation.id 0 +documenttype[1].datatype[3].sstruct.name "" +documenttype[1].datatype[3].sstruct.version 0 +documenttype[1].datatype[3].sstruct.compression.type NONE +documenttype[1].datatype[3].sstruct.compression.level 0 +documenttype[1].datatype[3].sstruct.compression.threshold 95 +documenttype[1].datatype[3].sstruct.compression.minsize 200 +documenttype[1].datatype[4].id -1220861393 +documenttype[1].datatype[4].type MAP +documenttype[1].datatype[4].array.element.id 0 +documenttype[1].datatype[4].map.key.id 2 +documenttype[1].datatype[4].map.value.id -1486737430 +documenttype[1].datatype[4].wset.key.id 0 +documenttype[1].datatype[4].wset.createifnonexistent false +documenttype[1].datatype[4].wset.removeifzero false +documenttype[1].datatype[4].annotationref.annotation.id 0 +documenttype[1].datatype[4].sstruct.name "" +documenttype[1].datatype[4].sstruct.version 0 +documenttype[1].datatype[4].sstruct.compression.type NONE +documenttype[1].datatype[4].sstruct.compression.level 0 +documenttype[1].datatype[4].sstruct.compression.threshold 95 +documenttype[1].datatype[4].sstruct.compression.minsize 200 +documenttype[1].datatype[5].id 1070047409 +documenttype[1].datatype[5].type MAP +documenttype[1].datatype[5].array.element.id 0 +documenttype[1].datatype[5].map.key.id 2 +documenttype[1].datatype[5].map.value.id 339965458 +documenttype[1].datatype[5].wset.key.id 0 +documenttype[1].datatype[5].wset.createifnonexistent false +documenttype[1].datatype[5].wset.removeifzero false +documenttype[1].datatype[5].annotationref.annotation.id 0 +documenttype[1].datatype[5].sstruct.name "" +documenttype[1].datatype[5].sstruct.version 0 +documenttype[1].datatype[5].sstruct.compression.type NONE +documenttype[1].datatype[5].sstruct.compression.level 0 +documenttype[1].datatype[5].sstruct.compression.threshold 95 +documenttype[1].datatype[5].sstruct.compression.minsize 200 +documenttype[1].datatype[6].id 919697476 +documenttype[1].datatype[6].type STRUCT +documenttype[1].datatype[6].array.element.id 0 +documenttype[1].datatype[6].map.key.id 0 +documenttype[1].datatype[6].map.value.id 0 +documenttype[1].datatype[6].wset.key.id 0 +documenttype[1].datatype[6].wset.createifnonexistent false +documenttype[1].datatype[6].wset.removeifzero false +documenttype[1].datatype[6].annotationref.annotation.id 0 +documenttype[1].datatype[6].sstruct.name "maptest_search.header" +documenttype[1].datatype[6].sstruct.version 0 +documenttype[1].datatype[6].sstruct.compression.type NONE +documenttype[1].datatype[6].sstruct.compression.level 0 +documenttype[1].datatype[6].sstruct.compression.threshold 95 +documenttype[1].datatype[6].sstruct.compression.minsize 200 +documenttype[1].datatype[6].sstruct.field[0].name "name" +documenttype[1].datatype[6].sstruct.field[0].id 1160796772 +documenttype[1].datatype[6].sstruct.field[0].id_v6 774203930 +documenttype[1].datatype[6].sstruct.field[0].datatype 2 +documenttype[1].datatype[6].sstruct.field[1].name "m1" +documenttype[1].datatype[6].sstruct.field[1].id 656260193 +documenttype[1].datatype[6].sstruct.field[1].id_v6 1013611640 +documenttype[1].datatype[6].sstruct.field[1].datatype 339965458 +documenttype[1].datatype[6].sstruct.field[2].name "m2" +documenttype[1].datatype[6].sstruct.field[2].id 1105173090 +documenttype[1].datatype[6].sstruct.field[2].id_v6 1026497887 +documenttype[1].datatype[6].sstruct.field[2].datatype 1888564261 +documenttype[1].datatype[6].sstruct.field[3].name "m3" +documenttype[1].datatype[6].sstruct.field[3].id 1834987989 +documenttype[1].datatype[6].sstruct.field[3].id_v6 1365320273 +documenttype[1].datatype[6].sstruct.field[3].datatype -1220861393 +documenttype[1].datatype[6].sstruct.field[4].name "m4" +documenttype[1].datatype[6].sstruct.field[4].id 1696105521 +documenttype[1].datatype[6].sstruct.field[4].id_v6 1636310067 +documenttype[1].datatype[6].sstruct.field[4].datatype 1070047409 +documenttype[1].datatype[6].sstruct.field[5].name "rankfeatures" +documenttype[1].datatype[6].sstruct.field[5].id 1883197392 +documenttype[1].datatype[6].sstruct.field[5].id_v6 699950698 +documenttype[1].datatype[6].sstruct.field[5].datatype 2 +documenttype[1].datatype[6].sstruct.field[6].name "summaryfeatures" +documenttype[1].datatype[6].sstruct.field[6].id 1840337115 +documenttype[1].datatype[6].sstruct.field[6].id_v6 1981648971 +documenttype[1].datatype[6].sstruct.field[6].datatype 2 +documenttype[1].datatype[7].id -125720743 +documenttype[1].datatype[7].type STRUCT +documenttype[1].datatype[7].array.element.id 0 +documenttype[1].datatype[7].map.key.id 0 +documenttype[1].datatype[7].map.value.id 0 +documenttype[1].datatype[7].wset.key.id 0 +documenttype[1].datatype[7].wset.createifnonexistent false +documenttype[1].datatype[7].wset.removeifzero false +documenttype[1].datatype[7].annotationref.annotation.id 0 +documenttype[1].datatype[7].sstruct.name "maptest_search.body" +documenttype[1].datatype[7].sstruct.version 0 +documenttype[1].datatype[7].sstruct.compression.type NONE +documenttype[1].datatype[7].sstruct.compression.level 0 +documenttype[1].datatype[7].sstruct.compression.threshold 95 +documenttype[1].datatype[7].sstruct.compression.minsize 200 diff --git a/streamingvisitors/src/tests/searchvisitor/cfg/juniperrc.aaa.cfg b/streamingvisitors/src/tests/searchvisitor/cfg/juniperrc.aaa.cfg new file mode 100644 index 00000000000..09e13ed0f8b --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/cfg/juniperrc.aaa.cfg @@ -0,0 +1,11 @@ +length 256 +max_match_candidates 1000 +max_matches 3 +min_length 128 +prefix true +stem_max_extend 3 +stem_min_length 5 +surround_max 128 +winsize 200 +winsize_fallback_multiplier 10.0 +override[0] diff --git a/streamingvisitors/src/tests/searchvisitor/cfg/juniperrc.simple.cfg b/streamingvisitors/src/tests/searchvisitor/cfg/juniperrc.simple.cfg new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/cfg/juniperrc.simple.cfg diff --git a/streamingvisitors/src/tests/searchvisitor/cfg/rank-profiles.aaa.cfg b/streamingvisitors/src/tests/searchvisitor/cfg/rank-profiles.aaa.cfg new file mode 100644 index 00000000000..cf3ee6a7179 --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/cfg/rank-profiles.aaa.cfg @@ -0,0 +1,43 @@ +rankprofile[2] +rankprofile[0].name "default" +rankprofile[0].fef.property[15] +rankprofile[0].fef.property[00].name "vespa.summary.feature" +rankprofile[0].fef.property[00].value "fieldTermMatch(m1.key,0).firstPosition" +rankprofile[0].fef.property[01].name "vespa.summary.feature" +rankprofile[0].fef.property[01].value "fieldTermMatch(m1.key,0).occurrences" +rankprofile[0].fef.property[02].name "vespa.summary.feature" +rankprofile[0].fef.property[02].value "fieldLength(m1.key)" +rankprofile[0].fef.property[03].name "vespa.summary.feature" +rankprofile[0].fef.property[03].value "fieldTermMatch(m1.value,0).firstPosition" +rankprofile[0].fef.property[04].name "vespa.summary.feature" +rankprofile[0].fef.property[04].value "fieldTermMatch(m1.value,0).occurrences" +rankprofile[0].fef.property[05].name "vespa.summary.feature" +rankprofile[0].fef.property[05].value "fieldLength(m1.value)" +rankprofile[0].fef.property[06].name "vespa.summary.feature" +rankprofile[0].fef.property[06].value "fieldTermMatch(m2.value.a,0).firstPosition" +rankprofile[0].fef.property[07].name "vespa.summary.feature" +rankprofile[0].fef.property[07].value "fieldTermMatch(m2.value.a,0).occurrences" +rankprofile[0].fef.property[08].name "vespa.summary.feature" +rankprofile[0].fef.property[08].value "fieldLength(m2.value.a)" +rankprofile[0].fef.property[09].name "vespa.summary.feature" +rankprofile[0].fef.property[09].value "fieldTermMatch(m3.value,0).firstPosition" +rankprofile[0].fef.property[10].name "vespa.summary.feature" +rankprofile[0].fef.property[10].value "fieldTermMatch(m3.value,0).occurrences" +rankprofile[0].fef.property[11].name "vespa.summary.feature" +rankprofile[0].fef.property[11].value "fieldLength(m3.value)" +rankprofile[0].fef.property[12].name "vespa.summary.feature" +rankprofile[0].fef.property[12].value "fieldTermMatch(m4.value.value,0).firstPosition" +rankprofile[0].fef.property[13].name "vespa.summary.feature" +rankprofile[0].fef.property[13].value "fieldTermMatch(m4.value.value,0).occurrences" +rankprofile[0].fef.property[14].name "vespa.summary.feature" +rankprofile[0].fef.property[14].value "fieldLength(m4.value.value)" +rankprofile[1].name "unranked" +rankprofile[1].fef.property[4] +rankprofile[1].fef.property[0].name "vespa.rank.firstphase" +rankprofile[1].fef.property[0].value "value(0)" +rankprofile[1].fef.property[1].name "vespa.hitcollector.heapsize" +rankprofile[1].fef.property[1].value "0" +rankprofile[1].fef.property[2].name "vespa.hitcollector.arraysize" +rankprofile[1].fef.property[2].value "0" +rankprofile[1].fef.property[3].name "vespa.dump.ignoredefaultfeatures" +rankprofile[1].fef.property[3].value "true" diff --git a/streamingvisitors/src/tests/searchvisitor/cfg/rank-profiles.simple.cfg b/streamingvisitors/src/tests/searchvisitor/cfg/rank-profiles.simple.cfg new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/cfg/rank-profiles.simple.cfg diff --git a/streamingvisitors/src/tests/searchvisitor/cfg/summary.aaa.cfg b/streamingvisitors/src/tests/searchvisitor/cfg/summary.aaa.cfg new file mode 100644 index 00000000000..8cc32ed5ea4 --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/cfg/summary.aaa.cfg @@ -0,0 +1,30 @@ +defaultsummaryid 197963550 +classes[2] +classes[0].id 190911431 +classes[0].name "attributeprefetch" +classes[0].fields[3] +classes[0].fields[0].name "name" +classes[0].fields[0].type "longstring" +classes[0].fields[1].name "rankfeatures" +classes[0].fields[1].type "longstring" +classes[0].fields[2].name "summaryfeatures" +classes[0].fields[2].type "longstring" +classes[1].id 197963550 +classes[1].name "maptest" +classes[1].fields[8] +classes[1].fields[0].name "documentid" +classes[1].fields[0].type "longstring" +classes[1].fields[1].name "m1" +classes[1].fields[1].type "jsonstring" +classes[1].fields[2].name "m2" +classes[1].fields[2].type "jsonstring" +classes[1].fields[3].name "m3" +classes[1].fields[3].type "jsonstring" +classes[1].fields[4].name "m4" +classes[1].fields[4].type "jsonstring" +classes[1].fields[5].name "name" +classes[1].fields[5].type "longstring" +classes[1].fields[6].name "rankfeatures" +classes[1].fields[6].type "longstring" +classes[1].fields[7].name "summaryfeatures" +classes[1].fields[7].type "longstring" diff --git a/streamingvisitors/src/tests/searchvisitor/cfg/summary.simple.cfg b/streamingvisitors/src/tests/searchvisitor/cfg/summary.simple.cfg new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/cfg/summary.simple.cfg diff --git a/streamingvisitors/src/tests/searchvisitor/cfg/summarymap.aaa.cfg b/streamingvisitors/src/tests/searchvisitor/cfg/summarymap.aaa.cfg new file mode 100644 index 00000000000..1c9567431ae --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/cfg/summarymap.aaa.cfg @@ -0,0 +1,11 @@ +defaultoutputclass -1 +override[3] +override[0].arguments "name" +override[0].command "attribute" +override[0].field "name" +override[1].arguments "" +override[1].command "rankfeatures" +override[1].field "rankfeatures" +override[2].arguments "" +override[2].command "summaryfeatures" +override[2].field "summaryfeatures" diff --git a/streamingvisitors/src/tests/searchvisitor/cfg/summarymap.simple.cfg b/streamingvisitors/src/tests/searchvisitor/cfg/summarymap.simple.cfg new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/cfg/summarymap.simple.cfg diff --git a/streamingvisitors/src/tests/searchvisitor/cfg/vsmfields.aaa.cfg b/streamingvisitors/src/tests/searchvisitor/cfg/vsmfields.aaa.cfg new file mode 100644 index 00000000000..4d976764bd9 --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/cfg/vsmfields.aaa.cfg @@ -0,0 +1,98 @@ +documentverificationlevel 0 +searchall 1 +documenttype[1] +documenttype[0].name "maptest" +documenttype[0].index[17] +documenttype[0].index[00].name "m1" +documenttype[0].index[00].field[2] +documenttype[0].index[00].field[0].name "m1.key" +documenttype[0].index[00].field[1].name "m1.value" +documenttype[0].index[01].name "m1.key" +documenttype[0].index[01].field[1] +documenttype[0].index[01].field[0].name "m1.key" +documenttype[0].index[02].name "m1.value" +documenttype[0].index[02].field[1] +documenttype[0].index[02].field[0].name "m1.value" +documenttype[0].index[03].name "m2" +documenttype[0].index[03].field[3] +documenttype[0].index[03].field[0].name "m2.key" +documenttype[0].index[03].field[1].name "m2.value.a" +documenttype[0].index[03].field[2].name "m2.value.b" +documenttype[0].index[04].name "m2.key" +documenttype[0].index[04].field[1] +documenttype[0].index[04].field[0].name "m2.key" +documenttype[0].index[05].name "m2.value" +documenttype[0].index[05].field[2] +documenttype[0].index[05].field[0].name "m2.value.a" +documenttype[0].index[05].field[1].name "m2.value.b" +documenttype[0].index[06].name "m2.value.a" +documenttype[0].index[06].field[1] +documenttype[0].index[06].field[0].name "m2.value.a" +documenttype[0].index[07].name "m2.value.b" +documenttype[0].index[07].field[1] +documenttype[0].index[07].field[0].name "m2.value.b" +documenttype[0].index[08].name "m3" +documenttype[0].index[08].field[2] +documenttype[0].index[08].field[0].name "m3.key" +documenttype[0].index[08].field[1].name "m3.value" +documenttype[0].index[09].name "m3.key" +documenttype[0].index[09].field[1] +documenttype[0].index[09].field[0].name "m3.key" +documenttype[0].index[10].name "m3.value" +documenttype[0].index[10].field[1] +documenttype[0].index[10].field[0].name "m3.value" +documenttype[0].index[11].name "m4" +documenttype[0].index[11].field[3] +documenttype[0].index[11].field[0].name "m4.key" +documenttype[0].index[11].field[1].name "m4.value.key" +documenttype[0].index[11].field[2].name "m4.value.value" +documenttype[0].index[12].name "m4.key" +documenttype[0].index[12].field[1] +documenttype[0].index[12].field[0].name "m4.key" +documenttype[0].index[13].name "m4.value" +documenttype[0].index[13].field[2] +documenttype[0].index[13].field[0].name "m4.value.key" +documenttype[0].index[13].field[1].name "m4.value.value" +documenttype[0].index[14].name "m4.value.key" +documenttype[0].index[14].field[1] +documenttype[0].index[14].field[0].name "m4.value.key" +documenttype[0].index[15].name "m4.value.value" +documenttype[0].index[15].field[1] +documenttype[0].index[15].field[0].name "m4.value.value" +documenttype[0].index[16].name "name" +documenttype[0].index[16].field[1] +documenttype[0].index[16].field[0].name "name" +fieldspec[11] +fieldspec[00].arg1 "" +fieldspec[00].name "m1.key" +fieldspec[00].searchmethod AUTOUTF8 +fieldspec[01].arg1 "" +fieldspec[01].name "m1.value" +fieldspec[01].searchmethod AUTOUTF8 +fieldspec[02].arg1 "" +fieldspec[02].name "m2.key" +fieldspec[02].searchmethod AUTOUTF8 +fieldspec[03].arg1 "" +fieldspec[03].name "m2.value.a" +fieldspec[03].searchmethod AUTOUTF8 +fieldspec[04].arg1 "" +fieldspec[04].name "m2.value.b" +fieldspec[04].searchmethod AUTOUTF8 +fieldspec[05].arg1 "" +fieldspec[05].name "m3.key" +fieldspec[05].searchmethod AUTOUTF8 +fieldspec[06].arg1 "" +fieldspec[06].name "m3.value" +fieldspec[06].searchmethod AUTOUTF8 +fieldspec[07].arg1 "" +fieldspec[07].name "m4.key" +fieldspec[07].searchmethod AUTOUTF8 +fieldspec[08].arg1 "" +fieldspec[08].name "m4.value.key" +fieldspec[08].searchmethod AUTOUTF8 +fieldspec[09].arg1 "" +fieldspec[09].name "m4.value.value" +fieldspec[09].searchmethod AUTOUTF8 +fieldspec[10].arg1 "" +fieldspec[10].name "name" +fieldspec[10].searchmethod AUTOUTF8 diff --git a/streamingvisitors/src/tests/searchvisitor/cfg/vsmfields.simple.cfg b/streamingvisitors/src/tests/searchvisitor/cfg/vsmfields.simple.cfg new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/cfg/vsmfields.simple.cfg diff --git a/streamingvisitors/src/tests/searchvisitor/cfg/vsmsummary.aaa.cfg b/streamingvisitors/src/tests/searchvisitor/cfg/vsmsummary.aaa.cfg new file mode 100644 index 00000000000..664778d40a8 --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/cfg/vsmsummary.aaa.cfg @@ -0,0 +1,26 @@ +outputclass "" +fieldmap[6] +fieldmap[0].command NONE +fieldmap[0].summary "m1" +fieldmap[0].document[1] +fieldmap[0].document[0].field "m1" +fieldmap[1].command NONE +fieldmap[1].summary "m2" +fieldmap[1].document[3] +fieldmap[1].document[0].field "m2.key" +fieldmap[1].document[1].field "m2.value.a" +fieldmap[1].document[2].field "m2.value.b" +fieldmap[2].command NONE +fieldmap[2].summary "m3" +fieldmap[2].document[1] +fieldmap[2].document[0].field "m3" +fieldmap[3].command NONE +fieldmap[3].summary "m4" +fieldmap[3].document[1] +fieldmap[3].document[0].field "m4" +fieldmap[4].command NONE +fieldmap[4].summary "rankfeatures" +fieldmap[4].document[0] +fieldmap[5].command NONE +fieldmap[5].summary "summaryfeatures" +fieldmap[5].document[0] diff --git a/streamingvisitors/src/tests/searchvisitor/cfg/vsmsummary.simple.cfg b/streamingvisitors/src/tests/searchvisitor/cfg/vsmsummary.simple.cfg new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/cfg/vsmsummary.simple.cfg diff --git a/streamingvisitors/src/tests/searchvisitor/searchvisitor.cpp b/streamingvisitors/src/tests/searchvisitor/searchvisitor.cpp new file mode 100644 index 00000000000..77df70ad256 --- /dev/null +++ b/streamingvisitors/src/tests/searchvisitor/searchvisitor.cpp @@ -0,0 +1,123 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/vespalib/testkit/testapp.h> +#include <vespa/searchlib/query/tree/querybuilder.h> +#include <vespa/searchlib/query/tree/simplequery.h> +#include <vespa/searchlib/query/tree/stackdumpcreator.h> +#include <vespa/searchvisitor/searchenvironment.h> +#include <vespa/searchvisitor/searchvisitor.h> +#include <vespa/storage/frameworkimpl/component/storagecomponentregisterimpl.h> +#include <vespa/storageframework/defaultimplementation/clock/fakeclock.h> + +using namespace search; +using namespace search::query; +using namespace document; + +namespace storage { + +class SearchVisitorTest : public vespalib::TestApp +{ +private: + framework::defaultimplementation::FakeClock _clock; + StorageComponentRegisterImpl _componentRegister; + std::unique_ptr<StorageComponent> _component; + SearchEnvironment _env; + void testSearchVisitor(); + void testSearchEnvironment(); + void testCreateSearchVisitor(const vespalib::string & dir, const vdslib::Parameters & parameters); + void testOnlyRequireWeakReadConsistency(); + +public: + SearchVisitorTest(); + int Main(); +}; + +SearchVisitorTest::SearchVisitorTest() : + vespalib::TestApp(), + _componentRegister(), + _env("dir:cfg") +{ + _componentRegister.setNodeInfo("mycluster", lib::NodeType::STORAGE, 1); + _componentRegister.setClock(_clock); + StorageComponent::DocumentTypeRepoSP repo(new DocumentTypeRepo(readDocumenttypesConfig("cfg/documenttypes.cfg"))); + _componentRegister.setDocumentTypeRepo(repo); + _component.reset(new StorageComponent(_componentRegister, "storage")); +}; + +std::vector<spi::DocEntry::LP> +createDocuments(const vespalib::string & dir) +{ + (void) dir; + std::vector<spi::DocEntry::LP> documents; + spi::Timestamp ts; + document::Document::UP doc(new document::Document()); + spi::DocEntry::LP e(new spi::DocEntry(ts, 0, std::move(doc))); + documents.push_back(e); + return documents; +} + +void +SearchVisitorTest::testCreateSearchVisitor(const vespalib::string & dir, const vdslib::Parameters & params) +{ + SearchVisitorFactory sFactory(dir); + VisitorFactory & factory(sFactory); + std::unique_ptr<Visitor> sv(static_cast<SearchVisitor *>(factory.makeVisitor(*_component, _env, params))); + document::OrderingSpecification orderSpec; + document::BucketId bucketId; + std::vector<spi::DocEntry::LP> documents(createDocuments(dir)); + Visitor::HitCounter hitCounter(&orderSpec); + sv->handleDocuments(bucketId, documents, hitCounter); +} + +void +SearchVisitorTest::testSearchEnvironment() +{ + EXPECT_TRUE(_env.getVSMAdapter("simple") != NULL); + EXPECT_TRUE(_env.getRankManager("simple") != NULL); +} + +void +SearchVisitorTest::testSearchVisitor() +{ + vdslib::Parameters params; + params.set("searchcluster", "aaa"); + params.set("queryflags", "0x40000"); + params.set("summarycount", "3"); + params.set("summaryclass", "petra"); + params.set("rankprofile", "default"); + + QueryBuilder<SimpleQueryNodeTypes> builder; + builder.addStringTerm("maptest", "sddocname", 0, Weight(0)); + Node::UP node = builder.build(); + vespalib::string stackDump = StackDumpCreator::create(*node); + + params.set("query", stackDump); + testCreateSearchVisitor("dir:cfg", params); +} + +void +SearchVisitorTest::testOnlyRequireWeakReadConsistency() +{ + SearchVisitorFactory factory("dir:cfg"); + VisitorFactory& factoryBase(factory); + vdslib::Parameters params; + std::unique_ptr<Visitor> sv( + factoryBase.makeVisitor(*_component, _env, params)); + EXPECT_TRUE(sv->getRequiredReadConsistency() == spi::ReadConsistency::WEAK); +} + +int +SearchVisitorTest::Main() +{ + TEST_INIT("searchvisitor_test"); + + testSearchVisitor(); TEST_FLUSH(); + testSearchEnvironment(); TEST_FLUSH(); + testOnlyRequireWeakReadConsistency(); TEST_FLUSH(); + + TEST_DONE(); +} + +} // namespace storage + +TEST_APPHOOK(storage::SearchVisitorTest) diff --git a/streamingvisitors/src/vespa/searchvisitor/.gitignore b/streamingvisitors/src/vespa/searchvisitor/.gitignore new file mode 100644 index 00000000000..a01e73125b3 --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/.gitignore @@ -0,0 +1,9 @@ +*.So +*.lo +.*.swp +.depend +.depend.NEW +.deps +.libs +Makefile +/libsearchvisitor.so.5.1 diff --git a/streamingvisitors/src/vespa/searchvisitor/CMakeLists.txt b/streamingvisitors/src/vespa/searchvisitor/CMakeLists.txt new file mode 100644 index 00000000000..f31bae302d1 --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/CMakeLists.txt @@ -0,0 +1,15 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(streamingvisitors_searchvisitor + SOURCES + hitcollector.cpp + indexenvironment.cpp + queryenvironment.cpp + querytermdata.cpp + querywrapper.cpp + rankmanager.cpp + rankprocessor.cpp + searchenvironment.cpp + searchvisitor.cpp + INSTALL lib64 + DEPENDS +) diff --git a/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp b/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp new file mode 100644 index 00000000000..10b752adf9e --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/hitcollector.cpp @@ -0,0 +1,149 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".searchvisitor.hitcollector"); +#include "hitcollector.h" +#include <stdexcept> + +using search::FeatureSet; +using search::fef::MatchData; +using vdslib::SearchResult; + +namespace storage { + +HitCollector::HitCollector(size_t wantedHits) : + _hits(), + _sortedByDocId(true) +{ + _hits.reserve(wantedHits); +} + +const vsm::Document & +HitCollector::getDocSum(const search::DocumentIdT & docId) const +{ + for (HitVector::const_iterator it(_hits.begin()), mt(_hits.end()); it < mt; it++) { + if (docId == it->getDocId()) { + return *it->getDocument(); + } + } + throw std::runtime_error(vespalib::make_string("Could not look up document id %d", docId)); +} + +bool +HitCollector::addHit(const vsm::StorageDocument::SP & doc, const search::fef::MatchData & data, double score) +{ + Hit h(doc, data, score); + return addHit(h); +} + +bool +HitCollector::addHit(const vsm::StorageDocument::SP & doc, const search::fef::MatchData & data, + double score, const void * sortData, size_t sortDataLen) +{ + Hit h(doc, data, score, sortData, sortDataLen); + return addHit(h); +} + +void +HitCollector::sortByDocId() +{ + if (!_sortedByDocId) { + std::sort(_hits.begin(), _hits.end()); // sort on docId + _sortedByDocId = true; + } +} + +bool +HitCollector::addHitToHeap(const Hit & hit) const +{ + // return true if the given hit is better than the current worst one. + return (hit.getSortBlob().empty()) + ? (hit.cmpRank(_hits[0]) < 0) + : (hit.cmpSort(_hits[0]) < 0); +} + +bool +HitCollector::addHit(const Hit & hit) +{ + bool amongTheBest(false); + ssize_t avail = (_hits.capacity() - _hits.size()); + bool useSortBlob( ! hit.getSortBlob().empty() ); + if (avail > 1) { + // No heap yet. + _hits.push_back(hit); + amongTheBest = true; + } else if (_hits.capacity() == 0) { + // this happens when wantedHitCount = 0 + // in this case we shall not put anything on the heap (which is empty) + } else if ( avail == 0 && addHitToHeap(hit)) { // already a heap + if (useSortBlob) { + std::pop_heap(_hits.begin(), _hits.end(), Hit::SortComparator()); + } else { + std::pop_heap(_hits.begin(), _hits.end(), Hit::RankComparator()); + } + + _hits.back() = hit; + amongTheBest = true; + + if (useSortBlob) { + std::push_heap(_hits.begin(), _hits.end(), Hit::SortComparator()); + } else { + std::push_heap(_hits.begin(), _hits.end(), Hit::RankComparator()); + } + } else if (avail == 1) { // make a heap of the hit vector + _hits.push_back(hit); + amongTheBest = true; + if (useSortBlob) { + std::make_heap(_hits.begin(), _hits.end(), Hit::SortComparator()); + } else { + std::make_heap(_hits.begin(), _hits.end(), Hit::RankComparator()); + } + _sortedByDocId = false; // the hit vector is no longer sorted by docId + } + return amongTheBest; +} + +void +HitCollector::fillSearchResult(vdslib::SearchResult & searchResult) +{ + sortByDocId(); + for (HitVector::const_iterator it(_hits.begin()), mt(_hits.end()); it != mt; it++) { + vespalib::string documentId(it->getDocument()->docDoc().getId().toString()); + search::DocumentIdT docId = it->getDocId(); + SearchResult::RankType rank = it->getRankScore(); + + LOG(debug, "fillSearchResult: gDocId(%s), lDocId(%u), rank(%f)", documentId.c_str(), docId, (float)rank); + + if (it->getSortBlob().empty()) { + searchResult.addHit(docId, documentId.c_str(), rank); + } else { + searchResult.addHit(docId, documentId.c_str(), rank, it->getSortBlob().c_str(), it->getSortBlob().size()); + } + } +} + +FeatureSet::SP +HitCollector::getFeatureSet(IRankProgram &rankProgram, + const std::vector<vespalib::string> & names, + const std::vector<search::fef::FeatureHandle> & handles) +{ + if (names.empty() || _hits.empty()) { + return FeatureSet::SP(new FeatureSet()); + } + sortByDocId(); + FeatureSet::SP retval = FeatureSet::SP(new FeatureSet(names, _hits.size())); + for (HitVector::iterator it(_hits.begin()), mt(_hits.end()); it != mt; ++it) { + const MatchData &matchData = rankProgram.run(it->getDocId(), it->getMatchData()); + uint32_t docId = matchData.getDocId(); + search::feature_t * f = retval->getFeaturesByIndex(retval->addDocId(docId)); + for (uint32_t j = 0; j < names.size(); ++j) { + f[j] = *matchData.resolveFeature(handles[j]); + LOG(debug, "getFeatureSet: lDocId(%u), '%s': %f", docId, names[j].c_str(), f[j]); + } + } + return retval; +} + +} // namespace storage + diff --git a/streamingvisitors/src/vespa/searchvisitor/hitcollector.h b/streamingvisitors/src/vespa/searchvisitor/hitcollector.h new file mode 100644 index 00000000000..11c799f1c06 --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/hitcollector.h @@ -0,0 +1,145 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/common/featureset.h> +#include <vespa/searchlib/fef/matchdata.h> +#include <vespa/vdslib/container/searchresult.h> +#include <vespa/vsm/common/docsum.h> +#include <vespa/vsm/common/storagedocument.h> +#include <vespa/vespalib/stllike/string.h> + +namespace storage { + +/** + * This class is used to store hits and MatchData objects for the m best hits. + **/ +class HitCollector : public vsm::IDocSumCache +{ +private: + class Hit + { + public: + Hit(const vsm::StorageDocument::SP & doc, const search::fef::MatchData & matchData, + double score, const void * sortData, size_t sortDataLen) : + _docid(matchData.getDocId()), + _score(score), + _document(doc), + _matchData(), + _sortBlob(sortData, sortDataLen) + { + _matchData.reserve(matchData.getNumTermFields()); + for (search::fef::TermFieldHandle handle = 0; handle < matchData.getNumTermFields(); ++handle) { + _matchData.emplace_back(*matchData.resolveTermField(handle)); + } + } + Hit(const vsm::StorageDocument::SP & doc, const search::fef::MatchData & matchData, double score) + : Hit(doc, matchData, score, nullptr, 0) {} + search::DocumentIdT getDocId() const { return _docid; } + const vsm::StorageDocument::SP & getDocument() const { return _document; } + const std::vector<search::fef::TermFieldMatchData> &getMatchData() const { return _matchData; } + search::feature_t getRankScore() const { return _score; } + const vespalib::string & getSortBlob() const { return _sortBlob; } + bool operator < (const Hit & b) const { return getDocId() < b.getDocId(); } + int cmpDocId(const Hit & b) const { return getDocId() - b.getDocId(); } + int cmpRank(const Hit & b) const { + return (getRankScore() > b.getRankScore()) ? + -1 : ((getRankScore() < b.getRankScore()) ? 1 : cmpDocId(b)); + } + int cmpSort(const Hit & b) const { + int diff = _sortBlob.compare(b._sortBlob.c_str(), b._sortBlob.size()); + return (diff == 0) ? cmpDocId(b) : diff; + } + class RankComparator { + public: + RankComparator() {} + bool operator() (const Hit & lhs, const Hit & rhs) const { + return lhs.cmpRank(rhs) < 0; + } + }; + class SortComparator { + public: + SortComparator() {} + bool operator() (const Hit & lhs, const Hit & rhs) const { + return lhs.cmpSort(rhs) < 0; + } + }; + + private: + uint32_t _docid; + double _score; + vsm::StorageDocument::SP _document; + std::vector<search::fef::TermFieldMatchData> _matchData; + vespalib::string _sortBlob; + }; + typedef std::vector<Hit> HitVector; + HitVector _hits; + bool _sortedByDocId; // flag for whether the hit vector is sorted on docId + + void sortByDocId(); + bool addHitToHeap(const Hit & hit) const; + bool addHit(const Hit & hit); + +public: + typedef std::unique_ptr<HitCollector> UP; + + struct IRankProgram { + virtual ~IRankProgram() {} + virtual const search::fef::MatchData &run(uint32_t docid, const std::vector<search::fef::TermFieldMatchData> &matchData) = 0; + }; + + HitCollector(size_t wantedHits); + + virtual const vsm::Document & getDocSum(const search::DocumentIdT & docId) const; + + /** + * Adds a hit to this hit collector. + * Make sure that the hits are added in increasing local docId order. + * If you add a NULL document you should not use getDocSum() or fillSearchResult(), + * as these functions expect valid documents. + * + * @param doc The document that is a hit. + * @param data The match data for the hit. + * @return true if the document was added to the heap + **/ + bool addHit(const vsm::StorageDocument::SP & doc, const search::fef::MatchData & data, double score); + + /** + * Adds a hit to this hit collector. + * Make sure that the hits are added in increasing local docId order. + * If you add a NULL document you should not use getDocSum() or fillSearchResult(), + * as these functions expect valid documents. + * + * @param doc The document that is a hit. + * @param data The match data for the hit. + * @param sortData The buffer of the sortdata. + * @param sortDataLen The length of the sortdata. + * @return true if the document was added to the heap + **/ + bool addHit(const vsm::StorageDocument::SP & doc, const search::fef::MatchData & data, + double score, const void * sortData, size_t sortDataLen); + + /** + * Fills the given search result with the m best hits from the hit heap. + * Invoking this method will destroy the heap property of the hit heap. + **/ + void fillSearchResult(vdslib::SearchResult & searchResult); + + /** + * Extract features from the hits stored in the hit heap. + * Invoking this method will destroy the heap property of the hit heap. + * Note that this method will calculate any additional features. + * + * @return features for all hits on the heap. + * @param rankProgram the rank program used to calculate all features. + * @param names names of all features. + * @param handles handles of all features. + **/ + search::FeatureSet::SP getFeatureSet(IRankProgram &rankProgram, + const std::vector<vespalib::string> & names, + const std::vector<search::fef::FeatureHandle> & handles); + +}; + +} // namespace storage + diff --git a/streamingvisitors/src/vespa/searchvisitor/indexenvironment.cpp b/streamingvisitors/src/vespa/searchvisitor/indexenvironment.cpp new file mode 100644 index 00000000000..1dc6a096f2e --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/indexenvironment.cpp @@ -0,0 +1,37 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".searchvisitor.indexenvironment"); +#include "indexenvironment.h" + +using namespace search::fef; + +namespace storage { + +IndexEnvironment::IndexEnvironment(const ITableManager & tableManager) : + _tableManager(&tableManager), + _properties(), + _fields(), + _fieldNames(), + _motivation(RANK), + _rankAttributes(), + _dumpAttributes() +{ +} + +bool +IndexEnvironment::addField(const vespalib::string & name, bool isAttribute) +{ + if (getFieldByName(name) != NULL) { + return false; + } + FieldInfo info(isAttribute ? FieldType::ATTRIBUTE : FieldType::INDEX, CollectionType::SINGLE, name, _fields.size()); + info.addAttribute(); // we are able to produce needed attributes at query time + _fields.push_back(info); + _fieldNames[info.name()] = info.id(); + return true; +} + +} // namespace storage + diff --git a/streamingvisitors/src/vespa/searchvisitor/indexenvironment.h b/streamingvisitors/src/vespa/searchvisitor/indexenvironment.h new file mode 100644 index 00000000000..5eabf0525e8 --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/indexenvironment.h @@ -0,0 +1,95 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/fef/iindexenvironment.h> +#include <vespa/searchlib/fef/itablemanager.h> +#include <vespa/searchlib/fef/properties.h> +#include <vespa/searchlib/fef/fieldinfo.h> +#include <vespa/searchlib/fef/fieldtype.h> +#include <vespa/vespalib/stllike/string.h> +#include <vespa/vespalib/stllike/hash_map.h> +#include <set> + +namespace storage { + +/** + * Implementation of the feature execution framework + * index environment API for the search visitor. + **/ +class IndexEnvironment : public search::fef::IIndexEnvironment +{ +private: + typedef vespalib::hash_map<vespalib::string, uint32_t> StringInt32Map; + const search::fef::ITableManager * _tableManager; + search::fef::Properties _properties; + std::vector<search::fef::FieldInfo> _fields; + StringInt32Map _fieldNames; + mutable FeatureMotivation _motivation; + mutable std::set<vespalib::string> _rankAttributes; + mutable std::set<vespalib::string> _dumpAttributes; + +public: + IndexEnvironment(const search::fef::ITableManager & tableManager); + + // inherit documentation + virtual const search::fef::Properties & getProperties() const { return _properties; } + + // inherit documentation + virtual uint32_t getNumFields() const { return _fields.size(); } + + // inherit documentation + virtual const search::fef::FieldInfo * getField(uint32_t id) const { + if (id >= _fields.size()) { + return NULL; + } + return &_fields[id]; + } + + // inherit documentation + virtual const search::fef::FieldInfo * getFieldByName(const string & name) const { + StringInt32Map::const_iterator itr = _fieldNames.find(name); + if (itr == _fieldNames.end()) { + return NULL; + } + return getField(itr->second); + } + + // inherit documentation + virtual const search::fef::ITableManager & getTableManager() const { return *_tableManager; } + + virtual FeatureMotivation getFeatureMotivation() const override { + return _motivation; + } + + // inherit documentation + virtual void hintFeatureMotivation(FeatureMotivation motivation) const { + _motivation = motivation; + } + + // inherit documentation + virtual void hintFieldAccess(uint32_t) const {} + + // inherit documentation + virtual void hintAttributeAccess(const string & name) const { + if (name.empty()) { + return; + } + if (_motivation == RANK) { + _rankAttributes.insert(name); + } else { + _dumpAttributes.insert(name); + } + } + + bool addField(const vespalib::string & name, bool isAttribute); + + search::fef::Properties & getProperties() { return _properties; } + + const std::set<vespalib::string> & getHintedRankAttributes() const { return _rankAttributes; } + + const std::set<vespalib::string> & getHintedDumpAttributes() const { return _dumpAttributes; } +}; + +} // namespace storage + diff --git a/streamingvisitors/src/vespa/searchvisitor/queryenvironment.cpp b/streamingvisitors/src/vespa/searchvisitor/queryenvironment.cpp new file mode 100644 index 00000000000..ca90df395fd --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/queryenvironment.cpp @@ -0,0 +1,61 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".searchvisitor.queryenvironment"); +#include "queryenvironment.h" +#include <vespa/searchlib/common/location.h> + +using search::IAttributeManager; +using search::fef::Properties; +using vespalib::string; + +namespace storage { + +namespace { + +search::fef::Location parseLocation(const string & location_str) +{ + search::fef::Location fefLocation; + if (location_str.empty()) { + return fefLocation; + } + string::size_type pos = location_str.find(':'); + if (pos == string::npos) { + LOG(warning, "Location string lacks attribute vector specification. loc='%s'. Location ignored.", + location_str.c_str()); + return fefLocation; + } + string attr = location_str.substr(0, pos); + const string location = location_str.substr(pos + 1); + + search::common::Location locationSpec; + if (!locationSpec.parse(location)) { + LOG(warning, "Location parse error (location: '%s'): %s. Location ignored.", + location.c_str(), locationSpec.getParseError()); + return fefLocation; + } + fefLocation.setAttribute(attr); + fefLocation.setXPosition(locationSpec.getX()); + fefLocation.setYPosition(locationSpec.getY()); + fefLocation.setXAspect(locationSpec.getXAspect()); + fefLocation.setValid(true); + return fefLocation; +} + +} + +QueryEnvironment::QueryEnvironment(const string & location_str, + const IndexEnvironment & indexEnv, + const Properties & properties, + const IAttributeManager * attrMgr) : + _indexEnv(indexEnv), + _properties(properties), + _attrCtx(attrMgr->createContext()), + _queryTerms(), + _location(parseLocation(location_str)) +{ +} + +} // namespace storage + diff --git a/streamingvisitors/src/vespa/searchvisitor/queryenvironment.h b/streamingvisitors/src/vespa/searchvisitor/queryenvironment.h new file mode 100644 index 00000000000..48d0ef7645a --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/queryenvironment.h @@ -0,0 +1,63 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchcommon/attribute/iattributecontext.h> +#include <vespa/searchlib/attribute/iattributemanager.h> +#include <vespa/searchlib/fef/iindexenvironment.h> +#include <vespa/searchlib/fef/iqueryenvironment.h> +#include <vespa/searchlib/fef/location.h> +#include <vespa/searchlib/fef/properties.h> +#include "indexenvironment.h" + +namespace storage { + +/** + * Implementation of the feature execution framework + * query environment API for the search visitor. + **/ +class QueryEnvironment : public search::fef::IQueryEnvironment +{ +private: + const IndexEnvironment &_indexEnv; + const search::fef::Properties &_properties; + search::attribute::IAttributeContext::UP _attrCtx; + std::vector<const search::fef::ITermData *> _queryTerms; + search::fef::Location _location; + +public: + typedef std::unique_ptr<QueryEnvironment> UP; + + QueryEnvironment(const vespalib::string & location, + const IndexEnvironment & indexEnv, + const search::fef::Properties & properties, + const search::IAttributeManager * attrMgr = NULL); + + // inherit documentation + virtual const search::fef::Properties & getProperties() const { return _properties; } + + // inherit documentation + virtual uint32_t getNumTerms() const { return _queryTerms.size(); } + + // inherit documentation + virtual const search::fef::ITermData *getTerm(uint32_t idx) const { + if (idx >= _queryTerms.size()) { + return NULL; + } + return _queryTerms[idx]; + } + + // inherit documentation + virtual const search::fef::Location & getLocation() const { return _location; } + + // inherit documentation + virtual const search::attribute::IAttributeContext & getAttributeContext() const { return *_attrCtx; } + + // inherit documentation + virtual const search::fef::IIndexEnvironment & getIndexEnvironment() const { return _indexEnv; } + + void addTerm(const search::fef::ITermData *term) { _queryTerms.push_back(term); } +}; + +} // namespace storage + diff --git a/streamingvisitors/src/vespa/searchvisitor/querytermdata.cpp b/streamingvisitors/src/vespa/searchvisitor/querytermdata.cpp new file mode 100644 index 00000000000..7b2bcd1e4d5 --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/querytermdata.cpp @@ -0,0 +1,15 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".searchvisitor.querytermdata"); +#include "querytermdata.h" + +using namespace search::fef; + +namespace storage { + +IMPLEMENT_DUPLICATE(QueryTermData); + +} // namespace storage + diff --git a/streamingvisitors/src/vespa/searchvisitor/querytermdata.h b/streamingvisitors/src/vespa/searchvisitor/querytermdata.h new file mode 100644 index 00000000000..ef6d96e6e39 --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/querytermdata.h @@ -0,0 +1,30 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/fef/matchdatalayout.h> +#include <vespa/searchlib/fef/simpletermdata.h> +#include <vespa/searchlib/query/querynoderesultbase.h> + +namespace storage { + +/** + * This class keeps data for a query term that is used by the ranking framework. + **/ +class QueryTermData : public search::QueryNodeResultBase +{ +private: + search::fef::SimpleTermData _termData; + +public: + DUPLICATE(QueryTermData); // create duplicate function + + virtual bool evaluate() const { return true; } + virtual void reset() {} + virtual bool getRewriteFloatTerms() const { return true; } + + search::fef::SimpleTermData &getTermData() { return _termData; } +}; + +} // namespace storage + diff --git a/streamingvisitors/src/vespa/searchvisitor/querywrapper.cpp b/streamingvisitors/src/vespa/searchvisitor/querywrapper.cpp new file mode 100644 index 00000000000..f375b532839 --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/querywrapper.cpp @@ -0,0 +1,51 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".searchvisitor.querywrapper"); +#include "querywrapper.h" + +using namespace search; + +namespace storage { + +QueryWrapper::PhraseList::PhraseList(Query & query) : + _phrases() +{ + QueryNodeRefList phrases; + query.getPhrases(phrases); + for (size_t i = 0; i < phrases.size(); ++i) { + _phrases.push_back(static_cast<PhraseQueryNode *>(phrases[i])); + } +} + +PhraseQueryNode * +QueryWrapper::PhraseList::findPhrase(QueryTerm * term, size_t & index) +{ + for (size_t i = 0; i < _phrases.size(); ++i) { + for (size_t j = 0; j < _phrases[i]->size(); ++j) { + if ((*_phrases[i])[j].get() == term) { + index = j; + return _phrases[i]; + } + } + } + return NULL; +} + +QueryWrapper::QueryWrapper(Query & query) : + _phraseList(query), + _termList() +{ + QueryTermList leafs; + query.getLeafs(leafs); + for (size_t i = 0; i < leafs.size(); ++i) { + size_t index = 0; + PhraseQueryNode * parent = _phraseList.findPhrase(leafs[i], index); + _termList.push_back(Term(leafs[i], parent, index)); + } +} + + +} // namespace storage + diff --git a/streamingvisitors/src/vespa/searchvisitor/querywrapper.h b/streamingvisitors/src/vespa/searchvisitor/querywrapper.h new file mode 100644 index 00000000000..beeda493197 --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/querywrapper.h @@ -0,0 +1,65 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/query/query.h> +#include <vespa/searchlib/query/querynode.h> + +namespace storage { + +/** + * This class wraps a query and adds extra information to the list of leaf terms. + **/ +class QueryWrapper +{ +public: + class PhraseList { + private: + std::vector<search::PhraseQueryNode *> _phrases; + + public: + PhraseList(search::Query & query); + search::PhraseQueryNode * findPhrase(search::QueryTerm * term, size_t & index); + }; + + class Term { + private: + search::QueryTerm * _term; + search::PhraseQueryNode * _parent; + size_t _index; + + public: + Term() : + _term(NULL), + _parent(NULL), + _index(0) + { + } + Term(search::QueryTerm * term, search::PhraseQueryNode * parent, size_t index) : + _term(term), + _parent(parent), + _index(index) + { + } + search::QueryTerm * getTerm() { return _term; } + search::PhraseQueryNode * getParent() { return _parent; } + size_t getIndex() const { return _index; } + bool isPhraseTerm() const { return _parent != NULL; } + bool isFirstPhraseTerm() const { return isPhraseTerm() && getIndex() == 0; } + size_t getPosAdjust() const { return _parent != NULL ? _parent->width() - 1 : 0; } + }; + + typedef std::vector<Term> TermList; + +private: + PhraseList _phraseList; + TermList _termList; + +public: + QueryWrapper(search::Query & query); + TermList & getTermList() { return _termList; } + const TermList & getTermList() const { return _termList; } +}; + +} // namespace storage + diff --git a/streamingvisitors/src/vespa/searchvisitor/rankmanager.cpp b/streamingvisitors/src/vespa/searchvisitor/rankmanager.cpp new file mode 100644 index 00000000000..b638b072d1d --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/rankmanager.cpp @@ -0,0 +1,200 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".searchvisitor.rankmanager"); +#include <vespa/searchlib/features/setup.h> +#include <vespa/searchlib/fef/functiontablefactory.h> +#include <vespa/vespalib/util/vstringfmt.h> +#include "rankmanager.h" + +using vespa::config::search::RankProfilesConfig; +using vespa::config::search::vsm::VsmfieldsConfig; +using search::fef::Blueprint; +using search::fef::BlueprintFactory; +using search::fef::FieldInfo; +using search::fef::Properties; +using search::fef::RankSetup; +using vsm::VsmfieldsHandle; +using vsm::VSMAdapter; + +namespace storage { + +void +RankManager::Snapshot::addProperties(const vespa::config::search::RankProfilesConfig & cfg) +{ + for (uint32_t i = 0; i < cfg.rankprofile.size(); ++i) { + const RankProfilesConfig::Rankprofile & curr = cfg.rankprofile[i]; + _properties.push_back(NamedPropertySet()); + _properties.back().first = curr.name; + Properties & p = _properties.back().second; + for (uint32_t j = 0; j < curr.fef.property.size(); ++j) { + p.add(vespalib::string(curr.fef.property[j].name.c_str()), + vespalib::string(curr.fef.property[j].value.c_str())); + } + } +} + +void +RankManager::Snapshot::detectFields(const VsmfieldsHandle & fields) +{ + for (uint32_t i = 0; i < fields->fieldspec.size(); ++i) { + const VsmfieldsConfig::Fieldspec & fs = fields->fieldspec[i]; + bool isAttribute = (fs.fieldtype == VsmfieldsConfig::Fieldspec::ATTRIBUTE); + LOG(debug, "Adding field of type '%s' and name '%s' with id '%u' the index environment.", + isAttribute ? "ATTRIBUTE" : "INDEX", fs.name.c_str(), i); + // This id must match the vsm specific field id + _protoEnv.addField(fs.name, isAttribute); + } +} + +void +RankManager::Snapshot::buildFieldMappings(const VsmfieldsHandle & fields) +{ + for (uint32_t i = 0; i < fields->documenttype.size(); ++i) { + const char * dname = fields->documenttype[i].name.c_str(); + LOG(debug, "Looking through indexes for documenttype '%s'", dname); + for (uint32_t j = 0; j < fields->documenttype[i].index.size(); ++j) { + const char * iname = fields->documenttype[i].index[j].name.c_str(); + LOG(debug, "Looking through fields for index '%s'", iname); + View view; + for (uint32_t k = 0; k < fields->documenttype[i].index[j].field.size(); ++k) { + const char * fname = fields->documenttype[i].index[j].field[k].name.c_str(); + const FieldInfo * info = _protoEnv.getFieldByName(vespalib::string(fname)); + if (info != NULL) { + LOG(debug, "Adding field '%s' to view in index '%s' (field id '%u')", + fname, iname, info->id()); + view.push_back(info->id()); + } else { + LOG(warning, "Field '%s' is not registred in the index environment. " + "Cannot add to index view.", fname); + } + } + if (_views.find(iname) == _views.end()) { + std::sort(view.begin(), view.end()); // lowest field id first + _views[iname] = view; + } else { + LOG(warning, "We already have a view for index '%s'. Drop the new view.", iname); + } + } + } +} + +bool +RankManager::Snapshot::initRankSetup(const BlueprintFactory & factory) +{ + // set up individual index environments per rank profile + for (uint32_t i = 0; i < _properties.size(); ++i) { + _indexEnv.push_back(_protoEnv); + IndexEnvironment & ie = _indexEnv.back(); + ie.getProperties().import(_properties[i].second); + } + + // set up individual rank setups per rank profile + for (uint32_t i = 0; i < _indexEnv.size(); ++i) { + IndexEnvironment & ie = _indexEnv[i]; + + RankSetup::SP rs(new RankSetup(factory, ie)); + rs->configure(); // reads config values from the property map + if (!rs->compile()) { + LOG(warning, "Could not compile rank setup for rank profile '%u'.", i); + return false; + } + _rankSetup.push_back(rs); + } + LOG_ASSERT(_indexEnv.size() == _rankSetup.size()); + LOG(debug, "Number of index environments and rank setups: %u", (uint32_t)_indexEnv.size()); + LOG_ASSERT(_properties.size() == _rankSetup.size()); + for (uint32_t i = 0; i < _properties.size(); ++i) { + vespalib::string number = vespalib::make_vespa_string("%u", i); + _rpmap[number] = i; + } + for (uint32_t i = 0; i < _properties.size(); ++i) { + const vespalib::string &name = _properties[i].first; + _rpmap[name] = i; + } + return true; +} + +RankManager::Snapshot::Snapshot() : + _tableManager(), + _protoEnv(_tableManager), + _properties(), + _indexEnv(), + _rankSetup(), + _rpmap(), + _views() +{ + _tableManager.addFactory(search::fef::ITableFactory::SP(new search::fef::FunctionTableFactory(256))); +} + +bool +RankManager::Snapshot::setup(const RankManager & rm, const std::vector<NamedPropertySet> & properties) +{ + _properties = properties; + return setup(rm); +} + +bool +RankManager::Snapshot::setup(const RankManager & rm) +{ + VsmfieldsHandle fields = rm._vsmAdapter->getFieldsConfig(); + detectFields(fields); + buildFieldMappings(fields); + if (!initRankSetup(rm._blueprintFactory)) { + return false; + } + return true; +} + +bool +RankManager::Snapshot::setup(const RankManager & rm, const RankProfilesConfig & cfg) +{ + addProperties(cfg); + return setup(rm); +} + +void RankManager::notify(const vsm::VSMConfigSnapshot & snap) +{ + configureRankProfiles(*snap.getConfig<RankProfilesConfig>()); +} + + +void +RankManager::configureRankProfiles(const RankProfilesConfig & cfg) +{ + LOG(debug, "configureRankProfiles(): Size of cfg rankprofiles: %zd", cfg.rankprofile.size()); + + std::unique_ptr<Snapshot> snapshot(new Snapshot()); + if (snapshot->setup(*this, cfg)) { + _snapshot.set(snapshot.release()); + _snapshot.latch(); // switch to the new config object + } else { + vespalib::string msg = "(re-)configuration of rank manager failed"; + LOG(error, "%s", msg.c_str()); + throw vespalib::Exception(msg, VESPA_STRLOC); + } +} + +RankManager::RankManager(VSMAdapter * const vsmAdapter) : + _blueprintFactory(), + _snapshot(), + _vsmAdapter(vsmAdapter) +{ + // init blueprint factory + search::features::setup_search_features(_blueprintFactory); +} + +RankManager::~RankManager() +{ +} + +void +RankManager::configure(const vsm::VSMConfigSnapshot & snap) +{ + notify(snap); +} + + +} // namespace storage + diff --git a/streamingvisitors/src/vespa/searchvisitor/rankmanager.h b/streamingvisitors/src/vespa/searchvisitor/rankmanager.h new file mode 100644 index 00000000000..d4fdafaba8a --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/rankmanager.h @@ -0,0 +1,92 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/config-rank-profiles.h> +#include <vespa/searchlib/fef/blueprintfactory.h> +#include <vespa/searchlib/fef/ranksetup.h> +#include <vespa/searchlib/fef/tablemanager.h> +#include <vespa/vsm/vsm/vsm-adapter.h> +#include "indexenvironment.h" + +namespace storage { + +/** + * This class subscribes to the rank-profiles config and keeps a setup per rank profile. + **/ +class RankManager +{ +public: + /** collection of field ids for an index **/ + typedef std::vector<uint32_t> View; + + /** + * This class represents a snapshot of the rank-profiles config with associated setup per rank profile. + * A new instance of this class is created as part of reload config. + **/ + class Snapshot { + private: + typedef std::pair<vespalib::string, search::fef::Properties> NamedPropertySet; + typedef vespalib::hash_map<vespalib::string, View> ViewMap; + typedef vespalib::hash_map<vespalib::string, int> Map; + search::fef::TableManager _tableManager; + IndexEnvironment _protoEnv; + std::vector<NamedPropertySet> _properties; // property set per rank profile + std::vector<IndexEnvironment> _indexEnv; // index environment per rank profile + std::vector<search::fef::RankSetup::SP> _rankSetup; // rank setup per rank profile + Map _rpmap; + ViewMap _views; + + void addProperties(const vespa::config::search::RankProfilesConfig & cfg); + void detectFields(const vsm::VsmfieldsHandle & fields); + void buildFieldMappings(const vsm::VsmfieldsHandle & fields); + bool initRankSetup(const search::fef::BlueprintFactory & factory); + bool setup(const RankManager & manager); + int getIndex(const vespalib::string & key) const { + Map::const_iterator found(_rpmap.find(key)); + return (found != _rpmap.end()) ? found->second : 0; + } + + public: + typedef std::shared_ptr<Snapshot> SP; + Snapshot(); + const std::vector<NamedPropertySet> & getProperties() const { return _properties; } + bool setup(const RankManager & manager, const vespa::config::search::RankProfilesConfig & cfg); + bool setup(const RankManager & manager, const std::vector<NamedPropertySet> & properties); + const search::fef::RankSetup & getRankSetup(const vespalib::string &rankProfile) const { + return *(_rankSetup[getIndex(rankProfile)]); + } + const IndexEnvironment & getIndexEnvironment(const vespalib::string &rankProfile) const { + return _indexEnv[getIndex(rankProfile)]; + } + const View *getView(const vespalib::string & index) const { + ViewMap::const_iterator itr = _views.find(index); + if (itr != _views.end()) { + return &itr->second; + } + return NULL; + } + }; + +private: + search::fef::BlueprintFactory _blueprintFactory; + vespalib::PtrHolder<Snapshot> _snapshot; + const vsm::VSMAdapter * _vsmAdapter; + + void configureRankProfiles(const vespa::config::search::RankProfilesConfig & cfg); + virtual void notify(const vsm::VSMConfigSnapshot & snapshot); + +public: + RankManager(vsm::VSMAdapter * const vsmAdapter); + virtual ~RankManager(); + + void configure(const vsm::VSMConfigSnapshot & snap); + + /** + * Retrieves the current snapshot of the rank-profiles config. + **/ + Snapshot::SP getSnapshot() const { return _snapshot.get(); } +}; + +} // namespace storage + diff --git a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp new file mode 100644 index 00000000000..090479f9b90 --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.cpp @@ -0,0 +1,304 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +LOG_SETUP(".searchvisitor.rankprocessor"); +#include <vespa/searchlib/fef/handle.h> +#include <vespa/searchlib/fef/simpletermfielddata.h> +#include <vespa/vsm/vsm/fieldsearchspec.h> +#include "querytermdata.h" +#include "rankprocessor.h" + +using search::FeatureSet; +using search::HitList; +using search::Query; +using search::QueryTerm; +using search::QueryTermList; +using search::fef::FeatureHandle; +using search::fef::MatchData; +using search::fef::Properties; +using search::fef::RankProgram; +using search::fef::RankSetup; +using search::fef::IllegalHandle; +using search::fef::ITermData; +using search::fef::ITermFieldData; +using search::fef::TermFieldHandle; +using search::fef::TermFieldMatchData; +using search::fef::TermFieldMatchDataPosition; +using vdslib::SearchResult; + +namespace storage { + +namespace { + +vespalib::string +getIndexName(const vespalib::string & indexName, const vespalib::string & expandedIndexName) +{ + if (indexName == expandedIndexName) { + return indexName; + } + return indexName + "(" + expandedIndexName + ")"; +} + +FeatureHandle +getFeatureHandle(const RankProgram &rankProgram) { + std::vector<vespalib::string> featureNames; + std::vector<FeatureHandle> featureHandles; + rankProgram.get_seed_handles(featureNames, featureHandles); + assert(featureNames.size() == 1); + assert(featureHandles.size() == 1); + return featureHandles.front(); +} + +} + +void +RankProcessor::initQueryEnvironment() +{ + QueryWrapper::TermList & terms = _query.getTermList(); + + for (uint32_t i = 0; i < terms.size(); ++i) { + if (!terms[i].isPhraseTerm() || terms[i].isFirstPhraseTerm()) { // register 1 term data per phrase + QueryTermData & qtd = dynamic_cast<QueryTermData &>(terms[i].getTerm()->getQueryItem()); + + qtd.getTermData().setWeight(terms[i].getTerm()->weight()); + qtd.getTermData().setUniqueId(terms[i].getTerm()->uniqueId()); + if (terms[i].isFirstPhraseTerm()) { + qtd.getTermData().setPhraseLength(terms[i].getParent()->width()); + } else { + qtd.getTermData().setPhraseLength(1); + } + + vespalib::string expandedIndexName = vsm::FieldSearchSpecMap::stripNonFields(terms[i].getTerm()->index()); + const RankManager::View *view = _rankManagerSnapshot->getView(expandedIndexName); + if (view != NULL) { + RankManager::View::const_iterator iter = view->begin(); + RankManager::View::const_iterator endp = view->end(); + for (; iter != endp; ++iter) { + qtd.getTermData().addField(*iter).setHandle(_mdLayout.allocTermField(*iter)); + } + } else { + LOG(warning, "Could not find a view for index '%s'. Ranking no fields.", + getIndexName(terms[i].getTerm()->index(), expandedIndexName).c_str()); + } + + LOG(debug, "Setup query term '%s:%s' (%s)", + getIndexName(terms[i].getTerm()->index(), expandedIndexName).c_str(), + terms[i].getTerm()->getTerm(), + terms[i].isFirstPhraseTerm() ? "phrase" : "term"); + _queryEnv.addTerm(&qtd.getTermData()); + } else { + LOG(debug, "Ignore query term '%s:%s' (part of phrase)", + terms[i].getTerm()->index().c_str(), terms[i].getTerm()->getTerm()); + } + } +} + +void +RankProcessor::initHitCollector(size_t wantedHitCount) +{ + _hitCollector.reset(new HitCollector(wantedHitCount)); +} + +void +RankProcessor::setupRankProgram(RankProgram &program) +{ + program.setup(_mdLayout, _queryEnv, search::fef::Properties()); +} + +void +RankProcessor::init(bool forRanking, size_t wantedHitCount) +{ + initQueryEnvironment(); + if (forRanking) { + if (_rankSetup.getSecondPhaseRank().empty()) { + _rankProgram = _rankSetup.create_first_phase_program(); + } else { + // We calculate 2. phase ranking for all hits (no need calculating 1. phase ranking as well) + _rankProgram = _rankSetup.create_second_phase_program(); + } + setupRankProgram(*_rankProgram); + _rankScoreHandle = getFeatureHandle(*_rankProgram); + _summaryProgram = _rankSetup.create_summary_program(); + setupRankProgram(*_summaryProgram); + } else { + _rankProgram = _rankSetup.create_dump_program(); + setupRankProgram(*_rankProgram); + } + initHitCollector(wantedHitCount); +} + +RankProcessor::RankProcessor(RankManager::Snapshot::SP snapshot, + const vespalib::string &rankProfile, + search::Query & query, + const vespalib::string & location, + Properties & queryProperties, + const search::IAttributeManager * attrMgr) : + + _rankManagerSnapshot(snapshot), + _rankSetup(snapshot->getRankSetup(rankProfile)), + _query(query), + _queryEnv(location, snapshot->getIndexEnvironment(rankProfile), queryProperties, attrMgr), + _mdLayout(), + _rankProgram(), + _score(0.0), + _summaryProgram(), + _rankScoreHandle(IllegalHandle), + _hitCollector() +{ +} + +void +RankProcessor::initForRanking(size_t wantedHitCount) +{ + return init(true, wantedHitCount); +} + +void +RankProcessor::initForDumping(size_t wantedHitCount) +{ + return init(false, wantedHitCount); +} + +void +RankProcessor::runRankProgram(uint32_t docId) +{ + _rankProgram->run(docId); + if (_rankScoreHandle != IllegalHandle) { + MatchData &matchData = _rankProgram->match_data(); + _score = *(matchData.resolveFeature(_rankScoreHandle)); + if (isnan(_score) || isinf(_score)) { + _score = -HUGE_VAL; + } + } +} + +namespace { + +void +copyTermFieldMatchData(const std::vector<search::fef::TermFieldMatchData> &src, MatchData &dst) +{ + assert(src.size() == dst.getNumTermFields()); + for (search::fef::TermFieldHandle handle = 0; handle < dst.getNumTermFields(); ++handle) { + (*dst.resolveTermField(handle)) = src[handle]; + } +} + +class RankProgramWrapper : public HitCollector::IRankProgram +{ +private: + RankProgram &_rankProgram; + +public: + RankProgramWrapper(RankProgram &rankProgram) : _rankProgram(rankProgram) {} + virtual const MatchData &run(uint32_t docid, const std::vector<search::fef::TermFieldMatchData> &matchData) override { + // Prepare the match data object used by the rank program with earlier unpacked match data. + copyTermFieldMatchData(matchData, _rankProgram.match_data()); + _rankProgram.run(docid); + return _rankProgram.match_data(); + } +}; + +} + +FeatureSet::SP +RankProcessor::calculateFeatureSet() +{ + LOG(debug, "Calculate feature set"); + std::vector<vespalib::string> names; + std::vector<FeatureHandle> handles; + RankProgram &rankProgram = *(_summaryProgram.get() != nullptr ? _summaryProgram : _rankProgram); + rankProgram.get_seed_handles(names, handles); + LOG(debug, "Feature handles: numNames(%ld), numHandles(%ld)", names.size(), handles.size()); + RankProgramWrapper wrapper(rankProgram); + FeatureSet::SP sf = _hitCollector->getFeatureSet(wrapper, names, handles); + LOG(debug, "Feature set: numFeatures(%u), numDocs(%u)", sf->numFeatures(), sf->numDocs()); + return sf; +} + +void +RankProcessor::fillSearchResult(vdslib::SearchResult & searchResult) +{ + _hitCollector->fillSearchResult(searchResult); +} + +void +RankProcessor::unpackMatchData(uint32_t docId) +{ + MatchData &matchData = _rankProgram->match_data(); + matchData.setDocId(docId); + unpackMatchData(matchData); +} + +void +RankProcessor::unpackMatchData(MatchData &matchData) +{ + QueryWrapper::TermList & terms = _query.getTermList(); + for (uint32_t i = 0; i < terms.size(); ++i) { + if (!terms[i].isPhraseTerm() || terms[i].isFirstPhraseTerm()) { // consider 1 term data per phrase + bool isPhrase = terms[i].isFirstPhraseTerm(); + QueryTermData & qtd = static_cast<QueryTermData &>(terms[i].getTerm()->getQueryItem()); + const ITermData &td = qtd.getTermData(); + + HitList list; + const HitList & hitList = isPhrase ? + terms[i].getParent()->evaluateHits(list) : terms[i].getTerm()->evaluateHits(list); + + if (hitList.size() > 0) { // only unpack if we have a hit + LOG(debug, "Unpack match data for query term '%s:%s' (%s)", + terms[i].getTerm()->index().c_str(), terms[i].getTerm()->getTerm(), + isPhrase ? "phrase" : "term"); + + uint32_t lastFieldId = -1; + TermFieldMatchData *tmd = 0; + uint32_t fieldLen = search::fef::FieldPositionsIterator::UNKNOWN_LENGTH; + + // optimize for hitlist giving all hits for a single field in one chunk + for (const search::Hit & hit : hitList) { + uint32_t fieldId = hit.context(); + + if (fieldId != lastFieldId) { + // reset to notfound/unknown values + tmd = 0; + fieldLen = search::fef::FieldPositionsIterator::UNKNOWN_LENGTH; + + // setup for new field that had a hit + const ITermFieldData *tfd = td.lookupField(fieldId); + if (tfd != 0) { + tmd = matchData.resolveTermField(tfd->getHandle()); + tmd->setFieldId(fieldId); + // reset field match data, but only once per docId + if (tmd->getDocId() != matchData.getDocId()) { + tmd->reset(matchData.getDocId()); + } + } + // find fieldLen for new field + if (isPhrase) { + if (fieldId < terms[i].getParent()->getFieldInfoSize()) { + const QueryTerm::FieldInfo & fi = terms[i].getParent()->getFieldInfo(fieldId); + fieldLen = fi.getFieldLength(); + } + } else { + if (fieldId < terms[i].getTerm()->getFieldInfoSize()) { + const QueryTerm::FieldInfo & fi = terms[i].getTerm()->getFieldInfo(fieldId); + fieldLen = fi.getFieldLength(); + } + } + lastFieldId = fieldId; + } + if (tmd != 0) { + // adjust so that the position for phrase terms equals the match for the first term + TermFieldMatchDataPosition pos(0, hit.wordpos() - terms[i].getPosAdjust(), + hit.weight(), fieldLen); + tmd->appendPosition(pos); + LOG(debug, "Append position(%u), weight(%d), tfmd.weight(%d)", + pos.getPosition(), pos.getElementWeight(), tmd->getWeight()); + } + } + } + } + } +} + +} // namespace storage + diff --git a/streamingvisitors/src/vespa/searchvisitor/rankprocessor.h b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.h new file mode 100644 index 00000000000..0596d0803f3 --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/rankprocessor.h @@ -0,0 +1,75 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/fef/matchdata.h> +#include <vespa/searchlib/fef/matchdatalayout.h> +#include <vespa/searchlib/fef/properties.h> +#include <vespa/searchlib/fef/rank_program.h> +#include <vespa/searchlib/fef/ranksetup.h> +#include <vespa/searchlib/query/query.h> +#include <vespa/vdslib/container/searchresult.h> +#include "hitcollector.h" +#include "queryenvironment.h" +#include "querywrapper.h" +#include "rankmanager.h" + +namespace storage { + +/** + * This class is associated with a query and a rank profile and + * is used to calculate rank and feature set for matched documents. + **/ +class RankProcessor +{ +private: + RankManager::Snapshot::SP _rankManagerSnapshot; + const search::fef::RankSetup & _rankSetup; + QueryWrapper _query; + + QueryEnvironment _queryEnv; + search::fef::MatchDataLayout _mdLayout; + search::fef::RankProgram::UP _rankProgram; + double _score; + search::fef::RankProgram::UP _summaryProgram; + search::fef::FeatureHandle _rankScoreHandle; + HitCollector::UP _hitCollector; + + void initQueryEnvironment(); + void initHitCollector(size_t wantedHitCount); + void setupRankProgram(search::fef::RankProgram &program); + + /** + * Initializes this rank processor. + * @param forRanking whether this should be used for ranking or dumping. + * @param wantedHitCount the number of hits we want to return from the hit collector. + * @return whether the rank processor was initialized or not. + **/ + void init(bool forRanking, size_t wantedHitCount); + + void unpackMatchData(search::fef::MatchData &matchData); + +public: + typedef std::unique_ptr<RankProcessor> UP; + + RankProcessor(RankManager::Snapshot::SP snapshot, + const vespalib::string &rankProfile, + search::Query & query, + const vespalib::string & location, + search::fef::Properties & queryProperties, + const search::IAttributeManager * attrMgr); + + void initForRanking(size_t wantedHitCount); + void initForDumping(size_t wantedHitCount); + void unpackMatchData(uint32_t docId); + void runRankProgram(uint32_t docId); + search::FeatureSet::SP calculateFeatureSet(); + void fillSearchResult(vdslib::SearchResult & searchResult); + const search::fef::MatchData &getMatchData() const { return _rankProgram->match_data(); } + void setRankScore(double score) { _score = score; } + double getRankScore() const { return _score; } + HitCollector & getHitCollector() { return *_hitCollector; } +}; + +} // namespace storage + diff --git a/streamingvisitors/src/vespa/searchvisitor/searchenvironment.cpp b/streamingvisitors/src/vespa/searchvisitor/searchenvironment.cpp new file mode 100644 index 00000000000..df40062ce07 --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/searchenvironment.cpp @@ -0,0 +1,90 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include "searchenvironment.h" + +LOG_SETUP(".visitor.instance.searchenvironment"); + +using search::docsummary::JuniperProperties; +using vsm::VSMAdapter; + +namespace storage { + +__thread SearchEnvironment::EnvMap * SearchEnvironment::_localEnvMap=0; + +SearchEnvironment::Env::Env(const vespalib::string & muffens, const config::ConfigUri & configUri, Fast_NormalizeWordFolder & wf) : + _configId(configUri.getConfigId()), + _configurer(config::SimpleConfigRetriever::UP( + new config::SimpleConfigRetriever(createKeySet(configUri.getConfigId()), configUri.getContext())), + this), + _vsmAdapter(new VSMAdapter(muffens, _configId, wf)), + _rankManager(new RankManager(_vsmAdapter.get())) +{ + + _configurer.start(); +} + +config::ConfigKeySet +SearchEnvironment::Env::createKeySet(const vespalib::string & configId) +{ + config::ConfigKeySet set; + set.add<vespa::config::search::vsm::VsmfieldsConfig, + vespa::config::search::SummaryConfig, + vespa::config::search::SummarymapConfig, + vespa::config::search::vsm::VsmsummaryConfig, + vespa::config::search::summary::JuniperrcConfig, + vespa::config::search::RankProfilesConfig>(configId); + return set; +} + +void +SearchEnvironment::Env::configure(const config::ConfigSnapshot & snapshot) +{ + vsm::VSMConfigSnapshot snap(_configId, snapshot); + _vsmAdapter->configure(snap); + _rankManager->configure(snap); +} + +SearchEnvironment::Env::~Env() +{ + _configurer.close(); +} + +SearchEnvironment::SearchEnvironment(const config::ConfigUri & configUri) : + VisitorEnvironment(), + _envMap(), + _configUri(configUri) +{ +} + +SearchEnvironment::~SearchEnvironment() +{ + vespalib::LockGuard guard(_lock); + _threadLocals.clear(); +} + +SearchEnvironment::Env & SearchEnvironment::getEnv(const vespalib::string & searchCluster) +{ + config::ConfigUri searchClusterUri(_configUri.createWithNewId(searchCluster)); + if (_localEnvMap == NULL) { + _localEnvMap = new EnvMap; + vespalib::LockGuard guard(_lock); + _threadLocals.push_back(EnvMapLP(_localEnvMap)); + } + EnvMap::iterator localFound = _localEnvMap->find(searchCluster); + if (localFound == _localEnvMap->end()) { + vespalib::LockGuard guard(_lock); + EnvMap::iterator found = _envMap.find(searchCluster); + if (found == _envMap.end()) { + LOG(debug, "Init VSMAdapter with config id = '%s'", searchCluster.c_str()); + _envMap[searchCluster].reset(new Env("*", searchClusterUri, _wordFolder)); + found = _envMap.find(searchCluster); + } + _localEnvMap->insert(*found); + localFound = _localEnvMap->find(searchCluster); + } + return *localFound->second; +} + +} diff --git a/streamingvisitors/src/vespa/searchvisitor/searchenvironment.h b/streamingvisitors/src/vespa/searchvisitor/searchenvironment.h new file mode 100644 index 00000000000..c67153b8dd2 --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/searchenvironment.h @@ -0,0 +1,53 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchsummary/docsummary/juniperproperties.h> +#include <vespa/storage/visiting/visitor.h> +#include <vespa/config/retriever/simpleconfigurer.h> +#include <vespa/vsm/vsm/vsm-adapter.h> +#include "rankmanager.h" + +namespace storage { + +class SearchEnvironment : public VisitorEnvironment +{ +private: + class Env : public config::SimpleConfigurable { + public: + typedef std::shared_ptr<Env> SP; + Env(const vespalib::string & muffens, const config::ConfigUri & configUri, Fast_NormalizeWordFolder & wf); + ~Env(); + const vsm::VSMAdapter * getVSMAdapter() const { return _vsmAdapter.get(); } + const RankManager * getRankManager() const { return _rankManager.get(); } + void configure(const config::ConfigSnapshot & snapshot); + + static config::ConfigKeySet createKeySet(const vespalib::string & configId); + private: + const vespalib::string _configId; + config::SimpleConfigurer _configurer; + std::unique_ptr<vsm::VSMAdapter> _vsmAdapter; + std::unique_ptr<RankManager> _rankManager; + }; + typedef vespalib::hash_map<vespalib::string, Env::SP> EnvMap; + typedef vespalib::LinkedPtr<EnvMap> EnvMapLP; + typedef std::vector< vespalib::LinkedPtr<EnvMap> > ThreadLocals; + + static __thread EnvMap * _localEnvMap; + EnvMap _envMap; + ThreadLocals _threadLocals; + vespalib::Lock _lock; + Fast_NormalizeWordFolder _wordFolder; + config::ConfigUri _configUri; + + Env & getEnv(const vespalib::string & searchcluster); + +public: + SearchEnvironment(const config::ConfigUri & configUri); + ~SearchEnvironment(); + const vsm::VSMAdapter * getVSMAdapter(const vespalib::string & searchcluster) { return getEnv(searchcluster).getVSMAdapter(); } + const RankManager * getRankManager(const vespalib::string & searchcluster) { return getEnv(searchcluster).getRankManager(); } +}; + +} + diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp new file mode 100644 index 00000000000..c15062feb8a --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.cpp @@ -0,0 +1,1166 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/fastos/fastos.h> +#include <vespa/log/log.h> +#include <vespa/document/datatype/positiondatatype.h> +#include <vespa/document/fieldvalue/arrayfieldvalue.h> +#include <vespa/document/fieldvalue/bytefieldvalue.h> +#include <vespa/document/fieldvalue/document.h> +#include <vespa/document/fieldvalue/doublefieldvalue.h> +#include <vespa/document/fieldvalue/floatfieldvalue.h> +#include <vespa/document/fieldvalue/longfieldvalue.h> +#include <vespa/document/fieldvalue/weightedsetfieldvalue.h> +#include <vespa/searchlib/attribute/attributeguard.h> +#include <vespa/searchlib/attribute/extendableattributes.h> +#include <vespa/searchlib/aggregation/modifiers.h> +#include <vespa/searchlib/common/packets.h> +#include <vespa/searchlib/common/sortspec.h> +#include <vespa/searchlib/features/setup.h> +#include <vespa/searchlib/fef/fef.h> +#include <vespa/fastlib/text/wordfolder.h> +#include <vespa/vdslib/container/documentlist.h> +#include <vespa/vsm/config/vsm-cfif.h> +#include <vespa/vsm/vsm/docsumfilter.h> +#include <vespa/vsm/vsm/vsm-adapter.h> +#include "querytermdata.h" +#include "searchenvironment.h" +#include "searchvisitor.h" + +LOG_SETUP(".visitor.instance.searchvisitor"); + +namespace storage { + +using vsm::VSMAdapter; +using vsm::DocsumFilter; +using vsm::DocsumTools; +using vsm::DocsumToolsPtr; +using vsm::DocSumCache; +using vsm::FieldIdTSearcherMap; +using vsm::FieldPathMapT; +using vsm::FieldSearcher; +using vsm::FieldSearchSpecMap; +using vsm::VsmfieldsHandle; +using vsm::FieldPath; +using vsm::StorageDocument; +using vsm::StringFieldIdTMap; +using search::IAttributeManager; +using search::AttributeGuard; +using search::AttributeManager; +using search::AttributeVector; +using search::attribute::IAttributeVector; +using search::EmptyQueryNodeResult; +using search::Query; +using search::QueryPacketT; +using search::FeatureSet; +using search::fs4transport::FS4Packet_DOCSUM; +using search::fs4transport::FS4Packet_EOL; +using search::fs4transport::PacketArray; +using namespace search::docsummary; +using namespace search::aggregation; +using namespace search::expression; +using vdslib::Parameters; +using vdslib::DocumentList; + + +class ForceWordfolderInit +{ +public: + ForceWordfolderInit(); +}; + +ForceWordfolderInit::ForceWordfolderInit() +{ + Fast_NormalizeWordFolder::Setup(Fast_NormalizeWordFolder::DO_ACCENT_REMOVAL | + Fast_NormalizeWordFolder::DO_SHARP_S_SUBSTITUTION | + Fast_NormalizeWordFolder::DO_LIGATURE_SUBSTITUTION | + Fast_NormalizeWordFolder::DO_MULTICHAR_EXPANSION); +} + +static ForceWordfolderInit _G_forceNormWordFolderInit; + + +AttributeVector::SP +createMultiValueAttribute(const vespalib::string & name, const document::FieldValue & fv, bool arrayType) +{ + const document::DataType * ndt = fv.getDataType(); + if (ndt->inherits(document::CollectionDataType::classId)) { + ndt = &(static_cast<const document::CollectionDataType *>(ndt)) + ->getNestedType(); + } + LOG(debug, "Create %s attribute '%s' with data type '%s' (%s)", + arrayType ? "array" : "weighted set", name.c_str(), ndt->getName().c_str(), fv.getClass().name()); + AttributeVector::SP attr; + if (ndt->getId() == document::DataType::T_BYTE || + ndt->getId() == document::DataType::T_INT || + ndt->getId() == document::DataType::T_LONG) + { + attr.reset(arrayType ? static_cast<AttributeVector *>(new search::MultiIntegerExtAttribute(name)) + : static_cast<AttributeVector *>(new search::WeightedSetIntegerExtAttribute(name))); + } else if (ndt->getId() == document::DataType::T_DOUBLE || + ndt->getId() == document::DataType::T_FLOAT) + { + attr.reset(arrayType ? static_cast<AttributeVector *>(new search::MultiFloatExtAttribute(name)) + : static_cast<AttributeVector *>(new search::WeightedSetFloatExtAttribute(name))); + } else if (ndt->getId() == document::DataType::T_STRING) { + attr.reset(arrayType ? static_cast<AttributeVector *>(new search::MultiStringExtAttribute(name)) + : static_cast<AttributeVector *>(new search::WeightedSetStringExtAttribute(name))); + } else { + LOG(debug, "Can not make an multivalue attribute out of %s with data type '%s' (%s)", + name.c_str(), ndt->getName().c_str(), fv.getClass().name()); + } + return attr; +} + +AttributeVector::SP +createAttribute(const vespalib::string & name, const document::FieldValue & fv) +{ + LOG(debug, "Create single value attribute '%s' with value type '%s'", name.c_str(), fv.getClass().name()); + AttributeVector::SP attr; + + if (fv.inherits(document::ByteFieldValue::classId) || fv.inherits(document::IntFieldValue::classId) || fv.inherits(document::LongFieldValue::classId)) { + attr.reset(new search::SingleIntegerExtAttribute(name)); + } else if (fv.inherits(document::DoubleFieldValue::classId) || fv.inherits(document::FloatFieldValue::classId)) { + attr.reset(new search::SingleFloatExtAttribute(name)); + } else if (fv.inherits(document::StringFieldValue::classId)) { + attr.reset(new search::SingleStringExtAttribute(name)); + } else { + LOG(debug, "Can not make an attribute out of %s of type '%s'.", name.c_str(), fv.getClass().name()); + } + return attr; +} + +SearchVisitor::SummaryGenerator::SummaryGenerator() : + search::aggregation::HitsAggregationResult::SummaryGenerator(), + _callback(), + _docsumState(_callback), + _docsumFilter(), + _docsumWriter(NULL), + _rawBuf(4096) +{ +} + +vespalib::ConstBufferRef SearchVisitor::SummaryGenerator::fillSummary(search::AttributeVector::DocId lid, const search::aggregation::HitsAggregationResult::SummaryClassType & summaryClass) +{ + if (_docsumWriter != NULL) { + _rawBuf.reset(); + _docsumState._args.setResultClassName(summaryClass); + uint32_t docsumLen = _docsumWriter->WriteDocsum(lid, &_docsumState, _docsumFilter.get(), &_rawBuf); + return vespalib::ConstBufferRef(_rawBuf.GetDrainPos(), docsumLen); + } + return vespalib::ConstBufferRef(); +} + +void SearchVisitor::HitsResultPreparator::execute(vespalib::Identifiable & obj) +{ + search::aggregation::HitsAggregationResult & hitsAggr(static_cast<search::aggregation::HitsAggregationResult &>(obj)); + hitsAggr.setSummaryGenerator(_summaryGenerator); + _numHitsAggregators++; +} + +bool SearchVisitor::HitsResultPreparator::check(const vespalib::Identifiable & obj) const +{ + return obj.getClass().inherits(search::aggregation::HitsAggregationResult::classId); +} + +SearchVisitor::GroupingEntry::GroupingEntry(Grouping * grouping) : + _grouping(grouping), + _count(0), + _limit(grouping->getMaxN(std::numeric_limits<size_t>::max())) +{ +} + +void SearchVisitor::GroupingEntry::aggregate(const document::Document & doc, search::HitRank rank) +{ + if (_count < _limit) { + _grouping->aggregate(doc, rank); + _count++; + } +} + +SearchVisitor::~SearchVisitor() { + if (! isCompletedCalled()) { + document::OrderingSpecification orderSpec; + HitCounter hc(&orderSpec); + completedVisitingInternal(hc); + } +} + +SearchVisitor::SearchVisitor(StorageComponent& component, + VisitorEnvironment& vEnv, + const Parameters& params) : + Visitor(component), + _env(static_cast<SearchEnvironment &>(vEnv)), + _params(params), + _vsmAdapter(NULL), + _docSearchedCount(0), + _hitCount(0), + _hitsRejectedCount(0), + _query(), + _queryResult(new documentapi::QueryResultMessage()), + _fieldSearcherMap(), + _docTypeMapping(), + _fieldSearchSpecMap(), + _snippetModifierManager(), + _summaryGenerator(), + _summaryClass("default"), + _attrMan(), + _attrCtx(_attrMan.createContext()), + _groupingList(), + _attributeFields(), + _sortList(), + _docsumWriter(NULL), + _searchBuffer(new vsm::SearcherBuf()), + _tmpSortBuffer(256), + _documentIdAttributeBacking(new search::SingleStringExtAttribute("[docid]") ), + _rankAttributeBacking(new search::SingleFloatExtAttribute("[rank]") ), + _documentIdAttribute(dynamic_cast<search::SingleStringExtAttribute &>(*_documentIdAttributeBacking)), + _rankAttribute(dynamic_cast<search::SingleFloatExtAttribute &>(*_rankAttributeBacking)), + _shouldFillRankAttribute(false), + _syntheticFieldsController(), + _rankController() +{ + LOG(debug, "Created SearchVisitor"); +} + +void SearchVisitor::init(const Parameters & params) +{ + _attrMan.add(_documentIdAttributeBacking); + _attrMan.add(_rankAttributeBacking); + Parameters::ValueRef valueRef; + if ( params.get("summaryclass", valueRef) ) { + _summaryClass = vespalib::string(static_cast<const char *>(valueRef.data()), + static_cast<unsigned>(valueRef.size())); + LOG(debug, "Received summary class: %s", _summaryClass.c_str()); + } + + size_t wantedSummaryCount(10); + if (params.get("summarycount", valueRef) ) { + vespalib::string tmp(static_cast<const char *>(valueRef.data()), valueRef.size()); + wantedSummaryCount = strtoul(tmp.c_str(), NULL, 0); + LOG(debug, "Received summary count: %ld", wantedSummaryCount); + } + _queryResult->getSearchResult().setWantedHitCount(wantedSummaryCount); + + if (params.get("rankprofile", valueRef) ) { + vespalib::string tmp(static_cast<const char *>(valueRef.data()), valueRef.size()); + _rankController.setRankProfile(tmp); + LOG(debug, "Received rank profile: %s", _rankController.getRankProfile().c_str()); + } + + if (params.get("queryflags", valueRef) ) { + vespalib::string tmp(static_cast<const char *>(valueRef.data()), valueRef.size()); + LOG(debug, "Received query flags: 0x%lx", strtoul(tmp.c_str(), NULL, 0)); + uint32_t queryFlags = strtoul(tmp.c_str(), NULL, 0); + _rankController.setDumpFeatures((queryFlags & search::fs4transport::QFLAG_DUMP_FEATURES) != 0); + LOG(debug, "QFLAG_DUMP_FEATURES: %s", _rankController.getDumpFeatures() ? "true" : "false"); + } + + if (params.get("rankproperties", valueRef) && valueRef.size() > 0) { + LOG(spam, "Received rank properties of %zd bytes", valueRef.size()); + uint32_t len = static_cast<uint32_t>(valueRef.size()); + char * data = const_cast<char *>(static_cast<const char *>(valueRef.data())); + FNET_DataBuffer src(data, len); + uint32_t cnt = src.ReadInt32(); + len -= sizeof(uint32_t); + LOG(debug, "Properties count: '%u'", cnt); + for (uint32_t i = 0; i < cnt; ++i) { + search::fs4transport::FS4Properties prop; + if (!prop.decode(src, len)) { + LOG(warning, "Could not decode rank properties"); + } else { + LOG(debug, "Properties[%u]: name '%s', size '%u'", i, prop.getName(), prop.size()); + if (strcmp(prop.getName(), "rank") == 0) { // pick up rank properties + for (uint32_t j = 0; j < prop.size(); ++j) { + LOG(debug, "Properties[%u][%u]: key '%s' -> value '%s'", i, j, prop.getKey(j), prop.getValue(j)); + _rankController.getQueryProperties().add(vespalib::string(prop.getKey(j), prop.getKeyLen(j)), + vespalib::string(prop.getValue(j), prop.getValueLen(j))); + } + } + } + } + } else { + LOG(debug, "No rank properties received"); + } + + if (params.get("rankprofile", valueRef)) { + vespalib::string tmp(static_cast<const char *>(valueRef.data()), valueRef.size()); + _summaryGenerator.getDocsumState()._args.SetRankProfile(tmp); + } + + int queryFlags = 0; + if (params.get("queryflags", queryFlags)) { + _summaryGenerator.getDocsumState()._args.SetQueryFlags(queryFlags); + } + + vespalib::string location; + if (params.get("location", valueRef)) { + location = vespalib::string(static_cast<const char *>(valueRef.data()), valueRef.size()); + LOG(debug, "Location = '%s'", location.c_str()); + _summaryGenerator.getDocsumState()._args.SetLocation(valueRef.size(), (const char*)valueRef.data()); + } + + Parameters::ValueRef searchClusterBlob; + if (params.get("searchcluster", searchClusterBlob)) { + LOG(spam, "Received searchcluster blob of %zd bytes", searchClusterBlob.size()); + vespalib::string searchCluster(static_cast<const char *>(searchClusterBlob.data()), searchClusterBlob.size()); + _vsmAdapter = _env.getVSMAdapter(searchCluster); + + if ( params.get("sort", valueRef) ) { + _sortSpec = search::common::SortSpec(vespalib::string(static_cast<const char *>(valueRef.data()), + static_cast<unsigned>(valueRef.size()))); + LOG(debug, "Received sort specification: '%s'", _sortSpec.getSpec().c_str()); + } + + Parameters::ValueRef queryBlob; + if ( params.get("query", queryBlob) ) { + LOG(spam, "Received query blob of %zd bytes", queryBlob.size()); + QueryTermData resultAddOn; + _query = Query(resultAddOn, QueryPacketT(static_cast<const char *>(queryBlob.data()), queryBlob.size())); + LOG(debug, "Query tree: '%s'", _query.asString().c_str()); + _searchBuffer->reserve(0x10000); + + int stackCount = 0; + if (params.get("querystackcount", stackCount)) { + _summaryGenerator.getDocsumState()._args.SetStackDump(stackCount, queryBlob.size(), (const char*)queryBlob.data()); + } else { + LOG(warning, "Request without query stack count"); + } + + std::vector<vespalib::string> additionalFields; + registerAdditionalFields(_vsmAdapter->getDocsumTools()->getFieldSpecs(), additionalFields); + + StringFieldIdTMap fieldsInQuery; + setupFieldSearchers(additionalFields, fieldsInQuery); + + setupSnippetModifiers(); + + setupScratchDocument(fieldsInQuery); + + _syntheticFieldsController.setup(_fieldSearchSpecMap.nameIdMap(), fieldsInQuery); + + setupAttributeVectors(); + + setupAttributeVectorsForSorting(_sortSpec); + + const RankManager * rm = _env.getRankManager(searchCluster); + _rankController.setRankManagerSnapshot(rm->getSnapshot()); + _rankController.setupRankProcessors(_query, location, wantedSummaryCount, _attrMan, _attributeFields); + // Depends on hitCollector setup. + setupDocsumObjects(); + + } else { + LOG(warning, "No query received"); + } + + if (params.get("aggregation", valueRef) ) { + std::vector<char> newAggrBlob; + newAggrBlob.resize(valueRef.size()); + memcpy(&newAggrBlob[0], valueRef.data(), newAggrBlob.size()); + LOG(debug, "Received new aggregation blob of %zd bytes", newAggrBlob.size()); + setupGrouping(newAggrBlob); + } + + } else { + LOG(warning, "No searchcluster specified"); + } + + if ( params.get("unique", valueRef) ) { + LOG(spam, "Received unique specification of %zd bytes", valueRef.size()); + } else { + LOG(debug, "No unique specification received"); + } +} + +SearchVisitorFactory::SearchVisitorFactory(const config::ConfigUri & configUri) + : VisitorFactory(), + _configUri(configUri) +{} + +VisitorEnvironment::UP +SearchVisitorFactory::makeVisitorEnvironment(StorageComponent&) +{ + return VisitorEnvironment::UP(new SearchEnvironment(_configUri)); +} + +storage::Visitor* +SearchVisitorFactory::makeVisitor(StorageComponent& component, + storage::VisitorEnvironment& env, + const vdslib::Parameters& params) +{ + return new SearchVisitor(component, env, params); +} + +void +SearchVisitor::AttributeInserter::onPrimitive(const IteratorContent & c) +{ + const document::FieldValue & value = c.getValue(); + LOG(debug, "AttributeInserter: Adding value '%s'(%d) to attribute '%s' for docid '%d'", + value.toString().c_str(), c.getWeight(), _attribute.getName().c_str(), _docId); + search::IExtendAttribute & attr = *_attribute.getExtendInterface(); + const vespalib::Identifiable::RuntimeClass & aInfo = _attribute.getClass(); + if (aInfo.inherits(search::IntegerAttribute::classId)) { + attr.add(value.getAsLong(), c.getWeight()); + } else if (aInfo.inherits(search::FloatingPointAttribute::classId)) { + attr.add(value.getAsDouble(), c.getWeight()); + } else if (aInfo.inherits(search::StringAttribute::classId)) { + attr.add(value.getAsString().c_str(), c.getWeight()); + } else { + assert(false && "We got an attribute vector that is of an unknown type"); + } +} + +SearchVisitor::AttributeInserter::AttributeInserter(search::AttributeVector & attribute, search::AttributeVector::DocId docId) : + _attribute(attribute), + _docId(docId) +{ +} + +SearchVisitor::PositionInserter::PositionInserter(search::AttributeVector & attribute, search::AttributeVector::DocId docId) : + AttributeInserter(attribute, docId), + _fieldX(document::PositionDataType::getInstance().getField(document::PositionDataType::FIELD_X)), + _fieldY(document::PositionDataType::getInstance().getField(document::PositionDataType::FIELD_Y)) +{ +} + +void +SearchVisitor::PositionInserter::onPrimitive(const IteratorContent & c) +{ + (void) c; +} + +void +SearchVisitor::PositionInserter::onStructStart(const IteratorContent & c) +{ + const document::StructuredFieldValue & value = static_cast<const document::StructuredFieldValue &>(c.getValue()); + LOG(debug, "PositionInserter: Adding value '%s'(%d) to attribute '%s' for docid '%d'", + value.toString().c_str(), c.getWeight(), _attribute.getName().c_str(), _docId); + + value.getValue(_fieldX, _valueX); + value.getValue(_fieldY, _valueY); + int64_t zcurve = vespalib::geo::ZCurve::encode(_valueX.getValue(), _valueY.getValue()); + LOG(debug, "X=%d, Y=%d, zcurve=%ld", _valueX.getValue(), _valueY.getValue(), zcurve); + search::IExtendAttribute & attr = *_attribute.getExtendInterface(); + attr.add(zcurve, c.getWeight()); +} + +void +SearchVisitor::RankController::processHintedAttributes(const IndexEnvironment & indexEnv, bool rank, + const search::IAttributeManager & attrMan, + std::vector<AttrInfo> & attributeFields) +{ + const std::set<vespalib::string> & attributes = (rank ? indexEnv.getHintedRankAttributes() : indexEnv.getHintedDumpAttributes()); + for (const vespalib::string & name : attributes) { + LOG(debug, "Process attribute access hint (%s): '%s'", rank ? "rank" : "dump", name.c_str()); + const search::fef::FieldInfo * fieldInfo = indexEnv.getFieldByName(name); + if (fieldInfo != NULL) { + bool found = false; + uint32_t fid = fieldInfo->id(); + for (size_t j = 0; !found && (j < attributeFields.size()); ++j) { + found = (attributeFields[j]._field == fid); + } + if (!found) { + search::AttributeGuard::UP attr(attrMan.getAttribute(name)); + if (attr->valid()) { + LOG(debug, "Add attribute '%s' with field id '%u' to the list of needed attributes", name.c_str(), fid); + attributeFields.push_back(AttrInfo(fid, std::move(attr))); + } else { + LOG(warning, "Cannot locate attribute '%s' in the attribute manager. " + "Ignore access hint about this attribute", name.c_str()); + } + } + } else { + LOG(warning, "Cannot locate field '%s' in the index environment. Ignore access hint about this attribute", + name.c_str()); + } + } +} + +SearchVisitor::RankController::RankController() : + _rankProfile("default"), + _rankManagerSnapshot(NULL), + _rankSetup(NULL), + _queryProperties(), + _hasRanking(false), + _rankProcessor(), + _dumpFeatures(false), + _dumpProcessor() +{ +} + +void +SearchVisitor::RankController::setupRankProcessors(search::Query & query, + const vespalib::string & location, + size_t wantedHitCount, + const search::IAttributeManager & attrMan, + std::vector<AttrInfo> & attributeFields) +{ + _rankSetup = &_rankManagerSnapshot->getRankSetup(_rankProfile); + + // register attribute vectors needed for ranking + const IndexEnvironment & indexEnv = _rankManagerSnapshot->getIndexEnvironment(_rankProfile); + processHintedAttributes(indexEnv, true, attrMan, attributeFields); + + _rankProcessor.reset(new RankProcessor(_rankManagerSnapshot, _rankProfile, query, location, _queryProperties, &attrMan)); + LOG(debug, "Initialize rank processor"); + _rankProcessor->initForRanking(wantedHitCount); + + if (_dumpFeatures) { + // register attribute vectors needed for dumping + processHintedAttributes(indexEnv, false, attrMan, attributeFields); + + _dumpProcessor.reset(new RankProcessor(_rankManagerSnapshot, _rankProfile, query, location, _queryProperties, &attrMan)); + LOG(debug, "Initialize dump processor"); + _dumpProcessor->initForDumping(wantedHitCount); + } + + _hasRanking = true; +} + + +void +SearchVisitor::RankController::onDocumentMatch(uint32_t docId) +{ + // unpacking into match data + _rankProcessor->unpackMatchData(docId); + if (_dumpFeatures) { + _dumpProcessor->unpackMatchData(docId); + } +} + +void +SearchVisitor::RankController::rankMatchedDocument(uint32_t docId) +{ + _rankProcessor->runRankProgram(docId); + LOG(debug, "Rank score for matched document %u: %f", + _rankProcessor->getMatchData().getDocId(), + _rankProcessor->getRankScore()); + if (_dumpFeatures) { + _dumpProcessor->runRankProgram(docId); + // we must transfer the score to this match data to make sure that the same hits + // are kept on the hit collector used in the dump processor as the one used in the rank processor + _dumpProcessor->setRankScore(_rankProcessor->getRankScore()); + } +} + +bool +SearchVisitor::RankController::keepMatchedDocument() +{ + // also make sure that NaN scores are added + return (!(_rankProcessor->getRankScore() <= _rankSetup->getRankScoreDropLimit())); +} + +bool +SearchVisitor::RankController::collectMatchedDocument(bool hasSorting, + SearchVisitor & visitor, + const std::vector<char> & tmpSortBuffer, + const vsm::StorageDocument::SP & document) +{ + bool amongTheBest(false); + if (!hasSorting) { + amongTheBest = _rankProcessor->getHitCollector().addHit(document, _rankProcessor->getMatchData(), _rankProcessor->getRankScore()); + if (amongTheBest && _dumpFeatures) { + _dumpProcessor->getHitCollector().addHit(vsm::StorageDocument::SP(NULL), _dumpProcessor->getMatchData(), _dumpProcessor->getRankScore()); + } + } else { + size_t pos = visitor.fillSortBuffer(); + LOG(spam, "SortBlob is %ld bytes", pos); + amongTheBest = _rankProcessor->getHitCollector().addHit(document, _rankProcessor->getMatchData(), _rankProcessor->getRankScore(), + &tmpSortBuffer[0], pos); + if (amongTheBest && _dumpFeatures) { + _dumpProcessor->getHitCollector().addHit(vsm::StorageDocument::SP(NULL), _dumpProcessor->getMatchData(), _dumpProcessor->getRankScore(), + &tmpSortBuffer[0], pos); + } + } + return amongTheBest; +} + +void +SearchVisitor::RankController::onCompletedVisiting(vsm::GetDocsumsStateCallback & docsumsStateCallback, vdslib::SearchResult & searchResult) +{ + if (_hasRanking) { + // fill the search result with the hits from the hit collector + _rankProcessor->fillSearchResult(searchResult); + + // calculate summary features and set them on the callback object + if (!_rankSetup->getSummaryFeatures().empty()) { + LOG(debug, "Calculate summary features"); + search::FeatureSet::SP sf = _rankProcessor->calculateFeatureSet(); + docsumsStateCallback.setSummaryFeatures(sf); + } + + // calculate rank features and set them on the callback object + if (_dumpFeatures) { + LOG(debug, "Calculate rank features"); + search::FeatureSet::SP rf = _dumpProcessor->calculateFeatureSet(); + docsumsStateCallback.setRankFeatures(rf); + } + } +} + +SearchVisitor::SyntheticFieldsController::SyntheticFieldsController() : + _documentIdFId(StringFieldIdTMap::npos) +{ +} + +void +SearchVisitor::SyntheticFieldsController::setup(const StringFieldIdTMap & fieldRegistry, + const StringFieldIdTMap & /*fieldsInQuery*/) +{ + _documentIdFId = fieldRegistry.fieldNo("documentid"); + assert(_documentIdFId != StringFieldIdTMap::npos); +} + +void +SearchVisitor::SyntheticFieldsController::onDocument(vsm::StorageDocument & document) +{ + (void) document; +} + +void +SearchVisitor::SyntheticFieldsController::onDocumentMatch(vsm::StorageDocument & document, + const vespalib::string & documentId) +{ + document.setField(_documentIdFId, document::FieldValue::UP(new document::StringFieldValue(documentId))); +} + +void +SearchVisitor::registerAdditionalFields(const std::vector<vsm::DocsumTools::FieldSpec> & docsumSpec, + std::vector<vespalib::string> & fieldList) +{ + for (size_t i = 0; i < docsumSpec.size(); ++i) { + fieldList.push_back(docsumSpec[i].getOutputName()); + const std::vector<vespalib::string> & inputNames = docsumSpec[i].getInputNames(); + for (size_t j = 0; j < inputNames.size(); ++j) { + fieldList.push_back(inputNames[j]); + if (document::PositionDataType::isZCurveFieldName(inputNames[j])) { + fieldList.push_back(document::PositionDataType::cutZCurveFieldName(inputNames[j])); + } + } + } + // fields used during sorting + fieldList.push_back("[docid]"); + fieldList.push_back("[rank]"); + fieldList.push_back("documentid"); +} + +void +SearchVisitor::setupFieldSearchers(const std::vector<vespalib::string> & additionalFields, + StringFieldIdTMap & fieldsInQuery) +{ + // Create mapping from field name to field id, from field id to search spec, + // and from index name to list of field ids + _fieldSearchSpecMap.buildFromConfig(_vsmAdapter->getFieldsConfig()); + // Add extra elements to mapping from field name to field id + _fieldSearchSpecMap.buildFromConfig(additionalFields); + + // Reconfig field searchers based on the query + _fieldSearchSpecMap.reconfigFromQuery(_query); + + // Map field name to field id for all fields in the query + _fieldSearchSpecMap.buildFieldsInQuery(_query, fieldsInQuery); + // Connect field names in the query to field searchers + _fieldSearchSpecMap.buildSearcherMap(fieldsInQuery.map(), _fieldSearcherMap); + + // prepare the field searchers + _fieldSearcherMap.prepare(_fieldSearchSpecMap.documentTypeMap(), _searchBuffer, _query); +} + +void +SearchVisitor::setupSnippetModifiers() +{ + search::QueryTermList qtl; + _query.getLeafs(qtl); + _snippetModifierManager.setup(qtl, _fieldSearchSpecMap.specMap(), _fieldSearchSpecMap.documentTypeMap().begin()->second); +} + +void +SearchVisitor::setupScratchDocument(const StringFieldIdTMap & fieldsInQuery) +{ + if (_fieldSearchSpecMap.documentTypeMap().empty()) { + throw vespalib::IllegalStateException("Illegal config: There must be at least 1 document type in the 'vsmfields' config"); + } + // Setup document type mapping + if (_fieldSearchSpecMap.documentTypeMap().size() != 1) { + LOG(warning, "We have %zd document types in the vsmfields config when we expected 1. Using the first one", + _fieldSearchSpecMap.documentTypeMap().size()); + } + _fieldsUnion = fieldsInQuery.map(); + for(vsm::StringFieldIdTMapT::const_iterator it(_fieldSearchSpecMap.nameIdMap().map().begin()), + mt(_fieldSearchSpecMap.nameIdMap().map().end()); it != mt; it++) { + if (_fieldsUnion.find(it->first) == _fieldsUnion.end()) { + LOG(debug, "Adding field '%s' from _fieldSearchSpecMap", it->first.c_str()); + _fieldsUnion[it->first] = it->second; + } + } + // Init based on default document type and mapping from field name to field id + _docTypeMapping.init(_fieldSearchSpecMap.documentTypeMap().begin()->first, + _fieldsUnion, *_component.getTypeRepo()); + _docTypeMapping.prepareBaseDoc(_fieldPathMap); +} + +void +SearchVisitor::setupDocsumObjects() +{ + std::unique_ptr<DocsumFilter> docsumFilter(new DocsumFilter(_vsmAdapter->getDocsumTools(), _rankController.getRankProcessor()->getHitCollector())); + docsumFilter->init(_fieldSearchSpecMap.nameIdMap(), *_fieldPathMap); + docsumFilter->setSnippetModifiers(_snippetModifierManager.getModifiers()); + _summaryGenerator.setFilter(std::move(docsumFilter)); + if (_vsmAdapter->getDocsumTools().get()) { + GetDocsumsState * ds(&_summaryGenerator.getDocsumState()); + _vsmAdapter->getDocsumTools()->getDocsumWriter()->InitState(_attrMan, ds); + _summaryGenerator.setDocsumWriter(*_vsmAdapter->getDocsumTools()->getDocsumWriter()); + for (const IAttributeVector * v : ds->_attributes) { + if (v != NULL) { + vespalib::string name(v->getName()); + vsm::FieldIdT fid = _fieldSearchSpecMap.nameIdMap().fieldNo(name); + if ( fid != StringFieldIdTMap::npos ) { + AttributeGuard::UP attr(_attrMan.getAttribute(name)); + if (attr->valid()) { + size_t index(_attributeFields.size()); + for (size_t j(0); j < index; j++) { + if (_attributeFields[j]._field == fid) { + index = j; + } + } + if (index == _attributeFields.size()) { + _attributeFields.push_back(AttrInfo(fid, std::move(attr))); + } + } else { + LOG(warning, "Attribute '%s' is not valid", name.c_str()); + } + } else { + LOG(warning, "No field with name '%s'. Odd ....", name.c_str()); + } + } + } + } else { + LOG(warning, "No docsum tools available"); + } +} + +void +SearchVisitor::setupAttributeVectors() +{ + const FieldPathMapT & fm = *_fieldPathMap; + for (FieldPathMapT::const_iterator it(fm.begin()), mt(fm.end()); it != mt; it++) { + if ( ! it->empty() ) { + vespalib::string attrName(it->front().getName()); + for (FieldPath::const_iterator ft(it->begin()+1), fmt(it->end()); ft != fmt; ft++) { + attrName.append("."); + attrName.append(ft->getName()); + } + + enum FieldDataType { + OTHER = 0, + ARRAY, + WSET + }; + FieldDataType typeSeen = OTHER; + for (FieldPath::const_iterator ft(it->begin()), fmt(it->end()); ft != fmt; ft++) { + int dataTypeId(ft->getDataType().getClass().id()); + if (dataTypeId == document::ArrayDataType::classId) { + typeSeen = ARRAY; + } else if (dataTypeId == document::MapDataType::classId) { + typeSeen = ARRAY; + } else if (dataTypeId == document::WeightedSetDataType::classId) { + typeSeen = WSET; + } + } + const document::FieldValue & fv = it->back().getFieldValueToSet(); + AttributeVector::SP attr; + if (typeSeen == ARRAY) { + attr = createMultiValueAttribute(attrName, fv, true); + } else if (typeSeen == WSET) { + attr = createMultiValueAttribute (attrName, fv, false); + } else { + attr = createAttribute(attrName, fv); + } + + if (attr.get()) { + LOG(debug, "Adding attribute '%s' for field '%s' with data type '%s' (%s)", + attr->getName().c_str(), attrName.c_str(), fv.getDataType()->getName().c_str(), fv.getClass().name()); + if ( ! _attrMan.add(attr) ) { + LOG(warning, "Failed adding attribute '%s' for field '%s' with data type '%s' (%s)", + attr->getName().c_str(), attrName.c_str(), fv.getDataType()->getName().c_str(), fv.getClass().name()); + } + } else { + LOG(debug, "Cannot setup attribute for field '%s' with data type '%s' (%s). Aggregation and sorting will not work for this field", + attrName.c_str(), fv.getDataType()->getName().c_str(), fv.getClass().name()); + } + } + } +} + +void +SearchVisitor::setupAttributeVectorsForSorting(const search::common::SortSpec & sortList) +{ + if ( ! sortList.empty() ) { + for (size_t i(0), m(sortList.size()); i < m; i++) { + const search::common::SortInfo & sInfo(sortList[i]); + vsm::FieldIdT fid = _fieldSearchSpecMap.nameIdMap().fieldNo(sInfo._field); + if ( fid != StringFieldIdTMap::npos ) { + AttributeGuard::UP attr(_attrMan.getAttribute(sInfo._field)); + if (attr->valid()) { + if (!(*attr)->hasMultiValue()) { + size_t index(_attributeFields.size()); + for(size_t j(0); j < index; j++) { + if (_attributeFields[j]._field == fid) { + index = j; + _attributeFields[index]._ascending = sInfo._ascending; + _attributeFields[index]._converter = sInfo._converter.get(); + } + } + if (index == _attributeFields.size()) { + _attributeFields.push_back(AttrInfo(fid, std::move(attr), sInfo._ascending, sInfo._converter.get())); + } + _sortList.push_back(index); + } else { + LOG(warning, "Attribute '%s' is not sortable", sInfo._field.c_str()); + } + } else { + LOG(warning, "Attribute '%s' is not valid", sInfo._field.c_str()); + } + } else { + LOG(warning, "Cannot locate field '%s' in field name registry", sInfo._field.c_str()); + } + } + } else { + LOG(debug, "No sort specification received"); + } +} + +void +SearchVisitor::setupGrouping(const std::vector<char> & groupingBlob) +{ + vespalib::nbostream iss(&groupingBlob[0], groupingBlob.size()); + vespalib::NBOSerializer is(iss); + uint32_t numGroupings(0); + is >> numGroupings; + for(size_t i(0); i < numGroupings; i++) { + std::unique_ptr<Grouping> ag(new Grouping()); + ag->deserialize(is); + GroupingList::value_type groupingPtr(ag.release()); + Grouping & grouping = *groupingPtr; + Attribute2DocumentAccessor attr2Doc; + grouping.select(attr2Doc, attr2Doc); + LOG(debug, "Grouping # %ld with id(%d)", i, grouping.getId()); + try { + search::expression::ConfigureStaticParams stuff(_attrCtx.get(), &_docTypeMapping.getCurrentDocumentType()); + grouping.configureStaticStuff(stuff); + HitsResultPreparator preparator(_summaryGenerator); + grouping.select(preparator, preparator); + grouping.preAggregate(false); + if (!grouping.getAll() || (preparator.getNumHitsAggregators() == 0)) { + _groupingList.push_back(groupingPtr); + } else { + LOG(warning, "You can not collect hits with an all aggregator yet."); + } + } catch (const std::exception & e) { + LOG(error, "Could not locate attribute for grouping number %ld : %s", i, e.what()); + } + } +} + +class SingleDocumentStore : public vsm::IDocSumCache +{ +public: + SingleDocumentStore(const vsm::StorageDocument & doc) : _doc(doc) { } + virtual const vsm::Document & getDocSum(const search::DocumentIdT & docId) const { + (void) docId; + return _doc; + } +private: + const vsm::StorageDocument & _doc; +}; + +bool +SearchVisitor::compatibleDocumentTypes(const document::DocumentType& typeA, + const document::DocumentType& typeB) const +{ + if (&typeA == &typeB) { + return true; + } else { + return (typeA.getName() == typeB.getName()); + } +} + +void +SearchVisitor::handleDocuments(const document::BucketId&, + std::vector<spi::DocEntry::LP>& entries, + HitCounter& hitCounter) +{ + (void) hitCounter; + if (_vsmAdapter == NULL) { + init(_params); + } + if ( ! _rankController.valid() ) { + //Prevent continuing with bad config. + return; + } + document::DocumentId emptyId; + LOG(debug, "SearchVisitor '%s' handling block of %zu documents", + _id.c_str(), entries.size()); + size_t highestFieldNo(_fieldSearchSpecMap.nameIdMap().highestFieldNo()); + + const document::DocumentType* defaultDocType = + _docTypeMapping.getDefaultDocumentType(); + assert(defaultDocType); + for (size_t i = 0; i< entries.size(); ++i) { + spi::DocEntry& entry(*entries[i]); + vsm::StorageDocument::SP document( + new StorageDocument(entry.releaseDocument())); + document->fieldPathMap(_fieldPathMap); + document->setFieldCount(highestFieldNo); + + try { + document->init(); + if (defaultDocType != NULL + && !compatibleDocumentTypes(*defaultDocType, + document->docDoc().getType())) + { + LOG(debug, "Skipping document of type '%s' when " + "handling only documents of type '%s'", + document->docDoc().getType().getName().c_str(), + defaultDocType->getName().c_str()); + } else { + if (handleDocument(document)) { + _backingDocuments.push_back(document); + } + } + } catch (const std::exception & e) { + LOG(warning, "Caught exception handling document '%s'. Exception='%s'", + document->docDoc().getId().getScheme().toString().c_str(), + e.what()); + } + } +} + +bool +SearchVisitor::handleDocument(const vsm::StorageDocument::SP & document) +{ + bool needToKeepDocument(false); + _syntheticFieldsController.onDocument(*document); + group(document->docDoc(), 0, true); + if (match(*document)) { + RankProcessor & rp = *_rankController.getRankProcessor(); + vespalib::string documentId(document->docDoc().getId().getScheme().toString()); + LOG(debug, "Matched document with id '%s'", documentId.c_str()); + + document->setDocId(rp.getMatchData().getDocId()); + + fillAttributeVectors(documentId, *document); + + _rankController.rankMatchedDocument(rp.getMatchData().getDocId()); + + if (_shouldFillRankAttribute) { + _rankAttribute.add(rp.getRankScore()); + } + + if (_rankController.keepMatchedDocument()) { + + bool amongTheBest = _rankController.collectMatchedDocument(!_sortList.empty(), *this, _tmpSortBuffer, document); + + _syntheticFieldsController.onDocumentMatch(*document, documentId); + + SingleDocumentStore single(*document); + _summaryGenerator.setDocsumCache(single); + group(document->docDoc(), rp.getRankScore(), false); + + if (amongTheBest) { + document->saveCachedFields(); + needToKeepDocument = true; + } + + } else { + _hitsRejectedCount++; + LOG(debug, "Do not keep document with id '%s' because rank score (%f) <= rank score drop limit (%f)", + documentId.c_str(), + rp.getRankScore(), + _rankController.getRankSetup()->getRankScoreDropLimit()); + } + } else { + LOG(debug, "Did not match document with id '%s'", document->docDoc().getId().getScheme().toString().c_str()); + } + return needToKeepDocument; +} + +void +SearchVisitor::group(const document::Document & doc, search::HitRank rank, bool all) +{ + LOG(spam, "Group all: %s", all ? "true" : "false"); + for(GroupingList::iterator it(_groupingList.begin()), mt(_groupingList.end()); it != mt; it++) { + GroupingEntry & grouping(*it); + if (all == grouping->getAll()) { + grouping.aggregate(doc, rank); + LOG(spam, "Actually group document with id '%s'", doc.getId().getScheme().toString().c_str()); + } + } +} + +bool +SearchVisitor::match(const vsm::StorageDocument & doc) +{ + for (FieldIdTSearcherMap::iterator it = _fieldSearcherMap.begin(), mt = _fieldSearcherMap.end(); it != mt; it++) { + FieldSearcher & fSearch = *(*it); + fSearch.search(doc); + } + bool hit(_query.evaluate()); + if (hit) { + _hitCount++; + LOG(spam, "Match in doc %d", doc.getDocId()); + + _rankController.onDocumentMatch(_hitCount - 1); // send in the local docId to use for this hit + } + _docSearchedCount++; + _query.reset(); + return hit; +} + +void +SearchVisitor::fillAttributeVectors(const vespalib::string & documentId, const StorageDocument & document) +{ + for (size_t i(0), im(_attributeFields.size()); i < im; i++) { + const AttrInfo & finfo = _attributeFields[i]; + const AttributeGuard &finfoGuard(*finfo._attr); + bool isPosition = finfoGuard->getClass().inherits(search::IntegerAttribute::classId) && document::PositionDataType::isZCurveFieldName(finfoGuard->getName()); + LOG(debug, "Filling attribute '%s', isPosition='%s'", finfoGuard->getName().c_str(), isPosition ? "true" : "false"); + uint32_t fieldId = finfo._field; + if (isPosition) { + vespalib::stringref org = document::PositionDataType::cutZCurveFieldName(finfoGuard->getName()); + fieldId = _fieldsUnion.find(org)->second; + } + const StorageDocument::SubDocument & subDoc = document.getComplexField(fieldId); + search::AttributeVector & attrV = const_cast<search::AttributeVector & >(*finfoGuard); + search::AttributeVector::DocId docId(0); + attrV.addDoc(docId); + if (subDoc.getFieldValue() != NULL) { + LOG(debug, "value = '%s'", subDoc.getFieldValue()->toString().c_str()); + if (isPosition) { + LOG(spam, "Position"); + PositionInserter pi(attrV, docId); + subDoc.getFieldValue()->iterateNested(subDoc.begin(), subDoc.end(), pi); + } else { + AttributeInserter ai(attrV, docId); + subDoc.getFieldValue()->iterateNested(subDoc.begin(), subDoc.end(), ai); + } + } else if (finfoGuard->getName() == "[docid]") { + _documentIdAttribute.add(documentId.c_str()); + // assert((_docsumCache.cache().size() + 1) == _documentIdAttribute.getNumDocs()); + } else if (finfoGuard->getName() == "[rank]") { + _shouldFillRankAttribute = true; + } + } +} + +size_t +SearchVisitor::fillSortBuffer() +{ + size_t pos(0); + for(size_t i(0), m(_sortList.size()); i != m; i++) { + int written(0); + const AttrInfo & finfo = _attributeFields[_sortList[i]]; + const AttributeGuard &finfoGuard(*finfo._attr); + LOG(debug, "Adding sortdata for document %d for attribute '%s'", + finfoGuard->getNumDocs() - 1, finfoGuard->getName().c_str()); +// assert((_docsumCache.cache().size() + 1) == finfo._attr->getNumDocs()); + do { + if (finfo._ascending) { + written = finfoGuard->serializeForAscendingSort(finfoGuard->getNumDocs()-1, &_tmpSortBuffer[0]+pos, _tmpSortBuffer.size() - pos, finfo._converter); + } else { + written = finfoGuard->serializeForDescendingSort(finfoGuard->getNumDocs()-1, &_tmpSortBuffer[0]+pos, _tmpSortBuffer.size() - pos, finfo._converter); + } + if (written == -1) { + _tmpSortBuffer.resize(_tmpSortBuffer.size()*2); + } + } while (written == -1); + pos += written; + } + return pos; +} + +void SearchVisitor::completedBucket(const document::BucketId&, HitCounter&) +{ + LOG(debug, "Completed bucket"); +} + +void SearchVisitor::completedVisitingInternal(HitCounter& hitCounter) +{ + if (_vsmAdapter == NULL) { + init(_params); + } + LOG(debug, "Completed visiting"); + vdslib::SearchResult & searchResult(_queryResult->getSearchResult()); + vdslib::DocumentSummary & documentSummary(_queryResult->getDocumentSummary()); + LOG(debug, "Hit count: %lu", searchResult.getHitCount()); + + _rankController.onCompletedVisiting(_summaryGenerator.getDocsumCallback(), searchResult); + LOG(debug, "Hit count: %lu", searchResult.getHitCount()); + + /// Now I can sort. No more documentid access order. + searchResult.sort(); + searchResult.setTotalHitCount(_hitCount - _hitsRejectedCount); + + const char* docId; + vdslib::SearchResult::RankType rank; + for (uint32_t i = 0; i < searchResult.getHitCount(); i++) { + searchResult.getHit(i, docId, rank); + hitCounter.addHit(document::DocumentId(docId), 0); + } + + generateGroupingResults(); + + generateDocumentSummaries(); + _backingDocuments.clear(); + + documentSummary.sort(); + LOG(debug, "Docsum count: %lu", documentSummary.getSummaryCount()); +} + +void SearchVisitor::completedVisiting(HitCounter& hitCounter) +{ + completedVisitingInternal(hitCounter); + sendMessage(documentapi::DocumentMessage::UP(_queryResult.release())); +} + +void +SearchVisitor::generateGroupingResults() +{ + vdslib::SearchResult & searchResult(_queryResult->getSearchResult()); + for (GroupingList::iterator it(_groupingList.begin()), mt(_groupingList.end()); it != mt; it++) { + Grouping & grouping(**it); + LOG(debug, "grouping before postAggregate: %s", grouping.asString().c_str()); + grouping.postAggregate(); + grouping.postMerge(); + grouping.sortById(); + LOG(debug, "grouping after postAggregate: %s", grouping.asString().c_str()); + vespalib::nbostream os; + vespalib::NBOSerializer nos(os); + grouping.serialize(nos); + vespalib::MallocPtr blob(os.size()); + memcpy(blob, os.c_str(), os.size()); + searchResult.getGroupingList().add(grouping.getId(), blob); + } +} + +void +SearchVisitor::generateDocumentSummaries() +{ + if ( ! _rankController.valid()) { + return; + } + _summaryGenerator.setDocsumCache(_rankController.getRankProcessor()->getHitCollector()); + vdslib::SearchResult & searchResult(_queryResult->getSearchResult()); + vdslib::DocumentSummary & documentSummary(_queryResult->getDocumentSummary()); + for (size_t i(0), m(searchResult.getHitCount()); (i < m) && (i < searchResult.getWantedHitCount()); i++ ) { + const char * docId(NULL); + vdslib::SearchResult::RankType rank(0); + uint32_t lid = searchResult.getHit(i, docId, rank); + vespalib::ConstBufferRef docsum = _summaryGenerator.fillSummary(lid, _summaryClass); + documentSummary.addSummary(docId, docsum.c_str(), docsum.size()); + LOG(debug, "Adding summary %ld: globalDocId(%s), localDocId(%u), rank(%f), bytes(%lu)", + i, docId, lid, rank, docsum.size()); + } +} + + +} diff --git a/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h new file mode 100644 index 00000000000..bc8d72c4177 --- /dev/null +++ b/streamingvisitors/src/vespa/searchvisitor/searchvisitor.h @@ -0,0 +1,464 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vsm/common/docsum.h> +#include <vespa/vsm/common/documenttypemapping.h> +#include <vespa/vsm/common/storagedocument.h> +#include <vespa/vsm/searcher/fieldsearcher.h> +#include <vespa/vsm/vsm/docsumfilter.h> +#include <vespa/vsm/vsm/fieldsearchspec.h> +#include <vespa/vsm/vsm/snippetmodifier.h> +#include <vespa/vsm/vsm/vsm-adapter.h> +#include <vespa/vespalib/objects/objectoperation.h> +#include <vespa/vespalib/objects/objectpredicate.h> +#include <vespa/searchlib/query/query.h> +#include <vespa/searchlib/aggregation/aggregation.h> +#include <vespa/searchlib/attribute/attributemanager.h> +#include <vespa/searchlib/attribute/attributevector.h> +#include <vespa/searchlib/attribute/extendableattributes.h> +#include <vespa/searchlib/common/sortspec.h> +#include <vespa/storage/visiting/visitor.h> +#include <vespa/documentapi/messagebus/messages/queryresultmessage.h> +#include "hitcollector.h" +#include "indexenvironment.h" +#include "queryenvironment.h" +#include "rankmanager.h" +#include "rankprocessor.h" +#include "searchenvironment.h" + +using namespace search::aggregation; + +namespace storage { + +/** + * @class storage::SearchVisitor + * + * @brief Visitor that applies a search query to visitor data and + * converts them to a SearchResultCommand and a DocumentSummaryCommand. + **/ +class SearchVisitor : public Visitor { +public: + SearchVisitor(StorageComponent&, VisitorEnvironment& vEnv, + const vdslib::Parameters & params); + + ~SearchVisitor(); +private: + /** + * This struct wraps an attribute vector. + **/ + struct AttrInfo { + public: + /** + * Construct a new object. + * + * @param fid the field id of the attribute field. + * @param attr a guard to the attribute vector. + **/ + AttrInfo(vsm::FieldIdT fid, search::AttributeGuard::UP attr) : + _field(fid), + _ascending(true), + _converter(NULL), + _attr(std::move(attr)) + { + } + /** + * Construct a new object. + * + * @param fid the field id of the attribute field. + * @param attr a guard to the attribute vector. + * @param ascending whether this attribute should be sorted ascending or not. + * @param converter is a converter to apply to the attribute before sorting. + **/ + AttrInfo(vsm::FieldIdT fid, search::AttributeGuard::UP attr, bool ascending, const search::common::BlobConverter * converter) : + _field(fid), + _ascending(ascending), + _converter(converter), + _attr(std::move(attr)) + { + } + vsm::FieldIdT _field; + bool _ascending; + const search::common::BlobConverter * _converter; + search::AttributeGuard::UP _attr; + }; + + /** + * This class gets callbacks when iterating through a field value and + * inserts the values into a given attribute vector. + **/ + class AttributeInserter : public document::FieldValue::IteratorHandler { + protected: + typedef document::FieldValue::IteratorHandler::Content IteratorContent; + search::AttributeVector & _attribute; + search::AttributeVector::DocId _docId; + + virtual void onPrimitive(const IteratorContent & c); + + public: + AttributeInserter(search::AttributeVector & attribute, search::AttributeVector::DocId docId); + }; + + class PositionInserter : public AttributeInserter { + public: + PositionInserter(search::AttributeVector & attribute, search::AttributeVector::DocId docId); + private: + virtual void onPrimitive(const IteratorContent & c); + virtual void onStructStart(const Content & fv); + document::Field _fieldX; + document::Field _fieldY; + document::IntFieldValue _valueX; + document::IntFieldValue _valueY; + }; + + /** + * This class controls all the ranking related objects. + **/ + class RankController { + private: + vespalib::string _rankProfile; + RankManager::Snapshot::SP _rankManagerSnapshot; + const search::fef::RankSetup * _rankSetup; + search::fef::Properties _queryProperties; + bool _hasRanking; + RankProcessor::UP _rankProcessor; + bool _dumpFeatures; + RankProcessor::UP _dumpProcessor; + + /** + * Process attribute hints and add needed attributes to the given list. + **/ + void processHintedAttributes(const IndexEnvironment & indexEnv, bool rank, + const search::IAttributeManager & attrMan, + std::vector<AttrInfo> & attributeFields); + + public: + RankController(); + bool valid() const { return _rankProcessor.get() != NULL; } + void setRankProfile(const vespalib::string &rankProfile) { _rankProfile = rankProfile; } + const vespalib::string &getRankProfile() const { return _rankProfile; } + void setRankManagerSnapshot(const RankManager::Snapshot::SP & snapshot) { _rankManagerSnapshot = snapshot; } + search::fef::Properties & getQueryProperties() { return _queryProperties; } + RankProcessor * getRankProcessor() { return _rankProcessor.get(); } + void setDumpFeatures(bool dumpFeatures) { _dumpFeatures = dumpFeatures; } + bool getDumpFeatures() const { return _dumpFeatures; } + const search::fef::RankSetup * getRankSetup() const { return _rankSetup; } + + /** + * Setup rank processors used for ranking and dumping. + * + * @param query the query associated with the search visitor. + * @param wantedHitCount number of hits wanted. + * @param attrMan the attribute manager. + * @param attributeFields the list of attribute vectors needed. + **/ + void setupRankProcessors(search::Query & query, + const vespalib::string & location, + size_t wantedHitCount, + const search::IAttributeManager & attrMan, + std::vector<AttrInfo> & attributeFields); + /** + * Callback function that is called for each document that match. + * Unpack match data. + * + * @param docId the docId to use for this hit + **/ + void onDocumentMatch(uint32_t docId); + + /** + * Calculate rank for a matched document. + **/ + void rankMatchedDocument(uint32_t docId); + + /** + * Returns whether we should keep the matched document. + * Use the rank-score-drop-limit to decide this. + **/ + bool keepMatchedDocument(); + + /** + * Collect a matched document in the hit collector. + * Take sort spec into consideration if used. + * + * @param hasSorting whether the search result should be sorted. + * @param visitor the search visitor. + * @param tmpSortBuffer the sort buffer containing the sort data. + * @param documentId the document id of the document to collect. + * @return true if the document was added to the heap + **/ + bool collectMatchedDocument(bool hasSorting, + SearchVisitor & visitor, + const std::vector<char> & tmpSortBuffer, + const vsm::StorageDocument::SP & documentId); + /** + * Callback function that is called when visiting is completed. + * Perform second phase ranking and calculate summary features / rank features if asked for. + * + * @param docsumsStateCallback state object to store summary features and rank features. + **/ + void onCompletedVisiting(vsm::GetDocsumsStateCallback & docsumsStateCallback, vdslib::SearchResult & searchResult); + }; + + /** + * This class controls all the synthetic fields + **/ + class SyntheticFieldsController { + private: + vsm::FieldIdT _documentIdFId; + + public: + SyntheticFieldsController(); + + /** + * Setup synthetic fields, like 'sddocname' and 'documentid'. + * + * @param fieldRegistry mapping from field name to field id for all known fields. + * @param fieldsInQuery mapping from field name to field id for fields mentioned in the query. + **/ + void setup(const vsm::StringFieldIdTMap & fieldRegistry, + const vsm::StringFieldIdTMap & fieldsInQuery); + + /** + * Callback function that is called for each document received. + * + * @param document the document received. + **/ + void onDocument(vsm::StorageDocument & document); + + /** + * Callback function that is called for each document matched. + * + * @param document the document matched. + * @param documentId the document id of the matched document. + **/ + void onDocumentMatch(vsm::StorageDocument & document, + const vespalib::string & documentId); + }; + + /** + * Register field names from the given docsum spec into the given field name list. + * These field names are in addition to the field names found in the vsmfields config. + * Duplicates are removed when later building mapping from field name to field id. + * + * @param docsumSpec config with the field names used by the docsum setup. + * @param fieldList list of field names that are built. + **/ + void registerAdditionalFields(const std::vector<vsm::DocsumTools::FieldSpec> & docsumSpec, + std::vector<vespalib::string> & fieldList); + + /** + * Setup the field searchers used when matching the query with the stream of documents. + * This includes setting up various mappings in FieldSearchSpecMap and building mapping + * for fields used by the query. + * + * @param additionalFields list of additional field names used when setting up the mappings. + * @param fieldsInQuery mapping from field name to field id that are built based on the query. + **/ + void setupFieldSearchers(const std::vector<vespalib::string> & additionalFields, + vsm::StringFieldIdTMap & fieldsInQuery); + + /** + * Setup snippet modifiers for the fields where we have substring search. + * The modifiers will be used when generating docsum. + **/ + void setupSnippetModifiers(); + + /** + * Setup the scratch document that is used when receiving a stream of documents through the visitor api. + * Each document in this stream is serialized into the scratch document and passed to vsm for matching. + **/ + void setupScratchDocument(const vsm::StringFieldIdTMap & fieldsInQuery); + + /** + * Setup the objects used for document summary. + **/ + void setupDocsumObjects(); + + /** + * Create and register an attribute vector in the attribute manager for each field value in the scratch document. + * If later needed during evaluation, these attribute vectors are filled with the actual + * value(s) from the scratch document. + **/ + void setupAttributeVectors(); + + /** + * Setup attribute vectors needed for sorting. + * + * @param sortList the list of attributes needed for sorting. + **/ + void setupAttributeVectorsForSorting(const search::common::SortSpec & sortList); + + /** + * Setup grouping based on the given grouping blob. + * + * @param groupingBlob the binary representation of the grouping specification. + **/ + void setupGrouping(const std::vector<char> & groupingBlob); + + // Inherit doc from Visitor + void handleDocuments(const document::BucketId&, + std::vector<spi::DocEntry::LP>& entries, + HitCounter& hitCounter) override; + + bool compatibleDocumentTypes(const document::DocumentType& typeA, + const document::DocumentType& typeB) const; + + /** + * Process one document + * @param document Document to process. + * @return true if the underlying buffer is needed later on, then it must be kept. + */ + bool handleDocument(const vsm::StorageDocument::SP & document); + + /** + * Collect the given document for grouping. + * + * @param doc the document used for grouping. + * @param all whether we should group all documents, not just hits. + **/ + void group(const document::Document & doc, search::HitRank rank, bool all); + + /** + * Check if the given document matches the query. + * + * @param doc the document to match. + * @return whether the document matched the query. + **/ + bool match(const vsm::StorageDocument & doc); + + /** + * Fill attribute vectors needed for aggregation and sorting with values from the scratch document. + * + * @param documentId the document id of the matched document. + **/ + void fillAttributeVectors(const vespalib::string & documentId, const vsm::StorageDocument & document); + + /** + * Fill the sort buffer based on the attribute vectors needed for sorting. + * + * @return the position of the sort buffer. + **/ + size_t fillSortBuffer(); + + // Inherit doc from Visitor + void completedBucket(const document::BucketId&, HitCounter& counter) override; + + // Inherit doc from Visitor + void completedVisiting(HitCounter& counter) override; + + spi::ReadConsistency getRequiredReadConsistency() const override { + // Searches are not considered to require strong consistency. + return spi::ReadConsistency::WEAK; + } + + /** + * Required to be called at least once. + */ + void completedVisitingInternal(HitCounter& counter); + + /** + * Generate grouping results from the new grouping framework (if any) and add them to the search result. + **/ + void generateGroupingResults(); + + /** + * Generate document summaries for a specified subset of the hits. + **/ + void generateDocumentSummaries(); + + class GroupingEntry : std::shared_ptr<Grouping> { + public: + GroupingEntry(Grouping * grouping); + void aggregate(const document::Document & doc, search::HitRank rank); + const Grouping & operator * () const { return *_grouping; } + Grouping & operator * () { return *_grouping; } + const Grouping * operator -> () const { return _grouping.get(); } + private: + std::shared_ptr<Grouping> _grouping; + size_t _count; + size_t _limit; + }; + typedef std::vector< GroupingEntry > GroupingList; + typedef std::vector<vsm::StorageDocument::SP> DocumentVector; + + class SummaryGenerator : public HitsAggregationResult::SummaryGenerator + { + public: + SummaryGenerator(); + GetDocsumsState & getDocsumState() { return _docsumState; } + vsm::GetDocsumsStateCallback & getDocsumCallback() { return _callback; } + void setFilter(std::unique_ptr<vsm::DocsumFilter> filter) { _docsumFilter = std::move(filter); } + void setDocsumCache(const vsm::IDocSumCache & cache) { _docsumFilter->setDocSumStore(cache); } + void setDocsumWriter(IDocsumWriter & docsumWriter) { _docsumWriter = & docsumWriter; } + virtual vespalib::ConstBufferRef fillSummary(search::AttributeVector::DocId lid, const HitsAggregationResult::SummaryClassType & summaryClass); + private: + vsm::GetDocsumsStateCallback _callback; + GetDocsumsState _docsumState; + std::unique_ptr<vsm::DocsumFilter> _docsumFilter; + search::docsummary::IDocsumWriter * _docsumWriter; + search::RawBuf _rawBuf; + }; + + class HitsResultPreparator : public vespalib::ObjectOperation, public vespalib::ObjectPredicate + { + public: + HitsResultPreparator(SummaryGenerator & summaryGenerator) : + _summaryGenerator(summaryGenerator), + _numHitsAggregators(0) + { } + size_t getNumHitsAggregators() const { return _numHitsAggregators; } + private: + virtual void execute(vespalib::Identifiable &obj); + virtual bool check(const vespalib::Identifiable &obj) const; + SummaryGenerator & _summaryGenerator; + size_t _numHitsAggregators; + }; + + void init(const vdslib::Parameters & params); + SearchEnvironment & _env; + vdslib::Parameters _params; + const vsm::VSMAdapter * _vsmAdapter; + size_t _docSearchedCount; + size_t _hitCount; + size_t _hitsRejectedCount; + search::Query _query; + std::unique_ptr<documentapi::QueryResultMessage> _queryResult; + vsm::FieldIdTSearcherMap _fieldSearcherMap; + vsm::SharedFieldPathMap _fieldPathMap; + vsm::DocumentTypeMapping _docTypeMapping; + vsm::FieldSearchSpecMap _fieldSearchSpecMap; + vsm::SnippetModifierManager _snippetModifierManager; + SummaryGenerator _summaryGenerator; + vespalib::string _summaryClass; + search::AttributeManager _attrMan; + search::attribute::IAttributeContext::UP _attrCtx; + GroupingList _groupingList; + std::vector<AttrInfo> _attributeFields; + search::common::SortSpec _sortSpec; + std::vector<size_t> _sortList; + IDocsumWriter * _docsumWriter; + vsm::SharedSearcherBuf _searchBuffer; + std::vector<char> _tmpSortBuffer; + search::AttributeVector::SP _documentIdAttributeBacking; + search::AttributeVector::SP _rankAttributeBacking; + search::SingleStringExtAttribute & _documentIdAttribute; + search::SingleFloatExtAttribute & _rankAttribute; + bool _shouldFillRankAttribute; + SyntheticFieldsController _syntheticFieldsController; + RankController _rankController; + DocumentVector _backingDocuments; + vsm::StringFieldIdTMapT _fieldsUnion; +}; + +class SearchVisitorFactory : public VisitorFactory { + config::ConfigUri _configUri; + VisitorEnvironment::UP makeVisitorEnvironment(StorageComponent&); + + Visitor* makeVisitor(StorageComponent&, VisitorEnvironment&env, + const vdslib::Parameters& params); +public: + SearchVisitorFactory(const config::ConfigUri & configUri); +}; + +} + diff --git a/streamingvisitors/src/vespa/snippetvisitor/.gitignore b/streamingvisitors/src/vespa/snippetvisitor/.gitignore new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/streamingvisitors/src/vespa/snippetvisitor/.gitignore |