diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2022-05-15 00:41:35 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-05-15 00:41:35 +0200 |
commit | 4db8dcbf3395fd92b1348155142b85df5a754289 (patch) | |
tree | 912b02e614bc9889ea3543893cbeb699971e8156 /streamingvisitors/src | |
parent | 287a799b270200aca440cad376272328128a5054 (diff) |
Revert "Revert "Collapse vsm into streamingvisitors""
Diffstat (limited to 'streamingvisitors/src')
96 files changed, 7613 insertions, 5 deletions
diff --git a/streamingvisitors/src/tests/charbuffer/.gitignore b/streamingvisitors/src/tests/charbuffer/.gitignore new file mode 100644 index 00000000000..2c980038fb5 --- /dev/null +++ b/streamingvisitors/src/tests/charbuffer/.gitignore @@ -0,0 +1,4 @@ +.depend +Makefile +charbuffer_test +vsm_charbuffer_test_app diff --git a/streamingvisitors/src/tests/charbuffer/CMakeLists.txt b/streamingvisitors/src/tests/charbuffer/CMakeLists.txt new file mode 100644 index 00000000000..5d0c0068d37 --- /dev/null +++ b/streamingvisitors/src/tests/charbuffer/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vsm_charbuffer_test_app TEST + SOURCES + charbuffer.cpp + DEPENDS + streamingvisitors +) +vespa_add_test(NAME vsm_charbuffer_test_app COMMAND vsm_charbuffer_test_app) diff --git a/streamingvisitors/src/tests/charbuffer/charbuffer.cpp b/streamingvisitors/src/tests/charbuffer/charbuffer.cpp new file mode 100644 index 00000000000..736d35459cb --- /dev/null +++ b/streamingvisitors/src/tests/charbuffer/charbuffer.cpp @@ -0,0 +1,80 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/vespalib/testkit/testapp.h> + +#include <vespa/vsm/common/charbuffer.h> + +namespace vsm { + +class CharBufferTest : public vespalib::TestApp +{ +private: + void test(); +public: + int Main() override; +}; + +void +CharBufferTest::test() +{ + { // empty + CharBuffer buf; + EXPECT_EQUAL(buf.getLength(), 0u); + EXPECT_EQUAL(buf.getPos(), 0u); + EXPECT_EQUAL(buf.getRemaining(), 0u); + } + { // explicit length + CharBuffer buf(8); + EXPECT_EQUAL(buf.getLength(), 8u); + EXPECT_EQUAL(buf.getPos(), 0u); + EXPECT_EQUAL(buf.getRemaining(), 8u); + } + { // resize + CharBuffer buf(8); + EXPECT_EQUAL(buf.getLength(), 8u); + buf.resize(16); + EXPECT_EQUAL(buf.getLength(), 16u); + buf.resize(8); + EXPECT_EQUAL(buf.getLength(), 16u); + } + { // put with triggered resize + CharBuffer buf(8); + buf.put("123456", 6); + EXPECT_EQUAL(buf.getLength(), 8u); + EXPECT_EQUAL(buf.getPos(), 6u); + EXPECT_EQUAL(buf.getRemaining(), 2u); + EXPECT_EQUAL(std::string(buf.getBuffer(), buf.getPos()), "123456"); + buf.put("789", 3); + EXPECT_EQUAL(buf.getLength(), 12u); + EXPECT_EQUAL(buf.getPos(), 9u); + EXPECT_EQUAL(buf.getRemaining(), 3u); + EXPECT_EQUAL(std::string(buf.getBuffer(), buf.getPos()), "123456789"); + buf.put('a'); + EXPECT_EQUAL(buf.getLength(), 12u); + EXPECT_EQUAL(buf.getPos(), 10u); + EXPECT_EQUAL(buf.getRemaining(), 2u); + EXPECT_EQUAL(std::string(buf.getBuffer(), buf.getPos()), "123456789a"); + buf.reset(); + EXPECT_EQUAL(buf.getLength(), 12u); + EXPECT_EQUAL(buf.getPos(), 0u); + EXPECT_EQUAL(buf.getRemaining(), 12u); + buf.put("bcd", 3); + EXPECT_EQUAL(buf.getLength(), 12u); + EXPECT_EQUAL(buf.getPos(), 3u); + EXPECT_EQUAL(buf.getRemaining(), 9u); + EXPECT_EQUAL(std::string(buf.getBuffer(), buf.getPos()), "bcd"); + } +} + +int +CharBufferTest::Main() +{ + TEST_INIT("charbuffer_test"); + + test(); + + TEST_DONE(); +} + +} + +TEST_APPHOOK(vsm::CharBufferTest); diff --git a/streamingvisitors/src/tests/config/mail.cfg b/streamingvisitors/src/tests/config/mail.cfg new file mode 100644 index 00000000000..ce830beac23 --- /dev/null +++ b/streamingvisitors/src/tests/config/mail.cfg @@ -0,0 +1,116 @@ +datatype[2] +datatype[0].id 1012 +datatype[0].arraytype[1] +datatype[0].arraytype[0].datatype 12 +datatype[1].id 1013 +datatype[1].arraytype[1] +datatype[1].arraytype[0].datatype 13 +documenttype[1] +documenttype[0].name mail +documenttype[0].version 0 +documenttype[0].inherits[0] +documenttype[0].field[26] +documenttype[0].field[0].name mailid +documenttype[0].field[0].id 2 +documenttype[0].field[0].header true +documenttype[0].field[0].datatype 2 +documenttype[0].field[1].name date +documenttype[0].field[1].id 3 +documenttype[0].field[1].header true +documenttype[0].field[1].datatype 0 +documenttype[0].field[2].name from +documenttype[0].field[2].id 4 +documenttype[0].field[2].header true +documenttype[0].field[2].datatype 12 +documenttype[0].field[3].name replyto +documenttype[0].field[3].id 5 +documenttype[0].field[3].header true +documenttype[0].field[3].datatype 12 +documenttype[0].field[4].name to +documenttype[0].field[4].id 6 +documenttype[0].field[4].header true +documenttype[0].field[4].datatype 12 +documenttype[0].field[5].name cc +documenttype[0].field[5].id 7 +documenttype[0].field[5].header true +documenttype[0].field[5].datatype 12 +documenttype[0].field[6].name bcc +documenttype[0].field[6].id 8 +documenttype[0].field[6].header true +documenttype[0].field[6].datatype 12 +documenttype[0].field[7].name subject +documenttype[0].field[7].id 9 +documenttype[0].field[7].header true +documenttype[0].field[7].datatype 12 +documenttype[0].field[8].name body +documenttype[0].field[8].id 10 +documenttype[0].field[8].header false +documenttype[0].field[8].datatype 12 +documenttype[0].field[9].name attachmentcount +documenttype[0].field[9].id 11 +documenttype[0].field[9].header false +documenttype[0].field[9].datatype 0 +documenttype[0].field[10].name attachmentpartids +documenttype[0].field[10].id 12 +documenttype[0].field[10].header false +documenttype[0].field[10].datatype 2 +documenttype[0].field[11].name attachmentsizes +documenttype[0].field[11].id 13 +documenttype[0].field[11].header false +documenttype[0].field[11].datatype 2 +documenttype[0].field[12].name attachmentnames +documenttype[0].field[12].id 14 +documenttype[0].field[12].header false +documenttype[0].field[12].datatype 2 +documenttype[0].field[13].name attachmenttypes +documenttype[0].field[13].id 15 +documenttype[0].field[13].header false +documenttype[0].field[13].datatype 2 +documenttype[0].field[14].name attachmentlanguages +documenttype[0].field[14].id 16 +documenttype[0].field[14].header false +documenttype[0].field[14].datatype 2 +documenttype[0].field[15].name attachmentcontent +documenttype[0].field[15].id 17 +documenttype[0].field[15].header false +documenttype[0].field[15].datatype 2 +documenttype[0].field[16].name bodylanguage +documenttype[0].field[16].id 18 +documenttype[0].field[16].header false +documenttype[0].field[16].datatype 2 +documenttype[0].field[17].name bodyencoding +documenttype[0].field[17].id 19 +documenttype[0].field[17].header false +documenttype[0].field[17].datatype 2 +documenttype[0].field[18].name collectionid +documenttype[0].field[18].id 20 +documenttype[0].field[18].header true +documenttype[0].field[18].datatype 4 +documenttype[0].field[19].name content +documenttype[0].field[19].id 21 +documenttype[0].field[19].header true +documenttype[0].field[19].datatype 12 +documenttype[0].field[20].name bodymeta +documenttype[0].field[20].id 50027053 +documenttype[0].field[20].header false +documenttype[0].field[20].datatype 13 +documenttype[0].field[21].name attachments +documenttype[0].field[21].id 1081629685 +documenttype[0].field[21].header false +documenttype[0].field[21].datatype 1012 +documenttype[0].field[22].name attachmentsmeta +documenttype[0].field[22].id 1203055625 +documenttype[0].field[22].header false +documenttype[0].field[22].datatype 1013 +documenttype[0].field[23].name tolist +documenttype[0].field[23].id 1084918181 +documenttype[0].field[23].header false +documenttype[0].field[23].datatype 1012 +documenttype[0].field[24].name cclist +documenttype[0].field[24].id 1733332403 +documenttype[0].field[24].header false +documenttype[0].field[24].datatype 1012 +documenttype[0].field[25].name bcclist +documenttype[0].field[25].id 410546306 +documenttype[0].field[25].header false +documenttype[0].field[25].datatype 1012 diff --git a/streamingvisitors/src/tests/config/vsm.cfg b/streamingvisitors/src/tests/config/vsm.cfg new file mode 100644 index 00000000000..dc50447f623 --- /dev/null +++ b/streamingvisitors/src/tests/config/vsm.cfg @@ -0,0 +1,3 @@ +doctype file:../config/mail.cfg +storagecfg "" +vsmfields file:../config/vsmfields.cfg diff --git a/streamingvisitors/src/tests/config/vsmfields.cfg b/streamingvisitors/src/tests/config/vsmfields.cfg new file mode 100644 index 00000000000..30f1c8ed8b1 --- /dev/null +++ b/streamingvisitors/src/tests/config/vsmfields.cfg @@ -0,0 +1,297 @@ +threadsperquery 4 +documentverificationlevel=0 +searchall 1 +fieldspec[17] +fieldspec[0].name bcc +fieldspec[0].searchmethod AUTOUTF8 +fieldspec[0].arg1 "" +fieldspec[1].name cc +fieldspec[1].searchmethod AUTOUTF8 +fieldspec[1].arg1 "" +fieldspec[2].name from +fieldspec[2].searchmethod AUTOUTF8 +fieldspec[2].arg1 "" +fieldspec[3].name date +fieldspec[3].searchmethod INT32 +fieldspec[3].arg1 "" +fieldspec[4].name replyto +fieldspec[4].searchmethod AUTOUTF8 +fieldspec[4].arg1 "" +fieldspec[5].name subject +fieldspec[5].searchmethod AUTOUTF8 +fieldspec[5].arg1 "" +fieldspec[6].name to +fieldspec[6].searchmethod AUTOUTF8 +fieldspec[6].arg1 "" +fieldspec[7].name body +fieldspec[7].searchmethod AUTOUTF8 +fieldspec[7].arg1 "" +fieldspec[8].name bodymeta +fieldspec[8].searchmethod AUTOUTF8 +fieldspec[8].arg1 "" +fieldspec[9].name mailid +fieldspec[9].searchmethod AUTOUTF8 +fieldspec[9].arg1 "" +fieldspec[10].name attachmentcount +fieldspec[10].searchmethod INT32 +fieldspec[10].arg1 "" +fieldspec[11].name attachmentcontent +fieldspec[11].searchmethod AUTOUTF8 +fieldspec[11].arg1 "" +fieldspec[12].name attachmenttypes +fieldspec[12].searchmethod AUTOUTF8 +fieldspec[12].arg1 "" +fieldspec[13].name attachmentnames +fieldspec[13].searchmethod AUTOUTF8 +fieldspec[13].arg1 "" +fieldspec[14].name attachmentlanguages +fieldspec[14].searchmethod AUTOUTF8 +fieldspec[14].arg1 "" +fieldspec[15].name URI +fieldspec[15].searchmethod AUTOUTF8 +fieldspec[15].arg1 "" +fieldspec[16].name vsm_whichfieldmatched +fieldspec[16].searchmethod AUTOUTF8 +fieldspec[16].arg1 "" +index[26] +index[0].name default +index[0].field[10] +index[0].field[0].name from +index[0].field[1].name to +index[0].field[2].name cc +index[0].field[3].name bcc +index[0].field[4].name subject +index[0].field[5].name body +index[0].field[6].name attachmentcontent +index[0].field[7].name attachmentnames +index[0].field[8].name attachmenttypes +index[0].field[9].name date +index[1].name all +index[1].field[8] +index[1].field[0].name to +index[1].field[1].name cc +index[1].field[2].name bcc +index[1].field[3].name subject +index[1].field[4].name body +index[1].field[5].name attachmentcontent +index[1].field[6].name attachmentnames +index[1].field[7].name attachmenttypes +index[2].name header +index[2].field[6] +index[2].field[0].name from +index[2].field[1].name replyto +index[2].field[2].name to +index[2].field[3].name cc +index[2].field[4].name bcc +index[2].field[5].name subject +index[3].name senders +index[3].field[2] +index[3].field[0].name from +index[3].field[1].name replyto +index[4].name recipients +index[4].field[3] +index[4].field[0].name to +index[4].field[1].name cc +index[4].field[2].name bcc +index[5].name address +index[5].field[5] +index[5].field[0].name from +index[5].field[1].name replyto +index[5].field[2].name to +index[5].field[3].name cc +index[5].field[4].name bcc +index[6].name body +index[6].field[2] +index[6].field[0].name subject +index[6].field[1].name body +index[7].name meta +index[7].field[2] +index[7].field[0].name attachmentcontent +index[7].field[1].name attachmenttypes +index[8].name index1 +index[8].field[1] +index[8].field[0].name bcc +index[9].name index2 +index[9].field[2] +index[9].field[0].name bcc +index[9].field[1].name cc +index[10].name index3 +index[10].field[3] +index[10].field[0].name bcc +index[10].field[1].name cc +index[10].field[2].name from +index[11].name index4 +index[11].field[4] +index[11].field[0].name bcc +index[11].field[1].name cc +index[11].field[2].name from +index[11].field[3].name date +index[12].name index5 +index[12].field[5] +index[12].field[0].name bcc +index[12].field[1].name cc +index[12].field[2].name from +index[12].field[3].name date +index[12].field[4].name replyto +index[13].name index6 +index[13].field[6] +index[13].field[0].name bcc +index[13].field[1].name cc +index[13].field[2].name from +index[13].field[3].name date +index[13].field[4].name replyto +index[13].field[5].name subject +index[14].name index7 +index[14].field[7] +index[14].field[0].name bcc +index[14].field[1].name cc +index[14].field[2].name from +index[14].field[3].name date +index[14].field[4].name replyto +index[14].field[5].name subject +index[14].field[6].name to +index[15].name index8 +index[15].field[8] +index[15].field[0].name bcc +index[15].field[1].name cc +index[15].field[2].name from +index[15].field[3].name date +index[15].field[4].name replyto +index[15].field[5].name subject +index[15].field[6].name to +index[15].field[7].name body +index[16].name index9 +index[16].field[9] +index[16].field[0].name bcc +index[16].field[1].name cc +index[16].field[2].name from +index[16].field[3].name date +index[16].field[4].name replyto +index[16].field[5].name subject +index[16].field[6].name to +index[16].field[7].name body +index[16].field[8].name bodymeta +index[17].name index10 +index[17].field[10] +index[17].field[0].name bcc +index[17].field[1].name cc +index[17].field[2].name from +index[17].field[3].name date +index[17].field[4].name replyto +index[17].field[5].name subject +index[17].field[6].name to +index[17].field[7].name body +index[17].field[8].name bodymeta +index[17].field[9].name mailid +index[18].name index11 +index[18].field[11] +index[18].field[0].name bcc +index[18].field[1].name cc +index[18].field[2].name from +index[18].field[3].name date +index[18].field[4].name replyto +index[18].field[5].name subject +index[18].field[6].name to +index[18].field[7].name body +index[18].field[8].name bodymeta +index[18].field[9].name mailid +index[18].field[10].name attachmentcount +index[19].name index12 +index[19].field[12] +index[19].field[0].name bcc +index[19].field[1].name cc +index[19].field[2].name from +index[19].field[3].name date +index[19].field[4].name replyto +index[19].field[5].name subject +index[19].field[6].name to +index[19].field[7].name body +index[19].field[8].name bodymeta +index[19].field[9].name mailid +index[19].field[10].name attachmentcount +index[19].field[11].name attachmentcontent +index[20].name index13 +index[20].field[13] +index[20].field[0].name bcc +index[20].field[1].name cc +index[20].field[2].name from +index[20].field[3].name date +index[20].field[4].name replyto +index[20].field[5].name subject +index[20].field[6].name to +index[20].field[7].name body +index[20].field[8].name bodymeta +index[20].field[9].name mailid +index[20].field[10].name attachmentcount +index[20].field[11].name attachmentcontent +index[20].field[12].name attachmenttypes +index[21].name index14 +index[21].field[14] +index[21].field[0].name bcc +index[21].field[1].name cc +index[21].field[2].name from +index[21].field[3].name date +index[21].field[4].name replyto +index[21].field[5].name subject +index[21].field[6].name to +index[21].field[7].name body +index[21].field[8].name bodymeta +index[21].field[9].name mailid +index[21].field[10].name attachmentcount +index[21].field[11].name attachmentcontent +index[21].field[12].name attachmenttypes +index[21].field[13].name attachmentnames +index[22].name index15 +index[22].field[15] +index[22].field[0].name bcc +index[22].field[1].name cc +index[22].field[2].name from +index[22].field[3].name date +index[22].field[4].name replyto +index[22].field[5].name subject +index[22].field[6].name to +index[22].field[7].name body +index[22].field[8].name bodymeta +index[22].field[9].name mailid +index[22].field[10].name attachmentcount +index[22].field[11].name attachmentcontent +index[22].field[12].name attachmenttypes +index[22].field[13].name attachmentnames +index[22].field[14].name attachmentlanguages +index[23].name index16 +index[23].field[15] +index[23].field[0].name bcc +index[23].field[1].name cc +index[23].field[2].name from +index[23].field[3].name date +index[23].field[4].name replyto +index[23].field[5].name subject +index[23].field[6].name to +index[23].field[7].name body +index[23].field[8].name bodymeta +index[23].field[9].name mailid +index[23].field[10].name attachmentcount +index[23].field[11].name attachmentcontent +index[23].field[12].name attachmenttypes +index[23].field[13].name attachmentnames +index[23].field[14].name attachmentlanguages +index[24].name index17 +index[24].field[15] +index[24].field[0].name bcc +index[24].field[1].name cc +index[24].field[2].name from +index[24].field[3].name date +index[24].field[4].name replyto +index[24].field[5].name subject +index[24].field[6].name to +index[24].field[7].name body +index[24].field[8].name bodymeta +index[24].field[9].name mailid +index[24].field[10].name attachmentcount +index[24].field[11].name attachmentcontent +index[24].field[12].name attachmenttypes +index[24].field[13].name attachmentnames +index[24].field[14].name attachmentlanguages +index[25].name date +index[25].field[1] +index[25].field[0].name date diff --git a/streamingvisitors/src/tests/docsum/.gitignore b/streamingvisitors/src/tests/docsum/.gitignore new file mode 100644 index 00000000000..9a697a94de8 --- /dev/null +++ b/streamingvisitors/src/tests/docsum/.gitignore @@ -0,0 +1,4 @@ +.depend +Makefile +docsum_test +vsm_docsum_test_app diff --git a/streamingvisitors/src/tests/docsum/CMakeLists.txt b/streamingvisitors/src/tests/docsum/CMakeLists.txt new file mode 100644 index 00000000000..87c46409053 --- /dev/null +++ b/streamingvisitors/src/tests/docsum/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vsm_docsum_test_app TEST + SOURCES + docsum.cpp + DEPENDS + streamingvisitors +) +vespa_add_test(NAME vsm_docsum_test_app COMMAND vsm_docsum_test_app) diff --git a/streamingvisitors/src/tests/docsum/docsum.cpp b/streamingvisitors/src/tests/docsum/docsum.cpp new file mode 100644 index 00000000000..475489d2f5a --- /dev/null +++ b/streamingvisitors/src/tests/docsum/docsum.cpp @@ -0,0 +1,293 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/vespalib/testkit/testapp.h> +#include <vespa/document/fieldvalue/fieldvalues.h> +#include <vespa/document/datatype/structdatatype.h> +#include <vespa/document/datatype/weightedsetdatatype.h> +#include <vespa/document/datatype/mapdatatype.h> +#include <vespa/vsm/common/docsum.h> +#include <vespa/vsm/vsm/flattendocsumwriter.h> +#include <vespa/vsm/vsm/slimefieldwriter.h> + +using namespace document; + +namespace vsm { + +template <typename T> +class Vector : public std::vector<T> +{ +public: + Vector<T> & add(T v) { this->push_back(v); return *this; } +}; + +typedef Vector<std::string> StringList; +typedef Vector<std::pair<std::string, int32_t> > WeightedStringList; + + +class TestDocument : public vsm::Document +{ +private: + std::vector<FieldValueContainer> _fields; + +public: + TestDocument(const search::DocumentIdT & docId, size_t numFields) : vsm::Document(docId, numFields), _fields(numFields) {} + virtual bool setField(FieldIdT fId, document::FieldValue::UP fv) override { + if (fId < _fields.size()) { + _fields[fId].reset(fv.release()); + return true; + } + return false; + } + virtual const document::FieldValue * getField(FieldIdT fId) const override { + if (fId < _fields.size()) { + return _fields[fId].get(); + } + return NULL; + } +}; + + +class DocsumTest : public vespalib::TestApp +{ +private: + ArrayFieldValue createFieldValue(const StringList & fv); + WeightedSetFieldValue createFieldValue(const WeightedStringList & fv); + + void assertFlattenDocsumWriter(const FieldValue & fv, const std::string & exp) { + FlattenDocsumWriter fdw; + assertFlattenDocsumWriter(fdw, fv, exp); + } + void assertFlattenDocsumWriter(FlattenDocsumWriter & fdw, const FieldValue & fv, const std::string & exp); + void assertSlimeFieldWriter(const FieldValue & fv, const std::string & exp) { + SlimeFieldWriter sfw; + TEST_DO(assertSlimeFieldWriter(sfw, fv, exp)); + } + void assertSlimeFieldWriter(SlimeFieldWriter & sfw, const FieldValue & fv, const std::string & exp); + + void testFlattenDocsumWriter(); + void testSlimeFieldWriter(); + void requireThatSlimeFieldWriterHandlesMap(); + void testDocSumCache(); + +public: + int Main() override; +}; + +ArrayFieldValue +DocsumTest::createFieldValue(const StringList & fv) +{ + + static ArrayDataType type(*DataType::STRING); + ArrayFieldValue afv(type); + for (size_t i = 0; i < fv.size(); ++i) { + afv.add(StringFieldValue(fv[i])); + } + return afv; +} + +WeightedSetFieldValue +DocsumTest::createFieldValue(const WeightedStringList & fv) +{ + static WeightedSetDataType type(*DataType::STRING, false, false); + WeightedSetFieldValue wsfv(type); + for (size_t i = 0; i < fv.size(); ++i) { + wsfv.add(StringFieldValue(fv[i].first), fv[i].second); + } + return wsfv; +} + +void +DocsumTest::assertFlattenDocsumWriter(FlattenDocsumWriter & fdw, const FieldValue & fv, const std::string & exp) +{ + FieldPath empty; + fv.iterateNested(empty.getFullRange(), fdw); + std::string actual(fdw.getResult().getBuffer(), fdw.getResult().getPos()); + EXPECT_EQUAL(actual, exp); +} + +void +DocsumTest::assertSlimeFieldWriter(SlimeFieldWriter & sfw, const FieldValue & fv, const std::string & exp) +{ + sfw.convert(fv); + + vespalib::Slime gotSlime; + vespalib::Memory serialized(sfw.out()); + size_t decodeRes = vespalib::slime::BinaryFormat::decode(serialized, gotSlime); + ASSERT_EQUAL(decodeRes, serialized.size); + + vespalib::Slime expSlime; + size_t used = vespalib::slime::JsonFormat::decode(exp, expSlime); + EXPECT_TRUE(used > 0); + EXPECT_EQUAL(expSlime, gotSlime); +} + +void +DocsumTest::testFlattenDocsumWriter() +{ + { // basic tests + TEST_DO(assertFlattenDocsumWriter(StringFieldValue("foo bar"), "foo bar")); + TEST_DO(assertFlattenDocsumWriter(RawFieldValue("foo bar"), "foo bar")); + TEST_DO(assertFlattenDocsumWriter(BoolFieldValue(true), "true")); + TEST_DO(assertFlattenDocsumWriter(BoolFieldValue(false), "false")); + TEST_DO(assertFlattenDocsumWriter(LongFieldValue(123456789), "123456789")); + TEST_DO(assertFlattenDocsumWriter(createFieldValue(StringList().add("foo bar").add("baz").add(" qux ")), + "foo bar baz qux ")); + } + { // test mulitple invokations + FlattenDocsumWriter fdw("#"); + TEST_DO(assertFlattenDocsumWriter(fdw, StringFieldValue("foo"), "foo")); + TEST_DO(assertFlattenDocsumWriter(fdw, StringFieldValue("bar"), "foo#bar")); + fdw.clear(); + TEST_DO(assertFlattenDocsumWriter(fdw, StringFieldValue("baz"), "baz")); + TEST_DO(assertFlattenDocsumWriter(fdw, StringFieldValue("qux"), "baz qux")); + } + { // test resizing + FlattenDocsumWriter fdw("#"); + EXPECT_EQUAL(fdw.getResult().getPos(), 0u); + EXPECT_EQUAL(fdw.getResult().getLength(), 32u); + TEST_DO(assertFlattenDocsumWriter(fdw, StringFieldValue("aaaabbbbccccddddeeeeffffgggghhhh"), + "aaaabbbbccccddddeeeeffffgggghhhh")); + EXPECT_EQUAL(fdw.getResult().getPos(), 32u); + EXPECT_EQUAL(fdw.getResult().getLength(), 32u); + TEST_DO(assertFlattenDocsumWriter(fdw, StringFieldValue("aaaa"), "aaaabbbbccccddddeeeeffffgggghhhh#aaaa")); + EXPECT_EQUAL(fdw.getResult().getPos(), 37u); + EXPECT_TRUE(fdw.getResult().getLength() >= 37u); + fdw.clear(); + EXPECT_EQUAL(fdw.getResult().getPos(), 0u); + EXPECT_TRUE(fdw.getResult().getLength() >= 37u); + } +} + +void +DocsumTest::testSlimeFieldWriter() +{ + { // basic types + assertSlimeFieldWriter(LongFieldValue(123456789), "123456789"); + assertSlimeFieldWriter(BoolFieldValue(true), "true"); + assertSlimeFieldWriter(BoolFieldValue(false), "false"); + assertSlimeFieldWriter(DoubleFieldValue(12.34), "12.34"); + assertSlimeFieldWriter(StringFieldValue("foo bar"), "\"foo bar\""); + } + { // collection field values + assertSlimeFieldWriter(createFieldValue(StringList().add("foo").add("bar").add("baz")), + "[\"foo\",\"bar\",\"baz\"]"); + assertSlimeFieldWriter(createFieldValue(WeightedStringList().add(std::make_pair("bar", 20)). + add(std::make_pair("baz", 30)). + add(std::make_pair("foo", 10))), + "[{item:\"bar\",weight:20},{item:\"baz\",weight:30},{item:\"foo\",weight:10}]"); + } + { // struct field value + StructDataType subType("substruct"); + Field fd("d", 0, *DataType::STRING); + Field fe("e", 1, *DataType::STRING); + subType.addField(fd); + subType.addField(fe); + StructFieldValue subValue(subType); + subValue.setValue(fd, StringFieldValue("baz")); + subValue.setValue(fe, StringFieldValue("qux")); + + StructDataType type("struct"); + Field fa("a", 0, *DataType::STRING); + Field fb("b", 1, *DataType::STRING); + Field fc("c", 2, subType); + type.addField(fa); + type.addField(fb); + type.addField(fc); + StructFieldValue value(type); + value.setValue(fa, StringFieldValue("foo")); + value.setValue(fb, StringFieldValue("bar")); + value.setValue(fc, subValue); + + + { // select a subset and then all + SlimeFieldWriter sfw; + DocsumFieldSpec::FieldIdentifierVector fields; + { + FieldPath path; + type.buildFieldPath(path, "a"); + fields.push_back(DocsumFieldSpec::FieldIdentifier(0, std::move(path))); + } + { + FieldPath path; + type.buildFieldPath(path, "c.e"); + fields.push_back(DocsumFieldSpec::FieldIdentifier(0, std::move(path))); + } + sfw.setInputFields(fields); + TEST_DO(assertSlimeFieldWriter(sfw, value, "{\"a\":\"foo\",\"c\":{\"e\":\"qux\"}}")); + sfw.clear(); + TEST_DO(assertSlimeFieldWriter(sfw, value, "{\"a\":\"foo\",\"b\":\"bar\",\"c\":{\"d\":\"baz\",\"e\":\"qux\"}}")); + } + + { // multiple invocations + SlimeFieldWriter sfw; + TEST_DO(assertSlimeFieldWriter(sfw, StringFieldValue("foo"), "\"foo\"")); + sfw.clear(); + TEST_DO(assertSlimeFieldWriter(sfw, StringFieldValue("bar"), "\"bar\"")); + sfw.clear(); + TEST_DO(assertSlimeFieldWriter(sfw, StringFieldValue("baz"), "\"baz\"")); + } + + } +} + +void +DocsumTest::requireThatSlimeFieldWriterHandlesMap() +{ + { // map<string, string> + MapDataType mapType(*DataType::STRING, *DataType::STRING); + MapFieldValue mapfv(mapType); + EXPECT_TRUE(mapfv.put(StringFieldValue("k1"), StringFieldValue("v1"))); + EXPECT_TRUE(mapfv.put(StringFieldValue("k2"), StringFieldValue("v2"))); + assertSlimeFieldWriter(mapfv, "[{\"key\":\"k1\",\"value\":\"v1\"},{\"key\":\"k2\",\"value\":\"v2\"}]"); + } + { // map<string, struct> + StructDataType structType("struct"); + Field fa("a", 0, *DataType::STRING); + Field fb("b", 1, *DataType::STRING); + structType.addField(fa); + structType.addField(fb); + StructFieldValue structValue(structType); + structValue.setValue(fa, StringFieldValue("foo")); + structValue.setValue(fb, StringFieldValue("bar")); + MapDataType mapType(*DataType::STRING, structType); + MapFieldValue mapfv(mapType); + EXPECT_TRUE(mapfv.put(StringFieldValue("k1"), structValue)); + { // select a subset and then all + SlimeFieldWriter sfw; + DocsumFieldSpec::FieldIdentifierVector fields; + { + FieldPath path; + mapType.buildFieldPath(path, "value.b"); + fields.push_back(DocsumFieldSpec::FieldIdentifier(0, std::move(path))); + } + sfw.setInputFields(fields); + TEST_DO(assertSlimeFieldWriter(sfw, mapfv, "[{\"key\":\"k1\",\"value\":{\"b\":\"bar\"}}]")); + { + FieldPath path; + mapType.buildFieldPath(path, "{k1}.a"); + fields[0] = DocsumFieldSpec::FieldIdentifier(0, std::move(path)); + } + sfw.clear(); + sfw.setInputFields(fields); + TEST_DO(assertSlimeFieldWriter(sfw, mapfv, "[{\"key\":\"k1\",\"value\":{\"a\":\"foo\"}}]")); + sfw.clear(); // all fields implicit + TEST_DO(assertSlimeFieldWriter(sfw, mapfv, "[{\"key\":\"k1\",\"value\":{\"a\":\"foo\",\"b\":\"bar\"}}]")); + } + } +} + +int +DocsumTest::Main() +{ + TEST_INIT("docsum_test"); + + TEST_DO(testFlattenDocsumWriter()); + TEST_DO(testSlimeFieldWriter()); + TEST_DO(requireThatSlimeFieldWriterHandlesMap()); + + TEST_DONE(); +} + +} + +TEST_APPHOOK(vsm::DocsumTest); + diff --git a/streamingvisitors/src/tests/document/.gitignore b/streamingvisitors/src/tests/document/.gitignore new file mode 100644 index 00000000000..d47781eff63 --- /dev/null +++ b/streamingvisitors/src/tests/document/.gitignore @@ -0,0 +1,4 @@ +.depend +Makefile +document_test +vsm_document_test_app diff --git a/streamingvisitors/src/tests/document/CMakeLists.txt b/streamingvisitors/src/tests/document/CMakeLists.txt new file mode 100644 index 00000000000..5ea12dc5e2d --- /dev/null +++ b/streamingvisitors/src/tests/document/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vsm_document_test_app TEST + SOURCES + document.cpp + DEPENDS + streamingvisitors +) +vespa_add_test(NAME vsm_document_test_app COMMAND vsm_document_test_app) diff --git a/streamingvisitors/src/tests/document/document.cpp b/streamingvisitors/src/tests/document/document.cpp new file mode 100644 index 00000000000..1e97d232a64 --- /dev/null +++ b/streamingvisitors/src/tests/document/document.cpp @@ -0,0 +1,129 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/vespalib/testkit/testapp.h> + +#include <vespa/document/fieldvalue/fieldvalues.h> +#include <vespa/document/datatype/documenttype.h> +#include <vespa/vsm/common/storagedocument.h> +#include <vespa/vespalib/stllike/asciistream.h> + +using namespace document; + +namespace vsm { + +class DocumentTest : public vespalib::TestApp +{ +private: + void testStorageDocument(); + void testStringFieldIdTMap(); +public: + int Main() override; +}; + +void +DocumentTest::testStorageDocument() +{ + DocumentType dt("testdoc", 0); + + Field fa("a", 0, *DataType::STRING); + Field fb("b", 1, *DataType::STRING); + dt.addField(fa); + dt.addField(fb); + + document::Document::UP doc(new document::Document(dt, DocumentId())); + doc->setValue(fa, StringFieldValue("foo")); + doc->setValue(fb, StringFieldValue("bar")); + + SharedFieldPathMap fpmap(new FieldPathMapT()); + fpmap->emplace_back(); + dt.buildFieldPath(fpmap->back(),"a"); + fpmap->emplace_back(); + dt.buildFieldPath(fpmap->back(), "b"); + fpmap->emplace_back(); + ASSERT_TRUE((*fpmap)[0].size() == 1); + ASSERT_TRUE((*fpmap)[1].size() == 1); + ASSERT_TRUE((*fpmap)[2].size() == 0); + + StorageDocument sdoc(std::move(doc), fpmap, 3); + ASSERT_TRUE(sdoc.valid()); + + EXPECT_EQUAL(std::string("foo"), sdoc.getField(0)->getAsString()); + EXPECT_EQUAL(std::string("bar"), sdoc.getField(1)->getAsString()); + EXPECT_TRUE(sdoc.getField(2) == nullptr); + // test caching + EXPECT_EQUAL(std::string("foo"), sdoc.getField(0)->getAsString()); + EXPECT_EQUAL(std::string("bar"), sdoc.getField(1)->getAsString()); + EXPECT_TRUE(sdoc.getField(2) == nullptr); + + // set new values + EXPECT_TRUE(sdoc.setField(0, FieldValue::UP(new StringFieldValue("baz")))); + EXPECT_EQUAL(std::string("baz"), sdoc.getField(0)->getAsString()); + EXPECT_EQUAL(std::string("bar"), sdoc.getField(1)->getAsString()); + EXPECT_TRUE(sdoc.getField(2) == nullptr); + EXPECT_TRUE(sdoc.setField(1, FieldValue::UP(new StringFieldValue("qux")))); + EXPECT_EQUAL(std::string("baz"), sdoc.getField(0)->getAsString()); + EXPECT_EQUAL(std::string("qux"), sdoc.getField(1)->getAsString()); + EXPECT_TRUE(sdoc.getField(2) == nullptr); + EXPECT_TRUE(sdoc.setField(2, FieldValue::UP(new StringFieldValue("quux")))); + EXPECT_EQUAL(std::string("baz"), sdoc.getField(0)->getAsString()); + EXPECT_EQUAL(std::string("qux"), sdoc.getField(1)->getAsString()); + EXPECT_EQUAL(std::string("quux"), sdoc.getField(2)->getAsString()); + + EXPECT_TRUE(!sdoc.setField(3, FieldValue::UP(new StringFieldValue("thud")))); + + SharedFieldPathMap fim; + StorageDocument s2(std::make_unique<document::Document>(), fim, 0); + EXPECT_EQUAL(IdString().toString(), s2.docDoc().getId().toString()); +} + +void DocumentTest::testStringFieldIdTMap() +{ + StringFieldIdTMap m; + EXPECT_EQUAL(0u, m.highestFieldNo()); + EXPECT_TRUE(StringFieldIdTMap::npos == m.fieldNo("unknown")); + m.add("f1"); + EXPECT_EQUAL(0u, m.fieldNo("f1")); + EXPECT_EQUAL(1u, m.highestFieldNo()); + m.add("f1"); + EXPECT_EQUAL(0u, m.fieldNo("f1")); + EXPECT_EQUAL(1u, m.highestFieldNo()); + m.add("f2"); + EXPECT_EQUAL(1u, m.fieldNo("f2")); + EXPECT_EQUAL(2u, m.highestFieldNo()); + m.add("f3", 7); + EXPECT_EQUAL(7u, m.fieldNo("f3")); + EXPECT_EQUAL(8u, m.highestFieldNo()); + m.add("f3"); + EXPECT_EQUAL(7u, m.fieldNo("f3")); + EXPECT_EQUAL(8u, m.highestFieldNo()); + m.add("f2", 13); + EXPECT_EQUAL(13u, m.fieldNo("f2")); + EXPECT_EQUAL(14u, m.highestFieldNo()); + m.add("f4"); + EXPECT_EQUAL(3u, m.fieldNo("f4")); + EXPECT_EQUAL(14u, m.highestFieldNo()); + { + vespalib::asciistream os; + StringFieldIdTMap t; + t.add("b"); + t.add("a"); + os << t; + EXPECT_EQUAL(vespalib::string("a = 1\nb = 0\n"), os.str()); + } + +} + +int +DocumentTest::Main() +{ + TEST_INIT("document_test"); + + testStorageDocument(); + testStringFieldIdTMap(); + + TEST_DONE(); +} + +} + +TEST_APPHOOK(vsm::DocumentTest); + diff --git a/streamingvisitors/src/tests/hitcollector/CMakeLists.txt b/streamingvisitors/src/tests/hitcollector/CMakeLists.txt index f25ab348265..dbec820a462 100644 --- a/streamingvisitors/src/tests/hitcollector/CMakeLists.txt +++ b/streamingvisitors/src/tests/hitcollector/CMakeLists.txt @@ -3,6 +3,6 @@ vespa_add_executable(streamingvisitors_hitcollector_test_app TEST SOURCES hitcollector_test.cpp DEPENDS - streamingvisitors_searchvisitor + streamingvisitors ) vespa_add_test(NAME streamingvisitors_hitcollector_test_app COMMAND streamingvisitors_hitcollector_test_app) diff --git a/streamingvisitors/src/tests/matching_elements_filler/CMakeLists.txt b/streamingvisitors/src/tests/matching_elements_filler/CMakeLists.txt index ef93d551912..5cc2977b3c3 100644 --- a/streamingvisitors/src/tests/matching_elements_filler/CMakeLists.txt +++ b/streamingvisitors/src/tests/matching_elements_filler/CMakeLists.txt @@ -3,7 +3,7 @@ vespa_add_executable(streamingvisitors_matching_elements_filler_test_app TEST SOURCES matching_elements_filler_test.cpp DEPENDS - streamingvisitors_searchvisitor + streamingvisitors GTest::GTest ) vespa_add_test(NAME streamingvisitors_matching_elements_filler_test_app COMMAND streamingvisitors_matching_elements_filler_test_app) diff --git a/streamingvisitors/src/tests/querywrapper/CMakeLists.txt b/streamingvisitors/src/tests/querywrapper/CMakeLists.txt index 7cae60e6a11..e0131d0c6cc 100644 --- a/streamingvisitors/src/tests/querywrapper/CMakeLists.txt +++ b/streamingvisitors/src/tests/querywrapper/CMakeLists.txt @@ -3,6 +3,6 @@ vespa_add_executable(streamingvisitors_querywrapper_test_app TEST SOURCES querywrapper_test.cpp DEPENDS - streamingvisitors_searchvisitor + streamingvisitors ) vespa_add_test(NAME streamingvisitors_querywrapper_test_app COMMAND streamingvisitors_querywrapper_test_app) diff --git a/streamingvisitors/src/tests/searcher/.gitignore b/streamingvisitors/src/tests/searcher/.gitignore new file mode 100644 index 00000000000..52a56dff405 --- /dev/null +++ b/streamingvisitors/src/tests/searcher/.gitignore @@ -0,0 +1,4 @@ +.depend +Makefile +searcher_test +vsm_searcher_test_app diff --git a/streamingvisitors/src/tests/searcher/CMakeLists.txt b/streamingvisitors/src/tests/searcher/CMakeLists.txt new file mode 100644 index 00000000000..2277f5ef55f --- /dev/null +++ b/streamingvisitors/src/tests/searcher/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vsm_searcher_test_app TEST + SOURCES + searcher_test.cpp + DEPENDS + streamingvisitors +) +vespa_add_test(NAME vsm_searcher_test_app COMMAND vsm_searcher_test_app) diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp new file mode 100644 index 00000000000..34fa66eaa90 --- /dev/null +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -0,0 +1,864 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vespalib/testkit/testapp.h> + +#include <vespa/vsm/searcher/fieldsearcher.h> +#include <vespa/vsm/searcher/floatfieldsearcher.h> +#include <vespa/vsm/searcher/futf8strchrfieldsearcher.h> +#include <vespa/vsm/searcher/intfieldsearcher.h> +#include <vespa/vsm/searcher/boolfieldsearcher.h> +#include <vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h> +#include <vespa/vsm/searcher/utf8exactstringfieldsearcher.h> +#include <vespa/vsm/searcher/utf8substringsearcher.h> +#include <vespa/vsm/searcher/utf8substringsnippetmodifier.h> +#include <vespa/vsm/searcher/utf8suffixstringfieldsearcher.h> +#include <vespa/vsm/vsm/snippetmodifier.h> +#include <vespa/searchlib/query/streaming/queryterm.h> +#include <vespa/document/fieldvalue/fieldvalues.h> + +using namespace document; +using search::streaming::HitList; +using search::streaming::QueryNodeResultFactory; +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; +using TermType = QueryTerm::Type; +using namespace vsm; + +template <typename T> +class Vector : public std::vector<T> +{ +public: + Vector() : std::vector<T>() {} + Vector<T> & add(T v) { this->push_back(v); return *this; } +}; + +typedef Vector<size_t> Hits; +typedef Vector<std::string> StringList; +typedef Vector<Hits> HitsList; +typedef Vector<bool> BoolList; +typedef Vector<int64_t> LongList; +typedef Vector<float> FloatList; +typedef QueryTerm::FieldInfo QTFieldInfo; +typedef Vector<QTFieldInfo> FieldInfoList; + +class String +{ +private: + const std::string & _str; +public: + String(const std::string & str) : _str(str) {} + bool operator==(const String & rhs) const { + return _str == rhs._str; + } +}; + +class Query +{ +private: + void setupQuery(const StringList & terms) { + for (size_t i = 0; i < terms.size(); ++i) { + ParsedQueryTerm pqt = parseQueryTerm(terms[i]); + ParsedTerm pt = parseTerm(pqt.second); + qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second)); + } + for (size_t i = 0; i < qtv.size(); ++i) { + qtl.push_back(qtv[i].get()); + } + } +public: + typedef std::pair<std::string, std::string> ParsedQueryTerm; + typedef std::pair<std::string, TermType> ParsedTerm; + QueryNodeResultFactory eqnr; + std::vector<QueryTerm::UP> qtv; + QueryTermList qtl; + Query(const StringList & terms); + ~Query(); + static ParsedQueryTerm parseQueryTerm(const std::string & queryTerm) { + size_t i = queryTerm.find(':'); + if (i != std::string::npos) { + return ParsedQueryTerm(queryTerm.substr(0, i), queryTerm.substr(i + 1)); + } + return ParsedQueryTerm(std::string(), queryTerm); + } + static ParsedTerm parseTerm(const std::string & term) { + if (term[0] == '*' && term[term.size() - 1] == '*') { + return std::make_pair(term.substr(1, term.size() - 2), TermType::SUBSTRINGTERM); + } else if (term[0] == '*') { + return std::make_pair(term.substr(1, term.size() - 1), TermType::SUFFIXTERM); + } else if (term[term.size() - 1] == '*') { + return std::make_pair(term.substr(0, term.size() - 1), TermType::PREFIXTERM); + } else { + return std::make_pair(term, TermType::WORD); + } + } +}; + +Query::Query(const StringList & terms) : eqnr(), qtv(), qtl() { + setupQuery(terms); +} +Query::~Query() = default; + +struct SnippetModifierSetup +{ + Query query; + UTF8SubstringSnippetModifier::SP searcher; + SharedSearcherBuf buf; + SnippetModifier modifier; + explicit SnippetModifierSetup(const StringList & terms); + ~SnippetModifierSetup(); +}; + +SnippetModifierSetup::SnippetModifierSetup(const StringList & terms) + : query(terms), + searcher(new UTF8SubstringSnippetModifier()), + buf(new SearcherBuf(8)), + modifier(searcher) +{ + searcher->prepare(query.qtl, buf); +} +SnippetModifierSetup::~SnippetModifierSetup() = default; + +// helper functions +ArrayFieldValue getFieldValue(const StringList &fv); +ArrayFieldValue getFieldValue(const LongList &fv); +ArrayFieldValue getFieldValue(const FloatList &fv); + +bool assertMatchTermSuffix(const std::string &term, const std::string &word); +void assertSnippetModifier(const StringList &query, const std::string &fv, const std::string &exp); +void assertSnippetModifier(SnippetModifierSetup &setup, const FieldValue &fv, const std::string &exp); +void assertQueryTerms(const SnippetModifierManager &man, FieldIdT fId, const StringList &terms); +void assertNumeric(FieldSearcher &fs, const StringList &query, const FieldValue &fv, const BoolList &exp); +std::vector<QueryTerm::UP> performSearch(FieldSearcher &fs, const StringList &query, const FieldValue &fv); +void assertSearch(FieldSearcher &fs, const StringList &query, const FieldValue &fv, const HitsList &exp); +bool assertCountWords(size_t numWords, const std::string &field); +bool assertFieldInfo(FieldSearcher &fs, const StringList &query, const FieldValue &fv, const FieldInfoList &exp); + +void assertString(StrChrFieldSearcher &fs, const StringList &query, const std::string &field, const HitsList &exp) { + assertSearch(fs, query, StringFieldValue(field), exp); +} + +void assertString(StrChrFieldSearcher &fs, const StringList &query, const StringList &field, const HitsList &exp) { + assertSearch(fs, query, getFieldValue(field), exp); +} + +void assertString(StrChrFieldSearcher &fs, const std::string &term, const std::string &field, const Hits &exp) { + assertString(fs, StringList().add(term), field, HitsList().add(exp)); +} +void assertString(StrChrFieldSearcher &fs, const std::string &term, const StringList &field, const Hits &exp) { + assertString(fs, StringList().add(term), field, HitsList().add(exp)); +} + +void assertInt(IntFieldSearcher & fs, const StringList &query, int64_t field, const BoolList &exp) { + assertNumeric(fs, query, LongFieldValue(field), exp); +} + +void assertInt(IntFieldSearcher & fs, const std::string &term, int64_t field, bool exp) { + assertInt(fs, StringList().add(term), field, BoolList().add(exp)); +} + +void assertBool(BoolFieldSearcher & fs, const StringList &query, bool field, const BoolList &exp) { + assertNumeric(fs, query, BoolFieldValue(field), exp); +} +void assertBool(BoolFieldSearcher & fs, const std::string &term, bool field, bool exp) { + assertBool(fs, StringList().add(term), field, BoolList().add(exp)); +} + +void assertInt(IntFieldSearcher & fs, const StringList &query, const LongList &field, const HitsList &exp) { + assertSearch(fs, query, getFieldValue(field), exp); +} + +void assertInt(IntFieldSearcher & fs, const std::string &term, const LongList &field, const Hits &exp) { + assertInt(fs, StringList().add(term), field, HitsList().add(exp)); +} + +void assertFloat(FloatFieldSearcher & fs, const StringList &query, float field, const BoolList &exp) { + assertNumeric(fs, query, FloatFieldValue(field), exp); +} + +void assertFloat(FloatFieldSearcher & fs, const std::string &term, float field, bool exp) { + assertFloat(fs, StringList().add(term), field, BoolList().add(exp)); +} + +void assertFloat(FloatFieldSearcher & fs, const StringList &query, const FloatList &field, const HitsList &exp) { + assertSearch(fs, query, getFieldValue(field), exp); +} + +void assertFloat(FloatFieldSearcher & fs, const std::string &term, const FloatList &field, const Hits &exp) { + assertFloat(fs, StringList().add(term), field, HitsList().add(exp)); +} + +bool +assertFieldInfo(StrChrFieldSearcher &fs, const StringList &query, const std::string &fv, const FieldInfoList &exp) { + return assertFieldInfo(fs, query, StringFieldValue(fv), exp); +} + +bool +assertFieldInfo(StrChrFieldSearcher &fs, const StringList &query, const StringList &fv, const FieldInfoList &exp) { + return assertFieldInfo(fs, query, getFieldValue(fv), exp); +} +bool +assertFieldInfo(StrChrFieldSearcher &fs, const std::string &term, const StringList &fv, const QTFieldInfo &exp) { + return assertFieldInfo(fs, StringList().add(term), fv, FieldInfoList().add(exp)); +} + +bool +assertFieldInfo(StrChrFieldSearcher &fs, const std::string &term, const std::string &fv, const QTFieldInfo &exp) { + return assertFieldInfo(fs, StringList().add(term), fv, FieldInfoList().add(exp)); +} + +void assertFieldInfo(IntFieldSearcher & fs, const StringList &query, int64_t fv, const FieldInfoList &exp) { + assertFieldInfo(fs, query, LongFieldValue(fv), exp); +} + +void assertFieldInfo(IntFieldSearcher & fs, const StringList &query, const LongList &fv, const FieldInfoList &exp) { + assertFieldInfo(fs, query, getFieldValue(fv), exp); +} + +void assertFieldInfo(IntFieldSearcher & fs, const std::string &term, int64_t fv, const QTFieldInfo &exp) { + assertFieldInfo(fs, StringList().add(term), fv, FieldInfoList().add(exp)); +} + +void assertFieldInfo(IntFieldSearcher & fs, const std::string &term, const LongList &fv, const QTFieldInfo &exp) { + assertFieldInfo(fs, StringList().add(term), fv, FieldInfoList().add(exp)); +} + +void assertFieldInfo(FloatFieldSearcher & fs, const StringList &query, float fv, const FieldInfoList &exp) { + assertFieldInfo(fs, query, FloatFieldValue(fv), exp); +} + +void +assertFieldInfo(FloatFieldSearcher & fs, const StringList &query, const FloatList &fv, const FieldInfoList &exp) { + assertFieldInfo(fs, query, getFieldValue(fv), exp); +} + +/** float field searcher **/ +void assertFieldInfo(FloatFieldSearcher & fs, const std::string &term, float fv, const QTFieldInfo &exp) { + assertFieldInfo(fs, StringList().add(term), fv, FieldInfoList().add(exp)); +} + +void assertFieldInfo(FloatFieldSearcher & fs, const std::string &term, const FloatList &fv, const QTFieldInfo &exp) { + assertFieldInfo(fs, StringList().add(term), fv, FieldInfoList().add(exp)); +} + + +/** snippet modifer searcher **/ +void assertSnippetModifier(const std::string &term, const std::string &fv, const std::string &exp) { + assertSnippetModifier(StringList().add(term), fv, exp); +} + + +ArrayFieldValue +getFieldValue(const StringList & fv) +{ + + static ArrayDataType type(*DataType::STRING); + ArrayFieldValue afv(type); + for (size_t i = 0; i < fv.size(); ++i) { + afv.add(StringFieldValue(fv[i])); + } + return afv; +} + +ArrayFieldValue +getFieldValue(const LongList & fv) +{ + static ArrayDataType type(*DataType::LONG); + ArrayFieldValue afv(type); + for (size_t i = 0; i < fv.size(); ++i) { + afv.add(LongFieldValue(fv[i])); + } + return afv; +} + +ArrayFieldValue +getFieldValue(const FloatList & fv) +{ + static ArrayDataType type(*DataType::FLOAT); + ArrayFieldValue afv(type); + for (size_t i = 0; i < fv.size(); ++i) { + afv.add(FloatFieldValue(fv[i])); + } + return afv; +} + +bool +assertMatchTermSuffix(const std::string & term, const std::string & word) +{ + QueryNodeResultFactory eqnr; + QueryTerm qa(eqnr.create(), term, "index", TermType::WORD); + QueryTerm qb(eqnr.create(), word, "index", TermType::WORD); + const ucs4_t * a; + size_t alen = qa.term(a); + const ucs4_t * b; + size_t blen = qb.term(b); + return UTF8StringFieldSearcherBase::matchTermSuffix(a, alen, b, blen); +} + +void +assertNumeric(FieldSearcher & fs, const StringList & query, const FieldValue & fv, const BoolList & exp) +{ + HitsList hl; + for (size_t i = 0; i < exp.size(); ++i) { + hl.push_back(exp[i] ? Hits().add(0) : Hits()); + } + assertSearch(fs, query, fv, hl); +} + +std::vector<QueryTerm::UP> +performSearch(FieldSearcher & fs, const StringList & query, const FieldValue & fv) +{ + Query q(query); + + // prepare field searcher + SharedSearcherBuf ssb = SharedSearcherBuf(new SearcherBuf()); + fs.prepare(q.qtl, ssb); + + // setup document + SharedFieldPathMap sfim(new FieldPathMapT()); + sfim->push_back(FieldPath()); + StorageDocument doc(std::make_unique<document::Document>(), sfim, 1); + doc.setField(0, document::FieldValue::UP(fv.clone())); + + fs.search(doc); + return std::move(q.qtv); +} + +void +assertSearch(FieldSearcher & fs, const StringList & query, const FieldValue & fv, const HitsList & exp) +{ + auto qtv = performSearch(fs, query, fv); + EXPECT_EQUAL(qtv.size(), exp.size()); + ASSERT_TRUE(qtv.size() == exp.size()); + for (size_t i = 0; i < qtv.size(); ++i) { + const HitList & hl = qtv[i]->getHitList(); + EXPECT_EQUAL(hl.size(), exp[i].size()); + ASSERT_TRUE(hl.size() == exp[i].size()); + for (size_t j = 0; j < hl.size(); ++j) { + EXPECT_EQUAL((size_t)hl[j].pos(), exp[i][j]); + } + } +} + +bool +assertFieldInfo(FieldSearcher & fs, const StringList & query, + const FieldValue & fv, const FieldInfoList & exp) +{ + auto qtv = performSearch(fs, query, fv); + if (!EXPECT_EQUAL(qtv.size(), exp.size())) return false; + bool retval = true; + for (size_t i = 0; i < qtv.size(); ++i) { + if (!EXPECT_EQUAL(qtv[i]->getFieldInfo(0).getHitOffset(), exp[i].getHitOffset())) retval = false; + if (!EXPECT_EQUAL(qtv[i]->getFieldInfo(0).getHitCount(), exp[i].getHitCount())) retval = false; + if (!EXPECT_EQUAL(qtv[i]->getFieldInfo(0).getFieldLength(), exp[i].getFieldLength())) retval = false; + } + return retval; +} + +void +assertSnippetModifier(const StringList & query, const std::string & fv, const std::string & exp) +{ + UTF8SubstringSnippetModifier mod; + performSearch(mod, query, StringFieldValue(fv)); + EXPECT_EQUAL(mod.getModifiedBuf().getPos(), exp.size()); + std::string actual(mod.getModifiedBuf().getBuffer(), mod.getModifiedBuf().getPos()); + EXPECT_EQUAL(actual.size(), exp.size()); + EXPECT_EQUAL(actual, exp); +} + +void assertSnippetModifier(SnippetModifierSetup & setup, const FieldValue & fv, const std::string & exp) +{ + FieldValue::UP mfv = setup.modifier.modify(fv); + const document::LiteralFieldValueB & lfv = static_cast<const document::LiteralFieldValueB &>(*mfv.get()); + const std::string & actual = lfv.getValue(); + EXPECT_EQUAL(actual.size(), exp.size()); + EXPECT_EQUAL(actual, exp); +} + +void assertQueryTerms(const SnippetModifierManager & man, FieldIdT fId, const StringList & terms) +{ + if (terms.size() == 0) { + ASSERT_TRUE(man.getModifiers().getModifier(fId) == NULL); + return; + } + ASSERT_TRUE(man.getModifiers().getModifier(fId) != NULL); + UTF8SubstringSnippetModifier * searcher = + (static_cast<SnippetModifier *>(man.getModifiers().getModifier(fId)))->getSearcher().get(); + EXPECT_EQUAL(searcher->getQueryTerms().size(), terms.size()); + ASSERT_TRUE(searcher->getQueryTerms().size() == terms.size()); + for (size_t i = 0; i < terms.size(); ++i) { + EXPECT_EQUAL(std::string(searcher->getQueryTerms()[i]->getTerm()), terms[i]); + } +} + +bool assertCountWords(size_t numWords, const std::string & field) +{ + FieldRef ref(field.c_str(), field.size()); + return EXPECT_EQUAL(numWords, FieldSearcher::countWords(ref)); +} + +bool +testStringFieldInfo(StrChrFieldSearcher & fs) +{ + assertString(fs, "foo", StringList().add("foo bar baz").add("foo bar").add("baz foo"), Hits().add(0).add(3).add(6)); + assertString(fs, StringList().add("foo").add("bar"), StringList().add("foo bar baz").add("foo bar").add("baz foo"), + HitsList().add(Hits().add(0).add(3).add(6)).add(Hits().add(1).add(4))); + + bool retval = true; + if (!EXPECT_TRUE(assertFieldInfo(fs, "foo", "foo", QTFieldInfo(0, 1, 1)))) retval = false; + if (!EXPECT_TRUE(assertFieldInfo(fs, "bar", "foo", QTFieldInfo(0, 0, 1)))) retval = false; + if (!EXPECT_TRUE(assertFieldInfo(fs, "foo", "foo bar baz", QTFieldInfo(0, 1, 3)))) retval = false; + if (!EXPECT_TRUE(assertFieldInfo(fs, "bar", "foo bar baz", QTFieldInfo(0, 1, 3)))) retval = false; + if (!EXPECT_TRUE(assertFieldInfo(fs, "baz", "foo bar baz", QTFieldInfo(0, 1, 3)))) retval = false; + if (!EXPECT_TRUE(assertFieldInfo(fs, "qux", "foo bar baz", QTFieldInfo(0, 0, 3)))) retval = false; + if (!EXPECT_TRUE(assertFieldInfo(fs, "foo", "foo foo foo", QTFieldInfo(0, 3, 3)))) retval = false; + // query term size > last term size + if (!EXPECT_TRUE(assertFieldInfo(fs, "runner", "Road Runner Disco", QTFieldInfo(0, 1, 3)))) retval = false; + if (!EXPECT_TRUE(assertFieldInfo(fs, StringList().add("roadrun").add("runner"), "Road Runner Disco", + FieldInfoList().add(QTFieldInfo(0, 0, 3)).add(QTFieldInfo(0, 1, 3))))) retval = false; + // multiple terms + if (!EXPECT_TRUE(assertFieldInfo(fs, "foo", StringList().add("foo bar baz").add("foo bar"), + QTFieldInfo(0, 2, 5)))) retval = false; + if (!EXPECT_TRUE(assertFieldInfo(fs, StringList().add("foo").add("baz"), "foo bar baz", + FieldInfoList().add(QTFieldInfo(0, 1, 3)).add(QTFieldInfo(0, 1, 3))))) retval = false; + if (!EXPECT_TRUE(assertFieldInfo(fs, StringList().add("foo").add("baz"), StringList().add("foo bar baz").add("foo bar"), + FieldInfoList().add(QTFieldInfo(0, 2, 5)).add(QTFieldInfo(0, 1, 5))))) retval = false; + return retval; +} +bool +testStrChrFieldSearcher(StrChrFieldSearcher & fs) +{ + std::string field = "operators and operator overloading with utf8 char oe = \xc3\x98"; + assertString(fs, "oper", field, Hits()); + assertString(fs, "tor", field, Hits()); + assertString(fs, "oper*", field, Hits().add(0).add(2)); + assertString(fs, "and", field, Hits().add(1)); + + assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits()).add(Hits())); + assertString(fs, StringList().add("and").add("overloading"), field, HitsList().add(Hits().add(1)).add(Hits().add(3))); + + fs.setMatchType(FieldSearcher::PREFIX); + assertString(fs, "oper", field, Hits().add(0).add(2)); + assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits().add(0).add(2)).add(Hits())); + + fs.setMatchType(FieldSearcher::REGULAR); + if (!EXPECT_TRUE(testStringFieldInfo(fs))) return false; + + { // test handling of several underscores + StringList query = StringList().add("foo").add("bar"); + HitsList exp = HitsList().add(Hits().add(0)).add(Hits().add(1)); + assertString(fs, query, "foo_bar", exp); + assertString(fs, query, "foo__bar", exp); + assertString(fs, query, "foo___bar", exp); + assertString(fs, query, "foo________bar", exp); + assertString(fs, query, "foo____________________bar", exp); + assertString(fs, query, "________________________________________foo________________________________________bar________________________________________", exp); + query = StringList().add("foo").add("thisisaveryveryverylongword"); + assertString(fs, query, "foo____________________thisisaveryveryverylongword", exp); + + assertString(fs, "bar", "foo bar", Hits().add(1)); + assertString(fs, "bar", "foo____________________bar", Hits().add(1)); + assertString(fs, "bar", "foo____________________thisisaveryveryverylongword____________________bar", Hits().add(2)); + } + return true; +} + + TEST("verify correct term parsing") { + ASSERT_TRUE(Query::parseQueryTerm("index:term").first == "index"); + ASSERT_TRUE(Query::parseQueryTerm("index:term").second == "term"); + ASSERT_TRUE(Query::parseQueryTerm("term").first == ""); + ASSERT_TRUE(Query::parseQueryTerm("term").second == "term"); + ASSERT_TRUE(Query::parseTerm("*substr*").first == "substr"); + ASSERT_TRUE(Query::parseTerm("*substr*").second == TermType::SUBSTRINGTERM); + ASSERT_TRUE(Query::parseTerm("*suffix").first == "suffix"); + ASSERT_TRUE(Query::parseTerm("*suffix").second == TermType::SUFFIXTERM); + ASSERT_TRUE(Query::parseTerm("prefix*").first == "prefix"); + ASSERT_TRUE(Query::parseTerm("prefix*").second == TermType::PREFIXTERM); + ASSERT_TRUE(Query::parseTerm("term").first == "term"); + ASSERT_TRUE(Query::parseTerm("term").second == TermType::WORD); + } + + TEST("suffix matching") { + EXPECT_EQUAL(assertMatchTermSuffix("a", "vespa"), true); + EXPECT_EQUAL(assertMatchTermSuffix("spa", "vespa"), true); + EXPECT_EQUAL(assertMatchTermSuffix("vespa", "vespa"), true); + EXPECT_EQUAL(assertMatchTermSuffix("vvespa", "vespa"), false); + EXPECT_EQUAL(assertMatchTermSuffix("fspa", "vespa"), false); + EXPECT_EQUAL(assertMatchTermSuffix("v", "vespa"), false); + } + +TEST("Test basic strchrfield searchers") { + { + UTF8StrChrFieldSearcher fs(0); + EXPECT_TRUE(testStrChrFieldSearcher(fs)); + } + { + FUTF8StrChrFieldSearcher fs(0); + EXPECT_TRUE(testStrChrFieldSearcher(fs)); + } +} + +bool +testUTF8SubStringFieldSearcher(StrChrFieldSearcher & fs) +{ + std::string field = "operators and operator overloading"; + assertString(fs, "rsand", field, Hits()); + assertString(fs, "ove", field, Hits().add(3)); + assertString(fs, "ing", field, Hits().add(3)); + assertString(fs, "era", field, Hits().add(0).add(2)); + assertString(fs, "a", field, Hits().add(0).add(1).add(2).add(3)); + + assertString(fs, StringList().add("dn").add("gn"), field, HitsList().add(Hits()).add(Hits())); + assertString(fs, StringList().add("ato").add("load"), field, HitsList().add(Hits().add(0).add(2)).add(Hits().add(3))); + + assertString(fs, StringList().add("aa").add("ab"), "aaaab", + HitsList().add(Hits().add(0).add(0).add(0)).add(Hits().add(0))); + + if (!EXPECT_TRUE(testStringFieldInfo(fs))) return false; + return true; +} + +TEST("utf8 substring search") { + { + UTF8SubStringFieldSearcher fs(0); + EXPECT_TRUE(testUTF8SubStringFieldSearcher(fs)); + assertString(fs, "aa", "aaaa", Hits().add(0).add(0)); + } + { + UTF8SubStringFieldSearcher fs(0); + EXPECT_TRUE(testUTF8SubStringFieldSearcher(fs)); + assertString(fs, "abc", "abc bcd abc", Hits().add(0).add(2)); + fs.maxFieldLength(4); + assertString(fs, "abc", "abc bcd abc", Hits().add(0)); + } + { + UTF8SubstringSnippetModifier fs(0); + EXPECT_TRUE(testUTF8SubStringFieldSearcher(fs)); + // we don't have 1 term optimization + assertString(fs, "aa", "aaaa", Hits().add(0).add(0).add(0)); + } +} + +TEST("utf8 substring search with empty term") +{ + UTF8SubStringFieldSearcher fs(0); + EXPECT_TRUE(testUTF8SubStringFieldSearcher(fs)); + assertString(fs, "", "abc", Hits()); + assertFieldInfo(fs, "", "abc", QTFieldInfo().setFieldLength(0)); +} + +TEST("utf8 suffix search") { + UTF8SuffixStringFieldSearcher fs(0); + std::string field = "operators and operator overloading"; + assertString(fs, "rsand", field, Hits()); + assertString(fs, "tor", field, Hits().add(2)); + assertString(fs, "tors", field, Hits().add(0)); + + assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits())); + assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3))); + + EXPECT_TRUE(testStringFieldInfo(fs)); +} + +TEST("utf8 exact match") { + UTF8ExactStringFieldSearcher fs(0); + // regular + TEST_DO(assertString(fs, "vespa", "vespa", Hits().add(0))); + TEST_DO(assertString(fs, "vespar", "vespa", Hits())); + TEST_DO(assertString(fs, "vespa", "vespar", Hits())); + TEST_DO(assertString(fs, "vespa", "vespa vespa", Hits())); + TEST_DO(assertString(fs, "vesp", "vespa", Hits())); + TEST_DO(assertString(fs, "vesp*", "vespa", Hits().add(0))); + TEST_DO(assertString(fs, "hutte", "hutte", Hits().add(0))); + TEST_DO(assertString(fs, "hütte", "hütte", Hits().add(0))); + TEST_DO(assertString(fs, "hutte", "hütte", Hits())); + TEST_DO(assertString(fs, "hütte", "hutte", Hits())); + TEST_DO(assertString(fs, "hütter", "hütte", Hits())); + TEST_DO(assertString(fs, "hütte", "hütter", Hits())); +} + +TEST("utf8 flexible searcher"){ + UTF8FlexibleStringFieldSearcher fs(0); + // regular + assertString(fs, "vespa", "vespa", Hits().add(0)); + assertString(fs, "vesp", "vespa", Hits()); + assertString(fs, "esp", "vespa", Hits()); + assertString(fs, "espa", "vespa", Hits()); + + // prefix + assertString(fs, "vesp*", "vespa", Hits().add(0)); + fs.setMatchType(FieldSearcher::PREFIX); + assertString(fs, "vesp", "vespa", Hits().add(0)); + + // substring + fs.setMatchType(FieldSearcher::REGULAR); + assertString(fs, "*esp*", "vespa", Hits().add(0)); + fs.setMatchType(FieldSearcher::SUBSTRING); + assertString(fs, "esp", "vespa", Hits().add(0)); + + // suffix + fs.setMatchType(FieldSearcher::REGULAR); + assertString(fs, "*espa", "vespa", Hits().add(0)); + fs.setMatchType(FieldSearcher::SUFFIX); + assertString(fs, "espa", "vespa", Hits().add(0)); + + fs.setMatchType(FieldSearcher::REGULAR); + EXPECT_TRUE(testStringFieldInfo(fs)); +} + +TEST("bool search") { + BoolFieldSearcher fs(0); + TEST_DO(assertBool(fs, "true", true, true)); + TEST_DO(assertBool(fs, "true", false, false)); + TEST_DO(assertBool(fs, "1", true, true)); + TEST_DO(assertBool(fs, "1", false, false)); + TEST_DO(assertBool(fs, "false", true, false)); + TEST_DO(assertBool(fs, "false", false, true)); + TEST_DO(assertBool(fs, "0", true, false)); + TEST_DO(assertBool(fs, "0", false, true)); + TEST_DO(assertBool(fs, StringList().add("true").add("false").add("true"), true, BoolList().add(true).add(false).add(true))); + TEST_DO(assertBool(fs, StringList().add("true").add("false").add("true"), false, BoolList().add(false).add(true).add(false))); +} + +TEST("integer search") +{ + IntFieldSearcher fs(0); + TEST_DO(assertInt(fs, "10", 10, true)); + TEST_DO(assertInt(fs, "9", 10, false)); + TEST_DO(assertInt(fs, ">9", 10, true)); + TEST_DO(assertInt(fs, ">9", 9, false)); + TEST_DO(assertInt(fs, "<11", 10, true)); + TEST_DO(assertInt(fs, "<11", 11, false)); + TEST_DO(assertInt(fs, "-10", -10, true)); + TEST_DO(assertInt(fs, "-9", -10, false)); + TEST_DO(assertInt(fs, "a", 10, false)); + TEST_DO(assertInt(fs, "[-5;5]", -5, true)); + TEST_DO(assertInt(fs, "[-5;5]", 0, true)); + TEST_DO(assertInt(fs, "[-5;5]", 5, true)); + TEST_DO(assertInt(fs, "[-5;5]", -6, false)); + TEST_DO(assertInt(fs, "[-5;5]", 6, false)); + + TEST_DO(assertInt(fs, StringList().add("9").add("11"), 10, BoolList().add(false).add(false))); + TEST_DO(assertInt(fs, StringList().add("9").add("10"), 10, BoolList().add(false).add(true))); + TEST_DO(assertInt(fs, StringList().add("10").add(">9"), 10, BoolList().add(true).add(true))); + + TEST_DO(assertInt(fs, "10", LongList().add(10).add(20).add(10).add(30), Hits().add(0).add(2))); + TEST_DO(assertInt(fs, StringList().add("10").add("20"), LongList().add(10).add(20).add(10).add(30), + HitsList().add(Hits().add(0).add(2)).add(Hits().add(1)))); + + TEST_DO(assertFieldInfo(fs, "10", 10, QTFieldInfo(0, 1, 1))); + TEST_DO(assertFieldInfo(fs, "10", LongList().add(10).add(20).add(10).add(30), QTFieldInfo(0, 2, 4))); + TEST_DO(assertFieldInfo(fs, StringList().add("10").add("20"), 10, + FieldInfoList().add(QTFieldInfo(0, 1, 1)).add(QTFieldInfo(0, 0, 1)))); + TEST_DO(assertFieldInfo(fs, StringList().add("10").add("20"), LongList().add(10).add(20).add(10).add(30), + FieldInfoList().add(QTFieldInfo(0, 2, 4)).add(QTFieldInfo(0, 1, 4)))); +} + +TEST("floating point search") +{ + FloatFieldSearcher fs; + TEST_DO(assertFloat(fs, "10", 10, true)); + TEST_DO(assertFloat(fs, "10.5", 10.5, true)); + TEST_DO(assertFloat(fs, "-10.5", -10.5, true)); + TEST_DO(assertFloat(fs, ">10.5", 10.6, true)); + TEST_DO(assertFloat(fs, ">10.5", 10.5, false)); + TEST_DO(assertFloat(fs, "<10.5", 10.4, true)); + TEST_DO(assertFloat(fs, "<10.5", 10.5, false)); + TEST_DO(assertFloat(fs, "10.4", 10.5, false)); + TEST_DO(assertFloat(fs, "-10.4", -10.5, false)); + TEST_DO(assertFloat(fs, "a", 10.5, false)); + TEST_DO(assertFloat(fs, "[-5.5;5.5]", -5.5, true)); + TEST_DO(assertFloat(fs, "[-5.5;5.5]", 0, true)); + TEST_DO(assertFloat(fs, "[-5.5;5.5]", 5.5, true)); + TEST_DO(assertFloat(fs, "[-5.5;5.5]", -5.6, false)); + TEST_DO(assertFloat(fs, "[-5.5;5.5]", 5.6, false)); + + TEST_DO(assertFloat(fs, StringList().add("10").add("11"), 10.5, BoolList().add(false).add(false))); + TEST_DO(assertFloat(fs, StringList().add("10").add("10.5"), 10.5, BoolList().add(false).add(true))); + TEST_DO(assertFloat(fs, StringList().add(">10.4").add("10.5"), 10.5, BoolList().add(true).add(true))); + + TEST_DO(assertFloat(fs, "10.5", FloatList().add(10.5).add(20.5).add(10.5).add(30.5), Hits().add(0).add(2))); + TEST_DO(assertFloat(fs, StringList().add("10.5").add("20.5"), FloatList().add(10.5).add(20.5).add(10.5).add(30.5), + HitsList().add(Hits().add(0).add(2)).add(Hits().add(1)))); + + TEST_DO(assertFieldInfo(fs, "10.5", 10.5, QTFieldInfo(0, 1, 1))); + TEST_DO(assertFieldInfo(fs, "10.5", FloatList().add(10.5).add(20.5).add(10.5).add(30.5), QTFieldInfo(0, 2, 4))); + TEST_DO(assertFieldInfo(fs, StringList().add("10.5").add("20.5"), 10.5, + FieldInfoList().add(QTFieldInfo(0, 1, 1)).add(QTFieldInfo(0, 0, 1)))); + TEST_DO(assertFieldInfo(fs, StringList().add("10.5").add("20.5"), FloatList().add(10.5).add(20.5).add(10.5).add(30.5), + FieldInfoList().add(QTFieldInfo(0, 2, 4)).add(QTFieldInfo(0, 1, 4)))); +} + +TEST("Snippet modifier search") { + // ascii + assertSnippetModifier("f", "foo", "\x1F""f\x1Foo"); + assertSnippetModifier("o", "foo", "f\x1Fo\x1F\x1Fo\x1F"); + assertSnippetModifier("r", "bar", "ba\x1Fr\x1F"); + assertSnippetModifier("foo", "foo foo", "\x1F""foo\x1F \x1F""foo\x1F"); + assertSnippetModifier("aa", "aaaaaa", "\x1F""aa\x1F\x1F""aa\x1F\x1F""aa\x1F"); + assertSnippetModifier("ab", "abcd\x1F""efgh", "\x1F""ab\x1F""cd\x1F""efgh"); + assertSnippetModifier("ef", "abcd\x1F""efgh", "abcd\x1F\x1F""ef\x1Fgh"); + assertSnippetModifier("fg", "abcd\x1F""efgh", "abcd\x1F""e\x1F""fg\x1Fh"); + // the separator overlapping the match is skipped + assertSnippetModifier("cdef", "abcd\x1F""efgh", "ab\x1F""cdef\x1F""gh"); + // no hits + assertSnippetModifier("bb", "aaaaaa", "aaaaaa"); + + + // multiple query terms + assertSnippetModifier(StringList().add("ab").add("cd"), "abcd", "\x1F""ab\x1F\x1F""cd\x1F"); + // when we have overlap we only get the first match + assertSnippetModifier(StringList().add("ab").add("bc"), "abcd", "\x1F""ab\x1F""cd"); + assertSnippetModifier(StringList().add("bc").add("ab"), "abcd", "\x1F""ab\x1F""cd"); + // the separator overlapping the match is skipped + assertSnippetModifier(StringList().add("de").add("ef"), "abcd\x1F""efgh", "abc\x1F""de\x1F""fgh"); + + // cjk + assertSnippetModifier("\xe7\x9f\xb3", "\xe7\x9f\xb3\xe6\x98\x8e\xe5\x87\xb1\xe5\x9c\xa8", + "\x1f\xe7\x9f\xb3\x1f\xe6\x98\x8e\xe5\x87\xb1\xe5\x9c\xa8"); + assertSnippetModifier("\xe6\x98\x8e\xe5\x87\xb1", "\xe7\x9f\xb3\xe6\x98\x8e\xe5\x87\xb1\xe5\x9c\xa8", + "\xe7\x9f\xb3\x1f\xe6\x98\x8e\xe5\x87\xb1\x1f\xe5\x9c\xa8"); + // the separator overlapping the match is skipped + assertSnippetModifier("\xe6\x98\x8e\xe5\x87\xb1", "\xe7\x9f\xb3\xe6\x98\x8e\x1f\xe5\x87\xb1\xe5\x9c\xa8", + "\xe7\x9f\xb3\x1f\xe6\x98\x8e\xe5\x87\xb1\x1f\xe5\x9c\xa8"); + + { // check that resizing works + UTF8SubstringSnippetModifier mod; + EXPECT_EQUAL(mod.getModifiedBuf().getLength(), 32u); + EXPECT_EQUAL(mod.getModifiedBuf().getPos(), 0u); + performSearch(mod, StringList().add("a"), StringFieldValue("aaaaaaaaaaaaaaaa")); + EXPECT_EQUAL(mod.getModifiedBuf().getPos(), 16u + 2 * 16u); + EXPECT_TRUE(mod.getModifiedBuf().getLength() >= mod.getModifiedBuf().getPos()); + } +} + +TEST("snippet modifier") { + { // string field value + SnippetModifierSetup sms(StringList().add("ab")); + // multiple invokations + assertSnippetModifier(sms, StringFieldValue("ab"), "\x1F""ab\x1F"); + assertSnippetModifier(sms, StringFieldValue("xxxxabxxxxabxxxx"), "xxxx\x1F""ab\x1Fxxxx\x1F""ab\x1Fxxxx"); + assertSnippetModifier(sms, StringFieldValue("xxabxx"), "xx\x1F""ab\x1Fxx"); + } + { // collection field value + SnippetModifierSetup sms(StringList().add("ab")); + // multiple invokations + assertSnippetModifier(sms, getFieldValue(StringList().add("ab")), "\x1F""ab\x1F"); + assertSnippetModifier(sms, getFieldValue(StringList().add("xxabxx")), "xx\x1F""ab\x1Fxx"); + assertSnippetModifier(sms, getFieldValue(StringList().add("ab").add("xxabxx").add("xxxxxx")), + "\x1F""ab\x1F\x1E""xx\x1F""ab\x1F""xx\x1E""xxxxxx"); + assertSnippetModifier(sms, getFieldValue(StringList().add("cd").add("ef").add("gh")), + "cd\x1E""ef\x1E""gh"); + } + { // check that resizing works + SnippetModifierSetup sms(StringList().add("a")); + EXPECT_EQUAL(sms.modifier.getValueBuf().getLength(), 32u); + EXPECT_EQUAL(sms.modifier.getValueBuf().getPos(), 0u); + sms.modifier.modify(StringFieldValue("aaaaaaaaaaaaaaaa")); + EXPECT_EQUAL(sms.modifier.getValueBuf().getPos(), 16u + 2 * 16u); + EXPECT_TRUE(sms.modifier.getValueBuf().getLength() >= sms.modifier.getValueBuf().getPos()); + } +} + +TEST("FieldSearchSpec constrution") { + { + FieldSearchSpec f; + EXPECT_FALSE(f.valid()); + EXPECT_EQUAL(0u, f.id()); + EXPECT_EQUAL("", f.name()); + EXPECT_EQUAL(0x100000u, f.maxLength()); + } + { + FieldSearchSpec f(7, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 789); + EXPECT_TRUE(f.valid()); + EXPECT_EQUAL(7u, f.id()); + EXPECT_EQUAL("f0", f.name()); + EXPECT_EQUAL(789u, f.maxLength()); + EXPECT_EQUAL(789u, f.searcher().maxFieldLength()); + } +} + +TEST("snippet modifier manager") { + FieldSearchSpecMapT specMap; + specMap[0] = FieldSearchSpec(0, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 1000); + specMap[1] = FieldSearchSpec(1, "f1", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "", 1000); + IndexFieldMapT indexMap; + indexMap["i0"].push_back(0); + indexMap["i1"].push_back(1); + indexMap["i2"].push_back(0); + indexMap["i2"].push_back(1); + + { + SnippetModifierManager man; + Query query(StringList().add("i0:foo")); + man.setup(query.qtl, specMap, indexMap); + assertQueryTerms(man, 0, StringList().add("foo")); + assertQueryTerms(man, 1, StringList()); + } + { + SnippetModifierManager man; + Query query(StringList().add("i1:foo")); + man.setup(query.qtl, specMap, indexMap); + assertQueryTerms(man, 0, StringList()); + assertQueryTerms(man, 1, StringList()); + } + { + SnippetModifierManager man; + Query query(StringList().add("i1:*foo*")); + man.setup(query.qtl, specMap, indexMap); + assertQueryTerms(man, 0, StringList()); + assertQueryTerms(man, 1, StringList().add("foo")); + } + { + SnippetModifierManager man; + Query query(StringList().add("i2:foo").add("i2:*bar*")); + man.setup(query.qtl, specMap, indexMap); + assertQueryTerms(man, 0, StringList().add("foo").add("bar")); + assertQueryTerms(man, 1, StringList().add("bar")); + } + { // check buffer sizes + SnippetModifierManager man; + Query query(StringList().add("i2:foo").add("i2:*bar*")); + man.setup(query.qtl, specMap, indexMap); + { + SnippetModifier * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(0)); + UTF8SubstringSnippetModifier * searcher = sm->getSearcher().get(); + EXPECT_EQUAL(sm->getValueBuf().getLength(), 128u); + EXPECT_EQUAL(searcher->getModifiedBuf().getLength(), 64u); + } + { + SnippetModifier * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(1)); + UTF8SubstringSnippetModifier * searcher = sm->getSearcher().get(); + EXPECT_EQUAL(sm->getValueBuf().getLength(), 128u); + EXPECT_EQUAL(searcher->getModifiedBuf().getLength(), 64u); + } + } +} + +TEST("Stripping of indexes") +{ + EXPECT_EQUAL("f", FieldSearchSpecMap::stripNonFields("f")); + EXPECT_EQUAL("f", FieldSearchSpecMap::stripNonFields("f[0]")); + EXPECT_EQUAL("f[a]", FieldSearchSpecMap::stripNonFields("f[a]")); + + EXPECT_EQUAL("f.value", FieldSearchSpecMap::stripNonFields("f{a}")); + EXPECT_EQUAL("f.value", FieldSearchSpecMap::stripNonFields("f{a0}")); + EXPECT_EQUAL("f{a 0}", FieldSearchSpecMap::stripNonFields("f{a 0}")); + EXPECT_EQUAL("f.value", FieldSearchSpecMap::stripNonFields("f{\"a 0\"}")); +} + +TEST("counting of words") { + EXPECT_TRUE(assertCountWords(0, "")); + EXPECT_TRUE(assertCountWords(0, "?")); + EXPECT_TRUE(assertCountWords(1, "foo")); + EXPECT_TRUE(assertCountWords(2, "foo bar")); + EXPECT_TRUE(assertCountWords(2, "? foo bar")); + EXPECT_TRUE(assertCountWords(2, "foo bar ?")); + + // check that 'a' is counted as 1 word + UTF8StrChrFieldSearcher fs(0); + StringList field = StringList().add("a").add("aa bb cc"); + assertString(fs, "bb", field, Hits().add(2)); + assertString(fs, StringList().add("bb").add("not"), field, HitsList().add(Hits().add(2)).add(Hits())); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/streamingvisitors/src/tests/searchvisitor/CMakeLists.txt b/streamingvisitors/src/tests/searchvisitor/CMakeLists.txt index fdbd60ce30a..01b625b6b3b 100644 --- a/streamingvisitors/src/tests/searchvisitor/CMakeLists.txt +++ b/streamingvisitors/src/tests/searchvisitor/CMakeLists.txt @@ -3,6 +3,6 @@ vespa_add_executable(streamingvisitors_searchvisitor_test_app TEST SOURCES searchvisitor_test.cpp DEPENDS - streamingvisitors_searchvisitor + streamingvisitors ) vespa_add_test(NAME streamingvisitors_searchvisitor_test_app COMMAND streamingvisitors_searchvisitor_test_app) diff --git a/streamingvisitors/src/tests/textutil/.gitignore b/streamingvisitors/src/tests/textutil/.gitignore new file mode 100644 index 00000000000..1103f79800a --- /dev/null +++ b/streamingvisitors/src/tests/textutil/.gitignore @@ -0,0 +1,4 @@ +.depend +Makefile +textutil_test +vsm_textutil_test_app diff --git a/streamingvisitors/src/tests/textutil/CMakeLists.txt b/streamingvisitors/src/tests/textutil/CMakeLists.txt new file mode 100644 index 00000000000..59817d01137 --- /dev/null +++ b/streamingvisitors/src/tests/textutil/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(vsm_textutil_test_app TEST + SOURCES + textutil.cpp + DEPENDS + streamingvisitors +) +vespa_add_test(NAME vsm_textutil_test_app COMMAND vsm_textutil_test_app) diff --git a/streamingvisitors/src/tests/textutil/textutil.cpp b/streamingvisitors/src/tests/textutil/textutil.cpp new file mode 100644 index 00000000000..2a1390eaa01 --- /dev/null +++ b/streamingvisitors/src/tests/textutil/textutil.cpp @@ -0,0 +1,285 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/vespalib/testkit/testapp.h> + +#include <vespa/fastlib/text/unicodeutil.h> +#include <vespa/searchlib/query/base.h> +#include <vespa/vsm/searcher/fold.h> +#include <vespa/vsm/searcher/futf8strchrfieldsearcher.h> +#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> + +using search::byte; // unsigned char + +namespace vsm { + +template <typename T> +class Vector : public std::vector<T> +{ +public: + Vector() : std::vector<T>() {} + Vector<T> & a(T v) { this->push_back(v); return *this; } +}; + +typedef Vector<ucs4_t> UCS4V; +typedef Vector<size_t> SizeV; +typedef UTF8StringFieldSearcherBase SFSB; +typedef FUTF8StrChrFieldSearcher FSFS; + +class TextUtilTest : public vespalib::TestApp +{ +private: + ucs4_t getUTF8Char(const char * src); + template <typename BW, bool OFF> + void assertSkipSeparators(const char * input, size_t len, const UCS4V & expdstbuf, const SizeV & expoffsets); + void assertAnsiFold(const std::string & toFold, const std::string & exp); + void assertAnsiFold(char c, char exp); +#ifdef __x86_64__ + void assert_sse2_foldua(const std::string & toFold, size_t charFolded, const std::string & exp); + void assert_sse2_foldua(unsigned char c, unsigned char exp, size_t charFolded = 16); +#endif + + template <typename BW, bool OFF> + void testSkipSeparators(); + void testSkipSeparators(); + void testSeparatorCharacter(); + void testAnsiFold(); + void test_lfoldua(); +#ifdef __x86_64__ + void test_sse2_foldua(); +#endif + +public: + int Main() override; +}; + +ucs4_t +TextUtilTest::getUTF8Char(const char * src) +{ + ucs4_t retval = Fast_UnicodeUtil::GetUTF8Char(src); + ASSERT_TRUE(retval != Fast_UnicodeUtil::_BadUTF8Char); + return retval; +} + +template <typename BW, bool OFF> +void +TextUtilTest::assertSkipSeparators(const char * input, size_t len, const UCS4V & expdstbuf, const SizeV & expoffsets) +{ + const byte * srcbuf = reinterpret_cast<const byte *>(input); + auto dstbuf = std::make_unique<ucs4_t[]>(len + 1); + auto offsets = std::make_unique<size_t[]>(len + 1); + UTF8StrChrFieldSearcher fs; + BW bw(dstbuf.get(), offsets.get()); + size_t dstlen = fs.skipSeparators(srcbuf, len, bw); + EXPECT_EQUAL(dstlen, expdstbuf.size()); + ASSERT_TRUE(dstlen == expdstbuf.size()); + for (size_t i = 0; i < dstlen; ++i) { + EXPECT_EQUAL(dstbuf[i], expdstbuf[i]); + if (OFF) { + EXPECT_EQUAL(offsets[i], expoffsets[i]); + } + } +} + +void +TextUtilTest::assertAnsiFold(const std::string & toFold, const std::string & exp) +{ + char folded[256]; + EXPECT_TRUE(FSFS::ansiFold(toFold.c_str(), toFold.size(), folded)); + EXPECT_EQUAL(std::string(folded, toFold.size()), exp); +} + +void +TextUtilTest::assertAnsiFold(char c, char exp) +{ + char folded; + EXPECT_TRUE(FSFS::ansiFold(&c, 1, &folded)); + EXPECT_EQUAL((int32_t)folded, (int32_t)exp); +} + +#ifdef __x86_64__ +void +TextUtilTest::assert_sse2_foldua(const std::string & toFold, size_t charFolded, const std::string & exp) +{ + char folded[256]; + size_t alignedStart = 0xF - (size_t(folded + 0xF) % 0x10); + const unsigned char * toFoldOrg = reinterpret_cast<const unsigned char *>(toFold.c_str()); + const unsigned char * retval = + sse2_foldua(toFoldOrg, toFold.size(), reinterpret_cast<unsigned char *>(folded + alignedStart)); + EXPECT_EQUAL((size_t)(retval - toFoldOrg), charFolded); + EXPECT_EQUAL(std::string(folded + alignedStart, charFolded), exp); +} + +void +TextUtilTest::assert_sse2_foldua(unsigned char c, unsigned char exp, size_t charFolded) +{ + unsigned char toFold[16]; + memset(toFold, c, 16); + unsigned char folded[32]; + size_t alignedStart = 0xF - (size_t(folded + 0xF) % 0x10); + const unsigned char * retval = sse2_foldua(toFold, 16, folded + alignedStart); + EXPECT_EQUAL((size_t)(retval - toFold), charFolded); + for (size_t i = 0; i < charFolded; ++i) { + EXPECT_EQUAL((int32_t)folded[i + alignedStart], (int32_t)exp); + } +} +#endif + +template <typename BW, bool OFF> +void +TextUtilTest::testSkipSeparators() +{ + // ascii characters + assertSkipSeparators<BW, OFF>("foo", 3, UCS4V().a('f').a('o').a('o'), SizeV().a(0).a(1).a(2)); + assertSkipSeparators<BW, OFF>("f\x1Fo", 3, UCS4V().a('f').a('o'), SizeV().a(0).a(2)); + assertSkipSeparators<BW, OFF>("f\no", 3, UCS4V().a('f').a('\n').a('o'), SizeV().a(0).a(1).a(2)); + assertSkipSeparators<BW, OFF>("f\to", 3, UCS4V().a('f').a('\t').a('o'), SizeV().a(0).a(1).a(2)); + + // utf8 char + assertSkipSeparators<BW, OFF>("\xC2\x80\x66", 3, UCS4V().a(getUTF8Char("\xC2\x80")).a('f'), + SizeV().a(0).a(2)); + assertSkipSeparators<BW, OFF>("\xE0\xA0\x80\x66", 4, UCS4V().a(getUTF8Char("\xE0\xA0\x80")).a('f'), + SizeV().a(0).a(3)); + assertSkipSeparators<BW, OFF>("\xF0\x90\x80\x80\x66", 5, UCS4V().a(getUTF8Char("\xF0\x90\x80\x80")).a('f'), + SizeV().a(0).a(4)); + + // replacement string (sharp s -> ss) + assertSkipSeparators<BW, OFF>("\xC3\x9F\x66\xC3\x9F", 5, UCS4V().a('s').a('s').a('f').a('s').a('s'), + SizeV().a(0).a(0).a(2).a(3).a(3)); +} + +void +TextUtilTest::testSkipSeparators() +{ + Fast_NormalizeWordFolder::Setup(Fast_NormalizeWordFolder::DO_SHARP_S_SUBSTITUTION); + + testSkipSeparators<SFSB::BufferWrapper, false>(); + testSkipSeparators<SFSB::OffsetWrapper, true>(); +} + +void +TextUtilTest::testSeparatorCharacter() +{ + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x00')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x01')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x02')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x03')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x04')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x05')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x06')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x07')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x08')); + EXPECT_TRUE(! SFSB::isSeparatorCharacter('\x09')); // '\t' + EXPECT_TRUE(! SFSB::isSeparatorCharacter('\x0a')); // '\n' + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0b')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0c')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0d')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0e')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0f')); + + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x10')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x11')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x12')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x13')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x14')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x15')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x16')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x17')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x18')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x19')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1a')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1b')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1c')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1d')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1e')); + EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1f')); + + EXPECT_TRUE(! SFSB::isSeparatorCharacter('\x20')); // space +} + +void +TextUtilTest::testAnsiFold() +{ + FieldSearcher::init(); + assertAnsiFold("", ""); + assertAnsiFold("ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz"); + assertAnsiFold("abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz"); + assertAnsiFold("0123456789", "0123456789"); + for (int i = 0; i < 128; ++i) { + if ((i >= 'a' && i <= 'z') || (i >= '0' && i <= '9')) { + assertAnsiFold(i, i); + } else if (i >= 'A' && i <= 'Z') { + assertAnsiFold(i, i + 32); + } else { + assertAnsiFold(i, 0); + } + } + + // non-ascii is ignored + for (int i = 128; i < 256; ++i) { + char toFold = i; + char folded; + EXPECT_TRUE(!FSFS::ansiFold(&toFold, 1, &folded)); + } +} + +void +TextUtilTest::test_lfoldua() +{ + FieldSearcher::init(); + char folded[256]; + size_t alignedStart = 0; + const char * toFold = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"; + size_t len = strlen(toFold); + EXPECT_TRUE(FSFS::lfoldua(toFold, len, folded, alignedStart)); + EXPECT_EQUAL(std::string(folded + alignedStart, len), "abcdefghijklmnopqrstuvwxyz"); +} + +#ifdef __x86_64__ +void +TextUtilTest::test_sse2_foldua() +{ + assert_sse2_foldua("", 0, ""); + assert_sse2_foldua("ABCD", 0, ""); + assert_sse2_foldua("ABCDEFGHIJKLMNO", 0, ""); + assert_sse2_foldua("ABCDEFGHIJKLMNOP", 16, "abcdefghijklmnop"); + assert_sse2_foldua("ABCDEFGHIJKLMNOPQ", 16, "abcdefghijklmnop"); + assert_sse2_foldua("KLMNOPQRSTUVWXYZ", 16, "klmnopqrstuvwxyz"); + assert_sse2_foldua("abcdefghijklmnop", 16, "abcdefghijklmnop"); + assert_sse2_foldua("klmnopqrstuvwxyz", 16, "klmnopqrstuvwxyz"); + assert_sse2_foldua("0123456789abcdef", 16, "0123456789abcdef"); + + for (int i = 0; i < 128; ++i) { + if ((i >= 'a' && i <= 'z') || (i >= '0' && i <= '9')) { + assert_sse2_foldua(i, i); + } else if (i >= 'A' && i <= 'Z') { + assert_sse2_foldua(i, i + 32); + } else { + assert_sse2_foldua(i, 0); + } + } + + // non-ascii is ignored + for (int i = 128; i < 256; ++i) { + assert_sse2_foldua(i, '?', 0); + } +} +#endif + +int +TextUtilTest::Main() +{ + TEST_INIT("textutil_test"); + + testSkipSeparators(); + testSeparatorCharacter(); + testAnsiFold(); + test_lfoldua(); +#ifdef __x86_64__ + test_sse2_foldua(); +#endif + + TEST_DONE(); +} + +} + +TEST_APPHOOK(vsm::TextUtilTest); diff --git a/streamingvisitors/src/tests/utilapps/.gitignore b/streamingvisitors/src/tests/utilapps/.gitignore new file mode 100644 index 00000000000..e69de29bb2d --- /dev/null +++ b/streamingvisitors/src/tests/utilapps/.gitignore diff --git a/streamingvisitors/src/vespa/searchvisitor/CMakeLists.txt b/streamingvisitors/src/vespa/searchvisitor/CMakeLists.txt index e8f85fc987e..ff629462f9e 100644 --- a/streamingvisitors/src/vespa/searchvisitor/CMakeLists.txt +++ b/streamingvisitors/src/vespa/searchvisitor/CMakeLists.txt @@ -1,5 +1,5 @@ # Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -vespa_add_library(streamingvisitors_searchvisitor +vespa_add_library(streamingvisitors SOURCES hitcollector.cpp indexenvironment.cpp @@ -11,6 +11,10 @@ vespa_add_library(streamingvisitors_searchvisitor rankprocessor.cpp searchenvironment.cpp searchvisitor.cpp + $<TARGET_OBJECTS:vsm_vconfig> + $<TARGET_OBJECTS:vsm_vsmbase> + $<TARGET_OBJECTS:vsm_vsmcommon> + $<TARGET_OBJECTS:vsm_vsmsearcher> INSTALL lib64 DEPENDS searchlib_searchlib_uca diff --git a/streamingvisitors/src/vespa/vsm/.gitignore b/streamingvisitors/src/vespa/vsm/.gitignore new file mode 100644 index 00000000000..4c5f5d9ef7a --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/.gitignore @@ -0,0 +1,3 @@ +.depend +Makefile +/libvsm.so.5.1 diff --git a/streamingvisitors/src/vespa/vsm/common/.gitignore b/streamingvisitors/src/vespa/vsm/common/.gitignore new file mode 100644 index 00000000000..95bc02923a9 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/common/.gitignore @@ -0,0 +1,5 @@ +*.exe +*.ilk +*.pdb +.depend* +Makefile diff --git a/streamingvisitors/src/vespa/vsm/common/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/common/CMakeLists.txt new file mode 100644 index 00000000000..4570a9b581e --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/common/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(vsm_vsmcommon OBJECT + SOURCES + charbuffer.cpp + document.cpp + documenttypemapping.cpp + fieldmodifier.cpp + storagedocument.cpp + DEPENDS +) diff --git a/streamingvisitors/src/vespa/vsm/common/charbuffer.cpp b/streamingvisitors/src/vespa/vsm/common/charbuffer.cpp new file mode 100644 index 00000000000..b8fbb5c8846 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/common/charbuffer.cpp @@ -0,0 +1,32 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "charbuffer.h" +#include <cstring> + +namespace vsm { + +CharBuffer::CharBuffer(size_t len) : + _buffer(len), + _pos(0) +{ } + +void +CharBuffer::put(const char * src, size_t n) +{ + if (n > getRemaining()) { + resize(_pos + (n * 2)); + } + char * dst = &_buffer[_pos]; + memcpy(dst, src, n); + _pos += n; +} + +void +CharBuffer::resize(size_t len) +{ + if (len > getLength()) { + _buffer.resize(len); + } +} + +} + diff --git a/streamingvisitors/src/vespa/vsm/common/charbuffer.h b/streamingvisitors/src/vespa/vsm/common/charbuffer.h new file mode 100644 index 00000000000..08618a9b973 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/common/charbuffer.h @@ -0,0 +1,52 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vector> +#include <memory> + +namespace vsm { + +/** + * Simple growable char buffer. + **/ +class CharBuffer +{ +private: + std::vector<char> _buffer; + size_t _pos; + +public: + typedef std::shared_ptr<CharBuffer> SP; + + /** + * Creates a char buffer with len bytes. + **/ + CharBuffer(size_t len = 0); + + /** + * Copies n bytes from the src array into the underlying buffer at the + * current position, and updates the position accordingly. + * Resizing will occur if needed. + **/ + void put(const char * src, size_t n); + + /** + * Resizes the buffer so that the new length becomes len. + * Resizing will not occur if len < current length. + **/ + void resize(size_t len); + + /** + * Resets the position to the beginning of the buffer. + **/ + void reset() { _pos = 0; } + + const char * getBuffer() const { return &_buffer[0]; } + size_t getLength() const { return _buffer.size(); } + size_t getPos() const { return _pos; } + size_t getRemaining() const { return getLength() - getPos(); } + void put(char c) { put(&c, 1); } +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/common/docsum.h b/streamingvisitors/src/vespa/vsm/common/docsum.h new file mode 100644 index 00000000000..49b84cb0783 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/common/docsum.h @@ -0,0 +1,22 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "document.h" + +namespace vsm { + +/** + Will represent a cache of the document summaries. -> Actual docsums will be + generated on the fly when requested. A document summary is accessed by its + documentId. +*/ + +class IDocSumCache +{ +public: + virtual const Document & getDocSum(const search::DocumentIdT & docId) const = 0; + virtual ~IDocSumCache() { } +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/common/document.cpp b/streamingvisitors/src/vespa/vsm/common/document.cpp new file mode 100644 index 00000000000..a345c82ce2d --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/common/document.cpp @@ -0,0 +1,73 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "document.h" +#include <vespa/vespalib/stllike/asciistream.h> +#include <vespa/vespalib/stllike/hash_map.hpp> + +using search::DocumentIdT; +using search::TimeT; +using document::FieldValue; + +namespace vsm +{ + +vespalib::asciistream & operator << (vespalib::asciistream & os, const FieldRef & f) +{ + const char *s = f.data(); + os << f.size(); + if (s) { + os << s; // Better hope it's null terminated! + } + os << " : "; + return os; +} + +vespalib::asciistream & operator << (vespalib::asciistream & os, const StringFieldIdTMap & f) +{ + for (StringFieldIdTMapT::const_iterator it=f._map.begin(), mt=f._map.end(); it != mt; it++) { + os << it->first << " = " << it->second << '\n'; + } + return os; +} + +StringFieldIdTMap::StringFieldIdTMap() : + _map() +{ +} + +void StringFieldIdTMap::add(const vespalib::string & s, FieldIdT fieldId) +{ + _map[s] = fieldId; +} + +void StringFieldIdTMap::add(const vespalib::string & s) +{ + if (_map.find(s) == _map.end()) { + FieldIdT fieldId = _map.size(); + _map[s] = fieldId; + } +} + +FieldIdT StringFieldIdTMap::fieldNo(const vespalib::string & fName) const +{ + StringFieldIdTMapT::const_iterator found = _map.find(fName); + FieldIdT fNo((found != _map.end()) ? found->second : npos); + return fNo; +} + +size_t StringFieldIdTMap::highestFieldNo() const +{ + size_t maxFNo(0); + for (const auto & field : _map) { + if (field.second >= maxFNo) { + maxFNo = field.second + 1; + } + } + return maxFNo; +} + +Document::~Document() { } + +} + +VESPALIB_HASH_MAP_INSTANTIATE(vespalib::string, vsm::FieldIdTList); +VESPALIB_HASH_MAP_INSTANTIATE(vespalib::string, vsm::IndexFieldMapT); diff --git a/streamingvisitors/src/vespa/vsm/common/document.h b/streamingvisitors/src/vespa/vsm/common/document.h new file mode 100644 index 00000000000..8c11d27072b --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/common/document.h @@ -0,0 +1,68 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/searchlib/query/base.h> +#include <vespa/document/fieldvalue/fieldvalue.h> +#include <vespa/vespalib/stllike/hash_map.h> +#include <map> + +namespace vespalib { + class asciistream; +} + +namespace vsm { + +/// Type to identify fields in documents. +typedef unsigned int FieldIdT; +/// A type to represent a list of FieldIds. +typedef std::vector<FieldIdT> FieldIdTList; +/// A type to represent all the fields contained in all the indexs. +typedef vespalib::hash_map<vespalib::string, FieldIdTList> IndexFieldMapT; +/// A type to represent all the fields contained in all the indexs in an all the document types. +typedef vespalib::hash_map<vespalib::string, IndexFieldMapT> DocumentTypeIndexFieldMapT; +/// A type to represent a map from fieldname to fieldid. +typedef std::map<vespalib::string, FieldIdT> StringFieldIdTMapT; + +class StringFieldIdTMap +{ + public: + enum { npos=0xFFFFFFFF }; + StringFieldIdTMap(); + FieldIdT fieldNo(const vespalib::string & fName) const; + void add(const vespalib::string & s); + void add(const vespalib::string & s, FieldIdT fNo); + const StringFieldIdTMapT & map() const { return _map; } + size_t highestFieldNo() const; + friend vespalib::asciistream & operator << (vespalib::asciistream & os, const StringFieldIdTMap & f); + private: + StringFieldIdTMapT _map; +}; + +typedef vespalib::stringref FieldRef; + +/** + This is the base class representing a document. It gives a document some + basic properties. A document is a collection of fields, together with a + document id and a time stamp. +*/ +class Document +{ + public: + Document(size_t maxFieldCount) : _docId(0), _fieldCount(maxFieldCount) { } + Document(search::DocumentIdT doc, size_t maxFieldCount) : _docId(doc), _fieldCount(maxFieldCount) { } + virtual ~Document(); + const search::DocumentIdT & getDocId() const { return _docId; } + size_t getFieldCount() const { return _fieldCount; } + void setDocId(const search::DocumentIdT & v) { _docId = v; } + virtual const document::FieldValue * getField(FieldIdT fId) const = 0; + /** + Returns true, if not possible to set. + */ + virtual bool setField(FieldIdT fId, document::FieldValue::UP fv) = 0; + private: + search::DocumentIdT _docId; + const size_t _fieldCount; +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/common/documenttypemapping.cpp b/streamingvisitors/src/vespa/vsm/common/documenttypemapping.cpp new file mode 100644 index 00000000000..7886c44b2e0 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/common/documenttypemapping.cpp @@ -0,0 +1,104 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "documenttypemapping.h" +#include <vespa/document/repo/documenttyperepo.h> +#include <vespa/document/datatype/documenttype.h> +#include <vespa/vespalib/stllike/hash_map.hpp> + +#include <vespa/log/log.h> +LOG_SETUP(".vsm.common.documenttypemapping"); + +namespace vsm { + +DocumentTypeMapping::DocumentTypeMapping() : + _fieldMap(), + _defaultDocumentTypeName(), + _defaultDocumentType(), + _documentTypeFreq() +{ } + +DocumentTypeMapping::~DocumentTypeMapping() { } + +namespace { + +vespalib::string getDocTypeId(const document::DocumentType & docType) +{ + vespalib::string typeId(docType.getName()); + typeId += "0"; // Hardcoded version (version not supported) + return typeId; +} + +} + +void DocumentTypeMapping::init(const vespalib::string & defaultDocumentType, + const StringFieldIdTMapT & fieldList, + const document::DocumentTypeRepo &repo) +{ + _defaultDocumentType = repo.getDocumentType(defaultDocumentType); + _defaultDocumentTypeName = getDocTypeId(*_defaultDocumentType); + LOG(debug, "Setting default document type to '%s'", + _defaultDocumentTypeName.c_str()); + buildFieldMap(_defaultDocumentType, fieldList, _defaultDocumentTypeName); +} + +bool DocumentTypeMapping::prepareBaseDoc(SharedFieldPathMap & map) const +{ + FieldPathMapMapT::const_iterator found = _fieldMap.find(_defaultDocumentTypeName); + if (found != _fieldMap.end()) { + map = std::make_shared<FieldPathMapT>(found->second); + LOG(debug, "Found FieldPathMap for default document type '%s' with %zd elements", + _defaultDocumentTypeName.c_str(), map->size()); + } else { + LOG(warning, "No FieldPathMap found for default document type '%s'. Using empty one", + _defaultDocumentTypeName.c_str()); + map = std::make_shared<FieldPathMapT>(); + } + return true; +} + +void DocumentTypeMapping::buildFieldMap( + const document::DocumentType *docTypePtr, + const StringFieldIdTMapT & fieldList, const vespalib::string & typeId) +{ + LOG(debug, "buildFieldMap: docType = '%s', fieldList.size = '%zd', typeId = '%s'", + docTypePtr->getName().c_str(), fieldList.size(), typeId.c_str()); + const document::DocumentType & docType = *docTypePtr; + size_t highestFNo(0); + for (StringFieldIdTMapT::const_iterator it = fieldList.begin(), mt = fieldList.end(); it != mt; it++) { + highestFNo = std::max(highestFNo, size_t(it->second)); + } + highestFNo++; + FieldPathMapT & fieldMap = _fieldMap[typeId]; + + fieldMap.resize(highestFNo); + + size_t validCount(0); + for (StringFieldIdTMapT::const_iterator it = fieldList.begin(), mt = fieldList.end(); it != mt; it++) { + vespalib::string fname = it->first; + LOG(debug, "Handling %s -> %d", fname.c_str(), it->second); + try { + if ((it->first[0] != '[') && (it->first != "summaryfeatures") && (it->first != "rankfeatures") && (it->first != "ranklog") && (it->first != "sddocname") && (it->first != "documentid")) { + FieldPath fieldPath; + docType.buildFieldPath(fieldPath, fname); + fieldMap[it->second] = std::move(fieldPath); + validCount++; + LOG(spam, "Found %s -> %d in document", fname.c_str(), it->second); + } + } catch (const std::exception & e) { + LOG(debug, "Could not get field info for '%s' in documenttype '%s' (id = '%s') : %s", + it->first.c_str(), docType.getName().c_str(), typeId.c_str(), e.what()); + } + } + _documentTypeFreq.insert(std::make_pair(validCount, docTypePtr)); +} + +const document::DocumentType & DocumentTypeMapping::getCurrentDocumentType() const +{ + if (_documentTypeFreq.empty()) { + throw std::runtime_error("No document type registered yet."); + } + return *_documentTypeFreq.rbegin()->second; +} + + +} diff --git a/streamingvisitors/src/vespa/vsm/common/documenttypemapping.h b/streamingvisitors/src/vespa/vsm/common/documenttypemapping.h new file mode 100644 index 00000000000..607b40cec47 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/common/documenttypemapping.h @@ -0,0 +1,54 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/vsm/common/storagedocument.h> + +namespace document { class DocumentTypeRepo; } + +namespace vsm +{ + +class DocumentTypeMapping +{ +public: + DocumentTypeMapping(); + ~DocumentTypeMapping(); + + /** + * Prepares the given document by sharing the field info map + * registered for that document type. + **/ + bool prepareBaseDoc(SharedFieldPathMap & doc) const; + + /** + * Builds a field info map for all registered document types. + **/ + void init(const vespalib::string & defaultDocumentType, + const StringFieldIdTMapT & fieldList, + const document::DocumentTypeRepo &repo); + + const document::DocumentType & getCurrentDocumentType() const; + const vespalib::string & getDefaultDocumentTypeName() const + { return _defaultDocumentTypeName; } + const document::DocumentType *getDefaultDocumentType() const + { return _defaultDocumentType; } + +private: + /** + * Builds a field info map for the given type id. This is a + * mapping from field id to field path and field value for all + * field names in the given list based on the given document type. + **/ + void buildFieldMap(const document::DocumentType *docType, + const StringFieldIdTMapT & fieldList, + const vespalib::string & typeId); + typedef vespalib::hash_map<vespalib::string, FieldPathMapT> FieldPathMapMapT; + typedef std::multimap<size_t, const document::DocumentType *> DocumentTypeUsage; + FieldPathMapMapT _fieldMap; + vespalib::string _defaultDocumentTypeName; + const document::DocumentType *_defaultDocumentType; + DocumentTypeUsage _documentTypeFreq; +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/common/fieldmodifier.cpp b/streamingvisitors/src/vespa/vsm/common/fieldmodifier.cpp new file mode 100644 index 00000000000..b39afd83b5a --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/common/fieldmodifier.cpp @@ -0,0 +1,24 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "fieldmodifier.h" +#include <vespa/vespalib/stllike/hash_map.hpp> + +namespace vsm { + +FieldModifierMap::FieldModifierMap() : + _map() +{ } + +FieldModifierMap::~FieldModifierMap() { } + +FieldModifier * +FieldModifierMap::getModifier(FieldIdT fId) const +{ + FieldModifierMapT::const_iterator itr = _map.find(fId); + if (itr == _map.end()) { + return NULL; + } + return itr->second.get(); +} + +} diff --git a/streamingvisitors/src/vespa/vsm/common/fieldmodifier.h b/streamingvisitors/src/vespa/vsm/common/fieldmodifier.h new file mode 100644 index 00000000000..60e480fa237 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/common/fieldmodifier.h @@ -0,0 +1,58 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/document/fieldvalue/fieldvalue.h> +#include <vespa/vsm/common/document.h> + +namespace vsm { + +/** + * Interface for classes that want to modify a field value. + **/ +class FieldModifier +{ +public: + typedef std::unique_ptr<FieldModifier> UP; + + /** + * Modifies the given field value and returns a new one. + **/ + virtual document::FieldValue::UP modify(const document::FieldValue & fv) = 0; + + /** + * Modifies the given field value and returns a new one. + * Use the given field path to iterate the field value. + **/ + virtual document::FieldValue::UP modify(const document::FieldValue & fv, + const document::FieldPath & path) = 0; + + virtual ~FieldModifier() { } +}; + +typedef vespalib::hash_map<FieldIdT, FieldModifier::UP> FieldModifierMapT; + +/** + * This class wraps a map from field id to field modifier. + **/ +class FieldModifierMap +{ +private: + FieldModifierMapT _map; + +public: + FieldModifierMap(); + ~FieldModifierMap(); + FieldModifierMapT & map() { return _map; } + const FieldModifierMapT & map() const { return _map; } + + /** + * Returns the modifier associated with the given field id or NULL if not found. + * + * @param fId the field id to look up. + * @return the field modifier or NULL if not found. + **/ + FieldModifier * getModifier(FieldIdT fId) const; +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/common/storagedocument.cpp b/streamingvisitors/src/vespa/vsm/common/storagedocument.cpp new file mode 100644 index 00000000000..a0d666268f5 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/common/storagedocument.cpp @@ -0,0 +1,81 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "storagedocument.h" +#include <vespa/document/fieldvalue/arrayfieldvalue.h> +#include <vespa/document/fieldvalue/weightedsetfieldvalue.h> + +#include <vespa/log/log.h> +LOG_SETUP(".vsm.storagedocument"); + +using NestedIterator = document::FieldValue::PathRange; + +namespace vsm { + +StorageDocument::StorageDocument(document::Document::UP doc, const SharedFieldPathMap & fim, size_t fieldNoLimit) : + Document(fieldNoLimit), + _doc(std::move(doc)), + _fieldMap(fim), + _cachedFields(getFieldCount()), + _backedFields() +{ } + +StorageDocument::~StorageDocument() { } + +namespace { + FieldPath _emptyFieldPath; + StorageDocument::SubDocument _empySubDocument(NULL, _emptyFieldPath.getFullRange()); +} + +const StorageDocument::SubDocument & +StorageDocument::getComplexField(FieldIdT fId) const +{ + if (_cachedFields[fId].getFieldValue() == NULL) { + const FieldPath & fp = (*_fieldMap)[fId]; + if ( ! fp.empty() ) { + const document::StructuredFieldValue * sfv = _doc.get(); + NestedIterator nested = fp.getFullRange(); + const document::FieldPathEntry& fvInfo = nested.cur(); + bool ok = sfv->getValue(fvInfo.getFieldRef(), fvInfo.getFieldValueToSet()); + if (ok) { + SubDocument tmp(&fvInfo.getFieldValueToSet(), nested.next()); + _cachedFields[fId].swap(tmp); + } + } else { + LOG(debug, "Failed getting field fId %d.", fId); + return _empySubDocument; + } + } + return _cachedFields[fId]; +} + +void StorageDocument::saveCachedFields() const +{ + size_t m(_cachedFields.size()); + _backedFields.reserve(m); + for (size_t i(0); i < m; i++) { + if (_cachedFields[i].getFieldValue() != 0) { + _backedFields.emplace_back(document::FieldValue::UP(_cachedFields[i].getFieldValue()->clone())); + _cachedFields[i].setFieldValue(_backedFields.back().get()); + } + } +} + +const document::FieldValue * +StorageDocument::getField(FieldIdT fId) const +{ + return getComplexField(fId).getFieldValue(); +} + +bool StorageDocument::setField(FieldIdT fId, document::FieldValue::UP fv) +{ + bool ok(fId < _cachedFields.size()); + if (ok) { + const FieldPath & fp = (*_fieldMap)[fId]; + SubDocument tmp(fv.get(), NestedIterator(fp.end(), fp.end())); + _cachedFields[fId].swap(tmp); + _backedFields.emplace_back(std::move(fv)); + } + return ok; +} + +} diff --git a/streamingvisitors/src/vespa/vsm/common/storagedocument.h b/streamingvisitors/src/vespa/vsm/common/storagedocument.h new file mode 100644 index 00000000000..a7f21cb052f --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/common/storagedocument.h @@ -0,0 +1,59 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "document.h" +#include <vespa/document/fieldvalue/document.h> + +namespace vsm { + +typedef vespalib::CloneablePtr<document::FieldValue> FieldValueContainer; +typedef document::FieldPath FieldPath; // field path to navigate a field value +typedef std::vector<FieldPath> FieldPathMapT; // map from field id to field path +typedef std::shared_ptr<FieldPathMapT> SharedFieldPathMap; + +class StorageDocument : public Document { +public: + typedef std::unique_ptr<StorageDocument> UP; + + class SubDocument { + public: + SubDocument() : _fieldValue(nullptr) {} + SubDocument(document::FieldValue *fv, document::FieldValue::PathRange nested) : + _fieldValue(fv), + _range(nested) + { } + + const document::FieldValue *getFieldValue() const { return _fieldValue; } + void setFieldValue(document::FieldValue *fv) { _fieldValue = fv; } + const document::FieldValue::PathRange & getRange() const { return _range; } + void swap(SubDocument &rhs) { + std::swap(_fieldValue, rhs._fieldValue); + std::swap(_range, rhs._range); + } + private: + FieldPath::const_iterator begin() const; + FieldPath::const_iterator end() const; + document::FieldValue *_fieldValue; + document::FieldValue::PathRange _range; + }; +public: + StorageDocument(document::Document::UP doc, const SharedFieldPathMap &fim, size_t fieldNoLimit); + StorageDocument(const StorageDocument &) = delete; + StorageDocument & operator = (const StorageDocument &) = delete; + ~StorageDocument(); + + const document::Document &docDoc() const { return *_doc; } + bool valid() const { return _doc.get() != nullptr; } + const SubDocument &getComplexField(FieldIdT fId) const; + const document::FieldValue *getField(FieldIdT fId) const override; + bool setField(FieldIdT fId, document::FieldValue::UP fv) override ; + void saveCachedFields() const; +private: + document::Document::UP _doc; + SharedFieldPathMap _fieldMap; + mutable std::vector<SubDocument> _cachedFields; + mutable std::vector<document::FieldValue::UP> _backedFields; +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/config/.gitignore b/streamingvisitors/src/vespa/vsm/config/.gitignore new file mode 100644 index 00000000000..d58390943e2 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/config/.gitignore @@ -0,0 +1,4 @@ +.depend +Makefile +config-*.cpp +config-*.h diff --git a/streamingvisitors/src/vespa/vsm/config/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/config/CMakeLists.txt new file mode 100644 index 00000000000..fea0bafe6b2 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/config/CMakeLists.txt @@ -0,0 +1,11 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(vsm_vconfig OBJECT + SOURCES + DEPENDS +) +vespa_generate_config(vsm_vconfig vsmfields.def) +install_config_definition(vsmfields.def vespa.config.search.vsm.vsmfields.def) +vespa_generate_config(vsm_vconfig vsm.def) +install_config_definition(vsm.def vespa.config.search.vsm.vsm.def) +vespa_generate_config(vsm_vconfig vsmsummary.def) +install_config_definition(vsmsummary.def vespa.config.search.vsm.vsmsummary.def) diff --git a/streamingvisitors/src/vespa/vsm/config/vsm-cfif.h b/streamingvisitors/src/vespa/vsm/config/vsm-cfif.h new file mode 100644 index 00000000000..22033aee232 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/config/vsm-cfif.h @@ -0,0 +1,25 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/vsm/config/config-vsmfields.h> +#include <vespa/vsm/config/config-vsm.h> +#include <vespa/vsm/config/config-vsmsummary.h> +#include <vespa/vespalib/util/ptrholder.h> + +using vespa::config::search::vsm::VsmConfig; +using vespa::config::search::vsm::VsmsummaryConfig; +using vespa::config::search::vsm::VsmfieldsConfig; + +namespace vsm { + +typedef vespalib::PtrHolder<VsmfieldsConfig> VsmfieldsHolder; +typedef std::shared_ptr<VsmfieldsConfig> VsmfieldsHandle; + +typedef vespalib::PtrHolder<VsmConfig> VsmHolder; +typedef std::shared_ptr<VsmConfig> VsmHandle; + +typedef vespalib::PtrHolder<VsmsummaryConfig> FastS_VsmsummaryHolder; +typedef std::shared_ptr<VsmsummaryConfig> FastS_VsmsummaryHandle; + +} + diff --git a/streamingvisitors/src/vespa/vsm/config/vsm.def b/streamingvisitors/src/vespa/vsm/config/vsm.def new file mode 100644 index 00000000000..1971f9e9574 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/config/vsm.def @@ -0,0 +1,13 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +namespace=vespa.config.search.vsm + +## The document model for the documents used as input for the VSM +doctype reference + +## Configuration for storage client used by VSM +storagecfg reference + +## Config defining what search method should be applied to different +## fields in the documents. It also contains a mapping from index name +## to a set of fields making up that index. +vsmfields reference diff --git a/streamingvisitors/src/vespa/vsm/config/vsmfields.def b/streamingvisitors/src/vespa/vsm/config/vsmfields.def new file mode 100644 index 00000000000..5e943c9274d --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/config/vsmfields.def @@ -0,0 +1,31 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +namespace=vespa.config.search.vsm + +## Level of verification applied to the documents received. +documentverificationlevel int default=0 + +## Set if one should ignore limit hits. +searchall int default=1 + +## The name of a field for which we are assigning a search method. +## The field name refers directly to a field in the document model. +fieldspec[].name string + +## The search method for a given field. Note: same field in 2 different document types must match on type if not a random result might be expected. +fieldspec[].searchmethod enum { NONE, BOOL, AUTOUTF8, UTF8, SSE2UTF8, INT8, INT16, INT32, INT64, FLOAT16, FLOAT, DOUBLE, GEOPOS } default=AUTOUTF8 +fieldspec[].arg1 string default="" + +## Maximum number of chars to search per field. +fieldspec[].maxlength int default=1048576 + +## Type of the field +fieldspec[].fieldtype enum {ATTRIBUTE, INDEX} default=INDEX + +## The name of a documenttype for which we are assigning a set of indexes. +documenttype[].name string +## The name of an index of a documenttype for which we are assigning a set of fields. +documenttype[].index[].name string + +## The name of a field part of an index. +## The field name refers directly to a field in the document model. +documenttype[].index[].field[].name string diff --git a/streamingvisitors/src/vespa/vsm/config/vsmsummary.def b/streamingvisitors/src/vespa/vsm/config/vsmsummary.def new file mode 100644 index 00000000000..5eb96624826 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/config/vsmsummary.def @@ -0,0 +1,21 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +namespace=vespa.config.search.vsm + +## The name of the result class that should be generated for documents +## returned from the VSM. If this value is empty, the first found +## result class will be used. +outputclass string default="" + +## Mapping of field names between the result class and the document +## model. This value represents the name in the result class. Fields +## not mentioned here will get the identity mapping. +fieldmap[].summary string + +## Mapping of field names between the result class and the document +## model. This field vector represents the names in the document model +## that should be used as input when generating the summary field. +fieldmap[].document[].field string + +## This command specifies how the document fields should be combined +## when generating the summary field. +fieldmap[].command enum { NONE, FLATTENJUNIPER, FLATTENSPACE } default=NONE diff --git a/streamingvisitors/src/vespa/vsm/searcher/.gitignore b/streamingvisitors/src/vespa/vsm/searcher/.gitignore new file mode 100644 index 00000000000..95bc02923a9 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/.gitignore @@ -0,0 +1,5 @@ +*.exe +*.ilk +*.pdb +.depend* +Makefile diff --git a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt new file mode 100644 index 00000000000..0a2a9ec21d2 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt @@ -0,0 +1,28 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") + set(SSE2_FILES "fold.cpp") +else() + unset(SSE2_FILES) +endif() + +vespa_add_library(vsm_vsmsearcher OBJECT + SOURCES + boolfieldsearcher.cpp + fieldsearcher.cpp + floatfieldsearcher.cpp + ${SSE2_FILES} + futf8strchrfieldsearcher.cpp + geo_pos_field_searcher.cpp + intfieldsearcher.cpp + strchrfieldsearcher.cpp + utf8flexiblestringfieldsearcher.cpp + utf8strchrfieldsearcher.cpp + utf8stringfieldsearcherbase.cpp + utf8substringsearcher.cpp + utf8substringsnippetmodifier.cpp + utf8suffixstringfieldsearcher.cpp + utf8exactstringfieldsearcher.cpp + DEPENDS + vsm_vconfig +) diff --git a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp new file mode 100644 index 00000000000..8c9b556e593 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp @@ -0,0 +1,56 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "boolfieldsearcher.h" +#include <vespa/document/fieldvalue/boolfieldvalue.h> + +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; + +namespace vsm { + +namespace { +vespalib::stringref TRUE = "true"; +vespalib::stringref FALSE = "false"; +} + +std::unique_ptr<FieldSearcher> +BoolFieldSearcher::duplicate() const +{ + return std::make_unique<BoolFieldSearcher>(*this); +} + +BoolFieldSearcher::BoolFieldSearcher(FieldIdT fId) : + FieldSearcher(fId), + _terms() +{ } + +BoolFieldSearcher::~BoolFieldSearcher() = default; + +void BoolFieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf) +{ + _terms.clear(); + FieldSearcher::prepare(qtl, buf); + for (const QueryTerm * qt : qtl) { + if (TRUE == qt->getTerm()) { + _terms.push_back(true); + } else if (FALSE == qt->getTerm()) { + _terms.push_back(false); + } else { + int64_t low; + int64_t high; + bool valid = qt->getAsIntegerTerm(low, high); + _terms.push_back(valid && (low > 0)); + } + } +} + +void BoolFieldSearcher::onValue(const document::FieldValue & fv) +{ + for(size_t j=0, jm(_terms.size()); j < jm; j++) { + if (static_cast<const document::BoolFieldValue &>(fv).getValue() == _terms[j]) { + addHit(*_qtl[j], 0); + } + } + ++_words; +} + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h new file mode 100644 index 00000000000..f6afef9e507 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h @@ -0,0 +1,21 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "fieldsearcher.h" + +namespace vsm { + +class BoolFieldSearcher : public FieldSearcher +{ +public: + std::unique_ptr<FieldSearcher> duplicate() const override; + BoolFieldSearcher(FieldIdT fId); + ~BoolFieldSearcher(); + void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override; + void onValue(const document::FieldValue & fv) override; +private: + std::vector<bool> _terms; +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp new file mode 100644 index 00000000000..e69999b160e --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp @@ -0,0 +1,301 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "fieldsearcher.h" +#include <vespa/vsm/vsm/fieldsearchspec.h> +#include <vespa/document/fieldvalue/arrayfieldvalue.h> +#include <vespa/document/fieldvalue/weightedsetfieldvalue.h> + +#include <vespa/log/log.h> +LOG_SETUP(".vsm.searcher.fieldsearcher"); + +using search::byte; +using search::streaming::Query; +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; +using search::v16qi; + +namespace vsm { + +class force +{ + public: + force() { FieldSearcher::init(); } +}; + +static force __forceInit; + +byte FieldSearcher::_foldLowCase[256]; +byte FieldSearcher::_wordChar[256]; + +FieldSearcherBase::FieldSearcherBase() : + _qtl(), + _qtlFastBuffer(), + _qtlFastSize(0), + _qtlFast(nullptr) +{ +} + +FieldSearcherBase::FieldSearcherBase(const FieldSearcherBase & org) : + _qtl(), + _qtlFastBuffer(), + _qtlFastSize(0), + _qtlFast(nullptr) +{ + prepare(org._qtl); +} + +FieldSearcherBase::~FieldSearcherBase() +{ +} + +FieldSearcherBase & FieldSearcherBase::operator = (const FieldSearcherBase & org) +{ + if (this != &org) { + prepare(org._qtl); + } + return *this; +} + +void FieldSearcherBase::prepare(const QueryTermList & qtl) +{ + _qtl = qtl; + _qtlFastBuffer.resize(sizeof(*_qtlFast)*(_qtl.size()+1), 0x13); + _qtlFast = reinterpret_cast<v16qi *>(reinterpret_cast<unsigned long>(&_qtlFastBuffer[0]+15) & ~0xf); + _qtlFastSize = 0; + for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) { + const QueryTerm & qt = **it; + memcpy(&_qtlFast[_qtlFastSize++], qt.getTerm(), std::min(size_t(16), qt.termLen())); + } +} + +FieldSearcher::FieldSearcher(const FieldIdT & fId, bool defaultPrefix) : + FieldSearcherBase(), + _field(fId), + _matchType(defaultPrefix ? PREFIX : REGULAR), + _maxFieldLength(0x100000), + _currentElementId(0), + _currentElementWeight(1), + _pureUsAsciiCount(0), + _pureUsAsciiFieldCount(0), + _anyUtf8Count(0), + _anyUtf8FieldCount(0), + _words(0), + _badUtf8Count(0), + _zeroCount(0) +{ + zeroStat(); +} + +FieldSearcher::~FieldSearcher() = default; + +bool FieldSearcher::search(const StorageDocument & doc) +{ + for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) { + QueryTerm & qt = **it; + QueryTerm::FieldInfo & fInfo = qt.getFieldInfo(field()); + fInfo.setHitOffset(qt.getHitList().size()); + } + onSearch(doc); + for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) { + QueryTerm & qt = **it; + QueryTerm::FieldInfo & fInfo = qt.getFieldInfo(field()); + fInfo.setHitCount(qt.getHitList().size() - fInfo.getHitOffset()); + fInfo.setFieldLength(_words); + } + _words = 0; + return true; +} + +void FieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & UNUSED_PARAM(buf)) +{ + FieldSearcherBase::prepare(qtl); + prepareFieldId(); +} + +size_t FieldSearcher::countWords(const FieldRef & f) +{ + size_t words = 0; + const char * n = f.data(); + const char * e = n + f.size(); + for( ; n < e; ++n) { + for (; isspace(*n) && (n<e); ++n); + const char * m = n; + for (; iswordchar(*n) && (n<e); ++n); + if (n > m) { + words++; + } + } + return words; +} + +void FieldSearcher::prepareFieldId() +{ + for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) { + QueryTerm & qt = **it; + qt.resizeFieldId(field()); + } +} + +void FieldSearcher::addStat(const FieldSearcher & toAdd) +{ + _pureUsAsciiCount += toAdd._pureUsAsciiCount; + _pureUsAsciiFieldCount += toAdd._pureUsAsciiFieldCount; + _anyUtf8Count += toAdd._anyUtf8Count; + _anyUtf8FieldCount += toAdd._anyUtf8FieldCount; + _badUtf8Count += toAdd._badUtf8Count; + _zeroCount += toAdd._zeroCount; + for (size_t i=0; i<NELEMS(_utf8Count); i++) { _utf8Count[i] += toAdd._utf8Count[i]; } +} + +void FieldSearcher::zeroStat() +{ + _pureUsAsciiCount = 0; + _pureUsAsciiFieldCount = 0; + _anyUtf8Count = 0; + _anyUtf8FieldCount = 0; + _badUtf8Count = 0; + _zeroCount = 0; + for (size_t i=0; i<NELEMS(_utf8Count); i++) { _utf8Count[i] = 0; } +} + +void FieldSearcher::init() +{ + for (unsigned i = 0; i < NELEMS(_foldLowCase); i++) { + _foldLowCase[i] = 0; + _wordChar[i] = 0; + } + for (int i = 'A'; i <= 'Z'; i++) { + _wordChar[i] = 0xFF; + _foldLowCase[i] = i | 0x20; + } + for (int i = 'a'; i <= 'z'; i++) { + _wordChar[i] = 0xFF; + _foldLowCase[i] = i; + } + for (int i = '0'; i <= '9'; i++) { + _wordChar[i] = 0xFF; + _foldLowCase[i] = i; + } + for (int i = 0xC0; i <= 0xFF; i++) { + _wordChar[i] = 0xFF; + } + _wordChar[0xd7] = 0; + _wordChar[0xf7] = 0; + + if (1) /* _doAccentRemoval */ { + _foldLowCase[0xc0] = 'a'; + _foldLowCase[0xc1] = 'a'; + _foldLowCase[0xc2] = 'a'; + _foldLowCase[0xc3] = 'a'; // A tilde + _foldLowCase[0xc7] = 'c'; + _foldLowCase[0xc8] = 'e'; + _foldLowCase[0xc9] = 'e'; + _foldLowCase[0xca] = 'e'; + _foldLowCase[0xcb] = 'e'; + _foldLowCase[0xcc] = 'i'; // I grave + _foldLowCase[0xcd] = 'i'; + _foldLowCase[0xce] = 'i'; + _foldLowCase[0xcf] = 'i'; + _foldLowCase[0xd3] = 'o'; + _foldLowCase[0xd4] = 'o'; + _foldLowCase[0xda] = 'u'; + _foldLowCase[0xdb] = 'u'; + + _foldLowCase[0xe0] = 'a'; + _foldLowCase[0xe1] = 'a'; + _foldLowCase[0xe2] = 'a'; + _foldLowCase[0xe3] = 'a'; // a tilde + _foldLowCase[0xe7] = 'c'; + _foldLowCase[0xe8] = 'e'; + _foldLowCase[0xe9] = 'e'; + _foldLowCase[0xea] = 'e'; + _foldLowCase[0xeb] = 'e'; + _foldLowCase[0xec] = 'i'; // i grave + _foldLowCase[0xed] = 'i'; + _foldLowCase[0xee] = 'i'; + _foldLowCase[0xef] = 'i'; + _foldLowCase[0xf3] = 'o'; + _foldLowCase[0xf4] = 'o'; + _foldLowCase[0xfa] = 'u'; + _foldLowCase[0xfb] = 'u'; + } +} + +void FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT & difm, const SharedSearcherBuf & searcherBuf, Query & query) +{ + QueryTermList qtl; + query.getLeafs(qtl); + vespalib::string tmp; + for (FieldIdTSearcherMap::iterator it = begin(), mt = end(); it != mt; it++) { + QueryTermList onlyInIndex; + FieldIdT fid = (*it)->field(); + for (QueryTermList::iterator qt = qtl.begin(), mqt = qtl.end(); qt != mqt; qt++) { + QueryTerm * q = *qt; + for (DocumentTypeIndexFieldMapT::const_iterator dt(difm.begin()), dmt(difm.end()); dt != dmt; dt++) { + const IndexFieldMapT & fim = dt->second; + IndexFieldMapT::const_iterator found = fim.find(FieldSearchSpecMap::stripNonFields(q->index())); + if (found != fim.end()) { + const FieldIdTList & index = found->second; + if ((find(index.begin(), index.end(), fid) != index.end()) && (find(onlyInIndex.begin(), onlyInIndex.end(), q) == onlyInIndex.end())) { + onlyInIndex.push_back(q); + } + } else { + LOG(debug, "Could not find the requested index=%s in the index config map. Query does not fit search definition.", q->index().c_str()); + } + } + } + /// Should perhaps do a unique on onlyInIndex + (*it)->prepare(onlyInIndex, searcherBuf); + if (logger.wants(ns_log::Logger::spam)) { + char tmpBuf[16]; + sprintf(tmpBuf,"%d", fid); + tmp += tmpBuf; + tmp += ", "; + } + } + LOG(debug, "Will search in %s", tmp.c_str()); +} + +bool FieldSearcher::onSearch(const StorageDocument & doc) +{ + bool retval(true); + size_t fNo(field()); + const StorageDocument::SubDocument & sub = doc.getComplexField(fNo); + if (sub.getFieldValue() != nullptr) { + IteratorHandler ih(*this); + sub.getFieldValue()->iterateNested(sub.getRange(), ih); + } + return retval; +} + +void +FieldSearcher::IteratorHandler::onPrimitive(uint32_t, const Content & c) +{ + LOG(spam, "onPrimitive: field value '%s'", c.getValue().toString().c_str()); + _searcher.setCurrentWeight(c.getWeight()); + _searcher.setCurrentElementId(getArrayIndex()); + _searcher.onValue(c.getValue()); +} + +void +FieldSearcher::IteratorHandler::onCollectionStart(const Content & c) +{ + const document::FieldValue & fv = c.getValue(); + LOG(spam, "onCollectionStart: field value '%s'", fv.toString().c_str()); + if (fv.isA(document::FieldValue::Type::ARRAY)) { + const document::ArrayFieldValue & afv = static_cast<const document::ArrayFieldValue &>(fv); + LOG(spam, "onCollectionStart: Array size = '%zu'", afv.size()); + } else if (fv.isA(document::FieldValue::Type::WSET)) { + const document::WeightedSetFieldValue & wsfv = static_cast<const document::WeightedSetFieldValue &>(fv); + LOG(spam, "onCollectionStart: WeightedSet size = '%zu'", wsfv.size()); + } +} + +void +FieldSearcher::IteratorHandler::onStructStart(const Content & c) +{ + LOG(spam, "onStructStart: field value '%s'", c.getValue().toString().c_str()); + _searcher.onStructValue(static_cast<const document::StructFieldValue &>(c.getValue())); +} + + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h new file mode 100644 index 00000000000..5c2ef8fec28 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h @@ -0,0 +1,147 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/document/fieldvalue/iteratorhandler.h> +#include <vespa/searchlib/query/streaming/query.h> +#include <vespa/vsm/common/document.h> +#include <vespa/vsm/common/storagedocument.h> + +namespace vsm { + +typedef size_t termcount_t; +typedef size_t termsize_t; + +#if defined(COLLECT_CHAR_STAT) + #define NEED_CHAR_STAT(a) { a; } +#else + #define NEED_CHAR_STAT(a) +#endif + +typedef ucs4_t cmptype_t; +typedef vespalib::Array<cmptype_t> SearcherBuf; +typedef std::shared_ptr<SearcherBuf> SharedSearcherBuf; +typedef std::vector<char> CharVector; + +class FieldSearcherBase +{ +protected: + search::streaming::QueryTermList _qtl; +private: + CharVector _qtlFastBuffer; +protected: + FieldSearcherBase(); + FieldSearcherBase(const FieldSearcherBase & org); + virtual ~FieldSearcherBase(void); + FieldSearcherBase & operator = (const FieldSearcherBase & org); + void prepare(const search::streaming::QueryTermList & qtl); + size_t _qtlFastSize; + search::v16qi *_qtlFast; +}; + +class FieldSearcher : public FieldSearcherBase +{ +public: + enum MatchType { + REGULAR, + PREFIX, + SUBSTRING, + SUFFIX, + EXACT + }; + + FieldSearcher(const FieldIdT & fId, bool defaultPrefix=false); + ~FieldSearcher() override; + virtual std::unique_ptr<FieldSearcher> duplicate() const = 0; + bool search(const StorageDocument & doc); + virtual void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf); + const FieldIdT & field() const { return _field; } + void field(const FieldIdT & v) { _field = v; prepareFieldId(); } + bool prefix() const { return _matchType == PREFIX; } + bool substring() const { return _matchType == SUBSTRING; } + bool suffix() const { return _matchType == SUFFIX; } + bool exact() const { return _matchType == EXACT; } + void setMatchType(MatchType mt) { _matchType = mt; } + static void init(); + static search::byte fold(search::byte c) { return _foldLowCase[c]; } + static search::byte iswordchar(search::byte c) { return _wordChar[c]; } + static search::byte isspace(search::byte c) { return ! iswordchar(c); } + static size_t countWords(const FieldRef & f); + unsigned pureUsAsciiCount() const { return _pureUsAsciiCount; } + unsigned pureUsAsciiFieldCount() const { return _pureUsAsciiFieldCount; } + unsigned anyUtf8Count() const { return _anyUtf8Count; } + unsigned anyUtf8FieldCount() const { return _anyUtf8FieldCount; } + unsigned badUtf8Count() const { return _badUtf8Count; } + unsigned zeroCount() const { return _zeroCount; } + unsigned utf8Count(size_t sz) const { return _utf8Count[1+sz]; } + const unsigned * utf8Count() const { return _utf8Count; } + int32_t getCurrentWeight() const { return _currentElementWeight; } + void addStat(const FieldSearcher & toAdd); + void zeroStat(); + FieldSearcher & maxFieldLength(uint32_t maxFieldLength_) { _maxFieldLength = maxFieldLength_; return *this; } + size_t maxFieldLength() const { return _maxFieldLength; } + +private: + class IteratorHandler : public document::fieldvalue::IteratorHandler { + private: + FieldSearcher & _searcher; + + void onPrimitive(uint32_t fid, const Content & c) override; + void onCollectionStart(const Content & c) override; + void onStructStart(const Content & c) override; + + public: + IteratorHandler(FieldSearcher & searcher) : _searcher(searcher) {} + }; + friend class IteratorHandler; // to allow calls to onValue(); + + void prepareFieldId(); + void setCurrentWeight(int32_t weight) { _currentElementWeight = weight; } + void setCurrentElementId(int32_t weight) { _currentElementId = weight; } + bool onSearch(const StorageDocument & doc); + virtual void onValue(const document::FieldValue & fv) = 0; + virtual void onStructValue(const document::StructFieldValue &) { } + FieldIdT _field; + MatchType _matchType; + unsigned _maxFieldLength; + uint32_t _currentElementId; + int32_t _currentElementWeight; // Contains the weight of the current item being evaluated. + /// Number of bytes in blocks containing pure us-ascii + unsigned _pureUsAsciiCount; + /// Number of blocks containing pure us-ascii + unsigned _pureUsAsciiFieldCount; + /// Number of bytes in blocks containing any non us-ascii + unsigned _anyUtf8Count; + /// Number of blocks containing any non us-ascii + unsigned _anyUtf8FieldCount; +protected: + /// Number of terms searched. + unsigned _words; + /// Number of utf8 bytes by utf8 size. + unsigned _utf8Count[6]; + unsigned _badUtf8Count; + unsigned _zeroCount; +protected: + void addPureUsAsciiField(size_t sz) { _pureUsAsciiCount += sz; _pureUsAsciiFieldCount++;; } + void addAnyUtf8Field(size_t sz) { _anyUtf8Count += sz; _anyUtf8FieldCount++; } + /** + * Adds a hit to the given query term. + * For each call to onValue() a batch of words are processed, and the position is local to this batch. + **/ + void addHit(search::streaming::QueryTerm & qt, uint32_t pos) const { + qt.add(_words + pos, field(), _currentElementId, getCurrentWeight()); + } +public: + static search::byte _foldLowCase[256]; + static search::byte _wordChar[256]; +}; + +typedef std::unique_ptr<FieldSearcher> FieldSearcherContainer; +typedef std::vector<FieldSearcherContainer> FieldIdTSearcherMapT; + +class FieldIdTSearcherMap : public FieldIdTSearcherMapT +{ +public: + void prepare(const DocumentTypeIndexFieldMapT & difm, const SharedSearcherBuf & searcherBuf, search::streaming::Query & query); +}; + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp new file mode 100644 index 00000000000..02d8bd8c12a --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp @@ -0,0 +1,70 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "floatfieldsearcher.h" + +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; + +namespace vsm { + +std::unique_ptr<FieldSearcher> +FloatFieldSearcher::duplicate() const +{ + return std::make_unique<FloatFieldSearcher>(*this); +} + +std::unique_ptr<FieldSearcher> +DoubleFieldSearcher::duplicate() const +{ + return std::make_unique<DoubleFieldSearcher>(*this); +} + +template<typename T> +FloatFieldSearcherT<T>::FloatFieldSearcherT(FieldIdT fId) : + FieldSearcher(fId), + _floatTerm() +{} + +template<typename T> +FloatFieldSearcherT<T>::~FloatFieldSearcherT() {} + +template<typename T> +void FloatFieldSearcherT<T>::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf) +{ + _floatTerm.clear(); + FieldSearcher::prepare(qtl, buf); + for (QueryTermList::const_iterator it=qtl.begin(); it < qtl.end(); it++) { + const QueryTerm * qt = *it; + size_t sz(qt->termLen()); + if (sz) { + double low; + double high; + bool valid = qt->getAsDoubleTerm(low, high); + _floatTerm.push_back(FloatInfo(low, high, valid)); + } + } +} + + +template<typename T> +void FloatFieldSearcherT<T>::onValue(const document::FieldValue & fv) +{ + for(size_t j=0, jm(_floatTerm.size()); j < jm; j++) { + const FloatInfo & ii = _floatTerm[j]; + if (ii.valid() && (ii.cmp(fv.getAsDouble()))) { + addHit(*_qtl[j], 0); + } + } + ++_words; +} + +template<typename T> +bool FloatFieldSearcherT<T>::FloatInfo::cmp(T key) const +{ + return (_lower <= key) && (key <= _upper); +} + +template class FloatFieldSearcherT<float>; +template class FloatFieldSearcherT<double>; + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h new file mode 100644 index 00000000000..98018fbf4a3 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h @@ -0,0 +1,53 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "fieldsearcher.h" + +namespace vsm { + +template <typename T> +class FloatFieldSearcherT : public FieldSearcher +{ +public: + FloatFieldSearcherT(FieldIdT fId=0); + ~FloatFieldSearcherT(); + void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override; + void onValue(const document::FieldValue & fv) override; +protected: + class FloatInfo + { + public: + FloatInfo(T low, T high, bool v) : _lower(low), _upper(high), _valid(v) { if (low > high) { _lower = high; _upper = low; } } + bool cmp(T key) const; + bool valid() const { return _valid; } + void setValid(bool v) { _valid = v; } + T getLow() const { return _lower; } + T getHigh() const { return _upper; } + private: + T _lower; + T _upper; + bool _valid; + }; + typedef std::vector<FloatInfo> FloatInfoListT; + FloatInfoListT _floatTerm; +}; + +typedef FloatFieldSearcherT<float> FloatFieldSearcherTF; +typedef FloatFieldSearcherT<double> FloatFieldSearcherTD; + +class FloatFieldSearcher : public FloatFieldSearcherTF +{ +public: + std::unique_ptr<FieldSearcher> duplicate() const override; + FloatFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTF(fId) { } +}; + +class DoubleFieldSearcher : public FloatFieldSearcherTD +{ +public: + std::unique_ptr<FieldSearcher> duplicate() const override; + DoubleFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTD(fId) { } +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/searcher/fold.cpp b/streamingvisitors/src/vespa/vsm/searcher/fold.cpp new file mode 100644 index 00000000000..bd2392d3ad6 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/fold.cpp @@ -0,0 +1,153 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// +#include "fold.h" + +namespace vsm { + +const unsigned char * sse2_foldaa(const unsigned char * toFoldOrg, size_t sz, unsigned char * foldedOrg) +{ + typedef char v16qi __attribute__ ((__vector_size__(16))); + typedef long long v2di __attribute__ ((__vector_size__(16))); + static v16qi _G_0 = { '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1 }; + static v16qi _G_9 = { '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9' }; + static v16qi _G_a = { 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1 }; + static v16qi _G_z = { 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z' }; + static v16qi _G_8bit = { (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, + (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4 }; + static v2di _G_lowCase = { 0x2020202020202020ULL, 0x2020202020202020ULL }; + const v16qi *toFold = reinterpret_cast<const v16qi *>(toFoldOrg); + v2di * folded = reinterpret_cast<v2di *>(foldedOrg); + size_t i=0; + for (size_t m=sz/16; i < m; i++) + { +#ifndef __INTEL_COMPILER + int nonAscii = __builtin_ia32_pmovmskb128(toFold[i]); + if (nonAscii) + { +#ifdef __clang__ + v16qi non8Mask = _G_8bit > toFold[i]; +#else + v16qi non8Mask = __builtin_ia32_pcmpgtb128(_G_8bit, toFold[i]); +#endif + int non8bit = __builtin_ia32_pmovmskb128(non8Mask); + if (non8bit) + { + break; + } + break; + } +#ifdef __clang__ + v16qi _0 = toFold[i] > _G_0; + v16qi _z = toFold[i] > _G_z; + v2di _0_z = v2di(_0) ^ v2di(_z); + v2di toLow = _0_z & v2di(toFold[i]); + v16qi low = v16qi(toLow | _G_lowCase); + _0 = low > _G_0; + v16qi _9 = low > _G_9; + v16qi _a = low > _G_a; + _z = low > _G_z; + v2di _0_9_m = v2di(_0) ^ v2di(_9); + v2di _a_z_m = v2di(_a) ^ v2di(_z); + v2di _0_9 = _0_9_m & v2di(low); + v2di _a_z = _a_z_m & v2di(low); + folded[i] = _0_9 | _a_z; +#else + v16qi _0 = __builtin_ia32_pcmpgtb128(toFold[i], _G_0); + v16qi _z = __builtin_ia32_pcmpgtb128(toFold[i], _G_z); + v2di _0_z = __builtin_ia32_pxor128(v2di(_0), v2di(_z)); + v2di toLow = __builtin_ia32_pand128(_0_z, v2di(toFold[i])); + v16qi low = v16qi(__builtin_ia32_por128(toLow, _G_lowCase)); + _0 = __builtin_ia32_pcmpgtb128(low, _G_0); + v16qi _9 = __builtin_ia32_pcmpgtb128(low, _G_9); + v16qi _a = __builtin_ia32_pcmpgtb128(low, _G_a); + _z = __builtin_ia32_pcmpgtb128(low, _G_z); + v2di _0_9_m = __builtin_ia32_pxor128(v2di(_0), v2di(_9)); + v2di _a_z_m = __builtin_ia32_pxor128(v2di(_a), v2di(_z)); + v2di _0_9 = __builtin_ia32_pand128(_0_9_m, v2di(low)); + v2di _a_z = __builtin_ia32_pand128(_a_z_m, v2di(low)); + folded[i] = __builtin_ia32_por128(_0_9, _a_z); +#endif +#else +# warning "Intel's icc compiler does not like __builtin_ia32_pxor128" + LOG_ABORT("should not be reached"); +#endif + } + return toFoldOrg+i*16; +} + +const unsigned char * sse2_foldua(const unsigned char * toFoldOrg, size_t sz, unsigned char * foldedOrg) +{ + typedef char v16qi __attribute__ ((__vector_size__(16))); + typedef long long v2di __attribute__ ((__vector_size__(16))); + static v16qi _G_0 = { '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1 }; + static v16qi _G_9 = { '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9' }; + static v16qi _G_a = { 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1 }; + static v16qi _G_z = { 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z' }; + static v16qi _G_8bit = { (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, + (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4 }; + static v2di _G_lowCase = { 0x2020202020202020ULL, 0x2020202020202020ULL }; + v2di * folded = reinterpret_cast<v2di *>(foldedOrg); + size_t i=0; + for (size_t m=sz/16; i < m; i++) + { +#ifndef __INTEL_COMPILER +#ifdef __clang__ + v16qi current = __builtin_ia32_lddqu(reinterpret_cast<const char *>(&toFoldOrg[i*16])); +#else + v16qi current = __builtin_ia32_loaddqu(reinterpret_cast<const char *>(&toFoldOrg[i*16])); +#endif + int nonAscii = __builtin_ia32_pmovmskb128(current); + if (nonAscii) + { +#ifdef __clang__ + v16qi non8Mask = _G_8bit > current; +#else + v16qi non8Mask = __builtin_ia32_pcmpgtb128(_G_8bit, current); +#endif + int non8bit = __builtin_ia32_pmovmskb128(non8Mask); + if (non8bit) + { + break; + } + break; + } +#ifdef __clang__ + v16qi _0 = current > _G_0; + v16qi _z = current > _G_z; + v2di _0_z = v2di(_0) ^ v2di(_z); + v2di toLow = _0_z & v2di(current); + v16qi low = v16qi(toLow | _G_lowCase); + _0 = low > _G_0; + v16qi _9 = low > _G_9; + v16qi _a = low > _G_a; + _z = low > _G_z; + v2di _0_9_m = v2di(_0) ^ v2di(_9); + v2di _a_z_m = v2di(_a) ^ v2di(_z); + v2di _0_9 = _0_9_m & v2di(low); + v2di _a_z = _a_z_m & v2di(low); + folded[i] = _0_9 | _a_z; +#else + v16qi _0 = __builtin_ia32_pcmpgtb128(current, _G_0); + v16qi _z = __builtin_ia32_pcmpgtb128(current, _G_z); + v2di _0_z = __builtin_ia32_pxor128(v2di(_0), v2di(_z)); + v2di toLow = __builtin_ia32_pand128(_0_z, v2di(current)); + v16qi low = v16qi(__builtin_ia32_por128(toLow, _G_lowCase)); + _0 = __builtin_ia32_pcmpgtb128(low, _G_0); + v16qi _9 = __builtin_ia32_pcmpgtb128(low, _G_9); + v16qi _a = __builtin_ia32_pcmpgtb128(low, _G_a); + _z = __builtin_ia32_pcmpgtb128(low, _G_z); + v2di _0_9_m = __builtin_ia32_pxor128(v2di(_0), v2di(_9)); + v2di _a_z_m = __builtin_ia32_pxor128(v2di(_a), v2di(_z)); + v2di _0_9 = __builtin_ia32_pand128(_0_9_m, v2di(low)); + v2di _a_z = __builtin_ia32_pand128(_a_z_m, v2di(low)); + folded[i] = __builtin_ia32_por128(_0_9, _a_z); +#endif +#else +# warning "Intel's icc compiler does not like __builtin_ia32_pxor128" + LOG_ABORT("should not be reached"); +#endif + } + return toFoldOrg+i*16; +} + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/fold.h b/streamingvisitors/src/vespa/vsm/searcher/fold.h new file mode 100644 index 00000000000..578b883484f --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/fold.h @@ -0,0 +1,12 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/vsm/common/document.h> + +namespace vsm { + +const search::byte * sse2_foldaa(const search::byte * toFoldOrg, size_t sz, search::byte * foldedOrg); +const search::byte * sse2_foldua(const search::byte * toFoldOrg, size_t sz, search::byte * foldedOrg); + +} + diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp new file mode 100644 index 00000000000..fc5d77de419 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp @@ -0,0 +1,310 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "futf8strchrfieldsearcher.h" +#ifdef __x86_64__ +#include "fold.h" +#endif +#include <vespa/vespalib/util/size_literals.h> + +using search::byte; +using search::streaming::QueryTerm; +using search::v16qi; +using vespalib::Optimized; + +namespace vsm { + +std::unique_ptr<FieldSearcher> +FUTF8StrChrFieldSearcher::duplicate() const +{ + return std::make_unique<FUTF8StrChrFieldSearcher>(*this); +} + +FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher() + : UTF8StrChrFieldSearcher(), + _folded(4_Ki) +{ } +FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher(FieldIdT fId) + : UTF8StrChrFieldSearcher(fId), + _folded(4_Ki) +{ } +FUTF8StrChrFieldSearcher::~FUTF8StrChrFieldSearcher() {} + +bool +FUTF8StrChrFieldSearcher::ansiFold(const char * toFold, size_t sz, char * folded) +{ + bool retval(true); + for(size_t i=0; i < sz; i++) { + byte c = toFold[i]; + if (c>=128) { retval = false; break; } + folded[i] = FieldSearcher::_foldLowCase[c]; + } + return retval; +} + +bool +FUTF8StrChrFieldSearcher::lfoldaa(const char * toFold, size_t sz, char * folded, size_t & unalignedStart) +{ + unalignedStart = (size_t(toFold) & 0xF); +#ifdef __x86_64__ + bool retval(true); + size_t unalignedsz = std::min(sz, (16 - unalignedStart) & 0xF); + + size_t foldedUnaligned = (size_t(folded) & 0xF); + unalignedStart = (foldedUnaligned < unalignedStart) ? (unalignedStart-foldedUnaligned) : unalignedStart + 16 - foldedUnaligned; + size_t alignedStart = unalignedStart+unalignedsz; + + size_t alignedsz = sz - unalignedsz; + size_t alignsz16 = alignedsz & 0xFFFFFFF0; + size_t rest = alignedsz - alignsz16; + + if (unalignedStart) { + retval = ansiFold(toFold, unalignedsz, folded + unalignedStart); + } + if (alignsz16 && retval) { + const byte * end = sse2_foldaa(reinterpret_cast<const byte *>(toFold+unalignedsz), alignsz16, reinterpret_cast<byte *>(folded+alignedStart)); + retval = (end == reinterpret_cast<const byte *>(toFold+unalignedsz+alignsz16)); + } + if(rest && retval) { + retval = ansiFold(toFold + unalignedsz + alignsz16, rest, folded+alignedStart+alignsz16); + } + return retval; +#else + return ansiFold(toFold, sz, folded + unalignedStart); +#endif +} + +bool +FUTF8StrChrFieldSearcher::lfoldua(const char * toFold, size_t sz, char * folded, size_t & alignedStart) +{ + alignedStart = 0xF - (size_t(folded + 0xF) % 0x10); +#ifdef __x86_64__ + bool retval(true); + + size_t alignsz16 = sz & 0xFFFFFFF0; + size_t rest = sz - alignsz16; + + if (alignsz16) { + const byte * end = sse2_foldua(reinterpret_cast<const byte *>(toFold), alignsz16, reinterpret_cast<byte *>(folded+alignedStart)); + retval = (end == reinterpret_cast<const byte *>(toFold+alignsz16)); + } + if(rest && retval) { + retval = ansiFold(toFold + alignsz16, rest, folded+alignedStart+alignsz16); + } + return retval; +#else + return ansiFold(toFold, sz, folded + alignedStart); +#endif +} + +namespace { + +#ifdef __x86_64__ +inline const char * advance(const char * n, const v16qi zero) +{ + uint32_t charMap = 0; + unsigned zeroCountSum = 0; + do { // find first '\0' character (the end of the word) +#ifndef __INTEL_COMPILER +#ifdef __clang__ + v16qi tmpCurrent = __builtin_ia32_lddqu(n+zeroCountSum); + v16qi tmp0 = tmpCurrent == zero; +#else + v16qi tmpCurrent = __builtin_ia32_loaddqu(n+zeroCountSum); + v16qi tmp0 = __builtin_ia32_pcmpeqb128(tmpCurrent, reinterpret_cast<v16qi>(zero)); +#endif + charMap = __builtin_ia32_pmovmskb128(tmp0); // 1 in charMap equals to '\0' in input buffer +#else +# warning "Intel's icc compiler does not like __builtin_ia32_xxxxx" + LOG_ABORT("should not be reached"); +#endif + zeroCountSum += 16; + } while (!charMap); + int charCount = Optimized::lsbIdx(charMap); // number of word characters in last 16 bytes + uint32_t zeroMap = ((~charMap) & 0xffff) >> charCount; + + int zeroCounter = Optimized::lsbIdx(zeroMap); // number of non-characters ('\0') in last 16 bytes + int sum = zeroCountSum - 16 + charCount + zeroCounter; + if (!zeroMap) { // only '\0' in last 16 bytes (no new word found) + do { // find first word character (the next word) +#ifndef __INTEL_COMPILER +#ifdef __clang__ + v16qi tmpCurrent = __builtin_ia32_lddqu(n+zeroCountSum); + tmpCurrent = tmpCurrent > zero; +#else + v16qi tmpCurrent = __builtin_ia32_loaddqu(n+zeroCountSum); + tmpCurrent = __builtin_ia32_pcmpgtb128(tmpCurrent, reinterpret_cast<v16qi>(zero)); +#endif + zeroMap = __builtin_ia32_pmovmskb128(tmpCurrent); // 1 in zeroMap equals to word character in input buffer +#else +# warning "Intel's icc compiler does not like __builtin_ia32_xxxxx" + LOG_ABORT("should not be reached"); +#endif + zeroCountSum += 16; + } while(!zeroMap); + zeroCounter = Optimized::lsbIdx(zeroMap); + sum = zeroCountSum - 16 + zeroCounter; + } + return n + sum; +} +#else +inline const char* advance(const char* n) +{ + const char* p = n; + const char* zero = static_cast<const char *>(memchr(p, 0, 64_Ki)); + while (zero == nullptr) { + p += 64_Ki; + zero = static_cast<const char *>(memchr(p, 0, 64_Ki)); + } + p = zero; + while (*p == '\0') { + ++p; + } + return p; +} +#endif + +} + +size_t FUTF8StrChrFieldSearcher::match(const char *folded, size_t sz, QueryTerm & qt) +{ +#ifdef __x86_64__ + const v16qi _G_zero = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +#endif + termcount_t words(0); + const char * term; + termsize_t tsz = qt.term(term); + const char *et=term+tsz; + const char * n = folded; + const char *e = n + sz; + + while (!*n) n++; + while (true) { + if (n>=e) break; + +#if 0 + v16qi current = __builtin_ia32_loaddqu(n); + current = __builtin_ia32_pcmpeqb128(current, _qtlFast[0]); + unsigned eqMap = __builtin_ia32_pmovmskb128(current); + unsigned neqMap = ~eqMap; + unsigned numEq = Optimized::lsbIdx(neqMap); + /* if (eqMap)*/ { + if (numEq >= 16) { + const char *tt = term+16; + const char *p = n+16; + while ( (*tt == *p) && (tt < et)) { tt++; p++; numEq++; } + } + if ((numEq >= tsz) && (prefix() || qt.isPrefix() || !n[tsz])) { + addHit(qt, words); + } + } +#else + const char *tt = term; + while ((tt < et) && (*tt == *n)) { tt++; n++; } + if ((tt == et) && (prefix() || qt.isPrefix() || !*n)) { + addHit(qt, words); + } +#endif + words++; +#ifdef __x86_64__ + n = advance(n, _G_zero); +#else + n = advance(n); +#endif + } + return words; +} + +size_t FUTF8StrChrFieldSearcher::match(const char *folded, size_t sz, size_t mintsz, QueryTerm ** qtl, size_t qtlSize) +{ + (void) mintsz; +#ifdef __x86_64__ + const v16qi _G_zero = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; +#endif + termcount_t words(0); + const char * n = folded; + const char *e = n + sz; + while (!*n) n++; + for( ; ; ) { + if (n>=e) break; +#if 0 + v16qi current = __builtin_ia32_loaddqu(n); + for(size_t i=0; i < qtlSize; i++) { + v16qi tmpEq = __builtin_ia32_pcmpeqb128(current, _qtlFast[i]); + unsigned eqMap = __builtin_ia32_pmovmskb128(tmpEq); + /* if (eqMap) */ { + QueryTerm & qt = *qtl[i]; + unsigned neqMap = ~eqMap; + unsigned numEq = Optimized::lsbIdx(neqMap); + termsize_t tsz = qt.termLen(); + if (numEq >= 16) { + const char *tt = qt.term() + 16; + const char *et=tt+tsz; + const char *p = n+16; + while ( (*tt == *p) && (tt < et)) { tt++; p++; numEq++; } + } + if ((numEq >= tsz) && (prefix() || qt.isPrefix() || !n[tsz])) { + addHit(qt, words); + } + } + } +#else + for(QueryTerm ** it=qtl, ** mt=qtl+qtlSize; it != mt; it++) { + QueryTerm & qt = **it; + const char * term; + termsize_t tsz = qt.term(term); + + const char *et=term+tsz; + const char *fnt; + for (fnt = n; (term < et) && (*term == *fnt); term++, fnt++); + if ((term == et) && (prefix() || qt.isPrefix() || !*fnt)) { + addHit(qt, words); + } + } +#endif + words++; +#ifdef __x86_64__ + n = advance(n, _G_zero); +#else + n = advance(n); +#endif + } + return words; +} + +size_t FUTF8StrChrFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) +{ + _folded.reserve(f.size()+16*3); //Enable fulle xmm0 store + size_t unalignedStart(0); + bool ascii7Bit = lfoldua(f.data(), f.size(), &_folded[0], unalignedStart); + if (ascii7Bit) { + char * folded = &_folded[unalignedStart]; + /// Add the pattern 00 01 00 to avoid multiple eof tests of falling off the edge. + folded[f.size()] = 0; + folded[f.size()+1] = 0x01; + memset(folded + f.size() + 2, 0, 16); // initialize padding data to avoid valgrind complaining about uninitialized values + return match(folded, f.size(), qt); + NEED_CHAR_STAT(addPureUsAsciiField(f.size())); + } else { + return UTF8StrChrFieldSearcher::matchTerm(f, qt); + } +} + +size_t FUTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) +{ + _folded.reserve(f.size()+16*3); //Enable fulle xmm0 store + size_t unalignedStart(0); + bool ascii7Bit = lfoldua(f.data(), f.size(), &_folded[0], unalignedStart); + if (ascii7Bit) { + char * folded = &_folded[unalignedStart]; + /// Add the pattern 00 01 00 to avoid multiple eof tests of falling off the edge. + folded[f.size()] = 0; + folded[f.size()+1] = 0x01; + memset(folded + f.size() + 2, 0, 16); // initialize padding data to avoid valgrind complaining about uninitialized values + return match(folded, f.size(), mintsz, &_qtl[0], _qtl.size()); + NEED_CHAR_STAT(addPureUsAsciiField(f.size())); + } else { + return UTF8StrChrFieldSearcher::matchTerms(f, mintsz); + } +} + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h new file mode 100644 index 00000000000..900ab4c9120 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h @@ -0,0 +1,26 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "utf8strchrfieldsearcher.h" + +namespace vsm { + +class FUTF8StrChrFieldSearcher : public UTF8StrChrFieldSearcher +{ +public: + std::unique_ptr<FieldSearcher> duplicate() const override; + FUTF8StrChrFieldSearcher(); + FUTF8StrChrFieldSearcher(FieldIdT fId); + ~FUTF8StrChrFieldSearcher(); + static bool ansiFold(const char * toFold, size_t sz, char * folded); + static bool lfoldaa(const char * toFold, size_t sz, char * folded, size_t & unalignedStart); + static bool lfoldua(const char * toFold, size_t sz, char * folded, size_t & alignedStart); + private: + size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + size_t matchTerms(const FieldRef&, const size_t shortestTerm) override; + virtual size_t match(const char *folded, size_t sz, search::streaming::QueryTerm & qt); + size_t match(const char *folded, size_t sz, size_t mintsz, search::streaming::QueryTerm ** qtl, size_t qtlSize); + std::vector<char> _folded; +}; + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp new file mode 100644 index 00000000000..db93bda7778 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp @@ -0,0 +1,78 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "geo_pos_field_searcher.h" +#include <vespa/document/fieldvalue/arrayfieldvalue.h> +#include <vespa/document/fieldvalue/structfieldvalue.h> +#include <vespa/searchlib/common/geo_location_parser.h> +#include <vespa/vespalib/util/issue.h> +#include <vespa/vespalib/util/exception.h> + +#include <vespa/log/log.h> +LOG_SETUP(".vsm.searcher.geo_pos_field_searcher"); + +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; +using search::common::GeoLocation; +using search::common::GeoLocationParser; + +namespace vsm { + +std::unique_ptr<FieldSearcher> GeoPosFieldSearcher::duplicate() const { + return std::make_unique<GeoPosFieldSearcher>(*this); +} + +GeoPosFieldSearcher::GeoPosFieldSearcher(FieldIdT fId) : + FieldSearcher(fId), + _geoPosTerm() +{} + +GeoPosFieldSearcher::~GeoPosFieldSearcher() {} + +void GeoPosFieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf) { + _geoPosTerm.clear(); + FieldSearcher::prepare(qtl, buf); + for (const QueryTerm * qt : qtl) { + const vespalib::string & str = qt->getTermString(); + GeoLocationParser parser; + bool valid = parser.parseNoField(str); + if (! valid) { + vespalib::Issue::report("invalid position in term: %s", str.c_str()); + } + _geoPosTerm.emplace_back(parser.getGeoLocation()); + } +} + +void GeoPosFieldSearcher::onValue(const document::FieldValue & fv) { + LOG(spam, "ignore field value '%s'", fv.toString().c_str()); +} + +void GeoPosFieldSearcher::onStructValue(const document::StructFieldValue & fv) { + size_t num_terms = _geoPosTerm.size(); + for (size_t j = 0; j < num_terms; ++j) { + const GeoPosInfo & gpi = _geoPosTerm[j]; + if (gpi.valid() && gpi.cmp(fv)) { + addHit(*_qtl[j], 0); + } + } + ++_words; +} + +bool GeoPosFieldSearcher::GeoPosInfo::cmp(const document::StructFieldValue & sfv) const { + try { + auto xv = sfv.getValue("x"); + auto yv = sfv.getValue("y"); + if (xv && yv) { + int32_t x = xv->getAsInt(); + int32_t y = yv->getAsInt(); + GeoLocation::Point p{x,y}; + if (inside_limit(p)) { + return true; + } + } + } catch (const vespalib::Exception &e) { + vespalib::Issue::report("bad fieldvalue for GeoPosFieldSearcher: %s", e.getMessage().c_str()); + } + return false; +} + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h new file mode 100644 index 00000000000..ef1c5b5a1c4 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h @@ -0,0 +1,28 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "fieldsearcher.h" +#include <vespa/searchlib/common/geo_location.h> + +namespace vsm { + +class GeoPosFieldSearcher : public FieldSearcher { +public: + GeoPosFieldSearcher(FieldIdT fId=0); + ~GeoPosFieldSearcher(); + void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override; + void onValue(const document::FieldValue & fv) override; + void onStructValue(const document::StructFieldValue & fv) override; + std::unique_ptr<FieldSearcher> duplicate() const override; +protected: + using GeoLocation = search::common::GeoLocation; + class GeoPosInfo : public GeoLocation { + public: + GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {} + bool cmp(const document::StructFieldValue & fv) const; + }; + typedef std::vector<GeoPosInfo> GeoPosInfoListT; + GeoPosInfoListT _geoPosTerm; +}; + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp new file mode 100644 index 00000000000..8cfb8e6df14 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp @@ -0,0 +1,49 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "intfieldsearcher.h" + +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; + +namespace vsm { + +std::unique_ptr<FieldSearcher> +IntFieldSearcher::duplicate() const +{ + return std::make_unique<IntFieldSearcher>(*this); +} + +IntFieldSearcher::IntFieldSearcher(FieldIdT fId) : + FieldSearcher(fId), + _intTerm() +{ } + +IntFieldSearcher::~IntFieldSearcher() = default; + +void IntFieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf) +{ + _intTerm.clear(); + FieldSearcher::prepare(qtl, buf); + for (QueryTermList::const_iterator it=qtl.begin(); it < qtl.end(); it++) { + const QueryTerm * qt = *it; + size_t sz(qt->termLen()); + if (sz) { + int64_t low; + int64_t high; + bool valid = qt->getAsIntegerTerm(low, high); + _intTerm.push_back(IntInfo(low, high, valid)); + } + } +} + +void IntFieldSearcher::onValue(const document::FieldValue & fv) +{ + for(size_t j=0, jm(_intTerm.size()); j < jm; j++) { + const IntInfo & ii = _intTerm[j]; + if (ii.valid() && (ii.cmp(fv.getAsLong()))) { + addHit(*_qtl[j], 0); + } + } + ++_words; +} + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h new file mode 100644 index 00000000000..a2b17a87f4b --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h @@ -0,0 +1,33 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "fieldsearcher.h" + +namespace vsm { + +class IntFieldSearcher : public FieldSearcher +{ +public: + std::unique_ptr<FieldSearcher> duplicate() const override; + IntFieldSearcher(FieldIdT fId=0); + ~IntFieldSearcher(); + void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override; + void onValue(const document::FieldValue & fv) override; +protected: + class IntInfo + { + public: + IntInfo(int64_t low, int64_t high, bool v) : _lower(low), _upper(high), _valid(v) { if (low > high) { _lower = high; _upper = low; } } + bool cmp(int64_t key) const { return (_lower <= key) && (key <= _upper); } + bool valid() const { return _valid; } + private: + int64_t _lower; + int64_t _upper; + bool _valid; + }; + typedef std::vector<IntInfo> IntInfoListT; + IntInfoListT _intTerm; +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp new file mode 100644 index 00000000000..1c4ff78ff4a --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp @@ -0,0 +1,56 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "strchrfieldsearcher.h" +#include <vespa/document/fieldvalue/stringfieldvalue.h> + +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; + +namespace vsm { + +void StrChrFieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf) +{ + FieldSearcher::prepare(qtl, buf); +} + +void StrChrFieldSearcher::onValue(const document::FieldValue & fv) +{ + const document::LiteralFieldValueB & sfv = static_cast<const document::LiteralFieldValueB &>(fv); + vespalib::stringref val = sfv.getValueRef(); + FieldRef fr(val.data(), std::min(maxFieldLength(), val.size())); + matchDoc(fr); +} + +bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef) +{ + bool retval(true); + if (_qtl.size() > 1) { + size_t mintsz = shortestTerm(); + if (fieldRef.size() >= mintsz) { + _words += matchTerms(fieldRef, mintsz); + } else { + _words += countWords(fieldRef); + } + } else { + for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) { + QueryTerm & qt = **it; + if (fieldRef.size() >= qt.termLen()) { + _words += matchTerm(fieldRef, qt); + } else { + _words += countWords(fieldRef); + } + } + } + return retval; +} + +size_t StrChrFieldSearcher::shortestTerm() const +{ + size_t mintsz(_qtl.front()->termLen()); + for(QueryTermList::const_iterator it=_qtl.begin()+1, mt=_qtl.end(); it != mt; it++) { + const QueryTerm & qt = **it; + mintsz = std::min(mintsz, qt.termLen()); + } + return mintsz; +} + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h new file mode 100644 index 00000000000..0155c79cddf --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h @@ -0,0 +1,22 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "fieldsearcher.h" + +namespace vsm { + +class StrChrFieldSearcher : public FieldSearcher +{ +public: + StrChrFieldSearcher() : FieldSearcher(0) { } + StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { } + void onValue(const document::FieldValue & fv) override; + void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override; +private: + size_t shortestTerm() const; + bool matchDoc(const FieldRef & field); + virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) = 0; + virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) = 0; +}; + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp new file mode 100644 index 00000000000..977602a691c --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp @@ -0,0 +1,33 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "utf8exactstringfieldsearcher.h" + +using search::byte; +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; + +namespace vsm { + +std::unique_ptr<FieldSearcher> +UTF8ExactStringFieldSearcher::duplicate() const +{ + return std::make_unique<UTF8ExactStringFieldSearcher>(*this); +} + +size_t +UTF8ExactStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) +{ + (void) mintsz; + for (QueryTermList::iterator it = _qtl.begin(), mt = _qtl.end(); it != mt; ++it) { + QueryTerm & qt = **it; + matchTermExact(f, qt); + } + return 1; +} + +size_t +UTF8ExactStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) +{ + return matchTermExact(f, qt); +} + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h new file mode 100644 index 00000000000..744974a6cf6 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h @@ -0,0 +1,25 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> + +namespace vsm +{ + +/** + * This class does suffix utf8 searches. + **/ +class UTF8ExactStringFieldSearcher : public UTF8StringFieldSearcherBase +{ +protected: + virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + +public: + std::unique_ptr<FieldSearcher> duplicate() const override; + UTF8ExactStringFieldSearcher() : UTF8StringFieldSearcherBase() { } + UTF8ExactStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp new file mode 100644 index 00000000000..9aef99f9fa1 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp @@ -0,0 +1,69 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "utf8flexiblestringfieldsearcher.h" + +#include <vespa/log/log.h> +LOG_SETUP(".vsm.searcher.utf8flexiblestringfieldsearcher"); + +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; + +namespace vsm { + +std::unique_ptr<FieldSearcher> +UTF8FlexibleStringFieldSearcher::duplicate() const +{ + return std::make_unique<UTF8FlexibleStringFieldSearcher>(*this); +} + +size_t +UTF8FlexibleStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) +{ + (void) mintsz; + size_t words = 0; + for (QueryTermList::iterator it = _qtl.begin(); it != _qtl.end(); ++it) { + words = matchTerm(f, **it); + } + return words; +} + +size_t +UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) +{ + if (qt.isPrefix()) { + LOG(debug, "Use prefix match for prefix term '%s:%s'", qt.index().c_str(), qt.getTerm()); + return matchTermRegular(f, qt); + } else if (qt.isSubstring()) { + LOG(debug, "Use substring match for substring term '%s:%s'", qt.index().c_str(), qt.getTerm()); + return matchTermSubstring(f, qt); + } else if (qt.isSuffix()) { + LOG(debug, "Use suffix match for suffix term '%s:%s'", qt.index().c_str(), qt.getTerm()); + return matchTermSuffix(f, qt); + } else if (qt.isExactstring()) { + LOG(debug, "Use exact match for exact term '%s:%s'", qt.index().c_str(), qt.getTerm()); + return matchTermExact(f, qt); + } else { + if (substring()) { + LOG(debug, "Use substring match for term '%s:%s'", qt.index().c_str(), qt.getTerm()); + return matchTermSubstring(f, qt); + } else if (suffix()) { + LOG(debug, "Use suffix match for term '%s:%s'", qt.index().c_str(), qt.getTerm()); + return matchTermSuffix(f, qt); + } else if (exact()) { + LOG(debug, "Use exact match for term '%s:%s'", qt.index().c_str(), qt.getTerm()); + return matchTermExact(f, qt); + } else { + LOG(debug, "Use regular/prefix match for term '%s:%s'", qt.index().c_str(), qt.getTerm()); + return matchTermRegular(f, qt); + } + } +} + +UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher() : + UTF8StringFieldSearcherBase() +{ } + +UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher(FieldIdT fId) : + UTF8StringFieldSearcherBase(fId) +{ } + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h new file mode 100644 index 00000000000..63931af0036 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h @@ -0,0 +1,35 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> + +namespace vsm +{ + +/** + * This class does utf8 searches based on the query term type. + * It will choose between regular search strategy (including prefix) and substring search strategy. + **/ +class UTF8FlexibleStringFieldSearcher : public UTF8StringFieldSearcherBase +{ +private: + /** + * Tries to match the given query term against the content of the given field reference. + * Search strategy is choosen based on the query term type. + **/ + virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + + /** + * Tries to match each query term in the underlying query against the content of the given field reference. + * Search strategy is choosen based on the query term type. + **/ + virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + +public: + std::unique_ptr<FieldSearcher> duplicate() const override; + UTF8FlexibleStringFieldSearcher(); + UTF8FlexibleStringFieldSearcher(FieldIdT fId); +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp new file mode 100644 index 00000000000..0d93009655c --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp @@ -0,0 +1,56 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "utf8strchrfieldsearcher.h" + +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; +using search::byte; + +namespace vsm { + +std::unique_ptr<FieldSearcher> +UTF8StrChrFieldSearcher::duplicate() const +{ + return std::make_unique<UTF8StrChrFieldSearcher>(*this); +} + +size_t +UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) +{ + (void) mintsz; + termcount_t words(0); + const byte * n = reinterpret_cast<const byte *> (f.data()); + const byte * e = n + f.size(); + if (f.size() >= _buf->size()) { + _buf->reserve(f.size() + 1); + } + cmptype_t * fn = &(*_buf.get())[0]; + size_t fl(0); + + for( ; n < e; ) { + if (!*n) { _zeroCount++; n++; } + n = tokenize(n, _buf->capacity(), fn, fl); + for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) { + QueryTerm & qt = **it; + const cmptype_t * term; + termsize_t tsz = qt.term(term); + if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) { + const cmptype_t *tt=term, *et=term+tsz; + for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++); + if (tt == et) { + addHit(qt, words); + } + } + } + words++; + } + NEED_CHAR_STAT(addAnyUtf8Field(f.size())); + return words; +} + +size_t +UTF8StrChrFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) +{ + return matchTermRegular(f, qt); +} + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h new file mode 100644 index 00000000000..1687a1a18c0 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h @@ -0,0 +1,25 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "utf8stringfieldsearcherbase.h" + +namespace vsm { + +/** + * This class does normal utf8 searches. + * This class uses an highly optimized version of the tokenize method in fastlib. + **/ +class UTF8StrChrFieldSearcher : public UTF8StringFieldSearcherBase +{ +public: + std::unique_ptr<FieldSearcher> duplicate() const override; + UTF8StrChrFieldSearcher() : UTF8StringFieldSearcherBase() { } + UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } + +protected: + size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp new file mode 100644 index 00000000000..148cdf2c0c3 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp @@ -0,0 +1,320 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "utf8stringfieldsearcherbase.h" +#include <cassert> + +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; +using search::byte; + +namespace vsm { + +const byte * +UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen) +{ + if (maxSz > 0) { + maxSz--; + } + ucs4_t c(*p); + ucs4_t *q(dstbuf); + const byte * end(p+maxSz); + + // Skip non-word characters between words + for (; p < end; ) { + if (c < 128) { + if (!c) { break; } + p++; + if (__builtin_expect(_isWord[c], false)) { + *q++ = _foldCase[c]; + c = 0; + } else { + c = *p; + } + } else { + const byte * oldP(p); + c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); + if (Fast_UnicodeUtil::IsWordChar(c)) { + _utf8Count[p-oldP-1]++; + const char *repl = ReplacementString(c); + if (repl != NULL) { + size_t repllen = strlen(repl); + if (repllen > 0) { + q = Fast_UnicodeUtil::ucs4copy(q,repl); + } + } else { + c = ToFold(c); + *q++ = c; + } + break; + } else { + if (c == _BadUTF8Char) { + _badUtf8Count++; + } else { + _utf8Count[p-oldP-1]++; + } + c = *p; + } + } + } + + c = *p; // Next char + for (; p < end;) { + if (c < 128) { // Common case, ASCII + if (!c) { break; } + p++; + if (__builtin_expect(!_isWord[c], false)) { + c = 0; + } else { + *q++ = _foldCase[c]; + c = *p; + } + } else { + const byte * oldP(p); + c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); + if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) { + _utf8Count[p-oldP-1]++; + const char *repl = ReplacementString(c); + if (repl != NULL) { + size_t repllen = strlen(repl); + if (repllen > 0) { + q = Fast_UnicodeUtil::ucs4copy(q,repl); + } + } else { + c = ToFold(c); + *q++ = c; + } + + c = *p; + } else { + if (c == _BadUTF8Char) { + _badUtf8Count++; + } else { + _utf8Count[p-oldP-1]++; + } + break; + } + } + } + *q = 0; + tokenlen = q - dstbuf; + return p; +} + +size_t +UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt) +{ + termcount_t words(0); + const byte * n = reinterpret_cast<const byte *> (f.data()); + // __builtin_prefetch(n, 0, 0); + const cmptype_t * term; + termsize_t tsz = qt.term(term); + const byte * e = n + f.size(); + if ( f.size() >= _buf->size()) { + _buf->reserve(f.size() + 1); + } + cmptype_t * fn = &(*_buf.get())[0]; + size_t fl(0); + + for( ; n < e; ) { + if (!*n) { _zeroCount++; n++; } + n = tokenize(n, _buf->capacity(), fn, fl); + if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) { + const cmptype_t *tt=term, *et=term+tsz; + for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++); + if (tt == et) { + addHit(qt, words); + } + } + words++; + } + NEED_CHAR_STAT(addAnyUtf8Field(f.size())); + return words; +} + +size_t +UTF8StringFieldSearcherBase::matchTermExact(const FieldRef & f, QueryTerm & qt) +{ + const byte * n = reinterpret_cast<const byte *> (f.data()); + const cmptype_t * term; + termsize_t tsz = qt.term(term); + const cmptype_t * eterm = term+tsz; + const byte * e = n + f.size(); + if (tsz <= f.size()) { + bool equal(true); + for (; equal && (n < e) && (term < eterm); term++) { + if (*term < 0x80) { + equal = (*term == _foldCase[*n++]); + } else { + cmptype_t c = ToFold(Fast_UnicodeUtil::GetUTF8CharNonAscii(n)); + equal = (*term == c); + } + } + if (equal && (term == eterm) && (qt.isPrefix() || (n == e))) { + addHit(qt,0); + } + } + NEED_CHAR_STAT(addAnyUtf8Field(f.size())); + return 1; +} + +size_t +UTF8StringFieldSearcherBase::matchTermSubstring(const FieldRef & f, QueryTerm & qt) +{ + if (qt.termLen() == 0) { return 0; } + const byte * n = reinterpret_cast<const byte *> (f.data()); + const cmptype_t * term; + termsize_t tsz = qt.term(term); + if ( f.size() >= _buf->size()) { + _buf->reserve(f.size() + 1); + } + cmptype_t * fntemp = &(*_buf.get())[0]; + BufferWrapper wrapper(fntemp); + size_t fl = skipSeparators(n, f.size(), wrapper); + const cmptype_t * fn(fntemp); + const cmptype_t * fe = fn + fl; + const cmptype_t * fre = fe - tsz; + termcount_t words(0); + for(words = 0; fn <= fre; ) { + const cmptype_t *tt=term, *et=term+tsz, *fnt=fn; + for (; (tt < et) && (*tt == *fnt); tt++, fnt++); + if (tt == et) { + fn = fnt; + addHit(qt, words); + } else { + if ( ! Fast_UnicodeUtil::IsWordChar(*fn++) ) { + words++; + for(; (fn < fre) && ! Fast_UnicodeUtil::IsWordChar(*fn) ; fn++ ); + } + } + } + NEED_CHAR_STAT(addAnyUtf8Field(f.size())); + return words + 1; // we must also count the last word +} + +size_t +UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt) +{ + termcount_t words = 0; + const byte * srcbuf = reinterpret_cast<const byte *> (f.data()); + const byte * srcend = srcbuf + f.size(); + const cmptype_t * term; + termsize_t tsz = qt.term(term); + if (f.size() >= _buf->size()) { + _buf->reserve(f.size() + 1); + } + cmptype_t * dstbuf = &(*_buf.get())[0]; + size_t tokenlen = 0; + + for( ; srcbuf < srcend; ) { + if (*srcbuf == 0) { + ++_zeroCount; + ++srcbuf; + } + srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen); + if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) { + addHit(qt, words); + } + words++; + } + return words; +} + +UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase() : + StrChrFieldSearcher(), + Fast_NormalizeWordFolder(), + Fast_UnicodeUtil() +{ +} + +UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase(FieldIdT fId) : + StrChrFieldSearcher(fId), + Fast_NormalizeWordFolder(), + Fast_UnicodeUtil() +{ +} + +UTF8StringFieldSearcherBase::~UTF8StringFieldSearcherBase() {} + +void +UTF8StringFieldSearcherBase::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf) +{ + StrChrFieldSearcher::prepare(qtl, buf); + _buf = buf; +} + +bool +UTF8StringFieldSearcherBase::matchTermSuffix(const cmptype_t * term, size_t termlen, + const cmptype_t * word, size_t wordlen) +{ + if ((termlen <= wordlen)) { + const cmptype_t * titr = term + termlen - 1; + const cmptype_t * witr = word + wordlen - 1; + bool hit = true; + // traverse the term and the word back to front + for (; titr >= term; --titr, --witr) { + if (*titr != *witr) { + hit = false; + break; + } + } + return hit; + } + return false; +} + +bool +UTF8StringFieldSearcherBase::isSeparatorCharacter(ucs4_t c) +{ + return ((c < 0x20) && (c != '\n') && (c != '\t')); +} + +template <typename T> +size_t +UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T & dstbuf) { + const search::byte * e(p+sz); + const search::byte * b(p); + + for(; p < e; ) { + ucs4_t c(*p); + const search::byte * oldP(p); + if (c < 128) { + p++; + if (!isSeparatorCharacter(c)) { + dstbuf.onCharacter(_foldCase[c], (oldP - b)); + } + } else { + c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p); + const char *repl = ReplacementString(c); + if (repl != NULL) { + size_t repllen = strlen(repl); + if (repllen > 0) { + ucs4_t * buf = dstbuf.getBuf(); + ucs4_t * newBuf = Fast_UnicodeUtil::ucs4copy(buf, repl); + if (dstbuf.hasOffsets()) { + for (; buf < newBuf; ++buf) { + dstbuf.incBuf(1); + dstbuf.onOffset(oldP - b); + } + } else { + dstbuf.incBuf(newBuf - buf); + } + } + } else { + c = ToFold(c); + dstbuf.onCharacter(c, (oldP - b)); + } + if (c == _BadUTF8Char) { + _badUtf8Count++; + } else { + _utf8Count[p-oldP-1]++; + } + } + } + assert(dstbuf.valid()); + return dstbuf.size(); +} + +template unsigned long UTF8StringFieldSearcherBase::skipSeparators<UTF8StringFieldSearcherBase::BufferWrapper>(unsigned char const*, unsigned long, UTF8StringFieldSearcherBase::BufferWrapper&); +template unsigned long UTF8StringFieldSearcherBase::skipSeparators<UTF8StringFieldSearcherBase::OffsetWrapper>(unsigned char const*, unsigned long, UTF8StringFieldSearcherBase::OffsetWrapper&); + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h new file mode 100644 index 00000000000..f540a7ac457 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h @@ -0,0 +1,138 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "strchrfieldsearcher.h" +#include <vespa/fastlib/text/normwordfolder.h> + +namespace vsm { + +/** + * This class is the base class for all utf8 string searchers. + * It contains utility functions used by the other searchers. + * As normal the prepare method is called + * after the query is built. A SharedSearcherBuf is used given to it. This is a + * buffer that is shared among all searchers that are run in the same context. + * Reuse of this buffer ensures better cache hit ratio because this is just a + * scratchpad for tokenizing. It will grow till the max size and stay there. + **/ +class UTF8StringFieldSearcherBase : public StrChrFieldSearcher, protected Fast_NormalizeWordFolder, public Fast_UnicodeUtil +{ +public: + /** + * Template class that wraps an ucs4 buffer. + * Used when invoking skipSeparators() during substring matching. + **/ + class BufferWrapper + { + protected: + ucs4_t * _bbuf; + ucs4_t * _cbuf; + + public: + BufferWrapper(ucs4_t * buf) : _bbuf(buf), _cbuf(buf) { } + BufferWrapper(ucs4_t * buf, size_t *) : _bbuf(buf), _cbuf(buf) { } + void onCharacter(ucs4_t ch, size_t) { *_cbuf++ = ch; } + void onOffset(size_t) { } + void incBuf(size_t inc) { _cbuf += inc; } + ucs4_t * getBuf() { return _cbuf; } + bool valid() { return true; } + size_t size() { return (_cbuf - _bbuf); } + bool hasOffsets() { return false; } + }; + + /** + * Template class that wraps an offset buffer in addition to an ucs4 buffer. + * The offset buffer contains offsets into the original utf8 buffer. + **/ + class OffsetWrapper : public BufferWrapper + { + private: + size_t * _boff; + size_t * _coff; + + public: + OffsetWrapper(ucs4_t * buf, size_t * offsets) : BufferWrapper(buf), _boff(offsets), _coff(offsets) {} + void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; } + void onOffset(size_t of) { *_coff++ = of; } + bool valid() { return (size() == (size_t)(_coff - _boff)); } + bool hasOffsets() { return true; } + }; + +protected: + SharedSearcherBuf _buf; + + const search::byte * tokenize(const search::byte * buf, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen); + + /** + * Matches the given query term against the words in the given field reference + * using exact or prefix match strategy. + * + * @param f the field reference to match against. + * @param qt the query term trying to match. + * @return the number of words in the field ref. + **/ + size_t matchTermRegular(const FieldRef & f, search::streaming::QueryTerm & qt); + + /** + * Matches the given query term against the characters in the given field reference + * using substring match strategy. + * + * @param f the field reference to match against. + * @param qt the query term trying to match. + * @return the number of words in the field ref. + **/ + size_t matchTermSubstring(const FieldRef & f, search::streaming::QueryTerm & qt); + + /** + * Matches the given query term against the words in the given field reference + * using suffix match strategy. + * + * @param f the field reference to match against. + * @param qt the query term trying to match. + * @return the number of words in the field ref. + **/ + size_t matchTermSuffix(const FieldRef & f, search::streaming::QueryTerm & qt); + + /** + * Matches the given query term against the words in the given field reference + * using exact match strategy. + * + * @param f the field reference to match against. + * @param qt the query term trying to match. + * @return the number of words in the field ref. + **/ + size_t matchTermExact(const FieldRef & f, search::streaming::QueryTerm & qt); + +public: + UTF8StringFieldSearcherBase(); + UTF8StringFieldSearcherBase(FieldIdT fId); + ~UTF8StringFieldSearcherBase(); + void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override; + /** + * Matches the given query term against the given word using suffix match strategy. + * + * @param term the buffer with the term. + * @param termLen the length of the term. + * @param word the buffer with the word. + * @param wordlen the length of the word. + * @return true if the term matches the word. + **/ + static bool matchTermSuffix(const cmptype_t * term, size_t termlen, + const cmptype_t * word, size_t wordlen); + + /** + * Checks whether the given character is a separator character. + **/ + static bool isSeparatorCharacter(ucs4_t); + + /** + * Transforms the given utf8 array into an array of ucs4 characters. + * Folding is performed. Separator characters are skipped. + **/ + template <typename T> + size_t skipSeparators(const search::byte * p, size_t sz, T & dstbuf); + +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp new file mode 100644 index 00000000000..fd327d3a3df --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp @@ -0,0 +1,59 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vsm/searcher/utf8substringsearcher.h> + +using search::byte; +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; + +namespace vsm { + +std::unique_ptr<FieldSearcher> +UTF8SubStringFieldSearcher::duplicate() const +{ + return std::make_unique<UTF8SubStringFieldSearcher>(*this); +} + +size_t +UTF8SubStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) +{ + const byte * n = reinterpret_cast<const byte *> (f.data()); + if ( f.size() >= _buf->size()) { + _buf->reserve(f.size() + 1); + } + cmptype_t * fntemp = &(*_buf.get())[0]; + BufferWrapper wrapper(fntemp); + size_t fl = skipSeparators(n, f.size(), wrapper); + const cmptype_t * fn(fntemp); + const cmptype_t * fe = fn + fl; + const cmptype_t * fre = fe - mintsz; + termcount_t words(0); + for(words = 0; fn <= fre; ) { + for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) { + QueryTerm & qt = **it; + const cmptype_t * term; + termsize_t tsz = qt.term(term); + + const cmptype_t *tt=term, *et=term+tsz, *fnt=fn; + for (; (tt < et) && (*tt == *fnt); tt++, fnt++); + if (tt == et) { + addHit(qt, words); + } + } + if ( ! Fast_UnicodeUtil::IsWordChar(*fn++) ) { + words++; + for(; (fn < fre) && ! Fast_UnicodeUtil::IsWordChar(*fn); fn++ ); + } + } + + NEED_CHAR_STAT(addAnyUtf8Field(f.size())); + return words + 1; // we must also count the last word +} + +size_t +UTF8SubStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) +{ + return matchTermSubstring(f, qt); +} + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h new file mode 100644 index 00000000000..1c463c28847 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h @@ -0,0 +1,23 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/vsm/searcher/utf8strchrfieldsearcher.h> + +namespace vsm { + +/** + * This class does substring utf8 searches. + **/ +class UTF8SubStringFieldSearcher : public UTF8StringFieldSearcherBase +{ +public: + std::unique_ptr<FieldSearcher> duplicate() const override; + UTF8SubStringFieldSearcher() : UTF8StringFieldSearcherBase() { } + UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } +protected: + size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp new file mode 100644 index 00000000000..be02a58cfda --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp @@ -0,0 +1,144 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "utf8substringsnippetmodifier.h" +#include <cassert> + +using search::byte; +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; + +namespace vsm { + +std::unique_ptr<FieldSearcher> +UTF8SubstringSnippetModifier::duplicate() const +{ + return std::make_unique<UTF8SubstringSnippetModifier>(*this); +} + +size_t +UTF8SubstringSnippetModifier::matchTerms(const FieldRef & f, const size_t mintsz) +{ + _modified->reset(); + _readPtr = f.data(); + const byte * src = reinterpret_cast<const byte *> (f.data()); + // resize ucs4 buffer + if (f.size() >= _buf->size()) { + _buf->resize(f.size() + 1); + } + // resize offset buffers + if (f.size() >= _offsets->size()) { + _offsets->resize(f.size() + 1); + } + // resize modified buffer + if (f.size() + 16 > _modified->getLength()) { + _modified->resize(f.size() + 16); // make room for some unit separators + } + cmptype_t * dbegin = &(*_buf.get())[0]; + OffsetWrapper wrapper(dbegin, &(*_offsets)[0]); + size_t numchars = skipSeparators(src, f.size(), wrapper); + const cmptype_t * ditr = dbegin; + const cmptype_t * dend = ditr + numchars; + const cmptype_t * drend = dend - mintsz; + termcount_t words = 0; + for(; ditr <= drend; ) { + for (QueryTermList::iterator itr = _qtl.begin(); itr != _qtl.end(); ++itr) { + QueryTerm & qt = **itr; + const cmptype_t * term; + termsize_t tsz = qt.term(term); + + const cmptype_t * titr = term; + const cmptype_t * tend = term + tsz; + const cmptype_t * dtmp = ditr; + for (; (titr < tend) && (*titr == *dtmp); ++titr, ++dtmp); + if (titr == tend) { + const char * mbegin = f.data() + (*_offsets)[ditr - dbegin]; + const char * mend = f.data() + ((dtmp < dend) ? ((*_offsets)[dtmp - dbegin]) : f.size()); + if (_readPtr <= mbegin) { + // We will only copy from the field ref once. + // If we have overlapping matches only the first one will be considered. + insertSeparators(mbegin, mend); + } + addHit(qt, words); + } + } + if ( ! Fast_UnicodeUtil::IsWordChar(*ditr++) ) { + words++; + for(; (ditr < drend) && ! Fast_UnicodeUtil::IsWordChar(*ditr) ; ++ditr ); + } + } + assert(_readPtr <= (f.data() + f.size())); + // copy remaining + size_t toCopy = f.size() - (_readPtr - f.data()); + copyToModified(toCopy); + + return words + 1; // we must also count the last word +} + +size_t +UTF8SubstringSnippetModifier::matchTerm(const FieldRef & f, QueryTerm & qt) +{ + const cmptype_t * term; + termsize_t tsz = qt.term(term); + return matchTerms(f, tsz); +} + +void +UTF8SubstringSnippetModifier::copyToModified(size_t n, bool skipSep) +{ + if (n == 0) { + return; + } + if (skipSep) { + for (const char * readEnd = _readPtr + n; _readPtr < readEnd; ++_readPtr) { + if (!isSeparatorCharacter(*_readPtr)) { + _modified->put(*_readPtr); + } + } + } else { + _modified->put(_readPtr, n); + _readPtr += n; + } +} + +void +UTF8SubstringSnippetModifier::insertSeparators(const char * mbegin, const char * mend) +{ + copyToModified(mbegin - _readPtr); + _modified->put(_unitSep); + // skip separators such that the match is not splitted. + copyToModified((mend - mbegin), true); + _modified->put(_unitSep); +} + +UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier() : + UTF8StringFieldSearcherBase(), + _modified(new CharBuffer(32)), + _offsets(new std::vector<size_t>(32)), + _readPtr(NULL), + _unitSep('\x1F') +{ +} + +UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId) : + UTF8StringFieldSearcherBase(fId), + _modified(new CharBuffer(32)), + _offsets(new std::vector<size_t>(32)), + _readPtr(NULL), + _unitSep('\x1F') +{ +} + +UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId, + const CharBuffer::SP & modBuf, + const SharedOffsetBuffer & offBuf) : + UTF8StringFieldSearcherBase(fId), + _modified(modBuf), + _offsets(offBuf), + _readPtr(NULL), + _unitSep('\x1F') +{ +} + +UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() {} + +} + diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h new file mode 100644 index 00000000000..0127a7f2827 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h @@ -0,0 +1,72 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "utf8stringfieldsearcherbase.h" +#include <vespa/vsm/common/charbuffer.h> + +namespace vsm { + +typedef std::shared_ptr<std::vector<size_t> > SharedOffsetBuffer; + +/** + * This class does substring searches the same way as UTF8SubStringFieldSearcher. + * While matching the query term(s) against the field reference it builds a modified + * buffer based on the field reference where the only difference is that unit separators + * are inserted before and after a match. These extra unit separators make it possible + * to highlight a substring match when later generating snippets. + **/ +class UTF8SubstringSnippetModifier : public UTF8StringFieldSearcherBase +{ +private: + CharBuffer::SP _modified; // buffer to write the modified field value + SharedOffsetBuffer _offsets; // for each character in _buf we have an offset into the utf8 buffer (field reference) + const char * _readPtr; // buffer to read from (field reference) + char _unitSep; // the unit separator character to use + + virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + + /** + * Copies n bytes from the field reference to the modified buffer and updates the read pointer. + * Separator characters from the field reference can be skipped. + * This is to avoid that a match is splitted by separator characters from the original field reference. + * + * @param n the number of bytes to copy. + * @param skipSep whether we should skip separator characters from the field reference. + **/ + void copyToModified(size_t n, bool skipSep = false); + + /** + * Copies from the field reference to the modified buffer and inserts unit separators for a match + * starting at mbegin (in the field reference) and ending at mend (in the field reference). + * A unit separator is inserted before and after the match. + * + * @param mbegin the beginning of the match. + * @param mend the end of the match. + **/ + void insertSeparators(const char * mbegin, const char * mend); + +public: + typedef std::shared_ptr<UTF8SubstringSnippetModifier> SP; + + std::unique_ptr<FieldSearcher> duplicate() const override; + + UTF8SubstringSnippetModifier(); + UTF8SubstringSnippetModifier(FieldIdT fId); + ~UTF8SubstringSnippetModifier(); + + /** + * Creates a new instance. + * + * @param fId the field id to operate on. + * @param modBuf the shared buffer used to store the modified field value. + * @param offBuf the shared buffer used to store the offsets into the field reference. + **/ + UTF8SubstringSnippetModifier(FieldIdT fId, const CharBuffer::SP & modBuf, const SharedOffsetBuffer & offBuf); + + const CharBuffer & getModifiedBuf() const { return *_modified; } + const search::streaming::QueryTermList & getQueryTerms() const { return _qtl; } +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp new file mode 100644 index 00000000000..3495d46b85b --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp @@ -0,0 +1,54 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "utf8suffixstringfieldsearcher.h" + +using search::byte; +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; + +namespace vsm { + +std::unique_ptr<FieldSearcher> +UTF8SuffixStringFieldSearcher::duplicate() const +{ + return std::make_unique<UTF8SuffixStringFieldSearcher>(*this); +} + +size_t +UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz) +{ + (void) mintsz; + termcount_t words = 0; + const byte * srcbuf = reinterpret_cast<const byte *> (f.data()); + const byte * srcend = srcbuf + f.size(); + if (f.size() >= _buf->size()) { + _buf->reserve(f.size() + 1); + } + cmptype_t * dstbuf = &(*_buf.get())[0]; + size_t tokenlen = 0; + + for( ; srcbuf < srcend; ) { + if (*srcbuf == 0) { + ++_zeroCount; + ++srcbuf; + } + srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen); + for (QueryTermList::iterator it = _qtl.begin(), mt = _qtl.end(); it != mt; ++it) { + QueryTerm & qt = **it; + const cmptype_t * term; + termsize_t tsz = qt.term(term); + if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) { + addHit(qt, words); + } + } + words++; + } + return words; +} + +size_t +UTF8SuffixStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt) +{ + return matchTermSuffix(f, qt); +} + +} diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h new file mode 100644 index 00000000000..0640ac22da5 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h @@ -0,0 +1,25 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h> + +namespace vsm +{ + +/** + * This class does suffix utf8 searches. + **/ +class UTF8SuffixStringFieldSearcher : public UTF8StringFieldSearcherBase +{ +protected: + virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override; + virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override; + +public: + std::unique_ptr<FieldSearcher> duplicate() const override; + UTF8SuffixStringFieldSearcher() : UTF8StringFieldSearcherBase() { } + UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { } +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/vsm/.gitignore b/streamingvisitors/src/vespa/vsm/vsm/.gitignore new file mode 100644 index 00000000000..95bc02923a9 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/.gitignore @@ -0,0 +1,5 @@ +*.exe +*.ilk +*.pdb +.depend* +Makefile diff --git a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt new file mode 100644 index 00000000000..adc00b341a3 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt @@ -0,0 +1,14 @@ +# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(vsm_vsmbase OBJECT + SOURCES + docsumfieldspec.cpp + docsumfilter.cpp + fieldsearchspec.cpp + flattendocsumwriter.cpp + slimefieldwriter.cpp + snippetmodifier.cpp + vsm-adapter.cpp + docsumconfig.cpp + DEPENDS + vsm_vconfig +) diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.cpp new file mode 100644 index 00000000000..656e9eed132 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.cpp @@ -0,0 +1,75 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include <vespa/vsm/vsm/docsumconfig.h> +#include <vespa/searchsummary/docsummary/docsumfieldwriter.h> +#include <vespa/searchsummary/docsummary/matched_elements_filter_dfw.h> +#include <vespa/searchlib/common/matching_elements_fields.h> +#include <vespa/vsm/config/config-vsmfields.h> +#include <vespa/vsm/config/config-vsmsummary.h> + +using search::MatchingElementsFields; +using search::docsummary::IDocsumFieldWriter; +using search::docsummary::EmptyDFW; +using search::docsummary::MatchedElementsFilterDFW; +using search::docsummary::ResultConfig; +using vespa::config::search::vsm::VsmfieldsConfig; +using vespa::config::search::vsm::VsmsummaryConfig; + +namespace vsm { + +namespace { + +void populate_fields(MatchingElementsFields& fields, VsmfieldsConfig& fields_config, const vespalib::string& field_name) +{ + vespalib::string prefix = field_name + "."; + for (const auto& spec : fields_config.fieldspec) { + if (spec.name.substr(0, prefix.size()) == prefix) { + fields.add_mapping(field_name, spec.name); + } + if (spec.name == field_name) { + fields.add_field(field_name); + } + } +} + +} + +DynamicDocsumConfig::DynamicDocsumConfig(search::docsummary::IDocsumEnvironment* env, search::docsummary::DynamicDocsumWriter* writer, std::shared_ptr<VsmfieldsConfig> vsm_fields_config) + : Parent(env, writer), + _vsm_fields_config(std::move(vsm_fields_config)) +{ +} + +IDocsumFieldWriter::UP +DynamicDocsumConfig::createFieldWriter(const string & fieldName, const string & overrideName, const string & argument, bool & rc, std::shared_ptr<search::MatchingElementsFields> matching_elems_fields) +{ + IDocsumFieldWriter::UP fieldWriter; + if ((overrideName == "staticrank") || + (overrideName == "ranklog") || + (overrideName == "label") || + (overrideName == "project") || + (overrideName == "positions") || + (overrideName == "absdist") || + (overrideName == "subproject")) + { + fieldWriter = std::make_unique<EmptyDFW>(); + rc = true; + } else if ((overrideName == "attribute") || + (overrideName == "attributecombiner") || + (overrideName == "geopos")) { + rc = true; + } else if ((overrideName == "matchedattributeelementsfilter") || + (overrideName == "matchedelementsfilter")) { + string source_field = argument.empty() ? fieldName : argument; + const ResultConfig& resultConfig = getResultConfig(); + int source_field_enum = resultConfig.GetFieldNameEnum().Lookup(source_field.c_str()); + populate_fields(*matching_elems_fields, *_vsm_fields_config, source_field); + fieldWriter = MatchedElementsFilterDFW::create(source_field, source_field_enum, matching_elems_fields); + rc = static_cast<bool>(fieldWriter); + } else { + fieldWriter = search::docsummary::DynamicDocsumConfig::createFieldWriter(fieldName, overrideName, argument, rc, matching_elems_fields); + } + return fieldWriter; +} + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.h b/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.h new file mode 100644 index 00000000000..11010c04e90 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.h @@ -0,0 +1,29 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchsummary/docsummary/docsumconfig.h> + +namespace vespa::config::search::vsm { +namespace internal { class InternalVsmfieldsType; } +typedef const internal::InternalVsmfieldsType VsmfieldsConfig; +} +namespace vsm { + +class DynamicDocsumConfig : public search::docsummary::DynamicDocsumConfig +{ +public: + using Parent = search::docsummary::DynamicDocsumConfig; + using VsmfieldsConfig = vespa::config::search::vsm::VsmfieldsConfig; +private: + std::shared_ptr<VsmfieldsConfig> _vsm_fields_config; +public: + DynamicDocsumConfig(search::docsummary::IDocsumEnvironment* env, search::docsummary::DynamicDocsumWriter* writer, std::shared_ptr<VsmfieldsConfig> vsm_fields_config); +private: + std::unique_ptr<search::docsummary::IDocsumFieldWriter> + createFieldWriter(const string & fieldName, const string & overrideName, + const string & cf, bool & rc, std::shared_ptr<search::MatchingElementsFields> matching_elems_fields) override; +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.cpp new file mode 100644 index 00000000000..936aaaa2091 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.cpp @@ -0,0 +1,35 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include "docsumfieldspec.h" + +namespace vsm { + +DocsumFieldSpec::FieldIdentifier::FieldIdentifier() : + _id(StringFieldIdTMap::npos), + _path() +{ } + +DocsumFieldSpec::FieldIdentifier::FieldIdentifier(FieldIdT id, FieldPath path) : + _id(id), + _path(std::move(path)) +{ } + +DocsumFieldSpec::FieldIdentifier::FieldIdentifier(FieldIdentifier &&) noexcept = default; +DocsumFieldSpec::FieldIdentifier & DocsumFieldSpec::FieldIdentifier::operator=(FieldIdentifier &&) noexcept = default; +DocsumFieldSpec::FieldIdentifier::~FieldIdentifier() = default; + +DocsumFieldSpec::DocsumFieldSpec() : + _resultType(search::docsummary::RES_INT), + _command(VsmsummaryConfig::Fieldmap::Command::NONE), + _outputField(), + _inputFields() +{ } + +DocsumFieldSpec::DocsumFieldSpec(search::docsummary::ResType resultType, + VsmsummaryConfig::Fieldmap::Command command) : + _resultType(resultType), + _command(command), + _outputField(), + _inputFields() +{ } + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.h b/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.h new file mode 100644 index 00000000000..db6ee9fa223 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.h @@ -0,0 +1,72 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/searchsummary/docsummary/resultclass.h> +#include <vespa/vsm/common/document.h> +#include <vespa/vsm/common/storagedocument.h> +#include <vespa/vsm/config/vsm-cfif.h> + +namespace vsm { + +/** + * This class contains the specifications for how to generate a summary field. + **/ +class DocsumFieldSpec { +public: + /** + * This class contains a field id and a field path (to navigate a field value). + **/ + class FieldIdentifier { + private: + FieldIdT _id; + FieldPath _path; + + public: + FieldIdentifier(); + FieldIdentifier(FieldIdT id, FieldPath path); + FieldIdentifier(FieldIdentifier &&) noexcept; + FieldIdentifier & operator=(FieldIdentifier &&) noexcept; + FieldIdentifier(const FieldIdentifier &) = delete; + FieldIdentifier & operator=(const FieldIdentifier &) = delete; + ~FieldIdentifier(); + FieldIdT getId() const { return _id; } + const FieldPath & getPath() const { return _path; } + }; + + typedef std::vector<FieldIdentifier> FieldIdentifierVector; + +private: + search::docsummary::ResType _resultType; + VsmsummaryConfig::Fieldmap::Command _command; + FieldIdentifier _outputField; + FieldIdentifierVector _inputFields; + +public: + DocsumFieldSpec(); + DocsumFieldSpec(search::docsummary::ResType resultType, VsmsummaryConfig::Fieldmap::Command command); + + /** + * Returns the result type for the summary field. + **/ + search::docsummary::ResType getResultType() const { return _resultType; } + + /** + * Returns the command specifying how to transform input fields into output summary field. + **/ + VsmsummaryConfig::Fieldmap::Command getCommand() const { return _command; } + + /** + * Returns whether the input field and output field are identical. + **/ + bool hasIdentityMapping() const { + return _inputFields.size() == 1 && _outputField.getId() == _inputFields[0].getId(); + } + + const FieldIdentifier & getOutputField() const { return _outputField; } + void setOutputField(FieldIdentifier outputField) { _outputField = std::move(outputField); } + const FieldIdentifierVector & getInputFields() const { return _inputFields; } + FieldIdentifierVector & getInputFields() { return _inputFields; } +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp new file mode 100644 index 00000000000..70759feb41c --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp @@ -0,0 +1,477 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "docsumfilter.h" +#include "slimefieldwriter.h" +#include <vespa/searchsummary/docsummary/summaryfieldconverter.h> +#include <vespa/document/base/exceptions.h> +#include <vespa/document/fieldvalue/iteratorhandler.h> + +#include <vespa/log/log.h> +LOG_SETUP(".vsm.docsumfilter"); + +using namespace search::docsummary; + + +namespace { + +class Handler : public document::fieldvalue::IteratorHandler { +public: +}; + +struct IntResultHandler : public Handler { + int32_t value; + IntResultHandler() : value(0) {} + void onPrimitive(uint32_t, const Content & c) override { + value = c.getValue().getAsInt(); + } +}; + +struct LongResultHandler : public Handler { + int64_t value; + LongResultHandler() : value(0) {} + void onPrimitive(uint32_t, const Content & c) override { + value = c.getValue().getAsLong(); + } +}; + +struct FloatResultHandler : public Handler { + float value; + FloatResultHandler() : value(0) {} + void onPrimitive(uint32_t, const Content & c) override { + value = c.getValue().getAsFloat(); + } +}; + +struct DoubleResultHandler : public Handler { + double value; + DoubleResultHandler() : value(0) {} + void onPrimitive(uint32_t, const Content & c) override { + value = c.getValue().getAsDouble(); + } +}; + +class StringResultHandler : public Handler { +private: + ResType _type; + ResultPacker & _packer; + void addToPacker(const char * buf, size_t len) { + switch (_type) { + case RES_STRING: + _packer.AddString(buf, len); + break; + case RES_LONG_STRING: + _packer.AddLongString(buf, len); + break; + default: + break; + } + } + +public: + StringResultHandler(ResType t, ResultPacker & p) : _type(t), _packer(p) {} + void onPrimitive(uint32_t, const Content & c) override { + const document::FieldValue & fv = c.getValue(); + if (fv.isLiteral()) { + const document::LiteralFieldValueB & lfv = static_cast<const document::LiteralFieldValueB &>(fv); + vespalib::stringref s = lfv.getValueRef(); + addToPacker(s.data(), s.size()); + } else { + vespalib::string s = fv.toString(); + addToPacker(s.c_str(), s.size()); + } + } +}; + +class RawResultHandler : public Handler { +private: + ResType _type; + ResultPacker & _packer; + +public: + RawResultHandler(ResType t, ResultPacker & p) : _type(t), _packer(p) {} + void onPrimitive(uint32_t, const Content & c) override { + const document::FieldValue & fv = c.getValue(); + try { + std::pair<const char *, size_t> buf = fv.getAsRaw(); + if (buf.first != nullptr) { + switch (_type) { + case RES_DATA: + _packer.AddData(buf.first, buf.second); + break; + case RES_LONG_DATA: + _packer.AddLongData(buf.first, buf.second); + break; + default: + break; + } + } + } catch (document::InvalidDataTypeConversionException & e) { + LOG(warning, "RawResultHandler: Could not get field value '%s' as raw. Skipping writing this field", fv.toString().c_str()); + _packer.AddEmpty(); + } + } +}; + + +} + + +namespace vsm { + +FieldPath +copyPathButFirst(const FieldPath & rhs) { + // skip the element that correspond to the start field value + FieldPath path; + if ( ! rhs.empty()) { + for (auto it = rhs.begin() + 1; it != rhs.end(); ++it) { + path.push_back(std::make_unique<document::FieldPathEntry>(**it)); + } + } + return path; +} + +void +DocsumFilter::prepareFieldSpec(DocsumFieldSpec & spec, const DocsumTools::FieldSpec & toolsSpec, + const FieldMap & fieldMap, const FieldPathMapT & fieldPathMap) +{ + { // setup output field + const vespalib::string & name = toolsSpec.getOutputName(); + LOG(debug, "prepareFieldSpec: output field name '%s'", name.c_str()); + FieldIdT field = fieldMap.fieldNo(name); + if (field != FieldMap::npos) { + if (field < fieldPathMap.size()) { + spec.setOutputField(DocsumFieldSpec::FieldIdentifier(field, copyPathButFirst(fieldPathMap[field]))); + } else { + LOG(warning, "Could not find a field path for field '%s' with id '%d'", name.c_str(), field); + spec.setOutputField(DocsumFieldSpec::FieldIdentifier(field, FieldPath())); + } + } else { + LOG(warning, "Could not find output summary field '%s'", name.c_str()); + } + } + // setup input fields + for (size_t i = 0; i < toolsSpec.getInputNames().size(); ++i) { + const vespalib::string & name = toolsSpec.getInputNames()[i]; + LOG(debug, "prepareFieldSpec: input field name '%s'", name.c_str()); + FieldIdT field = fieldMap.fieldNo(name); + if (field != FieldMap::npos) { + if (field < fieldPathMap.size()) { + LOG(debug, "field %u < map size %zu", field, fieldPathMap.size()); + spec.getInputFields().push_back(DocsumFieldSpec::FieldIdentifier(field, copyPathButFirst(fieldPathMap[field]))); + } else { + LOG(warning, "Could not find a field path for field '%s' with id '%d'", name.c_str(), field); + spec.getInputFields().push_back(DocsumFieldSpec::FieldIdentifier(field, FieldPath())); + } + if (_highestFieldNo <= field) { + _highestFieldNo = field + 1; + } + } else { + LOG(warning, "Could not find input summary field '%s'", name.c_str()); + } + } +} + +const document::FieldValue * +DocsumFilter::getFieldValue(const DocsumFieldSpec::FieldIdentifier & fieldId, + VsmsummaryConfig::Fieldmap::Command command, + const Document & docsum, bool & modified) +{ + FieldIdT fId = fieldId.getId(); + const document::FieldValue * fv = docsum.getField(fId); + if (fv == nullptr) { + return nullptr; + } + switch (command) { + case VsmsummaryConfig::Fieldmap::Command::FLATTENJUNIPER: + if (_snippetModifiers != nullptr) { + FieldModifier * mod = _snippetModifiers->getModifier(fId); + if (mod != nullptr) { + _cachedValue = mod->modify(*fv, fieldId.getPath()); + modified = true; + return _cachedValue.get(); + } + } + [[fallthrough]]; + default: + return fv; + } +} + + +DocsumFilter::DocsumFilter(const DocsumToolsPtr &tools, const IDocSumCache & docsumCache) : + _docsumCache(&docsumCache), + _tools(tools), + _fields(), + _highestFieldNo(0), + _packer(tools ? tools->getResultConfig() : nullptr), + _flattenWriter(), + _snippetModifiers(nullptr), + _cachedValue(), + _emptyFieldPath() +{ } + +DocsumFilter::~DocsumFilter() =default; + +void DocsumFilter::init(const FieldMap & fieldMap, const FieldPathMapT & fieldPathMap) +{ + if (_tools.get()) { + const ResultClass *resClass = _tools->getResultClass(); + const std::vector<DocsumTools::FieldSpec> & inputSpecs = _tools->getFieldSpecs(); + if (resClass != nullptr) { + uint32_t entryCnt = resClass->GetNumEntries(); + assert(entryCnt == inputSpecs.size()); + for (uint32_t i = 0; i < entryCnt; ++i) { + const ResConfigEntry &entry = *resClass->GetEntry(i); + const DocsumTools::FieldSpec & toolsSpec = inputSpecs[i]; + _fields.push_back(DocsumFieldSpec(entry._type, toolsSpec.getCommand())); + LOG(debug, "About to prepare field spec for summary field '%s'", entry._bindname.c_str()); + prepareFieldSpec(_fields.back(), toolsSpec, fieldMap, fieldPathMap); + } + assert(entryCnt == _fields.size()); + } + } +} + +uint32_t +DocsumFilter::getNumDocs() const +{ + return std::numeric_limits<uint32_t>::max(); +} + +void +DocsumFilter::writeField(const document::FieldValue & fv, const FieldPath & path, ResType type, ResultPacker & packer) +{ + switch (type) { + case RES_INT: { + IntResultHandler rh; + fv.iterateNested(path, rh); + uint32_t val = rh.value; + packer.AddInteger(val); + break; } + case RES_SHORT: { + IntResultHandler rh; + fv.iterateNested(path, rh); + uint16_t val = rh.value; + packer.AddShort(val); + break; } + case RES_BYTE: { + IntResultHandler rh; + fv.iterateNested(path, rh); + uint8_t val = rh.value; + packer.AddByte(val); + break; } + case RES_BOOL: { + IntResultHandler rh; + fv.iterateNested(path, rh); + uint8_t val = rh.value; + packer.AddByte(val); + break; } + case RES_FLOAT: { + FloatResultHandler rh; + fv.iterateNested(path, rh); + float val = rh.value; + packer.AddFloat(val); + break; } + case RES_DOUBLE: { + DoubleResultHandler rh; + fv.iterateNested(path, rh); + double val = rh.value; + packer.AddDouble(val); + break; } + case RES_INT64: { + LongResultHandler rh; + fv.iterateNested(path, rh); + uint64_t val = rh.value; + packer.AddInt64(val); + break; } + case RES_STRING: + case RES_LONG_STRING: + { + StringResultHandler rh(type, packer); + // the string result handler adds the result to the packer + fv.iterateNested(path, rh); + } + break; + case RES_DATA: + case RES_LONG_DATA: + { + RawResultHandler rh(type, packer); + // the raw result handler adds the result to the packer + fv.iterateNested(path, rh); + } + break; + default: + LOG(warning, "Unknown docsum field type: %s", ResultConfig::GetResTypeName(type)); + packer.AddEmpty(); // unhandled output type + break; + } +} + + +void +DocsumFilter::writeSlimeField(const DocsumFieldSpec & fieldSpec, + const Document & docsum, + ResultPacker & packer) +{ + if (fieldSpec.getCommand() == VsmsummaryConfig::Fieldmap::Command::NONE) { + const DocsumFieldSpec::FieldIdentifier & fieldId = fieldSpec.getOutputField(); + const document::FieldValue * fv = docsum.getField(fieldId.getId()); + if (fv != nullptr) { + LOG(debug, "writeSlimeField: About to write field '%d' as Slime: field value = '%s'", + fieldId.getId(), fv->toString().c_str()); + SlimeFieldWriter writer; + if (! fieldSpec.hasIdentityMapping()) { + writer.setInputFields(fieldSpec.getInputFields()); + } + writer.convert(*fv); + const vespalib::stringref out = writer.out(); + packer.AddLongString(out.data(), out.size()); + } else { + LOG(debug, "writeSlimeField: Field value not set for field '%d'", fieldId.getId()); + packer.AddEmpty(); + } + } else { + LOG(debug, "writeSlimeField: Cannot handle this command"); + packer.AddEmpty(); + } +} + +void +DocsumFilter::writeFlattenField(const DocsumFieldSpec & fieldSpec, + const Document & docsum, + ResultPacker & packer) +{ + if (fieldSpec.getCommand() == VsmsummaryConfig::Fieldmap::Command::NONE) { + LOG(debug, "writeFlattenField: Cannot handle command NONE"); + packer.AddEmpty(); + return; + } + + if (fieldSpec.getResultType() != RES_LONG_STRING && + fieldSpec.getResultType() != RES_STRING) + { + LOG(debug, "writeFlattenField: Can only handle result types STRING and LONG_STRING"); + packer.AddEmpty(); + return; + } + + switch (fieldSpec.getCommand()) { + case VsmsummaryConfig::Fieldmap::Command::FLATTENJUNIPER: + _flattenWriter.setSeparator("\x1E"); // record separator (same as juniper uses) + break; + default: + break; + } + const DocsumFieldSpec::FieldIdentifierVector & inputFields = fieldSpec.getInputFields(); + for (size_t i = 0; i < inputFields.size(); ++i) { + const DocsumFieldSpec::FieldIdentifier & fieldId = inputFields[i]; + bool modified = false; + const document::FieldValue * fv = getFieldValue(fieldId, fieldSpec.getCommand(), docsum, modified); + if (fv != nullptr) { + LOG(debug, "writeFlattenField: About to flatten field '%d' with field value (%s) '%s'", + fieldId.getId(), modified ? "modified" : "original", fv->toString().c_str()); + if (modified) { + fv->iterateNested(_emptyFieldPath, _flattenWriter); + } else { + fv->iterateNested(fieldId.getPath(), _flattenWriter); + } + } else { + LOG(debug, "writeFlattenField: Field value not set for field '%d'", fieldId.getId()); + } + } + + const CharBuffer & buf = _flattenWriter.getResult(); + switch (fieldSpec.getResultType()) { + case RES_STRING: + packer.AddString(buf.getBuffer(), buf.getPos()); + break; + case RES_LONG_STRING: + packer.AddLongString(buf.getBuffer(), buf.getPos()); + break; + default: + break; + } + _flattenWriter.clear(); +} + + +void +DocsumFilter::writeEmpty(ResType type, ResultPacker & packer) +{ + // use the 'notdefined' values when writing numeric values + switch (type) { + case RES_INT: + packer.AddInteger(std::numeric_limits<int32_t>::min()); + break; + case RES_SHORT: + packer.AddShort(std::numeric_limits<int16_t>::min()); + break; + case RES_BYTE: + packer.AddByte(0); // byte fields are unsigned so we have no 'notdefined' value. + break; + case RES_FLOAT: + packer.AddFloat(std::numeric_limits<float>::quiet_NaN()); + break; + case RES_DOUBLE: + packer.AddDouble(std::numeric_limits<double>::quiet_NaN()); + break; + case RES_INT64: + packer.AddInt64(std::numeric_limits<int64_t>::min()); + break; + default: + packer.AddEmpty(); + break; + } +} + +uint32_t +DocsumFilter::getSummaryClassId() const +{ + return _tools->getResultClass() ? _tools->getResultClass()->GetClassID() : ResultConfig::NoClassID(); +} + +DocsumStoreValue +DocsumFilter::getMappedDocsum(uint32_t id) +{ + const ResultClass *resClass = _tools->getResultClass(); + if (resClass == nullptr) { + return DocsumStoreValue(nullptr, 0); + } + + const Document & doc = _docsumCache->getDocSum(id); + + _packer.Init(resClass->GetClassID()); + for (FieldSpecList::iterator it(_fields.begin()), end = _fields.end(); it != end; ++it) { + ResType type = it->getResultType(); + if (type == RES_JSONSTRING) { + // this really means 'structured data' + writeSlimeField(*it, doc, _packer); + } else { + if (it->getInputFields().size() == 1 && it->getCommand() == VsmsummaryConfig::Fieldmap::Command::NONE) { + const DocsumFieldSpec::FieldIdentifier & fieldId = it->getInputFields()[0]; + const document::FieldValue * field = doc.getField(fieldId.getId()); + if (field != nullptr) { + writeField(*field, fieldId.getPath(), type, _packer); + } else { + writeEmpty(type, _packer); // void input + } + } else if (it->getInputFields().size() == 0 && it->getCommand() == VsmsummaryConfig::Fieldmap::Command::NONE) { + LOG(spam, "0 inputfields for output field %u", it->getOutputField().getId()); + writeEmpty(type, _packer); // no input + } else { + writeFlattenField(*it, doc, _packer); + } + } + } + + const char *buf; + uint32_t buflen; + bool ok = _packer.GetDocsumBlob(&buf, &buflen); + if (ok) { + return DocsumStoreValue(buf, buflen); + } else { + return DocsumStoreValue(nullptr, 0); + } +} + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.h b/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.h new file mode 100644 index 00000000000..e6f7ae3e6fe --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.h @@ -0,0 +1,90 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vsm/common/docsum.h> +#include <vespa/vsm/common/fieldmodifier.h> +#include <vespa/vsm/vsm/docsumfieldspec.h> +#include <vespa/vsm/vsm/fieldsearchspec.h> +#include <vespa/vsm/vsm/flattendocsumwriter.h> +#include <vespa/vsm/vsm/vsm-adapter.h> +#include <vespa/searchsummary/docsummary/resultpacker.h> +#include <vespa/searchsummary/docsummary/docsumstore.h> + +using search::docsummary::IDocsumStore; +using search::docsummary::DocsumStoreValue; +using search::docsummary::ResType; +using search::docsummary::ResultPacker; + +namespace vsm { + +/** + * This class implements the IDocsumStore interface such that docsum blobs + * can be fetched based on local document id. The docsum blobs are generated + * on the fly when requested. + **/ +class DocsumFilter : public IDocsumStore +{ +private: + typedef std::vector<DocsumFieldSpec> FieldSpecList; // list of summary field specs + typedef std::vector<vespalib::string> StringList; + typedef StringFieldIdTMap FieldMap; + + const IDocSumCache * _docsumCache; + DocsumToolsPtr _tools; + FieldSpecList _fields; // list of summary fields to generate + size_t _highestFieldNo; + ResultPacker _packer; + FlattenDocsumWriter _flattenWriter; + const FieldModifierMap * _snippetModifiers; + document::FieldValue::UP _cachedValue; + document::FieldPath _emptyFieldPath; + + DocsumFilter(const DocsumFilter &); + DocsumFilter &operator=(const DocsumFilter &); + void prepareFieldSpec(DocsumFieldSpec & spec, const DocsumTools::FieldSpec & toolsSpec, + const FieldMap & fieldMap, const FieldPathMapT & fieldPathMap); + const document::FieldValue * getFieldValue(const DocsumFieldSpec::FieldIdentifier & fieldId, + VsmsummaryConfig::Fieldmap::Command command, + const Document & docsum, bool & modified); + void writeField(const document::FieldValue & fv, const FieldPath & path, ResType type, ResultPacker & packer); + void writeSlimeField(const DocsumFieldSpec & fieldSpec, const Document & docsum, ResultPacker & packer); + void writeFlattenField(const DocsumFieldSpec & fieldSpec, const Document & docsum, ResultPacker & packer); + void writeEmpty(ResType type, ResultPacker & packer); + +public: + DocsumFilter(const DocsumToolsPtr & tools, const IDocSumCache & docsumCache); + ~DocsumFilter() override; + const DocsumToolsPtr & getTools() const { return _tools; } + + /** + * Initializes this docsum filter using the given field map and field path map. + * The field map is used to map from field name to field id. + * The field path map is used to retrieve the field path for each input field. + * + * @param fieldMap maps from field name -> field id + * @param fieldPathMap maps from field id -> field path + **/ + void init(const FieldMap & fieldMap, const FieldPathMapT & fieldPathMap); + + /** + * Sets the snippet modifiers to use when writing string fields used as input to snippet generation. + **/ + void setSnippetModifiers(const FieldModifierMap & modifiers) { _snippetModifiers = &modifiers; } + + /** + * Returns the highest field id + 1 among all fields in the field spec list. + **/ + size_t getHighestFieldNo() const { return _highestFieldNo; } + + + void setDocSumStore(const IDocSumCache & docsumCache) { _docsumCache = &docsumCache; } + + // Inherit doc from IDocsumStore + DocsumStoreValue getMappedDocsum(uint32_t id) override; + uint32_t getNumDocs() const override; + uint32_t getSummaryClassId() const override; +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp new file mode 100644 index 00000000000..7043e63ec87 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp @@ -0,0 +1,334 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "fieldsearchspec.h" +#include <vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h> +#include <vespa/vsm/searcher/utf8strchrfieldsearcher.h> +#include <vespa/vsm/searcher/utf8substringsearcher.h> +#include <vespa/vsm/searcher/utf8suffixstringfieldsearcher.h> +#include <vespa/vsm/searcher/utf8exactstringfieldsearcher.h> +#include <vespa/vsm/searcher/futf8strchrfieldsearcher.h> +#include <vespa/vsm/searcher/intfieldsearcher.h> +#include <vespa/vsm/searcher/boolfieldsearcher.h> +#include <vespa/vsm/searcher/floatfieldsearcher.h> +#include <vespa/vsm/searcher/geo_pos_field_searcher.h> +#include <vespa/vespalib/stllike/asciistream.h> +#include <regex> + +#include <vespa/log/log.h> +LOG_SETUP(".vsm.fieldsearchspec"); + +#define DEBUGMASK 0x01 + +using search::streaming::ConstQueryTermList; +using search::streaming::Query; +using search::streaming::QueryTerm; + +namespace vsm { + +namespace { + +void setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) { + if (arg1 == "prefix") { + searcher->setMatchType(FieldSearcher::PREFIX); + } else if (arg1 == "substring") { + searcher->setMatchType(FieldSearcher::SUBSTRING); + } else if (arg1 == "suffix") { + searcher->setMatchType(FieldSearcher::SUFFIX); + } else if (arg1 == "exact") { + searcher->setMatchType(FieldSearcher::EXACT); + } else if (arg1 == "word") { + searcher->setMatchType(FieldSearcher::EXACT); + } +} + +} + +FieldSearchSpec::FieldSearchSpec() : + _id(0), + _name(), + _maxLength(0x100000), + _searcher(), + _searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE), + _arg1(), + _reconfigured(false) +{ +} +FieldSearchSpec::~FieldSearchSpec() = default; + +FieldSearchSpec::FieldSearchSpec(FieldSearchSpec&& rhs) noexcept = default; +FieldSearchSpec& FieldSearchSpec::operator=(FieldSearchSpec&& rhs) noexcept = default; + +FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname, + VsmfieldsConfig::Fieldspec::Searchmethod searchDef, + const vespalib::string & arg1, size_t maxLength_) : + _id(fid), + _name(fname), + _maxLength(maxLength_), + _searcher(), + _searchMethod(searchDef), + _arg1(arg1), + _reconfigured(false) +{ + switch(searchDef) { + default: + LOG(warning, "Unknown searchdef = %d. Defaulting to AUTOUTF8", static_cast<int>(searchDef)); + [[fallthrough]]; + case VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8: + case VsmfieldsConfig::Fieldspec::Searchmethod::NONE: + case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8: + case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8: + if (arg1 == "substring") { + _searcher = std::make_unique<UTF8SubStringFieldSearcher>(fid); + } else if (arg1 == "suffix") { + _searcher = std::make_unique<UTF8SuffixStringFieldSearcher>(fid); + } else if (arg1 == "exact") { + _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid); + } else if (arg1 == "word") { + _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid); + } else if (searchDef == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) { + _searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid); + } else { + _searcher = std::make_unique<FUTF8StrChrFieldSearcher>(fid); + } + break; + case VsmfieldsConfig::Fieldspec::Searchmethod::BOOL: + _searcher = std::make_unique<BoolFieldSearcher>(fid); + break; + case VsmfieldsConfig::Fieldspec::Searchmethod::INT8: + case VsmfieldsConfig::Fieldspec::Searchmethod::INT16: + case VsmfieldsConfig::Fieldspec::Searchmethod::INT32: + case VsmfieldsConfig::Fieldspec::Searchmethod::INT64: + _searcher = std::make_unique<IntFieldSearcher>(fid); + break; + case VsmfieldsConfig::Fieldspec::Searchmethod::FLOAT: + _searcher = std::make_unique<FloatFieldSearcher>(fid); + break; + case VsmfieldsConfig::Fieldspec::Searchmethod::DOUBLE: + _searcher = std::make_unique<DoubleFieldSearcher>(fid); + break; + case VsmfieldsConfig::Fieldspec::Searchmethod::GEOPOS: + _searcher = std::make_unique<GeoPosFieldSearcher>(fid); + break; + } + if (_searcher) { + setMatchType(_searcher, arg1); + _searcher->maxFieldLength(maxLength()); + } +} + +void +FieldSearchSpec::reconfig(const QueryTerm & term) +{ + if (_reconfigured) { + return; + } + switch (_searchMethod) { + case VsmfieldsConfig::Fieldspec::Searchmethod::NONE: + case VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8: + case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8: + case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8: + if ((term.isSubstring() && _arg1 != "substring") || + (term.isSuffix() && _arg1 != "suffix") || + (term.isExactstring() && _arg1 != "exact") || + (term.isPrefix() && _arg1 == "suffix")) + { + _searcher = std::make_unique<UTF8FlexibleStringFieldSearcher>(id()); + // preserve the basic match property of the searcher + setMatchType(_searcher, _arg1); + LOG(debug, "Reconfigured to use UTF8FlexibleStringFieldSearcher (%s) for field '%s' with id '%d'", + _searcher->prefix() ? "prefix" : "regular", name().c_str(), id()); + _reconfigured = true; + } + break; + default: + break; + } +} + +vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpec & f) +{ + os << f._id << ' ' << f._name << ' '; + if ( ! f._searcher) { + os << " No searcher defined.\n"; + } + return os; +} + +FieldSearchSpecMap::FieldSearchSpecMap() = default; + +FieldSearchSpecMap::~FieldSearchSpecMap() = default; + +namespace { + const std::string _G_empty(""); + const std::string _G_value(".value"); + const std::regex _G_map1("\\{[a-zA-Z0-9]+\\}"); + const std::regex _G_map2("\\{\".*\"\\}"); + const std::regex _G_array("\\[[0-9]+\\]"); +} + +vespalib::string FieldSearchSpecMap::stripNonFields(const vespalib::string & rawIndex) +{ + if ((rawIndex.find('[') != vespalib::string::npos) || (rawIndex.find('{') != vespalib::string::npos)) { + std::string index = std::regex_replace(std::string(rawIndex), _G_map1, _G_value); + index = std::regex_replace(index, _G_map2, _G_value); + index = std::regex_replace(index, _G_array, _G_empty); + return index; + } + return rawIndex; +} + +bool FieldSearchSpecMap::buildFieldsInQuery(const Query & query, StringFieldIdTMap & fieldsInQuery) const +{ + bool retval(true); + ConstQueryTermList qtl; + query.getLeafs(qtl); + + for (const auto & term : qtl) { + for (const auto & dtm : documentTypeMap()) { + const IndexFieldMapT & fim = dtm.second; + vespalib::string rawIndex(term->index()); + vespalib::string index(stripNonFields(rawIndex)); + IndexFieldMapT::const_iterator fIt = fim.find(index); + if (fIt != fim.end()) { + for(FieldIdT fid : fIt->second) { + const FieldSearchSpec & spec = specMap().find(fid)->second; + LOG(debug, "buildFieldsInQuery = rawIndex='%s', index='%s'", rawIndex.c_str(), index.c_str()); + if ((rawIndex != index) && (spec.name().find(index) == 0)) { + vespalib::string modIndex(rawIndex); + modIndex.append(spec.name().substr(index.size())); + fieldsInQuery.add(modIndex, spec.id()); + } else { + fieldsInQuery.add(spec.name(),spec.id()); + } + } + } else { + LOG(warning, "No valid indexes registered for index %s", term->index().c_str()); + retval = false; + } + } + } + return retval; +} + +void FieldSearchSpecMap::buildFromConfig(const std::vector<vespalib::string> & otherFieldsNeeded) +{ + for(size_t i(0), m(otherFieldsNeeded.size()); i < m; i++) { + LOG(debug, "otherFieldsNeeded[%zd] = '%s'", i, otherFieldsNeeded[i].c_str()); + _nameIdMap.add(otherFieldsNeeded[i]); + } +} + +namespace { + +FieldIdTList +buildFieldSet(const VsmfieldsConfig::Documenttype::Index & ci, const FieldSearchSpecMapT & specMap, + const VsmfieldsConfig::Documenttype::IndexVector & indexes) +{ + LOG(spam, "Index %s with %zd fields", ci.name.c_str(), ci.field.size()); + FieldIdTList ifm; + for (const VsmfieldsConfig::Documenttype::Index::Field & cf : ci.field) { + LOG(spam, "Parsing field %s", cf.name.c_str()); + auto foundIndex = std::find_if(indexes.begin(), indexes.end(), + [&cf](const auto & v) { return v.name == cf.name;}); + if ((foundIndex != indexes.end()) && (cf.name != ci.name)) { + FieldIdTList sub = buildFieldSet(*foundIndex, specMap, indexes); + ifm.insert(ifm.end(), sub.begin(), sub.end()); + } else { + auto foundField = std::find_if(specMap.begin(), specMap.end(), + [&cf](const auto & v) { return v.second.name() == cf.name;} ); + if (foundField != specMap.end()) { + ifm.push_back(foundField->second.id()); + } else { + LOG(warning, "Field %s not defined. Ignoring....", cf.name.c_str()); + } + } + } + return ifm; +} + +} + +bool FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf) +{ + bool retval(true); + LOG(spam, "Parsing %zd fields", conf->fieldspec.size()); + for(const VsmfieldsConfig::Fieldspec & cfs : conf->fieldspec) { + LOG(spam, "Parsing %s", cfs.name.c_str()); + FieldIdT fieldId = specMap().size(); + FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, cfs.arg1.c_str(), cfs.maxlength); + _specMap[fieldId] = std::move(fss); + _nameIdMap.add(cfs.name, fieldId); + LOG(spam, "M in %d = %s", fieldId, cfs.name.c_str()); + } + + LOG(spam, "Parsing %zd document types", conf->documenttype.size()); + for(const VsmfieldsConfig::Documenttype & di : conf->documenttype) { + IndexFieldMapT indexMapp; + LOG(spam, "Parsing document type %s with %zd indexes", di.name.c_str(), di.index.size()); + for(const VsmfieldsConfig::Documenttype::Index & ci : di.index) { + indexMapp[ci.name] = buildFieldSet(ci, specMap(), di.index); + } + _documentTypeMap[di.name] = indexMapp; + } + return retval; +} + +void +FieldSearchSpecMap::reconfigFromQuery(const Query & query) +{ + ConstQueryTermList qtl; + query.getLeafs(qtl); + + for (const auto & termA : qtl) { + for (const auto & ifm : documentTypeMap()) { + IndexFieldMapT::const_iterator itc = ifm.second.find(termA->index()); + if (itc != ifm.second.end()) { + for (FieldIdT fid : itc->second) { + FieldSearchSpec & spec = _specMap.find(fid)->second; + spec.reconfig(*termA); + } + } + } + } +} + +bool lesserField(const FieldSearcherContainer & a, const FieldSearcherContainer & b) +{ + return a->field() < b->field(); +} + +void FieldSearchSpecMap::buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap) +{ + fieldSearcherMap.clear(); + for (const auto & entry : fieldsInQuery) { + FieldIdT fId = entry.second; + const FieldSearchSpec & spec = specMap().find(fId)->second; + fieldSearcherMap.emplace_back(spec.searcher().duplicate()); + } + std::sort(fieldSearcherMap.begin(), fieldSearcherMap.end(), lesserField); +} + + +vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & df) +{ + os << "DocumentTypeMap = \n"; + for (const auto & dtm : df.documentTypeMap()) { + os << "DocType = " << dtm.first << "\n"; + os << "IndexMap = \n"; + for (const auto &index : dtm.second) { + os << index.first << ": "; + for (FieldIdT fid : index.second) { + os << fid << ' '; + } + os << '\n'; + } + } + os << "SpecMap = \n"; + for (const auto & entry : df.specMap()) { + os << entry.first << " = " << entry.second << '\n'; + } + os << "NameIdMap = \n" << df.nameIdMap(); + return os; +} + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h new file mode 100644 index 00000000000..7b78a8634e0 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h @@ -0,0 +1,98 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/vsm/searcher/fieldsearcher.h> +#include <vespa/vsm/config/vsm-cfif.h> + +namespace vsm { + +class FieldSearchSpec +{ +public: + FieldSearchSpec(); + FieldSearchSpec(const FieldIdT & id, const vespalib::string & name, + VsmfieldsConfig::Fieldspec::Searchmethod searchMethod, + const vespalib::string & arg1, size_t maxLength); + ~FieldSearchSpec(); + FieldSearchSpec(FieldSearchSpec&& rhs) noexcept; + FieldSearchSpec& operator=(FieldSearchSpec&& rhs) noexcept; + const FieldSearcher & searcher() const { return *_searcher; } + const vespalib::string & name() const { return _name; } + FieldIdT id() const { return _id; } + bool valid() const { return static_cast<bool>(_searcher); } + size_t maxLength() const { return _maxLength; } + + /** + * Reconfigures the field searcher based on information in the given query term. + **/ + void reconfig(const search::streaming::QueryTerm & term); + + friend vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpec & f); + +private: + FieldIdT _id; + vespalib::string _name; + size_t _maxLength; + FieldSearcherContainer _searcher; + VsmfieldsConfig::Fieldspec::Searchmethod _searchMethod; + vespalib::string _arg1; + bool _reconfigured; +}; + +typedef std::map<FieldIdT, FieldSearchSpec> FieldSearchSpecMapT; + +class FieldSearchSpecMap +{ +public: + FieldSearchSpecMap(); + ~FieldSearchSpecMap(); + + /** + * Iterates over all fields in the vsmfields config and creates a mapping from field id to FieldSearchSpec objects + * and a mapping from field name to field id. It then iterates over all document types and index names + * and creates a mapping from index name to list of field ids for each document type. + **/ + bool buildFromConfig(const VsmfieldsHandle & conf); + + /** + * Iterates over the given field name vector adding extra elements to the mapping from field name to field id. + **/ + void buildFromConfig(const std::vector<vespalib::string> & otherFieldsNeeded); + + /** + * Reconfigures some of the field searchers based on information in the given query. + **/ + void reconfigFromQuery(const search::streaming::Query & query); + + /** + * Adds a [field name, field id] entry to the given mapping for each field name used in the given query. + * This is achieved by mapping from query term index name -> list of field ids -> [field name, field id] pairs. + **/ + bool buildFieldsInQuery(const search::streaming::Query & query, StringFieldIdTMap & fieldsInQuery) const; + + /** + * Adds a [field name, field id] entry to the given mapping for each field name in the given vector. + **/ + void buildFieldsInQuery(const std::vector<vespalib::string> & otherFieldsNeeded, StringFieldIdTMap & fieldsInQuery) const; + + /** + * Adds a FieldSearcher object to the given field searcher map for each field name in the other map. + **/ + void buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap); + + const FieldSearchSpecMapT & specMap() const { return _specMap; } + //const IndexFieldMapT & indexMap() const { return _documentTypeMap.begin()->second; } + const DocumentTypeIndexFieldMapT & documentTypeMap() const { return _documentTypeMap; } + const StringFieldIdTMap & nameIdMap() const { return _nameIdMap; } + friend vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & f); + + static vespalib::string stripNonFields(const vespalib::string & rawIndex); + +private: + FieldSearchSpecMapT _specMap; // mapping from field id to field search spec + DocumentTypeIndexFieldMapT _documentTypeMap; // mapping from index name to field id list for each document type + StringFieldIdTMap _nameIdMap; // mapping from field name to field id +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.cpp b/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.cpp new file mode 100644 index 00000000000..06b652d85e6 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.cpp @@ -0,0 +1,45 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "flattendocsumwriter.h" +#include <vespa/document/fieldvalue/fieldvalues.h> + +namespace vsm { + +void +FlattenDocsumWriter::considerSeparator() +{ + if (_useSeparator) { + _output.put(_separator.c_str(), _separator.size()); + } +} + +void +FlattenDocsumWriter::onPrimitive(uint32_t, const Content & c) +{ + considerSeparator(); + const document::FieldValue & fv = c.getValue(); + if (fv.isLiteral()) { + const document::LiteralFieldValueB & lfv = static_cast<const document::LiteralFieldValueB &>(fv); + vespalib::stringref value = lfv.getValueRef(); + _output.put(value.data(), value.size()); + } else if (fv.isNumeric() || + fv.isA(document::FieldValue::Type::BOOL)) + { + vespalib::string value = fv.getAsString(); + _output.put(value.data(), value.size()); + } else { + vespalib::string value = fv.toString(); + _output.put(value.data(), value.size()); + } + _useSeparator = true; +} + +FlattenDocsumWriter::FlattenDocsumWriter(const vespalib::string & separator) : + _output(32), + _separator(separator), + _useSeparator(false) +{ } + +FlattenDocsumWriter::~FlattenDocsumWriter() = default; + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.h b/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.h new file mode 100644 index 00000000000..47c6f1e75d0 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.h @@ -0,0 +1,36 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include <vespa/document/fieldvalue/fieldvalue.h> +#include <vespa/document/fieldvalue/iteratorhandler.h> +#include <vespa/vsm/common/charbuffer.h> + +namespace vsm { + +/** + * This class is used to flatten out and write a complex field value. + * A separator string is inserted between primitive field values. + **/ +class FlattenDocsumWriter : public document::fieldvalue::IteratorHandler { +private: + CharBuffer _output; + vespalib::string _separator; + bool _useSeparator; + + void considerSeparator(); + void onPrimitive(uint32_t, const Content & c) override; + +public: + FlattenDocsumWriter(const vespalib::string & separator = " "); + ~FlattenDocsumWriter(); + void setSeparator(const vespalib::string & separator) { _separator = separator; } + const CharBuffer & getResult() const { return _output; } + void clear() { + _output.reset(); + _separator = " "; + _useSeparator = false; + } +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/vsm/i_matching_elements_filler.h b/streamingvisitors/src/vespa/vsm/vsm/i_matching_elements_filler.h new file mode 100644 index 00000000000..a35cea40cec --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/i_matching_elements_filler.h @@ -0,0 +1,24 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <memory> + +namespace search { +class MatchingElements; +class MatchingElementsFields; +} + +namespace vsm { + +/* + * Interface class for filling matching elements structure for + * streaming search. + */ +class IMatchingElementsFiller { +public: + virtual std::unique_ptr<search::MatchingElements> fill_matching_elements(const search::MatchingElementsFields& fields) = 0; + virtual ~IMatchingElementsFiller() = default; +}; + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.cpp b/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.cpp new file mode 100644 index 00000000000..5bc5798fb9d --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.cpp @@ -0,0 +1,220 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "slimefieldwriter.h" +#include <vespa/searchlib/util/slime_output_raw_buf_adapter.h> +#include <vespa/vespalib/stllike/asciistream.h> +#include <vespa/vespalib/util/size_literals.h> +#include <vespa/searchsummary/docsummary/resultconfig.h> +#include <vespa/document/datatype/positiondatatype.h> + +#include <vespa/log/log.h> +LOG_SETUP(".vsm.slimefieldwriter"); + +namespace { + +vespalib::string +toString(const vsm::FieldPath & fieldPath) +{ + vespalib::asciistream oss; + for (size_t i = 0; i < fieldPath.size(); ++i) { + if (i > 0) { + oss << "."; + } + oss << fieldPath[i].getName(); + } + return oss.str(); +} + +vespalib::string +toString(const std::vector<vespalib::string> & fieldPath) +{ + vespalib::asciistream oss; + for (size_t i = 0; i < fieldPath.size(); ++i) { + if (i > 0) { + oss << "."; + } + oss << fieldPath[i]; + } + return oss.str(); +} + +} // namespace <unnamed> + +using namespace vespalib::slime::convenience; + + +namespace vsm { + +void +SlimeFieldWriter::traverseRecursive(const document::FieldValue & fv, Inserter &inserter) +{ + LOG(debug, "traverseRecursive: class(%s), fieldValue(%s), currentPath(%s)", + fv.className(), fv.toString().c_str(), toString(_currPath).c_str()); + + if (fv.isCollection()) { + const document::CollectionFieldValue & cfv = static_cast<const document::CollectionFieldValue &>(fv); + if (cfv.isA(document::FieldValue::Type::ARRAY)) { + const document::ArrayFieldValue & afv = static_cast<const document::ArrayFieldValue &>(cfv); + Cursor &a = inserter.insertArray(); + for (size_t i = 0; i < afv.size(); ++i) { + const document::FieldValue & nfv = afv[i]; + ArrayInserter ai(a); + traverseRecursive(nfv, ai); + } + } else { + assert(cfv.isA(document::FieldValue::Type::WSET)); + const document::WeightedSetFieldValue & wsfv = static_cast<const document::WeightedSetFieldValue &>(cfv); + Cursor &a = inserter.insertArray(); + Symbol isym = a.resolve("item"); + Symbol wsym = a.resolve("weight"); + for (const auto &entry : wsfv) { + Cursor &o = a.addObject(); + const document::FieldValue & nfv = *entry.first; + ObjectSymbolInserter oi(o, isym); + traverseRecursive(nfv, oi); + int weight = static_cast<const document::IntFieldValue &>(*entry.second).getValue(); + o.setLong(wsym, weight); + } + } + } else if (fv.isA(document::FieldValue::Type::MAP)) { + const document::MapFieldValue & mfv = static_cast<const document::MapFieldValue &>(fv); + Cursor &a = inserter.insertArray(); + Symbol keysym = a.resolve("key"); + Symbol valsym = a.resolve("value"); + for (const auto &entry : mfv) { + Cursor &o = a.addObject(); + ObjectSymbolInserter ki(o, keysym); + traverseRecursive(*entry.first, ki); + _currPath.push_back("value"); + ObjectSymbolInserter vi(o, valsym); + traverseRecursive(*entry.second, vi); + _currPath.pop_back(); + } + } else if (fv.isStructured()) { + const document::StructuredFieldValue & sfv = static_cast<const document::StructuredFieldValue &>(fv); + Cursor &o = inserter.insertObject(); + if (sfv.getDataType() == &document::PositionDataType::getInstance() + && search::docsummary::ResultConfig::wantedV8geoPositions()) + { + bool ok = true; + try { + int x = std::numeric_limits<int>::min(); + int y = std::numeric_limits<int>::min(); + for (const document::Field & entry : sfv) { + document::FieldValue::UP fval(sfv.getValue(entry)); + if (entry.getName() == "x") { + x = fval->getAsInt(); + } else if (entry.getName() == "y") { + y = fval->getAsInt(); + } else { + ok = false; + } + } + if (x == std::numeric_limits<int>::min()) ok = false; + if (y == std::numeric_limits<int>::min()) ok = false; + if (ok) { + o.setDouble("lat", double(y) / 1.0e6); + o.setDouble("lng", double(x) / 1.0e6); + return; + } + } catch (std::exception &e) { + (void)e; + // fallback to code below + } + } + for (const document::Field & entry : sfv) { + if (explorePath(entry.getName())) { + _currPath.push_back(entry.getName()); + Memory keymem(entry.getName()); + ObjectInserter oi(o, keymem); + document::FieldValue::UP fval(sfv.getValue(entry)); + traverseRecursive(*fval, oi); + _currPath.pop_back(); + } + } + } else { + if (fv.isLiteral()) { + const document::LiteralFieldValueB & lfv = static_cast<const document::LiteralFieldValueB &>(fv); + inserter.insertString(lfv.getValueRef()); + } else if (fv.isNumeric()) { + switch (fv.getDataType()->getId()) { + case document::DataType::T_BYTE: + case document::DataType::T_SHORT: + case document::DataType::T_INT: + case document::DataType::T_LONG: + inserter.insertLong(fv.getAsLong()); + break; + case document::DataType::T_DOUBLE: + inserter.insertDouble(fv.getAsDouble()); + break; + case document::DataType::T_FLOAT: + inserter.insertDouble(fv.getAsFloat()); + break; + default: + inserter.insertString(fv.getAsString()); + } + } else if (fv.isA(document::FieldValue::Type::BOOL)) { + const auto & bfv = static_cast<const document::BoolFieldValue &>(fv); + inserter.insertBool(bfv.getValue()); + } else { + inserter.insertString(fv.toString()); + } + } +} + +bool +SlimeFieldWriter::explorePath(vespalib::stringref candidate) +{ + if (_inputFields == nullptr) { + return true; + } + // find out if we should explore the current path + for (size_t i = 0; i < _inputFields->size(); ++i) { + const FieldPath & fp = (*_inputFields)[i].getPath(); + if (_currPath.size() <= fp.size()) { + bool equal = true; + for (size_t j = 0; j < _currPath.size() && equal; ++j) { + equal = (fp[j].getName() == _currPath[j]); + } + if (equal) { + if (_currPath.size() == fp.size()) { + return true; + } else if (fp[_currPath.size()].getName() == candidate) { + // the current path matches one of the input field paths + return true; + } + } + } + } + return false; +} + +SlimeFieldWriter::SlimeFieldWriter() : + _rbuf(4_Ki), + _slime(), + _inputFields(nullptr), + _currPath() +{ +} + +SlimeFieldWriter::~SlimeFieldWriter() = default; + +void +SlimeFieldWriter::convert(const document::FieldValue & fv) +{ + if (LOG_WOULD_LOG(debug)) { + if (_inputFields != nullptr) { + for (size_t i = 0; i < _inputFields->size(); ++i) { + LOG(debug, "write: input field path [%zd] '%s'", i, toString((*_inputFields)[i].getPath()).c_str()); + } + } else { + LOG(debug, "write: no input fields"); + } + } + SlimeInserter inserter(_slime); + traverseRecursive(fv, inserter); + search::SlimeOutputRawBufAdapter adapter(_rbuf); + vespalib::slime::BinaryFormat::encode(_slime, adapter); +} + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.h b/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.h new file mode 100644 index 00000000000..b5adac8985f --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.h @@ -0,0 +1,57 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "docsumfieldspec.h" +#include <vespa/vsm/common/storagedocument.h> +#include <vespa/document/fieldvalue/fieldvalues.h> +#include <vespa/vespalib/data/slime/slime.h> +#include <vespa/searchlib/util/rawbuf.h> + +namespace vsm { + +/** + * This class is used to write a field value as slime binary data. + * If only a subset of the field value should be written this subset + * is specified using the setInputFields() function. + **/ +class SlimeFieldWriter +{ +private: + search::RawBuf _rbuf; + vespalib::Slime _slime; + const DocsumFieldSpec::FieldIdentifierVector * _inputFields; + std::vector<vespalib::string> _currPath; + + void traverseRecursive(const document::FieldValue & fv, vespalib::slime::Inserter & inserter); + bool explorePath(vespalib::stringref candidate); + +public: + SlimeFieldWriter(); + ~SlimeFieldWriter(); + + + /** + * Specifies the subset of the field value that should be written. + **/ + void setInputFields(const DocsumFieldSpec::FieldIdentifierVector & inputFields) { _inputFields = &inputFields; } + + /** + * Convert the given field value + **/ + void convert(const document::FieldValue & fv); + + /** + * Return a reference to the output binary data + **/ + vespalib::stringref out() const { + return vespalib::stringref(_rbuf.GetDrainPos(), _rbuf.GetUsedLen()); + } + + void clear() { + _rbuf.Reuse(); + _inputFields = nullptr; + _currPath.clear(); + } +}; + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.cpp b/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.cpp new file mode 100644 index 00000000000..127302311f9 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.cpp @@ -0,0 +1,136 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "snippetmodifier.h" +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/vespalib/stllike/hash_map.hpp> + +#include <vespa/log/log.h> +LOG_SETUP(".vsm.snippetmodifier"); + +using namespace document; +using search::streaming::QueryTerm; +using search::streaming::QueryTermList; +typedef vespalib::hash_map<vsm::FieldIdT, QueryTermList> FieldQueryTermMap; + +namespace { + +void +addIfNotPresent(FieldQueryTermMap & map, vsm::FieldIdT fId, QueryTerm * qt) +{ + FieldQueryTermMap::iterator itr = map.find(fId); + if (itr != map.end()) { + QueryTermList & qtl = itr->second; + if (std::find(qtl.begin(), qtl.end(), qt) == qtl.end()) { + qtl.push_back(qt); + } + } else { + map[fId].push_back(qt); + } +} + +} + +namespace vsm { + +void +SnippetModifier::considerSeparator() +{ + if (_useSep) { + _valueBuf->put(_groupSep); + } +} + +void +SnippetModifier::onPrimitive(uint32_t, const Content & c) +{ + considerSeparator(); + _searcher->onValue(c.getValue()); + _valueBuf->put(_searcher->getModifiedBuf().getBuffer(), _searcher->getModifiedBuf().getPos()); + _useSep = true; +} + +void +SnippetModifier::reset() +{ + _valueBuf->reset(); + _useSep = false; +} + + +SnippetModifier::SnippetModifier(const UTF8SubstringSnippetModifier::SP & searcher) : + _searcher(searcher), + _valueBuf(new CharBuffer(32)), + _groupSep('\x1E'), + _useSep(false), + _empty() +{ +} + +SnippetModifier::SnippetModifier(const UTF8SubstringSnippetModifier::SP & searcher, const CharBuffer::SP & valueBuf) : + _searcher(searcher), + _valueBuf(valueBuf), + _groupSep('\x1E'), + _useSep(false), + _empty() +{ +} + +SnippetModifier::~SnippetModifier() {} + +FieldValue::UP +SnippetModifier::modify(const FieldValue & fv, const document::FieldPath & path) +{ + reset(); + fv.iterateNested(path, *this); + return FieldValue::UP(new StringFieldValue(vespalib::string(_valueBuf->getBuffer(), _valueBuf->getPos()))); +} + + +SnippetModifierManager::SnippetModifierManager() : + _modifiers(), + _searchBuf(new SearcherBuf(64)), + _searchModifyBuf(new CharBuffer(64)), + _searchOffsetBuf(new std::vector<size_t>(64)), + _modifierBuf(new CharBuffer(128)) +{ +} + +SnippetModifierManager::~SnippetModifierManager() {} + +void +SnippetModifierManager::setup(const QueryTermList & queryTerms, + const FieldSearchSpecMapT & specMap, + const IndexFieldMapT & indexMap) +{ + FieldQueryTermMap fqtm; + + // setup modifiers + for (QueryTermList::const_iterator i = queryTerms.begin(); i != queryTerms.end(); ++i) { + QueryTerm * qt = *i; + IndexFieldMapT::const_iterator j = indexMap.find(qt->index()); + if (j != indexMap.end()) { + for (FieldIdTList::const_iterator k = j->second.begin(); k != j->second.end(); ++k) { + FieldIdT fId = *k; + const FieldSearchSpec & spec = specMap.find(fId)->second; + if (spec.searcher().substring() || qt->isSubstring()) { // we need a modifier for this field id + addIfNotPresent(fqtm, fId, qt); + if (_modifiers.getModifier(fId) == NULL) { + LOG(debug, "Create snippet modifier for field id '%u'", fId); + UTF8SubstringSnippetModifier::SP searcher + (new UTF8SubstringSnippetModifier(fId, _searchModifyBuf, _searchOffsetBuf)); + _modifiers.map()[fId] = std::make_unique<SnippetModifier>(searcher, _modifierBuf); + } + } + } + } + } + + // prepare modifiers + for (auto & entry : _modifiers.map()) { + FieldIdT fId = entry.first; + SnippetModifier & smod = static_cast<SnippetModifier &>(*entry.second); + smod.getSearcher()->prepare(fqtm[fId], _searchBuf); + } +} + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.h b/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.h new file mode 100644 index 00000000000..4718ab8783a --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.h @@ -0,0 +1,110 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#pragma once + +#include "fieldsearchspec.h" +#include <vespa/vsm/common/charbuffer.h> +#include <vespa/vsm/common/document.h> +#include <vespa/vsm/common/fieldmodifier.h> +#include <vespa/vsm/searcher/utf8substringsnippetmodifier.h> +#include <vespa/document/fieldvalue/fieldvalue.h> +#include <vespa/document/fieldvalue/iteratorhandler.h> + +namespace vsm { + +/** + * This class is responsible for modifying field values where we have substring search and that are used + * as input to snippet generation. + * + * The class implements the FieldModifier interface to modify field values, and the IteratorHandler interface + * to traverse complex field values. Primitive field values are passed to the underlying searcher that is + * responsible for modifying the field value by inserting unit separators before and after matches. + * A group separator is inserted between primitive field values the same way as done by FlattenDocsumWriter. + **/ +class SnippetModifier : public FieldModifier, public document::fieldvalue::IteratorHandler +{ +private: + UTF8SubstringSnippetModifier::SP _searcher; + CharBuffer::SP _valueBuf; // buffer to store the final modified field value + char _groupSep; + bool _useSep; + document::FieldPath _empty; + + void considerSeparator(); + // Inherrit doc from document::FieldValue::IteratorHandler + void onPrimitive(uint32_t, const Content & c) override; + void reset(); + +public: + /** + * Creates a new instance. + * + * @param searcher the searcher used to modify primitive field values. + **/ + SnippetModifier(const UTF8SubstringSnippetModifier::SP & searcher); + + /** + * Creates a new instance. + * + * @param searcher the searcher used to modify primitive field values. + * @param valueBuf the shared buffer used to store the final modified field value. + **/ + SnippetModifier(const UTF8SubstringSnippetModifier::SP & searcher, const CharBuffer::SP & valueBuf); + + ~SnippetModifier(); + + /** + * Modifies the complete given field value. + **/ + document::FieldValue::UP modify(const document::FieldValue & fv) override { + return modify(fv, _empty); + } + + /** + * Modifies the given field value by passing all primitive field values to the searcher and + * inserting group separators between them. A string field value is returned. + * The iterating of the field value is limited by the given field path. + * + * @param fv the field value to modify. + * @param path the field path used to iterate the field value. + * @return the new modified field value. + **/ + document::FieldValue::UP modify(const document::FieldValue & fv, + const document::FieldPath & path) override; + + const CharBuffer & getValueBuf() const { return *_valueBuf; } + const UTF8SubstringSnippetModifier::SP & getSearcher() const { return _searcher; } +}; + +/** + * This class manages a set of snippet modifiers. + * The modifiers are instantiated and prepared in the setup function. + * This class also holds shared buffers that are used by the modifiers. + **/ +class SnippetModifierManager +{ +private: + FieldModifierMap _modifiers; + SharedSearcherBuf _searchBuf; + CharBuffer::SP _searchModifyBuf; + SharedOffsetBuffer _searchOffsetBuf; + CharBuffer::SP _modifierBuf; + +public: + SnippetModifierManager(); + ~SnippetModifierManager(); + + /** + * Setups snippet modifiers for all fields where we have substring search. + * + * @param queryTerms the query terms to take into consideration. + * @param specMap mapping from field id to search spec objects. + * @param fieldMap mapping from index (used in the query) to a list of field ids. + **/ + void setup(const search::streaming::QueryTermList & queryTerms, + const FieldSearchSpecMapT & specMap, const IndexFieldMapT & fieldMap); + + const FieldModifierMap & getModifiers() const { return _modifiers; } +}; + +} + diff --git a/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.cpp b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.cpp new file mode 100644 index 00000000000..5507532d4f3 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.cpp @@ -0,0 +1,194 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "vsm-adapter.hpp" +#include "docsumconfig.h" +#include "i_matching_elements_filler.h" +#include <vespa/searchlib/common/matching_elements.h> + +#include <vespa/log/log.h> +LOG_SETUP(".vsm.vsm-adapter"); + +using search::docsummary::ResConfigEntry; +using search::docsummary::KeywordExtractor; +using search::MatchingElements; +using config::ConfigSnapshot; + +namespace vsm { + +GetDocsumsStateCallback::GetDocsumsStateCallback() : + _summaryFeatures(), + _rankFeatures(), + _matching_elements_filler() +{ } + +void GetDocsumsStateCallback::FillSummaryFeatures(GetDocsumsState * state, IDocsumEnvironment * env) +{ + (void) env; + if (_summaryFeatures) { // set the summary features to write to the docsum + state->_summaryFeatures = _summaryFeatures; + state->_summaryFeaturesCached = true; + } +} + +void GetDocsumsStateCallback::FillRankFeatures(GetDocsumsState * state, IDocsumEnvironment * env) +{ + (void) env; + if (_rankFeatures) { // set the rank features to write to the docsum + state->_rankFeatures = _rankFeatures; + } +} + +void GetDocsumsStateCallback::FillDocumentLocations(GetDocsumsState *state, IDocsumEnvironment * env) +{ + (void) state; + (void) env; +} + +std::unique_ptr<MatchingElements> +GetDocsumsStateCallback::fill_matching_elements(const search::MatchingElementsFields& fields) +{ + if (_matching_elements_filler) { + return _matching_elements_filler->fill_matching_elements(fields); + } + return std::make_unique<MatchingElements>(); +} + +void +GetDocsumsStateCallback::set_matching_elements_filler(std::unique_ptr<IMatchingElementsFiller> matching_elements_filler) +{ + _matching_elements_filler = std::move(matching_elements_filler); +} + +GetDocsumsStateCallback::~GetDocsumsStateCallback() = default; + +DocsumTools::FieldSpec::FieldSpec() : + _outputName(), + _inputNames(), + _command(VsmsummaryConfig::Fieldmap::Command::NONE) +{ } + +DocsumTools::FieldSpec::~FieldSpec() = default; + +DocsumTools::DocsumTools(std::unique_ptr<DynamicDocsumWriter> writer) : + _writer(std::move(writer)), + _juniper(), + _resultClass(), + _fieldSpecs() +{ } + + +DocsumTools::~DocsumTools() = default; + +bool +DocsumTools::obtainFieldNames(const FastS_VsmsummaryHandle &cfg) +{ + uint32_t defaultSummaryId = getResultConfig()->LookupResultClassId(cfg->outputclass); + _resultClass = getResultConfig()->LookupResultClass(defaultSummaryId); + if (_resultClass != NULL) { + for (uint32_t i = 0; i < _resultClass->GetNumEntries(); ++i) { + const ResConfigEntry * entry = _resultClass->GetEntry(i); + _fieldSpecs.push_back(FieldSpec()); + _fieldSpecs.back().setOutputName(entry->_bindname); + bool found = false; + if (cfg) { + // check if we have this summary field in the vsmsummary config + for (uint32_t j = 0; j < cfg->fieldmap.size() && !found; ++j) { + if (entry->_bindname == cfg->fieldmap[j].summary.c_str()) { + for (uint32_t k = 0; k < cfg->fieldmap[j].document.size(); ++k) { + _fieldSpecs.back().getInputNames().push_back(cfg->fieldmap[j].document[k].field); + } + _fieldSpecs.back().setCommand(cfg->fieldmap[j].command); + found = true; + } + } + } + if (!found) { + // use yourself as input + _fieldSpecs.back().getInputNames().push_back(entry->_bindname); + } + } + } else { + LOG(warning, "could not locate result class: '%s'", cfg->outputclass.c_str()); + } + return true; +} + +void +VSMAdapter::configure(const VSMConfigSnapshot & snapshot) +{ + std::lock_guard guard(_lock); + LOG(debug, "(re-)configure VSM (docsum tools)"); + + std::shared_ptr<SummaryConfig> summary(snapshot.getConfig<SummaryConfig>()); + std::shared_ptr<SummarymapConfig> summaryMap(snapshot.getConfig<SummarymapConfig>()); + std::shared_ptr<VsmsummaryConfig> vsmSummary(snapshot.getConfig<VsmsummaryConfig>()); + std::shared_ptr<JuniperrcConfig> juniperrc(snapshot.getConfig<JuniperrcConfig>()); + + _fieldsCfg.set(snapshot.getConfig<VsmfieldsConfig>().release()); + _fieldsCfg.latch(); + + LOG(debug, "configureFields(): Size of cfg fieldspec: %zd", _fieldsCfg.get()->fieldspec.size()); // UlfC: debugging + LOG(debug, "configureFields(): Size of cfg documenttype: %zd", _fieldsCfg.get()->documenttype.size()); // UlfC: debugging + LOG(debug, "configureSummary(): Size of cfg classes: %zd", summary->classes.size()); // UlfC: debugging + LOG(debug, "configureSummaryMap(): Size of cfg override: %zd", summaryMap->override.size()); // UlfC: debugging + LOG(debug, "configureVsmSummary(): Size of cfg fieldmap: %zd", vsmSummary->fieldmap.size()); // UlfC: debugging + LOG(debug, "configureVsmSummary(): outputclass='%s'", vsmSummary->outputclass.c_str()); // UlfC: debugging + + // init result config + std::unique_ptr<ResultConfig> resCfg(new ResultConfig()); + if ( ! resCfg->ReadConfig(*summary.get(), _configId.c_str())) { + throw std::runtime_error("(re-)configuration of VSM (docsum tools) failed due to bad summary config"); + } + + // init keyword extractor + auto kwExtractor = std::make_unique<KeywordExtractor>(nullptr); + kwExtractor->AddLegalIndexSpec(_highlightindexes.c_str()); + vespalib::string spec = kwExtractor->GetLegalIndexSpec(); + LOG(debug, "index highlight spec: '%s'", spec.c_str()); + + // create dynamic docsum writer + auto writer = std::make_unique<DynamicDocsumWriter>(resCfg.release(), kwExtractor.release()); + + // configure juniper (used when configuring DynamicDocsumConfig) + _juniperProps = std::make_unique<JuniperProperties>(*juniperrc); + auto juniper = std::make_unique<juniper::Juniper>(_juniperProps.get(), &_wordFolder); + + // create new docsum tools + auto docsumTools = std::make_unique<DocsumTools>(std::move(writer)); + docsumTools->setJuniper(std::move(juniper)); + + // configure dynamic docsum writer + DynamicDocsumConfig dynDocsumConfig(docsumTools.get(), docsumTools->getDocsumWriter(), _fieldsCfg.get()); + dynDocsumConfig.configure(*summaryMap.get()); + + // configure new docsum tools + if (docsumTools->obtainFieldNames(vsmSummary)) { + // latch new docsum tools into production + _docsumTools.set(docsumTools.release()); + _docsumTools.latch(); + } else { + throw std::runtime_error("(re-)configuration of VSM (docsum tools) failed"); + } +} + +VSMConfigSnapshot::VSMConfigSnapshot(const vespalib::string & configId, const config::ConfigSnapshot & snapshot) + : _configId(configId), + _snapshot(std::make_unique<config::ConfigSnapshot>(snapshot)) +{ } +VSMConfigSnapshot::~VSMConfigSnapshot() = default; + +VSMAdapter::VSMAdapter(const vespalib::string & highlightindexes, const vespalib::string & configId, Fast_WordFolder & wordFolder) + : _highlightindexes(highlightindexes), + _configId(configId), + _wordFolder(wordFolder), + _fieldsCfg(), + _docsumTools(), + _juniperProps(), + _lock() +{ +} + + +VSMAdapter::~VSMAdapter() = default; + +} diff --git a/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.h b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.h new file mode 100644 index 00000000000..6484269353b --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.h @@ -0,0 +1,132 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchlib/query/base.h> +#include <vespa/vsm/config/vsm-cfif.h> +#include <vespa/config-summary.h> +#include <vespa/config-summarymap.h> +#include <vespa/searchlib/common/featureset.h> +#include <vespa/searchsummary/docsummary/docsumwriter.h> +#include <vespa/searchsummary/docsummary/docsumstate.h> +#include <vespa/searchsummary/docsummary/idocsumenvironment.h> +#include <vespa/juniper/rpinterface.h> + +using search::docsummary::ResultConfig; +using search::docsummary::ResultClass; +using search::docsummary::IDocsumWriter; +using search::docsummary::DynamicDocsumWriter; +using search::docsummary::GetDocsumsState; +using search::docsummary::IDocsumEnvironment; +using search::docsummary::JuniperProperties; + +using vespa::config::search::SummaryConfig; +using vespa::config::search::SummarymapConfig; +using vespa::config::search::summary::JuniperrcConfig; + +namespace config { class ConfigSnapshot; } +namespace vsm { + +class IMatchingElementsFiller; + +class GetDocsumsStateCallback : public search::docsummary::GetDocsumsStateCallback +{ +private: + search::FeatureSet::SP _summaryFeatures; + search::FeatureSet::SP _rankFeatures; + std::unique_ptr<IMatchingElementsFiller> _matching_elements_filler; + +public: + GetDocsumsStateCallback(); + void FillSummaryFeatures(GetDocsumsState * state, IDocsumEnvironment * env) override; + void FillRankFeatures(GetDocsumsState * state, IDocsumEnvironment * env) override; + virtual void FillDocumentLocations(GetDocsumsState * state, IDocsumEnvironment * env); + virtual std::unique_ptr<search::MatchingElements> fill_matching_elements(const search::MatchingElementsFields& fields) override; + void setSummaryFeatures(const search::FeatureSet::SP & sf) { _summaryFeatures = sf; } + void setRankFeatures(const search::FeatureSet::SP & rf) { _rankFeatures = rf; } + void set_matching_elements_filler(std::unique_ptr<IMatchingElementsFiller> matching_elements_filler); + ~GetDocsumsStateCallback(); +}; + +class DocsumTools : public IDocsumEnvironment +{ +public: + class FieldSpec { + private: + vespalib::string _outputName; + std::vector<vespalib::string> _inputNames; + VsmsummaryConfig::Fieldmap::Command _command; + + public: + FieldSpec(); + ~FieldSpec(); + const vespalib::string & getOutputName() const { return _outputName; } + void setOutputName(const vespalib::string & name) { _outputName = name; } + const std::vector<vespalib::string> & getInputNames() const { return _inputNames; } + std::vector<vespalib::string> & getInputNames() { return _inputNames; } + VsmsummaryConfig::Fieldmap::Command getCommand() const { return _command; } + void setCommand(VsmsummaryConfig::Fieldmap::Command command) { _command = command; } + }; + +private: + std::unique_ptr<DynamicDocsumWriter> _writer; + std::unique_ptr<juniper::Juniper> _juniper; + const ResultClass * _resultClass; + std::vector<FieldSpec> _fieldSpecs; + DocsumTools(const DocsumTools &); + DocsumTools &operator=(const DocsumTools &); + +public: + DocsumTools(std::unique_ptr<DynamicDocsumWriter> writer); + ~DocsumTools(); + void setJuniper(std::unique_ptr<juniper::Juniper> juniper) { _juniper = std::move(juniper); } + ResultConfig *getResultConfig() const { return _writer->GetResultConfig(); } + DynamicDocsumWriter *getDocsumWriter() const { return _writer.get(); } + const ResultClass *getResultClass() const { return _resultClass; } + const std::vector<FieldSpec> & getFieldSpecs() const { return _fieldSpecs; } + bool obtainFieldNames(const FastS_VsmsummaryHandle &cfg); + + // inherit doc from IDocsumEnvironment + search::IAttributeManager * getAttributeManager() override { return NULL; } + vespalib::string lookupIndex(const vespalib::string&) const override { return ""; } + juniper::Juniper * getJuniper() override { return _juniper.get(); } +}; + +typedef std::shared_ptr<DocsumTools> DocsumToolsPtr; + +class VSMConfigSnapshot { +private: + const vespalib::string _configId; + std::unique_ptr<const config::ConfigSnapshot> _snapshot; +public: + VSMConfigSnapshot(const vespalib::string & configId, const config::ConfigSnapshot & snapshot); + ~VSMConfigSnapshot(); + template <typename ConfigType> + std::unique_ptr<ConfigType> getConfig() const; +}; + +class VSMAdapter +{ +public: + VSMAdapter(const vespalib::string & highlightindexes, const vespalib::string & configId, Fast_WordFolder & wordFolder); + virtual ~VSMAdapter(); + + VsmfieldsHandle getFieldsConfig() const { return _fieldsCfg.get(); } + DocsumToolsPtr getDocsumTools() const { return _docsumTools.get(); } + void configure(const VSMConfigSnapshot & snapshot); +private: + vespalib::string _highlightindexes; + const vespalib::string _configId; + Fast_WordFolder & _wordFolder; + vespalib::PtrHolder<VsmfieldsConfig> _fieldsCfg; + vespalib::PtrHolder<DocsumTools> _docsumTools; + std::unique_ptr<JuniperProperties> _juniperProps; + + std::mutex _lock; + + VSMAdapter(const VSMAdapter &); + VSMAdapter &operator=(const VSMAdapter &); +}; + +} // namespace vsm + diff --git a/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.hpp b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.hpp new file mode 100644 index 00000000000..f071dbb2015 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.hpp @@ -0,0 +1,18 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "vsm-adapter.h" +#include <vespa/config/retriever/configsnapshot.hpp> + +namespace vsm { + +template <typename ConfigType> +std::unique_ptr<ConfigType> +VSMConfigSnapshot::getConfig() const +{ + return _snapshot->getConfig<ConfigType>(_configId); +} + +} // namespace vsm + |