aboutsummaryrefslogtreecommitdiffstats
path: root/streamingvisitors
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2022-05-15 00:41:35 +0200
committerGitHub <noreply@github.com>2022-05-15 00:41:35 +0200
commit4db8dcbf3395fd92b1348155142b85df5a754289 (patch)
tree912b02e614bc9889ea3543893cbeb699971e8156 /streamingvisitors
parent287a799b270200aca440cad376272328128a5054 (diff)
Revert "Revert "Collapse vsm into streamingvisitors""
Diffstat (limited to 'streamingvisitors')
-rw-r--r--streamingvisitors/CMakeLists.txt14
-rw-r--r--streamingvisitors/pom.xml44
-rw-r--r--streamingvisitors/src/tests/charbuffer/.gitignore4
-rw-r--r--streamingvisitors/src/tests/charbuffer/CMakeLists.txt8
-rw-r--r--streamingvisitors/src/tests/charbuffer/charbuffer.cpp80
-rw-r--r--streamingvisitors/src/tests/config/mail.cfg116
-rw-r--r--streamingvisitors/src/tests/config/vsm.cfg3
-rw-r--r--streamingvisitors/src/tests/config/vsmfields.cfg297
-rw-r--r--streamingvisitors/src/tests/docsum/.gitignore4
-rw-r--r--streamingvisitors/src/tests/docsum/CMakeLists.txt8
-rw-r--r--streamingvisitors/src/tests/docsum/docsum.cpp293
-rw-r--r--streamingvisitors/src/tests/document/.gitignore4
-rw-r--r--streamingvisitors/src/tests/document/CMakeLists.txt8
-rw-r--r--streamingvisitors/src/tests/document/document.cpp129
-rw-r--r--streamingvisitors/src/tests/hitcollector/CMakeLists.txt2
-rw-r--r--streamingvisitors/src/tests/matching_elements_filler/CMakeLists.txt2
-rw-r--r--streamingvisitors/src/tests/querywrapper/CMakeLists.txt2
-rw-r--r--streamingvisitors/src/tests/searcher/.gitignore4
-rw-r--r--streamingvisitors/src/tests/searcher/CMakeLists.txt8
-rw-r--r--streamingvisitors/src/tests/searcher/searcher_test.cpp864
-rw-r--r--streamingvisitors/src/tests/searchvisitor/CMakeLists.txt2
-rw-r--r--streamingvisitors/src/tests/textutil/.gitignore4
-rw-r--r--streamingvisitors/src/tests/textutil/CMakeLists.txt8
-rw-r--r--streamingvisitors/src/tests/textutil/textutil.cpp285
-rw-r--r--streamingvisitors/src/tests/utilapps/.gitignore0
-rw-r--r--streamingvisitors/src/vespa/searchvisitor/CMakeLists.txt6
-rw-r--r--streamingvisitors/src/vespa/vsm/.gitignore3
-rw-r--r--streamingvisitors/src/vespa/vsm/common/.gitignore5
-rw-r--r--streamingvisitors/src/vespa/vsm/common/CMakeLists.txt10
-rw-r--r--streamingvisitors/src/vespa/vsm/common/charbuffer.cpp32
-rw-r--r--streamingvisitors/src/vespa/vsm/common/charbuffer.h52
-rw-r--r--streamingvisitors/src/vespa/vsm/common/docsum.h22
-rw-r--r--streamingvisitors/src/vespa/vsm/common/document.cpp73
-rw-r--r--streamingvisitors/src/vespa/vsm/common/document.h68
-rw-r--r--streamingvisitors/src/vespa/vsm/common/documenttypemapping.cpp104
-rw-r--r--streamingvisitors/src/vespa/vsm/common/documenttypemapping.h54
-rw-r--r--streamingvisitors/src/vespa/vsm/common/fieldmodifier.cpp24
-rw-r--r--streamingvisitors/src/vespa/vsm/common/fieldmodifier.h58
-rw-r--r--streamingvisitors/src/vespa/vsm/common/storagedocument.cpp81
-rw-r--r--streamingvisitors/src/vespa/vsm/common/storagedocument.h59
-rw-r--r--streamingvisitors/src/vespa/vsm/config/.gitignore4
-rw-r--r--streamingvisitors/src/vespa/vsm/config/CMakeLists.txt11
-rw-r--r--streamingvisitors/src/vespa/vsm/config/vsm-cfif.h25
-rw-r--r--streamingvisitors/src/vespa/vsm/config/vsm.def13
-rw-r--r--streamingvisitors/src/vespa/vsm/config/vsmfields.def31
-rw-r--r--streamingvisitors/src/vespa/vsm/config/vsmsummary.def21
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/.gitignore5
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt28
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp56
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h21
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp301
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h147
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp70
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h53
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fold.cpp153
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fold.h12
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp310
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h26
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp78
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h28
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp49
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h33
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp56
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h22
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp33
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h25
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp69
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h35
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp56
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h25
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp320
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h138
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp59
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h23
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp144
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h72
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp54
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h25
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/.gitignore5
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt14
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/docsumconfig.cpp75
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/docsumconfig.h29
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.cpp35
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.h72
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp477
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/docsumfilter.h90
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp334
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h98
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.cpp45
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.h36
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/i_matching_elements_filler.h24
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.cpp220
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.h57
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.cpp136
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.h110
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.cpp194
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.h132
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.hpp18
98 files changed, 7669 insertions, 7 deletions
diff --git a/streamingvisitors/CMakeLists.txt b/streamingvisitors/CMakeLists.txt
index 5f5e16fc6c3..77ce4b471c0 100644
--- a/streamingvisitors/CMakeLists.txt
+++ b/streamingvisitors/CMakeLists.txt
@@ -7,17 +7,27 @@ vespa_define_module(
storage
storageapi
config_cloudconfig
- document
vespalib
+ document
vdslib
- vsm
+ searchlib
+ searchsummary
LIBS
src/vespa/searchvisitor
+ src/vespa/vsm/common
+ src/vespa/vsm/config
+ src/vespa/vsm/searcher
+ src/vespa/vsm/vsm
TESTS
src/tests/hitcollector
src/tests/matching_elements_filler
src/tests/querywrapper
src/tests/searchvisitor
+ src/tests/charbuffer
+ src/tests/docsum
+ src/tests/document
+ src/tests/searcher
+ src/tests/textutil
)
diff --git a/streamingvisitors/pom.xml b/streamingvisitors/pom.xml
new file mode 100644
index 00000000000..2cc777be593
--- /dev/null
+++ b/streamingvisitors/pom.xml
@@ -0,0 +1,44 @@
+<!-- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+ xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
+ http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>parent</artifactId>
+ <version>7-SNAPSHOT</version>
+ <relativePath>../parent/pom.xml</relativePath>
+ </parent>
+ <artifactId>vsm</artifactId>
+ <version>7-SNAPSHOT</version>
+ <packaging>jar</packaging>
+ <name>${project.artifactId}</name>
+ <dependencies>
+ <dependency>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>config-lib</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ </dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>com.yahoo.vespa</groupId>
+ <artifactId>config-class-plugin</artifactId>
+ <version>${project.version}</version>
+ <configuration>
+ <defFilesDirectories>src/vespa/vsm/config/</defFilesDirectories>
+ </configuration>
+ <executions>
+ <execution>
+ <id>config-gen</id>
+ <goals>
+ <goal>config-gen</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+</project>
diff --git a/streamingvisitors/src/tests/charbuffer/.gitignore b/streamingvisitors/src/tests/charbuffer/.gitignore
new file mode 100644
index 00000000000..2c980038fb5
--- /dev/null
+++ b/streamingvisitors/src/tests/charbuffer/.gitignore
@@ -0,0 +1,4 @@
+.depend
+Makefile
+charbuffer_test
+vsm_charbuffer_test_app
diff --git a/streamingvisitors/src/tests/charbuffer/CMakeLists.txt b/streamingvisitors/src/tests/charbuffer/CMakeLists.txt
new file mode 100644
index 00000000000..5d0c0068d37
--- /dev/null
+++ b/streamingvisitors/src/tests/charbuffer/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(vsm_charbuffer_test_app TEST
+ SOURCES
+ charbuffer.cpp
+ DEPENDS
+ streamingvisitors
+)
+vespa_add_test(NAME vsm_charbuffer_test_app COMMAND vsm_charbuffer_test_app)
diff --git a/streamingvisitors/src/tests/charbuffer/charbuffer.cpp b/streamingvisitors/src/tests/charbuffer/charbuffer.cpp
new file mode 100644
index 00000000000..736d35459cb
--- /dev/null
+++ b/streamingvisitors/src/tests/charbuffer/charbuffer.cpp
@@ -0,0 +1,80 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/vespalib/testkit/testapp.h>
+
+#include <vespa/vsm/common/charbuffer.h>
+
+namespace vsm {
+
+class CharBufferTest : public vespalib::TestApp
+{
+private:
+ void test();
+public:
+ int Main() override;
+};
+
+void
+CharBufferTest::test()
+{
+ { // empty
+ CharBuffer buf;
+ EXPECT_EQUAL(buf.getLength(), 0u);
+ EXPECT_EQUAL(buf.getPos(), 0u);
+ EXPECT_EQUAL(buf.getRemaining(), 0u);
+ }
+ { // explicit length
+ CharBuffer buf(8);
+ EXPECT_EQUAL(buf.getLength(), 8u);
+ EXPECT_EQUAL(buf.getPos(), 0u);
+ EXPECT_EQUAL(buf.getRemaining(), 8u);
+ }
+ { // resize
+ CharBuffer buf(8);
+ EXPECT_EQUAL(buf.getLength(), 8u);
+ buf.resize(16);
+ EXPECT_EQUAL(buf.getLength(), 16u);
+ buf.resize(8);
+ EXPECT_EQUAL(buf.getLength(), 16u);
+ }
+ { // put with triggered resize
+ CharBuffer buf(8);
+ buf.put("123456", 6);
+ EXPECT_EQUAL(buf.getLength(), 8u);
+ EXPECT_EQUAL(buf.getPos(), 6u);
+ EXPECT_EQUAL(buf.getRemaining(), 2u);
+ EXPECT_EQUAL(std::string(buf.getBuffer(), buf.getPos()), "123456");
+ buf.put("789", 3);
+ EXPECT_EQUAL(buf.getLength(), 12u);
+ EXPECT_EQUAL(buf.getPos(), 9u);
+ EXPECT_EQUAL(buf.getRemaining(), 3u);
+ EXPECT_EQUAL(std::string(buf.getBuffer(), buf.getPos()), "123456789");
+ buf.put('a');
+ EXPECT_EQUAL(buf.getLength(), 12u);
+ EXPECT_EQUAL(buf.getPos(), 10u);
+ EXPECT_EQUAL(buf.getRemaining(), 2u);
+ EXPECT_EQUAL(std::string(buf.getBuffer(), buf.getPos()), "123456789a");
+ buf.reset();
+ EXPECT_EQUAL(buf.getLength(), 12u);
+ EXPECT_EQUAL(buf.getPos(), 0u);
+ EXPECT_EQUAL(buf.getRemaining(), 12u);
+ buf.put("bcd", 3);
+ EXPECT_EQUAL(buf.getLength(), 12u);
+ EXPECT_EQUAL(buf.getPos(), 3u);
+ EXPECT_EQUAL(buf.getRemaining(), 9u);
+ EXPECT_EQUAL(std::string(buf.getBuffer(), buf.getPos()), "bcd");
+ }
+}
+
+int
+CharBufferTest::Main()
+{
+ TEST_INIT("charbuffer_test");
+
+ test();
+
+ TEST_DONE();
+}
+
+}
+
+TEST_APPHOOK(vsm::CharBufferTest);
diff --git a/streamingvisitors/src/tests/config/mail.cfg b/streamingvisitors/src/tests/config/mail.cfg
new file mode 100644
index 00000000000..ce830beac23
--- /dev/null
+++ b/streamingvisitors/src/tests/config/mail.cfg
@@ -0,0 +1,116 @@
+datatype[2]
+datatype[0].id 1012
+datatype[0].arraytype[1]
+datatype[0].arraytype[0].datatype 12
+datatype[1].id 1013
+datatype[1].arraytype[1]
+datatype[1].arraytype[0].datatype 13
+documenttype[1]
+documenttype[0].name mail
+documenttype[0].version 0
+documenttype[0].inherits[0]
+documenttype[0].field[26]
+documenttype[0].field[0].name mailid
+documenttype[0].field[0].id 2
+documenttype[0].field[0].header true
+documenttype[0].field[0].datatype 2
+documenttype[0].field[1].name date
+documenttype[0].field[1].id 3
+documenttype[0].field[1].header true
+documenttype[0].field[1].datatype 0
+documenttype[0].field[2].name from
+documenttype[0].field[2].id 4
+documenttype[0].field[2].header true
+documenttype[0].field[2].datatype 12
+documenttype[0].field[3].name replyto
+documenttype[0].field[3].id 5
+documenttype[0].field[3].header true
+documenttype[0].field[3].datatype 12
+documenttype[0].field[4].name to
+documenttype[0].field[4].id 6
+documenttype[0].field[4].header true
+documenttype[0].field[4].datatype 12
+documenttype[0].field[5].name cc
+documenttype[0].field[5].id 7
+documenttype[0].field[5].header true
+documenttype[0].field[5].datatype 12
+documenttype[0].field[6].name bcc
+documenttype[0].field[6].id 8
+documenttype[0].field[6].header true
+documenttype[0].field[6].datatype 12
+documenttype[0].field[7].name subject
+documenttype[0].field[7].id 9
+documenttype[0].field[7].header true
+documenttype[0].field[7].datatype 12
+documenttype[0].field[8].name body
+documenttype[0].field[8].id 10
+documenttype[0].field[8].header false
+documenttype[0].field[8].datatype 12
+documenttype[0].field[9].name attachmentcount
+documenttype[0].field[9].id 11
+documenttype[0].field[9].header false
+documenttype[0].field[9].datatype 0
+documenttype[0].field[10].name attachmentpartids
+documenttype[0].field[10].id 12
+documenttype[0].field[10].header false
+documenttype[0].field[10].datatype 2
+documenttype[0].field[11].name attachmentsizes
+documenttype[0].field[11].id 13
+documenttype[0].field[11].header false
+documenttype[0].field[11].datatype 2
+documenttype[0].field[12].name attachmentnames
+documenttype[0].field[12].id 14
+documenttype[0].field[12].header false
+documenttype[0].field[12].datatype 2
+documenttype[0].field[13].name attachmenttypes
+documenttype[0].field[13].id 15
+documenttype[0].field[13].header false
+documenttype[0].field[13].datatype 2
+documenttype[0].field[14].name attachmentlanguages
+documenttype[0].field[14].id 16
+documenttype[0].field[14].header false
+documenttype[0].field[14].datatype 2
+documenttype[0].field[15].name attachmentcontent
+documenttype[0].field[15].id 17
+documenttype[0].field[15].header false
+documenttype[0].field[15].datatype 2
+documenttype[0].field[16].name bodylanguage
+documenttype[0].field[16].id 18
+documenttype[0].field[16].header false
+documenttype[0].field[16].datatype 2
+documenttype[0].field[17].name bodyencoding
+documenttype[0].field[17].id 19
+documenttype[0].field[17].header false
+documenttype[0].field[17].datatype 2
+documenttype[0].field[18].name collectionid
+documenttype[0].field[18].id 20
+documenttype[0].field[18].header true
+documenttype[0].field[18].datatype 4
+documenttype[0].field[19].name content
+documenttype[0].field[19].id 21
+documenttype[0].field[19].header true
+documenttype[0].field[19].datatype 12
+documenttype[0].field[20].name bodymeta
+documenttype[0].field[20].id 50027053
+documenttype[0].field[20].header false
+documenttype[0].field[20].datatype 13
+documenttype[0].field[21].name attachments
+documenttype[0].field[21].id 1081629685
+documenttype[0].field[21].header false
+documenttype[0].field[21].datatype 1012
+documenttype[0].field[22].name attachmentsmeta
+documenttype[0].field[22].id 1203055625
+documenttype[0].field[22].header false
+documenttype[0].field[22].datatype 1013
+documenttype[0].field[23].name tolist
+documenttype[0].field[23].id 1084918181
+documenttype[0].field[23].header false
+documenttype[0].field[23].datatype 1012
+documenttype[0].field[24].name cclist
+documenttype[0].field[24].id 1733332403
+documenttype[0].field[24].header false
+documenttype[0].field[24].datatype 1012
+documenttype[0].field[25].name bcclist
+documenttype[0].field[25].id 410546306
+documenttype[0].field[25].header false
+documenttype[0].field[25].datatype 1012
diff --git a/streamingvisitors/src/tests/config/vsm.cfg b/streamingvisitors/src/tests/config/vsm.cfg
new file mode 100644
index 00000000000..dc50447f623
--- /dev/null
+++ b/streamingvisitors/src/tests/config/vsm.cfg
@@ -0,0 +1,3 @@
+doctype file:../config/mail.cfg
+storagecfg ""
+vsmfields file:../config/vsmfields.cfg
diff --git a/streamingvisitors/src/tests/config/vsmfields.cfg b/streamingvisitors/src/tests/config/vsmfields.cfg
new file mode 100644
index 00000000000..30f1c8ed8b1
--- /dev/null
+++ b/streamingvisitors/src/tests/config/vsmfields.cfg
@@ -0,0 +1,297 @@
+threadsperquery 4
+documentverificationlevel=0
+searchall 1
+fieldspec[17]
+fieldspec[0].name bcc
+fieldspec[0].searchmethod AUTOUTF8
+fieldspec[0].arg1 ""
+fieldspec[1].name cc
+fieldspec[1].searchmethod AUTOUTF8
+fieldspec[1].arg1 ""
+fieldspec[2].name from
+fieldspec[2].searchmethod AUTOUTF8
+fieldspec[2].arg1 ""
+fieldspec[3].name date
+fieldspec[3].searchmethod INT32
+fieldspec[3].arg1 ""
+fieldspec[4].name replyto
+fieldspec[4].searchmethod AUTOUTF8
+fieldspec[4].arg1 ""
+fieldspec[5].name subject
+fieldspec[5].searchmethod AUTOUTF8
+fieldspec[5].arg1 ""
+fieldspec[6].name to
+fieldspec[6].searchmethod AUTOUTF8
+fieldspec[6].arg1 ""
+fieldspec[7].name body
+fieldspec[7].searchmethod AUTOUTF8
+fieldspec[7].arg1 ""
+fieldspec[8].name bodymeta
+fieldspec[8].searchmethod AUTOUTF8
+fieldspec[8].arg1 ""
+fieldspec[9].name mailid
+fieldspec[9].searchmethod AUTOUTF8
+fieldspec[9].arg1 ""
+fieldspec[10].name attachmentcount
+fieldspec[10].searchmethod INT32
+fieldspec[10].arg1 ""
+fieldspec[11].name attachmentcontent
+fieldspec[11].searchmethod AUTOUTF8
+fieldspec[11].arg1 ""
+fieldspec[12].name attachmenttypes
+fieldspec[12].searchmethod AUTOUTF8
+fieldspec[12].arg1 ""
+fieldspec[13].name attachmentnames
+fieldspec[13].searchmethod AUTOUTF8
+fieldspec[13].arg1 ""
+fieldspec[14].name attachmentlanguages
+fieldspec[14].searchmethod AUTOUTF8
+fieldspec[14].arg1 ""
+fieldspec[15].name URI
+fieldspec[15].searchmethod AUTOUTF8
+fieldspec[15].arg1 ""
+fieldspec[16].name vsm_whichfieldmatched
+fieldspec[16].searchmethod AUTOUTF8
+fieldspec[16].arg1 ""
+index[26]
+index[0].name default
+index[0].field[10]
+index[0].field[0].name from
+index[0].field[1].name to
+index[0].field[2].name cc
+index[0].field[3].name bcc
+index[0].field[4].name subject
+index[0].field[5].name body
+index[0].field[6].name attachmentcontent
+index[0].field[7].name attachmentnames
+index[0].field[8].name attachmenttypes
+index[0].field[9].name date
+index[1].name all
+index[1].field[8]
+index[1].field[0].name to
+index[1].field[1].name cc
+index[1].field[2].name bcc
+index[1].field[3].name subject
+index[1].field[4].name body
+index[1].field[5].name attachmentcontent
+index[1].field[6].name attachmentnames
+index[1].field[7].name attachmenttypes
+index[2].name header
+index[2].field[6]
+index[2].field[0].name from
+index[2].field[1].name replyto
+index[2].field[2].name to
+index[2].field[3].name cc
+index[2].field[4].name bcc
+index[2].field[5].name subject
+index[3].name senders
+index[3].field[2]
+index[3].field[0].name from
+index[3].field[1].name replyto
+index[4].name recipients
+index[4].field[3]
+index[4].field[0].name to
+index[4].field[1].name cc
+index[4].field[2].name bcc
+index[5].name address
+index[5].field[5]
+index[5].field[0].name from
+index[5].field[1].name replyto
+index[5].field[2].name to
+index[5].field[3].name cc
+index[5].field[4].name bcc
+index[6].name body
+index[6].field[2]
+index[6].field[0].name subject
+index[6].field[1].name body
+index[7].name meta
+index[7].field[2]
+index[7].field[0].name attachmentcontent
+index[7].field[1].name attachmenttypes
+index[8].name index1
+index[8].field[1]
+index[8].field[0].name bcc
+index[9].name index2
+index[9].field[2]
+index[9].field[0].name bcc
+index[9].field[1].name cc
+index[10].name index3
+index[10].field[3]
+index[10].field[0].name bcc
+index[10].field[1].name cc
+index[10].field[2].name from
+index[11].name index4
+index[11].field[4]
+index[11].field[0].name bcc
+index[11].field[1].name cc
+index[11].field[2].name from
+index[11].field[3].name date
+index[12].name index5
+index[12].field[5]
+index[12].field[0].name bcc
+index[12].field[1].name cc
+index[12].field[2].name from
+index[12].field[3].name date
+index[12].field[4].name replyto
+index[13].name index6
+index[13].field[6]
+index[13].field[0].name bcc
+index[13].field[1].name cc
+index[13].field[2].name from
+index[13].field[3].name date
+index[13].field[4].name replyto
+index[13].field[5].name subject
+index[14].name index7
+index[14].field[7]
+index[14].field[0].name bcc
+index[14].field[1].name cc
+index[14].field[2].name from
+index[14].field[3].name date
+index[14].field[4].name replyto
+index[14].field[5].name subject
+index[14].field[6].name to
+index[15].name index8
+index[15].field[8]
+index[15].field[0].name bcc
+index[15].field[1].name cc
+index[15].field[2].name from
+index[15].field[3].name date
+index[15].field[4].name replyto
+index[15].field[5].name subject
+index[15].field[6].name to
+index[15].field[7].name body
+index[16].name index9
+index[16].field[9]
+index[16].field[0].name bcc
+index[16].field[1].name cc
+index[16].field[2].name from
+index[16].field[3].name date
+index[16].field[4].name replyto
+index[16].field[5].name subject
+index[16].field[6].name to
+index[16].field[7].name body
+index[16].field[8].name bodymeta
+index[17].name index10
+index[17].field[10]
+index[17].field[0].name bcc
+index[17].field[1].name cc
+index[17].field[2].name from
+index[17].field[3].name date
+index[17].field[4].name replyto
+index[17].field[5].name subject
+index[17].field[6].name to
+index[17].field[7].name body
+index[17].field[8].name bodymeta
+index[17].field[9].name mailid
+index[18].name index11
+index[18].field[11]
+index[18].field[0].name bcc
+index[18].field[1].name cc
+index[18].field[2].name from
+index[18].field[3].name date
+index[18].field[4].name replyto
+index[18].field[5].name subject
+index[18].field[6].name to
+index[18].field[7].name body
+index[18].field[8].name bodymeta
+index[18].field[9].name mailid
+index[18].field[10].name attachmentcount
+index[19].name index12
+index[19].field[12]
+index[19].field[0].name bcc
+index[19].field[1].name cc
+index[19].field[2].name from
+index[19].field[3].name date
+index[19].field[4].name replyto
+index[19].field[5].name subject
+index[19].field[6].name to
+index[19].field[7].name body
+index[19].field[8].name bodymeta
+index[19].field[9].name mailid
+index[19].field[10].name attachmentcount
+index[19].field[11].name attachmentcontent
+index[20].name index13
+index[20].field[13]
+index[20].field[0].name bcc
+index[20].field[1].name cc
+index[20].field[2].name from
+index[20].field[3].name date
+index[20].field[4].name replyto
+index[20].field[5].name subject
+index[20].field[6].name to
+index[20].field[7].name body
+index[20].field[8].name bodymeta
+index[20].field[9].name mailid
+index[20].field[10].name attachmentcount
+index[20].field[11].name attachmentcontent
+index[20].field[12].name attachmenttypes
+index[21].name index14
+index[21].field[14]
+index[21].field[0].name bcc
+index[21].field[1].name cc
+index[21].field[2].name from
+index[21].field[3].name date
+index[21].field[4].name replyto
+index[21].field[5].name subject
+index[21].field[6].name to
+index[21].field[7].name body
+index[21].field[8].name bodymeta
+index[21].field[9].name mailid
+index[21].field[10].name attachmentcount
+index[21].field[11].name attachmentcontent
+index[21].field[12].name attachmenttypes
+index[21].field[13].name attachmentnames
+index[22].name index15
+index[22].field[15]
+index[22].field[0].name bcc
+index[22].field[1].name cc
+index[22].field[2].name from
+index[22].field[3].name date
+index[22].field[4].name replyto
+index[22].field[5].name subject
+index[22].field[6].name to
+index[22].field[7].name body
+index[22].field[8].name bodymeta
+index[22].field[9].name mailid
+index[22].field[10].name attachmentcount
+index[22].field[11].name attachmentcontent
+index[22].field[12].name attachmenttypes
+index[22].field[13].name attachmentnames
+index[22].field[14].name attachmentlanguages
+index[23].name index16
+index[23].field[15]
+index[23].field[0].name bcc
+index[23].field[1].name cc
+index[23].field[2].name from
+index[23].field[3].name date
+index[23].field[4].name replyto
+index[23].field[5].name subject
+index[23].field[6].name to
+index[23].field[7].name body
+index[23].field[8].name bodymeta
+index[23].field[9].name mailid
+index[23].field[10].name attachmentcount
+index[23].field[11].name attachmentcontent
+index[23].field[12].name attachmenttypes
+index[23].field[13].name attachmentnames
+index[23].field[14].name attachmentlanguages
+index[24].name index17
+index[24].field[15]
+index[24].field[0].name bcc
+index[24].field[1].name cc
+index[24].field[2].name from
+index[24].field[3].name date
+index[24].field[4].name replyto
+index[24].field[5].name subject
+index[24].field[6].name to
+index[24].field[7].name body
+index[24].field[8].name bodymeta
+index[24].field[9].name mailid
+index[24].field[10].name attachmentcount
+index[24].field[11].name attachmentcontent
+index[24].field[12].name attachmenttypes
+index[24].field[13].name attachmentnames
+index[24].field[14].name attachmentlanguages
+index[25].name date
+index[25].field[1]
+index[25].field[0].name date
diff --git a/streamingvisitors/src/tests/docsum/.gitignore b/streamingvisitors/src/tests/docsum/.gitignore
new file mode 100644
index 00000000000..9a697a94de8
--- /dev/null
+++ b/streamingvisitors/src/tests/docsum/.gitignore
@@ -0,0 +1,4 @@
+.depend
+Makefile
+docsum_test
+vsm_docsum_test_app
diff --git a/streamingvisitors/src/tests/docsum/CMakeLists.txt b/streamingvisitors/src/tests/docsum/CMakeLists.txt
new file mode 100644
index 00000000000..87c46409053
--- /dev/null
+++ b/streamingvisitors/src/tests/docsum/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(vsm_docsum_test_app TEST
+ SOURCES
+ docsum.cpp
+ DEPENDS
+ streamingvisitors
+)
+vespa_add_test(NAME vsm_docsum_test_app COMMAND vsm_docsum_test_app)
diff --git a/streamingvisitors/src/tests/docsum/docsum.cpp b/streamingvisitors/src/tests/docsum/docsum.cpp
new file mode 100644
index 00000000000..475489d2f5a
--- /dev/null
+++ b/streamingvisitors/src/tests/docsum/docsum.cpp
@@ -0,0 +1,293 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/vespalib/testkit/testapp.h>
+#include <vespa/document/fieldvalue/fieldvalues.h>
+#include <vespa/document/datatype/structdatatype.h>
+#include <vespa/document/datatype/weightedsetdatatype.h>
+#include <vespa/document/datatype/mapdatatype.h>
+#include <vespa/vsm/common/docsum.h>
+#include <vespa/vsm/vsm/flattendocsumwriter.h>
+#include <vespa/vsm/vsm/slimefieldwriter.h>
+
+using namespace document;
+
+namespace vsm {
+
+template <typename T>
+class Vector : public std::vector<T>
+{
+public:
+ Vector<T> & add(T v) { this->push_back(v); return *this; }
+};
+
+typedef Vector<std::string> StringList;
+typedef Vector<std::pair<std::string, int32_t> > WeightedStringList;
+
+
+class TestDocument : public vsm::Document
+{
+private:
+ std::vector<FieldValueContainer> _fields;
+
+public:
+ TestDocument(const search::DocumentIdT & docId, size_t numFields) : vsm::Document(docId, numFields), _fields(numFields) {}
+ virtual bool setField(FieldIdT fId, document::FieldValue::UP fv) override {
+ if (fId < _fields.size()) {
+ _fields[fId].reset(fv.release());
+ return true;
+ }
+ return false;
+ }
+ virtual const document::FieldValue * getField(FieldIdT fId) const override {
+ if (fId < _fields.size()) {
+ return _fields[fId].get();
+ }
+ return NULL;
+ }
+};
+
+
+class DocsumTest : public vespalib::TestApp
+{
+private:
+ ArrayFieldValue createFieldValue(const StringList & fv);
+ WeightedSetFieldValue createFieldValue(const WeightedStringList & fv);
+
+ void assertFlattenDocsumWriter(const FieldValue & fv, const std::string & exp) {
+ FlattenDocsumWriter fdw;
+ assertFlattenDocsumWriter(fdw, fv, exp);
+ }
+ void assertFlattenDocsumWriter(FlattenDocsumWriter & fdw, const FieldValue & fv, const std::string & exp);
+ void assertSlimeFieldWriter(const FieldValue & fv, const std::string & exp) {
+ SlimeFieldWriter sfw;
+ TEST_DO(assertSlimeFieldWriter(sfw, fv, exp));
+ }
+ void assertSlimeFieldWriter(SlimeFieldWriter & sfw, const FieldValue & fv, const std::string & exp);
+
+ void testFlattenDocsumWriter();
+ void testSlimeFieldWriter();
+ void requireThatSlimeFieldWriterHandlesMap();
+ void testDocSumCache();
+
+public:
+ int Main() override;
+};
+
+ArrayFieldValue
+DocsumTest::createFieldValue(const StringList & fv)
+{
+
+ static ArrayDataType type(*DataType::STRING);
+ ArrayFieldValue afv(type);
+ for (size_t i = 0; i < fv.size(); ++i) {
+ afv.add(StringFieldValue(fv[i]));
+ }
+ return afv;
+}
+
+WeightedSetFieldValue
+DocsumTest::createFieldValue(const WeightedStringList & fv)
+{
+ static WeightedSetDataType type(*DataType::STRING, false, false);
+ WeightedSetFieldValue wsfv(type);
+ for (size_t i = 0; i < fv.size(); ++i) {
+ wsfv.add(StringFieldValue(fv[i].first), fv[i].second);
+ }
+ return wsfv;
+}
+
+void
+DocsumTest::assertFlattenDocsumWriter(FlattenDocsumWriter & fdw, const FieldValue & fv, const std::string & exp)
+{
+ FieldPath empty;
+ fv.iterateNested(empty.getFullRange(), fdw);
+ std::string actual(fdw.getResult().getBuffer(), fdw.getResult().getPos());
+ EXPECT_EQUAL(actual, exp);
+}
+
+void
+DocsumTest::assertSlimeFieldWriter(SlimeFieldWriter & sfw, const FieldValue & fv, const std::string & exp)
+{
+ sfw.convert(fv);
+
+ vespalib::Slime gotSlime;
+ vespalib::Memory serialized(sfw.out());
+ size_t decodeRes = vespalib::slime::BinaryFormat::decode(serialized, gotSlime);
+ ASSERT_EQUAL(decodeRes, serialized.size);
+
+ vespalib::Slime expSlime;
+ size_t used = vespalib::slime::JsonFormat::decode(exp, expSlime);
+ EXPECT_TRUE(used > 0);
+ EXPECT_EQUAL(expSlime, gotSlime);
+}
+
+void
+DocsumTest::testFlattenDocsumWriter()
+{
+ { // basic tests
+ TEST_DO(assertFlattenDocsumWriter(StringFieldValue("foo bar"), "foo bar"));
+ TEST_DO(assertFlattenDocsumWriter(RawFieldValue("foo bar"), "foo bar"));
+ TEST_DO(assertFlattenDocsumWriter(BoolFieldValue(true), "true"));
+ TEST_DO(assertFlattenDocsumWriter(BoolFieldValue(false), "false"));
+ TEST_DO(assertFlattenDocsumWriter(LongFieldValue(123456789), "123456789"));
+ TEST_DO(assertFlattenDocsumWriter(createFieldValue(StringList().add("foo bar").add("baz").add(" qux ")),
+ "foo bar baz qux "));
+ }
+ { // test mulitple invokations
+ FlattenDocsumWriter fdw("#");
+ TEST_DO(assertFlattenDocsumWriter(fdw, StringFieldValue("foo"), "foo"));
+ TEST_DO(assertFlattenDocsumWriter(fdw, StringFieldValue("bar"), "foo#bar"));
+ fdw.clear();
+ TEST_DO(assertFlattenDocsumWriter(fdw, StringFieldValue("baz"), "baz"));
+ TEST_DO(assertFlattenDocsumWriter(fdw, StringFieldValue("qux"), "baz qux"));
+ }
+ { // test resizing
+ FlattenDocsumWriter fdw("#");
+ EXPECT_EQUAL(fdw.getResult().getPos(), 0u);
+ EXPECT_EQUAL(fdw.getResult().getLength(), 32u);
+ TEST_DO(assertFlattenDocsumWriter(fdw, StringFieldValue("aaaabbbbccccddddeeeeffffgggghhhh"),
+ "aaaabbbbccccddddeeeeffffgggghhhh"));
+ EXPECT_EQUAL(fdw.getResult().getPos(), 32u);
+ EXPECT_EQUAL(fdw.getResult().getLength(), 32u);
+ TEST_DO(assertFlattenDocsumWriter(fdw, StringFieldValue("aaaa"), "aaaabbbbccccddddeeeeffffgggghhhh#aaaa"));
+ EXPECT_EQUAL(fdw.getResult().getPos(), 37u);
+ EXPECT_TRUE(fdw.getResult().getLength() >= 37u);
+ fdw.clear();
+ EXPECT_EQUAL(fdw.getResult().getPos(), 0u);
+ EXPECT_TRUE(fdw.getResult().getLength() >= 37u);
+ }
+}
+
+void
+DocsumTest::testSlimeFieldWriter()
+{
+ { // basic types
+ assertSlimeFieldWriter(LongFieldValue(123456789), "123456789");
+ assertSlimeFieldWriter(BoolFieldValue(true), "true");
+ assertSlimeFieldWriter(BoolFieldValue(false), "false");
+ assertSlimeFieldWriter(DoubleFieldValue(12.34), "12.34");
+ assertSlimeFieldWriter(StringFieldValue("foo bar"), "\"foo bar\"");
+ }
+ { // collection field values
+ assertSlimeFieldWriter(createFieldValue(StringList().add("foo").add("bar").add("baz")),
+ "[\"foo\",\"bar\",\"baz\"]");
+ assertSlimeFieldWriter(createFieldValue(WeightedStringList().add(std::make_pair("bar", 20)).
+ add(std::make_pair("baz", 30)).
+ add(std::make_pair("foo", 10))),
+ "[{item:\"bar\",weight:20},{item:\"baz\",weight:30},{item:\"foo\",weight:10}]");
+ }
+ { // struct field value
+ StructDataType subType("substruct");
+ Field fd("d", 0, *DataType::STRING);
+ Field fe("e", 1, *DataType::STRING);
+ subType.addField(fd);
+ subType.addField(fe);
+ StructFieldValue subValue(subType);
+ subValue.setValue(fd, StringFieldValue("baz"));
+ subValue.setValue(fe, StringFieldValue("qux"));
+
+ StructDataType type("struct");
+ Field fa("a", 0, *DataType::STRING);
+ Field fb("b", 1, *DataType::STRING);
+ Field fc("c", 2, subType);
+ type.addField(fa);
+ type.addField(fb);
+ type.addField(fc);
+ StructFieldValue value(type);
+ value.setValue(fa, StringFieldValue("foo"));
+ value.setValue(fb, StringFieldValue("bar"));
+ value.setValue(fc, subValue);
+
+
+ { // select a subset and then all
+ SlimeFieldWriter sfw;
+ DocsumFieldSpec::FieldIdentifierVector fields;
+ {
+ FieldPath path;
+ type.buildFieldPath(path, "a");
+ fields.push_back(DocsumFieldSpec::FieldIdentifier(0, std::move(path)));
+ }
+ {
+ FieldPath path;
+ type.buildFieldPath(path, "c.e");
+ fields.push_back(DocsumFieldSpec::FieldIdentifier(0, std::move(path)));
+ }
+ sfw.setInputFields(fields);
+ TEST_DO(assertSlimeFieldWriter(sfw, value, "{\"a\":\"foo\",\"c\":{\"e\":\"qux\"}}"));
+ sfw.clear();
+ TEST_DO(assertSlimeFieldWriter(sfw, value, "{\"a\":\"foo\",\"b\":\"bar\",\"c\":{\"d\":\"baz\",\"e\":\"qux\"}}"));
+ }
+
+ { // multiple invocations
+ SlimeFieldWriter sfw;
+ TEST_DO(assertSlimeFieldWriter(sfw, StringFieldValue("foo"), "\"foo\""));
+ sfw.clear();
+ TEST_DO(assertSlimeFieldWriter(sfw, StringFieldValue("bar"), "\"bar\""));
+ sfw.clear();
+ TEST_DO(assertSlimeFieldWriter(sfw, StringFieldValue("baz"), "\"baz\""));
+ }
+
+ }
+}
+
+void
+DocsumTest::requireThatSlimeFieldWriterHandlesMap()
+{
+ { // map<string, string>
+ MapDataType mapType(*DataType::STRING, *DataType::STRING);
+ MapFieldValue mapfv(mapType);
+ EXPECT_TRUE(mapfv.put(StringFieldValue("k1"), StringFieldValue("v1")));
+ EXPECT_TRUE(mapfv.put(StringFieldValue("k2"), StringFieldValue("v2")));
+ assertSlimeFieldWriter(mapfv, "[{\"key\":\"k1\",\"value\":\"v1\"},{\"key\":\"k2\",\"value\":\"v2\"}]");
+ }
+ { // map<string, struct>
+ StructDataType structType("struct");
+ Field fa("a", 0, *DataType::STRING);
+ Field fb("b", 1, *DataType::STRING);
+ structType.addField(fa);
+ structType.addField(fb);
+ StructFieldValue structValue(structType);
+ structValue.setValue(fa, StringFieldValue("foo"));
+ structValue.setValue(fb, StringFieldValue("bar"));
+ MapDataType mapType(*DataType::STRING, structType);
+ MapFieldValue mapfv(mapType);
+ EXPECT_TRUE(mapfv.put(StringFieldValue("k1"), structValue));
+ { // select a subset and then all
+ SlimeFieldWriter sfw;
+ DocsumFieldSpec::FieldIdentifierVector fields;
+ {
+ FieldPath path;
+ mapType.buildFieldPath(path, "value.b");
+ fields.push_back(DocsumFieldSpec::FieldIdentifier(0, std::move(path)));
+ }
+ sfw.setInputFields(fields);
+ TEST_DO(assertSlimeFieldWriter(sfw, mapfv, "[{\"key\":\"k1\",\"value\":{\"b\":\"bar\"}}]"));
+ {
+ FieldPath path;
+ mapType.buildFieldPath(path, "{k1}.a");
+ fields[0] = DocsumFieldSpec::FieldIdentifier(0, std::move(path));
+ }
+ sfw.clear();
+ sfw.setInputFields(fields);
+ TEST_DO(assertSlimeFieldWriter(sfw, mapfv, "[{\"key\":\"k1\",\"value\":{\"a\":\"foo\"}}]"));
+ sfw.clear(); // all fields implicit
+ TEST_DO(assertSlimeFieldWriter(sfw, mapfv, "[{\"key\":\"k1\",\"value\":{\"a\":\"foo\",\"b\":\"bar\"}}]"));
+ }
+ }
+}
+
+int
+DocsumTest::Main()
+{
+ TEST_INIT("docsum_test");
+
+ TEST_DO(testFlattenDocsumWriter());
+ TEST_DO(testSlimeFieldWriter());
+ TEST_DO(requireThatSlimeFieldWriterHandlesMap());
+
+ TEST_DONE();
+}
+
+}
+
+TEST_APPHOOK(vsm::DocsumTest);
+
diff --git a/streamingvisitors/src/tests/document/.gitignore b/streamingvisitors/src/tests/document/.gitignore
new file mode 100644
index 00000000000..d47781eff63
--- /dev/null
+++ b/streamingvisitors/src/tests/document/.gitignore
@@ -0,0 +1,4 @@
+.depend
+Makefile
+document_test
+vsm_document_test_app
diff --git a/streamingvisitors/src/tests/document/CMakeLists.txt b/streamingvisitors/src/tests/document/CMakeLists.txt
new file mode 100644
index 00000000000..5ea12dc5e2d
--- /dev/null
+++ b/streamingvisitors/src/tests/document/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(vsm_document_test_app TEST
+ SOURCES
+ document.cpp
+ DEPENDS
+ streamingvisitors
+)
+vespa_add_test(NAME vsm_document_test_app COMMAND vsm_document_test_app)
diff --git a/streamingvisitors/src/tests/document/document.cpp b/streamingvisitors/src/tests/document/document.cpp
new file mode 100644
index 00000000000..1e97d232a64
--- /dev/null
+++ b/streamingvisitors/src/tests/document/document.cpp
@@ -0,0 +1,129 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/vespalib/testkit/testapp.h>
+
+#include <vespa/document/fieldvalue/fieldvalues.h>
+#include <vespa/document/datatype/documenttype.h>
+#include <vespa/vsm/common/storagedocument.h>
+#include <vespa/vespalib/stllike/asciistream.h>
+
+using namespace document;
+
+namespace vsm {
+
+class DocumentTest : public vespalib::TestApp
+{
+private:
+ void testStorageDocument();
+ void testStringFieldIdTMap();
+public:
+ int Main() override;
+};
+
+void
+DocumentTest::testStorageDocument()
+{
+ DocumentType dt("testdoc", 0);
+
+ Field fa("a", 0, *DataType::STRING);
+ Field fb("b", 1, *DataType::STRING);
+ dt.addField(fa);
+ dt.addField(fb);
+
+ document::Document::UP doc(new document::Document(dt, DocumentId()));
+ doc->setValue(fa, StringFieldValue("foo"));
+ doc->setValue(fb, StringFieldValue("bar"));
+
+ SharedFieldPathMap fpmap(new FieldPathMapT());
+ fpmap->emplace_back();
+ dt.buildFieldPath(fpmap->back(),"a");
+ fpmap->emplace_back();
+ dt.buildFieldPath(fpmap->back(), "b");
+ fpmap->emplace_back();
+ ASSERT_TRUE((*fpmap)[0].size() == 1);
+ ASSERT_TRUE((*fpmap)[1].size() == 1);
+ ASSERT_TRUE((*fpmap)[2].size() == 0);
+
+ StorageDocument sdoc(std::move(doc), fpmap, 3);
+ ASSERT_TRUE(sdoc.valid());
+
+ EXPECT_EQUAL(std::string("foo"), sdoc.getField(0)->getAsString());
+ EXPECT_EQUAL(std::string("bar"), sdoc.getField(1)->getAsString());
+ EXPECT_TRUE(sdoc.getField(2) == nullptr);
+ // test caching
+ EXPECT_EQUAL(std::string("foo"), sdoc.getField(0)->getAsString());
+ EXPECT_EQUAL(std::string("bar"), sdoc.getField(1)->getAsString());
+ EXPECT_TRUE(sdoc.getField(2) == nullptr);
+
+ // set new values
+ EXPECT_TRUE(sdoc.setField(0, FieldValue::UP(new StringFieldValue("baz"))));
+ EXPECT_EQUAL(std::string("baz"), sdoc.getField(0)->getAsString());
+ EXPECT_EQUAL(std::string("bar"), sdoc.getField(1)->getAsString());
+ EXPECT_TRUE(sdoc.getField(2) == nullptr);
+ EXPECT_TRUE(sdoc.setField(1, FieldValue::UP(new StringFieldValue("qux"))));
+ EXPECT_EQUAL(std::string("baz"), sdoc.getField(0)->getAsString());
+ EXPECT_EQUAL(std::string("qux"), sdoc.getField(1)->getAsString());
+ EXPECT_TRUE(sdoc.getField(2) == nullptr);
+ EXPECT_TRUE(sdoc.setField(2, FieldValue::UP(new StringFieldValue("quux"))));
+ EXPECT_EQUAL(std::string("baz"), sdoc.getField(0)->getAsString());
+ EXPECT_EQUAL(std::string("qux"), sdoc.getField(1)->getAsString());
+ EXPECT_EQUAL(std::string("quux"), sdoc.getField(2)->getAsString());
+
+ EXPECT_TRUE(!sdoc.setField(3, FieldValue::UP(new StringFieldValue("thud"))));
+
+ SharedFieldPathMap fim;
+ StorageDocument s2(std::make_unique<document::Document>(), fim, 0);
+ EXPECT_EQUAL(IdString().toString(), s2.docDoc().getId().toString());
+}
+
+void DocumentTest::testStringFieldIdTMap()
+{
+ StringFieldIdTMap m;
+ EXPECT_EQUAL(0u, m.highestFieldNo());
+ EXPECT_TRUE(StringFieldIdTMap::npos == m.fieldNo("unknown"));
+ m.add("f1");
+ EXPECT_EQUAL(0u, m.fieldNo("f1"));
+ EXPECT_EQUAL(1u, m.highestFieldNo());
+ m.add("f1");
+ EXPECT_EQUAL(0u, m.fieldNo("f1"));
+ EXPECT_EQUAL(1u, m.highestFieldNo());
+ m.add("f2");
+ EXPECT_EQUAL(1u, m.fieldNo("f2"));
+ EXPECT_EQUAL(2u, m.highestFieldNo());
+ m.add("f3", 7);
+ EXPECT_EQUAL(7u, m.fieldNo("f3"));
+ EXPECT_EQUAL(8u, m.highestFieldNo());
+ m.add("f3");
+ EXPECT_EQUAL(7u, m.fieldNo("f3"));
+ EXPECT_EQUAL(8u, m.highestFieldNo());
+ m.add("f2", 13);
+ EXPECT_EQUAL(13u, m.fieldNo("f2"));
+ EXPECT_EQUAL(14u, m.highestFieldNo());
+ m.add("f4");
+ EXPECT_EQUAL(3u, m.fieldNo("f4"));
+ EXPECT_EQUAL(14u, m.highestFieldNo());
+ {
+ vespalib::asciistream os;
+ StringFieldIdTMap t;
+ t.add("b");
+ t.add("a");
+ os << t;
+ EXPECT_EQUAL(vespalib::string("a = 1\nb = 0\n"), os.str());
+ }
+
+}
+
+int
+DocumentTest::Main()
+{
+ TEST_INIT("document_test");
+
+ testStorageDocument();
+ testStringFieldIdTMap();
+
+ TEST_DONE();
+}
+
+}
+
+TEST_APPHOOK(vsm::DocumentTest);
+
diff --git a/streamingvisitors/src/tests/hitcollector/CMakeLists.txt b/streamingvisitors/src/tests/hitcollector/CMakeLists.txt
index f25ab348265..dbec820a462 100644
--- a/streamingvisitors/src/tests/hitcollector/CMakeLists.txt
+++ b/streamingvisitors/src/tests/hitcollector/CMakeLists.txt
@@ -3,6 +3,6 @@ vespa_add_executable(streamingvisitors_hitcollector_test_app TEST
SOURCES
hitcollector_test.cpp
DEPENDS
- streamingvisitors_searchvisitor
+ streamingvisitors
)
vespa_add_test(NAME streamingvisitors_hitcollector_test_app COMMAND streamingvisitors_hitcollector_test_app)
diff --git a/streamingvisitors/src/tests/matching_elements_filler/CMakeLists.txt b/streamingvisitors/src/tests/matching_elements_filler/CMakeLists.txt
index ef93d551912..5cc2977b3c3 100644
--- a/streamingvisitors/src/tests/matching_elements_filler/CMakeLists.txt
+++ b/streamingvisitors/src/tests/matching_elements_filler/CMakeLists.txt
@@ -3,7 +3,7 @@ vespa_add_executable(streamingvisitors_matching_elements_filler_test_app TEST
SOURCES
matching_elements_filler_test.cpp
DEPENDS
- streamingvisitors_searchvisitor
+ streamingvisitors
GTest::GTest
)
vespa_add_test(NAME streamingvisitors_matching_elements_filler_test_app COMMAND streamingvisitors_matching_elements_filler_test_app)
diff --git a/streamingvisitors/src/tests/querywrapper/CMakeLists.txt b/streamingvisitors/src/tests/querywrapper/CMakeLists.txt
index 7cae60e6a11..e0131d0c6cc 100644
--- a/streamingvisitors/src/tests/querywrapper/CMakeLists.txt
+++ b/streamingvisitors/src/tests/querywrapper/CMakeLists.txt
@@ -3,6 +3,6 @@ vespa_add_executable(streamingvisitors_querywrapper_test_app TEST
SOURCES
querywrapper_test.cpp
DEPENDS
- streamingvisitors_searchvisitor
+ streamingvisitors
)
vespa_add_test(NAME streamingvisitors_querywrapper_test_app COMMAND streamingvisitors_querywrapper_test_app)
diff --git a/streamingvisitors/src/tests/searcher/.gitignore b/streamingvisitors/src/tests/searcher/.gitignore
new file mode 100644
index 00000000000..52a56dff405
--- /dev/null
+++ b/streamingvisitors/src/tests/searcher/.gitignore
@@ -0,0 +1,4 @@
+.depend
+Makefile
+searcher_test
+vsm_searcher_test_app
diff --git a/streamingvisitors/src/tests/searcher/CMakeLists.txt b/streamingvisitors/src/tests/searcher/CMakeLists.txt
new file mode 100644
index 00000000000..2277f5ef55f
--- /dev/null
+++ b/streamingvisitors/src/tests/searcher/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(vsm_searcher_test_app TEST
+ SOURCES
+ searcher_test.cpp
+ DEPENDS
+ streamingvisitors
+)
+vespa_add_test(NAME vsm_searcher_test_app COMMAND vsm_searcher_test_app)
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp
new file mode 100644
index 00000000000..34fa66eaa90
--- /dev/null
+++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp
@@ -0,0 +1,864 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vespalib/testkit/testapp.h>
+
+#include <vespa/vsm/searcher/fieldsearcher.h>
+#include <vespa/vsm/searcher/floatfieldsearcher.h>
+#include <vespa/vsm/searcher/futf8strchrfieldsearcher.h>
+#include <vespa/vsm/searcher/intfieldsearcher.h>
+#include <vespa/vsm/searcher/boolfieldsearcher.h>
+#include <vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h>
+#include <vespa/vsm/searcher/utf8exactstringfieldsearcher.h>
+#include <vespa/vsm/searcher/utf8substringsearcher.h>
+#include <vespa/vsm/searcher/utf8substringsnippetmodifier.h>
+#include <vespa/vsm/searcher/utf8suffixstringfieldsearcher.h>
+#include <vespa/vsm/vsm/snippetmodifier.h>
+#include <vespa/searchlib/query/streaming/queryterm.h>
+#include <vespa/document/fieldvalue/fieldvalues.h>
+
+using namespace document;
+using search::streaming::HitList;
+using search::streaming::QueryNodeResultFactory;
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+using TermType = QueryTerm::Type;
+using namespace vsm;
+
+template <typename T>
+class Vector : public std::vector<T>
+{
+public:
+ Vector() : std::vector<T>() {}
+ Vector<T> & add(T v) { this->push_back(v); return *this; }
+};
+
+typedef Vector<size_t> Hits;
+typedef Vector<std::string> StringList;
+typedef Vector<Hits> HitsList;
+typedef Vector<bool> BoolList;
+typedef Vector<int64_t> LongList;
+typedef Vector<float> FloatList;
+typedef QueryTerm::FieldInfo QTFieldInfo;
+typedef Vector<QTFieldInfo> FieldInfoList;
+
+class String
+{
+private:
+ const std::string & _str;
+public:
+ String(const std::string & str) : _str(str) {}
+ bool operator==(const String & rhs) const {
+ return _str == rhs._str;
+ }
+};
+
+class Query
+{
+private:
+ void setupQuery(const StringList & terms) {
+ for (size_t i = 0; i < terms.size(); ++i) {
+ ParsedQueryTerm pqt = parseQueryTerm(terms[i]);
+ ParsedTerm pt = parseTerm(pqt.second);
+ qtv.push_back(std::make_unique<QueryTerm>(eqnr.create(), pt.first, pqt.first.empty() ? "index" : pqt.first, pt.second));
+ }
+ for (size_t i = 0; i < qtv.size(); ++i) {
+ qtl.push_back(qtv[i].get());
+ }
+ }
+public:
+ typedef std::pair<std::string, std::string> ParsedQueryTerm;
+ typedef std::pair<std::string, TermType> ParsedTerm;
+ QueryNodeResultFactory eqnr;
+ std::vector<QueryTerm::UP> qtv;
+ QueryTermList qtl;
+ Query(const StringList & terms);
+ ~Query();
+ static ParsedQueryTerm parseQueryTerm(const std::string & queryTerm) {
+ size_t i = queryTerm.find(':');
+ if (i != std::string::npos) {
+ return ParsedQueryTerm(queryTerm.substr(0, i), queryTerm.substr(i + 1));
+ }
+ return ParsedQueryTerm(std::string(), queryTerm);
+ }
+ static ParsedTerm parseTerm(const std::string & term) {
+ if (term[0] == '*' && term[term.size() - 1] == '*') {
+ return std::make_pair(term.substr(1, term.size() - 2), TermType::SUBSTRINGTERM);
+ } else if (term[0] == '*') {
+ return std::make_pair(term.substr(1, term.size() - 1), TermType::SUFFIXTERM);
+ } else if (term[term.size() - 1] == '*') {
+ return std::make_pair(term.substr(0, term.size() - 1), TermType::PREFIXTERM);
+ } else {
+ return std::make_pair(term, TermType::WORD);
+ }
+ }
+};
+
+Query::Query(const StringList & terms) : eqnr(), qtv(), qtl() {
+ setupQuery(terms);
+}
+Query::~Query() = default;
+
+struct SnippetModifierSetup
+{
+ Query query;
+ UTF8SubstringSnippetModifier::SP searcher;
+ SharedSearcherBuf buf;
+ SnippetModifier modifier;
+ explicit SnippetModifierSetup(const StringList & terms);
+ ~SnippetModifierSetup();
+};
+
+SnippetModifierSetup::SnippetModifierSetup(const StringList & terms)
+ : query(terms),
+ searcher(new UTF8SubstringSnippetModifier()),
+ buf(new SearcherBuf(8)),
+ modifier(searcher)
+{
+ searcher->prepare(query.qtl, buf);
+}
+SnippetModifierSetup::~SnippetModifierSetup() = default;
+
+// helper functions
+ArrayFieldValue getFieldValue(const StringList &fv);
+ArrayFieldValue getFieldValue(const LongList &fv);
+ArrayFieldValue getFieldValue(const FloatList &fv);
+
+bool assertMatchTermSuffix(const std::string &term, const std::string &word);
+void assertSnippetModifier(const StringList &query, const std::string &fv, const std::string &exp);
+void assertSnippetModifier(SnippetModifierSetup &setup, const FieldValue &fv, const std::string &exp);
+void assertQueryTerms(const SnippetModifierManager &man, FieldIdT fId, const StringList &terms);
+void assertNumeric(FieldSearcher &fs, const StringList &query, const FieldValue &fv, const BoolList &exp);
+std::vector<QueryTerm::UP> performSearch(FieldSearcher &fs, const StringList &query, const FieldValue &fv);
+void assertSearch(FieldSearcher &fs, const StringList &query, const FieldValue &fv, const HitsList &exp);
+bool assertCountWords(size_t numWords, const std::string &field);
+bool assertFieldInfo(FieldSearcher &fs, const StringList &query, const FieldValue &fv, const FieldInfoList &exp);
+
+void assertString(StrChrFieldSearcher &fs, const StringList &query, const std::string &field, const HitsList &exp) {
+ assertSearch(fs, query, StringFieldValue(field), exp);
+}
+
+void assertString(StrChrFieldSearcher &fs, const StringList &query, const StringList &field, const HitsList &exp) {
+ assertSearch(fs, query, getFieldValue(field), exp);
+}
+
+void assertString(StrChrFieldSearcher &fs, const std::string &term, const std::string &field, const Hits &exp) {
+ assertString(fs, StringList().add(term), field, HitsList().add(exp));
+}
+void assertString(StrChrFieldSearcher &fs, const std::string &term, const StringList &field, const Hits &exp) {
+ assertString(fs, StringList().add(term), field, HitsList().add(exp));
+}
+
+void assertInt(IntFieldSearcher & fs, const StringList &query, int64_t field, const BoolList &exp) {
+ assertNumeric(fs, query, LongFieldValue(field), exp);
+}
+
+void assertInt(IntFieldSearcher & fs, const std::string &term, int64_t field, bool exp) {
+ assertInt(fs, StringList().add(term), field, BoolList().add(exp));
+}
+
+void assertBool(BoolFieldSearcher & fs, const StringList &query, bool field, const BoolList &exp) {
+ assertNumeric(fs, query, BoolFieldValue(field), exp);
+}
+void assertBool(BoolFieldSearcher & fs, const std::string &term, bool field, bool exp) {
+ assertBool(fs, StringList().add(term), field, BoolList().add(exp));
+}
+
+void assertInt(IntFieldSearcher & fs, const StringList &query, const LongList &field, const HitsList &exp) {
+ assertSearch(fs, query, getFieldValue(field), exp);
+}
+
+void assertInt(IntFieldSearcher & fs, const std::string &term, const LongList &field, const Hits &exp) {
+ assertInt(fs, StringList().add(term), field, HitsList().add(exp));
+}
+
+void assertFloat(FloatFieldSearcher & fs, const StringList &query, float field, const BoolList &exp) {
+ assertNumeric(fs, query, FloatFieldValue(field), exp);
+}
+
+void assertFloat(FloatFieldSearcher & fs, const std::string &term, float field, bool exp) {
+ assertFloat(fs, StringList().add(term), field, BoolList().add(exp));
+}
+
+void assertFloat(FloatFieldSearcher & fs, const StringList &query, const FloatList &field, const HitsList &exp) {
+ assertSearch(fs, query, getFieldValue(field), exp);
+}
+
+void assertFloat(FloatFieldSearcher & fs, const std::string &term, const FloatList &field, const Hits &exp) {
+ assertFloat(fs, StringList().add(term), field, HitsList().add(exp));
+}
+
+bool
+assertFieldInfo(StrChrFieldSearcher &fs, const StringList &query, const std::string &fv, const FieldInfoList &exp) {
+ return assertFieldInfo(fs, query, StringFieldValue(fv), exp);
+}
+
+bool
+assertFieldInfo(StrChrFieldSearcher &fs, const StringList &query, const StringList &fv, const FieldInfoList &exp) {
+ return assertFieldInfo(fs, query, getFieldValue(fv), exp);
+}
+bool
+assertFieldInfo(StrChrFieldSearcher &fs, const std::string &term, const StringList &fv, const QTFieldInfo &exp) {
+ return assertFieldInfo(fs, StringList().add(term), fv, FieldInfoList().add(exp));
+}
+
+bool
+assertFieldInfo(StrChrFieldSearcher &fs, const std::string &term, const std::string &fv, const QTFieldInfo &exp) {
+ return assertFieldInfo(fs, StringList().add(term), fv, FieldInfoList().add(exp));
+}
+
+void assertFieldInfo(IntFieldSearcher & fs, const StringList &query, int64_t fv, const FieldInfoList &exp) {
+ assertFieldInfo(fs, query, LongFieldValue(fv), exp);
+}
+
+void assertFieldInfo(IntFieldSearcher & fs, const StringList &query, const LongList &fv, const FieldInfoList &exp) {
+ assertFieldInfo(fs, query, getFieldValue(fv), exp);
+}
+
+void assertFieldInfo(IntFieldSearcher & fs, const std::string &term, int64_t fv, const QTFieldInfo &exp) {
+ assertFieldInfo(fs, StringList().add(term), fv, FieldInfoList().add(exp));
+}
+
+void assertFieldInfo(IntFieldSearcher & fs, const std::string &term, const LongList &fv, const QTFieldInfo &exp) {
+ assertFieldInfo(fs, StringList().add(term), fv, FieldInfoList().add(exp));
+}
+
+void assertFieldInfo(FloatFieldSearcher & fs, const StringList &query, float fv, const FieldInfoList &exp) {
+ assertFieldInfo(fs, query, FloatFieldValue(fv), exp);
+}
+
+void
+assertFieldInfo(FloatFieldSearcher & fs, const StringList &query, const FloatList &fv, const FieldInfoList &exp) {
+ assertFieldInfo(fs, query, getFieldValue(fv), exp);
+}
+
+/** float field searcher **/
+void assertFieldInfo(FloatFieldSearcher & fs, const std::string &term, float fv, const QTFieldInfo &exp) {
+ assertFieldInfo(fs, StringList().add(term), fv, FieldInfoList().add(exp));
+}
+
+void assertFieldInfo(FloatFieldSearcher & fs, const std::string &term, const FloatList &fv, const QTFieldInfo &exp) {
+ assertFieldInfo(fs, StringList().add(term), fv, FieldInfoList().add(exp));
+}
+
+
+/** snippet modifer searcher **/
+void assertSnippetModifier(const std::string &term, const std::string &fv, const std::string &exp) {
+ assertSnippetModifier(StringList().add(term), fv, exp);
+}
+
+
+ArrayFieldValue
+getFieldValue(const StringList & fv)
+{
+
+ static ArrayDataType type(*DataType::STRING);
+ ArrayFieldValue afv(type);
+ for (size_t i = 0; i < fv.size(); ++i) {
+ afv.add(StringFieldValue(fv[i]));
+ }
+ return afv;
+}
+
+ArrayFieldValue
+getFieldValue(const LongList & fv)
+{
+ static ArrayDataType type(*DataType::LONG);
+ ArrayFieldValue afv(type);
+ for (size_t i = 0; i < fv.size(); ++i) {
+ afv.add(LongFieldValue(fv[i]));
+ }
+ return afv;
+}
+
+ArrayFieldValue
+getFieldValue(const FloatList & fv)
+{
+ static ArrayDataType type(*DataType::FLOAT);
+ ArrayFieldValue afv(type);
+ for (size_t i = 0; i < fv.size(); ++i) {
+ afv.add(FloatFieldValue(fv[i]));
+ }
+ return afv;
+}
+
+bool
+assertMatchTermSuffix(const std::string & term, const std::string & word)
+{
+ QueryNodeResultFactory eqnr;
+ QueryTerm qa(eqnr.create(), term, "index", TermType::WORD);
+ QueryTerm qb(eqnr.create(), word, "index", TermType::WORD);
+ const ucs4_t * a;
+ size_t alen = qa.term(a);
+ const ucs4_t * b;
+ size_t blen = qb.term(b);
+ return UTF8StringFieldSearcherBase::matchTermSuffix(a, alen, b, blen);
+}
+
+void
+assertNumeric(FieldSearcher & fs, const StringList & query, const FieldValue & fv, const BoolList & exp)
+{
+ HitsList hl;
+ for (size_t i = 0; i < exp.size(); ++i) {
+ hl.push_back(exp[i] ? Hits().add(0) : Hits());
+ }
+ assertSearch(fs, query, fv, hl);
+}
+
+std::vector<QueryTerm::UP>
+performSearch(FieldSearcher & fs, const StringList & query, const FieldValue & fv)
+{
+ Query q(query);
+
+ // prepare field searcher
+ SharedSearcherBuf ssb = SharedSearcherBuf(new SearcherBuf());
+ fs.prepare(q.qtl, ssb);
+
+ // setup document
+ SharedFieldPathMap sfim(new FieldPathMapT());
+ sfim->push_back(FieldPath());
+ StorageDocument doc(std::make_unique<document::Document>(), sfim, 1);
+ doc.setField(0, document::FieldValue::UP(fv.clone()));
+
+ fs.search(doc);
+ return std::move(q.qtv);
+}
+
+void
+assertSearch(FieldSearcher & fs, const StringList & query, const FieldValue & fv, const HitsList & exp)
+{
+ auto qtv = performSearch(fs, query, fv);
+ EXPECT_EQUAL(qtv.size(), exp.size());
+ ASSERT_TRUE(qtv.size() == exp.size());
+ for (size_t i = 0; i < qtv.size(); ++i) {
+ const HitList & hl = qtv[i]->getHitList();
+ EXPECT_EQUAL(hl.size(), exp[i].size());
+ ASSERT_TRUE(hl.size() == exp[i].size());
+ for (size_t j = 0; j < hl.size(); ++j) {
+ EXPECT_EQUAL((size_t)hl[j].pos(), exp[i][j]);
+ }
+ }
+}
+
+bool
+assertFieldInfo(FieldSearcher & fs, const StringList & query,
+ const FieldValue & fv, const FieldInfoList & exp)
+{
+ auto qtv = performSearch(fs, query, fv);
+ if (!EXPECT_EQUAL(qtv.size(), exp.size())) return false;
+ bool retval = true;
+ for (size_t i = 0; i < qtv.size(); ++i) {
+ if (!EXPECT_EQUAL(qtv[i]->getFieldInfo(0).getHitOffset(), exp[i].getHitOffset())) retval = false;
+ if (!EXPECT_EQUAL(qtv[i]->getFieldInfo(0).getHitCount(), exp[i].getHitCount())) retval = false;
+ if (!EXPECT_EQUAL(qtv[i]->getFieldInfo(0).getFieldLength(), exp[i].getFieldLength())) retval = false;
+ }
+ return retval;
+}
+
+void
+assertSnippetModifier(const StringList & query, const std::string & fv, const std::string & exp)
+{
+ UTF8SubstringSnippetModifier mod;
+ performSearch(mod, query, StringFieldValue(fv));
+ EXPECT_EQUAL(mod.getModifiedBuf().getPos(), exp.size());
+ std::string actual(mod.getModifiedBuf().getBuffer(), mod.getModifiedBuf().getPos());
+ EXPECT_EQUAL(actual.size(), exp.size());
+ EXPECT_EQUAL(actual, exp);
+}
+
+void assertSnippetModifier(SnippetModifierSetup & setup, const FieldValue & fv, const std::string & exp)
+{
+ FieldValue::UP mfv = setup.modifier.modify(fv);
+ const document::LiteralFieldValueB & lfv = static_cast<const document::LiteralFieldValueB &>(*mfv.get());
+ const std::string & actual = lfv.getValue();
+ EXPECT_EQUAL(actual.size(), exp.size());
+ EXPECT_EQUAL(actual, exp);
+}
+
+void assertQueryTerms(const SnippetModifierManager & man, FieldIdT fId, const StringList & terms)
+{
+ if (terms.size() == 0) {
+ ASSERT_TRUE(man.getModifiers().getModifier(fId) == NULL);
+ return;
+ }
+ ASSERT_TRUE(man.getModifiers().getModifier(fId) != NULL);
+ UTF8SubstringSnippetModifier * searcher =
+ (static_cast<SnippetModifier *>(man.getModifiers().getModifier(fId)))->getSearcher().get();
+ EXPECT_EQUAL(searcher->getQueryTerms().size(), terms.size());
+ ASSERT_TRUE(searcher->getQueryTerms().size() == terms.size());
+ for (size_t i = 0; i < terms.size(); ++i) {
+ EXPECT_EQUAL(std::string(searcher->getQueryTerms()[i]->getTerm()), terms[i]);
+ }
+}
+
+bool assertCountWords(size_t numWords, const std::string & field)
+{
+ FieldRef ref(field.c_str(), field.size());
+ return EXPECT_EQUAL(numWords, FieldSearcher::countWords(ref));
+}
+
+bool
+testStringFieldInfo(StrChrFieldSearcher & fs)
+{
+ assertString(fs, "foo", StringList().add("foo bar baz").add("foo bar").add("baz foo"), Hits().add(0).add(3).add(6));
+ assertString(fs, StringList().add("foo").add("bar"), StringList().add("foo bar baz").add("foo bar").add("baz foo"),
+ HitsList().add(Hits().add(0).add(3).add(6)).add(Hits().add(1).add(4)));
+
+ bool retval = true;
+ if (!EXPECT_TRUE(assertFieldInfo(fs, "foo", "foo", QTFieldInfo(0, 1, 1)))) retval = false;
+ if (!EXPECT_TRUE(assertFieldInfo(fs, "bar", "foo", QTFieldInfo(0, 0, 1)))) retval = false;
+ if (!EXPECT_TRUE(assertFieldInfo(fs, "foo", "foo bar baz", QTFieldInfo(0, 1, 3)))) retval = false;
+ if (!EXPECT_TRUE(assertFieldInfo(fs, "bar", "foo bar baz", QTFieldInfo(0, 1, 3)))) retval = false;
+ if (!EXPECT_TRUE(assertFieldInfo(fs, "baz", "foo bar baz", QTFieldInfo(0, 1, 3)))) retval = false;
+ if (!EXPECT_TRUE(assertFieldInfo(fs, "qux", "foo bar baz", QTFieldInfo(0, 0, 3)))) retval = false;
+ if (!EXPECT_TRUE(assertFieldInfo(fs, "foo", "foo foo foo", QTFieldInfo(0, 3, 3)))) retval = false;
+ // query term size > last term size
+ if (!EXPECT_TRUE(assertFieldInfo(fs, "runner", "Road Runner Disco", QTFieldInfo(0, 1, 3)))) retval = false;
+ if (!EXPECT_TRUE(assertFieldInfo(fs, StringList().add("roadrun").add("runner"), "Road Runner Disco",
+ FieldInfoList().add(QTFieldInfo(0, 0, 3)).add(QTFieldInfo(0, 1, 3))))) retval = false;
+ // multiple terms
+ if (!EXPECT_TRUE(assertFieldInfo(fs, "foo", StringList().add("foo bar baz").add("foo bar"),
+ QTFieldInfo(0, 2, 5)))) retval = false;
+ if (!EXPECT_TRUE(assertFieldInfo(fs, StringList().add("foo").add("baz"), "foo bar baz",
+ FieldInfoList().add(QTFieldInfo(0, 1, 3)).add(QTFieldInfo(0, 1, 3))))) retval = false;
+ if (!EXPECT_TRUE(assertFieldInfo(fs, StringList().add("foo").add("baz"), StringList().add("foo bar baz").add("foo bar"),
+ FieldInfoList().add(QTFieldInfo(0, 2, 5)).add(QTFieldInfo(0, 1, 5))))) retval = false;
+ return retval;
+}
+bool
+testStrChrFieldSearcher(StrChrFieldSearcher & fs)
+{
+ std::string field = "operators and operator overloading with utf8 char oe = \xc3\x98";
+ assertString(fs, "oper", field, Hits());
+ assertString(fs, "tor", field, Hits());
+ assertString(fs, "oper*", field, Hits().add(0).add(2));
+ assertString(fs, "and", field, Hits().add(1));
+
+ assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits()).add(Hits()));
+ assertString(fs, StringList().add("and").add("overloading"), field, HitsList().add(Hits().add(1)).add(Hits().add(3)));
+
+ fs.setMatchType(FieldSearcher::PREFIX);
+ assertString(fs, "oper", field, Hits().add(0).add(2));
+ assertString(fs, StringList().add("oper").add("tor"), field, HitsList().add(Hits().add(0).add(2)).add(Hits()));
+
+ fs.setMatchType(FieldSearcher::REGULAR);
+ if (!EXPECT_TRUE(testStringFieldInfo(fs))) return false;
+
+ { // test handling of several underscores
+ StringList query = StringList().add("foo").add("bar");
+ HitsList exp = HitsList().add(Hits().add(0)).add(Hits().add(1));
+ assertString(fs, query, "foo_bar", exp);
+ assertString(fs, query, "foo__bar", exp);
+ assertString(fs, query, "foo___bar", exp);
+ assertString(fs, query, "foo________bar", exp);
+ assertString(fs, query, "foo____________________bar", exp);
+ assertString(fs, query, "________________________________________foo________________________________________bar________________________________________", exp);
+ query = StringList().add("foo").add("thisisaveryveryverylongword");
+ assertString(fs, query, "foo____________________thisisaveryveryverylongword", exp);
+
+ assertString(fs, "bar", "foo bar", Hits().add(1));
+ assertString(fs, "bar", "foo____________________bar", Hits().add(1));
+ assertString(fs, "bar", "foo____________________thisisaveryveryverylongword____________________bar", Hits().add(2));
+ }
+ return true;
+}
+
+ TEST("verify correct term parsing") {
+ ASSERT_TRUE(Query::parseQueryTerm("index:term").first == "index");
+ ASSERT_TRUE(Query::parseQueryTerm("index:term").second == "term");
+ ASSERT_TRUE(Query::parseQueryTerm("term").first == "");
+ ASSERT_TRUE(Query::parseQueryTerm("term").second == "term");
+ ASSERT_TRUE(Query::parseTerm("*substr*").first == "substr");
+ ASSERT_TRUE(Query::parseTerm("*substr*").second == TermType::SUBSTRINGTERM);
+ ASSERT_TRUE(Query::parseTerm("*suffix").first == "suffix");
+ ASSERT_TRUE(Query::parseTerm("*suffix").second == TermType::SUFFIXTERM);
+ ASSERT_TRUE(Query::parseTerm("prefix*").first == "prefix");
+ ASSERT_TRUE(Query::parseTerm("prefix*").second == TermType::PREFIXTERM);
+ ASSERT_TRUE(Query::parseTerm("term").first == "term");
+ ASSERT_TRUE(Query::parseTerm("term").second == TermType::WORD);
+ }
+
+ TEST("suffix matching") {
+ EXPECT_EQUAL(assertMatchTermSuffix("a", "vespa"), true);
+ EXPECT_EQUAL(assertMatchTermSuffix("spa", "vespa"), true);
+ EXPECT_EQUAL(assertMatchTermSuffix("vespa", "vespa"), true);
+ EXPECT_EQUAL(assertMatchTermSuffix("vvespa", "vespa"), false);
+ EXPECT_EQUAL(assertMatchTermSuffix("fspa", "vespa"), false);
+ EXPECT_EQUAL(assertMatchTermSuffix("v", "vespa"), false);
+ }
+
+TEST("Test basic strchrfield searchers") {
+ {
+ UTF8StrChrFieldSearcher fs(0);
+ EXPECT_TRUE(testStrChrFieldSearcher(fs));
+ }
+ {
+ FUTF8StrChrFieldSearcher fs(0);
+ EXPECT_TRUE(testStrChrFieldSearcher(fs));
+ }
+}
+
+bool
+testUTF8SubStringFieldSearcher(StrChrFieldSearcher & fs)
+{
+ std::string field = "operators and operator overloading";
+ assertString(fs, "rsand", field, Hits());
+ assertString(fs, "ove", field, Hits().add(3));
+ assertString(fs, "ing", field, Hits().add(3));
+ assertString(fs, "era", field, Hits().add(0).add(2));
+ assertString(fs, "a", field, Hits().add(0).add(1).add(2).add(3));
+
+ assertString(fs, StringList().add("dn").add("gn"), field, HitsList().add(Hits()).add(Hits()));
+ assertString(fs, StringList().add("ato").add("load"), field, HitsList().add(Hits().add(0).add(2)).add(Hits().add(3)));
+
+ assertString(fs, StringList().add("aa").add("ab"), "aaaab",
+ HitsList().add(Hits().add(0).add(0).add(0)).add(Hits().add(0)));
+
+ if (!EXPECT_TRUE(testStringFieldInfo(fs))) return false;
+ return true;
+}
+
+TEST("utf8 substring search") {
+ {
+ UTF8SubStringFieldSearcher fs(0);
+ EXPECT_TRUE(testUTF8SubStringFieldSearcher(fs));
+ assertString(fs, "aa", "aaaa", Hits().add(0).add(0));
+ }
+ {
+ UTF8SubStringFieldSearcher fs(0);
+ EXPECT_TRUE(testUTF8SubStringFieldSearcher(fs));
+ assertString(fs, "abc", "abc bcd abc", Hits().add(0).add(2));
+ fs.maxFieldLength(4);
+ assertString(fs, "abc", "abc bcd abc", Hits().add(0));
+ }
+ {
+ UTF8SubstringSnippetModifier fs(0);
+ EXPECT_TRUE(testUTF8SubStringFieldSearcher(fs));
+ // we don't have 1 term optimization
+ assertString(fs, "aa", "aaaa", Hits().add(0).add(0).add(0));
+ }
+}
+
+TEST("utf8 substring search with empty term")
+{
+ UTF8SubStringFieldSearcher fs(0);
+ EXPECT_TRUE(testUTF8SubStringFieldSearcher(fs));
+ assertString(fs, "", "abc", Hits());
+ assertFieldInfo(fs, "", "abc", QTFieldInfo().setFieldLength(0));
+}
+
+TEST("utf8 suffix search") {
+ UTF8SuffixStringFieldSearcher fs(0);
+ std::string field = "operators and operator overloading";
+ assertString(fs, "rsand", field, Hits());
+ assertString(fs, "tor", field, Hits().add(2));
+ assertString(fs, "tors", field, Hits().add(0));
+
+ assertString(fs, StringList().add("an").add("din"), field, HitsList().add(Hits()).add(Hits()));
+ assertString(fs, StringList().add("nd").add("g"), field, HitsList().add(Hits().add(1)).add(Hits().add(3)));
+
+ EXPECT_TRUE(testStringFieldInfo(fs));
+}
+
+TEST("utf8 exact match") {
+ UTF8ExactStringFieldSearcher fs(0);
+ // regular
+ TEST_DO(assertString(fs, "vespa", "vespa", Hits().add(0)));
+ TEST_DO(assertString(fs, "vespar", "vespa", Hits()));
+ TEST_DO(assertString(fs, "vespa", "vespar", Hits()));
+ TEST_DO(assertString(fs, "vespa", "vespa vespa", Hits()));
+ TEST_DO(assertString(fs, "vesp", "vespa", Hits()));
+ TEST_DO(assertString(fs, "vesp*", "vespa", Hits().add(0)));
+ TEST_DO(assertString(fs, "hutte", "hutte", Hits().add(0)));
+ TEST_DO(assertString(fs, "hütte", "hütte", Hits().add(0)));
+ TEST_DO(assertString(fs, "hutte", "hütte", Hits()));
+ TEST_DO(assertString(fs, "hütte", "hutte", Hits()));
+ TEST_DO(assertString(fs, "hütter", "hütte", Hits()));
+ TEST_DO(assertString(fs, "hütte", "hütter", Hits()));
+}
+
+TEST("utf8 flexible searcher"){
+ UTF8FlexibleStringFieldSearcher fs(0);
+ // regular
+ assertString(fs, "vespa", "vespa", Hits().add(0));
+ assertString(fs, "vesp", "vespa", Hits());
+ assertString(fs, "esp", "vespa", Hits());
+ assertString(fs, "espa", "vespa", Hits());
+
+ // prefix
+ assertString(fs, "vesp*", "vespa", Hits().add(0));
+ fs.setMatchType(FieldSearcher::PREFIX);
+ assertString(fs, "vesp", "vespa", Hits().add(0));
+
+ // substring
+ fs.setMatchType(FieldSearcher::REGULAR);
+ assertString(fs, "*esp*", "vespa", Hits().add(0));
+ fs.setMatchType(FieldSearcher::SUBSTRING);
+ assertString(fs, "esp", "vespa", Hits().add(0));
+
+ // suffix
+ fs.setMatchType(FieldSearcher::REGULAR);
+ assertString(fs, "*espa", "vespa", Hits().add(0));
+ fs.setMatchType(FieldSearcher::SUFFIX);
+ assertString(fs, "espa", "vespa", Hits().add(0));
+
+ fs.setMatchType(FieldSearcher::REGULAR);
+ EXPECT_TRUE(testStringFieldInfo(fs));
+}
+
+TEST("bool search") {
+ BoolFieldSearcher fs(0);
+ TEST_DO(assertBool(fs, "true", true, true));
+ TEST_DO(assertBool(fs, "true", false, false));
+ TEST_DO(assertBool(fs, "1", true, true));
+ TEST_DO(assertBool(fs, "1", false, false));
+ TEST_DO(assertBool(fs, "false", true, false));
+ TEST_DO(assertBool(fs, "false", false, true));
+ TEST_DO(assertBool(fs, "0", true, false));
+ TEST_DO(assertBool(fs, "0", false, true));
+ TEST_DO(assertBool(fs, StringList().add("true").add("false").add("true"), true, BoolList().add(true).add(false).add(true)));
+ TEST_DO(assertBool(fs, StringList().add("true").add("false").add("true"), false, BoolList().add(false).add(true).add(false)));
+}
+
+TEST("integer search")
+{
+ IntFieldSearcher fs(0);
+ TEST_DO(assertInt(fs, "10", 10, true));
+ TEST_DO(assertInt(fs, "9", 10, false));
+ TEST_DO(assertInt(fs, ">9", 10, true));
+ TEST_DO(assertInt(fs, ">9", 9, false));
+ TEST_DO(assertInt(fs, "<11", 10, true));
+ TEST_DO(assertInt(fs, "<11", 11, false));
+ TEST_DO(assertInt(fs, "-10", -10, true));
+ TEST_DO(assertInt(fs, "-9", -10, false));
+ TEST_DO(assertInt(fs, "a", 10, false));
+ TEST_DO(assertInt(fs, "[-5;5]", -5, true));
+ TEST_DO(assertInt(fs, "[-5;5]", 0, true));
+ TEST_DO(assertInt(fs, "[-5;5]", 5, true));
+ TEST_DO(assertInt(fs, "[-5;5]", -6, false));
+ TEST_DO(assertInt(fs, "[-5;5]", 6, false));
+
+ TEST_DO(assertInt(fs, StringList().add("9").add("11"), 10, BoolList().add(false).add(false)));
+ TEST_DO(assertInt(fs, StringList().add("9").add("10"), 10, BoolList().add(false).add(true)));
+ TEST_DO(assertInt(fs, StringList().add("10").add(">9"), 10, BoolList().add(true).add(true)));
+
+ TEST_DO(assertInt(fs, "10", LongList().add(10).add(20).add(10).add(30), Hits().add(0).add(2)));
+ TEST_DO(assertInt(fs, StringList().add("10").add("20"), LongList().add(10).add(20).add(10).add(30),
+ HitsList().add(Hits().add(0).add(2)).add(Hits().add(1))));
+
+ TEST_DO(assertFieldInfo(fs, "10", 10, QTFieldInfo(0, 1, 1)));
+ TEST_DO(assertFieldInfo(fs, "10", LongList().add(10).add(20).add(10).add(30), QTFieldInfo(0, 2, 4)));
+ TEST_DO(assertFieldInfo(fs, StringList().add("10").add("20"), 10,
+ FieldInfoList().add(QTFieldInfo(0, 1, 1)).add(QTFieldInfo(0, 0, 1))));
+ TEST_DO(assertFieldInfo(fs, StringList().add("10").add("20"), LongList().add(10).add(20).add(10).add(30),
+ FieldInfoList().add(QTFieldInfo(0, 2, 4)).add(QTFieldInfo(0, 1, 4))));
+}
+
+TEST("floating point search")
+{
+ FloatFieldSearcher fs;
+ TEST_DO(assertFloat(fs, "10", 10, true));
+ TEST_DO(assertFloat(fs, "10.5", 10.5, true));
+ TEST_DO(assertFloat(fs, "-10.5", -10.5, true));
+ TEST_DO(assertFloat(fs, ">10.5", 10.6, true));
+ TEST_DO(assertFloat(fs, ">10.5", 10.5, false));
+ TEST_DO(assertFloat(fs, "<10.5", 10.4, true));
+ TEST_DO(assertFloat(fs, "<10.5", 10.5, false));
+ TEST_DO(assertFloat(fs, "10.4", 10.5, false));
+ TEST_DO(assertFloat(fs, "-10.4", -10.5, false));
+ TEST_DO(assertFloat(fs, "a", 10.5, false));
+ TEST_DO(assertFloat(fs, "[-5.5;5.5]", -5.5, true));
+ TEST_DO(assertFloat(fs, "[-5.5;5.5]", 0, true));
+ TEST_DO(assertFloat(fs, "[-5.5;5.5]", 5.5, true));
+ TEST_DO(assertFloat(fs, "[-5.5;5.5]", -5.6, false));
+ TEST_DO(assertFloat(fs, "[-5.5;5.5]", 5.6, false));
+
+ TEST_DO(assertFloat(fs, StringList().add("10").add("11"), 10.5, BoolList().add(false).add(false)));
+ TEST_DO(assertFloat(fs, StringList().add("10").add("10.5"), 10.5, BoolList().add(false).add(true)));
+ TEST_DO(assertFloat(fs, StringList().add(">10.4").add("10.5"), 10.5, BoolList().add(true).add(true)));
+
+ TEST_DO(assertFloat(fs, "10.5", FloatList().add(10.5).add(20.5).add(10.5).add(30.5), Hits().add(0).add(2)));
+ TEST_DO(assertFloat(fs, StringList().add("10.5").add("20.5"), FloatList().add(10.5).add(20.5).add(10.5).add(30.5),
+ HitsList().add(Hits().add(0).add(2)).add(Hits().add(1))));
+
+ TEST_DO(assertFieldInfo(fs, "10.5", 10.5, QTFieldInfo(0, 1, 1)));
+ TEST_DO(assertFieldInfo(fs, "10.5", FloatList().add(10.5).add(20.5).add(10.5).add(30.5), QTFieldInfo(0, 2, 4)));
+ TEST_DO(assertFieldInfo(fs, StringList().add("10.5").add("20.5"), 10.5,
+ FieldInfoList().add(QTFieldInfo(0, 1, 1)).add(QTFieldInfo(0, 0, 1))));
+ TEST_DO(assertFieldInfo(fs, StringList().add("10.5").add("20.5"), FloatList().add(10.5).add(20.5).add(10.5).add(30.5),
+ FieldInfoList().add(QTFieldInfo(0, 2, 4)).add(QTFieldInfo(0, 1, 4))));
+}
+
+TEST("Snippet modifier search") {
+ // ascii
+ assertSnippetModifier("f", "foo", "\x1F""f\x1Foo");
+ assertSnippetModifier("o", "foo", "f\x1Fo\x1F\x1Fo\x1F");
+ assertSnippetModifier("r", "bar", "ba\x1Fr\x1F");
+ assertSnippetModifier("foo", "foo foo", "\x1F""foo\x1F \x1F""foo\x1F");
+ assertSnippetModifier("aa", "aaaaaa", "\x1F""aa\x1F\x1F""aa\x1F\x1F""aa\x1F");
+ assertSnippetModifier("ab", "abcd\x1F""efgh", "\x1F""ab\x1F""cd\x1F""efgh");
+ assertSnippetModifier("ef", "abcd\x1F""efgh", "abcd\x1F\x1F""ef\x1Fgh");
+ assertSnippetModifier("fg", "abcd\x1F""efgh", "abcd\x1F""e\x1F""fg\x1Fh");
+ // the separator overlapping the match is skipped
+ assertSnippetModifier("cdef", "abcd\x1F""efgh", "ab\x1F""cdef\x1F""gh");
+ // no hits
+ assertSnippetModifier("bb", "aaaaaa", "aaaaaa");
+
+
+ // multiple query terms
+ assertSnippetModifier(StringList().add("ab").add("cd"), "abcd", "\x1F""ab\x1F\x1F""cd\x1F");
+ // when we have overlap we only get the first match
+ assertSnippetModifier(StringList().add("ab").add("bc"), "abcd", "\x1F""ab\x1F""cd");
+ assertSnippetModifier(StringList().add("bc").add("ab"), "abcd", "\x1F""ab\x1F""cd");
+ // the separator overlapping the match is skipped
+ assertSnippetModifier(StringList().add("de").add("ef"), "abcd\x1F""efgh", "abc\x1F""de\x1F""fgh");
+
+ // cjk
+ assertSnippetModifier("\xe7\x9f\xb3", "\xe7\x9f\xb3\xe6\x98\x8e\xe5\x87\xb1\xe5\x9c\xa8",
+ "\x1f\xe7\x9f\xb3\x1f\xe6\x98\x8e\xe5\x87\xb1\xe5\x9c\xa8");
+ assertSnippetModifier("\xe6\x98\x8e\xe5\x87\xb1", "\xe7\x9f\xb3\xe6\x98\x8e\xe5\x87\xb1\xe5\x9c\xa8",
+ "\xe7\x9f\xb3\x1f\xe6\x98\x8e\xe5\x87\xb1\x1f\xe5\x9c\xa8");
+ // the separator overlapping the match is skipped
+ assertSnippetModifier("\xe6\x98\x8e\xe5\x87\xb1", "\xe7\x9f\xb3\xe6\x98\x8e\x1f\xe5\x87\xb1\xe5\x9c\xa8",
+ "\xe7\x9f\xb3\x1f\xe6\x98\x8e\xe5\x87\xb1\x1f\xe5\x9c\xa8");
+
+ { // check that resizing works
+ UTF8SubstringSnippetModifier mod;
+ EXPECT_EQUAL(mod.getModifiedBuf().getLength(), 32u);
+ EXPECT_EQUAL(mod.getModifiedBuf().getPos(), 0u);
+ performSearch(mod, StringList().add("a"), StringFieldValue("aaaaaaaaaaaaaaaa"));
+ EXPECT_EQUAL(mod.getModifiedBuf().getPos(), 16u + 2 * 16u);
+ EXPECT_TRUE(mod.getModifiedBuf().getLength() >= mod.getModifiedBuf().getPos());
+ }
+}
+
+TEST("snippet modifier") {
+ { // string field value
+ SnippetModifierSetup sms(StringList().add("ab"));
+ // multiple invokations
+ assertSnippetModifier(sms, StringFieldValue("ab"), "\x1F""ab\x1F");
+ assertSnippetModifier(sms, StringFieldValue("xxxxabxxxxabxxxx"), "xxxx\x1F""ab\x1Fxxxx\x1F""ab\x1Fxxxx");
+ assertSnippetModifier(sms, StringFieldValue("xxabxx"), "xx\x1F""ab\x1Fxx");
+ }
+ { // collection field value
+ SnippetModifierSetup sms(StringList().add("ab"));
+ // multiple invokations
+ assertSnippetModifier(sms, getFieldValue(StringList().add("ab")), "\x1F""ab\x1F");
+ assertSnippetModifier(sms, getFieldValue(StringList().add("xxabxx")), "xx\x1F""ab\x1Fxx");
+ assertSnippetModifier(sms, getFieldValue(StringList().add("ab").add("xxabxx").add("xxxxxx")),
+ "\x1F""ab\x1F\x1E""xx\x1F""ab\x1F""xx\x1E""xxxxxx");
+ assertSnippetModifier(sms, getFieldValue(StringList().add("cd").add("ef").add("gh")),
+ "cd\x1E""ef\x1E""gh");
+ }
+ { // check that resizing works
+ SnippetModifierSetup sms(StringList().add("a"));
+ EXPECT_EQUAL(sms.modifier.getValueBuf().getLength(), 32u);
+ EXPECT_EQUAL(sms.modifier.getValueBuf().getPos(), 0u);
+ sms.modifier.modify(StringFieldValue("aaaaaaaaaaaaaaaa"));
+ EXPECT_EQUAL(sms.modifier.getValueBuf().getPos(), 16u + 2 * 16u);
+ EXPECT_TRUE(sms.modifier.getValueBuf().getLength() >= sms.modifier.getValueBuf().getPos());
+ }
+}
+
+TEST("FieldSearchSpec constrution") {
+ {
+ FieldSearchSpec f;
+ EXPECT_FALSE(f.valid());
+ EXPECT_EQUAL(0u, f.id());
+ EXPECT_EQUAL("", f.name());
+ EXPECT_EQUAL(0x100000u, f.maxLength());
+ }
+ {
+ FieldSearchSpec f(7, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 789);
+ EXPECT_TRUE(f.valid());
+ EXPECT_EQUAL(7u, f.id());
+ EXPECT_EQUAL("f0", f.name());
+ EXPECT_EQUAL(789u, f.maxLength());
+ EXPECT_EQUAL(789u, f.searcher().maxFieldLength());
+ }
+}
+
+TEST("snippet modifier manager") {
+ FieldSearchSpecMapT specMap;
+ specMap[0] = FieldSearchSpec(0, "f0", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "substring", 1000);
+ specMap[1] = FieldSearchSpec(1, "f1", VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8, "", 1000);
+ IndexFieldMapT indexMap;
+ indexMap["i0"].push_back(0);
+ indexMap["i1"].push_back(1);
+ indexMap["i2"].push_back(0);
+ indexMap["i2"].push_back(1);
+
+ {
+ SnippetModifierManager man;
+ Query query(StringList().add("i0:foo"));
+ man.setup(query.qtl, specMap, indexMap);
+ assertQueryTerms(man, 0, StringList().add("foo"));
+ assertQueryTerms(man, 1, StringList());
+ }
+ {
+ SnippetModifierManager man;
+ Query query(StringList().add("i1:foo"));
+ man.setup(query.qtl, specMap, indexMap);
+ assertQueryTerms(man, 0, StringList());
+ assertQueryTerms(man, 1, StringList());
+ }
+ {
+ SnippetModifierManager man;
+ Query query(StringList().add("i1:*foo*"));
+ man.setup(query.qtl, specMap, indexMap);
+ assertQueryTerms(man, 0, StringList());
+ assertQueryTerms(man, 1, StringList().add("foo"));
+ }
+ {
+ SnippetModifierManager man;
+ Query query(StringList().add("i2:foo").add("i2:*bar*"));
+ man.setup(query.qtl, specMap, indexMap);
+ assertQueryTerms(man, 0, StringList().add("foo").add("bar"));
+ assertQueryTerms(man, 1, StringList().add("bar"));
+ }
+ { // check buffer sizes
+ SnippetModifierManager man;
+ Query query(StringList().add("i2:foo").add("i2:*bar*"));
+ man.setup(query.qtl, specMap, indexMap);
+ {
+ SnippetModifier * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(0));
+ UTF8SubstringSnippetModifier * searcher = sm->getSearcher().get();
+ EXPECT_EQUAL(sm->getValueBuf().getLength(), 128u);
+ EXPECT_EQUAL(searcher->getModifiedBuf().getLength(), 64u);
+ }
+ {
+ SnippetModifier * sm = static_cast<SnippetModifier *>(man.getModifiers().getModifier(1));
+ UTF8SubstringSnippetModifier * searcher = sm->getSearcher().get();
+ EXPECT_EQUAL(sm->getValueBuf().getLength(), 128u);
+ EXPECT_EQUAL(searcher->getModifiedBuf().getLength(), 64u);
+ }
+ }
+}
+
+TEST("Stripping of indexes")
+{
+ EXPECT_EQUAL("f", FieldSearchSpecMap::stripNonFields("f"));
+ EXPECT_EQUAL("f", FieldSearchSpecMap::stripNonFields("f[0]"));
+ EXPECT_EQUAL("f[a]", FieldSearchSpecMap::stripNonFields("f[a]"));
+
+ EXPECT_EQUAL("f.value", FieldSearchSpecMap::stripNonFields("f{a}"));
+ EXPECT_EQUAL("f.value", FieldSearchSpecMap::stripNonFields("f{a0}"));
+ EXPECT_EQUAL("f{a 0}", FieldSearchSpecMap::stripNonFields("f{a 0}"));
+ EXPECT_EQUAL("f.value", FieldSearchSpecMap::stripNonFields("f{\"a 0\"}"));
+}
+
+TEST("counting of words") {
+ EXPECT_TRUE(assertCountWords(0, ""));
+ EXPECT_TRUE(assertCountWords(0, "?"));
+ EXPECT_TRUE(assertCountWords(1, "foo"));
+ EXPECT_TRUE(assertCountWords(2, "foo bar"));
+ EXPECT_TRUE(assertCountWords(2, "? foo bar"));
+ EXPECT_TRUE(assertCountWords(2, "foo bar ?"));
+
+ // check that 'a' is counted as 1 word
+ UTF8StrChrFieldSearcher fs(0);
+ StringList field = StringList().add("a").add("aa bb cc");
+ assertString(fs, "bb", field, Hits().add(2));
+ assertString(fs, StringList().add("bb").add("not"), field, HitsList().add(Hits().add(2)).add(Hits()));
+}
+
+TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/streamingvisitors/src/tests/searchvisitor/CMakeLists.txt b/streamingvisitors/src/tests/searchvisitor/CMakeLists.txt
index fdbd60ce30a..01b625b6b3b 100644
--- a/streamingvisitors/src/tests/searchvisitor/CMakeLists.txt
+++ b/streamingvisitors/src/tests/searchvisitor/CMakeLists.txt
@@ -3,6 +3,6 @@ vespa_add_executable(streamingvisitors_searchvisitor_test_app TEST
SOURCES
searchvisitor_test.cpp
DEPENDS
- streamingvisitors_searchvisitor
+ streamingvisitors
)
vespa_add_test(NAME streamingvisitors_searchvisitor_test_app COMMAND streamingvisitors_searchvisitor_test_app)
diff --git a/streamingvisitors/src/tests/textutil/.gitignore b/streamingvisitors/src/tests/textutil/.gitignore
new file mode 100644
index 00000000000..1103f79800a
--- /dev/null
+++ b/streamingvisitors/src/tests/textutil/.gitignore
@@ -0,0 +1,4 @@
+.depend
+Makefile
+textutil_test
+vsm_textutil_test_app
diff --git a/streamingvisitors/src/tests/textutil/CMakeLists.txt b/streamingvisitors/src/tests/textutil/CMakeLists.txt
new file mode 100644
index 00000000000..59817d01137
--- /dev/null
+++ b/streamingvisitors/src/tests/textutil/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(vsm_textutil_test_app TEST
+ SOURCES
+ textutil.cpp
+ DEPENDS
+ streamingvisitors
+)
+vespa_add_test(NAME vsm_textutil_test_app COMMAND vsm_textutil_test_app)
diff --git a/streamingvisitors/src/tests/textutil/textutil.cpp b/streamingvisitors/src/tests/textutil/textutil.cpp
new file mode 100644
index 00000000000..2a1390eaa01
--- /dev/null
+++ b/streamingvisitors/src/tests/textutil/textutil.cpp
@@ -0,0 +1,285 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/vespalib/testkit/testapp.h>
+
+#include <vespa/fastlib/text/unicodeutil.h>
+#include <vespa/searchlib/query/base.h>
+#include <vespa/vsm/searcher/fold.h>
+#include <vespa/vsm/searcher/futf8strchrfieldsearcher.h>
+#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
+
+using search::byte; // unsigned char
+
+namespace vsm {
+
+template <typename T>
+class Vector : public std::vector<T>
+{
+public:
+ Vector() : std::vector<T>() {}
+ Vector<T> & a(T v) { this->push_back(v); return *this; }
+};
+
+typedef Vector<ucs4_t> UCS4V;
+typedef Vector<size_t> SizeV;
+typedef UTF8StringFieldSearcherBase SFSB;
+typedef FUTF8StrChrFieldSearcher FSFS;
+
+class TextUtilTest : public vespalib::TestApp
+{
+private:
+ ucs4_t getUTF8Char(const char * src);
+ template <typename BW, bool OFF>
+ void assertSkipSeparators(const char * input, size_t len, const UCS4V & expdstbuf, const SizeV & expoffsets);
+ void assertAnsiFold(const std::string & toFold, const std::string & exp);
+ void assertAnsiFold(char c, char exp);
+#ifdef __x86_64__
+ void assert_sse2_foldua(const std::string & toFold, size_t charFolded, const std::string & exp);
+ void assert_sse2_foldua(unsigned char c, unsigned char exp, size_t charFolded = 16);
+#endif
+
+ template <typename BW, bool OFF>
+ void testSkipSeparators();
+ void testSkipSeparators();
+ void testSeparatorCharacter();
+ void testAnsiFold();
+ void test_lfoldua();
+#ifdef __x86_64__
+ void test_sse2_foldua();
+#endif
+
+public:
+ int Main() override;
+};
+
+ucs4_t
+TextUtilTest::getUTF8Char(const char * src)
+{
+ ucs4_t retval = Fast_UnicodeUtil::GetUTF8Char(src);
+ ASSERT_TRUE(retval != Fast_UnicodeUtil::_BadUTF8Char);
+ return retval;
+}
+
+template <typename BW, bool OFF>
+void
+TextUtilTest::assertSkipSeparators(const char * input, size_t len, const UCS4V & expdstbuf, const SizeV & expoffsets)
+{
+ const byte * srcbuf = reinterpret_cast<const byte *>(input);
+ auto dstbuf = std::make_unique<ucs4_t[]>(len + 1);
+ auto offsets = std::make_unique<size_t[]>(len + 1);
+ UTF8StrChrFieldSearcher fs;
+ BW bw(dstbuf.get(), offsets.get());
+ size_t dstlen = fs.skipSeparators(srcbuf, len, bw);
+ EXPECT_EQUAL(dstlen, expdstbuf.size());
+ ASSERT_TRUE(dstlen == expdstbuf.size());
+ for (size_t i = 0; i < dstlen; ++i) {
+ EXPECT_EQUAL(dstbuf[i], expdstbuf[i]);
+ if (OFF) {
+ EXPECT_EQUAL(offsets[i], expoffsets[i]);
+ }
+ }
+}
+
+void
+TextUtilTest::assertAnsiFold(const std::string & toFold, const std::string & exp)
+{
+ char folded[256];
+ EXPECT_TRUE(FSFS::ansiFold(toFold.c_str(), toFold.size(), folded));
+ EXPECT_EQUAL(std::string(folded, toFold.size()), exp);
+}
+
+void
+TextUtilTest::assertAnsiFold(char c, char exp)
+{
+ char folded;
+ EXPECT_TRUE(FSFS::ansiFold(&c, 1, &folded));
+ EXPECT_EQUAL((int32_t)folded, (int32_t)exp);
+}
+
+#ifdef __x86_64__
+void
+TextUtilTest::assert_sse2_foldua(const std::string & toFold, size_t charFolded, const std::string & exp)
+{
+ char folded[256];
+ size_t alignedStart = 0xF - (size_t(folded + 0xF) % 0x10);
+ const unsigned char * toFoldOrg = reinterpret_cast<const unsigned char *>(toFold.c_str());
+ const unsigned char * retval =
+ sse2_foldua(toFoldOrg, toFold.size(), reinterpret_cast<unsigned char *>(folded + alignedStart));
+ EXPECT_EQUAL((size_t)(retval - toFoldOrg), charFolded);
+ EXPECT_EQUAL(std::string(folded + alignedStart, charFolded), exp);
+}
+
+void
+TextUtilTest::assert_sse2_foldua(unsigned char c, unsigned char exp, size_t charFolded)
+{
+ unsigned char toFold[16];
+ memset(toFold, c, 16);
+ unsigned char folded[32];
+ size_t alignedStart = 0xF - (size_t(folded + 0xF) % 0x10);
+ const unsigned char * retval = sse2_foldua(toFold, 16, folded + alignedStart);
+ EXPECT_EQUAL((size_t)(retval - toFold), charFolded);
+ for (size_t i = 0; i < charFolded; ++i) {
+ EXPECT_EQUAL((int32_t)folded[i + alignedStart], (int32_t)exp);
+ }
+}
+#endif
+
+template <typename BW, bool OFF>
+void
+TextUtilTest::testSkipSeparators()
+{
+ // ascii characters
+ assertSkipSeparators<BW, OFF>("foo", 3, UCS4V().a('f').a('o').a('o'), SizeV().a(0).a(1).a(2));
+ assertSkipSeparators<BW, OFF>("f\x1Fo", 3, UCS4V().a('f').a('o'), SizeV().a(0).a(2));
+ assertSkipSeparators<BW, OFF>("f\no", 3, UCS4V().a('f').a('\n').a('o'), SizeV().a(0).a(1).a(2));
+ assertSkipSeparators<BW, OFF>("f\to", 3, UCS4V().a('f').a('\t').a('o'), SizeV().a(0).a(1).a(2));
+
+ // utf8 char
+ assertSkipSeparators<BW, OFF>("\xC2\x80\x66", 3, UCS4V().a(getUTF8Char("\xC2\x80")).a('f'),
+ SizeV().a(0).a(2));
+ assertSkipSeparators<BW, OFF>("\xE0\xA0\x80\x66", 4, UCS4V().a(getUTF8Char("\xE0\xA0\x80")).a('f'),
+ SizeV().a(0).a(3));
+ assertSkipSeparators<BW, OFF>("\xF0\x90\x80\x80\x66", 5, UCS4V().a(getUTF8Char("\xF0\x90\x80\x80")).a('f'),
+ SizeV().a(0).a(4));
+
+ // replacement string (sharp s -> ss)
+ assertSkipSeparators<BW, OFF>("\xC3\x9F\x66\xC3\x9F", 5, UCS4V().a('s').a('s').a('f').a('s').a('s'),
+ SizeV().a(0).a(0).a(2).a(3).a(3));
+}
+
+void
+TextUtilTest::testSkipSeparators()
+{
+ Fast_NormalizeWordFolder::Setup(Fast_NormalizeWordFolder::DO_SHARP_S_SUBSTITUTION);
+
+ testSkipSeparators<SFSB::BufferWrapper, false>();
+ testSkipSeparators<SFSB::OffsetWrapper, true>();
+}
+
+void
+TextUtilTest::testSeparatorCharacter()
+{
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x00'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x01'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x02'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x03'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x04'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x05'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x06'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x07'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x08'));
+ EXPECT_TRUE(! SFSB::isSeparatorCharacter('\x09')); // '\t'
+ EXPECT_TRUE(! SFSB::isSeparatorCharacter('\x0a')); // '\n'
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0b'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0c'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0d'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0e'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0f'));
+
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x10'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x11'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x12'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x13'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x14'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x15'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x16'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x17'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x18'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x19'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1a'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1b'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1c'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1d'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1e'));
+ EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1f'));
+
+ EXPECT_TRUE(! SFSB::isSeparatorCharacter('\x20')); // space
+}
+
+void
+TextUtilTest::testAnsiFold()
+{
+ FieldSearcher::init();
+ assertAnsiFold("", "");
+ assertAnsiFold("ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz");
+ assertAnsiFold("abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz");
+ assertAnsiFold("0123456789", "0123456789");
+ for (int i = 0; i < 128; ++i) {
+ if ((i >= 'a' && i <= 'z') || (i >= '0' && i <= '9')) {
+ assertAnsiFold(i, i);
+ } else if (i >= 'A' && i <= 'Z') {
+ assertAnsiFold(i, i + 32);
+ } else {
+ assertAnsiFold(i, 0);
+ }
+ }
+
+ // non-ascii is ignored
+ for (int i = 128; i < 256; ++i) {
+ char toFold = i;
+ char folded;
+ EXPECT_TRUE(!FSFS::ansiFold(&toFold, 1, &folded));
+ }
+}
+
+void
+TextUtilTest::test_lfoldua()
+{
+ FieldSearcher::init();
+ char folded[256];
+ size_t alignedStart = 0;
+ const char * toFold = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
+ size_t len = strlen(toFold);
+ EXPECT_TRUE(FSFS::lfoldua(toFold, len, folded, alignedStart));
+ EXPECT_EQUAL(std::string(folded + alignedStart, len), "abcdefghijklmnopqrstuvwxyz");
+}
+
+#ifdef __x86_64__
+void
+TextUtilTest::test_sse2_foldua()
+{
+ assert_sse2_foldua("", 0, "");
+ assert_sse2_foldua("ABCD", 0, "");
+ assert_sse2_foldua("ABCDEFGHIJKLMNO", 0, "");
+ assert_sse2_foldua("ABCDEFGHIJKLMNOP", 16, "abcdefghijklmnop");
+ assert_sse2_foldua("ABCDEFGHIJKLMNOPQ", 16, "abcdefghijklmnop");
+ assert_sse2_foldua("KLMNOPQRSTUVWXYZ", 16, "klmnopqrstuvwxyz");
+ assert_sse2_foldua("abcdefghijklmnop", 16, "abcdefghijklmnop");
+ assert_sse2_foldua("klmnopqrstuvwxyz", 16, "klmnopqrstuvwxyz");
+ assert_sse2_foldua("0123456789abcdef", 16, "0123456789abcdef");
+
+ for (int i = 0; i < 128; ++i) {
+ if ((i >= 'a' && i <= 'z') || (i >= '0' && i <= '9')) {
+ assert_sse2_foldua(i, i);
+ } else if (i >= 'A' && i <= 'Z') {
+ assert_sse2_foldua(i, i + 32);
+ } else {
+ assert_sse2_foldua(i, 0);
+ }
+ }
+
+ // non-ascii is ignored
+ for (int i = 128; i < 256; ++i) {
+ assert_sse2_foldua(i, '?', 0);
+ }
+}
+#endif
+
+int
+TextUtilTest::Main()
+{
+ TEST_INIT("textutil_test");
+
+ testSkipSeparators();
+ testSeparatorCharacter();
+ testAnsiFold();
+ test_lfoldua();
+#ifdef __x86_64__
+ test_sse2_foldua();
+#endif
+
+ TEST_DONE();
+}
+
+}
+
+TEST_APPHOOK(vsm::TextUtilTest);
diff --git a/streamingvisitors/src/tests/utilapps/.gitignore b/streamingvisitors/src/tests/utilapps/.gitignore
new file mode 100644
index 00000000000..e69de29bb2d
--- /dev/null
+++ b/streamingvisitors/src/tests/utilapps/.gitignore
diff --git a/streamingvisitors/src/vespa/searchvisitor/CMakeLists.txt b/streamingvisitors/src/vespa/searchvisitor/CMakeLists.txt
index e8f85fc987e..ff629462f9e 100644
--- a/streamingvisitors/src/vespa/searchvisitor/CMakeLists.txt
+++ b/streamingvisitors/src/vespa/searchvisitor/CMakeLists.txt
@@ -1,5 +1,5 @@
# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-vespa_add_library(streamingvisitors_searchvisitor
+vespa_add_library(streamingvisitors
SOURCES
hitcollector.cpp
indexenvironment.cpp
@@ -11,6 +11,10 @@ vespa_add_library(streamingvisitors_searchvisitor
rankprocessor.cpp
searchenvironment.cpp
searchvisitor.cpp
+ $<TARGET_OBJECTS:vsm_vconfig>
+ $<TARGET_OBJECTS:vsm_vsmbase>
+ $<TARGET_OBJECTS:vsm_vsmcommon>
+ $<TARGET_OBJECTS:vsm_vsmsearcher>
INSTALL lib64
DEPENDS
searchlib_searchlib_uca
diff --git a/streamingvisitors/src/vespa/vsm/.gitignore b/streamingvisitors/src/vespa/vsm/.gitignore
new file mode 100644
index 00000000000..4c5f5d9ef7a
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/.gitignore
@@ -0,0 +1,3 @@
+.depend
+Makefile
+/libvsm.so.5.1
diff --git a/streamingvisitors/src/vespa/vsm/common/.gitignore b/streamingvisitors/src/vespa/vsm/common/.gitignore
new file mode 100644
index 00000000000..95bc02923a9
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/common/.gitignore
@@ -0,0 +1,5 @@
+*.exe
+*.ilk
+*.pdb
+.depend*
+Makefile
diff --git a/streamingvisitors/src/vespa/vsm/common/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/common/CMakeLists.txt
new file mode 100644
index 00000000000..4570a9b581e
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/common/CMakeLists.txt
@@ -0,0 +1,10 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(vsm_vsmcommon OBJECT
+ SOURCES
+ charbuffer.cpp
+ document.cpp
+ documenttypemapping.cpp
+ fieldmodifier.cpp
+ storagedocument.cpp
+ DEPENDS
+)
diff --git a/streamingvisitors/src/vespa/vsm/common/charbuffer.cpp b/streamingvisitors/src/vespa/vsm/common/charbuffer.cpp
new file mode 100644
index 00000000000..b8fbb5c8846
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/common/charbuffer.cpp
@@ -0,0 +1,32 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "charbuffer.h"
+#include <cstring>
+
+namespace vsm {
+
+CharBuffer::CharBuffer(size_t len) :
+ _buffer(len),
+ _pos(0)
+{ }
+
+void
+CharBuffer::put(const char * src, size_t n)
+{
+ if (n > getRemaining()) {
+ resize(_pos + (n * 2));
+ }
+ char * dst = &_buffer[_pos];
+ memcpy(dst, src, n);
+ _pos += n;
+}
+
+void
+CharBuffer::resize(size_t len)
+{
+ if (len > getLength()) {
+ _buffer.resize(len);
+ }
+}
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/common/charbuffer.h b/streamingvisitors/src/vespa/vsm/common/charbuffer.h
new file mode 100644
index 00000000000..08618a9b973
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/common/charbuffer.h
@@ -0,0 +1,52 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vector>
+#include <memory>
+
+namespace vsm {
+
+/**
+ * Simple growable char buffer.
+ **/
+class CharBuffer
+{
+private:
+ std::vector<char> _buffer;
+ size_t _pos;
+
+public:
+ typedef std::shared_ptr<CharBuffer> SP;
+
+ /**
+ * Creates a char buffer with len bytes.
+ **/
+ CharBuffer(size_t len = 0);
+
+ /**
+ * Copies n bytes from the src array into the underlying buffer at the
+ * current position, and updates the position accordingly.
+ * Resizing will occur if needed.
+ **/
+ void put(const char * src, size_t n);
+
+ /**
+ * Resizes the buffer so that the new length becomes len.
+ * Resizing will not occur if len < current length.
+ **/
+ void resize(size_t len);
+
+ /**
+ * Resets the position to the beginning of the buffer.
+ **/
+ void reset() { _pos = 0; }
+
+ const char * getBuffer() const { return &_buffer[0]; }
+ size_t getLength() const { return _buffer.size(); }
+ size_t getPos() const { return _pos; }
+ size_t getRemaining() const { return getLength() - getPos(); }
+ void put(char c) { put(&c, 1); }
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/common/docsum.h b/streamingvisitors/src/vespa/vsm/common/docsum.h
new file mode 100644
index 00000000000..49b84cb0783
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/common/docsum.h
@@ -0,0 +1,22 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "document.h"
+
+namespace vsm {
+
+/**
+ Will represent a cache of the document summaries. -> Actual docsums will be
+ generated on the fly when requested. A document summary is accessed by its
+ documentId.
+*/
+
+class IDocSumCache
+{
+public:
+ virtual const Document & getDocSum(const search::DocumentIdT & docId) const = 0;
+ virtual ~IDocSumCache() { }
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/common/document.cpp b/streamingvisitors/src/vespa/vsm/common/document.cpp
new file mode 100644
index 00000000000..a345c82ce2d
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/common/document.cpp
@@ -0,0 +1,73 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "document.h"
+#include <vespa/vespalib/stllike/asciistream.h>
+#include <vespa/vespalib/stllike/hash_map.hpp>
+
+using search::DocumentIdT;
+using search::TimeT;
+using document::FieldValue;
+
+namespace vsm
+{
+
+vespalib::asciistream & operator << (vespalib::asciistream & os, const FieldRef & f)
+{
+ const char *s = f.data();
+ os << f.size();
+ if (s) {
+ os << s; // Better hope it's null terminated!
+ }
+ os << " : ";
+ return os;
+}
+
+vespalib::asciistream & operator << (vespalib::asciistream & os, const StringFieldIdTMap & f)
+{
+ for (StringFieldIdTMapT::const_iterator it=f._map.begin(), mt=f._map.end(); it != mt; it++) {
+ os << it->first << " = " << it->second << '\n';
+ }
+ return os;
+}
+
+StringFieldIdTMap::StringFieldIdTMap() :
+ _map()
+{
+}
+
+void StringFieldIdTMap::add(const vespalib::string & s, FieldIdT fieldId)
+{
+ _map[s] = fieldId;
+}
+
+void StringFieldIdTMap::add(const vespalib::string & s)
+{
+ if (_map.find(s) == _map.end()) {
+ FieldIdT fieldId = _map.size();
+ _map[s] = fieldId;
+ }
+}
+
+FieldIdT StringFieldIdTMap::fieldNo(const vespalib::string & fName) const
+{
+ StringFieldIdTMapT::const_iterator found = _map.find(fName);
+ FieldIdT fNo((found != _map.end()) ? found->second : npos);
+ return fNo;
+}
+
+size_t StringFieldIdTMap::highestFieldNo() const
+{
+ size_t maxFNo(0);
+ for (const auto & field : _map) {
+ if (field.second >= maxFNo) {
+ maxFNo = field.second + 1;
+ }
+ }
+ return maxFNo;
+}
+
+Document::~Document() { }
+
+}
+
+VESPALIB_HASH_MAP_INSTANTIATE(vespalib::string, vsm::FieldIdTList);
+VESPALIB_HASH_MAP_INSTANTIATE(vespalib::string, vsm::IndexFieldMapT);
diff --git a/streamingvisitors/src/vespa/vsm/common/document.h b/streamingvisitors/src/vespa/vsm/common/document.h
new file mode 100644
index 00000000000..8c11d27072b
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/common/document.h
@@ -0,0 +1,68 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/searchlib/query/base.h>
+#include <vespa/document/fieldvalue/fieldvalue.h>
+#include <vespa/vespalib/stllike/hash_map.h>
+#include <map>
+
+namespace vespalib {
+ class asciistream;
+}
+
+namespace vsm {
+
+/// Type to identify fields in documents.
+typedef unsigned int FieldIdT;
+/// A type to represent a list of FieldIds.
+typedef std::vector<FieldIdT> FieldIdTList;
+/// A type to represent all the fields contained in all the indexs.
+typedef vespalib::hash_map<vespalib::string, FieldIdTList> IndexFieldMapT;
+/// A type to represent all the fields contained in all the indexs in an all the document types.
+typedef vespalib::hash_map<vespalib::string, IndexFieldMapT> DocumentTypeIndexFieldMapT;
+/// A type to represent a map from fieldname to fieldid.
+typedef std::map<vespalib::string, FieldIdT> StringFieldIdTMapT;
+
+class StringFieldIdTMap
+{
+ public:
+ enum { npos=0xFFFFFFFF };
+ StringFieldIdTMap();
+ FieldIdT fieldNo(const vespalib::string & fName) const;
+ void add(const vespalib::string & s);
+ void add(const vespalib::string & s, FieldIdT fNo);
+ const StringFieldIdTMapT & map() const { return _map; }
+ size_t highestFieldNo() const;
+ friend vespalib::asciistream & operator << (vespalib::asciistream & os, const StringFieldIdTMap & f);
+ private:
+ StringFieldIdTMapT _map;
+};
+
+typedef vespalib::stringref FieldRef;
+
+/**
+ This is the base class representing a document. It gives a document some
+ basic properties. A document is a collection of fields, together with a
+ document id and a time stamp.
+*/
+class Document
+{
+ public:
+ Document(size_t maxFieldCount) : _docId(0), _fieldCount(maxFieldCount) { }
+ Document(search::DocumentIdT doc, size_t maxFieldCount) : _docId(doc), _fieldCount(maxFieldCount) { }
+ virtual ~Document();
+ const search::DocumentIdT & getDocId() const { return _docId; }
+ size_t getFieldCount() const { return _fieldCount; }
+ void setDocId(const search::DocumentIdT & v) { _docId = v; }
+ virtual const document::FieldValue * getField(FieldIdT fId) const = 0;
+ /**
+ Returns true, if not possible to set.
+ */
+ virtual bool setField(FieldIdT fId, document::FieldValue::UP fv) = 0;
+ private:
+ search::DocumentIdT _docId;
+ const size_t _fieldCount;
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/common/documenttypemapping.cpp b/streamingvisitors/src/vespa/vsm/common/documenttypemapping.cpp
new file mode 100644
index 00000000000..7886c44b2e0
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/common/documenttypemapping.cpp
@@ -0,0 +1,104 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "documenttypemapping.h"
+#include <vespa/document/repo/documenttyperepo.h>
+#include <vespa/document/datatype/documenttype.h>
+#include <vespa/vespalib/stllike/hash_map.hpp>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".vsm.common.documenttypemapping");
+
+namespace vsm {
+
+DocumentTypeMapping::DocumentTypeMapping() :
+ _fieldMap(),
+ _defaultDocumentTypeName(),
+ _defaultDocumentType(),
+ _documentTypeFreq()
+{ }
+
+DocumentTypeMapping::~DocumentTypeMapping() { }
+
+namespace {
+
+vespalib::string getDocTypeId(const document::DocumentType & docType)
+{
+ vespalib::string typeId(docType.getName());
+ typeId += "0"; // Hardcoded version (version not supported)
+ return typeId;
+}
+
+}
+
+void DocumentTypeMapping::init(const vespalib::string & defaultDocumentType,
+ const StringFieldIdTMapT & fieldList,
+ const document::DocumentTypeRepo &repo)
+{
+ _defaultDocumentType = repo.getDocumentType(defaultDocumentType);
+ _defaultDocumentTypeName = getDocTypeId(*_defaultDocumentType);
+ LOG(debug, "Setting default document type to '%s'",
+ _defaultDocumentTypeName.c_str());
+ buildFieldMap(_defaultDocumentType, fieldList, _defaultDocumentTypeName);
+}
+
+bool DocumentTypeMapping::prepareBaseDoc(SharedFieldPathMap & map) const
+{
+ FieldPathMapMapT::const_iterator found = _fieldMap.find(_defaultDocumentTypeName);
+ if (found != _fieldMap.end()) {
+ map = std::make_shared<FieldPathMapT>(found->second);
+ LOG(debug, "Found FieldPathMap for default document type '%s' with %zd elements",
+ _defaultDocumentTypeName.c_str(), map->size());
+ } else {
+ LOG(warning, "No FieldPathMap found for default document type '%s'. Using empty one",
+ _defaultDocumentTypeName.c_str());
+ map = std::make_shared<FieldPathMapT>();
+ }
+ return true;
+}
+
+void DocumentTypeMapping::buildFieldMap(
+ const document::DocumentType *docTypePtr,
+ const StringFieldIdTMapT & fieldList, const vespalib::string & typeId)
+{
+ LOG(debug, "buildFieldMap: docType = '%s', fieldList.size = '%zd', typeId = '%s'",
+ docTypePtr->getName().c_str(), fieldList.size(), typeId.c_str());
+ const document::DocumentType & docType = *docTypePtr;
+ size_t highestFNo(0);
+ for (StringFieldIdTMapT::const_iterator it = fieldList.begin(), mt = fieldList.end(); it != mt; it++) {
+ highestFNo = std::max(highestFNo, size_t(it->second));
+ }
+ highestFNo++;
+ FieldPathMapT & fieldMap = _fieldMap[typeId];
+
+ fieldMap.resize(highestFNo);
+
+ size_t validCount(0);
+ for (StringFieldIdTMapT::const_iterator it = fieldList.begin(), mt = fieldList.end(); it != mt; it++) {
+ vespalib::string fname = it->first;
+ LOG(debug, "Handling %s -> %d", fname.c_str(), it->second);
+ try {
+ if ((it->first[0] != '[') && (it->first != "summaryfeatures") && (it->first != "rankfeatures") && (it->first != "ranklog") && (it->first != "sddocname") && (it->first != "documentid")) {
+ FieldPath fieldPath;
+ docType.buildFieldPath(fieldPath, fname);
+ fieldMap[it->second] = std::move(fieldPath);
+ validCount++;
+ LOG(spam, "Found %s -> %d in document", fname.c_str(), it->second);
+ }
+ } catch (const std::exception & e) {
+ LOG(debug, "Could not get field info for '%s' in documenttype '%s' (id = '%s') : %s",
+ it->first.c_str(), docType.getName().c_str(), typeId.c_str(), e.what());
+ }
+ }
+ _documentTypeFreq.insert(std::make_pair(validCount, docTypePtr));
+}
+
+const document::DocumentType & DocumentTypeMapping::getCurrentDocumentType() const
+{
+ if (_documentTypeFreq.empty()) {
+ throw std::runtime_error("No document type registered yet.");
+ }
+ return *_documentTypeFreq.rbegin()->second;
+}
+
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/common/documenttypemapping.h b/streamingvisitors/src/vespa/vsm/common/documenttypemapping.h
new file mode 100644
index 00000000000..607b40cec47
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/common/documenttypemapping.h
@@ -0,0 +1,54 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/vsm/common/storagedocument.h>
+
+namespace document { class DocumentTypeRepo; }
+
+namespace vsm
+{
+
+class DocumentTypeMapping
+{
+public:
+ DocumentTypeMapping();
+ ~DocumentTypeMapping();
+
+ /**
+ * Prepares the given document by sharing the field info map
+ * registered for that document type.
+ **/
+ bool prepareBaseDoc(SharedFieldPathMap & doc) const;
+
+ /**
+ * Builds a field info map for all registered document types.
+ **/
+ void init(const vespalib::string & defaultDocumentType,
+ const StringFieldIdTMapT & fieldList,
+ const document::DocumentTypeRepo &repo);
+
+ const document::DocumentType & getCurrentDocumentType() const;
+ const vespalib::string & getDefaultDocumentTypeName() const
+ { return _defaultDocumentTypeName; }
+ const document::DocumentType *getDefaultDocumentType() const
+ { return _defaultDocumentType; }
+
+private:
+ /**
+ * Builds a field info map for the given type id. This is a
+ * mapping from field id to field path and field value for all
+ * field names in the given list based on the given document type.
+ **/
+ void buildFieldMap(const document::DocumentType *docType,
+ const StringFieldIdTMapT & fieldList,
+ const vespalib::string & typeId);
+ typedef vespalib::hash_map<vespalib::string, FieldPathMapT> FieldPathMapMapT;
+ typedef std::multimap<size_t, const document::DocumentType *> DocumentTypeUsage;
+ FieldPathMapMapT _fieldMap;
+ vespalib::string _defaultDocumentTypeName;
+ const document::DocumentType *_defaultDocumentType;
+ DocumentTypeUsage _documentTypeFreq;
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/common/fieldmodifier.cpp b/streamingvisitors/src/vespa/vsm/common/fieldmodifier.cpp
new file mode 100644
index 00000000000..b39afd83b5a
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/common/fieldmodifier.cpp
@@ -0,0 +1,24 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "fieldmodifier.h"
+#include <vespa/vespalib/stllike/hash_map.hpp>
+
+namespace vsm {
+
+FieldModifierMap::FieldModifierMap() :
+ _map()
+{ }
+
+FieldModifierMap::~FieldModifierMap() { }
+
+FieldModifier *
+FieldModifierMap::getModifier(FieldIdT fId) const
+{
+ FieldModifierMapT::const_iterator itr = _map.find(fId);
+ if (itr == _map.end()) {
+ return NULL;
+ }
+ return itr->second.get();
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/common/fieldmodifier.h b/streamingvisitors/src/vespa/vsm/common/fieldmodifier.h
new file mode 100644
index 00000000000..60e480fa237
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/common/fieldmodifier.h
@@ -0,0 +1,58 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/document/fieldvalue/fieldvalue.h>
+#include <vespa/vsm/common/document.h>
+
+namespace vsm {
+
+/**
+ * Interface for classes that want to modify a field value.
+ **/
+class FieldModifier
+{
+public:
+ typedef std::unique_ptr<FieldModifier> UP;
+
+ /**
+ * Modifies the given field value and returns a new one.
+ **/
+ virtual document::FieldValue::UP modify(const document::FieldValue & fv) = 0;
+
+ /**
+ * Modifies the given field value and returns a new one.
+ * Use the given field path to iterate the field value.
+ **/
+ virtual document::FieldValue::UP modify(const document::FieldValue & fv,
+ const document::FieldPath & path) = 0;
+
+ virtual ~FieldModifier() { }
+};
+
+typedef vespalib::hash_map<FieldIdT, FieldModifier::UP> FieldModifierMapT;
+
+/**
+ * This class wraps a map from field id to field modifier.
+ **/
+class FieldModifierMap
+{
+private:
+ FieldModifierMapT _map;
+
+public:
+ FieldModifierMap();
+ ~FieldModifierMap();
+ FieldModifierMapT & map() { return _map; }
+ const FieldModifierMapT & map() const { return _map; }
+
+ /**
+ * Returns the modifier associated with the given field id or NULL if not found.
+ *
+ * @param fId the field id to look up.
+ * @return the field modifier or NULL if not found.
+ **/
+ FieldModifier * getModifier(FieldIdT fId) const;
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/common/storagedocument.cpp b/streamingvisitors/src/vespa/vsm/common/storagedocument.cpp
new file mode 100644
index 00000000000..a0d666268f5
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/common/storagedocument.cpp
@@ -0,0 +1,81 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "storagedocument.h"
+#include <vespa/document/fieldvalue/arrayfieldvalue.h>
+#include <vespa/document/fieldvalue/weightedsetfieldvalue.h>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".vsm.storagedocument");
+
+using NestedIterator = document::FieldValue::PathRange;
+
+namespace vsm {
+
+StorageDocument::StorageDocument(document::Document::UP doc, const SharedFieldPathMap & fim, size_t fieldNoLimit) :
+ Document(fieldNoLimit),
+ _doc(std::move(doc)),
+ _fieldMap(fim),
+ _cachedFields(getFieldCount()),
+ _backedFields()
+{ }
+
+StorageDocument::~StorageDocument() { }
+
+namespace {
+ FieldPath _emptyFieldPath;
+ StorageDocument::SubDocument _empySubDocument(NULL, _emptyFieldPath.getFullRange());
+}
+
+const StorageDocument::SubDocument &
+StorageDocument::getComplexField(FieldIdT fId) const
+{
+ if (_cachedFields[fId].getFieldValue() == NULL) {
+ const FieldPath & fp = (*_fieldMap)[fId];
+ if ( ! fp.empty() ) {
+ const document::StructuredFieldValue * sfv = _doc.get();
+ NestedIterator nested = fp.getFullRange();
+ const document::FieldPathEntry& fvInfo = nested.cur();
+ bool ok = sfv->getValue(fvInfo.getFieldRef(), fvInfo.getFieldValueToSet());
+ if (ok) {
+ SubDocument tmp(&fvInfo.getFieldValueToSet(), nested.next());
+ _cachedFields[fId].swap(tmp);
+ }
+ } else {
+ LOG(debug, "Failed getting field fId %d.", fId);
+ return _empySubDocument;
+ }
+ }
+ return _cachedFields[fId];
+}
+
+void StorageDocument::saveCachedFields() const
+{
+ size_t m(_cachedFields.size());
+ _backedFields.reserve(m);
+ for (size_t i(0); i < m; i++) {
+ if (_cachedFields[i].getFieldValue() != 0) {
+ _backedFields.emplace_back(document::FieldValue::UP(_cachedFields[i].getFieldValue()->clone()));
+ _cachedFields[i].setFieldValue(_backedFields.back().get());
+ }
+ }
+}
+
+const document::FieldValue *
+StorageDocument::getField(FieldIdT fId) const
+{
+ return getComplexField(fId).getFieldValue();
+}
+
+bool StorageDocument::setField(FieldIdT fId, document::FieldValue::UP fv)
+{
+ bool ok(fId < _cachedFields.size());
+ if (ok) {
+ const FieldPath & fp = (*_fieldMap)[fId];
+ SubDocument tmp(fv.get(), NestedIterator(fp.end(), fp.end()));
+ _cachedFields[fId].swap(tmp);
+ _backedFields.emplace_back(std::move(fv));
+ }
+ return ok;
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/common/storagedocument.h b/streamingvisitors/src/vespa/vsm/common/storagedocument.h
new file mode 100644
index 00000000000..a7f21cb052f
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/common/storagedocument.h
@@ -0,0 +1,59 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "document.h"
+#include <vespa/document/fieldvalue/document.h>
+
+namespace vsm {
+
+typedef vespalib::CloneablePtr<document::FieldValue> FieldValueContainer;
+typedef document::FieldPath FieldPath; // field path to navigate a field value
+typedef std::vector<FieldPath> FieldPathMapT; // map from field id to field path
+typedef std::shared_ptr<FieldPathMapT> SharedFieldPathMap;
+
+class StorageDocument : public Document {
+public:
+ typedef std::unique_ptr<StorageDocument> UP;
+
+ class SubDocument {
+ public:
+ SubDocument() : _fieldValue(nullptr) {}
+ SubDocument(document::FieldValue *fv, document::FieldValue::PathRange nested) :
+ _fieldValue(fv),
+ _range(nested)
+ { }
+
+ const document::FieldValue *getFieldValue() const { return _fieldValue; }
+ void setFieldValue(document::FieldValue *fv) { _fieldValue = fv; }
+ const document::FieldValue::PathRange & getRange() const { return _range; }
+ void swap(SubDocument &rhs) {
+ std::swap(_fieldValue, rhs._fieldValue);
+ std::swap(_range, rhs._range);
+ }
+ private:
+ FieldPath::const_iterator begin() const;
+ FieldPath::const_iterator end() const;
+ document::FieldValue *_fieldValue;
+ document::FieldValue::PathRange _range;
+ };
+public:
+ StorageDocument(document::Document::UP doc, const SharedFieldPathMap &fim, size_t fieldNoLimit);
+ StorageDocument(const StorageDocument &) = delete;
+ StorageDocument & operator = (const StorageDocument &) = delete;
+ ~StorageDocument();
+
+ const document::Document &docDoc() const { return *_doc; }
+ bool valid() const { return _doc.get() != nullptr; }
+ const SubDocument &getComplexField(FieldIdT fId) const;
+ const document::FieldValue *getField(FieldIdT fId) const override;
+ bool setField(FieldIdT fId, document::FieldValue::UP fv) override ;
+ void saveCachedFields() const;
+private:
+ document::Document::UP _doc;
+ SharedFieldPathMap _fieldMap;
+ mutable std::vector<SubDocument> _cachedFields;
+ mutable std::vector<document::FieldValue::UP> _backedFields;
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/config/.gitignore b/streamingvisitors/src/vespa/vsm/config/.gitignore
new file mode 100644
index 00000000000..d58390943e2
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/config/.gitignore
@@ -0,0 +1,4 @@
+.depend
+Makefile
+config-*.cpp
+config-*.h
diff --git a/streamingvisitors/src/vespa/vsm/config/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/config/CMakeLists.txt
new file mode 100644
index 00000000000..fea0bafe6b2
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/config/CMakeLists.txt
@@ -0,0 +1,11 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(vsm_vconfig OBJECT
+ SOURCES
+ DEPENDS
+)
+vespa_generate_config(vsm_vconfig vsmfields.def)
+install_config_definition(vsmfields.def vespa.config.search.vsm.vsmfields.def)
+vespa_generate_config(vsm_vconfig vsm.def)
+install_config_definition(vsm.def vespa.config.search.vsm.vsm.def)
+vespa_generate_config(vsm_vconfig vsmsummary.def)
+install_config_definition(vsmsummary.def vespa.config.search.vsm.vsmsummary.def)
diff --git a/streamingvisitors/src/vespa/vsm/config/vsm-cfif.h b/streamingvisitors/src/vespa/vsm/config/vsm-cfif.h
new file mode 100644
index 00000000000..22033aee232
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/config/vsm-cfif.h
@@ -0,0 +1,25 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/vsm/config/config-vsmfields.h>
+#include <vespa/vsm/config/config-vsm.h>
+#include <vespa/vsm/config/config-vsmsummary.h>
+#include <vespa/vespalib/util/ptrholder.h>
+
+using vespa::config::search::vsm::VsmConfig;
+using vespa::config::search::vsm::VsmsummaryConfig;
+using vespa::config::search::vsm::VsmfieldsConfig;
+
+namespace vsm {
+
+typedef vespalib::PtrHolder<VsmfieldsConfig> VsmfieldsHolder;
+typedef std::shared_ptr<VsmfieldsConfig> VsmfieldsHandle;
+
+typedef vespalib::PtrHolder<VsmConfig> VsmHolder;
+typedef std::shared_ptr<VsmConfig> VsmHandle;
+
+typedef vespalib::PtrHolder<VsmsummaryConfig> FastS_VsmsummaryHolder;
+typedef std::shared_ptr<VsmsummaryConfig> FastS_VsmsummaryHandle;
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/config/vsm.def b/streamingvisitors/src/vespa/vsm/config/vsm.def
new file mode 100644
index 00000000000..1971f9e9574
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/config/vsm.def
@@ -0,0 +1,13 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.search.vsm
+
+## The document model for the documents used as input for the VSM
+doctype reference
+
+## Configuration for storage client used by VSM
+storagecfg reference
+
+## Config defining what search method should be applied to different
+## fields in the documents. It also contains a mapping from index name
+## to a set of fields making up that index.
+vsmfields reference
diff --git a/streamingvisitors/src/vespa/vsm/config/vsmfields.def b/streamingvisitors/src/vespa/vsm/config/vsmfields.def
new file mode 100644
index 00000000000..5e943c9274d
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/config/vsmfields.def
@@ -0,0 +1,31 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.search.vsm
+
+## Level of verification applied to the documents received.
+documentverificationlevel int default=0
+
+## Set if one should ignore limit hits.
+searchall int default=1
+
+## The name of a field for which we are assigning a search method.
+## The field name refers directly to a field in the document model.
+fieldspec[].name string
+
+## The search method for a given field. Note: same field in 2 different document types must match on type if not a random result might be expected.
+fieldspec[].searchmethod enum { NONE, BOOL, AUTOUTF8, UTF8, SSE2UTF8, INT8, INT16, INT32, INT64, FLOAT16, FLOAT, DOUBLE, GEOPOS } default=AUTOUTF8
+fieldspec[].arg1 string default=""
+
+## Maximum number of chars to search per field.
+fieldspec[].maxlength int default=1048576
+
+## Type of the field
+fieldspec[].fieldtype enum {ATTRIBUTE, INDEX} default=INDEX
+
+## The name of a documenttype for which we are assigning a set of indexes.
+documenttype[].name string
+## The name of an index of a documenttype for which we are assigning a set of fields.
+documenttype[].index[].name string
+
+## The name of a field part of an index.
+## The field name refers directly to a field in the document model.
+documenttype[].index[].field[].name string
diff --git a/streamingvisitors/src/vespa/vsm/config/vsmsummary.def b/streamingvisitors/src/vespa/vsm/config/vsmsummary.def
new file mode 100644
index 00000000000..5eb96624826
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/config/vsmsummary.def
@@ -0,0 +1,21 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+namespace=vespa.config.search.vsm
+
+## The name of the result class that should be generated for documents
+## returned from the VSM. If this value is empty, the first found
+## result class will be used.
+outputclass string default=""
+
+## Mapping of field names between the result class and the document
+## model. This value represents the name in the result class. Fields
+## not mentioned here will get the identity mapping.
+fieldmap[].summary string
+
+## Mapping of field names between the result class and the document
+## model. This field vector represents the names in the document model
+## that should be used as input when generating the summary field.
+fieldmap[].document[].field string
+
+## This command specifies how the document fields should be combined
+## when generating the summary field.
+fieldmap[].command enum { NONE, FLATTENJUNIPER, FLATTENSPACE } default=NONE
diff --git a/streamingvisitors/src/vespa/vsm/searcher/.gitignore b/streamingvisitors/src/vespa/vsm/searcher/.gitignore
new file mode 100644
index 00000000000..95bc02923a9
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/.gitignore
@@ -0,0 +1,5 @@
+*.exe
+*.ilk
+*.pdb
+.depend*
+Makefile
diff --git a/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
new file mode 100644
index 00000000000..0a2a9ec21d2
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/CMakeLists.txt
@@ -0,0 +1,28 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64")
+ set(SSE2_FILES "fold.cpp")
+else()
+ unset(SSE2_FILES)
+endif()
+
+vespa_add_library(vsm_vsmsearcher OBJECT
+ SOURCES
+ boolfieldsearcher.cpp
+ fieldsearcher.cpp
+ floatfieldsearcher.cpp
+ ${SSE2_FILES}
+ futf8strchrfieldsearcher.cpp
+ geo_pos_field_searcher.cpp
+ intfieldsearcher.cpp
+ strchrfieldsearcher.cpp
+ utf8flexiblestringfieldsearcher.cpp
+ utf8strchrfieldsearcher.cpp
+ utf8stringfieldsearcherbase.cpp
+ utf8substringsearcher.cpp
+ utf8substringsnippetmodifier.cpp
+ utf8suffixstringfieldsearcher.cpp
+ utf8exactstringfieldsearcher.cpp
+ DEPENDS
+ vsm_vconfig
+)
diff --git a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp
new file mode 100644
index 00000000000..8c9b556e593
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp
@@ -0,0 +1,56 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "boolfieldsearcher.h"
+#include <vespa/document/fieldvalue/boolfieldvalue.h>
+
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+
+namespace vsm {
+
+namespace {
+vespalib::stringref TRUE = "true";
+vespalib::stringref FALSE = "false";
+}
+
+std::unique_ptr<FieldSearcher>
+BoolFieldSearcher::duplicate() const
+{
+ return std::make_unique<BoolFieldSearcher>(*this);
+}
+
+BoolFieldSearcher::BoolFieldSearcher(FieldIdT fId) :
+ FieldSearcher(fId),
+ _terms()
+{ }
+
+BoolFieldSearcher::~BoolFieldSearcher() = default;
+
+void BoolFieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf)
+{
+ _terms.clear();
+ FieldSearcher::prepare(qtl, buf);
+ for (const QueryTerm * qt : qtl) {
+ if (TRUE == qt->getTerm()) {
+ _terms.push_back(true);
+ } else if (FALSE == qt->getTerm()) {
+ _terms.push_back(false);
+ } else {
+ int64_t low;
+ int64_t high;
+ bool valid = qt->getAsIntegerTerm(low, high);
+ _terms.push_back(valid && (low > 0));
+ }
+ }
+}
+
+void BoolFieldSearcher::onValue(const document::FieldValue & fv)
+{
+ for(size_t j=0, jm(_terms.size()); j < jm; j++) {
+ if (static_cast<const document::BoolFieldValue &>(fv).getValue() == _terms[j]) {
+ addHit(*_qtl[j], 0);
+ }
+ }
+ ++_words;
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h
new file mode 100644
index 00000000000..f6afef9e507
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.h
@@ -0,0 +1,21 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "fieldsearcher.h"
+
+namespace vsm {
+
+class BoolFieldSearcher : public FieldSearcher
+{
+public:
+ std::unique_ptr<FieldSearcher> duplicate() const override;
+ BoolFieldSearcher(FieldIdT fId);
+ ~BoolFieldSearcher();
+ void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override;
+ void onValue(const document::FieldValue & fv) override;
+private:
+ std::vector<bool> _terms;
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
new file mode 100644
index 00000000000..e69999b160e
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
@@ -0,0 +1,301 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "fieldsearcher.h"
+#include <vespa/vsm/vsm/fieldsearchspec.h>
+#include <vespa/document/fieldvalue/arrayfieldvalue.h>
+#include <vespa/document/fieldvalue/weightedsetfieldvalue.h>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".vsm.searcher.fieldsearcher");
+
+using search::byte;
+using search::streaming::Query;
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+using search::v16qi;
+
+namespace vsm {
+
+class force
+{
+ public:
+ force() { FieldSearcher::init(); }
+};
+
+static force __forceInit;
+
+byte FieldSearcher::_foldLowCase[256];
+byte FieldSearcher::_wordChar[256];
+
+FieldSearcherBase::FieldSearcherBase() :
+ _qtl(),
+ _qtlFastBuffer(),
+ _qtlFastSize(0),
+ _qtlFast(nullptr)
+{
+}
+
+FieldSearcherBase::FieldSearcherBase(const FieldSearcherBase & org) :
+ _qtl(),
+ _qtlFastBuffer(),
+ _qtlFastSize(0),
+ _qtlFast(nullptr)
+{
+ prepare(org._qtl);
+}
+
+FieldSearcherBase::~FieldSearcherBase()
+{
+}
+
+FieldSearcherBase & FieldSearcherBase::operator = (const FieldSearcherBase & org)
+{
+ if (this != &org) {
+ prepare(org._qtl);
+ }
+ return *this;
+}
+
+void FieldSearcherBase::prepare(const QueryTermList & qtl)
+{
+ _qtl = qtl;
+ _qtlFastBuffer.resize(sizeof(*_qtlFast)*(_qtl.size()+1), 0x13);
+ _qtlFast = reinterpret_cast<v16qi *>(reinterpret_cast<unsigned long>(&_qtlFastBuffer[0]+15) & ~0xf);
+ _qtlFastSize = 0;
+ for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) {
+ const QueryTerm & qt = **it;
+ memcpy(&_qtlFast[_qtlFastSize++], qt.getTerm(), std::min(size_t(16), qt.termLen()));
+ }
+}
+
+FieldSearcher::FieldSearcher(const FieldIdT & fId, bool defaultPrefix) :
+ FieldSearcherBase(),
+ _field(fId),
+ _matchType(defaultPrefix ? PREFIX : REGULAR),
+ _maxFieldLength(0x100000),
+ _currentElementId(0),
+ _currentElementWeight(1),
+ _pureUsAsciiCount(0),
+ _pureUsAsciiFieldCount(0),
+ _anyUtf8Count(0),
+ _anyUtf8FieldCount(0),
+ _words(0),
+ _badUtf8Count(0),
+ _zeroCount(0)
+{
+ zeroStat();
+}
+
+FieldSearcher::~FieldSearcher() = default;
+
+bool FieldSearcher::search(const StorageDocument & doc)
+{
+ for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) {
+ QueryTerm & qt = **it;
+ QueryTerm::FieldInfo & fInfo = qt.getFieldInfo(field());
+ fInfo.setHitOffset(qt.getHitList().size());
+ }
+ onSearch(doc);
+ for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) {
+ QueryTerm & qt = **it;
+ QueryTerm::FieldInfo & fInfo = qt.getFieldInfo(field());
+ fInfo.setHitCount(qt.getHitList().size() - fInfo.getHitOffset());
+ fInfo.setFieldLength(_words);
+ }
+ _words = 0;
+ return true;
+}
+
+void FieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & UNUSED_PARAM(buf))
+{
+ FieldSearcherBase::prepare(qtl);
+ prepareFieldId();
+}
+
+size_t FieldSearcher::countWords(const FieldRef & f)
+{
+ size_t words = 0;
+ const char * n = f.data();
+ const char * e = n + f.size();
+ for( ; n < e; ++n) {
+ for (; isspace(*n) && (n<e); ++n);
+ const char * m = n;
+ for (; iswordchar(*n) && (n<e); ++n);
+ if (n > m) {
+ words++;
+ }
+ }
+ return words;
+}
+
+void FieldSearcher::prepareFieldId()
+{
+ for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) {
+ QueryTerm & qt = **it;
+ qt.resizeFieldId(field());
+ }
+}
+
+void FieldSearcher::addStat(const FieldSearcher & toAdd)
+{
+ _pureUsAsciiCount += toAdd._pureUsAsciiCount;
+ _pureUsAsciiFieldCount += toAdd._pureUsAsciiFieldCount;
+ _anyUtf8Count += toAdd._anyUtf8Count;
+ _anyUtf8FieldCount += toAdd._anyUtf8FieldCount;
+ _badUtf8Count += toAdd._badUtf8Count;
+ _zeroCount += toAdd._zeroCount;
+ for (size_t i=0; i<NELEMS(_utf8Count); i++) { _utf8Count[i] += toAdd._utf8Count[i]; }
+}
+
+void FieldSearcher::zeroStat()
+{
+ _pureUsAsciiCount = 0;
+ _pureUsAsciiFieldCount = 0;
+ _anyUtf8Count = 0;
+ _anyUtf8FieldCount = 0;
+ _badUtf8Count = 0;
+ _zeroCount = 0;
+ for (size_t i=0; i<NELEMS(_utf8Count); i++) { _utf8Count[i] = 0; }
+}
+
+void FieldSearcher::init()
+{
+ for (unsigned i = 0; i < NELEMS(_foldLowCase); i++) {
+ _foldLowCase[i] = 0;
+ _wordChar[i] = 0;
+ }
+ for (int i = 'A'; i <= 'Z'; i++) {
+ _wordChar[i] = 0xFF;
+ _foldLowCase[i] = i | 0x20;
+ }
+ for (int i = 'a'; i <= 'z'; i++) {
+ _wordChar[i] = 0xFF;
+ _foldLowCase[i] = i;
+ }
+ for (int i = '0'; i <= '9'; i++) {
+ _wordChar[i] = 0xFF;
+ _foldLowCase[i] = i;
+ }
+ for (int i = 0xC0; i <= 0xFF; i++) {
+ _wordChar[i] = 0xFF;
+ }
+ _wordChar[0xd7] = 0;
+ _wordChar[0xf7] = 0;
+
+ if (1) /* _doAccentRemoval */ {
+ _foldLowCase[0xc0] = 'a';
+ _foldLowCase[0xc1] = 'a';
+ _foldLowCase[0xc2] = 'a';
+ _foldLowCase[0xc3] = 'a'; // A tilde
+ _foldLowCase[0xc7] = 'c';
+ _foldLowCase[0xc8] = 'e';
+ _foldLowCase[0xc9] = 'e';
+ _foldLowCase[0xca] = 'e';
+ _foldLowCase[0xcb] = 'e';
+ _foldLowCase[0xcc] = 'i'; // I grave
+ _foldLowCase[0xcd] = 'i';
+ _foldLowCase[0xce] = 'i';
+ _foldLowCase[0xcf] = 'i';
+ _foldLowCase[0xd3] = 'o';
+ _foldLowCase[0xd4] = 'o';
+ _foldLowCase[0xda] = 'u';
+ _foldLowCase[0xdb] = 'u';
+
+ _foldLowCase[0xe0] = 'a';
+ _foldLowCase[0xe1] = 'a';
+ _foldLowCase[0xe2] = 'a';
+ _foldLowCase[0xe3] = 'a'; // a tilde
+ _foldLowCase[0xe7] = 'c';
+ _foldLowCase[0xe8] = 'e';
+ _foldLowCase[0xe9] = 'e';
+ _foldLowCase[0xea] = 'e';
+ _foldLowCase[0xeb] = 'e';
+ _foldLowCase[0xec] = 'i'; // i grave
+ _foldLowCase[0xed] = 'i';
+ _foldLowCase[0xee] = 'i';
+ _foldLowCase[0xef] = 'i';
+ _foldLowCase[0xf3] = 'o';
+ _foldLowCase[0xf4] = 'o';
+ _foldLowCase[0xfa] = 'u';
+ _foldLowCase[0xfb] = 'u';
+ }
+}
+
+void FieldIdTSearcherMap::prepare(const DocumentTypeIndexFieldMapT & difm, const SharedSearcherBuf & searcherBuf, Query & query)
+{
+ QueryTermList qtl;
+ query.getLeafs(qtl);
+ vespalib::string tmp;
+ for (FieldIdTSearcherMap::iterator it = begin(), mt = end(); it != mt; it++) {
+ QueryTermList onlyInIndex;
+ FieldIdT fid = (*it)->field();
+ for (QueryTermList::iterator qt = qtl.begin(), mqt = qtl.end(); qt != mqt; qt++) {
+ QueryTerm * q = *qt;
+ for (DocumentTypeIndexFieldMapT::const_iterator dt(difm.begin()), dmt(difm.end()); dt != dmt; dt++) {
+ const IndexFieldMapT & fim = dt->second;
+ IndexFieldMapT::const_iterator found = fim.find(FieldSearchSpecMap::stripNonFields(q->index()));
+ if (found != fim.end()) {
+ const FieldIdTList & index = found->second;
+ if ((find(index.begin(), index.end(), fid) != index.end()) && (find(onlyInIndex.begin(), onlyInIndex.end(), q) == onlyInIndex.end())) {
+ onlyInIndex.push_back(q);
+ }
+ } else {
+ LOG(debug, "Could not find the requested index=%s in the index config map. Query does not fit search definition.", q->index().c_str());
+ }
+ }
+ }
+ /// Should perhaps do a unique on onlyInIndex
+ (*it)->prepare(onlyInIndex, searcherBuf);
+ if (logger.wants(ns_log::Logger::spam)) {
+ char tmpBuf[16];
+ sprintf(tmpBuf,"%d", fid);
+ tmp += tmpBuf;
+ tmp += ", ";
+ }
+ }
+ LOG(debug, "Will search in %s", tmp.c_str());
+}
+
+bool FieldSearcher::onSearch(const StorageDocument & doc)
+{
+ bool retval(true);
+ size_t fNo(field());
+ const StorageDocument::SubDocument & sub = doc.getComplexField(fNo);
+ if (sub.getFieldValue() != nullptr) {
+ IteratorHandler ih(*this);
+ sub.getFieldValue()->iterateNested(sub.getRange(), ih);
+ }
+ return retval;
+}
+
+void
+FieldSearcher::IteratorHandler::onPrimitive(uint32_t, const Content & c)
+{
+ LOG(spam, "onPrimitive: field value '%s'", c.getValue().toString().c_str());
+ _searcher.setCurrentWeight(c.getWeight());
+ _searcher.setCurrentElementId(getArrayIndex());
+ _searcher.onValue(c.getValue());
+}
+
+void
+FieldSearcher::IteratorHandler::onCollectionStart(const Content & c)
+{
+ const document::FieldValue & fv = c.getValue();
+ LOG(spam, "onCollectionStart: field value '%s'", fv.toString().c_str());
+ if (fv.isA(document::FieldValue::Type::ARRAY)) {
+ const document::ArrayFieldValue & afv = static_cast<const document::ArrayFieldValue &>(fv);
+ LOG(spam, "onCollectionStart: Array size = '%zu'", afv.size());
+ } else if (fv.isA(document::FieldValue::Type::WSET)) {
+ const document::WeightedSetFieldValue & wsfv = static_cast<const document::WeightedSetFieldValue &>(fv);
+ LOG(spam, "onCollectionStart: WeightedSet size = '%zu'", wsfv.size());
+ }
+}
+
+void
+FieldSearcher::IteratorHandler::onStructStart(const Content & c)
+{
+ LOG(spam, "onStructStart: field value '%s'", c.getValue().toString().c_str());
+ _searcher.onStructValue(static_cast<const document::StructFieldValue &>(c.getValue()));
+}
+
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
new file mode 100644
index 00000000000..5c2ef8fec28
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
@@ -0,0 +1,147 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/document/fieldvalue/iteratorhandler.h>
+#include <vespa/searchlib/query/streaming/query.h>
+#include <vespa/vsm/common/document.h>
+#include <vespa/vsm/common/storagedocument.h>
+
+namespace vsm {
+
+typedef size_t termcount_t;
+typedef size_t termsize_t;
+
+#if defined(COLLECT_CHAR_STAT)
+ #define NEED_CHAR_STAT(a) { a; }
+#else
+ #define NEED_CHAR_STAT(a)
+#endif
+
+typedef ucs4_t cmptype_t;
+typedef vespalib::Array<cmptype_t> SearcherBuf;
+typedef std::shared_ptr<SearcherBuf> SharedSearcherBuf;
+typedef std::vector<char> CharVector;
+
+class FieldSearcherBase
+{
+protected:
+ search::streaming::QueryTermList _qtl;
+private:
+ CharVector _qtlFastBuffer;
+protected:
+ FieldSearcherBase();
+ FieldSearcherBase(const FieldSearcherBase & org);
+ virtual ~FieldSearcherBase(void);
+ FieldSearcherBase & operator = (const FieldSearcherBase & org);
+ void prepare(const search::streaming::QueryTermList & qtl);
+ size_t _qtlFastSize;
+ search::v16qi *_qtlFast;
+};
+
+class FieldSearcher : public FieldSearcherBase
+{
+public:
+ enum MatchType {
+ REGULAR,
+ PREFIX,
+ SUBSTRING,
+ SUFFIX,
+ EXACT
+ };
+
+ FieldSearcher(const FieldIdT & fId, bool defaultPrefix=false);
+ ~FieldSearcher() override;
+ virtual std::unique_ptr<FieldSearcher> duplicate() const = 0;
+ bool search(const StorageDocument & doc);
+ virtual void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf);
+ const FieldIdT & field() const { return _field; }
+ void field(const FieldIdT & v) { _field = v; prepareFieldId(); }
+ bool prefix() const { return _matchType == PREFIX; }
+ bool substring() const { return _matchType == SUBSTRING; }
+ bool suffix() const { return _matchType == SUFFIX; }
+ bool exact() const { return _matchType == EXACT; }
+ void setMatchType(MatchType mt) { _matchType = mt; }
+ static void init();
+ static search::byte fold(search::byte c) { return _foldLowCase[c]; }
+ static search::byte iswordchar(search::byte c) { return _wordChar[c]; }
+ static search::byte isspace(search::byte c) { return ! iswordchar(c); }
+ static size_t countWords(const FieldRef & f);
+ unsigned pureUsAsciiCount() const { return _pureUsAsciiCount; }
+ unsigned pureUsAsciiFieldCount() const { return _pureUsAsciiFieldCount; }
+ unsigned anyUtf8Count() const { return _anyUtf8Count; }
+ unsigned anyUtf8FieldCount() const { return _anyUtf8FieldCount; }
+ unsigned badUtf8Count() const { return _badUtf8Count; }
+ unsigned zeroCount() const { return _zeroCount; }
+ unsigned utf8Count(size_t sz) const { return _utf8Count[1+sz]; }
+ const unsigned * utf8Count() const { return _utf8Count; }
+ int32_t getCurrentWeight() const { return _currentElementWeight; }
+ void addStat(const FieldSearcher & toAdd);
+ void zeroStat();
+ FieldSearcher & maxFieldLength(uint32_t maxFieldLength_) { _maxFieldLength = maxFieldLength_; return *this; }
+ size_t maxFieldLength() const { return _maxFieldLength; }
+
+private:
+ class IteratorHandler : public document::fieldvalue::IteratorHandler {
+ private:
+ FieldSearcher & _searcher;
+
+ void onPrimitive(uint32_t fid, const Content & c) override;
+ void onCollectionStart(const Content & c) override;
+ void onStructStart(const Content & c) override;
+
+ public:
+ IteratorHandler(FieldSearcher & searcher) : _searcher(searcher) {}
+ };
+ friend class IteratorHandler; // to allow calls to onValue();
+
+ void prepareFieldId();
+ void setCurrentWeight(int32_t weight) { _currentElementWeight = weight; }
+ void setCurrentElementId(int32_t weight) { _currentElementId = weight; }
+ bool onSearch(const StorageDocument & doc);
+ virtual void onValue(const document::FieldValue & fv) = 0;
+ virtual void onStructValue(const document::StructFieldValue &) { }
+ FieldIdT _field;
+ MatchType _matchType;
+ unsigned _maxFieldLength;
+ uint32_t _currentElementId;
+ int32_t _currentElementWeight; // Contains the weight of the current item being evaluated.
+ /// Number of bytes in blocks containing pure us-ascii
+ unsigned _pureUsAsciiCount;
+ /// Number of blocks containing pure us-ascii
+ unsigned _pureUsAsciiFieldCount;
+ /// Number of bytes in blocks containing any non us-ascii
+ unsigned _anyUtf8Count;
+ /// Number of blocks containing any non us-ascii
+ unsigned _anyUtf8FieldCount;
+protected:
+ /// Number of terms searched.
+ unsigned _words;
+ /// Number of utf8 bytes by utf8 size.
+ unsigned _utf8Count[6];
+ unsigned _badUtf8Count;
+ unsigned _zeroCount;
+protected:
+ void addPureUsAsciiField(size_t sz) { _pureUsAsciiCount += sz; _pureUsAsciiFieldCount++;; }
+ void addAnyUtf8Field(size_t sz) { _anyUtf8Count += sz; _anyUtf8FieldCount++; }
+ /**
+ * Adds a hit to the given query term.
+ * For each call to onValue() a batch of words are processed, and the position is local to this batch.
+ **/
+ void addHit(search::streaming::QueryTerm & qt, uint32_t pos) const {
+ qt.add(_words + pos, field(), _currentElementId, getCurrentWeight());
+ }
+public:
+ static search::byte _foldLowCase[256];
+ static search::byte _wordChar[256];
+};
+
+typedef std::unique_ptr<FieldSearcher> FieldSearcherContainer;
+typedef std::vector<FieldSearcherContainer> FieldIdTSearcherMapT;
+
+class FieldIdTSearcherMap : public FieldIdTSearcherMapT
+{
+public:
+ void prepare(const DocumentTypeIndexFieldMapT & difm, const SharedSearcherBuf & searcherBuf, search::streaming::Query & query);
+};
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp
new file mode 100644
index 00000000000..02d8bd8c12a
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp
@@ -0,0 +1,70 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "floatfieldsearcher.h"
+
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+
+namespace vsm {
+
+std::unique_ptr<FieldSearcher>
+FloatFieldSearcher::duplicate() const
+{
+ return std::make_unique<FloatFieldSearcher>(*this);
+}
+
+std::unique_ptr<FieldSearcher>
+DoubleFieldSearcher::duplicate() const
+{
+ return std::make_unique<DoubleFieldSearcher>(*this);
+}
+
+template<typename T>
+FloatFieldSearcherT<T>::FloatFieldSearcherT(FieldIdT fId) :
+ FieldSearcher(fId),
+ _floatTerm()
+{}
+
+template<typename T>
+FloatFieldSearcherT<T>::~FloatFieldSearcherT() {}
+
+template<typename T>
+void FloatFieldSearcherT<T>::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf)
+{
+ _floatTerm.clear();
+ FieldSearcher::prepare(qtl, buf);
+ for (QueryTermList::const_iterator it=qtl.begin(); it < qtl.end(); it++) {
+ const QueryTerm * qt = *it;
+ size_t sz(qt->termLen());
+ if (sz) {
+ double low;
+ double high;
+ bool valid = qt->getAsDoubleTerm(low, high);
+ _floatTerm.push_back(FloatInfo(low, high, valid));
+ }
+ }
+}
+
+
+template<typename T>
+void FloatFieldSearcherT<T>::onValue(const document::FieldValue & fv)
+{
+ for(size_t j=0, jm(_floatTerm.size()); j < jm; j++) {
+ const FloatInfo & ii = _floatTerm[j];
+ if (ii.valid() && (ii.cmp(fv.getAsDouble()))) {
+ addHit(*_qtl[j], 0);
+ }
+ }
+ ++_words;
+}
+
+template<typename T>
+bool FloatFieldSearcherT<T>::FloatInfo::cmp(T key) const
+{
+ return (_lower <= key) && (key <= _upper);
+}
+
+template class FloatFieldSearcherT<float>;
+template class FloatFieldSearcherT<double>;
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h
new file mode 100644
index 00000000000..98018fbf4a3
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.h
@@ -0,0 +1,53 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "fieldsearcher.h"
+
+namespace vsm {
+
+template <typename T>
+class FloatFieldSearcherT : public FieldSearcher
+{
+public:
+ FloatFieldSearcherT(FieldIdT fId=0);
+ ~FloatFieldSearcherT();
+ void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override;
+ void onValue(const document::FieldValue & fv) override;
+protected:
+ class FloatInfo
+ {
+ public:
+ FloatInfo(T low, T high, bool v) : _lower(low), _upper(high), _valid(v) { if (low > high) { _lower = high; _upper = low; } }
+ bool cmp(T key) const;
+ bool valid() const { return _valid; }
+ void setValid(bool v) { _valid = v; }
+ T getLow() const { return _lower; }
+ T getHigh() const { return _upper; }
+ private:
+ T _lower;
+ T _upper;
+ bool _valid;
+ };
+ typedef std::vector<FloatInfo> FloatInfoListT;
+ FloatInfoListT _floatTerm;
+};
+
+typedef FloatFieldSearcherT<float> FloatFieldSearcherTF;
+typedef FloatFieldSearcherT<double> FloatFieldSearcherTD;
+
+class FloatFieldSearcher : public FloatFieldSearcherTF
+{
+public:
+ std::unique_ptr<FieldSearcher> duplicate() const override;
+ FloatFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTF(fId) { }
+};
+
+class DoubleFieldSearcher : public FloatFieldSearcherTD
+{
+public:
+ std::unique_ptr<FieldSearcher> duplicate() const override;
+ DoubleFieldSearcher(FieldIdT fId=0) : FloatFieldSearcherTD(fId) { }
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fold.cpp b/streamingvisitors/src/vespa/vsm/searcher/fold.cpp
new file mode 100644
index 00000000000..bd2392d3ad6
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/fold.cpp
@@ -0,0 +1,153 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+//
+#include "fold.h"
+
+namespace vsm {
+
+const unsigned char * sse2_foldaa(const unsigned char * toFoldOrg, size_t sz, unsigned char * foldedOrg)
+{
+ typedef char v16qi __attribute__ ((__vector_size__(16)));
+ typedef long long v2di __attribute__ ((__vector_size__(16)));
+ static v16qi _G_0 = { '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1 };
+ static v16qi _G_9 = { '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9' };
+ static v16qi _G_a = { 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1 };
+ static v16qi _G_z = { 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z' };
+ static v16qi _G_8bit = { (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4,
+ (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4 };
+ static v2di _G_lowCase = { 0x2020202020202020ULL, 0x2020202020202020ULL };
+ const v16qi *toFold = reinterpret_cast<const v16qi *>(toFoldOrg);
+ v2di * folded = reinterpret_cast<v2di *>(foldedOrg);
+ size_t i=0;
+ for (size_t m=sz/16; i < m; i++)
+ {
+#ifndef __INTEL_COMPILER
+ int nonAscii = __builtin_ia32_pmovmskb128(toFold[i]);
+ if (nonAscii)
+ {
+#ifdef __clang__
+ v16qi non8Mask = _G_8bit > toFold[i];
+#else
+ v16qi non8Mask = __builtin_ia32_pcmpgtb128(_G_8bit, toFold[i]);
+#endif
+ int non8bit = __builtin_ia32_pmovmskb128(non8Mask);
+ if (non8bit)
+ {
+ break;
+ }
+ break;
+ }
+#ifdef __clang__
+ v16qi _0 = toFold[i] > _G_0;
+ v16qi _z = toFold[i] > _G_z;
+ v2di _0_z = v2di(_0) ^ v2di(_z);
+ v2di toLow = _0_z & v2di(toFold[i]);
+ v16qi low = v16qi(toLow | _G_lowCase);
+ _0 = low > _G_0;
+ v16qi _9 = low > _G_9;
+ v16qi _a = low > _G_a;
+ _z = low > _G_z;
+ v2di _0_9_m = v2di(_0) ^ v2di(_9);
+ v2di _a_z_m = v2di(_a) ^ v2di(_z);
+ v2di _0_9 = _0_9_m & v2di(low);
+ v2di _a_z = _a_z_m & v2di(low);
+ folded[i] = _0_9 | _a_z;
+#else
+ v16qi _0 = __builtin_ia32_pcmpgtb128(toFold[i], _G_0);
+ v16qi _z = __builtin_ia32_pcmpgtb128(toFold[i], _G_z);
+ v2di _0_z = __builtin_ia32_pxor128(v2di(_0), v2di(_z));
+ v2di toLow = __builtin_ia32_pand128(_0_z, v2di(toFold[i]));
+ v16qi low = v16qi(__builtin_ia32_por128(toLow, _G_lowCase));
+ _0 = __builtin_ia32_pcmpgtb128(low, _G_0);
+ v16qi _9 = __builtin_ia32_pcmpgtb128(low, _G_9);
+ v16qi _a = __builtin_ia32_pcmpgtb128(low, _G_a);
+ _z = __builtin_ia32_pcmpgtb128(low, _G_z);
+ v2di _0_9_m = __builtin_ia32_pxor128(v2di(_0), v2di(_9));
+ v2di _a_z_m = __builtin_ia32_pxor128(v2di(_a), v2di(_z));
+ v2di _0_9 = __builtin_ia32_pand128(_0_9_m, v2di(low));
+ v2di _a_z = __builtin_ia32_pand128(_a_z_m, v2di(low));
+ folded[i] = __builtin_ia32_por128(_0_9, _a_z);
+#endif
+#else
+# warning "Intel's icc compiler does not like __builtin_ia32_pxor128"
+ LOG_ABORT("should not be reached");
+#endif
+ }
+ return toFoldOrg+i*16;
+}
+
+const unsigned char * sse2_foldua(const unsigned char * toFoldOrg, size_t sz, unsigned char * foldedOrg)
+{
+ typedef char v16qi __attribute__ ((__vector_size__(16)));
+ typedef long long v2di __attribute__ ((__vector_size__(16)));
+ static v16qi _G_0 = { '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1, '0'-1 };
+ static v16qi _G_9 = { '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9', '9' };
+ static v16qi _G_a = { 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1, 'a'-1 };
+ static v16qi _G_z = { 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z', 'z' };
+ static v16qi _G_8bit = { (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4,
+ (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4, (char)0xc4 };
+ static v2di _G_lowCase = { 0x2020202020202020ULL, 0x2020202020202020ULL };
+ v2di * folded = reinterpret_cast<v2di *>(foldedOrg);
+ size_t i=0;
+ for (size_t m=sz/16; i < m; i++)
+ {
+#ifndef __INTEL_COMPILER
+#ifdef __clang__
+ v16qi current = __builtin_ia32_lddqu(reinterpret_cast<const char *>(&toFoldOrg[i*16]));
+#else
+ v16qi current = __builtin_ia32_loaddqu(reinterpret_cast<const char *>(&toFoldOrg[i*16]));
+#endif
+ int nonAscii = __builtin_ia32_pmovmskb128(current);
+ if (nonAscii)
+ {
+#ifdef __clang__
+ v16qi non8Mask = _G_8bit > current;
+#else
+ v16qi non8Mask = __builtin_ia32_pcmpgtb128(_G_8bit, current);
+#endif
+ int non8bit = __builtin_ia32_pmovmskb128(non8Mask);
+ if (non8bit)
+ {
+ break;
+ }
+ break;
+ }
+#ifdef __clang__
+ v16qi _0 = current > _G_0;
+ v16qi _z = current > _G_z;
+ v2di _0_z = v2di(_0) ^ v2di(_z);
+ v2di toLow = _0_z & v2di(current);
+ v16qi low = v16qi(toLow | _G_lowCase);
+ _0 = low > _G_0;
+ v16qi _9 = low > _G_9;
+ v16qi _a = low > _G_a;
+ _z = low > _G_z;
+ v2di _0_9_m = v2di(_0) ^ v2di(_9);
+ v2di _a_z_m = v2di(_a) ^ v2di(_z);
+ v2di _0_9 = _0_9_m & v2di(low);
+ v2di _a_z = _a_z_m & v2di(low);
+ folded[i] = _0_9 | _a_z;
+#else
+ v16qi _0 = __builtin_ia32_pcmpgtb128(current, _G_0);
+ v16qi _z = __builtin_ia32_pcmpgtb128(current, _G_z);
+ v2di _0_z = __builtin_ia32_pxor128(v2di(_0), v2di(_z));
+ v2di toLow = __builtin_ia32_pand128(_0_z, v2di(current));
+ v16qi low = v16qi(__builtin_ia32_por128(toLow, _G_lowCase));
+ _0 = __builtin_ia32_pcmpgtb128(low, _G_0);
+ v16qi _9 = __builtin_ia32_pcmpgtb128(low, _G_9);
+ v16qi _a = __builtin_ia32_pcmpgtb128(low, _G_a);
+ _z = __builtin_ia32_pcmpgtb128(low, _G_z);
+ v2di _0_9_m = __builtin_ia32_pxor128(v2di(_0), v2di(_9));
+ v2di _a_z_m = __builtin_ia32_pxor128(v2di(_a), v2di(_z));
+ v2di _0_9 = __builtin_ia32_pand128(_0_9_m, v2di(low));
+ v2di _a_z = __builtin_ia32_pand128(_a_z_m, v2di(low));
+ folded[i] = __builtin_ia32_por128(_0_9, _a_z);
+#endif
+#else
+# warning "Intel's icc compiler does not like __builtin_ia32_pxor128"
+ LOG_ABORT("should not be reached");
+#endif
+ }
+ return toFoldOrg+i*16;
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fold.h b/streamingvisitors/src/vespa/vsm/searcher/fold.h
new file mode 100644
index 00000000000..578b883484f
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/fold.h
@@ -0,0 +1,12 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/vsm/common/document.h>
+
+namespace vsm {
+
+const search::byte * sse2_foldaa(const search::byte * toFoldOrg, size_t sz, search::byte * foldedOrg);
+const search::byte * sse2_foldua(const search::byte * toFoldOrg, size_t sz, search::byte * foldedOrg);
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp
new file mode 100644
index 00000000000..fc5d77de419
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp
@@ -0,0 +1,310 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "futf8strchrfieldsearcher.h"
+#ifdef __x86_64__
+#include "fold.h"
+#endif
+#include <vespa/vespalib/util/size_literals.h>
+
+using search::byte;
+using search::streaming::QueryTerm;
+using search::v16qi;
+using vespalib::Optimized;
+
+namespace vsm {
+
+std::unique_ptr<FieldSearcher>
+FUTF8StrChrFieldSearcher::duplicate() const
+{
+ return std::make_unique<FUTF8StrChrFieldSearcher>(*this);
+}
+
+FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher()
+ : UTF8StrChrFieldSearcher(),
+ _folded(4_Ki)
+{ }
+FUTF8StrChrFieldSearcher::FUTF8StrChrFieldSearcher(FieldIdT fId)
+ : UTF8StrChrFieldSearcher(fId),
+ _folded(4_Ki)
+{ }
+FUTF8StrChrFieldSearcher::~FUTF8StrChrFieldSearcher() {}
+
+bool
+FUTF8StrChrFieldSearcher::ansiFold(const char * toFold, size_t sz, char * folded)
+{
+ bool retval(true);
+ for(size_t i=0; i < sz; i++) {
+ byte c = toFold[i];
+ if (c>=128) { retval = false; break; }
+ folded[i] = FieldSearcher::_foldLowCase[c];
+ }
+ return retval;
+}
+
+bool
+FUTF8StrChrFieldSearcher::lfoldaa(const char * toFold, size_t sz, char * folded, size_t & unalignedStart)
+{
+ unalignedStart = (size_t(toFold) & 0xF);
+#ifdef __x86_64__
+ bool retval(true);
+ size_t unalignedsz = std::min(sz, (16 - unalignedStart) & 0xF);
+
+ size_t foldedUnaligned = (size_t(folded) & 0xF);
+ unalignedStart = (foldedUnaligned < unalignedStart) ? (unalignedStart-foldedUnaligned) : unalignedStart + 16 - foldedUnaligned;
+ size_t alignedStart = unalignedStart+unalignedsz;
+
+ size_t alignedsz = sz - unalignedsz;
+ size_t alignsz16 = alignedsz & 0xFFFFFFF0;
+ size_t rest = alignedsz - alignsz16;
+
+ if (unalignedStart) {
+ retval = ansiFold(toFold, unalignedsz, folded + unalignedStart);
+ }
+ if (alignsz16 && retval) {
+ const byte * end = sse2_foldaa(reinterpret_cast<const byte *>(toFold+unalignedsz), alignsz16, reinterpret_cast<byte *>(folded+alignedStart));
+ retval = (end == reinterpret_cast<const byte *>(toFold+unalignedsz+alignsz16));
+ }
+ if(rest && retval) {
+ retval = ansiFold(toFold + unalignedsz + alignsz16, rest, folded+alignedStart+alignsz16);
+ }
+ return retval;
+#else
+ return ansiFold(toFold, sz, folded + unalignedStart);
+#endif
+}
+
+bool
+FUTF8StrChrFieldSearcher::lfoldua(const char * toFold, size_t sz, char * folded, size_t & alignedStart)
+{
+ alignedStart = 0xF - (size_t(folded + 0xF) % 0x10);
+#ifdef __x86_64__
+ bool retval(true);
+
+ size_t alignsz16 = sz & 0xFFFFFFF0;
+ size_t rest = sz - alignsz16;
+
+ if (alignsz16) {
+ const byte * end = sse2_foldua(reinterpret_cast<const byte *>(toFold), alignsz16, reinterpret_cast<byte *>(folded+alignedStart));
+ retval = (end == reinterpret_cast<const byte *>(toFold+alignsz16));
+ }
+ if(rest && retval) {
+ retval = ansiFold(toFold + alignsz16, rest, folded+alignedStart+alignsz16);
+ }
+ return retval;
+#else
+ return ansiFold(toFold, sz, folded + alignedStart);
+#endif
+}
+
+namespace {
+
+#ifdef __x86_64__
+inline const char * advance(const char * n, const v16qi zero)
+{
+ uint32_t charMap = 0;
+ unsigned zeroCountSum = 0;
+ do { // find first '\0' character (the end of the word)
+#ifndef __INTEL_COMPILER
+#ifdef __clang__
+ v16qi tmpCurrent = __builtin_ia32_lddqu(n+zeroCountSum);
+ v16qi tmp0 = tmpCurrent == zero;
+#else
+ v16qi tmpCurrent = __builtin_ia32_loaddqu(n+zeroCountSum);
+ v16qi tmp0 = __builtin_ia32_pcmpeqb128(tmpCurrent, reinterpret_cast<v16qi>(zero));
+#endif
+ charMap = __builtin_ia32_pmovmskb128(tmp0); // 1 in charMap equals to '\0' in input buffer
+#else
+# warning "Intel's icc compiler does not like __builtin_ia32_xxxxx"
+ LOG_ABORT("should not be reached");
+#endif
+ zeroCountSum += 16;
+ } while (!charMap);
+ int charCount = Optimized::lsbIdx(charMap); // number of word characters in last 16 bytes
+ uint32_t zeroMap = ((~charMap) & 0xffff) >> charCount;
+
+ int zeroCounter = Optimized::lsbIdx(zeroMap); // number of non-characters ('\0') in last 16 bytes
+ int sum = zeroCountSum - 16 + charCount + zeroCounter;
+ if (!zeroMap) { // only '\0' in last 16 bytes (no new word found)
+ do { // find first word character (the next word)
+#ifndef __INTEL_COMPILER
+#ifdef __clang__
+ v16qi tmpCurrent = __builtin_ia32_lddqu(n+zeroCountSum);
+ tmpCurrent = tmpCurrent > zero;
+#else
+ v16qi tmpCurrent = __builtin_ia32_loaddqu(n+zeroCountSum);
+ tmpCurrent = __builtin_ia32_pcmpgtb128(tmpCurrent, reinterpret_cast<v16qi>(zero));
+#endif
+ zeroMap = __builtin_ia32_pmovmskb128(tmpCurrent); // 1 in zeroMap equals to word character in input buffer
+#else
+# warning "Intel's icc compiler does not like __builtin_ia32_xxxxx"
+ LOG_ABORT("should not be reached");
+#endif
+ zeroCountSum += 16;
+ } while(!zeroMap);
+ zeroCounter = Optimized::lsbIdx(zeroMap);
+ sum = zeroCountSum - 16 + zeroCounter;
+ }
+ return n + sum;
+}
+#else
+inline const char* advance(const char* n)
+{
+ const char* p = n;
+ const char* zero = static_cast<const char *>(memchr(p, 0, 64_Ki));
+ while (zero == nullptr) {
+ p += 64_Ki;
+ zero = static_cast<const char *>(memchr(p, 0, 64_Ki));
+ }
+ p = zero;
+ while (*p == '\0') {
+ ++p;
+ }
+ return p;
+}
+#endif
+
+}
+
+size_t FUTF8StrChrFieldSearcher::match(const char *folded, size_t sz, QueryTerm & qt)
+{
+#ifdef __x86_64__
+ const v16qi _G_zero = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+#endif
+ termcount_t words(0);
+ const char * term;
+ termsize_t tsz = qt.term(term);
+ const char *et=term+tsz;
+ const char * n = folded;
+ const char *e = n + sz;
+
+ while (!*n) n++;
+ while (true) {
+ if (n>=e) break;
+
+#if 0
+ v16qi current = __builtin_ia32_loaddqu(n);
+ current = __builtin_ia32_pcmpeqb128(current, _qtlFast[0]);
+ unsigned eqMap = __builtin_ia32_pmovmskb128(current);
+ unsigned neqMap = ~eqMap;
+ unsigned numEq = Optimized::lsbIdx(neqMap);
+ /* if (eqMap)*/ {
+ if (numEq >= 16) {
+ const char *tt = term+16;
+ const char *p = n+16;
+ while ( (*tt == *p) && (tt < et)) { tt++; p++; numEq++; }
+ }
+ if ((numEq >= tsz) && (prefix() || qt.isPrefix() || !n[tsz])) {
+ addHit(qt, words);
+ }
+ }
+#else
+ const char *tt = term;
+ while ((tt < et) && (*tt == *n)) { tt++; n++; }
+ if ((tt == et) && (prefix() || qt.isPrefix() || !*n)) {
+ addHit(qt, words);
+ }
+#endif
+ words++;
+#ifdef __x86_64__
+ n = advance(n, _G_zero);
+#else
+ n = advance(n);
+#endif
+ }
+ return words;
+}
+
+size_t FUTF8StrChrFieldSearcher::match(const char *folded, size_t sz, size_t mintsz, QueryTerm ** qtl, size_t qtlSize)
+{
+ (void) mintsz;
+#ifdef __x86_64__
+ const v16qi _G_zero = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+#endif
+ termcount_t words(0);
+ const char * n = folded;
+ const char *e = n + sz;
+ while (!*n) n++;
+ for( ; ; ) {
+ if (n>=e) break;
+#if 0
+ v16qi current = __builtin_ia32_loaddqu(n);
+ for(size_t i=0; i < qtlSize; i++) {
+ v16qi tmpEq = __builtin_ia32_pcmpeqb128(current, _qtlFast[i]);
+ unsigned eqMap = __builtin_ia32_pmovmskb128(tmpEq);
+ /* if (eqMap) */ {
+ QueryTerm & qt = *qtl[i];
+ unsigned neqMap = ~eqMap;
+ unsigned numEq = Optimized::lsbIdx(neqMap);
+ termsize_t tsz = qt.termLen();
+ if (numEq >= 16) {
+ const char *tt = qt.term() + 16;
+ const char *et=tt+tsz;
+ const char *p = n+16;
+ while ( (*tt == *p) && (tt < et)) { tt++; p++; numEq++; }
+ }
+ if ((numEq >= tsz) && (prefix() || qt.isPrefix() || !n[tsz])) {
+ addHit(qt, words);
+ }
+ }
+ }
+#else
+ for(QueryTerm ** it=qtl, ** mt=qtl+qtlSize; it != mt; it++) {
+ QueryTerm & qt = **it;
+ const char * term;
+ termsize_t tsz = qt.term(term);
+
+ const char *et=term+tsz;
+ const char *fnt;
+ for (fnt = n; (term < et) && (*term == *fnt); term++, fnt++);
+ if ((term == et) && (prefix() || qt.isPrefix() || !*fnt)) {
+ addHit(qt, words);
+ }
+ }
+#endif
+ words++;
+#ifdef __x86_64__
+ n = advance(n, _G_zero);
+#else
+ n = advance(n);
+#endif
+ }
+ return words;
+}
+
+size_t FUTF8StrChrFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
+{
+ _folded.reserve(f.size()+16*3); //Enable fulle xmm0 store
+ size_t unalignedStart(0);
+ bool ascii7Bit = lfoldua(f.data(), f.size(), &_folded[0], unalignedStart);
+ if (ascii7Bit) {
+ char * folded = &_folded[unalignedStart];
+ /// Add the pattern 00 01 00 to avoid multiple eof tests of falling off the edge.
+ folded[f.size()] = 0;
+ folded[f.size()+1] = 0x01;
+ memset(folded + f.size() + 2, 0, 16); // initialize padding data to avoid valgrind complaining about uninitialized values
+ return match(folded, f.size(), qt);
+ NEED_CHAR_STAT(addPureUsAsciiField(f.size()));
+ } else {
+ return UTF8StrChrFieldSearcher::matchTerm(f, qt);
+ }
+}
+
+size_t FUTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+{
+ _folded.reserve(f.size()+16*3); //Enable fulle xmm0 store
+ size_t unalignedStart(0);
+ bool ascii7Bit = lfoldua(f.data(), f.size(), &_folded[0], unalignedStart);
+ if (ascii7Bit) {
+ char * folded = &_folded[unalignedStart];
+ /// Add the pattern 00 01 00 to avoid multiple eof tests of falling off the edge.
+ folded[f.size()] = 0;
+ folded[f.size()+1] = 0x01;
+ memset(folded + f.size() + 2, 0, 16); // initialize padding data to avoid valgrind complaining about uninitialized values
+ return match(folded, f.size(), mintsz, &_qtl[0], _qtl.size());
+ NEED_CHAR_STAT(addPureUsAsciiField(f.size()));
+ } else {
+ return UTF8StrChrFieldSearcher::matchTerms(f, mintsz);
+ }
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h
new file mode 100644
index 00000000000..900ab4c9120
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/futf8strchrfieldsearcher.h
@@ -0,0 +1,26 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "utf8strchrfieldsearcher.h"
+
+namespace vsm {
+
+class FUTF8StrChrFieldSearcher : public UTF8StrChrFieldSearcher
+{
+public:
+ std::unique_ptr<FieldSearcher> duplicate() const override;
+ FUTF8StrChrFieldSearcher();
+ FUTF8StrChrFieldSearcher(FieldIdT fId);
+ ~FUTF8StrChrFieldSearcher();
+ static bool ansiFold(const char * toFold, size_t sz, char * folded);
+ static bool lfoldaa(const char * toFold, size_t sz, char * folded, size_t & unalignedStart);
+ static bool lfoldua(const char * toFold, size_t sz, char * folded, size_t & alignedStart);
+ private:
+ size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+ size_t matchTerms(const FieldRef&, const size_t shortestTerm) override;
+ virtual size_t match(const char *folded, size_t sz, search::streaming::QueryTerm & qt);
+ size_t match(const char *folded, size_t sz, size_t mintsz, search::streaming::QueryTerm ** qtl, size_t qtlSize);
+ std::vector<char> _folded;
+};
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp
new file mode 100644
index 00000000000..db93bda7778
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp
@@ -0,0 +1,78 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "geo_pos_field_searcher.h"
+#include <vespa/document/fieldvalue/arrayfieldvalue.h>
+#include <vespa/document/fieldvalue/structfieldvalue.h>
+#include <vespa/searchlib/common/geo_location_parser.h>
+#include <vespa/vespalib/util/issue.h>
+#include <vespa/vespalib/util/exception.h>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".vsm.searcher.geo_pos_field_searcher");
+
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+using search::common::GeoLocation;
+using search::common::GeoLocationParser;
+
+namespace vsm {
+
+std::unique_ptr<FieldSearcher> GeoPosFieldSearcher::duplicate() const {
+ return std::make_unique<GeoPosFieldSearcher>(*this);
+}
+
+GeoPosFieldSearcher::GeoPosFieldSearcher(FieldIdT fId) :
+ FieldSearcher(fId),
+ _geoPosTerm()
+{}
+
+GeoPosFieldSearcher::~GeoPosFieldSearcher() {}
+
+void GeoPosFieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf) {
+ _geoPosTerm.clear();
+ FieldSearcher::prepare(qtl, buf);
+ for (const QueryTerm * qt : qtl) {
+ const vespalib::string & str = qt->getTermString();
+ GeoLocationParser parser;
+ bool valid = parser.parseNoField(str);
+ if (! valid) {
+ vespalib::Issue::report("invalid position in term: %s", str.c_str());
+ }
+ _geoPosTerm.emplace_back(parser.getGeoLocation());
+ }
+}
+
+void GeoPosFieldSearcher::onValue(const document::FieldValue & fv) {
+ LOG(spam, "ignore field value '%s'", fv.toString().c_str());
+}
+
+void GeoPosFieldSearcher::onStructValue(const document::StructFieldValue & fv) {
+ size_t num_terms = _geoPosTerm.size();
+ for (size_t j = 0; j < num_terms; ++j) {
+ const GeoPosInfo & gpi = _geoPosTerm[j];
+ if (gpi.valid() && gpi.cmp(fv)) {
+ addHit(*_qtl[j], 0);
+ }
+ }
+ ++_words;
+}
+
+bool GeoPosFieldSearcher::GeoPosInfo::cmp(const document::StructFieldValue & sfv) const {
+ try {
+ auto xv = sfv.getValue("x");
+ auto yv = sfv.getValue("y");
+ if (xv && yv) {
+ int32_t x = xv->getAsInt();
+ int32_t y = yv->getAsInt();
+ GeoLocation::Point p{x,y};
+ if (inside_limit(p)) {
+ return true;
+ }
+ }
+ } catch (const vespalib::Exception &e) {
+ vespalib::Issue::report("bad fieldvalue for GeoPosFieldSearcher: %s", e.getMessage().c_str());
+ }
+ return false;
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h
new file mode 100644
index 00000000000..ef1c5b5a1c4
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.h
@@ -0,0 +1,28 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "fieldsearcher.h"
+#include <vespa/searchlib/common/geo_location.h>
+
+namespace vsm {
+
+class GeoPosFieldSearcher : public FieldSearcher {
+public:
+ GeoPosFieldSearcher(FieldIdT fId=0);
+ ~GeoPosFieldSearcher();
+ void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override;
+ void onValue(const document::FieldValue & fv) override;
+ void onStructValue(const document::StructFieldValue & fv) override;
+ std::unique_ptr<FieldSearcher> duplicate() const override;
+protected:
+ using GeoLocation = search::common::GeoLocation;
+ class GeoPosInfo : public GeoLocation {
+ public:
+ GeoPosInfo (GeoLocation loc) noexcept : GeoLocation(std::move(loc)) {}
+ bool cmp(const document::StructFieldValue & fv) const;
+ };
+ typedef std::vector<GeoPosInfo> GeoPosInfoListT;
+ GeoPosInfoListT _geoPosTerm;
+};
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp
new file mode 100644
index 00000000000..8cfb8e6df14
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp
@@ -0,0 +1,49 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "intfieldsearcher.h"
+
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+
+namespace vsm {
+
+std::unique_ptr<FieldSearcher>
+IntFieldSearcher::duplicate() const
+{
+ return std::make_unique<IntFieldSearcher>(*this);
+}
+
+IntFieldSearcher::IntFieldSearcher(FieldIdT fId) :
+ FieldSearcher(fId),
+ _intTerm()
+{ }
+
+IntFieldSearcher::~IntFieldSearcher() = default;
+
+void IntFieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf)
+{
+ _intTerm.clear();
+ FieldSearcher::prepare(qtl, buf);
+ for (QueryTermList::const_iterator it=qtl.begin(); it < qtl.end(); it++) {
+ const QueryTerm * qt = *it;
+ size_t sz(qt->termLen());
+ if (sz) {
+ int64_t low;
+ int64_t high;
+ bool valid = qt->getAsIntegerTerm(low, high);
+ _intTerm.push_back(IntInfo(low, high, valid));
+ }
+ }
+}
+
+void IntFieldSearcher::onValue(const document::FieldValue & fv)
+{
+ for(size_t j=0, jm(_intTerm.size()); j < jm; j++) {
+ const IntInfo & ii = _intTerm[j];
+ if (ii.valid() && (ii.cmp(fv.getAsLong()))) {
+ addHit(*_qtl[j], 0);
+ }
+ }
+ ++_words;
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h
new file mode 100644
index 00000000000..a2b17a87f4b
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.h
@@ -0,0 +1,33 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "fieldsearcher.h"
+
+namespace vsm {
+
+class IntFieldSearcher : public FieldSearcher
+{
+public:
+ std::unique_ptr<FieldSearcher> duplicate() const override;
+ IntFieldSearcher(FieldIdT fId=0);
+ ~IntFieldSearcher();
+ void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override;
+ void onValue(const document::FieldValue & fv) override;
+protected:
+ class IntInfo
+ {
+ public:
+ IntInfo(int64_t low, int64_t high, bool v) : _lower(low), _upper(high), _valid(v) { if (low > high) { _lower = high; _upper = low; } }
+ bool cmp(int64_t key) const { return (_lower <= key) && (key <= _upper); }
+ bool valid() const { return _valid; }
+ private:
+ int64_t _lower;
+ int64_t _upper;
+ bool _valid;
+ };
+ typedef std::vector<IntInfo> IntInfoListT;
+ IntInfoListT _intTerm;
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
new file mode 100644
index 00000000000..1c4ff78ff4a
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
@@ -0,0 +1,56 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "strchrfieldsearcher.h"
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+
+namespace vsm {
+
+void StrChrFieldSearcher::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf)
+{
+ FieldSearcher::prepare(qtl, buf);
+}
+
+void StrChrFieldSearcher::onValue(const document::FieldValue & fv)
+{
+ const document::LiteralFieldValueB & sfv = static_cast<const document::LiteralFieldValueB &>(fv);
+ vespalib::stringref val = sfv.getValueRef();
+ FieldRef fr(val.data(), std::min(maxFieldLength(), val.size()));
+ matchDoc(fr);
+}
+
+bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef)
+{
+ bool retval(true);
+ if (_qtl.size() > 1) {
+ size_t mintsz = shortestTerm();
+ if (fieldRef.size() >= mintsz) {
+ _words += matchTerms(fieldRef, mintsz);
+ } else {
+ _words += countWords(fieldRef);
+ }
+ } else {
+ for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) {
+ QueryTerm & qt = **it;
+ if (fieldRef.size() >= qt.termLen()) {
+ _words += matchTerm(fieldRef, qt);
+ } else {
+ _words += countWords(fieldRef);
+ }
+ }
+ }
+ return retval;
+}
+
+size_t StrChrFieldSearcher::shortestTerm() const
+{
+ size_t mintsz(_qtl.front()->termLen());
+ for(QueryTermList::const_iterator it=_qtl.begin()+1, mt=_qtl.end(); it != mt; it++) {
+ const QueryTerm & qt = **it;
+ mintsz = std::min(mintsz, qt.termLen());
+ }
+ return mintsz;
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h
new file mode 100644
index 00000000000..0155c79cddf
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.h
@@ -0,0 +1,22 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "fieldsearcher.h"
+
+namespace vsm {
+
+class StrChrFieldSearcher : public FieldSearcher
+{
+public:
+ StrChrFieldSearcher() : FieldSearcher(0) { }
+ StrChrFieldSearcher(FieldIdT fId) : FieldSearcher(fId) { }
+ void onValue(const document::FieldValue & fv) override;
+ void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override;
+private:
+ size_t shortestTerm() const;
+ bool matchDoc(const FieldRef & field);
+ virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) = 0;
+ virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) = 0;
+};
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp
new file mode 100644
index 00000000000..977602a691c
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.cpp
@@ -0,0 +1,33 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "utf8exactstringfieldsearcher.h"
+
+using search::byte;
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+
+namespace vsm {
+
+std::unique_ptr<FieldSearcher>
+UTF8ExactStringFieldSearcher::duplicate() const
+{
+ return std::make_unique<UTF8ExactStringFieldSearcher>(*this);
+}
+
+size_t
+UTF8ExactStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+{
+ (void) mintsz;
+ for (QueryTermList::iterator it = _qtl.begin(), mt = _qtl.end(); it != mt; ++it) {
+ QueryTerm & qt = **it;
+ matchTermExact(f, qt);
+ }
+ return 1;
+}
+
+size_t
+UTF8ExactStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
+{
+ return matchTermExact(f, qt);
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h
new file mode 100644
index 00000000000..744974a6cf6
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8exactstringfieldsearcher.h
@@ -0,0 +1,25 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
+
+namespace vsm
+{
+
+/**
+ * This class does suffix utf8 searches.
+ **/
+class UTF8ExactStringFieldSearcher : public UTF8StringFieldSearcherBase
+{
+protected:
+ virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+ virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+
+public:
+ std::unique_ptr<FieldSearcher> duplicate() const override;
+ UTF8ExactStringFieldSearcher() : UTF8StringFieldSearcherBase() { }
+ UTF8ExactStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
new file mode 100644
index 00000000000..9aef99f9fa1
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp
@@ -0,0 +1,69 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "utf8flexiblestringfieldsearcher.h"
+
+#include <vespa/log/log.h>
+LOG_SETUP(".vsm.searcher.utf8flexiblestringfieldsearcher");
+
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+
+namespace vsm {
+
+std::unique_ptr<FieldSearcher>
+UTF8FlexibleStringFieldSearcher::duplicate() const
+{
+ return std::make_unique<UTF8FlexibleStringFieldSearcher>(*this);
+}
+
+size_t
+UTF8FlexibleStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+{
+ (void) mintsz;
+ size_t words = 0;
+ for (QueryTermList::iterator it = _qtl.begin(); it != _qtl.end(); ++it) {
+ words = matchTerm(f, **it);
+ }
+ return words;
+}
+
+size_t
+UTF8FlexibleStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
+{
+ if (qt.isPrefix()) {
+ LOG(debug, "Use prefix match for prefix term '%s:%s'", qt.index().c_str(), qt.getTerm());
+ return matchTermRegular(f, qt);
+ } else if (qt.isSubstring()) {
+ LOG(debug, "Use substring match for substring term '%s:%s'", qt.index().c_str(), qt.getTerm());
+ return matchTermSubstring(f, qt);
+ } else if (qt.isSuffix()) {
+ LOG(debug, "Use suffix match for suffix term '%s:%s'", qt.index().c_str(), qt.getTerm());
+ return matchTermSuffix(f, qt);
+ } else if (qt.isExactstring()) {
+ LOG(debug, "Use exact match for exact term '%s:%s'", qt.index().c_str(), qt.getTerm());
+ return matchTermExact(f, qt);
+ } else {
+ if (substring()) {
+ LOG(debug, "Use substring match for term '%s:%s'", qt.index().c_str(), qt.getTerm());
+ return matchTermSubstring(f, qt);
+ } else if (suffix()) {
+ LOG(debug, "Use suffix match for term '%s:%s'", qt.index().c_str(), qt.getTerm());
+ return matchTermSuffix(f, qt);
+ } else if (exact()) {
+ LOG(debug, "Use exact match for term '%s:%s'", qt.index().c_str(), qt.getTerm());
+ return matchTermExact(f, qt);
+ } else {
+ LOG(debug, "Use regular/prefix match for term '%s:%s'", qt.index().c_str(), qt.getTerm());
+ return matchTermRegular(f, qt);
+ }
+ }
+}
+
+UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher() :
+ UTF8StringFieldSearcherBase()
+{ }
+
+UTF8FlexibleStringFieldSearcher::UTF8FlexibleStringFieldSearcher(FieldIdT fId) :
+ UTF8StringFieldSearcherBase(fId)
+{ }
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
new file mode 100644
index 00000000000..63931af0036
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h
@@ -0,0 +1,35 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
+
+namespace vsm
+{
+
+/**
+ * This class does utf8 searches based on the query term type.
+ * It will choose between regular search strategy (including prefix) and substring search strategy.
+ **/
+class UTF8FlexibleStringFieldSearcher : public UTF8StringFieldSearcherBase
+{
+private:
+ /**
+ * Tries to match the given query term against the content of the given field reference.
+ * Search strategy is choosen based on the query term type.
+ **/
+ virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+
+ /**
+ * Tries to match each query term in the underlying query against the content of the given field reference.
+ * Search strategy is choosen based on the query term type.
+ **/
+ virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+
+public:
+ std::unique_ptr<FieldSearcher> duplicate() const override;
+ UTF8FlexibleStringFieldSearcher();
+ UTF8FlexibleStringFieldSearcher(FieldIdT fId);
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
new file mode 100644
index 00000000000..0d93009655c
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.cpp
@@ -0,0 +1,56 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "utf8strchrfieldsearcher.h"
+
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+using search::byte;
+
+namespace vsm {
+
+std::unique_ptr<FieldSearcher>
+UTF8StrChrFieldSearcher::duplicate() const
+{
+ return std::make_unique<UTF8StrChrFieldSearcher>(*this);
+}
+
+size_t
+UTF8StrChrFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+{
+ (void) mintsz;
+ termcount_t words(0);
+ const byte * n = reinterpret_cast<const byte *> (f.data());
+ const byte * e = n + f.size();
+ if (f.size() >= _buf->size()) {
+ _buf->reserve(f.size() + 1);
+ }
+ cmptype_t * fn = &(*_buf.get())[0];
+ size_t fl(0);
+
+ for( ; n < e; ) {
+ if (!*n) { _zeroCount++; n++; }
+ n = tokenize(n, _buf->capacity(), fn, fl);
+ for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) {
+ QueryTerm & qt = **it;
+ const cmptype_t * term;
+ termsize_t tsz = qt.term(term);
+ if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) {
+ const cmptype_t *tt=term, *et=term+tsz;
+ for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++);
+ if (tt == et) {
+ addHit(qt, words);
+ }
+ }
+ }
+ words++;
+ }
+ NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
+ return words;
+}
+
+size_t
+UTF8StrChrFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
+{
+ return matchTermRegular(f, qt);
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h
new file mode 100644
index 00000000000..1687a1a18c0
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8strchrfieldsearcher.h
@@ -0,0 +1,25 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "utf8stringfieldsearcherbase.h"
+
+namespace vsm {
+
+/**
+ * This class does normal utf8 searches.
+ * This class uses an highly optimized version of the tokenize method in fastlib.
+ **/
+class UTF8StrChrFieldSearcher : public UTF8StringFieldSearcherBase
+{
+public:
+ std::unique_ptr<FieldSearcher> duplicate() const override;
+ UTF8StrChrFieldSearcher() : UTF8StringFieldSearcherBase() { }
+ UTF8StrChrFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
+
+protected:
+ size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+ size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
new file mode 100644
index 00000000000..148cdf2c0c3
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.cpp
@@ -0,0 +1,320 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "utf8stringfieldsearcherbase.h"
+#include <cassert>
+
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+using search::byte;
+
+namespace vsm {
+
+const byte *
+UTF8StringFieldSearcherBase::tokenize(const byte * p, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen)
+{
+ if (maxSz > 0) {
+ maxSz--;
+ }
+ ucs4_t c(*p);
+ ucs4_t *q(dstbuf);
+ const byte * end(p+maxSz);
+
+ // Skip non-word characters between words
+ for (; p < end; ) {
+ if (c < 128) {
+ if (!c) { break; }
+ p++;
+ if (__builtin_expect(_isWord[c], false)) {
+ *q++ = _foldCase[c];
+ c = 0;
+ } else {
+ c = *p;
+ }
+ } else {
+ const byte * oldP(p);
+ c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
+ if (Fast_UnicodeUtil::IsWordChar(c)) {
+ _utf8Count[p-oldP-1]++;
+ const char *repl = ReplacementString(c);
+ if (repl != NULL) {
+ size_t repllen = strlen(repl);
+ if (repllen > 0) {
+ q = Fast_UnicodeUtil::ucs4copy(q,repl);
+ }
+ } else {
+ c = ToFold(c);
+ *q++ = c;
+ }
+ break;
+ } else {
+ if (c == _BadUTF8Char) {
+ _badUtf8Count++;
+ } else {
+ _utf8Count[p-oldP-1]++;
+ }
+ c = *p;
+ }
+ }
+ }
+
+ c = *p; // Next char
+ for (; p < end;) {
+ if (c < 128) { // Common case, ASCII
+ if (!c) { break; }
+ p++;
+ if (__builtin_expect(!_isWord[c], false)) {
+ c = 0;
+ } else {
+ *q++ = _foldCase[c];
+ c = *p;
+ }
+ } else {
+ const byte * oldP(p);
+ c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
+ if (__builtin_expect(Fast_UnicodeUtil::IsWordChar(c), false)) {
+ _utf8Count[p-oldP-1]++;
+ const char *repl = ReplacementString(c);
+ if (repl != NULL) {
+ size_t repllen = strlen(repl);
+ if (repllen > 0) {
+ q = Fast_UnicodeUtil::ucs4copy(q,repl);
+ }
+ } else {
+ c = ToFold(c);
+ *q++ = c;
+ }
+
+ c = *p;
+ } else {
+ if (c == _BadUTF8Char) {
+ _badUtf8Count++;
+ } else {
+ _utf8Count[p-oldP-1]++;
+ }
+ break;
+ }
+ }
+ }
+ *q = 0;
+ tokenlen = q - dstbuf;
+ return p;
+}
+
+size_t
+UTF8StringFieldSearcherBase::matchTermRegular(const FieldRef & f, QueryTerm & qt)
+{
+ termcount_t words(0);
+ const byte * n = reinterpret_cast<const byte *> (f.data());
+ // __builtin_prefetch(n, 0, 0);
+ const cmptype_t * term;
+ termsize_t tsz = qt.term(term);
+ const byte * e = n + f.size();
+ if ( f.size() >= _buf->size()) {
+ _buf->reserve(f.size() + 1);
+ }
+ cmptype_t * fn = &(*_buf.get())[0];
+ size_t fl(0);
+
+ for( ; n < e; ) {
+ if (!*n) { _zeroCount++; n++; }
+ n = tokenize(n, _buf->capacity(), fn, fl);
+ if ((tsz <= fl) && (prefix() || qt.isPrefix() || (tsz == fl))) {
+ const cmptype_t *tt=term, *et=term+tsz;
+ for (const cmptype_t *fnt=fn; (tt < et) && (*tt == *fnt); tt++, fnt++);
+ if (tt == et) {
+ addHit(qt, words);
+ }
+ }
+ words++;
+ }
+ NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
+ return words;
+}
+
+size_t
+UTF8StringFieldSearcherBase::matchTermExact(const FieldRef & f, QueryTerm & qt)
+{
+ const byte * n = reinterpret_cast<const byte *> (f.data());
+ const cmptype_t * term;
+ termsize_t tsz = qt.term(term);
+ const cmptype_t * eterm = term+tsz;
+ const byte * e = n + f.size();
+ if (tsz <= f.size()) {
+ bool equal(true);
+ for (; equal && (n < e) && (term < eterm); term++) {
+ if (*term < 0x80) {
+ equal = (*term == _foldCase[*n++]);
+ } else {
+ cmptype_t c = ToFold(Fast_UnicodeUtil::GetUTF8CharNonAscii(n));
+ equal = (*term == c);
+ }
+ }
+ if (equal && (term == eterm) && (qt.isPrefix() || (n == e))) {
+ addHit(qt,0);
+ }
+ }
+ NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
+ return 1;
+}
+
+size_t
+UTF8StringFieldSearcherBase::matchTermSubstring(const FieldRef & f, QueryTerm & qt)
+{
+ if (qt.termLen() == 0) { return 0; }
+ const byte * n = reinterpret_cast<const byte *> (f.data());
+ const cmptype_t * term;
+ termsize_t tsz = qt.term(term);
+ if ( f.size() >= _buf->size()) {
+ _buf->reserve(f.size() + 1);
+ }
+ cmptype_t * fntemp = &(*_buf.get())[0];
+ BufferWrapper wrapper(fntemp);
+ size_t fl = skipSeparators(n, f.size(), wrapper);
+ const cmptype_t * fn(fntemp);
+ const cmptype_t * fe = fn + fl;
+ const cmptype_t * fre = fe - tsz;
+ termcount_t words(0);
+ for(words = 0; fn <= fre; ) {
+ const cmptype_t *tt=term, *et=term+tsz, *fnt=fn;
+ for (; (tt < et) && (*tt == *fnt); tt++, fnt++);
+ if (tt == et) {
+ fn = fnt;
+ addHit(qt, words);
+ } else {
+ if ( ! Fast_UnicodeUtil::IsWordChar(*fn++) ) {
+ words++;
+ for(; (fn < fre) && ! Fast_UnicodeUtil::IsWordChar(*fn) ; fn++ );
+ }
+ }
+ }
+ NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
+ return words + 1; // we must also count the last word
+}
+
+size_t
+UTF8StringFieldSearcherBase::matchTermSuffix(const FieldRef & f, QueryTerm & qt)
+{
+ termcount_t words = 0;
+ const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
+ const byte * srcend = srcbuf + f.size();
+ const cmptype_t * term;
+ termsize_t tsz = qt.term(term);
+ if (f.size() >= _buf->size()) {
+ _buf->reserve(f.size() + 1);
+ }
+ cmptype_t * dstbuf = &(*_buf.get())[0];
+ size_t tokenlen = 0;
+
+ for( ; srcbuf < srcend; ) {
+ if (*srcbuf == 0) {
+ ++_zeroCount;
+ ++srcbuf;
+ }
+ srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
+ if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) {
+ addHit(qt, words);
+ }
+ words++;
+ }
+ return words;
+}
+
+UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase() :
+ StrChrFieldSearcher(),
+ Fast_NormalizeWordFolder(),
+ Fast_UnicodeUtil()
+{
+}
+
+UTF8StringFieldSearcherBase::UTF8StringFieldSearcherBase(FieldIdT fId) :
+ StrChrFieldSearcher(fId),
+ Fast_NormalizeWordFolder(),
+ Fast_UnicodeUtil()
+{
+}
+
+UTF8StringFieldSearcherBase::~UTF8StringFieldSearcherBase() {}
+
+void
+UTF8StringFieldSearcherBase::prepare(QueryTermList & qtl, const SharedSearcherBuf & buf)
+{
+ StrChrFieldSearcher::prepare(qtl, buf);
+ _buf = buf;
+}
+
+bool
+UTF8StringFieldSearcherBase::matchTermSuffix(const cmptype_t * term, size_t termlen,
+ const cmptype_t * word, size_t wordlen)
+{
+ if ((termlen <= wordlen)) {
+ const cmptype_t * titr = term + termlen - 1;
+ const cmptype_t * witr = word + wordlen - 1;
+ bool hit = true;
+ // traverse the term and the word back to front
+ for (; titr >= term; --titr, --witr) {
+ if (*titr != *witr) {
+ hit = false;
+ break;
+ }
+ }
+ return hit;
+ }
+ return false;
+}
+
+bool
+UTF8StringFieldSearcherBase::isSeparatorCharacter(ucs4_t c)
+{
+ return ((c < 0x20) && (c != '\n') && (c != '\t'));
+}
+
+template <typename T>
+size_t
+UTF8StringFieldSearcherBase::skipSeparators(const search::byte * p, size_t sz, T & dstbuf) {
+ const search::byte * e(p+sz);
+ const search::byte * b(p);
+
+ for(; p < e; ) {
+ ucs4_t c(*p);
+ const search::byte * oldP(p);
+ if (c < 128) {
+ p++;
+ if (!isSeparatorCharacter(c)) {
+ dstbuf.onCharacter(_foldCase[c], (oldP - b));
+ }
+ } else {
+ c = Fast_UnicodeUtil::GetUTF8CharNonAscii(p);
+ const char *repl = ReplacementString(c);
+ if (repl != NULL) {
+ size_t repllen = strlen(repl);
+ if (repllen > 0) {
+ ucs4_t * buf = dstbuf.getBuf();
+ ucs4_t * newBuf = Fast_UnicodeUtil::ucs4copy(buf, repl);
+ if (dstbuf.hasOffsets()) {
+ for (; buf < newBuf; ++buf) {
+ dstbuf.incBuf(1);
+ dstbuf.onOffset(oldP - b);
+ }
+ } else {
+ dstbuf.incBuf(newBuf - buf);
+ }
+ }
+ } else {
+ c = ToFold(c);
+ dstbuf.onCharacter(c, (oldP - b));
+ }
+ if (c == _BadUTF8Char) {
+ _badUtf8Count++;
+ } else {
+ _utf8Count[p-oldP-1]++;
+ }
+ }
+ }
+ assert(dstbuf.valid());
+ return dstbuf.size();
+}
+
+template unsigned long UTF8StringFieldSearcherBase::skipSeparators<UTF8StringFieldSearcherBase::BufferWrapper>(unsigned char const*, unsigned long, UTF8StringFieldSearcherBase::BufferWrapper&);
+template unsigned long UTF8StringFieldSearcherBase::skipSeparators<UTF8StringFieldSearcherBase::OffsetWrapper>(unsigned char const*, unsigned long, UTF8StringFieldSearcherBase::OffsetWrapper&);
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
new file mode 100644
index 00000000000..f540a7ac457
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8stringfieldsearcherbase.h
@@ -0,0 +1,138 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "strchrfieldsearcher.h"
+#include <vespa/fastlib/text/normwordfolder.h>
+
+namespace vsm {
+
+/**
+ * This class is the base class for all utf8 string searchers.
+ * It contains utility functions used by the other searchers.
+ * As normal the prepare method is called
+ * after the query is built. A SharedSearcherBuf is used given to it. This is a
+ * buffer that is shared among all searchers that are run in the same context.
+ * Reuse of this buffer ensures better cache hit ratio because this is just a
+ * scratchpad for tokenizing. It will grow till the max size and stay there.
+ **/
+class UTF8StringFieldSearcherBase : public StrChrFieldSearcher, protected Fast_NormalizeWordFolder, public Fast_UnicodeUtil
+{
+public:
+ /**
+ * Template class that wraps an ucs4 buffer.
+ * Used when invoking skipSeparators() during substring matching.
+ **/
+ class BufferWrapper
+ {
+ protected:
+ ucs4_t * _bbuf;
+ ucs4_t * _cbuf;
+
+ public:
+ BufferWrapper(ucs4_t * buf) : _bbuf(buf), _cbuf(buf) { }
+ BufferWrapper(ucs4_t * buf, size_t *) : _bbuf(buf), _cbuf(buf) { }
+ void onCharacter(ucs4_t ch, size_t) { *_cbuf++ = ch; }
+ void onOffset(size_t) { }
+ void incBuf(size_t inc) { _cbuf += inc; }
+ ucs4_t * getBuf() { return _cbuf; }
+ bool valid() { return true; }
+ size_t size() { return (_cbuf - _bbuf); }
+ bool hasOffsets() { return false; }
+ };
+
+ /**
+ * Template class that wraps an offset buffer in addition to an ucs4 buffer.
+ * The offset buffer contains offsets into the original utf8 buffer.
+ **/
+ class OffsetWrapper : public BufferWrapper
+ {
+ private:
+ size_t * _boff;
+ size_t * _coff;
+
+ public:
+ OffsetWrapper(ucs4_t * buf, size_t * offsets) : BufferWrapper(buf), _boff(offsets), _coff(offsets) {}
+ void onCharacter(ucs4_t ch, size_t of) { *_cbuf++ = ch; *_coff++ = of; }
+ void onOffset(size_t of) { *_coff++ = of; }
+ bool valid() { return (size() == (size_t)(_coff - _boff)); }
+ bool hasOffsets() { return true; }
+ };
+
+protected:
+ SharedSearcherBuf _buf;
+
+ const search::byte * tokenize(const search::byte * buf, size_t maxSz, cmptype_t * dstbuf, size_t & tokenlen);
+
+ /**
+ * Matches the given query term against the words in the given field reference
+ * using exact or prefix match strategy.
+ *
+ * @param f the field reference to match against.
+ * @param qt the query term trying to match.
+ * @return the number of words in the field ref.
+ **/
+ size_t matchTermRegular(const FieldRef & f, search::streaming::QueryTerm & qt);
+
+ /**
+ * Matches the given query term against the characters in the given field reference
+ * using substring match strategy.
+ *
+ * @param f the field reference to match against.
+ * @param qt the query term trying to match.
+ * @return the number of words in the field ref.
+ **/
+ size_t matchTermSubstring(const FieldRef & f, search::streaming::QueryTerm & qt);
+
+ /**
+ * Matches the given query term against the words in the given field reference
+ * using suffix match strategy.
+ *
+ * @param f the field reference to match against.
+ * @param qt the query term trying to match.
+ * @return the number of words in the field ref.
+ **/
+ size_t matchTermSuffix(const FieldRef & f, search::streaming::QueryTerm & qt);
+
+ /**
+ * Matches the given query term against the words in the given field reference
+ * using exact match strategy.
+ *
+ * @param f the field reference to match against.
+ * @param qt the query term trying to match.
+ * @return the number of words in the field ref.
+ **/
+ size_t matchTermExact(const FieldRef & f, search::streaming::QueryTerm & qt);
+
+public:
+ UTF8StringFieldSearcherBase();
+ UTF8StringFieldSearcherBase(FieldIdT fId);
+ ~UTF8StringFieldSearcherBase();
+ void prepare(search::streaming::QueryTermList & qtl, const SharedSearcherBuf & buf) override;
+ /**
+ * Matches the given query term against the given word using suffix match strategy.
+ *
+ * @param term the buffer with the term.
+ * @param termLen the length of the term.
+ * @param word the buffer with the word.
+ * @param wordlen the length of the word.
+ * @return true if the term matches the word.
+ **/
+ static bool matchTermSuffix(const cmptype_t * term, size_t termlen,
+ const cmptype_t * word, size_t wordlen);
+
+ /**
+ * Checks whether the given character is a separator character.
+ **/
+ static bool isSeparatorCharacter(ucs4_t);
+
+ /**
+ * Transforms the given utf8 array into an array of ucs4 characters.
+ * Folding is performed. Separator characters are skipped.
+ **/
+ template <typename T>
+ size_t skipSeparators(const search::byte * p, size_t sz, T & dstbuf);
+
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp
new file mode 100644
index 00000000000..fd327d3a3df
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.cpp
@@ -0,0 +1,59 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vsm/searcher/utf8substringsearcher.h>
+
+using search::byte;
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+
+namespace vsm {
+
+std::unique_ptr<FieldSearcher>
+UTF8SubStringFieldSearcher::duplicate() const
+{
+ return std::make_unique<UTF8SubStringFieldSearcher>(*this);
+}
+
+size_t
+UTF8SubStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+{
+ const byte * n = reinterpret_cast<const byte *> (f.data());
+ if ( f.size() >= _buf->size()) {
+ _buf->reserve(f.size() + 1);
+ }
+ cmptype_t * fntemp = &(*_buf.get())[0];
+ BufferWrapper wrapper(fntemp);
+ size_t fl = skipSeparators(n, f.size(), wrapper);
+ const cmptype_t * fn(fntemp);
+ const cmptype_t * fe = fn + fl;
+ const cmptype_t * fre = fe - mintsz;
+ termcount_t words(0);
+ for(words = 0; fn <= fre; ) {
+ for(QueryTermList::iterator it=_qtl.begin(), mt=_qtl.end(); it != mt; it++) {
+ QueryTerm & qt = **it;
+ const cmptype_t * term;
+ termsize_t tsz = qt.term(term);
+
+ const cmptype_t *tt=term, *et=term+tsz, *fnt=fn;
+ for (; (tt < et) && (*tt == *fnt); tt++, fnt++);
+ if (tt == et) {
+ addHit(qt, words);
+ }
+ }
+ if ( ! Fast_UnicodeUtil::IsWordChar(*fn++) ) {
+ words++;
+ for(; (fn < fre) && ! Fast_UnicodeUtil::IsWordChar(*fn); fn++ );
+ }
+ }
+
+ NEED_CHAR_STAT(addAnyUtf8Field(f.size()));
+ return words + 1; // we must also count the last word
+}
+
+size_t
+UTF8SubStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
+{
+ return matchTermSubstring(f, qt);
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h
new file mode 100644
index 00000000000..1c463c28847
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsearcher.h
@@ -0,0 +1,23 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/vsm/searcher/utf8strchrfieldsearcher.h>
+
+namespace vsm {
+
+/**
+ * This class does substring utf8 searches.
+ **/
+class UTF8SubStringFieldSearcher : public UTF8StringFieldSearcherBase
+{
+public:
+ std::unique_ptr<FieldSearcher> duplicate() const override;
+ UTF8SubStringFieldSearcher() : UTF8StringFieldSearcherBase() { }
+ UTF8SubStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
+protected:
+ size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+ size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp
new file mode 100644
index 00000000000..be02a58cfda
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.cpp
@@ -0,0 +1,144 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "utf8substringsnippetmodifier.h"
+#include <cassert>
+
+using search::byte;
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+
+namespace vsm {
+
+std::unique_ptr<FieldSearcher>
+UTF8SubstringSnippetModifier::duplicate() const
+{
+ return std::make_unique<UTF8SubstringSnippetModifier>(*this);
+}
+
+size_t
+UTF8SubstringSnippetModifier::matchTerms(const FieldRef & f, const size_t mintsz)
+{
+ _modified->reset();
+ _readPtr = f.data();
+ const byte * src = reinterpret_cast<const byte *> (f.data());
+ // resize ucs4 buffer
+ if (f.size() >= _buf->size()) {
+ _buf->resize(f.size() + 1);
+ }
+ // resize offset buffers
+ if (f.size() >= _offsets->size()) {
+ _offsets->resize(f.size() + 1);
+ }
+ // resize modified buffer
+ if (f.size() + 16 > _modified->getLength()) {
+ _modified->resize(f.size() + 16); // make room for some unit separators
+ }
+ cmptype_t * dbegin = &(*_buf.get())[0];
+ OffsetWrapper wrapper(dbegin, &(*_offsets)[0]);
+ size_t numchars = skipSeparators(src, f.size(), wrapper);
+ const cmptype_t * ditr = dbegin;
+ const cmptype_t * dend = ditr + numchars;
+ const cmptype_t * drend = dend - mintsz;
+ termcount_t words = 0;
+ for(; ditr <= drend; ) {
+ for (QueryTermList::iterator itr = _qtl.begin(); itr != _qtl.end(); ++itr) {
+ QueryTerm & qt = **itr;
+ const cmptype_t * term;
+ termsize_t tsz = qt.term(term);
+
+ const cmptype_t * titr = term;
+ const cmptype_t * tend = term + tsz;
+ const cmptype_t * dtmp = ditr;
+ for (; (titr < tend) && (*titr == *dtmp); ++titr, ++dtmp);
+ if (titr == tend) {
+ const char * mbegin = f.data() + (*_offsets)[ditr - dbegin];
+ const char * mend = f.data() + ((dtmp < dend) ? ((*_offsets)[dtmp - dbegin]) : f.size());
+ if (_readPtr <= mbegin) {
+ // We will only copy from the field ref once.
+ // If we have overlapping matches only the first one will be considered.
+ insertSeparators(mbegin, mend);
+ }
+ addHit(qt, words);
+ }
+ }
+ if ( ! Fast_UnicodeUtil::IsWordChar(*ditr++) ) {
+ words++;
+ for(; (ditr < drend) && ! Fast_UnicodeUtil::IsWordChar(*ditr) ; ++ditr );
+ }
+ }
+ assert(_readPtr <= (f.data() + f.size()));
+ // copy remaining
+ size_t toCopy = f.size() - (_readPtr - f.data());
+ copyToModified(toCopy);
+
+ return words + 1; // we must also count the last word
+}
+
+size_t
+UTF8SubstringSnippetModifier::matchTerm(const FieldRef & f, QueryTerm & qt)
+{
+ const cmptype_t * term;
+ termsize_t tsz = qt.term(term);
+ return matchTerms(f, tsz);
+}
+
+void
+UTF8SubstringSnippetModifier::copyToModified(size_t n, bool skipSep)
+{
+ if (n == 0) {
+ return;
+ }
+ if (skipSep) {
+ for (const char * readEnd = _readPtr + n; _readPtr < readEnd; ++_readPtr) {
+ if (!isSeparatorCharacter(*_readPtr)) {
+ _modified->put(*_readPtr);
+ }
+ }
+ } else {
+ _modified->put(_readPtr, n);
+ _readPtr += n;
+ }
+}
+
+void
+UTF8SubstringSnippetModifier::insertSeparators(const char * mbegin, const char * mend)
+{
+ copyToModified(mbegin - _readPtr);
+ _modified->put(_unitSep);
+ // skip separators such that the match is not splitted.
+ copyToModified((mend - mbegin), true);
+ _modified->put(_unitSep);
+}
+
+UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier() :
+ UTF8StringFieldSearcherBase(),
+ _modified(new CharBuffer(32)),
+ _offsets(new std::vector<size_t>(32)),
+ _readPtr(NULL),
+ _unitSep('\x1F')
+{
+}
+
+UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId) :
+ UTF8StringFieldSearcherBase(fId),
+ _modified(new CharBuffer(32)),
+ _offsets(new std::vector<size_t>(32)),
+ _readPtr(NULL),
+ _unitSep('\x1F')
+{
+}
+
+UTF8SubstringSnippetModifier::UTF8SubstringSnippetModifier(FieldIdT fId,
+ const CharBuffer::SP & modBuf,
+ const SharedOffsetBuffer & offBuf) :
+ UTF8StringFieldSearcherBase(fId),
+ _modified(modBuf),
+ _offsets(offBuf),
+ _readPtr(NULL),
+ _unitSep('\x1F')
+{
+}
+
+UTF8SubstringSnippetModifier::~UTF8SubstringSnippetModifier() {}
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h
new file mode 100644
index 00000000000..0127a7f2827
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8substringsnippetmodifier.h
@@ -0,0 +1,72 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "utf8stringfieldsearcherbase.h"
+#include <vespa/vsm/common/charbuffer.h>
+
+namespace vsm {
+
+typedef std::shared_ptr<std::vector<size_t> > SharedOffsetBuffer;
+
+/**
+ * This class does substring searches the same way as UTF8SubStringFieldSearcher.
+ * While matching the query term(s) against the field reference it builds a modified
+ * buffer based on the field reference where the only difference is that unit separators
+ * are inserted before and after a match. These extra unit separators make it possible
+ * to highlight a substring match when later generating snippets.
+ **/
+class UTF8SubstringSnippetModifier : public UTF8StringFieldSearcherBase
+{
+private:
+ CharBuffer::SP _modified; // buffer to write the modified field value
+ SharedOffsetBuffer _offsets; // for each character in _buf we have an offset into the utf8 buffer (field reference)
+ const char * _readPtr; // buffer to read from (field reference)
+ char _unitSep; // the unit separator character to use
+
+ virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+ virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+
+ /**
+ * Copies n bytes from the field reference to the modified buffer and updates the read pointer.
+ * Separator characters from the field reference can be skipped.
+ * This is to avoid that a match is splitted by separator characters from the original field reference.
+ *
+ * @param n the number of bytes to copy.
+ * @param skipSep whether we should skip separator characters from the field reference.
+ **/
+ void copyToModified(size_t n, bool skipSep = false);
+
+ /**
+ * Copies from the field reference to the modified buffer and inserts unit separators for a match
+ * starting at mbegin (in the field reference) and ending at mend (in the field reference).
+ * A unit separator is inserted before and after the match.
+ *
+ * @param mbegin the beginning of the match.
+ * @param mend the end of the match.
+ **/
+ void insertSeparators(const char * mbegin, const char * mend);
+
+public:
+ typedef std::shared_ptr<UTF8SubstringSnippetModifier> SP;
+
+ std::unique_ptr<FieldSearcher> duplicate() const override;
+
+ UTF8SubstringSnippetModifier();
+ UTF8SubstringSnippetModifier(FieldIdT fId);
+ ~UTF8SubstringSnippetModifier();
+
+ /**
+ * Creates a new instance.
+ *
+ * @param fId the field id to operate on.
+ * @param modBuf the shared buffer used to store the modified field value.
+ * @param offBuf the shared buffer used to store the offsets into the field reference.
+ **/
+ UTF8SubstringSnippetModifier(FieldIdT fId, const CharBuffer::SP & modBuf, const SharedOffsetBuffer & offBuf);
+
+ const CharBuffer & getModifiedBuf() const { return *_modified; }
+ const search::streaming::QueryTermList & getQueryTerms() const { return _qtl; }
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
new file mode 100644
index 00000000000..3495d46b85b
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.cpp
@@ -0,0 +1,54 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "utf8suffixstringfieldsearcher.h"
+
+using search::byte;
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+
+namespace vsm {
+
+std::unique_ptr<FieldSearcher>
+UTF8SuffixStringFieldSearcher::duplicate() const
+{
+ return std::make_unique<UTF8SuffixStringFieldSearcher>(*this);
+}
+
+size_t
+UTF8SuffixStringFieldSearcher::matchTerms(const FieldRef & f, const size_t mintsz)
+{
+ (void) mintsz;
+ termcount_t words = 0;
+ const byte * srcbuf = reinterpret_cast<const byte *> (f.data());
+ const byte * srcend = srcbuf + f.size();
+ if (f.size() >= _buf->size()) {
+ _buf->reserve(f.size() + 1);
+ }
+ cmptype_t * dstbuf = &(*_buf.get())[0];
+ size_t tokenlen = 0;
+
+ for( ; srcbuf < srcend; ) {
+ if (*srcbuf == 0) {
+ ++_zeroCount;
+ ++srcbuf;
+ }
+ srcbuf = tokenize(srcbuf, _buf->capacity(), dstbuf, tokenlen);
+ for (QueryTermList::iterator it = _qtl.begin(), mt = _qtl.end(); it != mt; ++it) {
+ QueryTerm & qt = **it;
+ const cmptype_t * term;
+ termsize_t tsz = qt.term(term);
+ if (matchTermSuffix(term, tsz, dstbuf, tokenlen)) {
+ addHit(qt, words);
+ }
+ }
+ words++;
+ }
+ return words;
+}
+
+size_t
+UTF8SuffixStringFieldSearcher::matchTerm(const FieldRef & f, QueryTerm & qt)
+{
+ return matchTermSuffix(f, qt);
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h
new file mode 100644
index 00000000000..0640ac22da5
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/searcher/utf8suffixstringfieldsearcher.h
@@ -0,0 +1,25 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
+
+namespace vsm
+{
+
+/**
+ * This class does suffix utf8 searches.
+ **/
+class UTF8SuffixStringFieldSearcher : public UTF8StringFieldSearcherBase
+{
+protected:
+ virtual size_t matchTerm(const FieldRef & f, search::streaming::QueryTerm & qt) override;
+ virtual size_t matchTerms(const FieldRef & f, const size_t shortestTerm) override;
+
+public:
+ std::unique_ptr<FieldSearcher> duplicate() const override;
+ UTF8SuffixStringFieldSearcher() : UTF8StringFieldSearcherBase() { }
+ UTF8SuffixStringFieldSearcher(FieldIdT fId) : UTF8StringFieldSearcherBase(fId) { }
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/vsm/.gitignore b/streamingvisitors/src/vespa/vsm/vsm/.gitignore
new file mode 100644
index 00000000000..95bc02923a9
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/.gitignore
@@ -0,0 +1,5 @@
+*.exe
+*.ilk
+*.pdb
+.depend*
+Makefile
diff --git a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt
new file mode 100644
index 00000000000..adc00b341a3
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt
@@ -0,0 +1,14 @@
+# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_library(vsm_vsmbase OBJECT
+ SOURCES
+ docsumfieldspec.cpp
+ docsumfilter.cpp
+ fieldsearchspec.cpp
+ flattendocsumwriter.cpp
+ slimefieldwriter.cpp
+ snippetmodifier.cpp
+ vsm-adapter.cpp
+ docsumconfig.cpp
+ DEPENDS
+ vsm_vconfig
+)
diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.cpp
new file mode 100644
index 00000000000..656e9eed132
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.cpp
@@ -0,0 +1,75 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/vsm/vsm/docsumconfig.h>
+#include <vespa/searchsummary/docsummary/docsumfieldwriter.h>
+#include <vespa/searchsummary/docsummary/matched_elements_filter_dfw.h>
+#include <vespa/searchlib/common/matching_elements_fields.h>
+#include <vespa/vsm/config/config-vsmfields.h>
+#include <vespa/vsm/config/config-vsmsummary.h>
+
+using search::MatchingElementsFields;
+using search::docsummary::IDocsumFieldWriter;
+using search::docsummary::EmptyDFW;
+using search::docsummary::MatchedElementsFilterDFW;
+using search::docsummary::ResultConfig;
+using vespa::config::search::vsm::VsmfieldsConfig;
+using vespa::config::search::vsm::VsmsummaryConfig;
+
+namespace vsm {
+
+namespace {
+
+void populate_fields(MatchingElementsFields& fields, VsmfieldsConfig& fields_config, const vespalib::string& field_name)
+{
+ vespalib::string prefix = field_name + ".";
+ for (const auto& spec : fields_config.fieldspec) {
+ if (spec.name.substr(0, prefix.size()) == prefix) {
+ fields.add_mapping(field_name, spec.name);
+ }
+ if (spec.name == field_name) {
+ fields.add_field(field_name);
+ }
+ }
+}
+
+}
+
+DynamicDocsumConfig::DynamicDocsumConfig(search::docsummary::IDocsumEnvironment* env, search::docsummary::DynamicDocsumWriter* writer, std::shared_ptr<VsmfieldsConfig> vsm_fields_config)
+ : Parent(env, writer),
+ _vsm_fields_config(std::move(vsm_fields_config))
+{
+}
+
+IDocsumFieldWriter::UP
+DynamicDocsumConfig::createFieldWriter(const string & fieldName, const string & overrideName, const string & argument, bool & rc, std::shared_ptr<search::MatchingElementsFields> matching_elems_fields)
+{
+ IDocsumFieldWriter::UP fieldWriter;
+ if ((overrideName == "staticrank") ||
+ (overrideName == "ranklog") ||
+ (overrideName == "label") ||
+ (overrideName == "project") ||
+ (overrideName == "positions") ||
+ (overrideName == "absdist") ||
+ (overrideName == "subproject"))
+ {
+ fieldWriter = std::make_unique<EmptyDFW>();
+ rc = true;
+ } else if ((overrideName == "attribute") ||
+ (overrideName == "attributecombiner") ||
+ (overrideName == "geopos")) {
+ rc = true;
+ } else if ((overrideName == "matchedattributeelementsfilter") ||
+ (overrideName == "matchedelementsfilter")) {
+ string source_field = argument.empty() ? fieldName : argument;
+ const ResultConfig& resultConfig = getResultConfig();
+ int source_field_enum = resultConfig.GetFieldNameEnum().Lookup(source_field.c_str());
+ populate_fields(*matching_elems_fields, *_vsm_fields_config, source_field);
+ fieldWriter = MatchedElementsFilterDFW::create(source_field, source_field_enum, matching_elems_fields);
+ rc = static_cast<bool>(fieldWriter);
+ } else {
+ fieldWriter = search::docsummary::DynamicDocsumConfig::createFieldWriter(fieldName, overrideName, argument, rc, matching_elems_fields);
+ }
+ return fieldWriter;
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.h b/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.h
new file mode 100644
index 00000000000..11010c04e90
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/docsumconfig.h
@@ -0,0 +1,29 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/searchsummary/docsummary/docsumconfig.h>
+
+namespace vespa::config::search::vsm {
+namespace internal { class InternalVsmfieldsType; }
+typedef const internal::InternalVsmfieldsType VsmfieldsConfig;
+}
+namespace vsm {
+
+class DynamicDocsumConfig : public search::docsummary::DynamicDocsumConfig
+{
+public:
+ using Parent = search::docsummary::DynamicDocsumConfig;
+ using VsmfieldsConfig = vespa::config::search::vsm::VsmfieldsConfig;
+private:
+ std::shared_ptr<VsmfieldsConfig> _vsm_fields_config;
+public:
+ DynamicDocsumConfig(search::docsummary::IDocsumEnvironment* env, search::docsummary::DynamicDocsumWriter* writer, std::shared_ptr<VsmfieldsConfig> vsm_fields_config);
+private:
+ std::unique_ptr<search::docsummary::IDocsumFieldWriter>
+ createFieldWriter(const string & fieldName, const string & overrideName,
+ const string & cf, bool & rc, std::shared_ptr<search::MatchingElementsFields> matching_elems_fields) override;
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.cpp
new file mode 100644
index 00000000000..936aaaa2091
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.cpp
@@ -0,0 +1,35 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include "docsumfieldspec.h"
+
+namespace vsm {
+
+DocsumFieldSpec::FieldIdentifier::FieldIdentifier() :
+ _id(StringFieldIdTMap::npos),
+ _path()
+{ }
+
+DocsumFieldSpec::FieldIdentifier::FieldIdentifier(FieldIdT id, FieldPath path) :
+ _id(id),
+ _path(std::move(path))
+{ }
+
+DocsumFieldSpec::FieldIdentifier::FieldIdentifier(FieldIdentifier &&) noexcept = default;
+DocsumFieldSpec::FieldIdentifier & DocsumFieldSpec::FieldIdentifier::operator=(FieldIdentifier &&) noexcept = default;
+DocsumFieldSpec::FieldIdentifier::~FieldIdentifier() = default;
+
+DocsumFieldSpec::DocsumFieldSpec() :
+ _resultType(search::docsummary::RES_INT),
+ _command(VsmsummaryConfig::Fieldmap::Command::NONE),
+ _outputField(),
+ _inputFields()
+{ }
+
+DocsumFieldSpec::DocsumFieldSpec(search::docsummary::ResType resultType,
+ VsmsummaryConfig::Fieldmap::Command command) :
+ _resultType(resultType),
+ _command(command),
+ _outputField(),
+ _inputFields()
+{ }
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.h b/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.h
new file mode 100644
index 00000000000..db6ee9fa223
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/docsumfieldspec.h
@@ -0,0 +1,72 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/searchsummary/docsummary/resultclass.h>
+#include <vespa/vsm/common/document.h>
+#include <vespa/vsm/common/storagedocument.h>
+#include <vespa/vsm/config/vsm-cfif.h>
+
+namespace vsm {
+
+/**
+ * This class contains the specifications for how to generate a summary field.
+ **/
+class DocsumFieldSpec {
+public:
+ /**
+ * This class contains a field id and a field path (to navigate a field value).
+ **/
+ class FieldIdentifier {
+ private:
+ FieldIdT _id;
+ FieldPath _path;
+
+ public:
+ FieldIdentifier();
+ FieldIdentifier(FieldIdT id, FieldPath path);
+ FieldIdentifier(FieldIdentifier &&) noexcept;
+ FieldIdentifier & operator=(FieldIdentifier &&) noexcept;
+ FieldIdentifier(const FieldIdentifier &) = delete;
+ FieldIdentifier & operator=(const FieldIdentifier &) = delete;
+ ~FieldIdentifier();
+ FieldIdT getId() const { return _id; }
+ const FieldPath & getPath() const { return _path; }
+ };
+
+ typedef std::vector<FieldIdentifier> FieldIdentifierVector;
+
+private:
+ search::docsummary::ResType _resultType;
+ VsmsummaryConfig::Fieldmap::Command _command;
+ FieldIdentifier _outputField;
+ FieldIdentifierVector _inputFields;
+
+public:
+ DocsumFieldSpec();
+ DocsumFieldSpec(search::docsummary::ResType resultType, VsmsummaryConfig::Fieldmap::Command command);
+
+ /**
+ * Returns the result type for the summary field.
+ **/
+ search::docsummary::ResType getResultType() const { return _resultType; }
+
+ /**
+ * Returns the command specifying how to transform input fields into output summary field.
+ **/
+ VsmsummaryConfig::Fieldmap::Command getCommand() const { return _command; }
+
+ /**
+ * Returns whether the input field and output field are identical.
+ **/
+ bool hasIdentityMapping() const {
+ return _inputFields.size() == 1 && _outputField.getId() == _inputFields[0].getId();
+ }
+
+ const FieldIdentifier & getOutputField() const { return _outputField; }
+ void setOutputField(FieldIdentifier outputField) { _outputField = std::move(outputField); }
+ const FieldIdentifierVector & getInputFields() const { return _inputFields; }
+ FieldIdentifierVector & getInputFields() { return _inputFields; }
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp
new file mode 100644
index 00000000000..70759feb41c
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.cpp
@@ -0,0 +1,477 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "docsumfilter.h"
+#include "slimefieldwriter.h"
+#include <vespa/searchsummary/docsummary/summaryfieldconverter.h>
+#include <vespa/document/base/exceptions.h>
+#include <vespa/document/fieldvalue/iteratorhandler.h>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".vsm.docsumfilter");
+
+using namespace search::docsummary;
+
+
+namespace {
+
+class Handler : public document::fieldvalue::IteratorHandler {
+public:
+};
+
+struct IntResultHandler : public Handler {
+ int32_t value;
+ IntResultHandler() : value(0) {}
+ void onPrimitive(uint32_t, const Content & c) override {
+ value = c.getValue().getAsInt();
+ }
+};
+
+struct LongResultHandler : public Handler {
+ int64_t value;
+ LongResultHandler() : value(0) {}
+ void onPrimitive(uint32_t, const Content & c) override {
+ value = c.getValue().getAsLong();
+ }
+};
+
+struct FloatResultHandler : public Handler {
+ float value;
+ FloatResultHandler() : value(0) {}
+ void onPrimitive(uint32_t, const Content & c) override {
+ value = c.getValue().getAsFloat();
+ }
+};
+
+struct DoubleResultHandler : public Handler {
+ double value;
+ DoubleResultHandler() : value(0) {}
+ void onPrimitive(uint32_t, const Content & c) override {
+ value = c.getValue().getAsDouble();
+ }
+};
+
+class StringResultHandler : public Handler {
+private:
+ ResType _type;
+ ResultPacker & _packer;
+ void addToPacker(const char * buf, size_t len) {
+ switch (_type) {
+ case RES_STRING:
+ _packer.AddString(buf, len);
+ break;
+ case RES_LONG_STRING:
+ _packer.AddLongString(buf, len);
+ break;
+ default:
+ break;
+ }
+ }
+
+public:
+ StringResultHandler(ResType t, ResultPacker & p) : _type(t), _packer(p) {}
+ void onPrimitive(uint32_t, const Content & c) override {
+ const document::FieldValue & fv = c.getValue();
+ if (fv.isLiteral()) {
+ const document::LiteralFieldValueB & lfv = static_cast<const document::LiteralFieldValueB &>(fv);
+ vespalib::stringref s = lfv.getValueRef();
+ addToPacker(s.data(), s.size());
+ } else {
+ vespalib::string s = fv.toString();
+ addToPacker(s.c_str(), s.size());
+ }
+ }
+};
+
+class RawResultHandler : public Handler {
+private:
+ ResType _type;
+ ResultPacker & _packer;
+
+public:
+ RawResultHandler(ResType t, ResultPacker & p) : _type(t), _packer(p) {}
+ void onPrimitive(uint32_t, const Content & c) override {
+ const document::FieldValue & fv = c.getValue();
+ try {
+ std::pair<const char *, size_t> buf = fv.getAsRaw();
+ if (buf.first != nullptr) {
+ switch (_type) {
+ case RES_DATA:
+ _packer.AddData(buf.first, buf.second);
+ break;
+ case RES_LONG_DATA:
+ _packer.AddLongData(buf.first, buf.second);
+ break;
+ default:
+ break;
+ }
+ }
+ } catch (document::InvalidDataTypeConversionException & e) {
+ LOG(warning, "RawResultHandler: Could not get field value '%s' as raw. Skipping writing this field", fv.toString().c_str());
+ _packer.AddEmpty();
+ }
+ }
+};
+
+
+}
+
+
+namespace vsm {
+
+FieldPath
+copyPathButFirst(const FieldPath & rhs) {
+ // skip the element that correspond to the start field value
+ FieldPath path;
+ if ( ! rhs.empty()) {
+ for (auto it = rhs.begin() + 1; it != rhs.end(); ++it) {
+ path.push_back(std::make_unique<document::FieldPathEntry>(**it));
+ }
+ }
+ return path;
+}
+
+void
+DocsumFilter::prepareFieldSpec(DocsumFieldSpec & spec, const DocsumTools::FieldSpec & toolsSpec,
+ const FieldMap & fieldMap, const FieldPathMapT & fieldPathMap)
+{
+ { // setup output field
+ const vespalib::string & name = toolsSpec.getOutputName();
+ LOG(debug, "prepareFieldSpec: output field name '%s'", name.c_str());
+ FieldIdT field = fieldMap.fieldNo(name);
+ if (field != FieldMap::npos) {
+ if (field < fieldPathMap.size()) {
+ spec.setOutputField(DocsumFieldSpec::FieldIdentifier(field, copyPathButFirst(fieldPathMap[field])));
+ } else {
+ LOG(warning, "Could not find a field path for field '%s' with id '%d'", name.c_str(), field);
+ spec.setOutputField(DocsumFieldSpec::FieldIdentifier(field, FieldPath()));
+ }
+ } else {
+ LOG(warning, "Could not find output summary field '%s'", name.c_str());
+ }
+ }
+ // setup input fields
+ for (size_t i = 0; i < toolsSpec.getInputNames().size(); ++i) {
+ const vespalib::string & name = toolsSpec.getInputNames()[i];
+ LOG(debug, "prepareFieldSpec: input field name '%s'", name.c_str());
+ FieldIdT field = fieldMap.fieldNo(name);
+ if (field != FieldMap::npos) {
+ if (field < fieldPathMap.size()) {
+ LOG(debug, "field %u < map size %zu", field, fieldPathMap.size());
+ spec.getInputFields().push_back(DocsumFieldSpec::FieldIdentifier(field, copyPathButFirst(fieldPathMap[field])));
+ } else {
+ LOG(warning, "Could not find a field path for field '%s' with id '%d'", name.c_str(), field);
+ spec.getInputFields().push_back(DocsumFieldSpec::FieldIdentifier(field, FieldPath()));
+ }
+ if (_highestFieldNo <= field) {
+ _highestFieldNo = field + 1;
+ }
+ } else {
+ LOG(warning, "Could not find input summary field '%s'", name.c_str());
+ }
+ }
+}
+
+const document::FieldValue *
+DocsumFilter::getFieldValue(const DocsumFieldSpec::FieldIdentifier & fieldId,
+ VsmsummaryConfig::Fieldmap::Command command,
+ const Document & docsum, bool & modified)
+{
+ FieldIdT fId = fieldId.getId();
+ const document::FieldValue * fv = docsum.getField(fId);
+ if (fv == nullptr) {
+ return nullptr;
+ }
+ switch (command) {
+ case VsmsummaryConfig::Fieldmap::Command::FLATTENJUNIPER:
+ if (_snippetModifiers != nullptr) {
+ FieldModifier * mod = _snippetModifiers->getModifier(fId);
+ if (mod != nullptr) {
+ _cachedValue = mod->modify(*fv, fieldId.getPath());
+ modified = true;
+ return _cachedValue.get();
+ }
+ }
+ [[fallthrough]];
+ default:
+ return fv;
+ }
+}
+
+
+DocsumFilter::DocsumFilter(const DocsumToolsPtr &tools, const IDocSumCache & docsumCache) :
+ _docsumCache(&docsumCache),
+ _tools(tools),
+ _fields(),
+ _highestFieldNo(0),
+ _packer(tools ? tools->getResultConfig() : nullptr),
+ _flattenWriter(),
+ _snippetModifiers(nullptr),
+ _cachedValue(),
+ _emptyFieldPath()
+{ }
+
+DocsumFilter::~DocsumFilter() =default;
+
+void DocsumFilter::init(const FieldMap & fieldMap, const FieldPathMapT & fieldPathMap)
+{
+ if (_tools.get()) {
+ const ResultClass *resClass = _tools->getResultClass();
+ const std::vector<DocsumTools::FieldSpec> & inputSpecs = _tools->getFieldSpecs();
+ if (resClass != nullptr) {
+ uint32_t entryCnt = resClass->GetNumEntries();
+ assert(entryCnt == inputSpecs.size());
+ for (uint32_t i = 0; i < entryCnt; ++i) {
+ const ResConfigEntry &entry = *resClass->GetEntry(i);
+ const DocsumTools::FieldSpec & toolsSpec = inputSpecs[i];
+ _fields.push_back(DocsumFieldSpec(entry._type, toolsSpec.getCommand()));
+ LOG(debug, "About to prepare field spec for summary field '%s'", entry._bindname.c_str());
+ prepareFieldSpec(_fields.back(), toolsSpec, fieldMap, fieldPathMap);
+ }
+ assert(entryCnt == _fields.size());
+ }
+ }
+}
+
+uint32_t
+DocsumFilter::getNumDocs() const
+{
+ return std::numeric_limits<uint32_t>::max();
+}
+
+void
+DocsumFilter::writeField(const document::FieldValue & fv, const FieldPath & path, ResType type, ResultPacker & packer)
+{
+ switch (type) {
+ case RES_INT: {
+ IntResultHandler rh;
+ fv.iterateNested(path, rh);
+ uint32_t val = rh.value;
+ packer.AddInteger(val);
+ break; }
+ case RES_SHORT: {
+ IntResultHandler rh;
+ fv.iterateNested(path, rh);
+ uint16_t val = rh.value;
+ packer.AddShort(val);
+ break; }
+ case RES_BYTE: {
+ IntResultHandler rh;
+ fv.iterateNested(path, rh);
+ uint8_t val = rh.value;
+ packer.AddByte(val);
+ break; }
+ case RES_BOOL: {
+ IntResultHandler rh;
+ fv.iterateNested(path, rh);
+ uint8_t val = rh.value;
+ packer.AddByte(val);
+ break; }
+ case RES_FLOAT: {
+ FloatResultHandler rh;
+ fv.iterateNested(path, rh);
+ float val = rh.value;
+ packer.AddFloat(val);
+ break; }
+ case RES_DOUBLE: {
+ DoubleResultHandler rh;
+ fv.iterateNested(path, rh);
+ double val = rh.value;
+ packer.AddDouble(val);
+ break; }
+ case RES_INT64: {
+ LongResultHandler rh;
+ fv.iterateNested(path, rh);
+ uint64_t val = rh.value;
+ packer.AddInt64(val);
+ break; }
+ case RES_STRING:
+ case RES_LONG_STRING:
+ {
+ StringResultHandler rh(type, packer);
+ // the string result handler adds the result to the packer
+ fv.iterateNested(path, rh);
+ }
+ break;
+ case RES_DATA:
+ case RES_LONG_DATA:
+ {
+ RawResultHandler rh(type, packer);
+ // the raw result handler adds the result to the packer
+ fv.iterateNested(path, rh);
+ }
+ break;
+ default:
+ LOG(warning, "Unknown docsum field type: %s", ResultConfig::GetResTypeName(type));
+ packer.AddEmpty(); // unhandled output type
+ break;
+ }
+}
+
+
+void
+DocsumFilter::writeSlimeField(const DocsumFieldSpec & fieldSpec,
+ const Document & docsum,
+ ResultPacker & packer)
+{
+ if (fieldSpec.getCommand() == VsmsummaryConfig::Fieldmap::Command::NONE) {
+ const DocsumFieldSpec::FieldIdentifier & fieldId = fieldSpec.getOutputField();
+ const document::FieldValue * fv = docsum.getField(fieldId.getId());
+ if (fv != nullptr) {
+ LOG(debug, "writeSlimeField: About to write field '%d' as Slime: field value = '%s'",
+ fieldId.getId(), fv->toString().c_str());
+ SlimeFieldWriter writer;
+ if (! fieldSpec.hasIdentityMapping()) {
+ writer.setInputFields(fieldSpec.getInputFields());
+ }
+ writer.convert(*fv);
+ const vespalib::stringref out = writer.out();
+ packer.AddLongString(out.data(), out.size());
+ } else {
+ LOG(debug, "writeSlimeField: Field value not set for field '%d'", fieldId.getId());
+ packer.AddEmpty();
+ }
+ } else {
+ LOG(debug, "writeSlimeField: Cannot handle this command");
+ packer.AddEmpty();
+ }
+}
+
+void
+DocsumFilter::writeFlattenField(const DocsumFieldSpec & fieldSpec,
+ const Document & docsum,
+ ResultPacker & packer)
+{
+ if (fieldSpec.getCommand() == VsmsummaryConfig::Fieldmap::Command::NONE) {
+ LOG(debug, "writeFlattenField: Cannot handle command NONE");
+ packer.AddEmpty();
+ return;
+ }
+
+ if (fieldSpec.getResultType() != RES_LONG_STRING &&
+ fieldSpec.getResultType() != RES_STRING)
+ {
+ LOG(debug, "writeFlattenField: Can only handle result types STRING and LONG_STRING");
+ packer.AddEmpty();
+ return;
+ }
+
+ switch (fieldSpec.getCommand()) {
+ case VsmsummaryConfig::Fieldmap::Command::FLATTENJUNIPER:
+ _flattenWriter.setSeparator("\x1E"); // record separator (same as juniper uses)
+ break;
+ default:
+ break;
+ }
+ const DocsumFieldSpec::FieldIdentifierVector & inputFields = fieldSpec.getInputFields();
+ for (size_t i = 0; i < inputFields.size(); ++i) {
+ const DocsumFieldSpec::FieldIdentifier & fieldId = inputFields[i];
+ bool modified = false;
+ const document::FieldValue * fv = getFieldValue(fieldId, fieldSpec.getCommand(), docsum, modified);
+ if (fv != nullptr) {
+ LOG(debug, "writeFlattenField: About to flatten field '%d' with field value (%s) '%s'",
+ fieldId.getId(), modified ? "modified" : "original", fv->toString().c_str());
+ if (modified) {
+ fv->iterateNested(_emptyFieldPath, _flattenWriter);
+ } else {
+ fv->iterateNested(fieldId.getPath(), _flattenWriter);
+ }
+ } else {
+ LOG(debug, "writeFlattenField: Field value not set for field '%d'", fieldId.getId());
+ }
+ }
+
+ const CharBuffer & buf = _flattenWriter.getResult();
+ switch (fieldSpec.getResultType()) {
+ case RES_STRING:
+ packer.AddString(buf.getBuffer(), buf.getPos());
+ break;
+ case RES_LONG_STRING:
+ packer.AddLongString(buf.getBuffer(), buf.getPos());
+ break;
+ default:
+ break;
+ }
+ _flattenWriter.clear();
+}
+
+
+void
+DocsumFilter::writeEmpty(ResType type, ResultPacker & packer)
+{
+ // use the 'notdefined' values when writing numeric values
+ switch (type) {
+ case RES_INT:
+ packer.AddInteger(std::numeric_limits<int32_t>::min());
+ break;
+ case RES_SHORT:
+ packer.AddShort(std::numeric_limits<int16_t>::min());
+ break;
+ case RES_BYTE:
+ packer.AddByte(0); // byte fields are unsigned so we have no 'notdefined' value.
+ break;
+ case RES_FLOAT:
+ packer.AddFloat(std::numeric_limits<float>::quiet_NaN());
+ break;
+ case RES_DOUBLE:
+ packer.AddDouble(std::numeric_limits<double>::quiet_NaN());
+ break;
+ case RES_INT64:
+ packer.AddInt64(std::numeric_limits<int64_t>::min());
+ break;
+ default:
+ packer.AddEmpty();
+ break;
+ }
+}
+
+uint32_t
+DocsumFilter::getSummaryClassId() const
+{
+ return _tools->getResultClass() ? _tools->getResultClass()->GetClassID() : ResultConfig::NoClassID();
+}
+
+DocsumStoreValue
+DocsumFilter::getMappedDocsum(uint32_t id)
+{
+ const ResultClass *resClass = _tools->getResultClass();
+ if (resClass == nullptr) {
+ return DocsumStoreValue(nullptr, 0);
+ }
+
+ const Document & doc = _docsumCache->getDocSum(id);
+
+ _packer.Init(resClass->GetClassID());
+ for (FieldSpecList::iterator it(_fields.begin()), end = _fields.end(); it != end; ++it) {
+ ResType type = it->getResultType();
+ if (type == RES_JSONSTRING) {
+ // this really means 'structured data'
+ writeSlimeField(*it, doc, _packer);
+ } else {
+ if (it->getInputFields().size() == 1 && it->getCommand() == VsmsummaryConfig::Fieldmap::Command::NONE) {
+ const DocsumFieldSpec::FieldIdentifier & fieldId = it->getInputFields()[0];
+ const document::FieldValue * field = doc.getField(fieldId.getId());
+ if (field != nullptr) {
+ writeField(*field, fieldId.getPath(), type, _packer);
+ } else {
+ writeEmpty(type, _packer); // void input
+ }
+ } else if (it->getInputFields().size() == 0 && it->getCommand() == VsmsummaryConfig::Fieldmap::Command::NONE) {
+ LOG(spam, "0 inputfields for output field %u", it->getOutputField().getId());
+ writeEmpty(type, _packer); // no input
+ } else {
+ writeFlattenField(*it, doc, _packer);
+ }
+ }
+ }
+
+ const char *buf;
+ uint32_t buflen;
+ bool ok = _packer.GetDocsumBlob(&buf, &buflen);
+ if (ok) {
+ return DocsumStoreValue(buf, buflen);
+ } else {
+ return DocsumStoreValue(nullptr, 0);
+ }
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.h b/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.h
new file mode 100644
index 00000000000..e6f7ae3e6fe
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/docsumfilter.h
@@ -0,0 +1,90 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vsm/common/docsum.h>
+#include <vespa/vsm/common/fieldmodifier.h>
+#include <vespa/vsm/vsm/docsumfieldspec.h>
+#include <vespa/vsm/vsm/fieldsearchspec.h>
+#include <vespa/vsm/vsm/flattendocsumwriter.h>
+#include <vespa/vsm/vsm/vsm-adapter.h>
+#include <vespa/searchsummary/docsummary/resultpacker.h>
+#include <vespa/searchsummary/docsummary/docsumstore.h>
+
+using search::docsummary::IDocsumStore;
+using search::docsummary::DocsumStoreValue;
+using search::docsummary::ResType;
+using search::docsummary::ResultPacker;
+
+namespace vsm {
+
+/**
+ * This class implements the IDocsumStore interface such that docsum blobs
+ * can be fetched based on local document id. The docsum blobs are generated
+ * on the fly when requested.
+ **/
+class DocsumFilter : public IDocsumStore
+{
+private:
+ typedef std::vector<DocsumFieldSpec> FieldSpecList; // list of summary field specs
+ typedef std::vector<vespalib::string> StringList;
+ typedef StringFieldIdTMap FieldMap;
+
+ const IDocSumCache * _docsumCache;
+ DocsumToolsPtr _tools;
+ FieldSpecList _fields; // list of summary fields to generate
+ size_t _highestFieldNo;
+ ResultPacker _packer;
+ FlattenDocsumWriter _flattenWriter;
+ const FieldModifierMap * _snippetModifiers;
+ document::FieldValue::UP _cachedValue;
+ document::FieldPath _emptyFieldPath;
+
+ DocsumFilter(const DocsumFilter &);
+ DocsumFilter &operator=(const DocsumFilter &);
+ void prepareFieldSpec(DocsumFieldSpec & spec, const DocsumTools::FieldSpec & toolsSpec,
+ const FieldMap & fieldMap, const FieldPathMapT & fieldPathMap);
+ const document::FieldValue * getFieldValue(const DocsumFieldSpec::FieldIdentifier & fieldId,
+ VsmsummaryConfig::Fieldmap::Command command,
+ const Document & docsum, bool & modified);
+ void writeField(const document::FieldValue & fv, const FieldPath & path, ResType type, ResultPacker & packer);
+ void writeSlimeField(const DocsumFieldSpec & fieldSpec, const Document & docsum, ResultPacker & packer);
+ void writeFlattenField(const DocsumFieldSpec & fieldSpec, const Document & docsum, ResultPacker & packer);
+ void writeEmpty(ResType type, ResultPacker & packer);
+
+public:
+ DocsumFilter(const DocsumToolsPtr & tools, const IDocSumCache & docsumCache);
+ ~DocsumFilter() override;
+ const DocsumToolsPtr & getTools() const { return _tools; }
+
+ /**
+ * Initializes this docsum filter using the given field map and field path map.
+ * The field map is used to map from field name to field id.
+ * The field path map is used to retrieve the field path for each input field.
+ *
+ * @param fieldMap maps from field name -> field id
+ * @param fieldPathMap maps from field id -> field path
+ **/
+ void init(const FieldMap & fieldMap, const FieldPathMapT & fieldPathMap);
+
+ /**
+ * Sets the snippet modifiers to use when writing string fields used as input to snippet generation.
+ **/
+ void setSnippetModifiers(const FieldModifierMap & modifiers) { _snippetModifiers = &modifiers; }
+
+ /**
+ * Returns the highest field id + 1 among all fields in the field spec list.
+ **/
+ size_t getHighestFieldNo() const { return _highestFieldNo; }
+
+
+ void setDocSumStore(const IDocSumCache & docsumCache) { _docsumCache = &docsumCache; }
+
+ // Inherit doc from IDocsumStore
+ DocsumStoreValue getMappedDocsum(uint32_t id) override;
+ uint32_t getNumDocs() const override;
+ uint32_t getSummaryClassId() const override;
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
new file mode 100644
index 00000000000..7043e63ec87
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.cpp
@@ -0,0 +1,334 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "fieldsearchspec.h"
+#include <vespa/vsm/searcher/utf8flexiblestringfieldsearcher.h>
+#include <vespa/vsm/searcher/utf8strchrfieldsearcher.h>
+#include <vespa/vsm/searcher/utf8substringsearcher.h>
+#include <vespa/vsm/searcher/utf8suffixstringfieldsearcher.h>
+#include <vespa/vsm/searcher/utf8exactstringfieldsearcher.h>
+#include <vespa/vsm/searcher/futf8strchrfieldsearcher.h>
+#include <vespa/vsm/searcher/intfieldsearcher.h>
+#include <vespa/vsm/searcher/boolfieldsearcher.h>
+#include <vespa/vsm/searcher/floatfieldsearcher.h>
+#include <vespa/vsm/searcher/geo_pos_field_searcher.h>
+#include <vespa/vespalib/stllike/asciistream.h>
+#include <regex>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".vsm.fieldsearchspec");
+
+#define DEBUGMASK 0x01
+
+using search::streaming::ConstQueryTermList;
+using search::streaming::Query;
+using search::streaming::QueryTerm;
+
+namespace vsm {
+
+namespace {
+
+void setMatchType(FieldSearcherContainer & searcher, vespalib::stringref arg1) {
+ if (arg1 == "prefix") {
+ searcher->setMatchType(FieldSearcher::PREFIX);
+ } else if (arg1 == "substring") {
+ searcher->setMatchType(FieldSearcher::SUBSTRING);
+ } else if (arg1 == "suffix") {
+ searcher->setMatchType(FieldSearcher::SUFFIX);
+ } else if (arg1 == "exact") {
+ searcher->setMatchType(FieldSearcher::EXACT);
+ } else if (arg1 == "word") {
+ searcher->setMatchType(FieldSearcher::EXACT);
+ }
+}
+
+}
+
+FieldSearchSpec::FieldSearchSpec() :
+ _id(0),
+ _name(),
+ _maxLength(0x100000),
+ _searcher(),
+ _searchMethod(VsmfieldsConfig::Fieldspec::Searchmethod::NONE),
+ _arg1(),
+ _reconfigured(false)
+{
+}
+FieldSearchSpec::~FieldSearchSpec() = default;
+
+FieldSearchSpec::FieldSearchSpec(FieldSearchSpec&& rhs) noexcept = default;
+FieldSearchSpec& FieldSearchSpec::operator=(FieldSearchSpec&& rhs) noexcept = default;
+
+FieldSearchSpec::FieldSearchSpec(const FieldIdT & fid, const vespalib::string & fname,
+ VsmfieldsConfig::Fieldspec::Searchmethod searchDef,
+ const vespalib::string & arg1, size_t maxLength_) :
+ _id(fid),
+ _name(fname),
+ _maxLength(maxLength_),
+ _searcher(),
+ _searchMethod(searchDef),
+ _arg1(arg1),
+ _reconfigured(false)
+{
+ switch(searchDef) {
+ default:
+ LOG(warning, "Unknown searchdef = %d. Defaulting to AUTOUTF8", static_cast<int>(searchDef));
+ [[fallthrough]];
+ case VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8:
+ case VsmfieldsConfig::Fieldspec::Searchmethod::NONE:
+ case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8:
+ case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8:
+ if (arg1 == "substring") {
+ _searcher = std::make_unique<UTF8SubStringFieldSearcher>(fid);
+ } else if (arg1 == "suffix") {
+ _searcher = std::make_unique<UTF8SuffixStringFieldSearcher>(fid);
+ } else if (arg1 == "exact") {
+ _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid);
+ } else if (arg1 == "word") {
+ _searcher = std::make_unique<UTF8ExactStringFieldSearcher>(fid);
+ } else if (searchDef == VsmfieldsConfig::Fieldspec::Searchmethod::UTF8) {
+ _searcher = std::make_unique<UTF8StrChrFieldSearcher>(fid);
+ } else {
+ _searcher = std::make_unique<FUTF8StrChrFieldSearcher>(fid);
+ }
+ break;
+ case VsmfieldsConfig::Fieldspec::Searchmethod::BOOL:
+ _searcher = std::make_unique<BoolFieldSearcher>(fid);
+ break;
+ case VsmfieldsConfig::Fieldspec::Searchmethod::INT8:
+ case VsmfieldsConfig::Fieldspec::Searchmethod::INT16:
+ case VsmfieldsConfig::Fieldspec::Searchmethod::INT32:
+ case VsmfieldsConfig::Fieldspec::Searchmethod::INT64:
+ _searcher = std::make_unique<IntFieldSearcher>(fid);
+ break;
+ case VsmfieldsConfig::Fieldspec::Searchmethod::FLOAT:
+ _searcher = std::make_unique<FloatFieldSearcher>(fid);
+ break;
+ case VsmfieldsConfig::Fieldspec::Searchmethod::DOUBLE:
+ _searcher = std::make_unique<DoubleFieldSearcher>(fid);
+ break;
+ case VsmfieldsConfig::Fieldspec::Searchmethod::GEOPOS:
+ _searcher = std::make_unique<GeoPosFieldSearcher>(fid);
+ break;
+ }
+ if (_searcher) {
+ setMatchType(_searcher, arg1);
+ _searcher->maxFieldLength(maxLength());
+ }
+}
+
+void
+FieldSearchSpec::reconfig(const QueryTerm & term)
+{
+ if (_reconfigured) {
+ return;
+ }
+ switch (_searchMethod) {
+ case VsmfieldsConfig::Fieldspec::Searchmethod::NONE:
+ case VsmfieldsConfig::Fieldspec::Searchmethod::AUTOUTF8:
+ case VsmfieldsConfig::Fieldspec::Searchmethod::UTF8:
+ case VsmfieldsConfig::Fieldspec::Searchmethod::SSE2UTF8:
+ if ((term.isSubstring() && _arg1 != "substring") ||
+ (term.isSuffix() && _arg1 != "suffix") ||
+ (term.isExactstring() && _arg1 != "exact") ||
+ (term.isPrefix() && _arg1 == "suffix"))
+ {
+ _searcher = std::make_unique<UTF8FlexibleStringFieldSearcher>(id());
+ // preserve the basic match property of the searcher
+ setMatchType(_searcher, _arg1);
+ LOG(debug, "Reconfigured to use UTF8FlexibleStringFieldSearcher (%s) for field '%s' with id '%d'",
+ _searcher->prefix() ? "prefix" : "regular", name().c_str(), id());
+ _reconfigured = true;
+ }
+ break;
+ default:
+ break;
+ }
+}
+
+vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpec & f)
+{
+ os << f._id << ' ' << f._name << ' ';
+ if ( ! f._searcher) {
+ os << " No searcher defined.\n";
+ }
+ return os;
+}
+
+FieldSearchSpecMap::FieldSearchSpecMap() = default;
+
+FieldSearchSpecMap::~FieldSearchSpecMap() = default;
+
+namespace {
+ const std::string _G_empty("");
+ const std::string _G_value(".value");
+ const std::regex _G_map1("\\{[a-zA-Z0-9]+\\}");
+ const std::regex _G_map2("\\{\".*\"\\}");
+ const std::regex _G_array("\\[[0-9]+\\]");
+}
+
+vespalib::string FieldSearchSpecMap::stripNonFields(const vespalib::string & rawIndex)
+{
+ if ((rawIndex.find('[') != vespalib::string::npos) || (rawIndex.find('{') != vespalib::string::npos)) {
+ std::string index = std::regex_replace(std::string(rawIndex), _G_map1, _G_value);
+ index = std::regex_replace(index, _G_map2, _G_value);
+ index = std::regex_replace(index, _G_array, _G_empty);
+ return index;
+ }
+ return rawIndex;
+}
+
+bool FieldSearchSpecMap::buildFieldsInQuery(const Query & query, StringFieldIdTMap & fieldsInQuery) const
+{
+ bool retval(true);
+ ConstQueryTermList qtl;
+ query.getLeafs(qtl);
+
+ for (const auto & term : qtl) {
+ for (const auto & dtm : documentTypeMap()) {
+ const IndexFieldMapT & fim = dtm.second;
+ vespalib::string rawIndex(term->index());
+ vespalib::string index(stripNonFields(rawIndex));
+ IndexFieldMapT::const_iterator fIt = fim.find(index);
+ if (fIt != fim.end()) {
+ for(FieldIdT fid : fIt->second) {
+ const FieldSearchSpec & spec = specMap().find(fid)->second;
+ LOG(debug, "buildFieldsInQuery = rawIndex='%s', index='%s'", rawIndex.c_str(), index.c_str());
+ if ((rawIndex != index) && (spec.name().find(index) == 0)) {
+ vespalib::string modIndex(rawIndex);
+ modIndex.append(spec.name().substr(index.size()));
+ fieldsInQuery.add(modIndex, spec.id());
+ } else {
+ fieldsInQuery.add(spec.name(),spec.id());
+ }
+ }
+ } else {
+ LOG(warning, "No valid indexes registered for index %s", term->index().c_str());
+ retval = false;
+ }
+ }
+ }
+ return retval;
+}
+
+void FieldSearchSpecMap::buildFromConfig(const std::vector<vespalib::string> & otherFieldsNeeded)
+{
+ for(size_t i(0), m(otherFieldsNeeded.size()); i < m; i++) {
+ LOG(debug, "otherFieldsNeeded[%zd] = '%s'", i, otherFieldsNeeded[i].c_str());
+ _nameIdMap.add(otherFieldsNeeded[i]);
+ }
+}
+
+namespace {
+
+FieldIdTList
+buildFieldSet(const VsmfieldsConfig::Documenttype::Index & ci, const FieldSearchSpecMapT & specMap,
+ const VsmfieldsConfig::Documenttype::IndexVector & indexes)
+{
+ LOG(spam, "Index %s with %zd fields", ci.name.c_str(), ci.field.size());
+ FieldIdTList ifm;
+ for (const VsmfieldsConfig::Documenttype::Index::Field & cf : ci.field) {
+ LOG(spam, "Parsing field %s", cf.name.c_str());
+ auto foundIndex = std::find_if(indexes.begin(), indexes.end(),
+ [&cf](const auto & v) { return v.name == cf.name;});
+ if ((foundIndex != indexes.end()) && (cf.name != ci.name)) {
+ FieldIdTList sub = buildFieldSet(*foundIndex, specMap, indexes);
+ ifm.insert(ifm.end(), sub.begin(), sub.end());
+ } else {
+ auto foundField = std::find_if(specMap.begin(), specMap.end(),
+ [&cf](const auto & v) { return v.second.name() == cf.name;} );
+ if (foundField != specMap.end()) {
+ ifm.push_back(foundField->second.id());
+ } else {
+ LOG(warning, "Field %s not defined. Ignoring....", cf.name.c_str());
+ }
+ }
+ }
+ return ifm;
+}
+
+}
+
+bool FieldSearchSpecMap::buildFromConfig(const VsmfieldsHandle & conf)
+{
+ bool retval(true);
+ LOG(spam, "Parsing %zd fields", conf->fieldspec.size());
+ for(const VsmfieldsConfig::Fieldspec & cfs : conf->fieldspec) {
+ LOG(spam, "Parsing %s", cfs.name.c_str());
+ FieldIdT fieldId = specMap().size();
+ FieldSearchSpec fss(fieldId, cfs.name, cfs.searchmethod, cfs.arg1.c_str(), cfs.maxlength);
+ _specMap[fieldId] = std::move(fss);
+ _nameIdMap.add(cfs.name, fieldId);
+ LOG(spam, "M in %d = %s", fieldId, cfs.name.c_str());
+ }
+
+ LOG(spam, "Parsing %zd document types", conf->documenttype.size());
+ for(const VsmfieldsConfig::Documenttype & di : conf->documenttype) {
+ IndexFieldMapT indexMapp;
+ LOG(spam, "Parsing document type %s with %zd indexes", di.name.c_str(), di.index.size());
+ for(const VsmfieldsConfig::Documenttype::Index & ci : di.index) {
+ indexMapp[ci.name] = buildFieldSet(ci, specMap(), di.index);
+ }
+ _documentTypeMap[di.name] = indexMapp;
+ }
+ return retval;
+}
+
+void
+FieldSearchSpecMap::reconfigFromQuery(const Query & query)
+{
+ ConstQueryTermList qtl;
+ query.getLeafs(qtl);
+
+ for (const auto & termA : qtl) {
+ for (const auto & ifm : documentTypeMap()) {
+ IndexFieldMapT::const_iterator itc = ifm.second.find(termA->index());
+ if (itc != ifm.second.end()) {
+ for (FieldIdT fid : itc->second) {
+ FieldSearchSpec & spec = _specMap.find(fid)->second;
+ spec.reconfig(*termA);
+ }
+ }
+ }
+ }
+}
+
+bool lesserField(const FieldSearcherContainer & a, const FieldSearcherContainer & b)
+{
+ return a->field() < b->field();
+}
+
+void FieldSearchSpecMap::buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap)
+{
+ fieldSearcherMap.clear();
+ for (const auto & entry : fieldsInQuery) {
+ FieldIdT fId = entry.second;
+ const FieldSearchSpec & spec = specMap().find(fId)->second;
+ fieldSearcherMap.emplace_back(spec.searcher().duplicate());
+ }
+ std::sort(fieldSearcherMap.begin(), fieldSearcherMap.end(), lesserField);
+}
+
+
+vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & df)
+{
+ os << "DocumentTypeMap = \n";
+ for (const auto & dtm : df.documentTypeMap()) {
+ os << "DocType = " << dtm.first << "\n";
+ os << "IndexMap = \n";
+ for (const auto &index : dtm.second) {
+ os << index.first << ": ";
+ for (FieldIdT fid : index.second) {
+ os << fid << ' ';
+ }
+ os << '\n';
+ }
+ }
+ os << "SpecMap = \n";
+ for (const auto & entry : df.specMap()) {
+ os << entry.first << " = " << entry.second << '\n';
+ }
+ os << "NameIdMap = \n" << df.nameIdMap();
+ return os;
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
new file mode 100644
index 00000000000..7b78a8634e0
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/fieldsearchspec.h
@@ -0,0 +1,98 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/vsm/searcher/fieldsearcher.h>
+#include <vespa/vsm/config/vsm-cfif.h>
+
+namespace vsm {
+
+class FieldSearchSpec
+{
+public:
+ FieldSearchSpec();
+ FieldSearchSpec(const FieldIdT & id, const vespalib::string & name,
+ VsmfieldsConfig::Fieldspec::Searchmethod searchMethod,
+ const vespalib::string & arg1, size_t maxLength);
+ ~FieldSearchSpec();
+ FieldSearchSpec(FieldSearchSpec&& rhs) noexcept;
+ FieldSearchSpec& operator=(FieldSearchSpec&& rhs) noexcept;
+ const FieldSearcher & searcher() const { return *_searcher; }
+ const vespalib::string & name() const { return _name; }
+ FieldIdT id() const { return _id; }
+ bool valid() const { return static_cast<bool>(_searcher); }
+ size_t maxLength() const { return _maxLength; }
+
+ /**
+ * Reconfigures the field searcher based on information in the given query term.
+ **/
+ void reconfig(const search::streaming::QueryTerm & term);
+
+ friend vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpec & f);
+
+private:
+ FieldIdT _id;
+ vespalib::string _name;
+ size_t _maxLength;
+ FieldSearcherContainer _searcher;
+ VsmfieldsConfig::Fieldspec::Searchmethod _searchMethod;
+ vespalib::string _arg1;
+ bool _reconfigured;
+};
+
+typedef std::map<FieldIdT, FieldSearchSpec> FieldSearchSpecMapT;
+
+class FieldSearchSpecMap
+{
+public:
+ FieldSearchSpecMap();
+ ~FieldSearchSpecMap();
+
+ /**
+ * Iterates over all fields in the vsmfields config and creates a mapping from field id to FieldSearchSpec objects
+ * and a mapping from field name to field id. It then iterates over all document types and index names
+ * and creates a mapping from index name to list of field ids for each document type.
+ **/
+ bool buildFromConfig(const VsmfieldsHandle & conf);
+
+ /**
+ * Iterates over the given field name vector adding extra elements to the mapping from field name to field id.
+ **/
+ void buildFromConfig(const std::vector<vespalib::string> & otherFieldsNeeded);
+
+ /**
+ * Reconfigures some of the field searchers based on information in the given query.
+ **/
+ void reconfigFromQuery(const search::streaming::Query & query);
+
+ /**
+ * Adds a [field name, field id] entry to the given mapping for each field name used in the given query.
+ * This is achieved by mapping from query term index name -> list of field ids -> [field name, field id] pairs.
+ **/
+ bool buildFieldsInQuery(const search::streaming::Query & query, StringFieldIdTMap & fieldsInQuery) const;
+
+ /**
+ * Adds a [field name, field id] entry to the given mapping for each field name in the given vector.
+ **/
+ void buildFieldsInQuery(const std::vector<vespalib::string> & otherFieldsNeeded, StringFieldIdTMap & fieldsInQuery) const;
+
+ /**
+ * Adds a FieldSearcher object to the given field searcher map for each field name in the other map.
+ **/
+ void buildSearcherMap(const StringFieldIdTMapT & fieldsInQuery, FieldIdTSearcherMap & fieldSearcherMap);
+
+ const FieldSearchSpecMapT & specMap() const { return _specMap; }
+ //const IndexFieldMapT & indexMap() const { return _documentTypeMap.begin()->second; }
+ const DocumentTypeIndexFieldMapT & documentTypeMap() const { return _documentTypeMap; }
+ const StringFieldIdTMap & nameIdMap() const { return _nameIdMap; }
+ friend vespalib::asciistream & operator <<(vespalib::asciistream & os, const FieldSearchSpecMap & f);
+
+ static vespalib::string stripNonFields(const vespalib::string & rawIndex);
+
+private:
+ FieldSearchSpecMapT _specMap; // mapping from field id to field search spec
+ DocumentTypeIndexFieldMapT _documentTypeMap; // mapping from index name to field id list for each document type
+ StringFieldIdTMap _nameIdMap; // mapping from field name to field id
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.cpp b/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.cpp
new file mode 100644
index 00000000000..06b652d85e6
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.cpp
@@ -0,0 +1,45 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "flattendocsumwriter.h"
+#include <vespa/document/fieldvalue/fieldvalues.h>
+
+namespace vsm {
+
+void
+FlattenDocsumWriter::considerSeparator()
+{
+ if (_useSeparator) {
+ _output.put(_separator.c_str(), _separator.size());
+ }
+}
+
+void
+FlattenDocsumWriter::onPrimitive(uint32_t, const Content & c)
+{
+ considerSeparator();
+ const document::FieldValue & fv = c.getValue();
+ if (fv.isLiteral()) {
+ const document::LiteralFieldValueB & lfv = static_cast<const document::LiteralFieldValueB &>(fv);
+ vespalib::stringref value = lfv.getValueRef();
+ _output.put(value.data(), value.size());
+ } else if (fv.isNumeric() ||
+ fv.isA(document::FieldValue::Type::BOOL))
+ {
+ vespalib::string value = fv.getAsString();
+ _output.put(value.data(), value.size());
+ } else {
+ vespalib::string value = fv.toString();
+ _output.put(value.data(), value.size());
+ }
+ _useSeparator = true;
+}
+
+FlattenDocsumWriter::FlattenDocsumWriter(const vespalib::string & separator) :
+ _output(32),
+ _separator(separator),
+ _useSeparator(false)
+{ }
+
+FlattenDocsumWriter::~FlattenDocsumWriter() = default;
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.h b/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.h
new file mode 100644
index 00000000000..47c6f1e75d0
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/flattendocsumwriter.h
@@ -0,0 +1,36 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include <vespa/document/fieldvalue/fieldvalue.h>
+#include <vespa/document/fieldvalue/iteratorhandler.h>
+#include <vespa/vsm/common/charbuffer.h>
+
+namespace vsm {
+
+/**
+ * This class is used to flatten out and write a complex field value.
+ * A separator string is inserted between primitive field values.
+ **/
+class FlattenDocsumWriter : public document::fieldvalue::IteratorHandler {
+private:
+ CharBuffer _output;
+ vespalib::string _separator;
+ bool _useSeparator;
+
+ void considerSeparator();
+ void onPrimitive(uint32_t, const Content & c) override;
+
+public:
+ FlattenDocsumWriter(const vespalib::string & separator = " ");
+ ~FlattenDocsumWriter();
+ void setSeparator(const vespalib::string & separator) { _separator = separator; }
+ const CharBuffer & getResult() const { return _output; }
+ void clear() {
+ _output.reset();
+ _separator = " ";
+ _useSeparator = false;
+ }
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/vsm/i_matching_elements_filler.h b/streamingvisitors/src/vespa/vsm/vsm/i_matching_elements_filler.h
new file mode 100644
index 00000000000..a35cea40cec
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/i_matching_elements_filler.h
@@ -0,0 +1,24 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <memory>
+
+namespace search {
+class MatchingElements;
+class MatchingElementsFields;
+}
+
+namespace vsm {
+
+/*
+ * Interface class for filling matching elements structure for
+ * streaming search.
+ */
+class IMatchingElementsFiller {
+public:
+ virtual std::unique_ptr<search::MatchingElements> fill_matching_elements(const search::MatchingElementsFields& fields) = 0;
+ virtual ~IMatchingElementsFiller() = default;
+};
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.cpp b/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.cpp
new file mode 100644
index 00000000000..5bc5798fb9d
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.cpp
@@ -0,0 +1,220 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "slimefieldwriter.h"
+#include <vespa/searchlib/util/slime_output_raw_buf_adapter.h>
+#include <vespa/vespalib/stllike/asciistream.h>
+#include <vespa/vespalib/util/size_literals.h>
+#include <vespa/searchsummary/docsummary/resultconfig.h>
+#include <vespa/document/datatype/positiondatatype.h>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".vsm.slimefieldwriter");
+
+namespace {
+
+vespalib::string
+toString(const vsm::FieldPath & fieldPath)
+{
+ vespalib::asciistream oss;
+ for (size_t i = 0; i < fieldPath.size(); ++i) {
+ if (i > 0) {
+ oss << ".";
+ }
+ oss << fieldPath[i].getName();
+ }
+ return oss.str();
+}
+
+vespalib::string
+toString(const std::vector<vespalib::string> & fieldPath)
+{
+ vespalib::asciistream oss;
+ for (size_t i = 0; i < fieldPath.size(); ++i) {
+ if (i > 0) {
+ oss << ".";
+ }
+ oss << fieldPath[i];
+ }
+ return oss.str();
+}
+
+} // namespace <unnamed>
+
+using namespace vespalib::slime::convenience;
+
+
+namespace vsm {
+
+void
+SlimeFieldWriter::traverseRecursive(const document::FieldValue & fv, Inserter &inserter)
+{
+ LOG(debug, "traverseRecursive: class(%s), fieldValue(%s), currentPath(%s)",
+ fv.className(), fv.toString().c_str(), toString(_currPath).c_str());
+
+ if (fv.isCollection()) {
+ const document::CollectionFieldValue & cfv = static_cast<const document::CollectionFieldValue &>(fv);
+ if (cfv.isA(document::FieldValue::Type::ARRAY)) {
+ const document::ArrayFieldValue & afv = static_cast<const document::ArrayFieldValue &>(cfv);
+ Cursor &a = inserter.insertArray();
+ for (size_t i = 0; i < afv.size(); ++i) {
+ const document::FieldValue & nfv = afv[i];
+ ArrayInserter ai(a);
+ traverseRecursive(nfv, ai);
+ }
+ } else {
+ assert(cfv.isA(document::FieldValue::Type::WSET));
+ const document::WeightedSetFieldValue & wsfv = static_cast<const document::WeightedSetFieldValue &>(cfv);
+ Cursor &a = inserter.insertArray();
+ Symbol isym = a.resolve("item");
+ Symbol wsym = a.resolve("weight");
+ for (const auto &entry : wsfv) {
+ Cursor &o = a.addObject();
+ const document::FieldValue & nfv = *entry.first;
+ ObjectSymbolInserter oi(o, isym);
+ traverseRecursive(nfv, oi);
+ int weight = static_cast<const document::IntFieldValue &>(*entry.second).getValue();
+ o.setLong(wsym, weight);
+ }
+ }
+ } else if (fv.isA(document::FieldValue::Type::MAP)) {
+ const document::MapFieldValue & mfv = static_cast<const document::MapFieldValue &>(fv);
+ Cursor &a = inserter.insertArray();
+ Symbol keysym = a.resolve("key");
+ Symbol valsym = a.resolve("value");
+ for (const auto &entry : mfv) {
+ Cursor &o = a.addObject();
+ ObjectSymbolInserter ki(o, keysym);
+ traverseRecursive(*entry.first, ki);
+ _currPath.push_back("value");
+ ObjectSymbolInserter vi(o, valsym);
+ traverseRecursive(*entry.second, vi);
+ _currPath.pop_back();
+ }
+ } else if (fv.isStructured()) {
+ const document::StructuredFieldValue & sfv = static_cast<const document::StructuredFieldValue &>(fv);
+ Cursor &o = inserter.insertObject();
+ if (sfv.getDataType() == &document::PositionDataType::getInstance()
+ && search::docsummary::ResultConfig::wantedV8geoPositions())
+ {
+ bool ok = true;
+ try {
+ int x = std::numeric_limits<int>::min();
+ int y = std::numeric_limits<int>::min();
+ for (const document::Field & entry : sfv) {
+ document::FieldValue::UP fval(sfv.getValue(entry));
+ if (entry.getName() == "x") {
+ x = fval->getAsInt();
+ } else if (entry.getName() == "y") {
+ y = fval->getAsInt();
+ } else {
+ ok = false;
+ }
+ }
+ if (x == std::numeric_limits<int>::min()) ok = false;
+ if (y == std::numeric_limits<int>::min()) ok = false;
+ if (ok) {
+ o.setDouble("lat", double(y) / 1.0e6);
+ o.setDouble("lng", double(x) / 1.0e6);
+ return;
+ }
+ } catch (std::exception &e) {
+ (void)e;
+ // fallback to code below
+ }
+ }
+ for (const document::Field & entry : sfv) {
+ if (explorePath(entry.getName())) {
+ _currPath.push_back(entry.getName());
+ Memory keymem(entry.getName());
+ ObjectInserter oi(o, keymem);
+ document::FieldValue::UP fval(sfv.getValue(entry));
+ traverseRecursive(*fval, oi);
+ _currPath.pop_back();
+ }
+ }
+ } else {
+ if (fv.isLiteral()) {
+ const document::LiteralFieldValueB & lfv = static_cast<const document::LiteralFieldValueB &>(fv);
+ inserter.insertString(lfv.getValueRef());
+ } else if (fv.isNumeric()) {
+ switch (fv.getDataType()->getId()) {
+ case document::DataType::T_BYTE:
+ case document::DataType::T_SHORT:
+ case document::DataType::T_INT:
+ case document::DataType::T_LONG:
+ inserter.insertLong(fv.getAsLong());
+ break;
+ case document::DataType::T_DOUBLE:
+ inserter.insertDouble(fv.getAsDouble());
+ break;
+ case document::DataType::T_FLOAT:
+ inserter.insertDouble(fv.getAsFloat());
+ break;
+ default:
+ inserter.insertString(fv.getAsString());
+ }
+ } else if (fv.isA(document::FieldValue::Type::BOOL)) {
+ const auto & bfv = static_cast<const document::BoolFieldValue &>(fv);
+ inserter.insertBool(bfv.getValue());
+ } else {
+ inserter.insertString(fv.toString());
+ }
+ }
+}
+
+bool
+SlimeFieldWriter::explorePath(vespalib::stringref candidate)
+{
+ if (_inputFields == nullptr) {
+ return true;
+ }
+ // find out if we should explore the current path
+ for (size_t i = 0; i < _inputFields->size(); ++i) {
+ const FieldPath & fp = (*_inputFields)[i].getPath();
+ if (_currPath.size() <= fp.size()) {
+ bool equal = true;
+ for (size_t j = 0; j < _currPath.size() && equal; ++j) {
+ equal = (fp[j].getName() == _currPath[j]);
+ }
+ if (equal) {
+ if (_currPath.size() == fp.size()) {
+ return true;
+ } else if (fp[_currPath.size()].getName() == candidate) {
+ // the current path matches one of the input field paths
+ return true;
+ }
+ }
+ }
+ }
+ return false;
+}
+
+SlimeFieldWriter::SlimeFieldWriter() :
+ _rbuf(4_Ki),
+ _slime(),
+ _inputFields(nullptr),
+ _currPath()
+{
+}
+
+SlimeFieldWriter::~SlimeFieldWriter() = default;
+
+void
+SlimeFieldWriter::convert(const document::FieldValue & fv)
+{
+ if (LOG_WOULD_LOG(debug)) {
+ if (_inputFields != nullptr) {
+ for (size_t i = 0; i < _inputFields->size(); ++i) {
+ LOG(debug, "write: input field path [%zd] '%s'", i, toString((*_inputFields)[i].getPath()).c_str());
+ }
+ } else {
+ LOG(debug, "write: no input fields");
+ }
+ }
+ SlimeInserter inserter(_slime);
+ traverseRecursive(fv, inserter);
+ search::SlimeOutputRawBufAdapter adapter(_rbuf);
+ vespalib::slime::BinaryFormat::encode(_slime, adapter);
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.h b/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.h
new file mode 100644
index 00000000000..b5adac8985f
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/slimefieldwriter.h
@@ -0,0 +1,57 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "docsumfieldspec.h"
+#include <vespa/vsm/common/storagedocument.h>
+#include <vespa/document/fieldvalue/fieldvalues.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/searchlib/util/rawbuf.h>
+
+namespace vsm {
+
+/**
+ * This class is used to write a field value as slime binary data.
+ * If only a subset of the field value should be written this subset
+ * is specified using the setInputFields() function.
+ **/
+class SlimeFieldWriter
+{
+private:
+ search::RawBuf _rbuf;
+ vespalib::Slime _slime;
+ const DocsumFieldSpec::FieldIdentifierVector * _inputFields;
+ std::vector<vespalib::string> _currPath;
+
+ void traverseRecursive(const document::FieldValue & fv, vespalib::slime::Inserter & inserter);
+ bool explorePath(vespalib::stringref candidate);
+
+public:
+ SlimeFieldWriter();
+ ~SlimeFieldWriter();
+
+
+ /**
+ * Specifies the subset of the field value that should be written.
+ **/
+ void setInputFields(const DocsumFieldSpec::FieldIdentifierVector & inputFields) { _inputFields = &inputFields; }
+
+ /**
+ * Convert the given field value
+ **/
+ void convert(const document::FieldValue & fv);
+
+ /**
+ * Return a reference to the output binary data
+ **/
+ vespalib::stringref out() const {
+ return vespalib::stringref(_rbuf.GetDrainPos(), _rbuf.GetUsedLen());
+ }
+
+ void clear() {
+ _rbuf.Reuse();
+ _inputFields = nullptr;
+ _currPath.clear();
+ }
+};
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.cpp b/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.cpp
new file mode 100644
index 00000000000..127302311f9
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.cpp
@@ -0,0 +1,136 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "snippetmodifier.h"
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/vespalib/stllike/hash_map.hpp>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".vsm.snippetmodifier");
+
+using namespace document;
+using search::streaming::QueryTerm;
+using search::streaming::QueryTermList;
+typedef vespalib::hash_map<vsm::FieldIdT, QueryTermList> FieldQueryTermMap;
+
+namespace {
+
+void
+addIfNotPresent(FieldQueryTermMap & map, vsm::FieldIdT fId, QueryTerm * qt)
+{
+ FieldQueryTermMap::iterator itr = map.find(fId);
+ if (itr != map.end()) {
+ QueryTermList & qtl = itr->second;
+ if (std::find(qtl.begin(), qtl.end(), qt) == qtl.end()) {
+ qtl.push_back(qt);
+ }
+ } else {
+ map[fId].push_back(qt);
+ }
+}
+
+}
+
+namespace vsm {
+
+void
+SnippetModifier::considerSeparator()
+{
+ if (_useSep) {
+ _valueBuf->put(_groupSep);
+ }
+}
+
+void
+SnippetModifier::onPrimitive(uint32_t, const Content & c)
+{
+ considerSeparator();
+ _searcher->onValue(c.getValue());
+ _valueBuf->put(_searcher->getModifiedBuf().getBuffer(), _searcher->getModifiedBuf().getPos());
+ _useSep = true;
+}
+
+void
+SnippetModifier::reset()
+{
+ _valueBuf->reset();
+ _useSep = false;
+}
+
+
+SnippetModifier::SnippetModifier(const UTF8SubstringSnippetModifier::SP & searcher) :
+ _searcher(searcher),
+ _valueBuf(new CharBuffer(32)),
+ _groupSep('\x1E'),
+ _useSep(false),
+ _empty()
+{
+}
+
+SnippetModifier::SnippetModifier(const UTF8SubstringSnippetModifier::SP & searcher, const CharBuffer::SP & valueBuf) :
+ _searcher(searcher),
+ _valueBuf(valueBuf),
+ _groupSep('\x1E'),
+ _useSep(false),
+ _empty()
+{
+}
+
+SnippetModifier::~SnippetModifier() {}
+
+FieldValue::UP
+SnippetModifier::modify(const FieldValue & fv, const document::FieldPath & path)
+{
+ reset();
+ fv.iterateNested(path, *this);
+ return FieldValue::UP(new StringFieldValue(vespalib::string(_valueBuf->getBuffer(), _valueBuf->getPos())));
+}
+
+
+SnippetModifierManager::SnippetModifierManager() :
+ _modifiers(),
+ _searchBuf(new SearcherBuf(64)),
+ _searchModifyBuf(new CharBuffer(64)),
+ _searchOffsetBuf(new std::vector<size_t>(64)),
+ _modifierBuf(new CharBuffer(128))
+{
+}
+
+SnippetModifierManager::~SnippetModifierManager() {}
+
+void
+SnippetModifierManager::setup(const QueryTermList & queryTerms,
+ const FieldSearchSpecMapT & specMap,
+ const IndexFieldMapT & indexMap)
+{
+ FieldQueryTermMap fqtm;
+
+ // setup modifiers
+ for (QueryTermList::const_iterator i = queryTerms.begin(); i != queryTerms.end(); ++i) {
+ QueryTerm * qt = *i;
+ IndexFieldMapT::const_iterator j = indexMap.find(qt->index());
+ if (j != indexMap.end()) {
+ for (FieldIdTList::const_iterator k = j->second.begin(); k != j->second.end(); ++k) {
+ FieldIdT fId = *k;
+ const FieldSearchSpec & spec = specMap.find(fId)->second;
+ if (spec.searcher().substring() || qt->isSubstring()) { // we need a modifier for this field id
+ addIfNotPresent(fqtm, fId, qt);
+ if (_modifiers.getModifier(fId) == NULL) {
+ LOG(debug, "Create snippet modifier for field id '%u'", fId);
+ UTF8SubstringSnippetModifier::SP searcher
+ (new UTF8SubstringSnippetModifier(fId, _searchModifyBuf, _searchOffsetBuf));
+ _modifiers.map()[fId] = std::make_unique<SnippetModifier>(searcher, _modifierBuf);
+ }
+ }
+ }
+ }
+ }
+
+ // prepare modifiers
+ for (auto & entry : _modifiers.map()) {
+ FieldIdT fId = entry.first;
+ SnippetModifier & smod = static_cast<SnippetModifier &>(*entry.second);
+ smod.getSearcher()->prepare(fqtm[fId], _searchBuf);
+ }
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.h b/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.h
new file mode 100644
index 00000000000..4718ab8783a
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/snippetmodifier.h
@@ -0,0 +1,110 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#pragma once
+
+#include "fieldsearchspec.h"
+#include <vespa/vsm/common/charbuffer.h>
+#include <vespa/vsm/common/document.h>
+#include <vespa/vsm/common/fieldmodifier.h>
+#include <vespa/vsm/searcher/utf8substringsnippetmodifier.h>
+#include <vespa/document/fieldvalue/fieldvalue.h>
+#include <vespa/document/fieldvalue/iteratorhandler.h>
+
+namespace vsm {
+
+/**
+ * This class is responsible for modifying field values where we have substring search and that are used
+ * as input to snippet generation.
+ *
+ * The class implements the FieldModifier interface to modify field values, and the IteratorHandler interface
+ * to traverse complex field values. Primitive field values are passed to the underlying searcher that is
+ * responsible for modifying the field value by inserting unit separators before and after matches.
+ * A group separator is inserted between primitive field values the same way as done by FlattenDocsumWriter.
+ **/
+class SnippetModifier : public FieldModifier, public document::fieldvalue::IteratorHandler
+{
+private:
+ UTF8SubstringSnippetModifier::SP _searcher;
+ CharBuffer::SP _valueBuf; // buffer to store the final modified field value
+ char _groupSep;
+ bool _useSep;
+ document::FieldPath _empty;
+
+ void considerSeparator();
+ // Inherrit doc from document::FieldValue::IteratorHandler
+ void onPrimitive(uint32_t, const Content & c) override;
+ void reset();
+
+public:
+ /**
+ * Creates a new instance.
+ *
+ * @param searcher the searcher used to modify primitive field values.
+ **/
+ SnippetModifier(const UTF8SubstringSnippetModifier::SP & searcher);
+
+ /**
+ * Creates a new instance.
+ *
+ * @param searcher the searcher used to modify primitive field values.
+ * @param valueBuf the shared buffer used to store the final modified field value.
+ **/
+ SnippetModifier(const UTF8SubstringSnippetModifier::SP & searcher, const CharBuffer::SP & valueBuf);
+
+ ~SnippetModifier();
+
+ /**
+ * Modifies the complete given field value.
+ **/
+ document::FieldValue::UP modify(const document::FieldValue & fv) override {
+ return modify(fv, _empty);
+ }
+
+ /**
+ * Modifies the given field value by passing all primitive field values to the searcher and
+ * inserting group separators between them. A string field value is returned.
+ * The iterating of the field value is limited by the given field path.
+ *
+ * @param fv the field value to modify.
+ * @param path the field path used to iterate the field value.
+ * @return the new modified field value.
+ **/
+ document::FieldValue::UP modify(const document::FieldValue & fv,
+ const document::FieldPath & path) override;
+
+ const CharBuffer & getValueBuf() const { return *_valueBuf; }
+ const UTF8SubstringSnippetModifier::SP & getSearcher() const { return _searcher; }
+};
+
+/**
+ * This class manages a set of snippet modifiers.
+ * The modifiers are instantiated and prepared in the setup function.
+ * This class also holds shared buffers that are used by the modifiers.
+ **/
+class SnippetModifierManager
+{
+private:
+ FieldModifierMap _modifiers;
+ SharedSearcherBuf _searchBuf;
+ CharBuffer::SP _searchModifyBuf;
+ SharedOffsetBuffer _searchOffsetBuf;
+ CharBuffer::SP _modifierBuf;
+
+public:
+ SnippetModifierManager();
+ ~SnippetModifierManager();
+
+ /**
+ * Setups snippet modifiers for all fields where we have substring search.
+ *
+ * @param queryTerms the query terms to take into consideration.
+ * @param specMap mapping from field id to search spec objects.
+ * @param fieldMap mapping from index (used in the query) to a list of field ids.
+ **/
+ void setup(const search::streaming::QueryTermList & queryTerms,
+ const FieldSearchSpecMapT & specMap, const IndexFieldMapT & fieldMap);
+
+ const FieldModifierMap & getModifiers() const { return _modifiers; }
+};
+
+}
+
diff --git a/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.cpp b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.cpp
new file mode 100644
index 00000000000..5507532d4f3
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.cpp
@@ -0,0 +1,194 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "vsm-adapter.hpp"
+#include "docsumconfig.h"
+#include "i_matching_elements_filler.h"
+#include <vespa/searchlib/common/matching_elements.h>
+
+#include <vespa/log/log.h>
+LOG_SETUP(".vsm.vsm-adapter");
+
+using search::docsummary::ResConfigEntry;
+using search::docsummary::KeywordExtractor;
+using search::MatchingElements;
+using config::ConfigSnapshot;
+
+namespace vsm {
+
+GetDocsumsStateCallback::GetDocsumsStateCallback() :
+ _summaryFeatures(),
+ _rankFeatures(),
+ _matching_elements_filler()
+{ }
+
+void GetDocsumsStateCallback::FillSummaryFeatures(GetDocsumsState * state, IDocsumEnvironment * env)
+{
+ (void) env;
+ if (_summaryFeatures) { // set the summary features to write to the docsum
+ state->_summaryFeatures = _summaryFeatures;
+ state->_summaryFeaturesCached = true;
+ }
+}
+
+void GetDocsumsStateCallback::FillRankFeatures(GetDocsumsState * state, IDocsumEnvironment * env)
+{
+ (void) env;
+ if (_rankFeatures) { // set the rank features to write to the docsum
+ state->_rankFeatures = _rankFeatures;
+ }
+}
+
+void GetDocsumsStateCallback::FillDocumentLocations(GetDocsumsState *state, IDocsumEnvironment * env)
+{
+ (void) state;
+ (void) env;
+}
+
+std::unique_ptr<MatchingElements>
+GetDocsumsStateCallback::fill_matching_elements(const search::MatchingElementsFields& fields)
+{
+ if (_matching_elements_filler) {
+ return _matching_elements_filler->fill_matching_elements(fields);
+ }
+ return std::make_unique<MatchingElements>();
+}
+
+void
+GetDocsumsStateCallback::set_matching_elements_filler(std::unique_ptr<IMatchingElementsFiller> matching_elements_filler)
+{
+ _matching_elements_filler = std::move(matching_elements_filler);
+}
+
+GetDocsumsStateCallback::~GetDocsumsStateCallback() = default;
+
+DocsumTools::FieldSpec::FieldSpec() :
+ _outputName(),
+ _inputNames(),
+ _command(VsmsummaryConfig::Fieldmap::Command::NONE)
+{ }
+
+DocsumTools::FieldSpec::~FieldSpec() = default;
+
+DocsumTools::DocsumTools(std::unique_ptr<DynamicDocsumWriter> writer) :
+ _writer(std::move(writer)),
+ _juniper(),
+ _resultClass(),
+ _fieldSpecs()
+{ }
+
+
+DocsumTools::~DocsumTools() = default;
+
+bool
+DocsumTools::obtainFieldNames(const FastS_VsmsummaryHandle &cfg)
+{
+ uint32_t defaultSummaryId = getResultConfig()->LookupResultClassId(cfg->outputclass);
+ _resultClass = getResultConfig()->LookupResultClass(defaultSummaryId);
+ if (_resultClass != NULL) {
+ for (uint32_t i = 0; i < _resultClass->GetNumEntries(); ++i) {
+ const ResConfigEntry * entry = _resultClass->GetEntry(i);
+ _fieldSpecs.push_back(FieldSpec());
+ _fieldSpecs.back().setOutputName(entry->_bindname);
+ bool found = false;
+ if (cfg) {
+ // check if we have this summary field in the vsmsummary config
+ for (uint32_t j = 0; j < cfg->fieldmap.size() && !found; ++j) {
+ if (entry->_bindname == cfg->fieldmap[j].summary.c_str()) {
+ for (uint32_t k = 0; k < cfg->fieldmap[j].document.size(); ++k) {
+ _fieldSpecs.back().getInputNames().push_back(cfg->fieldmap[j].document[k].field);
+ }
+ _fieldSpecs.back().setCommand(cfg->fieldmap[j].command);
+ found = true;
+ }
+ }
+ }
+ if (!found) {
+ // use yourself as input
+ _fieldSpecs.back().getInputNames().push_back(entry->_bindname);
+ }
+ }
+ } else {
+ LOG(warning, "could not locate result class: '%s'", cfg->outputclass.c_str());
+ }
+ return true;
+}
+
+void
+VSMAdapter::configure(const VSMConfigSnapshot & snapshot)
+{
+ std::lock_guard guard(_lock);
+ LOG(debug, "(re-)configure VSM (docsum tools)");
+
+ std::shared_ptr<SummaryConfig> summary(snapshot.getConfig<SummaryConfig>());
+ std::shared_ptr<SummarymapConfig> summaryMap(snapshot.getConfig<SummarymapConfig>());
+ std::shared_ptr<VsmsummaryConfig> vsmSummary(snapshot.getConfig<VsmsummaryConfig>());
+ std::shared_ptr<JuniperrcConfig> juniperrc(snapshot.getConfig<JuniperrcConfig>());
+
+ _fieldsCfg.set(snapshot.getConfig<VsmfieldsConfig>().release());
+ _fieldsCfg.latch();
+
+ LOG(debug, "configureFields(): Size of cfg fieldspec: %zd", _fieldsCfg.get()->fieldspec.size()); // UlfC: debugging
+ LOG(debug, "configureFields(): Size of cfg documenttype: %zd", _fieldsCfg.get()->documenttype.size()); // UlfC: debugging
+ LOG(debug, "configureSummary(): Size of cfg classes: %zd", summary->classes.size()); // UlfC: debugging
+ LOG(debug, "configureSummaryMap(): Size of cfg override: %zd", summaryMap->override.size()); // UlfC: debugging
+ LOG(debug, "configureVsmSummary(): Size of cfg fieldmap: %zd", vsmSummary->fieldmap.size()); // UlfC: debugging
+ LOG(debug, "configureVsmSummary(): outputclass='%s'", vsmSummary->outputclass.c_str()); // UlfC: debugging
+
+ // init result config
+ std::unique_ptr<ResultConfig> resCfg(new ResultConfig());
+ if ( ! resCfg->ReadConfig(*summary.get(), _configId.c_str())) {
+ throw std::runtime_error("(re-)configuration of VSM (docsum tools) failed due to bad summary config");
+ }
+
+ // init keyword extractor
+ auto kwExtractor = std::make_unique<KeywordExtractor>(nullptr);
+ kwExtractor->AddLegalIndexSpec(_highlightindexes.c_str());
+ vespalib::string spec = kwExtractor->GetLegalIndexSpec();
+ LOG(debug, "index highlight spec: '%s'", spec.c_str());
+
+ // create dynamic docsum writer
+ auto writer = std::make_unique<DynamicDocsumWriter>(resCfg.release(), kwExtractor.release());
+
+ // configure juniper (used when configuring DynamicDocsumConfig)
+ _juniperProps = std::make_unique<JuniperProperties>(*juniperrc);
+ auto juniper = std::make_unique<juniper::Juniper>(_juniperProps.get(), &_wordFolder);
+
+ // create new docsum tools
+ auto docsumTools = std::make_unique<DocsumTools>(std::move(writer));
+ docsumTools->setJuniper(std::move(juniper));
+
+ // configure dynamic docsum writer
+ DynamicDocsumConfig dynDocsumConfig(docsumTools.get(), docsumTools->getDocsumWriter(), _fieldsCfg.get());
+ dynDocsumConfig.configure(*summaryMap.get());
+
+ // configure new docsum tools
+ if (docsumTools->obtainFieldNames(vsmSummary)) {
+ // latch new docsum tools into production
+ _docsumTools.set(docsumTools.release());
+ _docsumTools.latch();
+ } else {
+ throw std::runtime_error("(re-)configuration of VSM (docsum tools) failed");
+ }
+}
+
+VSMConfigSnapshot::VSMConfigSnapshot(const vespalib::string & configId, const config::ConfigSnapshot & snapshot)
+ : _configId(configId),
+ _snapshot(std::make_unique<config::ConfigSnapshot>(snapshot))
+{ }
+VSMConfigSnapshot::~VSMConfigSnapshot() = default;
+
+VSMAdapter::VSMAdapter(const vespalib::string & highlightindexes, const vespalib::string & configId, Fast_WordFolder & wordFolder)
+ : _highlightindexes(highlightindexes),
+ _configId(configId),
+ _wordFolder(wordFolder),
+ _fieldsCfg(),
+ _docsumTools(),
+ _juniperProps(),
+ _lock()
+{
+}
+
+
+VSMAdapter::~VSMAdapter() = default;
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.h b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.h
new file mode 100644
index 00000000000..6484269353b
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.h
@@ -0,0 +1,132 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/searchlib/query/base.h>
+#include <vespa/vsm/config/vsm-cfif.h>
+#include <vespa/config-summary.h>
+#include <vespa/config-summarymap.h>
+#include <vespa/searchlib/common/featureset.h>
+#include <vespa/searchsummary/docsummary/docsumwriter.h>
+#include <vespa/searchsummary/docsummary/docsumstate.h>
+#include <vespa/searchsummary/docsummary/idocsumenvironment.h>
+#include <vespa/juniper/rpinterface.h>
+
+using search::docsummary::ResultConfig;
+using search::docsummary::ResultClass;
+using search::docsummary::IDocsumWriter;
+using search::docsummary::DynamicDocsumWriter;
+using search::docsummary::GetDocsumsState;
+using search::docsummary::IDocsumEnvironment;
+using search::docsummary::JuniperProperties;
+
+using vespa::config::search::SummaryConfig;
+using vespa::config::search::SummarymapConfig;
+using vespa::config::search::summary::JuniperrcConfig;
+
+namespace config { class ConfigSnapshot; }
+namespace vsm {
+
+class IMatchingElementsFiller;
+
+class GetDocsumsStateCallback : public search::docsummary::GetDocsumsStateCallback
+{
+private:
+ search::FeatureSet::SP _summaryFeatures;
+ search::FeatureSet::SP _rankFeatures;
+ std::unique_ptr<IMatchingElementsFiller> _matching_elements_filler;
+
+public:
+ GetDocsumsStateCallback();
+ void FillSummaryFeatures(GetDocsumsState * state, IDocsumEnvironment * env) override;
+ void FillRankFeatures(GetDocsumsState * state, IDocsumEnvironment * env) override;
+ virtual void FillDocumentLocations(GetDocsumsState * state, IDocsumEnvironment * env);
+ virtual std::unique_ptr<search::MatchingElements> fill_matching_elements(const search::MatchingElementsFields& fields) override;
+ void setSummaryFeatures(const search::FeatureSet::SP & sf) { _summaryFeatures = sf; }
+ void setRankFeatures(const search::FeatureSet::SP & rf) { _rankFeatures = rf; }
+ void set_matching_elements_filler(std::unique_ptr<IMatchingElementsFiller> matching_elements_filler);
+ ~GetDocsumsStateCallback();
+};
+
+class DocsumTools : public IDocsumEnvironment
+{
+public:
+ class FieldSpec {
+ private:
+ vespalib::string _outputName;
+ std::vector<vespalib::string> _inputNames;
+ VsmsummaryConfig::Fieldmap::Command _command;
+
+ public:
+ FieldSpec();
+ ~FieldSpec();
+ const vespalib::string & getOutputName() const { return _outputName; }
+ void setOutputName(const vespalib::string & name) { _outputName = name; }
+ const std::vector<vespalib::string> & getInputNames() const { return _inputNames; }
+ std::vector<vespalib::string> & getInputNames() { return _inputNames; }
+ VsmsummaryConfig::Fieldmap::Command getCommand() const { return _command; }
+ void setCommand(VsmsummaryConfig::Fieldmap::Command command) { _command = command; }
+ };
+
+private:
+ std::unique_ptr<DynamicDocsumWriter> _writer;
+ std::unique_ptr<juniper::Juniper> _juniper;
+ const ResultClass * _resultClass;
+ std::vector<FieldSpec> _fieldSpecs;
+ DocsumTools(const DocsumTools &);
+ DocsumTools &operator=(const DocsumTools &);
+
+public:
+ DocsumTools(std::unique_ptr<DynamicDocsumWriter> writer);
+ ~DocsumTools();
+ void setJuniper(std::unique_ptr<juniper::Juniper> juniper) { _juniper = std::move(juniper); }
+ ResultConfig *getResultConfig() const { return _writer->GetResultConfig(); }
+ DynamicDocsumWriter *getDocsumWriter() const { return _writer.get(); }
+ const ResultClass *getResultClass() const { return _resultClass; }
+ const std::vector<FieldSpec> & getFieldSpecs() const { return _fieldSpecs; }
+ bool obtainFieldNames(const FastS_VsmsummaryHandle &cfg);
+
+ // inherit doc from IDocsumEnvironment
+ search::IAttributeManager * getAttributeManager() override { return NULL; }
+ vespalib::string lookupIndex(const vespalib::string&) const override { return ""; }
+ juniper::Juniper * getJuniper() override { return _juniper.get(); }
+};
+
+typedef std::shared_ptr<DocsumTools> DocsumToolsPtr;
+
+class VSMConfigSnapshot {
+private:
+ const vespalib::string _configId;
+ std::unique_ptr<const config::ConfigSnapshot> _snapshot;
+public:
+ VSMConfigSnapshot(const vespalib::string & configId, const config::ConfigSnapshot & snapshot);
+ ~VSMConfigSnapshot();
+ template <typename ConfigType>
+ std::unique_ptr<ConfigType> getConfig() const;
+};
+
+class VSMAdapter
+{
+public:
+ VSMAdapter(const vespalib::string & highlightindexes, const vespalib::string & configId, Fast_WordFolder & wordFolder);
+ virtual ~VSMAdapter();
+
+ VsmfieldsHandle getFieldsConfig() const { return _fieldsCfg.get(); }
+ DocsumToolsPtr getDocsumTools() const { return _docsumTools.get(); }
+ void configure(const VSMConfigSnapshot & snapshot);
+private:
+ vespalib::string _highlightindexes;
+ const vespalib::string _configId;
+ Fast_WordFolder & _wordFolder;
+ vespalib::PtrHolder<VsmfieldsConfig> _fieldsCfg;
+ vespalib::PtrHolder<DocsumTools> _docsumTools;
+ std::unique_ptr<JuniperProperties> _juniperProps;
+
+ std::mutex _lock;
+
+ VSMAdapter(const VSMAdapter &);
+ VSMAdapter &operator=(const VSMAdapter &);
+};
+
+} // namespace vsm
+
diff --git a/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.hpp b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.hpp
new file mode 100644
index 00000000000..f071dbb2015
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/vsm-adapter.hpp
@@ -0,0 +1,18 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "vsm-adapter.h"
+#include <vespa/config/retriever/configsnapshot.hpp>
+
+namespace vsm {
+
+template <typename ConfigType>
+std::unique_ptr<ConfigType>
+VSMConfigSnapshot::getConfig() const
+{
+ return _snapshot->getConfig<ConfigType>(_configId);
+}
+
+} // namespace vsm
+