aboutsummaryrefslogtreecommitdiffstats
path: root/streamingvisitors/src/vespa/vsm
diff options
context:
space:
mode:
Diffstat (limited to 'streamingvisitors/src/vespa/vsm')
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt2
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp28
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h2
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp59
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h27
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp37
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h29
7 files changed, 183 insertions, 1 deletions
diff --git a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt
index a50a541f3b7..d13e2e1fd54 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt
+++ b/streamingvisitors/src/vespa/vsm/vsm/CMakeLists.txt
@@ -8,6 +8,8 @@ vespa_add_library(vsm_vsmbase OBJECT
flattendocsumwriter.cpp
query_term_filter_factory.cpp
snippetmodifier.cpp
+ tokens_converter.cpp
+ tokens_dfw.cpp
vsm-adapter.cpp
DEPENDS
vsm_vconfig
diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp
index 180200e2eaf..95e0f961d13 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp
+++ b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.cpp
@@ -1,6 +1,8 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "docsum_field_writer_factory.h"
+#include "fieldsearchspec.h"
+#include "tokens_dfw.h"
#include <vespa/searchlib/common/matching_elements_fields.h>
#include <vespa/searchsummary/docsummary/copy_dfw.h>
#include <vespa/searchsummary/docsummary/docsum_field_writer.h>
@@ -8,8 +10,10 @@
#include <vespa/searchsummary/docsummary/empty_dfw.h>
#include <vespa/searchsummary/docsummary/matched_elements_filter_dfw.h>
#include <vespa/vsm/config/config-vsmfields.h>
+#include <algorithm>
using search::MatchingElementsFields;
+using search::Normalizing;
using search::docsummary::CopyDFW;
using search::docsummary::DocsumFieldWriter;
using search::docsummary::EmptyDFW;
@@ -35,6 +39,23 @@ void populate_fields(MatchingElementsFields& fields, VsmfieldsConfig& fields_con
}
}
+bool is_exact_match(vespalib::stringref arg1) {
+ return ((arg1 == "exact") || (arg1 == "word"));
+}
+
+std::unique_ptr<DocsumFieldWriter>
+make_tokens_dfw(const vespalib::string& source, VsmfieldsConfig& fields_config)
+{
+ bool exact_match = false;
+ Normalizing normalize_mode = Normalizing::LOWERCASE;
+ auto it = std::find_if(fields_config.fieldspec.begin(), fields_config.fieldspec.end(), [&source](auto& fs) { return source == fs.name; });
+ if (it != fields_config.fieldspec.end()) {
+ exact_match = is_exact_match(it->arg1);
+ normalize_mode = FieldSearchSpecMap::convert_normalize_mode(it->normalize);
+ }
+ return std::make_unique<TokensDFW>(source, exact_match, normalize_mode);
+}
+
}
DocsumFieldWriterFactory::DocsumFieldWriterFactory(bool use_v8_geo_positions, const IDocsumEnvironment& env, const IQueryTermFilterFactory& query_term_filter_factory, const vespa::config::search::vsm::VsmfieldsConfig& vsm_fields_config)
@@ -68,6 +89,13 @@ DocsumFieldWriterFactory::create_docsum_field_writer(const vespalib::string& fie
vespalib::string source_field = source.empty() ? field_name : source;
populate_fields(*matching_elems_fields, _vsm_fields_config, source_field);
fieldWriter = MatchedElementsFilterDFW::create(source_field, matching_elems_fields);
+ } else if ((command == command::tokens) ||
+ (command == command::attribute_tokens)) {
+ if (!source.empty()) {
+ fieldWriter = make_tokens_dfw(source, _vsm_fields_config);
+ } else {
+ throw_missing_source(command);
+ }
} else {
return search::docsummary::DocsumFieldWriterFactory::create_docsum_field_writer(field_name, command, source, matching_elems_fields);
}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h
index e32761f38b3..68fcfd6e8eb 100644
--- a/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h
+++ b/streamingvisitors/src/vespa/vsm/vsm/docsum_field_writer_factory.h
@@ -8,7 +8,7 @@
namespace vsm {
/*
- * Factory interface class for creating docsum field writers, adjusted for
+ * Factory class for creating docsum field writers, adjusted for
* streaming search.
*/
class DocsumFieldWriterFactory : public search::docsummary::DocsumFieldWriterFactory
diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp
new file mode 100644
index 00000000000..3534b751356
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp
@@ -0,0 +1,59 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "tokens_converter.h"
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vespalib/text/utf8.h>
+#include <vespa/vespalib/util/array.h>
+#include <vespa/vsm/searcher/tokenizereader.h>
+
+using document::StringFieldValue;
+using search::byte;
+using vespalib::Utf8Writer;
+using vespalib::slime::ArrayInserter;
+using vespalib::slime::Cursor;
+using vespalib::slime::Inserter;
+using vsm::TokenizeReader;
+
+namespace vsm {
+
+TokensConverter::TokensConverter(bool exact_match, search::Normalizing normalize_mode)
+ : IStringFieldConverter(),
+ _text(),
+ _exact_match(exact_match),
+ _normalize_mode(normalize_mode)
+{
+}
+
+TokensConverter::~TokensConverter() = default;
+
+void
+TokensConverter::convert(const StringFieldValue &input, Inserter& inserter)
+{
+ _text = input.getValueRef();
+ Cursor& a = inserter.insertArray();
+ ArrayInserter ai(a);
+ vespalib::Array<ucs4_t> buf(_text.size() + 1, 0);
+ vespalib::string scratch;
+ TokenizeReader reader(reinterpret_cast<const byte *> (_text.data()), _text.size(), buf.data());
+ for (;;) {
+ auto len = _exact_match ? reader.tokenize_exact_match(_normalize_mode) : reader.tokenize(_normalize_mode);
+ if (len == 0) {
+ break;
+ }
+ scratch.clear();
+ Utf8Writer w(scratch);
+ for (size_t i = 0; i < len; ++i) {
+ w.putChar(buf[i]);
+ }
+ ai.insertString(scratch);
+ }
+}
+
+bool
+TokensConverter::render_weighted_set_as_array() const
+{
+ return true;
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h
new file mode 100644
index 00000000000..689b0b95b02
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.h
@@ -0,0 +1,27 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/searchsummary/docsummary/i_string_field_converter.h>
+#include <vespa/searchlib/query/query_normalization.h>
+
+namespace vsm {
+
+/*
+ * Class converting a string field value into an array
+ * containing the tokens.
+ */
+class TokensConverter : public search::docsummary::IStringFieldConverter
+{
+ vespalib::stringref _text;
+ bool _exact_match;
+ search::Normalizing _normalize_mode;
+
+public:
+ TokensConverter(bool exact_match, search::Normalizing normalize_mode);
+ ~TokensConverter() override;
+ void convert(const document::StringFieldValue &input, vespalib::slime::Inserter& inserter) override;
+ bool render_weighted_set_as_array() const override;
+};
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp
new file mode 100644
index 00000000000..7861c13a179
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.cpp
@@ -0,0 +1,37 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "tokens_dfw.h"
+#include <vespa/searchsummary/docsummary/i_docsum_store_document.h>
+#include "tokens_converter.h"
+
+using search::docsummary::IDocsumStoreDocument;
+using search::docsummary::GetDocsumsState;
+
+namespace vsm {
+
+TokensDFW::TokensDFW(const vespalib::string& input_field_name, bool exact_match, search::Normalizing normalize_mode)
+ : DocsumFieldWriter(),
+ _input_field_name(input_field_name),
+ _exact_match(exact_match),
+ _normalize_mode(normalize_mode)
+{
+}
+
+TokensDFW::~TokensDFW() = default;
+
+bool
+TokensDFW::isGenerated() const
+{
+ return false;
+}
+
+void
+TokensDFW::insertField(uint32_t, const IDocsumStoreDocument* doc, GetDocsumsState&, vespalib::slime::Inserter& target) const
+{
+ if (doc != nullptr) {
+ TokensConverter converter(_exact_match, _normalize_mode);
+ doc->insert_summary_field(_input_field_name, target, &converter);
+ }
+}
+
+}
diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h
new file mode 100644
index 00000000000..4199630a94d
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_dfw.h
@@ -0,0 +1,29 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/searchsummary/docsummary/docsum_field_writer.h>
+#include <vespa/searchlib/query/query_normalization.h>
+
+namespace vsm {
+
+/*
+ * Class for writing string field values from document as
+ * arrays containing the tokens. Tokenization is performed
+ * on the fly using the exact_match and normalize_mode settings.
+ */
+class TokensDFW : public search::docsummary::DocsumFieldWriter
+{
+private:
+ vespalib::string _input_field_name;
+ bool _exact_match;
+ search::Normalizing _normalize_mode;
+
+public:
+ explicit TokensDFW(const vespalib::string& input_field_name, bool exact_match, search::Normalizing normalize_mode);
+ ~TokensDFW() override;
+ bool isGenerated() const override;
+ void insertField(uint32_t docid, const search::docsummary::IDocsumStoreDocument* doc, search::docsummary::GetDocsumsState& state, vespalib::slime::Inserter& target) const override;
+};
+
+}