aboutsummaryrefslogtreecommitdiffstats
path: root/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp
diff options
context:
space:
mode:
authorGeir Storli <geirst@vespa.ai>2024-03-27 16:16:56 +0100
committerGitHub <noreply@github.com>2024-03-27 16:16:56 +0100
commit45e8c0c2b2ab7b3ac9d8e6a8f174b08ac15b342e (patch)
treeb886a2b83b89265239fc22f7a78fda01082832d6 /streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp
parentaa19da6beecdf82393af0f99a12940fb9db92bd9 (diff)
parent086944ee64cb439c336d8d94d6af1c253a22697b (diff)
Merge pull request #30747 from vespa-engine/toregge/add-streaming-mode-version-of-tokens-dfw
Add streaming mode version of tokens document field writer.
Diffstat (limited to 'streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp')
-rw-r--r--streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp59
1 files changed, 59 insertions, 0 deletions
diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp
new file mode 100644
index 00000000000..3534b751356
--- /dev/null
+++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp
@@ -0,0 +1,59 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "tokens_converter.h"
+#include <vespa/document/fieldvalue/stringfieldvalue.h>
+#include <vespa/vespalib/data/slime/slime.h>
+#include <vespa/vespalib/text/utf8.h>
+#include <vespa/vespalib/util/array.h>
+#include <vespa/vsm/searcher/tokenizereader.h>
+
+using document::StringFieldValue;
+using search::byte;
+using vespalib::Utf8Writer;
+using vespalib::slime::ArrayInserter;
+using vespalib::slime::Cursor;
+using vespalib::slime::Inserter;
+using vsm::TokenizeReader;
+
+namespace vsm {
+
+TokensConverter::TokensConverter(bool exact_match, search::Normalizing normalize_mode)
+ : IStringFieldConverter(),
+ _text(),
+ _exact_match(exact_match),
+ _normalize_mode(normalize_mode)
+{
+}
+
+TokensConverter::~TokensConverter() = default;
+
+void
+TokensConverter::convert(const StringFieldValue &input, Inserter& inserter)
+{
+ _text = input.getValueRef();
+ Cursor& a = inserter.insertArray();
+ ArrayInserter ai(a);
+ vespalib::Array<ucs4_t> buf(_text.size() + 1, 0);
+ vespalib::string scratch;
+ TokenizeReader reader(reinterpret_cast<const byte *> (_text.data()), _text.size(), buf.data());
+ for (;;) {
+ auto len = _exact_match ? reader.tokenize_exact_match(_normalize_mode) : reader.tokenize(_normalize_mode);
+ if (len == 0) {
+ break;
+ }
+ scratch.clear();
+ Utf8Writer w(scratch);
+ for (size_t i = 0; i < len; ++i) {
+ w.putChar(buf[i]);
+ }
+ ai.insertString(scratch);
+ }
+}
+
+bool
+TokensConverter::render_weighted_set_as_array() const
+{
+ return true;
+}
+
+}