diff options
author | Geir Storli <geirst@vespa.ai> | 2024-03-27 16:16:56 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-03-27 16:16:56 +0100 |
commit | 45e8c0c2b2ab7b3ac9d8e6a8f174b08ac15b342e (patch) | |
tree | b886a2b83b89265239fc22f7a78fda01082832d6 /streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp | |
parent | aa19da6beecdf82393af0f99a12940fb9db92bd9 (diff) | |
parent | 086944ee64cb439c336d8d94d6af1c253a22697b (diff) |
Merge pull request #30747 from vespa-engine/toregge/add-streaming-mode-version-of-tokens-dfw
Add streaming mode version of tokens document field writer.
Diffstat (limited to 'streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp')
-rw-r--r-- | streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp | 59 |
1 files changed, 59 insertions, 0 deletions
diff --git a/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp new file mode 100644 index 00000000000..3534b751356 --- /dev/null +++ b/streamingvisitors/src/vespa/vsm/vsm/tokens_converter.cpp @@ -0,0 +1,59 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "tokens_converter.h" +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/vespalib/data/slime/slime.h> +#include <vespa/vespalib/text/utf8.h> +#include <vespa/vespalib/util/array.h> +#include <vespa/vsm/searcher/tokenizereader.h> + +using document::StringFieldValue; +using search::byte; +using vespalib::Utf8Writer; +using vespalib::slime::ArrayInserter; +using vespalib::slime::Cursor; +using vespalib::slime::Inserter; +using vsm::TokenizeReader; + +namespace vsm { + +TokensConverter::TokensConverter(bool exact_match, search::Normalizing normalize_mode) + : IStringFieldConverter(), + _text(), + _exact_match(exact_match), + _normalize_mode(normalize_mode) +{ +} + +TokensConverter::~TokensConverter() = default; + +void +TokensConverter::convert(const StringFieldValue &input, Inserter& inserter) +{ + _text = input.getValueRef(); + Cursor& a = inserter.insertArray(); + ArrayInserter ai(a); + vespalib::Array<ucs4_t> buf(_text.size() + 1, 0); + vespalib::string scratch; + TokenizeReader reader(reinterpret_cast<const byte *> (_text.data()), _text.size(), buf.data()); + for (;;) { + auto len = _exact_match ? reader.tokenize_exact_match(_normalize_mode) : reader.tokenize(_normalize_mode); + if (len == 0) { + break; + } + scratch.clear(); + Utf8Writer w(scratch); + for (size_t i = 0; i < len; ++i) { + w.putChar(buf[i]); + } + ai.insertString(scratch); + } +} + +bool +TokensConverter::render_weighted_set_as_array() const +{ + return true; +} + +} |