diff options
author | bjormel <bjormel@yahooinc.com> | 2023-10-26 13:59:28 +0000 |
---|---|---|
committer | bjormel <bjormel@yahooinc.com> | 2023-10-26 13:59:28 +0000 |
commit | 567be9a1f6353cec41c23bfd1fcd46b4b2a4d2d7 (patch) | |
tree | 4664a743e166a5e11aee7b9acd70ad8ee2617612 /searchsummary/src/vespa/searchsummary/docsummary/tokens_converter.cpp | |
parent | e9058b555d4dfea2f6c872d9a677e8678b569569 (diff) | |
parent | bce3b8e926bf9da880172acbe1ba4b12d5e026d6 (diff) |
Merge branch 'master' into bjormel/aws-main-controllerbjormel/aws-main-controller
Diffstat (limited to 'searchsummary/src/vespa/searchsummary/docsummary/tokens_converter.cpp')
-rw-r--r-- | searchsummary/src/vespa/searchsummary/docsummary/tokens_converter.cpp | 78 |
1 files changed, 78 insertions, 0 deletions
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/tokens_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/tokens_converter.cpp new file mode 100644 index 00000000000..e2849fe793e --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/tokens_converter.cpp @@ -0,0 +1,78 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "tokens_converter.h" +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/searchlib/util/token_extractor.h> +#include <vespa/vespalib/data/slime/slime.h> + +using document::StringFieldValue; +using search::linguistics::TokenExtractor; +using vespalib::Memory; +using vespalib::slime::ArrayInserter; +using vespalib::slime::Cursor; +using vespalib::slime::Inserter; + +namespace search::docsummary { + +TokensConverter::TokensConverter(const TokenExtractor& token_extractor) + : IStringFieldConverter(), + _token_extractor(token_extractor), + _text() +{ +} + +TokensConverter::~TokensConverter() = default; + +template <typename ForwardIt> +void +TokensConverter::handle_alternative_index_terms(ForwardIt it, ForwardIt last, Inserter& inserter) +{ + Cursor& a = inserter.insertArray(); + ArrayInserter ai(a); + for (;it != last; ++it) { + handle_index_term(it->word, ai); + } +} + +void +TokensConverter::handle_index_term(vespalib::stringref word, Inserter& inserter) +{ + inserter.insertString(Memory(word)); +} + +void +TokensConverter::handle_indexing_terms(const StringFieldValue& value, vespalib::slime::Inserter& inserter) +{ + Cursor& a = inserter.insertArray(); + ArrayInserter ai(a); + using SpanTerm = TokenExtractor::SpanTerm; + std::vector<SpanTerm> terms; + auto span_trees = value.getSpanTrees(); + _token_extractor.extract(terms, span_trees, _text, nullptr); + auto it = terms.begin(); + auto ite = terms.end(); + auto itn = it; + for (; it != ite; it = itn) { + for (; itn != ite && itn->span == it->span; ++itn); + if ((itn - it) > 1) { + handle_alternative_index_terms(it, itn, ai); + } else { + handle_index_term(it->word, ai); + } + } +} + +void +TokensConverter::convert(const StringFieldValue &input, vespalib::slime::Inserter& inserter) +{ + _text = input.getValueRef(); + handle_indexing_terms(input, inserter); +} + +bool +TokensConverter::render_weighted_set_as_array() const +{ + return true; +} + +} |