diff options
Diffstat (limited to 'searchsummary/src/vespa/searchsummary/docsummary/tokens_converter.cpp')
-rw-r--r-- | searchsummary/src/vespa/searchsummary/docsummary/tokens_converter.cpp | 78 |
1 files changed, 78 insertions, 0 deletions
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/tokens_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/tokens_converter.cpp new file mode 100644 index 00000000000..e2849fe793e --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/tokens_converter.cpp @@ -0,0 +1,78 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "tokens_converter.h" +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/searchlib/util/token_extractor.h> +#include <vespa/vespalib/data/slime/slime.h> + +using document::StringFieldValue; +using search::linguistics::TokenExtractor; +using vespalib::Memory; +using vespalib::slime::ArrayInserter; +using vespalib::slime::Cursor; +using vespalib::slime::Inserter; + +namespace search::docsummary { + +TokensConverter::TokensConverter(const TokenExtractor& token_extractor) + : IStringFieldConverter(), + _token_extractor(token_extractor), + _text() +{ +} + +TokensConverter::~TokensConverter() = default; + +template <typename ForwardIt> +void +TokensConverter::handle_alternative_index_terms(ForwardIt it, ForwardIt last, Inserter& inserter) +{ + Cursor& a = inserter.insertArray(); + ArrayInserter ai(a); + for (;it != last; ++it) { + handle_index_term(it->word, ai); + } +} + +void +TokensConverter::handle_index_term(vespalib::stringref word, Inserter& inserter) +{ + inserter.insertString(Memory(word)); +} + +void +TokensConverter::handle_indexing_terms(const StringFieldValue& value, vespalib::slime::Inserter& inserter) +{ + Cursor& a = inserter.insertArray(); + ArrayInserter ai(a); + using SpanTerm = TokenExtractor::SpanTerm; + std::vector<SpanTerm> terms; + auto span_trees = value.getSpanTrees(); + _token_extractor.extract(terms, span_trees, _text, nullptr); + auto it = terms.begin(); + auto ite = terms.end(); + auto itn = it; + for (; it != ite; it = itn) { + for (; itn != ite && itn->span == it->span; ++itn); + if ((itn - it) > 1) { + handle_alternative_index_terms(it, itn, ai); + } else { + handle_index_term(it->word, ai); + } + } +} + +void +TokensConverter::convert(const StringFieldValue &input, vespalib::slime::Inserter& inserter) +{ + _text = input.getValueRef(); + handle_indexing_terms(input, inserter); +} + +bool +TokensConverter::render_weighted_set_as_array() const +{ + return true; +} + +} |