diff options
author | bjormel <bjormel@yahooinc.com> | 2023-10-26 13:59:28 +0000 |
---|---|---|
committer | bjormel <bjormel@yahooinc.com> | 2023-10-26 13:59:28 +0000 |
commit | 567be9a1f6353cec41c23bfd1fcd46b4b2a4d2d7 (patch) | |
tree | 4664a743e166a5e11aee7b9acd70ad8ee2617612 /searchlib/src/vespa/searchlib/util/token_extractor.cpp | |
parent | e9058b555d4dfea2f6c872d9a677e8678b569569 (diff) | |
parent | bce3b8e926bf9da880172acbe1ba4b12d5e026d6 (diff) |
Merge branch 'master' into bjormel/aws-main-controllerbjormel/aws-main-controller
Diffstat (limited to 'searchlib/src/vespa/searchlib/util/token_extractor.cpp')
-rw-r--r-- | searchlib/src/vespa/searchlib/util/token_extractor.cpp | 162 |
1 files changed, 162 insertions, 0 deletions
diff --git a/searchlib/src/vespa/searchlib/util/token_extractor.cpp b/searchlib/src/vespa/searchlib/util/token_extractor.cpp new file mode 100644 index 00000000000..a78f30afe21 --- /dev/null +++ b/searchlib/src/vespa/searchlib/util/token_extractor.cpp @@ -0,0 +1,162 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "token_extractor.h" +#include "linguisticsannotation.h" +#include <vespa/document/annotation/alternatespanlist.h> +#include <vespa/document/annotation/span.h> +#include <vespa/document/annotation/spanlist.h> +#include <vespa/document/annotation/spantreevisitor.h> +#include <vespa/document/fieldvalue/document.h> +#include <vespa/vespalib/text/utf8.h> +#include <vespa/vespalib/util/exceptions.h> + +#include <vespa/log/log.h> +LOG_SETUP(".searchlib.util.token_extractor"); + +using document::AlternateSpanList; +using document::Annotation; +using document::AnnotationType; +using document::Document; +using document::FieldValue; +using document::SimpleSpanList; +using document::Span; +using document::SpanList; +using document::SpanNode; +using document::SpanTreeVisitor; +using document::StringFieldValue; +using vespalib::Utf8Reader; + +namespace search::linguistics { + +namespace { + +class SpanFinder : public SpanTreeVisitor { +public: + int32_t begin_pos; + int32_t end_pos; + + SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {} + Span span() { return Span(begin_pos, end_pos - begin_pos); } + + void visit(const Span &node) override { + begin_pos = std::min(begin_pos, node.from()); + end_pos = std::max(end_pos, node.from() + node.length()); + } + void visit(const SpanList &node) override { + for (const auto & span_ : node) { + span_->accept(*this); + } + } + void visit(const SimpleSpanList &node) override { + for (const auto & span_ : node) { + span_.accept(*this); + } + } + void visit(const AlternateSpanList &node) override { + for (size_t i = 0; i < node.getNumSubtrees(); ++i) { + visit(node.getSubtree(i)); + } + } +}; + +Span +getSpan(const SpanNode &span_node) +{ + SpanFinder finder; + span_node.accept(finder); + return finder.span(); +} + +vespalib::stringref +get_span_string_or_alternative(vespalib::stringref s, const Span &span, const FieldValue* fv) +{ + if (fv != nullptr) { + auto raw = fv->getAsRaw(); + return {raw.first, raw.second}; + } else { + return {s.data() + span.from(), static_cast<size_t>(span.length())}; + } +} + +size_t +truncated_word_len(vespalib::stringref word, size_t max_byte_len) +{ + Utf8Reader reader(word); + while (reader.hasMore()) { + auto last_pos = reader.getPos(); + (void) reader.getChar(); + if (reader.getPos() > max_byte_len) { + return last_pos; + } + } + return reader.getPos(); // No truncation +} + +constexpr size_t max_fmt_len = 100; // Max length of word in logs + +} + +TokenExtractor::TokenExtractor(const vespalib::string& field_name, size_t max_word_len) + : _field_name(field_name), + _max_word_len(max_word_len) +{ +} + +TokenExtractor::~TokenExtractor() = default; + +vespalib::stringref +TokenExtractor::sanitize_word(vespalib::stringref word, const document::Document* doc) const +{ + size_t len = strnlen(word.data(), word.size()); + if (len < word.size()) { + size_t old_len = word.size(); + len = truncated_word_len(word, len); + word = word.substr(0, len); + if (doc != nullptr) { + LOG(error, "Detected NUL byte in word, length reduced from %zu to %zu, document %s field %s, truncated word prefix is %.*s", old_len, word.size(), doc->getId().toString().c_str(), _field_name.c_str(), (int) truncated_word_len(word, max_fmt_len), word.data()); + } + } + if (word.size() > _max_word_len) { + if (doc != nullptr) { + LOG(warning, "Dropped too long word (len %zu > max len %zu) from document %s field %s, word prefix is %.*s", word.size(), _max_word_len, doc->getId().toString().c_str(), _field_name.c_str(), (int) truncated_word_len(word, max_fmt_len), word.data()); + } + return {}; + } + return word; +} + +void +TokenExtractor::consider_word(std::vector<SpanTerm>& terms, vespalib::stringref text, const Span& span, const FieldValue* fv, const Document* doc) const +{ + if (span.length() > 0 && span.from() >= 0 && + static_cast<size_t>(span.from()) + static_cast<size_t>(span.length()) <= text.size()) { + auto word = get_span_string_or_alternative(text, span, fv); + word = sanitize_word(word, doc); + if (!word.empty()) { + terms.emplace_back(span, word, fv != nullptr); + } + } +} + +void +TokenExtractor::extract(std::vector<SpanTerm>& terms, const document::StringFieldValue::SpanTrees& trees, vespalib::stringref text, const Document* doc) const +{ + auto tree = StringFieldValue::findTree(trees, SPANTREE_NAME); + if (tree == nullptr) { + /* field might not be annotated if match type is exact */ + consider_word(terms, text, Span(0, text.size()), nullptr, doc); + return; + } + for (const Annotation & annotation : *tree) { + const SpanNode *span = annotation.getSpanNode(); + if ((span != nullptr) && annotation.valid() && + (annotation.getType() == *AnnotationType::TERM)) + { + Span sp = getSpan(*span); + consider_word(terms, text, sp, annotation.getFieldValue(), doc); + } + } + std::sort(terms.begin(), terms.end()); +} + +} |