diff options
author | Tor Egge <Tor.Egge@online.no> | 2022-09-11 13:11:56 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2022-09-11 13:11:56 +0200 |
commit | 2dede1f466509c2c3de1b18b9a087fd4162a8c6c (patch) | |
tree | 4454010871a235218f802cd4f47a3a3f3c5c6ec9 /searchsummary | |
parent | 2b533a5bda75247b4cff2754c0d48bfd2909b87c (diff) |
Move SlimeFiller to separate file.
Move AnnotationConverter (fka SummaryHandler) to separate file.
Diffstat (limited to 'searchsummary')
6 files changed, 584 insertions, 412 deletions
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt index 8070bed8b03..c4fb4c4dd8f 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt +++ b/searchsummary/src/vespa/searchsummary/docsummary/CMakeLists.txt @@ -1,6 +1,7 @@ # Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. vespa_add_library(searchsummary_docsummary OBJECT SOURCES + annotation_converter.cpp array_attribute_combiner_dfw.cpp attribute_combiner_dfw.cpp attribute_field_writer.cpp @@ -30,6 +31,7 @@ vespa_add_library(searchsummary_docsummary OBJECT resultconfig.cpp searchdatatype.cpp simple_dfw.cpp + slime_filler.cpp struct_fields_resolver.cpp struct_map_attribute_combiner_dfw.cpp summaryfeaturesdfw.cpp diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp new file mode 100644 index 00000000000..82f3d086b79 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.cpp @@ -0,0 +1,158 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "annotation_converter.h" +#include "linguisticsannotation.h" +#include <vespa/document/annotation/alternatespanlist.h> +#include <vespa/document/annotation/annotation.h> +#include <vespa/document/annotation/spantree.h> +#include <vespa/document/annotation/spantreevisitor.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/juniper/juniper_separators.h> +#include <vespa/vespalib/stllike/asciistream.h> +#include <vespa/vespalib/util/exceptions.h> +#include <utility> + +using document::Annotation; +using document::AlternateSpanList; +using document::FieldValue; +using document::SimpleSpanList; +using document::Span; +using document::SpanList; +using document::SpanNode; +using document::SpanTree; +using document::SpanTreeVisitor; +using document::StringFieldValue; + +namespace search::docsummary { + +namespace { + +vespalib::string +getSpanString(const vespalib::string &s, const Span &span) +{ + return vespalib::string(&s[span.from()], &s[span.from() + span.length()]); +} + +struct SpanFinder : SpanTreeVisitor { + int32_t begin_pos; + int32_t end_pos; + + SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {} + Span span() { return Span(begin_pos, end_pos - begin_pos); } + + void visit(const Span &node) override { + begin_pos = std::min(begin_pos, node.from()); + end_pos = std::max(end_pos, node.from() + node.length()); + } + void visit(const SpanList &node) override { + for (const auto & span_ : node) { + span_->accept(*this); + } + } + void visit(const SimpleSpanList &node) override { + for (const auto & span_ : node) { + span_.accept(*this); + } + } + void visit(const AlternateSpanList &node) override { + for (size_t i = 0; i < node.getNumSubtrees(); ++i) { + visit(node.getSubtree(i)); + } + } +}; + +Span getSpan(const SpanNode &span_node) { + SpanFinder finder; + span_node.accept(finder); + return finder.span(); +} + +const StringFieldValue &ensureStringFieldValue(const FieldValue &value) __attribute__((noinline)); + +const StringFieldValue &ensureStringFieldValue(const FieldValue &value) { + if (!value.isA(FieldValue::Type::STRING)) { + throw vespalib::IllegalArgumentException("Illegal field type. " + value.toString(), VESPA_STRLOC); + } + return static_cast<const StringFieldValue &>(value); +} + +} + +template <typename ForwardIt> +void +AnnotationConverter::handleAnnotations(const document::Span& span, ForwardIt it, ForwardIt last) { + int annCnt = (last - it); + if (annCnt > 1 || (annCnt == 1 && it->second)) { + annotateSpans(span, it, last); + } else { + out << getSpanString(text, span) << juniper::separators::unit_separator_string; + } +} + +template <typename ForwardIt> +void +AnnotationConverter::annotateSpans(const document::Span& span, ForwardIt it, ForwardIt last) { + out << juniper::separators::interlinear_annotation_anchor_string // ANCHOR + << (getSpanString(text, span)) + << juniper::separators::interlinear_annotation_separator_string; // SEPARATOR + while (it != last) { + if (it->second) { + out << ensureStringFieldValue(*it->second).getValue(); + } else { + out << getSpanString(text, span); + } + if (++it != last) { + out << " "; + } + } + out << juniper::separators::interlinear_annotation_terminator_string // TERMINATOR + << juniper::separators::unit_separator_string; +} + +void +AnnotationConverter::handleIndexingTerms(const StringFieldValue& value) +{ + StringFieldValue::SpanTrees trees = value.getSpanTrees(); + const SpanTree *tree = StringFieldValue::findTree(trees, linguistics::SPANTREE_NAME); + typedef std::pair<Span, const FieldValue *> SpanTerm; + typedef std::vector<SpanTerm> SpanTermVector; + if (!tree) { + // Treat a string without annotations as a single span. + SpanTerm str(Span(0, text.size()), + static_cast<const FieldValue*>(nullptr)); + handleAnnotations(str.first, &str, &str + 1); + return; + } + SpanTermVector terms; + for (const Annotation& annotation : *tree) { + // For now, skip any composite spans. + const Span *span = dynamic_cast<const Span*>(annotation.getSpanNode()); + if ((span != nullptr) && annotation.valid() && + (annotation.getType() == *linguistics::TERM)) { + terms.push_back(std::make_pair(getSpan(*span), + annotation.getFieldValue())); + } + } + sort(terms.begin(), terms.end()); + auto it = terms.begin(); + auto ite = terms.end(); + int32_t endPos = 0; + for (; it != ite; ) { + auto it_begin = it; + if (it_begin->first.from() > endPos) { + Span tmpSpan(endPos, it_begin->first.from() - endPos); + handleAnnotations(tmpSpan, it, it); + endPos = it_begin->first.from(); + } + for (; it != ite && it->first == it_begin->first; ++it); + handleAnnotations(it_begin->first, it_begin, it); + endPos = it_begin->first.from() + it_begin->first.length(); + } + int32_t wantEndPos = text.size(); + if (endPos < wantEndPos) { + Span tmpSpan(endPos, wantEndPos - endPos); + handleAnnotations(tmpSpan, ite, ite); + } +} + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h new file mode 100644 index 00000000000..37e3c18606e --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/annotation_converter.h @@ -0,0 +1,35 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> + +namespace document +{ +class Span; +class StringFieldValue; +} + +namespace vespalib { class asciistream; } + +namespace search::docsummary { + +/* + * Class converting a string field value with annotations into a string + * with interlinear annotations used by juniper. + */ +struct AnnotationConverter { + const vespalib::string text; + vespalib::asciistream& out; + + template <typename ForwardIt> + void handleAnnotations(const document::Span& span, ForwardIt it, ForwardIt last); + template <typename ForwardIt> + void annotateSpans(const document::Span& span, ForwardIt it, ForwardIt last); +public: + AnnotationConverter(const vespalib::string& s, vespalib::asciistream& stream) + : text(s), out(stream) {} + void handleIndexingTerms(const document::StringFieldValue& value); +}; + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp b/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp new file mode 100644 index 00000000000..7d31af552b1 --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.cpp @@ -0,0 +1,329 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "slime_filler.h" +#include "annotation_converter.h" +#include "resultconfig.h" +#include "searchdatatype.h" +#include <vespa/document/datatype/positiondatatype.h> +#include <vespa/document/fieldvalue/arrayfieldvalue.h> +#include <vespa/document/fieldvalue/boolfieldvalue.h> +#include <vespa/document/fieldvalue/bytefieldvalue.h> +#include <vespa/document/fieldvalue/doublefieldvalue.h> +#include <vespa/document/fieldvalue/floatfieldvalue.h> +#include <vespa/document/fieldvalue/intfieldvalue.h> +#include <vespa/document/fieldvalue/longfieldvalue.h> +#include <vespa/document/fieldvalue/mapfieldvalue.h> +#include <vespa/document/fieldvalue/predicatefieldvalue.h> +#include <vespa/document/fieldvalue/rawfieldvalue.h> +#include <vespa/document/fieldvalue/shortfieldvalue.h> +#include <vespa/document/fieldvalue/stringfieldvalue.h> +#include <vespa/document/fieldvalue/structfieldvalue.h> +#include <vespa/document/fieldvalue/weightedsetfieldvalue.h> +#include <vespa/document/fieldvalue/tensorfieldvalue.h> +#include <vespa/document/fieldvalue/referencefieldvalue.h> +#include <vespa/eval/eval/value_codec.h> +#include <vespa/vespalib/data/slime/slime.h> +#include <vespa/vespalib/objects/nbostream.h> +#include <vespa/vespalib/stllike/asciistream.h> +#include <cassert> + +using document::AnnotationReferenceFieldValue; +using document::ArrayFieldValue; +using document::BoolFieldValue; +using document::ByteFieldValue; +using document::Document; +using document::DoubleFieldValue; +using document::FieldValue; +using document::FloatFieldValue; +using document::IntFieldValue; +using document::LongFieldValue; +using document::MapFieldValue; +using document::PredicateFieldValue; +using document::RawFieldValue; +using document::ShortFieldValue; +using document::StringFieldValue; +using document::StructFieldValue; +using document::WeightedSetFieldValue; +using document::TensorFieldValue; +using document::ReferenceFieldValue; +using vespalib::slime::ArrayInserter; +using vespalib::slime::Cursor; +using vespalib::slime::Inserter; +using vespalib::slime::ObjectInserter; +using vespalib::slime::ObjectSymbolInserter; +using vespalib::slime::Symbol; +using vespalib::Memory; +using vespalib::asciistream; + +namespace search::docsummary { + +namespace { + +class MapFieldValueInserter { +private: + Cursor& _array; + Symbol _key_sym; + Symbol _val_sym; + bool _tokenize; + +public: + MapFieldValueInserter(Inserter& parent_inserter, bool tokenize) + : _array(parent_inserter.insertArray()), + _key_sym(_array.resolve("key")), + _val_sym(_array.resolve("value")), + _tokenize(tokenize) + { + } + void insert_entry(const FieldValue& key, const FieldValue& value) { + Cursor& c = _array.addObject(); + ObjectSymbolInserter ki(c, _key_sym); + ObjectSymbolInserter vi(c, _val_sym); + SlimeFiller key_conv(ki, _tokenize); + SlimeFiller val_conv(vi, _tokenize); + + key.accept(key_conv); + value.accept(val_conv); + } +}; + +} + +SlimeFiller::SlimeFiller(Inserter& inserter, bool tokenize) + : _inserter(inserter), + _tokenize(tokenize), + _matching_elems(nullptr) +{ +} + +SlimeFiller::SlimeFiller(Inserter& inserter, bool tokenize, const std::vector<uint32_t>* matching_elems) + : _inserter(inserter), + _tokenize(tokenize), + _matching_elems(matching_elems) +{ +} + +SlimeFiller::~SlimeFiller() = default; + +void +SlimeFiller::visit(const AnnotationReferenceFieldValue& v) +{ + (void)v; + Cursor& c = _inserter.insertObject(); + Memory key("error"); + Memory val("cannot convert from annotation reference field"); + c.setString(key, val); +} + +void +SlimeFiller::visit(const Document& v) +{ + (void)v; + Cursor& c = _inserter.insertObject(); + Memory key("error"); + Memory val("cannot convert from field of type document"); + c.setString(key, val); +} + +void +SlimeFiller::visit(const MapFieldValue& v) +{ + if (empty_or_empty_after_filtering(v)) { + return; + } + MapFieldValueInserter map_inserter(_inserter, _tokenize); + if (filter_matching_elements()) { + assert(v.has_no_erased_keys()); + for (uint32_t id_to_keep : (*_matching_elems)) { + auto entry = v[id_to_keep]; + map_inserter.insert_entry(*entry.first, *entry.second); + } + } else { + for (const auto& entry : v) { + map_inserter.insert_entry(*entry.first, *entry.second); + } + } +} + +void +SlimeFiller::visit(const ArrayFieldValue& value) +{ + if (empty_or_empty_after_filtering(value)) { + return; + } + Cursor& a = _inserter.insertArray(); + ArrayInserter ai(a); + SlimeFiller conv(ai, _tokenize); + if (filter_matching_elements()) { + for (uint32_t id_to_keep : (*_matching_elems)) { + value[id_to_keep].accept(conv); + } + } else { + for (const FieldValue& fv : value) { + fv.accept(conv); + } + } +} + +void +SlimeFiller::visit(const StringFieldValue& value) +{ + if (_tokenize) { + asciistream tmp; + AnnotationConverter converter(value.getValue(), tmp); + converter.handleIndexingTerms(value); + _inserter.insertString(Memory(tmp.str())); + } else { + _inserter.insertString(Memory(value.getValue())); + } +} + +void +SlimeFiller::visit(const IntFieldValue& value) +{ + int32_t v = value.getValue(); + _inserter.insertLong(v); +} + +void +SlimeFiller::visit(const LongFieldValue& value) +{ + int64_t v = value.getValue(); + _inserter.insertLong(v); +} + +void +SlimeFiller::visit(const ShortFieldValue& value) +{ + int16_t v = value.getValue(); + _inserter.insertLong(v); +} + +void +SlimeFiller::visit(const ByteFieldValue& value) +{ + int8_t v = value.getAsByte(); + _inserter.insertLong(v); +} + +void +SlimeFiller::visit(const BoolFieldValue& value) +{ + bool v = value.getValue(); + _inserter.insertBool(v); +} + +void +SlimeFiller::visit(const DoubleFieldValue& value) +{ + double v = value.getValue(); + _inserter.insertDouble(v); +} + +void +SlimeFiller::visit(const FloatFieldValue& value) +{ + float v = value.getValue(); + _inserter.insertDouble(v); +} + +void +SlimeFiller::visit(const PredicateFieldValue& value) +{ + _inserter.insertString(value.toString()); +} + +void +SlimeFiller::visit(const RawFieldValue& value) +{ + std::pair<const char *, size_t> buf = value.getAsRaw(); + _inserter.insertData(Memory(buf.first, buf.second)); +} + +void +SlimeFiller::visit(const StructFieldValue& value) +{ + if (value.getDataType() == &document::PositionDataType::getInstance() + && ResultConfig::wantedV8geoPositions()) + { + auto xv = value.getValue("x"); + auto yv = value.getValue("y"); + if (xv && yv) { + Cursor& c = _inserter.insertObject(); + c.setDouble("lat", double(yv->getAsInt()) / 1.0e6); + c.setDouble("lng", double(xv->getAsInt()) / 1.0e6); + return; + } + } + if (*value.getDataType() == *SearchDataType::URI) { + FieldValue::UP uriAllValue = value.getValue("all"); + if (uriAllValue && uriAllValue->isA(FieldValue::Type::STRING)) { + uriAllValue->accept(*this); + return; + } + } + Cursor& c = _inserter.insertObject(); + for (StructFieldValue::const_iterator itr = value.begin(); itr != value.end(); ++itr) { + Memory keymem(itr.field().getName()); + ObjectInserter vi(c, keymem); + SlimeFiller conv(vi, _tokenize); + FieldValue::UP nextValue(value.getValue(itr.field())); + (*nextValue).accept(conv); + } +} + +void +SlimeFiller::visit(const WeightedSetFieldValue& value) +{ + if (empty_or_empty_after_filtering(value)) { + return; + } + Cursor& a = _inserter.insertArray(); + Symbol isym = a.resolve("item"); + Symbol wsym = a.resolve("weight"); + using matching_elements_iterator_type = std::vector<uint32_t>::const_iterator; + matching_elements_iterator_type matching_elements_itr; + matching_elements_iterator_type matching_elements_itr_end; + if (filter_matching_elements()) { + matching_elements_itr = _matching_elems->begin(); + matching_elements_itr_end = _matching_elems->end(); + } + uint32_t idx = 0; + for (const auto& entry : value) { + if (filter_matching_elements()) { + if (matching_elements_itr == matching_elements_itr_end || + idx < *matching_elements_itr) { + ++idx; + continue; + } + ++matching_elements_itr; + } + Cursor& o = a.addObject(); + ObjectSymbolInserter ki(o, isym); + SlimeFiller conv(ki, _tokenize); + entry.first->accept(conv); + int weight = static_cast<const IntFieldValue&>(*entry.second).getValue(); + o.setLong(wsym, weight); + ++idx; + } +} + +void +SlimeFiller::visit(const TensorFieldValue& value) +{ + const auto& tensor = value.getAsTensorPtr(); + vespalib::nbostream s; + if (tensor) { + encode_value(*tensor, s); + } + _inserter.insertData(vespalib::Memory(s.peek(), s.size())); +} + +void +SlimeFiller::visit(const ReferenceFieldValue& value) +{ + _inserter.insertString(Memory(value.hasValidDocumentId() + ? value.getDocumentId().toString() + : vespalib::string())); +} + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.h b/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.h new file mode 100644 index 00000000000..e7d05ced3cf --- /dev/null +++ b/searchsummary/src/vespa/searchsummary/docsummary/slime_filler.h @@ -0,0 +1,55 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/document/fieldvalue/fieldvaluevisitor.h> +#include <cstdint> +#include <vector> + +namespace vespalib::slime { struct Inserter; } + +namespace search::docsummary { + +/* + * Class inserting a field value into a slime object. + */ +class SlimeFiller : public document::ConstFieldValueVisitor { + + vespalib::slime::Inserter& _inserter; + bool _tokenize; + const std::vector<uint32_t>* _matching_elems; + + bool filter_matching_elements() const { + return _matching_elems != nullptr; + } + + template <typename Value> + bool empty_or_empty_after_filtering(const Value& value) const { + return (value.isEmpty() || (filter_matching_elements() && (_matching_elems->empty() || _matching_elems->back() >= value.size()))); + } + + void visit(const document::AnnotationReferenceFieldValue& v) override; + void visit(const document::Document& v) override; + void visit(const document::MapFieldValue& v) override; + void visit(const document::ArrayFieldValue& value) override; + void visit(const document::StringFieldValue& value) override; + void visit(const document::IntFieldValue& value) override; + void visit(const document::LongFieldValue& value) override; + void visit(const document::ShortFieldValue& value) override; + void visit(const document::ByteFieldValue& value) override; + void visit(const document::BoolFieldValue& value) override; + void visit(const document::DoubleFieldValue& value) override; + void visit(const document::FloatFieldValue& value) override; + void visit(const document::PredicateFieldValue& value) override; + void visit(const document::RawFieldValue& value) override; + void visit(const document::StructFieldValue& value) override; + void visit(const document::WeightedSetFieldValue& value) override; + void visit(const document::TensorFieldValue& value) override; + void visit(const document::ReferenceFieldValue& value) override; +public: + SlimeFiller(vespalib::slime::Inserter& inserter, bool tokenize); + SlimeFiller(vespalib::slime::Inserter& inserter, bool tokenize, const std::vector<uint32_t>* matching_elems); + ~SlimeFiller() override; +}; + +} diff --git a/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp index 0542423ab75..78d9f7d10a1 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp @@ -1,16 +1,10 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "summaryfieldconverter.h" +#include "annotation_converter.h" #include "check_undefined_value_visitor.h" -#include "linguisticsannotation.h" -#include "resultconfig.h" #include "searchdatatype.h" -#include <vespa/document/annotation/alternatespanlist.h> -#include <vespa/document/annotation/annotation.h> -#include <vespa/document/annotation/spantree.h> -#include <vespa/document/annotation/spantreevisitor.h> -#include <vespa/document/datatype/documenttype.h> -#include <vespa/document/datatype/positiondatatype.h> +#include "slime_filler.h" #include <vespa/document/fieldvalue/arrayfieldvalue.h> #include <vespa/document/fieldvalue/boolfieldvalue.h> #include <vespa/document/fieldvalue/bytefieldvalue.h> @@ -27,33 +21,18 @@ #include <vespa/document/fieldvalue/annotationreferencefieldvalue.h> #include <vespa/document/fieldvalue/tensorfieldvalue.h> #include <vespa/document/fieldvalue/referencefieldvalue.h> -#include <vespa/eval/eval/value_codec.h> -#include <vespa/juniper/juniper_separators.h> -#include <vespa/searchcommon/common/schema.h> -#include <vespa/searchlib/util/url.h> -#include <vespa/vespalib/geo/zcurve.h> #include <vespa/vespalib/stllike/asciistream.h> #include <vespa/vespalib/util/size_literals.h> -#include <vespa/vespalib/util/stringfmt.h> #include <vespa/vespalib/data/slime/slime.h> #include <vespa/vespalib/data/smart_buffer.h> -#include <vespa/vespalib/objects/nbostream.h> -#include <vespa/vespalib/util/exceptions.h> - -using document::AlternateSpanList; -using document::Annotation; using document::AnnotationReferenceFieldValue; -using document::ArrayDataType; using document::ArrayFieldValue; using document::BoolFieldValue; using document::ByteFieldValue; -using document::DataType; using document::Document; -using document::DocumentType; using document::DoubleFieldValue; using document::FieldValue; -using document::FixedTypeRepo; using document::ConstFieldValueVisitor; using document::FloatFieldValue; using document::IntFieldValue; @@ -62,126 +41,15 @@ using document::MapFieldValue; using document::PredicateFieldValue; using document::RawFieldValue; using document::ShortFieldValue; -using document::Span; -using document::SpanList; -using document::SimpleSpanList; -using document::SpanNode; -using document::SpanTree; -using document::SpanTreeVisitor; using document::StringFieldValue; using document::StructFieldValue; -using document::WeightedSetDataType; using document::WeightedSetFieldValue; using document::TensorFieldValue; using document::ReferenceFieldValue; -using search::index::Schema; -using search::util::URL; -using std::make_pair; -using std::pair; -using std::vector; -using vespalib::asciistream; -using vespalib::geo::ZCurve; -using vespalib::make_string; -using vespalib::string; -using vespalib::stringref; namespace search::docsummary { namespace { -string getSpanString(const string &s, const Span &span) { - return string(&s[span.from()], &s[span.from() + span.length()]); -} - -struct SpanFinder : SpanTreeVisitor { - int32_t begin_pos; - int32_t end_pos; - - SpanFinder() : begin_pos(0x7fffffff), end_pos(-1) {} - Span span() { return Span(begin_pos, end_pos - begin_pos); } - - void visit(const Span &node) override { - begin_pos = std::min(begin_pos, node.from()); - end_pos = std::max(end_pos, node.from() + node.length()); - } - void visit(const SpanList &node) override { - for (const auto & span_ : node) { - span_->accept(*this); - } - } - void visit(const SimpleSpanList &node) override { - for (const auto & span_ : node) { - span_.accept(*this); - } - } - void visit(const AlternateSpanList &node) override { - for (size_t i = 0; i < node.getNumSubtrees(); ++i) { - visit(node.getSubtree(i)); - } - } -}; - -Span getSpan(const SpanNode &span_node) { - SpanFinder finder; - span_node.accept(finder); - return finder.span(); -} - -// Extract the FieldValues from all TERM annotations. For each span -// with such annotations, the Handler is invoked with a set of -// iterators over the FieldValues for that span. -template <typename Handler> -void handleIndexingTerms(Handler &handler, const StringFieldValue &value) { - StringFieldValue::SpanTrees trees = value.getSpanTrees(); - const SpanTree *tree = StringFieldValue::findTree(trees, linguistics::SPANTREE_NAME); - typedef pair<Span, const FieldValue *> SpanTerm; - typedef vector<SpanTerm> SpanTermVector; - if (!tree) { - // Treat a string without annotations as a single span. - SpanTerm str(Span(0, handler.text.size()), - static_cast<const FieldValue*>(nullptr)); - handler.handleAnnotations(str.first, &str, &str + 1); - return; - } - SpanTermVector terms; - for (const Annotation & annotation : *tree) { - // For now, skip any composite spans. - const Span *span = dynamic_cast<const Span*>(annotation.getSpanNode()); - if ((span != nullptr) && annotation.valid() && - (annotation.getType() == *linguistics::TERM)) { - terms.push_back(make_pair(getSpan(*span), - annotation.getFieldValue())); - } - } - sort(terms.begin(), terms.end()); - auto it = terms.begin(); - auto ite = terms.end(); - int32_t endPos = 0; - for (; it != ite; ) { - auto it_begin = it; - if (it_begin->first.from() > endPos) { - Span tmpSpan(endPos, it_begin->first.from() - endPos); - handler.handleAnnotations(tmpSpan, it, it); - endPos = it_begin->first.from(); - } - for (; it != ite && it->first == it_begin->first; ++it); - handler.handleAnnotations(it_begin->first, it_begin, it); - endPos = it_begin->first.from() + it_begin->first.length(); - } - int32_t wantEndPos = handler.text.size(); - if (endPos < wantEndPos) { - Span tmpSpan(endPos, wantEndPos - endPos); - handler.handleAnnotations(tmpSpan, ite, ite); - } -} - -const StringFieldValue &ensureStringFieldValue(const FieldValue &value) __attribute__((noinline)); - -const StringFieldValue &ensureStringFieldValue(const FieldValue &value) { - if (!value.isA(FieldValue::Type::STRING)) { - throw vespalib::IllegalArgumentException("Illegal field type. " + value.toString(), VESPA_STRLOC); - } - return static_cast<const StringFieldValue &>(value); -} struct FieldValueConverter { virtual FieldValue::UP convert(const FieldValue &input) = 0; @@ -189,47 +57,9 @@ struct FieldValueConverter { }; -struct SummaryHandler { - const string text; - asciistream &out; - - SummaryHandler(const string &s, asciistream &stream) - : text(s), out(stream) {} - - template <typename ForwardIt> - void handleAnnotations(const Span &span, ForwardIt it, ForwardIt last) { - int annCnt = (last - it); - if (annCnt > 1 || (annCnt == 1 && it->second)) { - annotateSpans(span, it, last); - } else { - out << getSpanString(text, span) << juniper::separators::unit_separator_string; - } - } - - template <typename ForwardIt> - void annotateSpans(const Span &span, ForwardIt it, ForwardIt last) { - out << juniper::separators::interlinear_annotation_anchor_string // ANCHOR - << (getSpanString(text, span)) - << juniper::separators::interlinear_annotation_separator_string; // SEPARATOR - while (it != last) { - if (it->second) { - out << ensureStringFieldValue(*it->second).getValue(); - } else { - out << getSpanString(text, span); - } - if (++it != last) { - out << " "; - } - } - out << juniper::separators::interlinear_annotation_terminator_string // TERMINATOR - << juniper::separators::unit_separator_string; - } -}; - - class SummaryFieldValueConverter : protected ConstFieldValueVisitor { - asciistream _str; + vespalib::asciistream _str; bool _tokenize; FieldValue::UP _field_value; FieldValueConverter &_structuredFieldConverter; @@ -251,8 +81,8 @@ class SummaryFieldValueConverter : protected ConstFieldValueVisitor void visit(const StringFieldValue &value) override { if (_tokenize) { - SummaryHandler handler(value.getValue(), _str); - handleIndexingTerms(handler, value); + AnnotationConverter converter(value.getValue(), _str); + converter.handleIndexingTerms(value); } else { _str << value.getValue(); } @@ -333,243 +163,6 @@ SummaryFieldValueConverter::~SummaryFieldValueConverter() = default; using namespace vespalib::slime::convenience; - - -class SlimeFiller : public ConstFieldValueVisitor { -private: - class MapFieldValueInserter { - private: - Cursor& _array; - Symbol _key_sym; - Symbol _val_sym; - bool _tokenize; - - public: - MapFieldValueInserter(Inserter& parent_inserter, bool tokenize) - : _array(parent_inserter.insertArray()), - _key_sym(_array.resolve("key")), - _val_sym(_array.resolve("value")), - _tokenize(tokenize) - { - } - void insert_entry(const FieldValue& key, const FieldValue& value) { - Cursor& c = _array.addObject(); - ObjectSymbolInserter ki(c, _key_sym); - ObjectSymbolInserter vi(c, _val_sym); - SlimeFiller key_conv(ki, _tokenize); - SlimeFiller val_conv(vi, _tokenize); - - key.accept(key_conv); - value.accept(val_conv); - } - }; - - Inserter &_inserter; - bool _tokenize; - const std::vector<uint32_t>* _matching_elems; - - bool filter_matching_elements() const { - return _matching_elems != nullptr; - } - - template <typename Value> - bool empty_or_empty_after_filtering(const Value& value) const { - return (value.isEmpty() || (filter_matching_elements() && (_matching_elems->empty() || _matching_elems->back() >= value.size()))); - } - - void visit(const AnnotationReferenceFieldValue & v ) override { - (void)v; - Cursor &c = _inserter.insertObject(); - Memory key("error"); - Memory val("cannot convert from annotation reference field"); - c.setString(key, val); - } - void visit(const Document & v) override { - (void)v; - Cursor &c = _inserter.insertObject(); - Memory key("error"); - Memory val("cannot convert from field of type document"); - c.setString(key, val); - } - - void visit(const MapFieldValue & v) override { - if (empty_or_empty_after_filtering(v)) { - return; - } - MapFieldValueInserter map_inserter(_inserter, _tokenize); - if (filter_matching_elements()) { - assert(v.has_no_erased_keys()); - for (uint32_t id_to_keep : (*_matching_elems)) { - auto entry = v[id_to_keep]; - map_inserter.insert_entry(*entry.first, *entry.second); - } - } else { - for (const auto &entry : v) { - map_inserter.insert_entry(*entry.first, *entry.second); - } - } - } - - void visit(const ArrayFieldValue &value) override { - if (empty_or_empty_after_filtering(value)) { - return; - } - Cursor &a = _inserter.insertArray(); - ArrayInserter ai(a); - SlimeFiller conv(ai, _tokenize); - if (filter_matching_elements()) { - for (uint32_t id_to_keep : (*_matching_elems)) { - value[id_to_keep].accept(conv); - } - } else { - for (const FieldValue &fv : value) { - fv.accept(conv); - } - } - } - - void visit(const StringFieldValue &value) override { - if (_tokenize) { - asciistream tmp; - SummaryHandler handler(value.getValue(), tmp); - handleIndexingTerms(handler, value); - _inserter.insertString(Memory(tmp.str())); - } else { - _inserter.insertString(Memory(value.getValue())); - } - } - - void visit(const IntFieldValue &value) override { - int32_t v = value.getValue(); - _inserter.insertLong(v); - } - void visit(const LongFieldValue &value) override { - int64_t v = value.getValue(); - _inserter.insertLong(v); - } - void visit(const ShortFieldValue &value) override { - int16_t v = value.getValue(); - _inserter.insertLong(v); - } - void visit(const ByteFieldValue &value) override { - int8_t v = value.getAsByte(); - _inserter.insertLong(v); - } - void visit(const BoolFieldValue &value) override { - bool v = value.getValue(); - _inserter.insertBool(v); - } - void visit(const DoubleFieldValue &value) override { - double v = value.getValue(); - _inserter.insertDouble(v); - } - void visit(const FloatFieldValue &value) override { - float v = value.getValue(); - _inserter.insertDouble(v); - } - - void visit(const PredicateFieldValue &value) override { - _inserter.insertString(value.toString()); - } - - void visit(const RawFieldValue &value) override { - std::pair<const char *, size_t> buf = value.getAsRaw(); - _inserter.insertData(Memory(buf.first, buf.second)); - } - - void visit(const StructFieldValue &value) override { - if (value.getDataType() == &document::PositionDataType::getInstance() - && ResultConfig::wantedV8geoPositions()) - { - auto xv = value.getValue("x"); - auto yv = value.getValue("y"); - if (xv && yv) { - Cursor &c = _inserter.insertObject(); - c.setDouble("lat", double(yv->getAsInt()) / 1.0e6); - c.setDouble("lng", double(xv->getAsInt()) / 1.0e6); - return; - } - } - if (*value.getDataType() == *SearchDataType::URI) { - FieldValue::UP uriAllValue = value.getValue("all"); - if (uriAllValue && uriAllValue->isA(FieldValue::Type::STRING)) { - uriAllValue->accept(*this); - return; - } - } - Cursor &c = _inserter.insertObject(); - for (StructFieldValue::const_iterator itr = value.begin(); itr != value.end(); ++itr) { - Memory keymem(itr.field().getName()); - ObjectInserter vi(c, keymem); - SlimeFiller conv(vi, _tokenize); - FieldValue::UP nextValue(value.getValue(itr.field())); - (*nextValue).accept(conv); - } - } - - void visit(const WeightedSetFieldValue &value) override { - if (empty_or_empty_after_filtering(value)) { - return; - } - Cursor &a = _inserter.insertArray(); - Symbol isym = a.resolve("item"); - Symbol wsym = a.resolve("weight"); - using matching_elements_iterator_type = std::vector<uint32_t>::const_iterator; - matching_elements_iterator_type matching_elements_itr; - matching_elements_iterator_type matching_elements_itr_end; - if (filter_matching_elements()) { - matching_elements_itr = _matching_elems->begin(); - matching_elements_itr_end = _matching_elems->end(); - } - uint32_t idx = 0; - for (const auto & entry : value) { - if (filter_matching_elements()) { - if (matching_elements_itr == matching_elements_itr_end || - idx < *matching_elements_itr) { - ++idx; - continue; - } - ++matching_elements_itr; - } - Cursor &o = a.addObject(); - ObjectSymbolInserter ki(o, isym); - SlimeFiller conv(ki, _tokenize); - entry.first->accept(conv); - int weight = static_cast<const IntFieldValue &>(*entry.second).getValue(); - o.setLong(wsym, weight); - ++idx; - } - } - - void visit(const TensorFieldValue &value) override { - const auto &tensor = value.getAsTensorPtr(); - vespalib::nbostream s; - if (tensor) { - encode_value(*tensor, s); - } - _inserter.insertData(vespalib::Memory(s.peek(), s.size())); - } - - void visit(const ReferenceFieldValue& value) override { - _inserter.insertString(Memory(value.hasValidDocumentId() - ? value.getDocumentId().toString() - : string())); - } - -public: - SlimeFiller(Inserter &inserter, bool tokenize) - : _inserter(inserter), - _tokenize(tokenize), - _matching_elems(nullptr) - {} - - SlimeFiller(Inserter& inserter, bool tokenize, const std::vector<uint32_t>* matching_elems) - : _inserter(inserter), - _tokenize(tokenize), - _matching_elems(matching_elems) - {} -}; - class SlimeConverter : public FieldValueConverter { private: bool _tokenize; |