// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "field_index.h" #include "ordered_field_index_inserter.h" #include "posting_iterator.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include LOG_SETUP(".searchlib.memoryindex.field_index"); using search::fef::TermFieldMatchDataArray; using search::index::DocIdAndFeatures; using search::index::Schema; using search::index::WordDocElementFeatures; using search::queryeval::BooleanMatchIteratorWrapper; using search::queryeval::FieldSpecBase; using search::queryeval::SearchIterator; using search::queryeval::SimpleLeafBlueprint; using vespalib::GenerationHandler; namespace search::memoryindex { using vespalib::datastore::EntryRef; template FieldIndex::FieldIndex(const index::Schema& schema, uint32_t fieldId) : FieldIndex(schema, fieldId, index::FieldLengthInfo()) { } template FieldIndex::FieldIndex(const index::Schema& schema, uint32_t fieldId, const index::FieldLengthInfo& info) : FieldIndexBase(schema, fieldId, info), _postingListStore() { using InserterType = OrderedFieldIndexInserter; _inserter = std::make_unique(*this); } template FieldIndex::~FieldIndex() { _postingListStore.disableFreeLists(); _postingListStore.disable_entry_hold_list(); _dict.disableFreeLists(); _dict.disable_entry_hold_list(); // XXX: Kludge for (DictionaryTree::Iterator it = _dict.begin(); it.valid(); ++it) { EntryRef pidx(it.getData().load_relaxed()); if (pidx.valid()) { _postingListStore.clear(pidx); it.getWData().store_release(EntryRef()); } } _postingListStore.clearBuilder(); freeze(); // Flush all pending posting list tree freezes assign_generation(); _dict.clear(); // Clear dictionary freeze(); // Flush pending freeze for dictionary tree. assign_generation(); incGeneration(); reclaim_memory(); } template typename FieldIndex::PostingList::Iterator FieldIndex::find(const vespalib::stringref word) const { DictionaryTree::Iterator itr = _dict.find(WordKey(EntryRef()), KeyComp(_wordStore, word)); if (itr.valid()) { return _postingListStore.begin(itr.getData().load_relaxed()); } return typename PostingList::Iterator(); } template typename FieldIndex::PostingList::ConstIterator FieldIndex::findFrozen(const vespalib::stringref word) const { auto itr = _dict.getFrozenView().find(WordKey(EntryRef()), KeyComp(_wordStore, word)); if (itr.valid()) { return _postingListStore.beginFrozen(itr.getData().load_acquire()); } return typename PostingList::ConstIterator(); } template void FieldIndex::compactFeatures() { auto compacting_buffers = _featureStore.start_compact(); auto itr = _dict.begin(); uint32_t packedIndex = _fieldId; for (; itr.valid(); ++itr) { typename PostingListStore::RefType pidx(itr.getData().load_relaxed()); if (!pidx.valid()) { continue; } uint32_t clusterSize = _postingListStore.getClusterSize(pidx); if (clusterSize == 0) { const PostingList *tree = _postingListStore.getTreeEntry(pidx); auto pitr = tree->begin(_postingListStore.getAllocator()); for (; pitr.valid(); ++pitr) { const PostingListEntryType& posting_entry(pitr.getData()); // Filter on which buffers to move features from when // performing incremental compaction. EntryRef newFeatures = _featureStore.moveFeatures(packedIndex, posting_entry.get_features_relaxed()); // Reference the moved data posting_entry.update_features(newFeatures); } } else { const PostingListKeyDataType *shortArray = _postingListStore.getKeyDataEntry(pidx, clusterSize); const PostingListKeyDataType *ite = shortArray + clusterSize; for (const PostingListKeyDataType *it = shortArray; it < ite; ++it) { const PostingListEntryType& posting_entry(it->getData()); // Filter on which buffers to move features from when // performing incremental compaction. EntryRef newFeatures = _featureStore.moveFeatures(packedIndex, posting_entry.get_features_relaxed()); // Reference the moved data posting_entry.update_features(newFeatures); } } } using generation_t = GenerationHandler::generation_t; compacting_buffers->finish(); generation_t generation = _generationHandler.getCurrentGeneration(); _featureStore.assign_generation(generation); } template void FieldIndex::dump(search::index::IndexBuilder & indexBuilder) { vespalib::stringref word; FeatureStore::DecodeContextCooked decoder(nullptr); DocIdAndFeatures features; vespalib::Array wordMap(_numUniqueWords + 1, 0); _featureStore.setupForField(_fieldId, decoder); for (auto itr = _dict.begin(); itr.valid(); ++itr) { const WordKey & wk = itr.getKey(); typename PostingListStore::RefType plist(itr.getData().load_relaxed()); word = _wordStore.getWord(wk._wordRef); if (!plist.valid()) { continue; } indexBuilder.startWord(word); uint32_t clusterSize = _postingListStore.getClusterSize(plist); if (clusterSize == 0) { const PostingList *tree = _postingListStore.getTreeEntry(plist); auto pitr = tree->begin(_postingListStore.getAllocator()); assert(pitr.valid()); for (; pitr.valid(); ++pitr) { features.set_doc_id(pitr.getKey()); const PostingListEntryType &entry(pitr.getData()); features.set_num_occs(entry.get_num_occs()); features.set_field_length(entry.get_field_length()); _featureStore.setupForReadFeatures(entry.get_features_relaxed(), decoder); decoder.readFeatures(features); indexBuilder.add_document(features); } } else { const PostingListKeyDataType *kd = _postingListStore.getKeyDataEntry(plist, clusterSize); const PostingListKeyDataType *kde = kd + clusterSize; for (; kd != kde; ++kd) { features.set_doc_id(kd->_key); const PostingListEntryType &entry(kd->getData()); features.set_num_occs(entry.get_num_occs()); features.set_field_length(entry.get_field_length()); _featureStore.setupForReadFeatures(entry.get_features_relaxed(), decoder); decoder.readFeatures(features); indexBuilder.add_document(features); } } indexBuilder.endWord(); } } template vespalib::MemoryUsage FieldIndex::getMemoryUsage() const { vespalib::MemoryUsage usage; usage.merge(_wordStore.getMemoryUsage()); usage.merge(_dict.getMemoryUsage()); usage.merge(_postingListStore.getMemoryUsage()); usage.merge(_featureStore.getMemoryUsage()); usage.merge(_remover.getStore().getMemoryUsage()); return usage; } template queryeval::SearchIterator::UP FieldIndex::make_search_iterator(const vespalib::string& term, uint32_t field_id, fef::TermFieldMatchDataArray match_data) const { return search::memoryindex::make_search_iterator (find(term), getFeatureStore(), field_id, std::move(match_data)); } namespace { template class MemoryTermBlueprint : public SimpleLeafBlueprint { private: using FieldIndexType = FieldIndex; using PostingListIteratorType = typename FieldIndexType::PostingList::ConstIterator; GenerationHandler::Guard _guard; const queryeval::FieldSpec _field; PostingListIteratorType _posting_itr; const FeatureStore& _feature_store; const uint32_t _field_id; const vespalib::string _query_term; const bool _use_bit_vector; public: MemoryTermBlueprint(GenerationHandler::Guard&& guard, PostingListIteratorType posting_itr, const FeatureStore& feature_store, const queryeval::FieldSpec& field, uint32_t field_id, const vespalib::string& query_term, bool use_bit_vector) : SimpleLeafBlueprint(field), _guard(), _field(field), _posting_itr(posting_itr), _feature_store(feature_store), _field_id(field_id), _query_term(query_term), _use_bit_vector(use_bit_vector) { _guard = std::move(guard); HitEstimate estimate(_posting_itr.size(), !_posting_itr.valid()); setEstimate(estimate); } SearchIterator::UP createLeafSearch(const TermFieldMatchDataArray& tfmda, bool) const override { auto result = make_search_iterator(_posting_itr, _feature_store, _field_id, tfmda); if (_use_bit_vector) { LOG(debug, "Return BooleanMatchIteratorWrapper: field_id(%u), doc_count(%zu)", _field_id, _posting_itr.size()); return std::make_unique(std::move(result), tfmda); } LOG(debug, "Return PostingIterator: field_id(%u), doc_count(%zu)", _field_id, _posting_itr.size()); return result; } SearchIterator::UP createFilterSearch(bool, FilterConstraint) const override { auto wrapper = std::make_unique(getState().numFields()); auto & tfmda = wrapper->tfmda(); wrapper->wrap(make_search_iterator(_posting_itr, _feature_store, _field_id, tfmda)); return wrapper; } void visitMembers(vespalib::ObjectVisitor& visitor) const override { SimpleLeafBlueprint::visitMembers(visitor); visit(visitor, "field_name", _field.getName()); visit(visitor, "query_term", _query_term); } }; } template std::unique_ptr FieldIndex::make_term_blueprint(const vespalib::string& term, const queryeval::FieldSpec& field, uint32_t field_id) { auto guard = takeGenerationGuard(); auto posting_itr = findFrozen(term); bool use_bit_vector = field.isFilter(); return std::make_unique> (std::move(guard), posting_itr, getFeatureStore(), field, field_id, term, use_bit_vector); } template class FieldIndex; template class FieldIndex; } using search::memoryindex::FieldIndexBase; namespace vespalib::btree { template class BTreeNodeDataWrap; template class BTreeNodeT; template class BTreeNodeTT; template class BTreeNodeTT; template class BTreeInternalNode; template class BTreeLeafNode; template class BTreeNodeStore; template class BTreeIterator; template class BTree; template class BTreeRoot; template class BTreeRootBase; template class BTreeNodeAllocator; }