diff options
author | Tor Egge <Tor.Egge@broadpark.no> | 2019-06-24 15:18:28 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@broadpark.no> | 2019-06-24 15:18:28 +0200 |
commit | 14e53ea2cfa51ecb2a7863c2d641632a2d08fbc2 (patch) | |
tree | bd24d356e08971d9330e5d95e3107e2e1e255693 /searchlib | |
parent | d3fc7d9efbf656ec9a5623084fc91be0c79789e8 (diff) |
Cap interleaved features in memory index (field_length, num_occs) to prevent
them wrapping around to low values.
Cap reconstucted interleaved features the same way.
Use interleaved features from memory index when writing disk index.
Diffstat (limited to 'searchlib')
4 files changed, 30 insertions, 28 deletions
diff --git a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp index 7f2014a2207..ac1735e0549 100644 --- a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp +++ b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp @@ -679,6 +679,17 @@ TEST_F(FieldIndexInterleavedFeaturesTest, no_features_are_unpacked) expect_features_unpacked("{1000000:}", 0, 0); } +TEST_F(FieldIndexInterleavedFeaturesTest, interleaved_features_are_capped) +{ + FeatureStore::DecodeContextCooked decoder(nullptr); + WrapInserter(idx).word("b").add(11, getFeatures(66001, 66000)).flush(); + auto itr = this->idx.find("b"); + EXPECT_EQ(11, itr.getKey()); + auto &entry = itr.getData(); + EXPECT_EQ(std::numeric_limits<uint16_t>::max(), entry.get_num_occs()); + EXPECT_EQ(std::numeric_limits<uint16_t>::max(), entry.get_field_length()); +} + Schema make_multi_field_schema() { diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp index aeb4e2976b5..ab77a6bfd08 100644 --- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp +++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp @@ -17,6 +17,8 @@ namespace { vespalib::string PosOccIdCooked = "PosOcc.3.Cooked"; vespalib::string interleaved_features("interleaved_features"); +uint16_t cap_u16(uint32_t val) { return std::min(val, static_cast<uint32_t>(std::numeric_limits<uint16_t>::max())); } + } using vespalib::getLastErrorString; @@ -346,8 +348,9 @@ FieldReaderStripInfo::read() if (_hasElements && _field_length_scanner) { field_length = _field_length_scanner->get_field_length(features.doc_id()); } - features.set_field_length(field_length); - features.set_num_occs(num_occs); + // cap interleaved features to 16 bits each, to match memory index + features.set_field_length(cap_u16(field_length)); + features.set_num_occs(cap_u16(num_occs)); } } diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp index 10cd200bdb0..352fd860642 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp @@ -32,20 +32,6 @@ using vespalib::GenerationHandler; namespace search::memoryindex { -namespace { - -void set_interleaved_features(DocIdAndFeatures &features) -{ - // Set cheap features based on normal features. - // TODO: Update when proper cheap features are present in memory index. - assert(!features.elements().empty()); - const auto &element = features.elements().front(); - features.set_field_length(element.getElementLen()); - features.set_num_occs(element.getNumOccs()); -} - -} - using datastore::EntryRef; template <bool interleaved_features> @@ -194,12 +180,12 @@ FieldIndex<interleaved_features>::dump(search::index::IndexBuilder & indexBuilde auto pitr = tree->begin(_postingListStore.getAllocator()); assert(pitr.valid()); for (; pitr.valid(); ++pitr) { - uint32_t docId = pitr.getKey(); - EntryRef featureRef(pitr.getData().get_features()); - _featureStore.setupForReadFeatures(featureRef, decoder); + features.set_doc_id(pitr.getKey()); + const PostingListEntryType &entry(pitr.getData()); + features.set_num_occs(entry.get_num_occs()); + features.set_field_length(entry.get_field_length()); + _featureStore.setupForReadFeatures(entry.get_features(), decoder); decoder.readFeatures(features); - features.set_doc_id(docId); - set_interleaved_features(features); indexBuilder.add_document(features); } } else { @@ -207,12 +193,12 @@ FieldIndex<interleaved_features>::dump(search::index::IndexBuilder & indexBuilde _postingListStore.getKeyDataEntry(plist, clusterSize); const PostingListKeyDataType *kde = kd + clusterSize; for (; kd != kde; ++kd) { - uint32_t docId = kd->_key; - EntryRef featureRef(kd->getData().get_features()); - _featureStore.setupForReadFeatures(featureRef, decoder); + features.set_doc_id(kd->_key); + const PostingListEntryType &entry(kd->getData()); + features.set_num_occs(entry.get_num_occs()); + features.set_field_length(entry.get_field_length()); + _featureStore.setupForReadFeatures(entry.get_features(), decoder); decoder.readFeatures(features); - features.set_doc_id(docId); - set_interleaved_features(features); indexBuilder.add_document(features); } } diff --git a/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp b/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp index c75087e8577..4eda7cf48bd 100644 --- a/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp +++ b/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp @@ -25,6 +25,8 @@ namespace { const vespalib::string emptyWord = ""; +uint16_t cap_u16(uint32_t val) { return std::min(val, static_cast<uint32_t>(std::numeric_limits<uint16_t>::max())); } + } template <bool interleaved_features> @@ -119,8 +121,8 @@ OrderedFieldIndexInserter<interleaved_features>::add(uint32_t docId, (_prevDocId == docId && !_prevAdd)); datastore::EntryRef featureRef = _fieldIndex.addFeatures(features); _adds.push_back(PostingListKeyDataType(docId, PostingListEntryType(featureRef, - features.num_occs(), - features.field_length()))); + cap_u16(features.num_occs()), + cap_u16(features.field_length())))); _listener.insert(_dItr.getKey()._wordRef, docId); _prevDocId = docId; _prevAdd = true; |