summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@broadpark.no>2019-06-24 15:18:28 +0200
committerTor Egge <Tor.Egge@broadpark.no>2019-06-24 15:18:28 +0200
commit14e53ea2cfa51ecb2a7863c2d641632a2d08fbc2 (patch)
treebd24d356e08971d9330e5d95e3107e2e1e255693 /searchlib
parentd3fc7d9efbf656ec9a5623084fc91be0c79789e8 (diff)
Cap interleaved features in memory index (field_length, num_occs) to prevent
them wrapping around to low values. Cap reconstucted interleaved features the same way. Use interleaved features from memory index when writing disk index.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/memoryindex/field_index/field_index_test.cpp11
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp7
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/field_index.cpp34
-rw-r--r--searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp6
4 files changed, 30 insertions, 28 deletions
diff --git a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp
index 7f2014a2207..ac1735e0549 100644
--- a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp
+++ b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp
@@ -679,6 +679,17 @@ TEST_F(FieldIndexInterleavedFeaturesTest, no_features_are_unpacked)
expect_features_unpacked("{1000000:}", 0, 0);
}
+TEST_F(FieldIndexInterleavedFeaturesTest, interleaved_features_are_capped)
+{
+ FeatureStore::DecodeContextCooked decoder(nullptr);
+ WrapInserter(idx).word("b").add(11, getFeatures(66001, 66000)).flush();
+ auto itr = this->idx.find("b");
+ EXPECT_EQ(11, itr.getKey());
+ auto &entry = itr.getData();
+ EXPECT_EQ(std::numeric_limits<uint16_t>::max(), entry.get_num_occs());
+ EXPECT_EQ(std::numeric_limits<uint16_t>::max(), entry.get_field_length());
+}
+
Schema
make_multi_field_schema()
{
diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
index aeb4e2976b5..ab77a6bfd08 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
@@ -17,6 +17,8 @@ namespace {
vespalib::string PosOccIdCooked = "PosOcc.3.Cooked";
vespalib::string interleaved_features("interleaved_features");
+uint16_t cap_u16(uint32_t val) { return std::min(val, static_cast<uint32_t>(std::numeric_limits<uint16_t>::max())); }
+
}
using vespalib::getLastErrorString;
@@ -346,8 +348,9 @@ FieldReaderStripInfo::read()
if (_hasElements && _field_length_scanner) {
field_length = _field_length_scanner->get_field_length(features.doc_id());
}
- features.set_field_length(field_length);
- features.set_num_occs(num_occs);
+ // cap interleaved features to 16 bits each, to match memory index
+ features.set_field_length(cap_u16(field_length));
+ features.set_num_occs(cap_u16(num_occs));
}
}
diff --git a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp
index 10cd200bdb0..352fd860642 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/field_index.cpp
@@ -32,20 +32,6 @@ using vespalib::GenerationHandler;
namespace search::memoryindex {
-namespace {
-
-void set_interleaved_features(DocIdAndFeatures &features)
-{
- // Set cheap features based on normal features.
- // TODO: Update when proper cheap features are present in memory index.
- assert(!features.elements().empty());
- const auto &element = features.elements().front();
- features.set_field_length(element.getElementLen());
- features.set_num_occs(element.getNumOccs());
-}
-
-}
-
using datastore::EntryRef;
template <bool interleaved_features>
@@ -194,12 +180,12 @@ FieldIndex<interleaved_features>::dump(search::index::IndexBuilder & indexBuilde
auto pitr = tree->begin(_postingListStore.getAllocator());
assert(pitr.valid());
for (; pitr.valid(); ++pitr) {
- uint32_t docId = pitr.getKey();
- EntryRef featureRef(pitr.getData().get_features());
- _featureStore.setupForReadFeatures(featureRef, decoder);
+ features.set_doc_id(pitr.getKey());
+ const PostingListEntryType &entry(pitr.getData());
+ features.set_num_occs(entry.get_num_occs());
+ features.set_field_length(entry.get_field_length());
+ _featureStore.setupForReadFeatures(entry.get_features(), decoder);
decoder.readFeatures(features);
- features.set_doc_id(docId);
- set_interleaved_features(features);
indexBuilder.add_document(features);
}
} else {
@@ -207,12 +193,12 @@ FieldIndex<interleaved_features>::dump(search::index::IndexBuilder & indexBuilde
_postingListStore.getKeyDataEntry(plist, clusterSize);
const PostingListKeyDataType *kde = kd + clusterSize;
for (; kd != kde; ++kd) {
- uint32_t docId = kd->_key;
- EntryRef featureRef(kd->getData().get_features());
- _featureStore.setupForReadFeatures(featureRef, decoder);
+ features.set_doc_id(kd->_key);
+ const PostingListEntryType &entry(kd->getData());
+ features.set_num_occs(entry.get_num_occs());
+ features.set_field_length(entry.get_field_length());
+ _featureStore.setupForReadFeatures(entry.get_features(), decoder);
decoder.readFeatures(features);
- features.set_doc_id(docId);
- set_interleaved_features(features);
indexBuilder.add_document(features);
}
}
diff --git a/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp b/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp
index c75087e8577..4eda7cf48bd 100644
--- a/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp
+++ b/searchlib/src/vespa/searchlib/memoryindex/ordered_field_index_inserter.cpp
@@ -25,6 +25,8 @@ namespace {
const vespalib::string emptyWord = "";
+uint16_t cap_u16(uint32_t val) { return std::min(val, static_cast<uint32_t>(std::numeric_limits<uint16_t>::max())); }
+
}
template <bool interleaved_features>
@@ -119,8 +121,8 @@ OrderedFieldIndexInserter<interleaved_features>::add(uint32_t docId,
(_prevDocId == docId && !_prevAdd));
datastore::EntryRef featureRef = _fieldIndex.addFeatures(features);
_adds.push_back(PostingListKeyDataType(docId, PostingListEntryType(featureRef,
- features.num_occs(),
- features.field_length())));
+ cap_u16(features.num_occs()),
+ cap_u16(features.field_length()))));
_listener.insert(_dItr.getKey()._wordRef, docId);
_prevDocId = docId;
_prevAdd = true;