summaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@broadpark.no>2019-06-06 14:56:29 +0200
committerTor Egge <Tor.Egge@broadpark.no>2019-06-06 18:54:56 +0200
commit1d079165f38e46422e4b4f51189c97b3fbe9d125 (patch)
tree02dbd194f079452f6296370a128b93b254033d12 /searchlib
parentd1005b01fc79f56049ca6244c31dd3d15b64492c (diff)
Save average field length in posting list file header.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp3
-rw-r--r--searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp19
-rw-r--r--searchlib/src/tests/diskindex/fieldwriter/fieldwriter_test.cpp10
-rw-r--r--searchlib/src/tests/diskindex/fusion/fusion_test.cpp3
-rw-r--r--searchlib/src/tests/memoryindex/field_index/field_index_test.cpp6
-rw-r--r--searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.cpp49
-rw-r--r--searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.h4
-rw-r--r--searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.cpp7
-rw-r--r--searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.h1
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/diskindex.cpp11
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/diskindex.h3
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/extposocc.cpp7
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/extposocc.h4
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp6
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fieldreader.h1
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp5
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fieldwriter.h1
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fusion.cpp14
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fusion.h3
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp18
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/indexbuilder.h2
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp15
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposocc.h10
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp6
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h1
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/zcposting.h3
-rw-r--r--searchlib/src/vespa/searchlib/index/postinglistfile.h5
-rw-r--r--searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp22
28 files changed, 214 insertions, 25 deletions
diff --git a/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp b/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp
index c562fea69c1..83b5c05effb 100644
--- a/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp
+++ b/searchlib/src/tests/diskindex/bitvector/bitvector_test.cpp
@@ -2,6 +2,7 @@
#include <vespa/log/log.h>
LOG_SETUP("bitvector_test");
#include <vespa/vespalib/testkit/testapp.h>
+#include <vespa/searchlib/index/field_length_info.h>
#include <vespa/searchlib/diskindex/bitvectordictionary.h>
#include <vespa/searchlib/diskindex/fieldwriter.h>
#include <vespa/searchlib/index/dummyfileheadercontext.h>
@@ -47,7 +48,7 @@ FieldWriterWrapper::open(const std::string &path,
const common::FileHeaderContext &fileHeaderContext)
{
vespalib::mkdir(path, false);
- return _writer.open(path, 64, 10000, false, false, schema, indexId, tuneFileWrite, fileHeaderContext);
+ return _writer.open(path, 64, 10000, false, false, schema, indexId, FieldLengthInfo(), tuneFileWrite, fileHeaderContext);
}
FieldWriterWrapper &
diff --git a/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp b/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp
index 7e24511435b..82a6f973cf9 100644
--- a/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp
+++ b/searchlib/src/tests/diskindex/diskindex/diskindex_test.cpp
@@ -57,6 +57,7 @@ private:
void requireThatLookupIsWorking(bool fieldEmpty, bool docEmpty, bool wordEmpty);
void requireThatWeCanReadPostingList();
+ void require_that_we_can_get_field_length_info();
void requireThatWeCanReadBitVector();
void requireThatBlueprintIsCreated();
void requireThatBlueprintCanCreateSearchIterators();
@@ -193,6 +194,20 @@ Test::requireThatWeCanReadPostingList()
}
void
+Test::require_that_we_can_get_field_length_info()
+{
+ auto info = _index->get_field_length_info("f1");
+ EXPECT_EQUAL(3.5, info.get_average_field_length());
+ EXPECT_EQUAL(21u, info.get_num_samples());
+ info = _index->get_field_length_info("f2");
+ EXPECT_EQUAL(4.0, info.get_average_field_length());
+ EXPECT_EQUAL(23u, info.get_num_samples());
+ info = _index->get_field_length_info("f3");
+ EXPECT_EQUAL(0.0, info.get_average_field_length());
+ EXPECT_EQUAL(0u, info.get_num_samples());
+}
+
+void
Test::requireThatWeCanReadBitVector()
{
{ // word 'w1'
@@ -323,6 +338,7 @@ Test::Main()
TEST_DO(openIndex("index/1", false, false, false, false, false));
TEST_DO(requireThatLookupIsWorking(false, false, false));
TEST_DO(requireThatWeCanReadPostingList());
+ TEST_DO(require_that_we_can_get_field_length_info());
TEST_DO(requireThatWeCanReadBitVector());
TEST_DO(requireThatBlueprintIsCreated());
TEST_DO(requireThatBlueprintCanCreateSearchIterators());
@@ -330,6 +346,7 @@ Test::Main()
TEST_DO(openIndex("index/2", true, false, false, false, false));
TEST_DO(requireThatLookupIsWorking(false, false, false));
TEST_DO(requireThatWeCanReadPostingList());
+ TEST_DO(require_that_we_can_get_field_length_info());
TEST_DO(requireThatWeCanReadBitVector());
TEST_DO(requireThatBlueprintIsCreated());
TEST_DO(requireThatBlueprintCanCreateSearchIterators());
@@ -337,6 +354,7 @@ Test::Main()
TEST_DO(openIndex("index/3", false, true, false, false, false));
TEST_DO(requireThatLookupIsWorking(false, false, false));
TEST_DO(requireThatWeCanReadPostingList());
+ TEST_DO(require_that_we_can_get_field_length_info());
TEST_DO(requireThatWeCanReadBitVector());
TEST_DO(requireThatBlueprintIsCreated());
TEST_DO(requireThatBlueprintCanCreateSearchIterators());
@@ -344,6 +362,7 @@ Test::Main()
TEST_DO(openIndex("index/4", true, true, false, false, false));
TEST_DO(requireThatLookupIsWorking(false, false, false));
TEST_DO(requireThatWeCanReadPostingList());
+ TEST_DO(require_that_we_can_get_field_length_info());
TEST_DO(requireThatWeCanReadBitVector());
TEST_DO(requireThatBlueprintIsCreated());
TEST_DO(requireThatBlueprintCanCreateSearchIterators());
diff --git a/searchlib/src/tests/diskindex/fieldwriter/fieldwriter_test.cpp b/searchlib/src/tests/diskindex/fieldwriter/fieldwriter_test.cpp
index cb38cd23409..2b9f8a5b201 100644
--- a/searchlib/src/tests/diskindex/fieldwriter/fieldwriter_test.cpp
+++ b/searchlib/src/tests/diskindex/fieldwriter/fieldwriter_test.cpp
@@ -6,6 +6,7 @@
#include <vespa/searchlib/test/fakedata/fakeword.h>
#include <vespa/searchlib/test/fakedata/fakewordset.h>
#include <vespa/searchlib/index/docidandfeatures.h>
+#include <vespa/searchlib/index/field_length_info.h>
#include <vespa/searchlib/index/postinglisthandle.h>
#include <vespa/searchlib/diskindex/zcposocc.h>
#include <vespa/searchlib/diskindex/zcposoccrandread.h>
@@ -38,6 +39,7 @@ using search::fakedata::FakeWord;
using search::fakedata::FakeWordSet;
using search::fef::TermFieldMatchData;
using search::fef::TermFieldMatchDataArray;
+using search::index::FieldLengthInfo;
using search::index::DummyFileHeaderContext;
using search::index::PostingListCounts;
using search::index::PostingListOffsetAndCounts;
@@ -200,6 +202,7 @@ WrappedFieldWriter::open()
minSkipDocs, minChunkDocs,
_dynamicK, _encode_cheap_features,
_schema, _indexId,
+ FieldLengthInfo(4.5, 42),
tuneFileWrite, fileHeaderContext);
}
@@ -427,6 +430,10 @@ readField(FakeWordSet &wordSet,
if (istate._fieldReader->isValid())
istate._fieldReader->read();
+ auto field_length_info = istate._fieldReader->get_field_length_info();
+ assert(4.5 == field_length_info.get_average_field_length());
+ assert(42u == field_length_info.get_num_samples());
+
TermFieldMatchData mdfield1;
unsigned int wordNum = 1;
@@ -503,6 +510,9 @@ randReadField(FakeWordSet &wordSet,
bool openPostingRes = postingFile->open(pname, tuneFileRandRead);
assert(openPostingRes);
(void) openPostingRes;
+ auto field_length_info = postingFile->get_field_length_info();
+ assert(4.5 == field_length_info.get_average_field_length());
+ assert(42u == field_length_info.get_num_samples());
for (int loop = 0; loop < 1; ++loop) {
unsigned int wordNum = 1;
diff --git a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp
index 694af2f1ad1..16d2a04df2e 100644
--- a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp
+++ b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp
@@ -326,6 +326,7 @@ Test::requireThatFusionIsWorking(const vespalib::string &prefix, bool directio,
uint32_t numDocs = 12 + 1;
uint32_t numWords = fic.getNumUniqueWords();
bool dynamicKPosOcc = false;
+ MockFieldLengthInspector mock_field_length_inspector;
TuneFileIndexing tuneFileIndexing;
TuneFileSearch tuneFileSearch;
DummyFileHeaderContext fileHeaderContext;
@@ -336,7 +337,7 @@ Test::requireThatFusionIsWorking(const vespalib::string &prefix, bool directio,
}
if (readmmap)
tuneFileSearch._read.setWantMemoryMap();
- ib.open(numDocs, numWords, tuneFileIndexing, fileHeaderContext);
+ ib.open(numDocs, numWords, mock_field_length_inspector, tuneFileIndexing, fileHeaderContext);
fic.dump(ib);
ib.close();
diff --git a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp
index 234cf9b5e84..05c905cdc32 100644
--- a/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp
+++ b/searchlib/src/tests/memoryindex/field_index/field_index_test.cpp
@@ -740,7 +740,7 @@ TEST_F(FieldIndexCollectionTest, require_that_dumping_words_with_no_docs_to_inde
b.setPrefix("dump");
TuneFileIndexing tuneFileIndexing;
DummyFileHeaderContext fileHeaderContext;
- b.open(5, 2, tuneFileIndexing, fileHeaderContext);
+ b.open(5, 2, MockFieldLengthInspector(), tuneFileIndexing, fileHeaderContext);
fic.dump(b);
b.close();
}
@@ -1210,7 +1210,9 @@ TEST_F(UriInverterTest, require_that_uri_indexing_is_working)
dib.setPrefix("urldump");
TuneFileIndexing tuneFileIndexing;
DummyFileHeaderContext fileHeaderContext;
- dib.open(11, _fic.getNumUniqueWords(), tuneFileIndexing,
+ dib.open(11, _fic.getNumUniqueWords(),
+ MockFieldLengthInspector(),
+ tuneFileIndexing,
fileHeaderContext);
_fic.dump(dib);
dib.close();
diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.cpp b/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.cpp
index 006c0b29ffb..b789bf16947 100644
--- a/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.cpp
+++ b/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.cpp
@@ -21,7 +21,8 @@ PosOccFieldParams::PosOccFieldParams()
_hasElementWeights(false),
_avgElemLen(512),
_collectionType(SINGLE),
- _name()
+ _name(),
+ _field_length_info()
{ }
@@ -128,14 +129,37 @@ PosOccFieldParams::setSchemaParams(const Schema &schema, uint32_t fieldId)
_name = field.getName();
}
+namespace {
+
+vespalib::string field_length_infix = "field_length.";
+
+struct FieldLengthKeys {
+ vespalib::string _average;
+ vespalib::string _samples;
+ FieldLengthKeys(const vespalib::string &prefix);
+ ~FieldLengthKeys();
+};
+
+FieldLengthKeys::FieldLengthKeys(const vespalib::string &prefix)
+ : _average(prefix + field_length_infix + "average"),
+ _samples(prefix + field_length_infix + "samples")
+{
+}
+
+FieldLengthKeys::~FieldLengthKeys() = default;
+
+}
void
-PosOccFieldParams::readHeader(const vespalib::GenericHeader &header,
+PosOccFieldParams::readHeader(const GenericHeader &header,
const vespalib::string &prefix)
{
+ using Tag = GenericHeader::Tag;
vespalib::string nameKey(prefix + "fieldName");
vespalib::string collKey(prefix + "collectionType");
vespalib::string avgElemLenKey(prefix + "avgElemLen");
+ FieldLengthKeys field_length_keys(prefix);
+
_name = header.getTag(nameKey).asString();
Schema::CollectionType ct = schema::collectionTypeFromName(header.getTag(collKey).asString());
switch (ct) {
@@ -158,17 +182,28 @@ PosOccFieldParams::readHeader(const vespalib::GenericHeader &header,
LOG_ABORT("Bad collection type when reading field param in header");
}
_avgElemLen = header.getTag(avgElemLenKey).asInteger();
+ if (header.hasTag(field_length_keys._average) &&
+ header.hasTag(field_length_keys._samples)) {
+ const auto &average_field_length_tag = header.getTag(field_length_keys._average);
+ const auto &field_length_samples_tag = header.getTag(field_length_keys._samples);
+ if (average_field_length_tag.getType() == Tag::Type::TYPE_FLOAT &&
+ field_length_samples_tag.getType() == Tag::Type::TYPE_INTEGER) {
+ _field_length_info = index::FieldLengthInfo(average_field_length_tag.asFloat(), field_length_samples_tag.asInteger());
+ }
+ }
}
void
-PosOccFieldParams::writeHeader(vespalib::GenericHeader &header,
+PosOccFieldParams::writeHeader(GenericHeader &header,
const vespalib::string &prefix) const
{
+ using Tag = GenericHeader::Tag;
vespalib::string nameKey(prefix + "fieldName");
vespalib::string collKey(prefix + "collectionType");
vespalib::string avgElemLenKey(prefix + "avgElemLen");
- header.putTag(GenericHeader::Tag(nameKey, _name));
+ FieldLengthKeys field_length_keys(prefix);
+ header.putTag(Tag(nameKey, _name));
Schema::CollectionType ct(schema::CollectionType::SINGLE);
switch (_collectionType) {
case SINGLE:
@@ -183,8 +218,10 @@ PosOccFieldParams::writeHeader(vespalib::GenericHeader &header,
default:
LOG_ABORT("Bad collection type when writing field param in header");
}
- header.putTag(GenericHeader::Tag(collKey, schema::getTypeName(ct)));
- header.putTag(GenericHeader::Tag(avgElemLenKey, _avgElemLen));
+ header.putTag(Tag(collKey, schema::getTypeName(ct)));
+ header.putTag(Tag(avgElemLenKey, _avgElemLen));
+ header.putTag(Tag(field_length_keys._average, _field_length_info.get_average_field_length()));
+ header.putTag(Tag(field_length_keys._samples, static_cast<int64_t>(_field_length_info.get_num_samples())));
}
}
diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.h b/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.h
index c781cec4db5..f053f558433 100644
--- a/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.h
+++ b/searchlib/src/vespa/searchlib/bitcompression/posocc_field_params.h
@@ -4,6 +4,7 @@
#include <cstdint>
#include <vespa/vespalib/stllike/string.h>
+#include <vespa/searchlib/index/field_length_info.h>
namespace search::index {
class PostingListParams;
@@ -33,6 +34,7 @@ public:
uint32_t _avgElemLen;
CollectionType _collectionType;
vespalib::string _name;
+ index::FieldLengthInfo _field_length_info;
PosOccFieldParams();
@@ -43,6 +45,8 @@ public:
void setSchemaParams(const Schema &schema, uint32_t fieldId);
void readHeader(const vespalib::GenericHeader &header, const vespalib::string &prefix);
void writeHeader(vespalib::GenericHeader &header, const vespalib::string &prefix) const;
+ const index::FieldLengthInfo &get_field_length_info() const { return _field_length_info; }
+ void set_field_length_info(const index::FieldLengthInfo &field_length_info) { _field_length_info = field_length_info; }
};
}
diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.cpp b/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.cpp
index 2e2674f98c6..f6a0ac0f783 100644
--- a/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.cpp
+++ b/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.cpp
@@ -129,4 +129,11 @@ PosOccFieldsParams::writeHeader(vespalib::GenericHeader &header,
}
}
+void
+PosOccFieldsParams::set_field_length_info(const index::FieldLengthInfo &field_length_info)
+{
+ assert(!_params.empty());
+ _params.front().set_field_length_info(field_length_info);
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.h b/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.h
index f6ae886b3f0..2bc32bdd75b 100644
--- a/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.h
+++ b/searchlib/src/vespa/searchlib/bitcompression/posocc_fields_params.h
@@ -44,6 +44,7 @@ public:
void setSchemaParams(const Schema &schema, const uint32_t indexId);
void readHeader(const vespalib::GenericHeader &header, const vespalib::string &prefix);
void writeHeader(vespalib::GenericHeader &header, const vespalib::string &prefix) const;
+ void set_field_length_info(const index::FieldLengthInfo &field_length_info);
};
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp
index b6d843e4e3c..a964ae1ce6a 100644
--- a/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/diskindex.cpp
@@ -464,4 +464,15 @@ DiskIndex::createBlueprint(const IRequestContext & requestContext, const FieldSp
}
}
+FieldLengthInfo
+DiskIndex::get_field_length_info(const vespalib::string& field_name) const
+{
+ uint32_t fieldId = _schema.getIndexFieldId(field_name);
+ if (fieldId != Schema::UNKNOWN_FIELD_ID) {
+ return _postingFiles[fieldId]->get_field_length_info();
+ } else {
+ return FieldLengthInfo();
+ }
+}
+
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/diskindex.h b/searchlib/src/vespa/searchlib/diskindex/diskindex.h
index d83b2f56d7c..91fd33a2c4a 100644
--- a/searchlib/src/vespa/searchlib/diskindex/diskindex.h
+++ b/searchlib/src/vespa/searchlib/diskindex/diskindex.h
@@ -5,6 +5,7 @@
#include "bitvectordictionary.h"
#include "zcposoccrandread.h"
#include <vespa/searchlib/index/dictionaryfile.h>
+#include <vespa/searchlib/index/field_length_info.h>
#include <vespa/searchlib/queryeval/searchable.h>
#include <vespa/vespalib/stllike/string.h>
#include <vespa/vespalib/stllike/cache.h>
@@ -147,6 +148,8 @@ public:
* Needed for the Cache::BackingStore interface.
*/
bool read(const Key & key, LookupResultVector & result);
+
+ index::FieldLengthInfo get_field_length_info(const vespalib::string& field_name) const;
};
void swap(DiskIndex::LookupResult & a, DiskIndex::LookupResult & b);
diff --git a/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp b/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp
index d03f7ca4149..1ee7b9ae9ae 100644
--- a/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/extposocc.cpp
@@ -57,14 +57,15 @@ makePosOccWrite(PostingListCountFileSeqWrite *const posOccCountWrite,
const PostingListParams &params,
const PostingListParams &featureParams,
const Schema &schema,
- uint32_t indexId)
+ uint32_t indexId,
+ const index::FieldLengthInfo &field_length_info)
{
std::unique_ptr<PostingListFileSeqWrite> posOccWrite;
if (dynamicK) {
- posOccWrite = std::make_unique<ZcPosOccSeqWrite>(schema, indexId, posOccCountWrite);
+ posOccWrite = std::make_unique<ZcPosOccSeqWrite>(schema, indexId, field_length_info, posOccCountWrite);
} else {
- posOccWrite = std::make_unique<Zc4PosOccSeqWrite>(schema, indexId, posOccCountWrite);
+ posOccWrite = std::make_unique<Zc4PosOccSeqWrite>(schema, indexId, field_length_info, posOccCountWrite);
}
posOccWrite->setFeatureParams(featureParams);
diff --git a/searchlib/src/vespa/searchlib/diskindex/extposocc.h b/searchlib/src/vespa/searchlib/diskindex/extposocc.h
index 285715849db..49852b18ad3 100644
--- a/searchlib/src/vespa/searchlib/diskindex/extposocc.h
+++ b/searchlib/src/vespa/searchlib/diskindex/extposocc.h
@@ -10,6 +10,7 @@ namespace search {
}
namespace search::index {
+ class FieldLengthInfo;
class PostingListParams;
class PostingListCountFileSeqWrite;
class PostingListCountFileSeqRead;
@@ -33,7 +34,8 @@ makePosOccWrite(index::PostingListCountFileSeqWrite *const posOccCountWrite,
const index::PostingListParams &params,
const index::PostingListParams &featureParams,
const index::Schema &schema,
- uint32_t indexId);
+ uint32_t indexId,
+ const index::FieldLengthInfo &field_length_info);
std::unique_ptr<index::PostingListFileSeqRead>
makePosOccRead(const vespalib::string &name,
diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
index c4cd6d3a22e..d3696e2f31c 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
@@ -18,6 +18,7 @@ vespalib::string PosOccIdCooked = "PosOcc.3.Cooked";
}
using vespalib::getLastErrorString;
+using search::index::FieldLengthInfo;
using search::index::Schema;
using search::index::SchemaUtil;
using search::bitcompression::PosOccFieldParams;
@@ -179,6 +180,11 @@ FieldReader::getFeatureParams(PostingListParams &params)
_oldposoccfile->getFeatureParams(params);
}
+const FieldLengthInfo &
+FieldReader::get_field_length_info() const
+{
+ return _oldposoccfile->get_field_length_info();
+}
std::unique_ptr<FieldReader>
FieldReader::allocFieldReader(const SchemaUtil::IndexIterator &index,
diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h
index 243da21731b..ee237f5cc69 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h
+++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h
@@ -93,6 +93,7 @@ public:
virtual void setFeatureParams(const PostingListParams &params);
virtual void getFeatureParams(PostingListParams &params);
uint32_t getDocIdLimit() const { return _docIdLimit; }
+ const index::FieldLengthInfo &get_field_length_info() const;
static std::unique_ptr<FieldReader> allocFieldReader(const IndexIterator &index, const Schema &oldSchema);
};
diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp
index 7fb575da7f3..ae308db1a4f 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.cpp
@@ -9,6 +9,8 @@
LOG_SETUP(".diskindex.fieldwriter");
+using search::index::FieldLengthInfo;
+
namespace search::diskindex {
using vespalib::getLastErrorString;
@@ -39,6 +41,7 @@ FieldWriter::open(const vespalib::string &prefix,
bool encode_cheap_features,
const Schema &schema,
const uint32_t indexId,
+ const FieldLengthInfo &field_length_info,
const TuneFileSeqWrite &tuneFileWrite,
const FileHeaderContext &fileHeaderContext)
{
@@ -66,7 +69,7 @@ FieldWriter::open(const vespalib::string &prefix,
_dictFile = std::make_unique<PageDict4FileSeqWrite>();
_dictFile->setParams(countParams);
- _posoccfile = makePosOccWrite(_dictFile.get(), dynamicKPosOccFormat, params, featureParams, schema, indexId);
+ _posoccfile = makePosOccWrite(_dictFile.get(), dynamicKPosOccFormat, params, featureParams, schema, indexId, field_length_info);
vespalib::string cname = _prefix + "dictionary";
// Open output dictionary file
diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h
index c71bc4f4132..69c763bbd77 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h
+++ b/searchlib/src/vespa/searchlib/diskindex/fieldwriter.h
@@ -71,6 +71,7 @@ public:
bool dynamicKPosOccFormat,
bool encode_cheap_features,
const Schema &schema, uint32_t indexId,
+ const index::FieldLengthInfo &field_length_info,
const TuneFileSeqWrite &tuneFileWrite,
const search::common::FileHeaderContext &fileHeaderContext);
diff --git a/searchlib/src/vespa/searchlib/diskindex/fusion.cpp b/searchlib/src/vespa/searchlib/diskindex/fusion.cpp
index 1000dcb605e..1ace5969b6b 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fusion.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/fusion.cpp
@@ -4,6 +4,7 @@
#include "fieldreader.h"
#include "dictionarywordreader.h"
#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/searchlib/index/field_length_info.h>
#include <vespa/searchlib/util/filekit.h>
#include <vespa/searchlib/util/dirtraverse.h>
#include <vespa/vespalib/io/fileutil.h>
@@ -26,6 +27,7 @@ using search::common::FileHeaderContext;
using search::diskindex::DocIdMapping;
using search::diskindex::WordNumMapping;
using search::docsummary::DocumentSummary;
+using search::index::FieldLengthInfo;
using search::index::PostingListParams;
using search::index::Schema;
using search::index::SchemaUtil;
@@ -324,13 +326,15 @@ Fusion::openInputFieldReaders(const SchemaUtil::IndexIterator &index, const Word
bool
-Fusion::openFieldWriter(const SchemaUtil::IndexIterator &index, FieldWriter &writer)
+Fusion::openFieldWriter(const SchemaUtil::IndexIterator &index, FieldWriter &writer, const FieldLengthInfo &field_length_info)
{
vespalib::string dir = _outDir + "/" + index.getName();
if (!writer.open(dir + "/", 64, 262144, _dynamicKPosIndexFormat,
index.use_experimental_posting_list_format(), index.getSchema(),
- index.getIndex(), _tuneFileIndexing._write, _fileHeaderContext)) {
+ index.getIndex(),
+ field_length_info,
+ _tuneFileIndexing._write, _fileHeaderContext)) {
throw IllegalArgumentException(make_string("Could not open output posocc + dictionary in %s", dir.c_str()));
}
return true;
@@ -368,7 +372,11 @@ Fusion::mergeFieldPostings(const SchemaUtil::IndexIterator &index, const WordNum
if (!openInputFieldReaders(index, list, readers)) {
return false;
}
- if (!openFieldWriter(index, fieldWriter)) {
+ FieldLengthInfo field_length_info;
+ if (!readers.empty()) {
+ field_length_info = readers.back()->get_field_length_info();
+ }
+ if (!openFieldWriter(index, fieldWriter, field_length_info)) {
return false;
}
if (!setupMergeHeap(readers, fieldWriter, heap)) {
diff --git a/searchlib/src/vespa/searchlib/diskindex/fusion.h b/searchlib/src/vespa/searchlib/diskindex/fusion.h
index 53f9db75758..28060a9c4be 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fusion.h
+++ b/searchlib/src/vespa/searchlib/diskindex/fusion.h
@@ -11,6 +11,7 @@
namespace search { template <class IN> class PostingPriorityQueue; }
namespace search { class TuneFileIndexing; }
namespace search::common { class FileHeaderContext; }
+namespace search::index { class FieldLengthInfo; }
namespace search::diskindex {
@@ -50,7 +51,7 @@ private:
bool mergeField(uint32_t id);
bool openInputFieldReaders(const SchemaUtil::IndexIterator &index, const WordNumMappingList & list,
std::vector<std::unique_ptr<FieldReader> > & readers);
- bool openFieldWriter(const SchemaUtil::IndexIterator &index, FieldWriter & writer);
+ bool openFieldWriter(const SchemaUtil::IndexIterator &index, FieldWriter & writer, const index::FieldLengthInfo &field_length_info);
bool setupMergeHeap(const std::vector<std::unique_ptr<FieldReader> > & readers,
FieldWriter &writer, PostingPriorityQueue<FieldReader> &heap);
bool mergeFieldPostings(const SchemaUtil::IndexIterator &index, const WordNumMappingList & list, uint64_t numWordIds);
diff --git a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp
index f9620c35908..c2e311f18a6 100644
--- a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.cpp
@@ -2,6 +2,8 @@
#include "indexbuilder.h"
#include <vespa/searchlib/index/docidandfeatures.h>
+#include <vespa/searchlib/index/field_length_info.h>
+#include <vespa/searchlib/index/i_field_length_inspector.h>
#include <vespa/searchlib/index/schemautil.h>
#include <vespa/searchlib/common/documentsummary.h>
#include <vespa/vespalib/io/fileutil.h>
@@ -19,6 +21,8 @@ namespace {
using common::FileHeaderContext;
using index::DocIdAndFeatures;
+using index::FieldLengthInfo;
+using index::IFieldLengthInspector;
using index::PostingListCounts;
using index::Schema;
using index::SchemaUtil;
@@ -37,6 +41,7 @@ public:
void open(vespalib::stringref dir,
const SchemaUtil::IndexIterator &index,
uint32_t docIdLimit, uint64_t numWordIds,
+ const FieldLengthInfo &field_length_info,
const TuneFileSeqWrite &tuneFileWrite,
const FileHeaderContext &fileHeaderContext);
@@ -69,6 +74,7 @@ public:
const vespalib::string &getName();
vespalib::string getDir();
void open(uint32_t docIdLimit, uint64_t numWordIds,
+ const FieldLengthInfo &field_length_info,
const TuneFileSeqWrite &tuneFileWrite,
const FileHeaderContext &fileHeaderContext);
void close();
@@ -90,6 +96,7 @@ void
FileHandle::open(vespalib::stringref dir,
const SchemaUtil::IndexIterator &index,
uint32_t docIdLimit, uint64_t numWordIds,
+ const FieldLengthInfo &field_length_info,
const TuneFileSeqWrite &tuneFileWrite,
const FileHeaderContext &fileHeaderContext)
{
@@ -100,6 +107,7 @@ FileHandle::open(vespalib::stringref dir,
if (!_fieldWriter->open(dir + "/", 64, 262144u, false,
index.use_experimental_posting_list_format(),
index.getSchema(), index.getIndex(),
+ field_length_info,
tuneFileWrite, fileHeaderContext)) {
LOG(error, "Could not open term writer %s for write (%s)",
vespalib::string(dir).c_str(), getLastErrorString().c_str());
@@ -170,12 +178,15 @@ IndexBuilder::FieldHandle::getDir()
void
IndexBuilder::FieldHandle::open(uint32_t docIdLimit, uint64_t numWordIds,
+ const FieldLengthInfo &field_length_info,
const TuneFileSeqWrite &tuneFileWrite,
const FileHeaderContext &fileHeaderContext)
{
_file.open(getDir(),
SchemaUtil::IndexIterator(*_schema, getIndexId()),
- docIdLimit, numWordIds, tuneFileWrite, fileHeaderContext);
+ docIdLimit, numWordIds,
+ field_length_info,
+ tuneFileWrite, fileHeaderContext);
}
void
@@ -278,6 +289,7 @@ IndexBuilder::appendToPrefix(vespalib::stringref name)
void
IndexBuilder::open(uint32_t docIdLimit, uint64_t numWordIds,
+ const IFieldLengthInspector &field_length_inspector,
const TuneFileIndexing &tuneFileIndexing,
const FileHeaderContext &fileHeaderContext)
{
@@ -294,7 +306,9 @@ IndexBuilder::open(uint32_t docIdLimit, uint64_t numWordIds,
continue;
}
vespalib::mkdir(fh.getDir(), false);
- fh.open(docIdLimit, numWordIds, tuneFileIndexing._write,
+ fh.open(docIdLimit, numWordIds,
+ field_length_inspector.get_field_length_info(fh.getName()),
+ tuneFileIndexing._write,
fileHeaderContext);
indexes.push_back(fh.getIndexId());
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h
index a1a77d608cd..4423c7e91a0 100644
--- a/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h
+++ b/searchlib/src/vespa/searchlib/diskindex/indexbuilder.h
@@ -8,6 +8,7 @@
#include <vector>
namespace search::common { class FileHeaderContext; }
+namespace search::index { class IFieldLengthInspector; }
namespace search::diskindex {
@@ -64,6 +65,7 @@ public:
vespalib::string appendToPrefix(vespalib::stringref name);
void open(uint32_t docIdLimit, uint64_t numWordIds,
+ const index::IFieldLengthInspector &field_length_inspector,
const TuneFileIndexing &tuneFileIndexing,
const common::FileHeaderContext &fileHandleContext);
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp
index 3ae2a631cb1..e3bb400f4d3 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposocc.cpp
@@ -12,6 +12,7 @@ namespace search::diskindex {
using search::bitcompression::PosOccFieldsParams;
using search::bitcompression::EG2PosOccDecodeContext;
using search::bitcompression::EGPosOccDecodeContext;
+using search::index::FieldLengthInfo;
using search::index::PostingListCountFileSeqRead;
using search::index::PostingListCountFileSeqWrite;
@@ -52,9 +53,15 @@ Zc4PosOccSeqRead::getSubIdentifier()
return d.getIdentifier();
}
+const FieldLengthInfo &
+Zc4PosOccSeqRead::get_field_length_info() const
+{
+ return _fieldsParams.getFieldParams()->get_field_length_info();
+}
Zc4PosOccSeqWrite::Zc4PosOccSeqWrite(const Schema &schema,
uint32_t indexId,
+ const FieldLengthInfo &field_length_info,
PostingListCountFileSeqWrite *countFile)
: Zc4PostingSeqWrite(countFile),
_fieldsParams(),
@@ -62,6 +69,7 @@ Zc4PosOccSeqWrite::Zc4PosOccSeqWrite(const Schema &schema,
{
_writer.set_encode_features(&_realEncodeFeatures);
_fieldsParams.setSchemaParams(schema, indexId);
+ _fieldsParams.set_field_length_info(field_length_info);
}
@@ -102,9 +110,15 @@ ZcPosOccSeqRead::getSubIdentifier()
return d.getIdentifier();
}
+const FieldLengthInfo &
+ZcPosOccSeqRead::get_field_length_info() const
+{
+ return _fieldsParams.getFieldParams()->get_field_length_info();
+}
ZcPosOccSeqWrite::ZcPosOccSeqWrite(const Schema &schema,
uint32_t indexId,
+ const FieldLengthInfo &field_length_info,
PostingListCountFileSeqWrite *countFile)
: ZcPostingSeqWrite(countFile),
_fieldsParams(),
@@ -112,6 +126,7 @@ ZcPosOccSeqWrite::ZcPosOccSeqWrite(const Schema &schema,
{
_writer.set_encode_features(&_realEncodeFeatures);
_fieldsParams.setSchemaParams(schema, indexId);
+ _fieldsParams.set_field_length_info(field_length_info);
}
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposocc.h b/searchlib/src/vespa/searchlib/diskindex/zcposocc.h
index 27700399bfb..0615cf636de 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposocc.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposocc.h
@@ -19,6 +19,7 @@ public:
Zc4PosOccSeqRead(index::PostingListCountFileSeqRead *countFile);
void setFeatureParams(const PostingListParams &params) override;
static const vespalib::string &getSubIdentifier();
+ const index::FieldLengthInfo &get_field_length_info() const override;
};
@@ -31,7 +32,9 @@ private:
public:
typedef index::Schema Schema;
- Zc4PosOccSeqWrite(const Schema &schema, uint32_t indexId, index::PostingListCountFileSeqWrite *countFile);
+ Zc4PosOccSeqWrite(const Schema &schema, uint32_t indexId,
+ const index::FieldLengthInfo &field_length_info,
+ index::PostingListCountFileSeqWrite *countFile);
};
@@ -45,6 +48,7 @@ public:
ZcPosOccSeqRead(index::PostingListCountFileSeqRead *countFile);
void setFeatureParams(const PostingListParams &params) override;
static const vespalib::string &getSubIdentifier();
+ const index::FieldLengthInfo &get_field_length_info() const override;
};
@@ -55,7 +59,9 @@ private:
bitcompression::EGPosOccEncodeContext<true> _realEncodeFeatures;
public:
typedef index::Schema Schema;
- ZcPosOccSeqWrite(const Schema &schema, uint32_t indexId, index::PostingListCountFileSeqWrite *countFile);
+ ZcPosOccSeqWrite(const Schema &schema, uint32_t indexId,
+ const index::FieldLengthInfo &field_length_info,
+ index::PostingListCountFileSeqWrite *countFile);
};
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp
index edbd78b9b01..aa4f15bc225 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.cpp
@@ -17,6 +17,7 @@ using search::bitcompression::EGPosOccDecodeContext;
using search::bitcompression::EGPosOccDecodeContextCooked;
using search::bitcompression::PosOccFieldsParams;
using search::bitcompression::FeatureDecodeContext;
+using search::index::FieldLengthInfo;
using search::index::PostingListCounts;
using search::index::PostingListHandle;
using search::ComprFileReadContext;
@@ -246,6 +247,11 @@ ZcPosOccRandRead::getSubIdentifier()
return d.getIdentifier();
}
+const FieldLengthInfo &
+ZcPosOccRandRead::get_field_length_info() const
+{
+ return _fieldsParams.getFieldParams()->get_field_length_info();
+}
Zc4PosOccRandRead::
Zc4PosOccRandRead()
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h
index 26b23161cb1..7ae59611e35 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposoccrandread.h
@@ -50,6 +50,7 @@ public:
virtual void readHeader();
static const vespalib::string &getIdentifier();
static const vespalib::string &getSubIdentifier();
+ const index::FieldLengthInfo &get_field_length_info() const override;
};
class Zc4PosOccRandRead : public ZcPosOccRandRead
diff --git a/searchlib/src/vespa/searchlib/diskindex/zcposting.h b/searchlib/src/vespa/searchlib/diskindex/zcposting.h
index 01049e720a9..6ca87f01aea 100644
--- a/searchlib/src/vespa/searchlib/diskindex/zcposting.h
+++ b/searchlib/src/vespa/searchlib/diskindex/zcposting.h
@@ -72,7 +72,8 @@ public:
void writeDocIdAndFeatures(const DocIdAndFeatures &features) override;
void flushWord() override;
- bool open(const vespalib::string &name, const TuneFileSeqWrite &tuneFileWrite,
+ bool open(const vespalib::string &name,
+ const TuneFileSeqWrite &tuneFileWrite,
const search::common::FileHeaderContext &fileHeaderContext) override;
bool close() override;
diff --git a/searchlib/src/vespa/searchlib/index/postinglistfile.h b/searchlib/src/vespa/searchlib/index/postinglistfile.h
index 1e7dde7f139..1569bdd13b6 100644
--- a/searchlib/src/vespa/searchlib/index/postinglistfile.h
+++ b/searchlib/src/vespa/searchlib/index/postinglistfile.h
@@ -13,6 +13,7 @@ namespace search::common { class FileHeaderContext; }
namespace search::index {
class DocIdAndFeatures;
+class FieldLengthInfo;
/**
* Interface for posting list files containing document ids and features
@@ -60,6 +61,8 @@ public:
* Get current (word, docid) feature parameters.
*/
virtual void getFeatureParams(PostingListParams &params);
+
+ virtual const FieldLengthInfo &get_field_length_info() const = 0;
};
/**
@@ -169,6 +172,8 @@ public:
*/
virtual bool close() = 0;
+ virtual const FieldLengthInfo &get_field_length_info() const = 0;
+
bool getMemoryMapped() const { return _memoryMapped; }
protected:
diff --git a/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp b/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp
index 1e25878a33e..5de51ca3933 100644
--- a/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp
+++ b/searchlib/src/vespa/searchlib/test/diskindex/testdiskindex.cpp
@@ -3,19 +3,39 @@
#include "testdiskindex.h"
#include <vespa/searchlib/diskindex/indexbuilder.h>
#include <vespa/searchlib/index/dummyfileheadercontext.h>
+#include <vespa/searchlib/index/i_field_length_inspector.h>
#include <vespa/vespalib/io/fileutil.h>
namespace search::diskindex {
using index::DocIdAndFeatures;
using index::DummyFileHeaderContext;
+using index::FieldLengthInfo;
+using index::IFieldLengthInspector;
using index::Schema;
using index::WordDocElementWordPosFeatures;
using index::schema::DataType;
+namespace {
+
+class MockFieldLengthInspector : public IFieldLengthInspector {
+ FieldLengthInfo get_field_length_info(const vespalib::string& field_name) const override {
+ if (field_name == "f1") {
+ return FieldLengthInfo(3.5, 21);
+ } else if (field_name == "f2") {
+ return FieldLengthInfo(4.0, 23);
+ } else {
+ return FieldLengthInfo();
+ }
+ }
+};
+
+}
+
struct Builder
{
search::diskindex::IndexBuilder _ib;
+ MockFieldLengthInspector _mock_field_length_inspector;
TuneFileIndexing _tuneFileIndexing;
DummyFileHeaderContext _fileHeaderContext;
DocIdAndFeatures _features;
@@ -35,7 +55,7 @@ struct Builder
_tuneFileIndexing._write.setWantDirectIO();
}
_ib.setPrefix(dir);
- _ib.open(docIdLimit, numWordIds, _tuneFileIndexing,
+ _ib.open(docIdLimit, numWordIds, _mock_field_length_inspector, _tuneFileIndexing,
_fileHeaderContext);
}