aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@broadpark.no>2019-06-20 11:03:26 +0200
committerTor Egge <Tor.Egge@broadpark.no>2019-06-20 12:03:59 +0200
commitaade9eb66ed483a05405b102a10d52a5a4c58f07 (patch)
tree383278ee759eeba68a8e9889c303827878e3b4ef /searchlib
parent38de0304985d85dc9da58e15ad494054bff5d5dc (diff)
Add field length scanner, to get accurate field lengths for
multivalue fields when regenerating interleaved features.
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/CMakeLists.txt1
-rw-r--r--searchlib/src/tests/diskindex/field_length_scanner/CMakeLists.txt11
-rw-r--r--searchlib/src/tests/diskindex/field_length_scanner/field_length_scanner_test.cpp73
-rw-r--r--searchlib/src/tests/diskindex/fusion/fusion_test.cpp2
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt1
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/field_length_scanner.cpp51
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/field_length_scanner.h63
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp36
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fieldreader.h8
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fusion.cpp28
-rw-r--r--searchlib/src/vespa/searchlib/diskindex/fusion.h2
11 files changed, 267 insertions, 9 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt
index f032bbe9c30..07ac6bb699c 100644
--- a/searchlib/CMakeLists.txt
+++ b/searchlib/CMakeLists.txt
@@ -106,6 +106,7 @@ vespa_define_module(
src/tests/diskindex/bitvector
src/tests/diskindex/diskindex
src/tests/diskindex/fieldwriter
+ src/tests/diskindex/field_length_scanner
src/tests/diskindex/fusion
src/tests/diskindex/pagedict4
src/tests/docstore/chunk
diff --git a/searchlib/src/tests/diskindex/field_length_scanner/CMakeLists.txt b/searchlib/src/tests/diskindex/field_length_scanner/CMakeLists.txt
new file mode 100644
index 00000000000..985aaa38401
--- /dev/null
+++ b/searchlib/src/tests/diskindex/field_length_scanner/CMakeLists.txt
@@ -0,0 +1,11 @@
+# Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+find_package(GTest REQUIRED)
+vespa_add_executable(searchlib_field_length_scanner_test_app TEST
+ SOURCES
+ field_length_scanner_test.cpp
+ DEPENDS
+ searchlib
+ searchlib_test
+ GTest::GTest
+)
+vespa_add_test(NAME searchlib_field_length_scanner_test_app COMMAND searchlib_field_length_scanner_test_app)
diff --git a/searchlib/src/tests/diskindex/field_length_scanner/field_length_scanner_test.cpp b/searchlib/src/tests/diskindex/field_length_scanner/field_length_scanner_test.cpp
new file mode 100644
index 00000000000..1b8a4c9655d
--- /dev/null
+++ b/searchlib/src/tests/diskindex/field_length_scanner/field_length_scanner_test.cpp
@@ -0,0 +1,73 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include <vespa/searchlib/diskindex/field_length_scanner.h>
+#include <vespa/searchlib/index/docidandfeatures.h>
+#include <vespa/vespalib/gtest/gtest.h>
+
+using search::index::DocIdAndFeatures;
+
+
+namespace search::diskindex {
+
+
+class FieldLengthScannerTest : public ::testing::Test
+{
+protected:
+ FieldLengthScanner _scanner;
+ FieldLengthScannerTest()
+ : _scanner(3)
+ {
+ }
+};
+
+TEST_F(FieldLengthScannerTest, require_that_no_scan_gives_empty_length)
+{
+ EXPECT_EQ(0, _scanner.get_field_length(1));
+}
+
+TEST_F(FieldLengthScannerTest, require_that_single_length_is_registered)
+{
+ DocIdAndFeatures features;
+ features.set_doc_id(1);
+ features.elements().emplace_back(0, 1, 5);
+ _scanner.scan_features(features);
+ EXPECT_EQ(5u, _scanner.get_field_length(1));
+}
+
+TEST_F(FieldLengthScannerTest, require_that_duplicate_element_is_ignored)
+{
+ DocIdAndFeatures features;
+ features.set_doc_id(1);
+ features.elements().emplace_back(10, 1, 5);
+ features.elements().emplace_back(100, 1, 23);
+ _scanner.scan_features(features);
+ EXPECT_EQ(28u, _scanner.get_field_length(1));
+ _scanner.scan_features(features); // elements 10 and 100 already scanned
+ EXPECT_EQ(28u, _scanner.get_field_length(1));
+ features.elements()[0].setElementId(11);
+ _scanner.scan_features(features); // element 100 already scanned
+ EXPECT_EQ(33u, _scanner.get_field_length(1));
+ features.elements()[1].setElementId(101);
+ _scanner.scan_features(features); // elements 10 already scanned
+ EXPECT_EQ(56u, _scanner.get_field_length(1));
+}
+
+TEST_F(FieldLengthScannerTest, require_that_documents_are_not_mixed)
+{
+ DocIdAndFeatures features1;
+ DocIdAndFeatures features2;
+ features1.set_doc_id(1);
+ features1.elements().emplace_back(10, 1, 5);
+ features1.elements().emplace_back(100, 1, 23);
+ features2.set_doc_id(2);
+ features2.elements().emplace_back(10, 1, 7);
+ features2.elements().emplace_back(100, 1, 9);
+ _scanner.scan_features(features1);
+ _scanner.scan_features(features2);
+ EXPECT_EQ(28u, _scanner.get_field_length(1));
+ EXPECT_EQ(16u, _scanner.get_field_length(2));
+}
+
+}
+
+GTEST_MAIN_RUN_ALL_TESTS()
diff --git a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp
index da29918a5fb..93a3fde22dd 100644
--- a/searchlib/src/tests/diskindex/fusion/fusion_test.cpp
+++ b/searchlib/src/tests/diskindex/fusion/fusion_test.cpp
@@ -545,7 +545,7 @@ TEST_F(FusionTest, require_that_interleaved_features_can_be_reconstructed)
assert_interleaved_features(disk_index, "f0", "a", 10, 1, 7);
assert_interleaved_features(disk_index, "f1", "w", 10, 1, 4);
assert_interleaved_features(disk_index, "f2", "ax", 10, 2, 4);
- assert_interleaved_features(disk_index, "f2", "ay", 10, 1, 3);
+ assert_interleaved_features(disk_index, "f2", "ay", 10, 1, 4);
assert_interleaved_features(disk_index, "f3", "wx", 10, 1, 2);
clean_field_length_testdirs();
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
index ba608467c8a..15678382741 100644
--- a/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
+++ b/searchlib/src/vespa/searchlib/diskindex/CMakeLists.txt
@@ -12,6 +12,7 @@ vespa_add_library(searchlib_diskindex OBJECT
extposocc.cpp
fieldreader.cpp
fieldwriter.cpp
+ field_length_scanner.cpp
fileheader.cpp
fusion.cpp
indexbuilder.cpp
diff --git a/searchlib/src/vespa/searchlib/diskindex/field_length_scanner.cpp b/searchlib/src/vespa/searchlib/diskindex/field_length_scanner.cpp
new file mode 100644
index 00000000000..b7aad2d1996
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/field_length_scanner.cpp
@@ -0,0 +1,51 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "field_length_scanner.h"
+#include <vespa/searchlib/index/docidandfeatures.h>
+
+namespace search::diskindex {
+
+FieldLengthScanner::FieldLengthScanner(uint32_t doc_id_limit)
+ : _field_length_vector(doc_id_limit),
+ _scanned_elements_map()
+{
+}
+
+FieldLengthScanner::~FieldLengthScanner() = default;
+
+void
+FieldLengthScanner::scan_features(const index::DocIdAndFeatures &features)
+{
+ if (features.elements().empty()) {
+ return;
+ }
+ auto &entry = _field_length_vector[features.doc_id()];
+ if (features.elements().back().getElementId() < element_id_bias) {
+ for (const auto &element : features.elements()) {
+ entry.add_element_length(element.getElementLen(), element.getElementId());
+ }
+ } else {
+ auto element = features.elements().cbegin();
+ while (element->getElementId() < element_id_bias) {
+ entry.add_element_length(element->getElementLen(), element->getElementId());
+ ++element;
+ }
+ auto &scanned_elements = _scanned_elements_map[features.doc_id()];
+ auto size_needed = features.elements().back().getElementId() + 1 - element_id_bias;
+ if (size_needed > scanned_elements.size()) {
+ if (size_needed > scanned_elements.capacity()) {
+ scanned_elements.reserve(std::max(size_needed + (size_needed / 4), 32u));
+ }
+ scanned_elements.resize(size_needed);
+ }
+ while (element != features.elements().cend()) {
+ if (!scanned_elements[element->getElementId() - element_id_bias]) {
+ scanned_elements[element->getElementId() - element_id_bias] = true;
+ entry.add_element_length(element->getElementLen());
+ }
+ ++element;
+ }
+ }
+}
+
+}
diff --git a/searchlib/src/vespa/searchlib/diskindex/field_length_scanner.h b/searchlib/src/vespa/searchlib/diskindex/field_length_scanner.h
new file mode 100644
index 00000000000..e282a85b64f
--- /dev/null
+++ b/searchlib/src/vespa/searchlib/diskindex/field_length_scanner.h
@@ -0,0 +1,63 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vector>
+#include <unordered_map>
+#include <limits>
+
+namespace search::index { class DocIdAndFeatures; }
+
+namespace search::diskindex {
+
+/*
+ * Class used to reconstruct field lengths based on element lengths in
+ * posting list file.
+ */
+class FieldLengthScanner {
+ class FieldLengthEntry {
+ uint16_t _field_length;
+ uint16_t _elements; // first 16 elements
+
+ static uint16_t make_element_mask(uint32_t element_id) { return (1u << element_id); }
+
+ public:
+ FieldLengthEntry()
+ : _field_length(0),
+ _elements(0)
+ {
+ }
+
+ void add_element_length(uint32_t element_length) {
+ // Cap field length
+ if (element_length < std::numeric_limits<uint16_t>::max()) {
+ uint32_t field_length32 = _field_length + element_length;
+ _field_length = std::min(field_length32, static_cast<uint32_t>(std::numeric_limits<uint16_t>::max()));
+ } else {
+ _field_length = std::numeric_limits<uint16_t>::max();
+ }
+ }
+
+ void add_element_length(uint32_t element_length, uint32_t element_id) {
+ uint16_t element_mask = make_element_mask(element_id);
+ if (!(_elements & element_mask)) {
+ _elements |= element_mask;
+ add_element_length(element_length);
+ }
+ }
+
+ uint16_t get_field_length() const { return _field_length; }
+ };
+ std::vector<FieldLengthEntry> _field_length_vector;
+ static constexpr uint32_t element_id_bias = 16;
+ // bit vectors for element >= element_id_bias
+ std::unordered_map<uint32_t, std::vector<bool>> _scanned_elements_map;
+
+public:
+ FieldLengthScanner(uint32_t doc_id_limit);
+ ~FieldLengthScanner();
+ void scan_features(const index::DocIdAndFeatures &features);
+ uint16_t get_field_length(uint32_t doc_id) const { return _field_length_vector[doc_id].get_field_length(); }
+};
+
+}
diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
index 68d37c43cb2..a6208cee970 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.cpp
@@ -4,6 +4,7 @@
#include "zcposocc.h"
#include "extposocc.h"
#include "pagedict4file.h"
+#include "field_length_scanner.h"
#include <vespa/vespalib/util/error.h>
#include <vespa/log/log.h>
@@ -189,7 +190,8 @@ FieldReader::get_field_length_info() const
std::unique_ptr<FieldReader>
FieldReader::allocFieldReader(const SchemaUtil::IndexIterator &index,
- const Schema &oldSchema)
+ const Schema &oldSchema,
+ std::shared_ptr<FieldLengthScanner> field_length_scanner)
{
assert(index.isValid());
if (index.hasMatchingOldFields(oldSchema)) {
@@ -203,7 +205,7 @@ FieldReader::allocFieldReader(const SchemaUtil::IndexIterator &index,
}
// field exists in old schema with different collection type setting
// or old field is missing wanted interleaved features.
- return std::make_unique<FieldReaderStripInfo>(index); // degraded
+ return std::make_unique<FieldReaderStripInfo>(index, field_length_scanner); // degraded
}
@@ -233,11 +235,12 @@ FieldReaderEmpty::getFeatureParams(PostingListParams &params)
}
-FieldReaderStripInfo::FieldReaderStripInfo(const IndexIterator &index)
+FieldReaderStripInfo::FieldReaderStripInfo(const IndexIterator &index, std::shared_ptr<FieldLengthScanner> field_length_scanner)
: _hasElements(false),
_hasElementWeights(false),
_want_interleaved_features(index.use_experimental_posting_list_format()),
- _regenerate_interleaved_features(false)
+ _regenerate_interleaved_features(false),
+ _field_length_scanner(std::move(field_length_scanner))
{
PosOccFieldsParams fieldsParams;
fieldsParams.setSchemaParams(index.getSchema(), index.getIndex());
@@ -272,10 +275,31 @@ FieldReaderStripInfo::open(const vespalib::string &prefix, const TuneFileSeqRead
_regenerate_interleaved_features = true;
}
}
+ if (_regenerate_interleaved_features && _hasElements && _field_length_scanner) {
+ scan_element_lengths();
+ close();
+ if (!FieldReader::open(prefix, tuneFileRead)) {
+ return false;
+ }
+ }
return true;
}
void
+FieldReaderStripInfo::scan_element_lengths()
+{
+ for (;;) {
+ FieldReader::read();
+ if (_wordNum == noWordNumHigh()) {
+ break;
+ }
+ DocIdAndFeatures &features = _docIdAndFeatures;
+ assert(!features.has_raw_data());
+ _field_length_scanner->scan_features(features);
+ }
+}
+
+void
FieldReaderStripInfo::read()
{
typedef search::index::WordDocElementFeatures Element;
@@ -319,7 +343,9 @@ FieldReaderStripInfo::read()
field_length += element.getElementLen();
num_occs += element.getNumOccs();
}
- // Note: Length of elements without occurrences is not included.
+ if (_hasElements && _field_length_scanner) {
+ field_length = _field_length_scanner->get_field_length(features.doc_id());
+ }
features.set_field_length(field_length);
features.set_num_occs(num_occs);
}
diff --git a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h
index 106e10d2e80..899b3708bf9 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fieldreader.h
+++ b/searchlib/src/vespa/searchlib/diskindex/fieldreader.h
@@ -13,6 +13,8 @@
namespace search::diskindex {
+class FieldLengthScanner;
+
/*
* FieldReader is used to read a dictionary and posting list file
* together, and get a sequential view of the stored data.
@@ -95,7 +97,7 @@ public:
uint32_t getDocIdLimit() const { return _docIdLimit; }
const index::FieldLengthInfo &get_field_length_info() const;
- static std::unique_ptr<FieldReader> allocFieldReader(const IndexIterator &index, const Schema &oldSchema);
+ static std::unique_ptr<FieldReader> allocFieldReader(const IndexIterator &index, const Schema &oldSchema, std::shared_ptr<FieldLengthScanner> field_length_scanner);
};
@@ -126,11 +128,13 @@ private:
bool _hasElementWeights;
bool _want_interleaved_features;
bool _regenerate_interleaved_features;
+ std::shared_ptr<FieldLengthScanner> _field_length_scanner;
public:
- FieldReaderStripInfo(const IndexIterator &index);
+ FieldReaderStripInfo(const IndexIterator &index, std::shared_ptr<FieldLengthScanner>);
bool allowRawFeatures() override;
bool open(const vespalib::string &prefix, const TuneFileSeqRead &tuneFileRead) override;
void read() override;
+ void scan_element_lengths();
void getFeatureParams(PostingListParams &params) override;
};
diff --git a/searchlib/src/vespa/searchlib/diskindex/fusion.cpp b/searchlib/src/vespa/searchlib/diskindex/fusion.cpp
index 1ace5969b6b..2f302650d37 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fusion.cpp
+++ b/searchlib/src/vespa/searchlib/diskindex/fusion.cpp
@@ -3,7 +3,9 @@
#include "fusion.h"
#include "fieldreader.h"
#include "dictionarywordreader.h"
+#include "field_length_scanner.h"
#include <vespa/vespalib/util/stringfmt.h>
+#include <vespa/searchlib/bitcompression/posocc_fields_params.h>
#include <vespa/searchlib/index/field_length_info.h>
#include <vespa/searchlib/util/filekit.h>
#include <vespa/searchlib/util/dirtraverse.h>
@@ -28,6 +30,8 @@ using search::diskindex::DocIdMapping;
using search::diskindex::WordNumMapping;
using search::docsummary::DocumentSummary;
using search::index::FieldLengthInfo;
+using search::bitcompression::PosOccFieldParams;
+using search::bitcompression::PosOccFieldsParams;
using search::index::PostingListParams;
using search::index::Schema;
using search::index::SchemaUtil;
@@ -304,17 +308,39 @@ Fusion::selectCookedOrRawFeatures(Reader &reader, Writer &writer)
}
+std::shared_ptr<FieldLengthScanner>
+Fusion::allocate_field_length_scanner(const SchemaUtil::IndexIterator &index)
+{
+ if (index.use_experimental_posting_list_format()) {
+ PosOccFieldsParams fieldsParams;
+ fieldsParams.setSchemaParams(index.getSchema(), index.getIndex());
+ assert(fieldsParams.getNumFields() > 0);
+ const PosOccFieldParams &fieldParams = fieldsParams.getFieldParams()[0];
+ if (fieldParams._hasElements) {
+ for (const auto &old_index : _oldIndexes) {
+ const Schema &old_schema = old_index.getSchema();
+ if (index.hasOldFields(old_schema) &&
+ !index.has_matching_experimental_posting_list_format(old_schema)) {
+ return std::make_shared<FieldLengthScanner>(_docIdLimit);
+ }
+ }
+ }
+ }
+ return std::shared_ptr<FieldLengthScanner>();
+}
+
bool
Fusion::openInputFieldReaders(const SchemaUtil::IndexIterator &index, const WordNumMappingList & list,
std::vector<std::unique_ptr<FieldReader> > & readers)
{
+ auto field_length_scanner = allocate_field_length_scanner(index);
vespalib::string indexName = index.getName();
for (const auto &oi : _oldIndexes) {
const Schema &oldSchema = oi.getSchema();
if (!index.hasOldFields(oldSchema)) {
continue; // drop data
}
- auto reader = FieldReader::allocFieldReader(index, oldSchema);
+ auto reader = FieldReader::allocFieldReader(index, oldSchema, field_length_scanner);
reader->setup(list[oi.getIndex()], oi.getDocIdMapping());
if (!reader->open(oi.getPath() + "/" + indexName + "/", _tuneFileIndexing._read)) {
return false;
diff --git a/searchlib/src/vespa/searchlib/diskindex/fusion.h b/searchlib/src/vespa/searchlib/diskindex/fusion.h
index 28060a9c4be..d532384f6e9 100644
--- a/searchlib/src/vespa/searchlib/diskindex/fusion.h
+++ b/searchlib/src/vespa/searchlib/diskindex/fusion.h
@@ -15,6 +15,7 @@ namespace search::index { class FieldLengthInfo; }
namespace search::diskindex {
+class FieldLengthScanner;
class FieldReader;
class FieldWriter;
class DictionaryWordReader;
@@ -49,6 +50,7 @@ private:
bool mergeFields(vespalib::ThreadExecutor & executor);
bool mergeField(uint32_t id);
+ std::shared_ptr<FieldLengthScanner> allocate_field_length_scanner(const SchemaUtil::IndexIterator &index);
bool openInputFieldReaders(const SchemaUtil::IndexIterator &index, const WordNumMappingList & list,
std::vector<std::unique_ptr<FieldReader> > & readers);
bool openFieldWriter(const SchemaUtil::IndexIterator &index, FieldWriter & writer, const index::FieldLengthInfo &field_length_info);