1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
|
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <vespa/searchlib/common/tunefileinfo.h>
#include <vespa/searchlib/diskindex/pagedict4file.h>
#include <vespa/searchlib/diskindex/pagedict4randread.h>
#include <vespa/searchlib/index/dummyfileheadercontext.h>
#include <vespa/vespalib/gtest/gtest.h>
#include <vespa/vespalib/stllike/asciistream.h>
#include <filesystem>
using search::diskindex::PageDict4FileSeqRead;
using search::diskindex::PageDict4FileSeqWrite;
using search::diskindex::PageDict4RandRead;
using search::index::DummyFileHeaderContext;
using search::index::PostingListCounts;
using search::index::PostingListOffsetAndCounts;
using search::index::PostingListParams;
namespace {
vespalib::string test_dir("long_words_dir");
vespalib::string dict(test_dir + "/dict");
PostingListCounts make_counts()
{
PostingListCounts counts;
counts._bitLength = 100;
counts._numDocs = 1;
counts._segments.clear();
return counts;
}
vespalib::string
make_word(int i)
{
vespalib::asciistream os;
vespalib::string word(5_Ki, 'a');
os << vespalib::setfill('0') << vespalib::setw(8) << i;
word.append(os.str());
return word;
}
}
/*
* A long word that don't fit into a 4 KiB 'page' causes a fallback to
* overflow handling where the word is put in the .ssdat file.
*/
TEST(PageDict4LongWordsTest, test_many_long_words)
{
int num_words = 4_Mi;
auto counts = make_counts();
std::filesystem::remove_all(std::filesystem::path(test_dir));
std::filesystem::create_directories(std::filesystem::path(test_dir));
auto dw = std::make_unique<PageDict4FileSeqWrite>();
DummyFileHeaderContext file_header_context;
PostingListParams params;
search::TuneFileSeqWrite tune_file_write;
params.set("numWordIds", num_words);
params.set("minChunkDocs", 256_Ki);
dw->setParams(params);
EXPECT_TRUE(dw->open(dict, tune_file_write, file_header_context));
for (int i = 0; i < num_words; ++i) {
auto word = make_word(i);
dw->writeWord(word, counts);
}
EXPECT_TRUE(dw->close());
dw.reset();
auto drr = std::make_unique<PageDict4RandRead>();
search::TuneFileRandRead tune_file_rand_read;
EXPECT_TRUE(drr->open(dict, tune_file_rand_read));
PostingListOffsetAndCounts offset_and_counts;
uint64_t exp_offset = 0;
uint64_t exp_acc_num_docs = 0;
for (int i = 0; i < num_words; ++i) {
auto word = make_word(i);
uint64_t check_word_num = 0;
EXPECT_TRUE(drr->lookup(word, check_word_num, offset_and_counts));
EXPECT_EQ(i + 1, (int) check_word_num);
EXPECT_EQ(exp_offset, offset_and_counts._offset);
EXPECT_EQ(exp_acc_num_docs, offset_and_counts._accNumDocs);
EXPECT_EQ(counts, offset_and_counts._counts);
exp_offset += offset_and_counts._counts._bitLength;
exp_acc_num_docs += offset_and_counts._counts._numDocs;
}
EXPECT_TRUE(drr->close());
drr.reset();
auto dr = std::make_unique<PageDict4FileSeqRead>();
search::TuneFileSeqRead tune_file_read;
EXPECT_TRUE(dr->open(dict, tune_file_read));
vespalib::string check_word;
PostingListCounts check_counts;
for (int i = 0; i < num_words; ++i) {
uint64_t check_word_num = 0;
check_word.clear();
dr->readWord(check_word, check_word_num, check_counts);
EXPECT_EQ(i + 1, (int) check_word_num);
EXPECT_EQ(make_word(i), check_word);
EXPECT_EQ(counts, check_counts);
}
EXPECT_TRUE(dr->close());
dr.reset();
std::filesystem::remove_all(std::filesystem::path(test_dir));
}
GTEST_MAIN_RUN_ALL_TESTS()
|