summaryrefslogtreecommitdiffstats
path: root/searchlib/src/vespa/searchlib/diskindex/dictionarywordreader.h
blob: 7204ba14d255db97ccb59df37f76968516810a56 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#pragma once

#include "pagedict4file.h"
#include <vespa/fastlib/io/bufferedfile.h>

namespace search
{

namespace diskindex
{


/*
 * Helper class, will be used by fusion later to handle generation of
 * word numbering without writing a word list file.
 */
class WordAggregator
{
private:
    vespalib::string _word;
    uint64_t _wordNum;

public:
    WordAggregator()
        : _word(),
          _wordNum(0)
    {
    }

    void
    tryWriteWord(const vespalib::stringref &word)
    {
        if (word != _word || _wordNum == 0) {
            ++_wordNum;
            _word = word;
        }
    }

    uint64_t
    getWordNum() const
    {
        return _wordNum;
    }
};


/*
 * Class used to merge words in multiple dictionaries for
 * new style fusion (using WordAggregator).
 */
class DictionaryWordReader
{
public:
    vespalib::string _word;
    uint64_t _wordNum;
    index::PostingListCounts _counts;

private:
    // "owners" of file handles.
    std::unique_ptr<Fast_BufferedFile> _old2newwordfile;

    using DictionaryFileSeqRead = index::DictionaryFileSeqRead;
    std::unique_ptr<DictionaryFileSeqRead> _dictFile;

    void
    allocFiles();

    static uint64_t
    noWordNumHigh()
    {
        return std::numeric_limits<uint64_t>::max();
    }

    static uint64_t
    noWordNum()
    {
        return 0u;
    }

public:
    DictionaryWordReader();

    ~DictionaryWordReader();

    bool
    isValid() const
    {
        return _wordNum != noWordNumHigh();
    }

    bool
    operator<(const DictionaryWordReader &rhs) const
    {
        if (!isValid())
            return false;
        if (!rhs.isValid())
            return true;
        return _word < rhs._word;
    }

    void
    read()
    {
        _dictFile->readWord(_word, _wordNum, _counts);
    }

    bool
    open(const vespalib::stringref & dictionaryName,
         const vespalib::stringref & wordMapName,
         const TuneFileSeqRead &tuneFileRead);

    void
    close();

    void
    writeNewWordNum(uint64_t newWordNum)
    {
        _old2newwordfile->WriteBuf(&newWordNum, sizeof(newWordNum));
    }

    void
    write(WordAggregator &writer)
    {
        writer.tryWriteWord(_word);
        writeNewWordNum(writer.getWordNum());
    }
};



} // namespace diskindex

} // namespace search