aboutsummaryrefslogtreecommitdiffstats
path: root/searchsummary/src/vespa/juniper/latintokenizer.h
blob: 0f5dd684565b09cd53a52e1a24d49f20aeb54f36 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
/**
*****************************************************************************
* @author Bård Kvalheim
* @date    Creation date: 2001-12-07
*
* A configurable tokenizer template that accepts two predicates: One to
* determine separator symbols and one to determine punctuation symbols. A
* typedef is defined that uses isspace/1 and ispunct/1.
*
* This tokenizer does not alter the text, and does not copy it.
*
* This tokenizer is not meant to be used as a real tokenizer for all
* languages. It is only a fast and simple latin tokenizer, intended for
* very basic applications.
*
* The tokens are returned as (char *, char *, bool) triples.  The two
* first elements delimit the token string, while the third element is
* true if the token is a punctuation symbol.
*
* If the last character in the input text is a punctuation symbol, the last
* token is the following:
*
*    text = " something bl bla ."
*
*    token.first        -> .
*    token.second       -> \0
*    token._punctuation = true;
*
*  In other words, token.second can point to the terminating '\0' in the input
*  text.
*
*****************************************************************************/

#pragma once

#include <cctype>
#include <cstring>

/**
*****************************************************************************
* A simple tokenizer. See description above.
*
* @class   Fast_LatinTokenizer
* @author Bård Kvalheim
* @date    Creation date: 2001-12-07
*****************************************************************************/

template <typename IsSeparator, typename IsPunctuation>
class Fast_LatinTokenizer {
private:
    Fast_LatinTokenizer(const Fast_LatinTokenizer &);
    Fast_LatinTokenizer& operator=(const Fast_LatinTokenizer &);

public:

    /** Helper class. */
    class Fast_Token {
    public:

        /** Member variables. */
        char *first;        // Points to start of token. Named 'first' for std::pair compatibility.
        char *second;       // Points to end of token.  Named 'second' for std::pair compatibility.
        bool  _punctuation; // Is the token a punctuation symbol?

        /** Constructors. */
        Fast_Token(char *begin, char *end, bool punctuation) : first(begin), second(end), _punctuation(punctuation) {}
        Fast_Token() : first(NULL), second(NULL), _punctuation(false) {}
        Fast_Token(const Fast_Token &other)
            : first(other.first),
              second(other.second),
              _punctuation(other._punctuation)
        {
        }
        Fast_Token& operator=(const Fast_Token &other)
        {
            first = other.first;
            second = other.second;
            _punctuation = other._punctuation;
            return *this;
        }

    };

    /** Constructors/destructor. */
    Fast_LatinTokenizer();
    explicit Fast_LatinTokenizer(char *text);
    Fast_LatinTokenizer(char *text, size_t length);
    virtual ~Fast_LatinTokenizer();

    /** Constructors, sort of. */
    void           SetNewText(char *text);
    void           SetNewText(char *text, size_t length);

    /** Are there any more tokens left? */
    bool           MoreTokens();

    /** Return next token. */
    Fast_Token     GetNextToken();

    /** Return text buffer. */
    char          *GetOriginalText();

    /** Observers in case we need not perform some action specific
     *  to the IsSeparator or IsPunctuation implementations
     *  (such as extra initialization or statistics gathering or...)
     */
    IsPunctuation& GetIsPunctuation() { return _isPunctuation; }
    IsSeparator&   GetIsSeparator()   { return _isSeparator;   }

private:

    /** Member variables. */
    char          *_org;           // Holds the original text buffer.
    char          *_next;          // Points to the current buffer position.
    char          *_end;           // Points to the end of the buffer.
    bool           _moreTokens;    // More text to process?
    IsSeparator    _isSeparator;   // Separator symbol predicate.
    IsPunctuation  _isPunctuation; // Punctuation symbol predicate.

    /** Helper methods. */
    void           SkipBlanks();

};

/**
*****************************************************************************
* Default constructor.
*
* @author Bård Kvalheim
*****************************************************************************/

template <typename IsSeparator, typename IsPunctuation>
Fast_LatinTokenizer<IsSeparator, IsPunctuation>::Fast_LatinTokenizer() :
    _org(NULL),
    _next(NULL),
    _end(NULL),
    _moreTokens(false),
    _isSeparator(),
    _isPunctuation()
{
}

/**
*****************************************************************************
* Constructor. Accepts a '\0' terminated text buffer.
*
* @param  text
* @author Bård Kvalheim
*****************************************************************************/

template <typename IsSeparator, typename IsPunctuation>
Fast_LatinTokenizer<IsSeparator, IsPunctuation>::Fast_LatinTokenizer(char *text) :
    _org(NULL),
    _next(NULL),
    _end(NULL),
    _moreTokens(false),
    _isSeparator(),
    _isPunctuation()
{
    SetNewText(text);
}

/**
*****************************************************************************
* Constructor. Accepts a text buffer and the buffer length
*
* @param  text
* @param  length
* @author Bård Kvalheim
*****************************************************************************/

template <typename IsSeparator, typename IsPunctuation>
Fast_LatinTokenizer<IsSeparator, IsPunctuation>::Fast_LatinTokenizer(char *text, size_t length)
    : _org(NULL),
      _next(NULL),
      _end(NULL),
      _moreTokens(false),
      _isSeparator(),
      _isPunctuation()
{
    SetNewText(text, length);
}

/**
*****************************************************************************
* Destructor.
*
* @author Bård Kvalheim
*****************************************************************************/

template <typename IsSeparator, typename IsPunctuation>
Fast_LatinTokenizer<IsSeparator, IsPunctuation>::~Fast_LatinTokenizer() {
}

/**
*****************************************************************************
* Sets a new '\0' terminated string.
*
* @param  text
* @author Bård Kvalheim
*****************************************************************************/

template <typename IsSeparator, typename IsPunctuation>
void
Fast_LatinTokenizer<IsSeparator, IsPunctuation>::SetNewText(char *text) {

    _org        = text;
    _next       = text;
    _moreTokens = text != NULL;
    _end        = NULL;
}

/**
*****************************************************************************
* Sets a new string, given the text buffer and its length.
*
* @param  text
* @param  length
* @author Bård Kvalheim
*****************************************************************************/

template <typename IsSeparator, typename IsPunctuation>
void
Fast_LatinTokenizer<IsSeparator, IsPunctuation>::SetNewText(char *text, size_t length) {

    _org        = text;
    _next       = text;
    _moreTokens = text != NULL;
    _end        = (_next ? _next + length : NULL);
}

/**
*****************************************************************************
* Skips all blanks and flags if there are more tokens.
*
* @author Bård Kvalheim
*****************************************************************************/

template <typename IsSeparator, typename IsPunctuation>
void
Fast_LatinTokenizer<IsSeparator, IsPunctuation>::SkipBlanks() {

    if (!_moreTokens) return;
    // Initialized with '\0' terminated buffer?
    if (_end == NULL) {
        while (*_next != '\0' && _isSeparator(*_next)) {
            ++_next;
        }
        if (*_next == '\0') {
            _moreTokens = false;
        }
    }

    // Initialized with specified buffer length.
    else {
        while (_next != _end && _isSeparator(*_next)) {
            ++_next;
        }
        if (_next == _end) {
            _moreTokens = false;
        }
    }

}

/**
*****************************************************************************
* Returns true if there are more tokens left in the text buffer.
*
* @author Bård Kvalheim
*****************************************************************************/

template <typename IsSeparator, typename IsPunctuation>
bool
Fast_LatinTokenizer<IsSeparator, IsPunctuation>::MoreTokens() {
    SkipBlanks();
    return _moreTokens;
}

/**
*****************************************************************************
* Returns the next token as a Fast_Token.
*
* @author Bård Kvalheim
*****************************************************************************/

template <typename IsSeparator, typename IsPunctuation>
typename Fast_LatinTokenizer<IsSeparator, IsPunctuation>::Fast_Token
Fast_LatinTokenizer<IsSeparator, IsPunctuation>::GetNextToken() {

    char *prev = _next;

    // Skip all blanks and flag if there are no more tokens.
    SkipBlanks();

    // Initialized with '\0' terminated buffer? Find the next blank or punctuation.
    if (_end == NULL) {
        while (*_next != '\0' && !_isSeparator(*_next) && !_isPunctuation(*_next)) {
            ++_next;
        }

        // Initialized with specified buffer length.
    }  else {
        while (_next != _end && !_isSeparator(*_next) && !_isPunctuation(*_next)) {
            ++_next;
        }
    }

    // Check if this token is a punctuation symbol, and generate token.
    bool isToken = ((_next - prev == 0) && _isPunctuation(*prev));

    if (isToken) {
        ++_next;
    }

    Fast_Token token(prev, _next, isToken);

    return token;

}

/**
*****************************************************************************
* Returns the original text buffer.
*
* @author Bård Kvalheim
*****************************************************************************/

template <typename IsSeparator, typename IsPunctuation>
char *
Fast_LatinTokenizer<IsSeparator, IsPunctuation>::GetOriginalText() {
    return _org;
}

/**
*****************************************************************************
* Helper class.
*
* When using isspace/1, ensure that the argument is cast to unsigned char to
* avoid problems with sign extension. See system documentation for details.
*
* @class   Fast_IsSpace
* @author Bård Kvalheim
* @date    Creation date: 2001-12-07
*****************************************************************************/

struct Fast_IsSpace {
    bool operator()(char c) {return (isspace(static_cast<unsigned char>(c)) != 0);}
};

/**
*****************************************************************************
* Helper class.
*
* When using ispunct/1, ensure that the argument is cast to unsigned char to
* avoid problems with sign extension. See system documentation for details.
*
* @class   Fast_IsPunctuation
* @author Bård Kvalheim
* @date    Creation date: 2001-12-07
*****************************************************************************/

struct Fast_IsPunctuation {
    bool operator()(char c) {return (ispunct(static_cast<unsigned char>(c)) != 0);}
};

/**
*****************************************************************************
* A simple tokenizer. See description above.
*
* @class   Fast_SimpleLatinTokenizer
* @author Bård Kvalheim
* @date    Creation date: 2001-12-07
*****************************************************************************/

using Fast_SimpleLatinTokenizer = Fast_LatinTokenizer<Fast_IsSpace, Fast_IsPunctuation>;