aboutsummaryrefslogtreecommitdiffstats
path: root/fsa/src/vespa/fsa/ngram.h
blob: 6c995637318cde2c2091ce60e1aa0ed0cbda4629 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
/**
 * @author  Peter Boros
 * @date    2004/08/20
 * @version $Id$
 * @file    ngram.h
 * @brief   n-gram class for tokenized text.
 */

#pragma once

#include <iostream>
#include <vector>
#include <string>
#include <algorithm>

#include "unicode.h"
#include "selector.h"
#include "permuter.h"
#include "tokenizer.h"

namespace fsa {

// {{{ class NGram

/**
 * @class NGram
 * @brief Class for representing n-grams.
 *
 * Supports tokenization and various manipulation methods, such as
 * join, sort, uniq, etc.
 */
class NGram {

public:

private:
  std::vector<std::string>  _tokens;        /**< Vector holding the tokens.             */

public:
  /**
   * @brief Default constructor, creates empty NGram.
   */
  NGram() : _tokens() {}

  /**
   * @brief Constructor.
   *
   * Creates an NGram object from a utf-8 encoded character
   * string. The string must be zero terminated. The string is
   * tokenized using unicode wordchar property. For certain puctuation
   * strategies, a special puctuation token is inserted if a puctuation
   * character is found.
   *
   * @param text Input text.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   */
  NGram(const char *text,
        unsigned int from=0, int length=-1);

  /**
   * @brief Constructor.
   *
   * Creates an NGram object from a utf-8 encoded character
   * string. The string must be zero terminated. The string is
   * tokenized using the supplied tokienizer.
   *
   * @param text Input text.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   * @param tokenizer Tokenizer.
   */
  NGram(const char *text,
        Tokenizer &tokenizer,
        unsigned int from=0, int length=-1);

  /**
   * @brief (Sort of) Copy constructor.
   *
   * @param g NGram object to copy.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   */
  NGram(const NGram &g, unsigned int from=0, int length=-1);

  /**
   * @brief (Sort of) Copy constructor.
   *
   * Copy selected tokens from an NGram objects.
   *
   * @param g NGram object to copy.
   * @param select Selector indicating which tokens to copy.
   */
  NGram(const NGram &g, const Selector &select);

  /**
   * @brief (Sort of) Copy constructor.
   *
   * Create a new NGram and permute the tokens.
   *
   * @param g NGram object to copy.
   * @param p Permuter object.
   * @param id Permutation ID.
   */
  NGram(const NGram &g, const Permuter &p, unsigned int id);

  /**
   * @brief Constructor.
   *
   * Creates an NGram object from a utf-8 encoded std::string. The
   * string is tokenized using unicode wordchar property. For certain
   * puctuation strategies, a special puctuation token is inserted if
   * a puctuation character is found.
   *
   * @param s Input text.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   */
  NGram(const std::string &s,
        unsigned int from=0, int length=-1);

  /**
   * @brief Constructor.
   *
   * Creates an NGram object from a utf-8 encoded std::string. The
   * string is tokenized using the supplied tokenizer.
   *
   * @param s Input text.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   * @param tokenizer Tokenizer.
   */
  NGram(const std::string &s,
        Tokenizer &tokenizer,
        unsigned int from=0, int length=-1);

  /**
   * @brief Set the object.
   *
   * Reinitalizes the NGram object from a utf-8 encoded character
   * string. The string must be zero terminated. The string is
   * tokenized using unicode wordchar property. For certain puctuation
   * strategies, a special puctuation token is inserted if a puctuation
   * character is found.
   *
   * @param text Input text.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   */
  void set(const char *text,
           unsigned int from=0, int length=-1);

  /**
   * @brief Set the object.
   *
   * Reinitalizes the NGram object from a utf-8 encoded character
   * string. The string must be zero terminated. The string is
   * tokenized using the supplied tokenizer.
   *
   * @param text Input text.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   * @param tokenizer Tokenizer.
   */
  void set(const char *text,
           Tokenizer &tokenizer,
           unsigned int from=0, int length=-1);

  /**
   * @brief Set the object.
   *
   * @param g NGram object to copy.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   */
  void set(const NGram &g, unsigned int from=0, int length=-1);

  /**
   * @brief Set the object.
   *
   * Copy selected tokens from an NGram objects.
   *
   * @param g NGram object to copy.
   * @param select Selector indicating which tokens to copy.
   */
  void set(const NGram &g, const Selector &select);

  /**
   * @brief Set the object.
   *
   * Set the object from another NGram with permuting the tokens.
   *
   * @param g NGram object to copy.
   * @param p Permuter object.
   * @param id Permutation ID.
   */
  void set(const NGram &g, const Permuter &p, unsigned int id);

  /**
   * @brief Set the object.
   *
   * Reinitalizes the NGram object from a utf-8 encoded
   * std::string. The string is tokenized using unicode wordchar
   * property. For certain puctuation strategies, a special puctuation
   * token is inserted if a puctuation character is found.
   *
   * @param s Input text.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   */
  void set(const std::string &s,
           unsigned int from=0, int length=-1);

  /**
   * @brief Set the object.
   *
   * Reinitalizes the NGram object from a utf-8 encoded
   * std::string. The string is tokenized using the supplied tokenizer.
   *
   * @param s Input text.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   * @param tokenizer Tokenizer.
   */
  void set(const std::string &s,
           Tokenizer &tokenizer,
           unsigned int from=0, int length=-1);

  /**
   * @brief Set the object.
   *
   * Reinitalizes the object from an std::string, as a single token.
   *
   * @param s Input string.
   */
  void setOne(const std::string &s);

  /**
   * @brief Append tokens to the object.
   *
   * Appends tokens to the NGram object from a utf-8 encoded character
   * string. The string must be zero terminated. The string is
   * tokenized using unicode wordchar property. For certain puctuation
   * strategies, a special puctuation token is inserted if a
   * puctuation character is found.
   *
   * @param text Input text.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   */
  void append(const char *text,
              unsigned int from=0, int length=-1);

  /**
   * @brief Append tokens to the object.
   *
   * Appends tokens to the NGram object from a utf-8 encoded character
   * string. The string must be zero terminated. The string is
   * tokenized using the supplied tokenizer.
   *
   * @param text Input text.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   * @param tokenizer Tokenizer.
   */
  void append(const char *text,
              Tokenizer &tokenizer,
              unsigned int from=0, int length=-1);

  /**
   * @brief Append tokens to the object.
   *
   * @param g NGram object to append.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   */
  void append(const NGram &g, unsigned int from=0, int length=-1);

  /**
   * @brief Append tokens to the object.
   *
   * Append selected tokens from an NGram objects.
   *
   * @param g NGram object to append.
   * @param select Selector indicating which tokens to copy.
   */
  void append(const NGram &g, const Selector &select);

  /**
   * @brief Append tokens to the object.
   *
   * Append a permuted NGram.
   *
   * @param g NGram object to append.
   * @param p Permuter object.
   * @param id Permutation ID.
   */
  void append(const NGram &g, const Permuter &p, unsigned int id);

  /**
   * @brief Append tokens to the object.
   *
   * Appends tokens to the NGram object from a utf-8 encoded
   * std::string. The string is tokenized using unicode wordchar
   * property. For certain puctuation strategies, a special puctuation
   * token is inserted if a puctuation character is found.
   *
   * @param s Input text.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   */
  void append(const std::string &s,
              unsigned int from=0, int length=-1);

  /**
   * @brief Append tokens to the object.
   *
   * Appends tokens to the NGram object from a utf-8 encoded
   * std::string. The string is tokenized using the supplied tokenizer.
   *
   * @param s Input text.
   * @param from Starting token to keep (preceeding tokens are ignored).
   * @param length Number of tokens to keep.
   * @param tokenizer Tokenizer.
   */
  void append(const std::string &s,
              Tokenizer &tokenizer,
              unsigned int from=0, int length=-1);

  /**
   * @brief Append a single token to the object.
   *
   * Appends a single token from an std::string.
   *
   * @param s Input string.
   */
  void appendOne(const std::string &s);


  /**
   * @brief Reset the object.
   */
  void clear() { _tokens.clear(); }

  /**
   * @brief Get the size of the n-gram (number of tokens).
   *
   * @return Number of tokens in n-gram.
   */
  unsigned int size() const { return _tokens.size(); }

  /**
   * @brief Get the length (size) of the n-gram (number of tokens).
   *
   * @return Number of tokens in n-gram.
   */
  unsigned int length() const { return _tokens.size(); }

  /**
   * @brief Sort the tokens lexicograpically.
   */
  void sort() { std::sort(_tokens.begin(),_tokens.end()); }

  /**
   * @brief Remove duplicate tokens from a sorted n-gram.
   */
  unsigned int uniq();

  /**
   * @brief Reverse the order of the tokens.
   */
  void reverse() { std::reverse(_tokens.begin(),_tokens.end()); }

  /**
   * @brief Join the whole or parts of the n-gram to single string.
   *
   * @param separator Separator string.
   * @param from Starting token (default 0).
   * @param length Number of tokens (default -1 which means all).
   * @return Joined tokens.
   */
  std::string join(const std::string &separator = " ",
                   unsigned int from=0, int length=-1) const;

  /**
   * @brief Index operator.
   *
   * Provides access a token directly. The index must be in the range
   * of 0..length()-1, this is not checked.
   *
   * @param i Index.
   * @return Reference to token.
   */
  std::string& operator[](unsigned int i) { return _tokens[i]; }

  /**
   * @brief Index operator.
   *
   * Provides const access a token directly. The index must be in the
   * range of 0..length()-1, this is not checked.
   *
   * @param i Index.
   * @return Const reference to token.
   */
  const std::string& operator[](unsigned int i) const { return _tokens[i]; }

  /**
   * @brief Get permutation ID to another n-gram.
   *
   * Get permutation ID to another n-gram. The other n-gram should
   * consist of the same tokens in different order.
   *
   * @param g The other n-gram.
   * @param p Permuter object.
   * @return Permutation ID.
   */
  int getPermIdTo(const NGram &g, const Permuter &p) const;

  /**
   * @brief Output operator.
   *
   * @param out Reference to output stream.
   * @param g n-gram.
   * @return Reference to output stream.
   */
  friend std::ostream& operator<<(std::ostream &out, const NGram &g);
};

// }}}

} // namespace fsa