aboutsummaryrefslogtreecommitdiffstats
path: root/fsa/src/vespa/fsa/conceptnet.h
blob: 04f931a3230b5079ccf5b57e8d79e3f6974de852 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
/**
 * @author  Peter Boros
 * @date    2004/10/01
 * @version $Id$
 * @file    conceptnet.h
 * @brief   Concept network class definition.
 *
 */

#pragma once

#include <cassert>
#include <stdlib.h>
#include "file.h" // for FileAccessMethod
#include "fsa.h"


namespace fsa {

// {{{ class ConceptNet

/**
 * @class ConceptNet
 * @brief Class for compact representation of a concept network.
 */
class ConceptNet {

public:

  class Handle; // defined in conceptnethandle.h

private:
  static const uint32_t MAGIC = 238579428;    /**< Magic number identifying concept net files. */

  static const FileAccessMethod _default_file_access_method = FILE_ACCESS_MMAP;   /**< Default file access method (read/mmap). */

  /**
   * @struct Header
   * @brief Concept net data file header.
   */
  struct Header {
    uint32_t _magic;             /**< Magic number.                        */
    uint32_t _version;           /**< Version number. (currently not used) */
    uint32_t _checksum;          /**< Checksum. (currently not used)       */
    uint32_t _index_size;        /**< Size of index structure.             */
    uint32_t _info_size;         /**< Size of info structure.              */
    uint32_t _catindex_size;     /**< Size of category index.              */
    uint32_t _strings_size;      /**< Size of string storage.              */
    uint32_t _max_freq;          /**< Reseved for normalization purposes.  */
    uint32_t _max_cfreq;         /**< Reseved for normalization purposes.  */
    uint32_t _max_qfreq;         /**< Reseved for normalization purposes.  */
    uint32_t _max_sfreq;         /**< Reseved for normalization purposes.  */
    uint32_t _max_efreq;         /**< Reseved for normalization purposes.  */
    uint32_t _max_afreq;         /**< Reseved for normalization purposes.  */
    uint32_t _dummy[51];         /**< Reserved.                            */
  };

  /**
   * @struct UnitData
   * @brief Unit data structure.
   */
  struct UnitData {
    uint32_t     _term;          /**< Offset of unit string in string storage.     */
    uint32_t     _frq;           /**< Unit frequency.                              */
    uint32_t     _cfrq;          /**< Frequency of the unit as complete query.     */
    uint32_t     _qfrq;          /**< Frequency of the unit as part of a query.    */
    uint32_t     _sfrq;          /**< Number of queries containing all unit terms. */
    uint32_t     _exts;          /**< If non-zero: offset of extension info in info structure.   */
    uint32_t     _assocs;        /**< If non-zero: offset of association info in info structure. */
    uint32_t     _cats;          /**< If non-zero: offset of category info in info structure.    */
  };

  void         *_mmap_addr;      /**< mmap address, NULL is file has not been mmapped.   */
  size_t        _mmap_length;    /**< mmap length.                                       */

  FSA           _unit_fsa;       /**< %FSA containing the units (with hash).    */
  uint32_t      _index_size;     /**< Size of the index structure.              */
  UnitData     *_index;          /**< Pointer to the index structure in memory. */
  uint32_t      _info_size;      /**< Size of the info structure.               */
  uint32_t     *_info;           /**< Pointer to the info structure in memory.  */
  uint32_t      _catindex_size;  /**< Size of the catergory index.              */
  uint32_t     *_catindex;       /**< Pointer to the category index in memory.  */
  uint32_t      _strings_size;   /**< Size of the string storage.               */
  char         *_strings;        /**< Pointer to the string storage in memory.  */

  bool _ok;                      /**< Flag indicating successful initialization. */

  /**
   * @brief Reset the object.
   *
   * Resets the object to an empty %ConceptNet, and releases allocated memory.
   */
  void reset();

  /**
   * @brief Read the concept net data file from disk.
   *
   * @param datafile Name of the concept net data file.
   * @param fam File access mode (read or mmap). If not set, the
   *            global default access mode will be used.
   * @return True on success.
   */
  bool read(const char *datafile, fsa::FileAccessMethod fam = FILE_ACCESS_UNDEF);

  /**
   * @brief Unimplemented private default constructor.
   */
  ConceptNet();
  /**
   * @brief Unimplemented private copy constructor.
   */
  ConceptNet(const ConceptNet&);
  /**
   * @brief Unimplemented private assignement operator.
   */
  const ConceptNet& operator=(const ConceptNet&);

public:

  /**
   * @brief Constructor.
   *
   * @param fsafile %FSA file containing the units, with a perfect has
   *                (used for indexing the data file).
   * @param datafile Concept net data file.
   * @param fam File access mode (read or mmap). If not set, the
   *            global default access mode will be used.
   */
  ConceptNet(const char *fsafile, const char *datafile=NULL, FileAccessMethod fam = FILE_ACCESS_UNDEF);
  ConceptNet(const std::string &fsafile, const std::string &datafile, FileAccessMethod fam = FILE_ACCESS_UNDEF);

  /**
   * @brief Destructor.
   */
  virtual ~ConceptNet();

  /**
   * @brief Check if initialization was successful.
   *
   * @return True if the initialization of the object succeeded.
   */
  bool isOk() const
  {
    return _ok;
  }

  /**
   * @brief Get the concept net %FSA.
   *
   * Get the concept net %FSA. The object continues to be owned by the
   * concept net.
   *
   * @return The concept net %FSA.
   */
  const FSA& getFSA() const
  {
    assert(_ok);
    return _unit_fsa;
  }

  /**
   * @brief Look up a unit.
   *
   * Look up a unit in the concept net, and get its index.
   *
   * @param unit Unit string.
   * @return Index of the unit, or -1 if not found.
   */
  int lookup(const char *unit) const;

  /**
   * @brief Look up a unit index.
   *
   * Look up a unit index in the concept net, and get the unit string.
   *
   * @param idx Unit index.
   * @return Pointer to the unit string, or NULL if index is out of range.
   */
  const char * lookup(int idx) const;

  /**
   * @brief Get the unit frequency of the unit.
   *
   * @param idx Unit index.
   * @return Unit frequency, or -1 if the index is out of range.
   */
  int frq(int idx) const;

  /**
   * @brief Get the unit frequency of the unit.
   *
   * @param unit Unit string.
   * @return Unit frequency, or -1 if the unit is not found.
   */
  int frq(const char *unit) const;

  /**
   * @brief Get the frequency of the unit as a complete query.
   *
   * @param idx Unit index.
   * @return Unit-C frequency, or -1 if the index is out of range.
   */
  int cFrq(int idx) const;

  /**
   * @brief Get the frequency of the unit as a complete query.
   *
   * @param unit Unit string.
   * @return Unit-C frequency, or -1 if the unit is not found.
   */
  int cFrq(const char *unit) const;

  /**
   * @brief Get the frequency of the unit as part of a query.
   *
   * @param idx Unit index.
   * @return Unit-Q frequency, or -1 if the index is out of range.
   */
  int qFrq(int idx) const;

  /**
   * @brief Get the frequency of the unit as part of a query.
   *
   * @param unit Unit string.
   * @return Unit-Q frequency, or -1 if the unit is not found.
   */
  int qFrq(const char *unit) const;

  /**
   * @brief Get the frequency of queries containing all terms of the unit.
   *
   * @param idx Unit index.
   * @return Unit-S frequency, or -1 if the index is out of range.
   */
  int sFrq(int idx) const;

  /**
   * @brief Get the frequency of queries containing all terms of the unit.
   *
   * @param unit Unit string.
   * @return Unit-Q frequency, or -1 if the unit is not found.
   */
  int sFrq(const char *unit) const;

  /**
   * @brief Get the unit score (100.0*cFrq/qFrq).
   *
   * @param idx Unit index.
   * @return Unit score, or -1.0 if the index is out of range.
   */
  double score(int idx) const;

  /**
   * @brief Get the unit score (100.0*cFrq/qFrq).
   *
   * @param unit Unit string.
   * @return Unit score, or -1. if the unit is not found.
   */
  double score(const char *unit) const;

  /**
   * @brief Get the unit strength (100.0*qFrq/sFrq).
   *
   * @param idx Unit index.
   * @return Unit strength, or -1.0 if the index is out of range.
   */
  double strength(int idx) const;

  /**
   * @brief Get the unit strength (100.0*qFrq/sFrq).
   *
   * @param unit Unit string.
   * @return Unit strength, or -1. if the unit is not found.
   */
  double strength(const char *unit) const;

  /**
   * @brief Get the number of extensions for the unit.
   *
   * @param idx Unit index.
   * @return Number of extensions for the unit, -1 if the index is out
   *         of range.
   */
  int numExt(int idx) const;

  /**
   * @brief Get the number of associations for the unit.
   *
   * @param idx Unit index.
   * @return Number of associations for the unit, -1 if the index is out
   *         of range.
   */
  int numAssoc(int idx) const;

  /**
   * @brief Get the number of categories for the unit.
   *
   * @param idx Unit index.
   * @return Number of categories for the unit, -1 if the index is out
   *         of range.
   */
  int numCat(int idx) const;

  /**
   * @brief Get the index of an extension.
   *
   * @param idx Unit index.
   * @param j Number of the extension (extensions of each unit are
   *          sorted by decreasing weight).
   * @return Extension (unit) index, -1 if idx or j is out
   *         of range.
   */
  int ext(int idx, int j) const;

  /**
   * @brief Get the frequency of an extension.
   *
   * @param idx Unit index.
   * @param j Number of the extension (extensions of each unit are
   *          sorted by decreasing weight).
   * @return Extension frequency, -1 if idx or j is out
   *         of range.
   */
  int extFrq(int idx, int j) const;

  /**
   * @brief Get the index of an association.
   *
   * @param idx Unit index.
   * @param j Number of the association (associations of each unit are
   *          sorted by decreasing weight).
   * @return Association (unit) index, -1 if idx or j is out
   *         of range.
   */
  int assoc(int idx, int j) const;

  /**
   * @brief Get the frequency of an association.
   *
   * @param idx Unit index.
   * @param j Number of the association (associations of each unit are
   *          sorted by decreasing weight).
   * @return Association frequency, -1 if idx or j is out
   *         of range.
   */
  int assocFrq(int idx, int j) const;

  /**
   * @brief Get the index of a category.
   *
   * @param idx Unit index.
   * @param j Number of the category.
   * @return Catergory index, -1 if idx or j is out of range.
   */
  int cat(int idx, int j) const;

  /**
   * @brief Get the name of a category.
   *
   * @param catIdx Category index.
   * @return Catergory name, or NULL if catIdx is out of range.
   */
  const char *catName(int catIdx) const;

};

// }}}

} // namespace fsa