fsa/src/vespa/fsa/wordchartokenizer.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#include "wordchartokenizer.h"
#include "unicode.h"

#include <string.h>


namespace fsa {

const bool WordCharTokenizer::_punctuation_table[] = {
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
};


bool WordCharTokenizer::init(const std::string &text)
{
  _tokens.clear();
  _current = 0;

  char *dup;
  if(_lowercase)
      dup = Unicode::strlowdupUTF8(text.c_str());
  else
      dup = Unicode::strdupUTF8(text.c_str());

  char *tmp = dup;
  char *tok,*end;
  ucs4_t ch=0;
  bool  need_punct=false, added_punct=false;

  while(*tmp) {
    tok=NULL;
    while((tok=tmp,*tmp) &&
          (ch=Unicode::getUTF8Char(tmp),
           _punctuation==PUNCTUATION_WHITESPACEONLY?Unicode::isSpaceChar(ch):!Unicode::isWordChar(ch))){
      if(_punctuation!=PUNCTUATION_DISCARD && _punctuation!=PUNCTUATION_WHITESPACEONLY){
        if(ch<128 && _punctuation_table[ch] && need_punct && !added_punct){
          _tokens.push_back(_punctuation_token);
          added_punct=true;
        }
      }
    }

    while((end=tmp,*tmp) &&
          (ch=Unicode::getUTF8Char(tmp),
           _punctuation==PUNCTUATION_WHITESPACEONLY?!Unicode::isSpaceChar(ch):Unicode::isWordChar(ch)));

    if(*end) {
      *end=0;
    }
    if(*tok){
      _tokens.push_back(std::string((char *)tok));
      added_punct = false;
      need_punct = true;
      if(_punctuation!=PUNCTUATION_DISCARD && _punctuation!=PUNCTUATION_WHITESPACEONLY){
        if(ch<128 && _punctuation_table[ch]){
          if(_punctuation==PUNCTUATION_FULL || ch!='.' || strlen(tok)>1){
            _tokens.push_back(_punctuation_token);
            added_punct=true;
          }
        }
      }
    }
  }

  if(added_punct) {  // The last token is a puctuation, drop it
    _tokens.pop_back();
  }

  free(dup);
  return true;
}


bool WordCharTokenizer::hasMore()
{
  return _tokens.size()>_current;
}

std::string WordCharTokenizer::getNext()
{
  if(_tokens.size()>_current){
    return _tokens[_current++];
  }
  else{
    return std::string();
  }
}

} // namespace fsa