1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
|
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "wordchartokenizer.h"
#include "unicode.h"
#include <string.h>
namespace fsa {
const bool WordCharTokenizer::_punctuation_table[] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
};
bool WordCharTokenizer::init(const std::string &text)
{
_tokens.clear();
_current = 0;
char *dup;
if(_lowercase)
dup = Unicode::strlowdupUTF8(text.c_str());
else
dup = Unicode::strdupUTF8(text.c_str());
char *tmp = dup;
char *tok,*end;
ucs4_t ch=0;
bool need_punct=false, added_punct=false;
while(*tmp) {
tok=NULL;
while((tok=tmp,*tmp) &&
(ch=Unicode::getUTF8Char(tmp),
_punctuation==PUNCTUATION_WHITESPACEONLY?Unicode::isSpaceChar(ch):!Unicode::isWordChar(ch))){
if(_punctuation!=PUNCTUATION_DISCARD && _punctuation!=PUNCTUATION_WHITESPACEONLY){
if(ch<128 && _punctuation_table[ch] && need_punct && !added_punct){
_tokens.push_back(_punctuation_token);
added_punct=true;
}
}
}
while((end=tmp,*tmp) &&
(ch=Unicode::getUTF8Char(tmp),
_punctuation==PUNCTUATION_WHITESPACEONLY?!Unicode::isSpaceChar(ch):Unicode::isWordChar(ch)));
if(*end) {
*end=0;
}
if(*tok){
_tokens.push_back(std::string((char *)tok));
added_punct = false;
need_punct = true;
if(_punctuation!=PUNCTUATION_DISCARD && _punctuation!=PUNCTUATION_WHITESPACEONLY){
if(ch<128 && _punctuation_table[ch]){
if(_punctuation==PUNCTUATION_FULL || ch!='.' || strlen(tok)>1){
_tokens.push_back(_punctuation_token);
added_punct=true;
}
}
}
}
}
if(added_punct) { // The last token is a puctuation, drop it
_tokens.pop_back();
}
free(dup);
return true;
}
bool WordCharTokenizer::hasMore()
{
return _tokens.size()>_current;
}
std::string WordCharTokenizer::getNext()
{
if(_tokens.size()>_current){
return _tokens[_current++];
}
else{
return std::string();
}
}
} // namespace fsa
|