1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
|
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <iostream>
#include <set>
#include <sstream>
#include <vespa/vespalib/text/stringtokenizer.h>
#include <gtest/gtest.h>
using vespalib::StringTokenizer;
using std::string;
TEST(StringTokenizerTest, testSimpleUsage)
{
{
string s("This,is ,a,,list ,\tof,,sepa rated\n, \rtokens,");
StringTokenizer tokenizer(s);
StringTokenizer::TokenList result;
result.push_back("This");
result.push_back("is");
result.push_back("a");
result.push_back("");
result.push_back("list");
result.push_back("of");
result.push_back("");
result.push_back("sepa rated");
result.push_back("tokens");
result.push_back("");
ASSERT_EQ(result.size(), static_cast<size_t>(tokenizer.size()));
for (unsigned int i=0; i<result.size(); i++) {
EXPECT_EQ(result[i], tokenizer[i]);
}
std::set<string> sorted(tokenizer.begin(), tokenizer.end());
EXPECT_EQ(static_cast<size_t>(8u), sorted.size());
tokenizer.removeEmptyTokens();
EXPECT_EQ(7u, tokenizer.size());
}
{
string s("\tAnother list with some \ntokens, and stuff.");
StringTokenizer tokenizer(s, " \t\n", ",.");
StringTokenizer::TokenList result;
result.push_back("");
result.push_back("Another");
result.push_back("list");
result.push_back("with");
result.push_back("some");
result.push_back("");
result.push_back("tokens");
result.push_back("and");
result.push_back("stuff");
ASSERT_EQ(result.size(), static_cast<size_t>(tokenizer.size()));
for (unsigned int i=0; i<result.size(); i++) {
EXPECT_EQ(result[i], tokenizer[i]);
}
std::set<string> sorted(tokenizer.begin(), tokenizer.end());
EXPECT_EQ(static_cast<size_t>(8u), sorted.size());
tokenizer.removeEmptyTokens();
EXPECT_EQ(7u, tokenizer.size());
}
{
string s(" ");
StringTokenizer tokenizer(s);
EXPECT_EQ(0u, tokenizer.size());
}
{
string s("");
StringTokenizer tokenizer(s);
EXPECT_EQ(0u, tokenizer.size());
}
{
// Test that there aren't any problems with using signed chars.
string s("Here\x01\xff be\xff\xfe dragons\xff");
StringTokenizer tokenizer(s, "\xff", "\x01 \xfe");
StringTokenizer::TokenList result;
result.push_back("Here");
result.push_back("be");
result.push_back("dragons");
result.push_back("");
ASSERT_EQ(result.size(), static_cast<size_t>(tokenizer.size()));
for (unsigned int i=0; i<result.size(); i++) {
EXPECT_EQ(result[i], tokenizer[i]);
}
std::set<string> sorted(tokenizer.begin(), tokenizer.end());
EXPECT_EQ(static_cast<size_t>(4u), sorted.size());
tokenizer.removeEmptyTokens();
EXPECT_EQ(3u, tokenizer.size());
}
}
|