aboutsummaryrefslogtreecommitdiffstats
path: root/document/src/tests/stringtokenizertest.cpp
blob: db98a3f5f5532d9819a2b260ddfe09462bf23a49 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#include <iostream>
#include <set>
#include <sstream>
#include <vespa/vespalib/text/stringtokenizer.h>
#include <gtest/gtest.h>

using vespalib::StringTokenizer;
using std::string;

TEST(StringTokenizerTest, testSimpleUsage)
{
    {
        string s("This,is ,a,,list ,\tof,,sepa rated\n, \rtokens,");
        StringTokenizer tokenizer(s);
        StringTokenizer::TokenList result;
        result.push_back("This");
        result.push_back("is");
        result.push_back("a");
        result.push_back("");
        result.push_back("list");
        result.push_back("of");
        result.push_back("");
        result.push_back("sepa rated");
        result.push_back("tokens");
        result.push_back("");

        ASSERT_EQ(result.size(), static_cast<size_t>(tokenizer.size()));
        for (unsigned int i=0; i<result.size(); i++) {
            EXPECT_EQ(result[i], tokenizer[i]);
        }
        std::set<string> sorted(tokenizer.begin(), tokenizer.end());
        EXPECT_EQ(static_cast<size_t>(8u), sorted.size());

        tokenizer.removeEmptyTokens();
        EXPECT_EQ(7u, tokenizer.size());
    }
    {
        string s("\tAnother list with some \ntokens, and stuff.");
        StringTokenizer tokenizer(s, " \t\n", ",.");
        StringTokenizer::TokenList result;
        result.push_back("");
        result.push_back("Another");
        result.push_back("list");
        result.push_back("with");
        result.push_back("some");
        result.push_back("");
        result.push_back("tokens");
        result.push_back("and");
        result.push_back("stuff");

        ASSERT_EQ(result.size(), static_cast<size_t>(tokenizer.size()));
        for (unsigned int i=0; i<result.size(); i++) {
            EXPECT_EQ(result[i], tokenizer[i]);
        }
        std::set<string> sorted(tokenizer.begin(), tokenizer.end());
        EXPECT_EQ(static_cast<size_t>(8u), sorted.size());

        tokenizer.removeEmptyTokens();
        EXPECT_EQ(7u, tokenizer.size());
    }
    {
        string s(" ");
        StringTokenizer tokenizer(s);
        EXPECT_EQ(0u, tokenizer.size());
    }

    {
        string s("");
        StringTokenizer tokenizer(s);
        EXPECT_EQ(0u, tokenizer.size());
    }
    {
        // Test that there aren't any problems with using signed chars.
        string s("Here\x01\xff be\xff\xfe dragons\xff");
        StringTokenizer tokenizer(s, "\xff", "\x01 \xfe");
        StringTokenizer::TokenList result;
        result.push_back("Here");
        result.push_back("be");
        result.push_back("dragons");
        result.push_back("");

        ASSERT_EQ(result.size(), static_cast<size_t>(tokenizer.size()));
        for (unsigned int i=0; i<result.size(); i++) {
            EXPECT_EQ(result[i], tokenizer[i]);
        }
        std::set<string> sorted(tokenizer.begin(), tokenizer.end());
        EXPECT_EQ(static_cast<size_t>(4u), sorted.size());

        tokenizer.removeEmptyTokens();
        EXPECT_EQ(3u, tokenizer.size());
    }
}