vespalib/src/tests/fastlib/text/wordfolderstest.cpp


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46

// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#include <vespa/fastlib/text/normwordfolder.h>
#include <vespa/vespalib/testkit/test_kit.h>

TEST("NormalizeWordFolderConstruction") {
    Fast_NormalizeWordFolder::Setup(
            Fast_NormalizeWordFolder::DO_ACCENT_REMOVAL
            | Fast_NormalizeWordFolder::DO_KATAKANA_TO_HIRAGANA
            | Fast_NormalizeWordFolder::DO_SMALL_TO_NORMAL_KANA
            | Fast_NormalizeWordFolder::DO_SHARP_S_SUBSTITUTION
            | Fast_NormalizeWordFolder::DO_LIGATURE_SUBSTITUTION
            | Fast_NormalizeWordFolder::DO_MULTICHAR_EXPANSION);
}

TEST("TokenizeAnnotatedUCS4Buffer") {
    auto nwf = std::make_unique<Fast_NormalizeWordFolder>();
    const char *testinput = "This is a "
                            "\xEF\xBF\xB9" "café" "\xEF\xBF\xBA" "cafe" "\xEF\xBF\xBB"
                            " superduperextrafeaturecoolandlongplainword fun "
                            "\xEF\xBF\xB9" "www" "\xEF\xBF\xBA"
                            "world wide web extra long annotation block" "\xEF\xBF\xBB"
                            " test\nIt is cool.\n";
    const char *correct[] = {
            "this", "is", "a",
            "\xEF\xBF\xB9" "café" "\xEF\xBF\xBA" "cafe" "\xEF\xBF\xBB",
            "superduperextrafeaturecooland", "fun",
            "\xEF\xBF\xB9" "www" "\xEF\xBF\xBA" "world wide web extra lon",
            "test", "it", "is", "cool" };

    const char *teststart = testinput;
    const char *testend = testinput + strlen(testinput);
    ucs4_t destbuf[32];
    ucs4_t *destbufend = destbuf + 32;

    const char *origstart = testinput;
    size_t tokenlen = 0;

    int tokencounter = 0;
    while ((teststart = nwf->UCS4Tokenize(teststart, testend, destbuf, destbufend, origstart, tokenlen)) < testend) {
        EXPECT_EQUAL(0, Fast_UnicodeUtil::utf8cmp(correct[tokencounter++], destbuf));
    }

}

TEST_MAIN() { TEST_RUN_ALL(); }