fastlib/src/vespa/fastlib/text/tests/wordfolderstest.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129

// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <vespa/fastlib/testsuite/test.h>
#include <vespa/fastlib/text/normwordfolder.h>

class WordFoldersTest : public Test
{
    bool NormalizeWordFolderConstruction() {
      Fast_NormalizeWordFolder::Setup(
              Fast_NormalizeWordFolder::DO_ACCENT_REMOVAL
              | Fast_NormalizeWordFolder::DO_KATAKANA_TO_HIRAGANA
              | Fast_NormalizeWordFolder::DO_SMALL_TO_NORMAL_KANA
              | Fast_NormalizeWordFolder::DO_SHARP_S_SUBSTITUTION
              | Fast_NormalizeWordFolder::DO_LIGATURE_SUBSTITUTION
              | Fast_NormalizeWordFolder::DO_MULTICHAR_EXPANSION);

      Fast_NormalizeWordFolder *nwf = new Fast_NormalizeWordFolder();
      delete nwf;

    return true;
  }

  bool TokenizeAnnotatedBuffer() {
    Fast_NormalizeWordFolder *nwf = new Fast_NormalizeWordFolder();
    const char *testinput = "This is a "
      "\xEF\xBF\xB9" "café" "\xEF\xBF\xBA" "cafe" "\xEF\xBF\xBB"
      " superduperextrafeaturecoolandlongplainword fun "
      "\xEF\xBF\xB9" "www" "\xEF\xBF\xBA"
      "world wide web extra long annotation block" "\xEF\xBF\xBB"
      " test\nIt is cool.\n";
    const char *correct[] = {
      "this", "is", "a",
      "\xEF\xBF\xB9" "café" "\xEF\xBF\xBA" "cafe" "\xEF\xBF\xBB",
      "superduperextrafeaturecool", "fun",
      "\xEF\xBF\xB9" "www" "\xEF\xBF\xBA" "world wide web ex",
      "test", "it", "is", "cool" };
    const char *teststart = testinput;
    const char *testend = testinput + strlen(testinput);
    char destbuf[32];
    char *destbufend = destbuf + 32;
    const char *origstart = testinput;
    size_t tokenlen = 0;

    int tokencounter = 0;
    bool success = true;
    while (
	   (teststart
	    = nwf->Tokenize(teststart, testend,
			    destbuf, destbufend,
			    origstart, tokenlen)) < testend) {
      // printf("found: %s, correct: %s\n", destbuf, correct[tokencounter]);
      success &= strcmp(destbuf, correct[tokencounter++]) == 0;
    }

    delete nwf;

    return success;
  }

  bool TokenizeAnnotatedUCS4Buffer() {
    Fast_NormalizeWordFolder *nwf = new Fast_NormalizeWordFolder();
    const char *testinput = "This is a "
      "\xEF\xBF\xB9" "café" "\xEF\xBF\xBA" "cafe" "\xEF\xBF\xBB"
      " superduperextrafeaturecoolandlongplainword fun "
      "\xEF\xBF\xB9" "www" "\xEF\xBF\xBA"
      "world wide web extra long annotation block" "\xEF\xBF\xBB"
      " test\nIt is cool.\n";
    const char *correct[] = {
      "this", "is", "a",
      "\xEF\xBF\xB9" "café" "\xEF\xBF\xBA" "cafe" "\xEF\xBF\xBB",
      "superduperextrafeaturecooland", "fun",
      "\xEF\xBF\xB9" "www" "\xEF\xBF\xBA" "world wide web extra lon",
      "test", "it", "is", "cool" };

    const char *teststart = testinput;
    const char *testend = testinput + strlen(testinput);
    ucs4_t destbuf[32];
    ucs4_t *destbufend = destbuf + 32;

    const char *origstart = testinput;
    size_t tokenlen = 0;

    int tokencounter = 0;
    bool success = true;
    while (
	   (teststart
	    = nwf->UCS4Tokenize(teststart, testend,
			    destbuf, destbufend,
			    origstart, tokenlen)) < testend) {
      success &= Fast_UnicodeUtil::utf8cmp(correct[tokencounter++], destbuf) == 0;
    }

    delete nwf;

    return success;
  }

   bool AccentRemovalTest() {
       auto freefunction = [] (char * ptr) { free(ptr); };
       auto input = std::unique_ptr<char, decltype(freefunction)>(Fast_UnicodeUtil::strdupLAT1("����������������������������������������������������������������������������������������������p�!"),
                                                                  freefunction);
       auto yelloutput = std::unique_ptr<char, decltype(freefunction)>(Fast_UnicodeUtil::strdupLAT1("�������������������������������AAAAAEAAAECEEEEIIIIDNOOOOOE�OEUUUUEYTHssaaaaaeaaaeceeeeiiiidnoooooe�oeuuuueythpth!"),
                                               freefunction);
       Fast_NormalizeWordFolder wordfolder;
       int len = wordfolder.FoldedSizeAsUTF8(input.get());
       auto fastliboutput = std::unique_ptr<char[]>(new char[len + 1]);
       wordfolder.FoldUTF8WordToUTF8Quick(fastliboutput.get(), input.get());
       fastliboutput[len] = '\0';
       printf("\n%s\n", yelloutput.get());
       printf("%s\n", fastliboutput.get());
       return strcasecmp(yelloutput.get(), fastliboutput.get()) == 0;
   }


public:

    void Run() override {
        // do the tests
        _test(NormalizeWordFolderConstruction());
        _test(TokenizeAnnotatedBuffer());
        _test(TokenizeAnnotatedUCS4Buffer());
        _test(AccentRemovalTest());
    }
};

class WordFoldersTestApp : public FastOS_Application
{
public:
    int Main() override;
};