fastlib/src/vespa/fastlib/text/tests/wordfolderstest.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131

// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include <memory>
#include <vespa/fastlib/testsuite/test.h>

#include "../wordfolder.h"
#include "../normwordfolder.h"

class WordFoldersTest : public Test
{
    bool NormalizeWordFolderConstruction() {
      Fast_NormalizeWordFolder::Setup(
              Fast_NormalizeWordFolder::DO_ACCENT_REMOVAL
              | Fast_NormalizeWordFolder::DO_KATAKANA_TO_HIRAGANA
              | Fast_NormalizeWordFolder::DO_SMALL_TO_NORMAL_KANA
              | Fast_NormalizeWordFolder::DO_SHARP_S_SUBSTITUTION
              | Fast_NormalizeWordFolder::DO_LIGATURE_SUBSTITUTION
              | Fast_NormalizeWordFolder::DO_MULTICHAR_EXPANSION);

      Fast_NormalizeWordFolder *nwf = new Fast_NormalizeWordFolder();
      delete nwf;

    return true;
  }

  bool TokenizeAnnotatedBuffer() {
    Fast_NormalizeWordFolder *nwf = new Fast_NormalizeWordFolder();
    const char *testinput = "This is a "
      "\xEF\xBF\xB9" "café" "\xEF\xBF\xBA" "cafe" "\xEF\xBF\xBB"
      " superduperextrafeaturecoolandlongplainword fun "
      "\xEF\xBF\xB9" "www" "\xEF\xBF\xBA"
      "world wide web extra long annotation block" "\xEF\xBF\xBB"
      " test\nIt is cool.\n";
    const char *correct[] = {
      "this", "is", "a",
      "\xEF\xBF\xB9" "café" "\xEF\xBF\xBA" "cafe" "\xEF\xBF\xBB",
      "superduperextrafeaturecool", "fun",
      "\xEF\xBF\xB9" "www" "\xEF\xBF\xBA" "world wide web ex",
      "test", "it", "is", "cool" };
    const char *teststart = testinput;
    const char *testend = testinput + strlen(testinput);
    char destbuf[32];
    char *destbufend = destbuf + 32;
    const char *origstart = testinput;
    size_t tokenlen = 0;

    int tokencounter = 0;
    bool success = true;
    while (
	   (teststart
	    = nwf->Tokenize(teststart, testend,
			    destbuf, destbufend,
			    origstart, tokenlen)) < testend) {
      // printf("found: %s, correct: %s\n", destbuf, correct[tokencounter]);
      success &= strcmp(destbuf, correct[tokencounter++]) == 0;
    }

    delete nwf;

    return success;
  }

  bool TokenizeAnnotatedUCS4Buffer() {
    Fast_NormalizeWordFolder *nwf = new Fast_NormalizeWordFolder();
    const char *testinput = "This is a "
      "\xEF\xBF\xB9" "café" "\xEF\xBF\xBA" "cafe" "\xEF\xBF\xBB"
      " superduperextrafeaturecoolandlongplainword fun "
      "\xEF\xBF\xB9" "www" "\xEF\xBF\xBA"
      "world wide web extra long annotation block" "\xEF\xBF\xBB"
      " test\nIt is cool.\n";
    const char *correct[] = {
      "this", "is", "a",
      "\xEF\xBF\xB9" "café" "\xEF\xBF\xBA" "cafe" "\xEF\xBF\xBB",
      "superduperextrafeaturecooland", "fun",
      "\xEF\xBF\xB9" "www" "\xEF\xBF\xBA" "world wide web extra lon",
      "test", "it", "is", "cool" };

    const char *teststart = testinput;
    const char *testend = testinput + strlen(testinput);
    ucs4_t destbuf[32];
    ucs4_t *destbufend = destbuf + 32;

    const char *origstart = testinput;
    size_t tokenlen = 0;

    int tokencounter = 0;
    bool success = true;
    while (
	   (teststart
	    = nwf->UCS4Tokenize(teststart, testend,
			    destbuf, destbufend,
			    origstart, tokenlen)) < testend) {
      success &= Fast_UnicodeUtil::utf8cmp(correct[tokencounter++], destbuf) == 0;
    }

    delete nwf;

    return success;
  }

   bool AccentRemovalTest() {
       auto freefunction = [] (char * ptr) { free(ptr); };
       auto input = std::unique_ptr<char, decltype(freefunction)>(Fast_UnicodeUtil::strdupLAT1("����������������������������������������������������������������������������������������������p�!"),
                                                                  freefunction);
       auto yelloutput = std::unique_ptr<char, decltype(freefunction)>(Fast_UnicodeUtil::strdupLAT1("�������������������������������AAAAAEAAAECEEEEIIIIDNOOOOOE�OEUUUUEYTHssaaaaaeaaaeceeeeiiiidnoooooe�oeuuuueythpth!"),
                                               freefunction);
       Fast_NormalizeWordFolder wordfolder;
       int len = wordfolder.FoldedSizeAsUTF8(input.get());
       auto fastliboutput = std::unique_ptr<char[]>(new char[len + 1]);
       wordfolder.FoldUTF8WordToUTF8Quick(fastliboutput.get(), input.get());
       fastliboutput[len] = '\0';
       printf("\n%s\n", yelloutput.get());
       printf("%s\n", fastliboutput.get());
       return strcasecmp(yelloutput.get(), fastliboutput.get()) == 0;
   }


public:
  virtual void Run() override {
    // do the tests
    _test(NormalizeWordFolderConstruction());
    _test(TokenizeAnnotatedBuffer());
    _test(TokenizeAnnotatedUCS4Buffer());
    _test(AccentRemovalTest());
  }
};

class WordFoldersTestApp : public FastOS_Application
{
public:
  virtual int Main() override;
};