Publish

author: Jon Bratseth <bratseth@yahoo-inc.com> 2016-06-15 23:09:44 +0200
committer: Jon Bratseth <bratseth@yahoo-inc.com> 2016-06-15 23:09:44 +0200
commit: 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree: 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /juniper/src/test/auxTest.cpp
1 files changed, 947 insertions, 0 deletions
diff --git a/juniper/src/test/auxTest.cpp b/juniper/src/test/auxTest.cpp
new file mode 100644
index 00000000000..7c53b2a7999
--- /dev/null
+++ b/juniper/src/test/auxTest.cpp
@@ -0,0 +1,947 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/fastos/fastos.h>
+#include <vespa/log/log.h>
+LOG_SETUP("");
+
+#include "auxTest.h"
+
+// Using separator definitions only from here:
+
+#define COLOR_HIGH_ON "\e[1;31m"
+#define COLOR_HIGH_OFF "\e[0m"
+
+#ifndef FASTOS_DEBUG
+static int debug_level = 0;
+#endif
+
+bool color_highlight = false;
+bool verbose = false;
+const unsigned char* connectors = reinterpret_cast<const unsigned char*>("-'");
+
+using juniper::SpecialTokenRegistry;
+
+AuxTest::AuxTest() : Test("Auxiliary"), test_methods_(), _sumconf(0)
+{
+    init();
+}
+
+AuxTest::~AuxTest()
+{
+    DeleteSummaryConfig(_sumconf);
+}
+
+
+void AuxTest::init()
+{
+    test_methods_["TestExample"] =
+        &AuxTest::TestExample;
+    test_methods_["TestPropertyMap"] =
+        &AuxTest::TestPropertyMap;
+    test_methods_["TestRerase"] =
+        &AuxTest::TestRerase;
+    test_methods_["TestUTF811"] =
+        &AuxTest::TestUTF811;
+    test_methods_["TestUTF812"] =
+        &AuxTest::TestUTF812;
+    test_methods_["TestDoubleWidth"] =
+        &AuxTest::TestDoubleWidth;
+    test_methods_["TestPartialUTF8"] =
+        &AuxTest::TestPartialUTF8;
+    test_methods_["TestLargeBlockChinese"] =
+        &AuxTest::TestLargeBlockChinese;
+    test_methods_["TestUTF8context"] =
+        &AuxTest::TestUTF8context;
+    test_methods_["TestJapanese"] =
+        &AuxTest::TestJapanese;
+    test_methods_["TestStartHits"] =
+        &AuxTest::TestStartHits;
+    test_methods_["TestEndHit"] =
+        &AuxTest::TestEndHit;
+    test_methods_["TestJuniperStack"] =
+        &AuxTest::TestJuniperStack;
+    test_methods_["TestSpecialTokenRegistry"] =
+        &AuxTest::TestSpecialTokenRegistry;
+    test_methods_["TestWhiteSpacePreserved"] =
+        &AuxTest::TestWhiteSpacePreserved;
+}
+
+
+// needed closures
+
+void AuxTest::TestUTF811()
+{
+    TestUTF8(11);
+}
+
+void AuxTest::TestUTF812()
+{
+    TestUTF8(12);
+}
+
+
+int
+countBrokenUTF8(const char *data, uint32_t len)
+{
+    int broken = 0;
+    int remain = 0;
+
+    for (uint32_t i = 0; i < len; ++i) {
+        unsigned char val = data[i];
+        switch (val & 0xc0) {
+        case 0xc0: // first char
+            remain = 1;
+            val <<= 2;
+            while ((val & 0x80) != 0) {
+                ++remain;
+                val <<= 1;
+            }
+            if (remain > 5) {
+                ++broken;
+                remain = 0;
+            }
+            break;
+        case 0x80: // continuation char
+            if (remain == 0) {
+                ++broken;
+            } else {
+                --remain;
+            }
+            break;
+        default: // single char
+            if (remain > 0) {
+                ++broken;
+                remain = 0;
+            }
+            break;
+        }
+    }
+    return broken;
+}
+
+void
+AuxTest::TestDoubleWidth()
+{
+    char input[17] =
+	"[\x1f\xef\xbd\x93\xef\xbd\x8f\xef\xbd\x8e\xef\xbd\x99\x1f]";
+
+    juniper::PropertyMap myprops;
+    myprops // no fallback, should get match
+        .set("juniper.dynsum.escape_markup", "off")
+        .set("juniper.dynsum.highlight_off", "</hi>")
+        .set("juniper.dynsum.continuation", "<sep />")
+        .set("juniper.dynsum.highlight_on", "<hi>");
+    Fast_NormalizeWordFolder wf;
+    juniper::Juniper juniper(&myprops, &wf);
+    juniper::Config myConfig("best", juniper);
+
+    juniper::QueryParser q("\xef\xbd\x93\xef\xbd\x8f\xef\xbd\x8e\xef\xbd\x99");
+    juniper::QueryHandle qh(q, NULL, juniper.getModifier());
+    juniper::Result* res = juniper::Analyse(&myConfig, &qh,
+            input, 17, 0, 0, 0);
+    _test(res != NULL);
+
+    juniper::Summary* sum = juniper::GetTeaser(res, NULL);
+    (void) sum;
+    // this should work
+    // _test(sum->Length() != 0);
+    juniper::ReleaseResult(res);
+}
+
+
+
+void
+AuxTest::TestPartialUTF8()
+{
+    const int inputSize = 5769; // NB: update this if input is changed
+    char input[inputSize];
+    FastOS_File file("partialutf8.input.utf8");
+    _test(file.OpenReadOnly());
+    _test(file.GetSize() == inputSize);
+    _test(file.Read(input, inputSize));
+    _test(countBrokenUTF8(input, inputSize) == 0);
+    file.Close();
+
+    juniper::PropertyMap myprops;
+    myprops // config taken from vespa test case
+        .set("juniper.dynsum.escape_markup", "off")
+        .set("juniper.dynsum.highlight_off", "")
+        .set("juniper.dynsum.continuation", "")
+        .set("juniper.dynsum.fallback", "prefix")
+        .set("juniper.dynsum.highlight_on", "");
+    Fast_NormalizeWordFolder wf;
+    juniper::Juniper juniper(&myprops, &wf);
+    juniper::Config myConfig("best", juniper);
+
+    juniper::QueryParser q("ipod");
+    juniper::QueryHandle qh(q, NULL, juniper.getModifier());
+    juniper::Result* res = juniper::Analyse(&myConfig, &qh,
+            input, inputSize, 0, 0, 0);
+    _test(res != NULL);
+
+    juniper::Summary* sum = juniper::GetTeaser(res, NULL);
+    _test(sum->Length() != 0);
+
+    // check for partial/broken utf-8
+    _test(countBrokenUTF8(sum->Text(), sum->Length()) == 0);
+
+    juniper::ReleaseResult(res);
+}
+
+void AuxTest::TestLargeBlockChinese()
+{
+    const int inputSize = 10410; // NB: update this if input is changed
+    char input[inputSize];
+    FastOS_File file("largeblockchinese.input.utf8");
+    _test(file.OpenReadOnly());
+    _test(file.GetSize() == inputSize);
+    _test(file.Read(input, inputSize));
+    _test(countBrokenUTF8(input, inputSize) == 0);
+    file.Close();
+
+    juniper::PropertyMap myprops;
+    myprops // config taken from reported bug
+        .set("juniper.dynsum.length", "50")
+        .set("juniper.dynsum.min_length", "20")
+        .set("juniper.dynsum.escape_markup", "off")
+        .set("juniper.dynsum.highlight_off", "")
+        .set("juniper.dynsum.continuation", "")
+        .set("juniper.dynsum.fallback", "prefix")
+        .set("juniper.dynsum.highlight_on", "");
+    Fast_NormalizeWordFolder wf;
+    juniper::Juniper juniper(&myprops, &wf);
+    juniper::Config myConfig("best", juniper);
+
+    juniper::QueryParser q("希望");
+    juniper::QueryHandle qh(q, NULL, juniper.getModifier());
+    juniper::Result* res = juniper::Analyse(&myConfig, &qh,
+            input, inputSize, 0, 0, 0);
+    _test(res != NULL);
+
+    juniper::Summary* sum = juniper::GetTeaser(res, NULL);
+    _test(sum->Length() != 0);
+
+    // check that the entire block of chinese data is not returned in the summary
+    _test(sum->Length() < 100);
+
+    // check for partial/broken utf-8
+    _test(countBrokenUTF8(sum->Text(), sum->Length()) == 0);
+
+    juniper::ReleaseResult(res);
+}
+
+void AuxTest::TestExample()
+{
+    juniper::QueryParser q("AND(consume,sleep,tree)");
+    juniper::QueryHandle qh(q, NULL, juniper::_Juniper->getModifier());
+
+    // some content
+    const char* content = "the monkey consumes bananas and sleeps afterwards."
+                          "&%#%&! cries the sleepy monkey and jumps down from the tree."
+                          "the last token here is split across lines consumed";
+    int content_len = strlen(content);
+    juniper::Result* res =
+        juniper::Analyse(juniper::TestConfig,
+                         &qh,
+                         content, content_len,
+                         0, 0, 0);
+    _test(res != NULL);
+
+    res->Scan();
+    Matcher& m = *res->_matcher;
+    _test(m.TotalMatchCnt(0) == 2 && m.ExactMatchCnt(0) == 0);
+    juniper::ReleaseResult(res);
+}
+
+
+void
+AuxTest::TestPropertyMap()
+{
+    juniper::PropertyMap map;
+    IJuniperProperties *props = &map;
+    map.set("foo", "bar").set("one", "two");
+    _test(props->GetProperty("bogus") == NULL);
+    _test(strcmp(props->GetProperty("bogus", "default"), "default") == 0);
+    _test(strcmp(props->GetProperty("foo"), "bar") == 0);
+    _test(strcmp(props->GetProperty("one", "default"), "two") == 0);
+}
+
+
+void AuxTest::TestRerase()
+{
+    std::list<int> ls;
+
+    for (int i = 0; i < 10; i++)
+        ls.push_back(i);
+
+    for (std::list<int>::reverse_iterator rit = ls.rbegin();
+         rit != ls.rend();)
+    {
+        if (*rit == 5 || *rit == 6)
+        {
+            // STL hackers heaven - puh this was cumbersome..
+            std::list<int>::reverse_iterator new_it(ls.erase((++rit).base()));
+            rit = new_it;
+        }
+        else
+            ++rit;
+    }
+
+    std::string s;
+    for (std::list<int>::iterator it = ls.begin();
+         it != ls.end(); ++it)
+        s += ('0' + *it);
+    _test(s == std::string("01234789"));
+}
+
+// Debug dump with positions for reference
+void test_dump(const char* s, unsigned int len)
+{
+    printf("test_dump: length %u\n", len);
+    for (unsigned int i = 0; i < len;)
+    {
+        unsigned int start = i;
+        for (; i < len;)
+        {
+            if (s[i] < 0) {
+                printf("�");
+            } else {
+                printf("%c", s[i]);
+            }
+            i++;
+            if (!(i % 100)) break;
+        }
+        printf("\n");
+        i = start + 10;
+        for (; i < len && i % 100; i+= 10)
+            printf("%7s%3d", "", i);
+        printf("\n");
+    }
+}
+
+
+void AuxTest::TestUTF8(unsigned int size)
+{
+    const char* s = u8"\u00e5pent s\u00f8k\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5\u00e6\u00f8\u00e5";
+    const unsigned char* p = (const unsigned char*)s;
+
+    int moved = 0;
+    for (int i = 0; i < (int)size + 2; i++)
+    {
+        // Forward tests:
+        p = (const unsigned char*)(s + i);
+        moved = Fast_UnicodeUtil::UTF8move((const unsigned char*)s, size, p, +1);
+        LOG(spam, "forw. moved %d, pos %d", moved, i);
+        if (i == 0 || i == 8)
+            _test(moved == 2);
+        else if (i >= (int)size)
+            _test(moved == -1);
+        else
+            _test(moved == 1);
+
+        // backward tests
+        p = (const unsigned char*)(s + i);
+        moved = Fast_UnicodeUtil::UTF8move((const unsigned char*)s, size, p, -1);
+        LOG(spam, "backw.moved %d, pos %d", moved, i);
+        if (i == 10 || i == 9 || i == 2)
+            _test(moved == 2);
+        else if (i == 0 || i > (int)size)
+            _test(moved == -1);
+        else
+            _test(moved == 1);
+
+        // move-to-start tests:
+        p = (const unsigned char*)(s + i);
+        moved = Fast_UnicodeUtil::UTF8move((const unsigned char*)s, size, p, 0);
+        LOG(spam, "to-start.moved %d, pos %d", moved, i);
+        if (i == 9 || i == 1)
+            _test(moved == 1);
+        else if (i >= (int)size)
+            _test(moved == -1);
+        else
+            _test(moved == 0);
+    }
+
+    // Assumption about equality of UCS4 IsWordChar and isalnum for
+    // ascii (c < 128) :
+    for (unsigned char c = 0; c < 128; c++)
+    {
+        const unsigned char* pc = &c;
+        ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(pc);
+        bool utf8res = Fast_UnicodeUtil::IsWordChar(u);
+        bool asciires = isalnum(c);
+        _test(utf8res == asciires);
+        if (utf8res != asciires)
+            fprintf(stderr, ":%c:%d != :%c:%d\n", u, utf8res, c, asciires);
+    }
+}
+
+
+void AuxTest::TestUTF8context()
+{
+    const char* iso_cont = u8"AND(m\u00b5ss,fast,s\u00f8kemotor,\u00e5relang)";
+    juniper::QueryParser q(iso_cont);
+    juniper::QueryHandle qh(q, NULL, juniper::_Juniper->getModifier());
+
+    // some content
+    std::string s(u8"Fast leverer s\u00d8kemotorer og andre nyttige ting for \u00e5 finne frem p\u00e5 ");
+    s.append(u8"internett. Teknologien er basert p\u00e5 \u00c5relang");
+    s += UNIT_SEPARATOR;
+    s.append(u8"norsk innsats og forskning i");
+    s += GROUP_SEPARATOR;
+    s.append(u8"trondheimsmilj\u00f8et. M\u00b5ss med denne nye funksjonaliteten for \u00e5 vise frem");
+    s += UNIT_SEPARATOR;
+    s.append(u8" beste forekomst av s\u00f8ket med s\u00f8kemotor til brukeren blir det enda bedre. ");
+    s.append(u8"Hvis bare UTF8-kodingen virker som den skal for tegn som tar mer enn \u00e9n byte.");
+
+    juniper::Result* res = juniper::Analyse(juniper::TestConfig, &qh, s.c_str(), s.size(), 0, 0, 0);
+    _test(res != NULL);
+
+    size_t charsize;
+    Matcher& m = *res->_matcher;
+
+    res->Scan();
+    _test(m.TotalMatchCnt(0) == 1 && m.ExactMatchCnt(0) == 1);
+    _test(m.TotalMatchCnt(1) == 1 && m.ExactMatchCnt(2) == 1);
+    _test(m.TotalMatchCnt(2) == 2 && m.ExactMatchCnt(2) == 1);
+    _test(m.TotalMatchCnt(3) == 1 && m.ExactMatchCnt(2) == 1);
+
+    char separators[3];
+    separators[0] = UNIT_SEPARATOR;
+    separators[1] = GROUP_SEPARATOR;
+    separators[2] = '\0';
+
+    if (color_highlight)
+        _sumconf = CreateSummaryConfig(COLOR_HIGH_ON, COLOR_HIGH_OFF, "...", separators, connectors);
+    else
+        _sumconf = CreateSummaryConfig("<hit>", "</hit>", "...", separators, connectors);
+    for (int i = 1; i <= 10; i++)
+    {
+        // Short summaries with many matches
+        test_summary(m, s.c_str(), s.size(), i*30, i / 3, i*10, charsize);
+        // fewer matches, longer summaries
+        test_summary(m, s.c_str(), s.size(), i*60, i / 6, i*20, charsize);
+    }
+    // Summary som er stort nok til � ta hele teksten
+    test_summary(m, s.c_str(), s.size(), 800, 100, 300, charsize);
+    // fprintf(stderr, "charsize %d s.size %d\n", charsize, s.size());
+    _test(charsize == s.size() - 3 - 11); // Subtract eliminated separators and dual bytes
+
+    // "Syke" settinger for summary:
+    test_summary(m, s.c_str(), s.size(), 10000, 0, 1000, charsize);
+    // fprintf(stderr, "charsize %d s.size %d\n", charsize, s.size());
+    _test(charsize == s.size() - 3 - 11); // Subtract eliminated separators and dual bytes
+
+    if (GetNumFailed() > 0 && debug_level > 0)
+    {
+        fprintf(stderr, "Characters in original text: %ld\n", s.size());
+        test_dump(s.c_str(), s.size());
+        m.dump_statistics();
+    }
+    juniper::ReleaseResult(res);
+}
+
+
+const char* japanese_sep_ex = "。";
+
+struct TermTextPair
+{
+    const char* term;
+    const char* text;
+};
+
+static TermTextPair testjap[] =
+{
+    // japanese string as term
+    { "私はガラスを食べられます",
+      "this is some japanese: 私はガラスを食べられます。それは私を傷つけません。 ending here" },
+
+    // HUGE japanese prefix and postfix and simple match in middle:
+    { "bond",
+      "私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。 bond 私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。私はガラスを食べられます。それは私を傷つけません。" },
+    { "japanese", "Simple。match。check。for。japanese。sep" },
+    { "hit", " -. hit at start" },
+    { "hit", "hit at end .,: " },
+    { "hit", "---------------------------------------------------------------------------------------------------------------------this is a text that is long enough to generate a hit that does have dots on both sides ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; " },
+    { NULL, NULL }
+};
+
+
+void AuxTest::TestJapanese()
+{
+    for (int i = 0; testjap[i].term != NULL; i++)
+    {
+        const char* qstr = testjap[i].term;
+        juniper::QueryParser q(qstr);
+        juniper::QueryHandle qh(q, NULL, juniper::_Juniper->getModifier());
+
+        const char* content = testjap[i].text;
+        int content_len = strlen(content);
+        juniper::Result* res = juniper::Analyse(juniper::TestConfig, &qh,
+                content, content_len,
+                0, 0, 0);
+        _test(res != NULL);
+
+        size_t charsize;
+        Matcher& m = *res->_matcher;
+
+        res->Scan();
+        if (color_highlight)
+            _sumconf = CreateSummaryConfig(COLOR_HIGH_ON, COLOR_HIGH_OFF, "...", "", connectors);
+        else
+            _sumconf = CreateSummaryConfig("<hit>", "</hit>", "...", "", connectors);
+
+        SummaryDesc* sumdesc = m.CreateSummaryDesc(256, 256, 4, 80);
+        _test(sumdesc != NULL);
+        if (!sumdesc)
+            return;
+        std::string sum = BuildSummary(content, content_len, sumdesc, _sumconf, charsize);
+
+        switch (i)
+        {
+        case 0:
+            // Matching a multibyte sequence
+            _test(m.TotalMatchCnt(0) == 1 && m.ExactMatchCnt(0) == 1);
+            // printf("total %d exact %d\n", m.TotalMatchCnt(0),m.ExactMatchCnt(0));
+            break;
+        case 1:
+            // Matching short word in loong multibyte sequence
+            _test(m.TotalMatchCnt(0) == 1 && m.ExactMatchCnt(0) == 1);
+            _test(sum.size() <= 400);
+            break;
+        case 2:
+            // Matching word in between multibyte separators
+            _test(m.TotalMatchCnt(0) == 1 && m.ExactMatchCnt(0) == 1);
+            break;
+        case 3:
+            // Check that result is the complete string (markup excluded)
+            _test(sum.size() - 11 == charsize);
+            // printf("sz %d charsz %d :%s:\n", sum.size(), charsize, sum.c_str());
+            break;
+        case 4:
+            // Check that result is the complete string (markup excluded)
+            _test(sum.size() - 11 == charsize);
+            // printf("sz %d charsz %d :%s:\n", sum.size(), charsize, sum.c_str());
+            break;
+        case 5:
+            // Check that we get no noise at the start or end of this
+            _test(sum.size() == 103 && charsize == 86);
+            // printf("sz %d charsz %d :%s:\n", sum.size(), charsize, sum.c_str());
+            break;
+        default:
+            break;
+        }
+        juniper::ReleaseResult(res);
+        DeleteSummaryDesc(sumdesc);
+        DeleteSummaryConfig(_sumconf);
+    }
+}
+
+
+void AuxTest::test_summary(Matcher& m, const char* content, size_t content_len,
+                           int size, int matches, int surround, size_t& charsize)
+{
+    SummaryDesc* sum = m.CreateSummaryDesc(size, size, matches, surround);
+    _test(sum != NULL);
+    if (!sum)
+    {
+        // No summary generated!
+        return;
+    }
+    std::string res = BuildSummary(content, content_len, sum, _sumconf, charsize);
+
+    if ((verbose || GetNumFailed() > 0) && debug_level > 0) {
+        printf("\nRequested size: %d, matches: %d, surround: %d, Summary size %lu :%s:\n",
+               size, matches, surround, static_cast<unsigned long>(res.size()), res.c_str());
+    }
+    DeleteSummaryDesc(sum);
+}
+
+
+class DefProps : public IJuniperProperties
+{
+public:
+    virtual const char* GetProperty(const char*, const char* def)
+    {
+        return def;
+    }
+};
+
+
+void AuxTest::TestStartHits()
+{
+    juniper::QueryParser q("elvis");
+    juniper::QueryHandle qh(q, "dynlength.120", juniper::_Juniper->getModifier());
+
+    const char* content =
+        "Elvis, this is a long match before matching Elvis again and then som more text at"
+        " the end. But this text at the end must be much longer than this to trigger the case."
+        " In fact it must be much longer. And then som more text at the end. But this text at "
+        "the end must be much longer than this to trigger the case";
+    int content_len = strlen(content);
+    juniper::Result* res = juniper::Analyse(juniper::TestConfig, &qh,
+            content, content_len,
+            0, 0, 0);
+    _test(res != NULL);
+
+    juniper::Summary* sum = juniper::GetTeaser(res, NULL);
+    (void) sum;
+    // TODO: ReEnable    _test(sum->Length() != 0);
+    juniper::ReleaseResult(res);
+}
+
+
+void AuxTest::TestEndHit()
+{
+    juniper::QueryParser q("match");
+    juniper::QueryHandle qh(q, "dynlength.120", juniper::_Juniper->getModifier());
+
+    const char* content =
+        "In this case we need a fairly long text that does not fit entirely into the resulting"
+        " summary, but that has a hit towards the end of the document where the expected length"
+        " extends the end of the doc. This means that the prefix must be more than 256 bytes"
+        " long. Here is the stuff we are looking for to match in a case where we have "
+        "surround_len bytes closer than good towardstheend�����������������������������������";
+    size_t content_len = strlen(content) - 55;
+
+    juniper::Result* res = juniper::Analyse(juniper::TestConfig, &qh,
+            content, content_len,
+            0, 0, 0);
+    _test(res != NULL);
+
+    juniper::Summary* sum = juniper::GetTeaser(res, NULL);
+    _test(sum->Length() != 0);
+    juniper::ReleaseResult(res);
+}
+
+
+
+class TokenChecker : public ITokenProcessor
+{
+private:
+    TokenChecker(const TokenChecker&);
+    TokenChecker& operator= (const TokenChecker&);
+
+    Token* _out;
+    int i;
+public:
+    TokenChecker(Token* output) : _out(output), i(0)
+    {  }
+
+    virtual void handle_token(Token& token)
+    {
+        _out[i] = token;
+        i++;
+    }
+
+    virtual void handle_end(Token&) {}
+};
+
+
+void AuxTest::TestJuniperStack()
+{
+    // Stack simplification tests
+    QueryExpr* q = new QueryNode(1, 0, 0);
+    QueryExpr* q1 = new QueryNode(1, 0, 0);
+    QueryExpr* q2 = new QueryTerm("Hepp", 4, 0);
+    q->AddChild(q1);
+    q1->AddChild(q2);
+
+    SimplifyStack(q);
+
+    std::string s;
+    q->Dump(s);
+    _test(strcmp(s.c_str(),"Hepp:100") == 0);
+    delete q;
+
+    if (GetNumFailed() > 0)
+        fprintf(stderr, "TestJuniperStack: %s\n", s.c_str());
+
+    q = new QueryNode(2, 0, 0);
+    q->_arity = 0;
+    SimplifyStack(q);
+    std::string s1;
+    _test(q == NULL);
+
+    if (GetNumFailed() > 0)
+        fprintf(stderr, "TestJuniperStack: %s\n", s.c_str());
+}
+
+class TokenProcessor : public ITokenProcessor {
+private:
+    const std::string      & _text;
+    std::vector<std::string> _tokens;
+public:
+    TokenProcessor(const std::string & text) : _text(text), _tokens() {}
+    virtual void handle_token(Token & t) {
+        _tokens.push_back(std::string(_text.c_str() + t.bytepos, t.bytelen));
+        //LOG(info, "handle_token(%s): bytepos(%d), wordpos(%d), bytelen(%d), curlen(%d)",
+            //_tokens.back().c_str(),
+            //(int)t.bytepos, (int)t.wordpos, t.bytelen, t.curlen);
+    }
+    virtual void handle_end(Token & t) {
+        _tokens.push_back(std::string(_text.c_str() + t.bytepos, t.bytelen));
+        //LOG(info, "handle_end(%s): bytepos(%d), wordpos(%d), bytelen(%d), curlen(%d)",
+            //_tokens.back().c_str(),
+            //(int)t.bytepos, (int)t.wordpos, t.bytelen, t.curlen);
+    }
+    void clearTokens() { _tokens.clear(); }
+    const std::vector<std::string> & getTokens() const { return _tokens; }
+};
+
+
+bool
+AuxTest::assertChar(ucs4_t act, char exp)
+{
+    //LOG(info, "assertChar(%d(%c), %c)", act, (char)act, exp);
+    return _test((char) act == exp);
+}
+
+void
+AuxTest::TestSpecialTokenRegistry()
+{
+    {
+        typedef SpecialTokenRegistry::CharStream CharStream;
+        ucs4_t buf[16];
+        {
+            std::string text = " c+-";
+            CharStream cs(text.c_str(), text.c_str() + text.size(), buf, buf + 16);
+            _test(!cs.isStartWordChar());
+            _test(cs.hasMoreChars());
+            _test(assertChar(cs.getNextChar(), ' '));
+            _test(cs.hasMoreChars());
+            cs.reset();
+            _test(cs.hasMoreChars());
+            _test(assertChar(cs.getNextChar(), ' '));
+            _test(assertChar(cs.getNextChar(), 'c'));
+            _test(cs.hasMoreChars());
+            cs.reset();
+            _test(cs.hasMoreChars());
+            _test(assertChar(cs.getNextChar(), ' '));
+            _test(assertChar(cs.getNextChar(), 'c'));
+            _test(assertChar(cs.getNextChar(), '+'));
+            _test(cs.hasMoreChars());
+            cs.reset();
+            _test(cs.hasMoreChars());
+            _test(assertChar(cs.getNextChar(), ' '));
+            _test(assertChar(cs.getNextChar(), 'c'));
+            _test(assertChar(cs.getNextChar(), '+'));
+            _test(assertChar(cs.getNextChar(), '-'));
+            _test(!cs.hasMoreChars());
+            cs.reset();
+            _test(cs.hasMoreChars());
+            _test(assertChar(cs.getNextChar(), ' '));
+            _test(cs.hasMoreChars());
+            _test(assertChar(cs.getNextChar(), 'c'));
+            _test(cs.hasMoreChars());
+            _test(assertChar(cs.getNextChar(), '+'));
+            _test(cs.hasMoreChars());
+            _test(assertChar(cs.getNextChar(), '-'));
+            _test(!cs.hasMoreChars());
+        }
+        { // test reset with increase to next char
+            std::string text = " c+-";
+            CharStream cs(text.c_str(), text.c_str() + text.size(), buf, buf + 16);
+            _test(cs.resetAndInc());
+            _test(cs.isStartWordChar());
+            _test(cs.hasMoreChars());
+            _test(assertChar(cs.getNextChar(), 'c'));
+            _test(assertChar(cs.getNextChar(), '+'));
+            _test(assertChar(cs.getNextChar(), '-'));
+            _test(!cs.hasMoreChars());
+            cs.reset();
+            _test(cs.hasMoreChars());
+            _test(assertChar(cs.getNextChar(), 'c'));
+            _test(assertChar(cs.getNextChar(), '+'));
+            _test(assertChar(cs.getNextChar(), '-'));
+            _test(!cs.hasMoreChars());
+            _test(cs.resetAndInc());
+            _test(!cs.isStartWordChar());
+            _test(cs.hasMoreChars());
+            _test(assertChar(cs.getNextChar(), '+'));
+            _test(assertChar(cs.getNextChar(), '-'));
+            _test(!cs.hasMoreChars());
+            _test(cs.resetAndInc());
+            _test(!cs.isStartWordChar());
+            _test(cs.hasMoreChars());
+            _test(assertChar(cs.getNextChar(), '-'));
+            _test(!cs.hasMoreChars());
+            _test(!cs.resetAndInc());
+            _test(!cs.hasMoreChars());
+        }
+        { // test lower case
+            std::string text = "C";
+            CharStream cs(text.c_str(), text.c_str() + text.size(), buf, buf + 16);
+            _test(assertChar(cs.getNextChar(), 'c'));
+        }
+    }
+    { // test tokenizer with special token registry
+        typedef std::unique_ptr<QueryNode> QueryNodeUP;
+        struct QB {
+            QueryNodeUP q;
+            QB(size_t numTerms) : q(new QueryNode(numTerms, 0, 0)) {}
+            QB(QB & rhs) : q(std::move(rhs.q)) { }
+            QB & add(const char * t, bool st = true) {
+                QueryTerm * qt = new QueryTerm(t, strlen(t), 0);
+                if (st) qt->_options |= X_SPECIALTOKEN;
+                q->AddChild(qt);
+                return *this;
+            }
+        };
+        struct Ctx {
+            std::string text;
+            QB qb;
+            SpecialTokenRegistry str;
+            Fast_NormalizeWordFolder wf;
+            TokenProcessor tp;
+            JuniperTokenizer jt;
+            Ctx(const std::string & text_, QB & qb_) : text(text_), qb(qb_), str(qb.q.get()), wf(), tp(text), jt(&wf, text.c_str(), text.size(), &tp, &str) { jt.scan(); }
+        };
+
+        { // only special token registered
+            Ctx c("foo", QB(2).add("c++").add("foo", false));
+            _test(c.str.getSpecialTokens().size() == 1);
+        }
+        { // various matches
+            std::string annotation = "\357\277\271dvdplusminus\357\277\272dvd+-\357\277\273";
+            std::string text = "c++ !my C++ text ?.net dvd+- stuff " + annotation;
+            Ctx c(text, QB(3).add("c++").add(".net").add("dvd+-", false));
+            _test(c.str.getSpecialTokens().size() == 2);
+            _test(c.tp.getTokens().size() == 9);
+            _test(c.tp.getTokens()[0] == "c++");
+            _test(c.tp.getTokens()[1] == "my");
+            _test(c.tp.getTokens()[2] == "C++");
+            _test(c.tp.getTokens()[3] == "text");
+            _test(c.tp.getTokens()[4] == ".net");
+            _test(c.tp.getTokens()[5] == "dvd");
+            _test(c.tp.getTokens()[6] == "stuff");
+            _test(c.tp.getTokens()[7] == annotation);
+            _test(c.tp.getTokens()[8] == "");
+        }
+        { // cannot start inside a word
+            Ctx c("foo ac++", QB(1).add("c++"));
+            _test(c.tp.getTokens().size() == 3);
+            _test(c.tp.getTokens()[0] == "foo");
+            _test(c.tp.getTokens()[1] == "ac");
+            _test(c.tp.getTokens()[2] == "");
+        }
+        { // can end inside a word (TODO: can be fixed if it is a problem)
+            Ctx c("++ca foo", QB(1).add("++c"));
+            _test(c.tp.getTokens().size() == 4);
+            _test(c.tp.getTokens()[0] == "++c");
+            _test(c.tp.getTokens()[1] == "a");
+            _test(c.tp.getTokens()[2] == "foo");
+            _test(c.tp.getTokens()[3] == "");
+        }
+        { // many scans but only match at the end
+            Ctx c("a+b- a+b+c- a+b+c+", QB(1).add("a+b+c+"));
+            _test(c.tp.getTokens().size() == 7);
+            _test(c.tp.getTokens()[0] == "a");
+            _test(c.tp.getTokens()[1] == "b");
+            _test(c.tp.getTokens()[2] == "a");
+            _test(c.tp.getTokens()[3] == "b");
+            _test(c.tp.getTokens()[4] == "c");
+            _test(c.tp.getTokens()[5] == "a+b+c+");
+            _test(c.tp.getTokens()[6] == "");
+        }
+        { // two special tokens (one being a substring of the other)
+            Ctx c("c+c+c-", QB(2).add("c+c+c+").add("+c+"));
+            _test(c.tp.getTokens().size() == 4);
+            _test(c.tp.getTokens()[0] == "c");
+            _test(c.tp.getTokens()[1] == "+c+");
+            _test(c.tp.getTokens()[2] == "c");
+            _test(c.tp.getTokens()[3] == "");
+        }
+        { // cjk
+            Ctx c("fish: \xE9\xB1\xBC!", QB(1).add("\xE9\xB1\xBC!"));
+            _test(c.tp.getTokens().size() == 3);
+            _test(c.tp.getTokens()[0] == "fish");
+            _test(c.tp.getTokens()[1] == "\xE9\xB1\xBC!");
+            _test(c.tp.getTokens()[2] == "");
+        }
+        { // special token with non-word first
+            Ctx c("+++c ..net", QB(2).add("++c").add(".net"));
+            _test(c.tp.getTokens().size() == 3);
+            _test(c.tp.getTokens()[0] == "++c");
+            _test(c.tp.getTokens()[1] == ".net");
+            _test(c.tp.getTokens()[2] == "");
+        }
+    }
+}
+
+void
+AuxTest::TestWhiteSpacePreserved()
+{
+    vespalib::string input = "\x1f"
+        "best"
+        "\x1f"
+        "  "
+        "\x1f"
+        "of"
+        "\x1f"
+        "  "
+        "\n"
+        "\x1f"
+        "metallica"
+        "\x1f";
+
+    juniper::PropertyMap myprops;
+    myprops.set("juniper.dynsum.escape_markup", "off")
+                .set("juniper.dynsum.highlight_off", "</hi>")
+                .set("juniper.dynsum.continuation", "<sep />")
+                .set("juniper.dynsum.highlight_on", "<hi>")
+                .set("juniper.dynsum.preserve_white_space", "on");
+    Fast_NormalizeWordFolder wf;
+    juniper::Juniper juniper(&myprops, &wf);
+    juniper::Config myConfig("myconfig", juniper);
+
+    juniper::QueryParser q("best");
+    juniper::QueryHandle qh(q, NULL, juniper.getModifier());
+    juniper::Result* res = juniper::Analyse(&myConfig, &qh, input.c_str(), input.size(), 0, 0, 0);
+    _test(res != NULL);
+
+    juniper::Summary* sum = juniper::GetTeaser(res, NULL);
+    vespalib::string expected = "<hi>best</hi>  of  \nmetallica";
+    vespalib::string actual(sum->Text(), sum->Length());
+    _test(actual == expected);
+    juniper::ReleaseResult(res);
+}
+
+void AuxTest::Run(MethodContainer::iterator &itr) {
+    try {
+        (this->*itr->second)();
+    } catch (...) {
+        _fail("Got unknown exception in test method " + itr->first);
+    }
+}
+
+void AuxTest::Run(const char* method) {
+    MethodContainer::iterator pos(test_methods_.find(method));
+    if (pos != test_methods_.end()) {
+        Run(pos);
+    } else {
+        std::cerr << "ERROR: No test method named \""
+                  << method << "\"" << std::endl;
+        _fail("No such method");
+    }
+}
+
+void AuxTest::Run() {
+    for (MethodContainer::iterator itr(test_methods_.begin());
+         itr != test_methods_.end();
+         ++itr)
+        Run(itr);
+}
+
+
+void AuxTest::Run(int argc, char* argv[])
+{
+    for (int i = 1; i < argc; ++i)
+    {
+        if (strcmp(argv[i], "-m") == 0 && argc > i + 1)
+        {
+            Run(argv[++i]);
+            return;
+        }
+    }
+    Run();
+}
author	Jon Bratseth <bratseth@yahoo-inc.com>	2016-06-15 23:09:44 +0200
committer	Jon Bratseth <bratseth@yahoo-inc.com>	2016-06-15 23:09:44 +0200
commit	72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree	2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /juniper/src/test/auxTest.cpp