aboutsummaryrefslogtreecommitdiffstats
path: root/streamingvisitors/src/tests/textutil
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2022-05-15 00:40:43 +0200
committerGitHub <noreply@github.com>2022-05-15 00:40:43 +0200
commitdacf557add1c6a3ffab036cdf2f7dfdf9750b22e (patch)
tree3a9dfff58b98898e2e28c0337925f4f04e5eaeb0 /streamingvisitors/src/tests/textutil
parent2722ce9d1d1ec12d57ebd3833ce37b0958afb752 (diff)
Revert "Collapse vsm into streamingvisitors"
Diffstat (limited to 'streamingvisitors/src/tests/textutil')
-rw-r--r--streamingvisitors/src/tests/textutil/.gitignore4
-rw-r--r--streamingvisitors/src/tests/textutil/CMakeLists.txt8
-rw-r--r--streamingvisitors/src/tests/textutil/textutil.cpp285
3 files changed, 0 insertions, 297 deletions
diff --git a/streamingvisitors/src/tests/textutil/.gitignore b/streamingvisitors/src/tests/textutil/.gitignore
deleted file mode 100644
index 1103f79800a..00000000000
--- a/streamingvisitors/src/tests/textutil/.gitignore
+++ /dev/null
@@ -1,4 +0,0 @@
-.depend
-Makefile
-textutil_test
-vsm_textutil_test_app
diff --git a/streamingvisitors/src/tests/textutil/CMakeLists.txt b/streamingvisitors/src/tests/textutil/CMakeLists.txt
deleted file mode 100644
index 59817d01137..00000000000
--- a/streamingvisitors/src/tests/textutil/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-vespa_add_executable(vsm_textutil_test_app TEST
- SOURCES
- textutil.cpp
- DEPENDS
- streamingvisitors
-)
-vespa_add_test(NAME vsm_textutil_test_app COMMAND vsm_textutil_test_app)
diff --git a/streamingvisitors/src/tests/textutil/textutil.cpp b/streamingvisitors/src/tests/textutil/textutil.cpp
deleted file mode 100644
index 2a1390eaa01..00000000000
--- a/streamingvisitors/src/tests/textutil/textutil.cpp
+++ /dev/null
@@ -1,285 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-#include <vespa/vespalib/testkit/testapp.h>
-
-#include <vespa/fastlib/text/unicodeutil.h>
-#include <vespa/searchlib/query/base.h>
-#include <vespa/vsm/searcher/fold.h>
-#include <vespa/vsm/searcher/futf8strchrfieldsearcher.h>
-#include <vespa/vsm/searcher/utf8stringfieldsearcherbase.h>
-
-using search::byte; // unsigned char
-
-namespace vsm {
-
-template <typename T>
-class Vector : public std::vector<T>
-{
-public:
- Vector() : std::vector<T>() {}
- Vector<T> & a(T v) { this->push_back(v); return *this; }
-};
-
-typedef Vector<ucs4_t> UCS4V;
-typedef Vector<size_t> SizeV;
-typedef UTF8StringFieldSearcherBase SFSB;
-typedef FUTF8StrChrFieldSearcher FSFS;
-
-class TextUtilTest : public vespalib::TestApp
-{
-private:
- ucs4_t getUTF8Char(const char * src);
- template <typename BW, bool OFF>
- void assertSkipSeparators(const char * input, size_t len, const UCS4V & expdstbuf, const SizeV & expoffsets);
- void assertAnsiFold(const std::string & toFold, const std::string & exp);
- void assertAnsiFold(char c, char exp);
-#ifdef __x86_64__
- void assert_sse2_foldua(const std::string & toFold, size_t charFolded, const std::string & exp);
- void assert_sse2_foldua(unsigned char c, unsigned char exp, size_t charFolded = 16);
-#endif
-
- template <typename BW, bool OFF>
- void testSkipSeparators();
- void testSkipSeparators();
- void testSeparatorCharacter();
- void testAnsiFold();
- void test_lfoldua();
-#ifdef __x86_64__
- void test_sse2_foldua();
-#endif
-
-public:
- int Main() override;
-};
-
-ucs4_t
-TextUtilTest::getUTF8Char(const char * src)
-{
- ucs4_t retval = Fast_UnicodeUtil::GetUTF8Char(src);
- ASSERT_TRUE(retval != Fast_UnicodeUtil::_BadUTF8Char);
- return retval;
-}
-
-template <typename BW, bool OFF>
-void
-TextUtilTest::assertSkipSeparators(const char * input, size_t len, const UCS4V & expdstbuf, const SizeV & expoffsets)
-{
- const byte * srcbuf = reinterpret_cast<const byte *>(input);
- auto dstbuf = std::make_unique<ucs4_t[]>(len + 1);
- auto offsets = std::make_unique<size_t[]>(len + 1);
- UTF8StrChrFieldSearcher fs;
- BW bw(dstbuf.get(), offsets.get());
- size_t dstlen = fs.skipSeparators(srcbuf, len, bw);
- EXPECT_EQUAL(dstlen, expdstbuf.size());
- ASSERT_TRUE(dstlen == expdstbuf.size());
- for (size_t i = 0; i < dstlen; ++i) {
- EXPECT_EQUAL(dstbuf[i], expdstbuf[i]);
- if (OFF) {
- EXPECT_EQUAL(offsets[i], expoffsets[i]);
- }
- }
-}
-
-void
-TextUtilTest::assertAnsiFold(const std::string & toFold, const std::string & exp)
-{
- char folded[256];
- EXPECT_TRUE(FSFS::ansiFold(toFold.c_str(), toFold.size(), folded));
- EXPECT_EQUAL(std::string(folded, toFold.size()), exp);
-}
-
-void
-TextUtilTest::assertAnsiFold(char c, char exp)
-{
- char folded;
- EXPECT_TRUE(FSFS::ansiFold(&c, 1, &folded));
- EXPECT_EQUAL((int32_t)folded, (int32_t)exp);
-}
-
-#ifdef __x86_64__
-void
-TextUtilTest::assert_sse2_foldua(const std::string & toFold, size_t charFolded, const std::string & exp)
-{
- char folded[256];
- size_t alignedStart = 0xF - (size_t(folded + 0xF) % 0x10);
- const unsigned char * toFoldOrg = reinterpret_cast<const unsigned char *>(toFold.c_str());
- const unsigned char * retval =
- sse2_foldua(toFoldOrg, toFold.size(), reinterpret_cast<unsigned char *>(folded + alignedStart));
- EXPECT_EQUAL((size_t)(retval - toFoldOrg), charFolded);
- EXPECT_EQUAL(std::string(folded + alignedStart, charFolded), exp);
-}
-
-void
-TextUtilTest::assert_sse2_foldua(unsigned char c, unsigned char exp, size_t charFolded)
-{
- unsigned char toFold[16];
- memset(toFold, c, 16);
- unsigned char folded[32];
- size_t alignedStart = 0xF - (size_t(folded + 0xF) % 0x10);
- const unsigned char * retval = sse2_foldua(toFold, 16, folded + alignedStart);
- EXPECT_EQUAL((size_t)(retval - toFold), charFolded);
- for (size_t i = 0; i < charFolded; ++i) {
- EXPECT_EQUAL((int32_t)folded[i + alignedStart], (int32_t)exp);
- }
-}
-#endif
-
-template <typename BW, bool OFF>
-void
-TextUtilTest::testSkipSeparators()
-{
- // ascii characters
- assertSkipSeparators<BW, OFF>("foo", 3, UCS4V().a('f').a('o').a('o'), SizeV().a(0).a(1).a(2));
- assertSkipSeparators<BW, OFF>("f\x1Fo", 3, UCS4V().a('f').a('o'), SizeV().a(0).a(2));
- assertSkipSeparators<BW, OFF>("f\no", 3, UCS4V().a('f').a('\n').a('o'), SizeV().a(0).a(1).a(2));
- assertSkipSeparators<BW, OFF>("f\to", 3, UCS4V().a('f').a('\t').a('o'), SizeV().a(0).a(1).a(2));
-
- // utf8 char
- assertSkipSeparators<BW, OFF>("\xC2\x80\x66", 3, UCS4V().a(getUTF8Char("\xC2\x80")).a('f'),
- SizeV().a(0).a(2));
- assertSkipSeparators<BW, OFF>("\xE0\xA0\x80\x66", 4, UCS4V().a(getUTF8Char("\xE0\xA0\x80")).a('f'),
- SizeV().a(0).a(3));
- assertSkipSeparators<BW, OFF>("\xF0\x90\x80\x80\x66", 5, UCS4V().a(getUTF8Char("\xF0\x90\x80\x80")).a('f'),
- SizeV().a(0).a(4));
-
- // replacement string (sharp s -> ss)
- assertSkipSeparators<BW, OFF>("\xC3\x9F\x66\xC3\x9F", 5, UCS4V().a('s').a('s').a('f').a('s').a('s'),
- SizeV().a(0).a(0).a(2).a(3).a(3));
-}
-
-void
-TextUtilTest::testSkipSeparators()
-{
- Fast_NormalizeWordFolder::Setup(Fast_NormalizeWordFolder::DO_SHARP_S_SUBSTITUTION);
-
- testSkipSeparators<SFSB::BufferWrapper, false>();
- testSkipSeparators<SFSB::OffsetWrapper, true>();
-}
-
-void
-TextUtilTest::testSeparatorCharacter()
-{
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x00'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x01'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x02'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x03'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x04'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x05'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x06'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x07'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x08'));
- EXPECT_TRUE(! SFSB::isSeparatorCharacter('\x09')); // '\t'
- EXPECT_TRUE(! SFSB::isSeparatorCharacter('\x0a')); // '\n'
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0b'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0c'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0d'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0e'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x0f'));
-
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x10'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x11'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x12'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x13'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x14'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x15'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x16'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x17'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x18'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x19'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1a'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1b'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1c'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1d'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1e'));
- EXPECT_TRUE(SFSB::isSeparatorCharacter('\x1f'));
-
- EXPECT_TRUE(! SFSB::isSeparatorCharacter('\x20')); // space
-}
-
-void
-TextUtilTest::testAnsiFold()
-{
- FieldSearcher::init();
- assertAnsiFold("", "");
- assertAnsiFold("ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz");
- assertAnsiFold("abcdefghijklmnopqrstuvwxyz", "abcdefghijklmnopqrstuvwxyz");
- assertAnsiFold("0123456789", "0123456789");
- for (int i = 0; i < 128; ++i) {
- if ((i >= 'a' && i <= 'z') || (i >= '0' && i <= '9')) {
- assertAnsiFold(i, i);
- } else if (i >= 'A' && i <= 'Z') {
- assertAnsiFold(i, i + 32);
- } else {
- assertAnsiFold(i, 0);
- }
- }
-
- // non-ascii is ignored
- for (int i = 128; i < 256; ++i) {
- char toFold = i;
- char folded;
- EXPECT_TRUE(!FSFS::ansiFold(&toFold, 1, &folded));
- }
-}
-
-void
-TextUtilTest::test_lfoldua()
-{
- FieldSearcher::init();
- char folded[256];
- size_t alignedStart = 0;
- const char * toFold = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
- size_t len = strlen(toFold);
- EXPECT_TRUE(FSFS::lfoldua(toFold, len, folded, alignedStart));
- EXPECT_EQUAL(std::string(folded + alignedStart, len), "abcdefghijklmnopqrstuvwxyz");
-}
-
-#ifdef __x86_64__
-void
-TextUtilTest::test_sse2_foldua()
-{
- assert_sse2_foldua("", 0, "");
- assert_sse2_foldua("ABCD", 0, "");
- assert_sse2_foldua("ABCDEFGHIJKLMNO", 0, "");
- assert_sse2_foldua("ABCDEFGHIJKLMNOP", 16, "abcdefghijklmnop");
- assert_sse2_foldua("ABCDEFGHIJKLMNOPQ", 16, "abcdefghijklmnop");
- assert_sse2_foldua("KLMNOPQRSTUVWXYZ", 16, "klmnopqrstuvwxyz");
- assert_sse2_foldua("abcdefghijklmnop", 16, "abcdefghijklmnop");
- assert_sse2_foldua("klmnopqrstuvwxyz", 16, "klmnopqrstuvwxyz");
- assert_sse2_foldua("0123456789abcdef", 16, "0123456789abcdef");
-
- for (int i = 0; i < 128; ++i) {
- if ((i >= 'a' && i <= 'z') || (i >= '0' && i <= '9')) {
- assert_sse2_foldua(i, i);
- } else if (i >= 'A' && i <= 'Z') {
- assert_sse2_foldua(i, i + 32);
- } else {
- assert_sse2_foldua(i, 0);
- }
- }
-
- // non-ascii is ignored
- for (int i = 128; i < 256; ++i) {
- assert_sse2_foldua(i, '?', 0);
- }
-}
-#endif
-
-int
-TextUtilTest::Main()
-{
- TEST_INIT("textutil_test");
-
- testSkipSeparators();
- testSeparatorCharacter();
- testAnsiFold();
- test_lfoldua();
-#ifdef __x86_64__
- test_sse2_foldua();
-#endif
-
- TEST_DONE();
-}
-
-}
-
-TEST_APPHOOK(vsm::TextUtilTest);