diff options
-rw-r--r-- | vsm/src/vespa/vsm/searcher/fold.cpp | 46 | ||||
-rw-r--r-- | vsm/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp | 10 |
2 files changed, 56 insertions, 0 deletions
diff --git a/vsm/src/vespa/vsm/searcher/fold.cpp b/vsm/src/vespa/vsm/searcher/fold.cpp index 903b1e43f79..a8ec5ee5ef2 100644 --- a/vsm/src/vespa/vsm/searcher/fold.cpp +++ b/vsm/src/vespa/vsm/searcher/fold.cpp @@ -24,7 +24,11 @@ const unsigned char * sse2_foldaa(const unsigned char * toFoldOrg, size_t sz, un int nonAscii = __builtin_ia32_pmovmskb128(toFold[i]); if (nonAscii) { +#ifdef __clang__ + v16qi non8Mask = _G_8bit > toFold[i]; +#else v16qi non8Mask = __builtin_ia32_pcmpgtb128(_G_8bit, toFold[i]); +#endif int non8bit = __builtin_ia32_pmovmskb128(non8Mask); if (non8bit) { @@ -32,6 +36,22 @@ const unsigned char * sse2_foldaa(const unsigned char * toFoldOrg, size_t sz, un } break; } +#ifdef __clang__ + v16qi _0 = toFold[i] > _G_0; + v16qi _z = toFold[i] > _G_z; + v2di _0_z = v2di(_0) ^ v2di(_z); + v2di toLow = _0_z & v2di(toFold[i]); + v16qi low = v16qi(toLow | _G_lowCase); + _0 = low > _G_0; + v16qi _9 = low > _G_9; + v16qi _a = low > _G_a; + _z = low > _G_z; + v2di _0_9_m = v2di(_0) ^ v2di(_9); + v2di _a_z_m = v2di(_a) ^ v2di(_z); + v2di _0_9 = _0_9_m & v2di(low); + v2di _a_z = _a_z_m & v2di(low); + folded[i] = _0_9 | _a_z; +#else v16qi _0 = __builtin_ia32_pcmpgtb128(toFold[i], _G_0); v16qi _z = __builtin_ia32_pcmpgtb128(toFold[i], _G_z); v2di _0_z = __builtin_ia32_pxor128(v2di(_0), v2di(_z)); @@ -46,6 +66,7 @@ const unsigned char * sse2_foldaa(const unsigned char * toFoldOrg, size_t sz, un v2di _0_9 = __builtin_ia32_pand128(_0_9_m, v2di(low)); v2di _a_z = __builtin_ia32_pand128(_a_z_m, v2di(low)); folded[i] = __builtin_ia32_por128(_0_9, _a_z); +#endif #else # warning "Intel's icc compiler does not like __builtin_ia32_pxor128" LOG_ABORT("should not be reached"); @@ -70,11 +91,19 @@ const unsigned char * sse2_foldua(const unsigned char * toFoldOrg, size_t sz, un for (size_t m=sz/16; i < m; i++) { #ifndef __INTEL_COMPILER +#ifdef __clang__ + v16qi current = __builtin_ia32_lddqu(reinterpret_cast<const char *>(&toFoldOrg[i*16])); +#else v16qi current = __builtin_ia32_loaddqu(reinterpret_cast<const char *>(&toFoldOrg[i*16])); +#endif int nonAscii = __builtin_ia32_pmovmskb128(current); if (nonAscii) { +#ifdef __clang__ + v16qi non8Mask = _G_8bit > current; +#else v16qi non8Mask = __builtin_ia32_pcmpgtb128(_G_8bit, current); +#endif int non8bit = __builtin_ia32_pmovmskb128(non8Mask); if (non8bit) { @@ -82,6 +111,22 @@ const unsigned char * sse2_foldua(const unsigned char * toFoldOrg, size_t sz, un } break; } +#ifdef __clang__ + v16qi _0 = current > _G_0; + v16qi _z = current > _G_z; + v2di _0_z = v2di(_0) ^ v2di(_z); + v2di toLow = _0_z & v2di(current); + v16qi low = v16qi(toLow | _G_lowCase); + _0 = low > _G_0; + v16qi _9 = low > _G_9; + v16qi _a = low > _G_a; + _z = low > _G_z; + v2di _0_9_m = v2di(_0) ^ v2di(_9); + v2di _a_z_m = v2di(_a) ^ v2di(_z); + v2di _0_9 = _0_9_m & v2di(low); + v2di _a_z = _a_z_m & v2di(low); + folded[i] = _0_9 | _a_z; +#else v16qi _0 = __builtin_ia32_pcmpgtb128(current, _G_0); v16qi _z = __builtin_ia32_pcmpgtb128(current, _G_z); v2di _0_z = __builtin_ia32_pxor128(v2di(_0), v2di(_z)); @@ -96,6 +141,7 @@ const unsigned char * sse2_foldua(const unsigned char * toFoldOrg, size_t sz, un v2di _0_9 = __builtin_ia32_pand128(_0_9_m, v2di(low)); v2di _a_z = __builtin_ia32_pand128(_a_z_m, v2di(low)); folded[i] = __builtin_ia32_por128(_0_9, _a_z); +#endif #else # warning "Intel's icc compiler does not like __builtin_ia32_pxor128" LOG_ABORT("should not be reached"); diff --git a/vsm/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp b/vsm/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp index b26b4bd5133..335f6e81d23 100644 --- a/vsm/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp +++ b/vsm/src/vespa/vsm/searcher/futf8strchrfieldsearcher.cpp @@ -90,8 +90,13 @@ inline const char * advance(const char * n, const v16qi zero) unsigned zeroCountSum = 0; do { // find first '\0' character (the end of the word) #ifndef __INTEL_COMPILER +#ifdef __clang__ + v16qi tmpCurrent = __builtin_ia32_lddqu(n+zeroCountSum); + v16qi tmp0 = tmpCurrent == zero; +#else v16qi tmpCurrent = __builtin_ia32_loaddqu(n+zeroCountSum); v16qi tmp0 = __builtin_ia32_pcmpeqb128(tmpCurrent, reinterpret_cast<v16qi>(zero)); +#endif charMap = __builtin_ia32_pmovmskb128(tmp0); // 1 in charMap equals to '\0' in input buffer #else # warning "Intel's icc compiler does not like __builtin_ia32_xxxxx" @@ -107,8 +112,13 @@ inline const char * advance(const char * n, const v16qi zero) if (!zeroMap) { // only '\0' in last 16 bytes (no new word found) do { // find first word character (the next word) #ifndef __INTEL_COMPILER +#ifdef __clang__ + v16qi tmpCurrent = __builtin_ia32_lddqu(n+zeroCountSum); + tmpCurrent = tmpCurrent > zero; +#else v16qi tmpCurrent = __builtin_ia32_loaddqu(n+zeroCountSum); tmpCurrent = __builtin_ia32_pcmpgtb128(tmpCurrent, reinterpret_cast<v16qi>(zero)); +#endif zeroMap = __builtin_ia32_pmovmskb128(tmpCurrent); // 1 in zeroMap equals to word character in input buffer #else # warning "Intel's icc compiler does not like __builtin_ia32_xxxxx" |