diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2016-09-11 20:50:26 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2016-09-11 20:51:08 +0000 |
commit | 9f13fec939fd0a28547753d122eccd1f2e19785e (patch) | |
tree | 274f4944903bdc31efe134e94b1ef7648e390b9c /searchlib | |
parent | 0228ffd55f264315531ada146c113aed6efc2f22 (diff) |
Do not require icu unless you really need it.
Diffstat (limited to 'searchlib')
19 files changed, 222 insertions, 142 deletions
diff --git a/searchlib/CMakeLists.txt b/searchlib/CMakeLists.txt index c03969bc928..df3e40ebd6d 100644 --- a/searchlib/CMakeLists.txt +++ b/searchlib/CMakeLists.txt @@ -16,7 +16,6 @@ vespa_define_module( searchcommon EXTERNAL_DEPENDS - icui18n rt LIBS @@ -52,6 +51,7 @@ vespa_define_module( src/vespa/searchlib/test/fakedata src/vespa/searchlib/test/memoryindex src/vespa/searchlib/transactionlog + src/vespa/searchlib/uca src/vespa/searchlib/util APPS diff --git a/searchlib/src/tests/forcelink/CMakeLists.txt b/searchlib/src/tests/forcelink/CMakeLists.txt index 48c9b4fd1a3..59031c2cd4d 100644 --- a/searchlib/src/tests/forcelink/CMakeLists.txt +++ b/searchlib/src/tests/forcelink/CMakeLists.txt @@ -4,5 +4,6 @@ vespa_add_executable(searchlib_forcelink_test_app TEST forcelink.cpp DEPENDS searchlib + searchlib_searchlib_uca ) vespa_add_test(NAME searchlib_forcelink_test_app COMMAND searchlib_forcelink_test_app) diff --git a/searchlib/src/tests/sort/CMakeLists.txt b/searchlib/src/tests/sort/CMakeLists.txt index 1830952bffd..54f2fcdd3d7 100644 --- a/searchlib/src/tests/sort/CMakeLists.txt +++ b/searchlib/src/tests/sort/CMakeLists.txt @@ -11,6 +11,7 @@ vespa_add_executable(searchlib_sort_test_app sort_test.cpp DEPENDS searchlib + searchlib_searchlib_uca ) #vespa_add_test(NAME searchlib_sort_test_app COMMAND searchlib_sort_test_app) vespa_add_executable(searchlib_uca_stress_app @@ -18,5 +19,6 @@ vespa_add_executable(searchlib_uca_stress_app uca.cpp DEPENDS searchlib + searchlib_searchlib_uca ) vespa_add_test(NAME searchlib_uca_stress_app COMMAND searchlib_uca_stress_app BENCHMARK) diff --git a/searchlib/src/tests/sort/sort_test.cpp b/searchlib/src/tests/sort/sort_test.cpp index cf5e1a1cb1f..1af12c7aecb 100644 --- a/searchlib/src/tests/sort/sort_test.cpp +++ b/searchlib/src/tests/sort/sort_test.cpp @@ -5,6 +5,7 @@ #include <vespa/searchlib/common/sort.h> #include <vespa/searchlib/common/sortspec.h> #include <vespa/searchlib/common/converters.h> +#include <vespa/searchlib/uca/ucaconverter.h> #include <vespa/vespalib/util/array.h> #include <vector> #include <fstream> @@ -17,6 +18,7 @@ LOG_SETUP("sort_test"); using vespalib::Array; using namespace search::common; +using namespace search::uca; using vespalib::ConstBufferRef; class Test : public vespalib::TestApp @@ -154,8 +156,9 @@ void Test::testStringCaseInsensitiveSort() void Test::testSortSpec() { + UcaConverterFactory ucaFactory; { - SortSpec sortspec("-name"); + SortSpec sortspec("-name", ucaFactory); EXPECT_EQUAL(sortspec.size(), 1u); EXPECT_EQUAL(sortspec[0]._field, "name"); EXPECT_TRUE( ! sortspec[0]._ascending); @@ -163,7 +166,7 @@ void Test::testSortSpec() } { - SortSpec sortspec("-lowercase(name)"); + SortSpec sortspec("-lowercase(name)", ucaFactory); EXPECT_EQUAL(sortspec.size(), 1u); EXPECT_EQUAL(sortspec[0]._field, "name"); EXPECT_TRUE( ! sortspec[0]._ascending); @@ -172,7 +175,7 @@ void Test::testSortSpec() } { - SortSpec sortspec("-uca(name,nn_no)"); + SortSpec sortspec("-uca(name,nn_no)", ucaFactory); EXPECT_EQUAL(sortspec.size(), 1u); EXPECT_EQUAL(sortspec[0]._field, "name"); EXPECT_TRUE( ! sortspec[0]._ascending); @@ -180,7 +183,7 @@ void Test::testSortSpec() EXPECT_TRUE(dynamic_cast<UcaConverter *>(sortspec[0]._converter.get()) != NULL); } { - SortSpec sortspec("-uca(name,nn_no,PRIMARY)"); + SortSpec sortspec("-uca(name,nn_no,PRIMARY)", ucaFactory); EXPECT_EQUAL(sortspec.size(), 1u); EXPECT_EQUAL(sortspec[0]._field, "name"); EXPECT_TRUE( ! sortspec[0]._ascending); @@ -188,7 +191,7 @@ void Test::testSortSpec() EXPECT_TRUE(dynamic_cast<UcaConverter *>(sortspec[0]._converter.get()) != NULL); } { - SortSpec sortspec("-uca(name,nn_no,SECONDARY)"); + SortSpec sortspec("-uca(name,nn_no,SECONDARY)", ucaFactory); EXPECT_EQUAL(sortspec.size(), 1u); EXPECT_EQUAL(sortspec[0]._field, "name"); EXPECT_TRUE( ! sortspec[0]._ascending); @@ -196,7 +199,7 @@ void Test::testSortSpec() EXPECT_TRUE(dynamic_cast<UcaConverter *>(sortspec[0]._converter.get()) != NULL); } { - SortSpec sortspec("-uca(name,nn_no,TERTIARY)"); + SortSpec sortspec("-uca(name,nn_no,TERTIARY)", ucaFactory); EXPECT_EQUAL(sortspec.size(), 1u); EXPECT_EQUAL(sortspec[0]._field, "name"); EXPECT_TRUE( ! sortspec[0]._ascending); @@ -204,7 +207,7 @@ void Test::testSortSpec() EXPECT_TRUE(dynamic_cast<UcaConverter *>(sortspec[0]._converter.get()) != NULL); } { - SortSpec sortspec("-uca(name,nn_no,QUATERNARY)"); + SortSpec sortspec("-uca(name,nn_no,QUATERNARY)", ucaFactory); EXPECT_EQUAL(sortspec.size(), 1u); EXPECT_EQUAL(sortspec[0]._field, "name"); EXPECT_TRUE( ! sortspec[0]._ascending); @@ -212,7 +215,7 @@ void Test::testSortSpec() EXPECT_TRUE(dynamic_cast<UcaConverter *>(sortspec[0]._converter.get()) != NULL); } { - SortSpec sortspec("-uca(name,nn_no,IDENTICAL)"); + SortSpec sortspec("-uca(name,nn_no,IDENTICAL)", ucaFactory); EXPECT_EQUAL(sortspec.size(), 1u); EXPECT_EQUAL(sortspec[0]._field, "name"); EXPECT_TRUE( ! sortspec[0]._ascending); @@ -220,7 +223,7 @@ void Test::testSortSpec() EXPECT_TRUE(dynamic_cast<UcaConverter *>(sortspec[0]._converter.get()) != NULL); } { - SortSpec sortspec("-uca(name,zh)"); + SortSpec sortspec("-uca(name,zh)", ucaFactory); EXPECT_EQUAL(sortspec.size(), 1u); EXPECT_EQUAL(sortspec[0]._field, "name"); EXPECT_TRUE( ! sortspec[0]._ascending); @@ -228,7 +231,7 @@ void Test::testSortSpec() EXPECT_TRUE(dynamic_cast<UcaConverter *>(sortspec[0]._converter.get()) != NULL); } { - SortSpec sortspec("-uca(name,finnes_ikke)"); + SortSpec sortspec("-uca(name,finnes_ikke)", ucaFactory); EXPECT_EQUAL(sortspec.size(), 1u); EXPECT_EQUAL(sortspec[0]._field, "name"); EXPECT_TRUE( ! sortspec[0]._ascending); @@ -237,7 +240,7 @@ void Test::testSortSpec() } { try { - SortSpec sortspec("-uca(name,nn_no,NTERTIARY)"); + SortSpec sortspec("-uca(name,nn_no,NTERTIARY)", ucaFactory); EXPECT_TRUE(false); } catch (const std::runtime_error & e) { EXPECT_TRUE(true); @@ -258,7 +261,7 @@ void Test::testSameAsJavaOrder() } } EXPECT_EQUAL(158u, javaOrder.size()); - search::common::UcaConverter uca("zh", "PRIMARY"); + UcaConverter uca("zh", "PRIMARY"); vespalib::ConstBufferRef fkey = uca.convert(vespalib::ConstBufferRef(javaOrder[0].c_str(), javaOrder[0].size())); vespalib::string prev(fkey.c_str(), fkey.size()); for (size_t i(1); i < javaOrder.size(); i++) { diff --git a/searchlib/src/tests/sort/uca.cpp b/searchlib/src/tests/sort/uca.cpp index b9225c94a66..121e53a9e7c 100644 --- a/searchlib/src/tests/sort/uca.cpp +++ b/searchlib/src/tests/sort/uca.cpp @@ -11,6 +11,7 @@ #include <vector> #include <stdexcept> #include <unicode/ustring.h> +#include <unicode/coll.h> LOG_SETUP("uca_stress"); diff --git a/searchlib/src/tests/sortspec/CMakeLists.txt b/searchlib/src/tests/sortspec/CMakeLists.txt index a59a5c102f7..0522f1725a3 100644 --- a/searchlib/src/tests/sortspec/CMakeLists.txt +++ b/searchlib/src/tests/sortspec/CMakeLists.txt @@ -4,5 +4,6 @@ vespa_add_executable(searchlib_multilevelsort_test_app TEST multilevelsort.cpp DEPENDS searchlib + searchlib_searchlib_uca ) vespa_add_test(NAME searchlib_multilevelsort_test_app COMMAND searchlib_multilevelsort_test_app) diff --git a/searchlib/src/tests/sortspec/multilevelsort.cpp b/searchlib/src/tests/sortspec/multilevelsort.cpp index f151bfaf132..c65683a1754 100644 --- a/searchlib/src/tests/sortspec/multilevelsort.cpp +++ b/searchlib/src/tests/sortspec/multilevelsort.cpp @@ -10,6 +10,7 @@ LOG_SETUP("multilevelsort_test"); #include <vespa/searchlib/attribute/stringbase.h> #include <vespa/searchlib/attribute/attributefactory.h> #include <vespa/searchlib/attribute/attributevector.hpp> +#include <vespa/searchlib/uca/ucaconverter.h> #include <vespa/vespalib/testkit/testapp.h> #include <map> #include <sstream> @@ -251,7 +252,8 @@ MultilevelSortTest::sortAndCheck(const std::vector<Spec> &spec, uint32_t num, vespalib::Clock clock; vespalib::Doom doom(clock, std::numeric_limits<long>::max()); - FastS_SortSpec sorter(doom, _sortMethod); + search::uca::UcaConverterFactory ucaFactory; + FastS_SortSpec sorter(doom, ucaFactory, _sortMethod); // init sorter with sort data for(uint32_t i = 0; i < spec.size(); ++i) { AttributeGuard ag; diff --git a/searchlib/src/vespa/searchlib/CMakeLists.txt b/searchlib/src/vespa/searchlib/CMakeLists.txt index 0f0420e5b61..bf3cabb460b 100644 --- a/searchlib/src/vespa/searchlib/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/CMakeLists.txt @@ -32,6 +32,5 @@ vespa_add_library(searchlib INSTALL lib64 DEPENDS staging_vespalib - icuuc atomic ) diff --git a/searchlib/src/vespa/searchlib/common/converters.h b/searchlib/src/vespa/searchlib/common/converters.h index ccd15c6105c..9eadeb5a8c5 100644 --- a/searchlib/src/vespa/searchlib/common/converters.h +++ b/searchlib/src/vespa/searchlib/common/converters.h @@ -3,7 +3,6 @@ #pragma once #include <vespa/searchcommon/common/iblobconverter.h> -#include <unicode/coll.h> #include <vector> #include <vespa/vespalib/stllike/string.h> @@ -13,7 +12,7 @@ namespace common { class PassThroughConverter : public BlobConverter { private: - virtual vespalib::ConstBufferRef onConvert(const vespalib::ConstBufferRef & src) const; + virtual ConstBufferRef onConvert(const ConstBufferRef & src) const; }; class LowercaseConverter : public BlobConverter @@ -21,47 +20,16 @@ class LowercaseConverter : public BlobConverter public: LowercaseConverter(); private: - virtual vespalib::ConstBufferRef onConvert(const vespalib::ConstBufferRef & src) const; + virtual ConstBufferRef onConvert(const ConstBufferRef & src) const; mutable vespalib::string _buffer; }; -class UcaConverter : public BlobConverter -{ +class ConverterFactory { +protected: + using stringref = vespalib::stringref; public: - typedef icu::Collator Collator; - UcaConverter(const vespalib::string & locale, const vespalib::string & strength); - const Collator & getCollator() const { return *_collator; } -private: - struct Buffer { - vespalib::string _data; - uint8_t *ptr() { return (uint8_t *)_data.begin(); } - int32_t siz() { return _data.size(); } - Buffer() : _data() { - reserve(_data.capacity()-8); // do not cause extra malloc() by default - } - void reserve(size_t size) { - _data.reserve(size+8); - _data.resize(size); - _data[size+1] = '\0'; - _data[size+2] = '\0'; - _data[size+3] = 'd'; - _data[size+4] = 'e'; - _data[size+5] = 'a'; - _data[size+6] = 'd'; - _data[size+7] = '\0'; - } - void check() { - assert(_data[siz()+3] == 'd'); - assert(_data[siz()+4] == 'e'); - assert(_data[siz()+5] == 'a'); - assert(_data[siz()+6] == 'd'); - } - }; - int utf8ToUtf16(const vespalib::ConstBufferRef & src) const; - virtual vespalib::ConstBufferRef onConvert(const vespalib::ConstBufferRef & src) const; - mutable Buffer _buffer; - mutable std::vector<UChar> _u16Buffer; - std::unique_ptr<Collator> _collator; + virtual ~ConverterFactory() { } + virtual BlobConverter::UP create(stringref local, stringref strength) const = 0; }; } diff --git a/searchlib/src/vespa/searchlib/common/sortresults.cpp b/searchlib/src/vespa/searchlib/common/sortresults.cpp index c58f15a8372..485c0d550b9 100644 --- a/searchlib/src/vespa/searchlib/common/sortresults.cpp +++ b/searchlib/src/vespa/searchlib/common/sortresults.cpp @@ -302,8 +302,9 @@ FastS_SortSpec::initSortData(const RankedHit *hits, uint32_t n) } -FastS_SortSpec::FastS_SortSpec(const vespalib::Doom & doom, int method) : +FastS_SortSpec::FastS_SortSpec(const vespalib::Doom & doom, const ConverterFactory & ucaFactory, int method) : _doom(doom), + _ucaFactory(ucaFactory), _method(method), _sortSpec(), _vectors() @@ -323,7 +324,7 @@ FastS_SortSpec::Init(const vespalib::string & sortStr, IAttributeContext & vecMa LOG(spam, "sortStr = %s", sortStr.c_str()); bool retval(true); try { - _sortSpec = SortSpec(sortStr); + _sortSpec = SortSpec(sortStr, _ucaFactory); for (SortSpec::const_iterator it(_sortSpec.begin()), mt(_sortSpec.end()); retval && (it < mt); it++) { retval = Add(vecMan, *it); } diff --git a/searchlib/src/vespa/searchlib/common/sortresults.h b/searchlib/src/vespa/searchlib/common/sortresults.h index 8da643411a0..712150bc2c5 100644 --- a/searchlib/src/vespa/searchlib/common/sortresults.h +++ b/searchlib/src/vespa/searchlib/common/sortresults.h @@ -122,19 +122,21 @@ private: typedef vespalib::AutoAlloc<0x800000> Alloc; typedef vespalib::Array<uint8_t, Alloc> BinarySortData; typedef vespalib::Array<SortData, Alloc> SortDataArray; - vespalib::Doom _doom; - int _method; + using ConverterFactory = search::common::ConverterFactory; + vespalib::Doom _doom; + const ConverterFactory & _ucaFactory; + int _method; search::common::SortSpec _sortSpec; - VectorRefList _vectors; - BinarySortData _binarySortData; - SortDataArray _sortDataArray; + VectorRefList _vectors; + BinarySortData _binarySortData; + SortDataArray _sortDataArray; bool Add(search::attribute::IAttributeContext & vecMan, const search::common::SortInfo & sInfo); void initSortData(const search::RankedHit *a, uint32_t n); uint8_t * realloc(uint32_t n, size_t & variableWidth, uint32_t & available, uint32_t & dataSize, uint8_t *mySortData); public: - FastS_SortSpec(const vespalib::Doom & doom, int method=2); + FastS_SortSpec(const vespalib::Doom & doom, const ConverterFactory & ucaFactory, int method=2); virtual ~FastS_SortSpec(); std::pair<const char *, size_t> getSortRef(size_t i) const { diff --git a/searchlib/src/vespa/searchlib/common/sortspec.cpp b/searchlib/src/vespa/searchlib/common/sortspec.cpp index b522d76ebaa..b81c812b830 100644 --- a/searchlib/src/vespa/searchlib/common/sortspec.cpp +++ b/searchlib/src/vespa/searchlib/common/sortspec.cpp @@ -41,78 +41,7 @@ ConstBufferRef LowercaseConverter::onConvert(const ConstBufferRef & src) const return ConstBufferRef(_buffer.begin(), _buffer.size()); } -namespace { - vespalib::Lock _GlobalDirtyICUThreadSafeLock; -} - -UcaConverter::UcaConverter(const vespalib::string & locale, const vespalib::string & strength) : - _buffer(), - _u16Buffer(128), - _collator() -{ - UErrorCode status = U_ZERO_ERROR; - Collator *coll(NULL); - { - vespalib::LockGuard guard(_GlobalDirtyICUThreadSafeLock); - coll = Collator::createInstance(icu::Locale(locale.c_str()), status); - } - if(U_SUCCESS(status)) { - _collator.reset(coll); - if (strength.empty()) { - _collator->setStrength(Collator::PRIMARY); - } else if (strength == "PRIMARY") { - _collator->setStrength(Collator::PRIMARY); - } else if (strength == "SECONDARY") { - _collator->setStrength(Collator::SECONDARY); - } else if (strength == "TERTIARY") { - _collator->setStrength(Collator::TERTIARY); - } else if (strength == "QUATERNARY") { - _collator->setStrength(Collator::QUATERNARY); - } else if (strength == "IDENTICAL") { - _collator->setStrength(Collator::IDENTICAL); - } else { - throw std::runtime_error("Illegal uca collation strength : " + strength); - } - } else { - delete coll; - throw std::runtime_error("Failed Collator::createInstance(Locale(locale.c_str()), status) with locale : " + locale); - } -} - -int UcaConverter::utf8ToUtf16(const ConstBufferRef & src) const -{ - UErrorCode status = U_ZERO_ERROR; - int32_t u16Wanted(0); - u_strFromUTF8(&_u16Buffer[0], _u16Buffer.size(), &u16Wanted, static_cast<const char *>(src.data()), -1, &status); - if (U_SUCCESS(status)) { - } else if (status == U_INVALID_CHAR_FOUND) { - LOG(warning, "ICU was not able to convert the %ld alleged utf8 characters'%s' to utf16", src.size(), src.c_str()); - } else if (status == U_BUFFER_OVERFLOW_ERROR) { - //Ignore as this is handled on the outside. - } else { - LOG(warning, "ICU made a undefined complaint(%d) about the %ld alleged utf8 characters'%s' to utf16", status, src.size(), src.c_str()); - } - return u16Wanted; -} - -ConstBufferRef UcaConverter::onConvert(const ConstBufferRef & src) const -{ - int32_t u16Wanted(utf8ToUtf16(src)); - if (u16Wanted > (int)_u16Buffer.size()) { - _u16Buffer.resize(u16Wanted); - u16Wanted = utf8ToUtf16(src); - } - int wanted = _collator->getSortKey(&_u16Buffer[0], u16Wanted, _buffer.ptr(), _buffer.siz()); - _buffer.check(); - if (wanted > _buffer.siz()) { - _buffer.reserve(wanted); - wanted = _collator->getSortKey(&_u16Buffer[0], u16Wanted, _buffer.ptr(), _buffer.siz()); - _buffer.check(); - } - return ConstBufferRef(_buffer.ptr(), wanted); -} - -SortSpec::SortSpec(const vespalib::string & spec) : +SortSpec::SortSpec(const vespalib::string & spec, const ConverterFactory & ucaFactory) : _spec(spec) { for (const char *pt(spec.c_str()), *mt(spec.c_str() + spec.size()); pt < mt;) { @@ -143,13 +72,13 @@ SortSpec::SortSpec(const vespalib::string & spec) : for(; (p < e) && (*p != ')'); p++); if (*p == ')') { vespalib::string strength(strengthName, p - strengthName); - push_back(SortInfo(attr, ascending, BlobConverter::SP(new UcaConverter(locale, strength)))); + push_back(SortInfo(attr, ascending, BlobConverter::SP(ucaFactory.create(locale, strength)))); } else { throw std::runtime_error(make_string("Missing ')' at %s attr=%s locale=%s strength=%s", p, attr.c_str(), localeName, strengthName)); } } else if (*p == ')') { vespalib::string locale(localeName, p-localeName); - push_back(SortInfo(attr, ascending, BlobConverter::SP(new UcaConverter(locale, "")))); + push_back(SortInfo(attr, ascending, BlobConverter::SP(ucaFactory.create(locale, "")))); } else { throw std::runtime_error(make_string("Missing ')' or ',' at %s attr=%s locale=%s", p, attr.c_str(), localeName)); } diff --git a/searchlib/src/vespa/searchlib/common/sortspec.h b/searchlib/src/vespa/searchlib/common/sortspec.h index bfa6a064105..80b5cfaf795 100644 --- a/searchlib/src/vespa/searchlib/common/sortspec.h +++ b/searchlib/src/vespa/searchlib/common/sortspec.h @@ -9,6 +9,7 @@ #include <vector> #include <vespa/vespalib/stllike/string.h> #include <vespa/searchcommon/common/iblobconverter.h> +#include <vespa/searchlib/common/converters.h> namespace search { namespace common { @@ -24,7 +25,7 @@ class SortSpec : public std::vector<SortInfo> { public: SortSpec() : _spec() { } - SortSpec(const vespalib::string & spec); + SortSpec(const vespalib::string & spec, const ConverterFactory & ucaFactory); const vespalib::string & getSpec() const { return _spec; } private: vespalib::string _spec; diff --git a/searchlib/src/vespa/searchlib/expression/CMakeLists.txt b/searchlib/src/vespa/searchlib/expression/CMakeLists.txt index 1940f9736be..a82f5625c3c 100644 --- a/searchlib/src/vespa/searchlib/expression/CMakeLists.txt +++ b/searchlib/src/vespa/searchlib/expression/CMakeLists.txt @@ -17,7 +17,6 @@ vespa_add_library(searchlib_expression OBJECT documentfieldnode.cpp attributenode.cpp zcurve.cpp - ucafunctionnode.cpp debugwaitfunctionnode.cpp mathfunctionnode.cpp numericfunctionnode.cpp diff --git a/searchlib/src/vespa/searchlib/uca/CMakeLists.txt b/searchlib/src/vespa/searchlib/uca/CMakeLists.txt new file mode 100644 index 00000000000..7ff31f7c7a0 --- /dev/null +++ b/searchlib/src/vespa/searchlib/uca/CMakeLists.txt @@ -0,0 +1,10 @@ +# Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_library(searchlib_searchlib_uca + SOURCES + ucaconverter.cpp + ucafunctionnode.cpp + INSTALL lib64 + DEPENDS + icui18n + icuuc +) diff --git a/searchlib/src/vespa/searchlib/uca/ucaconverter.cpp b/searchlib/src/vespa/searchlib/uca/ucaconverter.cpp new file mode 100644 index 00000000000..b084e1a59fe --- /dev/null +++ b/searchlib/src/vespa/searchlib/uca/ucaconverter.cpp @@ -0,0 +1,98 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/fastos/fastos.h> +#include <vespa/searchlib/common/sortspec.h> +#include <vespa/searchlib/uca/ucaconverter.h> +#include <vespa/vespalib/util/stringfmt.h> +#include <vespa/vespalib/util/sync.h> +#include <unicode/coll.h> +#include <unicode/ustring.h> +#include <stdexcept> +#include <vespa/fastlib/text/normwordfolder.h> +#include <vespa/vespalib/text/utf8.h> +#include <vespa/log/log.h> +LOG_SETUP(".search.common.sortspec"); + +namespace search { +namespace uca { + +using vespalib::ConstBufferRef; +using vespalib::make_string; + +namespace { + vespalib::Lock _GlobalDirtyICUThreadSafeLock; +} + +BlobConverter::UP +UcaConverterFactory::create(stringref local, stringref strength) const { + return std::make_unique<UcaConverter>(local, strength); +} + +UcaConverter::UcaConverter(vespalib::stringref locale, vespalib::stringref strength) : + _buffer(), + _u16Buffer(128), + _collator() +{ + UErrorCode status = U_ZERO_ERROR; + Collator *coll(NULL); + { + vespalib::LockGuard guard(_GlobalDirtyICUThreadSafeLock); + coll = Collator::createInstance(icu::Locale(locale.c_str()), status); + } + if(U_SUCCESS(status)) { + _collator.reset(coll); + if (strength.empty()) { + _collator->setStrength(Collator::PRIMARY); + } else if (strength == "PRIMARY") { + _collator->setStrength(Collator::PRIMARY); + } else if (strength == "SECONDARY") { + _collator->setStrength(Collator::SECONDARY); + } else if (strength == "TERTIARY") { + _collator->setStrength(Collator::TERTIARY); + } else if (strength == "QUATERNARY") { + _collator->setStrength(Collator::QUATERNARY); + } else if (strength == "IDENTICAL") { + _collator->setStrength(Collator::IDENTICAL); + } else { + throw std::runtime_error("Illegal uca collation strength : " + strength); + } + } else { + delete coll; + throw std::runtime_error("Failed Collator::createInstance(Locale(locale.c_str()), status) with locale : " + locale); + } +} + +int UcaConverter::utf8ToUtf16(const ConstBufferRef & src) const +{ + UErrorCode status = U_ZERO_ERROR; + int32_t u16Wanted(0); + u_strFromUTF8(&_u16Buffer[0], _u16Buffer.size(), &u16Wanted, static_cast<const char *>(src.data()), -1, &status); + if (U_SUCCESS(status)) { + } else if (status == U_INVALID_CHAR_FOUND) { + LOG(warning, "ICU was not able to convert the %ld alleged utf8 characters'%s' to utf16", src.size(), src.c_str()); + } else if (status == U_BUFFER_OVERFLOW_ERROR) { + //Ignore as this is handled on the outside. + } else { + LOG(warning, "ICU made a undefined complaint(%d) about the %ld alleged utf8 characters'%s' to utf16", status, src.size(), src.c_str()); + } + return u16Wanted; +} + +ConstBufferRef UcaConverter::onConvert(const ConstBufferRef & src) const +{ + int32_t u16Wanted(utf8ToUtf16(src)); + if (u16Wanted > (int)_u16Buffer.size()) { + _u16Buffer.resize(u16Wanted); + u16Wanted = utf8ToUtf16(src); + } + int wanted = _collator->getSortKey(&_u16Buffer[0], u16Wanted, _buffer.ptr(), _buffer.siz()); + _buffer.check(); + if (wanted > _buffer.siz()) { + _buffer.reserve(wanted); + wanted = _collator->getSortKey(&_u16Buffer[0], u16Wanted, _buffer.ptr(), _buffer.siz()); + _buffer.check(); + } + return ConstBufferRef(_buffer.ptr(), wanted); +} + +} +} diff --git a/searchlib/src/vespa/searchlib/uca/ucaconverter.h b/searchlib/src/vespa/searchlib/uca/ucaconverter.h new file mode 100644 index 00000000000..c71cabda87c --- /dev/null +++ b/searchlib/src/vespa/searchlib/uca/ucaconverter.h @@ -0,0 +1,63 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/searchcommon/common/iblobconverter.h> +#include <unicode/coll.h> +#include <vector> +#include <vespa/vespalib/stllike/string.h> + +namespace search { + +using common::BlobConverter; +using common::ConverterFactory; + +namespace uca { + +class UcaConverterFactory : public ConverterFactory { +public: + BlobConverter::UP create(stringref local, stringref strength) const override; +}; + +class UcaConverter : public BlobConverter +{ +public: + using Collator = icu::Collator; + UcaConverter(vespalib::stringref locale, vespalib::stringref strength); + const Collator & getCollator() const { return *_collator; } +private: + struct Buffer { + vespalib::string _data; + uint8_t *ptr() { return (uint8_t *)_data.begin(); } + int32_t siz() { return _data.size(); } + Buffer() : _data() { + reserve(_data.capacity()-8); // do not cause extra malloc() by default + } + void reserve(size_t size) { + _data.reserve(size+8); + _data.resize(size); + _data[size+1] = '\0'; + _data[size+2] = '\0'; + _data[size+3] = 'd'; + _data[size+4] = 'e'; + _data[size+5] = 'a'; + _data[size+6] = 'd'; + _data[size+7] = '\0'; + } + void check() { + assert(_data[siz()+3] == 'd'); + assert(_data[siz()+4] == 'e'); + assert(_data[siz()+5] == 'a'); + assert(_data[siz()+6] == 'd'); + } + }; + int utf8ToUtf16(const ConstBufferRef & src) const; + virtual ConstBufferRef onConvert(const ConstBufferRef & src) const; + mutable Buffer _buffer; + mutable std::vector<UChar> _u16Buffer; + std::unique_ptr<Collator> _collator; +}; + +} +} + diff --git a/searchlib/src/vespa/searchlib/expression/ucafunctionnode.cpp b/searchlib/src/vespa/searchlib/uca/ucafunctionnode.cpp index 2cd4df49c5b..fa29f6f1547 100644 --- a/searchlib/src/vespa/searchlib/expression/ucafunctionnode.cpp +++ b/searchlib/src/vespa/searchlib/uca/ucafunctionnode.cpp @@ -1,7 +1,7 @@ // Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include <vespa/fastos/fastos.h> -#include <vespa/searchlib/expression/ucafunctionnode.h> -#include <vespa/searchlib/common/converters.h> +#include <vespa/searchlib/uca/ucafunctionnode.h> +#include <vespa/searchlib/uca/ucaconverter.h> namespace search { namespace expression { @@ -24,7 +24,7 @@ UcaFunctionNode::UcaFunctionNode(const ExpressionNode::CP & arg, const vespalib: UnaryFunctionNode(arg), _locale(locale), _strength(strength), - _collator(new common::UcaConverter(locale, strength)) + _collator(new uca::UcaConverter(locale, strength)) { } @@ -104,7 +104,7 @@ Deserializer & UcaFunctionNode::onDeserialize(Deserializer & is) { UnaryFunctionNode::onDeserialize(is); is >> _locale >> _strength; - _collator.reset(new common::UcaConverter(_locale, _strength)); + _collator.reset(new uca::UcaConverter(_locale, _strength)); return is; } diff --git a/searchlib/src/vespa/searchlib/expression/ucafunctionnode.h b/searchlib/src/vespa/searchlib/uca/ucafunctionnode.h index 78242d9cbd1..78242d9cbd1 100644 --- a/searchlib/src/vespa/searchlib/expression/ucafunctionnode.h +++ b/searchlib/src/vespa/searchlib/uca/ucafunctionnode.h |