aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib/src/vespa/searchlib/uca/ucaconverter.cpp
blob: 42e63044e2db79e7627fa2bcbe72da6b3a6600b9 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.

#include "ucaconverter.h"
#include <unicode/ustring.h>
#include <vespa/vespalib/util/stringfmt.h>
#include <vespa/vespalib/text/utf8.h>
#include <mutex>
#include <vespa/log/log.h>
LOG_SETUP(".search.common.sortspec");

namespace search {
namespace uca {

using vespalib::ConstBufferRef;
using vespalib::make_string;

namespace {
std::mutex _GlobalDirtyICUThreadSafeLock;
}

BlobConverter::UP
UcaConverterFactory::create(stringref local, stringref strength) const {
    return std::make_unique<UcaConverter>(local, strength);
}

UcaConverter::UcaConverter(vespalib::stringref locale, vespalib::stringref strength) :
    _buffer(),
    _u16Buffer(128),
    _collator()
{
    UErrorCode status = U_ZERO_ERROR;
    Collator *coll(NULL);
    {
        std::lock_guard<std::mutex> guard(_GlobalDirtyICUThreadSafeLock);
        coll = Collator::createInstance(icu::Locale(locale.data()), status);
    }
    if(U_SUCCESS(status)) {
        _collator.reset(coll);
        if (strength.empty()) {
            _collator->setStrength(Collator::PRIMARY);
        } else if (strength == "PRIMARY") {
            _collator->setStrength(Collator::PRIMARY);
        } else if (strength == "SECONDARY") {
            _collator->setStrength(Collator::SECONDARY);
        } else if (strength == "TERTIARY") {
            _collator->setStrength(Collator::TERTIARY);
        } else if (strength == "QUATERNARY") {
            _collator->setStrength(Collator::QUATERNARY);
        } else if (strength == "IDENTICAL") {
            _collator->setStrength(Collator::IDENTICAL);
        } else {
            throw std::runtime_error("Illegal uca collation strength : " + strength);
        }
    } else {
        delete coll;
        throw std::runtime_error("Failed Collator::createInstance(Locale(locale.c_str()), status) with locale : " + locale);
    }
}

UcaConverter::~UcaConverter() {}

int UcaConverter::utf8ToUtf16(const ConstBufferRef & src) const
{
    UErrorCode status = U_ZERO_ERROR;
    int32_t u16Wanted(0);
    u_strFromUTF8(&_u16Buffer[0], _u16Buffer.size(), &u16Wanted, static_cast<const char *>(src.data()), -1, &status);
    if (U_SUCCESS(status)) {
    } else if (status == U_INVALID_CHAR_FOUND) {
        LOG(warning, "ICU was not able to convert the %ld alleged utf8 characters'%s' to utf16", src.size(), src.c_str());
    } else if (status == U_BUFFER_OVERFLOW_ERROR) {
        //Ignore as this is handled on the outside.
    } else {
        LOG(warning, "ICU made a undefined complaint(%d) about the %ld alleged utf8 characters'%s' to utf16", status, src.size(), src.c_str());
    }
    return u16Wanted;
}

ConstBufferRef UcaConverter::onConvert(const ConstBufferRef & src) const
{
    int32_t u16Wanted(utf8ToUtf16(src));
    if (u16Wanted > (int)_u16Buffer.size()) {
        _u16Buffer.resize(u16Wanted);
        u16Wanted = utf8ToUtf16(src);
    }
    int wanted = _collator->getSortKey(&_u16Buffer[0], u16Wanted, _buffer.ptr(), _buffer.siz());
    _buffer.check();
    if (wanted > _buffer.siz()) {
        _buffer.reserve(wanted);
        wanted = _collator->getSortKey(&_u16Buffer[0], u16Wanted, _buffer.ptr(), _buffer.siz());
        _buffer.check();
    }
    return ConstBufferRef(_buffer.ptr(), wanted);
}

}
}