diff options
author | Tor Egge <Tor.Egge@online.no> | 2022-09-07 15:26:53 +0200 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2022-09-07 15:26:53 +0200 |
commit | 3daae87bbeab7dad6022064c3f89c08e0cd009cf (patch) | |
tree | 95764df6cb9d10a58b4bca9ccc8e947b5b7dc273 /searchsummary | |
parent | fa7e942f65ca9c9be35c434bafb4a765ca5b7c50 (diff) |
Consolidate juniper separators.
Diffstat (limited to 'searchsummary')
-rw-r--r-- | searchsummary/src/tests/juniper/auxTest.cpp | 11 | ||||
-rw-r--r-- | searchsummary/src/vespa/juniper/CMakeLists.txt | 1 | ||||
-rw-r--r-- | searchsummary/src/vespa/juniper/config.cpp | 5 | ||||
-rw-r--r-- | searchsummary/src/vespa/juniper/dpinterface.h | 7 | ||||
-rw-r--r-- | searchsummary/src/vespa/juniper/juniper_separators.cpp | 14 | ||||
-rw-r--r-- | searchsummary/src/vespa/juniper/juniper_separators.h | 33 | ||||
-rw-r--r-- | searchsummary/src/vespa/juniper/matchobject.cpp | 14 | ||||
-rw-r--r-- | searchsummary/src/vespa/juniper/sumdesc.cpp | 25 | ||||
-rw-r--r-- | searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp | 11 |
9 files changed, 82 insertions, 39 deletions
diff --git a/searchsummary/src/tests/juniper/auxTest.cpp b/searchsummary/src/tests/juniper/auxTest.cpp index d4b65858e3e..ef22b0542af 100644 --- a/searchsummary/src/tests/juniper/auxTest.cpp +++ b/searchsummary/src/tests/juniper/auxTest.cpp @@ -1,6 +1,7 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. #include "auxTest.h" +#include <vespa/juniper/juniper_separators.h> #include <vespa/fastos/file.h> #include <vespa/log/log.h> LOG_SETUP(".auxtest"); @@ -394,11 +395,11 @@ void AuxTest::TestUTF8context() // some content std::string s(char_from_u8(u8"Fast leverer s\u00d8kemotorer og andre nyttige ting for \u00e5 finne frem p\u00e5 ")); s.append(char_from_u8(u8"internett. Teknologien er basert p\u00e5 \u00c5relang")); - s += UNIT_SEPARATOR; + s += juniper::separators::unit_separator_string; s.append(char_from_u8(u8"norsk innsats og forskning i")); - s += GROUP_SEPARATOR; + s += juniper::separators::group_separator_string; s.append(char_from_u8(u8"trondheimsmilj\u00f8et. M\u00b5ss med denne nye funksjonaliteten for \u00e5 vise frem")); - s += UNIT_SEPARATOR; + s += juniper::separators::unit_separator_string; s.append(char_from_u8(u8" beste forekomst av s\u00f8ket med s\u00f8kemotor til brukeren blir det enda bedre. ")); s.append(char_from_u8(u8"Hvis bare UTF8-kodingen virker som den skal for tegn som tar mer enn \u00e9n byte.")); @@ -415,8 +416,8 @@ void AuxTest::TestUTF8context() _test(m.TotalMatchCnt(3) == 1 && m.ExactMatchCnt(2) == 1); char separators[3]; - separators[0] = UNIT_SEPARATOR; - separators[1] = GROUP_SEPARATOR; + separators[0] = juniper::separators::unit_separator; + separators[1] = juniper::separators::group_separator; separators[2] = '\0'; if (color_highlight) diff --git a/searchsummary/src/vespa/juniper/CMakeLists.txt b/searchsummary/src/vespa/juniper/CMakeLists.txt index 3d6b72ef511..1d47885b1a9 100644 --- a/searchsummary/src/vespa/juniper/CMakeLists.txt +++ b/searchsummary/src/vespa/juniper/CMakeLists.txt @@ -7,6 +7,7 @@ vespa_add_library(searchsummary_juniper OBJECT mcand.cpp keyocc.cpp juniperparams.cpp + juniper_separators.cpp SummaryConfig.cpp tokenizer.cpp propreader.cpp diff --git a/searchsummary/src/vespa/juniper/config.cpp b/searchsummary/src/vespa/juniper/config.cpp index 3daebfd1ea8..5859ea8336e 100644 --- a/searchsummary/src/vespa/juniper/config.cpp +++ b/searchsummary/src/vespa/juniper/config.cpp @@ -3,6 +3,7 @@ #include "config.h" #include "rpinterface.h" #include "juniperdebug.h" +#include "juniper_separators.h" #define _NEED_SUMMARY_CONFIG_IMPL #include "SummaryConfig.h" #include <vespa/vespalib/locale/c.h> @@ -18,8 +19,8 @@ Config::Config(const char* config_name, const Juniper & juniper) : _juniper(juniper) { std::string separators = ""; - separators += UNIT_SEPARATOR; - separators += GROUP_SEPARATOR; + separators += separators::unit_separator_string; + separators += separators::group_separator_string; const char* high_on = GetProp("dynsum.highlight_on", "<b>"); const char* high_off = GetProp("dynsum.highlight_off", "</b>"); diff --git a/searchsummary/src/vespa/juniper/dpinterface.h b/searchsummary/src/vespa/juniper/dpinterface.h index 8f538cee05d..b5f302152c6 100644 --- a/searchsummary/src/vespa/juniper/dpinterface.h +++ b/searchsummary/src/vespa/juniper/dpinterface.h @@ -11,13 +11,6 @@ * rpinterface.h */ -/** The GS character used to separate paragraphs */ -#define GROUP_SEPARATOR 0x1D - -/** The US character used to separate words in CJK texts */ -#define UNIT_SEPARATOR 0x1F - - namespace juniper { /** class Tokentype Hint as to which type of token this is. diff --git a/searchsummary/src/vespa/juniper/juniper_separators.cpp b/searchsummary/src/vespa/juniper/juniper_separators.cpp new file mode 100644 index 00000000000..9342d3d34dc --- /dev/null +++ b/searchsummary/src/vespa/juniper/juniper_separators.cpp @@ -0,0 +1,14 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "juniper_separators.h" + +namespace juniper::separators { + +vespalib::string interlinear_annotation_anchor_string("\xef\xbf\xb9"); // U+FFF9 +vespalib::string interlinear_annotation_separator_string("\xef\xbf\xba"); // U+FFFA +vespalib::string interlinear_annotation_terminator_string("\xef\xbf\xbb"); // U+FFFB +vespalib::string group_separator_string("\x1d"); +vespalib::string record_separator_string("\x1e"); +vespalib::string unit_separator_string("\x1f"); + +} diff --git a/searchsummary/src/vespa/juniper/juniper_separators.h b/searchsummary/src/vespa/juniper/juniper_separators.h new file mode 100644 index 00000000000..03b0945138b --- /dev/null +++ b/searchsummary/src/vespa/juniper/juniper_separators.h @@ -0,0 +1,33 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include <vespa/vespalib/stllike/string.h> + +namespace juniper::separators { + +// Separators used in strings passed to juniper. + +// UTF-8 encoded separarators +extern vespalib::string interlinear_annotation_anchor_string; +extern vespalib::string interlinear_annotation_separator_string; +extern vespalib::string interlinear_annotation_terminator_string; +extern vespalib::string group_separator_string; +extern vespalib::string record_separator_string; +extern vespalib::string unit_separator_string; + +// UTF-32 separators +constexpr char32_t interlinear_annotation_anchor = U'\xfff9'; +constexpr char32_t interlinear_annotation_separator = U'\xfffa'; +constexpr char32_t interlinear_annotation_terminator = U'\xfffb'; + +// The GS character used to separate paragraphs +constexpr char8_t group_separator = u8'\x1d'; + +// The RS character +constexpr char8_t record_separator = u8'\x1e'; + +// The US character used to separate words in CJK texts +constexpr char8_t unit_separator = u8'\x1f'; + +} diff --git a/searchsummary/src/vespa/juniper/matchobject.cpp b/searchsummary/src/vespa/juniper/matchobject.cpp index 376f970d73b..60b14cd5bca 100644 --- a/searchsummary/src/vespa/juniper/matchobject.cpp +++ b/searchsummary/src/vespa/juniper/matchobject.cpp @@ -3,6 +3,7 @@ #include "query.h" #include "matchobject.h" #include "juniperdebug.h" +#include "juniper_separators.h" #include "result.h" #include "charutil.h" #include "wildcard_match.h" @@ -10,6 +11,8 @@ #include <vespa/log/log.h> LOG_SETUP(".juniper.matchobject"); +using namespace juniper::separators; + class traverser : public IQueryExprVisitor { public: @@ -299,14 +302,11 @@ QueryTerm* match_iterator::first_match(Token& token) size_t len = token.curlen; // Check for interlinear annotation, and "lie" to the matchobject - if (*term == 0xFFF9) { - // 0xFFF9 = Interlinear Annotation ANCHOR - // 0xFFFA = Interlinear Annotation SEPARATOR - // 0xFFFB = Interlinear Annotation TERMINATOR + if (static_cast<char32_t>(*term) == interlinear_annotation_anchor) { const ucs4_t *terminator = term + len; token.token = ++term; // starting annotation, skip to after SEPARATOR - while (term < terminator && *term != 0xFFFA) { + while (term < terminator && static_cast<char32_t>(*term) != interlinear_annotation_separator) { term++; } const ucs4_t *separator = term; @@ -315,9 +315,9 @@ QueryTerm* match_iterator::first_match(Token& token) token.token = ++term; // skip the SEPARATOR QueryTerm *qt; // process until TERMINATOR is found - while (term < terminator && *term != 0xFFFB) { + while (term < terminator && static_cast<char32_t>(*term) != interlinear_annotation_terminator) { // Handle multiple terms in the same annotation, for compound nouns or multiple stems - if (*term == ' ' || *term == 0xFFFA) { + if (*term == ' ' || static_cast<char32_t>(*term) == interlinear_annotation_separator) { token.curlen = term - token.token; LOG(debug, "recurse A to match token %u..%u len %d", token.token[0], token.token[token.curlen-1], token.curlen); qt = this->first_match(token); diff --git a/searchsummary/src/vespa/juniper/sumdesc.cpp b/searchsummary/src/vespa/juniper/sumdesc.cpp index d6ac5e6e416..18e1b7bbd11 100644 --- a/searchsummary/src/vespa/juniper/sumdesc.cpp +++ b/searchsummary/src/vespa/juniper/sumdesc.cpp @@ -2,6 +2,7 @@ #include "sumdesc.h" #include "juniperdebug.h" +#include "juniper_separators.h" #include "Matcher.h" #include "appender.h" #include <vespa/fastlib/text/unicodeutil.h> @@ -9,6 +10,8 @@ #include <vespa/log/log.h> LOG_SETUP(".juniper.sumdesc"); +using namespace juniper::separators; + /** SummaryDesc: A class of objects describing a query highlight * dynamic summary based on the current state of the provided * matcher. @@ -29,10 +32,6 @@ char printable_char(char c) return c; } -constexpr ucs4_t il_ann_anchor = 0xfff9; -constexpr ucs4_t il_ann_separator = 0xfffa; -constexpr ucs4_t il_ann_terminator = 0xfffb; - bool wordchar(const unsigned char* s) { unsigned char c = *s; @@ -44,13 +43,13 @@ bool wordchar(const unsigned char* s) } } -bool wordchar_or_il_ann_char(const unsigned char* s, ucs4_t annotation_char) +bool wordchar_or_il_ann_char(const unsigned char* s, char32_t annotation_char) { unsigned char c = *s; if (c & 0x80) { ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); return Fast_UnicodeUtil::IsWordChar(u) || - u == annotation_char; + static_cast<char32_t>(u) == annotation_char; } else { return isalnum(c); } @@ -58,12 +57,12 @@ bool wordchar_or_il_ann_char(const unsigned char* s, ucs4_t annotation_char) bool wordchar_or_il_ann_anchor(const unsigned char* s) { - return wordchar_or_il_ann_char(s, il_ann_anchor); + return wordchar_or_il_ann_char(s, interlinear_annotation_anchor); } bool wordchar_or_il_ann_terminator(const unsigned char* s) { - return wordchar_or_il_ann_char(s, il_ann_terminator); + return wordchar_or_il_ann_char(s, interlinear_annotation_terminator); } bool nonwordchar(const unsigned char* s) @@ -78,12 +77,12 @@ bool nonwordchar(const unsigned char* s) } bool -il_ann_char(const unsigned char* s, ucs4_t annotation_char) +il_ann_char(const unsigned char* s, char32_t annotation_char) { unsigned char c = *s; if (c & 0x80) { ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); - return u == annotation_char; + return static_cast<char32_t>(u) == annotation_char; } else { return false; } @@ -92,19 +91,19 @@ il_ann_char(const unsigned char* s, ucs4_t annotation_char) bool il_ann_anchor_char(const unsigned char* s) { - return il_ann_char(s, il_ann_anchor); + return il_ann_char(s, interlinear_annotation_anchor); } bool il_ann_separator_char(const unsigned char* s) { - return il_ann_char(s, il_ann_separator); + return il_ann_char(s, interlinear_annotation_separator); } bool il_ann_terminator_char(const unsigned char* s) { - return il_ann_char(s, il_ann_terminator); + return il_ann_char(s, interlinear_annotation_terminator); } /* Move backwards/forwards from ptr (no longer than to start) in an diff --git a/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp index 8bf78b90c77..5bfe41ed1b0 100644 --- a/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp +++ b/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp @@ -28,6 +28,7 @@ #include <vespa/document/fieldvalue/tensorfieldvalue.h> #include <vespa/document/fieldvalue/referencefieldvalue.h> #include <vespa/eval/eval/value_codec.h> +#include <vespa/juniper/juniper_separators.h> #include <vespa/searchcommon/common/schema.h> #include <vespa/searchlib/util/url.h> #include <vespa/vespalib/geo/zcurve.h> @@ -201,15 +202,15 @@ struct SummaryHandler { if (annCnt > 1 || (annCnt == 1 && it->second)) { annotateSpans(span, it, last); } else { - out << getSpanString(text, span) << '\037'; + out << getSpanString(text, span) << juniper::separators::unit_separator_string; } } template <typename ForwardIt> void annotateSpans(const Span &span, ForwardIt it, ForwardIt last) { - out << "\357\277\271" // ANCHOR + out << juniper::separators::interlinear_annotation_anchor_string // ANCHOR << (getSpanString(text, span)) - << "\357\277\272"; // SEPARATOR + << juniper::separators::interlinear_annotation_separator_string; // SEPARATOR while (it != last) { if (it->second) { out << ensureStringFieldValue(*it->second).getValue(); @@ -220,8 +221,8 @@ struct SummaryHandler { out << " "; } } - out << "\357\277\273" // TERMINATOR - << "\037"; + out << juniper::separators::interlinear_annotation_terminator_string // TERMINATOR + << juniper::separators::unit_separator_string; } }; |