summaryrefslogtreecommitdiffstats
path: root/searchsummary
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2022-09-07 15:26:53 +0200
committerTor Egge <Tor.Egge@online.no>2022-09-07 15:26:53 +0200
commit3daae87bbeab7dad6022064c3f89c08e0cd009cf (patch)
tree95764df6cb9d10a58b4bca9ccc8e947b5b7dc273 /searchsummary
parentfa7e942f65ca9c9be35c434bafb4a765ca5b7c50 (diff)
Consolidate juniper separators.
Diffstat (limited to 'searchsummary')
-rw-r--r--searchsummary/src/tests/juniper/auxTest.cpp11
-rw-r--r--searchsummary/src/vespa/juniper/CMakeLists.txt1
-rw-r--r--searchsummary/src/vespa/juniper/config.cpp5
-rw-r--r--searchsummary/src/vespa/juniper/dpinterface.h7
-rw-r--r--searchsummary/src/vespa/juniper/juniper_separators.cpp14
-rw-r--r--searchsummary/src/vespa/juniper/juniper_separators.h33
-rw-r--r--searchsummary/src/vespa/juniper/matchobject.cpp14
-rw-r--r--searchsummary/src/vespa/juniper/sumdesc.cpp25
-rw-r--r--searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp11
9 files changed, 82 insertions, 39 deletions
diff --git a/searchsummary/src/tests/juniper/auxTest.cpp b/searchsummary/src/tests/juniper/auxTest.cpp
index d4b65858e3e..ef22b0542af 100644
--- a/searchsummary/src/tests/juniper/auxTest.cpp
+++ b/searchsummary/src/tests/juniper/auxTest.cpp
@@ -1,6 +1,7 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
#include "auxTest.h"
+#include <vespa/juniper/juniper_separators.h>
#include <vespa/fastos/file.h>
#include <vespa/log/log.h>
LOG_SETUP(".auxtest");
@@ -394,11 +395,11 @@ void AuxTest::TestUTF8context()
// some content
std::string s(char_from_u8(u8"Fast leverer s\u00d8kemotorer og andre nyttige ting for \u00e5 finne frem p\u00e5 "));
s.append(char_from_u8(u8"internett. Teknologien er basert p\u00e5 \u00c5relang"));
- s += UNIT_SEPARATOR;
+ s += juniper::separators::unit_separator_string;
s.append(char_from_u8(u8"norsk innsats og forskning i"));
- s += GROUP_SEPARATOR;
+ s += juniper::separators::group_separator_string;
s.append(char_from_u8(u8"trondheimsmilj\u00f8et. M\u00b5ss med denne nye funksjonaliteten for \u00e5 vise frem"));
- s += UNIT_SEPARATOR;
+ s += juniper::separators::unit_separator_string;
s.append(char_from_u8(u8" beste forekomst av s\u00f8ket med s\u00f8kemotor til brukeren blir det enda bedre. "));
s.append(char_from_u8(u8"Hvis bare UTF8-kodingen virker som den skal for tegn som tar mer enn \u00e9n byte."));
@@ -415,8 +416,8 @@ void AuxTest::TestUTF8context()
_test(m.TotalMatchCnt(3) == 1 && m.ExactMatchCnt(2) == 1);
char separators[3];
- separators[0] = UNIT_SEPARATOR;
- separators[1] = GROUP_SEPARATOR;
+ separators[0] = juniper::separators::unit_separator;
+ separators[1] = juniper::separators::group_separator;
separators[2] = '\0';
if (color_highlight)
diff --git a/searchsummary/src/vespa/juniper/CMakeLists.txt b/searchsummary/src/vespa/juniper/CMakeLists.txt
index 3d6b72ef511..1d47885b1a9 100644
--- a/searchsummary/src/vespa/juniper/CMakeLists.txt
+++ b/searchsummary/src/vespa/juniper/CMakeLists.txt
@@ -7,6 +7,7 @@ vespa_add_library(searchsummary_juniper OBJECT
mcand.cpp
keyocc.cpp
juniperparams.cpp
+ juniper_separators.cpp
SummaryConfig.cpp
tokenizer.cpp
propreader.cpp
diff --git a/searchsummary/src/vespa/juniper/config.cpp b/searchsummary/src/vespa/juniper/config.cpp
index 3daebfd1ea8..5859ea8336e 100644
--- a/searchsummary/src/vespa/juniper/config.cpp
+++ b/searchsummary/src/vespa/juniper/config.cpp
@@ -3,6 +3,7 @@
#include "config.h"
#include "rpinterface.h"
#include "juniperdebug.h"
+#include "juniper_separators.h"
#define _NEED_SUMMARY_CONFIG_IMPL
#include "SummaryConfig.h"
#include <vespa/vespalib/locale/c.h>
@@ -18,8 +19,8 @@ Config::Config(const char* config_name, const Juniper & juniper) :
_juniper(juniper)
{
std::string separators = "";
- separators += UNIT_SEPARATOR;
- separators += GROUP_SEPARATOR;
+ separators += separators::unit_separator_string;
+ separators += separators::group_separator_string;
const char* high_on = GetProp("dynsum.highlight_on", "<b>");
const char* high_off = GetProp("dynsum.highlight_off", "</b>");
diff --git a/searchsummary/src/vespa/juniper/dpinterface.h b/searchsummary/src/vespa/juniper/dpinterface.h
index 8f538cee05d..b5f302152c6 100644
--- a/searchsummary/src/vespa/juniper/dpinterface.h
+++ b/searchsummary/src/vespa/juniper/dpinterface.h
@@ -11,13 +11,6 @@
* rpinterface.h
*/
-/** The GS character used to separate paragraphs */
-#define GROUP_SEPARATOR 0x1D
-
-/** The US character used to separate words in CJK texts */
-#define UNIT_SEPARATOR 0x1F
-
-
namespace juniper {
/** class Tokentype Hint as to which type of token this is.
diff --git a/searchsummary/src/vespa/juniper/juniper_separators.cpp b/searchsummary/src/vespa/juniper/juniper_separators.cpp
new file mode 100644
index 00000000000..9342d3d34dc
--- /dev/null
+++ b/searchsummary/src/vespa/juniper/juniper_separators.cpp
@@ -0,0 +1,14 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "juniper_separators.h"
+
+namespace juniper::separators {
+
+vespalib::string interlinear_annotation_anchor_string("\xef\xbf\xb9"); // U+FFF9
+vespalib::string interlinear_annotation_separator_string("\xef\xbf\xba"); // U+FFFA
+vespalib::string interlinear_annotation_terminator_string("\xef\xbf\xbb"); // U+FFFB
+vespalib::string group_separator_string("\x1d");
+vespalib::string record_separator_string("\x1e");
+vespalib::string unit_separator_string("\x1f");
+
+}
diff --git a/searchsummary/src/vespa/juniper/juniper_separators.h b/searchsummary/src/vespa/juniper/juniper_separators.h
new file mode 100644
index 00000000000..03b0945138b
--- /dev/null
+++ b/searchsummary/src/vespa/juniper/juniper_separators.h
@@ -0,0 +1,33 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/stllike/string.h>
+
+namespace juniper::separators {
+
+// Separators used in strings passed to juniper.
+
+// UTF-8 encoded separarators
+extern vespalib::string interlinear_annotation_anchor_string;
+extern vespalib::string interlinear_annotation_separator_string;
+extern vespalib::string interlinear_annotation_terminator_string;
+extern vespalib::string group_separator_string;
+extern vespalib::string record_separator_string;
+extern vespalib::string unit_separator_string;
+
+// UTF-32 separators
+constexpr char32_t interlinear_annotation_anchor = U'\xfff9';
+constexpr char32_t interlinear_annotation_separator = U'\xfffa';
+constexpr char32_t interlinear_annotation_terminator = U'\xfffb';
+
+// The GS character used to separate paragraphs
+constexpr char8_t group_separator = u8'\x1d';
+
+// The RS character
+constexpr char8_t record_separator = u8'\x1e';
+
+// The US character used to separate words in CJK texts
+constexpr char8_t unit_separator = u8'\x1f';
+
+}
diff --git a/searchsummary/src/vespa/juniper/matchobject.cpp b/searchsummary/src/vespa/juniper/matchobject.cpp
index 376f970d73b..60b14cd5bca 100644
--- a/searchsummary/src/vespa/juniper/matchobject.cpp
+++ b/searchsummary/src/vespa/juniper/matchobject.cpp
@@ -3,6 +3,7 @@
#include "query.h"
#include "matchobject.h"
#include "juniperdebug.h"
+#include "juniper_separators.h"
#include "result.h"
#include "charutil.h"
#include "wildcard_match.h"
@@ -10,6 +11,8 @@
#include <vespa/log/log.h>
LOG_SETUP(".juniper.matchobject");
+using namespace juniper::separators;
+
class traverser : public IQueryExprVisitor
{
public:
@@ -299,14 +302,11 @@ QueryTerm* match_iterator::first_match(Token& token)
size_t len = token.curlen;
// Check for interlinear annotation, and "lie" to the matchobject
- if (*term == 0xFFF9) {
- // 0xFFF9 = Interlinear Annotation ANCHOR
- // 0xFFFA = Interlinear Annotation SEPARATOR
- // 0xFFFB = Interlinear Annotation TERMINATOR
+ if (static_cast<char32_t>(*term) == interlinear_annotation_anchor) {
const ucs4_t *terminator = term + len;
token.token = ++term;
// starting annotation, skip to after SEPARATOR
- while (term < terminator && *term != 0xFFFA) {
+ while (term < terminator && static_cast<char32_t>(*term) != interlinear_annotation_separator) {
term++;
}
const ucs4_t *separator = term;
@@ -315,9 +315,9 @@ QueryTerm* match_iterator::first_match(Token& token)
token.token = ++term; // skip the SEPARATOR
QueryTerm *qt;
// process until TERMINATOR is found
- while (term < terminator && *term != 0xFFFB) {
+ while (term < terminator && static_cast<char32_t>(*term) != interlinear_annotation_terminator) {
// Handle multiple terms in the same annotation, for compound nouns or multiple stems
- if (*term == ' ' || *term == 0xFFFA) {
+ if (*term == ' ' || static_cast<char32_t>(*term) == interlinear_annotation_separator) {
token.curlen = term - token.token;
LOG(debug, "recurse A to match token %u..%u len %d", token.token[0], token.token[token.curlen-1], token.curlen);
qt = this->first_match(token);
diff --git a/searchsummary/src/vespa/juniper/sumdesc.cpp b/searchsummary/src/vespa/juniper/sumdesc.cpp
index d6ac5e6e416..18e1b7bbd11 100644
--- a/searchsummary/src/vespa/juniper/sumdesc.cpp
+++ b/searchsummary/src/vespa/juniper/sumdesc.cpp
@@ -2,6 +2,7 @@
#include "sumdesc.h"
#include "juniperdebug.h"
+#include "juniper_separators.h"
#include "Matcher.h"
#include "appender.h"
#include <vespa/fastlib/text/unicodeutil.h>
@@ -9,6 +10,8 @@
#include <vespa/log/log.h>
LOG_SETUP(".juniper.sumdesc");
+using namespace juniper::separators;
+
/** SummaryDesc: A class of objects describing a query highlight
* dynamic summary based on the current state of the provided
* matcher.
@@ -29,10 +32,6 @@ char printable_char(char c)
return c;
}
-constexpr ucs4_t il_ann_anchor = 0xfff9;
-constexpr ucs4_t il_ann_separator = 0xfffa;
-constexpr ucs4_t il_ann_terminator = 0xfffb;
-
bool wordchar(const unsigned char* s)
{
unsigned char c = *s;
@@ -44,13 +43,13 @@ bool wordchar(const unsigned char* s)
}
}
-bool wordchar_or_il_ann_char(const unsigned char* s, ucs4_t annotation_char)
+bool wordchar_or_il_ann_char(const unsigned char* s, char32_t annotation_char)
{
unsigned char c = *s;
if (c & 0x80) {
ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s);
return Fast_UnicodeUtil::IsWordChar(u) ||
- u == annotation_char;
+ static_cast<char32_t>(u) == annotation_char;
} else {
return isalnum(c);
}
@@ -58,12 +57,12 @@ bool wordchar_or_il_ann_char(const unsigned char* s, ucs4_t annotation_char)
bool wordchar_or_il_ann_anchor(const unsigned char* s)
{
- return wordchar_or_il_ann_char(s, il_ann_anchor);
+ return wordchar_or_il_ann_char(s, interlinear_annotation_anchor);
}
bool wordchar_or_il_ann_terminator(const unsigned char* s)
{
- return wordchar_or_il_ann_char(s, il_ann_terminator);
+ return wordchar_or_il_ann_char(s, interlinear_annotation_terminator);
}
bool nonwordchar(const unsigned char* s)
@@ -78,12 +77,12 @@ bool nonwordchar(const unsigned char* s)
}
bool
-il_ann_char(const unsigned char* s, ucs4_t annotation_char)
+il_ann_char(const unsigned char* s, char32_t annotation_char)
{
unsigned char c = *s;
if (c & 0x80) {
ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s);
- return u == annotation_char;
+ return static_cast<char32_t>(u) == annotation_char;
} else {
return false;
}
@@ -92,19 +91,19 @@ il_ann_char(const unsigned char* s, ucs4_t annotation_char)
bool
il_ann_anchor_char(const unsigned char* s)
{
- return il_ann_char(s, il_ann_anchor);
+ return il_ann_char(s, interlinear_annotation_anchor);
}
bool
il_ann_separator_char(const unsigned char* s)
{
- return il_ann_char(s, il_ann_separator);
+ return il_ann_char(s, interlinear_annotation_separator);
}
bool
il_ann_terminator_char(const unsigned char* s)
{
- return il_ann_char(s, il_ann_terminator);
+ return il_ann_char(s, interlinear_annotation_terminator);
}
/* Move backwards/forwards from ptr (no longer than to start) in an
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp
index 8bf78b90c77..5bfe41ed1b0 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp
@@ -28,6 +28,7 @@
#include <vespa/document/fieldvalue/tensorfieldvalue.h>
#include <vespa/document/fieldvalue/referencefieldvalue.h>
#include <vespa/eval/eval/value_codec.h>
+#include <vespa/juniper/juniper_separators.h>
#include <vespa/searchcommon/common/schema.h>
#include <vespa/searchlib/util/url.h>
#include <vespa/vespalib/geo/zcurve.h>
@@ -201,15 +202,15 @@ struct SummaryHandler {
if (annCnt > 1 || (annCnt == 1 && it->second)) {
annotateSpans(span, it, last);
} else {
- out << getSpanString(text, span) << '\037';
+ out << getSpanString(text, span) << juniper::separators::unit_separator_string;
}
}
template <typename ForwardIt>
void annotateSpans(const Span &span, ForwardIt it, ForwardIt last) {
- out << "\357\277\271" // ANCHOR
+ out << juniper::separators::interlinear_annotation_anchor_string // ANCHOR
<< (getSpanString(text, span))
- << "\357\277\272"; // SEPARATOR
+ << juniper::separators::interlinear_annotation_separator_string; // SEPARATOR
while (it != last) {
if (it->second) {
out << ensureStringFieldValue(*it->second).getValue();
@@ -220,8 +221,8 @@ struct SummaryHandler {
out << " ";
}
}
- out << "\357\277\273" // TERMINATOR
- << "\037";
+ out << juniper::separators::interlinear_annotation_terminator_string // TERMINATOR
+ << juniper::separators::unit_separator_string;
}
};