Consolidate juniper separators.

author: Tor Egge <Tor.Egge@online.no> 2022-09-07 15:26:53 +0200
committer: Tor Egge <Tor.Egge@online.no> 2022-09-07 15:26:53 +0200
commit: 3daae87bbeab7dad6022064c3f89c08e0cd009cf (patch)
tree: 95764df6cb9d10a58b4bca9ccc8e947b5b7dc273 /searchsummary
parent: fa7e942f65ca9c9be35c434bafb4a765ca5b7c50 (diff)
9 files changed, 82 insertions, 39 deletions
diff --git a/searchsummary/src/tests/juniper/auxTest.cpp b/searchsummary/src/tests/juniper/auxTest.cpp
index d4b65858e3e..ef22b0542af 100644
--- a/searchsummary/src/tests/juniper/auxTest.cpp
+++ b/searchsummary/src/tests/juniper/auxTest.cpp
@@ -1,6 +1,7 @@
 // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
 
 #include "auxTest.h"
+#include <vespa/juniper/juniper_separators.h>
 #include <vespa/fastos/file.h>
 #include <vespa/log/log.h>
 LOG_SETUP(".auxtest");
@@ -394,11 +395,11 @@ void AuxTest::TestUTF8context()
     // some content
     std::string s(char_from_u8(u8"Fast leverer s\u00d8kemotorer og andre nyttige ting for \u00e5 finne frem p\u00e5 "));
     s.append(char_from_u8(u8"internett. Teknologien er basert p\u00e5 \u00c5relang"));
-    s += UNIT_SEPARATOR;
+    s += juniper::separators::unit_separator_string;
     s.append(char_from_u8(u8"norsk innsats og forskning i"));
-    s += GROUP_SEPARATOR;
+    s += juniper::separators::group_separator_string;
     s.append(char_from_u8(u8"trondheimsmilj\u00f8et. M\u00b5ss med denne nye funksjonaliteten for \u00e5 vise frem"));
-    s += UNIT_SEPARATOR;
+    s += juniper::separators::unit_separator_string;
     s.append(char_from_u8(u8" beste forekomst av s\u00f8ket med s\u00f8kemotor til brukeren blir det enda bedre. "));
     s.append(char_from_u8(u8"Hvis bare UTF8-kodingen virker som den skal for tegn som tar mer enn \u00e9n byte."));
 
@@ -415,8 +416,8 @@ void AuxTest::TestUTF8context()
     _test(m.TotalMatchCnt(3) == 1 && m.ExactMatchCnt(2) == 1);
 
     char separators[3];
-    separators[0] = UNIT_SEPARATOR;
-    separators[1] = GROUP_SEPARATOR;
+    separators[0] = juniper::separators::unit_separator;
+    separators[1] = juniper::separators::group_separator;
     separators[2] = '\0';
 
     if (color_highlight)
diff --git a/searchsummary/src/vespa/juniper/CMakeLists.txt b/searchsummary/src/vespa/juniper/CMakeLists.txt
index 3d6b72ef511..1d47885b1a9 100644
--- a/searchsummary/src/vespa/juniper/CMakeLists.txt
+++ b/searchsummary/src/vespa/juniper/CMakeLists.txt
@@ -7,6 +7,7 @@ vespa_add_library(searchsummary_juniper OBJECT
     mcand.cpp
     keyocc.cpp
     juniperparams.cpp
+    juniper_separators.cpp
     SummaryConfig.cpp
     tokenizer.cpp
     propreader.cpp
diff --git a/searchsummary/src/vespa/juniper/config.cpp b/searchsummary/src/vespa/juniper/config.cpp
index 3daebfd1ea8..5859ea8336e 100644
--- a/searchsummary/src/vespa/juniper/config.cpp
+++ b/searchsummary/src/vespa/juniper/config.cpp
@@ -3,6 +3,7 @@
 #include "config.h"
 #include "rpinterface.h"
 #include "juniperdebug.h"
+#include "juniper_separators.h"
 #define _NEED_SUMMARY_CONFIG_IMPL
 #include "SummaryConfig.h"
 #include <vespa/vespalib/locale/c.h>
@@ -18,8 +19,8 @@ Config::Config(const char* config_name, const Juniper & juniper) :
     _juniper(juniper)
 {
     std::string separators = "";
-    separators += UNIT_SEPARATOR;
-    separators += GROUP_SEPARATOR;
+    separators += separators::unit_separator_string;
+    separators += separators::group_separator_string;
 
     const char* high_on  = GetProp("dynsum.highlight_on", "<b>");
     const char* high_off = GetProp("dynsum.highlight_off", "</b>");
diff --git a/searchsummary/src/vespa/juniper/dpinterface.h b/searchsummary/src/vespa/juniper/dpinterface.h
index 8f538cee05d..b5f302152c6 100644
--- a/searchsummary/src/vespa/juniper/dpinterface.h
+++ b/searchsummary/src/vespa/juniper/dpinterface.h
@@ -11,13 +11,6 @@
  *    rpinterface.h
  */
 
-/** The GS character used to separate paragraphs */
-#define GROUP_SEPARATOR   0x1D
-
-/** The US character used to separate words in CJK texts */
-#define UNIT_SEPARATOR    0x1F
-
-
 namespace juniper {
 
 /** class Tokentype Hint as to which type of token this is.
diff --git a/searchsummary/src/vespa/juniper/juniper_separators.cpp b/searchsummary/src/vespa/juniper/juniper_separators.cpp
new file mode 100644
index 00000000000..9342d3d34dc
--- /dev/null
+++ b/searchsummary/src/vespa/juniper/juniper_separators.cpp
@@ -0,0 +1,14 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "juniper_separators.h"
+
+namespace juniper::separators {
+
+vespalib::string interlinear_annotation_anchor_string("\xef\xbf\xb9"); // U+FFF9
+vespalib::string interlinear_annotation_separator_string("\xef\xbf\xba"); // U+FFFA
+vespalib::string interlinear_annotation_terminator_string("\xef\xbf\xbb"); // U+FFFB
+vespalib::string group_separator_string("\x1d");
+vespalib::string record_separator_string("\x1e");
+vespalib::string unit_separator_string("\x1f");
+
+}
diff --git a/searchsummary/src/vespa/juniper/juniper_separators.h b/searchsummary/src/vespa/juniper/juniper_separators.h
new file mode 100644
index 00000000000..03b0945138b
--- /dev/null
+++ b/searchsummary/src/vespa/juniper/juniper_separators.h
@@ -0,0 +1,33 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include <vespa/vespalib/stllike/string.h>
+
+namespace juniper::separators {
+
+// Separators used in strings passed to juniper.
+
+// UTF-8 encoded separarators
+extern vespalib::string interlinear_annotation_anchor_string;
+extern vespalib::string interlinear_annotation_separator_string;
+extern vespalib::string interlinear_annotation_terminator_string;
+extern vespalib::string group_separator_string;
+extern vespalib::string record_separator_string;
+extern vespalib::string unit_separator_string;
+
+// UTF-32 separators
+constexpr char32_t interlinear_annotation_anchor = U'\xfff9';
+constexpr char32_t interlinear_annotation_separator = U'\xfffa';
+constexpr char32_t interlinear_annotation_terminator = U'\xfffb';
+
+// The GS character used to separate paragraphs
+constexpr char8_t group_separator = u8'\x1d';
+
+// The RS character
+constexpr char8_t record_separator = u8'\x1e';
+
+// The US character used to separate words in CJK texts
+constexpr char8_t unit_separator  = u8'\x1f';
+
+}
diff --git a/searchsummary/src/vespa/juniper/matchobject.cpp b/searchsummary/src/vespa/juniper/matchobject.cpp
index 376f970d73b..60b14cd5bca 100644
--- a/searchsummary/src/vespa/juniper/matchobject.cpp
+++ b/searchsummary/src/vespa/juniper/matchobject.cpp
@@ -3,6 +3,7 @@
 #include "query.h"
 #include "matchobject.h"
 #include "juniperdebug.h"
+#include "juniper_separators.h"
 #include "result.h"
 #include "charutil.h"
 #include "wildcard_match.h"
@@ -10,6 +11,8 @@
 #include <vespa/log/log.h>
 LOG_SETUP(".juniper.matchobject");
 
+using namespace juniper::separators;
+
 class traverser : public IQueryExprVisitor
 {
 public:
@@ -299,14 +302,11 @@ QueryTerm* match_iterator::first_match(Token& token)
     size_t len = token.curlen;
 
     // Check for interlinear annotation, and "lie" to the matchobject
-    if (*term == 0xFFF9) {
-        // 0xFFF9 = Interlinear Annotation ANCHOR
-        // 0xFFFA = Interlinear Annotation SEPARATOR
-        // 0xFFFB = Interlinear Annotation TERMINATOR
+    if (static_cast<char32_t>(*term) == interlinear_annotation_anchor) {
         const ucs4_t *terminator = term + len;
         token.token = ++term;
         // starting annotation, skip to after SEPARATOR
-        while (term < terminator && *term != 0xFFFA) {
+        while (term < terminator && static_cast<char32_t>(*term) != interlinear_annotation_separator) {
             term++;
         }
         const ucs4_t *separator = term;
@@ -315,9 +315,9 @@ QueryTerm* match_iterator::first_match(Token& token)
             token.token = ++term; // skip the SEPARATOR
             QueryTerm *qt;
             // process until TERMINATOR is found
-            while (term < terminator && *term != 0xFFFB) {
+            while (term < terminator && static_cast<char32_t>(*term) != interlinear_annotation_terminator) {
                 // Handle multiple terms in the same annotation, for compound nouns or multiple stems
-                if (*term == ' ' || *term == 0xFFFA) {
+                if (*term == ' ' || static_cast<char32_t>(*term) == interlinear_annotation_separator) {
                     token.curlen = term - token.token;
                     LOG(debug, "recurse A to match token %u..%u len %d", token.token[0], token.token[token.curlen-1], token.curlen);
                     qt = this->first_match(token);
diff --git a/searchsummary/src/vespa/juniper/sumdesc.cpp b/searchsummary/src/vespa/juniper/sumdesc.cpp
index d6ac5e6e416..18e1b7bbd11 100644
--- a/searchsummary/src/vespa/juniper/sumdesc.cpp
+++ b/searchsummary/src/vespa/juniper/sumdesc.cpp
@@ -2,6 +2,7 @@
 
 #include "sumdesc.h"
 #include "juniperdebug.h"
+#include "juniper_separators.h"
 #include "Matcher.h"
 #include "appender.h"
 #include <vespa/fastlib/text/unicodeutil.h>
@@ -9,6 +10,8 @@
 #include <vespa/log/log.h>
 LOG_SETUP(".juniper.sumdesc");
 
+using namespace juniper::separators;
+
 /** SummaryDesc: A class of objects describing a query highlight
  *  dynamic summary based on the current state of the provided
  *  matcher.
@@ -29,10 +32,6 @@ char printable_char(char c)
     return c;
 }
 
-constexpr ucs4_t il_ann_anchor = 0xfff9;
-constexpr ucs4_t il_ann_separator = 0xfffa;
-constexpr ucs4_t il_ann_terminator = 0xfffb;
-
 bool wordchar(const unsigned char* s)
 {
     unsigned char c = *s;
@@ -44,13 +43,13 @@ bool wordchar(const unsigned char* s)
     }
 }
 
-bool wordchar_or_il_ann_char(const unsigned char* s, ucs4_t annotation_char)
+bool wordchar_or_il_ann_char(const unsigned char* s, char32_t annotation_char)
 {
     unsigned char c = *s;
     if (c & 0x80) {
         ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s);
         return Fast_UnicodeUtil::IsWordChar(u) ||
-            u == annotation_char;
+            static_cast<char32_t>(u) == annotation_char;
     } else {
         return isalnum(c);
     }
@@ -58,12 +57,12 @@ bool wordchar_or_il_ann_char(const unsigned char* s, ucs4_t annotation_char)
 
 bool wordchar_or_il_ann_anchor(const unsigned char* s)
 {
-    return wordchar_or_il_ann_char(s, il_ann_anchor);
+    return wordchar_or_il_ann_char(s, interlinear_annotation_anchor);
 }
 
 bool wordchar_or_il_ann_terminator(const unsigned char* s)
 {
-    return wordchar_or_il_ann_char(s, il_ann_terminator);
+    return wordchar_or_il_ann_char(s, interlinear_annotation_terminator);
 }
 
 bool nonwordchar(const unsigned char* s)
@@ -78,12 +77,12 @@ bool nonwordchar(const unsigned char* s)
 }
 
 bool
-il_ann_char(const unsigned char* s, ucs4_t annotation_char)
+il_ann_char(const unsigned char* s, char32_t annotation_char)
 {
     unsigned char c = *s;
     if (c & 0x80) {
         ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s);
-        return u == annotation_char;
+        return static_cast<char32_t>(u) == annotation_char;
     } else {
         return false;
     }
@@ -92,19 +91,19 @@ il_ann_char(const unsigned char* s, ucs4_t annotation_char)
 bool
 il_ann_anchor_char(const unsigned char* s)
 {
-    return il_ann_char(s, il_ann_anchor);
+    return il_ann_char(s, interlinear_annotation_anchor);
 }
 
 bool
 il_ann_separator_char(const unsigned char* s)
 {
-    return il_ann_char(s, il_ann_separator);
+    return il_ann_char(s, interlinear_annotation_separator);
 }
 
 bool
 il_ann_terminator_char(const unsigned char* s)
 {
-    return il_ann_char(s, il_ann_terminator);
+    return il_ann_char(s, interlinear_annotation_terminator);
 }
 
 /* Move backwards/forwards from ptr (no longer than to start) in an
diff --git a/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp b/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp
index 8bf78b90c77..5bfe41ed1b0 100644
--- a/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp
+++ b/searchsummary/src/vespa/searchsummary/docsummary/summaryfieldconverter.cpp
@@ -28,6 +28,7 @@
 #include <vespa/document/fieldvalue/tensorfieldvalue.h>
 #include <vespa/document/fieldvalue/referencefieldvalue.h>
 #include <vespa/eval/eval/value_codec.h>
+#include <vespa/juniper/juniper_separators.h>
 #include <vespa/searchcommon/common/schema.h>
 #include <vespa/searchlib/util/url.h>
 #include <vespa/vespalib/geo/zcurve.h>
@@ -201,15 +202,15 @@ struct SummaryHandler {
         if (annCnt > 1 || (annCnt == 1 && it->second)) {
             annotateSpans(span, it, last);
         } else {
-            out << getSpanString(text, span) << '\037';
+            out << getSpanString(text, span) << juniper::separators::unit_separator_string;
         }
     }
 
     template <typename ForwardIt>
     void annotateSpans(const Span &span, ForwardIt it, ForwardIt last) {
-        out << "\357\277\271"  // ANCHOR
+        out << juniper::separators::interlinear_annotation_anchor_string  // ANCHOR
             << (getSpanString(text, span))
-            << "\357\277\272"; // SEPARATOR
+            << juniper::separators::interlinear_annotation_separator_string; // SEPARATOR
         while (it != last) {
             if (it->second) {
                 out << ensureStringFieldValue(*it->second).getValue();
@@ -220,8 +221,8 @@ struct SummaryHandler {
                 out << " ";
             }
         }
-        out << "\357\277\273"  // TERMINATOR
-            << "\037";
+        out << juniper::separators::interlinear_annotation_terminator_string  // TERMINATOR
+            << juniper::separators::unit_separator_string;
     }
 };
author	Tor Egge <Tor.Egge@online.no>	2022-09-07 15:26:53 +0200
committer	Tor Egge <Tor.Egge@online.no>	2022-09-07 15:26:53 +0200
commit	3daae87bbeab7dad6022064c3f89c08e0cd009cf (patch)
tree	95764df6cb9d10a58b4bca9ccc8e947b5b7dc273 /searchsummary
parent	fa7e942f65ca9c9be35c434bafb4a765ca5b7c50 (diff)