summaryrefslogtreecommitdiffstats
path: root/juniper
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2021-12-15 11:43:41 +0100
committerTor Egge <Tor.Egge@online.no>2021-12-16 14:59:03 +0100
commit4331c9d2e31e69d99e3b8e0d2f8a1e4de6c62e39 (patch)
tree61fc2033c24dc4fe95452f080921969f77865470 /juniper
parent54327a0e714ef00b71007f7bef9f0f79a4d187fa (diff)
Don't split interlinear annotations.
Diffstat (limited to 'juniper')
-rw-r--r--juniper/src/vespa/juniper/sumdesc.cpp126
1 files changed, 118 insertions, 8 deletions
diff --git a/juniper/src/vespa/juniper/sumdesc.cpp b/juniper/src/vespa/juniper/sumdesc.cpp
index 1a27ffaefb0..82b64d40971 100644
--- a/juniper/src/vespa/juniper/sumdesc.cpp
+++ b/juniper/src/vespa/juniper/sumdesc.cpp
@@ -18,6 +18,10 @@ LOG_SETUP(".juniper.sumdesc");
namespace {
+constexpr ucs4_t il_ann_anchor = 0xfff9;
+constexpr ucs4_t il_ann_separator = 0xfffa;
+constexpr ucs4_t il_ann_terminator = 0xfffb;
+
bool wordchar(const unsigned char* s)
{
unsigned char c = *s;
@@ -29,6 +33,30 @@ bool wordchar(const unsigned char* s)
}
}
+bool wordchar_or_il_ann_anchor(const unsigned char* s)
+{
+ unsigned char c = *s;
+ if (c & 0x80) {
+ ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s);
+ return Fast_UnicodeUtil::IsWordChar(u) ||
+ u == il_ann_anchor;
+ } else {
+ return isalnum(c);
+ }
+}
+
+bool wordchar_or_il_ann_terminator(const unsigned char* s)
+{
+ unsigned char c = *s;
+ if (c & 0x80) {
+ ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s);
+ return Fast_UnicodeUtil::IsWordChar(u) ||
+ u == il_ann_terminator;
+ } else {
+ return isalnum(c);
+ }
+}
+
bool nonwordchar(const unsigned char* s)
{
unsigned char c = *s;
@@ -40,6 +68,41 @@ bool nonwordchar(const unsigned char* s)
}
}
+bool
+il_ann_anchor_char(const unsigned char* s)
+{
+ unsigned char c = *s;
+ if (c & 0x80) {
+ ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s);
+ return u == il_ann_anchor;
+ } else {
+ return false;
+ }
+}
+
+bool
+il_ann_separator_char(const unsigned char* s)
+{
+ unsigned char c = *s;
+ if (c & 0x80) {
+ ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s);
+ return u == il_ann_separator;
+ } else {
+ return false;
+ }
+}
+
+bool
+il_ann_terminator_char(const unsigned char* s)
+{
+ unsigned char c = *s;
+ if (c & 0x80) {
+ ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s);
+ return u == il_ann_terminator;
+ } else {
+ return false;
+ }
+}
/* Move backwards/forwards from ptr (no longer than to start) in an
* UTF8 text until the beginning of the word or (if space, until
@@ -66,10 +129,20 @@ int complete_word(unsigned char* start, ssize_t length,
// Figure out if a word needs completion or if we are just going
// to eliminate whitespace
if (!wordchar(ptr)) {
- whitespace_elim = true;
- // Change direction of scan
- increment = -increment;
- chartest = wordchar;
+ if (increment > 0 && il_ann_anchor_char(ptr)) {
+ chartest = il_ann_terminator_char;
+ } else if (increment < 0 && il_ann_terminator_char(ptr)) {
+ chartest = il_ann_anchor_char;
+ } else {
+ whitespace_elim = true;
+ // Change direction of scan
+ increment = -increment;
+ if (increment > 0) {
+ chartest = wordchar_or_il_ann_anchor;
+ } else {
+ chartest = wordchar_or_il_ann_terminator;
+ }
+ }
} else {
// Found a wordchar at pointer
// If moving forwards, we need to check the previous character
@@ -78,12 +151,16 @@ int complete_word(unsigned char* start, ssize_t length,
const unsigned char* pre_ptr = ptr;
int cur_move = Fast_UnicodeUtil::UTF8move(start, length,
pre_ptr, -1);
- if (!wordchar(pre_ptr)) // Points at start of new word
+ if (!wordchar(pre_ptr) && !il_ann_terminator_char(pre_ptr)) // Points at start of new word
{
whitespace_elim = true;
// Change direction of scan
increment = -increment;
- chartest = wordchar;
+ if (increment > 0) {
+ chartest = wordchar_or_il_ann_anchor;
+ } else {
+ chartest = wordchar_or_il_ann_terminator;
+ }
ptr = pre_ptr;
moved += cur_move;
} else {
@@ -118,6 +195,34 @@ int complete_word(unsigned char* start, ssize_t length,
break;
}
if (chartest(ptr)) {
+ if (chartest == nonwordchar) {
+ if (il_ann_separator_char(ptr)) {
+ if (increment > 0) {
+ chartest = il_ann_terminator_char;
+ } else {
+ chartest = il_ann_anchor_char;
+ }
+ moved += cur_move;
+ continue;
+ } else if (il_ann_terminator_char(ptr)) {
+ if (increment < 0) {
+ chartest = il_ann_anchor_char;
+ }
+ moved += cur_move;
+ continue;
+ } else if (il_ann_anchor_char(ptr)) {
+ if (increment > 0) {
+ chartest = il_ann_terminator_char;
+ }
+ moved += cur_move;
+ continue;
+ }
+ } else if ((chartest == il_ann_anchor_char) ||
+ (chartest == il_ann_terminator_char)) {
+ chartest = nonwordchar;
+ moved += cur_move;
+ continue;
+ }
LOG(spam, "complete_word: Breaking at char %c/0x%x (%d)", *ptr,
*ptr, cur_move);
// count this character (it is the first blank/wordchar)
@@ -130,7 +235,9 @@ int complete_word(unsigned char* start, ssize_t length,
break; // Found first blank/word char..
}
moved += cur_move;
- if (moved >= MAX_SCAN_WORD) {
+ if (moved >= MAX_SCAN_WORD &&
+ (chartest != il_ann_anchor_char) &&
+ (chartest != il_ann_terminator_char)) {
LOG(spam, "Word length extended max word length %d, "
"breaking at char 0x%x", MAX_SCAN_WORD, *ptr);
break;
@@ -475,8 +582,11 @@ int SummaryDesc::complete_extended_token(unsigned char* start, ssize_t length,
// Only a single connector character that connects word
// characters should lead us to include more words in the
// normal sense:
- if (!wordchar(preptr))
+ if (!wordchar(preptr) &&
+ !(increment > 0 && il_ann_anchor_char(preptr)) &&
+ !(increment < 0 && il_ann_terminator_char(preptr))) {
return moved;
+ }
// If a block of chinese data does not contain any spaces we have to return
// here in order to avoid searching all the way to the start/end.