Don't split interlinear annotations.

author: Tor Egge <Tor.Egge@online.no> 2021-12-15 11:43:41 +0100
committer: Tor Egge <Tor.Egge@online.no> 2021-12-16 14:59:03 +0100
commit: 4331c9d2e31e69d99e3b8e0d2f8a1e4de6c62e39 (patch)
tree: 61fc2033c24dc4fe95452f080921969f77865470 /juniper
parent: 54327a0e714ef00b71007f7bef9f0f79a4d187fa (diff)
1 files changed, 118 insertions, 8 deletions
diff --git a/juniper/src/vespa/juniper/sumdesc.cpp b/juniper/src/vespa/juniper/sumdesc.cpp
index 1a27ffaefb0..82b64d40971 100644
--- a/juniper/src/vespa/juniper/sumdesc.cpp
+++ b/juniper/src/vespa/juniper/sumdesc.cpp
@@ -18,6 +18,10 @@ LOG_SETUP(".juniper.sumdesc");
 
 namespace {
 
+constexpr ucs4_t il_ann_anchor = 0xfff9;
+constexpr ucs4_t il_ann_separator = 0xfffa;
+constexpr ucs4_t il_ann_terminator = 0xfffb;
+
 bool wordchar(const unsigned char* s)
 {
     unsigned char c = *s;
@@ -29,6 +33,30 @@ bool wordchar(const unsigned char* s)
     }
 }
 
+bool wordchar_or_il_ann_anchor(const unsigned char* s)
+{
+    unsigned char c = *s;
+    if (c & 0x80) {
+        ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s);
+        return Fast_UnicodeUtil::IsWordChar(u) ||
+            u == il_ann_anchor;
+    } else {
+        return isalnum(c);
+    }
+}
+
+bool wordchar_or_il_ann_terminator(const unsigned char* s)
+{
+    unsigned char c = *s;
+    if (c & 0x80) {
+        ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s);
+        return Fast_UnicodeUtil::IsWordChar(u) ||
+            u == il_ann_terminator;
+    } else {
+        return isalnum(c);
+    }
+}
+
 bool nonwordchar(const unsigned char* s)
 {
     unsigned char c = *s;
@@ -40,6 +68,41 @@ bool nonwordchar(const unsigned char* s)
     }
 }
 
+bool
+il_ann_anchor_char(const unsigned char* s)
+{
+    unsigned char c = *s;
+    if (c & 0x80) {
+        ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s);
+        return u == il_ann_anchor;
+    } else {
+        return false;
+    }
+}
+
+bool
+il_ann_separator_char(const unsigned char* s)
+{
+    unsigned char c = *s;
+    if (c & 0x80) {
+        ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s);
+        return u == il_ann_separator;
+    } else {
+        return false;
+    }
+}
+
+bool
+il_ann_terminator_char(const unsigned char* s)
+{
+    unsigned char c = *s;
+    if (c & 0x80) {
+        ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s);
+        return u == il_ann_terminator;
+    } else {
+        return false;
+    }
+}
 
 /* Move backwards/forwards from ptr (no longer than to start) in an
  * UTF8 text until the beginning of the word or (if space, until
@@ -66,10 +129,20 @@ int complete_word(unsigned char* start, ssize_t length,
     // Figure out if a word needs completion or if we are just going
     // to eliminate whitespace
     if (!wordchar(ptr)) {
-        whitespace_elim = true;
-        // Change direction of scan
-        increment = -increment;
-        chartest = wordchar;
+        if (increment > 0 && il_ann_anchor_char(ptr)) {
+            chartest = il_ann_terminator_char;
+        } else if (increment < 0 && il_ann_terminator_char(ptr)) {
+            chartest = il_ann_anchor_char;
+        } else {
+            whitespace_elim = true;
+            // Change direction of scan
+            increment = -increment;
+            if (increment > 0) {
+                chartest = wordchar_or_il_ann_anchor;
+            } else {
+                chartest = wordchar_or_il_ann_terminator;
+            }
+        }
     } else {
         // Found a wordchar at pointer
         // If moving forwards, we need to check the previous character
@@ -78,12 +151,16 @@ int complete_word(unsigned char* start, ssize_t length,
             const unsigned char* pre_ptr = ptr;
             int cur_move = Fast_UnicodeUtil::UTF8move(start, length,
                     pre_ptr, -1);
-            if (!wordchar(pre_ptr)) // Points at start of new word
+            if (!wordchar(pre_ptr) && !il_ann_terminator_char(pre_ptr)) // Points at start of new word
             {
                 whitespace_elim = true;
                 // Change direction of scan
                 increment = -increment;
-                chartest = wordchar;
+                if (increment > 0) {
+                    chartest = wordchar_or_il_ann_anchor;
+                } else {
+                    chartest = wordchar_or_il_ann_terminator;
+                }
                 ptr = pre_ptr;
                 moved += cur_move;
             } else {
@@ -118,6 +195,34 @@ int complete_word(unsigned char* start, ssize_t length,
             break;
         }
         if (chartest(ptr)) {
+            if (chartest == nonwordchar) {
+                if (il_ann_separator_char(ptr)) {
+                    if (increment > 0) {
+                        chartest = il_ann_terminator_char;
+                    } else {
+                        chartest = il_ann_anchor_char;
+                    }
+                    moved += cur_move;
+                    continue;
+                } else if (il_ann_terminator_char(ptr)) {
+                    if (increment < 0) {
+                        chartest = il_ann_anchor_char;
+                    }
+                    moved += cur_move;
+                    continue;
+                } else if (il_ann_anchor_char(ptr)) {
+                    if (increment > 0) {
+                        chartest = il_ann_terminator_char;
+                    }
+                    moved += cur_move;
+                    continue;
+                }
+            } else if ((chartest == il_ann_anchor_char) ||
+                       (chartest == il_ann_terminator_char)) {
+                chartest = nonwordchar;
+                moved += cur_move;
+                continue;
+            }
                LOG(spam, "complete_word: Breaking at char %c/0x%x (%d)", *ptr,
                    *ptr, cur_move);
             // count this character (it is the first blank/wordchar)
@@ -130,7 +235,9 @@ int complete_word(unsigned char* start, ssize_t length,
             break; // Found first blank/word char..
         }
         moved += cur_move;
-        if (moved >= MAX_SCAN_WORD) {
+        if (moved >= MAX_SCAN_WORD &&
+            (chartest != il_ann_anchor_char) &&
+            (chartest != il_ann_terminator_char)) {
             LOG(spam, "Word length extended max word length %d, "
                 "breaking at char 0x%x", MAX_SCAN_WORD, *ptr);
             break;
@@ -475,8 +582,11 @@ int SummaryDesc::complete_extended_token(unsigned char* start, ssize_t length,
         // Only a single connector character that connects word
         // characters should lead us to include more words in the
         // normal sense:
-        if (!wordchar(preptr))
+        if (!wordchar(preptr) &&
+            !(increment > 0 && il_ann_anchor_char(preptr)) &&
+            !(increment < 0 && il_ann_terminator_char(preptr))) {
             return moved;
+        }
 
 	// If a block of chinese data does not contain any spaces we have to return
 	// here in order to avoid searching all the way to the start/end.
author	Tor Egge <Tor.Egge@online.no>	2021-12-15 11:43:41 +0100
committer	Tor Egge <Tor.Egge@online.no>	2021-12-16 14:59:03 +0100
commit	4331c9d2e31e69d99e3b8e0d2f8a1e4de6c62e39 (patch)
tree	61fc2033c24dc4fe95452f080921969f77865470 /juniper
parent	54327a0e714ef00b71007f7bef9f0f79a4d187fa (diff)