diff options
author | Tor Egge <Tor.Egge@online.no> | 2021-12-15 11:43:41 +0100 |
---|---|---|
committer | Tor Egge <Tor.Egge@online.no> | 2021-12-16 14:59:03 +0100 |
commit | 4331c9d2e31e69d99e3b8e0d2f8a1e4de6c62e39 (patch) | |
tree | 61fc2033c24dc4fe95452f080921969f77865470 /juniper | |
parent | 54327a0e714ef00b71007f7bef9f0f79a4d187fa (diff) |
Don't split interlinear annotations.
Diffstat (limited to 'juniper')
-rw-r--r-- | juniper/src/vespa/juniper/sumdesc.cpp | 126 |
1 files changed, 118 insertions, 8 deletions
diff --git a/juniper/src/vespa/juniper/sumdesc.cpp b/juniper/src/vespa/juniper/sumdesc.cpp index 1a27ffaefb0..82b64d40971 100644 --- a/juniper/src/vespa/juniper/sumdesc.cpp +++ b/juniper/src/vespa/juniper/sumdesc.cpp @@ -18,6 +18,10 @@ LOG_SETUP(".juniper.sumdesc"); namespace { +constexpr ucs4_t il_ann_anchor = 0xfff9; +constexpr ucs4_t il_ann_separator = 0xfffa; +constexpr ucs4_t il_ann_terminator = 0xfffb; + bool wordchar(const unsigned char* s) { unsigned char c = *s; @@ -29,6 +33,30 @@ bool wordchar(const unsigned char* s) } } +bool wordchar_or_il_ann_anchor(const unsigned char* s) +{ + unsigned char c = *s; + if (c & 0x80) { + ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); + return Fast_UnicodeUtil::IsWordChar(u) || + u == il_ann_anchor; + } else { + return isalnum(c); + } +} + +bool wordchar_or_il_ann_terminator(const unsigned char* s) +{ + unsigned char c = *s; + if (c & 0x80) { + ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); + return Fast_UnicodeUtil::IsWordChar(u) || + u == il_ann_terminator; + } else { + return isalnum(c); + } +} + bool nonwordchar(const unsigned char* s) { unsigned char c = *s; @@ -40,6 +68,41 @@ bool nonwordchar(const unsigned char* s) } } +bool +il_ann_anchor_char(const unsigned char* s) +{ + unsigned char c = *s; + if (c & 0x80) { + ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); + return u == il_ann_anchor; + } else { + return false; + } +} + +bool +il_ann_separator_char(const unsigned char* s) +{ + unsigned char c = *s; + if (c & 0x80) { + ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); + return u == il_ann_separator; + } else { + return false; + } +} + +bool +il_ann_terminator_char(const unsigned char* s) +{ + unsigned char c = *s; + if (c & 0x80) { + ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); + return u == il_ann_terminator; + } else { + return false; + } +} /* Move backwards/forwards from ptr (no longer than to start) in an * UTF8 text until the beginning of the word or (if space, until @@ -66,10 +129,20 @@ int complete_word(unsigned char* start, ssize_t length, // Figure out if a word needs completion or if we are just going // to eliminate whitespace if (!wordchar(ptr)) { - whitespace_elim = true; - // Change direction of scan - increment = -increment; - chartest = wordchar; + if (increment > 0 && il_ann_anchor_char(ptr)) { + chartest = il_ann_terminator_char; + } else if (increment < 0 && il_ann_terminator_char(ptr)) { + chartest = il_ann_anchor_char; + } else { + whitespace_elim = true; + // Change direction of scan + increment = -increment; + if (increment > 0) { + chartest = wordchar_or_il_ann_anchor; + } else { + chartest = wordchar_or_il_ann_terminator; + } + } } else { // Found a wordchar at pointer // If moving forwards, we need to check the previous character @@ -78,12 +151,16 @@ int complete_word(unsigned char* start, ssize_t length, const unsigned char* pre_ptr = ptr; int cur_move = Fast_UnicodeUtil::UTF8move(start, length, pre_ptr, -1); - if (!wordchar(pre_ptr)) // Points at start of new word + if (!wordchar(pre_ptr) && !il_ann_terminator_char(pre_ptr)) // Points at start of new word { whitespace_elim = true; // Change direction of scan increment = -increment; - chartest = wordchar; + if (increment > 0) { + chartest = wordchar_or_il_ann_anchor; + } else { + chartest = wordchar_or_il_ann_terminator; + } ptr = pre_ptr; moved += cur_move; } else { @@ -118,6 +195,34 @@ int complete_word(unsigned char* start, ssize_t length, break; } if (chartest(ptr)) { + if (chartest == nonwordchar) { + if (il_ann_separator_char(ptr)) { + if (increment > 0) { + chartest = il_ann_terminator_char; + } else { + chartest = il_ann_anchor_char; + } + moved += cur_move; + continue; + } else if (il_ann_terminator_char(ptr)) { + if (increment < 0) { + chartest = il_ann_anchor_char; + } + moved += cur_move; + continue; + } else if (il_ann_anchor_char(ptr)) { + if (increment > 0) { + chartest = il_ann_terminator_char; + } + moved += cur_move; + continue; + } + } else if ((chartest == il_ann_anchor_char) || + (chartest == il_ann_terminator_char)) { + chartest = nonwordchar; + moved += cur_move; + continue; + } LOG(spam, "complete_word: Breaking at char %c/0x%x (%d)", *ptr, *ptr, cur_move); // count this character (it is the first blank/wordchar) @@ -130,7 +235,9 @@ int complete_word(unsigned char* start, ssize_t length, break; // Found first blank/word char.. } moved += cur_move; - if (moved >= MAX_SCAN_WORD) { + if (moved >= MAX_SCAN_WORD && + (chartest != il_ann_anchor_char) && + (chartest != il_ann_terminator_char)) { LOG(spam, "Word length extended max word length %d, " "breaking at char 0x%x", MAX_SCAN_WORD, *ptr); break; @@ -475,8 +582,11 @@ int SummaryDesc::complete_extended_token(unsigned char* start, ssize_t length, // Only a single connector character that connects word // characters should lead us to include more words in the // normal sense: - if (!wordchar(preptr)) + if (!wordchar(preptr) && + !(increment > 0 && il_ann_anchor_char(preptr)) && + !(increment < 0 && il_ann_terminator_char(preptr))) { return moved; + } // If a block of chinese data does not contain any spaces we have to return // here in order to avoid searching all the way to the start/end. |