diff options
author | Arnstein Ressem <aressem@gmail.com> | 2021-12-17 08:57:21 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-12-17 08:57:21 +0100 |
commit | 323751e6ff3805d41b8616ac04ff4d65593088ca (patch) | |
tree | 7ff6aa3f7c1edf6a789afcf78d848d8825507b71 /juniper | |
parent | 2eaa27f75d37737eaac6e14a2ac68d94646eb209 (diff) |
Revert "Don't split interlinear annotations."
Diffstat (limited to 'juniper')
-rw-r--r-- | juniper/src/vespa/juniper/sumdesc.cpp | 122 |
1 files changed, 9 insertions, 113 deletions
diff --git a/juniper/src/vespa/juniper/sumdesc.cpp b/juniper/src/vespa/juniper/sumdesc.cpp index 331a46fdbfa..1a27ffaefb0 100644 --- a/juniper/src/vespa/juniper/sumdesc.cpp +++ b/juniper/src/vespa/juniper/sumdesc.cpp @@ -18,10 +18,6 @@ LOG_SETUP(".juniper.sumdesc"); namespace { -constexpr ucs4_t il_ann_anchor = 0xfff9; -constexpr ucs4_t il_ann_separator = 0xfffa; -constexpr ucs4_t il_ann_terminator = 0xfffb; - bool wordchar(const unsigned char* s) { unsigned char c = *s; @@ -33,28 +29,6 @@ bool wordchar(const unsigned char* s) } } -bool wordchar_or_il_ann_char(const unsigned char* s, ucs4_t annotation_char) -{ - unsigned char c = *s; - if (c & 0x80) { - ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); - return Fast_UnicodeUtil::IsWordChar(u) || - u == annotation_char; - } else { - return isalnum(c); - } -} - -bool wordchar_or_il_ann_anchor(const unsigned char* s) -{ - return wordchar_or_il_ann_char(s, il_ann_anchor); -} - -bool wordchar_or_il_ann_terminator(const unsigned char* s) -{ - return wordchar_or_il_ann_char(s, il_ann_terminator); -} - bool nonwordchar(const unsigned char* s) { unsigned char c = *s; @@ -66,35 +40,6 @@ bool nonwordchar(const unsigned char* s) } } -bool -il_ann_char(const unsigned char* s, ucs4_t annotation_char) -{ - unsigned char c = *s; - if (c & 0x80) { - ucs4_t u = Fast_UnicodeUtil::GetUTF8Char(s); - return u == annotation_char; - } else { - return false; - } -} - -bool -il_ann_anchor_char(const unsigned char* s) -{ - return il_ann_char(s, il_ann_anchor); -} - -bool -il_ann_separator_char(const unsigned char* s) -{ - return il_ann_char(s, il_ann_separator); -} - -bool -il_ann_terminator_char(const unsigned char* s) -{ - return il_ann_char(s, il_ann_terminator); -} /* Move backwards/forwards from ptr (no longer than to start) in an * UTF8 text until the beginning of the word or (if space, until @@ -119,24 +64,12 @@ int complete_word(unsigned char* start, ssize_t length, } // Figure out if a word needs completion or if we are just going - // to eliminate whitespace. Consider sequence from interlinear - // annotation anchor to interlinear annotation terminator to be a - // word. + // to eliminate whitespace if (!wordchar(ptr)) { - if (increment > 0 && il_ann_anchor_char(ptr)) { - chartest = il_ann_terminator_char; - } else if (increment < 0 && il_ann_terminator_char(ptr)) { - chartest = il_ann_anchor_char; - } else { - whitespace_elim = true; - // Change direction of scan - increment = -increment; - if (increment > 0) { - chartest = wordchar_or_il_ann_anchor; - } else { - chartest = wordchar_or_il_ann_terminator; - } - } + whitespace_elim = true; + // Change direction of scan + increment = -increment; + chartest = wordchar; } else { // Found a wordchar at pointer // If moving forwards, we need to check the previous character @@ -145,16 +78,12 @@ int complete_word(unsigned char* start, ssize_t length, const unsigned char* pre_ptr = ptr; int cur_move = Fast_UnicodeUtil::UTF8move(start, length, pre_ptr, -1); - if (!wordchar(pre_ptr) && !il_ann_terminator_char(pre_ptr)) // Points at start of new word + if (!wordchar(pre_ptr)) // Points at start of new word { whitespace_elim = true; // Change direction of scan increment = -increment; - if (increment > 0) { - chartest = wordchar_or_il_ann_anchor; - } else { - chartest = wordchar_or_il_ann_terminator; - } + chartest = wordchar; ptr = pre_ptr; moved += cur_move; } else { @@ -189,34 +118,6 @@ int complete_word(unsigned char* start, ssize_t length, break; } if (chartest(ptr)) { - if (chartest == nonwordchar) { - if (il_ann_separator_char(ptr)) { - if (increment > 0) { - chartest = il_ann_terminator_char; - } else { - chartest = il_ann_anchor_char; - } - moved += cur_move; - continue; - } else if (il_ann_terminator_char(ptr)) { - if (increment < 0) { - chartest = il_ann_anchor_char; - } - moved += cur_move; - continue; - } else if (il_ann_anchor_char(ptr)) { - if (increment > 0) { - chartest = il_ann_terminator_char; - } - moved += cur_move; - continue; - } - } else if ((chartest == il_ann_anchor_char) || - (chartest == il_ann_terminator_char)) { - chartest = nonwordchar; - moved += cur_move; - continue; - } LOG(spam, "complete_word: Breaking at char %c/0x%x (%d)", *ptr, *ptr, cur_move); // count this character (it is the first blank/wordchar) @@ -229,9 +130,7 @@ int complete_word(unsigned char* start, ssize_t length, break; // Found first blank/word char.. } moved += cur_move; - if (moved >= MAX_SCAN_WORD && - (chartest != il_ann_anchor_char) && - (chartest != il_ann_terminator_char)) { + if (moved >= MAX_SCAN_WORD) { LOG(spam, "Word length extended max word length %d, " "breaking at char 0x%x", MAX_SCAN_WORD, *ptr); break; @@ -576,11 +475,8 @@ int SummaryDesc::complete_extended_token(unsigned char* start, ssize_t length, // Only a single connector character that connects word // characters should lead us to include more words in the // normal sense: - if (!wordchar(preptr) && - !(increment > 0 && il_ann_anchor_char(preptr)) && - !(increment < 0 && il_ann_terminator_char(preptr))) { + if (!wordchar(preptr)) return moved; - } // If a block of chinese data does not contain any spaces we have to return // here in order to avoid searching all the way to the start/end. |