aboutsummaryrefslogtreecommitdiffstats
path: root/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp')
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp33
1 files changed, 33 insertions, 0 deletions
diff --git a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
index d8a6091fe11..5988bdd912f 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/tokenizereader.cpp
@@ -4,6 +4,19 @@
namespace vsm {
+namespace {
+
+template <bool exact_match> inline bool is_word_char(ucs4_t c);
+
+template <>
+inline bool is_word_char<false>(ucs4_t c) { return Fast_UnicodeUtil::IsWordChar(c); }
+
+// All characters are treated as word characters for exact match
+template <>
+inline constexpr bool is_word_char<true>(ucs4_t) { return true; }
+
+}
+
void
TokenizeReader::fold(ucs4_t c) {
const char *repl = Fast_NormalizeWordFolder::ReplacementString(c);
@@ -18,4 +31,24 @@ TokenizeReader::fold(ucs4_t c) {
}
}
+template <bool exact_match>
+size_t
+TokenizeReader::tokenize_helper(Normalizing norm_mode)
+{
+ ucs4_t c(0);
+ while (hasNext()) {
+ if (is_word_char<exact_match>(c = next())) {
+ normalize(c, norm_mode);
+ while (hasNext() && is_word_char<exact_match>(c = next())) {
+ normalize(c, norm_mode);
+ }
+ break;
+ }
+ }
+ return complete();
+}
+
+template size_t TokenizeReader::tokenize_helper<false>(Normalizing);
+template size_t TokenizeReader::tokenize_helper<true>(Normalizing);
+
}