From 00c89ec2a67dbd32e88204cc6a0cafa099cf0f0c Mon Sep 17 00:00:00 2001 From: Tor Brede Vekterli Date: Mon, 22 Jan 2024 14:54:27 +0000 Subject: Treat regex and fuzzy whole-field matching as 1 logical word We have concluded that this is the most semantically correct way of reporting the count, and as a bonus it avoids having to do a separate pass over the string buffer. --- streamingvisitors/src/tests/searcher/searcher_test.cpp | 16 ++++++++++++++++ .../vsm/searcher/utf8flexiblestringfieldsearcher.cpp | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp index 705e14c11a5..eb233db9632 100644 --- a/streamingvisitors/src/tests/searcher/searcher_test.cpp +++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp @@ -729,6 +729,14 @@ TEST("utf8 flexible searcher handles regexes with explicit anchoring") { TEST_DO(assertString(fs, "#^foo$", "oo", Hits())); } +TEST("utf8 flexible searcher regex matching treats field as 1 word") { + UTF8FlexibleStringFieldSearcher fs(0); + // Match case + TEST_DO(assertFieldInfo(fs, "#.*", "foo bar baz", QTFieldInfo(0, 1, 1))); + // Mismatch case + TEST_DO(assertFieldInfo(fs, "#^zoid$", "foo bar baz", QTFieldInfo(0, 0, 1))); +} + TEST("utf8 flexible searcher handles fuzzy search in uncased mode") { UTF8FlexibleStringFieldSearcher fs(0); // Term syntax (only applies to these tests): @@ -819,6 +827,14 @@ TEST("utf8 flexible searcher caps oversized fuzzy prefix length to term length") TEST_DO(assertString(fs, "%{5,9001}zoid", "boid", Hits())); } +TEST("utf8 flexible searcher fuzzy matching treats field as 1 word") { + UTF8FlexibleStringFieldSearcher fs(0); + // Match case + TEST_DO(assertFieldInfo(fs, "%{1}foo bar baz", "foo jar baz", QTFieldInfo(0, 1, 1))); + // Mismatch case + TEST_DO(assertFieldInfo(fs, "%{1}foo", "foo bar baz", QTFieldInfo(0, 0, 1))); +} + TEST("bool search") { BoolFieldSearcher fs(0); TEST_DO(assertBool(fs, "true", true, true)); diff --git a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp index d648d2e252e..5f626ccb962 100644 --- a/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp +++ b/streamingvisitors/src/vespa/vsm/searcher/utf8flexiblestringfieldsearcher.cpp @@ -37,7 +37,7 @@ UTF8FlexibleStringFieldSearcher::match_regexp(const FieldRef & f, search::stream if (regexp_term->regexp().partial_match({f.data(), f.size()})) { addHit(qt, 0); } - return countWords(f); + return 1; } size_t @@ -50,7 +50,7 @@ UTF8FlexibleStringFieldSearcher::match_fuzzy(const FieldRef & f, search::streami if (fuzzy_term->is_match({f.data(), f.size()})) { addHit(qt, 0); } - return countWords(f); + return 1; } size_t -- cgit v1.2.3