summaryrefslogtreecommitdiffstats
path: root/streamingvisitors
diff options
context:
space:
mode:
authorTor Egge <Tor.Egge@online.no>2024-01-25 11:39:36 +0100
committerTor Egge <Tor.Egge@online.no>2024-01-25 11:39:36 +0100
commitd198b7b73e376bcb349b159a77e57dbb2a54f19e (patch)
tree7ba1e5430f562e1515856a6e6c82daeba905443d /streamingvisitors
parent45900206e7b773c804e803497dd5a7058f33f9c4 (diff)
Track element length in streaming mode.
Diffstat (limited to 'streamingvisitors')
-rw-r--r--streamingvisitors/src/tests/searcher/searcher_test.cpp15
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp2
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp15
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h7
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp2
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp2
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp2
-rw-r--r--streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp14
8 files changed, 49 insertions, 10 deletions
diff --git a/streamingvisitors/src/tests/searcher/searcher_test.cpp b/streamingvisitors/src/tests/searcher/searcher_test.cpp
index eb233db9632..d1778c2ce8d 100644
--- a/streamingvisitors/src/tests/searcher/searcher_test.cpp
+++ b/streamingvisitors/src/tests/searcher/searcher_test.cpp
@@ -1110,6 +1110,21 @@ TEST("counting of words") {
assertString(fs, StringList().add("bb").add("not"), field, HitsList().add(Hits().add(2)).add(Hits()));
}
+TEST("element lengths")
+{
+ UTF8StrChrFieldSearcher fs(0);
+ auto field = StringList().add("a").add("b a c").add("d a");
+ auto query = StringList().add("a");
+ auto qtv = performSearch(fs, query, getFieldValue(field));
+ EXPECT_EQUAL(1u, qtv.size());
+ auto& qt = *qtv[0];
+ auto& hl = qt.getHitList();
+ EXPECT_EQUAL(3u, hl.size());
+ EXPECT_EQUAL(1u, hl[0].element_length());
+ EXPECT_EQUAL(3u, hl[1].element_length());
+ EXPECT_EQUAL(2u, hl[2].element_length());
+}
+
vespalib::string NormalizationInput = "test That Somehing happens with during NårmØlization";
void
diff --git a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp
index d0cfa4d9956..aa25b0e75d3 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/boolfieldsearcher.cpp
@@ -53,7 +53,7 @@ void BoolFieldSearcher::onValue(const document::FieldValue & fv)
addHit(*_qtl[j], 0);
}
}
- ++_words;
+ set_element_length(1);
}
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
index 5e06ae41a03..c75ab7fccd3 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.cpp
@@ -5,6 +5,7 @@
#include <vespa/document/fieldvalue/weightedsetfieldvalue.h>
#include <vespa/searchlib/query/streaming/multi_term.h>
#include <vespa/vespalib/stllike/hash_set.h>
+#include <cassert>
#include <vespa/log/log.h>
LOG_SETUP(".vsm.searcher.fieldsearcher");
@@ -55,6 +56,7 @@ FieldSearcher::FieldSearcher(FieldIdT fId, bool defaultPrefix) noexcept
_maxFieldLength(0x100000),
_currentElementId(0),
_currentElementWeight(1),
+ _element_length_fixups(),
_words(0),
_badUtf8Count(0)
{
@@ -70,6 +72,7 @@ FieldSearcher::search(const StorageDocument & doc)
fInfo.setHitOffset(qt->getHitList().size());
}
onSearch(doc);
+ assert(_element_length_fixups.empty());
for (auto qt : _qtl) {
QueryTerm::FieldInfo & fInfo = qt->getFieldInfo(field());
fInfo.setHitCount(qt->getHitList().size() - fInfo.getHitOffset());
@@ -276,4 +279,16 @@ FieldSearcher::IteratorHandler::onStructStart(const Content & c)
_searcher.onStructValue(static_cast<const document::StructFieldValue &>(c.getValue()));
}
+void
+FieldSearcher::set_element_length(uint32_t element_length)
+{
+ _words += element_length;
+ if (!_element_length_fixups.empty()) {
+ for (auto& fixup : _element_length_fixups) {
+ fixup.first->set_element_length(fixup.second, element_length);
+ }
+ _element_length_fixups.clear();
+ }
+}
+
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
index e339e4bdf5a..4a9844d8af6 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
+++ b/streamingvisitors/src/vespa/vsm/searcher/fieldsearcher.h
@@ -6,6 +6,7 @@
#include <vespa/vsm/common/document.h>
#include <vespa/vsm/common/storagedocument.h>
#include <vespa/vespalib/util/array.h>
+#include <utility>
namespace search::fef { class IQueryEnvironment; }
@@ -96,6 +97,7 @@ private:
unsigned _maxFieldLength;
uint32_t _currentElementId;
int32_t _currentElementWeight; // Contains the weight of the current item being evaluated.
+ std::vector<std::pair<search::streaming::QueryTerm*, uint32_t>> _element_length_fixups;
protected:
/// Number of terms searched.
unsigned _words;
@@ -105,9 +107,10 @@ protected:
* Adds a hit to the given query term.
* For each call to onValue() a batch of words are processed, and the position is local to this batch.
**/
- void addHit(search::streaming::QueryTerm & qt, uint32_t pos) const {
- qt.add(field(), _currentElementId, _currentElementWeight, _words + pos);
+ void addHit(search::streaming::QueryTerm & qt, uint32_t pos) {
+ _element_length_fixups.emplace_back(&qt, qt.add(field(), _currentElementId, _currentElementWeight, _words + pos));
}
+ void set_element_length(uint32_t element_length);
public:
static search::byte _foldLowCase[256];
static search::byte _wordChar[256];
diff --git a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp
index 8558522003f..70e5bb4b82c 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/floatfieldsearcher.cpp
@@ -55,7 +55,7 @@ void FloatFieldSearcherT<T>::onValue(const document::FieldValue & fv)
addHit(*_qtl[j], 0);
}
}
- ++_words;
+ set_element_length(1);
}
template<typename T>
diff --git a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp
index 5ecc9a5a06e..bbeb3be986f 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/geo_pos_field_searcher.cpp
@@ -58,7 +58,7 @@ void GeoPosFieldSearcher::onStructValue(const document::StructFieldValue & fv) {
addHit(*_qtl[j], 0);
}
}
- ++_words;
+ set_element_length(1);
}
bool GeoPosFieldSearcher::GeoPosInfo::cmp(const document::StructFieldValue & sfv) const {
diff --git a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp
index e73c7f5c1a7..3984254274f 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/intfieldsearcher.cpp
@@ -43,7 +43,7 @@ void IntFieldSearcher::onValue(const document::FieldValue & fv)
addHit(*_qtl[j], 0);
}
}
- ++_words;
+ set_element_length(1);
}
}
diff --git a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
index ba52444101d..673cf11b2cf 100644
--- a/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
+++ b/streamingvisitors/src/vespa/vsm/searcher/strchrfieldsearcher.cpp
@@ -25,22 +25,28 @@ void StrChrFieldSearcher::onValue(const document::FieldValue & fv)
bool StrChrFieldSearcher::matchDoc(const FieldRef & fieldRef)
{
+ size_t element_length = 0;
+ bool need_count_words = false;
if (_qtl.size() > 1) {
size_t mintsz = shortestTerm();
if (fieldRef.size() >= mintsz) {
- _words += matchTerms(fieldRef, mintsz);
+ element_length = matchTerms(fieldRef, mintsz);
} else {
- _words += countWords(fieldRef);
+ need_count_words = true;
}
} else {
for (auto qt : _qtl) {
if (fieldRef.size() >= qt->termLen() || qt->isRegex() || qt->isFuzzy()) {
- _words += matchTerm(fieldRef, *qt);
+ element_length = std::max(element_length, matchTerm(fieldRef, *qt));
} else {
- _words += countWords(fieldRef);
+ need_count_words = true;
}
}
}
+ if (need_count_words) {
+ element_length = std::max(element_length, countWords(fieldRef));
+ }
+ set_element_length(element_length);
return true;
}