aboutsummaryrefslogtreecommitdiffstats
path: root/searchlib
diff options
context:
space:
mode:
Diffstat (limited to 'searchlib')
-rw-r--r--searchlib/src/main/java/com/yahoo/searchlib/rankingexpression/rule/LambdaFunctionNode.java47
-rw-r--r--searchlib/src/test/java/com/yahoo/searchlib/rankingexpression/evaluation/EvaluationTestCase.java5
-rw-r--r--searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h48
-rw-r--r--searchlib/src/vespa/searchlib/engine/proto_converter.cpp24
4 files changed, 85 insertions, 39 deletions
diff --git a/searchlib/src/main/java/com/yahoo/searchlib/rankingexpression/rule/LambdaFunctionNode.java b/searchlib/src/main/java/com/yahoo/searchlib/rankingexpression/rule/LambdaFunctionNode.java
index b2641fdf229..0f1331515cc 100644
--- a/searchlib/src/main/java/com/yahoo/searchlib/rankingexpression/rule/LambdaFunctionNode.java
+++ b/searchlib/src/main/java/com/yahoo/searchlib/rankingexpression/rule/LambdaFunctionNode.java
@@ -7,6 +7,7 @@ import com.yahoo.searchlib.rankingexpression.evaluation.MapContext;
import com.yahoo.searchlib.rankingexpression.evaluation.Value;
import com.yahoo.tensor.TensorType;
import com.yahoo.tensor.evaluation.TypeContext;
+import com.yahoo.tensor.functions.Generate;
import java.util.Collections;
import java.util.Deque;
@@ -151,19 +152,41 @@ public class LambdaFunctionNode extends CompositeNode {
});
}
- private static Set<String> featuresAccessedIn(ExpressionNode node) {
- if (node instanceof ReferenceNode) {
- return Set.of(((ReferenceNode) node).reference().toString());
- }
- else if (node instanceof NameNode) { // (This clause probably not necessary)
- return Set.of(((NameNode) node).getValue());
- }
- else if (node instanceof CompositeNode) {
- Set<String> features = new HashSet<>();
- ((CompositeNode)node).children().forEach(child -> features.addAll(featuresAccessedIn(child)));
- return features;
+ private static class FeatureFinder {
+ private final Set<String> target;
+ private final Set<String> localVariables = new HashSet<>();
+ FeatureFinder(Set<String> target) { this.target = target; }
+ void process(ExpressionNode node) {
+ if (node instanceof ReferenceNode refNode) {
+ String featureName = refNode.reference().toString();
+ if (! localVariables.contains(featureName)) {
+ target.add(featureName);
+ }
+ return;
+ }
+ Optional<FeatureFinder> subProcessor = Optional.empty();
+ if (node instanceof TensorFunctionNode t) {
+ var fun = t.function();
+ if (fun instanceof Generate<?> g) {
+ var ff = new FeatureFinder(target);
+ var genType = g.type(null); // Generate knows its own type without any context
+ for (var dim : genType.dimensions()) {
+ ff.localVariables.add(dim.name());
+ }
+ subProcessor = Optional.of(ff);
+ }
+ }
+ if (node instanceof CompositeNode composite) {
+ final FeatureFinder processor = subProcessor.orElse(this);
+ composite.children().forEach(child -> processor.process(child));
+ }
}
- return Set.of();
+ }
+
+ private static Set<String> featuresAccessedIn(ExpressionNode node) {
+ Set<String> features = new HashSet<>();
+ new FeatureFinder(features).process(node);
+ return features;
}
@Override
diff --git a/searchlib/src/test/java/com/yahoo/searchlib/rankingexpression/evaluation/EvaluationTestCase.java b/searchlib/src/test/java/com/yahoo/searchlib/rankingexpression/evaluation/EvaluationTestCase.java
index f9ba7552560..637e9be5fc3 100644
--- a/searchlib/src/test/java/com/yahoo/searchlib/rankingexpression/evaluation/EvaluationTestCase.java
+++ b/searchlib/src/test/java/com/yahoo/searchlib/rankingexpression/evaluation/EvaluationTestCase.java
@@ -769,6 +769,10 @@ public class EvaluationTestCase {
@Test
public void testLambdaValidation() {
EvaluationTester tester = new EvaluationTester();
+ // check that we are allowed to access dimension name "y" inside Generate
+ tester.assertEvaluates("{ {d1:0}:15, {d1:1}:150, {d1:2 }:1500 }",
+ "map(tensor0, f(x) (sum(tensor(y[6])(x*y))))",
+ "{ {d1:0}:1, {d1:1}:10, {d1:2}:100 }");
try {
tester.assertEvaluates("{ {d1:0}:1, {d1:1}:2, {d1:2 }:3 }",
"map(tensor0, f(x) (log10(x+sum(tensor0)))", "{ {d1:0}:10, {d1:1}:100, {d1:2}:1000 }");
@@ -779,7 +783,6 @@ public class EvaluationTestCase {
assertEquals("Lambda log10(x + reduce(tensor0, sum)) accesses features outside its scope: tensor0",
e.getMessage());
}
-
}
@Test
diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
index 562c15e94d5..f45f8f2245e 100644
--- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
+++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h
@@ -405,44 +405,44 @@ template <typename BaseSC, typename AttrT, typename DataT>
bool
NumericPostingSearchContext<BaseSC, AttrT, DataT>::use_posting_lists_when_non_strict(const queryeval::ExecuteInfo& info) const
{
- // The following initial constants are derived after running parts of
+ // The following constants are derived after running parts of
// the range search performance test with 10M documents on an Apple M1 Pro with 32 GB memory.
// This code was compiled with two different behaviors:
- // 1) 'filter matching' (never use posting lists).
+ // 1) 'lookup matching' (never use posting lists).
// 2) 'posting list matching' (always use posting lists).
// https://github.com/vespa-engine/system-test/tree/master/tests/performance/range_search
//
- // The following test case was used to establish the baseline cost of producing different number of hits as cheap as possible:
- // range_hits_ratio=[1, 10, 50, 100, 200, 500], values_in_range=1, fast_search=true, filter_hits_ratio=0.
- // The 6 range queries end up using a single posting list that produces the following number of hits: [10k, 100k, 500k, 1M, 2M, 5M]
- // Avg query latency (ms) results: [5.43, 8.56, 11.68, 14.68, 22.77, 42.88]
+ // The following test cases were used:
+ // range_hits_ratio=[100, 200], values_in_range=100, fast_search=true, filter_hits_ratio=[1, 2, 4, 6, 8, 10, 20, 40, 60, 80, 100, 120, 140, 160, 200].
//
- // Then the following test case was executed for both 1) 'filter matching' and 2) 'posting list matching':
- // range_hits_ratio=[1, 10, 50, 100, 200, 500], values_in_range=100, fast_search=true, filter_hits_ratio=0.
- // Avg query latency (ms) results:
- // 1) 'filter matching': [47.52, 51.06, 59.68, 79.3, 118.7, 145.26]
- // 2) 'posting list matching': [4.79, 11.6, 13.54, 20.24, 32.65, 67.28]
+ // By comparing the avg query latency between 1) 'lookup matching' and 2) 'posting list matching'
+ // we find the crossover point between the two strategies.
//
- // For 1) 'filter matching' we use the result from range_hits_ratio=1 (10k hits) compared to the baseline
- // to calculate the cost per document (in ns) to do filter matching: 1M*(47.52-5.43)/10M = 4.2
+ // Excerpt of results for range_hits_ratio=[100] and filter_hits_ratio=[10, 20, 40, 60]:
+ // 1) 'lookup matching': [7.1, 8.4, 14.1, 17.6]
+ // 2) 'posting list matching': [7.3, 8.8, 7.4, 8.1]
+ // With filter_hits_ratio=20, lookup matching is best.
+ // With filter_hits_ratio=40, posting list matching is best.
//
- // For 2) 'posting list matching' we use the results from range_hits_ratio=[50, 100, 200, 500] compared to the baseline
- // to calculate the average cost per hit (in ns) when merging the 100 posting lists:
- // 1M*(13.54-11.68)/500k = 3.7
- // 1M*(20.24-14.68)/1M = 5.6
- // 1M*(32.65-22.77)/2M = 4.9
- // 1M*(67.28-42.88)/5M = 4.9
+ // The extra cost and difference between the two strategies is modelled as follows:
+ // 1) lookup matching: exp_doc_hits * lookup_match_constant (LMC)
+ // 2) posting list matching: estimated_hits_in_range() * posting_list_merge_constant (PLMC)
//
- // The average is 4.8.
+ // At the crossover point (filter_hits_ratio=20) the following costs are calculated:
+ // 1) 10M*20/100 * LMC = 200k * LMC
+ // 2) 10M*100/1000 * PLMC = 1M * PLMC
+ //
+ // Based on this we see that LMC = 5 * PLMC.
+ // The same relationship is found with the test case range_hits_ratio=[200].
- constexpr float filtering_match_constant = 4.2;
- constexpr float posting_list_merge_constant = 4.8;
+ constexpr float lookup_match_constant = 5.0;
+ constexpr float posting_list_merge_constant = 1.0;
uint32_t exp_doc_hits = this->_docIdLimit * info.hitRate();
float avg_values_per_document = static_cast<float>(this->_numValues) / static_cast<float>(this->_docIdLimit);
- float filtering_cost = exp_doc_hits * avg_values_per_document * filtering_match_constant;
+ float lookup_match_cost = exp_doc_hits * avg_values_per_document * lookup_match_constant;
float posting_list_cost = this->estimated_hits_in_range() * posting_list_merge_constant;
- return posting_list_cost < filtering_cost;
+ return posting_list_cost < lookup_match_cost;
}
template <typename BaseSC, typename AttrT, typename DataT>
diff --git a/searchlib/src/vespa/searchlib/engine/proto_converter.cpp b/searchlib/src/vespa/searchlib/engine/proto_converter.cpp
index 59e4b6e99f8..b21de31eab0 100644
--- a/searchlib/src/vespa/searchlib/engine/proto_converter.cpp
+++ b/searchlib/src/vespa/searchlib/engine/proto_converter.cpp
@@ -15,6 +15,26 @@ namespace search::engine {
namespace {
+std::string escape_message(const vespalib::string &item) {
+ static const char hexdigits[] = "0123456789ABCDEF";
+ std::string r;
+ r.reserve(item.size());
+ for (char c : item) {
+ if (c == '\\') {
+ r.push_back(c);
+ r.push_back(c);
+ } else if (c < 32 || c > 126) {
+ r.push_back('\\');
+ r.push_back('x');
+ r.push_back(hexdigits[0xF & (c >> 4)]);
+ r.push_back(hexdigits[0xF & c]);
+ } else {
+ r.push_back(c);
+ }
+ }
+ return r;
+}
+
template <typename T>
vespalib::string make_sort_spec(const T &sorting) {
vespalib::string spec;
@@ -166,7 +186,7 @@ ProtoConverter::search_reply_to_proto(const SearchReply &reply, ProtoSearchReply
reply.my_issues->for_each_message([&](const vespalib::string &err_msg)
{
auto *err_obj = proto.add_errors();
- err_obj->set_message(err_msg);
+ err_obj->set_message(escape_message(err_msg));
});
}
}
@@ -223,7 +243,7 @@ ProtoConverter::docsum_reply_to_proto(const DocsumReply &reply, ProtoDocsumReply
reply.issues().for_each_message([&](const vespalib::string &err_msg)
{
auto *err_obj = proto.add_errors();
- err_obj->set_message(err_msg);
+ err_obj->set_message(escape_message(err_msg));
});
}
}