diff options
Diffstat (limited to 'searchlib')
4 files changed, 85 insertions, 39 deletions
diff --git a/searchlib/src/main/java/com/yahoo/searchlib/rankingexpression/rule/LambdaFunctionNode.java b/searchlib/src/main/java/com/yahoo/searchlib/rankingexpression/rule/LambdaFunctionNode.java index b2641fdf229..0f1331515cc 100644 --- a/searchlib/src/main/java/com/yahoo/searchlib/rankingexpression/rule/LambdaFunctionNode.java +++ b/searchlib/src/main/java/com/yahoo/searchlib/rankingexpression/rule/LambdaFunctionNode.java @@ -7,6 +7,7 @@ import com.yahoo.searchlib.rankingexpression.evaluation.MapContext; import com.yahoo.searchlib.rankingexpression.evaluation.Value; import com.yahoo.tensor.TensorType; import com.yahoo.tensor.evaluation.TypeContext; +import com.yahoo.tensor.functions.Generate; import java.util.Collections; import java.util.Deque; @@ -151,19 +152,41 @@ public class LambdaFunctionNode extends CompositeNode { }); } - private static Set<String> featuresAccessedIn(ExpressionNode node) { - if (node instanceof ReferenceNode) { - return Set.of(((ReferenceNode) node).reference().toString()); - } - else if (node instanceof NameNode) { // (This clause probably not necessary) - return Set.of(((NameNode) node).getValue()); - } - else if (node instanceof CompositeNode) { - Set<String> features = new HashSet<>(); - ((CompositeNode)node).children().forEach(child -> features.addAll(featuresAccessedIn(child))); - return features; + private static class FeatureFinder { + private final Set<String> target; + private final Set<String> localVariables = new HashSet<>(); + FeatureFinder(Set<String> target) { this.target = target; } + void process(ExpressionNode node) { + if (node instanceof ReferenceNode refNode) { + String featureName = refNode.reference().toString(); + if (! localVariables.contains(featureName)) { + target.add(featureName); + } + return; + } + Optional<FeatureFinder> subProcessor = Optional.empty(); + if (node instanceof TensorFunctionNode t) { + var fun = t.function(); + if (fun instanceof Generate<?> g) { + var ff = new FeatureFinder(target); + var genType = g.type(null); // Generate knows its own type without any context + for (var dim : genType.dimensions()) { + ff.localVariables.add(dim.name()); + } + subProcessor = Optional.of(ff); + } + } + if (node instanceof CompositeNode composite) { + final FeatureFinder processor = subProcessor.orElse(this); + composite.children().forEach(child -> processor.process(child)); + } } - return Set.of(); + } + + private static Set<String> featuresAccessedIn(ExpressionNode node) { + Set<String> features = new HashSet<>(); + new FeatureFinder(features).process(node); + return features; } @Override diff --git a/searchlib/src/test/java/com/yahoo/searchlib/rankingexpression/evaluation/EvaluationTestCase.java b/searchlib/src/test/java/com/yahoo/searchlib/rankingexpression/evaluation/EvaluationTestCase.java index f9ba7552560..637e9be5fc3 100644 --- a/searchlib/src/test/java/com/yahoo/searchlib/rankingexpression/evaluation/EvaluationTestCase.java +++ b/searchlib/src/test/java/com/yahoo/searchlib/rankingexpression/evaluation/EvaluationTestCase.java @@ -769,6 +769,10 @@ public class EvaluationTestCase { @Test public void testLambdaValidation() { EvaluationTester tester = new EvaluationTester(); + // check that we are allowed to access dimension name "y" inside Generate + tester.assertEvaluates("{ {d1:0}:15, {d1:1}:150, {d1:2 }:1500 }", + "map(tensor0, f(x) (sum(tensor(y[6])(x*y))))", + "{ {d1:0}:1, {d1:1}:10, {d1:2}:100 }"); try { tester.assertEvaluates("{ {d1:0}:1, {d1:1}:2, {d1:2 }:3 }", "map(tensor0, f(x) (log10(x+sum(tensor0)))", "{ {d1:0}:10, {d1:1}:100, {d1:2}:1000 }"); @@ -779,7 +783,6 @@ public class EvaluationTestCase { assertEquals("Lambda log10(x + reduce(tensor0, sum)) accesses features outside its scope: tensor0", e.getMessage()); } - } @Test diff --git a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h index 562c15e94d5..f45f8f2245e 100644 --- a/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h +++ b/searchlib/src/vespa/searchlib/attribute/postinglistsearchcontext.h @@ -405,44 +405,44 @@ template <typename BaseSC, typename AttrT, typename DataT> bool NumericPostingSearchContext<BaseSC, AttrT, DataT>::use_posting_lists_when_non_strict(const queryeval::ExecuteInfo& info) const { - // The following initial constants are derived after running parts of + // The following constants are derived after running parts of // the range search performance test with 10M documents on an Apple M1 Pro with 32 GB memory. // This code was compiled with two different behaviors: - // 1) 'filter matching' (never use posting lists). + // 1) 'lookup matching' (never use posting lists). // 2) 'posting list matching' (always use posting lists). // https://github.com/vespa-engine/system-test/tree/master/tests/performance/range_search // - // The following test case was used to establish the baseline cost of producing different number of hits as cheap as possible: - // range_hits_ratio=[1, 10, 50, 100, 200, 500], values_in_range=1, fast_search=true, filter_hits_ratio=0. - // The 6 range queries end up using a single posting list that produces the following number of hits: [10k, 100k, 500k, 1M, 2M, 5M] - // Avg query latency (ms) results: [5.43, 8.56, 11.68, 14.68, 22.77, 42.88] + // The following test cases were used: + // range_hits_ratio=[100, 200], values_in_range=100, fast_search=true, filter_hits_ratio=[1, 2, 4, 6, 8, 10, 20, 40, 60, 80, 100, 120, 140, 160, 200]. // - // Then the following test case was executed for both 1) 'filter matching' and 2) 'posting list matching': - // range_hits_ratio=[1, 10, 50, 100, 200, 500], values_in_range=100, fast_search=true, filter_hits_ratio=0. - // Avg query latency (ms) results: - // 1) 'filter matching': [47.52, 51.06, 59.68, 79.3, 118.7, 145.26] - // 2) 'posting list matching': [4.79, 11.6, 13.54, 20.24, 32.65, 67.28] + // By comparing the avg query latency between 1) 'lookup matching' and 2) 'posting list matching' + // we find the crossover point between the two strategies. // - // For 1) 'filter matching' we use the result from range_hits_ratio=1 (10k hits) compared to the baseline - // to calculate the cost per document (in ns) to do filter matching: 1M*(47.52-5.43)/10M = 4.2 + // Excerpt of results for range_hits_ratio=[100] and filter_hits_ratio=[10, 20, 40, 60]: + // 1) 'lookup matching': [7.1, 8.4, 14.1, 17.6] + // 2) 'posting list matching': [7.3, 8.8, 7.4, 8.1] + // With filter_hits_ratio=20, lookup matching is best. + // With filter_hits_ratio=40, posting list matching is best. // - // For 2) 'posting list matching' we use the results from range_hits_ratio=[50, 100, 200, 500] compared to the baseline - // to calculate the average cost per hit (in ns) when merging the 100 posting lists: - // 1M*(13.54-11.68)/500k = 3.7 - // 1M*(20.24-14.68)/1M = 5.6 - // 1M*(32.65-22.77)/2M = 4.9 - // 1M*(67.28-42.88)/5M = 4.9 + // The extra cost and difference between the two strategies is modelled as follows: + // 1) lookup matching: exp_doc_hits * lookup_match_constant (LMC) + // 2) posting list matching: estimated_hits_in_range() * posting_list_merge_constant (PLMC) // - // The average is 4.8. + // At the crossover point (filter_hits_ratio=20) the following costs are calculated: + // 1) 10M*20/100 * LMC = 200k * LMC + // 2) 10M*100/1000 * PLMC = 1M * PLMC + // + // Based on this we see that LMC = 5 * PLMC. + // The same relationship is found with the test case range_hits_ratio=[200]. - constexpr float filtering_match_constant = 4.2; - constexpr float posting_list_merge_constant = 4.8; + constexpr float lookup_match_constant = 5.0; + constexpr float posting_list_merge_constant = 1.0; uint32_t exp_doc_hits = this->_docIdLimit * info.hitRate(); float avg_values_per_document = static_cast<float>(this->_numValues) / static_cast<float>(this->_docIdLimit); - float filtering_cost = exp_doc_hits * avg_values_per_document * filtering_match_constant; + float lookup_match_cost = exp_doc_hits * avg_values_per_document * lookup_match_constant; float posting_list_cost = this->estimated_hits_in_range() * posting_list_merge_constant; - return posting_list_cost < filtering_cost; + return posting_list_cost < lookup_match_cost; } template <typename BaseSC, typename AttrT, typename DataT> diff --git a/searchlib/src/vespa/searchlib/engine/proto_converter.cpp b/searchlib/src/vespa/searchlib/engine/proto_converter.cpp index 59e4b6e99f8..b21de31eab0 100644 --- a/searchlib/src/vespa/searchlib/engine/proto_converter.cpp +++ b/searchlib/src/vespa/searchlib/engine/proto_converter.cpp @@ -15,6 +15,26 @@ namespace search::engine { namespace { +std::string escape_message(const vespalib::string &item) { + static const char hexdigits[] = "0123456789ABCDEF"; + std::string r; + r.reserve(item.size()); + for (char c : item) { + if (c == '\\') { + r.push_back(c); + r.push_back(c); + } else if (c < 32 || c > 126) { + r.push_back('\\'); + r.push_back('x'); + r.push_back(hexdigits[0xF & (c >> 4)]); + r.push_back(hexdigits[0xF & c]); + } else { + r.push_back(c); + } + } + return r; +} + template <typename T> vespalib::string make_sort_spec(const T &sorting) { vespalib::string spec; @@ -166,7 +186,7 @@ ProtoConverter::search_reply_to_proto(const SearchReply &reply, ProtoSearchReply reply.my_issues->for_each_message([&](const vespalib::string &err_msg) { auto *err_obj = proto.add_errors(); - err_obj->set_message(err_msg); + err_obj->set_message(escape_message(err_msg)); }); } } @@ -223,7 +243,7 @@ ProtoConverter::docsum_reply_to_proto(const DocsumReply &reply, ProtoDocsumReply reply.issues().for_each_message([&](const vespalib::string &err_msg) { auto *err_obj = proto.add_errors(); - err_obj->set_message(err_msg); + err_obj->set_message(escape_message(err_msg)); }); } } |