summaryrefslogtreecommitdiffstats
path: root/eval
diff options
context:
space:
mode:
authorHåvard Pettersen <havardpe@oath.com>2021-08-16 13:14:38 +0000
committerHåvard Pettersen <havardpe@oath.com>2021-08-16 13:36:09 +0000
commitc6331f25f9b6912d649c4fbdd65476775a4b6192 (patch)
tree3151f3bd7f20d2d78629c5f3d5984679786bff2a /eval
parente2fd2769172926471e602190afabc87815d4063a (diff)
move FeatureNameExtractor
to make it available for use in vespa-eval-expr
Diffstat (limited to 'eval')
-rw-r--r--eval/CMakeLists.txt1
-rw-r--r--eval/src/tests/eval/feature_name_extractor/.gitignore1
-rw-r--r--eval/src/tests/eval/feature_name_extractor/CMakeLists.txt8
-rw-r--r--eval/src/tests/eval/feature_name_extractor/feature_name_extractor_test.cpp79
-rw-r--r--eval/src/vespa/eval/eval/CMakeLists.txt1
-rw-r--r--eval/src/vespa/eval/eval/feature_name_extractor.cpp82
-rw-r--r--eval/src/vespa/eval/eval/feature_name_extractor.h18
7 files changed, 190 insertions, 0 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt
index d5a0b2a14ae..4af337dcb67 100644
--- a/eval/CMakeLists.txt
+++ b/eval/CMakeLists.txt
@@ -18,6 +18,7 @@ vespa_define_module(
src/tests/eval/compile_cache
src/tests/eval/compiled_function
src/tests/eval/fast_value
+ src/tests/eval/feature_name_extractor
src/tests/eval/function
src/tests/eval/function_speed
src/tests/eval/gbdt
diff --git a/eval/src/tests/eval/feature_name_extractor/.gitignore b/eval/src/tests/eval/feature_name_extractor/.gitignore
new file mode 100644
index 00000000000..88c86c1720e
--- /dev/null
+++ b/eval/src/tests/eval/feature_name_extractor/.gitignore
@@ -0,0 +1 @@
+searchlib_feature_name_extractor_test_app
diff --git a/eval/src/tests/eval/feature_name_extractor/CMakeLists.txt b/eval/src/tests/eval/feature_name_extractor/CMakeLists.txt
new file mode 100644
index 00000000000..7126060e974
--- /dev/null
+++ b/eval/src/tests/eval/feature_name_extractor/CMakeLists.txt
@@ -0,0 +1,8 @@
+# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+vespa_add_executable(eval_name_extractor_test_app TEST
+ SOURCES
+ feature_name_extractor_test.cpp
+ DEPENDS
+ vespaeval
+)
+vespa_add_test(NAME eval_name_extractor_test_app COMMAND eval_name_extractor_test_app)
diff --git a/eval/src/tests/eval/feature_name_extractor/feature_name_extractor_test.cpp b/eval/src/tests/eval/feature_name_extractor/feature_name_extractor_test.cpp
new file mode 100644
index 00000000000..3acf1ee2142
--- /dev/null
+++ b/eval/src/tests/eval/feature_name_extractor/feature_name_extractor_test.cpp
@@ -0,0 +1,79 @@
+// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+#include <vespa/vespalib/testkit/test_kit.h>
+#include <vespa/eval/eval/feature_name_extractor.h>
+
+using vespalib::eval::FeatureNameExtractor;
+
+void verify_extract(const vespalib::string &input,
+ const vespalib::string &expect_symbol,
+ const vespalib::string &expect_after)
+{
+ FeatureNameExtractor extractor;
+ const char *pos_in = input.data();
+ const char *end_in = input.data() + input.size();
+ vespalib::string symbol_out;
+ const char *pos_out = nullptr;
+ extractor.extract_symbol(pos_in, end_in, pos_out, symbol_out);
+ ASSERT_TRUE(pos_out != nullptr);
+ vespalib::string after(pos_out, end_in);
+ EXPECT_EQUAL(expect_symbol, symbol_out);
+ EXPECT_EQUAL(expect_after, after);
+}
+
+TEST("require that basic names are extracted correctly") {
+ TEST_DO(verify_extract("foo+", "foo", "+"));
+ TEST_DO(verify_extract("foo.out+", "foo.out", "+"));
+ TEST_DO(verify_extract("foo(p1,p2)+", "foo(p1,p2)", "+"));
+ TEST_DO(verify_extract("foo(p1,p2).out+", "foo(p1,p2).out", "+"));
+}
+
+TEST("require that special characters are allowed in prefix and suffix") {
+ TEST_DO(verify_extract("_@$+", "_@$", "+"));
+ TEST_DO(verify_extract("_@$.$@_+", "_@$.$@_", "+"));
+ TEST_DO(verify_extract("_@$(p1,p2)+", "_@$(p1,p2)", "+"));
+ TEST_DO(verify_extract("_@$(p1,p2).$@_+", "_@$(p1,p2).$@_", "+"));
+}
+
+TEST("require that dot is only allowed in suffix") {
+ TEST_DO(verify_extract("foo.bar+", "foo.bar", "+"));
+ TEST_DO(verify_extract("foo.bar.out+", "foo.bar.out", "+"));
+ TEST_DO(verify_extract("foo.bar(p1,p2)+", "foo.bar", "(p1,p2)+"));
+ TEST_DO(verify_extract("foo.bar(p1,p2).out+", "foo.bar", "(p1,p2).out+"));
+ TEST_DO(verify_extract("foo(p1,p2).out.bar+", "foo(p1,p2).out.bar", "+"));
+}
+
+TEST("require that parameters can be nested") {
+ TEST_DO(verify_extract("foo(p1(a,b),p2(c,d(e,f))).out+", "foo(p1(a,b),p2(c,d(e,f))).out", "+"));
+}
+
+TEST("require that space is allowed among parameters") {
+ TEST_DO(verify_extract("foo( p1 ( a , b ) ).out+", "foo( p1 ( a , b ) ).out", "+"));
+}
+
+TEST("require that space is now allowed outside parameters") {
+ TEST_DO(verify_extract("foo +", "foo", " +"));
+ TEST_DO(verify_extract("foo . out+", "foo", " . out+"));
+ TEST_DO(verify_extract("foo. out+", "foo.", " out+"));
+ TEST_DO(verify_extract("foo (p1,p2)+", "foo", " (p1,p2)+"));
+ TEST_DO(verify_extract("foo(p1,p2) +", "foo(p1,p2)", " +"));
+ TEST_DO(verify_extract("foo(p1,p2) .out+", "foo(p1,p2)", " .out+"));
+ TEST_DO(verify_extract("foo(p1,p2).out +", "foo(p1,p2).out", " +"));
+}
+
+TEST("require that parameters can be scientific numbers") {
+ TEST_DO(verify_extract("foo(1.3E+3,-1.9e-10).out+", "foo(1.3E+3,-1.9e-10).out", "+"));
+}
+
+TEST("require that quoted parenthesis are not counted") {
+ TEST_DO(verify_extract("foo(a,b,\")\").out+", "foo(a,b,\")\").out", "+"));
+}
+
+TEST("require that escaped quotes does not unquote") {
+ TEST_DO(verify_extract("foo(a,b,\"\\\")\").out+", "foo(a,b,\"\\\")\").out", "+"));
+}
+
+TEST("require that escaped escape does not hinder unquote") {
+ TEST_DO(verify_extract("foo(a,b,\"\\\\\")\").out+", "foo(a,b,\"\\\\\")", "\").out+"));
+}
+
+TEST_MAIN() { TEST_RUN_ALL(); }
diff --git a/eval/src/vespa/eval/eval/CMakeLists.txt b/eval/src/vespa/eval/eval/CMakeLists.txt
index 639ac3b5864..8271c7a4bed 100644
--- a/eval/src/vespa/eval/eval/CMakeLists.txt
+++ b/eval/src/vespa/eval/eval/CMakeLists.txt
@@ -13,6 +13,7 @@ vespa_add_library(eval_eval OBJECT
fast_addr_map.cpp
fast_forest.cpp
fast_value.cpp
+ feature_name_extractor.cpp
function.cpp
gbdt.cpp
int8float.cpp
diff --git a/eval/src/vespa/eval/eval/feature_name_extractor.cpp b/eval/src/vespa/eval/eval/feature_name_extractor.cpp
new file mode 100644
index 00000000000..f613d026d03
--- /dev/null
+++ b/eval/src/vespa/eval/eval/feature_name_extractor.cpp
@@ -0,0 +1,82 @@
+// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#include "feature_name_extractor.h"
+
+namespace vespalib::eval {
+
+namespace {
+
+struct LegalChar {
+ bool legal[256];
+ LegalChar(std::initializer_list<uint8_t> extra_chars) {
+ for (int c = 0; c < 256; ++c) {
+ legal[c] = isalnum(c);
+ }
+ for (uint8_t c: extra_chars) {
+ legal[c] = true;
+ }
+ }
+ bool is_legal(uint8_t c) { return legal[c]; }
+};
+
+static LegalChar prefix({'_', '$', '@'});
+static LegalChar suffix({'_', '.', '$', '@'});
+
+struct CountParen {
+ size_t depth = 0;
+ bool quoted = false;
+ bool escaped = false;
+ bool done(char c) {
+ if (quoted) {
+ if (escaped) {
+ escaped = false;
+ } else {
+ if (c == '\\') {
+ escaped = true;
+ } else if (c == '"') {
+ quoted = false;
+ }
+ }
+ } else {
+ if (c == '"') {
+ quoted = true;
+ } else if (c == '(') {
+ ++depth;
+ } else if (c == ')') {
+ if (--depth == 0) {
+ return true;
+ }
+ }
+ }
+ return false;
+ }
+};
+
+} // namespace <unnamed>
+
+void
+FeatureNameExtractor::extract_symbol(const char *pos_in, const char *end_in,
+ const char *&pos_out, vespalib::string &symbol_out) const
+{
+ while ((pos_in < end_in) && prefix.is_legal(*pos_in)) {
+ symbol_out.push_back(*pos_in++);
+ }
+ if ((pos_in < end_in) && (*pos_in == '(')) {
+ CountParen paren;
+ while (pos_in < end_in) {
+ symbol_out.push_back(*pos_in);
+ if (paren.done(*pos_in++)) {
+ break;
+ }
+ }
+ }
+ if ((pos_in < end_in) && (*pos_in == '.')) {
+ symbol_out.push_back(*pos_in++);
+ while ((pos_in < end_in) && suffix.is_legal(*pos_in)) {
+ symbol_out.push_back(*pos_in++);
+ }
+ }
+ pos_out = pos_in;
+}
+
+}
diff --git a/eval/src/vespa/eval/eval/feature_name_extractor.h b/eval/src/vespa/eval/eval/feature_name_extractor.h
new file mode 100644
index 00000000000..b3a9e0567a3
--- /dev/null
+++ b/eval/src/vespa/eval/eval/feature_name_extractor.h
@@ -0,0 +1,18 @@
+// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+
+#pragma once
+
+#include "function.h"
+
+namespace vespalib::eval {
+
+/**
+ * Custom symbol extractor used to extract ranking feature names when
+ * parsing ranking expressions.
+ **/
+struct FeatureNameExtractor : public vespalib::eval::SymbolExtractor {
+ void extract_symbol(const char *pos_in, const char *end_in,
+ const char *&pos_out, vespalib::string &symbol_out) const override;
+};
+
+}