diff options
author | Håvard Pettersen <havardpe@oath.com> | 2021-08-16 13:14:38 +0000 |
---|---|---|
committer | Håvard Pettersen <havardpe@oath.com> | 2021-08-16 13:36:09 +0000 |
commit | c6331f25f9b6912d649c4fbdd65476775a4b6192 (patch) | |
tree | 3151f3bd7f20d2d78629c5f3d5984679786bff2a /eval | |
parent | e2fd2769172926471e602190afabc87815d4063a (diff) |
move FeatureNameExtractor
to make it available for use in vespa-eval-expr
Diffstat (limited to 'eval')
-rw-r--r-- | eval/CMakeLists.txt | 1 | ||||
-rw-r--r-- | eval/src/tests/eval/feature_name_extractor/.gitignore | 1 | ||||
-rw-r--r-- | eval/src/tests/eval/feature_name_extractor/CMakeLists.txt | 8 | ||||
-rw-r--r-- | eval/src/tests/eval/feature_name_extractor/feature_name_extractor_test.cpp | 79 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/CMakeLists.txt | 1 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/feature_name_extractor.cpp | 82 | ||||
-rw-r--r-- | eval/src/vespa/eval/eval/feature_name_extractor.h | 18 |
7 files changed, 190 insertions, 0 deletions
diff --git a/eval/CMakeLists.txt b/eval/CMakeLists.txt index d5a0b2a14ae..4af337dcb67 100644 --- a/eval/CMakeLists.txt +++ b/eval/CMakeLists.txt @@ -18,6 +18,7 @@ vespa_define_module( src/tests/eval/compile_cache src/tests/eval/compiled_function src/tests/eval/fast_value + src/tests/eval/feature_name_extractor src/tests/eval/function src/tests/eval/function_speed src/tests/eval/gbdt diff --git a/eval/src/tests/eval/feature_name_extractor/.gitignore b/eval/src/tests/eval/feature_name_extractor/.gitignore new file mode 100644 index 00000000000..88c86c1720e --- /dev/null +++ b/eval/src/tests/eval/feature_name_extractor/.gitignore @@ -0,0 +1 @@ +searchlib_feature_name_extractor_test_app diff --git a/eval/src/tests/eval/feature_name_extractor/CMakeLists.txt b/eval/src/tests/eval/feature_name_extractor/CMakeLists.txt new file mode 100644 index 00000000000..7126060e974 --- /dev/null +++ b/eval/src/tests/eval/feature_name_extractor/CMakeLists.txt @@ -0,0 +1,8 @@ +# Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +vespa_add_executable(eval_name_extractor_test_app TEST + SOURCES + feature_name_extractor_test.cpp + DEPENDS + vespaeval +) +vespa_add_test(NAME eval_name_extractor_test_app COMMAND eval_name_extractor_test_app) diff --git a/eval/src/tests/eval/feature_name_extractor/feature_name_extractor_test.cpp b/eval/src/tests/eval/feature_name_extractor/feature_name_extractor_test.cpp new file mode 100644 index 00000000000..3acf1ee2142 --- /dev/null +++ b/eval/src/tests/eval/feature_name_extractor/feature_name_extractor_test.cpp @@ -0,0 +1,79 @@ +// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +#include <vespa/vespalib/testkit/test_kit.h> +#include <vespa/eval/eval/feature_name_extractor.h> + +using vespalib::eval::FeatureNameExtractor; + +void verify_extract(const vespalib::string &input, + const vespalib::string &expect_symbol, + const vespalib::string &expect_after) +{ + FeatureNameExtractor extractor; + const char *pos_in = input.data(); + const char *end_in = input.data() + input.size(); + vespalib::string symbol_out; + const char *pos_out = nullptr; + extractor.extract_symbol(pos_in, end_in, pos_out, symbol_out); + ASSERT_TRUE(pos_out != nullptr); + vespalib::string after(pos_out, end_in); + EXPECT_EQUAL(expect_symbol, symbol_out); + EXPECT_EQUAL(expect_after, after); +} + +TEST("require that basic names are extracted correctly") { + TEST_DO(verify_extract("foo+", "foo", "+")); + TEST_DO(verify_extract("foo.out+", "foo.out", "+")); + TEST_DO(verify_extract("foo(p1,p2)+", "foo(p1,p2)", "+")); + TEST_DO(verify_extract("foo(p1,p2).out+", "foo(p1,p2).out", "+")); +} + +TEST("require that special characters are allowed in prefix and suffix") { + TEST_DO(verify_extract("_@$+", "_@$", "+")); + TEST_DO(verify_extract("_@$.$@_+", "_@$.$@_", "+")); + TEST_DO(verify_extract("_@$(p1,p2)+", "_@$(p1,p2)", "+")); + TEST_DO(verify_extract("_@$(p1,p2).$@_+", "_@$(p1,p2).$@_", "+")); +} + +TEST("require that dot is only allowed in suffix") { + TEST_DO(verify_extract("foo.bar+", "foo.bar", "+")); + TEST_DO(verify_extract("foo.bar.out+", "foo.bar.out", "+")); + TEST_DO(verify_extract("foo.bar(p1,p2)+", "foo.bar", "(p1,p2)+")); + TEST_DO(verify_extract("foo.bar(p1,p2).out+", "foo.bar", "(p1,p2).out+")); + TEST_DO(verify_extract("foo(p1,p2).out.bar+", "foo(p1,p2).out.bar", "+")); +} + +TEST("require that parameters can be nested") { + TEST_DO(verify_extract("foo(p1(a,b),p2(c,d(e,f))).out+", "foo(p1(a,b),p2(c,d(e,f))).out", "+")); +} + +TEST("require that space is allowed among parameters") { + TEST_DO(verify_extract("foo( p1 ( a , b ) ).out+", "foo( p1 ( a , b ) ).out", "+")); +} + +TEST("require that space is now allowed outside parameters") { + TEST_DO(verify_extract("foo +", "foo", " +")); + TEST_DO(verify_extract("foo . out+", "foo", " . out+")); + TEST_DO(verify_extract("foo. out+", "foo.", " out+")); + TEST_DO(verify_extract("foo (p1,p2)+", "foo", " (p1,p2)+")); + TEST_DO(verify_extract("foo(p1,p2) +", "foo(p1,p2)", " +")); + TEST_DO(verify_extract("foo(p1,p2) .out+", "foo(p1,p2)", " .out+")); + TEST_DO(verify_extract("foo(p1,p2).out +", "foo(p1,p2).out", " +")); +} + +TEST("require that parameters can be scientific numbers") { + TEST_DO(verify_extract("foo(1.3E+3,-1.9e-10).out+", "foo(1.3E+3,-1.9e-10).out", "+")); +} + +TEST("require that quoted parenthesis are not counted") { + TEST_DO(verify_extract("foo(a,b,\")\").out+", "foo(a,b,\")\").out", "+")); +} + +TEST("require that escaped quotes does not unquote") { + TEST_DO(verify_extract("foo(a,b,\"\\\")\").out+", "foo(a,b,\"\\\")\").out", "+")); +} + +TEST("require that escaped escape does not hinder unquote") { + TEST_DO(verify_extract("foo(a,b,\"\\\\\")\").out+", "foo(a,b,\"\\\\\")", "\").out+")); +} + +TEST_MAIN() { TEST_RUN_ALL(); } diff --git a/eval/src/vespa/eval/eval/CMakeLists.txt b/eval/src/vespa/eval/eval/CMakeLists.txt index 639ac3b5864..8271c7a4bed 100644 --- a/eval/src/vespa/eval/eval/CMakeLists.txt +++ b/eval/src/vespa/eval/eval/CMakeLists.txt @@ -13,6 +13,7 @@ vespa_add_library(eval_eval OBJECT fast_addr_map.cpp fast_forest.cpp fast_value.cpp + feature_name_extractor.cpp function.cpp gbdt.cpp int8float.cpp diff --git a/eval/src/vespa/eval/eval/feature_name_extractor.cpp b/eval/src/vespa/eval/eval/feature_name_extractor.cpp new file mode 100644 index 00000000000..f613d026d03 --- /dev/null +++ b/eval/src/vespa/eval/eval/feature_name_extractor.cpp @@ -0,0 +1,82 @@ +// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#include "feature_name_extractor.h" + +namespace vespalib::eval { + +namespace { + +struct LegalChar { + bool legal[256]; + LegalChar(std::initializer_list<uint8_t> extra_chars) { + for (int c = 0; c < 256; ++c) { + legal[c] = isalnum(c); + } + for (uint8_t c: extra_chars) { + legal[c] = true; + } + } + bool is_legal(uint8_t c) { return legal[c]; } +}; + +static LegalChar prefix({'_', '$', '@'}); +static LegalChar suffix({'_', '.', '$', '@'}); + +struct CountParen { + size_t depth = 0; + bool quoted = false; + bool escaped = false; + bool done(char c) { + if (quoted) { + if (escaped) { + escaped = false; + } else { + if (c == '\\') { + escaped = true; + } else if (c == '"') { + quoted = false; + } + } + } else { + if (c == '"') { + quoted = true; + } else if (c == '(') { + ++depth; + } else if (c == ')') { + if (--depth == 0) { + return true; + } + } + } + return false; + } +}; + +} // namespace <unnamed> + +void +FeatureNameExtractor::extract_symbol(const char *pos_in, const char *end_in, + const char *&pos_out, vespalib::string &symbol_out) const +{ + while ((pos_in < end_in) && prefix.is_legal(*pos_in)) { + symbol_out.push_back(*pos_in++); + } + if ((pos_in < end_in) && (*pos_in == '(')) { + CountParen paren; + while (pos_in < end_in) { + symbol_out.push_back(*pos_in); + if (paren.done(*pos_in++)) { + break; + } + } + } + if ((pos_in < end_in) && (*pos_in == '.')) { + symbol_out.push_back(*pos_in++); + while ((pos_in < end_in) && suffix.is_legal(*pos_in)) { + symbol_out.push_back(*pos_in++); + } + } + pos_out = pos_in; +} + +} diff --git a/eval/src/vespa/eval/eval/feature_name_extractor.h b/eval/src/vespa/eval/eval/feature_name_extractor.h new file mode 100644 index 00000000000..b3a9e0567a3 --- /dev/null +++ b/eval/src/vespa/eval/eval/feature_name_extractor.h @@ -0,0 +1,18 @@ +// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. + +#pragma once + +#include "function.h" + +namespace vespalib::eval { + +/** + * Custom symbol extractor used to extract ranking feature names when + * parsing ranking expressions. + **/ +struct FeatureNameExtractor : public vespalib::eval::SymbolExtractor { + void extract_symbol(const char *pos_in, const char *end_in, + const char *&pos_out, vespalib::string &symbol_out) const override; +}; + +} |