Support 'id' as field name in C++ document selection lexing/parsing

'id' is normally a reserved keyword and wasn't explicitly allowed for the purpose of field name identifiers. Change `ID` to be lexed as a string token to allow for preserving the original casing when used as an identifier.
author: Tor Brede Vekterli <vekterli@verizonmedia.com> 2020-09-09 12:14:34 +0000
committer: Tor Brede Vekterli <vekterli@verizonmedia.com> 2020-09-09 12:33:16 +0000
commit: a77bd571f62a602f45c2da58ce5f52d89265646d (patch)
tree: cce6ef523f58e148acea8de9430004d2b0d448cc /document
parent: 95197a41a32a6d28c016c0e7c12b53705283a669 (diff)
3 files changed, 29 insertions, 11 deletions
diff --git a/document/src/tests/documentselectparsertest.cpp b/document/src/tests/documentselectparsertest.cpp
index 6fd9ab80faa..30b2bfbb1b4 100644
--- a/document/src/tests/documentselectparsertest.cpp
+++ b/document/src/tests/documentselectparsertest.cpp
@@ -94,9 +94,11 @@ void DocumentSelectParserTest::SetUp()
                      Struct("usergroup.header"),
                      Struct("usergroup.body"));
     builder.document(875463456, "user",
-                     Struct("user.header"), Struct("user.body"));
+                     Struct("user.header").addField("id", DataType::T_INT),
+                     Struct("user.body"));
     builder.document(567463442, "group",
-                     Struct("group.header"), Struct("group.body"));
+                     Struct("group.header").addField("iD", DataType::T_INT),
+                     Struct("group.body"));
     _repo = std::make_unique<DocumentTypeRepo>(builder.config());
 
     _parser = std::make_unique<select::Parser>(*_repo, _bucketIdFactory);
@@ -1455,6 +1457,9 @@ TEST_F(DocumentSelectParserTest, special_tokens_are_allowed_as_freestanding_iden
     EXPECT_EQ("(== (ID id.user) (FIELD user user))", parse_to_tree("id.user == user.user"));
     EXPECT_EQ("(NOT (DOCTYPE group))", parse_to_tree("not group"));
     EXPECT_EQ("(== (ID id.group) (FIELD group group))", parse_to_tree("id.group == group.group"));
+    EXPECT_EQ("(== (FIELD user id) (ID id.user))", parse_to_tree("user.id == id.user"));
+    // Case is preserved for special ID field
+    EXPECT_EQ("(== (FIELD group iD) (ID id.user))", parse_to_tree("group.iD == id.user"));
 }
 
 TEST_F(DocumentSelectParserTest, test_can_build_field_value_from_field_expr_node)
diff --git a/document/src/vespa/document/select/grammar/lexer.ll b/document/src/vespa/document/select/grammar/lexer.ll
index 1222aac02a2..d52cf0db7a9 100644
--- a/document/src/vespa/document/select/grammar/lexer.ll
+++ b/document/src/vespa/document/select/grammar/lexer.ll
@@ -119,7 +119,6 @@ SQ_STRING \'(\\([\\tnfr']|x{HEXDIGIT}{2})|[^'\\])*\'
 \[{WS}*(${IDCHARS}|{DECIMAL}){WS}*\]                      STRING_TOKEN(FP_ARRAY_LOOKUP)
 
  /* Primary tokens are case insensitive */
-(?i:"id")    NAMED_TOKEN(ID)
 (?i:"null")  NAMED_TOKEN(NULL)
 (?i:"true")  NAMED_TOKEN(TRUE)
 (?i:"false") NAMED_TOKEN(FALSE)
@@ -128,6 +127,7 @@ SQ_STRING \'(\\([\\tnfr']|x{HEXDIGIT}{2})|[^'\\])*\'
 (?i:"not")   NAMED_TOKEN(NOT)
 
  /* We expose the verbatim input as the token value, as these may also be used for identifiers... */
+(?i:"id")        STRING_TOKEN(ID)
 (?i:"user")      STRING_TOKEN(USER)
 (?i:"group")     STRING_TOKEN(GROUP)
 (?i:"scheme")    STRING_TOKEN(SCHEME)
diff --git a/document/src/vespa/document/select/grammar/parser.yy b/document/src/vespa/document/select/grammar/parser.yy
index 9d5b5825330..9e4a1d1a222 100644
--- a/document/src/vespa/document/select/grammar/parser.yy
+++ b/document/src/vespa/document/select/grammar/parser.yy
@@ -50,7 +50,6 @@
 %token LE    "<="
 %token GT    ">"
 %token LT    "<"
-%token ID
 %token NOW_FUNC
 
  /*
@@ -73,7 +72,7 @@
 %token <string_val>  FP_MAP_LOOKUP FP_ARRAY_LOOKUP
 %token <double_val>  FLOAT
 %token <i64_val>     INTEGER
-%token <string_val>  USER GROUP SCHEME NAMESPACE SPECIFIC BUCKET GID TYPE
+%token <string_val>  ID USER GROUP SCHEME NAMESPACE SPECIFIC BUCKET GID TYPE
 
 %type <string_val> ident mangled_ident
 %type <abstract_node> bool_
@@ -84,7 +83,7 @@
 %type <field_expr_node> field_spec
 
 %destructor { delete $$; } IDENTIFIER STRING FP_MAP_LOOKUP FP_ARRAY_LOOKUP
-%destructor { delete $$; } USER GROUP SCHEME NAMESPACE SPECIFIC BUCKET GID TYPE
+%destructor { delete $$; } ID USER GROUP SCHEME NAMESPACE SPECIFIC BUCKET GID TYPE
 %destructor { delete $$; } null_ bool_ number string doc_type ident id_arg id_spec
 %destructor { delete $$; } variable mangled_ident field_spec value arith_expr
 %destructor { delete $$; } comparison leaf logical_expr expression
@@ -244,9 +243,18 @@ id_arg
     ;
 
 id_spec
-    : ID %prec NON_DOT { $$ = new IdValueNode(bucket_id_factory, "id", ""); } /* Prefer shifting instead of reducing */
-    | ID "." id_arg    { $$ = new IdValueNode(bucket_id_factory, "id", *steal<string>($3)); }
-    | ID "." IDENTIFIER "(" ")" { $$ = new FunctionValueNode(*steal<string>($3), std::make_unique<IdValueNode>(bucket_id_factory, "id", "")); }
+    : ID %prec NON_DOT {
+          (void)steal<string>($1); // Explicitly discard.
+          $$ = new IdValueNode(bucket_id_factory, "id", ""); // Prefer shifting instead of reducing.
+      }
+    | ID "." id_arg {
+          (void)steal<string>($1); // Explicitly discard.
+          $$ = new IdValueNode(bucket_id_factory, "id", *steal<string>($3));
+      }
+    | ID "." IDENTIFIER "(" ")" {
+          (void)steal<string>($1);  // Explicitly discard.
+          $$ = new FunctionValueNode(*steal<string>($3), std::make_unique<IdValueNode>(bucket_id_factory, "id", ""));
+      }
     ;
 
 variable
@@ -254,12 +262,17 @@ variable
     ;
 
  /* FIXME this is a horrible leftover of post-parsed fieldpath processing */
- /* At least we verify structural integrity at initial parse-time now... */
- /* Post-parsing should be replaced with an actual parse-time built AST! */
+ /* At least we verify structural integrity at initial parse-time now...   */
+ /* Post-parsing should be replaced with an actual parse-time built AST!   */
+ /* This rule is only used after matching an initial valid identifier, so  */
+ /* we add some special casing of lexer keywords that today are allowed as */
+ /* regular field names (but not as document type names). Not pretty, but  */
+ /* it avoids parser ambiguities.                                          */
 mangled_ident
     : ident                         { $$ = $1; }
     | mangled_ident FP_MAP_LOOKUP   { $1->append(*steal<string>($2)); $$ = $1; }
     | mangled_ident FP_ARRAY_LOOKUP { $1->append(*steal<string>($2)); $$ = $1; }
+    | ID                            { $$ = $1; }
     ;
 
 field_spec
author	Tor Brede Vekterli <vekterli@verizonmedia.com>	2020-09-09 12:14:34 +0000
committer	Tor Brede Vekterli <vekterli@verizonmedia.com>	2020-09-09 12:33:16 +0000
commit	a77bd571f62a602f45c2da58ce5f52d89265646d (patch)
tree	cce6ef523f58e148acea8de9430004d2b0d448cc /document
parent	95197a41a32a6d28c016c0e7c12b53705283a669 (diff)