summaryrefslogtreecommitdiffstats
path: root/integration/intellij/src/main/jflex
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2021-11-18 23:04:30 +0100
committerJon Bratseth <bratseth@gmail.com>2021-11-18 23:04:30 +0100
commitc978c9e29652b24b7f31ed545c1c0e48a17464ec (patch)
treec347e96d74bcb9d180346d90385a2dceb6fbcad5 /integration/intellij/src/main/jflex
parent28b80bf7669ff14f1af913ef7bcee8659ac555a2 (diff)
Move and rename
Diffstat (limited to 'integration/intellij/src/main/jflex')
-rw-r--r--integration/intellij/src/main/jflex/ai/vespa/intellij/schema/lexer/sd.flex239
1 files changed, 239 insertions, 0 deletions
diff --git a/integration/intellij/src/main/jflex/ai/vespa/intellij/schema/lexer/sd.flex b/integration/intellij/src/main/jflex/ai/vespa/intellij/schema/lexer/sd.flex
new file mode 100644
index 00000000000..b4491acc717
--- /dev/null
+++ b/integration/intellij/src/main/jflex/ai/vespa/intellij/schema/lexer/sd.flex
@@ -0,0 +1,239 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package ai.vespa.intellij.schema.lexer;
+
+import com.intellij.lexer.FlexLexer;
+import com.intellij.psi.tree.IElementType;
+import com.intellij.ui.components.MultiColumnList;
+import ai.vespa.intellij.schema.psi.SdTokenType;
+
+import static ai.vespa.intellij.schema.psi.SdTypes.*; // That is the class which is specified as `elementTypeHolderClass` in bnf
+ // grammar file. This will contain all other tokens which we will use.
+import static com.intellij.psi.TokenType.BAD_CHARACTER; // Pre-defined bad character token.
+import static com.intellij.psi.TokenType.WHITE_SPACE; // Pre-defined whitespace character token.
+
+/*
+ * Vespa schema parser lexer
+ *
+ * @author Shahar Ariel
+ */
+
+%%
+
+%public
+%class SdLexer
+%implements FlexLexer
+%function advance
+%type IElementType
+%unicode
+
+//**--------- REGEXES ---------**//
+// If some character sequence is matched to this regex, it will be treated as an IDENTIFIER.
+ID=[a-zA-Z_][a-zA-Z0-9_]*
+ID_WITH_DASH = [a-zA-Z_][a-zA-Z0-9_-]*
+// If some character sequence is matched to this regex, it will be treated as a WHITE_SPACE.
+WHITE_SPACE=[ \t\n\x0B\f\r]+
+
+COMMENT=#.*
+SYMBOL= [!$|:{}(),.\[\]]
+INTEGER = [0-9]+
+FLOAT = {INTEGER}[.][0-9]+[e]?
+COMPARISON_OPERATOR = [<>]|(==)|(<=)|(>=)|(\~=)
+ARITHMETIC_OPERATOR = [\-+*/]
+STRING = \"([^\"\\]*(\\.[^\"\\]*)*)\"
+WORD = \w+
+
+
+%%
+
+<YYINITIAL> {
+ /**
+ In here, we match keywords. So if a keyword is found, this returns a token which corresponds to that keyword.
+ These tokens are generated using the 'sd.bnf' file and located in the SdTypes class.
+ These tokens are Parsed uses these return values to match token squence to a parser rule.
+ */
+
+ /**
+ This list of keywords has to be synchronized with sd.bnf file. If you add a keyword here, you should add it to the
+ sd.bnf file as well (to the rule KeywordOrIdentifier / KeywordNotIdentifier).
+ */
+
+ "search" { return SEARCH; }
+ "schema" { return SCHEMA; }
+ "document" { return DOCUMENT; }
+ "inherits" { return INHERITS; }
+ "struct" { return STRUCT; }
+ "field" { return FIELD; }
+ "type" { return TYPE; }
+ "struct-field" { return STRUCT_FIELD; }
+ "match" { return MATCH; }
+
+ "indexing" { return INDEXING; }
+ "summary" { return SUMMARY; }
+ "attribute" { return ATTRIBUTE; }
+ "set_language" { return SET_LANGUAGE; }
+
+ "array" { return ARRAY; }
+ "raw" { return RAW; }
+ "uri" { return URI; }
+ "reference" { return REFERENCE; }
+ "annotationreference" { return ANNOTATIONREFERENCE; }
+ "weightedset" { return WEIGHTEDSET; }
+ "map" { return MAP; }
+
+ "text" { return TEXT; }
+ "exact" { return EXACT; }
+ "exact-terminator" { return EXACT_TERMINATOR; }
+ "word" { return WORD; }
+ "prefix" { return PREFIX; }
+ "cased" { return CASED; }
+ "uncased" { return UNCASED; }
+ "substring" { return SUBSTRING; }
+ "suffix" { return SUFFIX; }
+ "max-length" { return MAX_LENGTH; }
+ "gram" { return GRAM; }
+ "gram-size" { return GRAM_SIZE; }
+
+ "fast-search" { return FAST_SEARCH; }
+ "fast-access" { return FAST_ACCESS; }
+ "alias" { return ALIAS; }
+ "sorting" { return SORTING; }
+ "uca" { return UCA; }
+ "lowercase" { return LOWERCASE; }
+ "paged" { return PAGED; }
+ "strength" { return STRENGTH; }
+ "primary" { return PRIMARY; }
+ "secondary" { return SECONDARY; }
+ "tertiary" { return TERTIARY; }
+ "quaternary" { return QUATERNARY; }
+ "identical" { return IDENTICAL; }
+ "distance-metric" { return DISTANCE_METRIC; }
+
+ "rank" { return RANK; }
+ "filter" { return FILTER; }
+ "normal" { return NORMAL; }
+ "literal" { return LITERAL; }
+ "indexing-rewrite" { return INDEXING_REWRITE; }
+ "none" { return NONE; }
+ "query-command" { return QUERY_COMMAND; }
+ "full" { return FULL; }
+ "static" { return STATIC; }
+ "dynamic" { return DYNAMIC; }
+ "source" { return SOURCE; }
+ "to" { return TO; }
+ "matched-elements-only" { return MATCHED_ELEMENTS_ONLY; }
+
+ "input" { return INPUT; }
+ "mutable" { return MUTABLE; }
+ "enable-bit-vectors" { return ENABLE_BIT_VECTORS; }
+ "enable-only-bit-vector" { return ENABLE_ONLY_BIT_VECTOR; }
+ "document-summary" { return DOCUMENT_SUMMARY; }
+ "from-disk" { return FROM_DISK; }
+ "omit-summary-features" { return OMIT_SUMMARY_FEATURES; }
+ "import" { return IMPORT; }
+ "as" { return AS; }
+
+ "rank-profile" { return RANK_PROFILE; }
+ "model" { return MODEL; }
+ "match-phase" { return MATCH_PHASE; }
+ "order" { return ORDER; }
+ "ascending" { return ASCENDING; }
+ "descending" { return DESCENDING; }
+ "locale" { return LOCALE; }
+ "max-hits" { return MAX_HITS; }
+ "diversity" { return DIVERSITY; }
+ "min-groups" { return MIN_GROUPS; }
+ "cutoff-factor" { return CUTOFF_FACTOR; }
+ "cutoff-strategy" { return CUTOFF_STRATEGY; }
+ "loose" { return LOOSE; }
+ "strict" { return STRICT; }
+ "rank-properties" { return RANK_PROPERTIES; }
+
+ "first-phase" { return FIRST_PHASE; }
+ "keep-rank-count" { return KEEP_RANK_COUNT; }
+ "rank-score-drop-limit" { return RANK_SCORE_DROP_LIMIT; }
+ "expression" { return EXPRESSION; }
+ "file" { return FILE; }
+ "expression" { return EXPRESSION; }
+ "num-threads-per-search" { return NUM_THREADS_PER_SEARCH; }
+ "termwise-limit" { return TERMWISE_LIMIT; }
+ "ignore-default-rank-features" { return IGNORE_DEFAULT_RANK_FEATURES; }
+ "min-hits-per-thread" { return MIN_HITS_PER_THREAD; }
+ "num-search-partition" { return NUM_SEARCH_PARTITION; }
+ "constants" { return CONSTANTS; }
+ "second-phase" { return SECOND_PHASE; }
+ "rerank-count" { return RERANK_COUNT; }
+ "rank-features" { return RANK_FEATURES; }
+
+ "weight" { return WEIGHT; }
+ "index" { return INDEX; }
+ "bolding" { return BOLDING; }
+ "on" { return ON; }
+ "off" { return OFF; }
+ "true" { return TRUE; }
+ "false" { return FALSE; }
+ "id" { return ID; }
+ "normalizing" { return NORMALIZING; }
+ "stemming" { return STEMMING; }
+ "arity" { return ARITY; }
+ "lower-bound" { return LOWER_BOUND; }
+ "upper-bound" { return UPPER_BOUND; }
+ "dense-posting-list-threshold" {return DENSE_POSTING_LIST_THRESHOLD; }
+ "enable-bm25" { return ENABLE_BM25; }
+ "hnsw" { return HNSW; }
+ "max-links-per-node" { return MAX_LINKS_PER_NODE; }
+ "neighbors-to-explore-at-insert" { return NEIGHBORS_TO_EXPLORE_AT_INSERT; }
+ "multi-threaded-indexing" { return MULTI_THREADED_INDEXING; }
+ "create-if-nonexistent" { return CREATE_IF_NONEXISTENT; }
+ "remove-if-zero" { return REMOVE_IF_ZERO; }
+ "dictionary" { return DICTIONARY; }
+ "hash" { return HASH; }
+ "btree" { return BTREE; }
+
+ "fieldset" { return FIELDSET; }
+ "fields" { return FIELDS; }
+ "constant" { return CONSTANT; }
+ "output" { return OUTPUT; }
+
+ "annotation" { return ANNOTATION; }
+ "rank-type" { return RANK_TYPE; }
+ "onnx-model" { return ONNX_MODEL; }
+ "raw-as-base64-in-summary" { return RAW_AS_BASE64_IN_SUMMARY; }
+ "on-match" { return ON_MATCH; }
+ "on-rank" { return ON_RANK; }
+ "on-summary" { return ON_SUMMARY; }
+
+ "function" { return FUNCTION; }
+ "macro" { return MACRO; }
+ "inline" { return INLINE; }
+
+ "summary-features" { return SUMMARY_FEATURES; }
+ "match-features" { return MATCH_FEATURES; }
+ "rank-features" { return RANK_FEATURES; }
+
+ "body" { return BODY; }
+ "header" { return HEADER; }
+ "summary-to" { return SUMMARY_TO; }
+
+ "evaluation-point" { return EVALUATION_POINT; }
+ "pre-post-filter-tipping-point" { return PRE_POST_FILTER_TIPPING_POINT; }
+
+ // In here, we check for character sequences which matches regular expressions defined above.
+ {ID} { return ID_REG; }
+ {ID_WITH_DASH} { return ID_WITH_DASH_REG; }
+
+ {WHITE_SPACE} { return WHITE_SPACE; }
+
+ {COMMENT} { return COMMENT; }
+ {SYMBOL} { return SYMBOL; }
+ {INTEGER} { return INTEGER_REG; }
+ {FLOAT} { return FLOAT_REG; }
+ {ARITHMETIC_OPERATOR} { return ARITHMETIC_OPERATOR; }
+ {COMPARISON_OPERATOR} { return COMPARISON_OPERATOR; }
+ {WORD} { return WORD_REG; }
+ {STRING} { return STRING_REG; }
+
+}
+
+// If the character sequence does not match any of the above rules, we return BAD_CHARACTER which indicates that
+// there is an error in the character sequence. This is used to highlight errors.
+[^] { return BAD_CHARACTER; } \ No newline at end of file