diff options
Diffstat (limited to 'indexinglanguage/src/main/javacc/IndexingParser.jj')
-rw-r--r-- | indexinglanguage/src/main/javacc/IndexingParser.jj | 822 |
1 files changed, 822 insertions, 0 deletions
diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj new file mode 100644 index 00000000000..a7b17d81c83 --- /dev/null +++ b/indexinglanguage/src/main/javacc/IndexingParser.jj @@ -0,0 +1,822 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +// -------------------------------------------------------------------------------- +// +// JavaCC options. +// +// -------------------------------------------------------------------------------- +options { + CACHE_TOKENS = false; + DEBUG_PARSER = false; + ERROR_REPORTING = true; + STATIC = false; + USER_CHAR_STREAM = true; +} + +// -------------------------------------------------------------------------------- +// +// Parser body. +// +// -------------------------------------------------------------------------------- +PARSER_BEGIN(IndexingParser) + +package com.yahoo.vespa.indexinglanguage.parser; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.util.List; +import java.util.LinkedList; +import java.util.Map; +import java.util.LinkedHashMap; + +import com.yahoo.collections.Pair; +import com.yahoo.document.datatypes.*; +import com.yahoo.text.StringUtilities; +import com.yahoo.vespa.indexinglanguage.expressions.*; +import com.yahoo.vespa.indexinglanguage.linguistics.AnnotatorConfig; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.Linguistics; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a> + * @version $Id$ + */ +public class IndexingParser { + + private String defaultFieldName; + private Linguistics linguistics; + private AnnotatorConfig annotatorCfg; + + public IndexingParser(String str) { + this(new IndexingInput(str)); + } + + public IndexingParser setDefaultFieldName(String fieldName) { + defaultFieldName = fieldName; + return this; + } + + public IndexingParser setLinguistics(Linguistics linguistics) { + this.linguistics = linguistics; + return this; + } + + public IndexingParser setAnnotatorConfig(AnnotatorConfig cfg) { + annotatorCfg = cfg; + return this; + } + + private static FieldValue parseDouble(String str) { + return new DoubleFieldValue(new BigDecimal(str).doubleValue()); + } + + private static FieldValue parseFloat(String str) { + if (str.endsWith("f") || str.endsWith("F")) { + str = str.substring(0, str.length() - 1); + } + return new FloatFieldValue(new BigDecimal(str).floatValue()); + } + + private static FieldValue parseInteger(String str) { + if (str.startsWith("0x")) { + return new IntegerFieldValue(new BigInteger(str.substring(2), 16).intValue()); + } else { + return new IntegerFieldValue(new BigInteger(str).intValue()); + } + } + + private static FieldValue parseLong(String str) { + if (str.endsWith("l") || str.endsWith("L")) { + str = str.substring(0, str.length() - 1); + } + if (str.startsWith("0x")) { + return new LongFieldValue(new BigInteger(str.substring(2), 16).longValue()); + } else { + return new LongFieldValue(new BigInteger(str).longValue()); + } + } +} + +PARSER_END(IndexingParser) + +SKIP : +{ + " " | "\t" | "\r" | "\f" +} + +SPECIAL_TOKEN : +{ + <COMMENT: "#" (~["\n","\r"])* > +} + +TOKEN : +{ + <INTEGER: (["0"-"9"])+ | ("0" ["x","X"] (["0"-"9","a"-"f","A"-"F"])+)> | + <LONG: <INTEGER> ["l","L"]> | + <DOUBLE: (["0"-"9"])+ ("." (["0"-"9"])*)? (["e","E"] (["+","-"])? (["0"-"9"])+)?> | + <FLOAT: <DOUBLE> ["f", "F"]> +} + +TOKEN : +{ + <NL: "\n"> | + <ADD: "+"> | + <SUB: "-"> | + <MUL: "*"> | + <DIV: "/"> | + <MOD: "%"> | + <EQ: "=="> | + <NE: "!="> | + <LT: "<"> | + <LE: "<="> | + <GT: ">"> | + <GE: ">="> | + <PIPE: "|"> | + <LCURLY: "{"> | + <RCURLY: "}"> | + <LPAREN: "("> | + <RPAREN: ")"> | + <DOT: "."> | + <COMMA: ","> | + <COLON: ":"> | + <SCOLON: ";"> | + <STRING: ("\"" (~["\""] | "\\\"")* "\"") | + ("'" (~["'"] | "\\'")* "'")> | + <ATTRIBUTE: "attribute"> | + <BASE64_DECODE: "base64decode"> | + <BASE64_ENCODE: "base64encode"> | + <CASE: "case"> | + <CASE_DEFAULT: "default"> | + <CLEAR_STATE: "clear_state"> | + <CREATE_IF_NON_EXISTENT: "create_if_non_existent"> | + <ECHO: "echo"> | + <ELSE: "else"> | + <EXACT: "exact"> | + <FLATTEN: "flatten"> | + <FOR_EACH: "for_each"> | + <GET_FIELD: "get_field"> | + <GET_VAR: "get_var"> | + <GUARD: "guard"> | + <HEX_DECODE: "hexdecode"> | + <HEX_ENCODE: "hexencode"> | + <HOST_NAME: "hostname"> | + <IF: "if"> | + <INDEX: "index"> | + <INPUT: "input"> | + <JOIN: "join"> | + <LOWER_CASE: "lowercase"> | + <NGRAM: "ngram"> | + <NORMALIZE: "normalize"> | + <NOW: "now"> | + <OPTIMIZE_PREDICATE: "optimize_predicate"> | + <PASSTHROUGH: "passthrough"> | + <RANDOM: "random"> | + <REMOVE_IF_ZERO: "remove_if_zero"> | + <SELECT_INPUT: "select_input"> | + <SET_LANGUAGE: "set_language"> | + <SET_VAR: "set_var"> | + <SPLIT: "split"> | + <STEM: "stem"> | + <SUBSTRING: "substring"> | + <SUMMARY: "summary"> | + <SWITCH: "switch"> | + <THIS: "this"> | + <TOKENIZE: "tokenize"> | + <TO_ARRAY: "to_array"> | + <TO_BYTE: "to_byte"> | + <TO_DOUBLE: "to_double"> | + <TO_FLOAT: "to_float"> | + <TO_INT: "to_int"> | + <TO_LONG: "to_long"> | + <TO_POS: "to_pos"> | + <TO_STRING: "to_string"> | + <TO_WSET: "to_wset"> | + <TRIM: "trim"> | + <ZCURVE: "zcurve"> | + <IDENTIFIER: ["a"-"z","A"-"Z", "_"] (["a"-"z","A"-"Z","0"-"9","_","-"])*> +} + +// -------------------------------------------------------------------------------- +// +// Production rules. +// +// -------------------------------------------------------------------------------- + +Expression root() : +{ + Expression exp; +} +{ + ( exp = statement() [ <SCOLON> ] ) + { + while (exp instanceof ExpressionList && ((ExpressionList)exp).size() == 1) exp = ((ExpressionList)exp).get(0); + return exp; + } +} + +ScriptExpression script() : +{ + StatementExpression exp; + List<StatementExpression> lst = new LinkedList<StatementExpression>(); +} +{ + ( <LCURLY> nl() exp = statement() { lst.add(exp); } nl() + ( <SCOLON> nl() [ exp = statement() { lst.add(exp); } nl() ] )* <RCURLY> ) + { return new ScriptExpression(lst); } +} + +StatementExpression statement() : +{ + Expression exp; + List<Expression> lst = new LinkedList<Expression>(); +} +{ + ( exp = expression() { lst.add(exp); } ( <PIPE> nl() exp = expression() { lst.add(exp); } )* ) + { return new StatementExpression(lst); } +} + +Expression expression() : +{ + Expression exp; + List<Expression> lst = new LinkedList<Expression>(); +} +{ + ( exp = math() { lst.add(exp); } ( <DOT> exp = math() { lst.add(exp); } )* ) + { return lst.size() == 1 ? exp : new CatExpression(lst); } +} + +Expression math() : +{ + ArithmeticExpression.Operator op = ArithmeticExpression.Operator.ADD; + MathResolver math = new MathResolver(); + Expression exp; +} +{ + ( exp = value() { math.push(op, exp); } + ( ( <ADD> { op = ArithmeticExpression.Operator.ADD; } | + <DIV> { op = ArithmeticExpression.Operator.DIV; } | + <MOD> { op = ArithmeticExpression.Operator.MOD; } | + <MUL> { op = ArithmeticExpression.Operator.MUL; } | + <SUB> { op = ArithmeticExpression.Operator.SUB; } ) + exp = value() { math.push(op, exp); } )* ) + { return math.resolve(); } +} + +Expression value() : +{ + Expression val; +} +{ + ( val = attributeExp() | + val = base64DecodeExp() | + val = base64EncodeExp() | + val = clearStateExp() | + val = echoExp() | + val = exactExp() | + val = flattenExp() | + val = forEachExp() | + val = getFieldExp() | + val = getVarExp() | + val = guardExp() | + val = hexDecodeExp() | + val = hexEncodeExp() | + val = hostNameExp() | + val = ifThenExp() | + val = indexExp() | + val = inputExp() | + val = joinExp() | + val = lowerCaseExp() | + val = ngramExp() | + val = normalizeExp() | + val = nowExp() | + val = optimizePredicateExp() | + val = passthroughExp() | + val = randomExp() | + val = script() | + val = selectInputExp() | + val = setLanguageExp() | + val = setValueExp() | + val = setVarExp() | + val = splitExp() | + val = substringExp() | + val = summaryExp() | + val = switchExp() | + val = thisExp() | + val = tokenizeExp() | + val = toArrayExp() | + val = toByteExp() | + val = toDoubleExp() | + val = toFloatExp() | + val = toIntExp() | + val = toLongExp() | + val = toPosExp() | + val = toStringExp() | + val = toWsetExp() | + val = trimExp() | + val = zcurveExp() | + ( <LPAREN> val = statement() <RPAREN> { val = new ParenthesisExpression(val); } ) ) + { return val; } +} + +Expression attributeExp() : +{ + String val = defaultFieldName; +} +{ + ( <ATTRIBUTE> [ val = fieldName() ] ) + { return new AttributeExpression(val); } +} + +Expression base64DecodeExp() : { } +{ + ( <BASE64_DECODE> ) + { return new Base64DecodeExpression(); } +} + +Expression base64EncodeExp() : { } +{ + ( <BASE64_ENCODE> ) + { return new Base64EncodeExpression(); } +} + +Expression clearStateExp() : { } +{ + ( <CLEAR_STATE> ) + { return new ClearStateExpression(); } +} + +Expression echoExp() : { } +{ + ( <ECHO> ) + { return new EchoExpression(); } +} + +Expression exactExp() : { } +{ + ( <EXACT> ) + { return new ExactExpression(); } +} + +Expression flattenExp() : { } +{ + ( <FLATTEN> ) + { return new FlattenExpression(); } +} + +Expression forEachExp() : +{ + Expression val; +} +{ + ( <FOR_EACH> <LCURLY> nl() val = statement() nl() <RCURLY> ) + { return new ForEachExpression(val); } +} + +Expression getFieldExp() : +{ + String val; +} +{ + ( <GET_FIELD> val = identifier() ) + { return new GetFieldExpression(val); } +} + +Expression getVarExp() : +{ + String val; +} +{ + ( <GET_VAR> val = identifier() ) + { return new GetVarExpression(val); } +} + +Expression guardExp() : +{ + Expression val; +} +{ + ( <GUARD> val = script() ) + { return new GuardExpression(val); } +} + +Expression hexDecodeExp() : { } +{ + ( <HEX_DECODE> ) + { return new HexDecodeExpression(); } +} + +Expression hexEncodeExp() : { } +{ + ( <HEX_ENCODE> ) + { return new HexEncodeExpression(); } +} + +Expression hostNameExp() : { } +{ + ( <HOST_NAME> ) + { return new HostNameExpression(); } +} + +Expression ifThenExp() : +{ + Expression lhs, rhs, ifTrue, ifFalse = null; + IfThenExpression.Comparator cmp; +} +{ + ( <IF> <LPAREN> lhs = expression() cmp = ifThenCmp() rhs = expression() <RPAREN> + ifTrue = script() [ <ELSE> ifFalse = script() ] ) + { return new IfThenExpression(lhs, cmp, rhs, ifTrue, ifFalse); } +} + +IfThenExpression.Comparator ifThenCmp() : +{ + IfThenExpression.Comparator val = null; +} +{ + ( <EQ> { val = IfThenExpression.Comparator.EQ; } | + <NE> { val = IfThenExpression.Comparator.NE; } | + <LE> { val = IfThenExpression.Comparator.LE; } | + <LT> { val = IfThenExpression.Comparator.LT; } | + <GE> { val = IfThenExpression.Comparator.GE; } | + <GT> { val = IfThenExpression.Comparator.GT; } ) + { return val; } +} + +Expression indexExp() : +{ + String val = defaultFieldName; +} +{ + ( <INDEX> [ val = fieldName() ] ) + { return new IndexExpression(val); } +} + +Expression inputExp() : +{ + String val = defaultFieldName; +} +{ + ( <INPUT> [ val = identifier() ] ) + { return new InputExpression(val); } +} + +Expression joinExp() : +{ + String val; +} +{ + ( <JOIN> val = string() ) + { return new JoinExpression(val); } +} + +Expression lowerCaseExp() : { } +{ + ( <LOWER_CASE> ) + { return new LowerCaseExpression(); } +} + +Expression ngramExp() : +{ + int gramSize; +} +{ + ( <NGRAM> gramSize = integer() ) + { return new NGramExpression(linguistics, gramSize); } +} + +Expression normalizeExp() : { } +{ + ( <NORMALIZE> ) + { return new NormalizeExpression(linguistics); } +} + +Expression nowExp() : { } +{ + ( <NOW> ) + { return new NowExpression(); } +} + +Expression optimizePredicateExp() : { } +{ + ( <OPTIMIZE_PREDICATE> ) + { return new OptimizePredicateExpression(); } +} + +Expression passthroughExp() : +{ + String val = defaultFieldName; +} +{ + ( <PASSTHROUGH> [ val = fieldName() ] ) + { return new PassthroughExpression(val); } +} + +Expression randomExp() : +{ + Integer val = null; +} +{ + ( <RANDOM> [ LOOKAHEAD(2) val = integer() ] ) + { return new RandomExpression(val); } +} + +Expression selectInputExp() : +{ + List<Pair<String, Expression>> cases = new LinkedList<Pair<String, Expression>>(); + Expression exp; + String str; +} +{ + ( <SELECT_INPUT> <LCURLY> nl() ( str = identifier() <COLON> exp = statement() <SCOLON> nl() + { cases.add(new Pair<String, Expression>(str, exp)); } )+ <RCURLY> ) + { return new SelectInputExpression(cases); } +} + +Expression setLanguageExp() : { } +{ + ( <SET_LANGUAGE> ) + { return new SetLanguageExpression(); } +} + +Expression setValueExp() : +{ + FieldValue val; +} +{ + ( val = fieldValue() ) + { return new SetValueExpression(val); } +} + +Expression setVarExp() : +{ + String val; +} +{ + ( <SET_VAR> val = identifier() ) + { return new SetVarExpression(val); } +} + +Expression splitExp() : +{ + String val; +} +{ + ( <SPLIT> val = string() ) + { return new SplitExpression(val); } +} + +Expression substringExp() : +{ + long from, to; +} +{ + ( <SUBSTRING> from = integer() to = integer() ) + { return new SubstringExpression((int)from, (int)to); } +} + +Expression summaryExp() : +{ + String val = defaultFieldName; +} +{ + ( <SUMMARY> [ val = fieldName() ] ) + { return new SummaryExpression(val); } +} + +Expression switchExp() : +{ + Map<String, Expression> cases = new LinkedHashMap<String, Expression>(); + Expression exp, defaultExp = null; + String str; +} +{ + ( <SWITCH> <LCURLY> nl() + ( <CASE> str = string() <COLON> exp = statement() { cases.put(str, exp); } <SCOLON> nl() )+ + [ <CASE_DEFAULT> <COLON> defaultExp = statement() <SCOLON> nl() ] + <RCURLY> ) + { return new SwitchExpression(cases, defaultExp); } +} + +Expression thisExp() : { } +{ + ( <THIS> ) + { return new ThisExpression(); } +} + +Expression tokenizeExp() : +{ + AnnotatorConfig cfg = annotatorCfg; +} +{ + ( <TOKENIZE> [ cfg = tokenizeCfg() ] ) + { return new TokenizeExpression(linguistics, cfg); } +} + +AnnotatorConfig tokenizeCfg() : +{ + AnnotatorConfig val = new AnnotatorConfig(annotatorCfg); + String str = "SHORTEST"; +} +{ + ( <STEM> ( <COLON> str = string() ) ? { val.setStemMode(str); } | + <NORMALIZE> { val.setRemoveAccents(true); } )+ + { return val; } +} + +Expression toArrayExp() : { } +{ + ( <TO_ARRAY> ) + { return new ToArrayExpression(); } +} + +Expression toByteExp() : { } +{ + ( <TO_BYTE> ) + { return new ToByteExpression(); } +} + +Expression toDoubleExp() : { } +{ + ( <TO_DOUBLE> ) + { return new ToDoubleExpression(); } +} + +Expression toFloatExp() : { } +{ + ( <TO_FLOAT> ) + { return new ToFloatExpression(); } +} + +Expression toIntExp() : { } +{ + ( <TO_INT> ) + { return new ToIntegerExpression(); } +} + +Expression toLongExp() : { } +{ + ( <TO_LONG> ) + { return new ToLongExpression(); } +} + +Expression toPosExp() : { } +{ + ( <TO_POS> ) + { return new ToPositionExpression(); } +} + +Expression toStringExp() : { } +{ + ( <TO_STRING> ) + { return new ToStringExpression(); } +} + +Expression toWsetExp() : +{ + boolean createIfNonExistent = false; + boolean removeIfZero = false; +} +{ + ( <TO_WSET> ( <CREATE_IF_NON_EXISTENT> { createIfNonExistent = true; } | + <REMOVE_IF_ZERO> { removeIfZero = true; } )* ) + { return new ToWsetExpression(createIfNonExistent, removeIfZero); } +} + +Expression trimExp() : { } +{ + ( <TRIM> ) + { return new TrimExpression(); } +} + +Expression zcurveExp() : { } +{ + ( <ZCURVE> ) + { return new ZCurveExpression(); } +} + +String identifier() : +{ + String val; +} +{ + ( val = string() | + ( <ATTRIBUTE> | + <BASE64_DECODE> | + <BASE64_ENCODE> | + <CASE> | + <CASE_DEFAULT> | + <CLEAR_STATE> | + <CREATE_IF_NON_EXISTENT> | + <ECHO> | + <EXACT> | + <ELSE> | + <FLATTEN> | + <FOR_EACH> | + <GET_FIELD> | + <GET_VAR> | + <GUARD> | + <HEX_DECODE> | + <HEX_ENCODE> | + <HOST_NAME> | + <IDENTIFIER> | + <IF> | + <INDEX> | + <INPUT> | + <JOIN> | + <LOWER_CASE> | + <NGRAM> | + <NORMALIZE> | + <NOW> | + <OPTIMIZE_PREDICATE> | + <PASSTHROUGH> | + <RANDOM> | + <REMOVE_IF_ZERO> | + <SELECT_INPUT> | + <SET_LANGUAGE> | + <SET_VAR> | + <SPLIT> | + <STEM> | + <SUBSTRING> | + <SUMMARY> | + <SWITCH> | + <THIS> | + <TO_ARRAY> | + <TO_DOUBLE> | + <TO_FLOAT> | + <TO_INT> | + <TO_LONG> | + <TO_POS> | + <TO_STRING> | + <TO_WSET> | + <TOKENIZE> | + <TRIM> | + <ZCURVE> ) { val = token.image; } ) + { return val; } +} + + +String fieldName() : +{ + StringBuilder builder = new StringBuilder(); + String str; +} +{ + ( str = identifier() { builder.append(str); } ( + LOOKAHEAD(2) <DOT> { builder.append(token.image); } + str = identifier() { builder.append(str); } )* ) + { return builder.toString(); } +} + +FieldValue fieldValue() : +{ + FieldValue val; +} +{ + ( val = numericValue() | val = stringValue() ) + { return val; } +} + +FieldValue numericValue() : +{ + FieldValue val; + String pre = ""; +} +{ + ( [ <ADD> | <SUB> { pre = "-"; } ] + ( <DOUBLE> { val = parseDouble(pre + token.image); } | + <FLOAT> { val = parseFloat(pre + token.image); } | + <INTEGER> { val = parseInteger(pre + token.image); } | + <LONG> { val = parseLong(pre + token.image); } ) ) + { return val; } +} + +FieldValue stringValue() : +{ + String val; +} +{ + ( val = string() ) + { return new StringFieldValue(val); } +} + +String string() : { } +{ + ( <STRING> ) + { return StringUtilities.unescape(token.image.substring(1, token.image.length() - 1)); } +} + +int integer() : +{ + String pre = ""; + int val; +} +{ + ( [ <ADD> | <SUB> { pre = "-"; } ] + <INTEGER> { val = Integer.parseInt(pre + token.image); } ) + { return val; } +} + +void nl() : { } +{ + ( <NL> )* +} + |