// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. // -------------------------------------------------------------------------------- // // JavaCC options. When this file is changed, run "mvn generate-sources" to rebuild // the parser classes. // // -------------------------------------------------------------------------------- options { UNICODE_INPUT = true; CACHE_TOKENS = false; DEBUG_PARSER = false; ERROR_REPORTING = true; FORCE_LA_CHECK = true; USER_CHAR_STREAM = true; } // -------------------------------------------------------------------------------- // // Parser body. // // -------------------------------------------------------------------------------- PARSER_BEGIN(ModelParser) package ai.vespa.rankingexpression.importer.vespa.parser; import java.io.File; import java.io.Reader; import java.io.StringReader; import java.util.List; import java.util.ArrayList; import ai.vespa.rankingexpression.importer.ImportedModel; import com.yahoo.io.IOUtils; import com.yahoo.tensor.Tensor; import com.yahoo.tensor.IndexedTensor; import com.yahoo.tensor.MixedTensor; import com.yahoo.tensor.TensorAddress; import com.yahoo.tensor.TensorType; import com.yahoo.tensor.serialization.JsonFormat; import com.yahoo.searchlib.rankingexpression.RankingExpression; /** * Parser of Vespa ML model files: Ranking expression functions enclosed in brackets. * * @author bratseth */ public class ModelParser { /** The model we are importing into */ private ImportedModel model; /** Creates a parser of a string */ public ModelParser(String input, ImportedModel model) { this(new SimpleCharStream(input), model); } /** Creates a parser */ public ModelParser(SimpleCharStream input, ImportedModel model) { this(input); this.model = model; } } PARSER_END(ModelParser) // -------------------------------------------------------------------------------- // // Token declarations. // // -------------------------------------------------------------------------------- // Declare white space characters. These do not include newline because it has // special meaning in several of the production rules. SKIP : { " " | "\t" | "\r" | "\f" } // Declare all tokens to be recognized. When a word token is added it MUST be // added to the identifier() production rule. TOKEN : { < NL: "\n" > | < FUNCTION: "function" > | < TENSOR_TYPE: "tensor" ("<" (~["<",">"])+ ">")? "(" (~["(",")"])* ")" > | < TENSOR_VALUE_SL: "value" (" ")* ":" (" ")* ("{") ("\n")? > | < TENSOR_VALUE_ML: "value" ()? "{" (["\n"," "])* ("{") (["\n"," "])* "}" ("\n")? > | < LBRACE: "{" > | < RBRACE: "}" > | < COLON: ":" > | < DOT: "." > | < COMMA: "," > | < DOUBLE_KEYWORD: "double" > | < INPUTS: "inputs" > | < MODEL: "model" > | < TYPE: "type" > | < EXPRESSION_SL: "expression" (" ")* ":" (("{")|)* ("\n")? > | < EXPRESSION_ML: "expression" ()? "{" (("{")|)* "}" > | < #BRACE_SL_LEVEL_1: (("{")|)* "}" > | < #BRACE_SL_LEVEL_2: (("{")|)* "}" > | < #BRACE_SL_LEVEL_3: "}" > | < #BRACE_SL_CONTENT: (~["{","}","\n"])* > | < #BRACE_ML_LEVEL_1: (("{")|)* "}" > | < #BRACE_ML_LEVEL_2: (("{")|)* "}" > | < #BRACE_ML_LEVEL_3: "}" > | < #BRACE_ML_CONTENT: (~["{","}"])* > | < #SEARCHLIB_SKIP: ([" ","\f","\n","\r","\t"])+ > | < CONSTANT: "constant" > | < CONSTANTS: "constants" > | < FILE: "file" > | < URI: "uri" > | < IDENTIFIER: ["a"-"z","A"-"Z", "_"] (["a"-"z","A"-"Z","0"-"9","_"])* > | < DOUBLEQUOTEDSTRING: "\"" ( ~["\""] )* "\"" > | < SINGLEQUOTEDSTRING: "'" ( ~["'"] )* "'" > | < CONTEXT: ["a"-"z","A"-"Z"] (["a"-"z", "A"-"Z", "0"-"9"])* > | < DOUBLE: ("-")? (["0"-"9"])+ "." (["0"-"9"])+ > | < INTEGER: ("-")? (["0"-"9"])+ > | < LONG: ("-")? (["0"-"9"])+"L" > | < STRING: (["a"-"z","A"-"Z","_","0"-"9","."])+ > | < FILE_PATH: ["a"-"z","A"-"Z", "_"] (["a"-"z","A"-"Z","0"-"9","_","-", "/", "."])+ > | < HTTP: ["h","H"] ["t","T"] ["t","T"] ["p","P"] (["s","S"])? > | < URI_PATH: ("//")? (["a"-"z","A"-"Z","0"-"9","_","-", "/", ".",":"])+ > } // Declare a special skip token for comments. SPECIAL_TOKEN : { } // -------------------------------------------------------------------------------- // // Production rules. // // -------------------------------------------------------------------------------- void model() : { String name; } { ()* ()* name = identifier() ()* modelContent() ()* { if ( ! model.name().endsWith(name)) throw new IllegalArgumentException("Unexpected model name '" + model.name() + "': Model '" + name + "' must be saved in a file named '" + name + ".model'"); } } void modelContent() : {} { ( | constants() | largeConstant() | function() | inputs() | input() )* } void inputs() : {} { ()* ()* ( input() ()* )* } /** Declared input variables (aka features). All non-scalar inputs must be declared. */ void input() : { String name; TensorType type; } { name = identifier() type = tensorType("Input parameter '" + name + "'") { model.input(name, type); } } /** A function */ void function() : { String name, expression, parameter; List< String > parameters = new ArrayList< String >(); } { ( name = identifier() "(" [ parameter = identifier() { parameters.add(parameter); } ( parameter = identifier() { parameters.add(parameter); } )* ] ")" lbrace() expression = expression() ()* ) { model.expression(name, expression); } } /** Consumes the constants of this model. */ void constants() : { String name; } { ()* ( constant() ()* )* } String constantTensorErrorMessage(String constantTensorName) : {} { { return "For constant tensor '" + constantTensorName + "' in '" + model + "'"; } } void constant() : { String name = null; TensorType type = TensorType.empty; Tensor value = null; String valuePath = null; } { ( name = identifier() ()? ( LOOKAHEAD(4) ( ( type = valueType(name) )? ()? ()* ( value = tensorValue(type) | valuePath = fileItem()) { if (value != null) { model.smallConstant(name, value); } else { try { value = JsonFormat.decode(type, IOUtils.readFileBytes(model.relativeFile(valuePath, "constant '" + name + "'"))); model.largeConstant(name, value); } catch (Exception e) { throw new IllegalArgumentException("Could not read constant '" + name + "'", e); } } } ) | // Deprecated forms (TODO: Vespa > 8: Add warning ): ( constantValue(name) | constantTensor(name) ) ) ) } // Deprecated form void constantValue(String name) : { Token value; } { ( value = | value = | value = ) { model.smallConstant(name, Tensor.from(value.image)); } } // Deprecated form void constantTensor(String name) : { String tensorString = ""; TensorType type = null; } { ()* (( tensorString = tensorValuePrefixedByValue() | type = tensorTypeWithPrefix(constantTensorErrorMessage(name)) ) ()* )* { model.smallConstant(name, type != null ? Tensor.from(type, tensorString) : Tensor.from(tensorString)); } } TensorType valueType(String name) : { TensorType type; } { ( ( type = tensorType("Type of " + name) ) | ( { type = TensorType.empty; } ) ) { return type; } } TensorType tensorType(String errorMessage) : { String tensorTypeString; } { { tensorTypeString = token.image; } { TensorType tensorType; try { tensorType = TensorType.fromSpec(tensorTypeString); } catch (IllegalArgumentException e) { throw new IllegalArgumentException(errorMessage + ": Illegal tensor type spec: " + e.getMessage()); } return tensorType; } } /** * Parses a tensor written in a tensor literal form, * https://docs.vespa.ai/en/reference/tensor.html#tensor-literal-form */ Tensor tensorValue(TensorType type) : { Tensor.Builder builder = Tensor.Builder.of(type); Number doubleValue = null; } { ( mappedTensorValue(builder) | indexedTensorValues(builder) | doubleValue = number() ) { if (doubleValue != null) { if (type.rank() > 0) throw new IllegalArgumentException("A tensor of type " + type + " cannot be a number"); builder.cell(doubleValue.doubleValue()); } return builder.build(); } } /** A mapped or mixed tensor value. */ void mappedTensorValue(Tensor.Builder builder) : {} { "{" ( mappedTensorBlock(builder) )* ( ()* mappedTensorBlock(builder) )* "}" } void mappedTensorBlock(Tensor.Builder builder) : { TensorAddress mappedAddress; } { mappedAddress = tensorAddress(builder.type()) ()* ( mappedTensorCellValue(mappedAddress, builder) | indexedTensorBlockValues(mappedAddress, builder) ) } void indexedTensorBlockValues(TensorAddress mappedAddress, Tensor.Builder builder) : { List values = new ArrayList(); } { arrayTensorValues(values) { MixedTensor.BoundBuilder boundBuilder = (MixedTensor.BoundBuilder)builder; double[] arrayValues = new double[values.size()]; for (int i = 0; i < values.size(); i++ ) { arrayValues[i] = values.get(i); } boundBuilder.block(mappedAddress, arrayValues); } } void indexedTensorValues(Tensor.Builder builder) : { List values = new ArrayList(); } { arrayTensorValues(values) { IndexedTensor.BoundBuilder boundBuilder = (IndexedTensor.BoundBuilder)builder; double[] arrayValues = new double[values.size()]; for (int i = 0; i < values.size(); i++ ) { arrayValues[i] = values.get(i); } boundBuilder.fill(arrayValues); } } /** Tensor array values. Using sub-bracketing for multiple dimensions is optional and therefore ignored here. */ void arrayTensorValues(List values) : {} { "[" ( ( indexedTensorValue(values) | arrayTensorValues(values)) )* ( ()* ( indexedTensorValue(values) | arrayTensorValues(values)) )* "]" } void indexedTensorValue(List values) : { Number value; } { value = number() { values.add(value.doubleValue()); } } void mappedTensorCellValue(TensorAddress address, Tensor.Builder builder) : { double value; } { value = tensorCellValue() { builder.cell(address, value); } } TensorAddress tensorAddress(TensorType type) : { TensorAddress.Builder builder = new TensorAddress.PartialBuilder(type); String label; } { ( label = tensorAddressLabel() { builder.add(label); } | ( "{" ( tensorAddressElement(builder) )* ( tensorAddressElement(builder) )* "}" ) ) { return builder.build(); } } void tensorAddressElement(TensorAddress.Builder builder) : { String dimension; String label; } { dimension = identifier() ()* label = tensorAddressLabel() { builder.add(dimension, label); } } String tensorAddressLabel() : { String label; } { ( label = identifier() | label = quotedString() ) { return label; } } double tensorCellValue() : { Number value; } { value = number() { return value.doubleValue(); } } /** Undocumented syntax for supplying a tensor constant value by a string prefixed by "value" */ String tensorValuePrefixedByValue() : { String tensor; } { ( { tensor = token.image.substring(token.image.indexOf(":") + 1); } | { tensor = token.image.substring(token.image.indexOf("{") + 1, token.image.lastIndexOf("}")); } ) { return tensor; } } TensorType tensorTypeWithPrefix(String errorMessage) : { String tensorTypeString; } { { tensorTypeString = token.image; } { TensorType tensorType; try { tensorType = TensorType.fromSpec(tensorTypeString); } catch (IllegalArgumentException e) { throw new IllegalArgumentException(errorMessage + ": Illegal tensor type spec: " + e.getMessage()); } return tensorType; } } /** Consumes a large constant. */ // TODO: Remove on Vespa 9 void largeConstant() : { String name; Tensor value; } { ( name = identifier() lbrace() value = largeConstantBody(name) ) { model.largeConstant(name, value); } } // TODO: Add support in ImportedModel for passing a large tensor through as a file/Uri pointer instead of reading it here Tensor largeConstantBody(String name) : { String path = null; TensorType type = null; } { ( path = filePath() // | ( path = uriPath() TODO | type = tensorType("Constant '" + name + "'") | )+ { try { return JsonFormat.decode(type, IOUtils.readFileBytes(model.relativeFile(path, "constant '" + name + "'"))); } catch (Exception e) { throw new IllegalArgumentException("Could not read constant '" + name + "'", e); } } } String filePath() : { } { ( | | ) { return token.image; } } String uriPath() : { } { ( ) { return token.image; } } /** Consumes an expression token and returns its image. */ String expression() : { String exp; } { ( { exp = token.image.substring(token.image.indexOf(":") + 1); } | { exp = token.image.substring(token.image.indexOf("{") + 1, token.image.lastIndexOf("}")); } ) { return exp; } } /** Consumes an identifier. This must be kept in sync with all word tokens that should be parseable as identifiers. */ String identifier() : { } { ( | | | | | | | | | ) { return token.image; } } Number number() : { Number num; } { (num = floatValue() | num = longValue() ) { return num; } } /** Consumes a long or integer token and returns its numeric value. */ long longValue() : { } { ( { return Long.parseLong(token.image); } | { return Long.parseLong(token.image.substring(0, token.image.length()-1)); } ) } /** Consumes a floating-point token and returns its numeric value. */ double floatValue() : { } { { return Double.valueOf(token.image); } } /** Consumes an opening brace with leading and trailing newline tokens. */ void lbrace() : { } { ()* ()* } String fileItem() : { String path; } { ( ( | | ) { path = com.yahoo.path.Path.fromString(token.image).getRelative(); } { } ()*) { return path; } } /** * Consumes a quoted string token and returns the token image minus the quotes. This does not perform * unescaping of the content, it simply removes the first and last character of the image. However, the token itself can * contain anything but a double quote. * * @return the unquoted token image */ String quotedString() : { } { ( | ) { return token.image.substring(1, token.image.length() - 1); } }