Support native Vespa standalone models

author: Jon Bratseth <bratseth@verizonmedia.com> 2019-05-31 17:55:21 +0200
committer: Jon Bratseth <bratseth@verizonmedia.com> 2019-05-31 17:55:21 +0200
commit: 986c2da2986a2fc0de4895a8107c85e4d0f37fd3 (patch)
tree: 3d7934b9feb062b9d1d48f7d4f88734ab8fecd9b /model-integration/src/main/javacc/ModelParser.jj
parent: 470e70ea9fe12681bf0427497cf470ac76b9eb95 (diff)
1 files changed, 352 insertions, 0 deletions
diff --git a/model-integration/src/main/javacc/ModelParser.jj b/model-integration/src/main/javacc/ModelParser.jj
new file mode 100644
index 00000000000..7604259e850
--- /dev/null
+++ b/model-integration/src/main/javacc/ModelParser.jj
@@ -0,0 +1,352 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+// --------------------------------------------------------------------------------
+//
+// JavaCC options. When this file is changed, run "mvn generate-sources" to rebuild
+// the parser classes.
+//
+// --------------------------------------------------------------------------------
+options {
+    UNICODE_INPUT = true;
+    CACHE_TOKENS  = false;
+    DEBUG_PARSER = false;
+    ERROR_REPORTING = true;
+    FORCE_LA_CHECK = true;
+    USER_CHAR_STREAM = true;
+}
+
+// --------------------------------------------------------------------------------
+//
+// Parser body.
+//
+// --------------------------------------------------------------------------------
+PARSER_BEGIN(ModelParser)
+
+package ai.vespa.rankingexpression.importer.vespa.parser;
+
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.List;
+import java.util.ArrayList;
+import ai.vespa.rankingexpression.importer.ImportedModel;
+import com.yahoo.tensor.TensorType;
+import com.yahoo.tensor.Tensor;
+import com.yahoo.searchlib.rankingexpression.RankingExpression;
+
+/**
+ * Parser of Vespa ML model files: Ranking expression functions enclosed in brackets.
+ *
+ * @author bratseth
+ */
+public class ModelParser {
+
+    /** The model we are importing into */
+    private ImportedModel model;
+
+    /** Creates a parser of a string */
+    public ModelParser(String input, ImportedModel model) {
+        this(new SimpleCharStream(input), model);
+    }
+
+    /** Creates a parser */
+    public ModelParser(SimpleCharStream input, ImportedModel model) {
+        this(input);
+        this.model = model;
+    }
+
+}
+
+PARSER_END(ModelParser)
+
+
+// --------------------------------------------------------------------------------
+//
+// Token declarations.
+//
+// --------------------------------------------------------------------------------
+
+// Declare white space characters. These do not include newline because it has
+// special meaning in several of the production rules.
+SKIP :
+{
+  " " | "\t" | "\r" | "\f"
+}
+
+// Declare all tokens to be recognized. When a word token is added it MUST be
+// added to the identifier() production rule.
+TOKEN :
+{
+  < NL: "\n" >
+| < FUNCTION: "function" >
+| < TENSOR_TYPE: "tensor(" (~["(",")"])+ ")" >
+| < TENSOR_VALUE_SL: "value" (" ")* ":" (" ")* ("{"<BRACE_SL_LEVEL_1>) ("\n")? >
+| < TENSOR_VALUE_ML: "value" (<SEARCHLIB_SKIP>)? "{" (["\n"," "])* ("{"<BRACE_ML_LEVEL_1>) (["\n"," "])* "}" ("\n")? >
+| < LBRACE: "{" >
+| < RBRACE: "}" >
+| < COLON: ":" >
+| < DOT: "." >
+| < COMMA: "," >
+| < MODEL: "model" >
+| < TYPE: "type" >
+| < EXPRESSION_SL: "expression" (" ")* ":" (("{"<BRACE_SL_LEVEL_1>)|<BRACE_SL_CONTENT>)* ("\n")? >
+| < EXPRESSION_ML: "expression" (<SEARCHLIB_SKIP>)? "{" (("{"<BRACE_ML_LEVEL_1>)|<BRACE_ML_CONTENT>)* "}" >
+| < #BRACE_SL_LEVEL_1: (("{"<BRACE_SL_LEVEL_2>)|<BRACE_SL_CONTENT>)* "}" >
+| < #BRACE_SL_LEVEL_2: (("{"<BRACE_SL_LEVEL_3>)|<BRACE_SL_CONTENT>)* "}" >
+| < #BRACE_SL_LEVEL_3: <BRACE_SL_CONTENT> "}" >
+| < #BRACE_SL_CONTENT: (~["{","}","\n"])* >
+| < #BRACE_ML_LEVEL_1: (("{"<BRACE_ML_LEVEL_2>)|<BRACE_ML_CONTENT>)* "}" >
+| < #BRACE_ML_LEVEL_2: (("{"<BRACE_ML_LEVEL_3>)|<BRACE_ML_CONTENT>)* "}" >
+| < #BRACE_ML_LEVEL_3: <BRACE_ML_CONTENT> "}" >
+| < #BRACE_ML_CONTENT: (~["{","}"])* >
+| < #SEARCHLIB_SKIP: ([" ","\f","\n","\r","\t"])+ >
+| < CONSTANTS: "constants" >
+| < FILE: "file" >
+| < URI: "uri" >
+| < IDENTIFIER:           ["a"-"z","A"-"Z", "_"] (["a"-"z","A"-"Z","0"-"9","_"])* >
+| < CONTEXT: ["a"-"z","A"-"Z"] (["a"-"z", "A"-"Z", "0"-"9"])* >
+| < DOUBLE: ("-")? (["0"-"9"])+ "." (["0"-"9"])+ >
+| < STRING: (["a"-"z","A"-"Z","_","0"-"9","."])+ >
+| < FILE_PATH: ["a"-"z","A"-"Z", "_"] (["a"-"z","A"-"Z","0"-"9","_","-", "/", "."])+ >
+| < HTTP: ["h","H"] ["t","T"] ["t","T"] ["p","P"] (["s","S"])? >
+| < URI_PATH: <HTTP> <COLON> ("//")? (["a"-"z","A"-"Z","0"-"9","_","-", "/", ".",":"])+ >
+}
+
+// Declare a special skip token for comments.
+SPECIAL_TOKEN :
+{
+  <SINGLE_LINE_COMMENT: "#" (~["\n","\r"])* >
+}
+
+
+// --------------------------------------------------------------------------------
+//
+// Production rules.
+//
+// --------------------------------------------------------------------------------
+
+void model() :
+{
+    String name;
+}
+{
+    (<NL>)*
+    <MODEL>
+    (<NL>)*
+    name = identifier()
+    (<NL>)*
+    <LBRACE> modelContent() <RBRACE>
+    (<NL>)*
+    <EOF>
+    {
+        if ( ! name.equals(model.name()))
+            throw new IllegalArgumentException("Model '" + name + "' must be saved in a file named '" + name + ".model'");
+    }
+}
+
+void modelContent() :
+{
+}
+{
+    ( <NL> | input() | function() )*
+}
+
+/** Declared input variables (aka features). All non-scalar inputs must be declared. */
+void input() :
+{
+    String name;
+    TensorType type;
+}
+{
+    name = identifier() <COLON> type = tensorType("Input parameter '" + name + "'")
+    { model.input(name, type); }
+}
+
+/** A function */
+void function() :
+{
+    String name, expression, parameter;
+    List parameters = new ArrayList();
+}
+{
+    (  <FUNCTION> name = identifier()
+      "("
+          [ parameter = identifier()         { parameters.add(parameter); }
+          ( <COMMA> parameter = identifier() { parameters.add(parameter); } )* ]
+      ")"
+      lbrace() expression = expression() (<NL>)* <RBRACE> )
+    {
+        try {
+            model.expression(name, new RankingExpression(expression));
+        }
+        catch (com.yahoo.searchlib.rankingexpression.parser.ParseException e) {
+            throw new IllegalArgumentException("Could not parse function '" + name + "'", e);
+        }
+    }
+}
+
+/** Consumes a constant block of model. */
+/*
+void rankingConstant() :
+{
+    String name;
+    RankingConstant constant;
+}
+{
+    ( <CONSTANT> name = identifier()
+        {
+//            constant = new RankingConstant(name);
+        }
+      lbrace() (rankingConstantItem(constant) (<NL>)*)+ <RBRACE> )
+    {
+    }
+}
+*/
+
+/** Consumes a constant block. */
+/*
+void rankingConstantItem(RankingConstant constant) :
+{
+    String path = null;
+    TensorType type = null;
+}
+{
+    ( (<FILE> <COLON> path = filePath() { } (<NL>)*) { constant.setFileName(path); }
+      | (<URI> <COLON> path = uriPath() { } (<NL>)*) { constant.setUri(path); }
+      | type = tensorTypeWithPrefix(rankingConstantErrorMessage(constant.getName())) (<NL>)* { constant.setType(type); }
+    )
+    {
+        return null;
+    }
+}
+*/
+
+String rankingConstantErrorMessage(String name) : {}
+{
+    { return "For ranking constant ' " + name + "'"; }
+}
+
+String filePath() : { }
+{
+    ( <FILE_PATH> | <STRING> | <IDENTIFIER>)
+    { return token.image; }
+}
+
+String uriPath() : { }
+{
+    ( <URI_PATH> )
+    { return token.image; }
+}
+
+/** Consumes the constants of this model. */
+void constants(ImportedModel model) :
+{
+    String name;
+}
+{
+    <CONSTANTS> <LBRACE> (<NL>)*
+      ( name = identifier() ( constantDouble(name) |
+                              constantTensor(name) ) (<NL>)* )*
+    <RBRACE>
+}
+
+void constantDouble(String name) :
+{
+    Token value;
+}
+{
+    <COLON> value = <DOUBLE> { model.smallConstant(name, Tensor.from(Double.parseDouble(token.image))); }
+}
+
+void constantTensor(String name) :
+{
+    String tensorString = "";
+    TensorType tensorType = null;
+}
+{
+    <LBRACE> (<NL>)*
+      (( tensorString = tensorValue() |
+         tensorType = tensorTypeWithPrefix(constantTensorErrorMessage(model.name(), name)) ) (<NL>)* )* <RBRACE>
+    {
+        if (tensorType != null) {
+            model.smallConstant(name, Tensor.from(tensorType, tensorString));
+        } else {
+            model.smallConstant(name, Tensor.from(tensorString));
+        }
+    }
+}
+
+String constantTensorErrorMessage(String model, String constantTensorName) : {}
+{
+    { return "For constant tensor '" + constantTensorName + "' in model '" + model + "'"; }
+}
+
+String tensorValue() :
+{
+    String tensor;
+}
+{
+    ( <TENSOR_VALUE_SL> { tensor = token.image.substring(token.image.indexOf(":") + 1); } |
+      <TENSOR_VALUE_ML> { tensor = token.image.substring(token.image.indexOf("{") + 1,
+                                                         token.image.lastIndexOf("}")); } )
+    {
+        return tensor;
+    }
+}
+
+TensorType tensorTypeWithPrefix(String errorMessage) :
+{
+    TensorType type;
+}
+{
+    <TYPE> <COLON> type= tensorType(errorMessage)
+    { return type; }
+}
+
+TensorType tensorType(String errorMessage) :
+{
+    String tensorTypeString;
+}
+{
+    <TENSOR_TYPE> { tensorTypeString = token.image; }
+    {
+        TensorType tensorType;
+        try {
+            tensorType = TensorType.fromSpec(tensorTypeString);
+        } catch (IllegalArgumentException e) {
+            throw new IllegalArgumentException(errorMessage + ": Illegal tensor type spec: " + e.getMessage());
+        }
+        return tensorType;
+    }
+}
+
+/** Consumes an expression token and returns its image. */
+String expression() :
+{
+    String exp;
+}
+{
+    ( <EXPRESSION_SL> { exp = token.image.substring(token.image.indexOf(":") + 1); } |
+      <EXPRESSION_ML> { exp = token.image.substring(token.image.indexOf("{") + 1,
+                                                    token.image.lastIndexOf("}")); } )
+    { return exp; }
+}
+
+/** Consumes an identifier. This must be kept in sync with all word tokens that should be parseable as identifiers. */
+String identifier() : { }
+{
+    (
+        <IDENTIFIER>
+      | <DOUBLE>
+      | <FILE>
+      | <URI>
+      | <MODEL>
+      | <TYPE>
+      | <CONSTANTS>
+    )
+    { return token.image; }
+}
+
+/** Consumes an opening brace with leading and trailing newline tokens. */
+void lbrace() : { }
+{
+    (<NL>)* <LBRACE> (<NL>)*
+}
author	Jon Bratseth <bratseth@verizonmedia.com>	2019-05-31 17:55:21 +0200
committer	Jon Bratseth <bratseth@verizonmedia.com>	2019-05-31 17:55:21 +0200
commit	986c2da2986a2fc0de4895a8107c85e4d0f37fd3 (patch)
tree	3d7934b9feb062b9d1d48f7d4f88734ab8fecd9b /model-integration/src/main/javacc/ModelParser.jj
parent	470e70ea9fe12681bf0427497cf470ac76b9eb95 (diff)