// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
// --------------------------------------------------------------------------------
//
// JavaCC options. When this file is changed, run "mvn generate-sources" to rebuild
// the parser classes.
//
// --------------------------------------------------------------------------------
options {
UNICODE_INPUT = true;
CACHE_TOKENS = false;
STATIC = false;
DEBUG_PARSER = false;
ERROR_REPORTING = true;
FORCE_LA_CHECK = true;
USER_CHAR_STREAM = true;
}
// --------------------------------------------------------------------------------
//
// Parser body.
//
// --------------------------------------------------------------------------------
PARSER_BEGIN(SDParser)
package com.yahoo.searchdefinition.parser;
import com.yahoo.document.*;
import com.yahoo.documentmodel.*;
import com.yahoo.compress.Compressor;
import com.yahoo.compress.CompressionType;
import com.yahoo.searchdefinition.document.*;
import com.yahoo.searchdefinition.document.annotation.SDAnnotationType;
import com.yahoo.searchdefinition.document.annotation.TemporaryAnnotationReferenceDataType;
import com.yahoo.searchdefinition.RankingConstant;
import com.yahoo.searchdefinition.Index;
import com.yahoo.searchdefinition.RankProfile;
import com.yahoo.searchdefinition.DefaultRankProfile;
import com.yahoo.searchdefinition.RankProfileRegistry;
import com.yahoo.searchdefinition.RankProfile.MatchPhaseSettings;
import com.yahoo.searchdefinition.RankProfile.DiversitySettings;
import com.yahoo.searchdefinition.Search;
import com.yahoo.searchdefinition.UnproperSearch;
import com.yahoo.searchdefinition.UnrankedRankProfile;
import com.yahoo.searchdefinition.fieldoperation.*;
import com.yahoo.searchlib.rankingexpression.FeatureList;
import com.yahoo.searchlib.rankingexpression.evaluation.Value;
import com.yahoo.searchlib.rankingexpression.evaluation.TensorValue;
import com.yahoo.tensor.Tensor;
import com.yahoo.tensor.TensorType;
import com.yahoo.vespa.documentmodel.DocumentSummary;
import com.yahoo.vespa.documentmodel.SummaryField;
import com.yahoo.vespa.documentmodel.SummaryTransform;
import com.yahoo.config.model.test.MockApplicationPackage;
import com.yahoo.config.application.api.ApplicationPackage;
import com.yahoo.config.application.api.DeployLogger;
import com.yahoo.config.model.application.provider.BaseDeployLogger;
import com.yahoo.language.Linguistics;
import com.yahoo.language.simple.SimpleLinguistics;
import com.yahoo.search.query.ranking.Diversity;
import java.util.Map;
import java.util.LinkedHashMap;
import java.util.logging.Level;
import org.apache.commons.lang.StringUtils;
/**
* A search definition parser
*
* @author Jon S Bratseth et.al.
*/
public class SDParser {
private DocumentTypeManager docMan = null;
private ApplicationPackage app = MockApplicationPackage.createEmpty();
private DeployLogger deployLogger = new BaseDeployLogger();
private RankProfileRegistry rankProfileRegistry = new RankProfileRegistry();
public SDParser(String input, DeployLogger deployLogger) {
this(new SimpleCharStream(input), deployLogger);
}
public SDParser(SimpleCharStream stream, DeployLogger deployLogger) {
this(stream);
this.deployLogger = deployLogger;
}
public SDParser(SimpleCharStream stream, DeployLogger deployLogger, ApplicationPackage applicationPackage, RankProfileRegistry rankProfileRegistry) {
this(stream);
this.deployLogger = deployLogger;
this.app = applicationPackage;
this.rankProfileRegistry = rankProfileRegistry;
}
/**
* Consumes an indexing language script which will use the simple linguistics implementation
* for testing, by taking input from the current input stream.
*
* @param multiline Whether or not to allow multi-line expressions.
*/
private IndexingOperation newIndexingOperation(boolean multiline) throws ParseException {
return newIndexingOperation(multiline, new SimpleLinguistics());
}
/**
* Consumes an indexing language script from the current input stream.
*
* @param multiline Whether or not to allow multi-line expressions.
* @param linguistics What to use for tokenizing.
*/
private IndexingOperation newIndexingOperation(boolean multiline, Linguistics linguistics) throws ParseException {
SimpleCharStream input = (SimpleCharStream)token_source.input_stream;
if (token.next != null) {
input.backup(token.next.image.length());
}
try {
return IndexingOperation.fromStream(input, multiline, linguistics);
} finally {
token.next = null;
jj_ntk = -1;
}
}
/**
* Parses the given token image as a ranking expression feature list.
*
* @param image The token image to parse.
* @return The consumed feature list.
* @throws ParseException Thrown if the image could not be parsed.
*/
private FeatureList getFeatureList(String image) throws ParseException {
try {
return new FeatureList(image);
}
catch (com.yahoo.searchlib.rankingexpression.parser.ParseException e) {
throw (ParseException) new ParseException("Could not parse feature list '" + image + "' at line " +
token_source.input_stream.getBeginLine() + ", column " +
token_source.input_stream.getBeginColumn() + ".").initCause(e);
}
}
/**
* Sets the compression threshold in each item in the compression config array.
*
* @param cfg The array of configs to modify.
* @param val The compression threshold to set.
*/
private void setCompressionThreshold(CompressionConfig cfg, int val) {
cfg.threshold = val;
}
/**
* Sets the compression level in each item in the compression config array.
*
* @param cfg The array of configs to modify.
* @param val The compression level to set.
*/
private void setCompressionLevel(CompressionConfig cfg, int val) {
cfg.compressionLevel = val;
}
}
PARSER_END(SDParser)
// --------------------------------------------------------------------------------
//
// Token declarations.
//
// --------------------------------------------------------------------------------
// Declare white space characters. These do not include newline because it has
// special meaning in several of the production rules.
SKIP :
{
" " | "\t" | "\r" | "\f"
}
// Declare all tokens to be recognized. When a word token is added it MUST be
// added to the identifier() production rule.
TOKEN :
{
< NL: "\n" >
| < ANNOTATION: "annotation" >
| < ANNOTATIONREFERENCE: "annotationreference" >
| < SEARCH: "search" >
| < DIVERSITY: "diversity" >
| < MIN_GROUPS: "min-groups" >
| < CUTOFF_FACTOR: "cutoff-factor" >
| < CUTOFF_STRATEGY: "cutoff-strategy" >
| < LOOSE: "loose" >
| < STRICT: "strict" >
| < DOCUMENT: "document" >
| < STRUCT: "struct" >
| < INHERITS: "inherits" >
| < FIELD: "field" >
| < FIELDS: "fields" >
| < FIELDSET: "fieldset" >
| < STRUCTFIELD: "struct-field" >
| < IMPORT: "import" >
| < AS: "as" >
| < INDEXING: "indexing" >
| < SUMMARYTO: "summary-to" >
| < DOCUMENTSUMMARY: "document-summary" >
| < RANKTYPE: "rank-type" >
| < WEIGHT: "weight" >
| < TYPE: "type" >
| < INDEX: "index" >
| < RISE: "rise" >
| < MTOKEN: "token" >
| < TEXT: "text" >
| < WORD: "word" >
| < GRAM: "gram" >
| < GRAMSIZE: "gram-size" >
| < MAXLENGTH: "max-length" >
| < PREFIX: "prefix" >
| < SUBSTRING: "substring" >
| < SUFFIX: "suffix" >
| < CONSTANT: "constant">
| < RANKPROFILE: "rank-profile" >
| < RANKDEGRADATIONFREQ: "rank-degradation-frequency" >
| < RANKDEGRADATION: "rank-degradation" >
| < RPBINSIZE: "doc-frequency" >
| < RPBINLOW: "min-fullrank-docs">
| < RPPOSBINSIZE: "occurrences-per-doc" >
| < SUMMARY: "summary" >
| < FULL: "full" >
| < STATIC: "static" >
| < DYNAMIC: "dynamic" >
| < SSCONTEXTUAL: "contextual" >
| < SSOVERRIDE: "override" >
| < SSTITLE: "title" >
| < SSURL: "url" >
| < PROPERTIES: "properties" >
| < ATTRIBUTE: "attribute" >
| < SORTING: "sorting" >
| < ASCENDING: "ascending" >
| < DESCENDING: "descending" >
| < UCA: "uca" >
| < RAW: "raw" >
| < LOWERCASE: "lowercase" >
| < FUNCTION: "function" >
| < LOCALE: "locale" >
| < STRENGTH: "strength" >
| < PRIMARY: "primary" >
| < SECONDARY: "secondary" >
| < TERTIARY: "tertiary" >
| < QUATERNARY: "quaternary" >
| < IDENTICAL: "identical" >
| < STEMMING: "stemming" >
| < NORMALIZING: "normalizing" >
| < BOLDING: "bolding" >
| < BODY: "body" >
| < HEADER: "header" >
| < NONE: "none" >
| < ON: "on" >
| < OFF: "off" >
| < TRUE: "true" >
| < FALSE: "false" >
| < SYMMETRIC: "symmetric" >
| < QUERYCOMMAND: "query-command" >
| < ALIAS: "alias" >
| < MATCH: "match" >
| < RANK: "rank" >
| < LITERAL: "literal" >
| < EXACT: "exact" >
| < FILTER: "filter" >
| < NORMAL: "normal" >
| < EXACTTERMINATOR: "exact-terminator" >
| < INDEXINGREWRITE: "indexing-rewrite" >
| < IGNOREDEFAULTRANKFEATURES: "ignore-default-rank-features" >
| < ID: "id" >
| < SOURCE: "source" >
| < TO: "to" >
| < DIRECT: "direct" >
| < ALWAYS: "always" >
| < ONDEMAND: "on-demand" >
| < NEVER: "never" >
| < ENABLEBITVECTORS: "enable-bit-vectors" >
| < ENABLEONLYBITVECTOR: "enable-only-bit-vector" >
| < FASTACCESS: "fast-access" >
| < FASTSEARCH: "fast-search" >
| < HUGE: "huge" >
| < PREFETCH: "prefetch" >
| < NOPREFETCH: "no-prefetch" >
| < TENSOR_TYPE: "tensor(" (~["(",")"])+ ")" >
| < TENSOR_VALUE_SL: "value" (" ")* ":" (" ")* ("{") ("\n")? >
| < TENSOR_VALUE_ML: "value" ()? "{" (["\n"," "])* ("{") (["\n"," "])* "}" ("\n")? >
| < COMPRESSION: "compression" >
| < COMPRESSIONLEVEL: "level" >
| < COMPRESSIONTHRESHOLD: "threshold" >
| < LZ4: "lz4" >
| < USEDOCUMENT: "use-document" >
| < LBRACE: "{" >
| < RBRACE: "}" >
| < COLON: ":" >
| < DOT: "." >
| < COMMA: "," >
| < ARRAY: "array" >
| < WEIGHTEDSET: "weightedset" >
| < MAP: "map" >
| < REFERENCE: "reference" >
| < QUESTIONMARK: "?" >
| < CREATEIFNONEXISTENT: "create-if-nonexistent" >
| < REMOVEIFZERO: "remove-if-zero" >
| < MATCHPHASE: "match-phase" >
| < EVALUATION_POINT: "evaluation-point" >
| < PRE_POST_FILTER_TIPPING_POINT: "pre-post-filter-tipping-point" >
| < ORDER: "order" >
| < MAXFILTERCOVERAGE: "max-filter-coverage" >
| < MAXHITS: "max-hits" >
| < FIRSTPHASE: "first-phase" >
| < SECONDPHASE: "second-phase" >
| < MACRO: "macro" >
| < INLINE: "inline" >
| < ARITY: "arity" >
| < LOWERBOUND: "lower-bound" >
| < UPPERBOUND: "upper-bound" >
| < DENSEPOSTINGLISTTHRESHOLD: "dense-posting-list-threshold" >
| < SUMMARYFEATURES_SL: "summary-features" (" ")* ":" (~["}","\n"])* ("\n")? >
| < SUMMARYFEATURES_ML: "summary-features" ()? "{" (~["}"])* "}" >
| < RANKFEATURES_SL: "rank-features" (" ")* ":" (~["}","\n"])* ("\n")? >
| < RANKFEATURES_ML: "rank-features" ()? "{" (~["}"])* "}" >
| < EXPRESSION_SL: "expression" (" ")* ":" (("{")|)* ("\n")? >
| < EXPRESSION_ML: "expression" ()? "{" (("{")|)* "}" >
| < #BRACE_SL_LEVEL_1: (("{")|)* "}" >
| < #BRACE_SL_LEVEL_2: (("{")|)* "}" >
| < #BRACE_SL_LEVEL_3: "}" >
| < #BRACE_SL_CONTENT: (~["{","}","\n"])* >
| < #BRACE_ML_LEVEL_1: (("{")|)* "}" >
| < #BRACE_ML_LEVEL_2: (("{")|)* "}" >
| < #BRACE_ML_LEVEL_3: "}" >
| < #BRACE_ML_CONTENT: (~["{","}"])* >
| < #SEARCHLIB_SKIP: ([" ","\f","\n","\r","\t"])+ >
| < RANKPROPERTIES: "rank-properties" >
| < RERANKCOUNT: "rerank-count" >
| < NUMTHREADSPERSEARCH: "num-threads-per-search" >
| < MINHITSPERTHREAD: "min-hits-per-thread" >
| < NUMSEARCHPARTITIONS: "num-search-partitions" >
| < TERMWISELIMIT: "termwise-limit" >
| < KEEPRANKCOUNT: "keep-rank-count" >
| < RANKSCOREDROPLIMIT: "rank-score-drop-limit" >
| < CONSTANTS: "constants" >
| < FILE: "file" >
| < IDENTIFIER: ["a"-"z","A"-"Z", "_"] (["a"-"z","A"-"Z","0"-"9","_","-"])* >
| < QUOTEDSTRING: "\"" ( ~["\""] )* "\"" >
| < CONTEXT: ["a"-"z","A"-"Z"] (["a"-"z", "A"-"Z", "0"-"9"])* >
| < DOUBLE: ("-")? (["0"-"9"])+ "." (["0"-"9"])+ >
| < INTEGER: ("-")? (["0"-"9"])+ >
| < LONG: ("-")? (["0"-"9"])+"L" >
| < STRING: (["a"-"z","A"-"Z","_","0"-"9","."])+ >
| < FILE_PATH: ["a"-"z","A"-"Z", "_"] (["a"-"z","A"-"Z","0"-"9","_","-", "/", "."])+ >
| < LESSTHAN: "<" >
| < GREATERTHAN: ">" >
| < VARIABLE: "$" >
}
// Declare a special skip token for comments.
SPECIAL_TOKEN :
{
}
// --------------------------------------------------------------------------------
//
// Production rules.
//
// --------------------------------------------------------------------------------
/**
* The rule consumes any search definition and returns the corresponding object. This is the only production that should
* ever consume leading newlines.
*
* @param dir The directory containing the file being parsed.
* @return The search definition object.
*/
Search search(DocumentTypeManager docMan, String dir) :
{
this.docMan = docMan;
Search search;
}
{
()* (search = rootSearch(dir) | search = rootDocument(dir))
{ return search; }
}
/**
* This rule consumes a proper search block. This and rootDocument() are the only rules that should ever consume
* trailing newline tokens.
*
* @param dir The directory containing the file being parsed.
* @return The search definition object.
*/
Search rootSearch(String dir) :
{
String name;
Search search;
}
{
( name = identifier() { if (!name.matches("[a-zA-Z_][a-zA-Z_:0-9]*")) {
deployLogger.log(Level.WARNING, name + " can not be used in YQL+ expressions.");
}
search = new Search(name, app);
rankProfileRegistry.addRankProfile(new DefaultRankProfile(search, rankProfileRegistry));
rankProfileRegistry.addRankProfile(new UnrankedRankProfile(search, rankProfileRegistry));}
lbrace() (rootSearchItem(search) ()*)* ()* )
{ return search; }
}
/**
* Consumes an element of a search block. This and rootSearch() are the only rules that should ever consume
* trailing newline tokens.
*
* @param search The search object to modify.
* @return Null.
*/
Object rootSearchItem(Search search) : { }
{
( document(search)
| documentSummary(search)
| field(null, search)
| index(search, null)
| rankingConstant(search)
| rankProfile(search)
| searchStemming(search)
| useDocument(search)
| structOutside(search)
| annotationOutside(search)
| fieldSet(search)
| importField(search) )
{ return null; }
}
/**
* Consumes a search definition that contains only documents to be used for inheritance, etc.
*
* @param dir The directory containing the file being parsed.
* @return The search definition object.
*/
Search rootDocument(String dir) :
{
Search search = new UnproperSearch();
}
{
( (rootDocumentItem(search) ()*)* )
{ return search; }
}
/**
* Consumes a single item from within a root document node.
*
* @param search The search object to modify.
* @return Null.
*/
Object rootDocumentItem(Search search) : { }
{
( namedDocument(search) )
{ return null; }
}
/**
* Consumes a use-document statement. This currently does nothing.
*
* @param search The search object to modify.
*/
void useDocument(Search search) : { }
{
identifier()
}
/**
* Consumes a document element. The name defaults to the search's name, but may be set.
*
* @param search The search object to add content to.
*/
void document(Search search) :
{
String name=search.getName();
SDDocumentType document;
}
{
( (name = identifier())? ()* { document = new SDDocumentType(name, search); }
[ inheritsDocument(document) ()* ]
()* (documentBody(document, search) ()*)* )
{
search.addDocument(document);
}
}
/**
* Consumes a document element, explicitly named
*
* @param search The search object to add content to.
*/
void namedDocument(Search search) :
{
String name;
SDDocumentType document;
}
{
( name = identifier() ()* { document = new SDDocumentType(name, search); }
[ inheritsDocument(document) ()* ]
()* (documentBody(document, search) ()*)* )
{
search.addDocument(document);
}
}
/**
* Consumes a document body block
*
* @param document The document type to modify.
* @param search The search object to add content to.
* @return Null.
*/
Object documentBody(SDDocumentType document, Search search) :
{
}
{
( annotation(search, document)
| compression(document, null)
| headercfg(document)
| bodycfg(document)
| structInside(document, search)
| field(document, search) )
{ return null; }
}
/**
* Consumes a document head block.
*
* @param document The document type to modify.
*/
void headercfg(SDDocumentType document) : { }
{
lbrace() [compression(document, "header") ()*]
}
/**
* Consumes a document body block.
*
* @param document The document type to modify.
*/
void bodycfg(SDDocumentType document) : { }
{
lbrace() [compression(document, "body") ()*]
}
/**
* Consumes a compression block. This can be set in both document header and -body block.
*
* @param document The document type to modify.
* @param name The name of the document block to modify.
*/
void compression(SDDocumentType document, String name) :
{
CompressionConfig cfg = new CompressionConfig(CompressionType.LZ4);
}
{
lbrace() (cfg = compressionItem(cfg) ()*)*
{
if (name == null || name.equals("header")) {
document.getDocumentType().getHeaderType().setCompressionConfig(cfg);
}
if (name == null || name.equals("body")) {
document.getDocumentType().getBodyType().setCompressionConfig(cfg);
}
}
}
/**
* Consumes the body of a compression block.
*
* @param cfg The compression config to modify.
*/
CompressionConfig compressionItem(CompressionConfig cfg) :
{
int val = -1;
}
{
( ( { cfg = new CompressionConfig(CompressionType.LZ4, cfg.compressionLevel, cfg.threshold); } )
| ( val = integer()) { setCompressionThreshold(cfg, val); }
| ( val = integer()) { setCompressionLevel(cfg, val); }
)
{
return cfg;
}
}
/**
* Consumes a document inheritance statement.
*
* @param document The document type to modify.
*/
void inheritsDocument(SDDocumentType document) :
{
String name;
}
{
name = identifier() { document.inherit(new DataTypeName(name)); }
( name = identifier() { document.inherit(new DataTypeName(name)); } )*
}
/**
* Consumes a field block from within a document element.
*
* @param document The document type to modify.
* @param search The search object to add content to.
*/
void field(SDDocumentType document, Search search) :
{
String name;
SDField field;
DataType type;
}
{
name = identifier() type = dataType()
{
if (name != null && com.yahoo.searchdefinition.Search.isReservedName(name.toLowerCase())) {
throw new IllegalArgumentException("Reserved name '" + name + "' can not be used as a field name.");
}
field = new TemporarySDField(name, type, true, document);
}
lbrace() (fieldBody(field, search, document) ()*)*
{
if (document != null) {
document.addField(field);
} else {
search.addExtraField(field);
}
}
}
void fieldSet(Search search) :
{
String name;
}
{