diff options
author | Jon Bratseth <bratseth@gmail.com> | 2022-01-10 19:24:03 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2022-01-10 19:24:03 +0100 |
commit | 0d095ccb083e66c99701bf0e2186cd0913227b58 (patch) | |
tree | 920508b9106035a9a26cb2f1be6badc2fb1c417f /container-search/src/main | |
parent | 75852e3ce2a075c73c0845a8000df4db4c1f7260 (diff) |
Stem by linguistics in rule bases
Also add a @language directive to stem in other languages than english.
Diffstat (limited to 'container-search/src/main')
13 files changed, 278 insertions, 233 deletions
diff --git a/container-search/src/main/java/com/yahoo/prelude/semantics/RuleBase.java b/container-search/src/main/java/com/yahoo/prelude/semantics/RuleBase.java index 2b8515b6db8..8e137d99951 100644 --- a/container-search/src/main/java/com/yahoo/prelude/semantics/RuleBase.java +++ b/container-search/src/main/java/com/yahoo/prelude/semantics/RuleBase.java @@ -1,19 +1,34 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.semantics; +import com.yahoo.language.Language; +import com.yahoo.language.Linguistics; +import com.yahoo.language.process.StemMode; +import com.yahoo.prelude.semantics.engine.RuleBaseLinguistics; +import com.yahoo.prelude.semantics.rule.CompositeCondition; +import com.yahoo.prelude.semantics.rule.Condition; +import com.yahoo.prelude.semantics.rule.NamedCondition; +import com.yahoo.prelude.semantics.rule.ProductionRule; +import com.yahoo.prelude.semantics.rule.SuperCondition; import com.yahoo.search.Query; import com.yahoo.prelude.querytransform.PhraseMatcher; import com.yahoo.prelude.semantics.engine.RuleEngine; import com.yahoo.prelude.semantics.parser.ParseException; -import com.yahoo.prelude.semantics.rule.*; import com.yahoo.protect.Validator; import java.io.File; -import java.util.*; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.ListIterator; +import java.util.Map; +import java.util.Set; +import java.util.StringTokenizer; /** - * A set of semantic production rules and named conditions used to analyze - * and rewrite queries + * A set of semantic production rules and named conditions used to analyze and rewrite queries * * @author bratseth */ @@ -26,7 +41,7 @@ public class RuleBase { private String source; /** The name of the automata file used, or null if none */ - protected String automataFileName = null; + private String automataFileName = null; /** * True if this rule base is default. @@ -61,29 +76,26 @@ public class RuleBase { */ private boolean usesAutomata = false; - /** Should we allow stemmed matches? */ - private boolean stemming = true; - - /** Creates an empty rule base. TODO: Disallow */ - public RuleBase() { - } + private RuleBaseLinguistics linguistics; /** Creates an empty rule base */ - public RuleBase(String name) { - setName(name); + public RuleBase(String name, Linguistics linguistics) { + this.name = name; + this.linguistics = new RuleBaseLinguistics(StemMode.BEST, Language.ENGLISH, linguistics); } /** - * Creates a rule base from a file + * Creates a rule base from file * - * @param ruleFile the rule file to read. The name of the file (minus path) becomes the rule base name + * @param ruleFile the rule file to read. The name of the file (minus path) becomes the rule base name. * @param automataFile the automata file, or null to not use an automata * @throws java.io.IOException if there is a problem reading one of the files * @throws ParseException if the rule file can not be parsed correctly * @throws RuleBaseException if the rule file contains inconsistencies */ - public static RuleBase createFromFile(String ruleFile, String automataFile) throws java.io.IOException, ParseException { - return new RuleImporter().importFile(ruleFile, automataFile); + public static RuleBase createFromFile(String ruleFile, String automataFile, Linguistics linguistics) + throws java.io.IOException, ParseException { + return new RuleImporter(linguistics).importFile(ruleFile, automataFile); } /** @@ -96,18 +108,13 @@ public class RuleBase { * @throws com.yahoo.prelude.semantics.parser.ParseException if the rule file can not be parsed correctly * @throws com.yahoo.prelude.semantics.RuleBaseException if the rule file contains inconsistencies */ - public static RuleBase createFromString(String name, String ruleString, String automataFile) throws java.io.IOException, ParseException { - RuleBase base = new RuleImporter().importString(ruleString, automataFile, new RuleBase()); + public static RuleBase createFromString(String name, String ruleString, String automataFile, Linguistics linguistics) + throws java.io.IOException, ParseException { + RuleBase base = new RuleImporter(linguistics).importString(ruleString, automataFile); base.setName(name); return base; } - /** Set to true to enable stemmed matches. True by default */ - public void setStemming(boolean stemming) { this.stemming = stemming; } - - /** Returns whether stemmed matches are allowed. True by default */ - public boolean getStemming() { return stemming; } - /** * <p>Include another rule base into this. This <b>transfers ownership</b> * of the given rule base - it can not be subsequently used for any purpose @@ -171,7 +178,7 @@ public class RuleBase { resolveSuper(condition, superCondition); } - private void resolveSuper(Condition condition,Condition superCondition) { + private void resolveSuper(Condition condition, Condition superCondition) { if (condition instanceof SuperCondition) { ((SuperCondition)condition).setCondition(superCondition); } @@ -336,7 +343,7 @@ public class RuleBase { // TODO: Values are not added right now protected void annotatePhrase(PhraseMatcher.Phrase phrase,Query query,int traceLevel) { - for (StringTokenizer tokens = new StringTokenizer(phrase.getData(),"|",false) ; tokens.hasMoreTokens(); ) { + for (StringTokenizer tokens = new StringTokenizer(phrase.getData(), "|", false); tokens.hasMoreTokens(); ) { String token = tokens.nextToken(); int semicolonIndex = token.indexOf(";"); String annotation = token; diff --git a/container-search/src/main/java/com/yahoo/prelude/semantics/RuleImporter.java b/container-search/src/main/java/com/yahoo/prelude/semantics/RuleImporter.java index 45569050882..acbf9a7ffb6 100644 --- a/container-search/src/main/java/com/yahoo/prelude/semantics/RuleImporter.java +++ b/container-search/src/main/java/com/yahoo/prelude/semantics/RuleImporter.java @@ -10,8 +10,9 @@ import java.util.Arrays; import java.util.List; import com.yahoo.io.IOUtils; -import com.yahoo.io.reader.NamedReader; -import com.yahoo.prelude.semantics.parser.*; +import com.yahoo.language.Linguistics; +import com.yahoo.prelude.semantics.parser.ParseException; +import com.yahoo.prelude.semantics.parser.SemanticsParser; /** * Imports rule bases from various sources. @@ -24,51 +25,47 @@ import com.yahoo.prelude.semantics.parser.*; // rule bases included into others, while neither the rule base or the parser knows. public class RuleImporter { - /** - * If this is set, imported rule bases are looked up in this config - * otherwise, they are looked up as files - */ - private SemanticRulesConfig config; + /** If this is set, imported rule bases are looked up in this config otherwise, they are looked up as files. */ + private final SemanticRulesConfig config; - /** - * Ignore requests to read automata files. - * Useful to validate rule bases without having automatas present - */ - private boolean ignoreAutomatas; + /** Ignore requests to read automata files. Useful to validate rule bases without having automatas present. */ + private final boolean ignoreAutomatas; - /** - * Ignore requests to include files. - * Useful to validate rule bases one by one in config - */ - private boolean ignoreIncludes = false; + /** Ignore requests to include files. Useful to validate rule bases one by one in config. */ + private final boolean ignoreIncludes; + + private Linguistics linguistics; /** Create a rule importer which will read from file */ - public RuleImporter() { - this(null, false); + public RuleImporter(Linguistics linguistics) { + this(null, false, linguistics); } /** Create a rule importer which will read from a config object */ - public RuleImporter(SemanticRulesConfig config) { - this(config, false); + public RuleImporter(SemanticRulesConfig config, Linguistics linguistics) { + this(config, false, linguistics); } - public RuleImporter(boolean ignoreAutomatas) { - this(null, ignoreAutomatas); + public RuleImporter(boolean ignoreAutomatas, Linguistics linguistics) { + this(null, ignoreAutomatas, linguistics); } - public RuleImporter(boolean ignoreAutomatas, boolean ignoreIncludes) { - this(null, ignoreAutomatas, ignoreIncludes); + public RuleImporter(boolean ignoreAutomatas, boolean ignoreIncludes, Linguistics linguistics) { + this(null, ignoreAutomatas, ignoreIncludes, linguistics); } - public RuleImporter(SemanticRulesConfig config, boolean ignoreAutomatas) { - this.config = config; - this.ignoreAutomatas = ignoreAutomatas; + public RuleImporter(SemanticRulesConfig config, boolean ignoreAutomatas, Linguistics linguistics) { + this(config, ignoreAutomatas, false, linguistics); } - public RuleImporter(SemanticRulesConfig config, boolean ignoreAutomatas, boolean ignoreIncludes) { + public RuleImporter(SemanticRulesConfig config, + boolean ignoreAutomatas, + boolean ignoreIncludes, + Linguistics linguistics) { this.config = config; this.ignoreAutomatas = ignoreAutomatas; this.ignoreIncludes = ignoreIncludes; + this.linguistics = linguistics; } /** @@ -91,33 +88,18 @@ public class RuleImporter { * @throws ParseException if the file does not contain a valid semantic rule set */ public RuleBase importFile(String fileName, String automataFile) throws IOException, ParseException { - return importFile(fileName, automataFile, null); - } - - /** - * Imports semantic rules from a file - * - * @param fileName the rule file to use - * @param automataFile the automata file to use, or null to not use any - * @param ruleBase an existing rule base to import these rules into, or null to create a new - * @throws java.io.IOException if the file can not be read for some reason - * @throws ParseException if the file does not contain a valid semantic rule set - */ - public RuleBase importFile(String fileName, String automataFile, RuleBase ruleBase) throws IOException, ParseException { - ruleBase = privateImportFile(fileName, automataFile, ruleBase); + var ruleBase = privateImportFile(fileName, automataFile); ruleBase.initialize(); return ruleBase; } - public RuleBase privateImportFile(String fileName, String automataFile, RuleBase ruleBase) throws IOException, ParseException { + public RuleBase privateImportFile(String fileName, String automataFile) throws IOException, ParseException { BufferedReader reader = null; try { reader = IOUtils.createReader(fileName, "utf-8"); File file = new File(fileName); String absoluteFileName = file.getAbsolutePath(); - if (ruleBase == null) - ruleBase = new RuleBase(); - ruleBase.setName(stripLastName(file.getName())); + var ruleBase = new RuleBase(stripLastName(file.getName()), linguistics); privateImportFromReader(reader, absoluteFileName, automataFile, ruleBase); return ruleBase; } @@ -157,18 +139,17 @@ public class RuleImporter { /** Returns an unitialized rule base */ private RuleBase privateImportFromDirectory(String ruleBaseName, RuleBase ruleBase) throws IOException, ParseException { - RuleBase include = new RuleBase(); String includeDir = new File(ruleBase.getSource()).getParentFile().getAbsolutePath(); if (!ruleBaseName.endsWith(".sr")) ruleBaseName = ruleBaseName + ".sr"; File importFile = new File(includeDir, ruleBaseName); if ( ! importFile.exists()) throw new IOException("No file named '" + shortenPath(importFile.getPath()) + "'"); - return privateImportFile(importFile.getPath(), null, include); + return privateImportFile(importFile.getPath(), null); } /** Returns an unitialized rule base */ - private RuleBase privateImportFromConfig(String ruleBaseName) throws IOException, ParseException { + private RuleBase privateImportFromConfig(String ruleBaseName) throws ParseException { SemanticRulesConfig.Rulebase ruleBaseConfig = findRuleBaseConfig(config,ruleBaseName); if (ruleBaseConfig == null) ruleBaseConfig = findRuleBaseConfig(config, stripLastName(ruleBaseName)); @@ -224,8 +205,7 @@ public class RuleImporter { /** Imports an unitialized rule base */ public RuleBase privateImportConfig(SemanticRulesConfig.Rulebase ruleBaseConfig) throws ParseException { if (config == null) throw new IllegalStateException("Must initialize with config if importing from config"); - RuleBase ruleBase = new RuleBase(); - ruleBase.setName(ruleBaseConfig.name()); + RuleBase ruleBase = new RuleBase(ruleBaseConfig.name(), linguistics); return privateImportFromReader(new StringReader(ruleBaseConfig.rules()), "semantic-rules.cfg", ruleBaseConfig.automata(),ruleBase); @@ -253,14 +233,10 @@ public class RuleImporter { /** Returns an unitialized rule base */ public RuleBase privateImportFromReader(Reader reader, String sourceName, String automataFile, RuleBase ruleBase) throws ParseException { try { - if (ruleBase == null) { - ruleBase = new RuleBase(); - if (sourceName == null) - sourceName = "anonymous"; - ruleBase.setName(sourceName); - } + if (ruleBase == null) + ruleBase = new RuleBase(sourceName == null ? "anonymous" : sourceName, linguistics); ruleBase.setSource(sourceName.replace('\\', '/')); - new SemanticsParser(reader).semanticRules(ruleBase, this); + new SemanticsParser(reader, linguistics).semanticRules(ruleBase, this); if (automataFile != null && !automataFile.isEmpty()) ruleBase.setAutomataFile(automataFile.replace('\\', '/')); return ruleBase; diff --git a/container-search/src/main/java/com/yahoo/prelude/semantics/SemanticSearcher.java b/container-search/src/main/java/com/yahoo/prelude/semantics/SemanticSearcher.java index f9d968a3a4d..a8167fd2001 100644 --- a/container-search/src/main/java/com/yahoo/prelude/semantics/SemanticSearcher.java +++ b/container-search/src/main/java/com/yahoo/prelude/semantics/SemanticSearcher.java @@ -4,6 +4,7 @@ package com.yahoo.prelude.semantics; import com.google.inject.Inject; import com.yahoo.component.chain.dependencies.After; import com.yahoo.component.chain.dependencies.Before; +import com.yahoo.language.Linguistics; import com.yahoo.prelude.ConfigurationException; import com.yahoo.search.Query; import com.yahoo.search.Result; @@ -13,7 +14,9 @@ import com.yahoo.search.result.ErrorMessage; import com.yahoo.search.searchchain.Execution; import com.yahoo.search.searchchain.PhaseNames; -import java.util.*; +import java.util.Arrays; +import java.util.List; +import java.util.Map; import static com.yahoo.prelude.querytransform.StemmingSearcher.STEMMING; @@ -38,7 +41,7 @@ public class SemanticSearcher extends Searcher { /** Creates a semantic searcher using the given default rule base */ public SemanticSearcher(RuleBase ruleBase) { - this(Collections.singletonList(ruleBase)); + this(List.of(ruleBase)); defaultRuleBase = ruleBase; } @@ -47,8 +50,8 @@ public class SemanticSearcher extends Searcher { } @Inject - public SemanticSearcher(SemanticRulesConfig config) { - this(toList(config)); + public SemanticSearcher(SemanticRulesConfig config, Linguistics linguistics) { + this(toList(config, linguistics)); } public SemanticSearcher(List<RuleBase> ruleBases) { @@ -59,9 +62,9 @@ public class SemanticSearcher extends Searcher { } } - private static List<RuleBase> toList(SemanticRulesConfig config) { + private static List<RuleBase> toList(SemanticRulesConfig config, Linguistics linguistics) { try { - RuleImporter ruleImporter = new RuleImporter(config); + RuleImporter ruleImporter = new RuleImporter(config, linguistics); List<RuleBase> ruleBaseList = new java.util.ArrayList<>(); for (SemanticRulesConfig.Rulebase ruleBaseConfig : config.rulebase()) { RuleBase ruleBase = ruleImporter.importConfig(ruleBaseConfig); diff --git a/container-search/src/main/java/com/yahoo/prelude/semantics/benchmark/RuleBaseBenchmark.java b/container-search/src/main/java/com/yahoo/prelude/semantics/benchmark/RuleBaseBenchmark.java index 938d12b271b..75b6e831983 100644 --- a/container-search/src/main/java/com/yahoo/prelude/semantics/benchmark/RuleBaseBenchmark.java +++ b/container-search/src/main/java/com/yahoo/prelude/semantics/benchmark/RuleBaseBenchmark.java @@ -9,6 +9,7 @@ import java.util.ArrayList; import java.util.Date; import java.util.Iterator; +import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.search.Query; import com.yahoo.prelude.semantics.RuleBase; import com.yahoo.prelude.semantics.RuleImporter; @@ -27,7 +28,7 @@ public class RuleBaseBenchmark { fsaFile = null; } } - RuleBase ruleBase = new RuleImporter().importFile(ruleBaseFile,fsaFile); + RuleBase ruleBase = new RuleImporter(new SimpleLinguistics()).importFile(ruleBaseFile, fsaFile); ArrayList<String> queries = new ArrayList<>(); BufferedReader reader = new BufferedReader(new FileReader(queryFile)); String line; @@ -35,7 +36,7 @@ public class RuleBaseBenchmark { queries.add(line); } Date start = new Date(); - for (int i=0;i<iterations;i++){ + for (int i=0; i<iterations; i++){ for (Iterator<String> iter = queries.iterator(); iter.hasNext(); ){ String queryString = iter.next(); Query query = new Query("?query="+queryString); @@ -43,7 +44,7 @@ public class RuleBaseBenchmark { } } Date end = new Date(); - long elapsed = end.getTime()-start.getTime(); + long elapsed = end.getTime() - start.getTime(); System.out.print("BENCHMARK: rulebase=" + ruleBaseFile + "\n fsa=" + fsaFile + "\n queries=" + queryFile + diff --git a/container-search/src/main/java/com/yahoo/prelude/semantics/engine/RuleBaseLinguistics.java b/container-search/src/main/java/com/yahoo/prelude/semantics/engine/RuleBaseLinguistics.java new file mode 100644 index 00000000000..c5519632d6d --- /dev/null +++ b/container-search/src/main/java/com/yahoo/prelude/semantics/engine/RuleBaseLinguistics.java @@ -0,0 +1,54 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.prelude.semantics.engine; + +import com.yahoo.language.Language; +import com.yahoo.language.Linguistics; +import com.yahoo.language.process.StemList; +import com.yahoo.language.process.StemMode; + +import java.util.List; +import java.util.Objects; + +/** + * Linguistics for a rule base + * + * @author bratseth + */ +public class RuleBaseLinguistics { + + private final StemMode stemMode; + private final Language language; + private final Linguistics linguistics; + + /** Creates a rule base with default settings */ + public RuleBaseLinguistics(Linguistics linguistics) { + this(StemMode.BEST, Language.ENGLISH, linguistics); + } + + + public RuleBaseLinguistics(StemMode stemMode, Language language, Linguistics linguistics) { + this.stemMode = Objects.requireNonNull(stemMode); + this.language = Objects.requireNonNull(language); + this.linguistics = Objects.requireNonNull(linguistics); + } + + public RuleBaseLinguistics withStemMode(StemMode stemMode) { + return new RuleBaseLinguistics(stemMode, language, linguistics); + } + + public RuleBaseLinguistics withLanguage(Language language) { + return new RuleBaseLinguistics(stemMode, language, linguistics); + } + + public Linguistics linguistics() { return linguistics; } + + /** Processes this term according to the linguistics of this rule base */ + public String process(String term) { + if (stemMode == StemMode.NONE) return term; + List<StemList> stems = linguistics.getStemmer().stem(term, StemMode.BEST, language); + if (stems.isEmpty()) return term; + if (stems.get(0).isEmpty()) return term; + return stems.get(0).get(0); + } + +} diff --git a/container-search/src/main/java/com/yahoo/prelude/semantics/engine/RuleEngine.java b/container-search/src/main/java/com/yahoo/prelude/semantics/engine/RuleEngine.java index e7ed05730cb..dd6610d1184 100644 --- a/container-search/src/main/java/com/yahoo/prelude/semantics/engine/RuleEngine.java +++ b/container-search/src/main/java/com/yahoo/prelude/semantics/engine/RuleEngine.java @@ -17,7 +17,7 @@ import java.util.ListIterator; */ public class RuleEngine { - private RuleBase rules; + private final RuleBase rules; public RuleEngine(RuleBase rules) { this.rules=rules; @@ -38,7 +38,6 @@ public class RuleEngine { boolean matchedAnything = false; Evaluation evaluation = new Evaluation(query, traceLevel); - evaluation.setStemming(rules.getStemming()); if (traceLevel >= 2) evaluation.trace(2,"Evaluating query '" + evaluation.getQuery().getModel().getQueryTree().getRoot() + "':"); for (ListIterator<ProductionRule> i = rules.ruleIterator(); i.hasNext(); ) { diff --git a/container-search/src/main/java/com/yahoo/prelude/semantics/rule/LiteralCondition.java b/container-search/src/main/java/com/yahoo/prelude/semantics/rule/LiteralCondition.java index 42bf0560726..b85dd892047 100644 --- a/container-search/src/main/java/com/yahoo/prelude/semantics/rule/LiteralCondition.java +++ b/container-search/src/main/java/com/yahoo/prelude/semantics/rule/LiteralCondition.java @@ -4,7 +4,7 @@ package com.yahoo.prelude.semantics.rule; import com.yahoo.prelude.semantics.engine.RuleEvaluation; /** - * A condition which is always true, and which has it's own value as return value + * A condition which is always true, and which has its own value as return value * * @author bratseth */ diff --git a/container-search/src/main/java/com/yahoo/prelude/semantics/rule/NamedCondition.java b/container-search/src/main/java/com/yahoo/prelude/semantics/rule/NamedCondition.java index b2592a36353..a267d274d5a 100644 --- a/container-search/src/main/java/com/yahoo/prelude/semantics/rule/NamedCondition.java +++ b/container-search/src/main/java/com/yahoo/prelude/semantics/rule/NamedCondition.java @@ -14,9 +14,9 @@ public class NamedCondition { private Condition condition; - public NamedCondition(String name,Condition condition) { - this.conditionName=name; - this.condition=condition; + public NamedCondition(String name, Condition condition) { + this.conditionName = name; + this.condition = condition; } public String getName() { return conditionName; } @@ -28,18 +28,18 @@ public class NamedCondition { public void setCondition(Condition condition) { this.condition = condition; } public boolean matches(RuleEvaluation e) { - if (e.getTraceLevel()>=3) { + if (e.getTraceLevel() >= 3) { e.trace(3,"Evaluating '" + this + "' at " + e.currentItem()); e.indentTrace(); } boolean matches=condition.matches(e); - if (e.getTraceLevel()>=3) { + if (e.getTraceLevel() >= 3) { e.unindentTrace(); if (matches) e.trace(3,"Matched '" + this + "' at " + e.previousItem()); - else if (e.getTraceLevel()>=4) + else if (e.getTraceLevel() >= 4) e.trace(4,"Did not match '" + this + "' at " + e.currentItem()); } return matches; diff --git a/container-search/src/main/java/com/yahoo/prelude/semantics/rule/NamespaceProduction.java b/container-search/src/main/java/com/yahoo/prelude/semantics/rule/NamespaceProduction.java index 099a8562ece..e6f32a83dd9 100644 --- a/container-search/src/main/java/com/yahoo/prelude/semantics/rule/NamespaceProduction.java +++ b/container-search/src/main/java/com/yahoo/prelude/semantics/rule/NamespaceProduction.java @@ -18,13 +18,13 @@ public class NamespaceProduction extends Production { private String key; /** The value to set in the namespace */ - private String value=null; + private String value; /** Creates a produced template term with no label and the default type */ - public NamespaceProduction(String namespace,String key,String value) { + public NamespaceProduction(String namespace, String key, String value) { setNamespace(namespace); - this.key=key; - this.value=value; + this.key = key; + this.value = value; } public String getNamespace() { return namespace; } @@ -44,7 +44,7 @@ public class NamespaceProduction extends Production { public void setValue(String value) { this.value = value; } - public void produce(RuleEvaluation e,int offset) { + public void produce(RuleEvaluation e, int offset) { e.getEvaluation().getQuery().properties().set(key, value); } diff --git a/container-search/src/main/java/com/yahoo/prelude/semantics/rule/ReferenceTermProduction.java b/container-search/src/main/java/com/yahoo/prelude/semantics/rule/ReferenceTermProduction.java index b36744dc397..af7abf325e7 100644 --- a/container-search/src/main/java/com/yahoo/prelude/semantics/rule/ReferenceTermProduction.java +++ b/container-search/src/main/java/com/yahoo/prelude/semantics/rule/ReferenceTermProduction.java @@ -12,7 +12,7 @@ import com.yahoo.prelude.semantics.engine.RuleEvaluation; import com.yahoo.protect.Validator; /** - * A term produced by a production rule which takes it's actual term value + * A term produced by a production rule which takes its actual term value * from one or more terms matched in the condition * * @author bratseth diff --git a/container-search/src/main/java/com/yahoo/prelude/semantics/rule/TermCondition.java b/container-search/src/main/java/com/yahoo/prelude/semantics/rule/TermCondition.java index 38d1fc9b83b..6e2d3de7d08 100644 --- a/container-search/src/main/java/com/yahoo/prelude/semantics/rule/TermCondition.java +++ b/container-search/src/main/java/com/yahoo/prelude/semantics/rule/TermCondition.java @@ -3,6 +3,7 @@ package com.yahoo.prelude.semantics.rule; import com.yahoo.prelude.query.TermItem; import com.yahoo.prelude.semantics.engine.NameSpace; +import com.yahoo.prelude.semantics.engine.RuleBaseLinguistics; import com.yahoo.prelude.semantics.engine.RuleEvaluation; /** @@ -12,39 +13,38 @@ import com.yahoo.prelude.semantics.engine.RuleEvaluation; */ public class TermCondition extends Condition { - private String term, termPlusS; + private final RuleBaseLinguistics linguistics; + private String originalTerm; + private String term; - /** Creates an invalid term */ - public TermCondition() { } - - public TermCondition(String term) { - this(null,term); + public TermCondition(String term, RuleBaseLinguistics linguistics) { + this(null, term, linguistics); } - public TermCondition(String label, String term) { + public TermCondition(String label, String term, RuleBaseLinguistics linguistics) { super(label); - this.term = term; - termPlusS = term + "s"; + this.linguistics = linguistics; + this.originalTerm = term; + this.term = linguistics.process(term); } public String getTerm() { return term; } public void setTerm(String term) { this.term = term; - termPlusS = term + "s"; } protected boolean doesMatch(RuleEvaluation e) { // TODO: Move this into the respective namespaces when query becomes one */ if (getNameSpace() != null) { NameSpace nameSpace = e.getEvaluation().getNameSpace(getNameSpace()); - return nameSpace.matches(term, e); + return nameSpace.matches(originalTerm, e); // No processing of terms in namespaces } else { if (e.currentItem() == null) return false; if ( ! labelMatches(e)) return false; - String matchedValue = termMatches(e.currentItem().getItem(), e.getEvaluation().getStemming()); + String matchedValue = termMatches(e.currentItem().getItem()); boolean matches = matchedValue!=null && labelMatches(e.currentItem().getItem(), e); if ((matches && !e.isInNegation() || (!matches && e.isInNegation()))) { e.addMatch(e.currentItem(), matchedValue); @@ -56,31 +56,9 @@ public class TermCondition extends Condition { } /** Returns a non-null replacement term if there is a match, null otherwise */ - private String termMatches(TermItem queryTerm, boolean stemming) { - String queryTermString = queryTerm.stringValue(); - - // The terms are the same - boolean matches = queryTermString.equals(term); - if (matches) return term; - - if (stemming) - if (termMatchesWithStemming(queryTermString)) return term; - - return null; - } - - private boolean termMatchesWithStemming(String queryTermString) { - if (queryTermString.length() < 3) return false; // Don't stem very short terms - - // The query term minus s is the same - boolean matches = queryTermString.equals(termPlusS); - if (matches) return true; - - // The query term plus s is the same - matches = term.equals(queryTermString + "s"); - if (matches) return true; - - return false; + private String termMatches(TermItem queryTerm) { + boolean matches = linguistics.process(queryTerm.stringValue()).equals(term); + return matches ? term : null; } public String toInnerString() { diff --git a/container-search/src/main/java/com/yahoo/search/result/FeatureData.java b/container-search/src/main/java/com/yahoo/search/result/FeatureData.java index 72a8b02a960..b1d64329927 100644 --- a/container-search/src/main/java/com/yahoo/search/result/FeatureData.java +++ b/container-search/src/main/java/com/yahoo/search/result/FeatureData.java @@ -23,6 +23,8 @@ import java.util.Set; /** * A wrapper for structured data representing feature values: A map of floats and tensors. * This class is immutable but not thread safe. + * + * @author bratseth */ public class FeatureData implements Inspectable, JsonProducer { diff --git a/container-search/src/main/javacc/com/yahoo/prelude/semantics/parser/SemanticsParser.jj b/container-search/src/main/javacc/com/yahoo/prelude/semantics/parser/SemanticsParser.jj index d79f78ef896..46117374e59 100644 --- a/container-search/src/main/javacc/com/yahoo/prelude/semantics/parser/SemanticsParser.jj +++ b/container-search/src/main/javacc/com/yahoo/prelude/semantics/parser/SemanticsParser.jj @@ -6,7 +6,6 @@ options { CACHE_TOKENS = true; DEBUG_PARSER = false; ERROR_REPORTING = true; - STATIC = false; UNICODE_INPUT = true; } @@ -15,12 +14,23 @@ PARSER_BEGIN(SemanticsParser) package com.yahoo.prelude.semantics.parser; import com.yahoo.javacc.UnicodeUtilities; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.Linguistics; +import com.yahoo.language.Language; import com.yahoo.prelude.semantics.*; import com.yahoo.prelude.semantics.rule.*; +import com.yahoo.prelude.semantics.engine.RuleBaseLinguistics; import com.yahoo.prelude.query.TermType; public class SemanticsParser { + private RuleBaseLinguistics linguistics; + + public SemanticsParser(java.io.Reader stream, Linguistics linguistics) { + this(stream); + this.linguistics = new RuleBaseLinguistics(linguistics); + } + } PARSER_END(SemanticsParser) @@ -77,6 +87,7 @@ TOKEN : <SMALLER: "<"> | <SMALLEREQUALS: "<="> | <STEMMINGDIRECTIVE: "@stemming"> | + <LANGUAGEDIRECTIVE: "@language"> | <SUPERDIRECTIVE: "@super"> | <IDENTIFIER: (~[ "\u0000"-"\u002f","\u003a"-"\u003f","\u005b"-"\u005d","\u007b"-"\u00a7","\u00a9","\u00ab"-"\u00ae","\u00b0"-"\u00b3","\u00b6"-"\u00b7","\u00b9","\u00bb"-"\u00bf", @@ -114,16 +125,20 @@ RuleBase semanticRules(RuleBase rules,RuleImporter importer) : // ---------------------------------- Directive --------------------------------------- -RuleBase directive(RuleBase rules,RuleImporter importer) : +RuleBase directive(RuleBase rules, RuleImporter importer) : { String name; } { - ( includeDirective(rules,importer) | defaultDirective(rules) | automataDirective(rules,importer) | stemmingDirective(rules) ) + ( includeDirective(rules, importer) | + defaultDirective(rules) | + automataDirective(rules, importer) | + stemmingDirective(rules) | + languageDirective(rules) ) { return rules; } } -void includeDirective(RuleBase rules,RuleImporter importer) : +void includeDirective(RuleBase rules, RuleImporter importer) : { String name; } @@ -131,25 +146,24 @@ void includeDirective(RuleBase rules,RuleImporter importer) : <INCLUDEDIRECTIVE> <LEFTBRACE> name=stringOrLiteral() <RIGHTBRACE> (<SEMICOLON>)? { try { - importer.include(name,rules); + importer.include(name, rules); } catch (java.io.IOException e) { - ParseException ep=new ParseException("Could not read included rule base '" + - name + "'"); + ParseException ep=new ParseException("Could not read included rule base '" + name + "'"); ep.initCause(e); throw ep; } } } -void automataDirective(RuleBase rules,RuleImporter importer) : +void automataDirective(RuleBase rules, RuleImporter importer) : { String name; } { - <AUTOMATADIRECTIVE> <LEFTBRACE> name=stringOrLiteral() <RIGHTBRACE> (<SEMICOLON>)? + <AUTOMATADIRECTIVE> <LEFTBRACE> name = stringOrLiteral() <RIGHTBRACE> (<SEMICOLON>)? { - importer.setAutomata(rules,name); + importer.setAutomata(rules, name); } } @@ -168,9 +182,20 @@ void stemmingDirective(RuleBase rules) : String booleanString; } { - <STEMMINGDIRECTIVE> <LEFTBRACE> booleanString=stringOrLiteral() <RIGHTBRACE> (<SEMICOLON>)? + <STEMMINGDIRECTIVE> <LEFTBRACE> booleanString = stringOrLiteral() <RIGHTBRACE> (<SEMICOLON>)? + { + linguistics = linguistics.withStemMode(Boolean.parseBoolean(booleanString) ? StemMode.BEST : StemMode.NONE); + } +} + +void languageDirective(RuleBase rules) : +{ + String languageString; +} +{ + <LANGUAGEDIRECTIVE> <LEFTBRACE> languageString = stringOrLiteral() <RIGHTBRACE> (<SEMICOLON>)? { - rules.setStemming(Boolean.parseBoolean(booleanString)); + linguistics = linguistics.withLanguage(Language.from(languageString)); } } @@ -183,10 +208,10 @@ void productionRule(RuleBase rules) : ProductionList production=null; } { - condition=topLevelCondition() rule=productionRuleType() ( production=productionList() )? <SEMICOLON> + condition = topLevelCondition() rule = productionRuleType() ( production = productionList() )? <SEMICOLON> { rule.setCondition(condition); - if (production!=null) rule.setProduction(production); + if (production != null) rule.setProduction(production); rules.addRule(rule); } } @@ -201,16 +226,16 @@ ProductionRule productionRuleType() : ProductionList productionList() : { - ProductionList productionList=new ProductionList(); + ProductionList productionList = new ProductionList(); Production production; int weight=100; } { - ( production=production() (<EXCLAMATION> weight=number())? + ( production = production() (<EXCLAMATION> weight = number())? { production.setWeight(weight); productionList.addProduction(production); - weight=100; + weight = 100; } (<NL>)* ) + { return productionList; } @@ -221,7 +246,7 @@ Production production() : Production production; } { - ( LOOKAHEAD(2) production=namespaceProduction() | production=termProduction() ) + ( LOOKAHEAD(2) production = namespaceProduction() | production = termProduction() ) { return production; } } @@ -229,12 +254,12 @@ TermProduction termProduction() : { TermProduction termProduction; TermType termType; - String label=null; + String label = null; } { - termType=termType() - ( LOOKAHEAD(2) label=label() )? - ( termProduction=nonphraseTermProduction() | termProduction=phraseProduction() ) + termType = termType() + ( LOOKAHEAD(2) label = label() )? + ( termProduction = nonphraseTermProduction() | termProduction = phraseProduction() ) { termProduction.setLabel(label); @@ -248,8 +273,8 @@ TermProduction nonphraseTermProduction() : TermProduction termProduction; } { - ( termProduction=referenceTermProduction() | - termProduction=literalTermProduction() ) + ( termProduction = referenceTermProduction() | + termProduction = literalTermProduction() ) { return termProduction; } @@ -257,14 +282,14 @@ TermProduction nonphraseTermProduction() : LiteralPhraseProduction phraseProduction() : { - LiteralPhraseProduction phraseProduction=new LiteralPhraseProduction(); - String term=null; + LiteralPhraseProduction phraseProduction = new LiteralPhraseProduction(); + String term = null; } { <QUOTE> ( - term=identifier() + term = identifier() { phraseProduction.addTerm(term); } )+ <QUOTE> @@ -277,11 +302,11 @@ NamespaceProduction namespaceProduction() : { String namespace; String key; - String value=null; + String value = null; } { - namespace=identifier() <DOT> key=stringOrLiteral() <EQUALS> value=identifierOrLiteral() - { return new NamespaceProduction(namespace,key,value); } + namespace = identifier() <DOT> key = stringOrLiteral() <EQUALS> value = identifierOrLiteral() + { return new NamespaceProduction(namespace, key, value); } } ReferenceTermProduction referenceTermProduction() : @@ -289,7 +314,7 @@ ReferenceTermProduction referenceTermProduction() : String reference; } { - <LEFTSQUAREBRACKET> reference=referenceIdentifier() <RIGHTSQUAREBRACKET> + <LEFTSQUAREBRACKET> reference = referenceIdentifier() <RIGHTSQUAREBRACKET> { return new ReferenceTermProduction(reference); } } @@ -298,7 +323,7 @@ LiteralTermProduction literalTermProduction() : String literal; } { - literal=identifier() + literal = identifier() { return new LiteralTermProduction(literal); } } @@ -319,7 +344,7 @@ String referenceIdentifier() : String reference; } { - ( reference=identifier() { return reference; } ) + ( reference = identifier() { return reference; } ) | ( <ELLIPSIS> { return "..."; } ) } @@ -332,25 +357,25 @@ void namedCondition(RuleBase rules) : Condition condition; } { - <LEFTSQUAREBRACKET> conditionName=identifier() <RIGHTSQUAREBRACKET> <CONDITION> condition=topLevelCondition() <SEMICOLON> - { rules.addCondition(new NamedCondition(conditionName,condition)); } + <LEFTSQUAREBRACKET> conditionName = identifier() <RIGHTSQUAREBRACKET> <CONDITION> condition = topLevelCondition() <SEMICOLON> + { rules.addCondition(new NamedCondition(conditionName, condition)); } } Condition topLevelCondition() : { Condition condition; - boolean startAnchor=false; - boolean endAnchor=false; + boolean startAnchor = false; + boolean endAnchor = false; } { - ( <DOT> { startAnchor=true; } )? + ( <DOT> { startAnchor = true; } )? ( - LOOKAHEAD(3) condition=choiceCondition() | - LOOKAHEAD(3) condition=sequenceCondition() + LOOKAHEAD(3) condition = choiceCondition() | + LOOKAHEAD(3) condition = sequenceCondition() ) - ( LOOKAHEAD(2) <DOT> { endAnchor=true; } )? + ( LOOKAHEAD(2) <DOT> { endAnchor = true; } )? { - condition.setAnchor(Condition.Anchor.create(startAnchor,endAnchor)); + condition.setAnchor(Condition.Anchor.create(startAnchor, endAnchor)); return condition; } } @@ -361,8 +386,8 @@ Condition condition() : } { ( - ( LOOKAHEAD(3) condition=choiceCondition() - | condition=terminalCondition() ) + ( LOOKAHEAD(3) condition = choiceCondition() + | condition = terminalCondition() ) { return condition; } @@ -374,8 +399,8 @@ Condition terminalOrSequenceCondition() : Condition condition; } { - ( LOOKAHEAD(3) condition=sequenceCondition() | - condition=terminalCondition() ) + ( LOOKAHEAD(3) condition = sequenceCondition() | + condition = terminalCondition() ) { return condition; } } @@ -384,20 +409,20 @@ Condition terminalCondition() : Condition condition; } { - ( condition=notCondition() | condition=terminalOrComparisonCondition() ) + ( condition = notCondition() | condition = terminalOrComparisonCondition() ) { return condition; } } Condition terminalOrComparisonCondition() : { - Condition condition,rightCondition; + Condition condition, rightCondition; String comparison; } { - condition=reallyTerminalCondition() - ( comparison=comparison() ( LOOKAHEAD(2) rightCondition=nestedCondition() | rightCondition=reallyTerminalCondition() ) -// ( comparison=comparison() rightCondition=condition() - { condition=new ComparisonCondition(condition,comparison,rightCondition); } + condition = reallyTerminalCondition() + ( comparison = comparison() ( LOOKAHEAD(2) rightCondition = nestedCondition() | rightCondition = reallyTerminalCondition() ) +// ( comparison = comparison() rightCondition = condition() + { condition = new ComparisonCondition(condition, comparison, rightCondition); } ) ? { return condition; } @@ -405,10 +430,10 @@ Condition terminalOrComparisonCondition() : Condition reallyTerminalCondition() : { - String label=null; - String context=null; - String nameSpace=null; - Condition condition=null; + String label = null; + String context = null; + String nameSpace = null; + Condition condition = null; } { // This body looks like this to distinguish these two cases @@ -416,20 +441,20 @@ Condition reallyTerminalCondition() : // condition . (end anchor) ( LOOKAHEAD(8) ( - ( LOOKAHEAD(2) context=context() )? - ( nameSpace=nameSpace() ) - ( LOOKAHEAD(2) label=label() )? - condition=terminalConditionBody() + ( LOOKAHEAD(2) context = context() )? + ( nameSpace = nameSpace() ) + ( LOOKAHEAD(2) label = label() )? + condition = terminalConditionBody() ) | ( - ( LOOKAHEAD(2) context=context() )? - ( LOOKAHEAD(2) label=label() )? - condition=terminalConditionBody() + ( LOOKAHEAD(2) context = context() )? + ( LOOKAHEAD(2) label = label() )? + condition = terminalConditionBody() ) ) { - if (context!=null) + if (context != null) condition.setContextName(context); condition.setLabel(label); condition.setNameSpace(nameSpace); @@ -440,18 +465,18 @@ Condition reallyTerminalCondition() : Condition terminalConditionBody() : { - Condition condition=null; + Condition condition = null; } { ( - LOOKAHEAD(2) condition=conditionReference() | - condition=termCondition() | - condition=nestedCondition() | - condition=nonReferableEllipsisCondition() | - condition=referableEllipsisCondition() | - condition=superCondition() | - condition=literalCondition() | - condition=compositeItemCondition()) + LOOKAHEAD(2) condition = conditionReference() | + condition = termCondition() | + condition = nestedCondition() | + condition = nonReferableEllipsisCondition() | + condition = referableEllipsisCondition() | + condition = superCondition() | + condition = literalCondition() | + condition = compositeItemCondition()) { return condition; } } @@ -460,7 +485,7 @@ Condition notCondition() : Condition condition; } { - <EXCLAMATION> condition=terminalOrComparisonCondition() + <EXCLAMATION> condition = terminalOrComparisonCondition() { return new NotCondition(condition); } } @@ -470,7 +495,7 @@ ConditionReference conditionReference() : String conditionName; } { - <LEFTSQUAREBRACKET> conditionName=identifier() <RIGHTSQUAREBRACKET> + <LEFTSQUAREBRACKET> conditionName = identifier() <RIGHTSQUAREBRACKET> { return new ConditionReference(conditionName); } } @@ -494,23 +519,23 @@ Condition nestedCondition() : Condition condition; } { - <LEFTBRACE> condition=choiceCondition() <RIGHTBRACE> + <LEFTBRACE> condition = choiceCondition() <RIGHTBRACE> { return condition; } } Condition sequenceCondition() : { - SequenceCondition sequenceCondition=new SequenceCondition(); + SequenceCondition sequenceCondition = new SequenceCondition(); Condition condition; } { - condition=terminalCondition() + condition = terminalCondition() { sequenceCondition.addCondition(condition); } - ( LOOKAHEAD(2) condition=terminalCondition() + ( LOOKAHEAD(2) condition = terminalCondition() { sequenceCondition.addCondition(condition); } )* { - if (sequenceCondition.conditionSize()==1) + if (sequenceCondition.conditionSize() == 1) return sequenceCondition.removeCondition(0); else return sequenceCondition; @@ -519,17 +544,17 @@ Condition sequenceCondition() : Condition choiceCondition() : { - ChoiceCondition choiceCondition=new ChoiceCondition(); + ChoiceCondition choiceCondition = new ChoiceCondition(); Condition condition; } { - condition=terminalOrSequenceCondition() + condition = terminalOrSequenceCondition() { choiceCondition.addCondition(condition); } - ( LOOKAHEAD(3) (<NL>)* <COMMA> (<NL>)* condition=terminalOrSequenceCondition() + ( LOOKAHEAD(3) (<NL>)* <COMMA> (<NL>)* condition = terminalOrSequenceCondition() { choiceCondition.addCondition(condition); } ) * { - if (choiceCondition.conditionSize()==1) + if (choiceCondition.conditionSize() == 1) return choiceCondition.removeCondition(0); else return choiceCondition; @@ -542,7 +567,7 @@ TermCondition termCondition() : } { ( str = identifier() ) - { return new TermCondition(str); } + { return new TermCondition(str, linguistics); } } SuperCondition superCondition() : { } @@ -566,7 +591,7 @@ CompositeItemCondition compositeItemCondition() : CompositeItemCondition compositeItemCondition = new CompositeItemCondition(); } { - ( <QUOTE> ( condition=terminalConditionBody() { compositeItemCondition.addCondition(condition); } ) <QUOTE> ) + ( <QUOTE> ( condition = terminalConditionBody() { compositeItemCondition.addCondition(condition); } ) <QUOTE> ) { return compositeItemCondition; } } |