diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-05-04 22:55:00 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2021-05-04 22:55:00 +0200 |
commit | 4c1ac8706f9aa9a8dbf5d8a73ee87650cf6620c5 (patch) | |
tree | e3553d597f6998695ec10dfb5c5db17979d0513a /container-search/src/test/java/com/yahoo | |
parent | 8e1475c69c80e937cfa3eb47372c8008786196af (diff) |
Revert "Bratseth/special tokens"
Diffstat (limited to 'container-search/src/test/java/com/yahoo')
5 files changed, 90 insertions, 62 deletions
diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java index cef8ae1751c..6afea895f3a 100644 --- a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java +++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParseTestCase.java @@ -18,14 +18,16 @@ import com.yahoo.prelude.query.PhraseSegmentItem; import com.yahoo.prelude.query.PrefixItem; import com.yahoo.prelude.query.RankItem; import com.yahoo.prelude.query.SubstringItem; +import com.yahoo.prelude.query.SubstringItem; import com.yahoo.prelude.query.SuffixItem; import com.yahoo.prelude.query.TaggableItem; import com.yahoo.prelude.query.WordItem; -import com.yahoo.language.process.SpecialTokens; +import com.yahoo.prelude.query.parser.SpecialTokens; import com.yahoo.prelude.query.parser.TestLinguistics; import com.yahoo.search.Query; import org.junit.Test; +import java.util.Collections; import java.util.Iterator; import static org.junit.Assert.assertEquals; @@ -1637,7 +1639,7 @@ public class ParseTestCase { @Test public void testNonSpecialTokenParsing() { - ParsingTester customTester = new ParsingTester(SpecialTokens.empty()); + ParsingTester customTester = new ParsingTester(new SpecialTokens("default")); customTester.assertParsed("OR c or c with (AND tcp ip)", "c# or c++ with tcp/ip", Query.Type.ANY); } diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParsingTester.java b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParsingTester.java index fd7e4cbe0e6..17155fff5de 100644 --- a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParsingTester.java +++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/ParsingTester.java @@ -11,8 +11,8 @@ import com.yahoo.prelude.IndexFacts; import com.yahoo.prelude.IndexModel; import com.yahoo.prelude.query.Item; import com.yahoo.prelude.query.NullItem; -import com.yahoo.language.process.SpecialTokenRegistry; -import com.yahoo.language.process.SpecialTokens; +import com.yahoo.prelude.query.parser.SpecialTokenRegistry; +import com.yahoo.prelude.query.parser.SpecialTokens; import com.yahoo.search.Query; import com.yahoo.search.config.IndexInfoConfig; import com.yahoo.search.query.parser.Parsable; @@ -20,9 +20,6 @@ import com.yahoo.search.query.parser.Parser; import com.yahoo.search.query.parser.ParserEnvironment; import com.yahoo.search.query.parser.ParserFactory; -import java.util.ArrayList; -import java.util.List; - import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; @@ -35,7 +32,7 @@ import static org.junit.Assert.assertTrue; public class ParsingTester { private static final Linguistics linguistics = new SimpleLinguistics(); - private final IndexFacts indexFacts; + private IndexFacts indexFacts; private SpecialTokenRegistry tokenRegistry; public ParsingTester() { @@ -52,10 +49,11 @@ public class ParsingTester { public ParsingTester(IndexFacts indexFacts, SpecialTokens specialTokens) { indexFacts.freeze(); + specialTokens.freeze(); this.indexFacts = indexFacts; tokenRegistry = new SpecialTokenRegistry(); - tokenRegistry = new SpecialTokenRegistry(List.of(specialTokens)); + tokenRegistry.addSpecialTokens(specialTokens); } /** @@ -74,13 +72,13 @@ public class ParsingTester { * This can be used to add new tokens and passing the resulting special tokens to the constructor of this. */ public static SpecialTokens createSpecialTokens() { - List<SpecialTokens.Token> tokens = new ArrayList<>(); - tokens.add(new SpecialTokens.Token("c++")); - tokens.add(new SpecialTokens.Token(".net", "dotnet")); - tokens.add(new SpecialTokens.Token("tcp/ip")); - tokens.add(new SpecialTokens.Token("c#")); - tokens.add(new SpecialTokens.Token("special-token-fs","firstsecond")); - return new SpecialTokens("default", tokens); + SpecialTokens tokens = new SpecialTokens("default"); + tokens.addSpecialToken("c++", null); + tokens.addSpecialToken(".net", "dotnet"); + tokens.addSpecialToken("tcp/ip", null); + tokens.addSpecialToken("c#", null); + tokens.addSpecialToken("special-token-fs","firstsecond"); + return tokens; } /** diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/TokenizerTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/TokenizerTestCase.java index e10fbd71c72..aa2e9dbcf75 100644 --- a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/TokenizerTestCase.java +++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/TokenizerTestCase.java @@ -6,13 +6,12 @@ import com.yahoo.prelude.Index; import com.yahoo.prelude.IndexFacts; import com.yahoo.prelude.IndexModel; import com.yahoo.prelude.SearchDefinition; -import com.yahoo.language.process.SpecialTokenRegistry; -import com.yahoo.language.process.SpecialTokens; +import com.yahoo.prelude.query.parser.SpecialTokenRegistry; +import com.yahoo.prelude.query.parser.SpecialTokens; import com.yahoo.prelude.query.parser.Token; import com.yahoo.prelude.query.parser.Tokenizer; import org.junit.Test; -import java.util.ArrayList; import java.util.Collections; import java.util.List; @@ -40,11 +39,13 @@ import static org.junit.Assert.assertTrue; */ public class TokenizerTestCase { + private SpecialTokenRegistry defaultRegistry = new SpecialTokenRegistry("file:src/test/java/com/yahoo/prelude/query/parser/test/replacingtokens.cfg"); + @Test public void testPlainTokenization() { Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics()); - tokenizer.setSpecialTokens(createSpecialTokens().getSpecialTokens("default")); + tokenizer.setSpecialTokens(createSpecialTokens()); List<?> tokens = tokenizer.tokenize("drive (to hwy88, 88) +or language:en ugcapi_1 & &a"); assertEquals(new Token(WORD, "drive"), tokens.get(0)); @@ -86,7 +87,7 @@ public class TokenizerTestCase { public void testOneSpecialToken() { Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics()); - tokenizer.setSpecialTokens(createSpecialTokens().getSpecialTokens("default")); + tokenizer.setSpecialTokens(createSpecialTokens()); List<?> tokens = tokenizer.tokenize("c++ lovers, please apply"); assertEquals(new Token(WORD, "c++"), tokens.get(0)); @@ -96,7 +97,7 @@ public class TokenizerTestCase { public void testSpecialTokenCombination() { Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics()); - tokenizer.setSpecialTokens(createSpecialTokens().getSpecialTokens("default")); + tokenizer.setSpecialTokens(createSpecialTokens()); List<?> tokens = tokenizer.tokenize("c#, c++ or .net know, not tcp/ip"); assertEquals(new Token(WORD, "c#"), tokens.get(0)); @@ -122,9 +123,10 @@ public class TokenizerTestCase { */ @Test public void testSpecialTokenCJK() { + assertEquals("Special tokens configured", 6, defaultRegistry.getSpecialTokens("default").size()); Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics()); tokenizer.setSubstringSpecialTokens(true); - tokenizer.setSpecialTokens(createSpecialTokens().getSpecialTokens("replacing")); + tokenizer.setSpecialTokens(defaultRegistry.getSpecialTokens("default")); List<?> tokens = tokenizer.tokenize("fooc#bar,c++with spacebarknowknowknow,knowknownot know"); assertEquals(new Token(WORD, "foo"), tokens.get(0)); @@ -149,7 +151,7 @@ public class TokenizerTestCase { public void testSpecialTokenCaseInsensitive() { Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics()); - tokenizer.setSpecialTokens(createSpecialTokens().getSpecialTokens("default")); + tokenizer.setSpecialTokens(createSpecialTokens()); List<?> tokens = tokenizer.tokenize("The AS/400 is great"); assertEquals(new Token(WORD, "The"), tokens.get(0)); @@ -165,7 +167,7 @@ public class TokenizerTestCase { public void testSpecialTokenNonMatch() { Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics()); - tokenizer.setSpecialTokens(createSpecialTokens().getSpecialTokens("default")); + tokenizer.setSpecialTokens(createSpecialTokens()); List<?> tokens = tokenizer.tokenize("c++ c+ aS/400 i/o .net i/ooo ap.net"); assertEquals(new Token(WORD, "c++"), tokens.get(0)); @@ -188,9 +190,18 @@ public class TokenizerTestCase { @Test public void testSpecialTokenConfigurationDefault() { + String tokenFile = "file:src/test/java/com/yahoo/prelude/query/parser/test/specialtokens.cfg"; + + SpecialTokenRegistry r = new SpecialTokenRegistry(tokenFile); + assertEquals("Special tokens configured", 6, + r.getSpecialTokens("default").size()); + assertEquals("Special tokens configured", 4, + r.getSpecialTokens("other").size()); + Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics()); - tokenizer.setSpecialTokens(createSpecialTokens().getSpecialTokens("default")); + tokenizer.setSpecialTokens( + r.getSpecialTokens("default")); List<?> tokens = tokenizer.tokenize( "with space, c++ or .... know, not b.s.d."); @@ -213,9 +224,18 @@ public class TokenizerTestCase { @Test public void testSpecialTokenConfigurationOther() { + String tokenFile = "file:src/test/java/com/yahoo/prelude/query/parser/test/specialtokens.cfg"; + + SpecialTokenRegistry r = new SpecialTokenRegistry(tokenFile); + assertEquals("Special tokens configured", 6, + r.getSpecialTokens("default").size()); + assertEquals("Special tokens configured", 4, + r.getSpecialTokens("other").size()); + Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics()); - tokenizer.setSpecialTokens(createSpecialTokens().getSpecialTokens("other")); + tokenizer.setSpecialTokens( + r.getSpecialTokens("other")); List<?> tokens = tokenizer.tokenize( "with space,!!!*** [huh] or ------ " + "know, &&&%%% b.s.d."); @@ -247,9 +267,26 @@ public class TokenizerTestCase { } @Test + public void testSpecialTokenConfigurationMissing() { + String tokenFile = "file:source/bogus/specialtokens.cfg"; + + SpecialTokenRegistry r = new SpecialTokenRegistry(tokenFile); + + Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics()); + + tokenizer.setSpecialTokens(r.getSpecialTokens("other")); + List<?> tokens = tokenizer.tokenize("c++"); + + assertEquals(new Token(WORD, "c"), tokens.get(0)); + assertEquals(new Token(PLUS, "+"), tokens.get(1)); + assertEquals(new Token(PLUS, "+"), tokens.get(2)); + } + + @Test public void testTokenReplacing() { + assertEquals("Special tokens configured", 6, defaultRegistry.getSpecialTokens("default").size()); Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics()); - tokenizer.setSpecialTokens(createSpecialTokens().getSpecialTokens("replacing")); + tokenizer.setSpecialTokens(defaultRegistry.getSpecialTokens("default")); List<?> tokens = tokenizer.tokenize("with space, c++ or .... know, not b.s.d."); assertEquals(new Token(WORD, "with-space"), tokens.get(0)); @@ -708,7 +745,7 @@ public class TokenizerTestCase { public void testSingleQuoteAsWordCharacter() { Tokenizer tokenizer = new Tokenizer(new SimpleLinguistics()); - tokenizer.setSpecialTokens(createSpecialTokens().getSpecialTokens("default")); + tokenizer.setSpecialTokens(createSpecialTokens()); List<?> tokens = tokenizer.tokenize("drive (to hwy88, 88) +or language:en nalle:a'a ugcapi_1 'a' 'a a'"); assertEquals(new Token(WORD, "drive"), tokens.get(0)); @@ -744,38 +781,17 @@ public class TokenizerTestCase { assertEquals(new Token(WORD, "a'"), tokens.get(30)); } - private SpecialTokenRegistry createSpecialTokens() { - List<SpecialTokens.Token> tokens = new ArrayList<>(); - tokens.add(new SpecialTokens.Token("c+")); - tokens.add(new SpecialTokens.Token("c++")); - tokens.add(new SpecialTokens.Token(".net")); - tokens.add(new SpecialTokens.Token("tcp/ip")); - tokens.add(new SpecialTokens.Token("i/o")); - tokens.add(new SpecialTokens.Token("c#")); - tokens.add(new SpecialTokens.Token("AS/400")); - tokens.add(new SpecialTokens.Token("....")); - tokens.add(new SpecialTokens.Token("b.s.d.")); - tokens.add(new SpecialTokens.Token("with space")); - tokens.add(new SpecialTokens.Token("dvd\\xB1r")); - SpecialTokens defaultTokens = new SpecialTokens("default", tokens); - - tokens = new ArrayList<>(); - tokens.add(new SpecialTokens.Token("[huh]")); - tokens.add(new SpecialTokens.Token("&&&%%%")); - tokens.add(new SpecialTokens.Token("------")); - tokens.add(new SpecialTokens.Token("!!!***")); - SpecialTokens otherTokens = new SpecialTokens("other", tokens); - - tokens = new ArrayList<>(); - tokens.add(new SpecialTokens.Token("....")); - tokens.add(new SpecialTokens.Token("c++", "cpp")); - tokens.add(new SpecialTokens.Token("b.s.d.")); - tokens.add(new SpecialTokens.Token("with space", "with-space")); - tokens.add(new SpecialTokens.Token("c#")); - tokens.add(new SpecialTokens.Token("know", "knuwww")); - SpecialTokens replacingTokens = new SpecialTokens("replacing", tokens); - - return new SpecialTokenRegistry(List.of(defaultTokens, otherTokens, replacingTokens)); + private SpecialTokens createSpecialTokens() { + SpecialTokens tokens = new SpecialTokens("default"); + + tokens.addSpecialToken("c+", null); + tokens.addSpecialToken("c++", null); + tokens.addSpecialToken(".net", null); + tokens.addSpecialToken("tcp/ip", null); + tokens.addSpecialToken("i/o", null); + tokens.addSpecialToken("c#", null); + tokens.addSpecialToken("AS/400", null); + return tokens; } } diff --git a/container-search/src/test/java/com/yahoo/prelude/query/parser/test/replacingtokens.cfg b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/replacingtokens.cfg new file mode 100644 index 00000000000..6a189de0164 --- /dev/null +++ b/container-search/src/test/java/com/yahoo/prelude/query/parser/test/replacingtokens.cfg @@ -0,0 +1,12 @@ +tokenlist[1] +tokenlist[0].name default +tokenlist[0].tokens[6] +tokenlist[0].tokens[0].token .... +tokenlist[0].tokens[1].token c++ +tokenlist[0].tokens[1].replace cpp +tokenlist[0].tokens[2].token b.s.d. +tokenlist[0].tokens[3].token with space +tokenlist[0].tokens[3].replace with-space +tokenlist[0].tokens[4].token c# +tokenlist[0].tokens[5].token know +tokenlist[0].tokens[5].replace knuwww diff --git a/container-search/src/test/java/com/yahoo/search/query/rewrite/RewriterFeaturesTestCase.java b/container-search/src/test/java/com/yahoo/search/query/rewrite/RewriterFeaturesTestCase.java index 08146bbe069..5508c2a73a7 100644 --- a/container-search/src/test/java/com/yahoo/search/query/rewrite/RewriterFeaturesTestCase.java +++ b/container-search/src/test/java/com/yahoo/search/query/rewrite/RewriterFeaturesTestCase.java @@ -8,7 +8,7 @@ import org.junit.Test; import com.yahoo.prelude.query.AndItem; import com.yahoo.prelude.query.CompositeItem; import com.yahoo.prelude.query.Item; -import com.yahoo.language.process.SpecialTokenRegistry; +import com.yahoo.prelude.query.parser.SpecialTokenRegistry; import com.yahoo.search.Query; import com.yahoo.search.searchchain.Execution; import com.yahoo.search.searchchain.Execution.Context; |