diff options
author | Jon Bratseth <bratseth@vespa.ai> | 2023-05-22 23:08:48 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@vespa.ai> | 2023-05-22 23:08:48 +0200 |
commit | cc60531ac22a7e9601055174a02a6e67c428f800 (patch) | |
tree | 8a1f336745c8ae2da36ca55501e1192ad111ac32 /linguistics/src/test/java | |
parent | 179a1d90ca76fa61bcbeb3967a58fd3e9b5e9654 (diff) |
Always treat each symbol as a separate token
Diffstat (limited to 'linguistics/src/test/java')
-rw-r--r-- | linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java | 11 | ||||
-rw-r--r-- | linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java | 17 |
2 files changed, 25 insertions, 3 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java index 6cefcfbf67a..a219efce3cd 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java @@ -49,6 +49,17 @@ public class GramSplitterTestCase { } @Test + public void testEmojis() { + String emoji1 = "\uD83D\uDD2A"; // 🔪 + String emoji2 = "\uD83D\uDE00"; // 😀 + assertGramSplit(emoji1, 2, "[" + emoji1+ "]"); + assertGramSplit(emoji1 + emoji2, 2, "[" + emoji1 + ", " + emoji2 + "]"); + assertGramSplit(emoji1 + "." + emoji2, 2, "[" + emoji1 + ", " + emoji2 + "]"); + assertGramSplit("." + emoji1 + "." + emoji2 + ".", 2, "[" + emoji1 + ", " + emoji2 + "]"); + assertGramSplit("foo" + emoji1 + "bar" + emoji2 + "baz", 2, "[fo, oo, " + emoji1 + ", ba, ar, " + emoji2 + ", ba, az]"); + } + + @Test public void testSpaceCornerCases() { // space corner cases assertGramSplit("e en e", 1, "[e, e, n, e]"); diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java index 1c2f7377bde..b4f080405bd 100644 --- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java @@ -1,10 +1,18 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.simple; +import com.yahoo.language.Language; import com.yahoo.language.process.AbstractTokenizerTestCase; import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; import org.junit.Test; +import java.util.Iterator; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + /** * @author Steinar Knutsen * @author bratseth @@ -36,9 +44,12 @@ public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase { @Test public void testTokenizeEmojis() { TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL); - String emoji = "\uD83D\uDD2A"; // 🔪 - tester.assertTokens(emoji, emoji); - tester.assertTokens(emoji + "foo", emoji, "foo"); + + String emoji1 = "\uD83D\uDD2A"; // 🔪 + String emoji2 = "\uD83D\uDE00"; // 😀 + tester.assertTokens(emoji1, emoji1); + tester.assertTokens(emoji1 + "foo", emoji1, "foo"); + tester.assertTokens(emoji1 + emoji2, emoji1, emoji2); } } |