From 57c431c7b1c8f3f1086a8600d844db44979f72a0 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Wed, 14 Apr 2021 10:16:06 +0200 Subject: No functional changes --- .../opennlp/OpenNlpTokenizationTestCase.java | 63 +++++++++------------- 1 file changed, 26 insertions(+), 37 deletions(-) (limited to 'linguistics') diff --git a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java index cd27551cd9a..2239a62f840 100644 --- a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java @@ -9,7 +9,6 @@ import org.junit.Test; import java.util.ArrayList; -import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; @@ -33,61 +32,54 @@ public class OpenNlpTokenizationTestCase { @Test public void testTokenizer() { assertTokenize("This is a test, 123", - Arrays.asList("this", "is", "a", "test", "123"), - Arrays.asList("This", " ", "is", " ", "a", " ", "test", ",", " ", "123")); + List.of("this", "is", "a", "test", "123"), + List.of("This", " ", "is", " ", "a", " ", "test", ",", " ", "123")); } @Test public void testUnderScoreTokenization() { - assertTokenize("ugcapi_1", Language.ENGLISH, StemMode.SHORTEST, true, Arrays.asList("ugcapi", "1"), null); + assertTokenize("ugcapi_1", Language.ENGLISH, StemMode.SHORTEST, true, List.of("ugcapi", "1"), null); } @Test public void testPhrasesWithPunctuation() { assertTokenize("PHY_101.html a space/time or space-time course", Language.ENGLISH, StemMode.NONE, false, - Arrays.asList("phy", "101", "html", "a", "space", "time", "or", "space", "time", "course"), + List.of("phy", "101", "html", "a", "space", "time", "or", "space", "time", "course"), null); - assertTokenize("PHY_101.", Language.ENGLISH, StemMode.NONE, false, Arrays.asList("phy", "101"), null); - assertTokenize("101.3", Language.ENGLISH, StemMode.NONE, false, Arrays.asList("101", "3"), null); + assertTokenize("PHY_101.", Language.ENGLISH, StemMode.NONE, false, List.of("phy", "101"), null); + assertTokenize("101.3", Language.ENGLISH, StemMode.NONE, false, List.of("101", "3"), null); } @Test public void testDoubleWidthTokenization() { // "sony" assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.NONE, false, - Arrays.asList("sony"), null); + List.of("sony"), null); assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.SHORTEST, false, - Arrays.asList("sony"), null); + List.of("sony"), null); // "SONY" assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.NONE, false, - Arrays.asList("sony"), null); + List.of("sony"), null); assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.SHORTEST, false, - Arrays.asList("sony"), null); + List.of("sony"), null); // "on" assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.NONE, false, - Arrays.asList("on"), null); + List.of("on"), null); assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.SHORTEST, false, - Arrays.asList("on"), null); + List.of("on"), null); // "ON" assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.NONE, false, - Arrays.asList("on"), null); + List.of("on"), null); assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.SHORTEST, false, - Arrays.asList("on"), null); + List.of("on"), null); assertTokenize("наименование", Language.RUSSIAN, StemMode.SHORTEST, false, - Arrays.asList("наименован"), null); + List.of("наименован"), null); } @Test public void testLargeTextTokenization() { - StringBuilder sb = new StringBuilder(); - String s = "teststring "; - for (int i = 0; i < 100000; i++) { - sb.append(s); - } - - String input = sb.toString(); - + String input = "teststring ".repeat(100000); int numTokens = 0; List pos = new ArrayList<>(); for (Token t : tokenizer.tokenize(input, Language.ENGLISH, StemMode.NONE, false)) { @@ -103,11 +95,8 @@ public class OpenNlpTokenizationTestCase { @Test public void testLargeTokenGuard() { - StringBuilder str = new StringBuilder(); - for (int i = 0; i < 128 * 256; i++) { - str.append("ab"); - } - Iterator it = tokenizer.tokenize(str.toString(), Language.ENGLISH, StemMode.NONE, false).iterator(); + String input = "ab".repeat(128 * 256); + Iterator it = tokenizer.tokenize(input, Language.ENGLISH, StemMode.NONE, false).iterator(); assertTrue(it.hasNext()); assertNotNull(it.next().getTokenString()); assertFalse(it.hasNext()); @@ -203,15 +192,15 @@ public class OpenNlpTokenizationTestCase { } /** - *

Compare the results of running an input string through the tokenizer with an "index" truth, and an optional - * "orig" truth.

+ * Compare the results of running an input string through the tokenizer with an "index" truth, and an optional + * "orig" truth. * - * @param input The text to process, passed to tokenizer. - * @param language The language tag, passed to tokenizer. - * @param stemMode If stemMode != NONE, test will silently succeed if tokenizer does not do stemming. - * @param accentDrop Passed to the tokenizer. - * @param indexed Compared to the "TokenString" result from the tokenizer. - * @param orig Compared to the "Orig" result from the tokenizer. + * @param input the text to process, passed to tokenizer + * @param language the language tag, passed to tokenizer + * @param stemMode if stemMode != NONE, test will silently succeed if tokenizer does not do stemming + * @param accentDrop passed to the tokenizer + * @param indexed compared to the "TokenString" result from the tokenizer + * @param orig compared to the "Orig" result from the tokenizer */ private void assertTokenize(String input, Language language, StemMode stemMode, boolean accentDrop, List indexed, List orig) { -- cgit v1.2.3