No functional changes

author: Jon Bratseth <bratseth@gmail.com> 2021-04-14 10:16:06 +0200
committer: Jon Bratseth <bratseth@gmail.com> 2021-04-14 10:16:06 +0200
commit: 57c431c7b1c8f3f1086a8600d844db44979f72a0 (patch)
tree: b27e731e2fb63d6d1727ae2aa1ea9513eb2322d3 /linguistics
parent: 9ec6d6986ae64496cedc5a23fe2ddb8447eabcd4 (diff)
1 files changed, 26 insertions, 37 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
index cd27551cd9a..2239a62f840 100644
--- a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
@@ -9,7 +9,6 @@ import org.junit.Test;
 
 
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.Iterator;
 import java.util.List;
 import java.util.NoSuchElementException;
@@ -33,61 +32,54 @@ public class OpenNlpTokenizationTestCase {
     @Test
     public void testTokenizer() {
         assertTokenize("This is a test, 123",
-                       Arrays.asList("this", "is", "a", "test", "123"),
-                       Arrays.asList("This", " ", "is", " ", "a", " ", "test", ",", " ", "123"));
+                       List.of("this", "is", "a", "test", "123"),
+                       List.of("This", " ", "is", " ", "a", " ", "test", ",", " ", "123"));
     }
 
     @Test
     public void testUnderScoreTokenization() {
-        assertTokenize("ugcapi_1", Language.ENGLISH, StemMode.SHORTEST, true, Arrays.asList("ugcapi", "1"), null);
+        assertTokenize("ugcapi_1", Language.ENGLISH, StemMode.SHORTEST, true, List.of("ugcapi", "1"), null);
     }
 
     @Test
     public void testPhrasesWithPunctuation() {
         assertTokenize("PHY_101.html a space/time or space-time course", Language.ENGLISH, StemMode.NONE,
                        false,
-                       Arrays.asList("phy", "101", "html", "a", "space", "time", "or", "space", "time", "course"),
+                       List.of("phy", "101", "html", "a", "space", "time", "or", "space", "time", "course"),
                        null);
-        assertTokenize("PHY_101.", Language.ENGLISH, StemMode.NONE, false, Arrays.asList("phy", "101"), null);
-        assertTokenize("101.3", Language.ENGLISH, StemMode.NONE, false, Arrays.asList("101", "3"), null);
+        assertTokenize("PHY_101.", Language.ENGLISH, StemMode.NONE, false, List.of("phy", "101"), null);
+        assertTokenize("101.3", Language.ENGLISH, StemMode.NONE, false, List.of("101", "3"), null);
     }
 
     @Test
     public void testDoubleWidthTokenization() {
         // "sony"
         assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.NONE, false,
-                       Arrays.asList("sony"), null);
+                       List.of("sony"), null);
         assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.SHORTEST, false,
-                       Arrays.asList("sony"), null);
+                       List.of("sony"), null);
         // "SONY"
         assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.NONE, false,
-                       Arrays.asList("sony"), null);
+                       List.of("sony"), null);
         assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.SHORTEST, false,
-                       Arrays.asList("sony"), null);
+                       List.of("sony"), null);
         // "on"
         assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.NONE, false,
-                       Arrays.asList("on"), null);
+                       List.of("on"), null);
         assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.SHORTEST, false,
-                       Arrays.asList("on"), null);
+                       List.of("on"), null);
         // "ON"
         assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.NONE, false,
-                       Arrays.asList("on"), null);
+                       List.of("on"), null);
         assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.SHORTEST, false,
-                       Arrays.asList("on"), null);
+                       List.of("on"), null);
         assertTokenize("наименование", Language.RUSSIAN, StemMode.SHORTEST, false,
-                Arrays.asList("наименован"), null);
+                       List.of("наименован"), null);
     }
 
     @Test
     public void testLargeTextTokenization() {
-        StringBuilder sb = new StringBuilder();
-        String s = "teststring ";
-        for (int i = 0; i < 100000; i++) {
-            sb.append(s);
-        }
-
-        String input = sb.toString();
-
+        String input = "teststring ".repeat(100000);
         int numTokens = 0;
         List<Long> pos = new ArrayList<>();
         for (Token t : tokenizer.tokenize(input, Language.ENGLISH, StemMode.NONE, false)) {
@@ -103,11 +95,8 @@ public class OpenNlpTokenizationTestCase {
 
     @Test
     public void testLargeTokenGuard() {
-        StringBuilder str = new StringBuilder();
-        for (int i = 0; i < 128 * 256; i++) {
-            str.append("ab");
-        }
-        Iterator<Token> it = tokenizer.tokenize(str.toString(), Language.ENGLISH, StemMode.NONE, false).iterator();
+        String input = "ab".repeat(128 * 256);
+        Iterator<Token> it = tokenizer.tokenize(input, Language.ENGLISH, StemMode.NONE, false).iterator();
         assertTrue(it.hasNext());
         assertNotNull(it.next().getTokenString());
         assertFalse(it.hasNext());
@@ -203,15 +192,15 @@ public class OpenNlpTokenizationTestCase {
     }
 
     /**
-     * <p>Compare the results of running an input string through the tokenizer with an "index" truth, and an optional
-     * "orig" truth.</p>
+     * Compare the results of running an input string through the tokenizer with an "index" truth, and an optional
+     * "orig" truth.
      *
-     * @param input      The text to process, passed to tokenizer.
-     * @param language   The language tag, passed to tokenizer.
-     * @param stemMode   If stemMode != NONE, test will silently succeed if tokenizer does not do stemming.
-     * @param accentDrop Passed to the tokenizer.
-     * @param indexed    Compared to the "TokenString" result from the tokenizer.
-     * @param orig       Compared to the "Orig" result from the tokenizer.
+     * @param input      the text to process, passed to tokenizer
+     * @param language   the language tag, passed to tokenizer
+     * @param stemMode   if stemMode != NONE, test will silently succeed if tokenizer does not do stemming
+     * @param accentDrop passed to the tokenizer
+     * @param indexed    compared to the "TokenString" result from the tokenizer
+     * @param orig       compared to the "Orig" result from the tokenizer
      */
     private void assertTokenize(String input, Language language, StemMode stemMode, boolean accentDrop,
                                 List<String> indexed, List<String> orig) {
author	Jon Bratseth <bratseth@gmail.com>	2021-04-14 10:16:06 +0200
committer	Jon Bratseth <bratseth@gmail.com>	2021-04-14 10:16:06 +0200
commit	57c431c7b1c8f3f1086a8600d844db44979f72a0 (patch)
tree	b27e731e2fb63d6d1727ae2aa1ea9513eb2322d3 /linguistics
parent	9ec6d6986ae64496cedc5a23fe2ddb8447eabcd4 (diff)