summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2021-04-14 10:16:06 +0200
committerJon Bratseth <bratseth@gmail.com>2021-04-14 10:16:06 +0200
commit57c431c7b1c8f3f1086a8600d844db44979f72a0 (patch)
treeb27e731e2fb63d6d1727ae2aa1ea9513eb2322d3 /linguistics
parent9ec6d6986ae64496cedc5a23fe2ddb8447eabcd4 (diff)
No functional changes
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java63
1 files changed, 26 insertions, 37 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
index cd27551cd9a..2239a62f840 100644
--- a/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/opennlp/OpenNlpTokenizationTestCase.java
@@ -9,7 +9,6 @@ import org.junit.Test;
import java.util.ArrayList;
-import java.util.Arrays;
import java.util.Iterator;
import java.util.List;
import java.util.NoSuchElementException;
@@ -33,61 +32,54 @@ public class OpenNlpTokenizationTestCase {
@Test
public void testTokenizer() {
assertTokenize("This is a test, 123",
- Arrays.asList("this", "is", "a", "test", "123"),
- Arrays.asList("This", " ", "is", " ", "a", " ", "test", ",", " ", "123"));
+ List.of("this", "is", "a", "test", "123"),
+ List.of("This", " ", "is", " ", "a", " ", "test", ",", " ", "123"));
}
@Test
public void testUnderScoreTokenization() {
- assertTokenize("ugcapi_1", Language.ENGLISH, StemMode.SHORTEST, true, Arrays.asList("ugcapi", "1"), null);
+ assertTokenize("ugcapi_1", Language.ENGLISH, StemMode.SHORTEST, true, List.of("ugcapi", "1"), null);
}
@Test
public void testPhrasesWithPunctuation() {
assertTokenize("PHY_101.html a space/time or space-time course", Language.ENGLISH, StemMode.NONE,
false,
- Arrays.asList("phy", "101", "html", "a", "space", "time", "or", "space", "time", "course"),
+ List.of("phy", "101", "html", "a", "space", "time", "or", "space", "time", "course"),
null);
- assertTokenize("PHY_101.", Language.ENGLISH, StemMode.NONE, false, Arrays.asList("phy", "101"), null);
- assertTokenize("101.3", Language.ENGLISH, StemMode.NONE, false, Arrays.asList("101", "3"), null);
+ assertTokenize("PHY_101.", Language.ENGLISH, StemMode.NONE, false, List.of("phy", "101"), null);
+ assertTokenize("101.3", Language.ENGLISH, StemMode.NONE, false, List.of("101", "3"), null);
}
@Test
public void testDoubleWidthTokenization() {
// "sony"
assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.NONE, false,
- Arrays.asList("sony"), null);
+ List.of("sony"), null);
assertTokenize("\uFF53\uFF4F\uFF4E\uFF59", Language.ENGLISH, StemMode.SHORTEST, false,
- Arrays.asList("sony"), null);
+ List.of("sony"), null);
// "SONY"
assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.NONE, false,
- Arrays.asList("sony"), null);
+ List.of("sony"), null);
assertTokenize("\uFF33\uFF2F\uFF2E\uFF39", Language.ENGLISH, StemMode.SHORTEST, false,
- Arrays.asList("sony"), null);
+ List.of("sony"), null);
// "on"
assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.NONE, false,
- Arrays.asList("on"), null);
+ List.of("on"), null);
assertTokenize("\uFF4F\uFF4E", Language.ENGLISH, StemMode.SHORTEST, false,
- Arrays.asList("on"), null);
+ List.of("on"), null);
// "ON"
assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.NONE, false,
- Arrays.asList("on"), null);
+ List.of("on"), null);
assertTokenize("\uFF2F\uFF2E", Language.ENGLISH, StemMode.SHORTEST, false,
- Arrays.asList("on"), null);
+ List.of("on"), null);
assertTokenize("наименование", Language.RUSSIAN, StemMode.SHORTEST, false,
- Arrays.asList("наименован"), null);
+ List.of("наименован"), null);
}
@Test
public void testLargeTextTokenization() {
- StringBuilder sb = new StringBuilder();
- String s = "teststring ";
- for (int i = 0; i < 100000; i++) {
- sb.append(s);
- }
-
- String input = sb.toString();
-
+ String input = "teststring ".repeat(100000);
int numTokens = 0;
List<Long> pos = new ArrayList<>();
for (Token t : tokenizer.tokenize(input, Language.ENGLISH, StemMode.NONE, false)) {
@@ -103,11 +95,8 @@ public class OpenNlpTokenizationTestCase {
@Test
public void testLargeTokenGuard() {
- StringBuilder str = new StringBuilder();
- for (int i = 0; i < 128 * 256; i++) {
- str.append("ab");
- }
- Iterator<Token> it = tokenizer.tokenize(str.toString(), Language.ENGLISH, StemMode.NONE, false).iterator();
+ String input = "ab".repeat(128 * 256);
+ Iterator<Token> it = tokenizer.tokenize(input, Language.ENGLISH, StemMode.NONE, false).iterator();
assertTrue(it.hasNext());
assertNotNull(it.next().getTokenString());
assertFalse(it.hasNext());
@@ -203,15 +192,15 @@ public class OpenNlpTokenizationTestCase {
}
/**
- * <p>Compare the results of running an input string through the tokenizer with an "index" truth, and an optional
- * "orig" truth.</p>
+ * Compare the results of running an input string through the tokenizer with an "index" truth, and an optional
+ * "orig" truth.
*
- * @param input The text to process, passed to tokenizer.
- * @param language The language tag, passed to tokenizer.
- * @param stemMode If stemMode != NONE, test will silently succeed if tokenizer does not do stemming.
- * @param accentDrop Passed to the tokenizer.
- * @param indexed Compared to the "TokenString" result from the tokenizer.
- * @param orig Compared to the "Orig" result from the tokenizer.
+ * @param input the text to process, passed to tokenizer
+ * @param language the language tag, passed to tokenizer
+ * @param stemMode if stemMode != NONE, test will silently succeed if tokenizer does not do stemming
+ * @param accentDrop passed to the tokenizer
+ * @param indexed compared to the "TokenString" result from the tokenizer
+ * @param orig compared to the "Orig" result from the tokenizer
*/
private void assertTokenize(String input, Language language, StemMode stemMode, boolean accentDrop,
List<String> indexed, List<String> orig) {