aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/test/java/com/yahoo
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2020-06-25 14:09:24 +0200
committerJon Bratseth <bratseth@gmail.com>2020-06-25 14:09:24 +0200
commit74bffb810050342bd32065a818e4f74b8cd7ce51 (patch)
treef4e50acb6aee944f0176d049ee94ca4a3a0614c6 /linguistics/src/test/java/com/yahoo
parent0680bf96a4bf17aec0b9fde98ac5369c0991f0fb (diff)
Surrogate aware gram splitting
Diffstat (limited to 'linguistics/src/test/java/com/yahoo')
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java46
1 files changed, 37 insertions, 9 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
index d862280550c..8fa23626193 100644
--- a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
@@ -4,9 +4,9 @@ package com.yahoo.language.process;
import com.yahoo.language.simple.SimpleLinguistics;
import org.junit.Test;
+import java.util.Arrays;
import java.util.Iterator;
-import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.*;
/**
@@ -113,6 +113,30 @@ public class GramSplitterTestCase {
"\u7345\u9069\u5e02]");
}
+ @Test
+ public void testSurrogatePairs() {
+ // A surrogate pair representing a code point in the "letter" class
+ String s = "\uD800\uDC00";
+
+ assertGramSplits(s, 1, s);
+ assertGramSplits(s, 2, s);
+ assertGramSplits(s + s, 1, s, s);
+ assertGramSplits(s + s, 2, s + s);
+ assertGramSplits(s + s, 3, s + s);
+ assertGramSplits(s + " " + s + s + " " + s, 1, s, s, s, s);
+ assertGramSplits(s + " " + s + s + " " + s, 2, s, s + s, s);
+ assertGramSplits(s + " " + s + s + " " + s, 3, s, s + s, s);
+ assertGramSplits(" " + s + " " + s + s + " " + s + " ", 1, s, s, s, s);
+ assertGramSplits(" " + s + " " + s + s + " " + s + " ", 2, s, s + s, s);
+ assertGramSplits(" " + s + " " + s + s + " " + s + " ", 3, s, s + s, s);
+ assertGramSplits(" " + s + " " + s + s + " " + s + " ", 1, s, s, s, s);
+ assertGramSplits(" " + s + " " + s + s + " " + s + " ", 2, s, s + s, s);
+ assertGramSplits(" " + s + " " + s + s + " " + s + " ", 3, s, s + s, s);
+ assertGramSplits(s + " " + s + " " + s, 4, s, s, s);
+ assertGramSplits(s + s + s + s, 3, s + s + s, s + s + s);
+ assertGramSplits(s + s + s + s + " " + s, 3, s + s + s, s + s + s, s);
+ }
+
@Test(expected = IllegalArgumentException.class)
public void testInvalidSplitSize() {
gramSplitter.split("en", 0);
@@ -128,23 +152,27 @@ public class GramSplitterTestCase {
String text = "en gul bille sang";
Iterator<GramSplitter.Gram> grams = gramSplitter.split(text, 3);
- assertThat(grams.next().extractFrom(text), is("en"));
+ assertEquals("en", grams.next().extractFrom(text));
assertTrue(grams.hasNext());
assertTrue(grams.hasNext());
- assertThat(grams.next().extractFrom(text), is("gul"));
- assertThat(grams.next().extractFrom(text), is("bil"));
- assertThat(grams.next().extractFrom(text), is("ill"));
- assertThat(grams.next().extractFrom(text), is("lle"));
+ assertEquals("gul", grams.next().extractFrom(text));
+ assertEquals("bil", grams.next().extractFrom(text));
+ assertEquals("ill", grams.next().extractFrom(text));
+ assertEquals("lle", grams.next().extractFrom(text));
assertTrue(grams.hasNext());
assertTrue(grams.hasNext());
- assertThat(grams.next().extractFrom(text), is("san"));
- assertThat(grams.next().extractFrom(text), is("ang"));
+ assertEquals("san", grams.next().extractFrom(text));
+ assertEquals("ang", grams.next().extractFrom(text));
assertFalse(grams.hasNext());
assertFalse(grams.hasNext());
}
+ private void assertGramSplits(String input, int gramSize, String ... expected) {
+ assertEquals(Arrays.asList(expected), gramSplitter.split(input, gramSize).toExtractedList());
+ }
+
private void assertGramSplit(String input, int gramSize, String expected) {
- assertThat(gramSplitter.split(input, gramSize).toExtractedList().toString(), is(expected));
+ assertEquals(expected, gramSplitter.split(input, gramSize).toExtractedList().toString());
}
}