diff options
author | Jon Bratseth <bratseth@gmail.com> | 2020-06-25 14:09:24 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2020-06-25 14:09:24 +0200 |
commit | 74bffb810050342bd32065a818e4f74b8cd7ce51 (patch) | |
tree | f4e50acb6aee944f0176d049ee94ca4a3a0614c6 /linguistics/src/test/java/com/yahoo | |
parent | 0680bf96a4bf17aec0b9fde98ac5369c0991f0fb (diff) |
Surrogate aware gram splitting
Diffstat (limited to 'linguistics/src/test/java/com/yahoo')
-rw-r--r-- | linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java | 46 |
1 files changed, 37 insertions, 9 deletions
diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java index d862280550c..8fa23626193 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java @@ -4,9 +4,9 @@ package com.yahoo.language.process; import com.yahoo.language.simple.SimpleLinguistics; import org.junit.Test; +import java.util.Arrays; import java.util.Iterator; -import static org.hamcrest.CoreMatchers.is; import static org.junit.Assert.*; /** @@ -113,6 +113,30 @@ public class GramSplitterTestCase { "\u7345\u9069\u5e02]"); } + @Test + public void testSurrogatePairs() { + // A surrogate pair representing a code point in the "letter" class + String s = "\uD800\uDC00"; + + assertGramSplits(s, 1, s); + assertGramSplits(s, 2, s); + assertGramSplits(s + s, 1, s, s); + assertGramSplits(s + s, 2, s + s); + assertGramSplits(s + s, 3, s + s); + assertGramSplits(s + " " + s + s + " " + s, 1, s, s, s, s); + assertGramSplits(s + " " + s + s + " " + s, 2, s, s + s, s); + assertGramSplits(s + " " + s + s + " " + s, 3, s, s + s, s); + assertGramSplits(" " + s + " " + s + s + " " + s + " ", 1, s, s, s, s); + assertGramSplits(" " + s + " " + s + s + " " + s + " ", 2, s, s + s, s); + assertGramSplits(" " + s + " " + s + s + " " + s + " ", 3, s, s + s, s); + assertGramSplits(" " + s + " " + s + s + " " + s + " ", 1, s, s, s, s); + assertGramSplits(" " + s + " " + s + s + " " + s + " ", 2, s, s + s, s); + assertGramSplits(" " + s + " " + s + s + " " + s + " ", 3, s, s + s, s); + assertGramSplits(s + " " + s + " " + s, 4, s, s, s); + assertGramSplits(s + s + s + s, 3, s + s + s, s + s + s); + assertGramSplits(s + s + s + s + " " + s, 3, s + s + s, s + s + s, s); + } + @Test(expected = IllegalArgumentException.class) public void testInvalidSplitSize() { gramSplitter.split("en", 0); @@ -128,23 +152,27 @@ public class GramSplitterTestCase { String text = "en gul bille sang"; Iterator<GramSplitter.Gram> grams = gramSplitter.split(text, 3); - assertThat(grams.next().extractFrom(text), is("en")); + assertEquals("en", grams.next().extractFrom(text)); assertTrue(grams.hasNext()); assertTrue(grams.hasNext()); - assertThat(grams.next().extractFrom(text), is("gul")); - assertThat(grams.next().extractFrom(text), is("bil")); - assertThat(grams.next().extractFrom(text), is("ill")); - assertThat(grams.next().extractFrom(text), is("lle")); + assertEquals("gul", grams.next().extractFrom(text)); + assertEquals("bil", grams.next().extractFrom(text)); + assertEquals("ill", grams.next().extractFrom(text)); + assertEquals("lle", grams.next().extractFrom(text)); assertTrue(grams.hasNext()); assertTrue(grams.hasNext()); - assertThat(grams.next().extractFrom(text), is("san")); - assertThat(grams.next().extractFrom(text), is("ang")); + assertEquals("san", grams.next().extractFrom(text)); + assertEquals("ang", grams.next().extractFrom(text)); assertFalse(grams.hasNext()); assertFalse(grams.hasNext()); } + private void assertGramSplits(String input, int gramSize, String ... expected) { + assertEquals(Arrays.asList(expected), gramSplitter.split(input, gramSize).toExtractedList()); + } + private void assertGramSplit(String input, int gramSize, String expected) { - assertThat(gramSplitter.split(input, gramSize).toExtractedList().toString(), is(expected)); + assertEquals(expected, gramSplitter.split(input, gramSize).toExtractedList().toString()); } } |