diff options
author | Jon Bratseth <bratseth@gmail.com> | 2020-06-25 14:09:24 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2020-06-25 14:09:24 +0200 |
commit | 74bffb810050342bd32065a818e4f74b8cd7ce51 (patch) | |
tree | f4e50acb6aee944f0176d049ee94ca4a3a0614c6 | |
parent | 0680bf96a4bf17aec0b9fde98ac5369c0991f0fb (diff) |
Surrogate aware gram splitting
5 files changed, 130 insertions, 39 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/DocumentSelectionBuilder.java b/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/DocumentSelectionBuilder.java index 07b87a41b2f..1d5d2420c8f 100644 --- a/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/DocumentSelectionBuilder.java +++ b/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/DocumentSelectionBuilder.java @@ -69,12 +69,10 @@ public class DocumentSelectionBuilder { String globalSelection = elem.stringAttribute("selection"); if (globalSelection != null) { validateSelectionExpression(globalSelection, null); - StringBuilder global = new StringBuilder(); - global.append('(').append(globalSelection).append(") AND (") - .append(sb.toString()).append(')'); - return global.toString(); + return "(" + globalSelection + ") AND (" + sb + ")"; } } return sb.toString(); } + } diff --git a/document/src/main/java/com/yahoo/document/select/simple/SelectionParser.java b/document/src/main/java/com/yahoo/document/select/simple/SelectionParser.java index 0614731265b..c250d3bede1 100644 --- a/document/src/main/java/com/yahoo/document/select/simple/SelectionParser.java +++ b/document/src/main/java/com/yahoo/document/select/simple/SelectionParser.java @@ -8,8 +8,11 @@ import com.yahoo.document.select.rule.ExpressionNode; * @author baldersheim */ public class SelectionParser extends Parser { + private ExpressionNode node; + public ExpressionNode getNode() { return node; } + public boolean parse(CharSequence s) { boolean retval = false; IdSpecParser id = new IdSpecParser(); @@ -40,4 +43,5 @@ public class SelectionParser extends Parser { return retval; } + } diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java index d91338e3d3f..adf3e4ecaaa 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java @@ -65,10 +65,10 @@ public final class NGramExpression extends Expression { // annotate gram as a word term String gramString = gram.extractFrom(input.getString()); - typedSpan(gram.getStart(), gram.getLength(), TokenType.ALPHABETIC, spanList). + typedSpan(gram.getStart(), gram.getCodePointCount(), TokenType.ALPHABETIC, spanList). annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(gramString, gramString)); - lastPosition = gram.getStart() + gram.getLength(); + lastPosition = gram.getStart() + gram.getCodePointCount(); } // handle punctuation at the end if (lastPosition < input.toString().length()) { diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java index aa7ae59edf9..8a255dd5370 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java +++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java @@ -49,12 +49,12 @@ public class GramSplitter { private final CharacterClasses characterClasses; /** Text to split */ - private final String input; + private final UnicodeString input; - /** Gram size */ + /** Gram size in code points */ private final int n; - /** Current index */ + /** Current position in the string */ private int i = 0; /** Whether the last thing that happened was being on a separator (including the start of the string) */ @@ -64,7 +64,7 @@ public class GramSplitter { private Gram nextGram = null; public GramSplitterIterator(String input, int n, CharacterClasses characterClasses) { - this.input = input; + this.input = new UnicodeString(input); this.n = n; this.characterClasses = characterClasses; } @@ -90,38 +90,40 @@ public class GramSplitter { private Gram findNext() { // Skip to next word character while (i < input.length() && !characterClasses.isLetterOrDigit(input.codePointAt(i))) { - i++; + i = input.next(i); isFirstAfterSeparator = true; } if (i >= input.length()) return null; - String gram = input.substring(i, Math.min(i + n, input.length())); - int nonWordChar = indexOfNonWordChar(gram); + UnicodeString gram = input.substring(i, n); + int nonWordChar = indexOfNonWordCodepoint(gram); if (nonWordChar == 0) throw new RuntimeException("Programming error"); if (nonWordChar > 0) - gram = gram.substring(0, nonWordChar); + gram = new UnicodeString(gram.toString().substring(0, nonWordChar)); - if (gram.length() == n) { // normal case: got a full length gram - i++; + if (gram.codePointCount() == n) { // normal case: got a full length gram + Gram g = new Gram(i, gram.codePointCount()); + i = input.next(i); isFirstAfterSeparator = false; - return new Gram(i - 1, gram.length()); + return g; } else { // gram is too short due either to a non-word separator or end of string if (isFirstAfterSeparator) { // make a gram anyway - i++; + Gram g = new Gram(i, gram.codePointCount()); + i = input.next(i); isFirstAfterSeparator = false; - return new Gram(i - 1, gram.length()); + return g; } else { // skip to next - i += gram.length() + 1; + i = input.skip(gram.codePointCount() + 1, i); isFirstAfterSeparator = true; return findNext(); } } } - private int indexOfNonWordChar(String s) { - for (int i = 0; i < s.length(); i++) { + private int indexOfNonWordCodepoint(UnicodeString s) { + for (int i = 0; i < s.length(); i = s.next(i)) { if ( ! characterClasses.isLetterOrDigit(s.codePointAt(i))) return i; } @@ -151,24 +153,29 @@ public class GramSplitter { */ public static final class Gram { - private int start, length; + private int start, codePointCount; - public Gram(int start, int length) { + public Gram(int start, int codePointCount) { this.start = start; - this.length = length; + this.codePointCount = codePointCount; } public int getStart() { return start; } - public int getLength() { - return length; + public int getCodePointCount() { + return codePointCount; } /** Returns this gram as a string from the input string */ public String extractFrom(String input) { - return input.substring(start, start + length); + return extractFrom(new UnicodeString(input)); + } + + /** Returns this gram as a string from the input string */ + public String extractFrom(UnicodeString input) { + return input.substring(start, codePointCount).toString(); } @Override @@ -177,7 +184,7 @@ public class GramSplitter { if ( ! (o instanceof Gram)) return false; Gram gram = (Gram)o; - if (length != gram.length) return false; + if (codePointCount != gram.codePointCount) return false; if (start != gram.start) return false; return true; } @@ -185,10 +192,64 @@ public class GramSplitter { @Override public int hashCode() { int result = start; - result = 31 * result + length; + result = 31 * result + codePointCount; return result; } } + /** + * A string wrapper with some convenience methods for dealing with UTF-16 surrogate pairs + * (a crime against humanity for which we'll be negatively impacted for at least the next million years). + */ + private static class UnicodeString { + + private final String s; + + public UnicodeString(String s) { + this.s = s; + } + + /** Substring in code point space */ + public UnicodeString substring(int start, int codePoints) { + int offset = s.offsetByCodePoints(start, Math.min(codePoints, s.codePointCount(start, s.length()))); + if (offset < 0) + return new UnicodeString(s.substring(start)); + else + return new UnicodeString(s.substring(start, offset)); + } + + /** Returns the position count code points after start (which may be past the end of the string) */ + public int skip(int codePointCount, int start) { + int index = start; + for (int i = 0; i < codePointCount; i++) { + index = next(index); + if (index > s.length()) break; + } + return index; + } + + /** Returns the index of the next code point after start (which may be past the end of the string) */ + public int next(int index) { + int next = index + 1; + if (next < s.length() && Character.isLowSurrogate(s.charAt(next))) + next++; + return next; + } + + /** Returns the number of positions (not code points) in this */ + public int length() { return s.length(); } + + /** Returns the number of code points in this */ + public int codePointCount() { return s.codePointCount(0, s.length()); } + + public int codePointAt(int index) { + return s.codePointAt(index); + } + + @Override + public String toString() { return s; } + + } + } diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java index d862280550c..8fa23626193 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java @@ -4,9 +4,9 @@ package com.yahoo.language.process; import com.yahoo.language.simple.SimpleLinguistics; import org.junit.Test; +import java.util.Arrays; import java.util.Iterator; -import static org.hamcrest.CoreMatchers.is; import static org.junit.Assert.*; /** @@ -113,6 +113,30 @@ public class GramSplitterTestCase { "\u7345\u9069\u5e02]"); } + @Test + public void testSurrogatePairs() { + // A surrogate pair representing a code point in the "letter" class + String s = "\uD800\uDC00"; + + assertGramSplits(s, 1, s); + assertGramSplits(s, 2, s); + assertGramSplits(s + s, 1, s, s); + assertGramSplits(s + s, 2, s + s); + assertGramSplits(s + s, 3, s + s); + assertGramSplits(s + " " + s + s + " " + s, 1, s, s, s, s); + assertGramSplits(s + " " + s + s + " " + s, 2, s, s + s, s); + assertGramSplits(s + " " + s + s + " " + s, 3, s, s + s, s); + assertGramSplits(" " + s + " " + s + s + " " + s + " ", 1, s, s, s, s); + assertGramSplits(" " + s + " " + s + s + " " + s + " ", 2, s, s + s, s); + assertGramSplits(" " + s + " " + s + s + " " + s + " ", 3, s, s + s, s); + assertGramSplits(" " + s + " " + s + s + " " + s + " ", 1, s, s, s, s); + assertGramSplits(" " + s + " " + s + s + " " + s + " ", 2, s, s + s, s); + assertGramSplits(" " + s + " " + s + s + " " + s + " ", 3, s, s + s, s); + assertGramSplits(s + " " + s + " " + s, 4, s, s, s); + assertGramSplits(s + s + s + s, 3, s + s + s, s + s + s); + assertGramSplits(s + s + s + s + " " + s, 3, s + s + s, s + s + s, s); + } + @Test(expected = IllegalArgumentException.class) public void testInvalidSplitSize() { gramSplitter.split("en", 0); @@ -128,23 +152,27 @@ public class GramSplitterTestCase { String text = "en gul bille sang"; Iterator<GramSplitter.Gram> grams = gramSplitter.split(text, 3); - assertThat(grams.next().extractFrom(text), is("en")); + assertEquals("en", grams.next().extractFrom(text)); assertTrue(grams.hasNext()); assertTrue(grams.hasNext()); - assertThat(grams.next().extractFrom(text), is("gul")); - assertThat(grams.next().extractFrom(text), is("bil")); - assertThat(grams.next().extractFrom(text), is("ill")); - assertThat(grams.next().extractFrom(text), is("lle")); + assertEquals("gul", grams.next().extractFrom(text)); + assertEquals("bil", grams.next().extractFrom(text)); + assertEquals("ill", grams.next().extractFrom(text)); + assertEquals("lle", grams.next().extractFrom(text)); assertTrue(grams.hasNext()); assertTrue(grams.hasNext()); - assertThat(grams.next().extractFrom(text), is("san")); - assertThat(grams.next().extractFrom(text), is("ang")); + assertEquals("san", grams.next().extractFrom(text)); + assertEquals("ang", grams.next().extractFrom(text)); assertFalse(grams.hasNext()); assertFalse(grams.hasNext()); } + private void assertGramSplits(String input, int gramSize, String ... expected) { + assertEquals(Arrays.asList(expected), gramSplitter.split(input, gramSize).toExtractedList()); + } + private void assertGramSplit(String input, int gramSize, String expected) { - assertThat(gramSplitter.split(input, gramSize).toExtractedList().toString(), is(expected)); + assertEquals(expected, gramSplitter.split(input, gramSize).toExtractedList().toString()); } } |