aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--config-model/src/main/java/com/yahoo/vespa/model/content/cluster/DocumentSelectionBuilder.java6
-rw-r--r--document/src/main/java/com/yahoo/document/select/simple/SelectionParser.java4
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java4
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java109
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java46
5 files changed, 130 insertions, 39 deletions
diff --git a/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/DocumentSelectionBuilder.java b/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/DocumentSelectionBuilder.java
index 07b87a41b2f..1d5d2420c8f 100644
--- a/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/DocumentSelectionBuilder.java
+++ b/config-model/src/main/java/com/yahoo/vespa/model/content/cluster/DocumentSelectionBuilder.java
@@ -69,12 +69,10 @@ public class DocumentSelectionBuilder {
String globalSelection = elem.stringAttribute("selection");
if (globalSelection != null) {
validateSelectionExpression(globalSelection, null);
- StringBuilder global = new StringBuilder();
- global.append('(').append(globalSelection).append(") AND (")
- .append(sb.toString()).append(')');
- return global.toString();
+ return "(" + globalSelection + ") AND (" + sb + ")";
}
}
return sb.toString();
}
+
}
diff --git a/document/src/main/java/com/yahoo/document/select/simple/SelectionParser.java b/document/src/main/java/com/yahoo/document/select/simple/SelectionParser.java
index 0614731265b..c250d3bede1 100644
--- a/document/src/main/java/com/yahoo/document/select/simple/SelectionParser.java
+++ b/document/src/main/java/com/yahoo/document/select/simple/SelectionParser.java
@@ -8,8 +8,11 @@ import com.yahoo.document.select.rule.ExpressionNode;
* @author baldersheim
*/
public class SelectionParser extends Parser {
+
private ExpressionNode node;
+
public ExpressionNode getNode() { return node; }
+
public boolean parse(CharSequence s) {
boolean retval = false;
IdSpecParser id = new IdSpecParser();
@@ -40,4 +43,5 @@ public class SelectionParser extends Parser {
return retval;
}
+
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
index d91338e3d3f..adf3e4ecaaa 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NGramExpression.java
@@ -65,10 +65,10 @@ public final class NGramExpression extends Expression {
// annotate gram as a word term
String gramString = gram.extractFrom(input.getString());
- typedSpan(gram.getStart(), gram.getLength(), TokenType.ALPHABETIC, spanList).
+ typedSpan(gram.getStart(), gram.getCodePointCount(), TokenType.ALPHABETIC, spanList).
annotate(LinguisticsAnnotator.lowerCaseTermAnnotation(gramString, gramString));
- lastPosition = gram.getStart() + gram.getLength();
+ lastPosition = gram.getStart() + gram.getCodePointCount();
}
// handle punctuation at the end
if (lastPosition < input.toString().length()) {
diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
index aa7ae59edf9..8a255dd5370 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
@@ -49,12 +49,12 @@ public class GramSplitter {
private final CharacterClasses characterClasses;
/** Text to split */
- private final String input;
+ private final UnicodeString input;
- /** Gram size */
+ /** Gram size in code points */
private final int n;
- /** Current index */
+ /** Current position in the string */
private int i = 0;
/** Whether the last thing that happened was being on a separator (including the start of the string) */
@@ -64,7 +64,7 @@ public class GramSplitter {
private Gram nextGram = null;
public GramSplitterIterator(String input, int n, CharacterClasses characterClasses) {
- this.input = input;
+ this.input = new UnicodeString(input);
this.n = n;
this.characterClasses = characterClasses;
}
@@ -90,38 +90,40 @@ public class GramSplitter {
private Gram findNext() {
// Skip to next word character
while (i < input.length() && !characterClasses.isLetterOrDigit(input.codePointAt(i))) {
- i++;
+ i = input.next(i);
isFirstAfterSeparator = true;
}
if (i >= input.length()) return null;
- String gram = input.substring(i, Math.min(i + n, input.length()));
- int nonWordChar = indexOfNonWordChar(gram);
+ UnicodeString gram = input.substring(i, n);
+ int nonWordChar = indexOfNonWordCodepoint(gram);
if (nonWordChar == 0) throw new RuntimeException("Programming error");
if (nonWordChar > 0)
- gram = gram.substring(0, nonWordChar);
+ gram = new UnicodeString(gram.toString().substring(0, nonWordChar));
- if (gram.length() == n) { // normal case: got a full length gram
- i++;
+ if (gram.codePointCount() == n) { // normal case: got a full length gram
+ Gram g = new Gram(i, gram.codePointCount());
+ i = input.next(i);
isFirstAfterSeparator = false;
- return new Gram(i - 1, gram.length());
+ return g;
}
else { // gram is too short due either to a non-word separator or end of string
if (isFirstAfterSeparator) { // make a gram anyway
- i++;
+ Gram g = new Gram(i, gram.codePointCount());
+ i = input.next(i);
isFirstAfterSeparator = false;
- return new Gram(i - 1, gram.length());
+ return g;
} else { // skip to next
- i += gram.length() + 1;
+ i = input.skip(gram.codePointCount() + 1, i);
isFirstAfterSeparator = true;
return findNext();
}
}
}
- private int indexOfNonWordChar(String s) {
- for (int i = 0; i < s.length(); i++) {
+ private int indexOfNonWordCodepoint(UnicodeString s) {
+ for (int i = 0; i < s.length(); i = s.next(i)) {
if ( ! characterClasses.isLetterOrDigit(s.codePointAt(i)))
return i;
}
@@ -151,24 +153,29 @@ public class GramSplitter {
*/
public static final class Gram {
- private int start, length;
+ private int start, codePointCount;
- public Gram(int start, int length) {
+ public Gram(int start, int codePointCount) {
this.start = start;
- this.length = length;
+ this.codePointCount = codePointCount;
}
public int getStart() {
return start;
}
- public int getLength() {
- return length;
+ public int getCodePointCount() {
+ return codePointCount;
}
/** Returns this gram as a string from the input string */
public String extractFrom(String input) {
- return input.substring(start, start + length);
+ return extractFrom(new UnicodeString(input));
+ }
+
+ /** Returns this gram as a string from the input string */
+ public String extractFrom(UnicodeString input) {
+ return input.substring(start, codePointCount).toString();
}
@Override
@@ -177,7 +184,7 @@ public class GramSplitter {
if ( ! (o instanceof Gram)) return false;
Gram gram = (Gram)o;
- if (length != gram.length) return false;
+ if (codePointCount != gram.codePointCount) return false;
if (start != gram.start) return false;
return true;
}
@@ -185,10 +192,64 @@ public class GramSplitter {
@Override
public int hashCode() {
int result = start;
- result = 31 * result + length;
+ result = 31 * result + codePointCount;
return result;
}
}
+ /**
+ * A string wrapper with some convenience methods for dealing with UTF-16 surrogate pairs
+ * (a crime against humanity for which we'll be negatively impacted for at least the next million years).
+ */
+ private static class UnicodeString {
+
+ private final String s;
+
+ public UnicodeString(String s) {
+ this.s = s;
+ }
+
+ /** Substring in code point space */
+ public UnicodeString substring(int start, int codePoints) {
+ int offset = s.offsetByCodePoints(start, Math.min(codePoints, s.codePointCount(start, s.length())));
+ if (offset < 0)
+ return new UnicodeString(s.substring(start));
+ else
+ return new UnicodeString(s.substring(start, offset));
+ }
+
+ /** Returns the position count code points after start (which may be past the end of the string) */
+ public int skip(int codePointCount, int start) {
+ int index = start;
+ for (int i = 0; i < codePointCount; i++) {
+ index = next(index);
+ if (index > s.length()) break;
+ }
+ return index;
+ }
+
+ /** Returns the index of the next code point after start (which may be past the end of the string) */
+ public int next(int index) {
+ int next = index + 1;
+ if (next < s.length() && Character.isLowSurrogate(s.charAt(next)))
+ next++;
+ return next;
+ }
+
+ /** Returns the number of positions (not code points) in this */
+ public int length() { return s.length(); }
+
+ /** Returns the number of code points in this */
+ public int codePointCount() { return s.codePointCount(0, s.length()); }
+
+ public int codePointAt(int index) {
+ return s.codePointAt(index);
+ }
+
+ @Override
+ public String toString() { return s; }
+
+ }
+
}
diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
index d862280550c..8fa23626193 100644
--- a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
@@ -4,9 +4,9 @@ package com.yahoo.language.process;
import com.yahoo.language.simple.SimpleLinguistics;
import org.junit.Test;
+import java.util.Arrays;
import java.util.Iterator;
-import static org.hamcrest.CoreMatchers.is;
import static org.junit.Assert.*;
/**
@@ -113,6 +113,30 @@ public class GramSplitterTestCase {
"\u7345\u9069\u5e02]");
}
+ @Test
+ public void testSurrogatePairs() {
+ // A surrogate pair representing a code point in the "letter" class
+ String s = "\uD800\uDC00";
+
+ assertGramSplits(s, 1, s);
+ assertGramSplits(s, 2, s);
+ assertGramSplits(s + s, 1, s, s);
+ assertGramSplits(s + s, 2, s + s);
+ assertGramSplits(s + s, 3, s + s);
+ assertGramSplits(s + " " + s + s + " " + s, 1, s, s, s, s);
+ assertGramSplits(s + " " + s + s + " " + s, 2, s, s + s, s);
+ assertGramSplits(s + " " + s + s + " " + s, 3, s, s + s, s);
+ assertGramSplits(" " + s + " " + s + s + " " + s + " ", 1, s, s, s, s);
+ assertGramSplits(" " + s + " " + s + s + " " + s + " ", 2, s, s + s, s);
+ assertGramSplits(" " + s + " " + s + s + " " + s + " ", 3, s, s + s, s);
+ assertGramSplits(" " + s + " " + s + s + " " + s + " ", 1, s, s, s, s);
+ assertGramSplits(" " + s + " " + s + s + " " + s + " ", 2, s, s + s, s);
+ assertGramSplits(" " + s + " " + s + s + " " + s + " ", 3, s, s + s, s);
+ assertGramSplits(s + " " + s + " " + s, 4, s, s, s);
+ assertGramSplits(s + s + s + s, 3, s + s + s, s + s + s);
+ assertGramSplits(s + s + s + s + " " + s, 3, s + s + s, s + s + s, s);
+ }
+
@Test(expected = IllegalArgumentException.class)
public void testInvalidSplitSize() {
gramSplitter.split("en", 0);
@@ -128,23 +152,27 @@ public class GramSplitterTestCase {
String text = "en gul bille sang";
Iterator<GramSplitter.Gram> grams = gramSplitter.split(text, 3);
- assertThat(grams.next().extractFrom(text), is("en"));
+ assertEquals("en", grams.next().extractFrom(text));
assertTrue(grams.hasNext());
assertTrue(grams.hasNext());
- assertThat(grams.next().extractFrom(text), is("gul"));
- assertThat(grams.next().extractFrom(text), is("bil"));
- assertThat(grams.next().extractFrom(text), is("ill"));
- assertThat(grams.next().extractFrom(text), is("lle"));
+ assertEquals("gul", grams.next().extractFrom(text));
+ assertEquals("bil", grams.next().extractFrom(text));
+ assertEquals("ill", grams.next().extractFrom(text));
+ assertEquals("lle", grams.next().extractFrom(text));
assertTrue(grams.hasNext());
assertTrue(grams.hasNext());
- assertThat(grams.next().extractFrom(text), is("san"));
- assertThat(grams.next().extractFrom(text), is("ang"));
+ assertEquals("san", grams.next().extractFrom(text));
+ assertEquals("ang", grams.next().extractFrom(text));
assertFalse(grams.hasNext());
assertFalse(grams.hasNext());
}
+ private void assertGramSplits(String input, int gramSize, String ... expected) {
+ assertEquals(Arrays.asList(expected), gramSplitter.split(input, gramSize).toExtractedList());
+ }
+
private void assertGramSplit(String input, int gramSize, String expected) {
- assertThat(gramSplitter.split(input, gramSize).toExtractedList().toString(), is(expected));
+ assertEquals(expected, gramSplitter.split(input, gramSize).toExtractedList().toString());
}
}