summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2023-10-20 17:50:35 +0200
committerGitHub <noreply@github.com>2023-10-20 17:50:35 +0200
commit46bd10fe1fbbbbf388155bf72e73ab26fbd0bfab (patch)
tree37c8e940004f64ac2fa0fe538749f36ce354db8b
parent2f60ec76050d005a04dc20b09ab7877bdd3abfb5 (diff)
parente228115788634d77f5b6354c12c1718252044860 (diff)
Merge pull request #29045 from vespa-engine/jonmv/fix-text-substring-in-tokenisation
Avoid cutting surrogate pairs when tokenising
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java3
-rw-r--r--linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java2
-rw-r--r--vespajlib/src/main/java/com/yahoo/text/Text.java9
-rw-r--r--vespajlib/src/test/java/com/yahoo/text/TextTestCase.java5
4 files changed, 12 insertions, 7 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 61ee3069127..191d067effe 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -12,6 +12,7 @@ import com.yahoo.language.Linguistics;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import com.yahoo.language.process.Tokenizer;
+import com.yahoo.text.Text;
import java.util.HashMap;
import java.util.Map;
@@ -71,7 +72,7 @@ public class LinguisticsAnnotator {
Tokenizer tokenizer = factory.getTokenizer();
String input = (text.getString().length() <= config.getMaxTokenizeLength())
? text.getString()
- : text.getString().substring(0, config.getMaxTokenizeLength());
+ : Text.substringByCodepoints(text.getString(), 0, config.getMaxTokenizeLength());
Iterable<Token> tokens = tokenizer.tokenize(input, config.getLanguage(), config.getStemMode(),
config.getRemoveAccents());
TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());
diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
index 143acc174f0..5ad6a382abd 100644
--- a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
+++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
@@ -7,7 +7,7 @@ import java.util.Locale;
/**
* This class provides a case normalization operation to be used e.g. when
- * document search should be case insensitive.
+ * document search should be case-insensitive.
*
* @author Simon Thoresen Hult
*/
diff --git a/vespajlib/src/main/java/com/yahoo/text/Text.java b/vespajlib/src/main/java/com/yahoo/text/Text.java
index 7c835965a1a..fe931ef34a3 100644
--- a/vespajlib/src/main/java/com/yahoo/text/Text.java
+++ b/vespajlib/src/main/java/com/yahoo/text/Text.java
@@ -170,15 +170,15 @@ public final class Text {
}
/**
- * Returns a string which is never larger than the given number of characters.
+ * Returns a string which is never larger than the given number of code points.
* If the string is longer than the given length it will be truncated.
* If length is 4 or less the string will be truncated to length.
* If length is longer than 4, it will be truncated at length-4 with " ..." added at the end.
*/
public static String truncate(String s, int length) {
- if (s.length() <= length) return s;
- if (length <= 4) return s.substring(0, length);
- return s.substring(0, length - 4) + " ...";
+ if (s.codePointCount(0, s.length()) <= length) return s;
+ if (length <= 4) return substringByCodepoints(s, 0, length);
+ return substringByCodepoints(s, 0, length - 4) + " ...";
}
public static String substringByCodepoints(String s, int fromCP, int toCP) {
@@ -208,4 +208,5 @@ public final class Text {
public static String format(String format, Object... args) {
return String.format(Locale.US, format, args);
}
+
}
diff --git a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java
index f192f678c13..9bb4668b7cb 100644
--- a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java
+++ b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java
@@ -104,6 +104,9 @@ public class TextTestCase {
assertEquals("", Text.truncate("ab", 0));
assertEquals("ab c", Text.truncate("ab cde", 4));
assertEquals("a ...", Text.truncate("ab cde", 5));
+ assertEquals("abc ...", Text.truncate("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef", 7));
+ assertEquals("abc\uD83D\uDE48 ...", Text.truncate("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef", 8));
+ assertEquals("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef", Text.truncate("abc\uD83D\uDE48\uD83D\uDE49\uD83D\uDE4Adef", 9));
}
@Test
@@ -152,6 +155,6 @@ public class TextTestCase {
sum = benchmarkIsValid(strings, 100000000);
diff = System.nanoTime() - start;
System.out.println("Validation num isValid = " + sum + ". Took " + diff + "ns");
-
}
+
}