aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src
diff options
context:
space:
mode:
authorjonmv <venstad@gmail.com>2022-12-06 13:12:00 +0100
committerjonmv <venstad@gmail.com>2022-12-06 13:12:00 +0100
commitadc2b8c2fcea257e0a1df826dc301f014baecec0 (patch)
tree9faf12ef9068eaffca9b6ed0647696dde3e41367 /linguistics/src
parent87bb10ce865023969e89692896a6e0236e4fdc73 (diff)
Compute code points in whole string only when needed
Diffstat (limited to 'linguistics/src')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java8
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java15
2 files changed, 17 insertions, 6 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
index 6ee82a6fddb..83110c0021e 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
@@ -212,11 +212,9 @@ public class GramSplitter {
/** Substring in code point space */
public UnicodeString substring(int start, int codePoints) {
- int offset = s.offsetByCodePoints(start, Math.min(codePoints, s.codePointCount(start, s.length())));
- if (offset < 0)
- return new UnicodeString(s.substring(start));
- else
- return new UnicodeString(s.substring(start, offset));
+ int cps = codePoints * 2 <= s.length() - start ? codePoints
+ : Math.min(codePoints, s.codePointCount(start, s.length()));
+ return new UnicodeString(s.substring(start, s.offsetByCodePoints(start, cps)));
}
/** Returns the position count code points after start (which may be past the end of the string) */
diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
index fa8419e200f..6cefcfbf67a 100644
--- a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java
@@ -1,13 +1,17 @@
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.process;
+import com.yahoo.language.process.GramSplitter.Gram;
+import com.yahoo.language.process.GramSplitter.GramSplitterIterator;
import com.yahoo.language.simple.SimpleLinguistics;
import org.junit.Test;
import java.util.Arrays;
import java.util.Iterator;
-import static org.junit.Assert.*;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertTrue;
/**
* @author bratseth
@@ -168,6 +172,15 @@ public class GramSplitterTestCase {
}
@Test
+ public void testLongString() {
+ String input = "hey ho come 色 let's go, and then we go again!\n色色色".repeat(10_000);
+ for (GramSplitterIterator grams = new GramSplitter(new CharacterClasses()).split(input, 3); grams.hasNext(); ) {
+ Gram gram = grams.next();
+ gram.extractFrom(input);
+ }
+ }
+
+ @Test
public void testChineseComma() {
String text = "我喜欢红色、蓝色和紫色";
Iterator<GramSplitter.Gram> grams = gramSplitter.split(text, 2);