diff options
author | jonmv <venstad@gmail.com> | 2022-12-06 13:12:00 +0100 |
---|---|---|
committer | jonmv <venstad@gmail.com> | 2022-12-06 13:12:00 +0100 |
commit | adc2b8c2fcea257e0a1df826dc301f014baecec0 (patch) | |
tree | 9faf12ef9068eaffca9b6ed0647696dde3e41367 | |
parent | 87bb10ce865023969e89692896a6e0236e4fdc73 (diff) |
Compute code points in whole string only when needed
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java | 8 | ||||
-rw-r--r-- | linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java | 15 |
2 files changed, 17 insertions, 6 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java index 6ee82a6fddb..83110c0021e 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java +++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java @@ -212,11 +212,9 @@ public class GramSplitter { /** Substring in code point space */ public UnicodeString substring(int start, int codePoints) { - int offset = s.offsetByCodePoints(start, Math.min(codePoints, s.codePointCount(start, s.length()))); - if (offset < 0) - return new UnicodeString(s.substring(start)); - else - return new UnicodeString(s.substring(start, offset)); + int cps = codePoints * 2 <= s.length() - start ? codePoints + : Math.min(codePoints, s.codePointCount(start, s.length())); + return new UnicodeString(s.substring(start, s.offsetByCodePoints(start, cps))); } /** Returns the position count code points after start (which may be past the end of the string) */ diff --git a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java index fa8419e200f..6cefcfbf67a 100644 --- a/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/process/GramSplitterTestCase.java @@ -1,13 +1,17 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.language.process; +import com.yahoo.language.process.GramSplitter.Gram; +import com.yahoo.language.process.GramSplitter.GramSplitterIterator; import com.yahoo.language.simple.SimpleLinguistics; import org.junit.Test; import java.util.Arrays; import java.util.Iterator; -import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; /** * @author bratseth @@ -168,6 +172,15 @@ public class GramSplitterTestCase { } @Test + public void testLongString() { + String input = "hey ho come 色 let's go, and then we go again!\n色色色".repeat(10_000); + for (GramSplitterIterator grams = new GramSplitter(new CharacterClasses()).split(input, 3); grams.hasNext(); ) { + Gram gram = grams.next(); + gram.extractFrom(input); + } + } + + @Test public void testChineseComma() { String text = "我喜欢红色、蓝色和紫色"; Iterator<GramSplitter.Gram> grams = gramSplitter.split(text, 2); |