Slight algorithm simplification

author: Jon Bratseth <bratseth@gmail.com> 2021-09-14 21:30:19 +0200
committer: Jon Bratseth <bratseth@gmail.com> 2021-09-14 21:30:19 +0200
commit: 6c3b241781761d743bfe712836cef49c00d08c9e (patch)
tree: 2f3dd0921305fa06b4070c177e36f64828be0240 /linguistics
parent: 7188e5153f0eb60988fe7f187161ce30a81aecb8 (diff)
1 files changed, 3 insertions, 6 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java b/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java
index 24b2a03bf3c..bd895e18b80 100644
--- a/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java
+++ b/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java
@@ -97,13 +97,10 @@ public class SentencePieceEncoder implements Segmenter {
             Trie.Node node = model.tokens.root;
             int characterPosition = start;
             while (node != null && characterPosition < input.length()) { // traverse the trie one character at the time from this position
-                node = node.children.get(input.charAt(characterPosition));
-                characterPosition++;
+                node = node.children.get(input.charAt(characterPosition++));
                 int length = characterPosition - start;
-                if (node != null) {
+                if (node != null && node.type != TokenType.unused) {
                     if (node.isToken()) {
-                        if (node.type == TokenType.unused) continue;
-
                         float score = node.type == TokenType.userDefined ? (length * model.maxScore - 0.1f) : node.score;
                         addSegment(TokenType.text, node.id, start, characterPosition, score, segmentEnds);
                     }
@@ -247,7 +244,7 @@ public class SentencePieceEncoder implements Segmenter {
             Float score;
             private final Map<Character, Node> children = new HashMap<>();
 
-            boolean isToken() { return score != null; }
+            boolean isToken() { return type != null; }
 
         }
author	Jon Bratseth <bratseth@gmail.com>	2021-09-14 21:30:19 +0200
committer	Jon Bratseth <bratseth@gmail.com>	2021-09-14 21:30:19 +0200
commit	6c3b241781761d743bfe712836cef49c00d08c9e (patch)
tree	2f3dd0921305fa06b4070c177e36f64828be0240 /linguistics
parent	7188e5153f0eb60988fe7f187161ce30a81aecb8 (diff)