summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2021-09-14 21:30:19 +0200
committerJon Bratseth <bratseth@gmail.com>2021-09-14 21:30:19 +0200
commit6c3b241781761d743bfe712836cef49c00d08c9e (patch)
tree2f3dd0921305fa06b4070c177e36f64828be0240 /linguistics
parent7188e5153f0eb60988fe7f187161ce30a81aecb8 (diff)
Slight algorithm simplification
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java9
1 files changed, 3 insertions, 6 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java b/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java
index 24b2a03bf3c..bd895e18b80 100644
--- a/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java
+++ b/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java
@@ -97,13 +97,10 @@ public class SentencePieceEncoder implements Segmenter {
Trie.Node node = model.tokens.root;
int characterPosition = start;
while (node != null && characterPosition < input.length()) { // traverse the trie one character at the time from this position
- node = node.children.get(input.charAt(characterPosition));
- characterPosition++;
+ node = node.children.get(input.charAt(characterPosition++));
int length = characterPosition - start;
- if (node != null) {
+ if (node != null && node.type != TokenType.unused) {
if (node.isToken()) {
- if (node.type == TokenType.unused) continue;
-
float score = node.type == TokenType.userDefined ? (length * model.maxScore - 0.1f) : node.score;
addSegment(TokenType.text, node.id, start, characterPosition, score, segmentEnds);
}
@@ -247,7 +244,7 @@ public class SentencePieceEncoder implements Segmenter {
Float score;
private final Map<Character, Node> children = new HashMap<>();
- boolean isToken() { return score != null; }
+ boolean isToken() { return type != null; }
}