diff options
author | Jon Bratseth <bratseth@gmail.com> | 2021-09-14 21:30:19 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2021-09-14 21:30:19 +0200 |
commit | 6c3b241781761d743bfe712836cef49c00d08c9e (patch) | |
tree | 2f3dd0921305fa06b4070c177e36f64828be0240 /linguistics | |
parent | 7188e5153f0eb60988fe7f187161ce30a81aecb8 (diff) |
Slight algorithm simplification
Diffstat (limited to 'linguistics')
-rw-r--r-- | linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java | 9 |
1 files changed, 3 insertions, 6 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java b/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java index 24b2a03bf3c..bd895e18b80 100644 --- a/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java +++ b/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java @@ -97,13 +97,10 @@ public class SentencePieceEncoder implements Segmenter { Trie.Node node = model.tokens.root; int characterPosition = start; while (node != null && characterPosition < input.length()) { // traverse the trie one character at the time from this position - node = node.children.get(input.charAt(characterPosition)); - characterPosition++; + node = node.children.get(input.charAt(characterPosition++)); int length = characterPosition - start; - if (node != null) { + if (node != null && node.type != TokenType.unused) { if (node.isToken()) { - if (node.type == TokenType.unused) continue; - float score = node.type == TokenType.userDefined ? (length * model.maxScore - 0.1f) : node.score; addSegment(TokenType.text, node.id, start, characterPosition, score, segmentEnds); } @@ -247,7 +244,7 @@ public class SentencePieceEncoder implements Segmenter { Float score; private final Map<Character, Node> children = new HashMap<>(); - boolean isToken() { return score != null; } + boolean isToken() { return type != null; } } |