From 7188e5153f0eb60988fe7f187161ce30a81aecb8 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Tue, 14 Sep 2021 21:23:05 +0200 Subject: Slight algorithm simplification --- .../sentencepiece/SentencePieceEncoder.java | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) (limited to 'linguistics/src/main/java') diff --git a/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java b/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java index 9509c1d070d..24b2a03bf3c 100644 --- a/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java +++ b/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java @@ -96,23 +96,22 @@ public class SentencePieceEncoder implements Segmenter { while (start < input.length()) { // segment from this position to the end of the text Trie.Node node = model.tokens.root; int characterPosition = start; - boolean addedSingleCharacterSegment = false; - while (characterPosition < input.length()) { // traverse the trie one character at the time from this position + while (node != null && characterPosition < input.length()) { // traverse the trie one character at the time from this position node = node.children.get(input.charAt(characterPosition)); characterPosition++; - if (node == null) break; int length = characterPosition - start; - if (node.isToken()) { - if (node.type == TokenType.unused) continue; + if (node != null) { + if (node.isToken()) { + if (node.type == TokenType.unused) continue; - float score = node.type == TokenType.userDefined ? (length * model.maxScore - 0.1f) : node.score; - addSegment(TokenType.text, node.id, start, characterPosition, score, segmentEnds); + float score = node.type == TokenType.userDefined ? (length * model.maxScore - 0.1f) : node.score; + addSegment(TokenType.text, node.id, start, characterPosition, score, segmentEnds); + } + } + else if (length == 1) { // add an 'unknown' token of length 1 instead to make the next position reachable + addSegment(TokenType.unknown, 0, start, start + 1, unknownScore, segmentEnds); } - if (! addedSingleCharacterSegment && length == 1) - addedSingleCharacterSegment = true; } - if ( ! addedSingleCharacterSegment) // add an unknown 1 character token to be able to start from the next character - addSegment(TokenType.unknown, 0, start, start + 1, unknownScore, segmentEnds); start++; } -- cgit v1.2.3