summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@oath.com>2021-09-14 21:47:26 +0200
committerGitHub <noreply@github.com>2021-09-14 21:47:26 +0200
commitbe09b42da7c43ace99ae56258f69e28c2ac08a6c (patch)
tree9075aac91a40b6555aaf386fe8b6b3d4f2f3a657 /linguistics
parent8fcb35fcde19bad6b4e8527404a185f1e95b4f6d (diff)
parentfa0dbe36983238c8e17ee6fb45037a09ed49bdad (diff)
Merge pull request #19131 from vespa-engine/bratseth/sp-simplify
Slight algorithm simplification
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java20
1 files changed, 7 insertions, 13 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java b/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java
index 9509c1d070d..a755a9e6ff3 100644
--- a/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java
+++ b/linguistics/src/main/java/com/yahoo/language/sentencepiece/SentencePieceEncoder.java
@@ -96,23 +96,17 @@ public class SentencePieceEncoder implements Segmenter {
while (start < input.length()) { // segment from this position to the end of the text
Trie.Node node = model.tokens.root;
int characterPosition = start;
- boolean addedSingleCharacterSegment = false;
- while (characterPosition < input.length()) { // traverse the trie one character at the time from this position
- node = node.children.get(input.charAt(characterPosition));
- characterPosition++;
- if (node == null) break;
+ while (node != null && characterPosition < input.length()) { // traverse the trie one character at the time from this position
+ node = node.children.get(input.charAt(characterPosition++));
int length = characterPosition - start;
- if (node.isToken()) {
- if (node.type == TokenType.unused) continue;
-
+ if (node != null && node.isToken() && node.type != TokenType.unused) {
float score = node.type == TokenType.userDefined ? (length * model.maxScore - 0.1f) : node.score;
addSegment(TokenType.text, node.id, start, characterPosition, score, segmentEnds);
}
- if (! addedSingleCharacterSegment && length == 1)
- addedSingleCharacterSegment = true;
+ else if (length == 1) { // add an 'unknown' length 1 token to make the next position reachable
+ addSegment(TokenType.unknown, 0, start, start + 1, unknownScore, segmentEnds);
+ }
}
- if ( ! addedSingleCharacterSegment) // add an unknown 1 character token to be able to start from the next character
- addSegment(TokenType.unknown, 0, start, start + 1, unknownScore, segmentEnds);
start++;
}
@@ -248,7 +242,7 @@ public class SentencePieceEncoder implements Segmenter {
Float score;
private final Map<Character, Node> children = new HashMap<>();
- boolean isToken() { return score != null; }
+ boolean isToken() { return type != null; }
}