summaryrefslogtreecommitdiffstats
path: root/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/ResultBuilder.java
diff options
context:
space:
mode:
Diffstat (limited to 'linguistics-components/src/main/java/com/yahoo/language/sentencepiece/ResultBuilder.java')
-rw-r--r--linguistics-components/src/main/java/com/yahoo/language/sentencepiece/ResultBuilder.java47
1 files changed, 47 insertions, 0 deletions
diff --git a/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/ResultBuilder.java b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/ResultBuilder.java
new file mode 100644
index 00000000000..2141505374c
--- /dev/null
+++ b/linguistics-components/src/main/java/com/yahoo/language/sentencepiece/ResultBuilder.java
@@ -0,0 +1,47 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.language.sentencepiece;
+
+/**
+ * Builds a result from a sentencepiece tokenization by being called for each segment in reverse
+ *
+ * @param <RESULTTYPE> the type of result this produces
+ * @author bratseth
+ */
+abstract class ResultBuilder<RESULTTYPE> {
+
+ private final RESULTTYPE result;
+
+ ResultBuilder(RESULTTYPE result) {
+ this.result = result;
+ }
+
+ /** Called for each segment, starting from the last and working backwards */
+ abstract void add(int start, int end, SentencePieceAlgorithm.SegmentEnd[] segmentEnds);
+
+ RESULTTYPE result() {return result;}
+
+ void build(String input, SentencePieceAlgorithm.SegmentEnd[] segmentEnds, boolean collapseUnknowns) {
+ if (collapseUnknowns) {
+ int segmentEnd = input.length();
+ int collapsedSegmentEnd = segmentEnd;
+ while (segmentEnd > 0) {
+ if (segmentEnds[segmentEnd].type != TokenType.unknown ) {
+ if (collapsedSegmentEnd != segmentEnd) { // We have deferred an unknown collapsed segment
+ add(segmentEnd, collapsedSegmentEnd, segmentEnds);
+ }
+ add(segmentEnds[segmentEnd].segmentStart, segmentEnd, segmentEnds);
+ collapsedSegmentEnd = segmentEnds[segmentEnd].segmentStart;
+ }
+ segmentEnd = segmentEnds[segmentEnd].segmentStart;
+ }
+ }
+ else {
+ int segmentEnd = input.length();
+ while (segmentEnd > 0) {
+ add(segmentEnds[segmentEnd].segmentStart, segmentEnd, segmentEnds);
+ segmentEnd = segmentEnds[segmentEnd].segmentStart;
+ }
+ }
+ }
+
+}