summaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/sentencepiece/Model.java
blob: 74f300057dcdbfc660bf46b7f92ac05c2238f082 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.sentencepiece;

import com.yahoo.io.IOUtils;
import com.yahoo.language.Language;
import sentencepiece.SentencepieceModel;

import java.io.IOException;
import java.nio.file.Path;

/**
 * A SentencePiece model
 *
 * @author bratseth
 */
final class Model {

    final Path source;
    final Language language;
    final float minScore;
    final float maxScore;
    final Trie tokens = new Trie();

    Model(Language language, Path path) {
        try {
            this.source = path;
            this.language = language;
            var sp = SentencepieceModel.ModelProto.parseFrom(IOUtils.readFileBytes(path.toFile()));
            float minScore = Float.MAX_VALUE;
            float maxScore = Float.MIN_VALUE;
            for (int i = 0; i < sp.getPiecesCount(); i++) {
                var piece = sp.getPieces(i);
                tokens.add(toTokenType(piece.getType()), i, piece.getPiece(), piece.getScore());
                minScore = Math.min(piece.getScore(), minScore);
                maxScore = Math.max(piece.getScore(), maxScore);
            }
            this.minScore = minScore;
            this.maxScore = maxScore;
        } catch (IOException e) {
            throw new IllegalArgumentException("Could not read a SentencePiece model from " + path, e);
        }
    }

    private static TokenType toTokenType(SentencepieceModel.ModelProto.SentencePiece.Type type) {
        switch (type) {
            case USER_DEFINED : return TokenType.userDefined;
            case UNKNOWN : return TokenType.unknown;
            case NORMAL : return TokenType.text;
            case CONTROL : return TokenType.control;
            case UNUSED : return TokenType.unused;
            default : throw new IllegalArgumentException("Unknkown token type " + type);
        }
    }

    @Override
    public String toString() {
        return "SentencePiece model for " + language + ": '" + source + "'";
    }

}