blob: 74f300057dcdbfc660bf46b7f92ac05c2238f082 (
plain) (
blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
|
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.sentencepiece;
import com.yahoo.io.IOUtils;
import com.yahoo.language.Language;
import sentencepiece.SentencepieceModel;
import java.io.IOException;
import java.nio.file.Path;
/**
* A SentencePiece model
*
* @author bratseth
*/
final class Model {
final Path source;
final Language language;
final float minScore;
final float maxScore;
final Trie tokens = new Trie();
Model(Language language, Path path) {
try {
this.source = path;
this.language = language;
var sp = SentencepieceModel.ModelProto.parseFrom(IOUtils.readFileBytes(path.toFile()));
float minScore = Float.MAX_VALUE;
float maxScore = Float.MIN_VALUE;
for (int i = 0; i < sp.getPiecesCount(); i++) {
var piece = sp.getPieces(i);
tokens.add(toTokenType(piece.getType()), i, piece.getPiece(), piece.getScore());
minScore = Math.min(piece.getScore(), minScore);
maxScore = Math.max(piece.getScore(), maxScore);
}
this.minScore = minScore;
this.maxScore = maxScore;
} catch (IOException e) {
throw new IllegalArgumentException("Could not read a SentencePiece model from " + path, e);
}
}
private static TokenType toTokenType(SentencepieceModel.ModelProto.SentencePiece.Type type) {
switch (type) {
case USER_DEFINED : return TokenType.userDefined;
case UNKNOWN : return TokenType.unknown;
case NORMAL : return TokenType.text;
case CONTROL : return TokenType.control;
case UNUSED : return TokenType.unused;
default : throw new IllegalArgumentException("Unknkown token type " + type);
}
}
@Override
public String toString() {
return "SentencePiece model for " + language + ": '" + source + "'";
}
}
|