1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
|
// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.sentencepiece;
import com.yahoo.config.FileReference;
import com.yahoo.language.Language;
import org.junit.Test;
/**
* @author bratseth
*/
public class SentencePieceConfigurationTest {
@Test
public void testEnglishTokenization() {
var b = new SentencePieceConfig.Builder();
addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b);
var tester = new SentencePieceTester(new SentencePieceEncoder(b.build()));
tester.assertSegmented("this is another sentence", "▁this", "▁is", "▁another", "▁sentence");
tester.assertSegmented("KHJKJHHKJHHSH hello", "▁", "KHJKJHHKJHHSH", "▁hel", "lo");
}
@Test
public void testNoCollapse() {
var b = new SentencePieceConfig.Builder();
addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b);
b.collapseUnknowns(false);
var tester = new SentencePieceTester(new SentencePieceEncoder(b.build()));
tester.assertSegmented("KHJ hello", "▁", "K", "H", "J", "▁hel", "lo");
}
@Test
public void testHighestScore() {
var b = new SentencePieceConfig.Builder();
addModel("unknown", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b);
b.scoring(SentencePieceConfig.Scoring.highestScore);
var tester = new SentencePieceTester(new SentencePieceEncoder(b.build()));
tester.assertSegmented("hello", "▁h", "el", "lo");
}
@Test
public void testMultiLanguageTokenization() {
var b = new SentencePieceConfig.Builder();
addModel("ja", "src/test/models/sentencepiece/ja.wiki.bpe.vs5000.model", b);
addModel("en", "src/test/models/sentencepiece/en.wiki.bpe.vs10000.model", b);
var tester = new SentencePieceTester(new SentencePieceEncoder(b.build()));
tester.assertSegmented(Language.JAPANESE, "いくつかの通常のテキスト", "▁", "いく", "つか", "の", "通常", "の", "テ", "キ", "スト");
tester.assertSegmented(Language.ENGLISH, "hello", "▁hel", "lo");
tester.assertSegmented(Language.JAPANESE, "hello", "▁h", "ell", "o");
}
private void addModel(String language, String file, SentencePieceConfig.Builder b) {
var mb = new SentencePieceConfig.Model.Builder();
mb.language(language);
mb.path(new FileReference(file));
b.model(mb);
}
}
|