aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java
blob: 05a2e35f09f92e5ce98cf7d25dedc3795f4cb9ea (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.simple;

import com.yahoo.language.Language;
import com.yahoo.language.process.AbstractTokenizerTestCase;
import com.yahoo.language.process.StemMode;
import com.yahoo.language.process.Token;
import org.junit.Test;

import java.util.Iterator;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;

/**
 * @author Steinar Knutsen
 * @author bratseth
 */
public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase {

    @Test
    public void testTokenizingNoStemming() {
        TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.NONE);
        tester.assertTokens("a\u030a tralalala n4lle. \uD800\uDFC8 (old Persian sign Auramazda, sorry if " +
                            "anyone 1s offended by ancien7 gods.Running)",
                            "\u00E5", " ", "tralalala"," ","n4lle", ".", " ","\uD800\uDFC8", " ", "(",
                            "old", " ", "persian", " ", "sign", " ", "auramazda", ",", " ", "sorry", " ",
                            "if", " ", "anyone", " ", "1s", " ", "offended", " ", "by", " ", "ancien7",
                            " ", "gods", ".", "running", ")");
    }

    @Test
    public void testTokenizingStemming() {
        TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL);
        tester.assertTokens("a\u030a tralalala n4lle. \uD800\uDFC8 (old Persian sign Auramazda, sorry if " +
                            "anyone 1s offended by ancien7 gods.Running)",
                            "\u00E5", " ", "tralalala"," ","n4lle", ".", " ","\uD800\uDFC8", " ", "(",
                            "old", " ", "persian", " ", "sign", " ", "auramazda", ",", " ", "sorry", " ",
                            "if", " ", "anyone", " ", "1s", " ", "offend", " ", "by", " ", "ancien7",
                            " ", "gods", ".", "running", ")");
    }

    @Test
    public void testTokenizeEmojis() {
        TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.ALL);

        String emoji1 = "\uD83D\uDD2A"; // 🔪
        String emoji2 = "\uD83D\uDE00"; // 😀
        tester.assertTokens(emoji1, emoji1);
        tester.assertTokens(emoji1 + "foo", emoji1, "foo");
        tester.assertTokens(emoji1 + emoji2, emoji1, emoji2);
    }

}