linguistics/src/main/java/com/yahoo/language/Linguistics.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language;

import com.yahoo.language.detect.Detector;
import com.yahoo.language.process.CharacterClasses;
import com.yahoo.language.process.GramSplitter;
import com.yahoo.language.process.LinguisticsContext;
import com.yahoo.language.process.Normalizer;
import com.yahoo.language.process.Segmenter;
import com.yahoo.language.process.Stemmer;
import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.process.Transformer;

/**
 * <p>Factory of linguistic processors. For technical reasons this provides more flexibility to provide separate
 * components for different operations than is needed in many cases; in particular the tokenizer should typically
 * stem, transform and normalize using the same operations as provided directly by this. A set of adaptors are
 * provided that makes this easy to achieve. Refer to the {com.yahoo.language.simple.SimpleLinguistics} implementation
 * to set this up.</p>
 *
 * <p>Thread safety: Instances of this factory type must be thread safe but the processors
 * returned by the factory methods do not. Clients should request separate processor instances
 * for each thread.</p>
 *
 * @author Mathias Mølster Lidal
 * @author Simon Thoresen Hult
 * @author bratseth
 */
public interface Linguistics {

    enum Component {
        STEMMER,
        TOKENIZER,
        NORMALIZER,
        TRANSFORMER,
        SEGMENTER,
        DETECTOR,
        GRAM_SPLITTER,
        CHARACTER_CLASSES
    }

    /** Prefer getStemmer(LinguisticsContext) */
    // TODO: Deprecate this
    default Stemmer getStemmer() {
        return getStemmer(LinguisticsContext.empty());
    }

    /**
     * Returns a thread-unsafe stemmer or lemmatizer.
     * This is used at query time to do stemming of search terms to indexes which contains text tokenized
     * with stemming turned on
     */
    default Stemmer getStemmer(LinguisticsContext linguisticsContext) {
        return getStemmer();
    }

    /**
     * Prefer getTokenize(LinguisticsContext).
     */
    default Tokenizer getTokenizer() {
        return getTokenizer(LinguisticsContext.empty());
    }

    /**
     * Returns a thread-unsafe tokenizer.
     * This is used at indexing time to produce an optionally stemmed and
     * transformed (accent normalized) stream of indexable tokens.
     */
    default Tokenizer getTokenizer(LinguisticsContext context) {
        return getTokenizer();
    }

    /** Returns a thread-unsafe normalizer. This is used at query time to cjk normalize query text. */
    Normalizer getNormalizer();

    /**
     * Returns a thread-unsafe transformer.
     * This is used at query time to do stemming of search terms to indexes which contains text tokenized
     * with accent normalization turned on
     */
    default Transformer getTransformer() {
        return getTransformer();
    }

    /**
     * Prefer getSegmenter(LinguisticsContext).
     */
    // TODO: Deprecate this
    default Segmenter getSegmenter() {
        return getSegmenter(LinguisticsContext.empty());
    }

    /**
     * Returns a thread-unsafe segmenter.
     * This is used at query time to find the individual semantic components of search terms to indexes
     * tokenized with segmentation.
     */
    default Segmenter getSegmenter(LinguisticsContext context) {
        return getSegmenter();
    }

    /**
     * Returns a thread-unsafe detector.
     * The language of the text is a parameter to other linguistic operations.
     * This is used to determine the language of a query or document field when not specified explicitly.
     */
    Detector getDetector();

    /**
     * Returns a thread-unsafe gram splitter.
     * This is used to split query or document text into fixed-length grams which allows matching without needing
     * or using segmented tokens.
     */
    GramSplitter getGramSplitter();

    /** Returns a thread-unsafe character classes instance. */
    CharacterClasses getCharacterClasses();

    /** Check if another instance is equivalent to this one */
    boolean equals(Linguistics other);

}