linguistics/src/main/java/com/yahoo/language/process/SpecialTokens.java


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141

// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.process;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;

import static com.yahoo.language.LinguisticsCase.toLowerCase;

/**
 * An immutable list of special tokens - strings which should override the normal tokenizer semantics
 * and be tokenized into a single token. Special tokens are case insensitive.
 *
 * @author bratseth
 */
public class SpecialTokens {

    private static final SpecialTokens empty = new SpecialTokens("(empty)", List.of());

    private final String name;
    private final int maximumLength;
    private final List<Token> tokens;
    private final Map<String, String> tokenMap;

    public SpecialTokens(String name,  List<Token> tokens) {
        tokens.stream().peek(token -> token.validate());
        List<Token> mutableTokens = new ArrayList<>(tokens);
        Collections.sort(mutableTokens);
        this.name = name;
        this.maximumLength = tokens.stream().mapToInt(token -> token.token().length()).max().orElse(0);
        this.tokens = List.copyOf(mutableTokens);
        this.tokenMap = tokens.stream().collect(Collectors.toUnmodifiableMap(t -> t.token(), t -> t.replacement()));
    }

    /** Returns the name of this special tokens list */
    public String name() {
        return name;
    }

    /**
     * Returns the tokens of this as an immutable map from token to replacement.
     * Tokens which do not have a replacement token maps to themselves.
     */
    public Map<String, String> asMap() { return tokenMap; }

    /**
     * Returns the special token starting at the start of the given string, or null if no
     * special token starts at this string
     *
     * @param string the string to search for a special token at the start position
     * @param substring true to allow the special token to be followed by a character which does not
     *        mark the end of a token
     */
    public Token tokenize(String string, boolean substring) {
        // XXX detonator pattern token.length may be != the length of the
        // matching data in string, ref caseIndependentLength(String)
        String input = toLowerCase(string.substring(0, Math.min(string.length(), maximumLength)));
        for (Iterator<Token> i = tokens.iterator(); i.hasNext();) {
            Token special = i.next();

            if (input.startsWith(special.token())) {
                if (string.length() == special.token().length() || substring || tokenEndsAt(special.token().length(), string))
                    return special;
            }
        }
        return null;
    }

    private boolean tokenEndsAt(int position, String string) {
        return !Character.isLetterOrDigit(string.charAt(position));
    }

    public static SpecialTokens empty() { return empty; }

    /** An immutable special token */
    public final static class Token implements Comparable<Token> {

        private final String token;
        private final String replacement;

        /** Creates a special token */
        public Token(String token) {
            this(token, null);
        }

        /** Creates a special token which will be represented by the given replacement token */
        public Token(String token, String replacement) {
            this.token = toLowerCase(token);
            if (replacement == null || replacement.trim().equals(""))
                this.replacement = this.token;
            else
                this.replacement = toLowerCase(replacement);
        }

        /** Returns the special token */
        public String token() { return token; }

        /** Returns the token to replace occurrences of this by, which equals token() unless this has a replacement. */
        public String replacement() { return replacement; }

        @Override
        public int compareTo(Token other) {
            if (this.token().length() < other.token().length()) return 1;
            if (this.token().length() == other.token().length()) return 0;
            return -1;
        }

        @Override
        public boolean equals(Object other) {
            if (other == this) return true;
            if ( ! (other instanceof Token)) return false;
            return Objects.equals(this.token, ((Token)other).token);
        }

        @Override
        public int hashCode() { return token.hashCode(); }

        @Override
        public String toString() {
            return "token '" + token + "'" + (replacement.equals(token) ? "" : " replacement '" + replacement + "'");
        }

        private void validate() {
            // XXX not fool proof length test, should test codepoint by codepoint for mixed case user input? not even that will necessarily be 100% robust...
            String asLow = toLowerCase(token);
            // TODO: Put along with the global toLowerCase
            String asHigh = token.toUpperCase(Locale.ENGLISH);
            if (asLow.length() != token.length() || asHigh.length() != token.length()) {
                throw new IllegalArgumentException("Special token '" + token + "' has case sensitive length. " +
                                                   "Please report this to the Vespa team.");
            }
        }

    }

}