aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@vespa.ai>2023-06-02 08:50:08 +0200
committerJon Bratseth <bratseth@vespa.ai>2023-06-02 08:50:08 +0200
commitb18703690547333d559f09f63f40ada4fed6f4d4 (patch)
tree6329d27ff2a2b7ff357fbc65e93713e6ffdcc1da /linguistics
parentd799fb136d17e62cc13d7021d409618b58d6d60a (diff)
Don't remove indexable symbols when stemming
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/abi-spec.json1
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/TokenType.java9
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java3
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java8
-rw-r--r--linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java4
5 files changed, 17 insertions, 8 deletions
diff --git a/linguistics/abi-spec.json b/linguistics/abi-spec.json
index a6aa902c688..dc85a2e6f0b 100644
--- a/linguistics/abi-spec.json
+++ b/linguistics/abi-spec.json
@@ -724,6 +724,7 @@
"public static final enum com.yahoo.language.process.TokenType SYMBOL",
"public static final enum com.yahoo.language.process.TokenType ALPHABETIC",
"public static final enum com.yahoo.language.process.TokenType NUMERIC",
+ "public static final enum com.yahoo.language.process.TokenType INDEXABLE_SYMBOL",
"public static final enum com.yahoo.language.process.TokenType MARKER"
]
},
diff --git a/linguistics/src/main/java/com/yahoo/language/process/TokenType.java b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java
index 14c7e9bc144..6c3e0c2ab36 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/TokenType.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/TokenType.java
@@ -14,6 +14,7 @@ public enum TokenType {
SYMBOL(3),
ALPHABETIC(4),
NUMERIC(5),
+ INDEXABLE_SYMBOL(6),
MARKER(255);
private final int value;
@@ -34,10 +35,10 @@ public enum TokenType {
* @return whether this type of token can be indexed
*/
public boolean isIndexable() {
- switch (this) {
- case ALPHABETIC: case NUMERIC: return true;
- default: return false;
- }
+ return switch (this) {
+ case ALPHABETIC, NUMERIC, INDEXABLE_SYMBOL -> true;
+ default -> false;
+ };
}
/** Translates this from the int code representation returned from {@link #getValue} */
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java
index 5c321e4da9b..8a88ae8f005 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenType.java
@@ -31,8 +31,9 @@ public class SimpleTokenType {
case Character.MATH_SYMBOL:
case Character.CURRENCY_SYMBOL:
case Character.MODIFIER_SYMBOL:
- case Character.OTHER_SYMBOL:
return TokenType.SYMBOL;
+ case Character.OTHER_SYMBOL:
+ return TokenType.INDEXABLE_SYMBOL;
case Character.OTHER_NUMBER:
// "SUPERSCRIPT TWO",
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index 2728249333e..d86ca30a632 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -67,7 +67,7 @@ public class SimpleTokenizer implements Tokenizer {
for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) {
nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE;
TokenType nextType = SimpleTokenType.valueOf(nextCode);
- if (!prevType.isIndexable() || !nextType.isIndexable()) {
+ if (isAtTokenBoundary(prevType, nextType)) {
String original = input.substring(prev, next);
tokens.add(new SimpleToken(original).setOffset(prev)
.setType(tokenType)
@@ -84,6 +84,12 @@ public class SimpleTokenizer implements Tokenizer {
return tokens;
}
+ private boolean isAtTokenBoundary(TokenType prevType, TokenType nextType) {
+ // Always index each symbol as a token
+ if (prevType == TokenType.INDEXABLE_SYMBOL || nextType == TokenType.INDEXABLE_SYMBOL) return true;
+ return !prevType.isIndexable() || !nextType.isIndexable();
+ }
+
private TokenType determineType(TokenType tokenType, TokenType characterType) {
if (characterType == TokenType.ALPHABETIC) return TokenType.ALPHABETIC;
return tokenType;
diff --git a/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java
index 0ce9b327533..70a97cda7e3 100644
--- a/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java
+++ b/linguistics/src/test/java/com/yahoo/language/process/TokenTypeTestCase.java
@@ -23,9 +23,9 @@ public class TokenTypeTestCase {
}
@Test
- public void requireThatOnlyAlphaNumericsAreIndexable() {
+ public void testIsIndexable() {
for (TokenType type : TokenType.values()) {
- if (type == TokenType.ALPHABETIC || type == TokenType.NUMERIC) {
+ if (type == TokenType.ALPHABETIC || type == TokenType.NUMERIC || type == TokenType.INDEXABLE_SYMBOL) {
assertTrue(type.isIndexable());
} else {
assertFalse(type.isIndexable());