summaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@vespa.ai>2023-11-10 21:38:23 +0100
committerJon Bratseth <bratseth@vespa.ai>2023-11-10 21:38:23 +0100
commit90965807bd8a6134fe92f7058b3b0a3287050c2a (patch)
treeada0de730852649e4e648c0c4093e9288988ea77 /linguistics
parentd74701fe719494819eeb7f5c1af4b59a5c652df6 (diff)
Prefer first stem to original if non equal
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java37
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java2
2 files changed, 28 insertions, 11 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
index 6cc68c7ac14..809e9b8d133 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
@@ -15,35 +15,48 @@ import java.util.Objects;
public class SimpleToken implements Token {
private final List<Token> components = new ArrayList<>();
- private final String orig;
+ private final String original;
private TokenType type = TokenType.UNKNOWN;
private TokenScript script = TokenScript.UNKNOWN;
private String tokenString;
+ private List<String> stems = null; // Any additional stems after tokenString
private boolean specialToken = false;
private long offset = 0;
- public SimpleToken(String orig) {
- this(orig, null);
+ public SimpleToken(String original) {
+ this(original, (String)null);
}
- public SimpleToken(String orig, String tokenString) {
- this.orig = orig;
+ public SimpleToken(String original, String tokenString) {
+ this.original = original;
this.tokenString = tokenString;
}
+ /** Exposed as fromStems */
+ private SimpleToken(String original, List<String> stems) {
+ this.type = TokenType.ALPHABETIC; // Only type which may have stems
+ this.original = original;
+ this.tokenString = stems.get(0);
+ this.stems = List.copyOf(stems.subList(1, stems.size()));
+ }
+
@Override
public String getOrig() {
- return orig;
+ return original;
}
@Override
public int getNumStems() {
- return tokenString != null ? 1 : 0;
+ return (tokenString != null ? 1 : 0) + (stems != null ? stems.size() : 0);
}
@Override
public String getStem(int i) {
- return tokenString;
+ if (i == 0)
+ return tokenString;
+ if (stems != null && i-1 < stems.size())
+ return stems.get(i-1);
+ return tokenString; // TODO Vespa 9: throw new IllegalArgumentException() instead
}
@Override
@@ -131,12 +144,12 @@ public class SimpleToken implements Token {
@Override
public int hashCode() {
- return orig.hashCode();
+ return original.hashCode();
}
@Override
public String toString() {
- return "token '" + orig + "'";
+ return "token '" + original + "'";
}
public String toDetailString() {
@@ -171,4 +184,8 @@ public class SimpleToken implements Token {
return getType().isIndexable() && (getOrig().length() > 0);
}
+ public static SimpleToken fromStems(String original, List<String> stems) {
+ return new SimpleToken(original, stems);
+ }
+
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index 98a84a48095..b72d2bd6d37 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -106,7 +106,7 @@ public class SimpleTokenizer implements Tokenizer {
String oldToken = token;
token = stemmer.stem(token);
String newToken = token;
- log.log(Level.FINEST, () -> "stem '" + oldToken+"' to '" + newToken+"'");
+ log.log(Level.FINEST, () -> "stem '" + oldToken + "' to '" + newToken + "'");
}
String result = token;
log.log(Level.FINEST, () -> "processed token is: " + result);