diff options
author | Jon Bratseth <bratseth@vespa.ai> | 2023-11-14 11:28:40 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@vespa.ai> | 2023-11-14 11:28:40 +0100 |
commit | 997896e40f47770b22a81b5ec8281d2e962ec4d9 (patch) | |
tree | 3784800283871e8f99b5579e3a8e545a11f86df1 /linguistics | |
parent | 29109450c8c2c98d969a711b8f6240bb5594c150 (diff) |
Revert "Merge pull request #29328 from vespa-engine/revert-29314-bratseth/casing-take-2"
This reverts commit a72e949533a46d665440a9c72ca2b8fb58f3a9c3, reversing
changes made to 944d635d00e165166508ef23399e9ed65a87a9c8.
Diffstat (limited to 'linguistics')
4 files changed, 30 insertions, 13 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java index 5ad6a382abd..f0439a21fec 100644 --- a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java +++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java @@ -26,6 +26,7 @@ public class LinguisticsCase { public static String toLowerCase(String in) { // def is picked from http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#toLowerCase%28%29 // Also, at the time of writing, English is the default language for queries + if (in == null) return null; return Lowercase.toLowerCase(in); } diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java index 33f5ee7e4bb..9178c2d7e09 100644 --- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java +++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java @@ -189,9 +189,8 @@ public class GramSplitter { @Override public boolean equals(Object o) { if (this == o) return true; - if ( ! (o instanceof Gram)) return false; + if ( ! (o instanceof Gram gram)) return false; - Gram gram = (Gram)o; if (codePointCount != gram.codePointCount) return false; if (start != gram.start) return false; return true; diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java index 6cc68c7ac14..809e9b8d133 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java @@ -15,35 +15,48 @@ import java.util.Objects; public class SimpleToken implements Token { private final List<Token> components = new ArrayList<>(); - private final String orig; + private final String original; private TokenType type = TokenType.UNKNOWN; private TokenScript script = TokenScript.UNKNOWN; private String tokenString; + private List<String> stems = null; // Any additional stems after tokenString private boolean specialToken = false; private long offset = 0; - public SimpleToken(String orig) { - this(orig, null); + public SimpleToken(String original) { + this(original, (String)null); } - public SimpleToken(String orig, String tokenString) { - this.orig = orig; + public SimpleToken(String original, String tokenString) { + this.original = original; this.tokenString = tokenString; } + /** Exposed as fromStems */ + private SimpleToken(String original, List<String> stems) { + this.type = TokenType.ALPHABETIC; // Only type which may have stems + this.original = original; + this.tokenString = stems.get(0); + this.stems = List.copyOf(stems.subList(1, stems.size())); + } + @Override public String getOrig() { - return orig; + return original; } @Override public int getNumStems() { - return tokenString != null ? 1 : 0; + return (tokenString != null ? 1 : 0) + (stems != null ? stems.size() : 0); } @Override public String getStem(int i) { - return tokenString; + if (i == 0) + return tokenString; + if (stems != null && i-1 < stems.size()) + return stems.get(i-1); + return tokenString; // TODO Vespa 9: throw new IllegalArgumentException() instead } @Override @@ -131,12 +144,12 @@ public class SimpleToken implements Token { @Override public int hashCode() { - return orig.hashCode(); + return original.hashCode(); } @Override public String toString() { - return "token '" + orig + "'"; + return "token '" + original + "'"; } public String toDetailString() { @@ -171,4 +184,8 @@ public class SimpleToken implements Token { return getType().isIndexable() && (getOrig().length() > 0); } + public static SimpleToken fromStems(String original, List<String> stems) { + return new SimpleToken(original, stems); + } + } diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index 98a84a48095..b72d2bd6d37 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -106,7 +106,7 @@ public class SimpleTokenizer implements Tokenizer { String oldToken = token; token = stemmer.stem(token); String newToken = token; - log.log(Level.FINEST, () -> "stem '" + oldToken+"' to '" + newToken+"'"); + log.log(Level.FINEST, () -> "stem '" + oldToken + "' to '" + newToken + "'"); } String result = token; log.log(Level.FINEST, () -> "processed token is: " + result); |