aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics
diff options
context:
space:
mode:
authorHarald Musum <musum@vespa.ai>2023-11-13 21:34:45 +0100
committerGitHub <noreply@github.com>2023-11-13 21:34:45 +0100
commitef5be496bc4857c5923f566251dd527873b248bf (patch)
tree657d51a4166d3f7cf40e04f0a5972f11d0261afd /linguistics
parent944d635d00e165166508ef23399e9ed65a87a9c8 (diff)
Revert "Bratseth/casing take 2"
Diffstat (limited to 'linguistics')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java1
-rw-r--r--linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java3
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java37
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java2
4 files changed, 13 insertions, 30 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
index f0439a21fec..5ad6a382abd 100644
--- a/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
+++ b/linguistics/src/main/java/com/yahoo/language/LinguisticsCase.java
@@ -26,7 +26,6 @@ public class LinguisticsCase {
public static String toLowerCase(String in) {
// def is picked from http://docs.oracle.com/javase/6/docs/api/java/lang/String.html#toLowerCase%28%29
// Also, at the time of writing, English is the default language for queries
- if (in == null) return null;
return Lowercase.toLowerCase(in);
}
diff --git a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
index 9178c2d7e09..33f5ee7e4bb 100644
--- a/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
+++ b/linguistics/src/main/java/com/yahoo/language/process/GramSplitter.java
@@ -189,8 +189,9 @@ public class GramSplitter {
@Override
public boolean equals(Object o) {
if (this == o) return true;
- if ( ! (o instanceof Gram gram)) return false;
+ if ( ! (o instanceof Gram)) return false;
+ Gram gram = (Gram)o;
if (codePointCount != gram.codePointCount) return false;
if (start != gram.start) return false;
return true;
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
index 809e9b8d133..6cc68c7ac14 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleToken.java
@@ -15,48 +15,35 @@ import java.util.Objects;
public class SimpleToken implements Token {
private final List<Token> components = new ArrayList<>();
- private final String original;
+ private final String orig;
private TokenType type = TokenType.UNKNOWN;
private TokenScript script = TokenScript.UNKNOWN;
private String tokenString;
- private List<String> stems = null; // Any additional stems after tokenString
private boolean specialToken = false;
private long offset = 0;
- public SimpleToken(String original) {
- this(original, (String)null);
+ public SimpleToken(String orig) {
+ this(orig, null);
}
- public SimpleToken(String original, String tokenString) {
- this.original = original;
+ public SimpleToken(String orig, String tokenString) {
+ this.orig = orig;
this.tokenString = tokenString;
}
- /** Exposed as fromStems */
- private SimpleToken(String original, List<String> stems) {
- this.type = TokenType.ALPHABETIC; // Only type which may have stems
- this.original = original;
- this.tokenString = stems.get(0);
- this.stems = List.copyOf(stems.subList(1, stems.size()));
- }
-
@Override
public String getOrig() {
- return original;
+ return orig;
}
@Override
public int getNumStems() {
- return (tokenString != null ? 1 : 0) + (stems != null ? stems.size() : 0);
+ return tokenString != null ? 1 : 0;
}
@Override
public String getStem(int i) {
- if (i == 0)
- return tokenString;
- if (stems != null && i-1 < stems.size())
- return stems.get(i-1);
- return tokenString; // TODO Vespa 9: throw new IllegalArgumentException() instead
+ return tokenString;
}
@Override
@@ -144,12 +131,12 @@ public class SimpleToken implements Token {
@Override
public int hashCode() {
- return original.hashCode();
+ return orig.hashCode();
}
@Override
public String toString() {
- return "token '" + original + "'";
+ return "token '" + orig + "'";
}
public String toDetailString() {
@@ -184,8 +171,4 @@ public class SimpleToken implements Token {
return getType().isIndexable() && (getOrig().length() > 0);
}
- public static SimpleToken fromStems(String original, List<String> stems) {
- return new SimpleToken(original, stems);
- }
-
}
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index b72d2bd6d37..98a84a48095 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -106,7 +106,7 @@ public class SimpleTokenizer implements Tokenizer {
String oldToken = token;
token = stemmer.stem(token);
String newToken = token;
- log.log(Level.FINEST, () -> "stem '" + oldToken + "' to '" + newToken + "'");
+ log.log(Level.FINEST, () -> "stem '" + oldToken+"' to '" + newToken+"'");
}
String result = token;
log.log(Level.FINEST, () -> "processed token is: " + result);