aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/simple
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2021-05-04 17:23:27 +0200
committerJon Bratseth <bratseth@gmail.com>2021-05-04 17:23:27 +0200
commitaf59be1ed263f1476dd5df0a696f328a7de72ccd (patch)
treeeaa85feb51486663f486adbe21f2905c83c60a98 /linguistics/src/main/java/com/yahoo/language/simple
parentb399aa85883146aa3ba1396769d8e82c88877674 (diff)
Wire in (but don't use) SpecialTokens
Diffstat (limited to 'linguistics/src/main/java/com/yahoo/language/simple')
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java12
-rw-r--r--linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java27
2 files changed, 27 insertions, 12 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
index e1a04b2985d..44f6c74b206 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleLinguistics.java
@@ -11,10 +11,12 @@ import com.yahoo.language.process.GramSplitter;
import com.yahoo.language.process.Normalizer;
import com.yahoo.language.process.Segmenter;
import com.yahoo.language.process.SegmenterImpl;
+import com.yahoo.language.process.SpecialTokenRegistry;
import com.yahoo.language.process.Stemmer;
import com.yahoo.language.process.StemmerImpl;
import com.yahoo.language.process.Tokenizer;
import com.yahoo.language.process.Transformer;
+import com.yahoo.vespa.configdefinition.SpecialtokensConfig;
/**
* Factory of simple linguistic processor implementations.
@@ -31,21 +33,27 @@ public class SimpleLinguistics implements Linguistics {
private final Detector detector;
private final CharacterClasses characterClasses;
private final GramSplitter gramSplitter;
+ private final SpecialTokenRegistry specialTokenRegistry;
- @Inject
public SimpleLinguistics() {
+ this(new SpecialtokensConfig.Builder().build());
+ }
+
+ @Inject
+ public SimpleLinguistics(SpecialtokensConfig specialTokensConfig) {
this.normalizer = new SimpleNormalizer();
this.transformer = new SimpleTransformer();
this.detector = new SimpleDetector();
this.characterClasses = new CharacterClasses();
this.gramSplitter = new GramSplitter(characterClasses);
+ this.specialTokenRegistry = new SpecialTokenRegistry(specialTokensConfig);
}
@Override
public Stemmer getStemmer() { return new StemmerImpl(getTokenizer()); }
@Override
- public Tokenizer getTokenizer() { return new SimpleTokenizer(normalizer, transformer); }
+ public Tokenizer getTokenizer() { return new SimpleTokenizer(normalizer, transformer, specialTokenRegistry); }
@Override
public Normalizer getNormalizer() { return normalizer; }
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
index 7df432f496d..740307c0cca 100644
--- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
+++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java
@@ -23,11 +23,13 @@ import java.util.logging.Level;
*/
public class SimpleTokenizer implements Tokenizer {
+ private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName());
private final static int SPACE_CODE = 32;
+
private final Normalizer normalizer;
private final Transformer transformer;
private final KStemmer stemmer = new KStemmer();
- private static final Logger log = Logger.getLogger(SimpleTokenizer.class.getName());
+ private final SpecialTokenRegistry specialTokenRegistry;
public SimpleTokenizer() {
this(new SimpleNormalizer(), new SimpleTransformer());
@@ -38,8 +40,13 @@ public class SimpleTokenizer implements Tokenizer {
}
public SimpleTokenizer(Normalizer normalizer, Transformer transformer) {
+ this(normalizer, transformer, new SpecialTokenRegistry(List.of()));
+ }
+
+ public SimpleTokenizer(Normalizer normalizer, Transformer transformer, SpecialTokenRegistry specialTokenRegistry) {
this.normalizer = normalizer;
this.transformer = transformer;
+ this.specialTokenRegistry = specialTokenRegistry;
}
@Override
@@ -56,8 +63,8 @@ public class SimpleTokenizer implements Tokenizer {
String original = input.substring(prev, next);
String token = processToken(original, language, stemMode, removeAccents);
tokens.add(new SimpleToken(original).setOffset(prev)
- .setType(prevType)
- .setTokenString(token));
+ .setType(prevType)
+ .setTokenString(token));
prev = next;
prevType = nextType;
}
@@ -67,20 +74,20 @@ public class SimpleTokenizer implements Tokenizer {
}
private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) {
- final String original = token;
- log.log(Level.FINEST, () -> "processToken '"+original+"'");
+ String original = token;
+ log.log(Level.FINEST, () -> "processToken '" + original + "'");
token = normalizer.normalize(token);
token = LinguisticsCase.toLowerCase(token);
if (removeAccents)
token = transformer.accentDrop(token, language);
if (stemMode != StemMode.NONE) {
- final String oldToken = token;
+ String oldToken = token;
token = stemmer.stem(token);
- final String newToken = token;
- log.log(Level.FINEST, () -> "stem '"+oldToken+"' to '"+newToken+"'");
+ String newToken = token;
+ log.log(Level.FINEST, () -> "stem '" + oldToken+"' to '" + newToken+"'");
}
- final String result = token;
- log.log(Level.FINEST, () -> "processed token is: "+result);
+ String result = token;
+ log.log(Level.FINEST, () -> "processed token is: " + result);
return result;
}