aboutsummaryrefslogtreecommitdiffstats
path: root/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics
diff options
context:
space:
mode:
Diffstat (limited to 'indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java14
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java15
2 files changed, 15 insertions, 14 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
index 441ac711cc3..03efee5f271 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/AnnotatorConfig.java
@@ -13,7 +13,7 @@ public class AnnotatorConfig implements Cloneable {
private Language language;
private StemMode stemMode;
private boolean removeAccents;
- private int maxTermOccurences;
+ private int maxTermOccurrences;
private int maxTokenizeLength;
public static final int DEFAULT_MAX_TERM_OCCURRENCES;
@@ -29,7 +29,7 @@ public class AnnotatorConfig implements Cloneable {
language = Language.ENGLISH;
stemMode = StemMode.NONE;
removeAccents = false;
- maxTermOccurences = DEFAULT_MAX_TERM_OCCURRENCES;
+ maxTermOccurrences = DEFAULT_MAX_TERM_OCCURRENCES;
maxTokenizeLength = DEFAULT_MAX_TOKENIZE_LENGTH;
}
@@ -37,7 +37,7 @@ public class AnnotatorConfig implements Cloneable {
language = rhs.language;
stemMode = rhs.stemMode;
removeAccents = rhs.removeAccents;
- maxTermOccurences = rhs.maxTermOccurences;
+ maxTermOccurrences = rhs.maxTermOccurrences;
maxTokenizeLength = rhs.maxTokenizeLength;
}
@@ -74,11 +74,11 @@ public class AnnotatorConfig implements Cloneable {
}
public int getMaxTermOccurrences() {
- return maxTermOccurences;
+ return maxTermOccurrences;
}
public AnnotatorConfig setMaxTermOccurrences(int maxTermCount) {
- this.maxTermOccurences = maxTermCount;
+ this.maxTermOccurrences = maxTermCount;
return this;
}
@@ -110,7 +110,7 @@ public class AnnotatorConfig implements Cloneable {
if (removeAccents != rhs.removeAccents) {
return false;
}
- if (maxTermOccurences != rhs.maxTermOccurences) {
+ if (maxTermOccurrences != rhs.maxTermOccurrences) {
return false;
}
if (maxTokenizeLength != rhs.maxTokenizeLength) {
@@ -122,6 +122,6 @@ public class AnnotatorConfig implements Cloneable {
@Override
public int hashCode() {
return getClass().hashCode() + language.hashCode() + stemMode.hashCode() +
- Boolean.valueOf(removeAccents).hashCode() + maxTermOccurences + maxTokenizeLength;
+ Boolean.valueOf(removeAccents).hashCode() + maxTermOccurrences + maxTokenizeLength;
}
}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
index 879a6b2ce8e..18f09a72fc9 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/linguistics/LinguisticsAnnotator.java
@@ -28,6 +28,7 @@ import static com.yahoo.language.LinguisticsCase.toLowerCase;
public class LinguisticsAnnotator {
private final Linguistics factory;
+ private final LinguisticsContext linguisticsContext;
private final AnnotatorConfig config;
private static class TermOccurrences {
@@ -56,8 +57,9 @@ public class LinguisticsAnnotator {
* @param factory the linguistics factory to use when annotating
* @param config the linguistics config to use
*/
- public LinguisticsAnnotator(Linguistics factory, AnnotatorConfig config) {
+ public LinguisticsAnnotator(Linguistics factory, LinguisticsContext context, AnnotatorConfig config) {
this.factory = factory;
+ this.linguisticsContext = context;
this.config = config;
}
@@ -70,15 +72,14 @@ public class LinguisticsAnnotator {
public boolean annotate(StringFieldValue text, ExecutionContext context) {
if (text.getSpanTree(SpanTrees.LINGUISTICS) != null) return true; // Already annotated with LINGUISTICS.
- Tokenizer tokenizer = factory.getTokenizer();
+ Tokenizer tokenizer = factory.getTokenizer(linguisticsContext);
String input = (text.getString().length() <= config.getMaxTokenizeLength())
? text.getString()
: text.getString().substring(0, config.getMaxTokenizeLength());
Iterable<Token> tokens = tokenizer.tokenize(input,
config.getLanguage(),
config.getStemMode(),
- config.getRemoveAccents(),
- new LinguisticsContext(context.getDocumentType().getName()));
+ config.getRemoveAccents());
TermOccurrences termOccurrences = new TermOccurrences(config.getMaxTermOccurrences());
SpanTree tree = new SpanTree(SpanTrees.LINGUISTICS);
for (Token token : tokens)
@@ -93,9 +94,9 @@ public class LinguisticsAnnotator {
* Creates a TERM annotation which has the lowercase value as annotation (only) if it is different from the
* original.
*
- * @param termToLowerCase The term to lower case.
- * @param origTerm The original term.
- * @return the created TERM annotation.
+ * @param termToLowerCase the term to lower case
+ * @param origTerm the original term
+ * @return the created TERM annotation
*/
public static Annotation lowerCaseTermAnnotation(String termToLowerCase, String origTerm) {
String annotationValue = toLowerCase(termToLowerCase);