diff options
Diffstat (limited to 'opennlp-linguistics/src/main/java/com')
9 files changed, 414 insertions, 0 deletions
diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java new file mode 100644 index 00000000000..27c23d8d3e6 --- /dev/null +++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/DefaultLanguageDetectorContextGenerator.java @@ -0,0 +1,32 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import opennlp.tools.ngram.NGramCharModel; +import opennlp.tools.util.normalizer.CharSequenceNormalizer; + +import java.util.HashSet; +import java.util.Set; + +/** + * Avoids using the unnecessarily slow {@link NGramCharModel}. + * + * @author jonmv + */ +public class DefaultLanguageDetectorContextGenerator extends opennlp.tools.langdetect.DefaultLanguageDetectorContextGenerator { + + public DefaultLanguageDetectorContextGenerator(int minLength, int maxLength, CharSequenceNormalizer... normalizers) { + super(minLength, maxLength, normalizers); + } + + @Override + public String[] getContext(CharSequence document) { + int[] normalized = normalizer.normalize(document).codePoints().map(Character::toLowerCase).toArray(); + Set<String> grams = new HashSet<>(); + for (int i = 0; i < normalized.length; i++) + for (int j = minLength; j <= maxLength && i + j < normalized.length; j++) + grams.add(new String(normalized, i, j)); + + return grams.toArray(new String[grams.size()]); + } + +} diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java new file mode 100644 index 00000000000..0cf4634c6c3 --- /dev/null +++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/LanguageDetectorFactory.java @@ -0,0 +1,20 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import opennlp.tools.langdetect.LanguageDetectorContextGenerator; + +/** + * Overrides the UrlCharSequenceNormalizer, which has a bad regex, until fixed: https://issues.apache.org/jira/browse/OPENNLP-1350 + * + * @author jonmv + */ +@SuppressWarnings("unused") // Loaded by black magic: specified in properties in the loaded model. +public class LanguageDetectorFactory extends opennlp.tools.langdetect.LanguageDetectorFactory { + + @Override + public LanguageDetectorContextGenerator getContextGenerator() { + return new DefaultLanguageDetectorContextGenerator(1, 3, + VespaCharSequenceNormalizer.getInstance()); + } + +} diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java new file mode 100644 index 00000000000..d7a7d3a4744 --- /dev/null +++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpDetector.java @@ -0,0 +1,92 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import com.yahoo.language.Language; +import com.yahoo.language.detect.Detection; +import com.yahoo.language.detect.Detector; +import com.yahoo.language.detect.Hint; +import com.yahoo.language.simple.SimpleDetector; +import opennlp.tools.langdetect.LanguageDetectorConfig; +import opennlp.tools.langdetect.LanguageDetectorME; +import opennlp.tools.langdetect.LanguageDetectorModel; + +import java.io.IOException; +import java.io.UncheckedIOException; +import java.nio.ByteBuffer; +import java.nio.charset.Charset; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; + +import static java.nio.charset.StandardCharsets.UTF_8; + +/** + * Detects text language using patched OpenNLP, with fallback to {@link SimpleDetector} for undetected CJK input. + * + * @author jonmv + */ +class OpenNlpDetector implements Detector { + + private static final Object monitor = new Object(); + private static LanguageDetectorModel model; + + private final SimpleDetector simple = new SimpleDetector(); + private final Map<String, Language> languagesByISO3 = new HashMap<>(); + private final LanguageDetectorME detector; + private final LanguageDetectorConfig config; + + OpenNlpDetector() { + detector = new LanguageDetectorME(loadModel()); + config = new LanguageDetectorConfig(); + config.setMinDiff(0.02); + config.setChunkSize(32); + config.setMaxLength(256); + for (Locale locale : Locale.getAvailableLocales()) { + Language language = Language.fromLocale(locale); + if (language != null) + languagesByISO3.put(locale.getISO3Language(), language); + } + } + + private static LanguageDetectorModel loadModel() { + synchronized (monitor) { + if (model == null) { + try { + model = new LanguageDetectorModel(OpenNlpDetector.class.getResourceAsStream("/models/langdetect-183.bin")); + } + catch (IOException e) { + throw new UncheckedIOException(e); + } + } + } + return model; + } + + @Override + public Detection detect(byte[] input, int offset, int length, Hint hint) { + Charset encoding = Charset.forName(simple.guessEncoding(input, offset, length)); + return new Detection(detectLanguage(new String(input, offset, length, encoding)), encoding.name(), false); + } + + @Override + public Detection detect(ByteBuffer input, Hint hint) { + if (input.hasArray()) + return detect(input.array(), input.arrayOffset() + input.position(), input.remaining(), hint); + + byte[] buffer = new byte[input.remaining()]; + input.get(buffer); + return detect(buffer, 0, buffer.length, hint); + } + + @Override + public Detection detect(String input, Hint hint) { + return new Detection(detectLanguage(input), UTF_8.name(), false); + } + + private Language detectLanguage(String input) { + var prediction = detector.probingPredictLanguages(input, config).getLanguages()[0]; + var result = prediction.getConfidence() > 0.02 ? languagesByISO3.get(prediction.getLang()) : null; + return result != null ? result : simple.guessLanguage(input.substring(0, Math.min(input.length(), 256))); + } + +} diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java new file mode 100644 index 00000000000..1d96d8a0cdf --- /dev/null +++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpLinguistics.java @@ -0,0 +1,36 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import com.yahoo.component.annotation.Inject; +import com.yahoo.language.Linguistics; +import com.yahoo.language.detect.Detector; +import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.simple.SimpleLinguistics; + +/** + * Returns a linguistics implementation based on OpenNlp. + * + * @author bratseth + * @author jonmv + */ +public class OpenNlpLinguistics extends SimpleLinguistics { + + private final Detector detector; + + @Inject + public OpenNlpLinguistics() { + this.detector = new OpenNlpDetector(); + } + + @Override + public Tokenizer getTokenizer() { + return new OpenNlpTokenizer(getNormalizer(), getTransformer()); + } + + @Override + public Detector getDetector() { return detector; } + + @Override + public boolean equals(Linguistics other) { return (other instanceof OpenNlpLinguistics); } + +} diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java new file mode 100644 index 00000000000..8080dc92729 --- /dev/null +++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/OpenNlpTokenizer.java @@ -0,0 +1,99 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import com.yahoo.language.Language; +import com.yahoo.language.LinguisticsCase; +import com.yahoo.language.process.Normalizer; +import com.yahoo.language.process.SpecialTokenRegistry; +import com.yahoo.language.process.StemMode; +import com.yahoo.language.process.Token; +import com.yahoo.language.process.Tokenizer; +import com.yahoo.language.process.Transformer; +import com.yahoo.language.simple.SimpleNormalizer; +import com.yahoo.language.simple.SimpleTokenizer; +import com.yahoo.language.simple.SimpleTransformer; +import opennlp.tools.stemmer.Stemmer; +import opennlp.tools.stemmer.snowball.SnowballStemmer; + +import java.util.List; + +/** + * Tokenizer using OpenNlp + * + * @author matskin + * @author bratseth + */ +public class OpenNlpTokenizer implements Tokenizer { + + private final static int SPACE_CODE = 32; + private final Normalizer normalizer; + private final Transformer transformer; + private final SimpleTokenizer simpleTokenizer; + private final SpecialTokenRegistry specialTokenRegistry; + + public OpenNlpTokenizer() { + this(new SimpleNormalizer(), new SimpleTransformer()); + } + + public OpenNlpTokenizer(Normalizer normalizer, Transformer transformer) { + this(normalizer, transformer, new SpecialTokenRegistry(List.of())); + } + + public OpenNlpTokenizer(Normalizer normalizer, Transformer transformer, SpecialTokenRegistry specialTokenRegistry) { + this.normalizer = normalizer; + this.transformer = transformer; + this.specialTokenRegistry = specialTokenRegistry; + this.simpleTokenizer = new SimpleTokenizer(normalizer, transformer, specialTokenRegistry); + } + + @Override + public Iterable<Token> tokenize(String input, Language language, StemMode stemMode, boolean removeAccents) { + Stemmer stemmer = stemmerFor(language, stemMode); + if (stemmer == null) + return simpleTokenizer.tokenize(input, language, stemMode, removeAccents); + else + return simpleTokenizer.tokenize(input, token -> processToken(token, language, stemMode, removeAccents, stemmer)); + } + + private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents, + Stemmer stemmer) { + token = normalizer.normalize(token); + token = LinguisticsCase.toLowerCase(token); + if (removeAccents) + token = transformer.accentDrop(token, language); + if (stemMode != StemMode.NONE) + token = stemmer.stem(token).toString(); + return token; + } + + private Stemmer stemmerFor(Language language, StemMode stemMode) { + if (language == null || language == Language.ENGLISH || stemMode == StemMode.NONE) return null; + SnowballStemmer.ALGORITHM algorithm = algorithmFor(language); + if (algorithm == null) return null; + return new SnowballStemmer(algorithm); + } + + private SnowballStemmer.ALGORITHM algorithmFor(Language language) { + switch (language) { + case DANISH: return SnowballStemmer.ALGORITHM.DANISH; + case DUTCH: return SnowballStemmer.ALGORITHM.DUTCH; + case FINNISH: return SnowballStemmer.ALGORITHM.FINNISH; + case FRENCH: return SnowballStemmer.ALGORITHM.FRENCH; + case GERMAN: return SnowballStemmer.ALGORITHM.GERMAN; + case HUNGARIAN: return SnowballStemmer.ALGORITHM.HUNGARIAN; + case IRISH: return SnowballStemmer.ALGORITHM.IRISH; + case ITALIAN: return SnowballStemmer.ALGORITHM.ITALIAN; + case NORWEGIAN_BOKMAL: return SnowballStemmer.ALGORITHM.NORWEGIAN; + case NORWEGIAN_NYNORSK: return SnowballStemmer.ALGORITHM.NORWEGIAN; + case PORTUGUESE: return SnowballStemmer.ALGORITHM.PORTUGUESE; + case ROMANIAN: return SnowballStemmer.ALGORITHM.ROMANIAN; + case RUSSIAN: return SnowballStemmer.ALGORITHM.RUSSIAN; + case SPANISH: return SnowballStemmer.ALGORITHM.SPANISH; + case SWEDISH: return SnowballStemmer.ALGORITHM.SWEDISH; + case TURKISH: return SnowballStemmer.ALGORITHM.TURKISH; + case ENGLISH: return SnowballStemmer.ALGORITHM.ENGLISH; + default: return null; + } + } + +} diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java new file mode 100644 index 00000000000..883319e2f8b --- /dev/null +++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/UrlCharSequenceNormalizer.java @@ -0,0 +1,31 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import opennlp.tools.util.normalizer.CharSequenceNormalizer; + +import java.util.regex.Pattern; + +/** + * Modifies {@link opennlp.tools.util.normalizer.UrlCharSequenceNormalizer} to avoid the bad email regex. + * + * @author jonmv + */ +public class UrlCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final Pattern URL_REGEX = + Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+"); + private static final Pattern MAIL_REGEX = + Pattern.compile("(?<![-+_.0-9A-Za-z])[-+_.0-9A-Za-z]+@[-0-9A-Za-z]+[-.0-9A-Za-z]+"); + + private static final UrlCharSequenceNormalizer INSTANCE = new UrlCharSequenceNormalizer(); + + public static UrlCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + public CharSequence normalize(CharSequence text) { + String modified = URL_REGEX.matcher(text).replaceAll(" "); + return MAIL_REGEX.matcher(modified).replaceAll(" "); + } + +} diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java new file mode 100644 index 00000000000..df8f3fad520 --- /dev/null +++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/VespaCharSequenceNormalizer.java @@ -0,0 +1,51 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +import opennlp.tools.util.normalizer.CharSequenceNormalizer; + +import java.util.function.IntConsumer; +import java.util.stream.IntStream; + +/** + * Simple normalizer + * + * @author arnej + */ +public class VespaCharSequenceNormalizer implements CharSequenceNormalizer { + + private static final VespaCharSequenceNormalizer INSTANCE = new VespaCharSequenceNormalizer(); + + public static VespaCharSequenceNormalizer getInstance() { + return INSTANCE; + } + + // filter replacing sequences of non-letters with a single space + static class OnlyLetters implements IntStream.IntMapMultiConsumer { + boolean addSpace = false; + public void accept(int codepoint, IntConsumer target) { + if (WordCharDetector.isWordChar(codepoint)) { + if (addSpace) { + target.accept(' '); + addSpace = false; + } + target.accept(Character.toLowerCase(codepoint)); + } else { + addSpace = true; + } + } + } + + public CharSequence normalize(CharSequence text) { + if (text.isEmpty()) { + return text; + } + var r = text + .codePoints() + .mapMulti(new OnlyLetters()) + .collect(StringBuilder::new, + StringBuilder::appendCodePoint, + StringBuilder::append); + return r; + } + +} diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java new file mode 100644 index 00000000000..d7e3f88ae8d --- /dev/null +++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/WordCharDetector.java @@ -0,0 +1,48 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.language.opennlp; + +class WordCharDetector { + public static boolean isWordChar(int codepoint) { + int unicodeGeneralCategory = Character.getType(codepoint); + switch (unicodeGeneralCategory) { + case Character.LOWERCASE_LETTER: + case Character.OTHER_LETTER: + case Character.TITLECASE_LETTER: + case Character.UPPERCASE_LETTER: + case Character.MODIFIER_LETTER: + return true; +/* + * these are the other categories, currently considered non-word-chars: + * + case Character.CONNECTOR_PUNCTUATION: + case Character.CONTROL: + case Character.CURRENCY_SYMBOL: + case Character.DASH_PUNCTUATION: + case Character.ENCLOSING_MARK: + case Character.END_PUNCTUATION: + case Character.FINAL_QUOTE_PUNCTUATION: + case Character.FORMAT: + case Character.INITIAL_QUOTE_PUNCTUATION: + case Character.MATH_SYMBOL: + case Character.MODIFIER_SYMBOL: + case Character.NON_SPACING_MARK: + case Character.OTHER_PUNCTUATION: + case Character.OTHER_SYMBOL: + case Character.PRIVATE_USE: + case Character.START_PUNCTUATION: + case Character.SURROGATE: + case Character.UNASSIGNED: + case Character.DECIMAL_DIGIT_NUMBER: + case Character.LETTER_NUMBER: + case Character.OTHER_NUMBER: + case Character.COMBINING_SPACING_MARK: + case Character.LINE_SEPARATOR: + case Character.SPACE_SEPARATOR: + case Character.PARAGRAPH_SEPARATOR: + * + */ + default: + return false; + } + } +} diff --git a/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/package-info.java b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/package-info.java new file mode 100644 index 00000000000..9606578b3ac --- /dev/null +++ b/opennlp-linguistics/src/main/java/com/yahoo/language/opennlp/package-info.java @@ -0,0 +1,5 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +@ExportPackage +package com.yahoo.language.opennlp; + +import com.yahoo.osgi.annotation.ExportPackage; |