diff options
author | MariusArhaug <mariusarhaug@hotmail.com> | 2024-04-03 15:14:33 +0200 |
---|---|---|
committer | MariusArhaug <mariusarhaug@hotmail.com> | 2024-04-03 16:14:37 +0200 |
commit | 80744246aff5cb9294496842ea27bf703e430c99 (patch) | |
tree | 0ef74371e57a2d3e2d6db1b8ed405646049f0ea5 | |
parent | 81dd10993cdbf1053926159d45b922ebd41e32df (diff) |
Add SimpleTokenScript to SimpleTokenizer
When parsing datasets such as WikiDumps to a significance model, we want
to only keep characters of that language script within our model. So
when adding the script value to our tokenizer we are able to use this to
filter out non-latin words when creating an english significance model
for example.
4 files changed, 124 insertions, 1 deletions
diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenScript.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenScript.java new file mode 100644 index 00000000000..a695d2e7f8d --- /dev/null +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenScript.java @@ -0,0 +1,71 @@ +package com.yahoo.language.simple; + +import com.yahoo.language.process.TokenScript; + +/** + * @author mariusarhaug + */ + +class SimpleTokenScript { + + static TokenScript valueOf(int codePoint) { + return switch(Character.UnicodeScript.of(codePoint)) + { + case COMMON -> TokenScript.COMMON; + case LATIN -> TokenScript.LATIN; + case GREEK -> TokenScript.GREEK; + case CYRILLIC -> TokenScript.CYRILLIC; + case ARMENIAN -> TokenScript.ARMENIAN; + case HEBREW -> TokenScript.HEBREW; + case ARABIC -> TokenScript.ARABIC; + case SYRIAC -> TokenScript.SYRIAC; + case THAANA -> TokenScript.THAANA; + case DEVANAGARI -> TokenScript.DEVANAGARI; + case GURMUKHI -> TokenScript.GURMUKHI; + case GUJARATI -> TokenScript.GUJARATI; + case ORIYA -> TokenScript.ORIYA; + case TAMIL -> TokenScript.TAMIL; + case TELUGU -> TokenScript.TELUGU; + case KANNADA -> TokenScript.KANNADA; + case MALAYALAM -> TokenScript.MALAYALAM; + case SINHALA -> TokenScript.SINHALA; + case THAI -> TokenScript.THAI; + case LAO -> TokenScript.LAO; + case TIBETAN -> TokenScript.TIBETAN; + case MYANMAR -> TokenScript.MYANMAR; + case GEORGIAN -> TokenScript.GEORGIAN; + case HANGUL -> TokenScript.HANGUL; + case ETHIOPIC -> TokenScript.ETHIOPIC; + case CHEROKEE -> TokenScript.CHEROKEE; + case OGHAM -> TokenScript.OGHAM; + case RUNIC -> TokenScript.RUNIC; + case KHMER -> TokenScript.KHMER; + case MONGOLIAN -> TokenScript.MONGOLIAN; + case HIRAGANA -> TokenScript.HIRAGANA; + case KATAKANA -> TokenScript.KATAKANA; + case HAN -> TokenScript.HAN; + case YI -> TokenScript.YI; + case GOTHIC -> TokenScript.GOTHIC; + case DESERET -> TokenScript.DESERET; + case INHERITED -> TokenScript.INHERITED; + case TAGALOG -> TokenScript.TAGALOG; + case HANUNOO -> TokenScript.HANUNOO; + case BUHID -> TokenScript.BUHID; + case TAGBANWA -> TokenScript.TAGBANWA; + case LIMBU -> TokenScript.LIMBU; + case UGARITIC -> TokenScript.UGARITIC; + case SHAVIAN -> TokenScript.SHAVIAN; + case OSMANYA -> TokenScript.OSMANYA; + case CYPRIOT -> TokenScript.CYPRIOT; + case BRAILLE -> TokenScript.BRAILLE; + case BUGINESE -> TokenScript.BUGINESE; + case COPTIC -> TokenScript.COPTIC; + case GLAGOLITIC -> TokenScript.GLAGOLITIC; + case KHAROSHTHI -> TokenScript.KHAROSHTHI; + case TIFINAGH -> TokenScript.TIFINAGH; + + default -> TokenScript.UNKNOWN; + }; + } +} + diff --git a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java index b72d2bd6d37..fb876f5f066 100644 --- a/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java +++ b/linguistics/src/main/java/com/yahoo/language/simple/SimpleTokenizer.java @@ -64,20 +64,28 @@ public class SimpleTokenizer implements Tokenizer { int nextCode = input.codePointAt(0); TokenType prevType = SimpleTokenType.valueOf(nextCode); TokenType tokenType = prevType; + TokenScript prevScript = SimpleTokenScript.valueOf(nextCode); + TokenScript tokenScript = prevScript; for (int prev = 0, next = Character.charCount(nextCode); next <= input.length(); ) { nextCode = next < input.length() ? input.codePointAt(next) : SPACE_CODE; TokenType nextType = SimpleTokenType.valueOf(nextCode); + TokenScript nextScript = SimpleTokenScript.valueOf(nextCode); + if (isAtTokenBoundary(prevType, nextType)) { String original = input.substring(prev, next); tokens.add(new SimpleToken(original).setOffset(prev) .setType(tokenType) - .setTokenString(tokenProcessor.apply(original))); + .setTokenString(tokenProcessor.apply(original)) + .setScript(tokenScript)); prev = next; prevType = nextType; + prevScript = nextScript; tokenType = prevType; + tokenScript = prevScript; } else { tokenType = determineType(tokenType, nextType); + tokenScript = determineScript(tokenScript, nextScript); } next += Character.charCount(nextCode); } @@ -95,6 +103,11 @@ public class SimpleTokenizer implements Tokenizer { return tokenType; } + private TokenScript determineScript(TokenScript tokenScript, TokenScript characterScript) { + if (characterScript == TokenScript.LATIN) return TokenScript.LATIN; + return tokenScript; + } + private String processToken(String token, Language language, StemMode stemMode, boolean removeAccents) { String original = token; log.log(Level.FINEST, () -> "processToken '" + original + "'"); diff --git a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java index 05a2e35f09f..ad8f990ef83 100644 --- a/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java +++ b/linguistics/src/test/java/com/yahoo/language/simple/SimpleTokenizerTestCase.java @@ -5,6 +5,7 @@ import com.yahoo.language.Language; import com.yahoo.language.process.AbstractTokenizerTestCase; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; +import com.yahoo.language.process.TokenScript; import org.junit.Test; import java.util.Iterator; @@ -52,4 +53,21 @@ public class SimpleTokenizerTestCase extends AbstractTokenizerTestCase { tester.assertTokens(emoji1 + emoji2, emoji1, emoji2); } + @Test public void testTokenizeScripts() { + TokenizerTester tester = new TokenizerTester().setStemMode(StemMode.NONE); + + tester.assertTokenScripts("anyone is արևելահայերեն by ancient कार्य", + TokenScript.LATIN, + TokenScript.COMMON, + TokenScript.LATIN, + TokenScript.COMMON, + TokenScript.ARMENIAN, + TokenScript.COMMON, + TokenScript.LATIN, + TokenScript.COMMON, + TokenScript.LATIN, + TokenScript.COMMON, + TokenScript.DEVANAGARI); + } } + diff --git a/linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java b/linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java index a719b5e66b8..401b89f0696 100644 --- a/linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java +++ b/linguistics/src/test/java/com/yahoo/language/simple/TokenizerTester.java @@ -5,6 +5,7 @@ import com.yahoo.language.Language; import com.yahoo.language.Linguistics; import com.yahoo.language.process.StemMode; import com.yahoo.language.process.Token; +import com.yahoo.language.process.TokenScript; import java.util.ArrayList; import java.util.Arrays; @@ -30,6 +31,14 @@ public class TokenizerTester { assertEquals(Arrays.asList(expectedTokenStrings), actual); } + public void assertTokenScripts(String input, TokenScript... expectedTokenScripts) { + List<TokenScript> actual = new ArrayList<>(); + for (Token token : tokenize(input)) { + findTokenScripts(token, actual); + } + assertEquals(Arrays.asList(expectedTokenScripts), actual); + } + public List<String> findTokenStrings(Token token, List<String> out) { int numComponents = token.getNumComponents(); if (token.isSpecialToken() || numComponents == 0) { @@ -42,6 +51,18 @@ public class TokenizerTester { return out; } + public List<TokenScript> findTokenScripts(Token token, List<TokenScript> out) { + int numComponents = token.getNumComponents(); + if (token.isSpecialToken() || numComponents == 0) { + out.add(token.getScript()); + } else { + for (int i = 0; i < numComponents; ++i) { + findTokenScripts(token.getComponent(i), out); + } + } + return out; + } + public Iterable<Token> tokenize(String input) { return linguistics.getTokenizer().tokenize(input, language, stemMode, accentDrop); } |