diff options
author | Jon Bratseth <bratseth@oath.com> | 2018-04-06 14:18:42 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@oath.com> | 2018-04-06 14:18:42 +0200 |
commit | 981633f3f5d0ddab9a6b5de9881d478e5210e717 (patch) | |
tree | fd5ac0d996545ef3e1066376c93c3cc914ef87d8 /vespajlib | |
parent | 67e98ccb86f0fa4838a4b2112fd7c0c654972cab (diff) |
Add stripInvalidCharacters
Diffstat (limited to 'vespajlib')
-rw-r--r-- | vespajlib/src/main/java/com/yahoo/text/Text.java | 39 | ||||
-rw-r--r-- | vespajlib/src/test/java/com/yahoo/text/TextTestCase.java | 41 |
2 files changed, 70 insertions, 10 deletions
diff --git a/vespajlib/src/main/java/com/yahoo/text/Text.java b/vespajlib/src/main/java/com/yahoo/text/Text.java index 0bdc2fb63bc..f3e1948649c 100644 --- a/vespajlib/src/main/java/com/yahoo/text/Text.java +++ b/vespajlib/src/main/java/com/yahoo/text/Text.java @@ -90,19 +90,38 @@ public final class Text { * Validates that the given string value only contains text characters and * returns the first illegal code point if one is found. */ - public static OptionalInt validateTextString(String value) { - for (int i = 0; i < value.length(); i++) { - char theChar = value.charAt(i); - int codePoint = value.codePointAt(i); - if (Character.isHighSurrogate(theChar)) { - // Skip one char ahead, since codePointAt() consumes one more char in this case - ++i; - } - if (!Text.isTextCharacter(codePoint)) { + public static OptionalInt validateTextString(String string) { + for (int i = 0; i < string.length(); i++) { + int codePoint = string.codePointAt(i); + if ( ! Text.isTextCharacter(codePoint)) return OptionalInt.of(codePoint); - } + + if (Character.isHighSurrogate(string.charAt(i))) + ++i; // // codePointAt() consumes one more char in this case } return OptionalInt.empty(); } + /** + * Returns a string where any invalid characters in the input string is replaced by spaces + */ + public static String stripInvalidCharacters(String string) { + StringBuilder stripped = null; // lazy, as most string will not need stripping + for (int i = 0; i < string.length(); i++) { + int codePoint = string.codePointAt(i); + if ( ! Text.isTextCharacter(codePoint) || codePoint == 'X' || codePoint == 'Y') { + if (stripped == null) + stripped = new StringBuilder(string.substring(0, i)); + stripped.append(' '); + } + else if (stripped != null) { + stripped.appendCodePoint(codePoint); + } + + if (Character.isHighSurrogate(string.charAt(i))) + ++i; // // codePointAt() consumes one more char in this case + } + return stripped != null ? stripped.toString() : string; + } + } diff --git a/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java new file mode 100644 index 00000000000..0c1cf9b4b30 --- /dev/null +++ b/vespajlib/src/test/java/com/yahoo/text/TextTestCase.java @@ -0,0 +1,41 @@ +package com.yahoo.text; + +import org.junit.Test; + +import java.util.OptionalInt; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; + +public class TextTestCase { + + @Test + public void testValidateTextString() { + assertFalse(Text.validateTextString("valid").isPresent()); + assertEquals(OptionalInt.of(1), Text.validateTextString("text\u0001text\u0003")); + assertEquals(OptionalInt.of(917503), + Text.validateTextString(new StringBuilder().appendCodePoint(0xDFFFF).toString())); + assertEquals(OptionalInt.of(917503), + Text.validateTextString(new StringBuilder("foo").appendCodePoint(0xDFFFF).toString())); + assertEquals(OptionalInt.of(917503), + Text.validateTextString(new StringBuilder().appendCodePoint(0xDFFFF).append("foo").toString())); + assertEquals(OptionalInt.of(917503), + Text.validateTextString(new StringBuilder("foo").appendCodePoint(0xDFFFF).append("foo").toString())); + } + + @Test + public void testStripTextString() { + assertEquals("", Text.stripInvalidCharacters("")); + assertEquals("valid", Text.stripInvalidCharacters("valid")); + assertEquals("text text ", Text.stripInvalidCharacters("text\u0001text\u0003")); + assertEquals(" ", + Text.stripInvalidCharacters(new StringBuilder().appendCodePoint(0xDFFFF).toString())); + assertEquals("foo ", + Text.stripInvalidCharacters(new StringBuilder("foo").appendCodePoint(0xDFFFF).toString())); + assertEquals(" foo", + Text.stripInvalidCharacters(new StringBuilder().appendCodePoint(0xDFFFF).append("foo").toString())); + assertEquals("foo foo", + Text.stripInvalidCharacters(new StringBuilder("foo").appendCodePoint(0xDFFFF).append("foo").toString())); + } + +} |