diff options
author | Bjørn Christian Seime <bjorn.christian@seime.no> | 2017-08-22 13:17:20 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2017-08-22 13:17:20 +0200 |
commit | 5aaaa2295c72dc8aab12bb7ae05c107f133c2124 (patch) | |
tree | 37348fc16aded1ed80da7c38784f1b9880926afd | |
parent | 125b90ed19ab1757d7e66ec240e8c8c5a2703f80 (diff) | |
parent | 1441c8ddee4f84cb6e6aa33b67b7eaf7927ce946 (diff) |
Merge pull request #3176 from vespa-engine/bratseth/refactor-character-filtering
Factor out string filtering method
3 files changed, 98 insertions, 143 deletions
diff --git a/document/src/main/java/com/yahoo/document/datatypes/StringFieldValue.java b/document/src/main/java/com/yahoo/document/datatypes/StringFieldValue.java index 7b8f96b3481..38a643992f1 100644 --- a/document/src/main/java/com/yahoo/document/datatypes/StringFieldValue.java +++ b/document/src/main/java/com/yahoo/document/datatypes/StringFieldValue.java @@ -11,6 +11,7 @@ import com.yahoo.document.serialization.FieldReader; import com.yahoo.document.serialization.FieldWriter; import com.yahoo.document.serialization.XmlSerializationHelper; import com.yahoo.document.serialization.XmlStream; +import com.yahoo.text.Text; import com.yahoo.vespa.objects.Ids; import java.util.Collection; @@ -20,6 +21,8 @@ import java.util.Map; /** * A StringFieldValue is a wrapper class that holds a String in {@link com.yahoo.document.Document}s and * other {@link com.yahoo.document.datatypes.FieldValue}s. + * + * String fields can only contain text characters, as defined by {@link Text#isTextCharacter(int)} * * @author Einar M R Rosenvinge */ @@ -34,32 +37,6 @@ public class StringFieldValue extends FieldValue { public static final int classId = registerClass(Ids.document + 15, StringFieldValue.class); private String value; private Map<String, SpanTree> spanTrees = null; - private static final boolean[] allowedAsciiChars = new boolean[0x80]; - - static { - allowedAsciiChars[0x0] = false; - allowedAsciiChars[0x1] = false; - allowedAsciiChars[0x2] = false; - allowedAsciiChars[0x3] = false; - allowedAsciiChars[0x4] = false; - allowedAsciiChars[0x5] = false; - allowedAsciiChars[0x6] = false; - allowedAsciiChars[0x7] = false; - allowedAsciiChars[0x8] = false; - allowedAsciiChars[0x9] = true; //tab - allowedAsciiChars[0xA] = true; //nl - allowedAsciiChars[0xB] = false; - allowedAsciiChars[0xC] = false; - allowedAsciiChars[0xD] = true; //cr - for (int i = 0xE; i < 0x20; i++) { - allowedAsciiChars[i] = false; - } - for (int i = 0x20; i < 0x7F; i++) { - allowedAsciiChars[i] = true; //printable ascii chars - } - allowedAsciiChars[0x7F] = true; //del - discouraged, but allowed - } - /** Creates a new StringFieldValue holding an empty String. */ public StringFieldValue() { @@ -70,6 +47,8 @@ public class StringFieldValue extends FieldValue { * Creates a new StringFieldValue with the given value. * * @param value the value to wrap. + * @throws IllegalArgumentException if the string contains non-text characters as defined by + * {@link Text#isTextCharacter(int)} */ public StringFieldValue(String value) { if (value==null) throw new IllegalArgumentException("Value cannot be null"); @@ -85,122 +64,9 @@ public class StringFieldValue extends FieldValue { ++i; } - //See http://www.w3.org/TR/2006/REC-xml11-20060816/#charsets - - if (codePoint < 0x80) { //ascii - if (allowedAsciiChars[codePoint]) { - continue; - } else { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - } - - //source cited above notes that 0x7F-0x84 and 0x86-0x9F are discouraged, but they are still allowed. - //see http://www.w3.org/International/questions/qa-controls - - if (codePoint < 0xFDD0) { - continue; - } - if (codePoint <= 0xFDDF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - - if (codePoint < 0x1FFFE) { - continue; - } - if (codePoint <= 0x1FFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0x2FFFE) { - continue; - } - if (codePoint <= 0x2FFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0x3FFFE) { - continue; - } - if (codePoint <= 0x3FFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0x4FFFE) { - continue; - } - if (codePoint <= 0x4FFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0x5FFFE) { - continue; - } - if (codePoint <= 0x5FFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0x6FFFE) { - continue; - } - if (codePoint <= 0x6FFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0x7FFFE) { - continue; - } - if (codePoint <= 0x7FFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0x8FFFE) { - continue; - } - if (codePoint <= 0x8FFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0x9FFFE) { - continue; - } - if (codePoint <= 0x9FFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0xAFFFE) { - continue; - } - if (codePoint <= 0xAFFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0xBFFFE) { - continue; - } - if (codePoint <= 0xBFFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0xCFFFE) { - continue; - } - if (codePoint <= 0xCFFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0xDFFFE) { - continue; - } - if (codePoint <= 0xDFFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0xEFFFE) { - continue; - } - if (codePoint <= 0xEFFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0xFFFFE) { - continue; - } - if (codePoint <= 0xFFFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } - if (codePoint < 0x10FFFE) { - continue; - } - if (codePoint <= 0x10FFFF) { - throw new IllegalArgumentException("StringFieldValue cannot contain code point 0x" + Integer.toHexString(codePoint).toUpperCase()); - } + if ( ! Text.isTextCharacter(codePoint)) + throw new IllegalArgumentException("A string field value cannot contain code point 0x" + + Integer.toHexString(codePoint).toUpperCase()); } this.value = value; } @@ -248,6 +114,8 @@ public class StringFieldValue extends FieldValue { * since they most certainly will not make sense for a new string value. * * @param o the new String to assign to this. An argument of null is equal to calling clear(). + * @throws IllegalArgumentException if the given argument is a string containing non-text characters as defined by + * {@link Text#isTextCharacter(int)} */ @Override public void assign(Object o) { @@ -313,7 +181,7 @@ public class StringFieldValue extends FieldValue { */ public SpanTree setSpanTree(SpanTree spanTree) { if (spanTrees == null) { - spanTrees = new HashMap(1); + spanTrees = new HashMap<>(1); } if (spanTrees.containsKey(spanTree.getName())) { throw new IllegalArgumentException("Span tree " + spanTree.getName() + " already exists."); diff --git a/vespajlib/src/main/java/com/yahoo/text/StringUtilities.java b/vespajlib/src/main/java/com/yahoo/text/StringUtilities.java index b9e96bdf850..370d079b3ec 100644 --- a/vespajlib/src/main/java/com/yahoo/text/StringUtilities.java +++ b/vespajlib/src/main/java/com/yahoo/text/StringUtilities.java @@ -16,6 +16,7 @@ import java.util.Set; * * @author Haakon Humberset */ +// TODO: Text utilities should which are still needed should move to Text. This should be deprecated. public class StringUtilities { private static Charset UTF8 = Charset.forName("utf8"); diff --git a/vespajlib/src/main/java/com/yahoo/text/Text.java b/vespajlib/src/main/java/com/yahoo/text/Text.java new file mode 100644 index 00000000000..2b670e5d727 --- /dev/null +++ b/vespajlib/src/main/java/com/yahoo/text/Text.java @@ -0,0 +1,86 @@ +package com.yahoo.text; + +/** + * Text utility functions. + * + * @author bratseth + */ +public final class Text { + + private static final boolean[] allowedAsciiChars = new boolean[0x80]; + + static { + allowedAsciiChars[0x0] = false; + allowedAsciiChars[0x1] = false; + allowedAsciiChars[0x2] = false; + allowedAsciiChars[0x3] = false; + allowedAsciiChars[0x4] = false; + allowedAsciiChars[0x5] = false; + allowedAsciiChars[0x6] = false; + allowedAsciiChars[0x7] = false; + allowedAsciiChars[0x8] = false; + allowedAsciiChars[0x9] = true; //tab + allowedAsciiChars[0xA] = true; //nl + allowedAsciiChars[0xB] = false; + allowedAsciiChars[0xC] = false; + allowedAsciiChars[0xD] = true; //cr + for (int i = 0xE; i < 0x20; i++) { + allowedAsciiChars[i] = false; + } + for (int i = 0x20; i < 0x7F; i++) { + allowedAsciiChars[i] = true; //printable ascii chars + } + allowedAsciiChars[0x7F] = true; //del - discouraged, but allowed + } + + /** No instantiation */ + private Text() {} + + /** + * Returns whether the given codepoint is a valid text character, potentially suitable for + * purposes such as indexing and display, see http://www.w3.org/TR/2006/REC-xml11-20060816/#charsets + */ + public static boolean isTextCharacter(int codepoint) { + // The link above notes that 0x7F-0x84 and 0x86-0x9F are discouraged, but they are still allowed - + // see http://www.w3.org/International/questions/qa-controls + + if (codepoint < 0x80) return allowedAsciiChars[codepoint]; + if (codepoint < 0xFDD0) return true; + if (codepoint <= 0xFDDF) return false; + if (codepoint < 0x1FFFE) return true; + if (codepoint <= 0x1FFFF) return false; + if (codepoint < 0x2FFFE) return true; + if (codepoint <= 0x2FFFF) return false; + if (codepoint < 0x3FFFE) return true; + if (codepoint <= 0x3FFFF) return false; + if (codepoint < 0x4FFFE) return true; + if (codepoint <= 0x4FFFF) return false; + if (codepoint < 0x5FFFE) return true; + if (codepoint <= 0x5FFFF) return false; + if (codepoint < 0x6FFFE) return true; + if (codepoint <= 0x6FFFF) return false; + if (codepoint < 0x7FFFE) return true; + if (codepoint <= 0x7FFFF) return false; + if (codepoint < 0x8FFFE) return true; + if (codepoint <= 0x8FFFF) return false; + if (codepoint < 0x9FFFE) return true; + if (codepoint <= 0x9FFFF) return false; + if (codepoint < 0xAFFFE) return true; + if (codepoint <= 0xAFFFF) return false; + if (codepoint < 0xBFFFE) return true; + if (codepoint <= 0xBFFFF) return false; + if (codepoint < 0xCFFFE) return true; + if (codepoint <= 0xCFFFF) return false; + if (codepoint < 0xDFFFE) return true; + if (codepoint <= 0xDFFFF) return false; + if (codepoint < 0xEFFFE) return true; + if (codepoint <= 0xEFFFF) return false; + if (codepoint < 0xFFFFE) return true; + if (codepoint <= 0xFFFFF) return false; + if (codepoint < 0x10FFFE) return true; + if (codepoint <= 0x10FFFF) return false; + + return true; + } + +} |