diff options
author | Henning Baldersheim <balder@yahoo-inc.com> | 2019-12-13 07:03:07 +0000 |
---|---|---|
committer | Henning Baldersheim <balder@yahoo-inc.com> | 2019-12-13 07:03:07 +0000 |
commit | 2bc18be3d47d16c4cf25f1bc36614cf6e6a25f7b (patch) | |
tree | 00e517a3763ae5be598407983563681f0834f6a0 /vespajlib | |
parent | b0098c4d582e071dae1957d38ce4dc2b915e5184 (diff) |
Add a faster, but simpler pattern matcher. Builds tensoraddresses 5 times+ faster than the regex one.
Diffstat (limited to 'vespajlib')
4 files changed, 129 insertions, 4 deletions
diff --git a/vespajlib/src/main/java/com/yahoo/tensor/TensorAddress.java b/vespajlib/src/main/java/com/yahoo/tensor/TensorAddress.java index f99645cd996..e705445c5a7 100644 --- a/vespajlib/src/main/java/com/yahoo/tensor/TensorAddress.java +++ b/vespajlib/src/main/java/com/yahoo/tensor/TensorAddress.java @@ -1,11 +1,14 @@ // Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.tensor; +import com.yahoo.text.Ascii7BitMatcher; + import java.util.Arrays; import java.util.Optional; -import java.util.regex.Pattern; import java.util.stream.Collectors; +import static com.yahoo.text.Ascii7BitMatcher.charsAndNumbers; + /** * An immutable address to a tensor cell. This simply supplies a value to each dimension * in a particular tensor type. By itself it is just a list of cell labels, it's meaning depends on its accompanying type. @@ -159,8 +162,8 @@ public abstract class TensorAddress implements Comparable<TensorAddress> { /** Supports building of a tensor address */ public static class Builder { - - static final private Pattern labelPattern = Pattern.compile("[-,A-Za-z0-9_@]([A-Z,a-z0-9_@$])*"); + static private final Ascii7BitMatcher labelMatcher = new Ascii7BitMatcher("-_@" + charsAndNumbers(), + "_@$" + charsAndNumbers()); private final TensorType type; private final String[] labels; @@ -205,7 +208,7 @@ public abstract class TensorAddress implements Comparable<TensorAddress> { static private void requireIdentifier(String s, String parameterName) { if (s == null) throw new IllegalArgumentException(parameterName + " can not be null"); - if ( ! labelPattern.matcher(s).matches()) + if ( ! labelMatcher.matches(s)) throw new IllegalArgumentException(parameterName + " must be an identifier or integer, not '" + s + "'"); } diff --git a/vespajlib/src/main/java/com/yahoo/text/Ascii7BitMatcher.java b/vespajlib/src/main/java/com/yahoo/text/Ascii7BitMatcher.java new file mode 100644 index 00000000000..b821d57de00 --- /dev/null +++ b/vespajlib/src/main/java/com/yahoo/text/Ascii7BitMatcher.java @@ -0,0 +1,64 @@ +// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.text; + +import java.util.BitSet; + +/** + * Fast replacement for regex based validators of simple expressions. + * It can take a list of legal characters for the the first character, + * and another list for the following. Limited to 7bit ascii. + * @author baldersheim + */ +public class Ascii7BitMatcher { + private final BitSet legalFirst; + private final BitSet legalRest; + private static BitSet createBitSet(String legal) { + BitSet legalChars = new BitSet(128); + for (int i=0; i < legal.length(); i++) { + char c = legal.charAt(i); + if (c < 128) { + legalChars.set(c); + } else { + throw new IllegalArgumentException("Char '" + c + "' at position " + i + " is not valid ascii 7 bit char"); + } + } + return legalChars; + } + public Ascii7BitMatcher(String legal) { + this(legal, legal); + } + public Ascii7BitMatcher(String legalFirstChar, String legalChars) { + legalFirst = createBitSet(legalFirstChar); + legalRest = createBitSet(legalChars); + } + private static boolean isAscii7Bit(char c) { return c < 128;} + private boolean isLegalFirst(char c) { + return isAscii7Bit(c) && legalFirst.get(c); + } + private boolean isLegalRest(char c) { + return isAscii7Bit(c) && legalRest.get(c); + } + public boolean matches(String s) { + if (s == null || s.isEmpty() || ! isLegalFirst(s.charAt(0))) return false; + for (int i = 1; i < s.length(); i++) { + if ( ! isLegalRest(s.charAt(i))) { + return false; + } + } + return true; + } + static public String charsAndNumbers() { + char [] chars = new char[26*2+10]; + int i = 0; + for (char c = 'A'; c <= 'Z'; c++) { + chars[i++] = c; + } + for (char c = 'a'; c <= 'z'; c++) { + chars[i++] = c; + } + for (char c = '0'; c <= '9'; c++) { + chars[i++] = c; + } + return new String(chars); + } +} diff --git a/vespajlib/src/test/java/com/yahoo/tensor/functions/DynamicTensorTestCase.java b/vespajlib/src/test/java/com/yahoo/tensor/functions/DynamicTensorTestCase.java index e16b7b90a1d..2a34bc11b76 100644 --- a/vespajlib/src/test/java/com/yahoo/tensor/functions/DynamicTensorTestCase.java +++ b/vespajlib/src/test/java/com/yahoo/tensor/functions/DynamicTensorTestCase.java @@ -6,6 +6,7 @@ import com.yahoo.tensor.TensorAddress; import com.yahoo.tensor.TensorType; import com.yahoo.tensor.evaluation.EvaluationContext; import com.yahoo.tensor.evaluation.Name; +import org.junit.Ignore; import org.junit.Test; import java.util.Collections; @@ -34,6 +35,19 @@ public class DynamicTensorTestCase { assertEquals("tensor(x{}):{{x:a}:5.0}", t2.toString()); } + @Ignore // Enable for benchmarking + public void benchMarkTensorAddressBuilder() { + long start = System.nanoTime(); + TensorType sparse = TensorType.fromSpec("tensor(x{})"); + for (int i=0; i < 10000; i++) { + TensorAddress.Builder builder = new TensorAddress.Builder(sparse); + for (int j=0; j < 1000; j++) { + builder.add("x", String.valueOf(j)); + } + } + System.out.println("Took " + (System.nanoTime() - start) + " ns"); + } + private static class Constant implements ScalarFunction<Name> { private final double value; diff --git a/vespajlib/src/test/java/com/yahoo/text/Ascii7BitMatcherTestCase.java b/vespajlib/src/test/java/com/yahoo/text/Ascii7BitMatcherTestCase.java new file mode 100644 index 00000000000..3f628b109f5 --- /dev/null +++ b/vespajlib/src/test/java/com/yahoo/text/Ascii7BitMatcherTestCase.java @@ -0,0 +1,44 @@ +package com.yahoo.text; + +import org.junit.Assert; +import org.junit.Test; + +import static org.junit.Assert.*; + +public class Ascii7BitMatcherTestCase { + @Test + public void testThatListedCharsAreLegal() { + assertTrue(new Ascii7BitMatcher("a").matches("aaaa")); + assertTrue(new Ascii7BitMatcher("ab").matches("abbbbbbbbb")); + assertTrue(new Ascii7BitMatcher("ab").matches("bbbbbbbbbba")); + assertTrue(new Ascii7BitMatcher("1").matches("1")); + } + @Test + public void requireThatNotListedCharsFail() { + assertFalse(new Ascii7BitMatcher("a").matches("b")); + } + + @Test + public void testThatLegalFirstAndRestPass() { + assertTrue(new Ascii7BitMatcher("a", "").matches("a")); + assertTrue(new Ascii7BitMatcher("a", "b").matches("abbbbbbbbb")); + assertTrue(new Ascii7BitMatcher("abc", "0123").matches("a123120")); + } + @Test + public void requireThatIllegalFirstOrSecondFail() { + assertFalse(new Ascii7BitMatcher("a", "").matches("aa")); + assertFalse(new Ascii7BitMatcher("a", "b").matches("aa")); + assertFalse(new Ascii7BitMatcher("", "a").matches("a")); + assertFalse(new Ascii7BitMatcher("a", "b").matches("bb")); + assertFalse(new Ascii7BitMatcher("a", "b").matches("abbbbbx")); + } + @Test + public void requireThatNonAsciiFailConstruction() { + try { + new Ascii7BitMatcher("aæb"); + Assert.fail("'æ' should not be allowed"); + } catch (IllegalArgumentException e) { + assertEquals("Char 'æ' at position 1 is not valid ascii 7 bit char", e.getMessage()); + } + } +} |