aboutsummaryrefslogtreecommitdiffstats
path: root/vespajlib
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2019-12-13 07:03:07 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2019-12-13 07:03:07 +0000
commit2bc18be3d47d16c4cf25f1bc36614cf6e6a25f7b (patch)
tree00e517a3763ae5be598407983563681f0834f6a0 /vespajlib
parentb0098c4d582e071dae1957d38ce4dc2b915e5184 (diff)
Add a faster, but simpler pattern matcher. Builds tensoraddresses 5 times+ faster than the regex one.
Diffstat (limited to 'vespajlib')
-rw-r--r--vespajlib/src/main/java/com/yahoo/tensor/TensorAddress.java11
-rw-r--r--vespajlib/src/main/java/com/yahoo/text/Ascii7BitMatcher.java64
-rw-r--r--vespajlib/src/test/java/com/yahoo/tensor/functions/DynamicTensorTestCase.java14
-rw-r--r--vespajlib/src/test/java/com/yahoo/text/Ascii7BitMatcherTestCase.java44
4 files changed, 129 insertions, 4 deletions
diff --git a/vespajlib/src/main/java/com/yahoo/tensor/TensorAddress.java b/vespajlib/src/main/java/com/yahoo/tensor/TensorAddress.java
index f99645cd996..e705445c5a7 100644
--- a/vespajlib/src/main/java/com/yahoo/tensor/TensorAddress.java
+++ b/vespajlib/src/main/java/com/yahoo/tensor/TensorAddress.java
@@ -1,11 +1,14 @@
// Copyright 2017 Yahoo Holdings. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.tensor;
+import com.yahoo.text.Ascii7BitMatcher;
+
import java.util.Arrays;
import java.util.Optional;
-import java.util.regex.Pattern;
import java.util.stream.Collectors;
+import static com.yahoo.text.Ascii7BitMatcher.charsAndNumbers;
+
/**
* An immutable address to a tensor cell. This simply supplies a value to each dimension
* in a particular tensor type. By itself it is just a list of cell labels, it's meaning depends on its accompanying type.
@@ -159,8 +162,8 @@ public abstract class TensorAddress implements Comparable<TensorAddress> {
/** Supports building of a tensor address */
public static class Builder {
-
- static final private Pattern labelPattern = Pattern.compile("[-,A-Za-z0-9_@]([A-Z,a-z0-9_@$])*");
+ static private final Ascii7BitMatcher labelMatcher = new Ascii7BitMatcher("-_@" + charsAndNumbers(),
+ "_@$" + charsAndNumbers());
private final TensorType type;
private final String[] labels;
@@ -205,7 +208,7 @@ public abstract class TensorAddress implements Comparable<TensorAddress> {
static private void requireIdentifier(String s, String parameterName) {
if (s == null)
throw new IllegalArgumentException(parameterName + " can not be null");
- if ( ! labelPattern.matcher(s).matches())
+ if ( ! labelMatcher.matches(s))
throw new IllegalArgumentException(parameterName + " must be an identifier or integer, not '" + s + "'");
}
diff --git a/vespajlib/src/main/java/com/yahoo/text/Ascii7BitMatcher.java b/vespajlib/src/main/java/com/yahoo/text/Ascii7BitMatcher.java
new file mode 100644
index 00000000000..b821d57de00
--- /dev/null
+++ b/vespajlib/src/main/java/com/yahoo/text/Ascii7BitMatcher.java
@@ -0,0 +1,64 @@
+// Copyright 2019 Oath Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.text;
+
+import java.util.BitSet;
+
+/**
+ * Fast replacement for regex based validators of simple expressions.
+ * It can take a list of legal characters for the the first character,
+ * and another list for the following. Limited to 7bit ascii.
+ * @author baldersheim
+ */
+public class Ascii7BitMatcher {
+ private final BitSet legalFirst;
+ private final BitSet legalRest;
+ private static BitSet createBitSet(String legal) {
+ BitSet legalChars = new BitSet(128);
+ for (int i=0; i < legal.length(); i++) {
+ char c = legal.charAt(i);
+ if (c < 128) {
+ legalChars.set(c);
+ } else {
+ throw new IllegalArgumentException("Char '" + c + "' at position " + i + " is not valid ascii 7 bit char");
+ }
+ }
+ return legalChars;
+ }
+ public Ascii7BitMatcher(String legal) {
+ this(legal, legal);
+ }
+ public Ascii7BitMatcher(String legalFirstChar, String legalChars) {
+ legalFirst = createBitSet(legalFirstChar);
+ legalRest = createBitSet(legalChars);
+ }
+ private static boolean isAscii7Bit(char c) { return c < 128;}
+ private boolean isLegalFirst(char c) {
+ return isAscii7Bit(c) && legalFirst.get(c);
+ }
+ private boolean isLegalRest(char c) {
+ return isAscii7Bit(c) && legalRest.get(c);
+ }
+ public boolean matches(String s) {
+ if (s == null || s.isEmpty() || ! isLegalFirst(s.charAt(0))) return false;
+ for (int i = 1; i < s.length(); i++) {
+ if ( ! isLegalRest(s.charAt(i))) {
+ return false;
+ }
+ }
+ return true;
+ }
+ static public String charsAndNumbers() {
+ char [] chars = new char[26*2+10];
+ int i = 0;
+ for (char c = 'A'; c <= 'Z'; c++) {
+ chars[i++] = c;
+ }
+ for (char c = 'a'; c <= 'z'; c++) {
+ chars[i++] = c;
+ }
+ for (char c = '0'; c <= '9'; c++) {
+ chars[i++] = c;
+ }
+ return new String(chars);
+ }
+}
diff --git a/vespajlib/src/test/java/com/yahoo/tensor/functions/DynamicTensorTestCase.java b/vespajlib/src/test/java/com/yahoo/tensor/functions/DynamicTensorTestCase.java
index e16b7b90a1d..2a34bc11b76 100644
--- a/vespajlib/src/test/java/com/yahoo/tensor/functions/DynamicTensorTestCase.java
+++ b/vespajlib/src/test/java/com/yahoo/tensor/functions/DynamicTensorTestCase.java
@@ -6,6 +6,7 @@ import com.yahoo.tensor.TensorAddress;
import com.yahoo.tensor.TensorType;
import com.yahoo.tensor.evaluation.EvaluationContext;
import com.yahoo.tensor.evaluation.Name;
+import org.junit.Ignore;
import org.junit.Test;
import java.util.Collections;
@@ -34,6 +35,19 @@ public class DynamicTensorTestCase {
assertEquals("tensor(x{}):{{x:a}:5.0}", t2.toString());
}
+ @Ignore // Enable for benchmarking
+ public void benchMarkTensorAddressBuilder() {
+ long start = System.nanoTime();
+ TensorType sparse = TensorType.fromSpec("tensor(x{})");
+ for (int i=0; i < 10000; i++) {
+ TensorAddress.Builder builder = new TensorAddress.Builder(sparse);
+ for (int j=0; j < 1000; j++) {
+ builder.add("x", String.valueOf(j));
+ }
+ }
+ System.out.println("Took " + (System.nanoTime() - start) + " ns");
+ }
+
private static class Constant implements ScalarFunction<Name> {
private final double value;
diff --git a/vespajlib/src/test/java/com/yahoo/text/Ascii7BitMatcherTestCase.java b/vespajlib/src/test/java/com/yahoo/text/Ascii7BitMatcherTestCase.java
new file mode 100644
index 00000000000..3f628b109f5
--- /dev/null
+++ b/vespajlib/src/test/java/com/yahoo/text/Ascii7BitMatcherTestCase.java
@@ -0,0 +1,44 @@
+package com.yahoo.text;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import static org.junit.Assert.*;
+
+public class Ascii7BitMatcherTestCase {
+ @Test
+ public void testThatListedCharsAreLegal() {
+ assertTrue(new Ascii7BitMatcher("a").matches("aaaa"));
+ assertTrue(new Ascii7BitMatcher("ab").matches("abbbbbbbbb"));
+ assertTrue(new Ascii7BitMatcher("ab").matches("bbbbbbbbbba"));
+ assertTrue(new Ascii7BitMatcher("1").matches("1"));
+ }
+ @Test
+ public void requireThatNotListedCharsFail() {
+ assertFalse(new Ascii7BitMatcher("a").matches("b"));
+ }
+
+ @Test
+ public void testThatLegalFirstAndRestPass() {
+ assertTrue(new Ascii7BitMatcher("a", "").matches("a"));
+ assertTrue(new Ascii7BitMatcher("a", "b").matches("abbbbbbbbb"));
+ assertTrue(new Ascii7BitMatcher("abc", "0123").matches("a123120"));
+ }
+ @Test
+ public void requireThatIllegalFirstOrSecondFail() {
+ assertFalse(new Ascii7BitMatcher("a", "").matches("aa"));
+ assertFalse(new Ascii7BitMatcher("a", "b").matches("aa"));
+ assertFalse(new Ascii7BitMatcher("", "a").matches("a"));
+ assertFalse(new Ascii7BitMatcher("a", "b").matches("bb"));
+ assertFalse(new Ascii7BitMatcher("a", "b").matches("abbbbbx"));
+ }
+ @Test
+ public void requireThatNonAsciiFailConstruction() {
+ try {
+ new Ascii7BitMatcher("aæb");
+ Assert.fail("'æ' should not be allowed");
+ } catch (IllegalArgumentException e) {
+ assertEquals("Char 'æ' at position 1 is not valid ascii 7 bit char", e.getMessage());
+ }
+ }
+}