summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2022-02-06 13:55:58 +0100
committerGitHub <noreply@github.com>2022-02-06 13:55:58 +0100
commitb7615a4a248be8809a2a5c9d3e95f18aa825dc82 (patch)
tree61d081189ebd9bbee5c1f4101986723009593346
parentb84ef936b1cedce0b99f79e03b2fe25a8db5f7c3 (diff)
parent69ce8870e81669dd0fa56b9ec6487949b7b60ffb (diff)
Merge pull request #21073 from vespa-engine/bratseth/il-hash
Add hash function
-rw-r--r--document/src/main/java/com/yahoo/document/datatypes/Array.java5
-rw-r--r--indexinglanguage/pom.xml1
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java95
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HexEncodeExpression.java1
-rw-r--r--indexinglanguage/src/main/javacc/IndexingParser.jj13
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java46
-rw-r--r--vespajlib/src/main/java/com/yahoo/collections/BobHash.java51
7 files changed, 179 insertions, 33 deletions
diff --git a/document/src/main/java/com/yahoo/document/datatypes/Array.java b/document/src/main/java/com/yahoo/document/datatypes/Array.java
index 11a8eb7a350..672690bafad 100644
--- a/document/src/main/java/com/yahoo/document/datatypes/Array.java
+++ b/document/src/main/java/com/yahoo/document/datatypes/Array.java
@@ -21,7 +21,7 @@ import java.util.ListIterator;
import java.util.RandomAccess;
/**
- * FieldValue which encapsulates a Array value
+ * FieldValue which encapsulates an Array value
*
* @author Einar M R Rosenvinge
*/
@@ -42,8 +42,7 @@ public final class Array<T extends FieldValue> extends CollectionFieldValue<T> i
this(type);
for (T v : values) {
if (!((ArrayDataType)type).getNestedType().isValueCompatible(v)) {
- throw new IllegalArgumentException("FieldValue " + v +
- " is not compatible with " + type + ".");
+ throw new IllegalArgumentException("FieldValue " + v + " is not compatible with " + type + ".");
}
}
this.values.addAll(values);
diff --git a/indexinglanguage/pom.xml b/indexinglanguage/pom.xml
index efca7479faf..f9ee18a4602 100644
--- a/indexinglanguage/pom.xml
+++ b/indexinglanguage/pom.xml
@@ -47,7 +47,6 @@
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
- <scope>test</scope>
</dependency>
</dependencies>
<build>
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java
new file mode 100644
index 00000000000..5b04720dad4
--- /dev/null
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java
@@ -0,0 +1,95 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.indexinglanguage.expressions;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+import com.yahoo.document.ArrayDataType;
+import com.yahoo.document.DataType;
+import com.yahoo.document.DocumentType;
+import com.yahoo.document.Field;
+import com.yahoo.document.datatypes.IntegerFieldValue;
+import com.yahoo.document.datatypes.LongFieldValue;
+import com.yahoo.document.datatypes.StringFieldValue;
+
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Hashes a string value to a long or int (by type inference on the target value).
+ *
+ * @author bratseth
+ */
+public class HashExpression extends Expression {
+
+ private final HashFunction hasher = Hashing.sipHash24();
+
+ /** The target type we are hashing into. */
+ private DataType targetType;
+
+ public HashExpression() {
+ super(DataType.STRING);
+ }
+
+ @Override
+ public void setStatementOutput(DocumentType documentType, Field field) {
+ if ( ! canStoreHash(field.getDataType()))
+ throw new IllegalArgumentException("Cannot use the hash function on an indexing statement for " +
+ field.getName() +
+ ": The hash function can only be used when the target field " +
+ "is int or long, not " + field.getDataType());
+ targetType = field.getDataType();
+ }
+
+ @Override
+ protected void doExecute(ExecutionContext context) {
+ StringFieldValue input = (StringFieldValue) context.getValue();
+ if (targetType.equals(DataType.INT))
+ context.setValue(new IntegerFieldValue(hashToInt(input.getString())));
+ else if (targetType.equals(DataType.LONG))
+ context.setValue(new LongFieldValue(hashToLong(input.getString())));
+ else
+ throw new IllegalStateException(); // won't happen
+ }
+
+ private int hashToInt(String value) {
+ return hasher.hashString(value, StandardCharsets.UTF_8).asInt();
+ }
+
+ private long hashToLong(String value) {
+ return hasher.hashString(value, StandardCharsets.UTF_8).asLong();
+ }
+
+ @Override
+ protected void doVerify(VerificationContext context) {
+ String outputField = context.getOutputField();
+ if (outputField == null)
+ throw new VerificationException(this, "No output field in this statement: " +
+ "Don't know what value to hash to.");
+ DataType outputFieldType = context.getInputType(this, outputField);
+ if ( ! canStoreHash(outputFieldType))
+ throw new VerificationException(this, "The type of the output field " + outputField +
+ " is not int or long but " + outputFieldType);
+ targetType = outputFieldType;
+ context.setValueType(createdOutputType());
+ }
+
+ private boolean canStoreHash(DataType type) {
+ if (type.equals(DataType.INT)) return true;
+ if (type.equals(DataType.LONG)) return true;
+ return false;
+ }
+
+ @Override
+ public DataType createdOutputType() {
+ return targetType;
+ }
+
+ @Override
+ public String toString() { return "hash"; }
+
+ @Override
+ public int hashCode() { return 987; }
+
+ @Override
+ public boolean equals(Object o) { return o instanceof HashExpression; }
+
+}
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HexEncodeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HexEncodeExpression.java
index 5e7288b8ecc..ca2be7c3400 100644
--- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HexEncodeExpression.java
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HexEncodeExpression.java
@@ -13,6 +13,7 @@ public final class HexEncodeExpression extends Expression {
public HexEncodeExpression() {
super(DataType.LONG);
}
+
@Override
protected void doExecute(ExecutionContext context) {
long input = ((LongFieldValue) context.getValue()).getLong();
diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj
index bdbecadecd3..e6b21f7c07b 100644
--- a/indexinglanguage/src/main/javacc/IndexingParser.jj
+++ b/indexinglanguage/src/main/javacc/IndexingParser.jj
@@ -164,6 +164,7 @@ TOKEN :
<GET_FIELD: "get_field"> |
<GET_VAR: "get_var"> |
<GUARD: "guard"> |
+ <HASH: "hash"> |
<HEX_DECODE: "hexdecode"> |
<HEX_ENCODE: "hexencode"> |
<HOST_NAME: "hostname"> |
@@ -283,13 +284,14 @@ Expression value() :
val = base64EncodeExp() |
val = clearStateExp() |
val = echoExp() |
- val = embedExp() |
+ val = embedExp() |
val = exactExp() |
val = flattenExp() |
val = forEachExp() |
val = getFieldExp() |
val = getVarExp() |
val = guardExp() |
+ val = hashExp() |
val = hexDecodeExp() |
val = hexEncodeExp() |
val = hostNameExp() |
@@ -419,6 +421,12 @@ Expression guardExp() :
{ return new GuardExpression(val); }
}
+Expression hashExp() : { }
+{
+ ( <HASH> )
+ { return new HashExpression(); }
+}
+
Expression hexDecodeExp() : { }
{
( <HEX_DECODE> )
@@ -744,12 +752,13 @@ String identifier() :
<ECHO> |
<EXACT> |
<ELSE> |
- <EMBED> |
+ <EMBED> |
<FLATTEN> |
<FOR_EACH> |
<GET_FIELD> |
<GET_VAR> |
<GUARD> |
+ <HASH> |
<HEX_DECODE> |
<HEX_ENCODE> |
<HOST_NAME> |
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java
index f9a6f2225b3..778d95fcaef 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java
@@ -9,7 +9,6 @@ import com.yahoo.document.TensorDataType;
import com.yahoo.document.datatypes.BoolFieldValue;
import com.yahoo.document.datatypes.StringFieldValue;
import com.yahoo.document.datatypes.TensorFieldValue;
-import com.yahoo.language.Language;
import com.yahoo.language.process.Embedder;
import com.yahoo.language.simple.SimpleLinguistics;
import com.yahoo.tensor.Tensor;
@@ -100,6 +99,50 @@ public class ScriptTestCase {
}
@Test
+ public void testIntHash() throws ParseException {
+ var expression = Expression.fromString("input myText | hash | attribute 'myInt'");
+
+ SimpleTestAdapter adapter = new SimpleTestAdapter();
+ adapter.createField(new Field("myText", DataType.STRING));
+ var intField = new Field("myInt", DataType.INT);
+ adapter.createField(intField);
+ adapter.setValue("myText", new StringFieldValue("input text"));
+ expression.setStatementOutput(new DocumentType("myDocument"), intField);
+
+ // Necessary to resolve output type
+ VerificationContext verificationContext = new VerificationContext(adapter);
+ assertEquals(DataType.INT, expression.verify(verificationContext));
+
+ ExecutionContext context = new ExecutionContext(adapter);
+ context.setValue(new StringFieldValue("input text"));
+ expression.execute(context);
+ assertTrue(adapter.values.containsKey("myInt"));
+ assertEquals(-1425622096, adapter.values.get("myInt").getWrappedValue());
+ }
+
+ @Test
+ public void testLongHash() throws ParseException {
+ var expression = Expression.fromString("input myText | hash | attribute 'myLong'");
+
+ SimpleTestAdapter adapter = new SimpleTestAdapter();
+ adapter.createField(new Field("myText", DataType.STRING));
+ var intField = new Field("myLong", DataType.LONG);
+ adapter.createField(intField);
+ adapter.setValue("myText", new StringFieldValue("input text"));
+ expression.setStatementOutput(new DocumentType("myDocument"), intField);
+
+ // Necessary to resolve output type
+ VerificationContext verificationContext = new VerificationContext(adapter);
+ assertEquals(DataType.LONG, expression.verify(verificationContext));
+
+ ExecutionContext context = new ExecutionContext(adapter);
+ context.setValue(new StringFieldValue("input text"));
+ expression.execute(context);
+ assertTrue(adapter.values.containsKey("myLong"));
+ assertEquals(7678158186624760752L, adapter.values.get("myLong").getWrappedValue());
+ }
+
+ @Test
public void testEmbed() throws ParseException {
TensorType tensorType = TensorType.fromSpec("tensor(d[4])");
var expression = Expression.fromString("input myText | embed | attribute 'myTensor'",
@@ -120,7 +163,6 @@ public class ScriptTestCase {
ExecutionContext context = new ExecutionContext(adapter);
context.setValue(new StringFieldValue("input text"));
expression.execute(context);
- assertNotNull(context);
assertTrue(adapter.values.containsKey("myTensor"));
assertEquals(Tensor.from(tensorType, "[7,3,0,0]"),
((TensorFieldValue)adapter.values.get("myTensor")).getTensor().get());
diff --git a/vespajlib/src/main/java/com/yahoo/collections/BobHash.java b/vespajlib/src/main/java/com/yahoo/collections/BobHash.java
index d133af2ea84..3d1e82743cc 100644
--- a/vespajlib/src/main/java/com/yahoo/collections/BobHash.java
+++ b/vespajlib/src/main/java/com/yahoo/collections/BobHash.java
@@ -153,44 +153,45 @@ public class BobHash {
// handle the last 11 bytes
c += k.length;
switch (len) {
- // all the case statements fall through
- case 11:
- c += (unsign(k[offset + 10]) << 24);
+ // all the case statements fall through
+ case 11:
+ c += (unsign(k[offset + 10]) << 24);
- case 10:
- c += (unsign(k[offset + 9]) << 16);
+ case 10:
+ c += (unsign(k[offset + 9]) << 16);
- case 9:
- c += (unsign(k[offset + 8]) << 8);
+ case 9:
+ c += (unsign(k[offset + 8]) << 8);
- /* the first byte of c is reserved for the length */
- case 8:
- b += (unsign(k[offset + 7]) << 24);
+ /* the first byte of c is reserved for the length */
+ case 8:
+ b += (unsign(k[offset + 7]) << 24);
- case 7:
- b += (unsign(k[offset + 6]) << 16);
+ case 7:
+ b += (unsign(k[offset + 6]) << 16);
- case 6:
- b += (unsign(k[offset + 5]) << 8);
+ case 6:
+ b += (unsign(k[offset + 5]) << 8);
- case 5:
- b += unsign(k[offset + 4]);
+ case 5:
+ b += unsign(k[offset + 4]);
- case 4:
- a += (unsign(k[offset + 3]) << 24);
+ case 4:
+ a += (unsign(k[offset + 3]) << 24);
- case 3:
- a += (unsign(k[offset + 2]) << 16);
+ case 3:
+ a += (unsign(k[offset + 2]) << 16);
- case 2:
- a += (unsign(k[offset + 1]) << 8);
+ case 2:
+ a += (unsign(k[offset + 1]) << 8);
- case 1:
- a += unsign(k[offset + 0]);
+ case 1:
+ a += unsign(k[offset + 0]);
- /* case 0: nothing left to add */
+ /* case 0: nothing left to add */
}
abcBuffer = mix(a, b, c);
return abcBuffer[2];
}
+
}