summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2022-02-04 14:54:44 +0100
committerJon Bratseth <bratseth@gmail.com>2022-02-04 14:54:44 +0100
commit7b992b88818a931665441c3fb0f0c16824116567 (patch)
tree66d6ff52ed406bad1b0017deac104e96189a2c52
parente67031fd521c68ca66fdf897f8c6b0fd5a395f45 (diff)
Add hash function
-rw-r--r--indexinglanguage/pom.xml1
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java91
-rw-r--r--indexinglanguage/src/main/javacc/IndexingParser.jj13
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java47
-rw-r--r--vespajlib/src/main/java/com/yahoo/collections/BobHash.java51
5 files changed, 174 insertions, 29 deletions
diff --git a/indexinglanguage/pom.xml b/indexinglanguage/pom.xml
index efca7479faf..f9ee18a4602 100644
--- a/indexinglanguage/pom.xml
+++ b/indexinglanguage/pom.xml
@@ -47,7 +47,6 @@
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
- <scope>test</scope>
</dependency>
</dependencies>
<build>
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java
new file mode 100644
index 00000000000..c69ceda2210
--- /dev/null
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java
@@ -0,0 +1,91 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.indexinglanguage.expressions;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+import com.yahoo.document.DataType;
+import com.yahoo.document.DocumentType;
+import com.yahoo.document.Field;
+import com.yahoo.document.datatypes.IntegerFieldValue;
+import com.yahoo.document.datatypes.LongFieldValue;
+import com.yahoo.document.datatypes.StringFieldValue;
+
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Hashes a string value to a long or int (by type inference on the target value).
+ *
+ * @author bratseth
+ */
+public class HashExpression extends Expression {
+
+ private final HashFunction hasher = Hashing.sipHash24();
+
+ /** The destination the embedding will be written to on the form [schema name].[field name] */
+ private String destination;
+
+ /** The target type we are embedding into. */
+ private DataType targetType;
+
+ public HashExpression() {
+ super(DataType.STRING);
+ }
+
+ @Override
+ public void setStatementOutput(DocumentType documentType, Field field) {
+ if (field.getDataType() != DataType.INT && field.getDataType() != DataType.LONG)
+ throw new IllegalArgumentException("Cannot use the hash function on an indexing statement for " +
+ field.getName() +
+ ": The hash function can only be used when the target field is int or long, not " +
+ field.getDataType());
+ targetType = field.getDataType();
+ }
+
+ @Override
+ protected void doExecute(ExecutionContext context) {
+ StringFieldValue input = (StringFieldValue) context.getValue();
+ if (targetType.equals(DataType.INT))
+ context.setValue(new IntegerFieldValue(hashToInt(input.getString())));
+ else if (targetType.equals(DataType.LONG))
+ context.setValue(new LongFieldValue(hashToLong(input.getString())));
+ else
+ throw new IllegalStateException(); // won't happen
+ }
+
+ private int hashToInt(String value) {
+ return hasher.hashString(value, StandardCharsets.UTF_8).asInt();
+ }
+
+ private long hashToLong(String value) {
+ return hasher.hashString(value, StandardCharsets.UTF_8).asLong();
+ }
+
+ @Override
+ protected void doVerify(VerificationContext context) {
+ String outputField = context.getOutputField();
+ if (outputField == null)
+ throw new VerificationException(this, "No output field in this statement: " +
+ "Don't know what value to hash to.");
+ DataType outputFieldType = context.getInputType(this, outputField);
+ if (outputFieldType != DataType.INT && outputFieldType != DataType.LONG)
+ throw new VerificationException(this, "The type of the output field " + outputField +
+ " is not an int or long but " + outputField);
+ targetType = outputFieldType;
+ context.setValueType(createdOutputType());
+ }
+
+ @Override
+ public DataType createdOutputType() {
+ return targetType;
+ }
+
+ @Override
+ public String toString() { return "embed"; }
+
+ @Override
+ public int hashCode() { return 1; }
+
+ @Override
+ public boolean equals(Object o) { return o instanceof EmbedExpression; }
+
+}
diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj
index bdbecadecd3..e6b21f7c07b 100644
--- a/indexinglanguage/src/main/javacc/IndexingParser.jj
+++ b/indexinglanguage/src/main/javacc/IndexingParser.jj
@@ -164,6 +164,7 @@ TOKEN :
<GET_FIELD: "get_field"> |
<GET_VAR: "get_var"> |
<GUARD: "guard"> |
+ <HASH: "hash"> |
<HEX_DECODE: "hexdecode"> |
<HEX_ENCODE: "hexencode"> |
<HOST_NAME: "hostname"> |
@@ -283,13 +284,14 @@ Expression value() :
val = base64EncodeExp() |
val = clearStateExp() |
val = echoExp() |
- val = embedExp() |
+ val = embedExp() |
val = exactExp() |
val = flattenExp() |
val = forEachExp() |
val = getFieldExp() |
val = getVarExp() |
val = guardExp() |
+ val = hashExp() |
val = hexDecodeExp() |
val = hexEncodeExp() |
val = hostNameExp() |
@@ -419,6 +421,12 @@ Expression guardExp() :
{ return new GuardExpression(val); }
}
+Expression hashExp() : { }
+{
+ ( <HASH> )
+ { return new HashExpression(); }
+}
+
Expression hexDecodeExp() : { }
{
( <HEX_DECODE> )
@@ -744,12 +752,13 @@ String identifier() :
<ECHO> |
<EXACT> |
<ELSE> |
- <EMBED> |
+ <EMBED> |
<FLATTEN> |
<FOR_EACH> |
<GET_FIELD> |
<GET_VAR> |
<GUARD> |
+ <HASH> |
<HEX_DECODE> |
<HEX_ENCODE> |
<HOST_NAME> |
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java
index f9a6f2225b3..b8254f133bc 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java
@@ -9,7 +9,6 @@ import com.yahoo.document.TensorDataType;
import com.yahoo.document.datatypes.BoolFieldValue;
import com.yahoo.document.datatypes.StringFieldValue;
import com.yahoo.document.datatypes.TensorFieldValue;
-import com.yahoo.language.Language;
import com.yahoo.language.process.Embedder;
import com.yahoo.language.simple.SimpleLinguistics;
import com.yahoo.tensor.Tensor;
@@ -100,6 +99,52 @@ public class ScriptTestCase {
}
@Test
+ public void testIntHash() throws ParseException {
+ var expression = Expression.fromString("input myText | hash | attribute 'myInt'");
+
+ SimpleTestAdapter adapter = new SimpleTestAdapter();
+ adapter.createField(new Field("myText", DataType.STRING));
+ var intField = new Field("myInt", DataType.INT);
+ adapter.createField(intField);
+ adapter.setValue("myText", new StringFieldValue("input text"));
+ expression.setStatementOutput(new DocumentType("myDocument"), intField);
+
+ // Necessary to resolve output type
+ VerificationContext verificationContext = new VerificationContext(adapter);
+ assertEquals(DataType.INT, expression.verify(verificationContext));
+
+ ExecutionContext context = new ExecutionContext(adapter);
+ context.setValue(new StringFieldValue("input text"));
+ expression.execute(context);
+ assertNotNull(context);
+ assertTrue(adapter.values.containsKey("myInt"));
+ assertEquals(-1425622096, adapter.values.get("myInt").getWrappedValue());
+ }
+
+ @Test
+ public void testLongHash() throws ParseException {
+ var expression = Expression.fromString("input myText | hash | attribute 'myLong'");
+
+ SimpleTestAdapter adapter = new SimpleTestAdapter();
+ adapter.createField(new Field("myText", DataType.STRING));
+ var intField = new Field("myLong", DataType.LONG);
+ adapter.createField(intField);
+ adapter.setValue("myText", new StringFieldValue("input text"));
+ expression.setStatementOutput(new DocumentType("myDocument"), intField);
+
+ // Necessary to resolve output type
+ VerificationContext verificationContext = new VerificationContext(adapter);
+ assertEquals(DataType.LONG, expression.verify(verificationContext));
+
+ ExecutionContext context = new ExecutionContext(adapter);
+ context.setValue(new StringFieldValue("input text"));
+ expression.execute(context);
+ assertNotNull(context);
+ assertTrue(adapter.values.containsKey("myLong"));
+ assertEquals(7678158186624760752L, adapter.values.get("myLong").getWrappedValue());
+ }
+
+ @Test
public void testEmbed() throws ParseException {
TensorType tensorType = TensorType.fromSpec("tensor(d[4])");
var expression = Expression.fromString("input myText | embed | attribute 'myTensor'",
diff --git a/vespajlib/src/main/java/com/yahoo/collections/BobHash.java b/vespajlib/src/main/java/com/yahoo/collections/BobHash.java
index d133af2ea84..3d1e82743cc 100644
--- a/vespajlib/src/main/java/com/yahoo/collections/BobHash.java
+++ b/vespajlib/src/main/java/com/yahoo/collections/BobHash.java
@@ -153,44 +153,45 @@ public class BobHash {
// handle the last 11 bytes
c += k.length;
switch (len) {
- // all the case statements fall through
- case 11:
- c += (unsign(k[offset + 10]) << 24);
+ // all the case statements fall through
+ case 11:
+ c += (unsign(k[offset + 10]) << 24);
- case 10:
- c += (unsign(k[offset + 9]) << 16);
+ case 10:
+ c += (unsign(k[offset + 9]) << 16);
- case 9:
- c += (unsign(k[offset + 8]) << 8);
+ case 9:
+ c += (unsign(k[offset + 8]) << 8);
- /* the first byte of c is reserved for the length */
- case 8:
- b += (unsign(k[offset + 7]) << 24);
+ /* the first byte of c is reserved for the length */
+ case 8:
+ b += (unsign(k[offset + 7]) << 24);
- case 7:
- b += (unsign(k[offset + 6]) << 16);
+ case 7:
+ b += (unsign(k[offset + 6]) << 16);
- case 6:
- b += (unsign(k[offset + 5]) << 8);
+ case 6:
+ b += (unsign(k[offset + 5]) << 8);
- case 5:
- b += unsign(k[offset + 4]);
+ case 5:
+ b += unsign(k[offset + 4]);
- case 4:
- a += (unsign(k[offset + 3]) << 24);
+ case 4:
+ a += (unsign(k[offset + 3]) << 24);
- case 3:
- a += (unsign(k[offset + 2]) << 16);
+ case 3:
+ a += (unsign(k[offset + 2]) << 16);
- case 2:
- a += (unsign(k[offset + 1]) << 8);
+ case 2:
+ a += (unsign(k[offset + 1]) << 8);
- case 1:
- a += unsign(k[offset + 0]);
+ case 1:
+ a += unsign(k[offset + 0]);
- /* case 0: nothing left to add */
+ /* case 0: nothing left to add */
}
abcBuffer = mix(a, b, c);
return abcBuffer[2];
}
+
}