aboutsummaryrefslogtreecommitdiffstats
path: root/indexinglanguage
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2022-02-04 14:54:44 +0100
committerJon Bratseth <bratseth@gmail.com>2022-02-04 14:54:44 +0100
commit7b992b88818a931665441c3fb0f0c16824116567 (patch)
tree66d6ff52ed406bad1b0017deac104e96189a2c52 /indexinglanguage
parente67031fd521c68ca66fdf897f8c6b0fd5a395f45 (diff)
Add hash function
Diffstat (limited to 'indexinglanguage')
-rw-r--r--indexinglanguage/pom.xml1
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java91
-rw-r--r--indexinglanguage/src/main/javacc/IndexingParser.jj13
-rw-r--r--indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java47
4 files changed, 148 insertions, 4 deletions
diff --git a/indexinglanguage/pom.xml b/indexinglanguage/pom.xml
index efca7479faf..f9ee18a4602 100644
--- a/indexinglanguage/pom.xml
+++ b/indexinglanguage/pom.xml
@@ -47,7 +47,6 @@
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
- <scope>test</scope>
</dependency>
</dependencies>
<build>
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java
new file mode 100644
index 00000000000..c69ceda2210
--- /dev/null
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java
@@ -0,0 +1,91 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.indexinglanguage.expressions;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+import com.yahoo.document.DataType;
+import com.yahoo.document.DocumentType;
+import com.yahoo.document.Field;
+import com.yahoo.document.datatypes.IntegerFieldValue;
+import com.yahoo.document.datatypes.LongFieldValue;
+import com.yahoo.document.datatypes.StringFieldValue;
+
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Hashes a string value to a long or int (by type inference on the target value).
+ *
+ * @author bratseth
+ */
+public class HashExpression extends Expression {
+
+ private final HashFunction hasher = Hashing.sipHash24();
+
+ /** The destination the embedding will be written to on the form [schema name].[field name] */
+ private String destination;
+
+ /** The target type we are embedding into. */
+ private DataType targetType;
+
+ public HashExpression() {
+ super(DataType.STRING);
+ }
+
+ @Override
+ public void setStatementOutput(DocumentType documentType, Field field) {
+ if (field.getDataType() != DataType.INT && field.getDataType() != DataType.LONG)
+ throw new IllegalArgumentException("Cannot use the hash function on an indexing statement for " +
+ field.getName() +
+ ": The hash function can only be used when the target field is int or long, not " +
+ field.getDataType());
+ targetType = field.getDataType();
+ }
+
+ @Override
+ protected void doExecute(ExecutionContext context) {
+ StringFieldValue input = (StringFieldValue) context.getValue();
+ if (targetType.equals(DataType.INT))
+ context.setValue(new IntegerFieldValue(hashToInt(input.getString())));
+ else if (targetType.equals(DataType.LONG))
+ context.setValue(new LongFieldValue(hashToLong(input.getString())));
+ else
+ throw new IllegalStateException(); // won't happen
+ }
+
+ private int hashToInt(String value) {
+ return hasher.hashString(value, StandardCharsets.UTF_8).asInt();
+ }
+
+ private long hashToLong(String value) {
+ return hasher.hashString(value, StandardCharsets.UTF_8).asLong();
+ }
+
+ @Override
+ protected void doVerify(VerificationContext context) {
+ String outputField = context.getOutputField();
+ if (outputField == null)
+ throw new VerificationException(this, "No output field in this statement: " +
+ "Don't know what value to hash to.");
+ DataType outputFieldType = context.getInputType(this, outputField);
+ if (outputFieldType != DataType.INT && outputFieldType != DataType.LONG)
+ throw new VerificationException(this, "The type of the output field " + outputField +
+ " is not an int or long but " + outputField);
+ targetType = outputFieldType;
+ context.setValueType(createdOutputType());
+ }
+
+ @Override
+ public DataType createdOutputType() {
+ return targetType;
+ }
+
+ @Override
+ public String toString() { return "embed"; }
+
+ @Override
+ public int hashCode() { return 1; }
+
+ @Override
+ public boolean equals(Object o) { return o instanceof EmbedExpression; }
+
+}
diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj
index bdbecadecd3..e6b21f7c07b 100644
--- a/indexinglanguage/src/main/javacc/IndexingParser.jj
+++ b/indexinglanguage/src/main/javacc/IndexingParser.jj
@@ -164,6 +164,7 @@ TOKEN :
<GET_FIELD: "get_field"> |
<GET_VAR: "get_var"> |
<GUARD: "guard"> |
+ <HASH: "hash"> |
<HEX_DECODE: "hexdecode"> |
<HEX_ENCODE: "hexencode"> |
<HOST_NAME: "hostname"> |
@@ -283,13 +284,14 @@ Expression value() :
val = base64EncodeExp() |
val = clearStateExp() |
val = echoExp() |
- val = embedExp() |
+ val = embedExp() |
val = exactExp() |
val = flattenExp() |
val = forEachExp() |
val = getFieldExp() |
val = getVarExp() |
val = guardExp() |
+ val = hashExp() |
val = hexDecodeExp() |
val = hexEncodeExp() |
val = hostNameExp() |
@@ -419,6 +421,12 @@ Expression guardExp() :
{ return new GuardExpression(val); }
}
+Expression hashExp() : { }
+{
+ ( <HASH> )
+ { return new HashExpression(); }
+}
+
Expression hexDecodeExp() : { }
{
( <HEX_DECODE> )
@@ -744,12 +752,13 @@ String identifier() :
<ECHO> |
<EXACT> |
<ELSE> |
- <EMBED> |
+ <EMBED> |
<FLATTEN> |
<FOR_EACH> |
<GET_FIELD> |
<GET_VAR> |
<GUARD> |
+ <HASH> |
<HEX_DECODE> |
<HEX_ENCODE> |
<HOST_NAME> |
diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java
index f9a6f2225b3..b8254f133bc 100644
--- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java
+++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java
@@ -9,7 +9,6 @@ import com.yahoo.document.TensorDataType;
import com.yahoo.document.datatypes.BoolFieldValue;
import com.yahoo.document.datatypes.StringFieldValue;
import com.yahoo.document.datatypes.TensorFieldValue;
-import com.yahoo.language.Language;
import com.yahoo.language.process.Embedder;
import com.yahoo.language.simple.SimpleLinguistics;
import com.yahoo.tensor.Tensor;
@@ -100,6 +99,52 @@ public class ScriptTestCase {
}
@Test
+ public void testIntHash() throws ParseException {
+ var expression = Expression.fromString("input myText | hash | attribute 'myInt'");
+
+ SimpleTestAdapter adapter = new SimpleTestAdapter();
+ adapter.createField(new Field("myText", DataType.STRING));
+ var intField = new Field("myInt", DataType.INT);
+ adapter.createField(intField);
+ adapter.setValue("myText", new StringFieldValue("input text"));
+ expression.setStatementOutput(new DocumentType("myDocument"), intField);
+
+ // Necessary to resolve output type
+ VerificationContext verificationContext = new VerificationContext(adapter);
+ assertEquals(DataType.INT, expression.verify(verificationContext));
+
+ ExecutionContext context = new ExecutionContext(adapter);
+ context.setValue(new StringFieldValue("input text"));
+ expression.execute(context);
+ assertNotNull(context);
+ assertTrue(adapter.values.containsKey("myInt"));
+ assertEquals(-1425622096, adapter.values.get("myInt").getWrappedValue());
+ }
+
+ @Test
+ public void testLongHash() throws ParseException {
+ var expression = Expression.fromString("input myText | hash | attribute 'myLong'");
+
+ SimpleTestAdapter adapter = new SimpleTestAdapter();
+ adapter.createField(new Field("myText", DataType.STRING));
+ var intField = new Field("myLong", DataType.LONG);
+ adapter.createField(intField);
+ adapter.setValue("myText", new StringFieldValue("input text"));
+ expression.setStatementOutput(new DocumentType("myDocument"), intField);
+
+ // Necessary to resolve output type
+ VerificationContext verificationContext = new VerificationContext(adapter);
+ assertEquals(DataType.LONG, expression.verify(verificationContext));
+
+ ExecutionContext context = new ExecutionContext(adapter);
+ context.setValue(new StringFieldValue("input text"));
+ expression.execute(context);
+ assertNotNull(context);
+ assertTrue(adapter.values.containsKey("myLong"));
+ assertEquals(7678158186624760752L, adapter.values.get("myLong").getWrappedValue());
+ }
+
+ @Test
public void testEmbed() throws ParseException {
TensorType tensorType = TensorType.fromSpec("tensor(d[4])");
var expression = Expression.fromString("input myText | embed | attribute 'myTensor'",