aboutsummaryrefslogtreecommitdiffstats
path: root/indexinglanguage/src/main
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@gmail.com>2022-02-04 14:54:44 +0100
committerJon Bratseth <bratseth@gmail.com>2022-02-04 14:54:44 +0100
commit7b992b88818a931665441c3fb0f0c16824116567 (patch)
tree66d6ff52ed406bad1b0017deac104e96189a2c52 /indexinglanguage/src/main
parente67031fd521c68ca66fdf897f8c6b0fd5a395f45 (diff)
Add hash function
Diffstat (limited to 'indexinglanguage/src/main')
-rw-r--r--indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java91
-rw-r--r--indexinglanguage/src/main/javacc/IndexingParser.jj13
2 files changed, 102 insertions, 2 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java
new file mode 100644
index 00000000000..c69ceda2210
--- /dev/null
+++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java
@@ -0,0 +1,91 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.indexinglanguage.expressions;
+
+import com.google.common.hash.HashFunction;
+import com.google.common.hash.Hashing;
+import com.yahoo.document.DataType;
+import com.yahoo.document.DocumentType;
+import com.yahoo.document.Field;
+import com.yahoo.document.datatypes.IntegerFieldValue;
+import com.yahoo.document.datatypes.LongFieldValue;
+import com.yahoo.document.datatypes.StringFieldValue;
+
+import java.nio.charset.StandardCharsets;
+
+/**
+ * Hashes a string value to a long or int (by type inference on the target value).
+ *
+ * @author bratseth
+ */
+public class HashExpression extends Expression {
+
+ private final HashFunction hasher = Hashing.sipHash24();
+
+ /** The destination the embedding will be written to on the form [schema name].[field name] */
+ private String destination;
+
+ /** The target type we are embedding into. */
+ private DataType targetType;
+
+ public HashExpression() {
+ super(DataType.STRING);
+ }
+
+ @Override
+ public void setStatementOutput(DocumentType documentType, Field field) {
+ if (field.getDataType() != DataType.INT && field.getDataType() != DataType.LONG)
+ throw new IllegalArgumentException("Cannot use the hash function on an indexing statement for " +
+ field.getName() +
+ ": The hash function can only be used when the target field is int or long, not " +
+ field.getDataType());
+ targetType = field.getDataType();
+ }
+
+ @Override
+ protected void doExecute(ExecutionContext context) {
+ StringFieldValue input = (StringFieldValue) context.getValue();
+ if (targetType.equals(DataType.INT))
+ context.setValue(new IntegerFieldValue(hashToInt(input.getString())));
+ else if (targetType.equals(DataType.LONG))
+ context.setValue(new LongFieldValue(hashToLong(input.getString())));
+ else
+ throw new IllegalStateException(); // won't happen
+ }
+
+ private int hashToInt(String value) {
+ return hasher.hashString(value, StandardCharsets.UTF_8).asInt();
+ }
+
+ private long hashToLong(String value) {
+ return hasher.hashString(value, StandardCharsets.UTF_8).asLong();
+ }
+
+ @Override
+ protected void doVerify(VerificationContext context) {
+ String outputField = context.getOutputField();
+ if (outputField == null)
+ throw new VerificationException(this, "No output field in this statement: " +
+ "Don't know what value to hash to.");
+ DataType outputFieldType = context.getInputType(this, outputField);
+ if (outputFieldType != DataType.INT && outputFieldType != DataType.LONG)
+ throw new VerificationException(this, "The type of the output field " + outputField +
+ " is not an int or long but " + outputField);
+ targetType = outputFieldType;
+ context.setValueType(createdOutputType());
+ }
+
+ @Override
+ public DataType createdOutputType() {
+ return targetType;
+ }
+
+ @Override
+ public String toString() { return "embed"; }
+
+ @Override
+ public int hashCode() { return 1; }
+
+ @Override
+ public boolean equals(Object o) { return o instanceof EmbedExpression; }
+
+}
diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj
index bdbecadecd3..e6b21f7c07b 100644
--- a/indexinglanguage/src/main/javacc/IndexingParser.jj
+++ b/indexinglanguage/src/main/javacc/IndexingParser.jj
@@ -164,6 +164,7 @@ TOKEN :
<GET_FIELD: "get_field"> |
<GET_VAR: "get_var"> |
<GUARD: "guard"> |
+ <HASH: "hash"> |
<HEX_DECODE: "hexdecode"> |
<HEX_ENCODE: "hexencode"> |
<HOST_NAME: "hostname"> |
@@ -283,13 +284,14 @@ Expression value() :
val = base64EncodeExp() |
val = clearStateExp() |
val = echoExp() |
- val = embedExp() |
+ val = embedExp() |
val = exactExp() |
val = flattenExp() |
val = forEachExp() |
val = getFieldExp() |
val = getVarExp() |
val = guardExp() |
+ val = hashExp() |
val = hexDecodeExp() |
val = hexEncodeExp() |
val = hostNameExp() |
@@ -419,6 +421,12 @@ Expression guardExp() :
{ return new GuardExpression(val); }
}
+Expression hashExp() : { }
+{
+ ( <HASH> )
+ { return new HashExpression(); }
+}
+
Expression hexDecodeExp() : { }
{
( <HEX_DECODE> )
@@ -744,12 +752,13 @@ String identifier() :
<ECHO> |
<EXACT> |
<ELSE> |
- <EMBED> |
+ <EMBED> |
<FLATTEN> |
<FOR_EACH> |
<GET_FIELD> |
<GET_VAR> |
<GUARD> |
+ <HASH> |
<HEX_DECODE> |
<HEX_ENCODE> |
<HOST_NAME> |