diff options
author | Jon Bratseth <bratseth@gmail.com> | 2022-02-04 14:54:44 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@gmail.com> | 2022-02-04 14:54:44 +0100 |
commit | 7b992b88818a931665441c3fb0f0c16824116567 (patch) | |
tree | 66d6ff52ed406bad1b0017deac104e96189a2c52 /indexinglanguage/src/main | |
parent | e67031fd521c68ca66fdf897f8c6b0fd5a395f45 (diff) |
Add hash function
Diffstat (limited to 'indexinglanguage/src/main')
-rw-r--r-- | indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java | 91 | ||||
-rw-r--r-- | indexinglanguage/src/main/javacc/IndexingParser.jj | 13 |
2 files changed, 102 insertions, 2 deletions
diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java new file mode 100644 index 00000000000..c69ceda2210 --- /dev/null +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java @@ -0,0 +1,91 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.indexinglanguage.expressions; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import com.yahoo.document.DataType; +import com.yahoo.document.DocumentType; +import com.yahoo.document.Field; +import com.yahoo.document.datatypes.IntegerFieldValue; +import com.yahoo.document.datatypes.LongFieldValue; +import com.yahoo.document.datatypes.StringFieldValue; + +import java.nio.charset.StandardCharsets; + +/** + * Hashes a string value to a long or int (by type inference on the target value). + * + * @author bratseth + */ +public class HashExpression extends Expression { + + private final HashFunction hasher = Hashing.sipHash24(); + + /** The destination the embedding will be written to on the form [schema name].[field name] */ + private String destination; + + /** The target type we are embedding into. */ + private DataType targetType; + + public HashExpression() { + super(DataType.STRING); + } + + @Override + public void setStatementOutput(DocumentType documentType, Field field) { + if (field.getDataType() != DataType.INT && field.getDataType() != DataType.LONG) + throw new IllegalArgumentException("Cannot use the hash function on an indexing statement for " + + field.getName() + + ": The hash function can only be used when the target field is int or long, not " + + field.getDataType()); + targetType = field.getDataType(); + } + + @Override + protected void doExecute(ExecutionContext context) { + StringFieldValue input = (StringFieldValue) context.getValue(); + if (targetType.equals(DataType.INT)) + context.setValue(new IntegerFieldValue(hashToInt(input.getString()))); + else if (targetType.equals(DataType.LONG)) + context.setValue(new LongFieldValue(hashToLong(input.getString()))); + else + throw new IllegalStateException(); // won't happen + } + + private int hashToInt(String value) { + return hasher.hashString(value, StandardCharsets.UTF_8).asInt(); + } + + private long hashToLong(String value) { + return hasher.hashString(value, StandardCharsets.UTF_8).asLong(); + } + + @Override + protected void doVerify(VerificationContext context) { + String outputField = context.getOutputField(); + if (outputField == null) + throw new VerificationException(this, "No output field in this statement: " + + "Don't know what value to hash to."); + DataType outputFieldType = context.getInputType(this, outputField); + if (outputFieldType != DataType.INT && outputFieldType != DataType.LONG) + throw new VerificationException(this, "The type of the output field " + outputField + + " is not an int or long but " + outputField); + targetType = outputFieldType; + context.setValueType(createdOutputType()); + } + + @Override + public DataType createdOutputType() { + return targetType; + } + + @Override + public String toString() { return "embed"; } + + @Override + public int hashCode() { return 1; } + + @Override + public boolean equals(Object o) { return o instanceof EmbedExpression; } + +} diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj index bdbecadecd3..e6b21f7c07b 100644 --- a/indexinglanguage/src/main/javacc/IndexingParser.jj +++ b/indexinglanguage/src/main/javacc/IndexingParser.jj @@ -164,6 +164,7 @@ TOKEN : <GET_FIELD: "get_field"> | <GET_VAR: "get_var"> | <GUARD: "guard"> | + <HASH: "hash"> | <HEX_DECODE: "hexdecode"> | <HEX_ENCODE: "hexencode"> | <HOST_NAME: "hostname"> | @@ -283,13 +284,14 @@ Expression value() : val = base64EncodeExp() | val = clearStateExp() | val = echoExp() | - val = embedExp() | + val = embedExp() | val = exactExp() | val = flattenExp() | val = forEachExp() | val = getFieldExp() | val = getVarExp() | val = guardExp() | + val = hashExp() | val = hexDecodeExp() | val = hexEncodeExp() | val = hostNameExp() | @@ -419,6 +421,12 @@ Expression guardExp() : { return new GuardExpression(val); } } +Expression hashExp() : { } +{ + ( <HASH> ) + { return new HashExpression(); } +} + Expression hexDecodeExp() : { } { ( <HEX_DECODE> ) @@ -744,12 +752,13 @@ String identifier() : <ECHO> | <EXACT> | <ELSE> | - <EMBED> | + <EMBED> | <FLATTEN> | <FOR_EACH> | <GET_FIELD> | <GET_VAR> | <GUARD> | + <HASH> | <HEX_DECODE> | <HEX_ENCODE> | <HOST_NAME> | |