From 7b992b88818a931665441c3fb0f0c16824116567 Mon Sep 17 00:00:00 2001 From: Jon Bratseth Date: Fri, 4 Feb 2022 14:54:44 +0100 Subject: Add hash function --- indexinglanguage/pom.xml | 1 - .../expressions/HashExpression.java | 91 ++++++++++++++++++++++ indexinglanguage/src/main/javacc/IndexingParser.jj | 13 +++- .../vespa/indexinglanguage/ScriptTestCase.java | 47 ++++++++++- 4 files changed, 148 insertions(+), 4 deletions(-) create mode 100644 indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java (limited to 'indexinglanguage') diff --git a/indexinglanguage/pom.xml b/indexinglanguage/pom.xml index efca7479faf..f9ee18a4602 100644 --- a/indexinglanguage/pom.xml +++ b/indexinglanguage/pom.xml @@ -47,7 +47,6 @@ com.google.guava guava - test diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java new file mode 100644 index 00000000000..c69ceda2210 --- /dev/null +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java @@ -0,0 +1,91 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.indexinglanguage.expressions; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import com.yahoo.document.DataType; +import com.yahoo.document.DocumentType; +import com.yahoo.document.Field; +import com.yahoo.document.datatypes.IntegerFieldValue; +import com.yahoo.document.datatypes.LongFieldValue; +import com.yahoo.document.datatypes.StringFieldValue; + +import java.nio.charset.StandardCharsets; + +/** + * Hashes a string value to a long or int (by type inference on the target value). + * + * @author bratseth + */ +public class HashExpression extends Expression { + + private final HashFunction hasher = Hashing.sipHash24(); + + /** The destination the embedding will be written to on the form [schema name].[field name] */ + private String destination; + + /** The target type we are embedding into. */ + private DataType targetType; + + public HashExpression() { + super(DataType.STRING); + } + + @Override + public void setStatementOutput(DocumentType documentType, Field field) { + if (field.getDataType() != DataType.INT && field.getDataType() != DataType.LONG) + throw new IllegalArgumentException("Cannot use the hash function on an indexing statement for " + + field.getName() + + ": The hash function can only be used when the target field is int or long, not " + + field.getDataType()); + targetType = field.getDataType(); + } + + @Override + protected void doExecute(ExecutionContext context) { + StringFieldValue input = (StringFieldValue) context.getValue(); + if (targetType.equals(DataType.INT)) + context.setValue(new IntegerFieldValue(hashToInt(input.getString()))); + else if (targetType.equals(DataType.LONG)) + context.setValue(new LongFieldValue(hashToLong(input.getString()))); + else + throw new IllegalStateException(); // won't happen + } + + private int hashToInt(String value) { + return hasher.hashString(value, StandardCharsets.UTF_8).asInt(); + } + + private long hashToLong(String value) { + return hasher.hashString(value, StandardCharsets.UTF_8).asLong(); + } + + @Override + protected void doVerify(VerificationContext context) { + String outputField = context.getOutputField(); + if (outputField == null) + throw new VerificationException(this, "No output field in this statement: " + + "Don't know what value to hash to."); + DataType outputFieldType = context.getInputType(this, outputField); + if (outputFieldType != DataType.INT && outputFieldType != DataType.LONG) + throw new VerificationException(this, "The type of the output field " + outputField + + " is not an int or long but " + outputField); + targetType = outputFieldType; + context.setValueType(createdOutputType()); + } + + @Override + public DataType createdOutputType() { + return targetType; + } + + @Override + public String toString() { return "embed"; } + + @Override + public int hashCode() { return 1; } + + @Override + public boolean equals(Object o) { return o instanceof EmbedExpression; } + +} diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj index bdbecadecd3..e6b21f7c07b 100644 --- a/indexinglanguage/src/main/javacc/IndexingParser.jj +++ b/indexinglanguage/src/main/javacc/IndexingParser.jj @@ -164,6 +164,7 @@ TOKEN : | | | + | | | | @@ -283,13 +284,14 @@ Expression value() : val = base64EncodeExp() | val = clearStateExp() | val = echoExp() | - val = embedExp() | + val = embedExp() | val = exactExp() | val = flattenExp() | val = forEachExp() | val = getFieldExp() | val = getVarExp() | val = guardExp() | + val = hashExp() | val = hexDecodeExp() | val = hexEncodeExp() | val = hostNameExp() | @@ -419,6 +421,12 @@ Expression guardExp() : { return new GuardExpression(val); } } +Expression hashExp() : { } +{ + ( ) + { return new HashExpression(); } +} + Expression hexDecodeExp() : { } { ( ) @@ -744,12 +752,13 @@ String identifier() : | | | - | + | | | | | | + | | | | diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java index f9a6f2225b3..b8254f133bc 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java @@ -9,7 +9,6 @@ import com.yahoo.document.TensorDataType; import com.yahoo.document.datatypes.BoolFieldValue; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.document.datatypes.TensorFieldValue; -import com.yahoo.language.Language; import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.tensor.Tensor; @@ -99,6 +98,52 @@ public class ScriptTestCase { assertEquals(new BoolFieldValue(true), output.getFieldValue("mybool")); } + @Test + public void testIntHash() throws ParseException { + var expression = Expression.fromString("input myText | hash | attribute 'myInt'"); + + SimpleTestAdapter adapter = new SimpleTestAdapter(); + adapter.createField(new Field("myText", DataType.STRING)); + var intField = new Field("myInt", DataType.INT); + adapter.createField(intField); + adapter.setValue("myText", new StringFieldValue("input text")); + expression.setStatementOutput(new DocumentType("myDocument"), intField); + + // Necessary to resolve output type + VerificationContext verificationContext = new VerificationContext(adapter); + assertEquals(DataType.INT, expression.verify(verificationContext)); + + ExecutionContext context = new ExecutionContext(adapter); + context.setValue(new StringFieldValue("input text")); + expression.execute(context); + assertNotNull(context); + assertTrue(adapter.values.containsKey("myInt")); + assertEquals(-1425622096, adapter.values.get("myInt").getWrappedValue()); + } + + @Test + public void testLongHash() throws ParseException { + var expression = Expression.fromString("input myText | hash | attribute 'myLong'"); + + SimpleTestAdapter adapter = new SimpleTestAdapter(); + adapter.createField(new Field("myText", DataType.STRING)); + var intField = new Field("myLong", DataType.LONG); + adapter.createField(intField); + adapter.setValue("myText", new StringFieldValue("input text")); + expression.setStatementOutput(new DocumentType("myDocument"), intField); + + // Necessary to resolve output type + VerificationContext verificationContext = new VerificationContext(adapter); + assertEquals(DataType.LONG, expression.verify(verificationContext)); + + ExecutionContext context = new ExecutionContext(adapter); + context.setValue(new StringFieldValue("input text")); + expression.execute(context); + assertNotNull(context); + assertTrue(adapter.values.containsKey("myLong")); + assertEquals(7678158186624760752L, adapter.values.get("myLong").getWrappedValue()); + } + @Test public void testEmbed() throws ParseException { TensorType tensorType = TensorType.fromSpec("tensor(d[4])"); -- cgit v1.2.3