diff options
7 files changed, 179 insertions, 33 deletions
diff --git a/document/src/main/java/com/yahoo/document/datatypes/Array.java b/document/src/main/java/com/yahoo/document/datatypes/Array.java index 11a8eb7a350..672690bafad 100644 --- a/document/src/main/java/com/yahoo/document/datatypes/Array.java +++ b/document/src/main/java/com/yahoo/document/datatypes/Array.java @@ -21,7 +21,7 @@ import java.util.ListIterator; import java.util.RandomAccess; /** - * FieldValue which encapsulates a Array value + * FieldValue which encapsulates an Array value * * @author Einar M R Rosenvinge */ @@ -42,8 +42,7 @@ public final class Array<T extends FieldValue> extends CollectionFieldValue<T> i this(type); for (T v : values) { if (!((ArrayDataType)type).getNestedType().isValueCompatible(v)) { - throw new IllegalArgumentException("FieldValue " + v + - " is not compatible with " + type + "."); + throw new IllegalArgumentException("FieldValue " + v + " is not compatible with " + type + "."); } } this.values.addAll(values); diff --git a/indexinglanguage/pom.xml b/indexinglanguage/pom.xml index efca7479faf..f9ee18a4602 100644 --- a/indexinglanguage/pom.xml +++ b/indexinglanguage/pom.xml @@ -47,7 +47,6 @@ <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> - <scope>test</scope> </dependency> </dependencies> <build> diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java new file mode 100644 index 00000000000..5b04720dad4 --- /dev/null +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HashExpression.java @@ -0,0 +1,95 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.vespa.indexinglanguage.expressions; + +import com.google.common.hash.HashFunction; +import com.google.common.hash.Hashing; +import com.yahoo.document.ArrayDataType; +import com.yahoo.document.DataType; +import com.yahoo.document.DocumentType; +import com.yahoo.document.Field; +import com.yahoo.document.datatypes.IntegerFieldValue; +import com.yahoo.document.datatypes.LongFieldValue; +import com.yahoo.document.datatypes.StringFieldValue; + +import java.nio.charset.StandardCharsets; + +/** + * Hashes a string value to a long or int (by type inference on the target value). + * + * @author bratseth + */ +public class HashExpression extends Expression { + + private final HashFunction hasher = Hashing.sipHash24(); + + /** The target type we are hashing into. */ + private DataType targetType; + + public HashExpression() { + super(DataType.STRING); + } + + @Override + public void setStatementOutput(DocumentType documentType, Field field) { + if ( ! canStoreHash(field.getDataType())) + throw new IllegalArgumentException("Cannot use the hash function on an indexing statement for " + + field.getName() + + ": The hash function can only be used when the target field " + + "is int or long, not " + field.getDataType()); + targetType = field.getDataType(); + } + + @Override + protected void doExecute(ExecutionContext context) { + StringFieldValue input = (StringFieldValue) context.getValue(); + if (targetType.equals(DataType.INT)) + context.setValue(new IntegerFieldValue(hashToInt(input.getString()))); + else if (targetType.equals(DataType.LONG)) + context.setValue(new LongFieldValue(hashToLong(input.getString()))); + else + throw new IllegalStateException(); // won't happen + } + + private int hashToInt(String value) { + return hasher.hashString(value, StandardCharsets.UTF_8).asInt(); + } + + private long hashToLong(String value) { + return hasher.hashString(value, StandardCharsets.UTF_8).asLong(); + } + + @Override + protected void doVerify(VerificationContext context) { + String outputField = context.getOutputField(); + if (outputField == null) + throw new VerificationException(this, "No output field in this statement: " + + "Don't know what value to hash to."); + DataType outputFieldType = context.getInputType(this, outputField); + if ( ! canStoreHash(outputFieldType)) + throw new VerificationException(this, "The type of the output field " + outputField + + " is not int or long but " + outputFieldType); + targetType = outputFieldType; + context.setValueType(createdOutputType()); + } + + private boolean canStoreHash(DataType type) { + if (type.equals(DataType.INT)) return true; + if (type.equals(DataType.LONG)) return true; + return false; + } + + @Override + public DataType createdOutputType() { + return targetType; + } + + @Override + public String toString() { return "hash"; } + + @Override + public int hashCode() { return 987; } + + @Override + public boolean equals(Object o) { return o instanceof HashExpression; } + +} diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HexEncodeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HexEncodeExpression.java index 5e7288b8ecc..ca2be7c3400 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HexEncodeExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/HexEncodeExpression.java @@ -13,6 +13,7 @@ public final class HexEncodeExpression extends Expression { public HexEncodeExpression() { super(DataType.LONG); } + @Override protected void doExecute(ExecutionContext context) { long input = ((LongFieldValue) context.getValue()).getLong(); diff --git a/indexinglanguage/src/main/javacc/IndexingParser.jj b/indexinglanguage/src/main/javacc/IndexingParser.jj index bdbecadecd3..e6b21f7c07b 100644 --- a/indexinglanguage/src/main/javacc/IndexingParser.jj +++ b/indexinglanguage/src/main/javacc/IndexingParser.jj @@ -164,6 +164,7 @@ TOKEN : <GET_FIELD: "get_field"> | <GET_VAR: "get_var"> | <GUARD: "guard"> | + <HASH: "hash"> | <HEX_DECODE: "hexdecode"> | <HEX_ENCODE: "hexencode"> | <HOST_NAME: "hostname"> | @@ -283,13 +284,14 @@ Expression value() : val = base64EncodeExp() | val = clearStateExp() | val = echoExp() | - val = embedExp() | + val = embedExp() | val = exactExp() | val = flattenExp() | val = forEachExp() | val = getFieldExp() | val = getVarExp() | val = guardExp() | + val = hashExp() | val = hexDecodeExp() | val = hexEncodeExp() | val = hostNameExp() | @@ -419,6 +421,12 @@ Expression guardExp() : { return new GuardExpression(val); } } +Expression hashExp() : { } +{ + ( <HASH> ) + { return new HashExpression(); } +} + Expression hexDecodeExp() : { } { ( <HEX_DECODE> ) @@ -744,12 +752,13 @@ String identifier() : <ECHO> | <EXACT> | <ELSE> | - <EMBED> | + <EMBED> | <FLATTEN> | <FOR_EACH> | <GET_FIELD> | <GET_VAR> | <GUARD> | + <HASH> | <HEX_DECODE> | <HEX_ENCODE> | <HOST_NAME> | diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java index f9a6f2225b3..778d95fcaef 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/ScriptTestCase.java @@ -9,7 +9,6 @@ import com.yahoo.document.TensorDataType; import com.yahoo.document.datatypes.BoolFieldValue; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.document.datatypes.TensorFieldValue; -import com.yahoo.language.Language; import com.yahoo.language.process.Embedder; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.tensor.Tensor; @@ -100,6 +99,50 @@ public class ScriptTestCase { } @Test + public void testIntHash() throws ParseException { + var expression = Expression.fromString("input myText | hash | attribute 'myInt'"); + + SimpleTestAdapter adapter = new SimpleTestAdapter(); + adapter.createField(new Field("myText", DataType.STRING)); + var intField = new Field("myInt", DataType.INT); + adapter.createField(intField); + adapter.setValue("myText", new StringFieldValue("input text")); + expression.setStatementOutput(new DocumentType("myDocument"), intField); + + // Necessary to resolve output type + VerificationContext verificationContext = new VerificationContext(adapter); + assertEquals(DataType.INT, expression.verify(verificationContext)); + + ExecutionContext context = new ExecutionContext(adapter); + context.setValue(new StringFieldValue("input text")); + expression.execute(context); + assertTrue(adapter.values.containsKey("myInt")); + assertEquals(-1425622096, adapter.values.get("myInt").getWrappedValue()); + } + + @Test + public void testLongHash() throws ParseException { + var expression = Expression.fromString("input myText | hash | attribute 'myLong'"); + + SimpleTestAdapter adapter = new SimpleTestAdapter(); + adapter.createField(new Field("myText", DataType.STRING)); + var intField = new Field("myLong", DataType.LONG); + adapter.createField(intField); + adapter.setValue("myText", new StringFieldValue("input text")); + expression.setStatementOutput(new DocumentType("myDocument"), intField); + + // Necessary to resolve output type + VerificationContext verificationContext = new VerificationContext(adapter); + assertEquals(DataType.LONG, expression.verify(verificationContext)); + + ExecutionContext context = new ExecutionContext(adapter); + context.setValue(new StringFieldValue("input text")); + expression.execute(context); + assertTrue(adapter.values.containsKey("myLong")); + assertEquals(7678158186624760752L, adapter.values.get("myLong").getWrappedValue()); + } + + @Test public void testEmbed() throws ParseException { TensorType tensorType = TensorType.fromSpec("tensor(d[4])"); var expression = Expression.fromString("input myText | embed | attribute 'myTensor'", @@ -120,7 +163,6 @@ public class ScriptTestCase { ExecutionContext context = new ExecutionContext(adapter); context.setValue(new StringFieldValue("input text")); expression.execute(context); - assertNotNull(context); assertTrue(adapter.values.containsKey("myTensor")); assertEquals(Tensor.from(tensorType, "[7,3,0,0]"), ((TensorFieldValue)adapter.values.get("myTensor")).getTensor().get()); diff --git a/vespajlib/src/main/java/com/yahoo/collections/BobHash.java b/vespajlib/src/main/java/com/yahoo/collections/BobHash.java index d133af2ea84..3d1e82743cc 100644 --- a/vespajlib/src/main/java/com/yahoo/collections/BobHash.java +++ b/vespajlib/src/main/java/com/yahoo/collections/BobHash.java @@ -153,44 +153,45 @@ public class BobHash { // handle the last 11 bytes c += k.length; switch (len) { - // all the case statements fall through - case 11: - c += (unsign(k[offset + 10]) << 24); + // all the case statements fall through + case 11: + c += (unsign(k[offset + 10]) << 24); - case 10: - c += (unsign(k[offset + 9]) << 16); + case 10: + c += (unsign(k[offset + 9]) << 16); - case 9: - c += (unsign(k[offset + 8]) << 8); + case 9: + c += (unsign(k[offset + 8]) << 8); - /* the first byte of c is reserved for the length */ - case 8: - b += (unsign(k[offset + 7]) << 24); + /* the first byte of c is reserved for the length */ + case 8: + b += (unsign(k[offset + 7]) << 24); - case 7: - b += (unsign(k[offset + 6]) << 16); + case 7: + b += (unsign(k[offset + 6]) << 16); - case 6: - b += (unsign(k[offset + 5]) << 8); + case 6: + b += (unsign(k[offset + 5]) << 8); - case 5: - b += unsign(k[offset + 4]); + case 5: + b += unsign(k[offset + 4]); - case 4: - a += (unsign(k[offset + 3]) << 24); + case 4: + a += (unsign(k[offset + 3]) << 24); - case 3: - a += (unsign(k[offset + 2]) << 16); + case 3: + a += (unsign(k[offset + 2]) << 16); - case 2: - a += (unsign(k[offset + 1]) << 8); + case 2: + a += (unsign(k[offset + 1]) << 8); - case 1: - a += unsign(k[offset + 0]); + case 1: + a += unsign(k[offset + 0]); - /* case 0: nothing left to add */ + /* case 0: nothing left to add */ } abcBuffer = mix(a, b, c); return abcBuffer[2]; } + } |