diff options
author | Tor Egge <tegge@vespa.ai> | 2024-07-02 17:21:09 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2024-07-02 17:21:09 +0200 |
commit | d1b3b4c1e4f7102f77ac0101bf1001f28b29f089 (patch) | |
tree | 6725f21d55ac1fb1ae73f71044ac0646fd50f2f4 | |
parent | e0ff928bfef21b3b8f1e35edbcdc00152f77ac25 (diff) | |
parent | c700babbe23608e359e4ee660cf76e9282c7f775 (diff) |
Merge pull request #31813 from vespa-engine/toregge/add-document-frequency-to-query-itemsv8.367.14
Add document frequency to query items.
8 files changed, 91 insertions, 3 deletions
diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json index e4d64c83b49..a36a56e65e8 100644 --- a/container-search/abi-spec.json +++ b/container-search/abi-spec.json @@ -480,10 +480,30 @@ "public void setExplicitSignificance(boolean)", "public boolean hasExplicitSignificance()", "public double getSignificance()", + "public void setDocumentFrequency(com.yahoo.prelude.query.DocumentFrequency)", + "public java.util.Optional getDocumentFrequency()", "public boolean hasUniqueID()" ], "fields" : [ ] }, + "com.yahoo.prelude.query.DocumentFrequency" : { + "superClass" : "java.lang.Record", + "interfaces" : [ ], + "attributes" : [ + "public", + "final", + "record" + ], + "methods" : [ + "public void <init>(long, long)", + "public final java.lang.String toString()", + "public final int hashCode()", + "public final boolean equals(java.lang.Object)", + "public long frequency()", + "public long count()" + ], + "fields" : [ ] + }, "com.yahoo.prelude.query.DotProductItem" : { "superClass" : "com.yahoo.prelude.query.WeightedSetItem", "interfaces" : [ ], @@ -874,7 +894,8 @@ "protected com.yahoo.prelude.query.Item connectedBacklink", "protected double connectivity", "protected double significance", - "protected boolean explicitSignificance" + "protected boolean explicitSignificance", + "protected com.yahoo.prelude.query.DocumentFrequency documentFrequency" ] }, "com.yahoo.prelude.query.ItemHelper" : { @@ -1578,6 +1599,8 @@ "public void setExplicitSignificance(boolean)", "public boolean hasExplicitSignificance()", "public double getSignificance()", + "public void setDocumentFrequency(com.yahoo.prelude.query.DocumentFrequency)", + "public java.util.Optional getDocumentFrequency()", "public boolean hasUniqueID()" ], "fields" : [ ] @@ -1679,7 +1702,9 @@ "public abstract void setSignificance(double)", "public abstract boolean hasExplicitSignificance()", "public abstract void setExplicitSignificance(boolean)", - "public abstract double getSignificance()" + "public abstract double getSignificance()", + "public abstract void setDocumentFrequency(com.yahoo.prelude.query.DocumentFrequency)", + "public abstract java.util.Optional getDocumentFrequency()" ], "fields" : [ ] }, @@ -1703,6 +1728,8 @@ "public void setExplicitSignificance(boolean)", "public boolean hasExplicitSignificance()", "public double getSignificance()", + "public void setDocumentFrequency(com.yahoo.prelude.query.DocumentFrequency)", + "public java.util.Optional getDocumentFrequency()", "public boolean hasUniqueID()" ], "fields" : [ ] diff --git a/container-search/src/main/java/com/yahoo/prelude/query/CompositeTaggableItem.java b/container-search/src/main/java/com/yahoo/prelude/query/CompositeTaggableItem.java index 10dc817b2b0..9cad27d7209 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/CompositeTaggableItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/CompositeTaggableItem.java @@ -1,6 +1,8 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.query; +import java.util.Optional; + /** * Common implementation for Item classes implementing the TaggableItem interface. * Note that this file exists in 3 copies that should be kept in sync: @@ -68,6 +70,12 @@ public abstract class CompositeTaggableItem extends CompositeItem implements Tag return significance; } + @Override + public void setDocumentFrequency(DocumentFrequency documentFrequency) { this.documentFrequency = documentFrequency; } + + @Override + public Optional<DocumentFrequency> getDocumentFrequency() { return Optional.ofNullable(documentFrequency); } + //Change access privilege from protected to public. public boolean hasUniqueID() { return super.hasUniqueID(); diff --git a/container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java b/container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java new file mode 100644 index 00000000000..da35914eaa4 --- /dev/null +++ b/container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java @@ -0,0 +1,16 @@ +// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.prelude.query; + +import com.yahoo.api.annotations.Beta; + +/** + * The expected number of documents matching an item given a corpus of + * multiple documents. This is the raw data used to calculate variants + * of idf, used as significance. + * + * @param frequency The number of documents in which an item occurs + * @param count The total number of documents in the corpus + */ +@Beta +public record DocumentFrequency(long frequency, long count) { +} diff --git a/container-search/src/main/java/com/yahoo/prelude/query/Item.java b/container-search/src/main/java/com/yahoo/prelude/query/Item.java index 099c546e3f0..6c82a2bea0e 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/Item.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/Item.java @@ -124,6 +124,8 @@ public abstract class Item implements Cloneable { protected double significance = 0; protected boolean explicitSignificance = false; + protected DocumentFrequency documentFrequency = null; + /** Whether this item is eligible for change by query rewriters (false) or should be kept as-is (true) */ private boolean isProtected; @@ -495,6 +497,8 @@ public abstract class Item implements Cloneable { discloser.addProperty("usePositionData", usePositionData); if (explicitSignificance) discloser.addProperty("significance", significance); + if (documentFrequency != null) + discloser.addProperty("documentFrequency", documentFrequency); if (weight != 100) discloser.addProperty("weight", weight); if (label != null) diff --git a/container-search/src/main/java/com/yahoo/prelude/query/SimpleTaggableItem.java b/container-search/src/main/java/com/yahoo/prelude/query/SimpleTaggableItem.java index 1718a4e7708..8d3eecd9f29 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/SimpleTaggableItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/SimpleTaggableItem.java @@ -1,6 +1,8 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.query; +import java.util.Optional; + /** * Common implementation for Item classes implementing the TaggableItem interface. * Note that this file exist in 3 copies that should be kept in sync: @@ -68,6 +70,12 @@ public abstract class SimpleTaggableItem extends Item implements TaggableItem { return significance; } + @Override + public void setDocumentFrequency(DocumentFrequency documentFrequency) { this.documentFrequency = documentFrequency; } + + @Override + public Optional<DocumentFrequency> getDocumentFrequency() { return Optional.ofNullable(documentFrequency); } + //Change access privilege from protected to public. public boolean hasUniqueID() { return super.hasUniqueID(); diff --git a/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java b/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java index 1bfd75f8d27..a91fe29590e 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java @@ -1,5 +1,8 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.query; +import com.yahoo.api.annotations.Beta; + +import java.util.Optional; /** * An interface used for anything which may be addressed using an external, @@ -44,4 +47,9 @@ public interface TaggableItem { void setExplicitSignificance(boolean significance); double getSignificance(); + @Beta + void setDocumentFrequency(DocumentFrequency documentFrequency); + + @Beta + Optional<DocumentFrequency> getDocumentFrequency(); } diff --git a/container-search/src/main/java/com/yahoo/prelude/query/TaggableSegmentItem.java b/container-search/src/main/java/com/yahoo/prelude/query/TaggableSegmentItem.java index 1cba588ce40..b3549e6868b 100644 --- a/container-search/src/main/java/com/yahoo/prelude/query/TaggableSegmentItem.java +++ b/container-search/src/main/java/com/yahoo/prelude/query/TaggableSegmentItem.java @@ -1,6 +1,8 @@ // Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.prelude.query; +import java.util.Optional; + /** * Common implementation for Item classes implementing the TaggableItem interface. * Note that this file exist in 3 copies that should be kept in sync: @@ -81,6 +83,12 @@ public abstract class TaggableSegmentItem extends SegmentItem implements Taggabl return significance; } + @Override + public void setDocumentFrequency(DocumentFrequency documentFrequency) { this.documentFrequency = documentFrequency; } + + @Override + public Optional<DocumentFrequency> getDocumentFrequency() { return Optional.ofNullable(documentFrequency); } + //Change access privilege from protected to public. @Override public boolean hasUniqueID() { diff --git a/container-search/src/test/java/com/yahoo/prelude/query/TaggableItemsTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/TaggableItemsTestCase.java index de77d6721f0..10d4523041c 100644 --- a/container-search/src/test/java/com/yahoo/prelude/query/TaggableItemsTestCase.java +++ b/container-search/src/test/java/com/yahoo/prelude/query/TaggableItemsTestCase.java @@ -99,7 +99,7 @@ public class TaggableItemsTestCase { .getDeclaredMethods(); final Method[] simple = SimpleTaggableItem.class.getDeclaredMethods(); final Method[] segment = TaggableSegmentItem.class.getDeclaredMethods(); - final int numberOfMethods = 10; + final int numberOfMethods = 12; assertEquals(numberOfMethods, composite.length); assertEquals(numberOfMethods, simple.length); assertEquals(numberOfMethods, segment.length); @@ -152,4 +152,13 @@ public class TaggableItemsTestCase { assertTrue(p.hasExplicitSignificance()); } + @Test + final void testSetDocumentFrequency() { + final PhraseSegmentItem p = new PhraseSegmentItem("farmyards", false, false); + assertFalse(p.getDocumentFrequency().isPresent()); + p.setDocumentFrequency(new DocumentFrequency(13, 100)); + assertTrue(p.getDocumentFrequency().isPresent()); + assertEquals(new DocumentFrequency(13, 100), p.getDocumentFrequency().get()); + } + } |