aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorTor Egge <tegge@vespa.ai>2024-07-02 17:21:09 +0200
committerGitHub <noreply@github.com>2024-07-02 17:21:09 +0200
commitd1b3b4c1e4f7102f77ac0101bf1001f28b29f089 (patch)
tree6725f21d55ac1fb1ae73f71044ac0646fd50f2f4
parente0ff928bfef21b3b8f1e35edbcdc00152f77ac25 (diff)
parentc700babbe23608e359e4ee660cf76e9282c7f775 (diff)
Merge pull request #31813 from vespa-engine/toregge/add-document-frequency-to-query-itemsv8.367.14
Add document frequency to query items.
-rw-r--r--container-search/abi-spec.json31
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/CompositeTaggableItem.java8
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java16
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/Item.java4
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/SimpleTaggableItem.java8
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java8
-rw-r--r--container-search/src/main/java/com/yahoo/prelude/query/TaggableSegmentItem.java8
-rw-r--r--container-search/src/test/java/com/yahoo/prelude/query/TaggableItemsTestCase.java11
8 files changed, 91 insertions, 3 deletions
diff --git a/container-search/abi-spec.json b/container-search/abi-spec.json
index e4d64c83b49..a36a56e65e8 100644
--- a/container-search/abi-spec.json
+++ b/container-search/abi-spec.json
@@ -480,10 +480,30 @@
"public void setExplicitSignificance(boolean)",
"public boolean hasExplicitSignificance()",
"public double getSignificance()",
+ "public void setDocumentFrequency(com.yahoo.prelude.query.DocumentFrequency)",
+ "public java.util.Optional getDocumentFrequency()",
"public boolean hasUniqueID()"
],
"fields" : [ ]
},
+ "com.yahoo.prelude.query.DocumentFrequency" : {
+ "superClass" : "java.lang.Record",
+ "interfaces" : [ ],
+ "attributes" : [
+ "public",
+ "final",
+ "record"
+ ],
+ "methods" : [
+ "public void <init>(long, long)",
+ "public final java.lang.String toString()",
+ "public final int hashCode()",
+ "public final boolean equals(java.lang.Object)",
+ "public long frequency()",
+ "public long count()"
+ ],
+ "fields" : [ ]
+ },
"com.yahoo.prelude.query.DotProductItem" : {
"superClass" : "com.yahoo.prelude.query.WeightedSetItem",
"interfaces" : [ ],
@@ -874,7 +894,8 @@
"protected com.yahoo.prelude.query.Item connectedBacklink",
"protected double connectivity",
"protected double significance",
- "protected boolean explicitSignificance"
+ "protected boolean explicitSignificance",
+ "protected com.yahoo.prelude.query.DocumentFrequency documentFrequency"
]
},
"com.yahoo.prelude.query.ItemHelper" : {
@@ -1578,6 +1599,8 @@
"public void setExplicitSignificance(boolean)",
"public boolean hasExplicitSignificance()",
"public double getSignificance()",
+ "public void setDocumentFrequency(com.yahoo.prelude.query.DocumentFrequency)",
+ "public java.util.Optional getDocumentFrequency()",
"public boolean hasUniqueID()"
],
"fields" : [ ]
@@ -1679,7 +1702,9 @@
"public abstract void setSignificance(double)",
"public abstract boolean hasExplicitSignificance()",
"public abstract void setExplicitSignificance(boolean)",
- "public abstract double getSignificance()"
+ "public abstract double getSignificance()",
+ "public abstract void setDocumentFrequency(com.yahoo.prelude.query.DocumentFrequency)",
+ "public abstract java.util.Optional getDocumentFrequency()"
],
"fields" : [ ]
},
@@ -1703,6 +1728,8 @@
"public void setExplicitSignificance(boolean)",
"public boolean hasExplicitSignificance()",
"public double getSignificance()",
+ "public void setDocumentFrequency(com.yahoo.prelude.query.DocumentFrequency)",
+ "public java.util.Optional getDocumentFrequency()",
"public boolean hasUniqueID()"
],
"fields" : [ ]
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/CompositeTaggableItem.java b/container-search/src/main/java/com/yahoo/prelude/query/CompositeTaggableItem.java
index 10dc817b2b0..9cad27d7209 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/CompositeTaggableItem.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/CompositeTaggableItem.java
@@ -1,6 +1,8 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.prelude.query;
+import java.util.Optional;
+
/**
* Common implementation for Item classes implementing the TaggableItem interface.
* Note that this file exists in 3 copies that should be kept in sync:
@@ -68,6 +70,12 @@ public abstract class CompositeTaggableItem extends CompositeItem implements Tag
return significance;
}
+ @Override
+ public void setDocumentFrequency(DocumentFrequency documentFrequency) { this.documentFrequency = documentFrequency; }
+
+ @Override
+ public Optional<DocumentFrequency> getDocumentFrequency() { return Optional.ofNullable(documentFrequency); }
+
//Change access privilege from protected to public.
public boolean hasUniqueID() {
return super.hasUniqueID();
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java b/container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java
new file mode 100644
index 00000000000..da35914eaa4
--- /dev/null
+++ b/container-search/src/main/java/com/yahoo/prelude/query/DocumentFrequency.java
@@ -0,0 +1,16 @@
+// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.prelude.query;
+
+import com.yahoo.api.annotations.Beta;
+
+/**
+ * The expected number of documents matching an item given a corpus of
+ * multiple documents. This is the raw data used to calculate variants
+ * of idf, used as significance.
+ *
+ * @param frequency The number of documents in which an item occurs
+ * @param count The total number of documents in the corpus
+ */
+@Beta
+public record DocumentFrequency(long frequency, long count) {
+}
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/Item.java b/container-search/src/main/java/com/yahoo/prelude/query/Item.java
index 099c546e3f0..6c82a2bea0e 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/Item.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/Item.java
@@ -124,6 +124,8 @@ public abstract class Item implements Cloneable {
protected double significance = 0;
protected boolean explicitSignificance = false;
+ protected DocumentFrequency documentFrequency = null;
+
/** Whether this item is eligible for change by query rewriters (false) or should be kept as-is (true) */
private boolean isProtected;
@@ -495,6 +497,8 @@ public abstract class Item implements Cloneable {
discloser.addProperty("usePositionData", usePositionData);
if (explicitSignificance)
discloser.addProperty("significance", significance);
+ if (documentFrequency != null)
+ discloser.addProperty("documentFrequency", documentFrequency);
if (weight != 100)
discloser.addProperty("weight", weight);
if (label != null)
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/SimpleTaggableItem.java b/container-search/src/main/java/com/yahoo/prelude/query/SimpleTaggableItem.java
index 1718a4e7708..8d3eecd9f29 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/SimpleTaggableItem.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/SimpleTaggableItem.java
@@ -1,6 +1,8 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.prelude.query;
+import java.util.Optional;
+
/**
* Common implementation for Item classes implementing the TaggableItem interface.
* Note that this file exist in 3 copies that should be kept in sync:
@@ -68,6 +70,12 @@ public abstract class SimpleTaggableItem extends Item implements TaggableItem {
return significance;
}
+ @Override
+ public void setDocumentFrequency(DocumentFrequency documentFrequency) { this.documentFrequency = documentFrequency; }
+
+ @Override
+ public Optional<DocumentFrequency> getDocumentFrequency() { return Optional.ofNullable(documentFrequency); }
+
//Change access privilege from protected to public.
public boolean hasUniqueID() {
return super.hasUniqueID();
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java b/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java
index 1bfd75f8d27..a91fe29590e 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/TaggableItem.java
@@ -1,5 +1,8 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.prelude.query;
+import com.yahoo.api.annotations.Beta;
+
+import java.util.Optional;
/**
* An interface used for anything which may be addressed using an external,
@@ -44,4 +47,9 @@ public interface TaggableItem {
void setExplicitSignificance(boolean significance);
double getSignificance();
+ @Beta
+ void setDocumentFrequency(DocumentFrequency documentFrequency);
+
+ @Beta
+ Optional<DocumentFrequency> getDocumentFrequency();
}
diff --git a/container-search/src/main/java/com/yahoo/prelude/query/TaggableSegmentItem.java b/container-search/src/main/java/com/yahoo/prelude/query/TaggableSegmentItem.java
index 1cba588ce40..b3549e6868b 100644
--- a/container-search/src/main/java/com/yahoo/prelude/query/TaggableSegmentItem.java
+++ b/container-search/src/main/java/com/yahoo/prelude/query/TaggableSegmentItem.java
@@ -1,6 +1,8 @@
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.prelude.query;
+import java.util.Optional;
+
/**
* Common implementation for Item classes implementing the TaggableItem interface.
* Note that this file exist in 3 copies that should be kept in sync:
@@ -81,6 +83,12 @@ public abstract class TaggableSegmentItem extends SegmentItem implements Taggabl
return significance;
}
+ @Override
+ public void setDocumentFrequency(DocumentFrequency documentFrequency) { this.documentFrequency = documentFrequency; }
+
+ @Override
+ public Optional<DocumentFrequency> getDocumentFrequency() { return Optional.ofNullable(documentFrequency); }
+
//Change access privilege from protected to public.
@Override
public boolean hasUniqueID() {
diff --git a/container-search/src/test/java/com/yahoo/prelude/query/TaggableItemsTestCase.java b/container-search/src/test/java/com/yahoo/prelude/query/TaggableItemsTestCase.java
index de77d6721f0..10d4523041c 100644
--- a/container-search/src/test/java/com/yahoo/prelude/query/TaggableItemsTestCase.java
+++ b/container-search/src/test/java/com/yahoo/prelude/query/TaggableItemsTestCase.java
@@ -99,7 +99,7 @@ public class TaggableItemsTestCase {
.getDeclaredMethods();
final Method[] simple = SimpleTaggableItem.class.getDeclaredMethods();
final Method[] segment = TaggableSegmentItem.class.getDeclaredMethods();
- final int numberOfMethods = 10;
+ final int numberOfMethods = 12;
assertEquals(numberOfMethods, composite.length);
assertEquals(numberOfMethods, simple.length);
assertEquals(numberOfMethods, segment.length);
@@ -152,4 +152,13 @@ public class TaggableItemsTestCase {
assertTrue(p.hasExplicitSignificance());
}
+ @Test
+ final void testSetDocumentFrequency() {
+ final PhraseSegmentItem p = new PhraseSegmentItem("farmyards", false, false);
+ assertFalse(p.getDocumentFrequency().isPresent());
+ p.setDocumentFrequency(new DocumentFrequency(13, 100));
+ assertTrue(p.getDocumentFrequency().isPresent());
+ assertEquals(new DocumentFrequency(13, 100), p.getDocumentFrequency().get());
+ }
+
}