summaryrefslogtreecommitdiffstats
path: root/predicate-search/src/main/java/com/yahoo/search/predicate/index/PredicateSearch.java
diff options
context:
space:
mode:
Diffstat (limited to 'predicate-search/src/main/java/com/yahoo/search/predicate/index/PredicateSearch.java')
-rw-r--r--predicate-search/src/main/java/com/yahoo/search/predicate/index/PredicateSearch.java281
1 files changed, 281 insertions, 0 deletions
diff --git a/predicate-search/src/main/java/com/yahoo/search/predicate/index/PredicateSearch.java b/predicate-search/src/main/java/com/yahoo/search/predicate/index/PredicateSearch.java
new file mode 100644
index 00000000000..c40bd944b7b
--- /dev/null
+++ b/predicate-search/src/main/java/com/yahoo/search/predicate/index/PredicateSearch.java
@@ -0,0 +1,281 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.search.predicate.index;
+
+import com.yahoo.search.predicate.Hit;
+import com.yahoo.search.predicate.SubqueryBitmap;
+import com.yahoo.search.predicate.utils.PrimitiveArraySorter;
+
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Optional;
+import java.util.Spliterator;
+import java.util.function.Consumer;
+import java.util.stream.Stream;
+import java.util.stream.StreamSupport;
+
+/**
+ * Implementation of the "Interval" boolean search algorithm.
+ *
+ * @author <a href="mailto:magnarn@yahoo-inc.com">Magnar Nedland</a>
+ * @author bjorncs
+ */
+public class PredicateSearch {
+ private final PostingList[] postingLists;
+ private final byte[] nPostingListsForDocument;
+ private final byte[] minFeatureIndex;
+ private final int[] docIds;
+ private final int[] intervals;
+ private final long[] subqueries;
+ private final long[] subqueryMarkers;
+ private final boolean[] visited;
+ private final short[] intervalEnds;
+
+ private short[] sortedIndexes;
+ private short[] sortedIndexesMergeBuffer;
+ private int nPostingLists;
+
+ /**
+ * Creates a search for a set of posting lists.
+ * @param postingLists Posting lists for the boolean variables that evaluate to true
+ * @param nPostingListsForDocument The number of posting list for each docId
+ * @param minFeatureIndex Index from docId to min-feature value.
+ * @param intervalEnds The interval end for each document.
+ * @param highestIntervalEnd The highest end value.
+ */
+ public PredicateSearch(
+ List<PostingList> postingLists, byte[] nPostingListsForDocument,
+ byte[] minFeatureIndex, short[] intervalEnds, int highestIntervalEnd) {
+ int size = postingLists.size();
+ this.nPostingListsForDocument = nPostingListsForDocument;
+ this.minFeatureIndex = minFeatureIndex;
+ this.nPostingLists = size;
+ this.postingLists = postingLists.toArray(new PostingList[postingLists.size()]);
+ this.sortedIndexes = new short[size];
+ this.sortedIndexesMergeBuffer = new short[size];
+ this.docIds = new int[size];
+ this.intervals = new int[size];
+ this.subqueries = new long[size];
+ this.subqueryMarkers = new long[highestIntervalEnd + 1];
+ this.visited = new boolean[highestIntervalEnd + 1];
+ this.intervalEnds = intervalEnds;
+
+ // Sort posting list array based on the underlying number of documents (largest first).
+ Arrays.sort(this.postingLists, (l, r) -> -Integer.compare(l.size(), r.size()));
+
+ for (short i = 0; i < size; ++i) {
+ PostingList postingList = this.postingLists[i];
+ sortedIndexes[i] = i;
+ docIds[i] = postingList.getDocId();
+ subqueries[i] = postingList.getSubquery();
+ }
+ // All posting lists start at beginId, so no need to sort yet.
+ }
+
+ /**
+ * @return A stream of Hit-objects from a lazy evaluation of the boolean search algorithm.
+ */
+ public Stream<Hit> stream() {
+ if (nPostingLists == 0) {
+ return Stream.empty();
+ }
+ return StreamSupport.stream(new PredicateSpliterator(), false);
+ }
+
+ private class PredicateSpliterator implements java.util.Spliterator<Hit> {
+ private int lastHit = -1;
+
+ @Override
+ public boolean tryAdvance(Consumer<? super Hit> action) {
+ Optional<Hit> optionalHit = seek(lastHit + 1);
+ optionalHit.ifPresent(hit -> {
+ lastHit = hit.getDocId();
+ action.accept(hit);
+ });
+ return optionalHit.isPresent();
+ }
+
+ @Override
+ public Spliterator<Hit> trySplit() {
+ return null;
+ }
+
+ @Override
+ public long estimateSize() {
+ return Long.MAX_VALUE;
+ }
+
+ @Override
+ public int characteristics() {
+ return ORDERED | DISTINCT | SORTED | NONNULL;
+ }
+
+ @Override
+ public Comparator<Hit> getComparator() {
+ return null;
+ }
+ }
+
+ private Optional<Hit> seek(int docId) {
+ boolean skippedToEnd = skipMinFeature(docId);
+ while (nPostingLists > 0 && !skippedToEnd) {
+ int docId0 = docIds[sortedIndexes[0]];
+ int minFeature = minFeatureIndex[docId0];
+ int k = minFeature > 0 ? minFeature - 1 : 0;
+ int intervalEnd = Short.toUnsignedInt(intervalEnds[docId0]);
+ if (k < nPostingLists) {
+ int docIdK = docIds[sortedIndexes[k]];
+ if (docId0 == docIdK) {
+ if (evaluateHit(docId0, k, intervalEnd)) {
+ return Optional.of(new Hit(docId0, subqueryMarkers[intervalEnd]));
+ }
+ }
+ }
+ skippedToEnd = skipMinFeature(docId0 + 1);
+ }
+ return Optional.empty();
+ }
+
+ private boolean skipMinFeature(int docId) {
+ int nDocuments = nPostingListsForDocument.length;
+ while (docId < nDocuments && minFeatureIndex[docId] > nPostingListsForDocument[docId]) {
+ ++docId;
+ }
+ if (docId < nDocuments) {
+ advanceAllTo(docId);
+ return false;
+ }
+ return true;
+ }
+
+ private boolean evaluateHit(int docId, int k, int intervalEnd) {
+ int candidates = k + 1;
+ for (int i = candidates; i < nPostingLists; ++i) {
+ if (docIds[sortedIndexes[i]] == docId) {
+ ++candidates;
+ } else {
+ break;
+ }
+ }
+
+ int nNoIntervalIterators = 0;
+ for (int i = 0; i < candidates; ++i) {
+ short index = sortedIndexes[i];
+ PostingList postingList = postingLists[index];
+ if (postingList.prepareIntervals()) {
+ intervals[index] = postingList.getInterval();
+ } else {
+ ++nNoIntervalIterators;
+ intervals[index] = 0xFFFFFFFF;
+ }
+ }
+ PrimitiveArraySorter.sort(sortedIndexes, 0, candidates, (a, b) -> Integer.compareUnsigned(intervals[a], intervals[b]));
+ candidates -= nNoIntervalIterators;
+
+ Arrays.fill(subqueryMarkers, 0, intervalEnd + 1, 0);
+ subqueryMarkers[0] = SubqueryBitmap.ALL_SUBQUERIES;
+ Arrays.fill(visited, 0, intervalEnd + 1, false);
+ visited[0] = true;
+ int highestEndSeen = 1;
+ for (int i = 0; i < candidates; ) {
+ int index = sortedIndexes[i];
+ int lastEnd = addInterval(index, highestEndSeen);
+ if (lastEnd == -1) {
+ return false;
+ }
+ highestEndSeen = Math.max(lastEnd, highestEndSeen);
+ PostingList postingList = postingLists[index];
+ if (postingList.nextInterval()) {
+ intervals[index] = postingList.getInterval();
+ restoreSortedOrder(i, candidates);
+ } else {
+ ++i;
+ }
+ }
+ return subqueryMarkers[intervalEnd] != 0;
+ }
+
+ private void restoreSortedOrder(int first, int last) {
+ short indexToMove = sortedIndexes[first];
+ long intervalToMove = Integer.toUnsignedLong(intervals[indexToMove]);
+ while (++first < last && intervalToMove > Integer.toUnsignedLong(intervals[sortedIndexes[first]])) {
+ sortedIndexes[first - 1] = sortedIndexes[first];
+ }
+ sortedIndexes[first - 1] = indexToMove;
+ }
+
+ /**
+ * Returns the end value of the interval,
+ * or -1 if the highest end value seen is less than the interval begin.
+ */
+ private int addInterval(int index, int highestEndSeen) {
+ int interval = intervals[index];
+ long subqueryBitMap = subqueries[index];
+ if (Interval.isZStar1Interval(interval)) {
+ int begin = Interval.getZStar1Begin(interval);
+ int end = Interval.getZStar1End(interval);
+ if (highestEndSeen < begin) return -1;
+ markSubquery(begin, end, ~subqueryMarkers[begin]);
+ return end;
+ } else {
+ int begin = Interval.getBegin(interval);
+ int end = Interval.getEnd(interval);
+ if (highestEndSeen < begin -1) return -1;
+ markSubquery(begin - 1, end, subqueryMarkers[begin - 1] & subqueryBitMap);
+ return end;
+ }
+ }
+
+ private void markSubquery(int begin, int end, long subqueryBitmap) {
+ if (visited[begin]) {
+ visited[end] = true;
+ subqueryMarkers[end] |= subqueryBitmap;
+ }
+ }
+
+ // Advances all posting lists to (or beyond) docId.
+ private void advanceAllTo(int docId) {
+ int i = 0;
+ int completedCount = 0;
+ for (; i < nPostingLists; ++i) {
+ if (docIds[sortedIndexes[i]] >= docId) {
+ break;
+ }
+ if (!advanceOneTo(docId, i)) {
+ ++completedCount;
+ }
+ }
+ // No need to sort if all posting lists are finished.
+ if (i > 0 && nPostingLists > completedCount) {
+ sortIndexes(i);
+ // Decrement the number of posting lists.
+ }
+ nPostingLists -= completedCount;
+ }
+
+ // Advances a single posting list to (or beyond) docId.
+ private boolean advanceOneTo(int docId, int index) {
+ int i = sortedIndexes[index];
+ PostingList postingList = postingLists[i];
+ if (postingList.nextDocument(docId - 1)) {
+ docIds[i] = postingList.getDocId();
+ return true;
+ }
+ docIds[i] = Integer.MAX_VALUE; // will be last after sorting.
+ return false;
+ }
+
+ private void sortIndexes(int numUpdated) {
+ // Sort the updated elements
+ boolean swapMergeBuffer =
+ PrimitiveArraySorter.sortAndMerge(sortedIndexes, sortedIndexesMergeBuffer, numUpdated, nPostingLists,
+ (a, b) -> Integer.compare(docIds[a], docIds[b]));
+ if (swapMergeBuffer) {
+ // Swap references
+ short[] temp = sortedIndexes;
+ sortedIndexes = sortedIndexesMergeBuffer;
+ sortedIndexesMergeBuffer = temp;
+ }
+
+ }
+}