diff options
Diffstat (limited to 'predicate-search/src/main/java/com/yahoo/search/predicate/index/conjunction/ConjunctionIndex.java')
-rw-r--r-- | predicate-search/src/main/java/com/yahoo/search/predicate/index/conjunction/ConjunctionIndex.java | 279 |
1 files changed, 279 insertions, 0 deletions
diff --git a/predicate-search/src/main/java/com/yahoo/search/predicate/index/conjunction/ConjunctionIndex.java b/predicate-search/src/main/java/com/yahoo/search/predicate/index/conjunction/ConjunctionIndex.java new file mode 100644 index 00000000000..c272cb5fb92 --- /dev/null +++ b/predicate-search/src/main/java/com/yahoo/search/predicate/index/conjunction/ConjunctionIndex.java @@ -0,0 +1,279 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.search.predicate.index.conjunction; + +import com.gs.collections.api.map.primitive.IntObjectMap; +import com.gs.collections.api.map.primitive.LongObjectMap; +import com.gs.collections.api.tuple.primitive.IntObjectPair; +import com.gs.collections.api.tuple.primitive.LongObjectPair; +import com.gs.collections.impl.map.mutable.primitive.IntObjectHashMap; +import com.gs.collections.impl.map.mutable.primitive.LongObjectHashMap; +import com.yahoo.document.predicate.FeatureConjunction; +import com.yahoo.search.predicate.PredicateQuery; +import com.yahoo.search.predicate.SubqueryBitmap; +import com.yahoo.search.predicate.serialization.SerializationHelper; +import com.yahoo.search.predicate.utils.PrimitiveArraySorter; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Optional; + +/** + * A searchable index of conjunctions (see {@link FeatureConjunction} / {@link IndexableFeatureConjunction}). + * Implements the algorithm described in the paper <a href="http://dl.acm.org/citation.cfm?id=1687633">Indexing Boolean Expressions</a>. + * + * @author <a href="mailto:magnarn@yahoo-inc.com">Magnar Nedland</a> + * @author bjorncs + */ +public class ConjunctionIndex { + // A map from K value to FeatureIndex + private final IntObjectMap<FeatureIndex> kIndex; + private final int[] zList; + private final long[] idMapping; + + public ConjunctionIndex(IntObjectMap<FeatureIndex> kIndex, int[] zList, long[] idMapping) { + this.kIndex = kIndex; + this.zList = zList; + this.idMapping = idMapping; + } + + public Searcher searcher() { + return new Searcher(); + } + + public void writeToOutputStream(DataOutputStream out) throws IOException { + SerializationHelper.writeIntArray(zList, out); + SerializationHelper.writeLongArray(idMapping, out); + out.writeInt(kIndex.size()); + for (IntObjectPair<FeatureIndex> p : kIndex.keyValuesView()) { + out.writeInt(p.getOne()); + p.getTwo().writeToOutputStream(out); + } + } + + public static ConjunctionIndex fromInputStream(DataInputStream in) throws IOException { + int[] zList = SerializationHelper.readIntArray(in); + long[] idMapping = SerializationHelper.readLongArray(in); + int kIndexSize = in.readInt(); + IntObjectHashMap<FeatureIndex> kIndex = new IntObjectHashMap<>(kIndexSize); + for (int i = 0; i < kIndexSize; i++) { + int key = in.readInt(); + kIndex.put(key, FeatureIndex.fromInputStream(in)); + } + kIndex.compact(); + return new ConjunctionIndex(kIndex, zList, idMapping); + } + + public static class FeatureIndex { + // Maps a feature id to conjunction id + private final LongObjectMap<int[]> map; + + public FeatureIndex(LongObjectMap<int[]> map) { + this.map = map; + } + + public Optional<int[]> getConjunctionIdsForFeature(long featureId) { + return Optional.ofNullable(map.get(featureId)); + } + + public void writeToOutputStream(DataOutputStream out) throws IOException { + out.writeInt(map.size()); + for (LongObjectPair<int[]> p : map.keyValuesView()) { + out.writeLong(p.getOne()); + SerializationHelper.writeIntArray(p.getTwo(), out); + } + } + + public static FeatureIndex fromInputStream(DataInputStream in) throws IOException { + int mapSize = in.readInt(); + LongObjectHashMap<int[]> map = new LongObjectHashMap<>(mapSize); + for (int i = 0; i < mapSize; i++) { + long key = in.readLong(); + map.put(key, SerializationHelper.readIntArray(in)); + } + map.compact(); + return new FeatureIndex(map); + } + } + + public class Searcher { + private final byte[] iteratorsPerConjunction; + + private Searcher() { + this.iteratorsPerConjunction = new byte[idMapping.length]; + } + + /** + * Retrieves a list of hits for the given query. + * + * @param query Specifies the boolean variables that are true. + * @return List of hits + */ + public List<ConjunctionHit> search(PredicateQuery query) { + List<ConjunctionHit> conjunctionHits = new ArrayList<>(); + int uniqueKeys = (int) query.getFeatures().stream().map(e -> e.key).distinct().count(); + for (int k = uniqueKeys; k >= 0; k--) { + List<ConjunctionIdIterator> iterators = new ArrayList<>(); + getFeatureIndex(k) + .ifPresent(featureIndex -> addFeatureIterators(query, featureIndex, iterators)); + if (k == 0 && zList.length > 0) { + iterators.add(new ConjunctionIdIterator(SubqueryBitmap.ALL_SUBQUERIES, zList)); + } + if (!iterators.isEmpty()) { + calculateIteratorsPerConjunction(iterators); + findMatchingConjunctions(k, iterators, conjunctionHits, iteratorsPerConjunction); + } + } + return conjunctionHits; + } + + private void calculateIteratorsPerConjunction(List<ConjunctionIdIterator> iterators) { + Arrays.fill(iteratorsPerConjunction, (byte)0); + for (ConjunctionIdIterator iterator : iterators) { + for (int id : iterator.getConjunctionIds()) { + if (ConjunctionId.isPositive(id)) { + ++iteratorsPerConjunction[id >>> 1]; + } + } + } + } + + private Optional<FeatureIndex> getFeatureIndex(int k) { + return Optional.ofNullable(kIndex.get(k)); + } + + private void addFeatureIterators(PredicateQuery query, FeatureIndex featureIndex, List<ConjunctionIdIterator> iterators) { + query.getFeatures().stream() + .map(e -> toSingleTermIterator(e, featureIndex)) + .filter(Optional::isPresent) + .map(Optional::get) + .forEach(iterators::add); + } + + private Optional<ConjunctionIdIterator> toSingleTermIterator(PredicateQuery.Feature feature, FeatureIndex featureIndex) { + return featureIndex.getConjunctionIdsForFeature(feature.featureHash) + .map(conjunctions -> new ConjunctionIdIterator(feature.subqueryBitmap, conjunctions)); + } + + private void findMatchingConjunctions(int k, List<ConjunctionIdIterator> iterators, List<ConjunctionHit> matchingIds, byte[] iteratorsPerConjunction) { + if (k == 0) { + k = 1; + } + int nextId = getNextId(0, k, iteratorsPerConjunction); + if (nextId == -1) { + return; // no hits + } + + int nIterators = iterators.size(); + if (nIterators < k) { + return; // No hits + } + short[] sortedIndexes = new short[nIterators]; + short[] sortedIndexesMergeBuffer = new short[nIterators]; + for (short i = 0; i < nIterators; ++i) { + sortedIndexes[i] = i; + } + + int[] currentIds = new int[nIterators]; + int nCompleted = initializeIterators(iterators, sortedIndexes, currentIds, nextId); + nIterators -= nCompleted; + + while (nIterators >= k) { + int id0 = currentIds[sortedIndexes[0]]; + int idK = currentIds[sortedIndexes[k - 1]]; + + // There should be at least k iterators for conjunction. + if (ConjunctionId.equals(id0, idK)) { + long matchingSubqueries = SubqueryBitmap.ALL_SUBQUERIES; + // Find first positive conjunction + int firstPositive = 0; + while (firstPositive < nIterators && !ConjunctionId.isPositive(currentIds[sortedIndexes[firstPositive]])) { + // AND in the complement of the bitmap for negative conjunctions. + matchingSubqueries &= ~iterators.get(sortedIndexes[firstPositive]).getSubqueryBitmap(); + ++firstPositive; + } + if (firstPositive + k <= nIterators) { + // Verify that at there are k positive iterators for the current conjunction. + id0 = currentIds[sortedIndexes[firstPositive]]; + idK = currentIds[sortedIndexes[firstPositive + k - 1]]; + if (id0 == idK) { // We know that id0 is positive conjunction + for (int i = firstPositive; i < firstPositive + k; i++) { + matchingSubqueries &= iterators.get(sortedIndexes[i]).getSubqueryBitmap(); + } + if (matchingSubqueries != 0) { + matchingIds.add(new ConjunctionHit(toExternalId(id0), matchingSubqueries)); + } + } + } + } + + // Advance iterators to next conjunction. + nextId = getNextId(ConjunctionId.nextId(id0), k, iteratorsPerConjunction); + if (nextId == -1) { + return; + } + int completed = 0; + int i; + for (i = 0; i < nIterators; ++i) { + short index = sortedIndexes[i]; + if (ConjunctionId.compare(currentIds[index], nextId) < 0) { + ConjunctionIdIterator iterator = iterators.get(index); + if (iterator.next(nextId)) { + currentIds[index] = iterator.getConjunctionId(); + } else { + currentIds[index] = Integer.MAX_VALUE; + ++completed; + } + } else { + break; + } + } + if (i > 0 && nIterators - completed >= k) { + boolean swapMergeBuffer = + PrimitiveArraySorter.sortAndMerge(sortedIndexes, sortedIndexesMergeBuffer, i, nIterators, + (a, b) -> Integer.compare(currentIds[a], currentIds[b])); + if (swapMergeBuffer) { + short[] temp = sortedIndexes; + sortedIndexes = sortedIndexesMergeBuffer; + sortedIndexesMergeBuffer = temp; + } + } + nIterators -= completed; + } + } + + private int initializeIterators(List<ConjunctionIdIterator> iterators, short[] sortedIndexes, int[] currentIds, int nextId) { + int nCompleted = 0; + int nIterators = iterators.size(); + for (int i = 0; i < nIterators; i++) { + ConjunctionIdIterator iterator = iterators.get(i); + if (iterator.next(nextId)) { + currentIds[i] = iterator.getConjunctionId(); + } else { + currentIds[i] = Integer.MAX_VALUE; + ++nCompleted; + + } + } + PrimitiveArraySorter.sort(sortedIndexes, (a, b) -> Integer.compare(currentIds[a], currentIds[b])); + return nCompleted; + } + + private int getNextId(int fromId, int k, byte[] iteratorsPerConjunction) { + int id = fromId >>> 1; + int nDocuments = iteratorsPerConjunction.length; + while (id < nDocuments && iteratorsPerConjunction[id] < k) { + ++id; + } + return id == nDocuments ? -1 : ((id << 1) | 1); + + } + + private long toExternalId(int internalId) { + return idMapping[internalId >>> 1]; + } + } +} |