diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /predicate-search/src/main/java/com/yahoo/search/predicate/PredicateIndexBuilder.java |
Publish
Diffstat (limited to 'predicate-search/src/main/java/com/yahoo/search/predicate/PredicateIndexBuilder.java')
-rw-r--r-- | predicate-search/src/main/java/com/yahoo/search/predicate/PredicateIndexBuilder.java | 269 |
1 files changed, 269 insertions, 0 deletions
diff --git a/predicate-search/src/main/java/com/yahoo/search/predicate/PredicateIndexBuilder.java b/predicate-search/src/main/java/com/yahoo/search/predicate/PredicateIndexBuilder.java new file mode 100644 index 00000000000..84940e54c02 --- /dev/null +++ b/predicate-search/src/main/java/com/yahoo/search/predicate/PredicateIndexBuilder.java @@ -0,0 +1,269 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.search.predicate; + +import com.google.common.annotations.Beta; +import com.google.common.base.Preconditions; +import com.google.common.primitives.Bytes; +import com.google.common.primitives.Ints; +import com.google.common.primitives.Shorts; +import com.yahoo.document.predicate.BooleanPredicate; +import com.yahoo.document.predicate.Predicate; +import com.yahoo.search.predicate.annotator.PredicateTreeAnnotations; +import com.yahoo.search.predicate.annotator.PredicateTreeAnnotator; +import com.yahoo.search.predicate.index.Feature; +import com.yahoo.search.predicate.index.Interval; +import com.yahoo.search.predicate.index.IntervalWithBounds; +import com.yahoo.search.predicate.index.Posting; +import com.yahoo.search.predicate.index.PredicateIntervalStore; +import com.yahoo.search.predicate.index.PredicateOptimizer; +import com.yahoo.search.predicate.index.SimpleIndex; +import com.yahoo.search.predicate.index.conjunction.ConjunctionIndexBuilder; +import com.yahoo.search.predicate.index.conjunction.IndexableFeatureConjunction; + +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeMap; + +import static java.util.stream.Collectors.joining; +import static java.util.stream.Collectors.toList; + +/** + * A builder for {@link PredicateIndex}. + * <p> + * When creating a PredicateIndexBuilder, you must specify an arity. This is used for + * range features, and is a trade-off of index size vs. query speed. Higher + * arities gives larger index but faster search. + * </p> + * <p> + * {@link #indexDocument(int, Predicate)} + * takes a document id and a predicate to insert into the index. + * Predicates should be specified using the predicate syntax described in the documentation. + * Create the {@link Predicate} objects using {@link Predicate#fromString(String)}. + * </p> + * <p> + * Use {@link #build()} to create an instance of {@link PredicateIndex}. + * </p> + * @author bjorncs + */ +@Beta +public class PredicateIndexBuilder { + + // Unique ids / mapping from internal to external id. LinkedHashSet as the insertion order is crucial. + private final Set<Integer> seenIds = new LinkedHashSet<>(); + private final List<Short> intervalEndsBuilder = new ArrayList<>(); + private final List<Byte> minFeatureIndexBuilder = new ArrayList<>(); + private final List<Integer> zeroConstraintDocuments = new ArrayList<>(); + private final SimpleIndex.Builder intervalIndexBuilder = new SimpleIndex.Builder(); + private final SimpleIndex.Builder boundsIndexBuilder = new SimpleIndex.Builder(); + private final SimpleIndex.Builder conjunctionIntervalIndexBuilder = new SimpleIndex.Builder(); + private final ConjunctionIndexBuilder conjunctionIndexBuilder = new ConjunctionIndexBuilder(); + private final PredicateIntervalStore.Builder intervalStoreBuilder; + private final PredicateOptimizer optimizer; + private final Config config; + private int documentIdCounter = 0; + private int nZStarDocuments = 0; + private int nZStarIntervals = 0; + private int highestIntervalEnd = 1; + + /** + * Creates a PredicateIndexBuilder with default upper and lower bounds. + * + * @param arity The arity to use when indexing range predicates. + * Small arity gives smaller index, but more expensive searches. + */ + public PredicateIndexBuilder(int arity) { + this(new Config.Builder().setArity(arity).build()); + } + + /** + * Creates a PredicateIndexBuilder. + * Limiting the range of possible values in range predicates reduces index size + * and increases search performance. + * + * @param arity The arity to use when indexing range predicates. + * Small arity gives smaller index, but more expensive searches. + * @param lowerBound The lower bound for the range of values used by range predicates. + * @param upperBound The upper bound for the range of values used by range predicates. + */ + public PredicateIndexBuilder(int arity, long lowerBound, long upperBound) { + this(new Config.Builder().setArity(arity).setLowerBound(lowerBound).setUpperBound(upperBound).build()); + } + + /** + * Creates a PredicateIndexBuilder based on a Config object. + * + * @param config Configuration for the PredicateIndexBuilder. + */ + public PredicateIndexBuilder(Config config) { + this.config = config; + this.optimizer = new PredicateOptimizer(config); + this.intervalStoreBuilder = new PredicateIntervalStore.Builder(); + } + + /** + * Indexes a predicate with the given id. + * + * @param docId A 32-bit document id, returned in the Hit objects when the predicate matches. + * @param predicate The predicate to index. + */ + public void indexDocument(int docId, Predicate predicate) { + if (documentIdCounter == Integer.MAX_VALUE) { + throw new IllegalStateException("Index is full, max number of documents is: " + Integer.MAX_VALUE); + } else if (seenIds.contains(docId)) { + throw new IllegalArgumentException("Document id is already in use: " + docId); + } else if (isNeverMatchingDocument(predicate)) { + return; + } + seenIds.add(docId); + predicate = optimizer.optimizePredicate(predicate); + int internalId = documentIdCounter++; + if (isAlwaysMatchingDocument(predicate)) { + indexZeroConstraintDocument(internalId); + } else { + indexDocument(internalId, PredicateTreeAnnotator.createPredicateTreeAnnotations(predicate)); + } + } + + private static boolean isAlwaysMatchingDocument(Predicate p) { + return p instanceof BooleanPredicate && ((BooleanPredicate) p).getValue(); + } + + private static boolean isNeverMatchingDocument(Predicate p) { + return p instanceof BooleanPredicate && !((BooleanPredicate) p).getValue(); + } + + private void indexZeroConstraintDocument(int docId) { + minFeatureIndexBuilder.add((byte) 0); + intervalEndsBuilder.add((short) Interval.ZERO_CONSTRAINT_RANGE); + zeroConstraintDocuments.add(docId); + } + + private void indexDocument(int docId, PredicateTreeAnnotations annotations) { + int minFeature = annotations.minFeature; + Preconditions.checkState(minFeature <= 0xFF, + "Predicate is too complex. Expected min-feature less than %d, was %d.", 0xFF, minFeature); + int intervalEnd = annotations.intervalEnd; + Preconditions.checkState(intervalEnd <= Interval.MAX_INTERVAL_END, + "Predicate is too complex. Expected min-feature less than %d, was %d.", + Interval.MAX_INTERVAL_END, intervalEnd); + highestIntervalEnd = Math.max(highestIntervalEnd, intervalEnd); + intervalEndsBuilder.add((short) intervalEnd); + minFeatureIndexBuilder.add((byte) minFeature); + indexDocumentFeatures(docId, annotations.intervalMap); + indexDocumentBoundsFeatures(docId, annotations.boundsMap); + indexDocumentConjunctions(docId, annotations.featureConjunctions); + aggregateZStarStatistics(annotations.intervalMap); + } + + private void aggregateZStarStatistics(Map<Long, List<Integer>> intervalMap) { + List<Integer> intervals = intervalMap.get(Feature.Z_STAR_COMPRESSED_ATTRIBUTE_HASH); + if (intervals != null) { + ++nZStarDocuments; + nZStarIntervals += intervals.size(); + } + } + + private void indexDocumentFeatures(int docId, Map<Long, List<Integer>> intervalMap) { + intervalMap.entrySet().stream() + .forEach(entry -> intervalIndexBuilder.insert(entry.getKey(), + new Posting(docId, + intervalStoreBuilder.insert(entry.getValue())))); + } + + private void indexDocumentBoundsFeatures(int docId, Map<Long, List<IntervalWithBounds>> boundsMap) { + boundsMap.entrySet().stream() + .forEach(entry -> boundsIndexBuilder.insert(entry.getKey(), + new Posting(docId, + intervalStoreBuilder.insert( + entry.getValue().stream().flatMap(IntervalWithBounds::stream).collect(toList()))))); + } + + private void indexDocumentConjunctions( + int docId, Map<IndexableFeatureConjunction, List<Integer>> featureConjunctions) { + for (Map.Entry<IndexableFeatureConjunction, List<Integer>> e : featureConjunctions.entrySet()) { + IndexableFeatureConjunction fc = e.getKey(); + List<Integer> intervals = e.getValue(); + Posting posting = new Posting(docId, intervalStoreBuilder.insert(intervals)); + conjunctionIntervalIndexBuilder.insert(fc.id, posting); + conjunctionIndexBuilder.indexConjunction(fc); + } + } + + public PredicateIndex build() { + return new PredicateIndex( + config, + Ints.toArray(seenIds), + Bytes.toArray(minFeatureIndexBuilder), + Shorts.toArray(intervalEndsBuilder), + highestIntervalEnd, + intervalIndexBuilder.build(), + boundsIndexBuilder.build(), + conjunctionIntervalIndexBuilder.build(), + intervalStoreBuilder.build(), + conjunctionIndexBuilder.build(), + Ints.toArray(zeroConstraintDocuments) + ); + } + + public int getZeroConstraintDocCount() { + return zeroConstraintDocuments.size(); + } + + /** + * Retrieve metrics about the current index. + * @return An object containing metrics. + */ + public PredicateIndexStats getStats() { + return new PredicateIndexStats(zeroConstraintDocuments, intervalIndexBuilder, + boundsIndexBuilder, intervalStoreBuilder, conjunctionIndexBuilder, nZStarDocuments, nZStarIntervals); + } + + /** + * A collection of metrics about the currently built {@link PredicateIndex}. + */ + public static class PredicateIndexStats { + private final Map<String, Object> metrics = new TreeMap<>(); + + public PredicateIndexStats( + List<Integer> zeroConstraintDocuments, + SimpleIndex.Builder intervalIndex, + SimpleIndex.Builder boundsIndex, + PredicateIntervalStore.Builder intervalStore, + ConjunctionIndexBuilder conjunctionIndex, + int nZStarDocuments, + int nZStarIntervals) { + Map<Integer, Integer> intervalStoreEntries = intervalStore.getEntriesForSize(); + metrics.put("Zero-constraint documents", zeroConstraintDocuments.size()); + metrics.put("Interval index keys", intervalIndex.getKeyCount()); + metrics.put("Interval index entries", intervalIndex.getEntryCount()); + metrics.put("Bounds index keys", boundsIndex.getKeyCount()); + metrics.put("Bounds index entries", boundsIndex.getEntryCount()); + metrics.put("Conjunction index feature count", conjunctionIndex.calculateFeatureCount()); + metrics.put("Conjunction index unique conjunction count", conjunctionIndex.getUniqueConjunctionCount()); + metrics.put("Conjunction index conjunction count", conjunctionIndex.getConjunctionsSeen()); + metrics.put("Conjunction index Z list size", conjunctionIndex.getZListSize()); + metrics.put("Interval store cache hits", intervalStore.getCacheHits()); + metrics.put("Interval store insert count", intervalStore.getTotalInserts()); + metrics.put("Interval store interval count", intervalStore.getNumberOfIntervals()); + metrics.put("Documents with ZStar intervals", nZStarDocuments); + metrics.put("Total ZStar intervals", nZStarIntervals); + intervalStoreEntries.entrySet().stream() + .filter(entry -> entry.getKey() != 0) + .forEach(entry -> metrics.put("Size " + entry.getKey() + " intervals", entry.getValue())); + } + + public void putValues(Map<String, Object> valueMap) { + valueMap.putAll(metrics); + } + + @Override + public String toString() { + return metrics.entrySet().stream() + .map(e -> String.format("%50s: %s", e.getKey(), e.getValue())) + .collect(joining("\n")); + } + } +} |