aboutsummaryrefslogtreecommitdiffstats
path: root/container-search/src/main/java/com/yahoo/search/grouping/UniqueGroupingSearcher.java
blob: 6d2b416700a2fb2608e9879d7c291847f34a2660 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.search.grouping;

import com.yahoo.component.chain.dependencies.After;
import com.yahoo.component.chain.dependencies.Before;
import java.util.logging.Level;
import com.yahoo.processing.request.CompoundName;
import com.yahoo.search.Query;
import com.yahoo.search.Result;
import com.yahoo.search.Searcher;
import com.yahoo.search.grouping.request.AllOperation;
import com.yahoo.search.grouping.request.AttributeValue;
import com.yahoo.search.grouping.request.CountAggregator;
import com.yahoo.search.grouping.request.EachOperation;
import com.yahoo.search.grouping.request.GroupingExpression;
import com.yahoo.search.grouping.request.GroupingOperation;
import com.yahoo.search.grouping.request.MaxAggregator;
import com.yahoo.search.grouping.request.MinAggregator;
import com.yahoo.search.grouping.request.NegFunction;
import com.yahoo.search.grouping.request.SummaryValue;
import com.yahoo.search.grouping.result.Group;
import com.yahoo.search.grouping.result.GroupList;
import com.yahoo.search.query.Sorting;
import com.yahoo.search.result.Hit;
import com.yahoo.search.result.HitOrderer;
import com.yahoo.search.searchchain.Execution;
import com.yahoo.search.searchchain.PhaseNames;

import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.logging.Logger;

/**
 * Implements 'unique' using a grouping expression.
 *
 * It doesn't work for multi-level sorting.
 *
 * @author andreer
 */
@After(PhaseNames.RAW_QUERY)
@Before(PhaseNames.TRANSFORMED_QUERY)
public class UniqueGroupingSearcher extends Searcher {

    public static final CompoundName PARAM_UNIQUE = CompoundName.from("unique");
    private static final Logger log = Logger.getLogger(UniqueGroupingSearcher.class.getName());
    private static final HitOrderer NOP_ORDERER = new HitOrderer() {

        @Override
        public void order(List<Hit> hits) {
            // The order of hits is given by the grouping framework, and should not be re-ordered when we copy the hits
            // from the groups to the base HitGroup in the result.
        }
    };
    static final String LABEL_COUNT = "uniqueCount";
    static final String LABEL_GROUPS = "uniqueGroups";
    static final String LABEL_HITS = "uniqueHits";

    /**
     * Implements the deprecated "unique" api for deduplication by using grouping. We create a grouping expression on
     * the field we wish to dedup on (which must be an attribute).
     * Total hits is calculated using the new count unique groups functionality.
     */
    @Override
    public Result search(Query query, Execution execution) {
        // Determine if we should remove duplicates
        String unique = query.properties().getString(PARAM_UNIQUE);
        if (unique == null || unique.trim().isEmpty()) {
            return execution.search(query);
        }
        query.trace("Performing deduping by attribute '" + unique + "'.", true, 3);
        return dedupe(query, execution, unique);
    }

    /**
     * Until we can use the grouping pagination features in 5.1, we'll have to support offset
     * by simply requesting and discarding hit #0 up to hit #offset.
     */
    private static Result dedupe(Query query, Execution execution, String dedupField) {
        Sorting sorting = query.getRanking().getSorting();
        if (sorting != null && sorting.fieldOrders().size() > 1) {
            query.trace("Can not use grouping for deduping with multi-level sorting.", 3);
            // To support this we'd have to generate a grouping expression with as many levels
            // as there are levels in the sort spec. This is probably too slow and costly that
            // we'd ever want to actually use it (and a bit harder to implement as well).
            return execution.search(query);
        }

        int hits = query.getHits();
        int offset = query.getOffset();
        int groupingHits = hits + offset;

        GroupingRequest groupingRequest = GroupingRequest.newInstance(query);
        groupingRequest.setRootOperation(
                buildGroupingExpression(
                        dedupField,
                        groupingHits,
                        query.getPresentation().getSummary(),
                        sorting));

        query.setHits(0);
        query.setOffset(0);
        Result result = execution.search(query);

        query = result.getQuery(); // query could have changed further down in the chain
        query.setHits(hits);
        query.setOffset(offset);

        Group root = groupingRequest.getResultGroup(result);
        if (null == root) {
            String msg = "Result group not found for deduping grouping request, returning empty result.";
            query.trace(msg, 3);
            log.log(Level.WARNING, msg);
            throw new IllegalStateException("Failed to produce deduped result set.");
        }
        result.hits().remove(root.getId().toString()); // hide our tracks

        GroupList resultGroups = root.getGroupList(dedupField);
        if (resultGroups == null) {
            query.trace("Deduping grouping request returned no hits, returning empty result.", 3);
            return result;
        }

        // Make sure that .addAll() doesn't re-order the hits we copy from the grouping
        // framework. The groups are already in the order they should be.
        result.hits().setOrderer(NOP_ORDERER);
        result.hits().addAll(getRequestedHits(resultGroups, offset, hits));

        Long countField = (Long) root.getField(LABEL_COUNT);
        long count = countField != null ? countField : 0;
        result.setTotalHitCount(count);

        return result;
    }

    /**
     * Create a hit ordering clause based on the sorting spec.
     *
     * @param sortingSpec A (single level!) sorting specification
     * @return a grouping expression which produces a sortable value
     */
    private static List<GroupingExpression> createHitOrderingClause(Sorting sortingSpec) {
        List<GroupingExpression> orderingClause = new ArrayList<>();
        for (Sorting.FieldOrder fieldOrder : sortingSpec.fieldOrders()) {
            Sorting.Order sortOrder = fieldOrder.getSortOrder();
            switch (sortOrder) {
                case ASCENDING, UNDEFINED ->
                    // When we want ascending order, the hit with the smallest value should come first (and be surfaced).
                        orderingClause.add(new MinAggregator(new AttributeValue(fieldOrder.getFieldName())));
                case DESCENDING ->
                    // When we sort in descending order, the hit with the largest value should come first (and be surfaced).
                        orderingClause.add(new NegFunction(new MaxAggregator(new AttributeValue(fieldOrder.getFieldName()))));
                default -> throw new UnsupportedOperationException("Can not handle sort order " + sortOrder + ".");
            }
        }
        return orderingClause;
    }

    /**
     * Create a hit ordering clause based on the sorting spec.
     *
     * @param sortingSpec A (single level!) sorting specification
     * @return a grouping expression which produces a sortable value
     */
    private static GroupingExpression createGroupOrderingClause(Sorting sortingSpec) {
        GroupingExpression groupingClause = null;
        for (Sorting.FieldOrder fieldOrder : sortingSpec.fieldOrders()) {
            Sorting.Order sortOrder = fieldOrder.getSortOrder();
            groupingClause = switch (sortOrder) {
                case ASCENDING, UNDEFINED -> new AttributeValue(fieldOrder.getFieldName());
                case DESCENDING ->
                    // To sort descending, just take the negative. This is the most common case
                        new NegFunction(new AttributeValue(fieldOrder.getFieldName()));
                default -> throw new UnsupportedOperationException("Can not handle sort order " + sortOrder + ".");
            };
        }
        return groupingClause;
    }

    /**
     * Retrieve the actually unique hits from the grouping results.
     *
     * @param resultGroups the results of the dedup grouping expression.
     * @param offset       the requested offset. Hits before this are discarded.
     * @param hits         the requested number of hits. Hits in excess of this are discarded.
     * @return A list of the actually requested hits, sorted as by the grouping expression.
     */
    private static List<Hit> getRequestedHits(GroupList resultGroups, int offset, int hits) {
        List<Hit> receivedHits = getAllHitsFromGroupingResult(resultGroups);
        if (receivedHits.size() <= offset) {
            return Collections.emptyList(); // There weren't any hits as far out as requested.
        }
        int lastRequestedHit = Math.min(offset + hits, receivedHits.size());
        return receivedHits.subList(offset, lastRequestedHit);
    }

    /**
     * Get all the hits returned by the grouping request. This might be more or less than the user requested.
     * This method handles the results from two different types of grouping expression, depending on whether
     * sorting was used for the query or not.
     *
     * @param resultGroups The result group of the dedup grouping request
     * @return A (correctly sorted) list of all the hits returned by the grouping expression.
     */
    private static List<Hit> getAllHitsFromGroupingResult(GroupList resultGroups) {
        List<Hit> hits = new ArrayList<>(resultGroups.size());
        for (Hit groupHit : resultGroups) {
            Group group = (Group)groupHit;
            GroupList sorted = group.getGroupList(LABEL_GROUPS);
            if (sorted != null) {
                group = (Group)sorted.iterator().next();
            }
            for (Hit hit : group.getHitList(LABEL_HITS)) {
                hits.add(hit);
            }
        }
        return hits;
    }

    static GroupingOperation buildGroupingExpression(String dedupField, int groupingHits, String summaryClass,
                                                     Sorting sortSpec) {
        if (sortSpec != null) {
            return buildGroupingExpressionWithSorting(dedupField, groupingHits, summaryClass, sortSpec);
        } else {
            return buildGroupingExpressionWithRanking(dedupField, groupingHits, summaryClass);
        }
    }

    /**
     * Create the grouping expression when ranking is used for ordering
     * (which is the default for grouping expressions, so ranking is not explicitly mentioned).
     * See unit test for examples
     */
    private static GroupingOperation buildGroupingExpressionWithRanking(String dedupField, int groupingHits,
                                                                        String summaryClass) {
        return new AllOperation()
            .setGroupBy(new AttributeValue(dedupField))
            .addOutput(new CountAggregator().setLabel(LABEL_COUNT))
            .setMax(groupingHits)
            .addChild(new EachOperation()
                .setMax(1)
                .addChild(new EachOperation()
                    .setLabel(LABEL_HITS)
                    .addOutput(summaryClass == null ? new SummaryValue() : new SummaryValue(summaryClass))));
    }

    /**
     * Create the grouping expression when sorting is used for ordering
     * This grouping expression is more complicated and probably quite a bit heavier to execute.
     * See unit test for examples
     */
    private static GroupingOperation buildGroupingExpressionWithSorting(String dedupField, int groupingHits,
                                                                        String summaryClass, Sorting sortSpec) {
        return new AllOperation()
            .setGroupBy(new AttributeValue(dedupField))
            .addOutput(new CountAggregator().setLabel(LABEL_COUNT))
            .setMax(groupingHits)
            .addOrderBy(createHitOrderingClause(sortSpec))
            .addChild(new EachOperation()
                .addChild(new AllOperation()
                    .setGroupBy(createGroupOrderingClause(sortSpec))
                    .addOrderBy(createHitOrderingClause(sortSpec))
                    .setMax(1)
                    .addChild(new EachOperation()
                        .setLabel(LABEL_GROUPS)
                        .addChild(new EachOperation()
                            .setLabel(LABEL_HITS)
                            .addOutput(summaryClass == null ? new SummaryValue() : new SummaryValue(summaryClass))))));
    }

}