aboutsummaryrefslogtreecommitdiffstats
path: root/predicate-search/src/main/java/com/yahoo/search/predicate/utils/TargetingQueryFileConverter.java
blob: fefa35a0e1ca773a659aed514c4710626fa3bd47 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.search.predicate.utils;

import com.google.common.net.UrlEscapers;
import com.yahoo.search.predicate.PredicateQuery;
import com.yahoo.search.predicate.serialization.PredicateQuerySerializer;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.*;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.stream.Stream;

import static java.util.stream.Collectors.joining;

/**
 * Converts a targeting query (the format provided by targeting team) into a file of Vespa queries formatted as URLs.
 *
 * The format is the following:
 * - Each line represents one bulk query (upto 64 subqueries)
 * - Each bulk query has a set of subqueries separated by ";"
 * - Each subquery is of the format: attrName\tattrValue\tsubqueryIndex\tisRangeTerm;
 * - Some attributes have no value.
 * - Value may contain ";"
 *
 * @author bjorncs
 */
public class TargetingQueryFileConverter {

    // Subqueries having more than this value are skipped.
    private static final int MAX_NUMBER_OF_TERMS = 100;

    private enum OutputFormat {JSON, YQL}

    private TargetingQueryFileConverter() {}

    public static void main(String[] args) throws IOException {
        int nQueries = 123042;
        int batchFactor = 64;
        Subqueries subqueries = parseRiseQueries(new File("test-data/rise-query2.txt"), nQueries);
        filterOutHugeSubqueries(subqueries);
        List<Query> queries = batchSubqueries(subqueries, batchFactor);
        writeSubqueriesToFile(
                queries,
                new File("test-data/targeting-queries-json-" + batchFactor + "b-" + nQueries + "n.txt"),
                OutputFormat.JSON);
        writeSubqueriesToFile(
                queries,
                new File("test-data/targeting-queries-yql-" + batchFactor + "b-" + nQueries + "n.txt"),
                OutputFormat.YQL);
    }


    private static void writeSubqueriesToFile(List<Query> queries, File output, OutputFormat outputFormat)
            throws IOException {
        try (BufferedWriter writer = new BufferedWriter(new FileWriter(output))) {
            if (outputFormat == OutputFormat.JSON) {
                writeJSONOutput(writer, queries);
            } else {
                writeYQLOutput(writer, queries);
            }

        }
    }

    private static void writeJSONOutput(BufferedWriter writer, List<Query> queries) throws IOException {
        PredicateQuerySerializer serializer = new PredicateQuerySerializer();
        for (Query query : queries) {
            PredicateQuery predicateQuery = toPredicateQuery(query);
            String json = serializer.toJSON(predicateQuery);
            writer.append(json).append('\n');
        }
    }

    private static PredicateQuery toPredicateQuery(Query query) {
        PredicateQuery predicateQuery = new PredicateQuery();
        for (Map.Entry<Long, Set<Feature>> e : query.valuesForSubqueries.entrySet()) {
            e.getValue().forEach(f -> predicateQuery.addFeature(f.key, f.strValue, e.getKey()));
        }
        for (Map.Entry<Long, Set<Feature>> e : query.rangesForSubqueries.entrySet()) {
            e.getValue().forEach(f -> predicateQuery.addRangeFeature(f.key, f.longValue, e.getKey()));
        }
        return predicateQuery;
    }

    private static void writeYQLOutput(BufferedWriter writer, List<Query> queries) throws IOException {
        for (Query query : queries) {
            writer.append(toYqlString(query)).append('\n');
        }
    }

    private static String toYqlString(Query query)  {
        StringBuilder yqlBuilder = new StringBuilder("select * from sources * where predicate(boolean, ");
        yqlBuilder
                .append(createYqlFormatSubqueryMapString(query.valuesForSubqueries, query.isSingleQuery))
                .append(", ")
                .append(createYqlFormatSubqueryMapString(query.rangesForSubqueries, query.isSingleQuery))
                .append(");");
        return "/search/?query&nocache&yql=" + UrlEscapers.urlFormParameterEscaper().escape(yqlBuilder.toString());
    }

    /*
     * The subqueryBatchFactor determines the batch factor for each query. A maximum of 64 queries can be batched
     * into a single query (as subqueries).
     *      0 => Do not batch and output plain queries (no subquery).
     *      1 => Do not batch, but output queries with single subquery.
     */
    private static List<Query> batchSubqueries(Subqueries subqueries, int subqueryBatchFactor) {
        Iterator<Integer> iterator = subqueries.subqueries.iterator();
        List<Query> result = new ArrayList<>();
        while (iterator.hasNext()) {
            // Aggregate the subqueries that contains a given value.
            Map<Feature, Long> subqueriesForValue = new TreeMap<>();
            Map<Feature, Long> subqueriesForRange = new TreeMap<>();
            // Batch single to single subquery for batch factor 0.
            for (int i = 0; i < Math.max(1, subqueryBatchFactor) && iterator.hasNext(); ++i) {
                Integer subquery = iterator.next();
                registerSubqueryValues(i, subqueries.valuesForSubquery.get(subquery), subqueriesForValue);
                registerSubqueryValues(i, subqueries.rangesForSubquery.get(subquery), subqueriesForRange);
            }

            // Aggregate the values that are contained in a given set of subqueries.
            Query query = new Query(subqueryBatchFactor == 0);
            simplifyAndFillQueryValues(query.valuesForSubqueries, subqueriesForValue);
            simplifyAndFillQueryValues(query.rangesForSubqueries, subqueriesForRange);
            result.add(query);
        }
        return result;
    }

    private static void registerSubqueryValues(int subquery, Set<Feature> values, Map<Feature, Long> subqueriesForValue) {
        if (values != null) {
            values.forEach(value -> subqueriesForValue.merge(value, 1L << subquery, (ids1, ids2) -> ids1 | ids2));
        }
    }

    private static void simplifyAndFillQueryValues(Map<Long, Set<Feature>> queryValues, Map<Feature, Long> subqueriesForValue) {
        for (Map.Entry<Feature, Long> entry : subqueriesForValue.entrySet()) {
            Feature feature = entry.getKey();
            Long subqueryBitmap = entry.getValue();
            Set<Feature> featureSet = queryValues.computeIfAbsent(subqueryBitmap, (k) -> new HashSet<>());
            featureSet.add(feature);
        }
    }

    private static String createYqlFormatSubqueryMapString(Map<Long, Set<Feature>> subqueriesForString, boolean isSingleQuery) {
        return subqueriesForString.entrySet().stream()
                .map(e -> {
                    Stream<String> features = e.getValue().stream().map(Feature::asYqlString);
                    if (isSingleQuery) {
                        return features.collect(joining(", "));
                    } else {
                        // Note: Cannot use method reference as both method toString(int) and method toString() match.
                        String values = features.collect(joining(", ", "{", "}"));
                        return String.format("\"0x%s\":%s", Long.toHexString(e.getKey()), values);
                    }
                })
                .collect(joining(", ", "{", "}"));
    }

    private static Subqueries parseRiseQueries(File riseQueryFile, int maxQueries) throws IOException {
        try (BufferedReader reader = new BufferedReader(new FileReader(riseQueryFile))) {
            Subqueries parsedSubqueries = new Subqueries();
            AtomicInteger counter = new AtomicInteger(1);
            reader.lines()
                    .limit(maxQueries)
                    .forEach(riseQuery -> parseRiseQuery(parsedSubqueries, riseQuery, counter.getAndIncrement()));
            return parsedSubqueries;
        }
    }

    private static void filterOutHugeSubqueries(Subqueries subqueries) {
        Iterator<Integer> iterator = subqueries.subqueries.iterator();
        while (iterator.hasNext()) {
            Integer subquery = iterator.next();
            Set<Feature> values = subqueries.valuesForSubquery.get(subquery);
            Set<Feature> ranges = subqueries.rangesForSubquery.get(subquery);
            int sizeValues = values == null ? 0 : values.size();
            int sizeRanges = ranges == null ? 0 : ranges.size();
            if (sizeValues + sizeRanges > MAX_NUMBER_OF_TERMS) {
                iterator.remove();
                subqueries.valuesForSubquery.remove(subquery);
                subqueries.rangesForSubquery.remove(subquery);
            }
        }
    }

    private static void parseRiseQuery(Subqueries subqueries, String queryString, int queryId) {
        StringTokenizer subQueryTokenizer = new StringTokenizer(queryString, "\t", true);
        while (subQueryTokenizer.hasMoreTokens()) {
            String key = subQueryTokenizer.nextToken("\t");
            subQueryTokenizer.nextToken();  // Consume delimiter
            String value = subQueryTokenizer.nextToken();
            if (value.equals("\t")) {
                value = "";
            } else {
                subQueryTokenizer.nextToken();  // Consume delimiter
            }
            int subQueryIndex = Integer.parseInt(subQueryTokenizer.nextToken());
            subQueryTokenizer.nextToken();  // Consume delimiter
            boolean isRangeTerm = Boolean.parseBoolean(subQueryTokenizer.nextToken(";"));
            if (subQueryTokenizer.hasMoreTokens()) {
                subQueryTokenizer.nextToken();  // Consume delimiter
            }
            int subqueryId = subQueryIndex + 64 * queryId;
            if (isRangeTerm) {
                Set<Feature> rangeFeatures = subqueries.rangesForSubquery.computeIfAbsent(
                        subqueryId, (id) -> new HashSet<>());
                rangeFeatures.add(new Feature(key, Long.parseLong(value)));
            } else {
                Set<Feature> features = subqueries.valuesForSubquery.computeIfAbsent(subqueryId, (id) -> new HashSet<>());
                features.add(new Feature(key, value));
            }
            subqueries.subqueries.add(subqueryId);
        }
    }

    private static class Subqueries {
        public final TreeSet<Integer> subqueries = new TreeSet<>();
        public final Map<Integer, Set<Feature>> valuesForSubquery = new HashMap<>();
        public final Map<Integer, Set<Feature>> rangesForSubquery = new HashMap<>();
    }

    private static class Query {
        public final boolean isSingleQuery;
        public final Map<Long, Set<Feature>> valuesForSubqueries = new TreeMap<>();
        public final Map<Long, Set<Feature>> rangesForSubqueries = new TreeMap<>();

        public Query(boolean isSingleQuery) {
            this.isSingleQuery = isSingleQuery;
        }
    }

    private static class Feature implements Comparable<Feature> {
        public final String key;
        private final String strValue;
        private final long longValue;

        public Feature(String key, String value) {
            this.key = key;
            this.strValue = value;
            this.longValue = 0;
        }

        public Feature(String key, long value) {
            this.key = key;
            this.strValue = null;
            this.longValue = value;
        }

        public String asYqlString() {
            if (strValue != null) {
                return String.format("\"%s\":\"%s\"", key, strValue);
            } else {
                return String.format("\"%s\":%dl", key, longValue);
            }
        }

        @Override
        public boolean equals(Object o) {
            if (this == o) return true;
            if (!(o instanceof Feature)) return false;

            Feature feature = (Feature) o;

            if (longValue != feature.longValue) return false;
            if (!key.equals(feature.key)) return false;
            return !(strValue != null ? !strValue.equals(feature.strValue) : feature.strValue != null);

        }

        @Override
        public int hashCode() {
            int result = key.hashCode();
            result = 31 * result + (strValue != null ? strValue.hashCode() : 0);
            result = 31 * result + (int) (longValue ^ (longValue >>> 32));
            return result;
        }

        @Override
        public int compareTo(Feature o) {
            return asYqlString().compareTo(o.asYqlString());
        }
    }

}