summaryrefslogtreecommitdiffstats
path: root/container-search/src/main/java/com/yahoo/search/schema/internal/TensorConverter.java
blob: 4485e538e13cd5a9393f699f6f340b3b4d15b659 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.search.schema.internal;

import com.yahoo.language.Language;
import com.yahoo.language.process.Embedder;
import com.yahoo.processing.request.Properties;
import com.yahoo.tensor.Tensor;
import com.yahoo.tensor.TensorType;

import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * A class which knows how to convert an Object value to a tensor of a given type.
 *
 * @author bratseth
 */
public class TensorConverter {

    private static final Pattern embedderArgumentAndQuotedTextRegexp = Pattern.compile("^([A-Za-z0-9_@\\-.]+),\\s*([\"'].*[\"'])");
    private static final Pattern embedderArgumentAndReferenceRegexp = Pattern.compile("^([A-Za-z0-9_@\\-.]+),\\s*(@.*)");

    private final Map<String, Embedder> embedders;

    public TensorConverter(Map<String, Embedder> embedders) {
        this.embedders = embedders;
    }

    public Tensor convertTo(TensorType type, String key, Object value, Language language,
                            Map<String, String> contextValues, Properties properties) {
        var context = new Embedder.Context(key).setLanguage(language);
        Tensor tensor = toTensor(type, value, context, contextValues, properties);
        if (tensor == null) return null;
        if (! tensor.type().isAssignableTo(type))
            throw new IllegalArgumentException("Require a tensor of type " + type);
        return tensor;
    }

    private Tensor toTensor(TensorType type, Object value, Embedder.Context context, Map<String, String> contextValues,
                            Properties properties) {
        if (value instanceof Tensor) return (Tensor)value;
        if (value instanceof String && isEmbed((String)value)) return embed((String)value, type, context, contextValues, properties);
        if (value instanceof String) return Tensor.from(type, (String)value);
        return null;
    }

    static boolean isEmbed(String value) {
        return value.startsWith("embed(");
    }

    private Tensor embed(String s, TensorType type, Embedder.Context embedderContext, Map<String, String> contextValues,
                         Properties properties) {
        if ( ! s.endsWith(")"))
            throw new IllegalArgumentException("Expected any string enclosed in embed(), but the argument does not end by ')'");
        String argument = s.substring("embed(".length(), s.length() - 1);
        Embedder embedder;
        String embedderId;

        // Check if arguments specifies an embedder with the format embed(embedder, "text to encode")
        Matcher matcher;
        if (( matcher = embedderArgumentAndQuotedTextRegexp.matcher(argument)).matches()) {
            embedderId = matcher.group(1);
            embedder = requireEmbedder(embedderId);
            argument = matcher.group(2);
        } else if (( matcher = embedderArgumentAndReferenceRegexp.matcher(argument)).matches()) {
                embedderId = matcher.group(1);
                embedder = requireEmbedder(embedderId);
                argument = matcher.group(2);
        } else if (embedders.isEmpty()) {
            throw new IllegalArgumentException("No embedders provided");  // should never happen
        } else if (embedders.size() > 1) {
            String usage = "Usage: embed(embedder-id, 'text'). " + embedderIds(embedders);
            if (! argument.contains("\"") && ! argument.contains("'"))
                throw new IllegalArgumentException("Multiple embedders are provided but the string to embed is not quoted. " + usage);
            else
                throw new IllegalArgumentException("Multiple embedders are provided but no embedder id is given. " + usage);
        } else {
            var entry = embedders.entrySet().stream().findFirst().get();
            embedderId = entry.getKey();
            embedder = entry.getValue();
        }
        return embedder.embed(resolve(argument, contextValues, properties), embedderContext.copy().setEmbedderId(embedderId), type);
    }

    private Embedder requireEmbedder(String embedderId) {
        if ( ! embedders.containsKey(embedderId))
            throw new IllegalArgumentException("Can't find embedder '" + embedderId + "'. " + embedderIds(embedders));
        return embedders.get(embedderId);
    }

    private static String resolve(String s, Map<String, String> contextValues, Properties properties) {
        if (s.startsWith("'") && s.endsWith("'"))
            return s.substring(1, s.length() - 1);
        if (s.startsWith("\"") && s.endsWith("\""))
            return s.substring(1, s.length() - 1);
        if (s.startsWith("@"))
            return resolveReference(s, contextValues, properties);
        return s;
    }

    private static String resolveReference(String s, Map<String, String> contextValues, Properties properties) {
        String referenceKey = s.substring(1);
        Object referencedValue = properties.get(referenceKey, contextValues);
        if (referencedValue == null)
            throw new IllegalArgumentException("Could not resolve query parameter reference '" + referenceKey +
                                               "' used in an embed() argument");
        return referencedValue.toString();
    }

    private static String embedderIds(Map<String, Embedder> embedders) {
        List<String> embedderIds = new ArrayList<>();
        embedders.forEach((key, value) -> embedderIds.add("'" + key + "'"));
        embedderIds.sort(null);
        return "Available embedder ids are " + String.join(", ", embedderIds) + ".";
    }

}