aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
blob: 7ed6f44261098a3054adc3ea40bb7d8f754bb379 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.significance.impl;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.yahoo.language.significance.DocumentFrequency;
import com.yahoo.language.significance.SignificanceModel;

import java.nio.file.Path;
import java.util.HashMap;

/**
 *
 * @author MariusArhaug
 */
public class DefaultSignificanceModel implements SignificanceModel {
    private final long corpusSize;
    private final HashMap<String, Long> frequencies;
    private final Path path;

    @JsonIgnoreProperties(ignoreUnknown = true)
    public static class SignificanceModelFile {
        private final String version;
        private final String id;
        private final String description;
        private final long corpusSize;
        private final String language;

        private final long wordCount;
        private final HashMap<String, Long> frequencies;

        @JsonCreator
        public SignificanceModelFile(
                @JsonProperty("version") String version,
                @JsonProperty("id") String id,
                @JsonProperty("description") String description,
                @JsonProperty("corpus-size") long corpusSize,
                @JsonProperty("language") String language,
                @JsonProperty("word-count") long wordCount,
                @JsonProperty("frequencies") HashMap<String, Long> frequencies) {
            this.version = version;
            this.id = id;
            this.description = description;
            this.corpusSize = corpusSize;
            this.language = language;
            this.wordCount = wordCount;
            this.frequencies = frequencies;
        }

        @JsonProperty("version")
        public String version() { return version; }

        @JsonProperty("id")
        public String id() { return id; }

        @JsonProperty("description")
        public String description() { return description; }

        @JsonProperty("corpus-size")
        public long corpusSize() { return corpusSize; }

        @JsonProperty("language")
        public String language() { return language; }

        @JsonProperty("frequencies")
        public HashMap<String, Long> frequencies() { return frequencies; }

        @JsonProperty("word-count")
        public long wordCount() { return wordCount; }

    }

    public DefaultSignificanceModel(Path path) {
        this.path = path;

        ObjectMapper objectMapper = new ObjectMapper();

        try {
            SignificanceModelFile model = objectMapper.readValue(this.path.toFile(), SignificanceModelFile.class);
            this.corpusSize = model.corpusSize;
            this.frequencies = model.frequencies;
        } catch (Exception e) {
            throw new RuntimeException("Failed to load model from " + path, e);
        }
    }

    @Override
    public DocumentFrequency documentFrequency(String word) {
        if (frequencies.containsKey(word)) {
            return new DocumentFrequency(frequencies.get(word), corpusSize);
        }
        return new DocumentFrequency(1, corpusSize);
    }
}