aboutsummaryrefslogtreecommitdiffstats
path: root/linguistics/src/main/java/com/yahoo/language/significance/impl/DefaultSignificanceModel.java
blob: 3244b8373ad08859570ce04e83037067d393d0ef (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
// Copyright Vespa.ai. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.language.significance.impl;

import com.fasterxml.jackson.databind.ObjectMapper;
import com.yahoo.language.significance.DocumentFrequency;
import com.yahoo.language.significance.SignificanceModel;

import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;

/**
 *
 * @author MariusArhaug
 */
public class DefaultSignificanceModel implements SignificanceModel {
    private final long corpusSize;
    private final HashMap<String, Long> frequencies;

    private String id;

    public DefaultSignificanceModel(DocumentFrequencyFile file, String id) {
        this.frequencies = file.frequencies();
        this.corpusSize = file.documentCount();
        this.id = id;
    }

    public DefaultSignificanceModel(Path path) {
        ObjectMapper objectMapper = new ObjectMapper();
        try {
            var file         = objectMapper.readValue(path.toFile(), DocumentFrequencyFile.class);
            this.frequencies = file.frequencies();
            this.corpusSize  = file.documentCount();
        } catch (IOException e) {
            throw new RuntimeException("Failed to load model from " + path, e);
        }
    }

    @Override
    public DocumentFrequency documentFrequency(String word) {
        if (frequencies.containsKey(word)) {
            return new DocumentFrequency(frequencies.get(word), corpusSize);
        }
        return new DocumentFrequency(1, corpusSize);
    }

    @Override
    public String getId() {
        return this.id;
    }

}