// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.fsa.conceptnet; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.nio.ByteBuffer; import java.nio.CharBuffer; import java.nio.ByteOrder; import java.nio.MappedByteBuffer; import java.nio.channels.FileChannel; import java.nio.channels.FileChannel.MapMode; import java.nio.charset.Charset; import com.yahoo.fsa.FSA; /** * Class for accessing the concept network automata. * * @author Peter Boros **/ public class ConceptNet { private FSA _fsa; private boolean _ok = false; private MappedByteBuffer _header; private MappedByteBuffer _index; private MappedByteBuffer _info; private MappedByteBuffer _catindex; private MappedByteBuffer _strings; private Charset _charset; public ConceptNet(String domain){ init(domain, "utf-8"); } public ConceptNet(String domain, String charsetname){ init(domain, charsetname); } public boolean isOk(){ return _ok; } private void init(String domain, String charsetname){ _charset = Charset.forName(charsetname); _fsa = new FSA(domain + ".fsa",charsetname); if(!_fsa.isOk()){ return; } FileInputStream file; try { file = new FileInputStream(domain + ".dat"); } catch (FileNotFoundException e) { System.out.print("ConceptNet data file " + domain + ".dat" + " not found.\n"); return; } try { _header = file.getChannel().map(MapMode.READ_ONLY,0,256); _header.order(ByteOrder.LITTLE_ENDIAN); if(h_magic()!=238579428){ System.out.print("ConceptNet bad magic " + h_magic() +"\n"); return; } _index = file.getChannel().map(MapMode.READ_ONLY, 256, 8*4*h_index_size()); _index.order(ByteOrder.LITTLE_ENDIAN); _info = file.getChannel().map(MapMode.READ_ONLY, 256+8*4*h_index_size(), 4*h_info_size()); _info.order(ByteOrder.LITTLE_ENDIAN); _catindex = file.getChannel().map(MapMode.READ_ONLY, 256+8*4*h_index_size()+4*h_info_size(), 4*h_catindex_size()); _catindex.order(ByteOrder.LITTLE_ENDIAN); _strings = file.getChannel().map(MapMode.READ_ONLY, 256+8*4*h_index_size()+4*h_info_size()+4*h_catindex_size(), h_strings_size()); _strings.order(ByteOrder.LITTLE_ENDIAN); _ok=true; } catch (IOException e) { System.out.print("ConceptNet IO exception.\n"); return; } } private int h_magic(){ return _header.getInt(0); } private int h_version(){ return _header.getInt(4); } private int h_checksum(){ return _header.getInt(8); } private int h_index_size(){ return _header.getInt(12); } private int h_info_size(){ return _header.getInt(16); } private int h_catindex_size(){ return _header.getInt(20); } private int h_strings_size(){ return _header.getInt(24); } private int h_max_freq(){ return _header.getInt(28); } private int h_max_cfreq(){ return _header.getInt(32); } private int h_max_qfreq(){ return _header.getInt(36); } private int h_max_sfreq(){ return _header.getInt(40); } private int h_max_efreq(){ return _header.getInt(44); } private int h_max_afreq(){ return _header.getInt(48); } private ByteBuffer encode(CharBuffer chrbuf){ return _charset.encode(chrbuf); } private String decode(ByteBuffer buf){ return _charset.decode(buf).toString(); } public int lookup(String unit) { FSA.State state = _fsa.getState(); // state.start(); // getState does this for us state.delta(unit); if(state.isFinal()){ return state.hash(); } return -1; } public String lookup(int idx) { if(!_ok || idx<0 || idx>=h_index_size()){ return null; } int termoffset = _index.getInt(4*8*idx); return getString(termoffset); } private String getString(int stringOffset){ if(_ok){ int length = 0; _strings.position(stringOffset); while(_strings.get()!=0){ length++; } ByteBuffer meta = ByteBuffer.allocate(length); _strings.position(stringOffset); _strings.get(meta.array(),0,length); return decode(meta); } return null; } public int frq(int idx) { if(!_ok || idx<0 || idx>=h_index_size()){ return -1; } return _index.getInt(4*8*idx+4); } public int cFrq(int idx) { if(!_ok || idx<0 || idx>=h_index_size()){ return -1; } return _index.getInt(4*8*idx+8); } public int qFrq(int idx) { if(!_ok || idx<0 || idx>=h_index_size()){ return -1; } return _index.getInt(4*8*idx+12); } public int sFrq(int idx) { if(!_ok || idx<0 || idx>=h_index_size()){ return -1; } return _index.getInt(4*8*idx+16); } public double score(int idx) { if(!_ok || idx<0 || idx>=h_index_size()){ return -1.0; } return 100.0*cFrq(idx)/qFrq(idx); } public double strength(int idx) { if(!_ok || idx<0 || idx>=h_index_size()){ return -1.0; } return 100.0*qFrq(idx)/sFrq(idx); } public int numExt(int idx) { if(idx<0 || idx>=h_index_size()){ return -1; } int offset = _index.getInt(4*8*idx+20); if(offset==0){ return 0; } return _info.getInt(4*offset); } public int ext(int idx, int i) { if(idx<0 || idx>=h_index_size()){ return -1; } int offset = _index.getInt(4*8*idx+20); if(offset==0){ return -1; } if(i>=_info.getInt(4*offset)){ return -1; } return _info.getInt(4*offset+4+8*i); } public int extFrq(int idx, int i) { if(idx<0 || idx>=h_index_size()){ return -1; } int offset = _index.getInt(4*8*idx+20); if(offset==0){ return -1; } if(i>=_info.getInt(4*offset)){ return -1; } return _info.getInt(4*offset+8+8*i); } public int numAssoc(int idx) { if(idx<0 || idx>=h_index_size()){ return -1; } int offset = _index.getInt(4*8*idx+24); if(offset==0){ return 0; } return _info.getInt(4*offset); } public int assoc(int idx, int i) { if(idx<0 || idx>=h_index_size()){ return -1; } int offset = _index.getInt(4*8*idx+24); if(offset==0){ return -1; } if(i>=_info.getInt(4*offset)){ return -1; } return _info.getInt(4*offset+4+8*i); } public int assocFrq(int idx, int i) { if(idx<0 || idx>=h_index_size()){ return -1; } int offset = _index.getInt(4*8*idx+24); if(offset==0){ return -1; } if(i>=_info.getInt(4*offset)){ return -1; } return _info.getInt(4*offset+8+8*i); } public int numCat(int idx) { if(idx<0 || idx>=h_index_size()){ return -1; } int offset = _index.getInt(4*8*idx+28); if(offset==0){ return 0; } return _info.getInt(4*offset); } public int cat(int idx, int i) { if(idx<0 || idx>=h_index_size()){ return -1; } int offset = _index.getInt(4*8*idx+28); if(offset==0){ return -1; } if(i>=_info.getInt(4*offset)){ return -1; } return _info.getInt(4*offset+4+8*i); } public String catName(int catidx) { if(!_ok || catidx<0 || catidx>=h_catindex_size()){ return null; } int catoffset = _catindex.getInt(4*catidx); return getString(catoffset); } //// test //// public static void main(String[] args) { String domain = "/home/gv/fsa/automata/us_main_20041002_20041008"; ConceptNet cn = new ConceptNet(domain); System.out.println("Loading ConceptNet domain "+domain+": "+cn.isOk()); int idx = cn.lookup("new york"); System.out.println(" lookup(\"new york\") -> "+idx); System.out.println(" lookup("+idx+") -> "+cn.lookup(idx)+"("+cn.score(idx)+","+cn.strength(idx)+")"); System.out.println(" extensions("+cn.numExt(idx)+"):"); for(int i=0;i<5 && i