diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /fsa/src/main/java |
Publish
Diffstat (limited to 'fsa/src/main/java')
9 files changed, 1981 insertions, 0 deletions
diff --git a/fsa/src/main/java/com/yahoo/fsa/FSA.java b/fsa/src/main/java/com/yahoo/fsa/FSA.java new file mode 100644 index 00000000000..6e352f3ddca --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/FSA.java @@ -0,0 +1,636 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.URL; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.CharBuffer; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel.MapMode; +import java.nio.charset.Charset; +import java.util.NoSuchElementException; + + +/** + * Finite-State Automaton. + * + * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a> + */ +public class FSA { + + /** + * Thread local state object used to traverse a Finite-State Automaton. + */ + public static class State { + + FSA fsa; + int state = 0; + int hash = 0; + + private State(FSA fsa) { + this.fsa = fsa; + start(); + } + + public void start(){ + state = fsa.start(); + hash = 0; + } + + public void delta(byte symbol) { + hash += fsa.hashDelta(state,symbol); + state = fsa.delta(state,symbol); + } + + /** Returns whether the given symbol would take us to a valid state, without changing the state */ + public boolean peekDelta(byte symbol) { + return fsa.delta(state,symbol)!=0; + } + + public boolean tryDelta(byte symbol) { + int lastHash=hash; + int lastState=state; + delta(symbol); + if (isValid()) return true; + + hash=lastHash; + state=lastState; + return false; + } + + public void delta(char chr){ + CharBuffer chrbuf = CharBuffer.allocate(1); + chrbuf.put(0,chr); + ByteBuffer buf = fsa.encode(chrbuf); + while(state >0 && buf.position()<buf.limit()){ + delta(buf.get()); + } + } + + /** Jumps ahead by string */ + public void delta(String string){ + ByteBuffer buf = fsa.encode(string); + while(state >0 && buf.position()<buf.limit()){ + delta(buf.get()); + } + } + + /** + * Jumps ahead by string if that puts us into a valid state, does nothing otherwise + * + * @return whether we jumped to a valid state (true) or di nothing (false) + */ + public boolean tryDelta(String string){ + int lastHash=hash; + int lastState=state; + delta(string); + if (isValid()) return true; + + hash=lastHash; + state=lastState; + return false; + } + + /** Jumps ahead by a word - if this is not the first word, it must be preceeded by space. */ + public void deltaWord(String string){ + if (state!=fsa.start()) { + delta((byte)' '); + } + delta(string); + } + + /** + * Tries to jump ahead by one word. If the given string is not the next complete valid word, nothing is done. + */ + public boolean tryDeltaWord(String string){ + int lastHash=hash; + int lastState=state; + tryDelta((byte)' '); + delta(string); + if (isValid() && peekDelta((byte)' ')) return true; + if (isFinal()) return true; + + hash=lastHash; + state=lastState; + return false; + } + + public boolean isFinal(){ + return fsa.isFinal(state); + } + + public boolean isStartState() { + return fsa.start() == state; + } + + public boolean isValid(){ + return state !=0; + } + + public ByteBuffer data(){ + return fsa.data(state); + } + + public String dataString(){ + return fsa.dataString(state); + } + + public int hash(){ + return hash; + } + + public ByteBuffer lookup(String str){ + start(); + delta(str); + return fsa.data(state); + } + + public boolean hasPerfectHash(){ + return fsa.hasPerfectHash(); + } + + } + + /** + * Class used to iterate over all accepted strings in the fsa. + */ + public static class Iterator implements java.util.Iterator<Iterator.Item> { + /** + * Internally, this class stores the state information for the iterator. + * Externally, it is used for accessing the data associated with the iterator position. + */ + public static class Item { + private FSA fsa; + private java.util.Stack<Byte> string; + private int symbol; + private int state; + private java.util.Stack<Integer> stack; + + /** + * Constructor + * @param fsa the FSA object the iterator is associated with. + * @param state the state used as start state. + */ + public Item(FSA fsa, int state) { + this.fsa = fsa; + this.string = new java.util.Stack(); + this.symbol = 0; + this.state = state; + this.stack = new java.util.Stack(); + } + + /** + * Copy constructor. (Does not copy the state stack) + */ + public Item(Item item) { + this.fsa = item.fsa; + this.string = new java.util.Stack(); + for (java.util.Iterator<Byte> itr = item.string.iterator(); itr.hasNext(); ) { + byte b = itr.next(); + this.string.push(b); + } + this.symbol = item.symbol; + this.state = item.state; + // no need to fill the stack as this constructor is used by Iterator::next() + this.stack = null; + } + + public String getString() { + ByteBuffer buffer = ByteBuffer.allocate(string.size()); + for (java.util.Iterator<Byte> itr = string.iterator(); itr.hasNext(); ) { + byte b = itr.next(); + buffer.put(b); + } + buffer.flip(); + return fsa.decode(buffer); + } + + public ByteBuffer getData() { + return fsa.data(state); + } + + public String getDataString() { + return fsa.dataString(state); + } + + public String toString() { + return "string: " + string + "(" + getString() + "), symbol: " + symbol + ", state: " + state; + } + } + + private Item item; + boolean useInitState = false; + + /** + * Constructor. + * @param state the state to create the iterator from. + */ + public Iterator(State state) { + item = new Item(state.fsa, state.state); + if (state.isFinal()) { + useInitState = true; + } else { + findNext(); + } + } + + private void findNext() { + int nextState; + int depth; + + if (item.symbol == 256 || item.fsa == null) { + throw new NoSuchElementException(); + } + + // flip the flag now that the first state has been returned + if (useInitState) { + useInitState = false; + } + + // try to find the next final state + for(;;) { + item.symbol++; + if (item.symbol < 256) { + byte symbol = (byte)item.symbol; + nextState = item.fsa.delta(item.state, (byte)item.symbol); + if (nextState != 0) { + item.string.push((byte)item.symbol); + item.stack.push(item.state); + item.state = nextState; + item.symbol = 0; + if (item.fsa.isFinal(nextState)) { + break; + } + } + } else { // backtrack + if ((depth = item.string.size()) > 0) { + byte b = item.string.pop(); // remove the last byte + item.symbol = b < 0 ? b + 256 : b; + item.state = item.stack.pop(); + } else { + item.state = 0; + break; + } + } + } + } + + public boolean hasNext() { + return item.state != 0 || useInitState; + } + + public Item next() { + Item retval = new Item(item); + findNext(); + return retval; + } + + public void remove() { + throw new UnsupportedOperationException(); + } + } + + public State getState(){ + return new State(this); + } + + /** + * Returns a new iterator to the start state. + */ + public Iterator iterator() { + return new Iterator(getState()); + } + + /** + * Returns a new iterator to the given state. + * @param state the state to create the iterator from. + */ + public Iterator iterator(State state) { + return new Iterator(state); + } + + private boolean _ok = false; + private MappedByteBuffer _header; + private MappedByteBuffer _symbol_tab; + private MappedByteBuffer _state_tab; + private MappedByteBuffer _data; + private MappedByteBuffer _phash; + private Charset _charset; + + /** + * Loads an FSA from a resource file name, which is resolved from the class path of the + * class loader of the given class. + * <p> + * This is useful for loading fsa's deployed within OSGi bundles. + * + * @param resourceFileName the name of the file, relative to any element on the classpath. + * For example, if the classpath contains resources/ and the file is resources/myfsa.fsa + * this argument should be myfsa.fsa + * @param loadingClass a class which provides the class loader to use for loading. Any class which is loaded + * from the same class path as the resource will do (e.g with OSGi - any class in the same bundle jar) + * @return the loaded FSA + * @throws RuntimeException if the class could not be loaded + */ + public static FSA loadFromResource(String resourceFileName,Class loadingClass) { + URL fsaUrl=loadingClass.getResource(resourceFileName); + if ( ! "file".equals(fsaUrl.getProtocol())) + throw new RuntimeException("Could not open non-file url '" + fsaUrl + "' as a file input stream: " + + "The classloader of " + loadingClass + "' does not return file urls"); + return new FSA(fsaUrl.getFile()); + } + + /** + * Loads an FSA from a file using utf-8 encoding + * + * @throws IllegalArgumentException if the file is not found + */ + public FSA(String filename) { + init(filename,"utf-8"); + } + + /** + * Loads an FSA from a file using the specified character encoding. + * + * @throws IllegalArgumentException if the file is not found + */ + public FSA(String filename, String charsetname) { + init(filename,charsetname); + } + + /** Loads an FSA from a file input stream using utf-8 encoding */ + public FSA(FileInputStream filename) { + init(filename,"utf-8"); + } + + /** Loads an FSA from a file input stream using the specified character encoding */ + public FSA(FileInputStream filename, String charsetname) { + init(filename,charsetname); + } + + private void init(String filename, String charsetname){ + try { + init(new FileInputStream(filename),charsetname); + } + catch (FileNotFoundException e) { + throw new IllegalArgumentException("Could not find FSA file '" + filename + "'",e); + } + catch (IOException e) { + throw new IllegalArgumentException("Could not read FSA file '" + filename + "'",e); + } + } + + private void init(FileInputStream file, String charsetname) { + try { + _charset = Charset.forName(charsetname); + + _header = file.getChannel().map(MapMode.READ_ONLY,0,256); + _header.order(ByteOrder.LITTLE_ENDIAN); + if (h_magic()!=2038637673) { + throw new IOException("Stream does not contain an FSA: Wrong file magic number " + h_magic()); + } + _symbol_tab = file.getChannel().map(MapMode.READ_ONLY, + 256,h_size()); + _symbol_tab.order(ByteOrder.LITTLE_ENDIAN); + _state_tab = file.getChannel().map(MapMode.READ_ONLY, + 256+h_size(),4*h_size()); + _state_tab.order(ByteOrder.LITTLE_ENDIAN); + _data = file.getChannel().map(MapMode.READ_ONLY, + 256+5*h_size(),h_data_size()); + _data.order(ByteOrder.LITTLE_ENDIAN); + if(h_has_phash()>0){ + _phash = file.getChannel().map(MapMode.READ_ONLY, + 256+5*h_size()+h_data_size(), + 4*h_size()); + _phash.order(ByteOrder.LITTLE_ENDIAN); + } + _ok=true; + } + catch (IOException e) { + throw new RuntimeException("IO error while reading FSA file",e); + } + } + + private int h_magic(){ + return _header.getInt(0); + } + private int h_version(){ + return _header.getInt(4); + } + private int h_checksum(){ + return _header.getInt(8); + } + private int h_size(){ + return _header.getInt(12); + } + private int h_start(){ + return _header.getInt(16); + } + private int h_data_size(){ + return _header.getInt(20); + } + private int h_data_type(){ + return _header.getInt(24); + } + private int h_fixed_data_size(){ + return _header.getInt(28); + } + private int h_has_phash(){ + return _header.getInt(32); + } + private int h_serial(){ + return _header.getInt(36); + } + private int getSymbol(int index){ + int symbol = _symbol_tab.get(index); + if(symbol<0){ + symbol += 256; + } + return symbol; + } + + private ByteBuffer encode(String str){ + return _charset.encode(str); + } + + private ByteBuffer encode(CharBuffer chrbuf){ + return _charset.encode(chrbuf); + } + + private String decode(ByteBuffer buf){ + return _charset.decode(buf).toString(); + } + + public boolean isOk(){ + return _ok; + } + + public boolean hasPerfectHash(){ + return _ok && h_has_phash()==1; + } + + public int version(){ + if(_ok){ + return h_version(); + } + return 0; + } + + public int serial(){ + if(_ok){ + return h_serial(); + } + return 0; + } + + protected int start(){ + if(_ok){ + return h_start(); + } + + return 0; + } + + protected int delta(int state, byte symbol){ + int s=symbol; + if(s<0){ + s+=256; + } + if(_ok && s>0 && s<255){ + if(getSymbol(state+s)==s){ + return _state_tab.getInt(4*(state+s)); + } + } + return 0; + } + + protected int hashDelta(int state, byte symbol){ + int s=symbol; + if(s<0){ + s+=256; + } + if(_ok && h_has_phash()==1 && s>0 && s<255){ + if(getSymbol(state+s)==s){ + return _phash.getInt(4*(state+s)); + } + } + return 0; + } + + protected boolean isFinal(int state){ + if(_ok){ + if(getSymbol(state+255)==255){ + return true; + } + } + return false; + } + + /** + * Retrieves data for the given state using the underlying fsa data buffer. + * @param state The fsa state to retrieve data from. + * @return A new buffer containing the data for the given state. + **/ + protected ByteBuffer data(int state) { + if(_ok && isFinal(state)){ + int offset = _state_tab.getInt(4*(state+255)); + int length; + if(h_data_type()==1){ + length = h_fixed_data_size(); + } + else{ + length = _data.getInt(offset); + offset += 4; + } + ByteBuffer meta = ByteBuffer.allocate(length); + meta.order(ByteOrder.LITTLE_ENDIAN); + byte[] dst = meta.array(); + for (int i = 0; i < length; ++i) { + dst[i] = _data.get(i + offset); + } + return meta; + } + return null; + } + + /** + * Retrieves data for the given state using the underlying fsa data buffer. + * @param state The fsa state to retrieve data from. + * @return A string representation of the data for the given state. + **/ + protected String dataString(int state) { + ByteBuffer meta = data(state); + if(meta!=null){ + // Remove trailing '\0' if it exists. This is usually the + // case for automata built with text format (makefsa -t) + String data = decode(meta); + if (data.endsWith("\0")) { + data = data.substring(0, data.length()-1); + } + return data; + } + return null; + } + + /** + * Convenience method that returns the metadata string in the fsa + * for the input lookup String, or null if the input string does + * not exist in the fsa. + * @param str The string to look up. + * @return Metadata string from the fsa. */ + public String lookup(String str){ + State s = getState(); + s.lookup(str); + return s.dataString(); + } + + + //// test //// + public static void main(String[] args) { + String test = "sour cherry"; + if (args.length >= 1) { + test = args[0]; + } + + String fsafile = "/home/gv/fsa/test/__testfsa__.__fsa__"; + //String fsafile = "/home/p13n/prelude/automata/query2dmozsegments.fsa"; + + FSA fsa = new FSA(fsafile); + + System.out.println("Loading FSA file "+fsafile+": "+fsa.isOk()); + System.out.println(" version: " + fsa.version()/1000000 + "." + + (fsa.version()/1000) % 1000 + "." + + fsa.version() % 1000); + System.out.println(" serial: " + fsa.serial()); + System.out.println(" phash: " + fsa.hasPerfectHash()); + + FSA.State s = fsa.getState(); + + s.start(); + for (int i=0; i < test.length(); i++) { + s.delta(test.charAt(i)); + } + System.out.println("\ndelta() char test " + test + ": " + + s.isFinal() + ", info: " + s.dataString() + + ", hash value: " + s.hash()); + + s.start(); + s.delta(test); + System.out.println("\ndelta() test " + test + ": " + + s.isFinal() + ", info: " + s.dataString() + + ", hash value: " + s.hash()); + + s.lookup(test); + String data = s.dataString(); + System.out.println("\nlookup() test \"" + test + "\": " + + (s.lookup(test) != null) + + ", info: " + data + ", hash value: " + s.hash()); + + String data2 = fsa.lookup(test); + System.out.println("\nFSA.lookup() test \"" + test + "\": " + data2); + } +} + + diff --git a/fsa/src/main/java/com/yahoo/fsa/MetaData.java b/fsa/src/main/java/com/yahoo/fsa/MetaData.java new file mode 100644 index 00000000000..fde868464c8 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/MetaData.java @@ -0,0 +1,217 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.ByteOrder; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.charset.Charset; + +import com.yahoo.fsa.FSA; + + +/** + * Class for accessing meta-data (dat-files) used by FSA applications. + * + * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a> + **/ +public class MetaData { + + private boolean _ok = false; + private MappedByteBuffer _header; + private MappedByteBuffer _data; + private Charset _charset; + + + public MetaData(String filename){ + init(filename, "utf-8"); + } + + public MetaData(String filename, String charsetname){ + init(filename, charsetname); + } + + public boolean isOk(){ + return _ok; + } + + private void init(String filename, String charsetname){ + + _charset = Charset.forName(charsetname); + + FileInputStream file; + try { + file = new FileInputStream(filename); + } + catch (FileNotFoundException e) { + System.out.print("MetaData file " + filename + " not found.\n"); + return; + } + + try { + _header = file.getChannel().map(MapMode.READ_ONLY,0,256); + _header.order(ByteOrder.LITTLE_ENDIAN); + if(h_magic()!=-2025936501){ + System.out.print("MetaData bad magic " + h_magic() +"\n"); + return; + } + _data = file.getChannel().map(MapMode.READ_ONLY, + 256, + h_size()); + _data.order(ByteOrder.LITTLE_ENDIAN); + _ok=true; + } + catch (IOException e) { + System.out.print("MetaData IO exception.\n"); + return; + } + } + + private int h_magic(){ + return _header.getInt(0); + } + private int h_version(){ + return _header.getInt(4); + } + private int h_checksum(){ + return _header.getInt(8); + } + private int h_size(){ + return _header.getInt(12); + } + private int h_reserved(int i){ + if(i<0||i>9){ + return 0; + } + return _header.getInt(16+4*i); + } + private int h_user(int i){ + if(i<0||i>49){ + return 0; + } + return _header.getInt(56+4*i); + } + + + private ByteBuffer encode(CharBuffer chrbuf){ + return _charset.encode(chrbuf); + } + + private String decode(ByteBuffer buf){ + return _charset.decode(buf).toString(); + } + + + public int user(int i){ + if(!_ok){ + return 0; + } + return h_user(i); + } + + public int getIntEntry(int idx) + { + if(_ok){ + return _data.getInt(idx*4); + } + else + return 0; + } + + public ByteBuffer getDirectRecordEntry(int idx, int size) + { + if(_ok){ + ByteBuffer meta = ByteBuffer.allocate(size); + meta.order(ByteOrder.LITTLE_ENDIAN); + _data.position(idx*size); + _data.get(meta.array(),0,size); + return meta; + } + else + return null; + } + + public ByteBuffer getIndirectRecordEntry(int idx, int size) + { + if(_ok){ + int offset = _data.getInt(idx*4); + ByteBuffer meta = ByteBuffer.allocate(size); + meta.order(ByteOrder.LITTLE_ENDIAN); + _data.position(offset); + _data.get(meta.array(),0,size); + return meta; + } + else + return null; + } + + public ByteBuffer getIndirectRecordEntry(int idx) + { + if(_ok){ + int offset = _data.getInt(idx*4); + int size = _data.getInt(offset); + ByteBuffer meta = ByteBuffer.allocate(size); + meta.order(ByteOrder.LITTLE_ENDIAN); + _data.position(offset+4); + _data.get(meta.array(),0,size); + return meta; + } + else + return null; + } + + public String getStringEntry(int stringOffset){ + if(_ok){ + int length = 0; + _data.position(stringOffset); + while(_data.get()!=0){ + length++; + } + ByteBuffer meta = ByteBuffer.allocate(length); + meta.order(ByteOrder.LITTLE_ENDIAN); + _data.position(stringOffset); + _data.get(meta.array(),0,length); + return decode(meta); + } + return null; + } + + public String[] getStringArrayEntry(int stringOffset, int numStrings){ + if(_ok && numStrings>0){ + String[] stringArray = new String[numStrings]; + int pos=stringOffset; + for(int i=0;i<numStrings;i++){ + int length = 0; + _data.position(pos); + while(_data.get()!=0){ + length++; + } + ByteBuffer meta = ByteBuffer.allocate(length); + meta.order(ByteOrder.LITTLE_ENDIAN); + _data.position(pos); + _data.get(meta.array(),0,length); + stringArray[i] = decode(meta); + pos += length+1; + } + return stringArray; + } + return null; + } + + //// test //// + public static void main(String[] args) { + String file = "dmozPred_2.dat"; + + MetaData metaData = new MetaData(file); + + System.out.println("Loading MetaData "+file+": "+metaData.isOk()); + } + + + +} diff --git a/fsa/src/main/java/com/yahoo/fsa/conceptnet/ConceptNet.java b/fsa/src/main/java/com/yahoo/fsa/conceptnet/ConceptNet.java new file mode 100644 index 00000000000..13cb93073d2 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/conceptnet/ConceptNet.java @@ -0,0 +1,384 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.conceptnet; + +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.ByteOrder; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.charset.Charset; + +import com.yahoo.fsa.FSA; + + +/** + * Class for accessing the concept network automata. + * + * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a> + **/ +public class ConceptNet { + + private FSA _fsa; + private boolean _ok = false; + private MappedByteBuffer _header; + private MappedByteBuffer _index; + private MappedByteBuffer _info; + private MappedByteBuffer _catindex; + private MappedByteBuffer _strings; + private Charset _charset; + + + public ConceptNet(String domain){ + init(domain, "utf-8"); + } + + public ConceptNet(String domain, String charsetname){ + init(domain, charsetname); + } + + public boolean isOk(){ + return _ok; + } + + private void init(String domain, String charsetname){ + + _charset = Charset.forName(charsetname); + + _fsa = new FSA(domain + ".fsa",charsetname); + + if(!_fsa.isOk()){ + return; + } + + FileInputStream file; + try { + file = new FileInputStream(domain + ".dat"); + } + catch (FileNotFoundException e) { + System.out.print("ConceptNet data file " + domain + ".dat" + " not found.\n"); + return; + } + + try { + _header = file.getChannel().map(MapMode.READ_ONLY,0,256); + _header.order(ByteOrder.LITTLE_ENDIAN); + if(h_magic()!=238579428){ + System.out.print("ConceptNet bad magic " + h_magic() +"\n"); + return; + } + _index = file.getChannel().map(MapMode.READ_ONLY, + 256, + 8*4*h_index_size()); + _index.order(ByteOrder.LITTLE_ENDIAN); + _info = file.getChannel().map(MapMode.READ_ONLY, + 256+8*4*h_index_size(), + 4*h_info_size()); + _info.order(ByteOrder.LITTLE_ENDIAN); + _catindex = file.getChannel().map(MapMode.READ_ONLY, + 256+8*4*h_index_size()+4*h_info_size(), + 4*h_catindex_size()); + _catindex.order(ByteOrder.LITTLE_ENDIAN); + _strings = file.getChannel().map(MapMode.READ_ONLY, + 256+8*4*h_index_size()+4*h_info_size()+4*h_catindex_size(), + h_strings_size()); + _strings.order(ByteOrder.LITTLE_ENDIAN); + _ok=true; + } + catch (IOException e) { + System.out.print("ConceptNet IO exception.\n"); + return; + } + } + + private int h_magic(){ + return _header.getInt(0); + } + private int h_version(){ + return _header.getInt(4); + } + private int h_checksum(){ + return _header.getInt(8); + } + private int h_index_size(){ + return _header.getInt(12); + } + private int h_info_size(){ + return _header.getInt(16); + } + private int h_catindex_size(){ + return _header.getInt(20); + } + private int h_strings_size(){ + return _header.getInt(24); + } + private int h_max_freq(){ + return _header.getInt(28); + } + private int h_max_cfreq(){ + return _header.getInt(32); + } + private int h_max_qfreq(){ + return _header.getInt(36); + } + private int h_max_sfreq(){ + return _header.getInt(40); + } + private int h_max_efreq(){ + return _header.getInt(44); + } + private int h_max_afreq(){ + return _header.getInt(48); + } + + + private ByteBuffer encode(CharBuffer chrbuf){ + return _charset.encode(chrbuf); + } + + private String decode(ByteBuffer buf){ + return _charset.decode(buf).toString(); + } + + public int lookup(String unit) + { + FSA.State state = _fsa.getState(); + // state.start(); // getState does this for us + state.delta(unit); + if(state.isFinal()){ + return state.hash(); + } + return -1; + } + + public String lookup(int idx) + { + if(!_ok || idx<0 || idx>=h_index_size()){ + return null; + } + int termoffset = _index.getInt(4*8*idx); + return getString(termoffset); + } + + private String getString(int stringOffset){ + if(_ok){ + int length = 0; + _strings.position(stringOffset); + while(_strings.get()!=0){ + length++; + } + ByteBuffer meta = ByteBuffer.allocate(length); + _strings.position(stringOffset); + _strings.get(meta.array(),0,length); + return decode(meta); + } + return null; + } + + public int frq(int idx) + { + if(!_ok || idx<0 || idx>=h_index_size()){ + return -1; + } + return _index.getInt(4*8*idx+4); + } + + public int cFrq(int idx) + { + if(!_ok || idx<0 || idx>=h_index_size()){ + return -1; + } + return _index.getInt(4*8*idx+8); + } + + public int qFrq(int idx) + { + if(!_ok || idx<0 || idx>=h_index_size()){ + return -1; + } + return _index.getInt(4*8*idx+12); + } + + public int sFrq(int idx) + { + if(!_ok || idx<0 || idx>=h_index_size()){ + return -1; + } + return _index.getInt(4*8*idx+16); + } + + public double score(int idx) + { + if(!_ok || idx<0 || idx>=h_index_size()){ + return -1.0; + } + return 100.0*cFrq(idx)/qFrq(idx); + } + + public double strength(int idx) + { + if(!_ok || idx<0 || idx>=h_index_size()){ + return -1.0; + } + return 100.0*qFrq(idx)/sFrq(idx); + } + + public int numExt(int idx) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+20); + if(offset==0){ + return 0; + } + return _info.getInt(4*offset); + } + + public int ext(int idx, int i) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+20); + if(offset==0){ + return -1; + } + if(i>=_info.getInt(4*offset)){ + return -1; + } + return _info.getInt(4*offset+4+8*i); + } + + public int extFrq(int idx, int i) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+20); + if(offset==0){ + return -1; + } + if(i>=_info.getInt(4*offset)){ + return -1; + } + return _info.getInt(4*offset+8+8*i); + } + + public int numAssoc(int idx) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+24); + if(offset==0){ + return 0; + } + return _info.getInt(4*offset); + } + + public int assoc(int idx, int i) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+24); + if(offset==0){ + return -1; + } + if(i>=_info.getInt(4*offset)){ + return -1; + } + return _info.getInt(4*offset+4+8*i); + } + + public int assocFrq(int idx, int i) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+24); + if(offset==0){ + return -1; + } + if(i>=_info.getInt(4*offset)){ + return -1; + } + return _info.getInt(4*offset+8+8*i); + } + + public int numCat(int idx) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+28); + if(offset==0){ + return 0; + } + return _info.getInt(4*offset); + } + + public int cat(int idx, int i) + { + if(idx<0 || idx>=h_index_size()){ + return -1; + } + int offset = _index.getInt(4*8*idx+28); + if(offset==0){ + return -1; + } + if(i>=_info.getInt(4*offset)){ + return -1; + } + return _info.getInt(4*offset+4+8*i); + } + + public String catName(int catidx) + { + if(!_ok || catidx<0 || catidx>=h_catindex_size()){ + return null; + } + int catoffset = _catindex.getInt(4*catidx); + return getString(catoffset); + } + + //// test //// + public static void main(String[] args) { + String domain = "/home/gv/fsa/automata/us_main_20041002_20041008"; + + ConceptNet cn = new ConceptNet(domain); + + System.out.println("Loading ConceptNet domain "+domain+": "+cn.isOk()); + int idx = cn.lookup("new york"); + System.out.println(" lookup(\"new york\") -> "+idx); + System.out.println(" lookup("+idx+") -> "+cn.lookup(idx)+"("+cn.score(idx)+","+cn.strength(idx)+")"); + System.out.println(" extensions("+cn.numExt(idx)+"):"); + for(int i=0;i<5 && i<cn.numExt(idx);i++){ + System.out.println(" "+cn.lookup(cn.ext(idx,i))+","+cn.extFrq(idx,i)); + } + if(5<cn.numExt(idx)){ + System.out.println(" ..."); + } + System.out.println(" associations("+cn.numAssoc(idx)+"):"); + for(int i=0;i<5 && i<cn.numAssoc(idx);i++){ + System.out.println(" "+cn.lookup(cn.assoc(idx,i))+","+cn.assocFrq(idx,i)); + } + if(5<cn.numAssoc(idx)){ + System.out.println(" ..."); + } + System.out.println(" categories("+cn.numCat(idx)+"):"); + for(int i=0;i<5 && i<cn.numCat(idx);i++){ + System.out.println(" "+cn.catName(cn.cat(idx,i))); + } + if(5<cn.numCat(idx)){ + System.out.println(" ..."); + } + } + + + +} diff --git a/fsa/src/main/java/com/yahoo/fsa/package-info.java b/fsa/src/main/java/com/yahoo/fsa/package-info.java new file mode 100644 index 00000000000..94c7fd30603 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/package-info.java @@ -0,0 +1,7 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +@ExportPackage +@PublicApi +package com.yahoo.fsa; + +import com.yahoo.api.annotations.PublicApi; +import com.yahoo.osgi.annotation.ExportPackage; diff --git a/fsa/src/main/java/com/yahoo/fsa/segmenter/Segment.java b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segment.java new file mode 100644 index 00000000000..1e424372a66 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segment.java @@ -0,0 +1,42 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.segmenter; + +/** + * Class encapsulation of a segment. + * + * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a> + */ +public class Segment { + + int _beg; + int _end; + int _conn; + + public Segment(int b, int e, int c) + { + _beg = b; + _end = e; + _conn = c; + } + + public int beg() + { + return _beg; + } + + public int end() + { + return _end; + } + + public int len() + { + return _end-_beg; + } + + public int conn() + { + return _conn; + } + +} diff --git a/fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java new file mode 100644 index 00000000000..80ccd791644 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java @@ -0,0 +1,137 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.segmenter; + +import java.util.LinkedList; +import java.util.ListIterator; + +import com.yahoo.fsa.FSA; + +/** + * API for accessing the Segmenter automata. + * + * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a> + */ +public class Segmenter { + + private FSA _fsa; + + public Segmenter(FSA fsa) { + _fsa = fsa; + } + + public Segmenter(String filename) { + _fsa = new FSA(filename,"utf-8"); + } + + public Segmenter(String filename, String charsetname) { + _fsa = new FSA(filename,charsetname); + } + + public boolean isOk() + { + return _fsa.isOk(); + } + + public Segments segment(String input) + { + String[] tokens = input.split("\\s"); + return segment(tokens); + } + + private class Detector { + FSA.State _state; + int _index; + + public Detector(FSA.State s, int i) + { + _state = s; + _index = i; + } + + public FSA.State state() + { + return _state; + } + + public int index() + { + return _index; + } + } + + public Segments segment(String[] tokens) + { + Segments segments = new Segments(tokens); + LinkedList detectors = new LinkedList(); + + int i=0; + + + while(i<tokens.length){ + detectors.add(new Detector(_fsa.getState(),i)); + + ListIterator det_it = detectors.listIterator(); + while(det_it.hasNext()){ + Detector d = (Detector)det_it.next(); + d.state().deltaWord(tokens[i]); + if(d.state().isFinal()){ + segments.add(new Segment(d.index(),i+1,d.state().data().getInt(0))); + } + + if(!d.state().isValid()){ + det_it.remove(); + } + } + i++; + } + + return segments; + } + + //// test //// + public static void main(String[] args) { + String fsafile = "/home/gv/fsa/automata/segments.fsa"; + + Segmenter segmenter = new Segmenter(fsafile); + + System.out.println("Loading segmenter FSA file "+fsafile+": "+segmenter.isOk()); + + for(int a=0;a<1||a<args.length;a++){ + + String query; + if(a==args.length){ + query = "times square head"; + } + else { + query = args[a]; + } + System.out.println("processing query \""+query+"\""); + + Segments segments = segmenter.segment(query); + System.out.println("all segments:"); + for(int i=0; i<segments.size();i++){ + System.out.println(" "+i+": \""+segments.sgm(i)+"\","+segments.conn(i)); + } + + Segments best; + + best = segments.segmentation(Segments.SEGMENTATION_WEIGHTED); + System.out.print("best segments (weighted): "); + for(int i=0; i<best.size();i++){ + System.out.print("("+best.sgm(i)+")"); + } + System.out.println(); + + best = segments.segmentation(Segments.SEGMENTATION_RIGHTMOST_LONGEST); + System.out.print("best segments (rightmost_longest):"); + for(int i=0; i<best.size();i++){ + System.out.print("("+best.sgm(i)+")"); + } + System.out.println(); + + } + + } + +} + diff --git a/fsa/src/main/java/com/yahoo/fsa/segmenter/Segments.java b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segments.java new file mode 100644 index 00000000000..26752046f80 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segments.java @@ -0,0 +1,313 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.segmenter; + +import java.util.LinkedList; + +/** + * Contains the segmentation() method. + * + * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a> + */ +public class Segments extends LinkedList { + + public final static int SEGMENTATION_WEIGHTED = 0; + public final static int SEGMENTATION_WEIGHTED_BIAS10 = 1; + public final static int SEGMENTATION_WEIGHTED_BIAS20 = 2; + public final static int SEGMENTATION_WEIGHTED_BIAS50 = 3; + public final static int SEGMENTATION_WEIGHTED_BIAS100 = 4; + public final static int SEGMENTATION_WEIGHTED_LEFTMOST = 5; + public final static int SEGMENTATION_WEIGHTED_RIGHTMOST = 6; + public final static int SEGMENTATION_WEIGHTED_LONGEST = 7; + public final static int SEGMENTATION_LEFTMOST_LONGEST = 8; + public final static int SEGMENTATION_LEFTMOST_WEIGHTED = 9; + public final static int SEGMENTATION_RIGHTMOST_LONGEST = 10; + public final static int SEGMENTATION_RIGHTMOST_WEIGHTED = 11; + public final static int SEGMENTATION_LONGEST_WEIGHTED = 12; + public final static int SEGMENTATION_LONGEST_LEFTMOST = 13; + public final static int SEGMENTATION_LONGEST_RIGHTMOST = 14; + public final static int SEGMENTATION_METHODS = 15; + + private String[] _tokens; + private int _size; + private int[][] _map; + + public Segments(String[] tokens) + { + _tokens = tokens; + _size = tokens.length; + _map = new int[_size+1][_size+1]; + for(int i=0; i<=_size; i++){ + for(int j=0; j<=_size; j++){ + _map[i][j]=-1; + } + } + } + + public void add(Segment s) + { + super.add(s); + _map[s.beg()][s.end()]=super.size()-1; + } + + private void addMissingSingles() + { + for(int i=0; i<_size; i++){ + if(_map[i][i+1]==-1){ + super.add(new Segment(i,i+1,0)); + _map[i][i+1]=super.size()-1; + } + } + } + + private void reMap() + { + for(int i=0; i<=_size; i++){ + for(int j=0; j<=_size; j++){ + _map[i][j]=-1; + } + } + for(int i=0; i<super.size(); i++){ + _map[beg(i)][end(i)] = i; + } + } + + public String sgm(int idx) + { + if(idx<0 || idx>=super.size()){ + return null; + } + String s = new String(_tokens[((Segment)(super.get(idx))).beg()]); + for(int i=((Segment)(super.get(idx))).beg()+1;i<((Segment)(super.get(idx))).end();i++){ + s += " " + _tokens[i]; + } + return s; + } + + public int beg(int idx) + { + if(idx<0 || idx>=super.size()){ + return -1; + } + return ((Segment)(super.get(idx))).beg(); + } + + public int end(int idx) + { + if(idx<0 || idx>=super.size()){ + return -1; + } + return ((Segment)(super.get(idx))).end(); + } + + public int len(int idx) + { + if(idx<0 || idx>=super.size()){ + return -1; + } + return ((Segment)(super.get(idx))).len(); + } + + public int conn(int idx) + { + if(idx<0 || idx>=super.size()){ + return -1; + } + return ((Segment)(super.get(idx))).conn(); + } + + public Segments segmentation(int method) + { + Segments smnt = new Segments(_tokens); + + addMissingSingles(); + + int maxsc, id, bestid=-1, bias=0, c, pos, bestval, temp=0, next=-1; + int[] maxScore = new int[super.size()]; + int[] nextid = new int[super.size()]; + for(int i=0;i<nextid.length;i++){ + nextid[i]=-1; + } + + switch(method){ + case SEGMENTATION_WEIGHTED_BIAS100: + bias+=50; + case SEGMENTATION_WEIGHTED_BIAS50: + bias+=30; + case SEGMENTATION_WEIGHTED_BIAS20: + bias+=10; + case SEGMENTATION_WEIGHTED_BIAS10: + bias+=10; + case SEGMENTATION_WEIGHTED: + bestid=-1; + for(int i=_tokens.length;i>=0;i--){ + bestid=-1;maxsc=0; + for(int j=i+1;j<=_tokens.length;j++){ + id=_map[i][j]; + if(id>=0 && maxScore[id]+1>maxsc) { + bestid=id; + maxsc=maxScore[id]+1; + } + } + if(maxsc>0){ + maxsc--; + } + for(int j=0;j<i;j++){ + id=_map[j][i]; + if(id>=0){ + nextid[id] = bestid; + c = conn(id); + if(i-j<=1){ + maxScore[id] = maxsc; + } + else if(bias>0){ + maxScore[id] = maxsc + ((100+(i-j-2)*bias)*c)/100; + } + else{ + maxScore[id] = maxsc + c; + } + } + } + } + id = bestid; + while(id!=-1){ + smnt.add(((Segment)(super.get(id)))); + id=nextid[id]; + } + break; + case SEGMENTATION_LEFTMOST_LONGEST: + case SEGMENTATION_LEFTMOST_WEIGHTED: + pos = 0; + while(pos<_tokens.length){ + bestid = -1; bestval = -1; + for(int i=pos+1;i<=_tokens.length;i++){ + id = _map[pos][i]; + if(id>=0 && + (method==SEGMENTATION_LEFTMOST_LONGEST || + (temp=(len(id)>1)? conn(id) :0)>bestval) ){ + bestid = id; + bestval = temp; + next = i; + } + } + smnt.add((Segment)(super.get(bestid))); + pos=next; + } + break; + case SEGMENTATION_RIGHTMOST_LONGEST: + case SEGMENTATION_RIGHTMOST_WEIGHTED: + pos = _tokens.length; + while(pos>0){ + bestid = -1; bestval = -1; + for(int i=pos-1;i>=0;i--){ + id = _map[i][pos]; + if(id>=0 && + (method==SEGMENTATION_RIGHTMOST_LONGEST || + (temp=(len(id)>1)? conn(id) :0)>bestval) ){ + bestid = id; + bestval = temp; + next = i; + } + } + smnt.addFirst(super.get(bestid)); + pos=next; + } + smnt.reMap(); + break; + case SEGMENTATION_LONGEST_WEIGHTED: + case SEGMENTATION_LONGEST_LEFTMOST: + case SEGMENTATION_LONGEST_RIGHTMOST: + case SEGMENTATION_WEIGHTED_LONGEST: + case SEGMENTATION_WEIGHTED_LEFTMOST: + case SEGMENTATION_WEIGHTED_RIGHTMOST: + buildSegmentationRecursive(method,smnt,0,_tokens.length); + break; + } + + return smnt; + } + + private void buildSegmentationRecursive(int method, Segments smnt, int b, int e) + { + int bestid, bestval1, bestval2, temp; + + bestid=-1;bestval1=-1;bestval2=-1; + for(int i=0;i<super.size();i++){ + if(b<=beg(i) && e>=end(i)){ + switch(method){ + case SEGMENTATION_LONGEST_WEIGHTED: + if(len(i)>bestval1 || + (len(i)==bestval1 && conn(i)>bestval2) ){ + bestid=i; + bestval1=len(i); + bestval2=conn(i); + } + break; + case SEGMENTATION_LONGEST_LEFTMOST: + if(len(i)>bestval1 || + (len(i)==bestval1 && beg(i)<bestval2) ){ + bestid=i; + bestval1=len(i); + bestval2=beg(i); + } + break; + case SEGMENTATION_LONGEST_RIGHTMOST: + if(len(i)>bestval1 || + (len(i)==bestval1 && end(i)>bestval2) ){ + bestid=i; + bestval1=len(i); + bestval2=end(i); + } + break; + case SEGMENTATION_WEIGHTED_LONGEST: + temp = (len(i)>1)?conn(i):0; + if(temp>bestval1 || + (temp==bestval1 && len(i)>bestval2) ){ + bestid=i; + bestval1=temp; + bestval2=len(i); + } + break; + case SEGMENTATION_WEIGHTED_LEFTMOST: + temp = (len(i)>1)? conn(i) :0; + if(temp>bestval1 || + (temp==bestval1 && beg(i)<bestval2) ){ + bestid=i; + bestval1=temp; + bestval2=beg(i); + } + break; + case SEGMENTATION_WEIGHTED_RIGHTMOST: + temp = len(i)>1?conn(i):0; + if(temp>bestval1 || + (temp==bestval1 && end(i)>bestval2) ){ + bestid=i; + bestval1=temp; + bestval2=end(i); + } + break; + default: // dummy defult pick first possible + if(bestid<0){ + bestid=i; + } + break; + } + } + } + if(bestid<0) { + return; // this should never happen, as all one-word segments are created + } + + if(b<beg(bestid)){ + buildSegmentationRecursive(method,smnt,b,beg(bestid)); + } + + // add segment + smnt.add((Segment)(super.get(bestid))); + + // check right side + if(e>end(bestid)){ + buildSegmentationRecursive(method,smnt,end(bestid),e); + } + } + +} diff --git a/fsa/src/main/java/com/yahoo/fsa/topicpredictor/PredictedTopic.java b/fsa/src/main/java/com/yahoo/fsa/topicpredictor/PredictedTopic.java new file mode 100644 index 00000000000..2dd0dcc9bb2 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/topicpredictor/PredictedTopic.java @@ -0,0 +1,65 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.topicpredictor; + + +/** + * Class encapsulation of a predicted topic. A topic has a weight and + * a term vector string of topicSegments. + * + * @author gjoranv + **/ +public class PredictedTopic { + + private String topic = ""; + private double weight = 0.0; + private String vector = ""; + + + public PredictedTopic(String topic, double weight, String vector){ + this.topic = topic; + this.weight = weight; + this.vector = vector; + } + + public PredictedTopic(String topic, double weight){ + this(topic, weight, ""); + } + + + /** Returns the topic */ + public String getTopic() { return topic; } + + /** Returns the weight */ + public double getWeight() { return weight; } + + /** Returns the vector*/ + public String getVector() { return vector; } + + + /** Sets the weight */ + public void setWeight(double weight) { + this.weight = weight; + } + + /** Adds to the weight */ + public void addWeight(double weight) { + this.weight += weight; + } + + /** Sets the vector*/ + public void setVector(String vector) { + this.vector = vector; + } + + /** Compares this topic to another topic, according to weight descending */ + public int compareDescendWeight(Object o) { + PredictedTopic pt = (PredictedTopic)o; + + double wgt1 = getWeight(); + double wgt2 = pt.getWeight(); + if (wgt1 < wgt2) { return 1; } + if (wgt1 > wgt2) { return -1;} + return 0; + } + +} diff --git a/fsa/src/main/java/com/yahoo/fsa/topicpredictor/TopicPredictor.java b/fsa/src/main/java/com/yahoo/fsa/topicpredictor/TopicPredictor.java new file mode 100644 index 00000000000..177e879c6c8 --- /dev/null +++ b/fsa/src/main/java/com/yahoo/fsa/topicpredictor/TopicPredictor.java @@ -0,0 +1,180 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.fsa.topicpredictor; + +import java.util.logging.Logger; +import java.util.List; +import java.util.LinkedList; +import java.util.Iterator; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.ByteOrder; +import java.nio.MappedByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.FileChannel.MapMode; +import java.nio.charset.Charset; + +import com.yahoo.fsa.FSA; +import com.yahoo.fsa.MetaData; + + +/** + * Class for accessing the topic prediction automata. Look up the + * predicted topics for a term. Each topic has an attached weight and + * a term vector (topicSegments). + * + * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a> + **/ +public class TopicPredictor extends MetaData { + + private static final String packageName = "com.yahoo.fsa.topicpredictor"; + + private FSA fsa = null; + + public TopicPredictor(String fsafile, String datfile){ + this(fsafile, datfile, "utf-8"); + } + + public TopicPredictor(String fsafile, String datfile, + String charsetname) { + super(datfile, charsetname); + if (!isOk()) { + Logger.getLogger(packageName). + warning("Error initializing predictor with file " + datfile); + } + + // Init the segment->'topic index' FSA + fsa = new FSA(fsafile); + if (!fsa.isOk()) { + Logger.getLogger(packageName). + warning("Error initializing FSA with file " + fsafile); + } + } + + /** + * Returns a list of PredictedTopic objects, one for each topic + * the segment maps to. The returned list contains all topics, + * as opposed to the two-argument version. + * @param segment The segment string to find (all) topics for. + * @return (Linked)List of PredictedTopic objects. */ + public List getPredictedTopics(String segment) { + return getPredictedTopics(segment, 0); + } + + /** + * Returns a list of PredictedTopic objects, one for each topic + * the segment maps to. The returned list length is cut off at + * 'maxTopics' entries, maxTopics=0 returns all topics. + * @param segment The segment string to find topics for. + * @param maxTopics The max number of topics to return, 0 for all topics + * @return (Linked)List of PredictedTopic objects. */ + public List getPredictedTopics(String segment, int maxTopics) { + List predictedTopics = new LinkedList(); + + int segIdx = getSegmentIndex(segment); + int[][] topicArr = getTopicArray(segIdx, maxTopics); + int numTopics = topicArr.length; + int allTopics = getNumTopics(segIdx); + /*Logger.getLogger(packageName). + fine("Segment: '" + segment + "' has " + allTopics + + " topics in automaton, fetched " + numTopics); + */ + for(int i=0; i < numTopics; i++) { + int weight = topicArr[i][1]; + String[] topicInfo= getTopicInfo(topicArr[i][0]); + String topic = topicInfo[0]; + String vector= topicInfo[1]; + PredictedTopic pt = + new PredictedTopic(topic, (double)weight, vector); + predictedTopics.add(pt); + } + + return predictedTopics; + } + + /** + * Returns the index (hash value) of the input segment in the FSA. + * @param segment The segment string to find index for. + * @return Index for this segment in the FSA. */ + private int getSegmentIndex(String segment) { + FSA.State s = fsa.getState(); + s.delta(segment); + if (s.isFinal()) { + return s.hash(); + } + return -1; + } + + /** + * Returns the number of topics the FSA contains for the input + * segment. + * @return Number of topics for the segment. */ + private int getNumTopics(int segIdx) { + if (segIdx < 0) { + return 0; + } + ByteBuffer buf = getIndirectRecordEntry(segIdx, 4); + return buf.getInt(0); + } + + /** + * Reads the topics and other metadata for a segment from the + * (memory-mapped) metadata file. Returns the info in a + * two-dimensional array (one row per topic). + * @param segIdx The FSA index (hash value) for the segment. + * @param maxTopics Max number of topics to return, 0 for all topics. + * @return Number of topics for the segment. */ + private int[][] getTopicArray(int segIdx, int maxTopics) { + if (segIdx < 0) { + return new int[0][0]; + } + + int numTopics = getNumTopics(segIdx); + if ((maxTopics > 0) && (numTopics > maxTopics)) { + numTopics = maxTopics; + } + + int[][] topics = new int[numTopics][2]; + ByteBuffer buf = getIndirectRecordEntry(segIdx,4+8*numTopics); + for(int i=0; i<numTopics; i++){ + topics[i][0] = buf.getInt(4+8*i); + topics[i][1] = buf.getInt(8+8*i); + } + return topics; + } + + /** + * Returns the topic and vector strings from the internal meta + * data structure. + * @param topicId Topic start index in a two-dimensional array + * @return topic string at [0] and vector string at [1] */ + private String[] getTopicInfo(int topicId) { + return getStringArrayEntry(user(0) + topicId, 2); + } + + + //// test //// + public static void main(String[] args) { + String segment = "new york"; + if (args.length >= 1) { + segment = args[0]; + } + + String fsafile = "/home/gv/fsa/automata/dmozPred_2.fsa"; + String datfile = "/home/gv/fsa/automata/dmozPred_2.dat"; + + TopicPredictor predictor = new TopicPredictor(fsafile, datfile); + + List predictedTopics = predictor.getPredictedTopics(segment, 25); + Iterator i = predictedTopics.iterator(); + while (i.hasNext()) { + PredictedTopic topic = (PredictedTopic) i.next(); + System.out.println("\n topic=" + topic.getTopic()); + System.out.println(" weight=" + topic.getWeight()); + System.out.println(" vector=" + topic.getVector()); + } + } + +} |