summaryrefslogtreecommitdiffstats
path: root/fsa/src/main/java
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
committerJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
commit72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /fsa/src/main/java
Publish
Diffstat (limited to 'fsa/src/main/java')
-rw-r--r--fsa/src/main/java/com/yahoo/fsa/FSA.java636
-rw-r--r--fsa/src/main/java/com/yahoo/fsa/MetaData.java217
-rw-r--r--fsa/src/main/java/com/yahoo/fsa/conceptnet/ConceptNet.java384
-rw-r--r--fsa/src/main/java/com/yahoo/fsa/package-info.java7
-rw-r--r--fsa/src/main/java/com/yahoo/fsa/segmenter/Segment.java42
-rw-r--r--fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java137
-rw-r--r--fsa/src/main/java/com/yahoo/fsa/segmenter/Segments.java313
-rw-r--r--fsa/src/main/java/com/yahoo/fsa/topicpredictor/PredictedTopic.java65
-rw-r--r--fsa/src/main/java/com/yahoo/fsa/topicpredictor/TopicPredictor.java180
9 files changed, 1981 insertions, 0 deletions
diff --git a/fsa/src/main/java/com/yahoo/fsa/FSA.java b/fsa/src/main/java/com/yahoo/fsa/FSA.java
new file mode 100644
index 00000000000..6e352f3ddca
--- /dev/null
+++ b/fsa/src/main/java/com/yahoo/fsa/FSA.java
@@ -0,0 +1,636 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.fsa;
+
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.net.URL;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.nio.CharBuffer;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel.MapMode;
+import java.nio.charset.Charset;
+import java.util.NoSuchElementException;
+
+
+/**
+ * Finite-State Automaton.
+ *
+ * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a>
+ */
+public class FSA {
+
+ /**
+ * Thread local state object used to traverse a Finite-State Automaton.
+ */
+ public static class State {
+
+ FSA fsa;
+ int state = 0;
+ int hash = 0;
+
+ private State(FSA fsa) {
+ this.fsa = fsa;
+ start();
+ }
+
+ public void start(){
+ state = fsa.start();
+ hash = 0;
+ }
+
+ public void delta(byte symbol) {
+ hash += fsa.hashDelta(state,symbol);
+ state = fsa.delta(state,symbol);
+ }
+
+ /** Returns whether the given symbol would take us to a valid state, without changing the state */
+ public boolean peekDelta(byte symbol) {
+ return fsa.delta(state,symbol)!=0;
+ }
+
+ public boolean tryDelta(byte symbol) {
+ int lastHash=hash;
+ int lastState=state;
+ delta(symbol);
+ if (isValid()) return true;
+
+ hash=lastHash;
+ state=lastState;
+ return false;
+ }
+
+ public void delta(char chr){
+ CharBuffer chrbuf = CharBuffer.allocate(1);
+ chrbuf.put(0,chr);
+ ByteBuffer buf = fsa.encode(chrbuf);
+ while(state >0 && buf.position()<buf.limit()){
+ delta(buf.get());
+ }
+ }
+
+ /** Jumps ahead by string */
+ public void delta(String string){
+ ByteBuffer buf = fsa.encode(string);
+ while(state >0 && buf.position()<buf.limit()){
+ delta(buf.get());
+ }
+ }
+
+ /**
+ * Jumps ahead by string if that puts us into a valid state, does nothing otherwise
+ *
+ * @return whether we jumped to a valid state (true) or di nothing (false)
+ */
+ public boolean tryDelta(String string){
+ int lastHash=hash;
+ int lastState=state;
+ delta(string);
+ if (isValid()) return true;
+
+ hash=lastHash;
+ state=lastState;
+ return false;
+ }
+
+ /** Jumps ahead by a word - if this is not the first word, it must be preceeded by space. */
+ public void deltaWord(String string){
+ if (state!=fsa.start()) {
+ delta((byte)' ');
+ }
+ delta(string);
+ }
+
+ /**
+ * Tries to jump ahead by one word. If the given string is not the next complete valid word, nothing is done.
+ */
+ public boolean tryDeltaWord(String string){
+ int lastHash=hash;
+ int lastState=state;
+ tryDelta((byte)' ');
+ delta(string);
+ if (isValid() && peekDelta((byte)' ')) return true;
+ if (isFinal()) return true;
+
+ hash=lastHash;
+ state=lastState;
+ return false;
+ }
+
+ public boolean isFinal(){
+ return fsa.isFinal(state);
+ }
+
+ public boolean isStartState() {
+ return fsa.start() == state;
+ }
+
+ public boolean isValid(){
+ return state !=0;
+ }
+
+ public ByteBuffer data(){
+ return fsa.data(state);
+ }
+
+ public String dataString(){
+ return fsa.dataString(state);
+ }
+
+ public int hash(){
+ return hash;
+ }
+
+ public ByteBuffer lookup(String str){
+ start();
+ delta(str);
+ return fsa.data(state);
+ }
+
+ public boolean hasPerfectHash(){
+ return fsa.hasPerfectHash();
+ }
+
+ }
+
+ /**
+ * Class used to iterate over all accepted strings in the fsa.
+ */
+ public static class Iterator implements java.util.Iterator<Iterator.Item> {
+ /**
+ * Internally, this class stores the state information for the iterator.
+ * Externally, it is used for accessing the data associated with the iterator position.
+ */
+ public static class Item {
+ private FSA fsa;
+ private java.util.Stack<Byte> string;
+ private int symbol;
+ private int state;
+ private java.util.Stack<Integer> stack;
+
+ /**
+ * Constructor
+ * @param fsa the FSA object the iterator is associated with.
+ * @param state the state used as start state.
+ */
+ public Item(FSA fsa, int state) {
+ this.fsa = fsa;
+ this.string = new java.util.Stack();
+ this.symbol = 0;
+ this.state = state;
+ this.stack = new java.util.Stack();
+ }
+
+ /**
+ * Copy constructor. (Does not copy the state stack)
+ */
+ public Item(Item item) {
+ this.fsa = item.fsa;
+ this.string = new java.util.Stack();
+ for (java.util.Iterator<Byte> itr = item.string.iterator(); itr.hasNext(); ) {
+ byte b = itr.next();
+ this.string.push(b);
+ }
+ this.symbol = item.symbol;
+ this.state = item.state;
+ // no need to fill the stack as this constructor is used by Iterator::next()
+ this.stack = null;
+ }
+
+ public String getString() {
+ ByteBuffer buffer = ByteBuffer.allocate(string.size());
+ for (java.util.Iterator<Byte> itr = string.iterator(); itr.hasNext(); ) {
+ byte b = itr.next();
+ buffer.put(b);
+ }
+ buffer.flip();
+ return fsa.decode(buffer);
+ }
+
+ public ByteBuffer getData() {
+ return fsa.data(state);
+ }
+
+ public String getDataString() {
+ return fsa.dataString(state);
+ }
+
+ public String toString() {
+ return "string: " + string + "(" + getString() + "), symbol: " + symbol + ", state: " + state;
+ }
+ }
+
+ private Item item;
+ boolean useInitState = false;
+
+ /**
+ * Constructor.
+ * @param state the state to create the iterator from.
+ */
+ public Iterator(State state) {
+ item = new Item(state.fsa, state.state);
+ if (state.isFinal()) {
+ useInitState = true;
+ } else {
+ findNext();
+ }
+ }
+
+ private void findNext() {
+ int nextState;
+ int depth;
+
+ if (item.symbol == 256 || item.fsa == null) {
+ throw new NoSuchElementException();
+ }
+
+ // flip the flag now that the first state has been returned
+ if (useInitState) {
+ useInitState = false;
+ }
+
+ // try to find the next final state
+ for(;;) {
+ item.symbol++;
+ if (item.symbol < 256) {
+ byte symbol = (byte)item.symbol;
+ nextState = item.fsa.delta(item.state, (byte)item.symbol);
+ if (nextState != 0) {
+ item.string.push((byte)item.symbol);
+ item.stack.push(item.state);
+ item.state = nextState;
+ item.symbol = 0;
+ if (item.fsa.isFinal(nextState)) {
+ break;
+ }
+ }
+ } else { // backtrack
+ if ((depth = item.string.size()) > 0) {
+ byte b = item.string.pop(); // remove the last byte
+ item.symbol = b < 0 ? b + 256 : b;
+ item.state = item.stack.pop();
+ } else {
+ item.state = 0;
+ break;
+ }
+ }
+ }
+ }
+
+ public boolean hasNext() {
+ return item.state != 0 || useInitState;
+ }
+
+ public Item next() {
+ Item retval = new Item(item);
+ findNext();
+ return retval;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ }
+
+ public State getState(){
+ return new State(this);
+ }
+
+ /**
+ * Returns a new iterator to the start state.
+ */
+ public Iterator iterator() {
+ return new Iterator(getState());
+ }
+
+ /**
+ * Returns a new iterator to the given state.
+ * @param state the state to create the iterator from.
+ */
+ public Iterator iterator(State state) {
+ return new Iterator(state);
+ }
+
+ private boolean _ok = false;
+ private MappedByteBuffer _header;
+ private MappedByteBuffer _symbol_tab;
+ private MappedByteBuffer _state_tab;
+ private MappedByteBuffer _data;
+ private MappedByteBuffer _phash;
+ private Charset _charset;
+
+ /**
+ * Loads an FSA from a resource file name, which is resolved from the class path of the
+ * class loader of the given class.
+ * <p>
+ * This is useful for loading fsa's deployed within OSGi bundles.
+ *
+ * @param resourceFileName the name of the file, relative to any element on the classpath.
+ * For example, if the classpath contains resources/ and the file is resources/myfsa.fsa
+ * this argument should be myfsa.fsa
+ * @param loadingClass a class which provides the class loader to use for loading. Any class which is loaded
+ * from the same class path as the resource will do (e.g with OSGi - any class in the same bundle jar)
+ * @return the loaded FSA
+ * @throws RuntimeException if the class could not be loaded
+ */
+ public static FSA loadFromResource(String resourceFileName,Class loadingClass) {
+ URL fsaUrl=loadingClass.getResource(resourceFileName);
+ if ( ! "file".equals(fsaUrl.getProtocol()))
+ throw new RuntimeException("Could not open non-file url '" + fsaUrl + "' as a file input stream: " +
+ "The classloader of " + loadingClass + "' does not return file urls");
+ return new FSA(fsaUrl.getFile());
+ }
+
+ /**
+ * Loads an FSA from a file using utf-8 encoding
+ *
+ * @throws IllegalArgumentException if the file is not found
+ */
+ public FSA(String filename) {
+ init(filename,"utf-8");
+ }
+
+ /**
+ * Loads an FSA from a file using the specified character encoding.
+ *
+ * @throws IllegalArgumentException if the file is not found
+ */
+ public FSA(String filename, String charsetname) {
+ init(filename,charsetname);
+ }
+
+ /** Loads an FSA from a file input stream using utf-8 encoding */
+ public FSA(FileInputStream filename) {
+ init(filename,"utf-8");
+ }
+
+ /** Loads an FSA from a file input stream using the specified character encoding */
+ public FSA(FileInputStream filename, String charsetname) {
+ init(filename,charsetname);
+ }
+
+ private void init(String filename, String charsetname){
+ try {
+ init(new FileInputStream(filename),charsetname);
+ }
+ catch (FileNotFoundException e) {
+ throw new IllegalArgumentException("Could not find FSA file '" + filename + "'",e);
+ }
+ catch (IOException e) {
+ throw new IllegalArgumentException("Could not read FSA file '" + filename + "'",e);
+ }
+ }
+
+ private void init(FileInputStream file, String charsetname) {
+ try {
+ _charset = Charset.forName(charsetname);
+
+ _header = file.getChannel().map(MapMode.READ_ONLY,0,256);
+ _header.order(ByteOrder.LITTLE_ENDIAN);
+ if (h_magic()!=2038637673) {
+ throw new IOException("Stream does not contain an FSA: Wrong file magic number " + h_magic());
+ }
+ _symbol_tab = file.getChannel().map(MapMode.READ_ONLY,
+ 256,h_size());
+ _symbol_tab.order(ByteOrder.LITTLE_ENDIAN);
+ _state_tab = file.getChannel().map(MapMode.READ_ONLY,
+ 256+h_size(),4*h_size());
+ _state_tab.order(ByteOrder.LITTLE_ENDIAN);
+ _data = file.getChannel().map(MapMode.READ_ONLY,
+ 256+5*h_size(),h_data_size());
+ _data.order(ByteOrder.LITTLE_ENDIAN);
+ if(h_has_phash()>0){
+ _phash = file.getChannel().map(MapMode.READ_ONLY,
+ 256+5*h_size()+h_data_size(),
+ 4*h_size());
+ _phash.order(ByteOrder.LITTLE_ENDIAN);
+ }
+ _ok=true;
+ }
+ catch (IOException e) {
+ throw new RuntimeException("IO error while reading FSA file",e);
+ }
+ }
+
+ private int h_magic(){
+ return _header.getInt(0);
+ }
+ private int h_version(){
+ return _header.getInt(4);
+ }
+ private int h_checksum(){
+ return _header.getInt(8);
+ }
+ private int h_size(){
+ return _header.getInt(12);
+ }
+ private int h_start(){
+ return _header.getInt(16);
+ }
+ private int h_data_size(){
+ return _header.getInt(20);
+ }
+ private int h_data_type(){
+ return _header.getInt(24);
+ }
+ private int h_fixed_data_size(){
+ return _header.getInt(28);
+ }
+ private int h_has_phash(){
+ return _header.getInt(32);
+ }
+ private int h_serial(){
+ return _header.getInt(36);
+ }
+ private int getSymbol(int index){
+ int symbol = _symbol_tab.get(index);
+ if(symbol<0){
+ symbol += 256;
+ }
+ return symbol;
+ }
+
+ private ByteBuffer encode(String str){
+ return _charset.encode(str);
+ }
+
+ private ByteBuffer encode(CharBuffer chrbuf){
+ return _charset.encode(chrbuf);
+ }
+
+ private String decode(ByteBuffer buf){
+ return _charset.decode(buf).toString();
+ }
+
+ public boolean isOk(){
+ return _ok;
+ }
+
+ public boolean hasPerfectHash(){
+ return _ok && h_has_phash()==1;
+ }
+
+ public int version(){
+ if(_ok){
+ return h_version();
+ }
+ return 0;
+ }
+
+ public int serial(){
+ if(_ok){
+ return h_serial();
+ }
+ return 0;
+ }
+
+ protected int start(){
+ if(_ok){
+ return h_start();
+ }
+
+ return 0;
+ }
+
+ protected int delta(int state, byte symbol){
+ int s=symbol;
+ if(s<0){
+ s+=256;
+ }
+ if(_ok && s>0 && s<255){
+ if(getSymbol(state+s)==s){
+ return _state_tab.getInt(4*(state+s));
+ }
+ }
+ return 0;
+ }
+
+ protected int hashDelta(int state, byte symbol){
+ int s=symbol;
+ if(s<0){
+ s+=256;
+ }
+ if(_ok && h_has_phash()==1 && s>0 && s<255){
+ if(getSymbol(state+s)==s){
+ return _phash.getInt(4*(state+s));
+ }
+ }
+ return 0;
+ }
+
+ protected boolean isFinal(int state){
+ if(_ok){
+ if(getSymbol(state+255)==255){
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Retrieves data for the given state using the underlying fsa data buffer.
+ * @param state The fsa state to retrieve data from.
+ * @return A new buffer containing the data for the given state.
+ **/
+ protected ByteBuffer data(int state) {
+ if(_ok && isFinal(state)){
+ int offset = _state_tab.getInt(4*(state+255));
+ int length;
+ if(h_data_type()==1){
+ length = h_fixed_data_size();
+ }
+ else{
+ length = _data.getInt(offset);
+ offset += 4;
+ }
+ ByteBuffer meta = ByteBuffer.allocate(length);
+ meta.order(ByteOrder.LITTLE_ENDIAN);
+ byte[] dst = meta.array();
+ for (int i = 0; i < length; ++i) {
+ dst[i] = _data.get(i + offset);
+ }
+ return meta;
+ }
+ return null;
+ }
+
+ /**
+ * Retrieves data for the given state using the underlying fsa data buffer.
+ * @param state The fsa state to retrieve data from.
+ * @return A string representation of the data for the given state.
+ **/
+ protected String dataString(int state) {
+ ByteBuffer meta = data(state);
+ if(meta!=null){
+ // Remove trailing '\0' if it exists. This is usually the
+ // case for automata built with text format (makefsa -t)
+ String data = decode(meta);
+ if (data.endsWith("\0")) {
+ data = data.substring(0, data.length()-1);
+ }
+ return data;
+ }
+ return null;
+ }
+
+ /**
+ * Convenience method that returns the metadata string in the fsa
+ * for the input lookup String, or null if the input string does
+ * not exist in the fsa.
+ * @param str The string to look up.
+ * @return Metadata string from the fsa. */
+ public String lookup(String str){
+ State s = getState();
+ s.lookup(str);
+ return s.dataString();
+ }
+
+
+ //// test ////
+ public static void main(String[] args) {
+ String test = "sour cherry";
+ if (args.length >= 1) {
+ test = args[0];
+ }
+
+ String fsafile = "/home/gv/fsa/test/__testfsa__.__fsa__";
+ //String fsafile = "/home/p13n/prelude/automata/query2dmozsegments.fsa";
+
+ FSA fsa = new FSA(fsafile);
+
+ System.out.println("Loading FSA file "+fsafile+": "+fsa.isOk());
+ System.out.println(" version: " + fsa.version()/1000000 + "." +
+ (fsa.version()/1000) % 1000 + "." +
+ fsa.version() % 1000);
+ System.out.println(" serial: " + fsa.serial());
+ System.out.println(" phash: " + fsa.hasPerfectHash());
+
+ FSA.State s = fsa.getState();
+
+ s.start();
+ for (int i=0; i < test.length(); i++) {
+ s.delta(test.charAt(i));
+ }
+ System.out.println("\ndelta() char test " + test + ": " +
+ s.isFinal() + ", info: " + s.dataString() +
+ ", hash value: " + s.hash());
+
+ s.start();
+ s.delta(test);
+ System.out.println("\ndelta() test " + test + ": " +
+ s.isFinal() + ", info: " + s.dataString() +
+ ", hash value: " + s.hash());
+
+ s.lookup(test);
+ String data = s.dataString();
+ System.out.println("\nlookup() test \"" + test + "\": " +
+ (s.lookup(test) != null) +
+ ", info: " + data + ", hash value: " + s.hash());
+
+ String data2 = fsa.lookup(test);
+ System.out.println("\nFSA.lookup() test \"" + test + "\": " + data2);
+ }
+}
+
+
diff --git a/fsa/src/main/java/com/yahoo/fsa/MetaData.java b/fsa/src/main/java/com/yahoo/fsa/MetaData.java
new file mode 100644
index 00000000000..fde868464c8
--- /dev/null
+++ b/fsa/src/main/java/com/yahoo/fsa/MetaData.java
@@ -0,0 +1,217 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.fsa;
+
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.FileChannel.MapMode;
+import java.nio.charset.Charset;
+
+import com.yahoo.fsa.FSA;
+
+
+/**
+ * Class for accessing meta-data (dat-files) used by FSA applications.
+ *
+ * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a>
+ **/
+public class MetaData {
+
+ private boolean _ok = false;
+ private MappedByteBuffer _header;
+ private MappedByteBuffer _data;
+ private Charset _charset;
+
+
+ public MetaData(String filename){
+ init(filename, "utf-8");
+ }
+
+ public MetaData(String filename, String charsetname){
+ init(filename, charsetname);
+ }
+
+ public boolean isOk(){
+ return _ok;
+ }
+
+ private void init(String filename, String charsetname){
+
+ _charset = Charset.forName(charsetname);
+
+ FileInputStream file;
+ try {
+ file = new FileInputStream(filename);
+ }
+ catch (FileNotFoundException e) {
+ System.out.print("MetaData file " + filename + " not found.\n");
+ return;
+ }
+
+ try {
+ _header = file.getChannel().map(MapMode.READ_ONLY,0,256);
+ _header.order(ByteOrder.LITTLE_ENDIAN);
+ if(h_magic()!=-2025936501){
+ System.out.print("MetaData bad magic " + h_magic() +"\n");
+ return;
+ }
+ _data = file.getChannel().map(MapMode.READ_ONLY,
+ 256,
+ h_size());
+ _data.order(ByteOrder.LITTLE_ENDIAN);
+ _ok=true;
+ }
+ catch (IOException e) {
+ System.out.print("MetaData IO exception.\n");
+ return;
+ }
+ }
+
+ private int h_magic(){
+ return _header.getInt(0);
+ }
+ private int h_version(){
+ return _header.getInt(4);
+ }
+ private int h_checksum(){
+ return _header.getInt(8);
+ }
+ private int h_size(){
+ return _header.getInt(12);
+ }
+ private int h_reserved(int i){
+ if(i<0||i>9){
+ return 0;
+ }
+ return _header.getInt(16+4*i);
+ }
+ private int h_user(int i){
+ if(i<0||i>49){
+ return 0;
+ }
+ return _header.getInt(56+4*i);
+ }
+
+
+ private ByteBuffer encode(CharBuffer chrbuf){
+ return _charset.encode(chrbuf);
+ }
+
+ private String decode(ByteBuffer buf){
+ return _charset.decode(buf).toString();
+ }
+
+
+ public int user(int i){
+ if(!_ok){
+ return 0;
+ }
+ return h_user(i);
+ }
+
+ public int getIntEntry(int idx)
+ {
+ if(_ok){
+ return _data.getInt(idx*4);
+ }
+ else
+ return 0;
+ }
+
+ public ByteBuffer getDirectRecordEntry(int idx, int size)
+ {
+ if(_ok){
+ ByteBuffer meta = ByteBuffer.allocate(size);
+ meta.order(ByteOrder.LITTLE_ENDIAN);
+ _data.position(idx*size);
+ _data.get(meta.array(),0,size);
+ return meta;
+ }
+ else
+ return null;
+ }
+
+ public ByteBuffer getIndirectRecordEntry(int idx, int size)
+ {
+ if(_ok){
+ int offset = _data.getInt(idx*4);
+ ByteBuffer meta = ByteBuffer.allocate(size);
+ meta.order(ByteOrder.LITTLE_ENDIAN);
+ _data.position(offset);
+ _data.get(meta.array(),0,size);
+ return meta;
+ }
+ else
+ return null;
+ }
+
+ public ByteBuffer getIndirectRecordEntry(int idx)
+ {
+ if(_ok){
+ int offset = _data.getInt(idx*4);
+ int size = _data.getInt(offset);
+ ByteBuffer meta = ByteBuffer.allocate(size);
+ meta.order(ByteOrder.LITTLE_ENDIAN);
+ _data.position(offset+4);
+ _data.get(meta.array(),0,size);
+ return meta;
+ }
+ else
+ return null;
+ }
+
+ public String getStringEntry(int stringOffset){
+ if(_ok){
+ int length = 0;
+ _data.position(stringOffset);
+ while(_data.get()!=0){
+ length++;
+ }
+ ByteBuffer meta = ByteBuffer.allocate(length);
+ meta.order(ByteOrder.LITTLE_ENDIAN);
+ _data.position(stringOffset);
+ _data.get(meta.array(),0,length);
+ return decode(meta);
+ }
+ return null;
+ }
+
+ public String[] getStringArrayEntry(int stringOffset, int numStrings){
+ if(_ok && numStrings>0){
+ String[] stringArray = new String[numStrings];
+ int pos=stringOffset;
+ for(int i=0;i<numStrings;i++){
+ int length = 0;
+ _data.position(pos);
+ while(_data.get()!=0){
+ length++;
+ }
+ ByteBuffer meta = ByteBuffer.allocate(length);
+ meta.order(ByteOrder.LITTLE_ENDIAN);
+ _data.position(pos);
+ _data.get(meta.array(),0,length);
+ stringArray[i] = decode(meta);
+ pos += length+1;
+ }
+ return stringArray;
+ }
+ return null;
+ }
+
+ //// test ////
+ public static void main(String[] args) {
+ String file = "dmozPred_2.dat";
+
+ MetaData metaData = new MetaData(file);
+
+ System.out.println("Loading MetaData "+file+": "+metaData.isOk());
+ }
+
+
+
+}
diff --git a/fsa/src/main/java/com/yahoo/fsa/conceptnet/ConceptNet.java b/fsa/src/main/java/com/yahoo/fsa/conceptnet/ConceptNet.java
new file mode 100644
index 00000000000..13cb93073d2
--- /dev/null
+++ b/fsa/src/main/java/com/yahoo/fsa/conceptnet/ConceptNet.java
@@ -0,0 +1,384 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.fsa.conceptnet;
+
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.FileChannel.MapMode;
+import java.nio.charset.Charset;
+
+import com.yahoo.fsa.FSA;
+
+
+/**
+ * Class for accessing the concept network automata.
+ *
+ * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a>
+ **/
+public class ConceptNet {
+
+ private FSA _fsa;
+ private boolean _ok = false;
+ private MappedByteBuffer _header;
+ private MappedByteBuffer _index;
+ private MappedByteBuffer _info;
+ private MappedByteBuffer _catindex;
+ private MappedByteBuffer _strings;
+ private Charset _charset;
+
+
+ public ConceptNet(String domain){
+ init(domain, "utf-8");
+ }
+
+ public ConceptNet(String domain, String charsetname){
+ init(domain, charsetname);
+ }
+
+ public boolean isOk(){
+ return _ok;
+ }
+
+ private void init(String domain, String charsetname){
+
+ _charset = Charset.forName(charsetname);
+
+ _fsa = new FSA(domain + ".fsa",charsetname);
+
+ if(!_fsa.isOk()){
+ return;
+ }
+
+ FileInputStream file;
+ try {
+ file = new FileInputStream(domain + ".dat");
+ }
+ catch (FileNotFoundException e) {
+ System.out.print("ConceptNet data file " + domain + ".dat" + " not found.\n");
+ return;
+ }
+
+ try {
+ _header = file.getChannel().map(MapMode.READ_ONLY,0,256);
+ _header.order(ByteOrder.LITTLE_ENDIAN);
+ if(h_magic()!=238579428){
+ System.out.print("ConceptNet bad magic " + h_magic() +"\n");
+ return;
+ }
+ _index = file.getChannel().map(MapMode.READ_ONLY,
+ 256,
+ 8*4*h_index_size());
+ _index.order(ByteOrder.LITTLE_ENDIAN);
+ _info = file.getChannel().map(MapMode.READ_ONLY,
+ 256+8*4*h_index_size(),
+ 4*h_info_size());
+ _info.order(ByteOrder.LITTLE_ENDIAN);
+ _catindex = file.getChannel().map(MapMode.READ_ONLY,
+ 256+8*4*h_index_size()+4*h_info_size(),
+ 4*h_catindex_size());
+ _catindex.order(ByteOrder.LITTLE_ENDIAN);
+ _strings = file.getChannel().map(MapMode.READ_ONLY,
+ 256+8*4*h_index_size()+4*h_info_size()+4*h_catindex_size(),
+ h_strings_size());
+ _strings.order(ByteOrder.LITTLE_ENDIAN);
+ _ok=true;
+ }
+ catch (IOException e) {
+ System.out.print("ConceptNet IO exception.\n");
+ return;
+ }
+ }
+
+ private int h_magic(){
+ return _header.getInt(0);
+ }
+ private int h_version(){
+ return _header.getInt(4);
+ }
+ private int h_checksum(){
+ return _header.getInt(8);
+ }
+ private int h_index_size(){
+ return _header.getInt(12);
+ }
+ private int h_info_size(){
+ return _header.getInt(16);
+ }
+ private int h_catindex_size(){
+ return _header.getInt(20);
+ }
+ private int h_strings_size(){
+ return _header.getInt(24);
+ }
+ private int h_max_freq(){
+ return _header.getInt(28);
+ }
+ private int h_max_cfreq(){
+ return _header.getInt(32);
+ }
+ private int h_max_qfreq(){
+ return _header.getInt(36);
+ }
+ private int h_max_sfreq(){
+ return _header.getInt(40);
+ }
+ private int h_max_efreq(){
+ return _header.getInt(44);
+ }
+ private int h_max_afreq(){
+ return _header.getInt(48);
+ }
+
+
+ private ByteBuffer encode(CharBuffer chrbuf){
+ return _charset.encode(chrbuf);
+ }
+
+ private String decode(ByteBuffer buf){
+ return _charset.decode(buf).toString();
+ }
+
+ public int lookup(String unit)
+ {
+ FSA.State state = _fsa.getState();
+ // state.start(); // getState does this for us
+ state.delta(unit);
+ if(state.isFinal()){
+ return state.hash();
+ }
+ return -1;
+ }
+
+ public String lookup(int idx)
+ {
+ if(!_ok || idx<0 || idx>=h_index_size()){
+ return null;
+ }
+ int termoffset = _index.getInt(4*8*idx);
+ return getString(termoffset);
+ }
+
+ private String getString(int stringOffset){
+ if(_ok){
+ int length = 0;
+ _strings.position(stringOffset);
+ while(_strings.get()!=0){
+ length++;
+ }
+ ByteBuffer meta = ByteBuffer.allocate(length);
+ _strings.position(stringOffset);
+ _strings.get(meta.array(),0,length);
+ return decode(meta);
+ }
+ return null;
+ }
+
+ public int frq(int idx)
+ {
+ if(!_ok || idx<0 || idx>=h_index_size()){
+ return -1;
+ }
+ return _index.getInt(4*8*idx+4);
+ }
+
+ public int cFrq(int idx)
+ {
+ if(!_ok || idx<0 || idx>=h_index_size()){
+ return -1;
+ }
+ return _index.getInt(4*8*idx+8);
+ }
+
+ public int qFrq(int idx)
+ {
+ if(!_ok || idx<0 || idx>=h_index_size()){
+ return -1;
+ }
+ return _index.getInt(4*8*idx+12);
+ }
+
+ public int sFrq(int idx)
+ {
+ if(!_ok || idx<0 || idx>=h_index_size()){
+ return -1;
+ }
+ return _index.getInt(4*8*idx+16);
+ }
+
+ public double score(int idx)
+ {
+ if(!_ok || idx<0 || idx>=h_index_size()){
+ return -1.0;
+ }
+ return 100.0*cFrq(idx)/qFrq(idx);
+ }
+
+ public double strength(int idx)
+ {
+ if(!_ok || idx<0 || idx>=h_index_size()){
+ return -1.0;
+ }
+ return 100.0*qFrq(idx)/sFrq(idx);
+ }
+
+ public int numExt(int idx)
+ {
+ if(idx<0 || idx>=h_index_size()){
+ return -1;
+ }
+ int offset = _index.getInt(4*8*idx+20);
+ if(offset==0){
+ return 0;
+ }
+ return _info.getInt(4*offset);
+ }
+
+ public int ext(int idx, int i)
+ {
+ if(idx<0 || idx>=h_index_size()){
+ return -1;
+ }
+ int offset = _index.getInt(4*8*idx+20);
+ if(offset==0){
+ return -1;
+ }
+ if(i>=_info.getInt(4*offset)){
+ return -1;
+ }
+ return _info.getInt(4*offset+4+8*i);
+ }
+
+ public int extFrq(int idx, int i)
+ {
+ if(idx<0 || idx>=h_index_size()){
+ return -1;
+ }
+ int offset = _index.getInt(4*8*idx+20);
+ if(offset==0){
+ return -1;
+ }
+ if(i>=_info.getInt(4*offset)){
+ return -1;
+ }
+ return _info.getInt(4*offset+8+8*i);
+ }
+
+ public int numAssoc(int idx)
+ {
+ if(idx<0 || idx>=h_index_size()){
+ return -1;
+ }
+ int offset = _index.getInt(4*8*idx+24);
+ if(offset==0){
+ return 0;
+ }
+ return _info.getInt(4*offset);
+ }
+
+ public int assoc(int idx, int i)
+ {
+ if(idx<0 || idx>=h_index_size()){
+ return -1;
+ }
+ int offset = _index.getInt(4*8*idx+24);
+ if(offset==0){
+ return -1;
+ }
+ if(i>=_info.getInt(4*offset)){
+ return -1;
+ }
+ return _info.getInt(4*offset+4+8*i);
+ }
+
+ public int assocFrq(int idx, int i)
+ {
+ if(idx<0 || idx>=h_index_size()){
+ return -1;
+ }
+ int offset = _index.getInt(4*8*idx+24);
+ if(offset==0){
+ return -1;
+ }
+ if(i>=_info.getInt(4*offset)){
+ return -1;
+ }
+ return _info.getInt(4*offset+8+8*i);
+ }
+
+ public int numCat(int idx)
+ {
+ if(idx<0 || idx>=h_index_size()){
+ return -1;
+ }
+ int offset = _index.getInt(4*8*idx+28);
+ if(offset==0){
+ return 0;
+ }
+ return _info.getInt(4*offset);
+ }
+
+ public int cat(int idx, int i)
+ {
+ if(idx<0 || idx>=h_index_size()){
+ return -1;
+ }
+ int offset = _index.getInt(4*8*idx+28);
+ if(offset==0){
+ return -1;
+ }
+ if(i>=_info.getInt(4*offset)){
+ return -1;
+ }
+ return _info.getInt(4*offset+4+8*i);
+ }
+
+ public String catName(int catidx)
+ {
+ if(!_ok || catidx<0 || catidx>=h_catindex_size()){
+ return null;
+ }
+ int catoffset = _catindex.getInt(4*catidx);
+ return getString(catoffset);
+ }
+
+ //// test ////
+ public static void main(String[] args) {
+ String domain = "/home/gv/fsa/automata/us_main_20041002_20041008";
+
+ ConceptNet cn = new ConceptNet(domain);
+
+ System.out.println("Loading ConceptNet domain "+domain+": "+cn.isOk());
+ int idx = cn.lookup("new york");
+ System.out.println(" lookup(\"new york\") -> "+idx);
+ System.out.println(" lookup("+idx+") -> "+cn.lookup(idx)+"("+cn.score(idx)+","+cn.strength(idx)+")");
+ System.out.println(" extensions("+cn.numExt(idx)+"):");
+ for(int i=0;i<5 && i<cn.numExt(idx);i++){
+ System.out.println(" "+cn.lookup(cn.ext(idx,i))+","+cn.extFrq(idx,i));
+ }
+ if(5<cn.numExt(idx)){
+ System.out.println(" ...");
+ }
+ System.out.println(" associations("+cn.numAssoc(idx)+"):");
+ for(int i=0;i<5 && i<cn.numAssoc(idx);i++){
+ System.out.println(" "+cn.lookup(cn.assoc(idx,i))+","+cn.assocFrq(idx,i));
+ }
+ if(5<cn.numAssoc(idx)){
+ System.out.println(" ...");
+ }
+ System.out.println(" categories("+cn.numCat(idx)+"):");
+ for(int i=0;i<5 && i<cn.numCat(idx);i++){
+ System.out.println(" "+cn.catName(cn.cat(idx,i)));
+ }
+ if(5<cn.numCat(idx)){
+ System.out.println(" ...");
+ }
+ }
+
+
+
+}
diff --git a/fsa/src/main/java/com/yahoo/fsa/package-info.java b/fsa/src/main/java/com/yahoo/fsa/package-info.java
new file mode 100644
index 00000000000..94c7fd30603
--- /dev/null
+++ b/fsa/src/main/java/com/yahoo/fsa/package-info.java
@@ -0,0 +1,7 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+@ExportPackage
+@PublicApi
+package com.yahoo.fsa;
+
+import com.yahoo.api.annotations.PublicApi;
+import com.yahoo.osgi.annotation.ExportPackage;
diff --git a/fsa/src/main/java/com/yahoo/fsa/segmenter/Segment.java b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segment.java
new file mode 100644
index 00000000000..1e424372a66
--- /dev/null
+++ b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segment.java
@@ -0,0 +1,42 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.fsa.segmenter;
+
+/**
+ * Class encapsulation of a segment.
+ *
+ * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a>
+ */
+public class Segment {
+
+ int _beg;
+ int _end;
+ int _conn;
+
+ public Segment(int b, int e, int c)
+ {
+ _beg = b;
+ _end = e;
+ _conn = c;
+ }
+
+ public int beg()
+ {
+ return _beg;
+ }
+
+ public int end()
+ {
+ return _end;
+ }
+
+ public int len()
+ {
+ return _end-_beg;
+ }
+
+ public int conn()
+ {
+ return _conn;
+ }
+
+}
diff --git a/fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java
new file mode 100644
index 00000000000..80ccd791644
--- /dev/null
+++ b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java
@@ -0,0 +1,137 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.fsa.segmenter;
+
+import java.util.LinkedList;
+import java.util.ListIterator;
+
+import com.yahoo.fsa.FSA;
+
+/**
+ * API for accessing the Segmenter automata.
+ *
+ * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a>
+ */
+public class Segmenter {
+
+ private FSA _fsa;
+
+ public Segmenter(FSA fsa) {
+ _fsa = fsa;
+ }
+
+ public Segmenter(String filename) {
+ _fsa = new FSA(filename,"utf-8");
+ }
+
+ public Segmenter(String filename, String charsetname) {
+ _fsa = new FSA(filename,charsetname);
+ }
+
+ public boolean isOk()
+ {
+ return _fsa.isOk();
+ }
+
+ public Segments segment(String input)
+ {
+ String[] tokens = input.split("\\s");
+ return segment(tokens);
+ }
+
+ private class Detector {
+ FSA.State _state;
+ int _index;
+
+ public Detector(FSA.State s, int i)
+ {
+ _state = s;
+ _index = i;
+ }
+
+ public FSA.State state()
+ {
+ return _state;
+ }
+
+ public int index()
+ {
+ return _index;
+ }
+ }
+
+ public Segments segment(String[] tokens)
+ {
+ Segments segments = new Segments(tokens);
+ LinkedList detectors = new LinkedList();
+
+ int i=0;
+
+
+ while(i<tokens.length){
+ detectors.add(new Detector(_fsa.getState(),i));
+
+ ListIterator det_it = detectors.listIterator();
+ while(det_it.hasNext()){
+ Detector d = (Detector)det_it.next();
+ d.state().deltaWord(tokens[i]);
+ if(d.state().isFinal()){
+ segments.add(new Segment(d.index(),i+1,d.state().data().getInt(0)));
+ }
+
+ if(!d.state().isValid()){
+ det_it.remove();
+ }
+ }
+ i++;
+ }
+
+ return segments;
+ }
+
+ //// test ////
+ public static void main(String[] args) {
+ String fsafile = "/home/gv/fsa/automata/segments.fsa";
+
+ Segmenter segmenter = new Segmenter(fsafile);
+
+ System.out.println("Loading segmenter FSA file "+fsafile+": "+segmenter.isOk());
+
+ for(int a=0;a<1||a<args.length;a++){
+
+ String query;
+ if(a==args.length){
+ query = "times square head";
+ }
+ else {
+ query = args[a];
+ }
+ System.out.println("processing query \""+query+"\"");
+
+ Segments segments = segmenter.segment(query);
+ System.out.println("all segments:");
+ for(int i=0; i<segments.size();i++){
+ System.out.println(" "+i+": \""+segments.sgm(i)+"\","+segments.conn(i));
+ }
+
+ Segments best;
+
+ best = segments.segmentation(Segments.SEGMENTATION_WEIGHTED);
+ System.out.print("best segments (weighted): ");
+ for(int i=0; i<best.size();i++){
+ System.out.print("("+best.sgm(i)+")");
+ }
+ System.out.println();
+
+ best = segments.segmentation(Segments.SEGMENTATION_RIGHTMOST_LONGEST);
+ System.out.print("best segments (rightmost_longest):");
+ for(int i=0; i<best.size();i++){
+ System.out.print("("+best.sgm(i)+")");
+ }
+ System.out.println();
+
+ }
+
+ }
+
+}
+
diff --git a/fsa/src/main/java/com/yahoo/fsa/segmenter/Segments.java b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segments.java
new file mode 100644
index 00000000000..26752046f80
--- /dev/null
+++ b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segments.java
@@ -0,0 +1,313 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.fsa.segmenter;
+
+import java.util.LinkedList;
+
+/**
+ * Contains the segmentation() method.
+ *
+ * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a>
+ */
+public class Segments extends LinkedList {
+
+ public final static int SEGMENTATION_WEIGHTED = 0;
+ public final static int SEGMENTATION_WEIGHTED_BIAS10 = 1;
+ public final static int SEGMENTATION_WEIGHTED_BIAS20 = 2;
+ public final static int SEGMENTATION_WEIGHTED_BIAS50 = 3;
+ public final static int SEGMENTATION_WEIGHTED_BIAS100 = 4;
+ public final static int SEGMENTATION_WEIGHTED_LEFTMOST = 5;
+ public final static int SEGMENTATION_WEIGHTED_RIGHTMOST = 6;
+ public final static int SEGMENTATION_WEIGHTED_LONGEST = 7;
+ public final static int SEGMENTATION_LEFTMOST_LONGEST = 8;
+ public final static int SEGMENTATION_LEFTMOST_WEIGHTED = 9;
+ public final static int SEGMENTATION_RIGHTMOST_LONGEST = 10;
+ public final static int SEGMENTATION_RIGHTMOST_WEIGHTED = 11;
+ public final static int SEGMENTATION_LONGEST_WEIGHTED = 12;
+ public final static int SEGMENTATION_LONGEST_LEFTMOST = 13;
+ public final static int SEGMENTATION_LONGEST_RIGHTMOST = 14;
+ public final static int SEGMENTATION_METHODS = 15;
+
+ private String[] _tokens;
+ private int _size;
+ private int[][] _map;
+
+ public Segments(String[] tokens)
+ {
+ _tokens = tokens;
+ _size = tokens.length;
+ _map = new int[_size+1][_size+1];
+ for(int i=0; i<=_size; i++){
+ for(int j=0; j<=_size; j++){
+ _map[i][j]=-1;
+ }
+ }
+ }
+
+ public void add(Segment s)
+ {
+ super.add(s);
+ _map[s.beg()][s.end()]=super.size()-1;
+ }
+
+ private void addMissingSingles()
+ {
+ for(int i=0; i<_size; i++){
+ if(_map[i][i+1]==-1){
+ super.add(new Segment(i,i+1,0));
+ _map[i][i+1]=super.size()-1;
+ }
+ }
+ }
+
+ private void reMap()
+ {
+ for(int i=0; i<=_size; i++){
+ for(int j=0; j<=_size; j++){
+ _map[i][j]=-1;
+ }
+ }
+ for(int i=0; i<super.size(); i++){
+ _map[beg(i)][end(i)] = i;
+ }
+ }
+
+ public String sgm(int idx)
+ {
+ if(idx<0 || idx>=super.size()){
+ return null;
+ }
+ String s = new String(_tokens[((Segment)(super.get(idx))).beg()]);
+ for(int i=((Segment)(super.get(idx))).beg()+1;i<((Segment)(super.get(idx))).end();i++){
+ s += " " + _tokens[i];
+ }
+ return s;
+ }
+
+ public int beg(int idx)
+ {
+ if(idx<0 || idx>=super.size()){
+ return -1;
+ }
+ return ((Segment)(super.get(idx))).beg();
+ }
+
+ public int end(int idx)
+ {
+ if(idx<0 || idx>=super.size()){
+ return -1;
+ }
+ return ((Segment)(super.get(idx))).end();
+ }
+
+ public int len(int idx)
+ {
+ if(idx<0 || idx>=super.size()){
+ return -1;
+ }
+ return ((Segment)(super.get(idx))).len();
+ }
+
+ public int conn(int idx)
+ {
+ if(idx<0 || idx>=super.size()){
+ return -1;
+ }
+ return ((Segment)(super.get(idx))).conn();
+ }
+
+ public Segments segmentation(int method)
+ {
+ Segments smnt = new Segments(_tokens);
+
+ addMissingSingles();
+
+ int maxsc, id, bestid=-1, bias=0, c, pos, bestval, temp=0, next=-1;
+ int[] maxScore = new int[super.size()];
+ int[] nextid = new int[super.size()];
+ for(int i=0;i<nextid.length;i++){
+ nextid[i]=-1;
+ }
+
+ switch(method){
+ case SEGMENTATION_WEIGHTED_BIAS100:
+ bias+=50;
+ case SEGMENTATION_WEIGHTED_BIAS50:
+ bias+=30;
+ case SEGMENTATION_WEIGHTED_BIAS20:
+ bias+=10;
+ case SEGMENTATION_WEIGHTED_BIAS10:
+ bias+=10;
+ case SEGMENTATION_WEIGHTED:
+ bestid=-1;
+ for(int i=_tokens.length;i>=0;i--){
+ bestid=-1;maxsc=0;
+ for(int j=i+1;j<=_tokens.length;j++){
+ id=_map[i][j];
+ if(id>=0 && maxScore[id]+1>maxsc) {
+ bestid=id;
+ maxsc=maxScore[id]+1;
+ }
+ }
+ if(maxsc>0){
+ maxsc--;
+ }
+ for(int j=0;j<i;j++){
+ id=_map[j][i];
+ if(id>=0){
+ nextid[id] = bestid;
+ c = conn(id);
+ if(i-j<=1){
+ maxScore[id] = maxsc;
+ }
+ else if(bias>0){
+ maxScore[id] = maxsc + ((100+(i-j-2)*bias)*c)/100;
+ }
+ else{
+ maxScore[id] = maxsc + c;
+ }
+ }
+ }
+ }
+ id = bestid;
+ while(id!=-1){
+ smnt.add(((Segment)(super.get(id))));
+ id=nextid[id];
+ }
+ break;
+ case SEGMENTATION_LEFTMOST_LONGEST:
+ case SEGMENTATION_LEFTMOST_WEIGHTED:
+ pos = 0;
+ while(pos<_tokens.length){
+ bestid = -1; bestval = -1;
+ for(int i=pos+1;i<=_tokens.length;i++){
+ id = _map[pos][i];
+ if(id>=0 &&
+ (method==SEGMENTATION_LEFTMOST_LONGEST ||
+ (temp=(len(id)>1)? conn(id) :0)>bestval) ){
+ bestid = id;
+ bestval = temp;
+ next = i;
+ }
+ }
+ smnt.add((Segment)(super.get(bestid)));
+ pos=next;
+ }
+ break;
+ case SEGMENTATION_RIGHTMOST_LONGEST:
+ case SEGMENTATION_RIGHTMOST_WEIGHTED:
+ pos = _tokens.length;
+ while(pos>0){
+ bestid = -1; bestval = -1;
+ for(int i=pos-1;i>=0;i--){
+ id = _map[i][pos];
+ if(id>=0 &&
+ (method==SEGMENTATION_RIGHTMOST_LONGEST ||
+ (temp=(len(id)>1)? conn(id) :0)>bestval) ){
+ bestid = id;
+ bestval = temp;
+ next = i;
+ }
+ }
+ smnt.addFirst(super.get(bestid));
+ pos=next;
+ }
+ smnt.reMap();
+ break;
+ case SEGMENTATION_LONGEST_WEIGHTED:
+ case SEGMENTATION_LONGEST_LEFTMOST:
+ case SEGMENTATION_LONGEST_RIGHTMOST:
+ case SEGMENTATION_WEIGHTED_LONGEST:
+ case SEGMENTATION_WEIGHTED_LEFTMOST:
+ case SEGMENTATION_WEIGHTED_RIGHTMOST:
+ buildSegmentationRecursive(method,smnt,0,_tokens.length);
+ break;
+ }
+
+ return smnt;
+ }
+
+ private void buildSegmentationRecursive(int method, Segments smnt, int b, int e)
+ {
+ int bestid, bestval1, bestval2, temp;
+
+ bestid=-1;bestval1=-1;bestval2=-1;
+ for(int i=0;i<super.size();i++){
+ if(b<=beg(i) && e>=end(i)){
+ switch(method){
+ case SEGMENTATION_LONGEST_WEIGHTED:
+ if(len(i)>bestval1 ||
+ (len(i)==bestval1 && conn(i)>bestval2) ){
+ bestid=i;
+ bestval1=len(i);
+ bestval2=conn(i);
+ }
+ break;
+ case SEGMENTATION_LONGEST_LEFTMOST:
+ if(len(i)>bestval1 ||
+ (len(i)==bestval1 && beg(i)<bestval2) ){
+ bestid=i;
+ bestval1=len(i);
+ bestval2=beg(i);
+ }
+ break;
+ case SEGMENTATION_LONGEST_RIGHTMOST:
+ if(len(i)>bestval1 ||
+ (len(i)==bestval1 && end(i)>bestval2) ){
+ bestid=i;
+ bestval1=len(i);
+ bestval2=end(i);
+ }
+ break;
+ case SEGMENTATION_WEIGHTED_LONGEST:
+ temp = (len(i)>1)?conn(i):0;
+ if(temp>bestval1 ||
+ (temp==bestval1 && len(i)>bestval2) ){
+ bestid=i;
+ bestval1=temp;
+ bestval2=len(i);
+ }
+ break;
+ case SEGMENTATION_WEIGHTED_LEFTMOST:
+ temp = (len(i)>1)? conn(i) :0;
+ if(temp>bestval1 ||
+ (temp==bestval1 && beg(i)<bestval2) ){
+ bestid=i;
+ bestval1=temp;
+ bestval2=beg(i);
+ }
+ break;
+ case SEGMENTATION_WEIGHTED_RIGHTMOST:
+ temp = len(i)>1?conn(i):0;
+ if(temp>bestval1 ||
+ (temp==bestval1 && end(i)>bestval2) ){
+ bestid=i;
+ bestval1=temp;
+ bestval2=end(i);
+ }
+ break;
+ default: // dummy defult pick first possible
+ if(bestid<0){
+ bestid=i;
+ }
+ break;
+ }
+ }
+ }
+ if(bestid<0) {
+ return; // this should never happen, as all one-word segments are created
+ }
+
+ if(b<beg(bestid)){
+ buildSegmentationRecursive(method,smnt,b,beg(bestid));
+ }
+
+ // add segment
+ smnt.add((Segment)(super.get(bestid)));
+
+ // check right side
+ if(e>end(bestid)){
+ buildSegmentationRecursive(method,smnt,end(bestid),e);
+ }
+ }
+
+}
diff --git a/fsa/src/main/java/com/yahoo/fsa/topicpredictor/PredictedTopic.java b/fsa/src/main/java/com/yahoo/fsa/topicpredictor/PredictedTopic.java
new file mode 100644
index 00000000000..2dd0dcc9bb2
--- /dev/null
+++ b/fsa/src/main/java/com/yahoo/fsa/topicpredictor/PredictedTopic.java
@@ -0,0 +1,65 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.fsa.topicpredictor;
+
+
+/**
+ * Class encapsulation of a predicted topic. A topic has a weight and
+ * a term vector string of topicSegments.
+ *
+ * @author gjoranv
+ **/
+public class PredictedTopic {
+
+ private String topic = "";
+ private double weight = 0.0;
+ private String vector = "";
+
+
+ public PredictedTopic(String topic, double weight, String vector){
+ this.topic = topic;
+ this.weight = weight;
+ this.vector = vector;
+ }
+
+ public PredictedTopic(String topic, double weight){
+ this(topic, weight, "");
+ }
+
+
+ /** Returns the topic */
+ public String getTopic() { return topic; }
+
+ /** Returns the weight */
+ public double getWeight() { return weight; }
+
+ /** Returns the vector*/
+ public String getVector() { return vector; }
+
+
+ /** Sets the weight */
+ public void setWeight(double weight) {
+ this.weight = weight;
+ }
+
+ /** Adds to the weight */
+ public void addWeight(double weight) {
+ this.weight += weight;
+ }
+
+ /** Sets the vector*/
+ public void setVector(String vector) {
+ this.vector = vector;
+ }
+
+ /** Compares this topic to another topic, according to weight descending */
+ public int compareDescendWeight(Object o) {
+ PredictedTopic pt = (PredictedTopic)o;
+
+ double wgt1 = getWeight();
+ double wgt2 = pt.getWeight();
+ if (wgt1 < wgt2) { return 1; }
+ if (wgt1 > wgt2) { return -1;}
+ return 0;
+ }
+
+}
diff --git a/fsa/src/main/java/com/yahoo/fsa/topicpredictor/TopicPredictor.java b/fsa/src/main/java/com/yahoo/fsa/topicpredictor/TopicPredictor.java
new file mode 100644
index 00000000000..177e879c6c8
--- /dev/null
+++ b/fsa/src/main/java/com/yahoo/fsa/topicpredictor/TopicPredictor.java
@@ -0,0 +1,180 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.fsa.topicpredictor;
+
+import java.util.logging.Logger;
+import java.util.List;
+import java.util.LinkedList;
+import java.util.Iterator;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.ByteOrder;
+import java.nio.MappedByteBuffer;
+import java.nio.channels.FileChannel;
+import java.nio.channels.FileChannel.MapMode;
+import java.nio.charset.Charset;
+
+import com.yahoo.fsa.FSA;
+import com.yahoo.fsa.MetaData;
+
+
+/**
+ * Class for accessing the topic prediction automata. Look up the
+ * predicted topics for a term. Each topic has an attached weight and
+ * a term vector (topicSegments).
+ *
+ * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a>
+ **/
+public class TopicPredictor extends MetaData {
+
+ private static final String packageName = "com.yahoo.fsa.topicpredictor";
+
+ private FSA fsa = null;
+
+ public TopicPredictor(String fsafile, String datfile){
+ this(fsafile, datfile, "utf-8");
+ }
+
+ public TopicPredictor(String fsafile, String datfile,
+ String charsetname) {
+ super(datfile, charsetname);
+ if (!isOk()) {
+ Logger.getLogger(packageName).
+ warning("Error initializing predictor with file " + datfile);
+ }
+
+ // Init the segment->'topic index' FSA
+ fsa = new FSA(fsafile);
+ if (!fsa.isOk()) {
+ Logger.getLogger(packageName).
+ warning("Error initializing FSA with file " + fsafile);
+ }
+ }
+
+ /**
+ * Returns a list of PredictedTopic objects, one for each topic
+ * the segment maps to. The returned list contains all topics,
+ * as opposed to the two-argument version.
+ * @param segment The segment string to find (all) topics for.
+ * @return (Linked)List of PredictedTopic objects. */
+ public List getPredictedTopics(String segment) {
+ return getPredictedTopics(segment, 0);
+ }
+
+ /**
+ * Returns a list of PredictedTopic objects, one for each topic
+ * the segment maps to. The returned list length is cut off at
+ * 'maxTopics' entries, maxTopics=0 returns all topics.
+ * @param segment The segment string to find topics for.
+ * @param maxTopics The max number of topics to return, 0 for all topics
+ * @return (Linked)List of PredictedTopic objects. */
+ public List getPredictedTopics(String segment, int maxTopics) {
+ List predictedTopics = new LinkedList();
+
+ int segIdx = getSegmentIndex(segment);
+ int[][] topicArr = getTopicArray(segIdx, maxTopics);
+ int numTopics = topicArr.length;
+ int allTopics = getNumTopics(segIdx);
+ /*Logger.getLogger(packageName).
+ fine("Segment: '" + segment + "' has " + allTopics +
+ " topics in automaton, fetched " + numTopics);
+ */
+ for(int i=0; i < numTopics; i++) {
+ int weight = topicArr[i][1];
+ String[] topicInfo= getTopicInfo(topicArr[i][0]);
+ String topic = topicInfo[0];
+ String vector= topicInfo[1];
+ PredictedTopic pt =
+ new PredictedTopic(topic, (double)weight, vector);
+ predictedTopics.add(pt);
+ }
+
+ return predictedTopics;
+ }
+
+ /**
+ * Returns the index (hash value) of the input segment in the FSA.
+ * @param segment The segment string to find index for.
+ * @return Index for this segment in the FSA. */
+ private int getSegmentIndex(String segment) {
+ FSA.State s = fsa.getState();
+ s.delta(segment);
+ if (s.isFinal()) {
+ return s.hash();
+ }
+ return -1;
+ }
+
+ /**
+ * Returns the number of topics the FSA contains for the input
+ * segment.
+ * @return Number of topics for the segment. */
+ private int getNumTopics(int segIdx) {
+ if (segIdx < 0) {
+ return 0;
+ }
+ ByteBuffer buf = getIndirectRecordEntry(segIdx, 4);
+ return buf.getInt(0);
+ }
+
+ /**
+ * Reads the topics and other metadata for a segment from the
+ * (memory-mapped) metadata file. Returns the info in a
+ * two-dimensional array (one row per topic).
+ * @param segIdx The FSA index (hash value) for the segment.
+ * @param maxTopics Max number of topics to return, 0 for all topics.
+ * @return Number of topics for the segment. */
+ private int[][] getTopicArray(int segIdx, int maxTopics) {
+ if (segIdx < 0) {
+ return new int[0][0];
+ }
+
+ int numTopics = getNumTopics(segIdx);
+ if ((maxTopics > 0) && (numTopics > maxTopics)) {
+ numTopics = maxTopics;
+ }
+
+ int[][] topics = new int[numTopics][2];
+ ByteBuffer buf = getIndirectRecordEntry(segIdx,4+8*numTopics);
+ for(int i=0; i<numTopics; i++){
+ topics[i][0] = buf.getInt(4+8*i);
+ topics[i][1] = buf.getInt(8+8*i);
+ }
+ return topics;
+ }
+
+ /**
+ * Returns the topic and vector strings from the internal meta
+ * data structure.
+ * @param topicId Topic start index in a two-dimensional array
+ * @return topic string at [0] and vector string at [1] */
+ private String[] getTopicInfo(int topicId) {
+ return getStringArrayEntry(user(0) + topicId, 2);
+ }
+
+
+ //// test ////
+ public static void main(String[] args) {
+ String segment = "new york";
+ if (args.length >= 1) {
+ segment = args[0];
+ }
+
+ String fsafile = "/home/gv/fsa/automata/dmozPred_2.fsa";
+ String datfile = "/home/gv/fsa/automata/dmozPred_2.dat";
+
+ TopicPredictor predictor = new TopicPredictor(fsafile, datfile);
+
+ List predictedTopics = predictor.getPredictedTopics(segment, 25);
+ Iterator i = predictedTopics.iterator();
+ while (i.hasNext()) {
+ PredictedTopic topic = (PredictedTopic) i.next();
+ System.out.println("\n topic=" + topic.getTopic());
+ System.out.println(" weight=" + topic.getWeight());
+ System.out.println(" vector=" + topic.getVector());
+ }
+ }
+
+}