summaryrefslogtreecommitdiffstats
path: root/fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java
diff options
context:
space:
mode:
Diffstat (limited to 'fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java')
-rw-r--r--fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java137
1 files changed, 137 insertions, 0 deletions
diff --git a/fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java
new file mode 100644
index 00000000000..80ccd791644
--- /dev/null
+++ b/fsa/src/main/java/com/yahoo/fsa/segmenter/Segmenter.java
@@ -0,0 +1,137 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.fsa.segmenter;
+
+import java.util.LinkedList;
+import java.util.ListIterator;
+
+import com.yahoo.fsa.FSA;
+
+/**
+ * API for accessing the Segmenter automata.
+ *
+ * @author <a href="mailto:boros@yahoo-inc.com">Peter Boros</a>
+ */
+public class Segmenter {
+
+ private FSA _fsa;
+
+ public Segmenter(FSA fsa) {
+ _fsa = fsa;
+ }
+
+ public Segmenter(String filename) {
+ _fsa = new FSA(filename,"utf-8");
+ }
+
+ public Segmenter(String filename, String charsetname) {
+ _fsa = new FSA(filename,charsetname);
+ }
+
+ public boolean isOk()
+ {
+ return _fsa.isOk();
+ }
+
+ public Segments segment(String input)
+ {
+ String[] tokens = input.split("\\s");
+ return segment(tokens);
+ }
+
+ private class Detector {
+ FSA.State _state;
+ int _index;
+
+ public Detector(FSA.State s, int i)
+ {
+ _state = s;
+ _index = i;
+ }
+
+ public FSA.State state()
+ {
+ return _state;
+ }
+
+ public int index()
+ {
+ return _index;
+ }
+ }
+
+ public Segments segment(String[] tokens)
+ {
+ Segments segments = new Segments(tokens);
+ LinkedList detectors = new LinkedList();
+
+ int i=0;
+
+
+ while(i<tokens.length){
+ detectors.add(new Detector(_fsa.getState(),i));
+
+ ListIterator det_it = detectors.listIterator();
+ while(det_it.hasNext()){
+ Detector d = (Detector)det_it.next();
+ d.state().deltaWord(tokens[i]);
+ if(d.state().isFinal()){
+ segments.add(new Segment(d.index(),i+1,d.state().data().getInt(0)));
+ }
+
+ if(!d.state().isValid()){
+ det_it.remove();
+ }
+ }
+ i++;
+ }
+
+ return segments;
+ }
+
+ //// test ////
+ public static void main(String[] args) {
+ String fsafile = "/home/gv/fsa/automata/segments.fsa";
+
+ Segmenter segmenter = new Segmenter(fsafile);
+
+ System.out.println("Loading segmenter FSA file "+fsafile+": "+segmenter.isOk());
+
+ for(int a=0;a<1||a<args.length;a++){
+
+ String query;
+ if(a==args.length){
+ query = "times square head";
+ }
+ else {
+ query = args[a];
+ }
+ System.out.println("processing query \""+query+"\"");
+
+ Segments segments = segmenter.segment(query);
+ System.out.println("all segments:");
+ for(int i=0; i<segments.size();i++){
+ System.out.println(" "+i+": \""+segments.sgm(i)+"\","+segments.conn(i));
+ }
+
+ Segments best;
+
+ best = segments.segmentation(Segments.SEGMENTATION_WEIGHTED);
+ System.out.print("best segments (weighted): ");
+ for(int i=0; i<best.size();i++){
+ System.out.print("("+best.sgm(i)+")");
+ }
+ System.out.println();
+
+ best = segments.segmentation(Segments.SEGMENTATION_RIGHTMOST_LONGEST);
+ System.out.print("best segments (rightmost_longest):");
+ for(int i=0; i<best.size();i++){
+ System.out.print("("+best.sgm(i)+")");
+ }
+ System.out.println();
+
+ }
+
+ }
+
+}
+