orig) {
assertTokenize(input, Language.ENGLISH, StemMode.NONE, false, indexed, orig);
}
/**
* Compare the results of running an input string through the tokenizer with an "index" truth, and an optional
* "orig" truth.
*
* @param input The text to process, passed to tokenizer.
* @param language The language tag, passed to tokenizer.
* @param stemMode If stemMode != NONE, test will silently succeed if tokenizer does not do stemming.
* @param accentDrop Passed to the tokenizer.
* @param indexed Compared to the "TokenString" result from the tokenizer.
* @param orig Compared to the "Orig" result from the tokenizer.
*/
private void assertTokenize(String input, Language language, StemMode stemMode, boolean accentDrop,
List indexed, List orig) {
int i = 0;
int j = 0;
for (Token token : tokenizer.tokenize(input, language, stemMode, accentDrop)) {
// System.err.println("got token orig '"+token.getOrig()+"'");
// System.err.println("got token stem '"+token.getTokenString(stemMode)+"'");
if (token.getNumComponents() > 0) {
for (int comp = 0; comp < token.getNumComponents(); comp++) {
Token t = token.getComponent(comp);
if (t.getType().isIndexable()) {
assertEquals("comp index: " + i, indexed.get(i++), toLowerCase(t.getTokenString()));
}
}
} else {
if (token.getType().isIndexable()) {
assertEquals("exp index: " + i, indexed.get(i++), toLowerCase(token.getTokenString()));
}
}
if (orig != null) {
assertEquals("orig index: " + j, orig.get(j++), token.getOrig());
}
}
assertEquals("indexed length", indexed.size(), i);
if (orig != null) {
assertEquals("orig length", orig.size(), j);
}
}
}