orig) {
assertTokenize(input, Language.ENGLISH, StemMode.NONE, false, indexed, orig);
}
/**
* Compare the results of running an input string through the tokenizer with an "index" truth, and an optional
* "orig" truth.
*
* @param input The text to process, passed to tokenizer.
* @param language The language tag, passed to tokenizer.
* @param stemMode If stemMode != NONE, test will silently succeed if tokenizer does not do stemming.
* @param accentDrop Passed to the tokenizer.
* @param indexed Compared to the "TokenString" result from the tokenizer.
* @param orig Compared to the "Orig" result from the tokenizer.
*/
private void assertTokenize(String input, Language language, StemMode stemMode, boolean accentDrop,
List indexed, List orig) {
int i = 0;
int j = 0;
for (Token token : tokenizer.tokenize(input, language, stemMode, accentDrop)) {
// System.err.println("got token orig '"+token.getOrig()+"'");
// System.err.println("got token stem '"+token.getTokenString(stemMode)+"'");
if (token.getNumComponents() > 0) {
for (int comp = 0; comp < token.getNumComponents(); comp++) {
Token t = token.getComponent(comp);
if (t.getType().isIndexable()) {
assertThat("comp index: " + i, toLowerCase(t.getTokenString()), is(indexed.get(i++)));
}
}
} else {
if (token.getType().isIndexable()) {
assertThat("exp index: " + i, toLowerCase(token.getTokenString()), is(indexed.get(i++)));
}
}
if (orig != null) {
assertThat("orig index: " + j, token.getOrig(), is(orig.get(j++)));
}
}
assertThat("indexed length", i, is(indexed.size()));
if (orig != null) {
assertThat("orig length", j, is(orig.size()));
}
}
}