From 589923e168d45c76a230227c1a9dfebd2f5b6990 Mon Sep 17 00:00:00 2001 From: Arne Juul Date: Tue, 13 Jul 2021 11:45:59 +0000 Subject: try to trap spurious failure * we have seen spurious failures when verifying output from accent dropping; so far nothing reproducible, so add some extra logging and retry once if it happens (in case it's some kind of race-condition glitch). --- .../expressions/NormalizeExpression.java | 30 +++++++++++++++++++ .../expressions/NormalizeTestCase.java | 35 ++++++++++++++++++++++ 2 files changed, 65 insertions(+) diff --git a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NormalizeExpression.java b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NormalizeExpression.java index 9c6e698a154..78a7a858aba 100644 --- a/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NormalizeExpression.java +++ b/indexinglanguage/src/main/java/com/yahoo/vespa/indexinglanguage/expressions/NormalizeExpression.java @@ -6,12 +6,16 @@ import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.language.Linguistics; import com.yahoo.language.process.Transformer; +import java.util.logging.Logger; +import java.util.logging.Level; + /** * @author Simon Thoresen Hult */ public final class NormalizeExpression extends Expression { private final Linguistics linguistics; + private static final Logger logger = Logger.getLogger(NormalizeExpression.class.getName()); public NormalizeExpression(Linguistics linguistics) { super(DataType.STRING); @@ -22,9 +26,35 @@ public final class NormalizeExpression extends Expression { return linguistics; } + + private static String escape(String str) { + StringBuilder buf = new StringBuilder(); + for (char c : str.toCharArray()) { + if (c >= ' ') { + buf.append(c); + } else { + buf.append(String.format("U+%04X", (int)c)); + } + } + return buf.toString(); + } + @Override protected void doExecute(ExecutionContext context) { Transformer transformer = linguistics.getTransformer(); + var orig = String.valueOf(context.getValue()); + var lang = context.resolveLanguage(linguistics); + var transformed = transformer.accentDrop(orig, lang); + try { + context.setValue(new StringFieldValue(transformed)); + return; + } catch (IllegalArgumentException ex) { + String msg = ("bad normalize, \n" + + "original: >>> " + escape(orig) + " <<<\n" + + " -> accentDrop(" + lang + ") -> \n" + + "transformed: >>> " + escape(transformed) + " <<<"); + logger.log(Level.SEVERE, msg); + } context.setValue(new StringFieldValue(transformer.accentDrop(String.valueOf(context.getValue()), context.resolveLanguage(linguistics)))); } diff --git a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NormalizeTestCase.java b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NormalizeTestCase.java index 8b4f1a8b344..3ad1b129b4d 100644 --- a/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NormalizeTestCase.java +++ b/indexinglanguage/src/test/java/com/yahoo/vespa/indexinglanguage/expressions/NormalizeTestCase.java @@ -6,8 +6,10 @@ import com.yahoo.document.datatypes.FieldValue; import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.language.Language; import com.yahoo.language.Linguistics; +import com.yahoo.language.process.Transformer; import com.yahoo.language.simple.SimpleLinguistics; import com.yahoo.vespa.indexinglanguage.SimpleTestAdapter; + import org.junit.Test; import org.mockito.Mockito; @@ -56,4 +58,37 @@ public class NormalizeTestCase { assertTrue(val instanceof StringFieldValue); assertEquals("beyonce", ((StringFieldValue)val).getString()); } + + class MyMockTransformer implements Transformer { + boolean first = true; + @Override + public String accentDrop(String input, Language language) { + if (first) { + first = false; + return input.replace(' ', '\u0008'); + } else { + return input.replace(' ', '/'); + } + } + } + + class MyMockLinguistics extends SimpleLinguistics { + private Transformer transformer = new MyMockTransformer(); + @Override + public Transformer getTransformer() { + return transformer; + } + } + + @Test + public void requireThatBadNormalizeRetries() { + ExecutionContext ctx = new ExecutionContext(new SimpleTestAdapter()); + ctx.setLanguage(Language.ENGLISH); + ctx.setValue(new StringFieldValue("bad norm")); + var linguistics = new MyMockLinguistics(); + new NormalizeExpression(linguistics).execute(ctx); + FieldValue val = ctx.getValue(); + assertTrue(val instanceof StringFieldValue); + assertEquals("bad/norm", ((StringFieldValue)val).getString()); + } } -- cgit v1.2.3