diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2016-06-15 23:09:44 +0200 |
commit | 72231250ed81e10d66bfe70701e64fa5fe50f712 (patch) | |
tree | 2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /docprocs/src/main |
Publish
Diffstat (limited to 'docprocs/src/main')
4 files changed, 387 insertions, 0 deletions
diff --git a/docprocs/src/main/java/com/yahoo/docprocs/indexing/DocumentScript.java b/docprocs/src/main/java/com/yahoo/docprocs/indexing/DocumentScript.java new file mode 100644 index 00000000000..a367aec0cfb --- /dev/null +++ b/docprocs/src/main/java/com/yahoo/docprocs/indexing/DocumentScript.java @@ -0,0 +1,107 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.docprocs.indexing; + +import com.yahoo.document.Document; +import com.yahoo.document.DocumentUpdate; +import com.yahoo.document.Field; +import com.yahoo.document.annotation.SpanTrees; +import com.yahoo.document.datatypes.Array; +import com.yahoo.document.datatypes.FieldValue; +import com.yahoo.document.datatypes.MapFieldValue; +import com.yahoo.document.datatypes.StringFieldValue; +import com.yahoo.document.datatypes.Struct; +import com.yahoo.document.datatypes.StructuredFieldValue; +import com.yahoo.document.datatypes.WeightedSet; +import com.yahoo.document.fieldpathupdate.AssignFieldPathUpdate; +import com.yahoo.document.fieldpathupdate.FieldPathUpdate; +import com.yahoo.document.update.FieldUpdate; +import com.yahoo.document.update.MapValueUpdate; +import com.yahoo.document.update.ValueUpdate; +import com.yahoo.vespa.indexinglanguage.AdapterFactory; +import com.yahoo.vespa.indexinglanguage.expressions.Expression; + +import java.util.*; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a> + */ +public class DocumentScript { + + private final String documentType; + private final Set<String> inputFields; + private final Expression expression; + + public DocumentScript(String documentType, Collection<String> inputFields, Expression expression) { + this.documentType = documentType; + this.inputFields = new HashSet<>(inputFields); + this.expression = expression; + } + + public Expression getExpression() { return expression; } + public Document execute(AdapterFactory adapterFactory, Document document) { + for (Iterator<Map.Entry<Field, FieldValue>> it = document.iterator(); it.hasNext(); ) { + Map.Entry<Field, FieldValue> entry = it.next(); + requireThatFieldIsDeclaredInDocument(entry.getKey()); + removeAnyLinguisticsSpanTree(entry.getValue()); + } + return expression.execute(adapterFactory, document); + } + + public DocumentUpdate execute(AdapterFactory adapterFactory, DocumentUpdate update) { + for (FieldUpdate fieldUpdate : update.getFieldUpdates()) { + requireThatFieldIsDeclaredInDocument(fieldUpdate.getField()); + for (ValueUpdate<?> valueUpdate : fieldUpdate.getValueUpdates()) { + removeAnyLinguisticsSpanTree(valueUpdate); + } + } + for (FieldPathUpdate fieldUpdate : update.getFieldPathUpdates()) { + requireThatFieldIsDeclaredInDocument(fieldUpdate.getFieldPath().get(0).getFieldRef()); + if (fieldUpdate instanceof AssignFieldPathUpdate) { + removeAnyLinguisticsSpanTree(((AssignFieldPathUpdate)fieldUpdate).getFieldValue()); + } + } + return Expression.execute(expression, adapterFactory, update); + } + + private void requireThatFieldIsDeclaredInDocument(Field field) { + if (field != null && !inputFields.contains(field.getName())) { + throw new IllegalArgumentException("Field '" + field.getName() + "' is not part of the declared document " + + "type '" + documentType + "'."); + } + } + + private void removeAnyLinguisticsSpanTree(ValueUpdate<?> valueUpdate) { + if (valueUpdate instanceof MapValueUpdate) { + removeAnyLinguisticsSpanTree(((MapValueUpdate)valueUpdate).getUpdate()); + } else { + removeAnyLinguisticsSpanTree(valueUpdate.getValue()); + } + } + + private void removeAnyLinguisticsSpanTree(FieldValue value) { + if (value instanceof StringFieldValue) { + ((StringFieldValue)value).removeSpanTree(SpanTrees.LINGUISTICS); + } else if (value instanceof Array) { + Array<?> arr = (Array)value; + for (Object obj : arr.getValues()) { + removeAnyLinguisticsSpanTree((FieldValue)obj); + } + } else if (value instanceof WeightedSet) { + WeightedSet<?> wset = (WeightedSet)value; + for (Object obj : wset.keySet()) { + removeAnyLinguisticsSpanTree((FieldValue)obj); + } + } else if (value instanceof MapFieldValue) { + MapFieldValue<?,?> map = (MapFieldValue)value; + for (Map.Entry<?,?> entry : map.entrySet()) { + removeAnyLinguisticsSpanTree((FieldValue)entry.getKey()); + removeAnyLinguisticsSpanTree((FieldValue)entry.getValue()); + } + } else if (value instanceof StructuredFieldValue) { + StructuredFieldValue struct = (StructuredFieldValue)value; + for (Iterator<Map.Entry<Field, FieldValue>> it = struct.iterator(); it.hasNext();) { + removeAnyLinguisticsSpanTree(it.next().getValue()); + } + } + } +} diff --git a/docprocs/src/main/java/com/yahoo/docprocs/indexing/FastLogger.java b/docprocs/src/main/java/com/yahoo/docprocs/indexing/FastLogger.java new file mode 100644 index 00000000000..e990c9fb894 --- /dev/null +++ b/docprocs/src/main/java/com/yahoo/docprocs/indexing/FastLogger.java @@ -0,0 +1,32 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.docprocs.indexing; + +import java.util.logging.Level; +import java.util.logging.Logger; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a> + */ +class FastLogger { + + private final Logger log; + + private FastLogger(Logger log) { + this.log = log; + } + + public void log(Level level, String format, Object... args) { + if (!log.isLoggable(level)) { + return; + } + if (args.length > 0) { + log.log(level, String.format(format, args)); + } else { + log.log(level, format); + } + } + + public static FastLogger getLogger(String name) { + return new FastLogger(Logger.getLogger(name)); + } +} diff --git a/docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java b/docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java new file mode 100644 index 00000000000..c6fe7b301e1 --- /dev/null +++ b/docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java @@ -0,0 +1,120 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.docprocs.indexing; + +import java.util.ArrayList; +import java.util.List; +import com.google.inject.Inject; +import com.yahoo.component.chain.dependencies.After; +import com.yahoo.component.chain.dependencies.Before; +import com.yahoo.component.chain.dependencies.Provides; +import com.yahoo.docproc.DocumentProcessor; +import com.yahoo.docproc.Processing; +import com.yahoo.document.*; +import com.yahoo.document.config.DocumentmanagerConfig; +import com.yahoo.language.Linguistics; +import com.yahoo.log.LogLevel; +import com.yahoo.vespa.configdefinition.IlscriptsConfig; +import com.yahoo.vespa.indexinglanguage.AdapterFactory; +import com.yahoo.vespa.indexinglanguage.SimpleAdapterFactory; +import com.yahoo.vespa.indexinglanguage.expressions.Expression; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a> + */ +@Provides({ IndexingProcessor.PROVIDED_NAME }) +@Before({ IndexingProcessor.INDEXING_END }) +@After({ IndexingProcessor.INDEXING_START, "*" }) +public class IndexingProcessor extends DocumentProcessor { + + public final static String PROVIDED_NAME = "indexedDocument"; + public final static String INDEXING_START = "indexingStart"; + public final static String INDEXING_END = "indexingEnd"; + + private final static FastLogger log = FastLogger.getLogger(IndexingProcessor.class.getName()); + private final DocumentTypeManager docTypeMgr; + private final ScriptManager scriptMgr; + private final AdapterFactory adapterFactory; + + private class ExpressionSelector extends SimpleAdapterFactory.SelectExpression { + @Override + public Expression selectExpression(DocumentType documentType, String fieldName) { + return scriptMgr.getScript(documentType, fieldName).getExpression(); + } + } + + @Inject + public IndexingProcessor(DocumentmanagerConfig documentmanagerConfig, + IlscriptsConfig ilscriptsConfig, + Linguistics linguistics) { + docTypeMgr = DocumentTypeManagerConfigurer.configureNewManager(documentmanagerConfig); + scriptMgr = new ScriptManager(docTypeMgr, ilscriptsConfig, linguistics); + adapterFactory = new SimpleAdapterFactory(new ExpressionSelector()); + } + + @Override + public Progress process(Processing proc) { + if (proc.getDocumentOperations().isEmpty()) { + return Progress.DONE; + } + List<DocumentOperation> out = new ArrayList<>(proc.getDocumentOperations().size()); + for (DocumentOperation documentOperation : proc.getDocumentOperations()) { + if (documentOperation instanceof DocumentPut) { + processDocument((DocumentPut)documentOperation, out); + } else if (documentOperation instanceof DocumentUpdate) { + processUpdate((DocumentUpdate)documentOperation, out); + } else if (documentOperation instanceof DocumentRemove) { + processRemove((DocumentRemove)documentOperation, out); + } else if (documentOperation != null) { + throw new IllegalArgumentException("Document class " + documentOperation.getClass().getName() + " not supported."); + } else { + throw new IllegalArgumentException("Expected document, got null."); + } + } + proc.getDocumentOperations().clear(); + proc.getDocumentOperations().addAll(out); + return Progress.DONE; + } + + DocumentTypeManager getDocumentTypeManager() { + return docTypeMgr; + } + + private void processDocument(DocumentPut prev, List<DocumentOperation> out) { + DocumentScript script = scriptMgr.getScript(prev.getDocument().getDataType()); + if (script == null) { + log.log(LogLevel.DEBUG, "No indexing script for document '%s'.", prev.getId()); + out.add(prev); + return; + } + log.log(LogLevel.DEBUG, "Processing document '%s'.", prev.getId()); + Document next = script.execute(adapterFactory, prev.getDocument()); + if (next == null) { + log.log(LogLevel.DEBUG, "Document '" + prev.getId() + "' produced no output."); + return; + } + + out.add(new DocumentPut(prev, next)); + } + + private void processUpdate(DocumentUpdate prev, List<DocumentOperation> out) { + DocumentScript script = scriptMgr.getScript(prev.getType()); + if (script == null) { + log.log(LogLevel.DEBUG, "No indexing script for update '%s'.", prev.getId()); + out.add(prev); + return; + } + log.log(LogLevel.DEBUG, "Processing update '%s'.", prev.getId()); + DocumentUpdate next = script.execute(adapterFactory, prev); + if (next == null) { + log.log(LogLevel.DEBUG, "Update '" + prev.getId() + "' produced no output."); + return; + } + next.setCondition(prev.getCondition()); + out.add(next); + } + + private void processRemove(DocumentRemove prev, List<DocumentOperation> out) { + log.log(LogLevel.DEBUG, "Not processing remove '%s'.", prev.getId()); + out.add(prev); + } +} diff --git a/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java b/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java new file mode 100644 index 00000000000..14bf5a0edf8 --- /dev/null +++ b/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java @@ -0,0 +1,128 @@ +// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.docprocs.indexing; + +import com.yahoo.document.DocumentType; +import com.yahoo.document.DocumentTypeManager; +import com.yahoo.language.Linguistics; +import com.yahoo.log.LogLevel; +import com.yahoo.vespa.configdefinition.IlscriptsConfig; +import com.yahoo.vespa.indexinglanguage.ScriptParserContext; +import com.yahoo.vespa.indexinglanguage.expressions.InputExpression; +import com.yahoo.vespa.indexinglanguage.expressions.ScriptExpression; +import com.yahoo.vespa.indexinglanguage.expressions.StatementExpression; +import com.yahoo.vespa.indexinglanguage.parser.IndexingInput; +import com.yahoo.vespa.indexinglanguage.parser.ParseException; + +import java.util.*; +import java.util.logging.Level; + +/** + * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a> + */ +public class ScriptManager { + + private static final FastLogger log = FastLogger.getLogger(ScriptManager.class.getName()); + private static final String FULL = "[all]"; + private final Map<String, Map<String, DocumentScript>> documentFieldScripts; + private final DocumentTypeManager docTypeMgr; + + public ScriptManager(DocumentTypeManager docTypeMgr, IlscriptsConfig config, Linguistics linguistics) { + this.docTypeMgr = docTypeMgr; + documentFieldScripts = createScriptsMap(docTypeMgr, config, linguistics); + } + + + private Map<String, DocumentScript> getScripts(DocumentType inputType) { + Map<String, DocumentScript> scripts = documentFieldScripts.get(inputType.getName()); + if (scripts != null) { + log.log(LogLevel.DEBUG, "Using script for type '%s'.", inputType.getName()); + return scripts; + } + for (Map.Entry<String, Map<String, DocumentScript>> entry : documentFieldScripts.entrySet()) { + if (inputType.inherits(docTypeMgr.getDocumentType(entry.getKey()))) { + log.log(LogLevel.DEBUG, "Using script of super-type '%s'.", entry.getKey()); + return entry.getValue(); + } + } + for (Map.Entry<String, Map<String, DocumentScript>> entry : documentFieldScripts.entrySet()) { + if (docTypeMgr.getDocumentType(entry.getKey()).inherits(inputType)) { + log.log(LogLevel.DEBUG, "Using script of sub-type '%s'.", entry.getKey()); + return entry.getValue(); + } + } + log.log(LogLevel.DEBUG, "No script for type '%s'.", inputType.getName()); + return null; + } + + public DocumentScript getScript(DocumentType inputType) { + return getScript(inputType, FULL); + } + + public DocumentScript getScript(DocumentType inputType, String inputFieldName) { + Map<String, DocumentScript> fieldScripts = getScripts(inputType); + if (fieldScripts != null) { + DocumentScript script = fieldScripts.get(inputFieldName); + if (script != null) { + log.log(LogLevel.DEBUG, "Using script for type '%s' and field '%s'.", inputType.getName(), inputFieldName); + return script; + } + } + return null; + } + + private static Map<String, Map<String, DocumentScript>> createScriptsMap(DocumentTypeManager docTypeMgr, + IlscriptsConfig config, + Linguistics linguistics) { + Map<String, Map<String, DocumentScript>> documentFieldScripts = new HashMap<>(config.ilscript().size()); + ScriptParserContext parserContext = new ScriptParserContext(linguistics); + parserContext.getAnnotatorConfig().setMaxTermOccurrences(config.maxtermoccurrences()); + + for (IlscriptsConfig.Ilscript ilscript : config.ilscript()) { + InputExpression.FieldPathOptimizer fieldPathOptimizer = new InputExpression.FieldPathOptimizer(docTypeMgr.getDocumentType(ilscript.doctype())); + List<StatementExpression> expressions = new ArrayList<>(ilscript.content().size()); + Map<String, DocumentScript> fieldScripts = new HashMap<>(ilscript.content().size()); + for (String content : ilscript.content()) { + expressions.add(parse(ilscript.doctype(), parserContext, content)); + StatementExpression statement = parse(ilscript.doctype(), parserContext, content); + InputExpression.InputFieldNameExtractor inputFieldNameExtractor = new InputExpression.InputFieldNameExtractor(); + statement.select(inputFieldNameExtractor, inputFieldNameExtractor); + statement.select(fieldPathOptimizer, fieldPathOptimizer); + if (inputFieldNameExtractor.getInputFieldNames().size() == 1) { + String fieldName = inputFieldNameExtractor.getInputFieldNames().get(0); + ScriptExpression script; + if (fieldScripts.containsKey(fieldName)) { + DocumentScript prev = fieldScripts.get(fieldName); + List<StatementExpression> appendedList = new ArrayList<>(((ScriptExpression)prev.getExpression()).asList()); + appendedList.add(statement); + script = new ScriptExpression(appendedList); + log.log(Level.FINE, "Appending script for field '" + fieldName + "' = " + statement); + log.log(Level.FINE, "Full script for field '" + fieldName + "' = " + appendedList); + } else { + script = new ScriptExpression(statement); + log.log(Level.FINE, "Setting script for field '" + fieldName + "' = " + statement); + } + DocumentScript documentScript = new DocumentScript(ilscript.doctype(), inputFieldNameExtractor.getInputFieldNames(), script); + fieldScripts.put(fieldName, documentScript); + } else { + log.log(Level.FINE, "Non single(" + inputFieldNameExtractor.getInputFieldNames().size() +") inputs = " + inputFieldNameExtractor.getInputFieldNames() + ". Script = " + statement); + } + } + + ScriptExpression script = new ScriptExpression(expressions); + script.select(fieldPathOptimizer, fieldPathOptimizer); + fieldScripts.put(FULL, new DocumentScript(ilscript.doctype(), ilscript.docfield(),script)); + documentFieldScripts.put(ilscript.doctype(), Collections.unmodifiableMap(fieldScripts)); + } + return Collections.unmodifiableMap(documentFieldScripts); + } + + private static StatementExpression parse(String docType, ScriptParserContext parserConfig, String content) { + parserConfig.setInputStream(new IndexingInput(content)); + try { + return StatementExpression.newInstance(parserConfig); + } catch (ParseException e) { + throw new IllegalArgumentException("Illegal indexing script for document type '" + + docType + "'; " + content, e); + } + } +} |