summaryrefslogtreecommitdiffstats
path: root/docprocs/src/main
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
committerJon Bratseth <bratseth@yahoo-inc.com>2016-06-15 23:09:44 +0200
commit72231250ed81e10d66bfe70701e64fa5fe50f712 (patch)
tree2728bba1131a6f6e5bdf95afec7d7ff9358dac50 /docprocs/src/main
Publish
Diffstat (limited to 'docprocs/src/main')
-rw-r--r--docprocs/src/main/java/com/yahoo/docprocs/indexing/DocumentScript.java107
-rw-r--r--docprocs/src/main/java/com/yahoo/docprocs/indexing/FastLogger.java32
-rw-r--r--docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java120
-rw-r--r--docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java128
4 files changed, 387 insertions, 0 deletions
diff --git a/docprocs/src/main/java/com/yahoo/docprocs/indexing/DocumentScript.java b/docprocs/src/main/java/com/yahoo/docprocs/indexing/DocumentScript.java
new file mode 100644
index 00000000000..a367aec0cfb
--- /dev/null
+++ b/docprocs/src/main/java/com/yahoo/docprocs/indexing/DocumentScript.java
@@ -0,0 +1,107 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.docprocs.indexing;
+
+import com.yahoo.document.Document;
+import com.yahoo.document.DocumentUpdate;
+import com.yahoo.document.Field;
+import com.yahoo.document.annotation.SpanTrees;
+import com.yahoo.document.datatypes.Array;
+import com.yahoo.document.datatypes.FieldValue;
+import com.yahoo.document.datatypes.MapFieldValue;
+import com.yahoo.document.datatypes.StringFieldValue;
+import com.yahoo.document.datatypes.Struct;
+import com.yahoo.document.datatypes.StructuredFieldValue;
+import com.yahoo.document.datatypes.WeightedSet;
+import com.yahoo.document.fieldpathupdate.AssignFieldPathUpdate;
+import com.yahoo.document.fieldpathupdate.FieldPathUpdate;
+import com.yahoo.document.update.FieldUpdate;
+import com.yahoo.document.update.MapValueUpdate;
+import com.yahoo.document.update.ValueUpdate;
+import com.yahoo.vespa.indexinglanguage.AdapterFactory;
+import com.yahoo.vespa.indexinglanguage.expressions.Expression;
+
+import java.util.*;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ */
+public class DocumentScript {
+
+ private final String documentType;
+ private final Set<String> inputFields;
+ private final Expression expression;
+
+ public DocumentScript(String documentType, Collection<String> inputFields, Expression expression) {
+ this.documentType = documentType;
+ this.inputFields = new HashSet<>(inputFields);
+ this.expression = expression;
+ }
+
+ public Expression getExpression() { return expression; }
+ public Document execute(AdapterFactory adapterFactory, Document document) {
+ for (Iterator<Map.Entry<Field, FieldValue>> it = document.iterator(); it.hasNext(); ) {
+ Map.Entry<Field, FieldValue> entry = it.next();
+ requireThatFieldIsDeclaredInDocument(entry.getKey());
+ removeAnyLinguisticsSpanTree(entry.getValue());
+ }
+ return expression.execute(adapterFactory, document);
+ }
+
+ public DocumentUpdate execute(AdapterFactory adapterFactory, DocumentUpdate update) {
+ for (FieldUpdate fieldUpdate : update.getFieldUpdates()) {
+ requireThatFieldIsDeclaredInDocument(fieldUpdate.getField());
+ for (ValueUpdate<?> valueUpdate : fieldUpdate.getValueUpdates()) {
+ removeAnyLinguisticsSpanTree(valueUpdate);
+ }
+ }
+ for (FieldPathUpdate fieldUpdate : update.getFieldPathUpdates()) {
+ requireThatFieldIsDeclaredInDocument(fieldUpdate.getFieldPath().get(0).getFieldRef());
+ if (fieldUpdate instanceof AssignFieldPathUpdate) {
+ removeAnyLinguisticsSpanTree(((AssignFieldPathUpdate)fieldUpdate).getFieldValue());
+ }
+ }
+ return Expression.execute(expression, adapterFactory, update);
+ }
+
+ private void requireThatFieldIsDeclaredInDocument(Field field) {
+ if (field != null && !inputFields.contains(field.getName())) {
+ throw new IllegalArgumentException("Field '" + field.getName() + "' is not part of the declared document " +
+ "type '" + documentType + "'.");
+ }
+ }
+
+ private void removeAnyLinguisticsSpanTree(ValueUpdate<?> valueUpdate) {
+ if (valueUpdate instanceof MapValueUpdate) {
+ removeAnyLinguisticsSpanTree(((MapValueUpdate)valueUpdate).getUpdate());
+ } else {
+ removeAnyLinguisticsSpanTree(valueUpdate.getValue());
+ }
+ }
+
+ private void removeAnyLinguisticsSpanTree(FieldValue value) {
+ if (value instanceof StringFieldValue) {
+ ((StringFieldValue)value).removeSpanTree(SpanTrees.LINGUISTICS);
+ } else if (value instanceof Array) {
+ Array<?> arr = (Array)value;
+ for (Object obj : arr.getValues()) {
+ removeAnyLinguisticsSpanTree((FieldValue)obj);
+ }
+ } else if (value instanceof WeightedSet) {
+ WeightedSet<?> wset = (WeightedSet)value;
+ for (Object obj : wset.keySet()) {
+ removeAnyLinguisticsSpanTree((FieldValue)obj);
+ }
+ } else if (value instanceof MapFieldValue) {
+ MapFieldValue<?,?> map = (MapFieldValue)value;
+ for (Map.Entry<?,?> entry : map.entrySet()) {
+ removeAnyLinguisticsSpanTree((FieldValue)entry.getKey());
+ removeAnyLinguisticsSpanTree((FieldValue)entry.getValue());
+ }
+ } else if (value instanceof StructuredFieldValue) {
+ StructuredFieldValue struct = (StructuredFieldValue)value;
+ for (Iterator<Map.Entry<Field, FieldValue>> it = struct.iterator(); it.hasNext();) {
+ removeAnyLinguisticsSpanTree(it.next().getValue());
+ }
+ }
+ }
+}
diff --git a/docprocs/src/main/java/com/yahoo/docprocs/indexing/FastLogger.java b/docprocs/src/main/java/com/yahoo/docprocs/indexing/FastLogger.java
new file mode 100644
index 00000000000..e990c9fb894
--- /dev/null
+++ b/docprocs/src/main/java/com/yahoo/docprocs/indexing/FastLogger.java
@@ -0,0 +1,32 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.docprocs.indexing;
+
+import java.util.logging.Level;
+import java.util.logging.Logger;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen Hult</a>
+ */
+class FastLogger {
+
+ private final Logger log;
+
+ private FastLogger(Logger log) {
+ this.log = log;
+ }
+
+ public void log(Level level, String format, Object... args) {
+ if (!log.isLoggable(level)) {
+ return;
+ }
+ if (args.length > 0) {
+ log.log(level, String.format(format, args));
+ } else {
+ log.log(level, format);
+ }
+ }
+
+ public static FastLogger getLogger(String name) {
+ return new FastLogger(Logger.getLogger(name));
+ }
+}
diff --git a/docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java b/docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java
new file mode 100644
index 00000000000..c6fe7b301e1
--- /dev/null
+++ b/docprocs/src/main/java/com/yahoo/docprocs/indexing/IndexingProcessor.java
@@ -0,0 +1,120 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.docprocs.indexing;
+
+import java.util.ArrayList;
+import java.util.List;
+import com.google.inject.Inject;
+import com.yahoo.component.chain.dependencies.After;
+import com.yahoo.component.chain.dependencies.Before;
+import com.yahoo.component.chain.dependencies.Provides;
+import com.yahoo.docproc.DocumentProcessor;
+import com.yahoo.docproc.Processing;
+import com.yahoo.document.*;
+import com.yahoo.document.config.DocumentmanagerConfig;
+import com.yahoo.language.Linguistics;
+import com.yahoo.log.LogLevel;
+import com.yahoo.vespa.configdefinition.IlscriptsConfig;
+import com.yahoo.vespa.indexinglanguage.AdapterFactory;
+import com.yahoo.vespa.indexinglanguage.SimpleAdapterFactory;
+import com.yahoo.vespa.indexinglanguage.expressions.Expression;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ */
+@Provides({ IndexingProcessor.PROVIDED_NAME })
+@Before({ IndexingProcessor.INDEXING_END })
+@After({ IndexingProcessor.INDEXING_START, "*" })
+public class IndexingProcessor extends DocumentProcessor {
+
+ public final static String PROVIDED_NAME = "indexedDocument";
+ public final static String INDEXING_START = "indexingStart";
+ public final static String INDEXING_END = "indexingEnd";
+
+ private final static FastLogger log = FastLogger.getLogger(IndexingProcessor.class.getName());
+ private final DocumentTypeManager docTypeMgr;
+ private final ScriptManager scriptMgr;
+ private final AdapterFactory adapterFactory;
+
+ private class ExpressionSelector extends SimpleAdapterFactory.SelectExpression {
+ @Override
+ public Expression selectExpression(DocumentType documentType, String fieldName) {
+ return scriptMgr.getScript(documentType, fieldName).getExpression();
+ }
+ }
+
+ @Inject
+ public IndexingProcessor(DocumentmanagerConfig documentmanagerConfig,
+ IlscriptsConfig ilscriptsConfig,
+ Linguistics linguistics) {
+ docTypeMgr = DocumentTypeManagerConfigurer.configureNewManager(documentmanagerConfig);
+ scriptMgr = new ScriptManager(docTypeMgr, ilscriptsConfig, linguistics);
+ adapterFactory = new SimpleAdapterFactory(new ExpressionSelector());
+ }
+
+ @Override
+ public Progress process(Processing proc) {
+ if (proc.getDocumentOperations().isEmpty()) {
+ return Progress.DONE;
+ }
+ List<DocumentOperation> out = new ArrayList<>(proc.getDocumentOperations().size());
+ for (DocumentOperation documentOperation : proc.getDocumentOperations()) {
+ if (documentOperation instanceof DocumentPut) {
+ processDocument((DocumentPut)documentOperation, out);
+ } else if (documentOperation instanceof DocumentUpdate) {
+ processUpdate((DocumentUpdate)documentOperation, out);
+ } else if (documentOperation instanceof DocumentRemove) {
+ processRemove((DocumentRemove)documentOperation, out);
+ } else if (documentOperation != null) {
+ throw new IllegalArgumentException("Document class " + documentOperation.getClass().getName() + " not supported.");
+ } else {
+ throw new IllegalArgumentException("Expected document, got null.");
+ }
+ }
+ proc.getDocumentOperations().clear();
+ proc.getDocumentOperations().addAll(out);
+ return Progress.DONE;
+ }
+
+ DocumentTypeManager getDocumentTypeManager() {
+ return docTypeMgr;
+ }
+
+ private void processDocument(DocumentPut prev, List<DocumentOperation> out) {
+ DocumentScript script = scriptMgr.getScript(prev.getDocument().getDataType());
+ if (script == null) {
+ log.log(LogLevel.DEBUG, "No indexing script for document '%s'.", prev.getId());
+ out.add(prev);
+ return;
+ }
+ log.log(LogLevel.DEBUG, "Processing document '%s'.", prev.getId());
+ Document next = script.execute(adapterFactory, prev.getDocument());
+ if (next == null) {
+ log.log(LogLevel.DEBUG, "Document '" + prev.getId() + "' produced no output.");
+ return;
+ }
+
+ out.add(new DocumentPut(prev, next));
+ }
+
+ private void processUpdate(DocumentUpdate prev, List<DocumentOperation> out) {
+ DocumentScript script = scriptMgr.getScript(prev.getType());
+ if (script == null) {
+ log.log(LogLevel.DEBUG, "No indexing script for update '%s'.", prev.getId());
+ out.add(prev);
+ return;
+ }
+ log.log(LogLevel.DEBUG, "Processing update '%s'.", prev.getId());
+ DocumentUpdate next = script.execute(adapterFactory, prev);
+ if (next == null) {
+ log.log(LogLevel.DEBUG, "Update '" + prev.getId() + "' produced no output.");
+ return;
+ }
+ next.setCondition(prev.getCondition());
+ out.add(next);
+ }
+
+ private void processRemove(DocumentRemove prev, List<DocumentOperation> out) {
+ log.log(LogLevel.DEBUG, "Not processing remove '%s'.", prev.getId());
+ out.add(prev);
+ }
+}
diff --git a/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java b/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java
new file mode 100644
index 00000000000..14bf5a0edf8
--- /dev/null
+++ b/docprocs/src/main/java/com/yahoo/docprocs/indexing/ScriptManager.java
@@ -0,0 +1,128 @@
+// Copyright 2016 Yahoo Inc. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.docprocs.indexing;
+
+import com.yahoo.document.DocumentType;
+import com.yahoo.document.DocumentTypeManager;
+import com.yahoo.language.Linguistics;
+import com.yahoo.log.LogLevel;
+import com.yahoo.vespa.configdefinition.IlscriptsConfig;
+import com.yahoo.vespa.indexinglanguage.ScriptParserContext;
+import com.yahoo.vespa.indexinglanguage.expressions.InputExpression;
+import com.yahoo.vespa.indexinglanguage.expressions.ScriptExpression;
+import com.yahoo.vespa.indexinglanguage.expressions.StatementExpression;
+import com.yahoo.vespa.indexinglanguage.parser.IndexingInput;
+import com.yahoo.vespa.indexinglanguage.parser.ParseException;
+
+import java.util.*;
+import java.util.logging.Level;
+
+/**
+ * @author <a href="mailto:simon@yahoo-inc.com">Simon Thoresen</a>
+ */
+public class ScriptManager {
+
+ private static final FastLogger log = FastLogger.getLogger(ScriptManager.class.getName());
+ private static final String FULL = "[all]";
+ private final Map<String, Map<String, DocumentScript>> documentFieldScripts;
+ private final DocumentTypeManager docTypeMgr;
+
+ public ScriptManager(DocumentTypeManager docTypeMgr, IlscriptsConfig config, Linguistics linguistics) {
+ this.docTypeMgr = docTypeMgr;
+ documentFieldScripts = createScriptsMap(docTypeMgr, config, linguistics);
+ }
+
+
+ private Map<String, DocumentScript> getScripts(DocumentType inputType) {
+ Map<String, DocumentScript> scripts = documentFieldScripts.get(inputType.getName());
+ if (scripts != null) {
+ log.log(LogLevel.DEBUG, "Using script for type '%s'.", inputType.getName());
+ return scripts;
+ }
+ for (Map.Entry<String, Map<String, DocumentScript>> entry : documentFieldScripts.entrySet()) {
+ if (inputType.inherits(docTypeMgr.getDocumentType(entry.getKey()))) {
+ log.log(LogLevel.DEBUG, "Using script of super-type '%s'.", entry.getKey());
+ return entry.getValue();
+ }
+ }
+ for (Map.Entry<String, Map<String, DocumentScript>> entry : documentFieldScripts.entrySet()) {
+ if (docTypeMgr.getDocumentType(entry.getKey()).inherits(inputType)) {
+ log.log(LogLevel.DEBUG, "Using script of sub-type '%s'.", entry.getKey());
+ return entry.getValue();
+ }
+ }
+ log.log(LogLevel.DEBUG, "No script for type '%s'.", inputType.getName());
+ return null;
+ }
+
+ public DocumentScript getScript(DocumentType inputType) {
+ return getScript(inputType, FULL);
+ }
+
+ public DocumentScript getScript(DocumentType inputType, String inputFieldName) {
+ Map<String, DocumentScript> fieldScripts = getScripts(inputType);
+ if (fieldScripts != null) {
+ DocumentScript script = fieldScripts.get(inputFieldName);
+ if (script != null) {
+ log.log(LogLevel.DEBUG, "Using script for type '%s' and field '%s'.", inputType.getName(), inputFieldName);
+ return script;
+ }
+ }
+ return null;
+ }
+
+ private static Map<String, Map<String, DocumentScript>> createScriptsMap(DocumentTypeManager docTypeMgr,
+ IlscriptsConfig config,
+ Linguistics linguistics) {
+ Map<String, Map<String, DocumentScript>> documentFieldScripts = new HashMap<>(config.ilscript().size());
+ ScriptParserContext parserContext = new ScriptParserContext(linguistics);
+ parserContext.getAnnotatorConfig().setMaxTermOccurrences(config.maxtermoccurrences());
+
+ for (IlscriptsConfig.Ilscript ilscript : config.ilscript()) {
+ InputExpression.FieldPathOptimizer fieldPathOptimizer = new InputExpression.FieldPathOptimizer(docTypeMgr.getDocumentType(ilscript.doctype()));
+ List<StatementExpression> expressions = new ArrayList<>(ilscript.content().size());
+ Map<String, DocumentScript> fieldScripts = new HashMap<>(ilscript.content().size());
+ for (String content : ilscript.content()) {
+ expressions.add(parse(ilscript.doctype(), parserContext, content));
+ StatementExpression statement = parse(ilscript.doctype(), parserContext, content);
+ InputExpression.InputFieldNameExtractor inputFieldNameExtractor = new InputExpression.InputFieldNameExtractor();
+ statement.select(inputFieldNameExtractor, inputFieldNameExtractor);
+ statement.select(fieldPathOptimizer, fieldPathOptimizer);
+ if (inputFieldNameExtractor.getInputFieldNames().size() == 1) {
+ String fieldName = inputFieldNameExtractor.getInputFieldNames().get(0);
+ ScriptExpression script;
+ if (fieldScripts.containsKey(fieldName)) {
+ DocumentScript prev = fieldScripts.get(fieldName);
+ List<StatementExpression> appendedList = new ArrayList<>(((ScriptExpression)prev.getExpression()).asList());
+ appendedList.add(statement);
+ script = new ScriptExpression(appendedList);
+ log.log(Level.FINE, "Appending script for field '" + fieldName + "' = " + statement);
+ log.log(Level.FINE, "Full script for field '" + fieldName + "' = " + appendedList);
+ } else {
+ script = new ScriptExpression(statement);
+ log.log(Level.FINE, "Setting script for field '" + fieldName + "' = " + statement);
+ }
+ DocumentScript documentScript = new DocumentScript(ilscript.doctype(), inputFieldNameExtractor.getInputFieldNames(), script);
+ fieldScripts.put(fieldName, documentScript);
+ } else {
+ log.log(Level.FINE, "Non single(" + inputFieldNameExtractor.getInputFieldNames().size() +") inputs = " + inputFieldNameExtractor.getInputFieldNames() + ". Script = " + statement);
+ }
+ }
+
+ ScriptExpression script = new ScriptExpression(expressions);
+ script.select(fieldPathOptimizer, fieldPathOptimizer);
+ fieldScripts.put(FULL, new DocumentScript(ilscript.doctype(), ilscript.docfield(),script));
+ documentFieldScripts.put(ilscript.doctype(), Collections.unmodifiableMap(fieldScripts));
+ }
+ return Collections.unmodifiableMap(documentFieldScripts);
+ }
+
+ private static StatementExpression parse(String docType, ScriptParserContext parserConfig, String content) {
+ parserConfig.setInputStream(new IndexingInput(content));
+ try {
+ return StatementExpression.newInstance(parserConfig);
+ } catch (ParseException e) {
+ throw new IllegalArgumentException("Illegal indexing script for document type '" +
+ docType + "'; " + content, e);
+ }
+ }
+}