diff options
author | Tor Brede Vekterli <vekterli@yahooinc.com> | 2023-02-23 14:45:34 +0100 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@yahooinc.com> | 2023-02-23 15:18:20 +0100 |
commit | 7cb3323d0cf0c1c98f0c0e47cb88741f56c73eb9 (patch) | |
tree | bb2662cfc5777d812cceae7198962b39c14a59ff | |
parent | f66f816102ce0a7c3aaba72d1db61a83157259ed (diff) |
Add JSONL output support to `vespa-visit` CLI tool
JSONL output is enabled via new `--jsonl` argument. Mutually
exclusive with `--jsonoutput` and (deprecated) `--xmloutput`.
3 files changed, 168 insertions, 72 deletions
diff --git a/vespaclient-java/src/main/java/com/yahoo/vespavisit/StdOutVisitorHandler.java b/vespaclient-java/src/main/java/com/yahoo/vespavisit/StdOutVisitorHandler.java index fc74cb6d899..288df7e470c 100644 --- a/vespaclient-java/src/main/java/com/yahoo/vespavisit/StdOutVisitorHandler.java +++ b/vespaclient-java/src/main/java/com/yahoo/vespavisit/StdOutVisitorHandler.java @@ -35,42 +35,45 @@ import java.util.logging.Logger; @SuppressWarnings("deprecation") public class StdOutVisitorHandler extends VdsVisitHandler { - private static final Logger log = Logger.getLogger( - StdOutVisitorHandler.class.getName()); - private final boolean printIds; - private final boolean indentXml; - private final int processTimeMilliSecs; - private final PrintStream out; - private final boolean jsonOutput; - private final boolean tensorShortForm; - private final boolean tensorDirectValues; + private static final Logger log = Logger.getLogger(StdOutVisitorHandler.class.getName()); - private final VisitorDataHandler dataHandler; + public enum OutputFormat { + JSONL, + JSON, + XML // Deprecated + } - public StdOutVisitorHandler(boolean printIds, boolean indentXml, - boolean showProgress, boolean showStatistics, boolean doStatistics, - boolean abortOnClusterDown, int processtime, boolean jsonOutput, - boolean tensorShortForm, - boolean tensorDirectValues) - { - this(printIds, indentXml, showProgress, showStatistics, doStatistics, abortOnClusterDown, processtime, - jsonOutput, tensorShortForm, tensorDirectValues, createStdOutPrintStream()); + // Explicitly _not_ a record since we want the fields to be mutable when building. + public static class Params { + boolean printIds = false; + boolean indentXml = false; + boolean showProgress = false; + boolean showStatistics = false; + boolean doStatistics = false; + boolean abortOnClusterDown = false; + int processTimeMilliSecs = 0; + OutputFormat outputFormat = OutputFormat.JSON; + boolean tensorShortForm = false; // TODO Vespa 9: change default to true + boolean tensorDirectValues = false; // TODO Vespa 9: change default to true + + boolean usesJson() { + return outputFormat == OutputFormat.JSON || outputFormat == OutputFormat.JSONL; + } } - StdOutVisitorHandler(boolean printIds, boolean indentXml, - boolean showProgress, boolean showStatistics, boolean doStatistics, - boolean abortOnClusterDown, int processtime, boolean jsonOutput, - boolean tensorShortForm, boolean tensorDirectValues, PrintStream out) - { - super(showProgress, showStatistics, abortOnClusterDown); - this.printIds = printIds; - this.indentXml = indentXml; - this.processTimeMilliSecs = processtime; - this.jsonOutput = jsonOutput; - this.tensorShortForm = tensorShortForm; - this.tensorDirectValues = tensorDirectValues; + private final Params params; + private final PrintStream out; + private final VisitorDataHandler dataHandler; + + public StdOutVisitorHandler(Params params, PrintStream out) { + super(params.showProgress, params.showStatistics, params.abortOnClusterDown); + this.params = params; this.out = out; - this.dataHandler = new DataHandler(doStatistics); + this.dataHandler = new DataHandler(params.doStatistics); + } + + public StdOutVisitorHandler(Params params) { + this(params, createStdOutPrintStream()); } private static PrintStream createStdOutPrintStream() { @@ -128,9 +131,9 @@ public class StdOutVisitorHandler extends VdsVisitHandler { @Override public void onMessage(Message m, AckToken token) { - if (processTimeMilliSecs > 0) { + if (params.processTimeMilliSecs > 0) { try { - Thread.sleep(processTimeMilliSecs); + Thread.sleep(params.processTimeMilliSecs); } catch (InterruptedException e) {} } @@ -158,16 +161,15 @@ public class StdOutVisitorHandler extends VdsVisitHandler { System.err.print('\r'); } - if (printIds) { + if (params.printIds) { out.print(doc.getId()); out.print(" (Last modified at "); out.println(timestamp + ")"); } else { - if (jsonOutput) { + if (params.usesJson()) { writeJsonDocument(doc); } else { - out.print(doc.toXML( - indentXml ? " " : "")); + out.print(doc.toXML(params.indentXml ? " " : "")); } } } catch (Exception e) { @@ -179,7 +181,7 @@ public class StdOutVisitorHandler extends VdsVisitHandler { private void writeJsonDocument(Document doc) throws IOException { writeFeedStartOrRecordSeparator(); - out.write(JsonWriter.toByteArray(doc, tensorShortForm, tensorDirectValues)); + out.write(JsonWriter.toByteArray(doc, params.tensorShortForm, params.tensorDirectValues)); } @Override @@ -189,10 +191,10 @@ public class StdOutVisitorHandler extends VdsVisitHandler { System.err.print('\r'); } - if (printIds) { + if (params.printIds) { out.println(docId + " (Removed)"); } else { - if (jsonOutput) { + if (params.usesJson()) { writeJsonDocumentRemove(docId); } else { XmlStream stream = new XmlStream(); @@ -218,10 +220,12 @@ public class StdOutVisitorHandler extends VdsVisitHandler { private void writeFeedStartOrRecordSeparator() { if (first) { - out.println("["); + if (params.outputFormat == OutputFormat.JSON) { + out.println("["); + } first = false; } else { - out.println(","); + out.println((params.outputFormat == OutputFormat.JSON) ? "," : ""); } } @@ -259,7 +263,7 @@ public class StdOutVisitorHandler extends VdsVisitHandler { @Override public synchronized void onDone() { - if (jsonOutput && !printIds) { + if ((params.outputFormat == OutputFormat.JSON) && !params.printIds) { if (first) { out.print('['); } diff --git a/vespaclient-java/src/main/java/com/yahoo/vespavisit/VdsVisit.java b/vespaclient-java/src/main/java/com/yahoo/vespavisit/VdsVisit.java index a6e34055fbd..ceea7d320e9 100644 --- a/vespaclient-java/src/main/java/com/yahoo/vespavisit/VdsVisit.java +++ b/vespaclient-java/src/main/java/com/yahoo/vespavisit/VdsVisit.java @@ -24,7 +24,6 @@ import org.apache.commons.cli.Option; import org.apache.commons.cli.Options; import java.io.*; -import java.nio.charset.Charset; import java.nio.charset.StandardCharsets; import java.util.Map; import java.util.stream.Collectors; @@ -334,9 +333,15 @@ public class VdsVisit { .hasArg(false) .build()); + options.addOption(Option.builder() + .longOpt("jsonl") + .desc("Output documents as JSONL (JSON Lines format)") + .hasArg(false) + .build()); + options.addOption(Option.builder("x") .longOpt("xmloutput") - .desc("Output documents as XML") + .desc("Output documents as XML (deprecated)") .hasArg(false) .build()); @@ -370,6 +375,7 @@ public class VdsVisit { private int processTime = 0; private int fullTimeout = 7 * 24 * 60 * 60 * 1000; private boolean jsonOutput = true; + private boolean jsonLinesOutput = false; private boolean tensorShortForm = false; // TODO Vespa 9: change default to true private boolean tensorDirectValues = false; // TODO Vespa 9: change default to true @@ -437,10 +443,32 @@ public class VdsVisit { this.processTime = processTime; } + public boolean jsonOutput() { + return jsonOutput; + } + public void setJsonOutput(boolean jsonOutput) { this.jsonOutput = jsonOutput; } + public boolean jsonLinesOutput() { + return jsonLinesOutput; + } + + public void setJsonLinesOutput(boolean jsonLinesOutput) { + this.jsonLinesOutput = jsonLinesOutput; + } + + public StdOutVisitorHandler.OutputFormat stdOutHandlerOutputFormat() { + if (jsonLinesOutput) { + return StdOutVisitorHandler.OutputFormat.JSONL; + } else if (jsonOutput) { + return StdOutVisitorHandler.OutputFormat.JSON; + } else { + return StdOutVisitorHandler.OutputFormat.XML; + } + } + public boolean tensorShortForm() { return tensorShortForm; } @@ -587,11 +615,18 @@ public class VdsVisit { } boolean jsonOutput = line.hasOption("jsonoutput"); - boolean xmlOutput = line.hasOption("xmloutput"); - if (jsonOutput && xmlOutput) { - throw new IllegalArgumentException("Cannot combine both xml and json output"); + boolean jsonl = line.hasOption("jsonl"); + boolean xmlOutput = line.hasOption("xmloutput"); + if ((jsonOutput || jsonl) && xmlOutput) { + throw new IllegalArgumentException("Cannot combine both XML and JSON output"); + } else if (jsonOutput && jsonl) { + throw new IllegalArgumentException("Cannot combine both JSON and JSONL output"); + } + if (jsonl) { + allParams.setJsonLinesOutput(true); + } else { + allParams.setJsonOutput(!xmlOutput); } - allParams.setJsonOutput(!xmlOutput); allParams.setVisitorParameters(params); return allParams; @@ -747,17 +782,18 @@ public class VdsVisit { VdsVisitHandler handler; - handler = new StdOutVisitorHandler( - params.isPrintIdsOnly(), - params.isVerbose(), - params.isVerbose(), - params.isVerbose(), - params.getStatisticsParts() != null, - params.getAbortOnClusterDown(), - params.getProcessTime(), - params.jsonOutput, - params.tensorShortForm, - params.tensorDirectValues); + var handlerParams = new StdOutVisitorHandler.Params(); + handlerParams.printIds = params.isPrintIdsOnly(); + handlerParams.indentXml = params.isVerbose(); + handlerParams.showProgress = params.isVerbose(); + handlerParams.showStatistics = params.isVerbose(); + handlerParams.doStatistics = params.getStatisticsParts() != null; + handlerParams.abortOnClusterDown = params.getAbortOnClusterDown(); + handlerParams.processTimeMilliSecs = params.getProcessTime(); + handlerParams.outputFormat = params.stdOutHandlerOutputFormat(); + handlerParams.tensorShortForm = params.tensorShortForm(); + handlerParams.tensorDirectValues = params.tensorDirectValues(); + handler = new StdOutVisitorHandler(handlerParams); if (visitorParameters.getResumeFileName() != null) { handler.setProgressFileName(visitorParameters.getResumeFileName()); diff --git a/vespaclient-java/src/test/java/com/yahoo/vespavisit/StdOutVisitorHandlerTest.java b/vespaclient-java/src/test/java/com/yahoo/vespavisit/StdOutVisitorHandlerTest.java index c1bbe8711a5..aa708b1fde9 100644 --- a/vespaclient-java/src/test/java/com/yahoo/vespavisit/StdOutVisitorHandlerTest.java +++ b/vespaclient-java/src/test/java/com/yahoo/vespavisit/StdOutVisitorHandlerTest.java @@ -1,15 +1,19 @@ // Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. package com.yahoo.vespavisit; +import com.yahoo.document.DataType; import com.yahoo.document.Document; +import com.yahoo.document.DocumentId; import com.yahoo.document.DocumentPut; import com.yahoo.document.DocumentType; import com.yahoo.document.TensorDataType; +import com.yahoo.document.datatypes.StringFieldValue; import com.yahoo.document.datatypes.TensorFieldValue; import com.yahoo.documentapi.AckToken; import com.yahoo.documentapi.VisitorControlSession; import com.yahoo.documentapi.VisitorDataHandler; import com.yahoo.documentapi.messagebus.protocol.PutDocumentMessage; +import com.yahoo.documentapi.messagebus.protocol.RemoveDocumentMessage; import com.yahoo.tensor.Tensor; import com.yahoo.tensor.TensorType; import org.junit.jupiter.api.Test; @@ -26,23 +30,27 @@ import static org.mockito.Mockito.mock; * @author bjorncs */ public class StdOutVisitorHandlerTest { - private boolean jsonOutput; - - public void initStdOutVisitorHandlerTest(boolean jsonOutput) { - this.jsonOutput = jsonOutput; - } public static Object[] data() { return new Object[]{true, false}; } + private static StdOutVisitorHandler.Params createHandlerParams(boolean jsonOutput, boolean tensorShortForm, boolean tensorDirectValues) { + var params = new StdOutVisitorHandler.Params(); + params.outputFormat = jsonOutput ? StdOutVisitorHandler.OutputFormat.JSON + : StdOutVisitorHandler.OutputFormat.XML; + params.tensorShortForm = tensorShortForm; + params.tensorDirectValues = tensorDirectValues; + return params; + } + @MethodSource("data") @ParameterizedTest(name = "jsonOutput={0}") void printing_ids_for_zero_documents_produces_empty_output(boolean jsonOutput) { - initStdOutVisitorHandlerTest(jsonOutput); ByteArrayOutputStream out = new ByteArrayOutputStream(); - StdOutVisitorHandler visitorHandler = - new StdOutVisitorHandler(/*printIds*/true, false, false, false, false, false, 0, jsonOutput, false, false, new PrintStream(out, true)); + var params = createHandlerParams(jsonOutput, false, false); + params.printIds = true; + StdOutVisitorHandler visitorHandler = new StdOutVisitorHandler(params, new PrintStream(out, true)); VisitorDataHandler dataHandler = visitorHandler.getDataHandler(); dataHandler.onDone(); String output = out.toString(); @@ -52,10 +60,9 @@ public class StdOutVisitorHandlerTest { @MethodSource("data") @ParameterizedTest(name = "jsonOutput={0}") void printing_zero_documents_produces_empty_output(boolean jsonOutput) { - initStdOutVisitorHandlerTest(jsonOutput); ByteArrayOutputStream out = new ByteArrayOutputStream(); StdOutVisitorHandler visitorHandler = - new StdOutVisitorHandler(/*printIds*/false, false, false, false, false, false, 0, jsonOutput, false, false, new PrintStream(out, true)); + new StdOutVisitorHandler(createHandlerParams(jsonOutput, false, false), new PrintStream(out, true)); VisitorDataHandler dataHandler = visitorHandler.getDataHandler(); dataHandler.onDone(); String expectedOutput = jsonOutput ? "[]" : ""; @@ -71,8 +78,7 @@ public class StdOutVisitorHandlerTest { var putMsg = new PutDocumentMessage(new DocumentPut(doc)); var out = new ByteArrayOutputStream(); - var visitorHandler = new StdOutVisitorHandler(/*printIds*/false, false, false, false, false, false, - 0, true, tensorShortForm, tensorDirectValues, new PrintStream(out, true)); + var visitorHandler = new StdOutVisitorHandler(createHandlerParams(true, tensorShortForm, tensorDirectValues), new PrintStream(out, true)); var dataHandler = visitorHandler.getDataHandler(); var controlSession = mock(VisitorControlSession.class); var ackToken = mock(AckToken.class); @@ -100,4 +106,54 @@ public class StdOutVisitorHandlerTest { do_test_json_tensor_fields_rendering(true, false, expectedOutput); } + private static PutDocumentMessage createPutWithDocAndValue(DocumentType docType, String docId, String fieldValue) { + var doc = new Document(docType, docId); + doc.setFieldValue("bar", new StringFieldValue(fieldValue)); + return new PutDocumentMessage(new DocumentPut(doc)); + } + + private static RemoveDocumentMessage createRemoveForDoc(String docId) { + return new RemoveDocumentMessage(new DocumentId(docId)); + } + + @MethodSource("data") + @ParameterizedTest(name = "jsonLinesFormat={0}") + void json_can_be_output_in_json_lines_format(boolean jsonLinesFormat) { + var docType = new DocumentType("foo"); + docType.addField("bar", DataType.STRING); + + var params = createHandlerParams(true, true, true); + params.outputFormat = jsonLinesFormat ? StdOutVisitorHandler.OutputFormat.JSONL + : StdOutVisitorHandler.OutputFormat.JSON; + + var out = new ByteArrayOutputStream(); + var visitorHandler = new StdOutVisitorHandler(params, new PrintStream(out, true)); + var dataHandler = visitorHandler.getDataHandler(); + var controlSession = mock(VisitorControlSession.class); + dataHandler.setSession(controlSession); + + dataHandler.onMessage(createPutWithDocAndValue(docType, "id:baz:foo::1", "fluffy\nbunnies"), mock(AckToken.class)); + dataHandler.onMessage(createRemoveForDoc("id:baz:foo::2"), mock(AckToken.class)); + dataHandler.onMessage(createPutWithDocAndValue(docType, "id:baz:foo::3", "\r\ncool fox\r\n"), mock(AckToken.class)); + dataHandler.onDone(); + + String output = out.toString().trim(); + if (jsonLinesFormat) { + // JSONL; no implicit start/end array chars or trailing commas after objects + var expected = """ + {"id":"id:baz:foo::1","fields":{"bar":"fluffy\\nbunnies"}} + {"remove":"id:baz:foo::2"} + {"id":"id:baz:foo::3","fields":{"bar":"\\r\\ncool fox\\r\\n"}}"""; + assertEquals(expected, output); + } else { + // non-JSONL; usual array of comma-separated objects form + var expected = """ + [ + {"id":"id:baz:foo::1","fields":{"bar":"fluffy\\nbunnies"}}, + {"remove":"id:baz:foo::2"}, + {"id":"id:baz:foo::3","fields":{"bar":"\\r\\ncool fox\\r\\n"}}]"""; + assertEquals(expected, output); + } + } + } |