diff options
44 files changed, 3 insertions, 3927 deletions
diff --git a/.gitignore b/.gitignore index c77fe07eee7..adc898a7266 100644 --- a/.gitignore +++ b/.gitignore @@ -40,12 +40,10 @@ Testing /build.ninja /rules.ninja *_test_app -/hadoop/dependency-reduced-pom.xml /mvnw /mvnw.cmd /mvnwDebug /mvnwDebug.cmd -/vespa-hadoop/dependency-reduced-pom.xml .preprocessed/ .DS_Store install_manifest.txt diff --git a/parent/pom.xml b/parent/pom.xml index 549d66e37a0..034daac34a3 100644 --- a/parent/pom.xml +++ b/parent/pom.xml @@ -495,61 +495,6 @@ <!-- No version property, as we don't want maven-dependency-plugin to alert about newer versions. --> <version>3.1.9</version> </dependency> - <dependency> <!-- Control netty-all version --> - <groupId>io.netty</groupId> - <artifactId>netty-all</artifactId> - <version>${netty.version}</version> - </dependency> - <dependency> - <groupId>io.netty</groupId> - <artifactId>netty-buffer</artifactId> - <version>${netty.version}</version> - </dependency> - <dependency> - <groupId>io.netty</groupId> - <artifactId>netty-common</artifactId> - <version>${netty.version}</version> - </dependency> - <dependency> - <groupId>io.netty</groupId> - <artifactId>netty-codec</artifactId> - <version>${netty.version}</version> - </dependency> - <dependency> - <groupId>io.netty</groupId> - <artifactId>netty-codec-http2</artifactId> - <version>${netty.version}</version> - </dependency> - <dependency> - <groupId>io.netty</groupId> - <artifactId>netty-codec-http</artifactId> - <version>${netty.version}</version> - </dependency> - <dependency> - <groupId>io.netty</groupId> - <artifactId>netty-transport</artifactId> - <version>${netty.version}</version> - </dependency> - <dependency> - <groupId>io.netty</groupId> - <artifactId>netty-transport-classes-epoll</artifactId> - <version>${netty.version}</version> - </dependency> - <dependency> <!-- Control netty-handler version --> - <groupId>io.netty</groupId> - <artifactId>netty-handler</artifactId> - <version>${netty.version}</version> - </dependency> - <dependency> <!-- Control netty-transport-native-epoll version --> - <groupId>io.netty</groupId> - <artifactId>netty-transport-native-epoll</artifactId> - <version>${netty.version}</version> - </dependency> - <dependency> <!-- Control netty-handler version --> - <groupId>io.netty</groupId> - <artifactId>netty-tcnative</artifactId> - <version>${netty-tcnative.version}</version> - </dependency> <dependency> <groupId>com.github.tomakehurst</groupId> <artifactId>wiremock-jre8-standalone</artifactId> @@ -1075,104 +1020,6 @@ <artifactId>json</artifactId> <version>${org.json.version}</version> </dependency> - <dependency> <!-- Due to hadoop-common pulling in 1.7.7 --> - <groupId>org.apache.avro</groupId> - <artifactId>avro</artifactId> - <version>${avro.version}</version> - </dependency> - <dependency> <!-- Due to hadoop-common pulling in 9.8.1 --> - <groupId>com.nimbusds</groupId> - <artifactId>nimbus-jose-jwt</artifactId> - <version>${nimbus.version}</version> - </dependency> - <dependency> <!-- Due to hadoop-common pulling in older version --> - <groupId>net.minidev</groupId> - <artifactId>json-smart</artifactId> - <version>${json-smart.version}</version> - </dependency> - <dependency> - <!-- Force fresh woodstox-core without security issue hadoop-3.3.4 --> - <groupId>com.fasterxml.woodstox</groupId> - <artifactId>woodstox-core</artifactId> - <version>${woodstox.version}</version> - </dependency> - <dependency> - <!-- Force fresh jersey-json without security issue hadoop-3.3.4 --> - <groupId>com.sun.jersey</groupId> - <artifactId>jersey-json</artifactId> - <version>${sun-jersey-json.version}</version> - </dependency> - <dependency> - <!-- Force fresh jettison without security issue hadoop-3.3.4 --> - <groupId>org.codehaus.jettison</groupId> - <artifactId>jettison</artifactId> - <version>${jettison.version}</version> - </dependency> - <dependency> - <!-- Transitive dependencies from pig-0.16 up-to-date --> - <groupId>tomcat</groupId> - <artifactId>jasper-runtime</artifactId> - <version>${tomcat-jasper.version}</version> - </dependency> - <dependency> - <!-- Transitive dependencies from pig-0.16 up-to-date --> - <groupId>tomcat</groupId> - <artifactId>jasper-compiler</artifactId> - <version>${tomcat-jasper.version}</version> - </dependency> - <!-- Hadoop dependencies --> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-common</artifactId> - <version>${hadoop.version}</version> - <exclusions> - <exclusion> - <groupId>log4j</groupId> - <artifactId>log4j</artifactId> - </exclusion> - <exclusion> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-log4j12</artifactId> - </exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-client</artifactId> - <version>${hadoop.version}</version> - <exclusions> - <exclusion> - <groupId>log4j</groupId> - <artifactId>log4j</artifactId> - </exclusion> - <exclusion> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-log4j12</artifactId> - </exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>org.apache.pig</groupId> - <artifactId>pig</artifactId> - <version>${pig.version}</version> - <classifier>h2</classifier> - </dependency> - <dependency> - <!-- Hadoop test dependency --> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-minicluster</artifactId> - <version>${hadoop.version}</version> - <exclusions> - <exclusion> - <groupId>log4j</groupId> - <artifactId>log4j</artifactId> - </exclusion> - <exclusion> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-log4j12</artifactId> - </exclusion> - </exclusions> - </dependency> </dependencies> </dependencyManagement> @@ -1207,15 +1054,11 @@ <felix.version>7.0.1</felix.version> <felix.log.version>1.0.1</felix.log.version> <findbugs.version>3.0.2</findbugs.version> <!-- Should be kept in sync with guava --> - <groovy.version>3.0.13</groovy.version> - <hadoop.version>3.3.4</hadoop.version> <hdrhistogram.version>2.1.12</hdrhistogram.version> - <jettison.version>1.5.1</jettison.version> <jetty.version>9.4.49.v20220914</jetty.version> <jetty-alpn.version>1.1.3.v20160715</jetty-alpn.version> <jjwt.version>0.11.2</jjwt.version> <jna.version>5.11.0</jna.version> - <json-smart.version>2.4.8</json-smart.version> <junit.version>5.8.1</junit.version> <maven-archiver.version>3.5.2</maven-archiver.version> <maven-assembly-plugin.version>3.3.0</maven-assembly-plugin.version> @@ -1235,23 +1078,16 @@ <maven-shade-plugin.version>3.4.1</maven-shade-plugin.version> <maven-site-plugin.version>3.9.1</maven-site-plugin.version> <maven-source-plugin.version>3.2.1</maven-source-plugin.version> - <mockito.version>4.0.0</mockito.version> - <netty.version>4.1.84.Final</netty.version> - <netty-tcnative.version>2.0.54.Final</netty-tcnative.version> - <nimbus.version>9.25.6</nimbus.version> + <mockito.version>4.0.0</mockito.version> <onnxruntime.version>1.12.1</onnxruntime.version> <!-- WARNING: sync cloud-tenant-base-dependencies-enforcer/pom.xml --> <org.json.version>20220320</org.json.version> <org.lz4.version>1.8.0</org.lz4.version> - <pig.version>0.16.0</pig.version> <prometheus.client.version>0.6.0</prometheus.client.version> <protobuf.version>3.21.7</protobuf.version> <spifly.version>1.3.5</spifly.version> - <sun-jersey-json.version>1.19.4</sun-jersey-json.version> <surefire.version>2.22.2</surefire.version> - <tomcat-jasper.version>5.5.23</tomcat-jasper.version> - <wiremock.version>2.34.0</wiremock.version> - <woodstox.version>6.4.0</woodstox.version> - <zookeeper.client.version>3.8.0</zookeeper.client.version> + <wiremock.version>2.34.0</wiremock.version> + <zookeeper.client.version>3.8.0</zookeeper.client.version> <doclint>all</doclint> <test.hide>true</test.hide> @@ -131,7 +131,6 @@ <module>vespa-feed-client</module> <module>vespa-feed-client-api</module> <module>vespa-feed-client-cli</module> - <module>vespa-hadoop</module> <module>vespa-maven-plugin</module> <module>vespa-osgi-testrunner</module> <module>vespa-testrunner-components</module> diff --git a/vespa-hadoop/OWNERS b/vespa-hadoop/OWNERS deleted file mode 100644 index 6b09ce48bd4..00000000000 --- a/vespa-hadoop/OWNERS +++ /dev/null @@ -1 +0,0 @@ -lesters diff --git a/vespa-hadoop/README b/vespa-hadoop/README deleted file mode 100644 index 1b567b88c1d..00000000000 --- a/vespa-hadoop/README +++ /dev/null @@ -1,4 +0,0 @@ -The Vespa Hadoop client. - -Contains APIs for feeding and querying Vespa from the grid. - diff --git a/vespa-hadoop/abi-spec.json b/vespa-hadoop/abi-spec.json deleted file mode 100644 index 9e26dfeeb6e..00000000000 --- a/vespa-hadoop/abi-spec.json +++ /dev/null @@ -1 +0,0 @@ -{}
\ No newline at end of file diff --git a/vespa-hadoop/pom.xml b/vespa-hadoop/pom.xml deleted file mode 100644 index 43f7c17967d..00000000000 --- a/vespa-hadoop/pom.xml +++ /dev/null @@ -1,166 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" - xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <groupId>com.yahoo.vespa</groupId> - <artifactId>parent</artifactId> - <version>8-SNAPSHOT</version> - <relativePath>../parent/pom.xml</relativePath> - </parent> - <artifactId>vespa-hadoop</artifactId> - <version>8-SNAPSHOT</version> - <name>${project.artifactId}</name> - <description>Integration tools between Vespa and Hadoop</description> - - <properties> - <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> - </properties> - - <dependencies> - <!-- Hadoop dependencies --> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-common</artifactId> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-client</artifactId> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.pig</groupId> - <artifactId>pig</artifactId> - <classifier>h2</classifier> - <scope>provided</scope> - </dependency> - - <!-- These are inherited from parent. Needed for correct versions on Hadoop. --> - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpclient</artifactId> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpcore</artifactId> - <scope>compile</scope> - </dependency> - - <!-- Test dependencies --> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-minicluster</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <!-- This is a HACK due to hadoop relying on mockito in NameNodeAdapter, but not providing it. Brum, brum !! --> - <groupId>org.mockito</groupId> - <artifactId>mockito-core</artifactId> - <scope>test</scope> - </dependency> - - <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter</artifactId> - <scope>test</scope> - </dependency> - <!-- Vespa feeding dependencies --> - <dependency> - <groupId>com.yahoo.vespa</groupId> - <artifactId>vespa-feed-client</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>com.yahoo.vespa</groupId> - <artifactId>vespa-feed-client-api</artifactId> - <version>${project.version}</version> - </dependency> - - <!-- Jackson dependencies used in this module --> - <dependency> - <groupId>com.fasterxml.jackson.core</groupId> - <artifactId>jackson-core</artifactId> - </dependency> - <dependency> - <groupId>com.fasterxml.jackson.core</groupId> - <artifactId>jackson-databind</artifactId> - </dependency> - - </dependencies> - - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-shade-plugin</artifactId> - <executions> - <execution> - <phase>package</phase> - <goals> - <goal>shade</goal> - </goals> - <configuration> - <minimizeJar>false</minimizeJar> - - <relocations> - <relocation> - <pattern>com.google</pattern> - <shadedPattern>shaded.vespa</shadedPattern> - </relocation> - <relocation> - <pattern>org.bouncycastle</pattern> - <shadedPattern>shaded.vespa.bouncycastle</shadedPattern> - </relocation> - <relocation> - <pattern>commons-codec</pattern> - <shadedPattern>shaded.vespa</shadedPattern> - </relocation> - <relocation> - <pattern>commons-logging</pattern> - <shadedPattern>shaded.vespa</shadedPattern> - </relocation> - <relocation> - <pattern>org.apache</pattern> - <shadedPattern>shaded.vespa</shadedPattern> - <excludes> - <exclude>org.apache.hadoop.**</exclude> - <exclude>org.apache.pig.**</exclude> - </excludes> - </relocation> - <relocation> - <pattern>com.fasterxml</pattern> - <shadedPattern>shaded.vespa</shadedPattern> - </relocation> - <relocation> - <pattern>org.codehaus</pattern> - <shadedPattern>shaded.vespa</shadedPattern> - </relocation> - <relocation> - <pattern>io.airlift</pattern> - <shadedPattern>shaded.vespa</shadedPattern> - </relocation> - <relocation> - <pattern>com.ctc.wstx</pattern> - <shadedPattern>shaded.vespa</shadedPattern> - </relocation> - </relocations> - </configuration> - </execution> - </executions> - </plugin> - - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-compiler-plugin</artifactId> - <configuration> - <release>${vespaClients.jdk.releaseVersion}</release> - </configuration> - </plugin> - </plugins> - - </build> -</project> diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaOutputCommitter.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaOutputCommitter.java deleted file mode 100644 index 6cb4ef45a96..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaOutputCommitter.java +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.mapreduce; - -import org.apache.hadoop.mapreduce.JobContext; -import org.apache.hadoop.mapreduce.OutputCommitter; -import org.apache.hadoop.mapreduce.TaskAttemptContext; - -import java.io.IOException; - -/** - * The output committer describes the commit task output for a Map-Reduce - * job. Not currently used, but is part of the Hadoop protocol since 2.7. - * - * @author lesters - */ -public class VespaOutputCommitter extends OutputCommitter { - @Override - public void setupJob(JobContext jobContext) throws IOException { - } - - @Override - public void setupTask(TaskAttemptContext taskAttemptContext) throws IOException { - } - - @Override - public boolean needsTaskCommit(TaskAttemptContext taskAttemptContext) throws IOException { - return false; - } - - @Override - public void commitTask(TaskAttemptContext taskAttemptContext) throws IOException { - } - - @Override - public void abortTask(TaskAttemptContext taskAttemptContext) throws IOException { - } -} diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaOutputFormat.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaOutputFormat.java deleted file mode 100644 index e49a5e17970..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaOutputFormat.java +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.mapreduce; - -import com.yahoo.vespa.hadoop.mapreduce.util.VespaConfiguration; -import com.yahoo.vespa.hadoop.mapreduce.util.VespaCounters; -import org.apache.hadoop.mapreduce.JobContext; -import org.apache.hadoop.mapreduce.OutputCommitter; -import org.apache.hadoop.mapreduce.OutputFormat; -import org.apache.hadoop.mapreduce.RecordWriter; -import org.apache.hadoop.mapreduce.TaskAttemptContext; - -import java.io.IOException; -import java.util.Properties; -import java.util.logging.Logger; - -/** - * An output specification for writing to Vespa instances in a Map-Reduce job. - * Mainly returns an instance of a {@link VespaRecordWriter} that does the - * actual feeding to Vespa. - * - * @author lesters - */ -@SuppressWarnings("rawtypes") -public class VespaOutputFormat extends OutputFormat { - - private static final Logger log = Logger.getLogger(VespaOutputFormat.class.getName()); - - final Properties configOverride; - - public VespaOutputFormat() { - super(); - this.configOverride = null; - } - - public VespaOutputFormat(Properties configOverride) { - super(); - this.configOverride = configOverride; - } - - - @Override - @SuppressWarnings("deprecation") - public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException { - VespaCounters counters = VespaCounters.get(context); - VespaConfiguration configuration = VespaConfiguration.get(context.getConfiguration(), configOverride); - return new VespaRecordWriter(configuration, counters); - } - - - @Override - public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { - return new VespaOutputCommitter(); - } - - - @Override - public void checkOutputSpecs(JobContext jobContext) throws IOException, InterruptedException { - } - -} diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaRecordWriter.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaRecordWriter.java deleted file mode 100644 index c450d7cdeef..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaRecordWriter.java +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.mapreduce; - -import ai.vespa.feed.client.FeedClient; -import ai.vespa.feed.client.FeedClientBuilder; -import ai.vespa.feed.client.JsonFeeder; -import ai.vespa.feed.client.OperationParseException; -import com.yahoo.vespa.hadoop.mapreduce.util.VespaConfiguration; -import com.yahoo.vespa.hadoop.mapreduce.util.VespaCounters; -import org.apache.hadoop.mapreduce.RecordWriter; -import org.apache.hadoop.mapreduce.TaskAttemptContext; - -import java.io.IOException; -import java.net.URI; -import java.time.Duration; -import java.util.Arrays; -import java.util.List; -import java.util.concurrent.ThreadLocalRandom; -import java.util.logging.Level; -import java.util.logging.Logger; - -import static java.util.stream.Collectors.toList; - -/** - * {@link VespaRecordWriter} sends the output <key, value> to one or more Vespa endpoints using vespa-feed-client. - * - * @author bjorncs - */ -public class VespaRecordWriter extends RecordWriter<Object, Object> { - - private final static Logger log = Logger.getLogger(VespaRecordWriter.class.getCanonicalName()); - - private final VespaCounters counters; - private final VespaConfiguration config; - - private boolean initialized = false; - private JsonFeeder feeder; - - protected VespaRecordWriter(VespaConfiguration config, VespaCounters counters) { - this.counters = counters; - this.config = config; - } - - @Override - public void write(Object key, Object data) throws IOException { - initializeOnFirstWrite(); - String json = data.toString().trim(); - feeder.feedSingle(json) - .whenComplete((result, error) -> { - if (error != null) { - if (error instanceof OperationParseException) { - counters.incrementDocumentsSkipped(1); - } else { - String msg = "Failed to feed single document: " + error; - log.log(Level.WARNING, msg, error); - counters.incrementDocumentsFailed(1); - } - } else { - counters.incrementDocumentsOk(1); - } - }); - counters.incrementDocumentsSent(1); - if (counters.getDocumentsSent() % config.progressInterval() == 0) { - String progress = String.format("Feed progress: %d / %d / %d / %d (sent, ok, failed, skipped)", - counters.getDocumentsSent(), - counters.getDocumentsOk(), - counters.getDocumentsFailed(), - counters.getDocumentsSkipped()); - log.info(progress); - } - } - - @Override - public void close(TaskAttemptContext context) throws IOException { - if (feeder != null) { - feeder.close(); - feeder = null; - initialized = false; - } - } - - /** Override method to alter {@link FeedClient} configuration */ - protected void onFeedClientInitialization(FeedClientBuilder builder) {} - - private void initializeOnFirstWrite() { - if (initialized) return; - useRandomizedStartupDelayIfEnabled(); - feeder = createJsonStreamFeeder(); - initialized = true; - } - - private void useRandomizedStartupDelayIfEnabled() { - if (!config.dryrun() && config.randomStartupSleepMs() > 0) { - int delay = ThreadLocalRandom.current().nextInt(config.randomStartupSleepMs()); - log.info("Delaying startup by " + delay + " ms"); - try { - Thread.sleep(delay); - } catch (Exception e) {} - } - } - - - private JsonFeeder createJsonStreamFeeder() { - FeedClient feedClient = createFeedClient(); - JsonFeeder.Builder builder = JsonFeeder.builder(feedClient) - .withTimeout(Duration.ofMinutes(10)); - if (config.route() != null) { - builder.withRoute(config.route()); - } - return builder.build(); - - } - - private FeedClient createFeedClient() { - List<URI> endpoints = endpointUris(config); - log.info("Using endpoints " + endpoints); - int streamsPerConnection = streamsPerConnection(config); - log.log(Level.INFO, "Using {0} max streams per connection", new Object[] {streamsPerConnection}); - log.log(Level.INFO, "Using {0} connections", new Object[] {config.numConnections()}); - FeedClientBuilder feedClientBuilder = FeedClientBuilder.create(endpoints) - .setConnectionsPerEndpoint(config.numConnections()) - .setMaxStreamPerConnection(streamsPerConnection) - .setDryrun(config.dryrun()) - .setRetryStrategy(retryStrategy(config)); - if (config.proxyHost() != null) { - URI proxyUri = URI.create(String.format( - "%s://%s:%d", config.proxyScheme(), config.proxyHost(), config.proxyPort())); - log.info("Using proxy " + proxyUri); - feedClientBuilder.setProxy(proxyUri); - } - - onFeedClientInitialization(feedClientBuilder); - return feedClientBuilder.build(); - } - - private static FeedClient.RetryStrategy retryStrategy(VespaConfiguration config) { - int maxRetries = config.numRetries(); - return new FeedClient.RetryStrategy() { - @Override public int retries() { return maxRetries; } - }; - } - - private static int streamsPerConnection(VespaConfiguration config) { - return Math.min(256, config.maxInFlightRequests() / config.numConnections()); - } - - private static List<URI> endpointUris(VespaConfiguration config) { - String scheme = config.useSSL().orElse(true) ? "https" : "http"; - return Arrays.stream(config.endpoint().split(",")) - .map(hostname -> URI.create(String.format("%s://%s:%d/", scheme, hostname, config.defaultPort()))) - .collect(toList()); - } -} diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaSimpleJsonInputFormat.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaSimpleJsonInputFormat.java deleted file mode 100644 index f9bcba96a69..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaSimpleJsonInputFormat.java +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.mapreduce; - -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonFactoryBuilder; -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.core.JsonToken; -import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.hadoop.fs.FSDataInputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.InputSplit; -import org.apache.hadoop.mapreduce.RecordReader; -import org.apache.hadoop.mapreduce.TaskAttemptContext; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.FileSplit; - -import java.io.BufferedInputStream; -import java.io.IOException; - -/** - * Simple JSON reader which splits the input file along JSON object boundaries. - * - * There are two cases handled here: - * 1. Each line contains a JSON object, i.e. { ... } - * 2. The file contains an array of objects with arbitrary line breaks, i.e. [ {...}, {...} ] - * - * Not suitable for cases where you want to extract objects from some other arbitrary structure. - * - * TODO: Support config which points to a array in the JSON as start point for object extraction, - * ala how it is done in VespaHttpClient.parseResultJson, i.e. support rootNode config. - * - * @author lesters - */ -public class VespaSimpleJsonInputFormat extends FileInputFormat<Text, NullWritable> { - - @Override - public RecordReader<Text, NullWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { - return new VespaJsonRecordReader(); - } - - public static class VespaJsonRecordReader extends RecordReader<Text, NullWritable> { - private long remaining; - private JsonParser parser; - private Text currentKey; - private NullWritable currentValue = NullWritable.get(); - - @Override - public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { - FileSplit fileSplit = (FileSplit) split; - FSDataInputStream stream = FileSystem.get(context.getConfiguration()).open(fileSplit.getPath()); - if (fileSplit.getStart() != 0) { - stream.seek(fileSplit.getStart()); - } - - remaining = fileSplit.getLength(); - JsonFactory factory = new JsonFactoryBuilder().disable(JsonFactory.Feature.CANONICALIZE_FIELD_NAMES).build(); - parser = factory.createParser(new BufferedInputStream(stream)); - parser.setCodec(new ObjectMapper()); - parser.nextToken(); - if (parser.currentToken() == JsonToken.START_ARRAY) { - parser.nextToken(); - } - } - - @Override - public boolean nextKeyValue() throws IOException, InterruptedException { - if (parser.currentToken() != JsonToken.START_OBJECT) { - return true; - } - currentKey = new Text(parser.readValueAsTree().toString()); - parser.nextToken(); - return false; - } - - @Override - public Text getCurrentKey() throws IOException, InterruptedException { - return currentKey; - } - - @Override - public NullWritable getCurrentValue() throws IOException, InterruptedException { - return currentValue; - } - - @Override - public float getProgress() throws IOException, InterruptedException { - return parser.getCurrentLocation().getByteOffset() / remaining; - } - - @Override - public void close() throws IOException { - parser.close(); - } - } - -} - diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/package-info.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/package-info.java deleted file mode 100644 index 22a742566cd..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/package-info.java +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -/** - * com.yahoo.vespa.hadoop.mapreduce contains classes and utilities - * to enable feeding directly to Vespa endpoints from mapreduce. - * It is a minimal layer over the Vespa HTTP client. - * - * NOTE: This is a PUBLIC API, but not annotated as such because this is not a bundle and - * we don't want to introduce Vespa dependencies. - */ -package com.yahoo.vespa.hadoop.mapreduce; diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/TupleTools.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/TupleTools.java deleted file mode 100644 index 5147dc3496c..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/TupleTools.java +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.mapreduce.util; - -import org.apache.pig.ResourceSchema; -import org.apache.pig.data.Tuple; -import org.apache.pig.impl.logicalLayer.schema.Schema; - -import java.io.IOException; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -public class TupleTools { - - private static final Pattern pattern = Pattern.compile("<([\\w]+)>"); - - public static Map<String, Object> tupleMap(Schema schema, Tuple tuple) throws IOException { - Map<String, Object> tupleMap = new HashMap<>((int)Math.ceil(tuple.size() / 0.75) + 1); - List<Schema.FieldSchema> schemas = schema.getFields(); - for (int i = 0; i < schemas.size(); i++) { - Schema.FieldSchema field = schemas.get(i); - String alias = field.alias; - Object value = tuple.get(i); - if (value != null) { - tupleMap.put(alias, value); - } - } - return tupleMap; - } - - public static Map<String, Object> tupleMap(ResourceSchema schema, Tuple tuple) throws IOException { - Map<String, Object> tupleMap = new HashMap<>((int)Math.ceil(tuple.size() / 0.75) + 1); - ResourceSchema.ResourceFieldSchema[] schemas = schema.getFields(); - for (int i = 0; i < schemas.length; i++) { - ResourceSchema.ResourceFieldSchema field = schemas[i]; - String alias = field.getName(); - Object value = tuple.get(i); - if (value != null) { - tupleMap.put(alias, value); - } - } - return tupleMap; - } - - public static String toString(Schema schema, Tuple tuple, String template) throws IOException { - return toString(tupleMap(schema, tuple), template); - } - - public static String toString(Map<String,Object> fields, String template) { - if (template == null || template.length() == 0) { - return template; - } - if (fields == null || fields.size() == 0) { - return template; - } - - Matcher m = pattern.matcher(template); - StringBuffer sb = new StringBuffer(); - while (m.find()) { - Object value = fields.get(m.group(1)); - String replacement = value != null ? value.toString() : m.group(0); - m.appendReplacement(sb, Matcher.quoteReplacement(replacement)); - } - m.appendTail(sb); - return sb.toString(); - } - -} diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaConfiguration.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaConfiguration.java deleted file mode 100644 index ae0b6a58155..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaConfiguration.java +++ /dev/null @@ -1,194 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.mapreduce.util; - -import org.apache.hadoop.conf.Configuration; - -import java.io.IOException; -import java.io.StringReader; -import java.util.Optional; -import java.util.Properties; - -public class VespaConfiguration { - - public static final String ENDPOINT = "vespa.feed.endpoint"; - public static final String DEFAULT_PORT = "vespa.feed.defaultport"; - public static final String USE_SSL = "vespa.feed.ssl"; - public static final String PROXY_HOST = "vespa.feed.proxy.host"; - public static final String PROXY_PORT = "vespa.feed.proxy.port"; - public static final String PROXY_SCHEME = "vespa.feed.proxy.scheme"; - public static final String DRYRUN = "vespa.feed.dryrun"; - public static final String USE_COMPRESSION = "vespa.feed.usecompression"; - public static final String PROGRESS_REPORT = "vespa.feed.progress.interval"; - public static final String CONNECTIONS = "vespa.feed.connections"; - public static final String THROTTLER_MIN_SIZE = "vespa.feed.throttler.min.size"; - public static final String QUERY_CONNECTION_TIMEOUT = "vespa.query.connection.timeout"; - public static final String ROUTE = "vespa.feed.route"; - public static final String MAX_SLEEP_TIME_MS = "vespa.feed.max.sleep.time.ms"; - public static final String MAX_IN_FLIGHT_REQUESTS = "vespa.feed.max.in.flight.requests"; - public static final String RANDOM_STARTUP_SLEEP = "vespa.feed.random.startup.sleep.ms"; - public static final String NUM_RETRIES = "vespa.feed.num.retries"; - - private final Configuration conf; - private final Properties override; - - private VespaConfiguration(Configuration conf, Properties override) { - this.conf = conf; - this.override = override; - } - - - public static VespaConfiguration get(Configuration conf, Properties override) { - return new VespaConfiguration(conf, override); - } - - - public String endpoint() { - return getString(ENDPOINT); - } - - - public int defaultPort() { - return getInt(DEFAULT_PORT, 4080); - } - - - public Optional<Boolean> useSSL() { - String raw = getString(USE_SSL); - if (raw == null || raw.trim().isEmpty()) return Optional.empty(); - return Optional.of(Boolean.parseBoolean(raw)); - } - - - public String proxyHost() { - return getString(PROXY_HOST); - } - - - public int proxyPort() { - return getInt(PROXY_PORT, 4080); - } - - - public String proxyScheme() { - String raw = getString(PROXY_SCHEME); - if (raw == null) return "http"; - return raw; - } - - - public boolean dryrun() { - return getBoolean(DRYRUN, false); - } - - - public boolean useCompression() { - return getBoolean(USE_COMPRESSION, true); - } - - - public int numConnections() { - return getInt(CONNECTIONS, 1); - } - - - public int throttlerMinSize() { - return getInt(THROTTLER_MIN_SIZE, 0); - } - - - public int queryConnectionTimeout() { - return getInt(QUERY_CONNECTION_TIMEOUT, 10000); - } - - - public String route() { - return getString(ROUTE); - } - - - public int maxSleepTimeMs() { - return getInt(MAX_SLEEP_TIME_MS, 10000); - } - - - public int maxInFlightRequests() { - return getInt(MAX_IN_FLIGHT_REQUESTS, 500); - } - - - public int randomStartupSleepMs() { - return getInt(RANDOM_STARTUP_SLEEP, 30000); - } - - - public int numRetries() { - return getInt(NUM_RETRIES, 100); - } - - - public int progressInterval() { - return getInt(PROGRESS_REPORT, 1000); - } - - public String getString(String name) { - if (override != null && override.containsKey(name)) { - return override.getProperty(name); - } - return conf != null ? conf.get(name) : null; - } - - - public int getInt(String name, int defaultValue) { - if (override != null && override.containsKey(name)) { - return Integer.parseInt(override.getProperty(name)); - } - return conf != null ? conf.getInt(name, defaultValue) : defaultValue; - } - - - public boolean getBoolean(String name, boolean defaultValue) { - if (override != null && override.containsKey(name)) { - return Boolean.parseBoolean(override.getProperty(name)); - } - return conf != null ? conf.getBoolean(name, defaultValue) : defaultValue; - - } - - public static Properties loadProperties(String... params) { - Properties properties = new Properties(); - if (params != null) { - for (String s : params) { - try { - properties.load(new StringReader(s)); - } catch (IOException e) { - throw new IllegalArgumentException(e); - } - } - } - return properties; - } - - - public String toString() { - StringBuilder sb = new StringBuilder(); - sb.append(ENDPOINT + ": " + endpoint() + "\n"); - sb.append(DEFAULT_PORT + ": " + defaultPort() + "\n"); - sb.append(USE_SSL + ": " + useSSL().map(Object::toString).orElse("<empty>") + "\n"); - sb.append(PROXY_HOST + ": " + proxyHost() + "\n"); - sb.append(PROXY_PORT + ": " + proxyPort() + "\n"); - sb.append(PROXY_SCHEME + ": " + proxyScheme() + "\n"); - sb.append(DRYRUN + ": " + dryrun() +"\n"); - sb.append(USE_COMPRESSION + ": " + useCompression() +"\n"); - sb.append(PROGRESS_REPORT + ": " + progressInterval() +"\n"); - sb.append(CONNECTIONS + ": " + numConnections() +"\n"); - sb.append(THROTTLER_MIN_SIZE + ": " + throttlerMinSize() +"\n"); - sb.append(QUERY_CONNECTION_TIMEOUT + ": " + queryConnectionTimeout() +"\n"); - sb.append(ROUTE + ": " + route() +"\n"); - sb.append(MAX_SLEEP_TIME_MS + ": " + maxSleepTimeMs() +"\n"); - sb.append(MAX_IN_FLIGHT_REQUESTS + ": " + maxInFlightRequests() +"\n"); - sb.append(RANDOM_STARTUP_SLEEP + ": " + randomStartupSleepMs() +"\n"); - sb.append(NUM_RETRIES + ": " + numRetries() +"\n"); - return sb.toString(); - } - -} diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaCounters.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaCounters.java deleted file mode 100644 index 63b4b6600fd..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaCounters.java +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.mapreduce.util; - -import org.apache.hadoop.mapreduce.Counter; -import org.apache.hadoop.mapreduce.Counters; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.TaskAttemptContext; - -import java.io.IOException; - -public class VespaCounters { - - public static final String GROUP = "Vespa Feed Counters"; - public static final String DOCS_OK = "Documents ok"; - public static final String DOCS_SENT = "Documents sent"; - public static final String DOCS_FAILED = "Documents failed"; - public static final String DOCS_SKIPPED = "Documents skipped"; - - private final Counter documentsSent; - private final Counter documentsOk; - private final Counter documentsFailed; - private final Counter documentsSkipped; - - - private VespaCounters(Job job) throws IOException { - Counters counters = job.getCounters(); - documentsSent = counters.findCounter(GROUP, DOCS_SENT); - documentsOk = counters.findCounter(GROUP, DOCS_OK); - documentsFailed = counters.findCounter(GROUP, DOCS_FAILED); - documentsSkipped = counters.findCounter(GROUP, DOCS_SKIPPED); - } - - - private VespaCounters(TaskAttemptContext context) { - documentsSent = context.getCounter(GROUP, DOCS_SENT); - documentsOk = context.getCounter(GROUP, DOCS_OK); - documentsFailed = context.getCounter(GROUP, DOCS_FAILED); - documentsSkipped = context.getCounter(GROUP, DOCS_SKIPPED); - } - - - private VespaCounters(org.apache.hadoop.mapred.Counters counters) { - documentsSent = counters.findCounter(GROUP, DOCS_SENT); - documentsOk = counters.findCounter(GROUP, DOCS_OK); - documentsFailed = counters.findCounter(GROUP, DOCS_FAILED); - documentsSkipped = counters.findCounter(GROUP, DOCS_SKIPPED); - } - - - public static VespaCounters get(Job job) throws IOException { - return new VespaCounters(job); - } - - - public static VespaCounters get(TaskAttemptContext context) { - return new VespaCounters(context); - } - - - public static VespaCounters get(org.apache.hadoop.mapred.Counters counters) { - return new VespaCounters(counters); - - } - - - public long getDocumentsSent() { - return documentsSent.getValue(); - } - - - public void incrementDocumentsSent(long incr) { - documentsSent.increment(incr); - } - - - public long getDocumentsOk() { - return documentsOk.getValue(); - } - - - public void incrementDocumentsOk(long incr) { - documentsOk.increment(incr); - } - - - public long getDocumentsFailed() { - return documentsFailed.getValue(); - } - - - public void incrementDocumentsFailed(long incr) { - documentsFailed.increment(incr); - } - - - public long getDocumentsSkipped() { - return documentsSkipped.getValue(); - } - - - public void incrementDocumentsSkipped(long incr) { - documentsSkipped.increment(incr); - } - -} diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaHttpClient.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaHttpClient.java deleted file mode 100644 index c7ed52a01c0..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaHttpClient.java +++ /dev/null @@ -1,102 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.mapreduce.util; - -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.http.HttpEntity; -import org.apache.http.HttpHost; -import org.apache.http.HttpResponse; -import org.apache.http.client.HttpClient; -import org.apache.http.client.config.RequestConfig; -import org.apache.http.client.methods.HttpGet; -import org.apache.http.impl.client.HttpClientBuilder; -import org.apache.http.util.EntityUtils; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Scanner; - -public class VespaHttpClient { - - private final HttpClient httpClient; - - public VespaHttpClient() { - this(null); - } - - public VespaHttpClient(VespaConfiguration configuration) { - httpClient = createClient(configuration); - } - - public String get(String url) throws IOException { - HttpGet httpGet = new HttpGet(url); - HttpResponse httpResponse = httpClient.execute(httpGet); - - HttpEntity entity = httpResponse.getEntity(); - InputStream is = entity.getContent(); - - String result = ""; - Scanner scanner = new Scanner(is, "UTF-8").useDelimiter("\\A"); - if (scanner.hasNext()) { - result = scanner.next(); - } - EntityUtils.consume(entity); - - if (httpResponse.getStatusLine().getStatusCode() != 200) { - return null; - } - - return result; - } - - public JsonNode parseResultJson(String json, String rootNode) throws IOException { - if (json == null || json.isEmpty()) { - return null; - } - if (rootNode == null || rootNode.isEmpty()) { - return null; - } - - ObjectMapper m = new ObjectMapper(); - JsonNode node = m.readTree(json); - if (node != null) { - String[] path = rootNode.split("/"); - for (String p : path) { - node = node.get(p); - - if (node == null) { - return null; - } - - // if node is an array, return the first node that has the correct path - if (node.isArray()) { - for (int i = 0; i < node.size(); ++i) { - JsonNode n = node.get(i); - if (n.has(p)) { - node = n; - break; - } - } - } - - } - } - return node; - } - - private HttpClient createClient(VespaConfiguration configuration) { - HttpClientBuilder clientBuilder = HttpClientBuilder.create(); - - RequestConfig.Builder requestConfigBuilder = RequestConfig.custom(); - if (configuration != null) { - requestConfigBuilder.setSocketTimeout(configuration.queryConnectionTimeout()); - requestConfigBuilder.setConnectTimeout(configuration.queryConnectionTimeout()); - if (configuration.proxyHost() != null) { - requestConfigBuilder.setProxy(new HttpHost(configuration.proxyHost(), configuration.proxyPort())); - } - } - clientBuilder.setDefaultRequestConfig(requestConfigBuilder.build()); - return clientBuilder.build(); - } - -} diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaQuerySchema.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaQuerySchema.java deleted file mode 100644 index cfaff44addb..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaQuerySchema.java +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.mapreduce.util; - -import com.fasterxml.jackson.databind.JsonNode; -import org.apache.pig.data.DataType; -import org.apache.pig.data.Tuple; -import org.apache.pig.data.TupleFactory; -import org.apache.pig.impl.logicalLayer.schema.Schema; -import org.apache.pig.impl.util.Utils; -import org.apache.pig.parser.ParserException; - -import java.util.ArrayList; -import java.util.Iterator; -import java.util.List; - -public class VespaQuerySchema implements Iterable<VespaQuerySchema.AliasTypePair> { - - private final List<AliasTypePair> tupleSchema = new ArrayList<>(); - - public VespaQuerySchema(String schema) { - for (String e : schema.split(",")) { - String[] pair = e.split(":"); - String alias = pair[0].trim(); - String type = pair[1].trim(); - tupleSchema.add(new AliasTypePair(alias, type)); - } - } - - public Tuple buildTuple(int rank, JsonNode hit) { - Tuple tuple = TupleFactory.getInstance().newTuple(); - - for (VespaQuerySchema.AliasTypePair tupleElement : tupleSchema) { - String alias = tupleElement.getAlias(); - Byte type = DataType.findTypeByName(tupleElement.getType()); - - // reserved word - if ("rank".equals(alias)) { - tuple.append(rank); - } else { - JsonNode field = hit; - String[] path = alias.split("/"); // move outside - for (String p : path) { - field = field.get(p); - if (field == null) { - type = DataType.NULL; // effectively skip field as it is not found - break; - } - } - switch (type) { - case DataType.BOOLEAN: - tuple.append(field.asBoolean()); - break; - case DataType.INTEGER: - tuple.append(field.asInt()); - break; - case DataType.LONG: - tuple.append(field.asLong()); - break; - case DataType.FLOAT: - case DataType.DOUBLE: - tuple.append(field.asDouble()); - break; - case DataType.DATETIME: - tuple.append(field.asText()); - break; - case DataType.CHARARRAY: - tuple.append(field.asText()); - break; - default: - // the rest of the data types are currently not supported - } - } - } - return tuple; - } - - public static Schema getPigSchema(String schemaString) { - Schema schema = null; - schemaString = schemaString.replace("/", "_"); - schemaString = "{(" + schemaString + ")}"; - try { - schema = Utils.getSchemaFromString(schemaString); - } catch (ParserException e) { - e.printStackTrace(); - } - return schema; - } - - @Override - public Iterator<AliasTypePair> iterator() { - return tupleSchema.iterator(); - } - - - public static class AliasTypePair { - private final String alias; - private final String type; - - AliasTypePair(String alias, String type) { - this.alias = alias; - this.type = type; - } - - public String getAlias() { - return alias; - } - - public String getType() { - return type; - } - - } - -} diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/package-info.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/package-info.java deleted file mode 100644 index 41c621d2877..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/package-info.java +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -/** - * com.yahoo.vespa.hadoop contains classes and utilities - * to enable feeding directly to Vespa endpoints from pig and mapreduce. - * It is a minimal layer over the Vespa HTTP client. - * - * NOTE: This is a PUBLIC API, but not annotated as such because this is not a bundle and - * we don't want to introduce Vespa dependencies. - */ -package com.yahoo.vespa.hadoop; diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaDocumentOperation.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaDocumentOperation.java deleted file mode 100644 index c95aa02215f..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaDocumentOperation.java +++ /dev/null @@ -1,669 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.pig; - -import com.fasterxml.jackson.core.JsonEncoding; -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonGenerator; -import com.yahoo.vespa.hadoop.mapreduce.util.TupleTools; -import com.yahoo.vespa.hadoop.mapreduce.util.VespaConfiguration; -import org.apache.pig.EvalFunc; -import org.apache.pig.PigWarning; -import org.apache.pig.data.DataBag; -import org.apache.pig.data.DataByteArray; -import org.apache.pig.data.DataType; -import org.apache.pig.data.Tuple; -import org.apache.pig.tools.pigstats.PigStatusReporter; -import org.apache.pig.impl.logicalLayer.schema.Schema; -import org.joda.time.DateTime; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.PrintWriter; -import java.io.StringWriter; -import java.io.UncheckedIOException; -import java.math.BigDecimal; -import java.math.BigInteger; -import java.util.*; - -/** - * A Pig UDF to convert simple Pig types into a valid Vespa JSON document format. - * - * @author lesters - */ -public class VespaDocumentOperation extends EvalFunc<String> { - - public enum Operation { - DOCUMENT, - PUT, - ID, - REMOVE, - UPDATE; - - @Override - public String toString() { - return super.toString().toLowerCase(); - } - - public static Operation fromString(String text) { - for (Operation op : Operation.values()) { - if (op.toString().equalsIgnoreCase(text)) { - return op; - } - } - throw new IllegalArgumentException("Unknown operation: " + text); - } - - public static boolean valid(String text) { - for (Operation op : Operation.values()) { - if (op.toString().equalsIgnoreCase(text)) { - return true; - } - } - return false; - } - - } - - private static final String PROPERTY_CREATE_IF_NON_EXISTENT = "create-if-non-existent"; - private static final String PROPERTY_ID_TEMPLATE = "docid"; - private static final String PROPERTY_OPERATION = "operation"; - private static final String PROPERTY_VERBOSE = "verbose"; - private static final String BAG_AS_MAP_FIELDS = "bag-as-map-fields"; - private static final String SIMPLE_ARRAY_FIELDS = "simple-array-fields"; - private static final String SIMPLE_OBJECT_FIELDS = "simple-object-fields"; - private static final String CREATE_TENSOR_FIELDS = "create-tensor-fields"; - private static final String REMOVE_TENSOR_FIELDS = "remove-tensor-fields"; - private static final String UPDATE_TENSOR_FIELDS = "update-tensor-fields"; - private static final String REMOVE_MAP_FIELDS = "remove-map-fields"; - private static final String UPDATE_MAP_FIELDS = "update-map-fields"; - private static final String EXCLUDE_FIELDS = "exclude-fields"; - private static final String TESTSET_CONDITION = "condition"; - private static final String PARTIAL_UPDATE_ASSIGN = "assign"; - private static final String PARTIAL_UPDATE_ADD = "add"; - private static final String PARTIAL_UPDATE_REMOVE = "remove"; - - private static Map<String, String> mapPartialOperationMap; - - static { - mapPartialOperationMap = new HashMap<>(); - mapPartialOperationMap.put(REMOVE_MAP_FIELDS, PARTIAL_UPDATE_REMOVE); - mapPartialOperationMap.put(UPDATE_MAP_FIELDS, PARTIAL_UPDATE_ASSIGN); - } - - private static Map<String, String> partialOperationMap; - - static { - partialOperationMap = new HashMap<>(); - partialOperationMap.put(REMOVE_TENSOR_FIELDS, PARTIAL_UPDATE_REMOVE); - partialOperationMap.put(UPDATE_TENSOR_FIELDS, PARTIAL_UPDATE_ADD); - partialOperationMap.put(REMOVE_MAP_FIELDS, PARTIAL_UPDATE_REMOVE); - partialOperationMap.put(UPDATE_MAP_FIELDS, PARTIAL_UPDATE_ASSIGN); - } - - private final boolean verbose; - private final String template; - private final Operation operation; - private final Properties properties; - private PigStatusReporter statusReporter; - - public VespaDocumentOperation(String... params) { - statusReporter = PigStatusReporter.getInstance(); - if (statusReporter != null) { - statusReporter.incrCounter("Vespa Document Operation Counters", "Document operation ok", 0); - statusReporter.incrCounter("Vespa Document Operation Counters", "Document operation failed", 0); - } - properties = VespaConfiguration.loadProperties(params); - template = properties.getProperty(PROPERTY_ID_TEMPLATE); - operation = Operation.fromString(properties.getProperty(PROPERTY_OPERATION, "put")); - verbose = Boolean.parseBoolean(properties.getProperty(PROPERTY_VERBOSE, "false")); - } - - @Override - public String exec(Tuple tuple) throws IOException { - if (tuple == null || tuple.size() == 0) { - if (statusReporter != null) { - statusReporter.incrCounter("Vespa Document Operation Counters", "Document operation failed", 1); - } - return null; - } - if (template == null || template.length() == 0) { - if (statusReporter != null) { - statusReporter.incrCounter("Vespa Document Operation Counters", "Document operation failed", 1); - } - warnLog("No valid document id template found. Skipping.", PigWarning.UDF_WARNING_1); - return null; - } - if (operation == null) { - warnLog("No valid operation found. Skipping.", PigWarning.UDF_WARNING_2); - return null; - } - - String json = null; - - try { - if (reporter != null) { - reporter.progress(); - } - - Schema inputSchema = getInputSchema(); - Map<String, Object> fields = TupleTools.tupleMap(inputSchema, tuple); - String docId = TupleTools.toString(fields, template); - if (verbose) { - System.out.println("Processing docId: "+ docId); - } - // create json - json = create(operation, docId, fields, properties, inputSchema); - if (json == null || json.length() == 0) { - warnLog("No valid document operation could be created.", PigWarning.UDF_WARNING_3); - return null; - } - - - } catch (Exception e) { - if (statusReporter != null) { - statusReporter.incrCounter("Vespa Document Operation Counters", "Document operation failed", 1); - } - StringBuilder sb = new StringBuilder(); - sb.append("Caught exception processing input row: \n"); - sb.append(tuple.toString()); - sb.append("\nException: "); - sb.append(getStackTraceAsString(e)); - warnLog(sb.toString(), PigWarning.UDF_WARNING_4); - return null; - } - if (statusReporter != null) { - statusReporter.incrCounter("Vespa Document Operation Counters", "Document operation ok", 1); - } - return json; - } - - - /** - * Create a JSON Vespa document operation given the supplied fields, - * operation and document id template. - * - * @param op Operation (put, remove, update) - * @param docId Document id - * @param fields Fields to put in document operation - * @return A valid JSON Vespa document operation - * @throws IOException ... - */ - public static String create(Operation op, String docId, Map<String, Object> fields, Properties properties, - Schema schema) throws IOException { - if (op == null) { - return null; - } - if (docId == null || docId.length() == 0) { - return null; - } - if (fields.isEmpty()) { - return null; - } - - // create json format - ByteArrayOutputStream out = new ByteArrayOutputStream(); - JsonGenerator g = new JsonFactory().createGenerator(out, JsonEncoding.UTF8); - g.writeStartObject(); - - g.writeStringField(op.toString(), docId); - - boolean createIfNonExistent = Boolean.parseBoolean(properties.getProperty(PROPERTY_CREATE_IF_NON_EXISTENT, "false")); - if (op == Operation.UPDATE && createIfNonExistent) { - writeField("create", true, DataType.BOOLEAN, g, properties, schema, op, 0); - } - String testSetConditionTemplate = properties.getProperty(TESTSET_CONDITION); - if (testSetConditionTemplate != null) { - String testSetCondition = TupleTools.toString(fields, testSetConditionTemplate); - writeField(TESTSET_CONDITION, testSetCondition, DataType.CHARARRAY, g, properties, schema, op, 0); - } - if (op != Operation.REMOVE) { - writeField("fields", fields, DataType.MAP, g, properties, schema, op, 0); - } - - g.writeEndObject(); - g.close(); - - return out.toString(); - } - - private static String getPartialOperation(Map<String, String> operationMap, String name, Properties properties) { - // This function checks if the property of the name falls into the map provided - // if yes, return the desired operation. if no, return null - // for example, input: - // operationMap map{"update-map-fields":"assign","remove-map-fields":"remove"} - // name date - // properties "update-map-fields":"date,month" - // output: assign - for (String label : operationMap.keySet()) { - if (properties.getProperty(label) != null) { - String[] p = properties.getProperty(label).split(","); - if (Arrays.asList(p).contains(name)) { - return operationMap.get(label); - } - } - } - return null; - } - - @SuppressWarnings("unchecked") - private static void writeField(String name, Object value, Byte type, JsonGenerator g, Properties properties, Schema schema, Operation op, int depth) throws IOException { - if (shouldWriteField(name, properties, depth)) { - String operation = getPartialOperation(mapPartialOperationMap, name, properties); - // check if the name has the property update-map-fields/remove-map-fields - // if yes, we need special treatments here as we need to loop through the tuple - // be aware the the operation here is not vespa operation such as "put" and "update" - // operation here are the field name we wish use to such as "assign" and "remove" - if (operation != null) { - writePartialUpdateAndRemoveMap(name, value, g, properties, schema, op, depth, operation); - } else { - g.writeFieldName(name); - if (shouldWritePartialUpdate(op, depth)) { - writePartialUpdate(value, type, g, name, properties, schema, op, depth); - } else { - writeValue(value, type, g, name, properties, schema, op, depth); - } - } - - } - } - - private static void writePartialUpdateAndRemoveMap(String name, Object value, JsonGenerator g, Properties properties, Schema schema, Operation op, int depth, String operation) throws IOException { - schema = (schema != null) ? schema.getField(0).schema : null; - // extract the key of map and keys in map for writing json when partial updating maps - Schema valueSchema = (schema != null) ? schema.getField(1).schema : null; - // data format { ( key; id, value: (abc,123,(123234,bbaa))) } - // the first element of each tuple in the bag will be the map to update - // the second element of each tuple in the bag will be the new value of the map - DataBag bag = (DataBag) value; - for (Tuple element : bag) { - if (element.size() != 2) { - continue; - } - String k = (String) element.get(0); - Object v = element.get(1); - Byte t = DataType.findType(v); - if (t == DataType.TUPLE) { - g.writeFieldName(name + "{" + k + "}"); - if (operation.equals(PARTIAL_UPDATE_REMOVE)) { - g.writeStartObject(); - g.writeFieldName(PARTIAL_UPDATE_REMOVE); - g.writeNumber(0); - g.writeEndObject(); - } else { - writePartialUpdate(v, t, g, name, properties, valueSchema, op, depth); - } - } - } - } - - @SuppressWarnings("unchecked") - private static void writeValue(Object value, Byte type, JsonGenerator g, String name, Properties properties, Schema schema, Operation op, int depth) throws IOException { - switch (type) { - case DataType.UNKNOWN: - break; - case DataType.NULL: - g.writeNull(); - break; - case DataType.BOOLEAN: - g.writeBoolean((boolean) value); - break; - case DataType.INTEGER: - g.writeNumber((int) value); - break; - case DataType.LONG: - g.writeNumber((long) value); - break; - case DataType.FLOAT: - g.writeNumber((float) value); - break; - case DataType.DOUBLE: - g.writeNumber((double) value); - break; - case DataType.DATETIME: - g.writeNumber(((DateTime) value).getMillis()); - break; - case DataType.BYTEARRAY: - DataByteArray bytes = (DataByteArray) value; - String raw = Base64.getEncoder().encodeToString(bytes.get()); - g.writeString(raw); - break; - case DataType.CHARARRAY: - g.writeString((String) value); - break; - case DataType.BIGINTEGER: - g.writeNumber((BigInteger) value); - break; - case DataType.BIGDECIMAL: - g.writeNumber((BigDecimal) value); - break; - case DataType.MAP: - g.writeStartObject(); - Map<Object, Object> map = (Map<Object, Object>) value; - if (shouldCreateTensor(map, name, properties)) { - if (isRemoveTensor(name, properties)) { - writeRemoveTensor(map, g); - } else { - writeTensor(map, g); - } - } else { - for (Map.Entry<Object, Object> entry : map.entrySet()) { - String k = entry.getKey().toString(); - Object v = entry.getValue(); - Byte t = DataType.findType(v); - Schema fieldSchema = (schema != null) ? schema.getField(k).schema : null; - writeField(k, v, t, g, properties, fieldSchema, op, depth + 1); - } - } - g.writeEndObject(); - break; - case DataType.TUPLE: - Tuple tuple = (Tuple) value; - if (shouldWriteTupleAsMap(name, properties)) { - Map<String, Object> fields = TupleTools.tupleMap(schema, tuple); - writeValue(fields, DataType.MAP, g, name, properties, schema, op, depth); - } else { - boolean writeStartArray = shouldWriteTupleStart(tuple, name, properties); - if (writeStartArray) { - g.writeStartArray(); - } - for (Object v : tuple) { - writeValue(v, DataType.findType(v), g, name, properties, schema, op, depth); - } - if (writeStartArray) { - g.writeEndArray(); - } - } - break; - case DataType.BAG: - DataBag bag = (DataBag) value; - // get the schema of the tuple in bag - schema = (schema != null) ? schema.getField(0).schema : null; - if (shouldWriteBagAsMap(name, properties)) { - // when treating bag as map, the schema of bag should be {(key, val)....} - // the size of tuple in bag should be 2. 1st one is key. 2nd one is val. - Schema valueSchema = (schema != null) ? schema.getField(1).schema : null; - - g.writeStartObject(); - for (Tuple element : bag) { - if (element.size() != 2) { - continue; - } - String k = (String) element.get(0); - Object v = element.get(1); - Byte t = DataType.findType(v); - if (t == DataType.TUPLE) { - Map<String, Object> fields = TupleTools.tupleMap(valueSchema, (Tuple) v); - writeField(k, fields, DataType.MAP, g, properties, valueSchema, op, depth + 1); - } else { - writeField(k, v, t, g, properties, valueSchema, op, depth + 1); - } - } - g.writeEndObject(); - } else { - g.writeStartArray(); - for (Tuple t : bag) { - writeValue(t, DataType.TUPLE, g, name, properties, schema, op, depth); - } - g.writeEndArray(); - } - break; - } - - } - - private static boolean shouldWritePartialUpdate(Operation op, int depth) { - return op == Operation.UPDATE && depth == 1; - } - - private static void writePartialUpdate(Object value, Byte type, JsonGenerator g, String name, Properties properties, Schema schema, Operation op, int depth) throws IOException { - g.writeStartObject(); - // here we check if the operation falls into the four partial operations we do on map/tensor structure - // if no, we assume it's a update on the whole document and we write assign here - // if yes, we write the desired operation here - String operation = getPartialOperation(partialOperationMap, name, properties); - if (operation != null) { - g.writeFieldName(operation); - } else { - g.writeFieldName(PARTIAL_UPDATE_ASSIGN); - } - writeValue(value, type, g, name, properties, schema, op, depth); - g.writeEndObject(); - } - - private static boolean shouldWriteTupleStart(Tuple tuple, String name, Properties properties) { - if (tuple.size() > 1 || properties == null) { - return true; - } - String simpleArrayFields = properties.getProperty(SIMPLE_ARRAY_FIELDS); - if (simpleArrayFields == null) { - return true; - } - if (simpleArrayFields.equals("*")) { - return false; - } - String[] fields = simpleArrayFields.split(","); - for (String field : fields) { - if (field.trim().equalsIgnoreCase(name)) { - return false; - } - } - return true; - } - - private static boolean shouldWriteTupleAsMap(String name, Properties properties) { - // include UPDATE_MAP_FIELDS here because when updating the map - // the second element in each tuple should be written as a map - if (properties == null) { - return false; - } - String addBagAsMapFields = properties.getProperty(UPDATE_MAP_FIELDS); - String simpleObjectFields = properties.getProperty(SIMPLE_OBJECT_FIELDS); - if (simpleObjectFields == null && addBagAsMapFields == null) { - return false; - } - if (addBagAsMapFields != null) { - if (addBagAsMapFields.equals("*")) { - return true; - } - String[] fields = addBagAsMapFields.split(","); - for (String field : fields) { - if (field.trim().equalsIgnoreCase(name)) { - return true; - } - } - - } - if (simpleObjectFields != null) { - if (simpleObjectFields.equals("*")) { - return true; - } - String[] fields = simpleObjectFields.split(","); - for (String field : fields) { - if (field.trim().equalsIgnoreCase(name)) { - return true; - } - } - } - return false; - } - - private static boolean shouldWriteBagAsMap(String name, Properties properties) { - if (properties == null) { - return false; - } - String bagAsMapFields = properties.getProperty(BAG_AS_MAP_FIELDS); - if (bagAsMapFields == null) { - return false; - } - if (bagAsMapFields.equals("*")) { - return true; - } - String[] fields = bagAsMapFields.split(","); - for (String field : fields) { - if (field.trim().equalsIgnoreCase(name)) { - return true; - } - } - return false; - } - - private static boolean shouldCreateTensor(Map<Object, Object> map, String name, Properties properties) { - if (properties == null) { - return false; - } - String createTensorFields = properties.getProperty(CREATE_TENSOR_FIELDS); - String addTensorFields = properties.getProperty(UPDATE_TENSOR_FIELDS); - String removeTensorFields = properties.getProperty(REMOVE_TENSOR_FIELDS); - - if (createTensorFields == null && addTensorFields == null && removeTensorFields == null) { - return false; - } - String[] fields; - if (createTensorFields != null) { - fields = createTensorFields.split(","); - for (String field : fields) { - if (field.trim().equalsIgnoreCase(name)) { - return true; - } - } - } - if (addTensorFields != null) { - fields = addTensorFields.split(","); - for (String field : fields) { - if (field.trim().equalsIgnoreCase(name)) { - return true; - } - } - } - if (removeTensorFields != null) { - fields = removeTensorFields.split(","); - for (String field : fields) { - if (field.trim().equalsIgnoreCase(name)) { - return true; - } - } - } - return false; - } - - private static boolean isRemoveTensor(String name, Properties properties) { - if (properties == null) { - return false; - } - String removeTensorFields = properties.getProperty(REMOVE_TENSOR_FIELDS); - if (removeTensorFields == null) { - return false; - } - String[] fields = removeTensorFields.split(","); - for (String field : fields) { - if (field.trim().equalsIgnoreCase(name)) { - return true; - } - } - return false; - } - - private static boolean shouldWriteField(String name, Properties properties, int depth) { - if (properties == null || depth != 1) { - return true; - } - String excludeFields = properties.getProperty(EXCLUDE_FIELDS); - if (excludeFields == null) { - return true; - } - String[] fields = excludeFields.split(","); - for (String field : fields) { - if (field.trim().equalsIgnoreCase(name)) { - return false; - } - } - return true; - } - - private static void writeTensor(Map<Object, Object> map, JsonGenerator g) throws IOException { - g.writeFieldName("cells"); - g.writeStartArray(); - for (Map.Entry<Object, Object> entry : map.entrySet()) { - String k = entry.getKey().toString(); - Double v = Double.parseDouble(entry.getValue().toString()); - - g.writeStartObject(); - - // Write address - g.writeFieldName("address"); - g.writeStartObject(); - - String[] dimensions = k.split(","); - for (String dimension : dimensions) { - if (dimension == null || dimension.isEmpty()) { - continue; - } - String[] address = dimension.split(":"); - if (address.length != 2) { - throw new IllegalArgumentException("Malformed cell address: " + dimension); - } - String dim = address[0]; - String label = address[1]; - if (dim == null || label == null || dim.isEmpty() || label.isEmpty()) { - throw new IllegalArgumentException("Malformed cell address: " + dimension); - } - g.writeFieldName(dim.trim()); - g.writeString(label.trim()); - } - g.writeEndObject(); - - // Write value - g.writeFieldName("value"); - g.writeNumber(v); - - g.writeEndObject(); - } - g.writeEndArray(); - } - - private static void writeRemoveTensor(Map<Object, Object> map, JsonGenerator g) throws IOException { - g.writeFieldName("addresses"); - g.writeStartArray(); - for (Map.Entry<Object, Object> entry : map.entrySet()) { - String k = entry.getKey().toString(); - String[] dimensions = k.split(","); - for (String dimension : dimensions) { - g.writeStartObject(); - if (dimension == null || dimension.isEmpty()) { - continue; - } - String[] address = dimension.split(":"); - if (address.length != 2) { - throw new IllegalArgumentException("Malformed cell address: " + dimension); - } - String dim = address[0]; - String label = address[1]; - if (dim == null || label == null || dim.isEmpty() || label.isEmpty()) { - throw new IllegalArgumentException("Malformed cell address: " + dimension); - } - g.writeFieldName(dim.trim()); - g.writeString(label.trim()); - g.writeEndObject(); - // Write address - } - } - g.writeEndArray(); - } - - // copied from vespajlib for reducing dependency and building with JDK 8 - private static String getStackTraceAsString(Throwable throwable) { - try (StringWriter stringWriter = new StringWriter(); - PrintWriter printWriter = new PrintWriter(stringWriter, true)) { - throwable.printStackTrace(printWriter); - return stringWriter.getBuffer().toString(); - } catch (IOException e) { - throw new UncheckedIOException(e); - } - } - - // wrapper to emit logs - private void warnLog(String msg, PigWarning warning) { - warn(msg, warning); - System.err.println(msg); - } -} diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaQuery.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaQuery.java deleted file mode 100644 index 1d50f2909a2..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaQuery.java +++ /dev/null @@ -1,114 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.pig; - -import com.fasterxml.jackson.databind.JsonNode; -import com.yahoo.vespa.hadoop.mapreduce.util.VespaConfiguration; -import com.yahoo.vespa.hadoop.mapreduce.util.VespaHttpClient; -import com.yahoo.vespa.hadoop.mapreduce.util.VespaQuerySchema; -import com.yahoo.vespa.hadoop.mapreduce.util.TupleTools; -import org.apache.pig.EvalFunc; -import org.apache.pig.data.*; -import org.apache.pig.impl.logicalLayer.schema.Schema; -import org.apache.pig.impl.util.UDFContext; - -import java.io.IOException; -import java.util.*; - -/** - * A Pig UDF to run a query against a Vespa cluster and return the - * results. - * - * @author lesters - */ -public class VespaQuery extends EvalFunc<DataBag> { - - private final String PROPERTY_QUERY_TEMPLATE = "query"; - private final String PROPERTY_QUERY_SCHEMA = "schema"; - private final String PROPERTY_ROOT_NODE = "rootnode"; - - private final VespaConfiguration configuration; - private final Properties properties; - private final String queryTemplate; - private final String querySchema; - private final String queryRootNode; - - private VespaHttpClient httpClient; - - public VespaQuery() { - this(new String[0]); - } - - public VespaQuery(String... params) { - configuration = VespaConfiguration.get(UDFContext.getUDFContext().getJobConf(), null); - properties = VespaConfiguration.loadProperties(params); - - queryTemplate = properties.getProperty(PROPERTY_QUERY_TEMPLATE); - if (queryTemplate == null || queryTemplate.isEmpty()) { - throw new IllegalArgumentException("Query template cannot be empty"); - } - - querySchema = properties.getProperty(PROPERTY_QUERY_SCHEMA, "rank:int,id:chararray"); - queryRootNode = properties.getProperty(PROPERTY_ROOT_NODE, "root/children"); - } - - @Override - public DataBag exec(Tuple input) throws IOException { - if (input == null || input.size() == 0) { - return null; - } - JsonNode jsonResult = queryVespa(input); - if (jsonResult == null) { - return null; - } - return createPigRepresentation(jsonResult); - } - - @Override - public Schema outputSchema(Schema input) { - return VespaQuerySchema.getPigSchema(querySchema); - } - - - private JsonNode queryVespa(Tuple input) throws IOException { - String url = createVespaQueryUrl(input); - if (url == null) { - return null; - } - String result = executeVespaQuery(url); - return parseVespaResultJson(result); - } - - - private String createVespaQueryUrl(Tuple input) throws IOException { - return TupleTools.toString(getInputSchema(), input, queryTemplate); - } - - - private String executeVespaQuery(String url) throws IOException { - if (httpClient == null) { - httpClient = new VespaHttpClient(configuration); - } - return httpClient.get(url); - } - - private JsonNode parseVespaResultJson(String result) throws IOException { - return httpClient == null ? null : httpClient.parseResultJson(result, queryRootNode); - } - - private DataBag createPigRepresentation(JsonNode hits) { - DataBag bag = new SortedDataBag(null); - VespaQuerySchema querySchema = new VespaQuerySchema(this.querySchema); - - for (int rank = 0; rank < hits.size(); ++rank) { - JsonNode hit = hits.get(rank); - Tuple tuple = querySchema.buildTuple(rank, hit); - bag.add(tuple); - } - - return bag; - } - - - - -} diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaSimpleJsonLoader.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaSimpleJsonLoader.java deleted file mode 100644 index 9dc294ce243..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaSimpleJsonLoader.java +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.pig; - -import com.yahoo.vespa.hadoop.mapreduce.VespaSimpleJsonInputFormat; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.InputFormat; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.RecordReader; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.pig.LoadFunc; -import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit; -import org.apache.pig.data.Tuple; -import org.apache.pig.data.TupleFactory; - -import java.io.IOException; - -/** - * Simple JSON loader which loads either one JSON object per line or a - * multiline JSON consisting of objects in an array. - * - * Returns only the textual representation of the JSON object. - * - * @author lesters - */ -@SuppressWarnings("rawtypes") -public class VespaSimpleJsonLoader extends LoadFunc { - - private TupleFactory tupleFactory = TupleFactory.getInstance(); - private VespaSimpleJsonInputFormat.VespaJsonRecordReader recordReader; - - @Override - public void setLocation(String location, Job job) throws IOException { - FileInputFormat.setInputPaths(job, location); - } - - @Override - public InputFormat getInputFormat() throws IOException { - return new VespaSimpleJsonInputFormat(); - } - - @Override - public void prepareToRead(RecordReader reader, PigSplit split) throws IOException { - recordReader = (VespaSimpleJsonInputFormat.VespaJsonRecordReader) reader; - } - - @Override - public Tuple getNext() throws IOException { - try { - boolean done = recordReader.nextKeyValue(); - if (done) { - return null; - } - Text json = recordReader.getCurrentKey(); - if (json == null) { - return null; - } - return tupleFactory.newTuple(json.toString()); - - } catch (InterruptedException ex) { - return null; - } - } -} diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaStorage.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaStorage.java deleted file mode 100644 index a564dfac25d..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaStorage.java +++ /dev/null @@ -1,178 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.pig; - -import com.fasterxml.jackson.databind.ObjectMapper; -import com.yahoo.vespa.hadoop.mapreduce.VespaOutputFormat; -import com.yahoo.vespa.hadoop.mapreduce.util.TupleTools; -import com.yahoo.vespa.hadoop.mapreduce.util.VespaConfiguration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.OutputFormat; -import org.apache.hadoop.mapreduce.RecordWriter; -import org.apache.pig.ResourceSchema; -import org.apache.pig.StoreFunc; -import org.apache.pig.data.Tuple; -import org.apache.pig.impl.logicalLayer.schema.Schema; -import org.apache.pig.impl.util.UDFContext; - -import java.io.*; -import java.util.Base64; -import java.util.Map; -import java.util.Properties; - -/** - * A small Pig UDF wrapper around the Vespa http client for - * feeding data into a Vespa endpoint. - * - * @author lesters - */ -@SuppressWarnings("rawtypes") -public class VespaStorage extends StoreFunc { - - private final boolean createDocOp; - private final String template; - private final VespaDocumentOperation.Operation operation; - - private String signature = null; - private RecordWriter recordWriter = null; - private ResourceSchema resourceSchema = null; - - private static final String PROPERTY_CREATE_DOC_OP = "create-document-operation"; - private static final String PROPERTY_ID_TEMPLATE = "docid"; - private static final String PROPERTY_OPERATION = "operation"; - private static final String PROPERTY_RESOURCE_SCHEMA = "resource_schema"; - - Properties properties = new Properties(); - - public VespaStorage() { - createDocOp = false; - template = null; - operation = null; - } - - public VespaStorage(String... params) { - properties = VespaConfiguration.loadProperties(params); - createDocOp = Boolean.parseBoolean(properties.getProperty(PROPERTY_CREATE_DOC_OP, "false")); - operation = VespaDocumentOperation.Operation.fromString(properties.getProperty(PROPERTY_OPERATION, "put")); - template = properties.getProperty(PROPERTY_ID_TEMPLATE); - } - - - @Override - public OutputFormat getOutputFormat() throws IOException { - return new VespaOutputFormat(properties); - } - - - @Override - public void setStoreLocation(String endpoint, Job job) throws IOException { - properties.setProperty(VespaConfiguration.ENDPOINT, endpoint); - } - - - @Override - public void prepareToWrite(RecordWriter recordWriter) throws IOException { - this.recordWriter = recordWriter; - this.resourceSchema = getResourceSchema(); - } - - - @SuppressWarnings("unchecked") - @Override - public void putNext(Tuple tuple) throws IOException { - if (tuple == null || tuple.size() == 0) { - return; - } - - String data = null; - if (createDocOp) { - data = createDocumentOperation(tuple); - } else if (!tuple.isNull(0)) { - data = tuple.get(0).toString(); // assume single field with correctly formatted doc op. - } - - if (data == null || data.length() == 0) { - return; - } - - try { - recordWriter.write(0, data); - } catch (InterruptedException e) { - throw new IOException(e); - } - } - - - @Override - public void checkSchema(ResourceSchema resourceSchema) throws IOException { - setResourceSchema(resourceSchema); - } - - - @Override - public String relToAbsPathForStoreLocation(String endpoint, Path path) throws IOException { - return endpoint; - } - - - @Override - public void setStoreFuncUDFContextSignature(String s) { - this.signature = s; - } - - - @Override - public void cleanupOnFailure(String s, Job job) throws IOException { - } - - - @Override - public void cleanupOnSuccess(String s, Job job) throws IOException { - } - - - private ResourceSchema getResourceSchema() throws IOException { - Properties properties = getUDFProperties(); - return base64Deserialize(properties.getProperty(PROPERTY_RESOURCE_SCHEMA)); - } - - - private void setResourceSchema(ResourceSchema schema) throws IOException { - Properties properties = getUDFProperties(); - if (properties.getProperty(PROPERTY_RESOURCE_SCHEMA) == null) { - properties.setProperty(PROPERTY_RESOURCE_SCHEMA, base64Serialize(schema)); - } - } - - - private Properties getUDFProperties() { - String[] context = { signature }; - return UDFContext.getUDFContext().getUDFProperties(getClass(), context); - } - - - private String createDocumentOperation(Tuple tuple) throws IOException { - if (tuple == null || tuple.size() == 0) { - return null; - } - if (resourceSchema == null) { - return null; - } - - Map<String, Object> fields = TupleTools.tupleMap(resourceSchema, tuple); - String docId = TupleTools.toString(fields, template); - - Schema schema = Schema.getPigSchema(resourceSchema); - return VespaDocumentOperation.create(operation, docId, fields, properties, schema); - } - - public static String base64Serialize(ResourceSchema resourceSchema) throws IOException { - byte[] bytes = new ObjectMapper().writeValueAsBytes(resourceSchema); - return Base64.getEncoder().encodeToString(bytes); - } - - public static ResourceSchema base64Deserialize(String s) throws IOException { - byte[] data = Base64.getDecoder().decode(s); - return new ObjectMapper().readValue(data, ResourceSchema.class); - } -} diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/package-info.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/package-info.java deleted file mode 100644 index 686765ac047..00000000000 --- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/package-info.java +++ /dev/null @@ -1,10 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -/** - * com.yahoo.vespa.hadoop.pig contains classes and utilities - * to enable feeding directly to Vespa endpoints from pig. - * It is a minimal layer over the Vespa HTTP client. - * - * NOTE: This is a PUBLIC API, but not annotated as such because this is not a bundle and - * we don't want to introduce Vespa dependencies. - */ -package com.yahoo.vespa.hadoop.pig; diff --git a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/MapReduceTest.java b/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/MapReduceTest.java deleted file mode 100644 index d56cd818de2..00000000000 --- a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/MapReduceTest.java +++ /dev/null @@ -1,200 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.pig; - -import com.fasterxml.jackson.core.JsonEncoding; -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonGenerator; -import com.yahoo.vespa.hadoop.mapreduce.VespaOutputFormat; -import com.yahoo.vespa.hadoop.mapreduce.util.VespaConfiguration; -import com.yahoo.vespa.hadoop.mapreduce.util.VespaCounters; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FSDataOutputStream; -import org.apache.hadoop.fs.FileSystem; -import org.apache.hadoop.fs.LocalFileSystem; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.hdfs.HdfsConfiguration; -import org.apache.hadoop.hdfs.MiniDFSCluster; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.test.PathUtils; -import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.io.BufferedInputStream; -import java.io.ByteArrayOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.StringTokenizer; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertTrue; - -public class MapReduceTest { - - protected static File hdfsBaseDir; - protected static FileSystem hdfs; - protected static Configuration conf; - protected static MiniDFSCluster cluster; - - protected static Path metricsJsonPath; - protected static Path metricsCsvPath; - - @BeforeAll - public static void setUp() throws IOException { - hdfsBaseDir = new File(PathUtils.getTestDir(MapReduceTest.class).getCanonicalPath()); - - conf = new HdfsConfiguration(); - conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsBaseDir.getAbsolutePath()); - conf.set(VespaConfiguration.DRYRUN, "true"); - conf.set(VespaConfiguration.ENDPOINT, "endpoint-does-not-matter-in-dryrun"); - - cluster = new MiniDFSCluster.Builder(conf).build(); - hdfs = FileSystem.get(conf); - - metricsJsonPath = new Path("metrics_json"); - metricsCsvPath = new Path("metrics_csv"); - copyToHdfs("src/test/resources/operations_data.json", metricsJsonPath, "data"); - copyToHdfs("src/test/resources/tabular_data.csv", metricsCsvPath, "data"); - } - - @AfterAll - public static void tearDown() throws IOException { - Path testDir = new Path(hdfsBaseDir.getParent()); - hdfs.delete(testDir, true); - cluster.shutdown(); - LocalFileSystem localFileSystem = FileSystem.getLocal(conf); - localFileSystem.delete(testDir, true); - } - - @Test - public void requireThatMapOnlyJobSucceeds() throws Exception { - Job job = Job.getInstance(conf); - job.setJarByClass(MapReduceTest.class); - job.setMapperClass(FeedMapper.class); - job.setOutputFormatClass(VespaOutputFormat.class); - job.setMapOutputValueClass(Text.class); - - FileInputFormat.setInputPaths(job, metricsJsonPath); - - boolean success = job.waitForCompletion(true); - assertTrue(success, "Job Failed"); - - VespaCounters counters = VespaCounters.get(job); - assertEquals(10, counters.getDocumentsSent()); - assertEquals(0, counters.getDocumentsFailed()); - assertEquals(10, counters.getDocumentsOk()); - } - - @Test - public void requireThatMapReduceJobSucceeds() throws Exception { - Job job = Job.getInstance(conf); - job.setJarByClass(MapReduceTest.class); - job.setMapperClass(FeedMapper.class); - job.setOutputFormatClass(VespaOutputFormat.class); - job.setMapOutputValueClass(Text.class); - job.setReducerClass(FeedReducer.class); - job.setNumReduceTasks(1); - - FileInputFormat.setInputPaths(job, metricsJsonPath); - - boolean success = job.waitForCompletion(true); - assertTrue(success, "Job Failed"); - - VespaCounters counters = VespaCounters.get(job); - assertEquals(10, counters.getDocumentsSent()); - assertEquals(0, counters.getDocumentsFailed()); - assertEquals(10, counters.getDocumentsOk()); - } - - - @Test - public void requireThatTransformMapJobSucceeds() throws Exception { - Job job = Job.getInstance(conf); - job.setJarByClass(MapReduceTest.class); - job.setMapperClass(ParsingMapper.class); - job.setOutputFormatClass(VespaOutputFormat.class); - job.setMapOutputValueClass(Text.class); - job.setReducerClass(FeedReducer.class); - job.setNumReduceTasks(1); - - FileInputFormat.setInputPaths(job, metricsCsvPath); - - boolean success = job.waitForCompletion(true); - assertTrue(success, "Job Failed"); - - VespaCounters counters = VespaCounters.get(job); - assertEquals(10, counters.getDocumentsSent()); - assertEquals(0, counters.getDocumentsFailed()); - assertEquals(10, counters.getDocumentsOk()); - assertEquals(0, counters.getDocumentsSkipped()); - } - - - private static void copyToHdfs(String localFile, Path hdfsDir, String hdfsName) throws IOException { - Path hdfsPath = new Path(hdfsDir, hdfsName); - FSDataOutputStream out = hdfs.create(hdfsPath); - - try (InputStream in = new BufferedInputStream(new FileInputStream(localFile))) { - int len; - byte[] buffer = new byte[1024]; - while ((len = in.read(buffer)) > 0) { - out.write(buffer, 0, len); - } - } finally { - out.close(); - } - } - - public static class FeedMapper extends Mapper<LongWritable, Text, LongWritable, Text> { - public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { - context.write(key, value); - } - } - - public static class FeedReducer extends Reducer<Object, Text, LongWritable, Text> { - public void reduce(LongWritable key, Text value, Context context) throws IOException, InterruptedException { - context.write(key, value); - } - } - - public static class ParsingMapper extends Mapper<LongWritable, Text, LongWritable, Text> { - public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { - String line = value.toString(); - if (line == null || line.length() == 0) - return; - - StringTokenizer tokenizer = new StringTokenizer(line); - long date = Long.parseLong(tokenizer.nextToken()); - String metricName = tokenizer.nextToken(); - long metricValue = Long.parseLong(tokenizer.nextToken()); - String application = tokenizer.nextToken(); - - String docid = "id:"+application+":metric::"+metricName+"-"+date; - - ByteArrayOutputStream out = new ByteArrayOutputStream(); - JsonGenerator g = new JsonFactory().createGenerator(out, JsonEncoding.UTF8); - - g.writeStartObject(); - g.writeObjectFieldStart("fields"); - g.writeNumberField("date", date); - g.writeStringField("name", metricName); - g.writeNumberField("value", metricValue); - g.writeStringField("application", application); - g.writeEndObject(); - g.writeStringField("put", docid); - g.writeEndObject(); - g.close(); - - context.write(key, new Text(out.toString())); - } - } - - -} diff --git a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaDocumentOperationTest.java b/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaDocumentOperationTest.java deleted file mode 100644 index ec20e82763c..00000000000 --- a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaDocumentOperationTest.java +++ /dev/null @@ -1,633 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.pig; - -import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.ObjectMapper; -import org.apache.pig.data.BagFactory; -import org.apache.pig.data.DataBag; -import org.apache.pig.data.DataByteArray; -import org.apache.pig.data.DataType; -import org.apache.pig.data.SortedDataBag; -import org.apache.pig.data.Tuple; -import org.apache.pig.data.TupleFactory; -import org.apache.pig.impl.logicalLayer.FrontendException; -import org.apache.pig.impl.logicalLayer.schema.Schema; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.BeforeEach; -import org.junit.jupiter.api.Test; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.PrintStream; -import java.util.HashMap; -import java.util.Iterator; -import java.util.Map; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertNull; -import static org.junit.jupiter.api.Assertions.assertTrue; - -public class VespaDocumentOperationTest { - private final ByteArrayOutputStream outContent = new ByteArrayOutputStream(); - private final PrintStream originalOut = System.out; - - @BeforeEach - public void setUpStreams() { - System.setOut(new PrintStream(outContent)); - } - - @AfterEach - public void restoreStreams() { - System.setOut(originalOut); - } - @Test - public void requireThatUDFReturnsCorrectJson() throws Exception { - String json = getDocumentOperationJson("docid=id:<application>:metrics::<name>-<date>"); - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - JsonNode fields = root.path("fields"); - - // operation put is default - assertEquals("id:testapp:metrics::clicks-20160112", root.get("put").asText()); - assertEquals("testapp", fields.get("application").asText()); - assertEquals("clicks", fields.get("name").asText()); - assertEquals(3, fields.get("value").asInt()); - } - - - @Test - public void requireThatUDFSupportsUpdateAssign() throws IOException { - String json = getDocumentOperationJson("docid=id:<application>:metrics::<name>-<date>", "operation=update"); - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - JsonNode fields = root.path("fields"); - - assertEquals("id:testapp:metrics::clicks-20160112", root.get("update").asText()); - assertEquals("testapp", fields.get("application").get("assign").asText()); - assertEquals("clicks", fields.get("name").get("assign").asText()); - assertEquals(3, fields.get("value").get("assign").asInt()); - } - - @Test - public void requireThatUDFSupportsConditionalUpdateAssign() throws IOException { - String json = getDocumentOperationJson("docid=id:<application>:metrics::<name>-<date>", "operation=update", "condition=clicks < <value>"); - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - JsonNode fields = root.path("fields"); - - assertEquals("id:testapp:metrics::clicks-20160112", root.get("update").asText()); - assertEquals("clicks < 3", root.get("condition").asText()); - assertEquals("testapp", fields.get("application").get("assign").asText()); - assertEquals("clicks", fields.get("name").get("assign").asText()); - assertEquals(3, fields.get("value").get("assign").asInt()); - } - - @Test - public void requireThatUDFSupportsCreateIfNonExistent() throws IOException { - String json = getDocumentOperationJson("docid=id:<application>:metrics::<name>-<date>", "operation=update", - "create-if-non-existent=true"); - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - JsonNode fields = root.path("fields"); - - assertEquals("id:testapp:metrics::clicks-20160112", root.get("update").asText()); - assertTrue(root.get("create").asBoolean()); - assertEquals("testapp", fields.get("application").get("assign").asText()); - assertEquals("clicks", fields.get("name").get("assign").asText()); - assertEquals(3, fields.get("value").get("assign").asInt()); - } - - - @Test - public void requireThatUDFReturnsNullForMissingConfig() throws Exception { - String json = getDocumentOperationJson(); - assertNull(json); - } - - - @Test - public void requireThatUDFCorrectlyGeneratesRemoveBagAsMapOperation() throws Exception { - DataBag bag = BagFactory.getInstance().newDefaultBag(); - - Schema innerObjectSchema = new Schema(); - Tuple innerObjectTuple = TupleFactory.getInstance().newTuple(); - addToTuple("year", DataType.CHARARRAY, "2020", innerObjectSchema, innerObjectTuple); - addToTuple("month", DataType.INTEGER, 3, innerObjectSchema, innerObjectTuple); - - Schema objectSchema = new Schema(); - Tuple objectTuple = TupleFactory.getInstance().newTuple(); - addToTuple("key", DataType.CHARARRAY, "234566", objectSchema, objectTuple); - addToTuple("value", DataType.TUPLE, innerObjectTuple,innerObjectSchema,objectSchema, objectTuple); - - Schema bagSchema = new Schema(); - addToBagWithSchema("firstLayerTuple",DataType.TUPLE,objectTuple,objectSchema,bagSchema,bag); - - innerObjectSchema = new Schema(); - innerObjectTuple = TupleFactory.getInstance().newTuple(); - addToTuple("year", DataType.CHARARRAY, "2020", innerObjectSchema, innerObjectTuple); - addToTuple("month", DataType.INTEGER, 3, innerObjectSchema, innerObjectTuple); - - objectSchema = new Schema(); - objectTuple = TupleFactory.getInstance().newTuple(); - addToTuple("key", DataType.CHARARRAY, "123456", objectSchema, objectTuple); - addToTuple("value", DataType.TUPLE, innerObjectTuple,innerObjectSchema,objectSchema, objectTuple); - - addToBagWithSchema("firstLayerTuple",DataType.TUPLE,objectTuple,objectSchema,bagSchema,bag); - - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - addToTuple("bag", DataType.BAG, bag, bagSchema, schema, tuple); - addToTuple("id", DataType.CHARARRAY, "123", schema, tuple); - - VespaDocumentOperation docOp = new VespaDocumentOperation("docid=id", "remove-map-fields=bag","operation=update"); - docOp.setInputSchema(schema); - String json = docOp.exec(tuple); - - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - JsonNode fields = root.get("fields"); - assertEquals("{\"remove\":0}", fields.get("bag{123456}").toString()); - assertEquals("{\"remove\":0}", fields.get("bag{234566}").toString()); - - } - - @Test - public void requireThatUDFCorrectlyGeneratesAddBagAsMapOperation() throws Exception { - - DataBag bag = BagFactory.getInstance().newDefaultBag(); - - Schema innerObjectSchema = new Schema(); - Tuple innerObjectTuple = TupleFactory.getInstance().newTuple(); - addToTuple("year", DataType.CHARARRAY, "2020", innerObjectSchema, innerObjectTuple); - addToTuple("month", DataType.INTEGER, 3, innerObjectSchema, innerObjectTuple); - - Schema objectSchema = new Schema(); - Tuple objectTuple = TupleFactory.getInstance().newTuple(); - addToTuple("key", DataType.CHARARRAY, "123456", objectSchema, objectTuple); - addToTuple("value", DataType.TUPLE, innerObjectTuple,innerObjectSchema,objectSchema, objectTuple); - - Schema bagSchema = new Schema(); - addToBagWithSchema("firstLayerTuple",DataType.TUPLE,objectTuple,objectSchema,bagSchema,bag); - - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - addToTuple("bag", DataType.BAG, bag, bagSchema, schema, tuple); - addToTuple("id", DataType.CHARARRAY, "123", schema, tuple); - VespaDocumentOperation docOp = new VespaDocumentOperation("docid=id", "update-map-fields=bag","operation=update"); - docOp.setInputSchema(schema); - String json = docOp.exec(tuple); - - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - - JsonNode fields = root.get("fields"); - JsonNode value = fields.get("bag{123456}"); - JsonNode assign = value.get("assign"); - assertEquals("2020", assign.get("year").asText()); - assertEquals(3, assign.get("month").asInt()); - } - - @Test - public void requireThatUDFCorrectlyGeneratesAddTensorOperation() throws Exception { - - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - - // Please refer to the tensor format documentation - - Map<String, Double> tensor = new HashMap<String, Double>() {{ - put("x:label1,y:label2,z:label4", 2.0); - put("x:label3", 3.0); - }}; - - addToTuple("id", DataType.CHARARRAY, "123", schema, tuple); - addToTuple("tensor", DataType.MAP, tensor, schema, tuple); - - VespaDocumentOperation docOp = new VespaDocumentOperation("docid=empty", "update-tensor-fields=tensor","operation=update"); - docOp.setInputSchema(schema); - String json = docOp.exec(tuple); - - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - JsonNode fields = root.get("fields"); - JsonNode tensorValue = fields.get("tensor"); - JsonNode add = tensorValue.get("add"); - JsonNode cells = add.get("cells"); - Iterator<JsonNode> cellsIterator = cells.iterator(); - - JsonNode element = cellsIterator.next(); - assertEquals("label1", element.get("address").get("x").asText()); - assertEquals("label2", element.get("address").get("y").asText()); - assertEquals("label4", element.get("address").get("z").asText()); - assertEquals("2.0", element.get("value").toString()); - - element = cellsIterator.next(); - assertEquals("label3", element.get("address").get("x").asText()); - assertEquals("3.0", element.get("value").toString()); - } - - @Test - public void requireThatUDFCorrectlyGeneratesRemoveTensorOperation() throws Exception { - - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - - // Please refer to the tensor format documentation - - Map<String, Double> tensor = new HashMap<String, Double>() {{ - put("x:label1,y:label2,z:label4", 2.0); - put("x:label3", 3.0); - }}; - - addToTuple("id", DataType.CHARARRAY, "123", schema, tuple); - addToTuple("tensor", DataType.MAP, tensor, schema, tuple); - - VespaDocumentOperation docOp = new VespaDocumentOperation("docid=empty", "remove-tensor-fields=tensor","operation=update"); - docOp.setInputSchema(schema); - String json = docOp.exec(tuple); - - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - JsonNode fields = root.get("fields"); - JsonNode tensorValue = fields.get("tensor"); - JsonNode remove = tensorValue.get("remove"); - JsonNode address = remove.get("addresses"); - - Iterator<JsonNode> addressIterator = address.iterator(); - - JsonNode element = addressIterator.next(); - assertEquals("label1", element.get("x").asText()); - - element = addressIterator.next(); - assertEquals("label2", element.get("y").asText()); - - element = addressIterator.next(); - assertEquals("label4", element.get("z").asText()); - - element = addressIterator.next(); - assertEquals("label3", element.get("x").asText()); - } - - @Test - public void requireThatUDFReturnsNullWhenExceptionHappens() throws IOException { - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - - // broken DELTA format that would throw internally - Map<String, Double> tensor = new HashMap<String, Double>() {{ - put("xlabel1", 2.0); // missing : between 'x' and 'label1' - }}; - - addToTuple("id", DataType.CHARARRAY, "123", schema, tuple); - addToTuple("tensor", DataType.MAP, tensor, schema, tuple); - - VespaDocumentOperation docOp = new VespaDocumentOperation("docid=empty", "create-tensor-fields=tensor"); - docOp.setInputSchema(schema); - String json = docOp.exec(tuple); - - assertNull(json); - } - - @Test - public void requireThatUDFCorrectlyGeneratesRemoveOperation() throws Exception { - String json = getDocumentOperationJson("operation=remove", "docid=id:<application>:metrics::<name>-<date>"); - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - JsonNode fields = root.get("fields"); - - assertEquals("id:testapp:metrics::clicks-20160112", root.get("remove").asText()); - assertNull(fields); - } - - - @Test - public void requireThatUDFGeneratesComplexDataTypes() throws Exception { - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - - Tuple intTuple = TupleFactory.getInstance().newTuple(); - int[] intArray = {1, 2, 3}; - for (int i : intArray) { intTuple.append(i); } - - Tuple stringTuple = TupleFactory.getInstance().newTuple(); - String[] stringArray = {"a", "b", "c"}; - for (String s : stringArray) { stringTuple.append(s); } - - DataBag bag = new SortedDataBag(null); - bag.add(intTuple); - bag.add(stringTuple); - - Map<String, Object> innerMap = new HashMap<String, Object>() {{ - put("a", "string"); - put("tuple", intTuple); - }}; - - DataByteArray bytes = new DataByteArray("testdata".getBytes()); - - Map<String, Object> outerMap = new HashMap<String, Object>() {{ - put("string", "value"); - put("int", 3); - put("float", 3.145); - put("bool", true); - put("byte", bytes); - put("map", innerMap); - put("bag", bag); - }}; - - addToTuple("map", DataType.MAP, outerMap, schema, tuple); - - VespaDocumentOperation docOp = new VespaDocumentOperation("docid=empty"); - docOp.setInputSchema(schema); - String json = docOp.exec(tuple); - - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - JsonNode fields = root.get("fields"); - JsonNode map = fields.get("map"); - - assertEquals("value", map.get("string").asText()); - assertEquals(3, map.get("int").asInt()); - assertEquals(3.145, map.get("float").asDouble(), 1e-6); - assertTrue(map.get("bool").asBoolean()); - assertEquals("dGVzdGRhdGE=", map.get("byte").asText()); - - assertEquals("string", map.get("map").get("a").asText()); - for (int i = 0; i < intArray.length; ++i) { - assertEquals(intArray[i], map.get("map").get("tuple").get(i).asInt()); - } - - JsonNode bagField = map.get("bag"); - for (int i = 0; i < intArray.length; ++i) { - assertEquals(intArray[i], bagField.get(0).get(i).asInt()); - } - for (int i = 0; i < stringArray.length; ++i) { - assertEquals(stringArray[i], bagField.get(1).get(i).asText()); - } - } - - - @Test - public void requireThatSimpleArraysMustBeConfigured() throws Exception { - String[] stringArray = {"a", "b", "c"}; - JsonNode array = setupSimpleArrayOperation("array", stringArray, "docid=empty"); // simple arrays not configured - // json: [["a"], ["b"], ["c"]] - assertEquals("a", array.get(0).get(0).asText()); - assertEquals("b", array.get(1).get(0).asText()); - assertEquals("c", array.get(2).get(0).asText()); - } - - - @Test - public void requireThatSimpleArraysAreSupported() throws Exception { - String[] stringArray = {"a", "b", "c"}; - JsonNode array = setupSimpleArrayOperation("array", stringArray, "docid=empty", "simple-array-fields=array"); - // json: ["a", "b", "c"] - assertEquals("a", array.get(0).asText()); - assertEquals("b", array.get(1).asText()); - assertEquals("c", array.get(2).asText()); - } - - - @Test - public void requireThatSimpleArraysCanBeConfiguredWithWildcard() throws Exception { - String[] stringArray = {"a", "b", "c"}; - JsonNode array = setupSimpleArrayOperation("array", stringArray, "docid=empty", "simple-array-fields=*"); - // json: ["a", "b", "c"] - assertEquals("a", array.get(0).asText()); - assertEquals("b", array.get(1).asText()); - assertEquals("c", array.get(2).asText()); - } - - - @Test - public void requireThatMultipleSimpleArraysAreSupported() throws Exception { - String[] stringArray = {"a", "b", "c"}; - JsonNode array = setupSimpleArrayOperation("array", stringArray, "docid=empty", "simple-array-fields=empty,array"); - // json: ["a", "b", "c"] - assertEquals("a", array.get(0).asText()); - assertEquals("b", array.get(1).asText()); - assertEquals("c", array.get(2).asText()); - } - - - private JsonNode setupSimpleArrayOperation(String name, String[] array, String... params) throws IOException { - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - - DataBag bag = new SortedDataBag(null); - for (String s : array) { - Tuple stringTuple = TupleFactory.getInstance().newTuple(); - stringTuple.append(s); - bag.add(stringTuple); - } - addToTuple(name, DataType.BAG, bag, schema, tuple); - - VespaDocumentOperation docOp = new VespaDocumentOperation(params); - docOp.setInputSchema(schema); - String json = docOp.exec(tuple); - - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - JsonNode fields = root.get("fields"); - return fields.get(name); - } - - - @Test - public void requireThatUDFSupportsTensors() throws IOException { - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - - // Please refer to the tensor format documentation - - Map<String, Double> tensor = new HashMap<String, Double>() {{ - put("x:label1,y:label2,z:label4", 2.0); - put("x:label3", 3.0); - }}; - - addToTuple("id", DataType.CHARARRAY, "123", schema, tuple); - addToTuple("tensor", DataType.MAP, tensor, schema, tuple); - - VespaDocumentOperation docOp = new VespaDocumentOperation("docid=empty", "create-tensor-fields=tensor"); - docOp.setInputSchema(schema); - String json = docOp.exec(tuple); - - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - JsonNode fields = root.get("fields"); - JsonNode tensorNode = fields.get("tensor"); - JsonNode cells = tensorNode.get("cells"); - - assertEquals("label1", cells.get(0).get("address").get("x").asText()); - assertEquals("label2", cells.get(0).get("address").get("y").asText()); - assertEquals("label4", cells.get(0).get("address").get("z").asText()); - assertEquals("label3", cells.get(1).get("address").get("x").asText()); - - assertEquals(2.0, cells.get(0).get("value").asDouble(), 1e-6); - assertEquals(3.0, cells.get(1).get("value").asDouble(), 1e-6); - } - - - @Test - public void requireThatUDFCanExcludeFields() throws IOException { - String json = getDocumentOperationJson("docid=id:<application>:metrics::<name>-<date>", "exclude-fields=application,date"); - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - JsonNode fields = root.path("fields"); - - // 'application' and 'date' fields should not appear in JSON - assertNull(fields.get("application")); - assertNull(fields.get("date")); - assertNotNull(fields.get("name")); - assertNotNull(fields.get("value")); - } - - - private String getDocumentOperationJson(String... params) throws IOException { - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - - addToTuple("application", DataType.CHARARRAY, "testapp", schema, tuple); - addToTuple("name", DataType.CHARARRAY, "clicks", schema, tuple); - addToTuple("date", DataType.CHARARRAY, "20160112", schema, tuple); - addToTuple("value", DataType.CHARARRAY, 3, schema, tuple); - - VespaDocumentOperation docOp = new VespaDocumentOperation(params); - docOp.setInputSchema(schema); - return docOp.exec(tuple); - } - - - @Test - public void requireThatUDFSupportsSimpleObjectFields() throws IOException { - Schema objectSchema = new Schema(); - Tuple objectTuple = TupleFactory.getInstance().newTuple(); - addToTuple("id", DataType.LONG, 123456789L, objectSchema, objectTuple); - addToTuple("url", DataType.CHARARRAY, "example.com", objectSchema, objectTuple); - addToTuple("value", DataType.INTEGER, 123, objectSchema, objectTuple); - - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - addToTuple("object", DataType.TUPLE, objectTuple, objectSchema, schema, tuple); - - VespaDocumentOperation docOp = new VespaDocumentOperation("docid=empty", "simple-object-fields=object"); - docOp.setInputSchema(schema); - String json = docOp.exec(tuple); - - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - JsonNode fields = root.get("fields"); - JsonNode objectNode = fields.get("object"); - - assertEquals(123456789L, objectNode.get("id").asLong()); - assertEquals("example.com", objectNode.get("url").asText()); - assertEquals(123, objectNode.get("value").asInt()); - } - - - @Test - public void requireThatUDFSupportsBagAsMapFields() throws IOException { - DataBag bag = BagFactory.getInstance().newDefaultBag(); - - Schema objectSchema = new Schema(); - Tuple objectTuple = TupleFactory.getInstance().newTuple(); - addToTuple("key", DataType.CHARARRAY, "123456", objectSchema, objectTuple); - addToTuple("value", DataType.INTEGER, 123456, objectSchema, objectTuple); - bag.add(objectTuple); - - objectSchema = new Schema(); - objectTuple = TupleFactory.getInstance().newTuple(); - addToTuple("key", DataType.CHARARRAY, "234567", objectSchema, objectTuple); - addToTuple("value", DataType.INTEGER, 234567, objectSchema, objectTuple); - bag.add(objectTuple); - - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - addToTuple("bag", DataType.BAG, bag, objectSchema, schema, tuple); - - VespaDocumentOperation docOp = new VespaDocumentOperation("docid=empty", "bag-as-map-fields=bag"); - docOp.setInputSchema(schema); - String json = docOp.exec(tuple); - - ObjectMapper m = new ObjectMapper(); - JsonNode root = m.readTree(json); - JsonNode fields = root.get("fields"); - JsonNode bagNode = fields.get("bag"); - - assertEquals(123456, bagNode.get("123456").asInt()); - assertEquals(234567, bagNode.get("234567").asInt()); - } - - @Test - public void requireThatUDFPrintIdWhenVerbose() throws IOException { - DataBag bag = BagFactory.getInstance().newDefaultBag(); - - Schema objectSchema = new Schema(); - Tuple objectTuple = TupleFactory.getInstance().newTuple(); - addToTuple("key", DataType.CHARARRAY, "123456", objectSchema, objectTuple); - addToTuple("value", DataType.INTEGER, 123456, objectSchema, objectTuple); - bag.add(objectTuple); - - objectSchema = new Schema(); - objectTuple = TupleFactory.getInstance().newTuple(); - addToTuple("key", DataType.CHARARRAY, "234567", objectSchema, objectTuple); - addToTuple("value", DataType.INTEGER, 234567, objectSchema, objectTuple); - bag.add(objectTuple); - - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - addToTuple("bag", DataType.BAG, bag, objectSchema, schema, tuple); - - VespaDocumentOperation docOp = new VespaDocumentOperation("docid=7654321", "bag-as-map-fields=bag","verbose=true"); - docOp.setInputSchema(schema); - String json = docOp.exec(tuple); - - assertTrue(outContent.toString().contains("Processing docId: 7654321")); - } - - @Test - public void requireThatUDFVerboseSetToFalseByDefault() throws IOException { - DataBag bag = BagFactory.getInstance().newDefaultBag(); - - Schema objectSchema = new Schema(); - Tuple objectTuple = TupleFactory.getInstance().newTuple(); - addToTuple("key", DataType.CHARARRAY, "123456", objectSchema, objectTuple); - addToTuple("value", DataType.INTEGER, 123456, objectSchema, objectTuple); - bag.add(objectTuple); - - objectSchema = new Schema(); - objectTuple = TupleFactory.getInstance().newTuple(); - addToTuple("key", DataType.CHARARRAY, "234567", objectSchema, objectTuple); - addToTuple("value", DataType.INTEGER, 234567, objectSchema, objectTuple); - bag.add(objectTuple); - - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - addToTuple("bag", DataType.BAG, bag, objectSchema, schema, tuple); - - VespaDocumentOperation docOp = new VespaDocumentOperation("docid=7654321", "bag-as-map-fields=bag"); - docOp.setInputSchema(schema); - String json = docOp.exec(tuple); - - assertEquals("", outContent.toString()); - } - - private void addToTuple(String alias, byte type, Object value, Schema schema, Tuple tuple) { - schema.add(new Schema.FieldSchema(alias, type)); - tuple.append(value); - } - - - private void addToTuple(String alias, byte type, Object value, Schema schemaInField, Schema schema, Tuple tuple) - throws FrontendException { - schema.add(new Schema.FieldSchema(alias, schemaInField, type)); - tuple.append(value); - } - - private void addToBagWithSchema(String alias, byte type, Tuple value, Schema schemaInField, Schema schema,DataBag bag) - throws FrontendException { - schema.add(new Schema.FieldSchema(alias, schemaInField, type)); - bag.add(value); - } -} diff --git a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaQueryTest.java b/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaQueryTest.java deleted file mode 100644 index a0b549a737f..00000000000 --- a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaQueryTest.java +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.pig; - -import com.sun.net.httpserver.HttpServer; -import com.yahoo.vespa.hadoop.util.MockQueryHandler; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hdfs.HdfsConfiguration; -import org.apache.pig.ExecType; -import org.apache.pig.PigServer; -import org.apache.pig.data.Tuple; -import org.junit.jupiter.api.Test; - -import java.net.InetSocketAddress; -import java.util.Arrays; -import java.util.HashMap; -import java.util.Iterator; -import java.util.List; -import java.util.Map; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -public class VespaQueryTest { - - @Test - public void requireThatQueriesAreReturnedCorrectly() throws Exception { - runQueryTest("src/test/pig/query.pig", createQueryHandler(""), 18901); - } - - @Test - public void requireThatQueriesAreReturnedCorrectlyWithAlternativeJsonRoot() throws Exception { - runQueryTest("src/test/pig/query_alt_root.pig", createQueryHandler("children"), 18902); - } - - private void runQueryTest(String script, MockQueryHandler queryHandler, int port) throws Exception { - final String endpoint = "http://localhost:" + port; - - HttpServer server = HttpServer.create(new InetSocketAddress(port), 0); - server.createContext("/", queryHandler); - server.start(); - - PigServer ps = setup(script, endpoint); - - Iterator<Tuple> recommendations = ps.openIterator("recommendations"); - while (recommendations.hasNext()) { - Tuple tuple = recommendations.next(); - - String userid = (String) tuple.get(0); - Integer rank = (Integer) tuple.get(1); - String docid = (String) tuple.get(2); - Double relevance = (Double) tuple.get(3); - String fieldId = (String) tuple.get(4); - String fieldContent = (String) tuple.get(5); - - MockQueryHandler.MockQueryHit hit = queryHandler.getHit(userid, rank); - assertEquals(docid, hit.id); - assertEquals(relevance, hit.relevance, 1e-3); - assertEquals(fieldId, hit.fieldId); - assertEquals(fieldContent, hit.fieldContent); - } - - if (server != null) { - server.stop(0); - } - - } - - private PigServer setup(String script, String endpoint) throws Exception { - Configuration conf = new HdfsConfiguration(); - Map<String, String> parameters = new HashMap<>(); - parameters.put("ENDPOINT", endpoint); - - PigServer ps = new PigServer(ExecType.LOCAL, conf); - ps.setBatchOn(); - ps.registerScript(script, parameters); - - return ps; - } - - private MockQueryHandler createQueryHandler(String childNode) { - MockQueryHandler queryHandler = new MockQueryHandler(childNode); - - List<String> userIds = Arrays.asList("5", "104", "313"); - - int hitsPerUser = 3; - for (int i = 0; i < hitsPerUser * userIds.size(); ++i) { - String id = "" + (i+1); - String userId = userIds.get(i / hitsPerUser); - queryHandler.newHit(). - setId("id::::" + id). - setRelevance(1.0 - (i % hitsPerUser) * 0.1). - setFieldSddocname("doctype"). - setFieldId("" + id). - setFieldDate("2016060" + id). - setFieldContent("Content for user " + userId + " hit " + i % hitsPerUser + "..."). - add(userId); - } - - return queryHandler; - } - -} diff --git a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaStorageTest.java b/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaStorageTest.java deleted file mode 100644 index 3183c770bc7..00000000000 --- a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaStorageTest.java +++ /dev/null @@ -1,106 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.pig; - -import com.yahoo.vespa.hadoop.mapreduce.util.VespaConfiguration; -import com.yahoo.vespa.hadoop.mapreduce.util.VespaCounters; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.hdfs.HdfsConfiguration; -import org.apache.hadoop.mapred.Counters; -import org.apache.pig.ExecType; -import org.apache.pig.PigServer; -import org.apache.pig.backend.executionengine.ExecJob; -import org.apache.pig.tools.pigstats.JobStats; -import org.apache.pig.tools.pigstats.PigStats; -import org.apache.pig.tools.pigstats.mapreduce.MRJobStats; -import org.junit.jupiter.api.Test; - -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; - -public class VespaStorageTest { - - @Test - public void requireThatPremadeOperationsFeedSucceeds() throws Exception { - assertAllDocumentsOk("src/test/pig/feed_operations.pig"); - } - - - @Test - public void requireThatPremadeMultilineOperationsFeedSucceeds() throws Exception { - assertAllDocumentsOk("src/test/pig/feed_multiline_operations.pig"); - } - - - @Test - public void requireThatPremadeOperationsWithJsonLoaderFeedSucceeds() throws Exception { - assertAllDocumentsOk("src/test/pig/feed_operations_with_json_loader.pig"); - } - - @Test - public void requireThatPremadeOperationsWithJsonLoaderFeedAndNonLegacyClientSucceeds() throws Exception { - Configuration conf = new HdfsConfiguration(); - conf.set(VespaConfiguration.USE_SSL, Boolean.TRUE.toString()); - assertAllDocumentsOk("src/test/pig/feed_operations_with_json_loader.pig", conf); - } - - @Test - public void requireThatCreateOperationsFeedSucceeds() throws Exception { - assertAllDocumentsOk("src/test/pig/feed_create_operations.pig"); - } - - - @Test - public void requireThatCreateOperationsShortFormFeedSucceeds() throws Exception { - assertAllDocumentsOk("src/test/pig/feed_create_operations_short_form.pig"); - } - - - @Test - public void requireThatFeedVisitDataSucceeds() throws Exception { - assertAllDocumentsOk("src/test/pig/feed_visit_data.pig"); - } - - - private PigServer setup(String script, Configuration conf) throws Exception { - if (conf == null) { - conf = new HdfsConfiguration(); - } - conf.setIfUnset(VespaConfiguration.DRYRUN, "true"); - conf.setIfUnset(VespaConfiguration.ENDPOINT, "dummy-endpoint"); - - // Parameter substitutions - can also be set by configuration - Map<String, String> parameters = new HashMap<>(); - parameters.put("ENDPOINT", "endpoint-does-not-matter-in-dryrun,another-endpoint-that-does-not-matter"); - - PigServer ps = new PigServer(ExecType.LOCAL, conf); - ps.setBatchOn(); - ps.registerScript(script, parameters); - - return ps; - } - - - private void assertAllDocumentsOk(String script) throws Exception { - assertAllDocumentsOk(script, null); - } - - - private void assertAllDocumentsOk(String script, Configuration conf) throws Exception { - PigServer ps = setup(script, conf); - List<ExecJob> jobs = ps.executeBatch(); - PigStats stats = jobs.get(0).getStatistics(); - for (JobStats js : stats.getJobGraph()) { - Counters hadoopCounters = ((MRJobStats)js).getHadoopCounters(); - assertNotNull(hadoopCounters); - VespaCounters counters = VespaCounters.get(hadoopCounters); - assertEquals(10, counters.getDocumentsSent()); - assertEquals(0, counters.getDocumentsFailed()); - assertEquals(10, counters.getDocumentsOk()); - } - } - -} diff --git a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/util/MockQueryHandler.java b/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/util/MockQueryHandler.java deleted file mode 100644 index 64c160ea14c..00000000000 --- a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/util/MockQueryHandler.java +++ /dev/null @@ -1,219 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.util; - -import com.fasterxml.jackson.core.JsonEncoding; -import com.fasterxml.jackson.core.JsonFactory; -import com.fasterxml.jackson.core.JsonGenerator; -import com.sun.net.httpserver.HttpExchange; -import com.sun.net.httpserver.HttpHandler; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.io.OutputStream; -import java.net.URI; -import java.net.URLDecoder; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.List; -import java.util.Map; - -public class MockQueryHandler implements HttpHandler { - - private final Map<String, List<MockQueryHit>> hitMap; - private final String childNode; - - public MockQueryHandler(String childNode) { - this.hitMap = new HashMap<>(); - this.childNode = childNode; - } - - public void handle(HttpExchange t) throws IOException { - URI uri = t.getRequestURI(); - String query = uri.getQuery(); - String response = null; - - // Parse query - extract "query" element - if (query != null) { - String params[] = query.split("[&]"); - for (String param : params) { - int i = param.indexOf('='); - String name = param.substring(0, i); - String value = URLDecoder.decode(param.substring(i + 1), "UTF-8"); - - if ("query".equalsIgnoreCase(name)) { - response = getResponse(URLDecoder.decode(param.substring(i + 1), "UTF-8")); - } - } - } - - t.sendResponseHeaders(200, response == null ? 0 : response.length()); - OutputStream os = t.getResponseBody(); - os.write(response == null ? "".getBytes() : response.getBytes()); - os.close(); - - } - - public MockQueryHit getHit(String query, Integer rank) { - if (!hitMap.containsKey(query)) { - return null; - } - if (rank >= hitMap.get(query).size()) { - return null; - } - return hitMap.get(query).get(rank); - } - - public MockQueryHit newHit() { - return new MockQueryHit(this); - } - - public void addHit(String query, MockQueryHit hit) { - if (!hitMap.containsKey(query)) { - hitMap.put(query, new ArrayList<>()); - } - hitMap.get(query).add(hit); - } - - private String getResponse(String query) throws IOException { - List<MockQueryHit> hits = hitMap.get(query); - if (hits == null) { - return null; - } - - ByteArrayOutputStream out = new ByteArrayOutputStream(); - JsonGenerator g = new JsonFactory().createGenerator(out, JsonEncoding.UTF8); - - writeResultStart(g, hits.size()); - for (MockQueryHit hit : hits) { - writeHit(g, hit); - } - writeResultsEnd(g); - g.close(); - - return out.toString(); - } - - private void writeHit(JsonGenerator g, MockQueryHit hit) throws IOException { - g.writeStartObject(); - - g.writeFieldName("id"); - g.writeString(hit.id); - - g.writeFieldName("relevance"); - g.writeNumber(hit.relevance); - - g.writeFieldName("fields"); - g.writeStartObject(); - - g.writeFieldName("sddocname"); - g.writeString(hit.fieldSddocname); - - g.writeFieldName("date"); - g.writeString(hit.fieldDate); - - g.writeFieldName("content"); - g.writeString(hit.fieldContent); - - g.writeFieldName("id"); - g.writeString(hit.fieldId); - - g.writeEndObject(); - g.writeEndObject(); - } - - private void writeResultStart(JsonGenerator g, int count) throws IOException { - g.writeStartObject(); - g.writeFieldName("root"); - - g.writeStartObject(); - - g.writeFieldName("id"); - g.writeString("toplevel"); - - g.writeFieldName("relevance"); - g.writeNumber(1); - - g.writeFieldName("fields"); - g.writeStartObject(); - g.writeFieldName("totalCount"); - g.writeNumber(count); - g.writeEndObject(); - - g.writeFieldName("coverage"); - g.writeStartObject(); - g.writeFieldName("coverage"); - g.writeNumber(100); - // ... more stuff here usually - g.writeEndObject(); - - g.writeFieldName("children"); - g.writeStartArray(); - - if (!childNode.isEmpty()) { - g.writeStartObject(); - g.writeFieldName(childNode); - g.writeStartArray(); - } - } - - private void writeResultsEnd(JsonGenerator g) throws IOException { - if (!childNode.isEmpty()) { - g.writeEndArray(); - g.writeEndObject(); - } - g.writeEndArray(); - g.writeEndObject(); - g.writeEndObject(); - } - - public static class MockQueryHit { - - private final MockQueryHandler handler; - - public String id; - public Double relevance; - public String fieldSddocname; - public String fieldDate; - public String fieldContent; - public String fieldId; - - private MockQueryHit(MockQueryHandler handler) { - this.handler = handler; - } - - public void add(String query) { - handler.addHit(query, this); - } - - public MockQueryHit setId(String id) { - this.id = id; - return this; - } - - public MockQueryHit setRelevance(Double relevance) { - this.relevance = relevance; - return this; - } - - public MockQueryHit setFieldSddocname(String fieldSddocname) { - this.fieldSddocname = fieldSddocname; - return this; - } - - public MockQueryHit setFieldDate(String fieldDate) { - this.fieldDate = fieldDate; - return this; - } - - public MockQueryHit setFieldContent(String fieldContent) { - this.fieldContent = fieldContent; - return this; - } - - public MockQueryHit setFieldId(String fieldId) { - this.fieldId = fieldId; - return this; - } - } - -} diff --git a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/util/TupleToolsTest.java b/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/util/TupleToolsTest.java deleted file mode 100644 index b4ccbdf2183..00000000000 --- a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/util/TupleToolsTest.java +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -package com.yahoo.vespa.hadoop.util; - -import com.yahoo.vespa.hadoop.mapreduce.util.TupleTools; -import org.apache.pig.data.DataType; -import org.apache.pig.data.Tuple; -import org.apache.pig.data.TupleFactory; -import org.apache.pig.impl.logicalLayer.schema.Schema; -import org.junit.jupiter.api.Test; - -import java.io.IOException; - -import static org.junit.jupiter.api.Assertions.assertEquals; - -public class TupleToolsTest { - - @Test - public void requireThatTupleToStringHandlesSimpleTypes() throws IOException { - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - - addToTuple("id", DataType.CHARARRAY, "123", schema, tuple); - addToTuple("rank", DataType.INTEGER, 1, schema, tuple); - - String template = "Id is <id> and rank is <rank>"; - String result = TupleTools.toString(schema, tuple, template); - - assertEquals("Id is 123 and rank is 1", result); - } - - - private void addToTuple(String alias, byte type, Object value, Schema schema, Tuple tuple) { - schema.add(new Schema.FieldSchema(alias, type)); - tuple.append(value); - } - - @Test - public void requireThatTupleToStringHandlesStringCharacters() throws IOException { - Schema schema = new Schema(); - Tuple tuple = TupleFactory.getInstance().newTuple(); - - addToTuple("id", DataType.CHARARRAY, "_!@#$%^&*()", schema, tuple); - addToTuple("rank", DataType.INTEGER, 1, schema, tuple); - - String template = "Id is <id> and rank is <rank>"; - String result = TupleTools.toString(schema, tuple, template); - - assertEquals("Id is _!@#$%^&*() and rank is 1", result); - } - -} diff --git a/vespa-hadoop/src/test/pig/feed_create_operations.pig b/vespa-hadoop/src/test/pig/feed_create_operations.pig deleted file mode 100644 index 4583c095133..00000000000 --- a/vespa-hadoop/src/test/pig/feed_create_operations.pig +++ /dev/null @@ -1,24 +0,0 @@ --- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --- REGISTER vespa-hadoop.jar -- Not needed in tests - --- Create valid Vespa put operations -DEFINE VespaPutOperation - com.yahoo.vespa.hadoop.pig.VespaDocumentOperation( - 'operation=put', - 'docid=id:<application>:metrics::<name>-<date>' - ); - --- By default, VespaStorage assumes it's feeding valid Vespa operations -DEFINE VespaStorage - com.yahoo.vespa.hadoop.pig.VespaStorage(); - --- Load tabular data -metrics = LOAD 'src/test/resources/tabular_data.csv' AS (date:chararray, name:chararray, value:int, application:chararray); - --- Transform tabular data to a Vespa document operation JSON format -metrics = FOREACH metrics GENERATE VespaPutOperation(*); - --- Store into Vespa -STORE metrics INTO '$ENDPOINT' USING VespaStorage(); - - diff --git a/vespa-hadoop/src/test/pig/feed_create_operations_short_form.pig b/vespa-hadoop/src/test/pig/feed_create_operations_short_form.pig deleted file mode 100644 index 0f0e63d843a..00000000000 --- a/vespa-hadoop/src/test/pig/feed_create_operations_short_form.pig +++ /dev/null @@ -1,19 +0,0 @@ --- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --- REGISTER vespa-hadoop.jar -- Not needed in tests - --- Transform tabular data to a Vespa document operation JSON format --- as part of storing the data. -DEFINE VespaStorage - com.yahoo.vespa.hadoop.pig.VespaStorage( - 'create-document-operation=true', - 'operation=put', - 'docid=id:<application>:metrics::<name>-<date>' - ); - --- Load tabular data -metrics = LOAD 'src/test/resources/tabular_data.csv' AS (date:chararray, name:chararray, value:int, application:chararray); - --- Store into Vespa -STORE metrics INTO '$ENDPOINT' USING VespaStorage(); - - diff --git a/vespa-hadoop/src/test/pig/feed_multiline_operations.pig b/vespa-hadoop/src/test/pig/feed_multiline_operations.pig deleted file mode 100644 index 1971270cbdc..00000000000 --- a/vespa-hadoop/src/test/pig/feed_multiline_operations.pig +++ /dev/null @@ -1,15 +0,0 @@ --- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --- REGISTER vespa-hadoop.jar -- Not needed in tests - --- Define short name for VespaJsonLoader -DEFINE VespaJsonLoader com.yahoo.vespa.hadoop.pig.VespaSimpleJsonLoader(); - --- Define short name for VespaStorage -DEFINE VespaStorage com.yahoo.vespa.hadoop.pig.VespaStorage(); - --- Load data - one column for json data -metrics = LOAD 'src/test/resources/operations_multiline_data.json' USING VespaJsonLoader() AS (data:chararray); - --- Store into Vespa -STORE metrics INTO '$ENDPOINT' USING VespaStorage(); - diff --git a/vespa-hadoop/src/test/pig/feed_operations.pig b/vespa-hadoop/src/test/pig/feed_operations.pig deleted file mode 100644 index 48873fde87a..00000000000 --- a/vespa-hadoop/src/test/pig/feed_operations.pig +++ /dev/null @@ -1,11 +0,0 @@ --- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --- REGISTER vespa-hadoop.jar -- Not needed in tests - --- Define short name for VespaStorage -DEFINE VespaStorage com.yahoo.vespa.hadoop.pig.VespaStorage(); - --- Load data - one column for json data -metrics = LOAD 'src/test/resources/operations_data.json' AS (data:chararray); - --- Store into Vespa -STORE metrics INTO '$ENDPOINT' USING VespaStorage(); diff --git a/vespa-hadoop/src/test/pig/feed_operations_with_json_loader.pig b/vespa-hadoop/src/test/pig/feed_operations_with_json_loader.pig deleted file mode 100644 index da58fe3c678..00000000000 --- a/vespa-hadoop/src/test/pig/feed_operations_with_json_loader.pig +++ /dev/null @@ -1,14 +0,0 @@ --- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --- REGISTER vespa-hadoop.jar -- Not needed in tests - --- Define short name for VespaJsonLoader -DEFINE VespaJsonLoader com.yahoo.vespa.hadoop.pig.VespaSimpleJsonLoader(); - --- Define short name for VespaStorage -DEFINE VespaStorage com.yahoo.vespa.hadoop.pig.VespaStorage(); - --- Load data - one column for json data -metrics = LOAD 'src/test/resources/operations_data.json' USING VespaJsonLoader() AS (data:chararray); - --- Store into Vespa -STORE metrics INTO '$ENDPOINT' USING VespaStorage(); diff --git a/vespa-hadoop/src/test/pig/feed_operations_xml.pig b/vespa-hadoop/src/test/pig/feed_operations_xml.pig deleted file mode 100644 index 4e5057f4909..00000000000 --- a/vespa-hadoop/src/test/pig/feed_operations_xml.pig +++ /dev/null @@ -1,11 +0,0 @@ --- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --- REGISTER vespa-hadoop.jar -- Not needed in tests - --- Define short name for VespaStorage -DEFINE VespaStorage com.yahoo.vespa.hadoop.pig.VespaStorage(); - --- Load data - one column for xml data -data = LOAD 'src/test/resources/operations_data.xml' AS (data:chararray); - --- Store into Vespa -STORE data INTO '$ENDPOINT' USING VespaStorage(); diff --git a/vespa-hadoop/src/test/pig/feed_visit_data.pig b/vespa-hadoop/src/test/pig/feed_visit_data.pig deleted file mode 100644 index 59d144b53dc..00000000000 --- a/vespa-hadoop/src/test/pig/feed_visit_data.pig +++ /dev/null @@ -1,12 +0,0 @@ --- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --- REGISTER vespa-hadoop.jar -- Not needed in tests - --- Define short name for VespaStorage -DEFINE VespaStorage com.yahoo.vespa.hadoop.pig.VespaStorage(); - --- Load data - one column for json data -metrics = LOAD 'src/test/resources/visit_data.json' AS (data:chararray); - --- Store into Vespa -STORE metrics INTO '$ENDPOINT' USING VespaStorage(); - diff --git a/vespa-hadoop/src/test/pig/query.pig b/vespa-hadoop/src/test/pig/query.pig deleted file mode 100644 index 96caa5cd0c4..00000000000 --- a/vespa-hadoop/src/test/pig/query.pig +++ /dev/null @@ -1,19 +0,0 @@ --- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --- REGISTER vespa-hadoop.jar -- Not needed in tests - --- Define Vespa query for retrieving blog posts -DEFINE BlogPostRecommendations - com.yahoo.vespa.hadoop.pig.VespaQuery( - 'query=$ENDPOINT/search?query=<userid>&hits=100', - 'schema=rank:int,id:chararray,relevance:double,fields/id:chararray,fields/content:chararray' - ); - --- Load data from a local file -users = LOAD 'src/test/resources/user_ids.csv' AS (userid:chararray); -users = FILTER users BY userid IS NOT null; - --- Run a set of queries against Vespa -recommendations = FOREACH users GENERATE userid, FLATTEN(BlogPostRecommendations(*)); - --- Output recommendations -DUMP recommendations; diff --git a/vespa-hadoop/src/test/pig/query_alt_root.pig b/vespa-hadoop/src/test/pig/query_alt_root.pig deleted file mode 100644 index 2884b4a600f..00000000000 --- a/vespa-hadoop/src/test/pig/query_alt_root.pig +++ /dev/null @@ -1,20 +0,0 @@ --- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --- REGISTER vespa-hadoop.jar -- Not needed in tests - --- Define Vespa query for retrieving blog posts -DEFINE BlogPostRecommendations - com.yahoo.vespa.hadoop.pig.VespaQuery( - 'query=$ENDPOINT/search?query=<userid>&hits=100', - 'rootnode=root/children/children', - 'schema=rank:int,id:chararray,relevance:double,fields/id:chararray,fields/content:chararray' - ); - --- Load data from a local file -users = LOAD 'src/test/resources/user_ids.csv' AS (userid:chararray); -users = FILTER users BY userid IS NOT null; - --- Run a set of queries against Vespa -recommendations = FOREACH users GENERATE userid, FLATTEN(BlogPostRecommendations(*)); - --- Output recommendations -DUMP recommendations; diff --git a/vespa-hadoop/src/test/resources/operations_data.json b/vespa-hadoop/src/test/resources/operations_data.json deleted file mode 100644 index 5af436dbfe7..00000000000 --- a/vespa-hadoop/src/test/resources/operations_data.json +++ /dev/null @@ -1,10 +0,0 @@ -{"put":"id:testapp:metric::clicks-2015110414","fields":{"date":"2015110414","name":"clicks","value":1,"application":"testapp"}} -{"fields":{"date":"2015110416","name":"clicks","value":5,"application":"testapp"},"put":"id:testapp:metric::clicks-2015110416"} -{"put":"id:testapp:metric::clicks-2015110415","fields":{"date":"2015110415","name":"clicks","value":2,"application":"testapp"}} -{"put":"id:testapp:metric::clicks-2015110417","fields":{"date":"2015110417","name":"clicks","value":3,"application":"testapp"}} -{"put":"id:testapp:metric::clicks-2015110418","fields":{"date":"2015110418","name":"clicks","value":6,"application":"testapp"}} -{"put":"id:testapp:metric::clicks-2015110419","fields":{"date":"2015110419","name":"clicks","value":3,"application":"testapp"}} -{"put":"id:testapp:metric::clicks-2015110420","fields":{"date":"2015110420","name":"clicks","value":4,"application":"testapp"}} -{"put":"id:testapp:metric::clicks-2015110421","fields":{"date":"2015110421","name":"clicks","value":2,"application":"testapp"}} -{"fields":{"date":"2015110422","name":"clicks","value":5,"application":"testapp"},"condition":"metrics==0","put":"id:testapp:metric::clicks-2015110422"} -{"put":"id:testapp:metric::clicks-2015110423","fields":{"date":"2015110423","name":"clicks","value":1,"application":"testapp"}} diff --git a/vespa-hadoop/src/test/resources/operations_data.xml b/vespa-hadoop/src/test/resources/operations_data.xml deleted file mode 100644 index db02b6bee73..00000000000 --- a/vespa-hadoop/src/test/resources/operations_data.xml +++ /dev/null @@ -1,14 +0,0 @@ -<?xml version="1.0" encoding="utf-8"?> -<!-- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. --> -<vespafeed> - <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/a-ha/Scoundrel+Days"> <url>http://music.yahoo.com/a-ha/Scoundrel+Days</url> <title><![CDATA[Scoundrel Days]]></title> <artist><![CDATA[a-ha]]></artist> <year>0</year> <popularity>290</popularity> </document> - <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/Accept/Restless+And+Wild"> <url>http://music.yahoo.com/Accept/Restless+And+Wild</url> <title><![CDATA[Restless And Wild]]></title> <artist><![CDATA[Accept]]></artist> <year>0</year> <popularity>75</popularity> </document> - <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/Accept/Staying+A+Life"> <url>http://music.yahoo.com/Accept/Staying+A+Life</url> <title><![CDATA[Staying A Life]]></title> <artist><![CDATA[Accept]]></artist> <year>1985</year> <popularity>77</popularity> </document> - <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/Alice+In+Chains/Dirt"> <url>http://music.yahoo.com/Alice+In+Chains/Dirt</url> <title><![CDATA[Dirt]]></title> <artist><![CDATA[Alice In Chains]]></artist> <year>1992</year> <popularity>114</popularity> </document> - <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/Alice+In+Chains/Live"> <url>http://music.yahoo.com/Alice+In+Chains/Live</url> <title><![CDATA[Live]]></title> <artist><![CDATA[Alice In Chains]]></artist> <year>1990</year> <popularity>363</popularity> </document> - <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/Amy+MacDonald/This+Is+The+Life"> <url>http://music.yahoo.com/Amy+MacDonald/This+Is+The+Life</url> <title><![CDATA[This Is The Life]]></title> <artist><![CDATA[Amy MacDonald]]></artist> <year>2007</year> <popularity>355</popularity> </document> - <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/Ane+Brun/Duets"> <url>http://music.yahoo.com/Ane+Brun/Duets</url> <title><![CDATA[Duets]]></title> <artist><![CDATA[Ane Brun]]></artist> <year>0</year> <popularity>255</popularity> </document> - <update documenttype="music" documentid="id:music:music::http://music.yahoo.com/bobdylan/BestOf"><assign field="title">The Best of Bob Dylan</assign><add field="tracks"><item>Man Of Constant Sorrow</item></add></update> - <remove documentid="id:music:music::http://music.yahoo.com/Aqpop/Beautifully+Smart" /> - <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/Annuals/Be+He+Me"> <url>http://music.yahoo.com/Annuals/Be+He+Me</url> <title><![CDATA[Be He Me]]></title> <artist><![CDATA[Annuals]]></artist> <year>0</year> <popularity>207</popularity> </document> -</vespafeed> diff --git a/vespa-hadoop/src/test/resources/operations_multiline_data.json b/vespa-hadoop/src/test/resources/operations_multiline_data.json deleted file mode 100644 index 2b51698d9b7..00000000000 --- a/vespa-hadoop/src/test/resources/operations_multiline_data.json +++ /dev/null @@ -1,93 +0,0 @@ -[ - { - "put": "id:testapp:metric::clicks-2015110414", - "fields": { - "date": "2015110414", - "name": "clicks", - "value": 1, - "application": "testapp" - } - }, - { - "fields": { - "date": "2015110416", - "name": "clicks", - "value": 5, - "application": "testapp" - }, - "put": "id:testapp:metric::clicks-2015110416" - }, - { - "put": "id:testapp:metric::clicks-2015110415", - "fields": { - "date": "2015110415", - "name": "clicks", - "value": 2, - "application": "testapp" - } - }, - { - "put": "id:testapp:metric::clicks-2015110417", - "fields": { - "date": "2015110417", - "name": "clicks", - "value": 3, - "application": "testapp" - } - }, - { - "put": "id:testapp:metric::clicks-2015110418", - "fields": { - "date": "2015110418", - "name": "clicks", - "value": 6, - "application": "testapp" - } - }, - { - "put": "id:testapp:metric::clicks-2015110419", - "fields": { - "date": "2015110419", - "name": "clicks", - "value": 3, - "application": "testapp" - } - }, - { - "put": "id:testapp:metric::clicks-2015110420", - "fields": { - "date": "2015110420", - "name": "clicks", - "value": 4, - "application": "testapp" - } - }, - { - "put": "id:testapp:metric::clicks-2015110421", - "fields": { - "date": "2015110421", - "name": "clicks", - "value": 2, - "application": "testapp" - } - }, - { - "fields": { - "date": "2015110422", - "name": "clicks", - "value": 5, - "application": "testapp" - }, - "condition": "metrics==0", - "put": "id:testapp:metric::clicks-2015110422" - }, - { - "put": "id:testapp:metric::clicks-2015110423", - "fields": { - "date": "2015110423", - "name": "clicks", - "value": 1, - "application": "testapp" - } - } -] diff --git a/vespa-hadoop/src/test/resources/tabular_data.csv b/vespa-hadoop/src/test/resources/tabular_data.csv deleted file mode 100644 index 541597998e9..00000000000 --- a/vespa-hadoop/src/test/resources/tabular_data.csv +++ /dev/null @@ -1,11 +0,0 @@ -2015110414 clicks 1 testapp -2015110415 clicks 2 testapp -2015110416 clicks 5 testapp -2015110417 clicks 3 testapp -2015110418 clicks 6 testapp -2015110419 clicks 3 testapp -2015110420 clicks 4 testapp -2015110421 clicks 2 testapp -2015110422 clicks 5 testapp -2015110423 clicks 1 testapp - diff --git a/vespa-hadoop/src/test/resources/user_ids.csv b/vespa-hadoop/src/test/resources/user_ids.csv deleted file mode 100644 index 5875a3b9a7c..00000000000 --- a/vespa-hadoop/src/test/resources/user_ids.csv +++ /dev/null @@ -1,4 +0,0 @@ -5 -104 -313 - diff --git a/vespa-hadoop/src/test/resources/visit_data.json b/vespa-hadoop/src/test/resources/visit_data.json deleted file mode 100644 index 947b9326cc8..00000000000 --- a/vespa-hadoop/src/test/resources/visit_data.json +++ /dev/null @@ -1,10 +0,0 @@ -{"id":"id:testapp:metric::clicks-2015110414","fields":{"date":"2015110414","name":"clicks","value":1,"application":"testapp"}} -{"id":"id:testapp:metric::clicks-2015110415","fields":{"date":"2015110415","name":"clicks","value":2,"application":"testapp"}} -{"id":"id:testapp:metric::clicks-2015110416","fields":{"date":"2015110416","name":"clicks","value":4,"application":"testapp"}} -{"id":"id:testapp:metric::clicks-2015110417","fields":{"date":"2015110417","name":"clicks","value":3,"application":"testapp"}} -{"id":"id:testapp:metric::clicks-2015110418","fields":{"date":"2015110418","name":"clicks","value":6,"application":"testapp"}} -{"id":"id:testapp:metric::clicks-2015110419","fields":{"date":"2015110419","name":"clicks","value":3,"application":"testapp"}} -{"id":"id:testapp:metric::clicks-2015110420","fields":{"date":"2015110420","name":"clicks","value":4,"application":"testapp"}} -{"id":"id:testapp:metric::clicks-2015110421","fields":{"date":"2015110421","name":"clicks","value":2,"application":"testapp"}} -{"id":"id:testapp:metric::clicks-2015110422","fields":{"date":"2015110422","name":"clicks","value":7,"application":"testapp"}} -{"id":"id:testapp:metric::clicks-2015110423","fields":{"date":"2015110423","name":"clicks","value":1,"application":"testapp"}}
\ No newline at end of file |