summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorHenning Baldersheim <balder@yahoo-inc.com>2022-11-06 16:24:56 +0000
committerHenning Baldersheim <balder@yahoo-inc.com>2022-11-06 16:24:56 +0000
commit1acd63b6492fed791c6e1aa7c4daa437f0de30b3 (patch)
tree5109495f9b9003124a87445334890eee1d5da0c5
parent3b6df7e9ae9950744c6b8401026388f483f9c3a2 (diff)
No longer provide a repackaged hadoop/pig library
-rw-r--r--.gitignore2
-rw-r--r--parent/pom.xml170
-rw-r--r--pom.xml1
-rw-r--r--vespa-hadoop/OWNERS1
-rw-r--r--vespa-hadoop/README4
-rw-r--r--vespa-hadoop/abi-spec.json1
-rw-r--r--vespa-hadoop/pom.xml166
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaOutputCommitter.java37
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaOutputFormat.java60
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaRecordWriter.java153
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaSimpleJsonInputFormat.java99
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/package-info.java10
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/TupleTools.java70
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaConfiguration.java194
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaCounters.java105
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaHttpClient.java102
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaQuerySchema.java114
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/package-info.java10
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaDocumentOperation.java669
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaQuery.java114
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaSimpleJsonLoader.java63
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaStorage.java178
-rw-r--r--vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/package-info.java10
-rw-r--r--vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/MapReduceTest.java200
-rw-r--r--vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaDocumentOperationTest.java633
-rw-r--r--vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaQueryTest.java101
-rw-r--r--vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaStorageTest.java106
-rw-r--r--vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/util/MockQueryHandler.java219
-rw-r--r--vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/util/TupleToolsTest.java51
-rw-r--r--vespa-hadoop/src/test/pig/feed_create_operations.pig24
-rw-r--r--vespa-hadoop/src/test/pig/feed_create_operations_short_form.pig19
-rw-r--r--vespa-hadoop/src/test/pig/feed_multiline_operations.pig15
-rw-r--r--vespa-hadoop/src/test/pig/feed_operations.pig11
-rw-r--r--vespa-hadoop/src/test/pig/feed_operations_with_json_loader.pig14
-rw-r--r--vespa-hadoop/src/test/pig/feed_operations_xml.pig11
-rw-r--r--vespa-hadoop/src/test/pig/feed_visit_data.pig12
-rw-r--r--vespa-hadoop/src/test/pig/query.pig19
-rw-r--r--vespa-hadoop/src/test/pig/query_alt_root.pig20
-rw-r--r--vespa-hadoop/src/test/resources/operations_data.json10
-rw-r--r--vespa-hadoop/src/test/resources/operations_data.xml14
-rw-r--r--vespa-hadoop/src/test/resources/operations_multiline_data.json93
-rw-r--r--vespa-hadoop/src/test/resources/tabular_data.csv11
-rw-r--r--vespa-hadoop/src/test/resources/user_ids.csv4
-rw-r--r--vespa-hadoop/src/test/resources/visit_data.json10
44 files changed, 3 insertions, 3927 deletions
diff --git a/.gitignore b/.gitignore
index c77fe07eee7..adc898a7266 100644
--- a/.gitignore
+++ b/.gitignore
@@ -40,12 +40,10 @@ Testing
/build.ninja
/rules.ninja
*_test_app
-/hadoop/dependency-reduced-pom.xml
/mvnw
/mvnw.cmd
/mvnwDebug
/mvnwDebug.cmd
-/vespa-hadoop/dependency-reduced-pom.xml
.preprocessed/
.DS_Store
install_manifest.txt
diff --git a/parent/pom.xml b/parent/pom.xml
index 549d66e37a0..034daac34a3 100644
--- a/parent/pom.xml
+++ b/parent/pom.xml
@@ -495,61 +495,6 @@
<!-- No version property, as we don't want maven-dependency-plugin to alert about newer versions. -->
<version>3.1.9</version>
</dependency>
- <dependency> <!-- Control netty-all version -->
- <groupId>io.netty</groupId>
- <artifactId>netty-all</artifactId>
- <version>${netty.version}</version>
- </dependency>
- <dependency>
- <groupId>io.netty</groupId>
- <artifactId>netty-buffer</artifactId>
- <version>${netty.version}</version>
- </dependency>
- <dependency>
- <groupId>io.netty</groupId>
- <artifactId>netty-common</artifactId>
- <version>${netty.version}</version>
- </dependency>
- <dependency>
- <groupId>io.netty</groupId>
- <artifactId>netty-codec</artifactId>
- <version>${netty.version}</version>
- </dependency>
- <dependency>
- <groupId>io.netty</groupId>
- <artifactId>netty-codec-http2</artifactId>
- <version>${netty.version}</version>
- </dependency>
- <dependency>
- <groupId>io.netty</groupId>
- <artifactId>netty-codec-http</artifactId>
- <version>${netty.version}</version>
- </dependency>
- <dependency>
- <groupId>io.netty</groupId>
- <artifactId>netty-transport</artifactId>
- <version>${netty.version}</version>
- </dependency>
- <dependency>
- <groupId>io.netty</groupId>
- <artifactId>netty-transport-classes-epoll</artifactId>
- <version>${netty.version}</version>
- </dependency>
- <dependency> <!-- Control netty-handler version -->
- <groupId>io.netty</groupId>
- <artifactId>netty-handler</artifactId>
- <version>${netty.version}</version>
- </dependency>
- <dependency> <!-- Control netty-transport-native-epoll version -->
- <groupId>io.netty</groupId>
- <artifactId>netty-transport-native-epoll</artifactId>
- <version>${netty.version}</version>
- </dependency>
- <dependency> <!-- Control netty-handler version -->
- <groupId>io.netty</groupId>
- <artifactId>netty-tcnative</artifactId>
- <version>${netty-tcnative.version}</version>
- </dependency>
<dependency>
<groupId>com.github.tomakehurst</groupId>
<artifactId>wiremock-jre8-standalone</artifactId>
@@ -1075,104 +1020,6 @@
<artifactId>json</artifactId>
<version>${org.json.version}</version>
</dependency>
- <dependency> <!-- Due to hadoop-common pulling in 1.7.7 -->
- <groupId>org.apache.avro</groupId>
- <artifactId>avro</artifactId>
- <version>${avro.version}</version>
- </dependency>
- <dependency> <!-- Due to hadoop-common pulling in 9.8.1 -->
- <groupId>com.nimbusds</groupId>
- <artifactId>nimbus-jose-jwt</artifactId>
- <version>${nimbus.version}</version>
- </dependency>
- <dependency> <!-- Due to hadoop-common pulling in older version -->
- <groupId>net.minidev</groupId>
- <artifactId>json-smart</artifactId>
- <version>${json-smart.version}</version>
- </dependency>
- <dependency>
- <!-- Force fresh woodstox-core without security issue hadoop-3.3.4 -->
- <groupId>com.fasterxml.woodstox</groupId>
- <artifactId>woodstox-core</artifactId>
- <version>${woodstox.version}</version>
- </dependency>
- <dependency>
- <!-- Force fresh jersey-json without security issue hadoop-3.3.4 -->
- <groupId>com.sun.jersey</groupId>
- <artifactId>jersey-json</artifactId>
- <version>${sun-jersey-json.version}</version>
- </dependency>
- <dependency>
- <!-- Force fresh jettison without security issue hadoop-3.3.4 -->
- <groupId>org.codehaus.jettison</groupId>
- <artifactId>jettison</artifactId>
- <version>${jettison.version}</version>
- </dependency>
- <dependency>
- <!-- Transitive dependencies from pig-0.16 up-to-date -->
- <groupId>tomcat</groupId>
- <artifactId>jasper-runtime</artifactId>
- <version>${tomcat-jasper.version}</version>
- </dependency>
- <dependency>
- <!-- Transitive dependencies from pig-0.16 up-to-date -->
- <groupId>tomcat</groupId>
- <artifactId>jasper-compiler</artifactId>
- <version>${tomcat-jasper.version}</version>
- </dependency>
- <!-- Hadoop dependencies -->
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-common</artifactId>
- <version>${hadoop.version}</version>
- <exclusions>
- <exclusion>
- <groupId>log4j</groupId>
- <artifactId>log4j</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-client</artifactId>
- <version>${hadoop.version}</version>
- <exclusions>
- <exclusion>
- <groupId>log4j</groupId>
- <artifactId>log4j</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
- <dependency>
- <groupId>org.apache.pig</groupId>
- <artifactId>pig</artifactId>
- <version>${pig.version}</version>
- <classifier>h2</classifier>
- </dependency>
- <dependency>
- <!-- Hadoop test dependency -->
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-minicluster</artifactId>
- <version>${hadoop.version}</version>
- <exclusions>
- <exclusion>
- <groupId>log4j</groupId>
- <artifactId>log4j</artifactId>
- </exclusion>
- <exclusion>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- </exclusion>
- </exclusions>
- </dependency>
</dependencies>
</dependencyManagement>
@@ -1207,15 +1054,11 @@
<felix.version>7.0.1</felix.version>
<felix.log.version>1.0.1</felix.log.version>
<findbugs.version>3.0.2</findbugs.version> <!-- Should be kept in sync with guava -->
- <groovy.version>3.0.13</groovy.version>
- <hadoop.version>3.3.4</hadoop.version>
<hdrhistogram.version>2.1.12</hdrhistogram.version>
- <jettison.version>1.5.1</jettison.version>
<jetty.version>9.4.49.v20220914</jetty.version>
<jetty-alpn.version>1.1.3.v20160715</jetty-alpn.version>
<jjwt.version>0.11.2</jjwt.version>
<jna.version>5.11.0</jna.version>
- <json-smart.version>2.4.8</json-smart.version>
<junit.version>5.8.1</junit.version>
<maven-archiver.version>3.5.2</maven-archiver.version>
<maven-assembly-plugin.version>3.3.0</maven-assembly-plugin.version>
@@ -1235,23 +1078,16 @@
<maven-shade-plugin.version>3.4.1</maven-shade-plugin.version>
<maven-site-plugin.version>3.9.1</maven-site-plugin.version>
<maven-source-plugin.version>3.2.1</maven-source-plugin.version>
- <mockito.version>4.0.0</mockito.version>
- <netty.version>4.1.84.Final</netty.version>
- <netty-tcnative.version>2.0.54.Final</netty-tcnative.version>
- <nimbus.version>9.25.6</nimbus.version>
+ <mockito.version>4.0.0</mockito.version>
<onnxruntime.version>1.12.1</onnxruntime.version> <!-- WARNING: sync cloud-tenant-base-dependencies-enforcer/pom.xml -->
<org.json.version>20220320</org.json.version>
<org.lz4.version>1.8.0</org.lz4.version>
- <pig.version>0.16.0</pig.version>
<prometheus.client.version>0.6.0</prometheus.client.version>
<protobuf.version>3.21.7</protobuf.version>
<spifly.version>1.3.5</spifly.version>
- <sun-jersey-json.version>1.19.4</sun-jersey-json.version>
<surefire.version>2.22.2</surefire.version>
- <tomcat-jasper.version>5.5.23</tomcat-jasper.version>
- <wiremock.version>2.34.0</wiremock.version>
- <woodstox.version>6.4.0</woodstox.version>
- <zookeeper.client.version>3.8.0</zookeeper.client.version>
+ <wiremock.version>2.34.0</wiremock.version>
+ <zookeeper.client.version>3.8.0</zookeeper.client.version>
<doclint>all</doclint>
<test.hide>true</test.hide>
diff --git a/pom.xml b/pom.xml
index de19e9585c0..da105771d8d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -131,7 +131,6 @@
<module>vespa-feed-client</module>
<module>vespa-feed-client-api</module>
<module>vespa-feed-client-cli</module>
- <module>vespa-hadoop</module>
<module>vespa-maven-plugin</module>
<module>vespa-osgi-testrunner</module>
<module>vespa-testrunner-components</module>
diff --git a/vespa-hadoop/OWNERS b/vespa-hadoop/OWNERS
deleted file mode 100644
index 6b09ce48bd4..00000000000
--- a/vespa-hadoop/OWNERS
+++ /dev/null
@@ -1 +0,0 @@
-lesters
diff --git a/vespa-hadoop/README b/vespa-hadoop/README
deleted file mode 100644
index 1b567b88c1d..00000000000
--- a/vespa-hadoop/README
+++ /dev/null
@@ -1,4 +0,0 @@
-The Vespa Hadoop client.
-
-Contains APIs for feeding and querying Vespa from the grid.
-
diff --git a/vespa-hadoop/abi-spec.json b/vespa-hadoop/abi-spec.json
deleted file mode 100644
index 9e26dfeeb6e..00000000000
--- a/vespa-hadoop/abi-spec.json
+++ /dev/null
@@ -1 +0,0 @@
-{} \ No newline at end of file
diff --git a/vespa-hadoop/pom.xml b/vespa-hadoop/pom.xml
deleted file mode 100644
index 43f7c17967d..00000000000
--- a/vespa-hadoop/pom.xml
+++ /dev/null
@@ -1,166 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
-<project xmlns="http://maven.apache.org/POM/4.0.0"
- xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
- <modelVersion>4.0.0</modelVersion>
- <parent>
- <groupId>com.yahoo.vespa</groupId>
- <artifactId>parent</artifactId>
- <version>8-SNAPSHOT</version>
- <relativePath>../parent/pom.xml</relativePath>
- </parent>
- <artifactId>vespa-hadoop</artifactId>
- <version>8-SNAPSHOT</version>
- <name>${project.artifactId}</name>
- <description>Integration tools between Vespa and Hadoop</description>
-
- <properties>
- <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
- </properties>
-
- <dependencies>
- <!-- Hadoop dependencies -->
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-common</artifactId>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-client</artifactId>
- <scope>provided</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.pig</groupId>
- <artifactId>pig</artifactId>
- <classifier>h2</classifier>
- <scope>provided</scope>
- </dependency>
-
- <!-- These are inherited from parent. Needed for correct versions on Hadoop. -->
- <dependency>
- <groupId>org.apache.httpcomponents</groupId>
- <artifactId>httpclient</artifactId>
- <scope>compile</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.httpcomponents</groupId>
- <artifactId>httpcore</artifactId>
- <scope>compile</scope>
- </dependency>
-
- <!-- Test dependencies -->
- <dependency>
- <groupId>org.apache.hadoop</groupId>
- <artifactId>hadoop-minicluster</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <!-- This is a HACK due to hadoop relying on mockito in NameNodeAdapter, but not providing it. Brum, brum !! -->
- <groupId>org.mockito</groupId>
- <artifactId>mockito-core</artifactId>
- <scope>test</scope>
- </dependency>
-
- <dependency>
- <groupId>org.junit.jupiter</groupId>
- <artifactId>junit-jupiter</artifactId>
- <scope>test</scope>
- </dependency>
- <!-- Vespa feeding dependencies -->
- <dependency>
- <groupId>com.yahoo.vespa</groupId>
- <artifactId>vespa-feed-client</artifactId>
- <version>${project.version}</version>
- </dependency>
- <dependency>
- <groupId>com.yahoo.vespa</groupId>
- <artifactId>vespa-feed-client-api</artifactId>
- <version>${project.version}</version>
- </dependency>
-
- <!-- Jackson dependencies used in this module -->
- <dependency>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-core</artifactId>
- </dependency>
- <dependency>
- <groupId>com.fasterxml.jackson.core</groupId>
- <artifactId>jackson-databind</artifactId>
- </dependency>
-
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-shade-plugin</artifactId>
- <executions>
- <execution>
- <phase>package</phase>
- <goals>
- <goal>shade</goal>
- </goals>
- <configuration>
- <minimizeJar>false</minimizeJar>
-
- <relocations>
- <relocation>
- <pattern>com.google</pattern>
- <shadedPattern>shaded.vespa</shadedPattern>
- </relocation>
- <relocation>
- <pattern>org.bouncycastle</pattern>
- <shadedPattern>shaded.vespa.bouncycastle</shadedPattern>
- </relocation>
- <relocation>
- <pattern>commons-codec</pattern>
- <shadedPattern>shaded.vespa</shadedPattern>
- </relocation>
- <relocation>
- <pattern>commons-logging</pattern>
- <shadedPattern>shaded.vespa</shadedPattern>
- </relocation>
- <relocation>
- <pattern>org.apache</pattern>
- <shadedPattern>shaded.vespa</shadedPattern>
- <excludes>
- <exclude>org.apache.hadoop.**</exclude>
- <exclude>org.apache.pig.**</exclude>
- </excludes>
- </relocation>
- <relocation>
- <pattern>com.fasterxml</pattern>
- <shadedPattern>shaded.vespa</shadedPattern>
- </relocation>
- <relocation>
- <pattern>org.codehaus</pattern>
- <shadedPattern>shaded.vespa</shadedPattern>
- </relocation>
- <relocation>
- <pattern>io.airlift</pattern>
- <shadedPattern>shaded.vespa</shadedPattern>
- </relocation>
- <relocation>
- <pattern>com.ctc.wstx</pattern>
- <shadedPattern>shaded.vespa</shadedPattern>
- </relocation>
- </relocations>
- </configuration>
- </execution>
- </executions>
- </plugin>
-
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-compiler-plugin</artifactId>
- <configuration>
- <release>${vespaClients.jdk.releaseVersion}</release>
- </configuration>
- </plugin>
- </plugins>
-
- </build>
-</project>
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaOutputCommitter.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaOutputCommitter.java
deleted file mode 100644
index 6cb4ef45a96..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaOutputCommitter.java
+++ /dev/null
@@ -1,37 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.mapreduce;
-
-import org.apache.hadoop.mapreduce.JobContext;
-import org.apache.hadoop.mapreduce.OutputCommitter;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-
-import java.io.IOException;
-
-/**
- * The output committer describes the commit task output for a Map-Reduce
- * job. Not currently used, but is part of the Hadoop protocol since 2.7.
- *
- * @author lesters
- */
-public class VespaOutputCommitter extends OutputCommitter {
- @Override
- public void setupJob(JobContext jobContext) throws IOException {
- }
-
- @Override
- public void setupTask(TaskAttemptContext taskAttemptContext) throws IOException {
- }
-
- @Override
- public boolean needsTaskCommit(TaskAttemptContext taskAttemptContext) throws IOException {
- return false;
- }
-
- @Override
- public void commitTask(TaskAttemptContext taskAttemptContext) throws IOException {
- }
-
- @Override
- public void abortTask(TaskAttemptContext taskAttemptContext) throws IOException {
- }
-}
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaOutputFormat.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaOutputFormat.java
deleted file mode 100644
index e49a5e17970..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaOutputFormat.java
+++ /dev/null
@@ -1,60 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.mapreduce;
-
-import com.yahoo.vespa.hadoop.mapreduce.util.VespaConfiguration;
-import com.yahoo.vespa.hadoop.mapreduce.util.VespaCounters;
-import org.apache.hadoop.mapreduce.JobContext;
-import org.apache.hadoop.mapreduce.OutputCommitter;
-import org.apache.hadoop.mapreduce.OutputFormat;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-
-import java.io.IOException;
-import java.util.Properties;
-import java.util.logging.Logger;
-
-/**
- * An output specification for writing to Vespa instances in a Map-Reduce job.
- * Mainly returns an instance of a {@link VespaRecordWriter} that does the
- * actual feeding to Vespa.
- *
- * @author lesters
- */
-@SuppressWarnings("rawtypes")
-public class VespaOutputFormat extends OutputFormat {
-
- private static final Logger log = Logger.getLogger(VespaOutputFormat.class.getName());
-
- final Properties configOverride;
-
- public VespaOutputFormat() {
- super();
- this.configOverride = null;
- }
-
- public VespaOutputFormat(Properties configOverride) {
- super();
- this.configOverride = configOverride;
- }
-
-
- @Override
- @SuppressWarnings("deprecation")
- public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException {
- VespaCounters counters = VespaCounters.get(context);
- VespaConfiguration configuration = VespaConfiguration.get(context.getConfiguration(), configOverride);
- return new VespaRecordWriter(configuration, counters);
- }
-
-
- @Override
- public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
- return new VespaOutputCommitter();
- }
-
-
- @Override
- public void checkOutputSpecs(JobContext jobContext) throws IOException, InterruptedException {
- }
-
-}
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaRecordWriter.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaRecordWriter.java
deleted file mode 100644
index c450d7cdeef..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaRecordWriter.java
+++ /dev/null
@@ -1,153 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.mapreduce;
-
-import ai.vespa.feed.client.FeedClient;
-import ai.vespa.feed.client.FeedClientBuilder;
-import ai.vespa.feed.client.JsonFeeder;
-import ai.vespa.feed.client.OperationParseException;
-import com.yahoo.vespa.hadoop.mapreduce.util.VespaConfiguration;
-import com.yahoo.vespa.hadoop.mapreduce.util.VespaCounters;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-
-import java.io.IOException;
-import java.net.URI;
-import java.time.Duration;
-import java.util.Arrays;
-import java.util.List;
-import java.util.concurrent.ThreadLocalRandom;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-import static java.util.stream.Collectors.toList;
-
-/**
- * {@link VespaRecordWriter} sends the output &lt;key, value&gt; to one or more Vespa endpoints using vespa-feed-client.
- *
- * @author bjorncs
- */
-public class VespaRecordWriter extends RecordWriter<Object, Object> {
-
- private final static Logger log = Logger.getLogger(VespaRecordWriter.class.getCanonicalName());
-
- private final VespaCounters counters;
- private final VespaConfiguration config;
-
- private boolean initialized = false;
- private JsonFeeder feeder;
-
- protected VespaRecordWriter(VespaConfiguration config, VespaCounters counters) {
- this.counters = counters;
- this.config = config;
- }
-
- @Override
- public void write(Object key, Object data) throws IOException {
- initializeOnFirstWrite();
- String json = data.toString().trim();
- feeder.feedSingle(json)
- .whenComplete((result, error) -> {
- if (error != null) {
- if (error instanceof OperationParseException) {
- counters.incrementDocumentsSkipped(1);
- } else {
- String msg = "Failed to feed single document: " + error;
- log.log(Level.WARNING, msg, error);
- counters.incrementDocumentsFailed(1);
- }
- } else {
- counters.incrementDocumentsOk(1);
- }
- });
- counters.incrementDocumentsSent(1);
- if (counters.getDocumentsSent() % config.progressInterval() == 0) {
- String progress = String.format("Feed progress: %d / %d / %d / %d (sent, ok, failed, skipped)",
- counters.getDocumentsSent(),
- counters.getDocumentsOk(),
- counters.getDocumentsFailed(),
- counters.getDocumentsSkipped());
- log.info(progress);
- }
- }
-
- @Override
- public void close(TaskAttemptContext context) throws IOException {
- if (feeder != null) {
- feeder.close();
- feeder = null;
- initialized = false;
- }
- }
-
- /** Override method to alter {@link FeedClient} configuration */
- protected void onFeedClientInitialization(FeedClientBuilder builder) {}
-
- private void initializeOnFirstWrite() {
- if (initialized) return;
- useRandomizedStartupDelayIfEnabled();
- feeder = createJsonStreamFeeder();
- initialized = true;
- }
-
- private void useRandomizedStartupDelayIfEnabled() {
- if (!config.dryrun() && config.randomStartupSleepMs() > 0) {
- int delay = ThreadLocalRandom.current().nextInt(config.randomStartupSleepMs());
- log.info("Delaying startup by " + delay + " ms");
- try {
- Thread.sleep(delay);
- } catch (Exception e) {}
- }
- }
-
-
- private JsonFeeder createJsonStreamFeeder() {
- FeedClient feedClient = createFeedClient();
- JsonFeeder.Builder builder = JsonFeeder.builder(feedClient)
- .withTimeout(Duration.ofMinutes(10));
- if (config.route() != null) {
- builder.withRoute(config.route());
- }
- return builder.build();
-
- }
-
- private FeedClient createFeedClient() {
- List<URI> endpoints = endpointUris(config);
- log.info("Using endpoints " + endpoints);
- int streamsPerConnection = streamsPerConnection(config);
- log.log(Level.INFO, "Using {0} max streams per connection", new Object[] {streamsPerConnection});
- log.log(Level.INFO, "Using {0} connections", new Object[] {config.numConnections()});
- FeedClientBuilder feedClientBuilder = FeedClientBuilder.create(endpoints)
- .setConnectionsPerEndpoint(config.numConnections())
- .setMaxStreamPerConnection(streamsPerConnection)
- .setDryrun(config.dryrun())
- .setRetryStrategy(retryStrategy(config));
- if (config.proxyHost() != null) {
- URI proxyUri = URI.create(String.format(
- "%s://%s:%d", config.proxyScheme(), config.proxyHost(), config.proxyPort()));
- log.info("Using proxy " + proxyUri);
- feedClientBuilder.setProxy(proxyUri);
- }
-
- onFeedClientInitialization(feedClientBuilder);
- return feedClientBuilder.build();
- }
-
- private static FeedClient.RetryStrategy retryStrategy(VespaConfiguration config) {
- int maxRetries = config.numRetries();
- return new FeedClient.RetryStrategy() {
- @Override public int retries() { return maxRetries; }
- };
- }
-
- private static int streamsPerConnection(VespaConfiguration config) {
- return Math.min(256, config.maxInFlightRequests() / config.numConnections());
- }
-
- private static List<URI> endpointUris(VespaConfiguration config) {
- String scheme = config.useSSL().orElse(true) ? "https" : "http";
- return Arrays.stream(config.endpoint().split(","))
- .map(hostname -> URI.create(String.format("%s://%s:%d/", scheme, hostname, config.defaultPort())))
- .collect(toList());
- }
-}
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaSimpleJsonInputFormat.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaSimpleJsonInputFormat.java
deleted file mode 100644
index f9bcba96a69..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/VespaSimpleJsonInputFormat.java
+++ /dev/null
@@ -1,99 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.mapreduce;
-
-import com.fasterxml.jackson.core.JsonFactory;
-import com.fasterxml.jackson.core.JsonFactoryBuilder;
-import com.fasterxml.jackson.core.JsonParser;
-import com.fasterxml.jackson.core.JsonToken;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import org.apache.hadoop.fs.FSDataInputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.io.NullWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.InputSplit;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.mapreduce.lib.input.FileSplit;
-
-import java.io.BufferedInputStream;
-import java.io.IOException;
-
-/**
- * Simple JSON reader which splits the input file along JSON object boundaries.
- *
- * There are two cases handled here:
- * 1. Each line contains a JSON object, i.e. { ... }
- * 2. The file contains an array of objects with arbitrary line breaks, i.e. [ {...}, {...} ]
- *
- * Not suitable for cases where you want to extract objects from some other arbitrary structure.
- *
- * TODO: Support config which points to a array in the JSON as start point for object extraction,
- * ala how it is done in VespaHttpClient.parseResultJson, i.e. support rootNode config.
- *
- * @author lesters
- */
-public class VespaSimpleJsonInputFormat extends FileInputFormat<Text, NullWritable> {
-
- @Override
- public RecordReader<Text, NullWritable> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
- return new VespaJsonRecordReader();
- }
-
- public static class VespaJsonRecordReader extends RecordReader<Text, NullWritable> {
- private long remaining;
- private JsonParser parser;
- private Text currentKey;
- private NullWritable currentValue = NullWritable.get();
-
- @Override
- public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
- FileSplit fileSplit = (FileSplit) split;
- FSDataInputStream stream = FileSystem.get(context.getConfiguration()).open(fileSplit.getPath());
- if (fileSplit.getStart() != 0) {
- stream.seek(fileSplit.getStart());
- }
-
- remaining = fileSplit.getLength();
- JsonFactory factory = new JsonFactoryBuilder().disable(JsonFactory.Feature.CANONICALIZE_FIELD_NAMES).build();
- parser = factory.createParser(new BufferedInputStream(stream));
- parser.setCodec(new ObjectMapper());
- parser.nextToken();
- if (parser.currentToken() == JsonToken.START_ARRAY) {
- parser.nextToken();
- }
- }
-
- @Override
- public boolean nextKeyValue() throws IOException, InterruptedException {
- if (parser.currentToken() != JsonToken.START_OBJECT) {
- return true;
- }
- currentKey = new Text(parser.readValueAsTree().toString());
- parser.nextToken();
- return false;
- }
-
- @Override
- public Text getCurrentKey() throws IOException, InterruptedException {
- return currentKey;
- }
-
- @Override
- public NullWritable getCurrentValue() throws IOException, InterruptedException {
- return currentValue;
- }
-
- @Override
- public float getProgress() throws IOException, InterruptedException {
- return parser.getCurrentLocation().getByteOffset() / remaining;
- }
-
- @Override
- public void close() throws IOException {
- parser.close();
- }
- }
-
-}
-
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/package-info.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/package-info.java
deleted file mode 100644
index 22a742566cd..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/package-info.java
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-/**
- * com.yahoo.vespa.hadoop.mapreduce contains classes and utilities
- * to enable feeding directly to Vespa endpoints from mapreduce.
- * It is a minimal layer over the Vespa HTTP client.
- *
- * NOTE: This is a PUBLIC API, but not annotated as such because this is not a bundle and
- * we don't want to introduce Vespa dependencies.
- */
-package com.yahoo.vespa.hadoop.mapreduce;
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/TupleTools.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/TupleTools.java
deleted file mode 100644
index 5147dc3496c..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/TupleTools.java
+++ /dev/null
@@ -1,70 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.mapreduce.util;
-
-import org.apache.pig.ResourceSchema;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.impl.logicalLayer.schema.Schema;
-
-import java.io.IOException;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-public class TupleTools {
-
- private static final Pattern pattern = Pattern.compile("<([\\w]+)>");
-
- public static Map<String, Object> tupleMap(Schema schema, Tuple tuple) throws IOException {
- Map<String, Object> tupleMap = new HashMap<>((int)Math.ceil(tuple.size() / 0.75) + 1);
- List<Schema.FieldSchema> schemas = schema.getFields();
- for (int i = 0; i < schemas.size(); i++) {
- Schema.FieldSchema field = schemas.get(i);
- String alias = field.alias;
- Object value = tuple.get(i);
- if (value != null) {
- tupleMap.put(alias, value);
- }
- }
- return tupleMap;
- }
-
- public static Map<String, Object> tupleMap(ResourceSchema schema, Tuple tuple) throws IOException {
- Map<String, Object> tupleMap = new HashMap<>((int)Math.ceil(tuple.size() / 0.75) + 1);
- ResourceSchema.ResourceFieldSchema[] schemas = schema.getFields();
- for (int i = 0; i < schemas.length; i++) {
- ResourceSchema.ResourceFieldSchema field = schemas[i];
- String alias = field.getName();
- Object value = tuple.get(i);
- if (value != null) {
- tupleMap.put(alias, value);
- }
- }
- return tupleMap;
- }
-
- public static String toString(Schema schema, Tuple tuple, String template) throws IOException {
- return toString(tupleMap(schema, tuple), template);
- }
-
- public static String toString(Map<String,Object> fields, String template) {
- if (template == null || template.length() == 0) {
- return template;
- }
- if (fields == null || fields.size() == 0) {
- return template;
- }
-
- Matcher m = pattern.matcher(template);
- StringBuffer sb = new StringBuffer();
- while (m.find()) {
- Object value = fields.get(m.group(1));
- String replacement = value != null ? value.toString() : m.group(0);
- m.appendReplacement(sb, Matcher.quoteReplacement(replacement));
- }
- m.appendTail(sb);
- return sb.toString();
- }
-
-}
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaConfiguration.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaConfiguration.java
deleted file mode 100644
index ae0b6a58155..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaConfiguration.java
+++ /dev/null
@@ -1,194 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.mapreduce.util;
-
-import org.apache.hadoop.conf.Configuration;
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.Optional;
-import java.util.Properties;
-
-public class VespaConfiguration {
-
- public static final String ENDPOINT = "vespa.feed.endpoint";
- public static final String DEFAULT_PORT = "vespa.feed.defaultport";
- public static final String USE_SSL = "vespa.feed.ssl";
- public static final String PROXY_HOST = "vespa.feed.proxy.host";
- public static final String PROXY_PORT = "vespa.feed.proxy.port";
- public static final String PROXY_SCHEME = "vespa.feed.proxy.scheme";
- public static final String DRYRUN = "vespa.feed.dryrun";
- public static final String USE_COMPRESSION = "vespa.feed.usecompression";
- public static final String PROGRESS_REPORT = "vespa.feed.progress.interval";
- public static final String CONNECTIONS = "vespa.feed.connections";
- public static final String THROTTLER_MIN_SIZE = "vespa.feed.throttler.min.size";
- public static final String QUERY_CONNECTION_TIMEOUT = "vespa.query.connection.timeout";
- public static final String ROUTE = "vespa.feed.route";
- public static final String MAX_SLEEP_TIME_MS = "vespa.feed.max.sleep.time.ms";
- public static final String MAX_IN_FLIGHT_REQUESTS = "vespa.feed.max.in.flight.requests";
- public static final String RANDOM_STARTUP_SLEEP = "vespa.feed.random.startup.sleep.ms";
- public static final String NUM_RETRIES = "vespa.feed.num.retries";
-
- private final Configuration conf;
- private final Properties override;
-
- private VespaConfiguration(Configuration conf, Properties override) {
- this.conf = conf;
- this.override = override;
- }
-
-
- public static VespaConfiguration get(Configuration conf, Properties override) {
- return new VespaConfiguration(conf, override);
- }
-
-
- public String endpoint() {
- return getString(ENDPOINT);
- }
-
-
- public int defaultPort() {
- return getInt(DEFAULT_PORT, 4080);
- }
-
-
- public Optional<Boolean> useSSL() {
- String raw = getString(USE_SSL);
- if (raw == null || raw.trim().isEmpty()) return Optional.empty();
- return Optional.of(Boolean.parseBoolean(raw));
- }
-
-
- public String proxyHost() {
- return getString(PROXY_HOST);
- }
-
-
- public int proxyPort() {
- return getInt(PROXY_PORT, 4080);
- }
-
-
- public String proxyScheme() {
- String raw = getString(PROXY_SCHEME);
- if (raw == null) return "http";
- return raw;
- }
-
-
- public boolean dryrun() {
- return getBoolean(DRYRUN, false);
- }
-
-
- public boolean useCompression() {
- return getBoolean(USE_COMPRESSION, true);
- }
-
-
- public int numConnections() {
- return getInt(CONNECTIONS, 1);
- }
-
-
- public int throttlerMinSize() {
- return getInt(THROTTLER_MIN_SIZE, 0);
- }
-
-
- public int queryConnectionTimeout() {
- return getInt(QUERY_CONNECTION_TIMEOUT, 10000);
- }
-
-
- public String route() {
- return getString(ROUTE);
- }
-
-
- public int maxSleepTimeMs() {
- return getInt(MAX_SLEEP_TIME_MS, 10000);
- }
-
-
- public int maxInFlightRequests() {
- return getInt(MAX_IN_FLIGHT_REQUESTS, 500);
- }
-
-
- public int randomStartupSleepMs() {
- return getInt(RANDOM_STARTUP_SLEEP, 30000);
- }
-
-
- public int numRetries() {
- return getInt(NUM_RETRIES, 100);
- }
-
-
- public int progressInterval() {
- return getInt(PROGRESS_REPORT, 1000);
- }
-
- public String getString(String name) {
- if (override != null && override.containsKey(name)) {
- return override.getProperty(name);
- }
- return conf != null ? conf.get(name) : null;
- }
-
-
- public int getInt(String name, int defaultValue) {
- if (override != null && override.containsKey(name)) {
- return Integer.parseInt(override.getProperty(name));
- }
- return conf != null ? conf.getInt(name, defaultValue) : defaultValue;
- }
-
-
- public boolean getBoolean(String name, boolean defaultValue) {
- if (override != null && override.containsKey(name)) {
- return Boolean.parseBoolean(override.getProperty(name));
- }
- return conf != null ? conf.getBoolean(name, defaultValue) : defaultValue;
-
- }
-
- public static Properties loadProperties(String... params) {
- Properties properties = new Properties();
- if (params != null) {
- for (String s : params) {
- try {
- properties.load(new StringReader(s));
- } catch (IOException e) {
- throw new IllegalArgumentException(e);
- }
- }
- }
- return properties;
- }
-
-
- public String toString() {
- StringBuilder sb = new StringBuilder();
- sb.append(ENDPOINT + ": " + endpoint() + "\n");
- sb.append(DEFAULT_PORT + ": " + defaultPort() + "\n");
- sb.append(USE_SSL + ": " + useSSL().map(Object::toString).orElse("<empty>") + "\n");
- sb.append(PROXY_HOST + ": " + proxyHost() + "\n");
- sb.append(PROXY_PORT + ": " + proxyPort() + "\n");
- sb.append(PROXY_SCHEME + ": " + proxyScheme() + "\n");
- sb.append(DRYRUN + ": " + dryrun() +"\n");
- sb.append(USE_COMPRESSION + ": " + useCompression() +"\n");
- sb.append(PROGRESS_REPORT + ": " + progressInterval() +"\n");
- sb.append(CONNECTIONS + ": " + numConnections() +"\n");
- sb.append(THROTTLER_MIN_SIZE + ": " + throttlerMinSize() +"\n");
- sb.append(QUERY_CONNECTION_TIMEOUT + ": " + queryConnectionTimeout() +"\n");
- sb.append(ROUTE + ": " + route() +"\n");
- sb.append(MAX_SLEEP_TIME_MS + ": " + maxSleepTimeMs() +"\n");
- sb.append(MAX_IN_FLIGHT_REQUESTS + ": " + maxInFlightRequests() +"\n");
- sb.append(RANDOM_STARTUP_SLEEP + ": " + randomStartupSleepMs() +"\n");
- sb.append(NUM_RETRIES + ": " + numRetries() +"\n");
- return sb.toString();
- }
-
-}
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaCounters.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaCounters.java
deleted file mode 100644
index 63b4b6600fd..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaCounters.java
+++ /dev/null
@@ -1,105 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.mapreduce.util;
-
-import org.apache.hadoop.mapreduce.Counter;
-import org.apache.hadoop.mapreduce.Counters;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.TaskAttemptContext;
-
-import java.io.IOException;
-
-public class VespaCounters {
-
- public static final String GROUP = "Vespa Feed Counters";
- public static final String DOCS_OK = "Documents ok";
- public static final String DOCS_SENT = "Documents sent";
- public static final String DOCS_FAILED = "Documents failed";
- public static final String DOCS_SKIPPED = "Documents skipped";
-
- private final Counter documentsSent;
- private final Counter documentsOk;
- private final Counter documentsFailed;
- private final Counter documentsSkipped;
-
-
- private VespaCounters(Job job) throws IOException {
- Counters counters = job.getCounters();
- documentsSent = counters.findCounter(GROUP, DOCS_SENT);
- documentsOk = counters.findCounter(GROUP, DOCS_OK);
- documentsFailed = counters.findCounter(GROUP, DOCS_FAILED);
- documentsSkipped = counters.findCounter(GROUP, DOCS_SKIPPED);
- }
-
-
- private VespaCounters(TaskAttemptContext context) {
- documentsSent = context.getCounter(GROUP, DOCS_SENT);
- documentsOk = context.getCounter(GROUP, DOCS_OK);
- documentsFailed = context.getCounter(GROUP, DOCS_FAILED);
- documentsSkipped = context.getCounter(GROUP, DOCS_SKIPPED);
- }
-
-
- private VespaCounters(org.apache.hadoop.mapred.Counters counters) {
- documentsSent = counters.findCounter(GROUP, DOCS_SENT);
- documentsOk = counters.findCounter(GROUP, DOCS_OK);
- documentsFailed = counters.findCounter(GROUP, DOCS_FAILED);
- documentsSkipped = counters.findCounter(GROUP, DOCS_SKIPPED);
- }
-
-
- public static VespaCounters get(Job job) throws IOException {
- return new VespaCounters(job);
- }
-
-
- public static VespaCounters get(TaskAttemptContext context) {
- return new VespaCounters(context);
- }
-
-
- public static VespaCounters get(org.apache.hadoop.mapred.Counters counters) {
- return new VespaCounters(counters);
-
- }
-
-
- public long getDocumentsSent() {
- return documentsSent.getValue();
- }
-
-
- public void incrementDocumentsSent(long incr) {
- documentsSent.increment(incr);
- }
-
-
- public long getDocumentsOk() {
- return documentsOk.getValue();
- }
-
-
- public void incrementDocumentsOk(long incr) {
- documentsOk.increment(incr);
- }
-
-
- public long getDocumentsFailed() {
- return documentsFailed.getValue();
- }
-
-
- public void incrementDocumentsFailed(long incr) {
- documentsFailed.increment(incr);
- }
-
-
- public long getDocumentsSkipped() {
- return documentsSkipped.getValue();
- }
-
-
- public void incrementDocumentsSkipped(long incr) {
- documentsSkipped.increment(incr);
- }
-
-}
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaHttpClient.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaHttpClient.java
deleted file mode 100644
index c7ed52a01c0..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaHttpClient.java
+++ /dev/null
@@ -1,102 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.mapreduce.util;
-
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import org.apache.http.HttpEntity;
-import org.apache.http.HttpHost;
-import org.apache.http.HttpResponse;
-import org.apache.http.client.HttpClient;
-import org.apache.http.client.config.RequestConfig;
-import org.apache.http.client.methods.HttpGet;
-import org.apache.http.impl.client.HttpClientBuilder;
-import org.apache.http.util.EntityUtils;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Scanner;
-
-public class VespaHttpClient {
-
- private final HttpClient httpClient;
-
- public VespaHttpClient() {
- this(null);
- }
-
- public VespaHttpClient(VespaConfiguration configuration) {
- httpClient = createClient(configuration);
- }
-
- public String get(String url) throws IOException {
- HttpGet httpGet = new HttpGet(url);
- HttpResponse httpResponse = httpClient.execute(httpGet);
-
- HttpEntity entity = httpResponse.getEntity();
- InputStream is = entity.getContent();
-
- String result = "";
- Scanner scanner = new Scanner(is, "UTF-8").useDelimiter("\\A");
- if (scanner.hasNext()) {
- result = scanner.next();
- }
- EntityUtils.consume(entity);
-
- if (httpResponse.getStatusLine().getStatusCode() != 200) {
- return null;
- }
-
- return result;
- }
-
- public JsonNode parseResultJson(String json, String rootNode) throws IOException {
- if (json == null || json.isEmpty()) {
- return null;
- }
- if (rootNode == null || rootNode.isEmpty()) {
- return null;
- }
-
- ObjectMapper m = new ObjectMapper();
- JsonNode node = m.readTree(json);
- if (node != null) {
- String[] path = rootNode.split("/");
- for (String p : path) {
- node = node.get(p);
-
- if (node == null) {
- return null;
- }
-
- // if node is an array, return the first node that has the correct path
- if (node.isArray()) {
- for (int i = 0; i < node.size(); ++i) {
- JsonNode n = node.get(i);
- if (n.has(p)) {
- node = n;
- break;
- }
- }
- }
-
- }
- }
- return node;
- }
-
- private HttpClient createClient(VespaConfiguration configuration) {
- HttpClientBuilder clientBuilder = HttpClientBuilder.create();
-
- RequestConfig.Builder requestConfigBuilder = RequestConfig.custom();
- if (configuration != null) {
- requestConfigBuilder.setSocketTimeout(configuration.queryConnectionTimeout());
- requestConfigBuilder.setConnectTimeout(configuration.queryConnectionTimeout());
- if (configuration.proxyHost() != null) {
- requestConfigBuilder.setProxy(new HttpHost(configuration.proxyHost(), configuration.proxyPort()));
- }
- }
- clientBuilder.setDefaultRequestConfig(requestConfigBuilder.build());
- return clientBuilder.build();
- }
-
-}
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaQuerySchema.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaQuerySchema.java
deleted file mode 100644
index cfaff44addb..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/mapreduce/util/VespaQuerySchema.java
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.mapreduce.util;
-
-import com.fasterxml.jackson.databind.JsonNode;
-import org.apache.pig.data.DataType;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.data.TupleFactory;
-import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.apache.pig.impl.util.Utils;
-import org.apache.pig.parser.ParserException;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-public class VespaQuerySchema implements Iterable<VespaQuerySchema.AliasTypePair> {
-
- private final List<AliasTypePair> tupleSchema = new ArrayList<>();
-
- public VespaQuerySchema(String schema) {
- for (String e : schema.split(",")) {
- String[] pair = e.split(":");
- String alias = pair[0].trim();
- String type = pair[1].trim();
- tupleSchema.add(new AliasTypePair(alias, type));
- }
- }
-
- public Tuple buildTuple(int rank, JsonNode hit) {
- Tuple tuple = TupleFactory.getInstance().newTuple();
-
- for (VespaQuerySchema.AliasTypePair tupleElement : tupleSchema) {
- String alias = tupleElement.getAlias();
- Byte type = DataType.findTypeByName(tupleElement.getType());
-
- // reserved word
- if ("rank".equals(alias)) {
- tuple.append(rank);
- } else {
- JsonNode field = hit;
- String[] path = alias.split("/"); // move outside
- for (String p : path) {
- field = field.get(p);
- if (field == null) {
- type = DataType.NULL; // effectively skip field as it is not found
- break;
- }
- }
- switch (type) {
- case DataType.BOOLEAN:
- tuple.append(field.asBoolean());
- break;
- case DataType.INTEGER:
- tuple.append(field.asInt());
- break;
- case DataType.LONG:
- tuple.append(field.asLong());
- break;
- case DataType.FLOAT:
- case DataType.DOUBLE:
- tuple.append(field.asDouble());
- break;
- case DataType.DATETIME:
- tuple.append(field.asText());
- break;
- case DataType.CHARARRAY:
- tuple.append(field.asText());
- break;
- default:
- // the rest of the data types are currently not supported
- }
- }
- }
- return tuple;
- }
-
- public static Schema getPigSchema(String schemaString) {
- Schema schema = null;
- schemaString = schemaString.replace("/", "_");
- schemaString = "{(" + schemaString + ")}";
- try {
- schema = Utils.getSchemaFromString(schemaString);
- } catch (ParserException e) {
- e.printStackTrace();
- }
- return schema;
- }
-
- @Override
- public Iterator<AliasTypePair> iterator() {
- return tupleSchema.iterator();
- }
-
-
- public static class AliasTypePair {
- private final String alias;
- private final String type;
-
- AliasTypePair(String alias, String type) {
- this.alias = alias;
- this.type = type;
- }
-
- public String getAlias() {
- return alias;
- }
-
- public String getType() {
- return type;
- }
-
- }
-
-}
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/package-info.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/package-info.java
deleted file mode 100644
index 41c621d2877..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/package-info.java
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-/**
- * com.yahoo.vespa.hadoop contains classes and utilities
- * to enable feeding directly to Vespa endpoints from pig and mapreduce.
- * It is a minimal layer over the Vespa HTTP client.
- *
- * NOTE: This is a PUBLIC API, but not annotated as such because this is not a bundle and
- * we don't want to introduce Vespa dependencies.
- */
-package com.yahoo.vespa.hadoop;
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaDocumentOperation.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaDocumentOperation.java
deleted file mode 100644
index c95aa02215f..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaDocumentOperation.java
+++ /dev/null
@@ -1,669 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.pig;
-
-import com.fasterxml.jackson.core.JsonEncoding;
-import com.fasterxml.jackson.core.JsonFactory;
-import com.fasterxml.jackson.core.JsonGenerator;
-import com.yahoo.vespa.hadoop.mapreduce.util.TupleTools;
-import com.yahoo.vespa.hadoop.mapreduce.util.VespaConfiguration;
-import org.apache.pig.EvalFunc;
-import org.apache.pig.PigWarning;
-import org.apache.pig.data.DataBag;
-import org.apache.pig.data.DataByteArray;
-import org.apache.pig.data.DataType;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.tools.pigstats.PigStatusReporter;
-import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.joda.time.DateTime;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.PrintWriter;
-import java.io.StringWriter;
-import java.io.UncheckedIOException;
-import java.math.BigDecimal;
-import java.math.BigInteger;
-import java.util.*;
-
-/**
- * A Pig UDF to convert simple Pig types into a valid Vespa JSON document format.
- *
- * @author lesters
- */
-public class VespaDocumentOperation extends EvalFunc<String> {
-
- public enum Operation {
- DOCUMENT,
- PUT,
- ID,
- REMOVE,
- UPDATE;
-
- @Override
- public String toString() {
- return super.toString().toLowerCase();
- }
-
- public static Operation fromString(String text) {
- for (Operation op : Operation.values()) {
- if (op.toString().equalsIgnoreCase(text)) {
- return op;
- }
- }
- throw new IllegalArgumentException("Unknown operation: " + text);
- }
-
- public static boolean valid(String text) {
- for (Operation op : Operation.values()) {
- if (op.toString().equalsIgnoreCase(text)) {
- return true;
- }
- }
- return false;
- }
-
- }
-
- private static final String PROPERTY_CREATE_IF_NON_EXISTENT = "create-if-non-existent";
- private static final String PROPERTY_ID_TEMPLATE = "docid";
- private static final String PROPERTY_OPERATION = "operation";
- private static final String PROPERTY_VERBOSE = "verbose";
- private static final String BAG_AS_MAP_FIELDS = "bag-as-map-fields";
- private static final String SIMPLE_ARRAY_FIELDS = "simple-array-fields";
- private static final String SIMPLE_OBJECT_FIELDS = "simple-object-fields";
- private static final String CREATE_TENSOR_FIELDS = "create-tensor-fields";
- private static final String REMOVE_TENSOR_FIELDS = "remove-tensor-fields";
- private static final String UPDATE_TENSOR_FIELDS = "update-tensor-fields";
- private static final String REMOVE_MAP_FIELDS = "remove-map-fields";
- private static final String UPDATE_MAP_FIELDS = "update-map-fields";
- private static final String EXCLUDE_FIELDS = "exclude-fields";
- private static final String TESTSET_CONDITION = "condition";
- private static final String PARTIAL_UPDATE_ASSIGN = "assign";
- private static final String PARTIAL_UPDATE_ADD = "add";
- private static final String PARTIAL_UPDATE_REMOVE = "remove";
-
- private static Map<String, String> mapPartialOperationMap;
-
- static {
- mapPartialOperationMap = new HashMap<>();
- mapPartialOperationMap.put(REMOVE_MAP_FIELDS, PARTIAL_UPDATE_REMOVE);
- mapPartialOperationMap.put(UPDATE_MAP_FIELDS, PARTIAL_UPDATE_ASSIGN);
- }
-
- private static Map<String, String> partialOperationMap;
-
- static {
- partialOperationMap = new HashMap<>();
- partialOperationMap.put(REMOVE_TENSOR_FIELDS, PARTIAL_UPDATE_REMOVE);
- partialOperationMap.put(UPDATE_TENSOR_FIELDS, PARTIAL_UPDATE_ADD);
- partialOperationMap.put(REMOVE_MAP_FIELDS, PARTIAL_UPDATE_REMOVE);
- partialOperationMap.put(UPDATE_MAP_FIELDS, PARTIAL_UPDATE_ASSIGN);
- }
-
- private final boolean verbose;
- private final String template;
- private final Operation operation;
- private final Properties properties;
- private PigStatusReporter statusReporter;
-
- public VespaDocumentOperation(String... params) {
- statusReporter = PigStatusReporter.getInstance();
- if (statusReporter != null) {
- statusReporter.incrCounter("Vespa Document Operation Counters", "Document operation ok", 0);
- statusReporter.incrCounter("Vespa Document Operation Counters", "Document operation failed", 0);
- }
- properties = VespaConfiguration.loadProperties(params);
- template = properties.getProperty(PROPERTY_ID_TEMPLATE);
- operation = Operation.fromString(properties.getProperty(PROPERTY_OPERATION, "put"));
- verbose = Boolean.parseBoolean(properties.getProperty(PROPERTY_VERBOSE, "false"));
- }
-
- @Override
- public String exec(Tuple tuple) throws IOException {
- if (tuple == null || tuple.size() == 0) {
- if (statusReporter != null) {
- statusReporter.incrCounter("Vespa Document Operation Counters", "Document operation failed", 1);
- }
- return null;
- }
- if (template == null || template.length() == 0) {
- if (statusReporter != null) {
- statusReporter.incrCounter("Vespa Document Operation Counters", "Document operation failed", 1);
- }
- warnLog("No valid document id template found. Skipping.", PigWarning.UDF_WARNING_1);
- return null;
- }
- if (operation == null) {
- warnLog("No valid operation found. Skipping.", PigWarning.UDF_WARNING_2);
- return null;
- }
-
- String json = null;
-
- try {
- if (reporter != null) {
- reporter.progress();
- }
-
- Schema inputSchema = getInputSchema();
- Map<String, Object> fields = TupleTools.tupleMap(inputSchema, tuple);
- String docId = TupleTools.toString(fields, template);
- if (verbose) {
- System.out.println("Processing docId: "+ docId);
- }
- // create json
- json = create(operation, docId, fields, properties, inputSchema);
- if (json == null || json.length() == 0) {
- warnLog("No valid document operation could be created.", PigWarning.UDF_WARNING_3);
- return null;
- }
-
-
- } catch (Exception e) {
- if (statusReporter != null) {
- statusReporter.incrCounter("Vespa Document Operation Counters", "Document operation failed", 1);
- }
- StringBuilder sb = new StringBuilder();
- sb.append("Caught exception processing input row: \n");
- sb.append(tuple.toString());
- sb.append("\nException: ");
- sb.append(getStackTraceAsString(e));
- warnLog(sb.toString(), PigWarning.UDF_WARNING_4);
- return null;
- }
- if (statusReporter != null) {
- statusReporter.incrCounter("Vespa Document Operation Counters", "Document operation ok", 1);
- }
- return json;
- }
-
-
- /**
- * Create a JSON Vespa document operation given the supplied fields,
- * operation and document id template.
- *
- * @param op Operation (put, remove, update)
- * @param docId Document id
- * @param fields Fields to put in document operation
- * @return A valid JSON Vespa document operation
- * @throws IOException ...
- */
- public static String create(Operation op, String docId, Map<String, Object> fields, Properties properties,
- Schema schema) throws IOException {
- if (op == null) {
- return null;
- }
- if (docId == null || docId.length() == 0) {
- return null;
- }
- if (fields.isEmpty()) {
- return null;
- }
-
- // create json format
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- JsonGenerator g = new JsonFactory().createGenerator(out, JsonEncoding.UTF8);
- g.writeStartObject();
-
- g.writeStringField(op.toString(), docId);
-
- boolean createIfNonExistent = Boolean.parseBoolean(properties.getProperty(PROPERTY_CREATE_IF_NON_EXISTENT, "false"));
- if (op == Operation.UPDATE && createIfNonExistent) {
- writeField("create", true, DataType.BOOLEAN, g, properties, schema, op, 0);
- }
- String testSetConditionTemplate = properties.getProperty(TESTSET_CONDITION);
- if (testSetConditionTemplate != null) {
- String testSetCondition = TupleTools.toString(fields, testSetConditionTemplate);
- writeField(TESTSET_CONDITION, testSetCondition, DataType.CHARARRAY, g, properties, schema, op, 0);
- }
- if (op != Operation.REMOVE) {
- writeField("fields", fields, DataType.MAP, g, properties, schema, op, 0);
- }
-
- g.writeEndObject();
- g.close();
-
- return out.toString();
- }
-
- private static String getPartialOperation(Map<String, String> operationMap, String name, Properties properties) {
- // This function checks if the property of the name falls into the map provided
- // if yes, return the desired operation. if no, return null
- // for example, input:
- // operationMap map{"update-map-fields":"assign","remove-map-fields":"remove"}
- // name date
- // properties "update-map-fields":"date,month"
- // output: assign
- for (String label : operationMap.keySet()) {
- if (properties.getProperty(label) != null) {
- String[] p = properties.getProperty(label).split(",");
- if (Arrays.asList(p).contains(name)) {
- return operationMap.get(label);
- }
- }
- }
- return null;
- }
-
- @SuppressWarnings("unchecked")
- private static void writeField(String name, Object value, Byte type, JsonGenerator g, Properties properties, Schema schema, Operation op, int depth) throws IOException {
- if (shouldWriteField(name, properties, depth)) {
- String operation = getPartialOperation(mapPartialOperationMap, name, properties);
- // check if the name has the property update-map-fields/remove-map-fields
- // if yes, we need special treatments here as we need to loop through the tuple
- // be aware the the operation here is not vespa operation such as "put" and "update"
- // operation here are the field name we wish use to such as "assign" and "remove"
- if (operation != null) {
- writePartialUpdateAndRemoveMap(name, value, g, properties, schema, op, depth, operation);
- } else {
- g.writeFieldName(name);
- if (shouldWritePartialUpdate(op, depth)) {
- writePartialUpdate(value, type, g, name, properties, schema, op, depth);
- } else {
- writeValue(value, type, g, name, properties, schema, op, depth);
- }
- }
-
- }
- }
-
- private static void writePartialUpdateAndRemoveMap(String name, Object value, JsonGenerator g, Properties properties, Schema schema, Operation op, int depth, String operation) throws IOException {
- schema = (schema != null) ? schema.getField(0).schema : null;
- // extract the key of map and keys in map for writing json when partial updating maps
- Schema valueSchema = (schema != null) ? schema.getField(1).schema : null;
- // data format { ( key; id, value: (abc,123,(123234,bbaa))) }
- // the first element of each tuple in the bag will be the map to update
- // the second element of each tuple in the bag will be the new value of the map
- DataBag bag = (DataBag) value;
- for (Tuple element : bag) {
- if (element.size() != 2) {
- continue;
- }
- String k = (String) element.get(0);
- Object v = element.get(1);
- Byte t = DataType.findType(v);
- if (t == DataType.TUPLE) {
- g.writeFieldName(name + "{" + k + "}");
- if (operation.equals(PARTIAL_UPDATE_REMOVE)) {
- g.writeStartObject();
- g.writeFieldName(PARTIAL_UPDATE_REMOVE);
- g.writeNumber(0);
- g.writeEndObject();
- } else {
- writePartialUpdate(v, t, g, name, properties, valueSchema, op, depth);
- }
- }
- }
- }
-
- @SuppressWarnings("unchecked")
- private static void writeValue(Object value, Byte type, JsonGenerator g, String name, Properties properties, Schema schema, Operation op, int depth) throws IOException {
- switch (type) {
- case DataType.UNKNOWN:
- break;
- case DataType.NULL:
- g.writeNull();
- break;
- case DataType.BOOLEAN:
- g.writeBoolean((boolean) value);
- break;
- case DataType.INTEGER:
- g.writeNumber((int) value);
- break;
- case DataType.LONG:
- g.writeNumber((long) value);
- break;
- case DataType.FLOAT:
- g.writeNumber((float) value);
- break;
- case DataType.DOUBLE:
- g.writeNumber((double) value);
- break;
- case DataType.DATETIME:
- g.writeNumber(((DateTime) value).getMillis());
- break;
- case DataType.BYTEARRAY:
- DataByteArray bytes = (DataByteArray) value;
- String raw = Base64.getEncoder().encodeToString(bytes.get());
- g.writeString(raw);
- break;
- case DataType.CHARARRAY:
- g.writeString((String) value);
- break;
- case DataType.BIGINTEGER:
- g.writeNumber((BigInteger) value);
- break;
- case DataType.BIGDECIMAL:
- g.writeNumber((BigDecimal) value);
- break;
- case DataType.MAP:
- g.writeStartObject();
- Map<Object, Object> map = (Map<Object, Object>) value;
- if (shouldCreateTensor(map, name, properties)) {
- if (isRemoveTensor(name, properties)) {
- writeRemoveTensor(map, g);
- } else {
- writeTensor(map, g);
- }
- } else {
- for (Map.Entry<Object, Object> entry : map.entrySet()) {
- String k = entry.getKey().toString();
- Object v = entry.getValue();
- Byte t = DataType.findType(v);
- Schema fieldSchema = (schema != null) ? schema.getField(k).schema : null;
- writeField(k, v, t, g, properties, fieldSchema, op, depth + 1);
- }
- }
- g.writeEndObject();
- break;
- case DataType.TUPLE:
- Tuple tuple = (Tuple) value;
- if (shouldWriteTupleAsMap(name, properties)) {
- Map<String, Object> fields = TupleTools.tupleMap(schema, tuple);
- writeValue(fields, DataType.MAP, g, name, properties, schema, op, depth);
- } else {
- boolean writeStartArray = shouldWriteTupleStart(tuple, name, properties);
- if (writeStartArray) {
- g.writeStartArray();
- }
- for (Object v : tuple) {
- writeValue(v, DataType.findType(v), g, name, properties, schema, op, depth);
- }
- if (writeStartArray) {
- g.writeEndArray();
- }
- }
- break;
- case DataType.BAG:
- DataBag bag = (DataBag) value;
- // get the schema of the tuple in bag
- schema = (schema != null) ? schema.getField(0).schema : null;
- if (shouldWriteBagAsMap(name, properties)) {
- // when treating bag as map, the schema of bag should be {(key, val)....}
- // the size of tuple in bag should be 2. 1st one is key. 2nd one is val.
- Schema valueSchema = (schema != null) ? schema.getField(1).schema : null;
-
- g.writeStartObject();
- for (Tuple element : bag) {
- if (element.size() != 2) {
- continue;
- }
- String k = (String) element.get(0);
- Object v = element.get(1);
- Byte t = DataType.findType(v);
- if (t == DataType.TUPLE) {
- Map<String, Object> fields = TupleTools.tupleMap(valueSchema, (Tuple) v);
- writeField(k, fields, DataType.MAP, g, properties, valueSchema, op, depth + 1);
- } else {
- writeField(k, v, t, g, properties, valueSchema, op, depth + 1);
- }
- }
- g.writeEndObject();
- } else {
- g.writeStartArray();
- for (Tuple t : bag) {
- writeValue(t, DataType.TUPLE, g, name, properties, schema, op, depth);
- }
- g.writeEndArray();
- }
- break;
- }
-
- }
-
- private static boolean shouldWritePartialUpdate(Operation op, int depth) {
- return op == Operation.UPDATE && depth == 1;
- }
-
- private static void writePartialUpdate(Object value, Byte type, JsonGenerator g, String name, Properties properties, Schema schema, Operation op, int depth) throws IOException {
- g.writeStartObject();
- // here we check if the operation falls into the four partial operations we do on map/tensor structure
- // if no, we assume it's a update on the whole document and we write assign here
- // if yes, we write the desired operation here
- String operation = getPartialOperation(partialOperationMap, name, properties);
- if (operation != null) {
- g.writeFieldName(operation);
- } else {
- g.writeFieldName(PARTIAL_UPDATE_ASSIGN);
- }
- writeValue(value, type, g, name, properties, schema, op, depth);
- g.writeEndObject();
- }
-
- private static boolean shouldWriteTupleStart(Tuple tuple, String name, Properties properties) {
- if (tuple.size() > 1 || properties == null) {
- return true;
- }
- String simpleArrayFields = properties.getProperty(SIMPLE_ARRAY_FIELDS);
- if (simpleArrayFields == null) {
- return true;
- }
- if (simpleArrayFields.equals("*")) {
- return false;
- }
- String[] fields = simpleArrayFields.split(",");
- for (String field : fields) {
- if (field.trim().equalsIgnoreCase(name)) {
- return false;
- }
- }
- return true;
- }
-
- private static boolean shouldWriteTupleAsMap(String name, Properties properties) {
- // include UPDATE_MAP_FIELDS here because when updating the map
- // the second element in each tuple should be written as a map
- if (properties == null) {
- return false;
- }
- String addBagAsMapFields = properties.getProperty(UPDATE_MAP_FIELDS);
- String simpleObjectFields = properties.getProperty(SIMPLE_OBJECT_FIELDS);
- if (simpleObjectFields == null && addBagAsMapFields == null) {
- return false;
- }
- if (addBagAsMapFields != null) {
- if (addBagAsMapFields.equals("*")) {
- return true;
- }
- String[] fields = addBagAsMapFields.split(",");
- for (String field : fields) {
- if (field.trim().equalsIgnoreCase(name)) {
- return true;
- }
- }
-
- }
- if (simpleObjectFields != null) {
- if (simpleObjectFields.equals("*")) {
- return true;
- }
- String[] fields = simpleObjectFields.split(",");
- for (String field : fields) {
- if (field.trim().equalsIgnoreCase(name)) {
- return true;
- }
- }
- }
- return false;
- }
-
- private static boolean shouldWriteBagAsMap(String name, Properties properties) {
- if (properties == null) {
- return false;
- }
- String bagAsMapFields = properties.getProperty(BAG_AS_MAP_FIELDS);
- if (bagAsMapFields == null) {
- return false;
- }
- if (bagAsMapFields.equals("*")) {
- return true;
- }
- String[] fields = bagAsMapFields.split(",");
- for (String field : fields) {
- if (field.trim().equalsIgnoreCase(name)) {
- return true;
- }
- }
- return false;
- }
-
- private static boolean shouldCreateTensor(Map<Object, Object> map, String name, Properties properties) {
- if (properties == null) {
- return false;
- }
- String createTensorFields = properties.getProperty(CREATE_TENSOR_FIELDS);
- String addTensorFields = properties.getProperty(UPDATE_TENSOR_FIELDS);
- String removeTensorFields = properties.getProperty(REMOVE_TENSOR_FIELDS);
-
- if (createTensorFields == null && addTensorFields == null && removeTensorFields == null) {
- return false;
- }
- String[] fields;
- if (createTensorFields != null) {
- fields = createTensorFields.split(",");
- for (String field : fields) {
- if (field.trim().equalsIgnoreCase(name)) {
- return true;
- }
- }
- }
- if (addTensorFields != null) {
- fields = addTensorFields.split(",");
- for (String field : fields) {
- if (field.trim().equalsIgnoreCase(name)) {
- return true;
- }
- }
- }
- if (removeTensorFields != null) {
- fields = removeTensorFields.split(",");
- for (String field : fields) {
- if (field.trim().equalsIgnoreCase(name)) {
- return true;
- }
- }
- }
- return false;
- }
-
- private static boolean isRemoveTensor(String name, Properties properties) {
- if (properties == null) {
- return false;
- }
- String removeTensorFields = properties.getProperty(REMOVE_TENSOR_FIELDS);
- if (removeTensorFields == null) {
- return false;
- }
- String[] fields = removeTensorFields.split(",");
- for (String field : fields) {
- if (field.trim().equalsIgnoreCase(name)) {
- return true;
- }
- }
- return false;
- }
-
- private static boolean shouldWriteField(String name, Properties properties, int depth) {
- if (properties == null || depth != 1) {
- return true;
- }
- String excludeFields = properties.getProperty(EXCLUDE_FIELDS);
- if (excludeFields == null) {
- return true;
- }
- String[] fields = excludeFields.split(",");
- for (String field : fields) {
- if (field.trim().equalsIgnoreCase(name)) {
- return false;
- }
- }
- return true;
- }
-
- private static void writeTensor(Map<Object, Object> map, JsonGenerator g) throws IOException {
- g.writeFieldName("cells");
- g.writeStartArray();
- for (Map.Entry<Object, Object> entry : map.entrySet()) {
- String k = entry.getKey().toString();
- Double v = Double.parseDouble(entry.getValue().toString());
-
- g.writeStartObject();
-
- // Write address
- g.writeFieldName("address");
- g.writeStartObject();
-
- String[] dimensions = k.split(",");
- for (String dimension : dimensions) {
- if (dimension == null || dimension.isEmpty()) {
- continue;
- }
- String[] address = dimension.split(":");
- if (address.length != 2) {
- throw new IllegalArgumentException("Malformed cell address: " + dimension);
- }
- String dim = address[0];
- String label = address[1];
- if (dim == null || label == null || dim.isEmpty() || label.isEmpty()) {
- throw new IllegalArgumentException("Malformed cell address: " + dimension);
- }
- g.writeFieldName(dim.trim());
- g.writeString(label.trim());
- }
- g.writeEndObject();
-
- // Write value
- g.writeFieldName("value");
- g.writeNumber(v);
-
- g.writeEndObject();
- }
- g.writeEndArray();
- }
-
- private static void writeRemoveTensor(Map<Object, Object> map, JsonGenerator g) throws IOException {
- g.writeFieldName("addresses");
- g.writeStartArray();
- for (Map.Entry<Object, Object> entry : map.entrySet()) {
- String k = entry.getKey().toString();
- String[] dimensions = k.split(",");
- for (String dimension : dimensions) {
- g.writeStartObject();
- if (dimension == null || dimension.isEmpty()) {
- continue;
- }
- String[] address = dimension.split(":");
- if (address.length != 2) {
- throw new IllegalArgumentException("Malformed cell address: " + dimension);
- }
- String dim = address[0];
- String label = address[1];
- if (dim == null || label == null || dim.isEmpty() || label.isEmpty()) {
- throw new IllegalArgumentException("Malformed cell address: " + dimension);
- }
- g.writeFieldName(dim.trim());
- g.writeString(label.trim());
- g.writeEndObject();
- // Write address
- }
- }
- g.writeEndArray();
- }
-
- // copied from vespajlib for reducing dependency and building with JDK 8
- private static String getStackTraceAsString(Throwable throwable) {
- try (StringWriter stringWriter = new StringWriter();
- PrintWriter printWriter = new PrintWriter(stringWriter, true)) {
- throwable.printStackTrace(printWriter);
- return stringWriter.getBuffer().toString();
- } catch (IOException e) {
- throw new UncheckedIOException(e);
- }
- }
-
- // wrapper to emit logs
- private void warnLog(String msg, PigWarning warning) {
- warn(msg, warning);
- System.err.println(msg);
- }
-}
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaQuery.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaQuery.java
deleted file mode 100644
index 1d50f2909a2..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaQuery.java
+++ /dev/null
@@ -1,114 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.pig;
-
-import com.fasterxml.jackson.databind.JsonNode;
-import com.yahoo.vespa.hadoop.mapreduce.util.VespaConfiguration;
-import com.yahoo.vespa.hadoop.mapreduce.util.VespaHttpClient;
-import com.yahoo.vespa.hadoop.mapreduce.util.VespaQuerySchema;
-import com.yahoo.vespa.hadoop.mapreduce.util.TupleTools;
-import org.apache.pig.EvalFunc;
-import org.apache.pig.data.*;
-import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.apache.pig.impl.util.UDFContext;
-
-import java.io.IOException;
-import java.util.*;
-
-/**
- * A Pig UDF to run a query against a Vespa cluster and return the
- * results.
- *
- * @author lesters
- */
-public class VespaQuery extends EvalFunc<DataBag> {
-
- private final String PROPERTY_QUERY_TEMPLATE = "query";
- private final String PROPERTY_QUERY_SCHEMA = "schema";
- private final String PROPERTY_ROOT_NODE = "rootnode";
-
- private final VespaConfiguration configuration;
- private final Properties properties;
- private final String queryTemplate;
- private final String querySchema;
- private final String queryRootNode;
-
- private VespaHttpClient httpClient;
-
- public VespaQuery() {
- this(new String[0]);
- }
-
- public VespaQuery(String... params) {
- configuration = VespaConfiguration.get(UDFContext.getUDFContext().getJobConf(), null);
- properties = VespaConfiguration.loadProperties(params);
-
- queryTemplate = properties.getProperty(PROPERTY_QUERY_TEMPLATE);
- if (queryTemplate == null || queryTemplate.isEmpty()) {
- throw new IllegalArgumentException("Query template cannot be empty");
- }
-
- querySchema = properties.getProperty(PROPERTY_QUERY_SCHEMA, "rank:int,id:chararray");
- queryRootNode = properties.getProperty(PROPERTY_ROOT_NODE, "root/children");
- }
-
- @Override
- public DataBag exec(Tuple input) throws IOException {
- if (input == null || input.size() == 0) {
- return null;
- }
- JsonNode jsonResult = queryVespa(input);
- if (jsonResult == null) {
- return null;
- }
- return createPigRepresentation(jsonResult);
- }
-
- @Override
- public Schema outputSchema(Schema input) {
- return VespaQuerySchema.getPigSchema(querySchema);
- }
-
-
- private JsonNode queryVespa(Tuple input) throws IOException {
- String url = createVespaQueryUrl(input);
- if (url == null) {
- return null;
- }
- String result = executeVespaQuery(url);
- return parseVespaResultJson(result);
- }
-
-
- private String createVespaQueryUrl(Tuple input) throws IOException {
- return TupleTools.toString(getInputSchema(), input, queryTemplate);
- }
-
-
- private String executeVespaQuery(String url) throws IOException {
- if (httpClient == null) {
- httpClient = new VespaHttpClient(configuration);
- }
- return httpClient.get(url);
- }
-
- private JsonNode parseVespaResultJson(String result) throws IOException {
- return httpClient == null ? null : httpClient.parseResultJson(result, queryRootNode);
- }
-
- private DataBag createPigRepresentation(JsonNode hits) {
- DataBag bag = new SortedDataBag(null);
- VespaQuerySchema querySchema = new VespaQuerySchema(this.querySchema);
-
- for (int rank = 0; rank < hits.size(); ++rank) {
- JsonNode hit = hits.get(rank);
- Tuple tuple = querySchema.buildTuple(rank, hit);
- bag.add(tuple);
- }
-
- return bag;
- }
-
-
-
-
-}
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaSimpleJsonLoader.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaSimpleJsonLoader.java
deleted file mode 100644
index 9dc294ce243..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaSimpleJsonLoader.java
+++ /dev/null
@@ -1,63 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.pig;
-
-import com.yahoo.vespa.hadoop.mapreduce.VespaSimpleJsonInputFormat;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.InputFormat;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.RecordReader;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.pig.LoadFunc;
-import org.apache.pig.backend.hadoop.executionengine.mapReduceLayer.PigSplit;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.data.TupleFactory;
-
-import java.io.IOException;
-
-/**
- * Simple JSON loader which loads either one JSON object per line or a
- * multiline JSON consisting of objects in an array.
- *
- * Returns only the textual representation of the JSON object.
- *
- * @author lesters
- */
-@SuppressWarnings("rawtypes")
-public class VespaSimpleJsonLoader extends LoadFunc {
-
- private TupleFactory tupleFactory = TupleFactory.getInstance();
- private VespaSimpleJsonInputFormat.VespaJsonRecordReader recordReader;
-
- @Override
- public void setLocation(String location, Job job) throws IOException {
- FileInputFormat.setInputPaths(job, location);
- }
-
- @Override
- public InputFormat getInputFormat() throws IOException {
- return new VespaSimpleJsonInputFormat();
- }
-
- @Override
- public void prepareToRead(RecordReader reader, PigSplit split) throws IOException {
- recordReader = (VespaSimpleJsonInputFormat.VespaJsonRecordReader) reader;
- }
-
- @Override
- public Tuple getNext() throws IOException {
- try {
- boolean done = recordReader.nextKeyValue();
- if (done) {
- return null;
- }
- Text json = recordReader.getCurrentKey();
- if (json == null) {
- return null;
- }
- return tupleFactory.newTuple(json.toString());
-
- } catch (InterruptedException ex) {
- return null;
- }
- }
-}
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaStorage.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaStorage.java
deleted file mode 100644
index a564dfac25d..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/VespaStorage.java
+++ /dev/null
@@ -1,178 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.pig;
-
-import com.fasterxml.jackson.databind.ObjectMapper;
-import com.yahoo.vespa.hadoop.mapreduce.VespaOutputFormat;
-import com.yahoo.vespa.hadoop.mapreduce.util.TupleTools;
-import com.yahoo.vespa.hadoop.mapreduce.util.VespaConfiguration;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.OutputFormat;
-import org.apache.hadoop.mapreduce.RecordWriter;
-import org.apache.pig.ResourceSchema;
-import org.apache.pig.StoreFunc;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.apache.pig.impl.util.UDFContext;
-
-import java.io.*;
-import java.util.Base64;
-import java.util.Map;
-import java.util.Properties;
-
-/**
- * A small Pig UDF wrapper around the Vespa http client for
- * feeding data into a Vespa endpoint.
- *
- * @author lesters
- */
-@SuppressWarnings("rawtypes")
-public class VespaStorage extends StoreFunc {
-
- private final boolean createDocOp;
- private final String template;
- private final VespaDocumentOperation.Operation operation;
-
- private String signature = null;
- private RecordWriter recordWriter = null;
- private ResourceSchema resourceSchema = null;
-
- private static final String PROPERTY_CREATE_DOC_OP = "create-document-operation";
- private static final String PROPERTY_ID_TEMPLATE = "docid";
- private static final String PROPERTY_OPERATION = "operation";
- private static final String PROPERTY_RESOURCE_SCHEMA = "resource_schema";
-
- Properties properties = new Properties();
-
- public VespaStorage() {
- createDocOp = false;
- template = null;
- operation = null;
- }
-
- public VespaStorage(String... params) {
- properties = VespaConfiguration.loadProperties(params);
- createDocOp = Boolean.parseBoolean(properties.getProperty(PROPERTY_CREATE_DOC_OP, "false"));
- operation = VespaDocumentOperation.Operation.fromString(properties.getProperty(PROPERTY_OPERATION, "put"));
- template = properties.getProperty(PROPERTY_ID_TEMPLATE);
- }
-
-
- @Override
- public OutputFormat getOutputFormat() throws IOException {
- return new VespaOutputFormat(properties);
- }
-
-
- @Override
- public void setStoreLocation(String endpoint, Job job) throws IOException {
- properties.setProperty(VespaConfiguration.ENDPOINT, endpoint);
- }
-
-
- @Override
- public void prepareToWrite(RecordWriter recordWriter) throws IOException {
- this.recordWriter = recordWriter;
- this.resourceSchema = getResourceSchema();
- }
-
-
- @SuppressWarnings("unchecked")
- @Override
- public void putNext(Tuple tuple) throws IOException {
- if (tuple == null || tuple.size() == 0) {
- return;
- }
-
- String data = null;
- if (createDocOp) {
- data = createDocumentOperation(tuple);
- } else if (!tuple.isNull(0)) {
- data = tuple.get(0).toString(); // assume single field with correctly formatted doc op.
- }
-
- if (data == null || data.length() == 0) {
- return;
- }
-
- try {
- recordWriter.write(0, data);
- } catch (InterruptedException e) {
- throw new IOException(e);
- }
- }
-
-
- @Override
- public void checkSchema(ResourceSchema resourceSchema) throws IOException {
- setResourceSchema(resourceSchema);
- }
-
-
- @Override
- public String relToAbsPathForStoreLocation(String endpoint, Path path) throws IOException {
- return endpoint;
- }
-
-
- @Override
- public void setStoreFuncUDFContextSignature(String s) {
- this.signature = s;
- }
-
-
- @Override
- public void cleanupOnFailure(String s, Job job) throws IOException {
- }
-
-
- @Override
- public void cleanupOnSuccess(String s, Job job) throws IOException {
- }
-
-
- private ResourceSchema getResourceSchema() throws IOException {
- Properties properties = getUDFProperties();
- return base64Deserialize(properties.getProperty(PROPERTY_RESOURCE_SCHEMA));
- }
-
-
- private void setResourceSchema(ResourceSchema schema) throws IOException {
- Properties properties = getUDFProperties();
- if (properties.getProperty(PROPERTY_RESOURCE_SCHEMA) == null) {
- properties.setProperty(PROPERTY_RESOURCE_SCHEMA, base64Serialize(schema));
- }
- }
-
-
- private Properties getUDFProperties() {
- String[] context = { signature };
- return UDFContext.getUDFContext().getUDFProperties(getClass(), context);
- }
-
-
- private String createDocumentOperation(Tuple tuple) throws IOException {
- if (tuple == null || tuple.size() == 0) {
- return null;
- }
- if (resourceSchema == null) {
- return null;
- }
-
- Map<String, Object> fields = TupleTools.tupleMap(resourceSchema, tuple);
- String docId = TupleTools.toString(fields, template);
-
- Schema schema = Schema.getPigSchema(resourceSchema);
- return VespaDocumentOperation.create(operation, docId, fields, properties, schema);
- }
-
- public static String base64Serialize(ResourceSchema resourceSchema) throws IOException {
- byte[] bytes = new ObjectMapper().writeValueAsBytes(resourceSchema);
- return Base64.getEncoder().encodeToString(bytes);
- }
-
- public static ResourceSchema base64Deserialize(String s) throws IOException {
- byte[] data = Base64.getDecoder().decode(s);
- return new ObjectMapper().readValue(data, ResourceSchema.class);
- }
-}
diff --git a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/package-info.java b/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/package-info.java
deleted file mode 100644
index 686765ac047..00000000000
--- a/vespa-hadoop/src/main/java/com/yahoo/vespa/hadoop/pig/package-info.java
+++ /dev/null
@@ -1,10 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-/**
- * com.yahoo.vespa.hadoop.pig contains classes and utilities
- * to enable feeding directly to Vespa endpoints from pig.
- * It is a minimal layer over the Vespa HTTP client.
- *
- * NOTE: This is a PUBLIC API, but not annotated as such because this is not a bundle and
- * we don't want to introduce Vespa dependencies.
- */
-package com.yahoo.vespa.hadoop.pig;
diff --git a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/MapReduceTest.java b/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/MapReduceTest.java
deleted file mode 100644
index d56cd818de2..00000000000
--- a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/MapReduceTest.java
+++ /dev/null
@@ -1,200 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.pig;
-
-import com.fasterxml.jackson.core.JsonEncoding;
-import com.fasterxml.jackson.core.JsonFactory;
-import com.fasterxml.jackson.core.JsonGenerator;
-import com.yahoo.vespa.hadoop.mapreduce.VespaOutputFormat;
-import com.yahoo.vespa.hadoop.mapreduce.util.VespaConfiguration;
-import com.yahoo.vespa.hadoop.mapreduce.util.VespaCounters;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.fs.FSDataOutputStream;
-import org.apache.hadoop.fs.FileSystem;
-import org.apache.hadoop.fs.LocalFileSystem;
-import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.hdfs.HdfsConfiguration;
-import org.apache.hadoop.hdfs.MiniDFSCluster;
-import org.apache.hadoop.io.LongWritable;
-import org.apache.hadoop.io.Text;
-import org.apache.hadoop.mapreduce.Job;
-import org.apache.hadoop.mapreduce.Mapper;
-import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
-import org.apache.hadoop.test.PathUtils;
-import org.junit.jupiter.api.AfterAll;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
-
-import java.io.BufferedInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.StringTokenizer;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-
-public class MapReduceTest {
-
- protected static File hdfsBaseDir;
- protected static FileSystem hdfs;
- protected static Configuration conf;
- protected static MiniDFSCluster cluster;
-
- protected static Path metricsJsonPath;
- protected static Path metricsCsvPath;
-
- @BeforeAll
- public static void setUp() throws IOException {
- hdfsBaseDir = new File(PathUtils.getTestDir(MapReduceTest.class).getCanonicalPath());
-
- conf = new HdfsConfiguration();
- conf.set(MiniDFSCluster.HDFS_MINIDFS_BASEDIR, hdfsBaseDir.getAbsolutePath());
- conf.set(VespaConfiguration.DRYRUN, "true");
- conf.set(VespaConfiguration.ENDPOINT, "endpoint-does-not-matter-in-dryrun");
-
- cluster = new MiniDFSCluster.Builder(conf).build();
- hdfs = FileSystem.get(conf);
-
- metricsJsonPath = new Path("metrics_json");
- metricsCsvPath = new Path("metrics_csv");
- copyToHdfs("src/test/resources/operations_data.json", metricsJsonPath, "data");
- copyToHdfs("src/test/resources/tabular_data.csv", metricsCsvPath, "data");
- }
-
- @AfterAll
- public static void tearDown() throws IOException {
- Path testDir = new Path(hdfsBaseDir.getParent());
- hdfs.delete(testDir, true);
- cluster.shutdown();
- LocalFileSystem localFileSystem = FileSystem.getLocal(conf);
- localFileSystem.delete(testDir, true);
- }
-
- @Test
- public void requireThatMapOnlyJobSucceeds() throws Exception {
- Job job = Job.getInstance(conf);
- job.setJarByClass(MapReduceTest.class);
- job.setMapperClass(FeedMapper.class);
- job.setOutputFormatClass(VespaOutputFormat.class);
- job.setMapOutputValueClass(Text.class);
-
- FileInputFormat.setInputPaths(job, metricsJsonPath);
-
- boolean success = job.waitForCompletion(true);
- assertTrue(success, "Job Failed");
-
- VespaCounters counters = VespaCounters.get(job);
- assertEquals(10, counters.getDocumentsSent());
- assertEquals(0, counters.getDocumentsFailed());
- assertEquals(10, counters.getDocumentsOk());
- }
-
- @Test
- public void requireThatMapReduceJobSucceeds() throws Exception {
- Job job = Job.getInstance(conf);
- job.setJarByClass(MapReduceTest.class);
- job.setMapperClass(FeedMapper.class);
- job.setOutputFormatClass(VespaOutputFormat.class);
- job.setMapOutputValueClass(Text.class);
- job.setReducerClass(FeedReducer.class);
- job.setNumReduceTasks(1);
-
- FileInputFormat.setInputPaths(job, metricsJsonPath);
-
- boolean success = job.waitForCompletion(true);
- assertTrue(success, "Job Failed");
-
- VespaCounters counters = VespaCounters.get(job);
- assertEquals(10, counters.getDocumentsSent());
- assertEquals(0, counters.getDocumentsFailed());
- assertEquals(10, counters.getDocumentsOk());
- }
-
-
- @Test
- public void requireThatTransformMapJobSucceeds() throws Exception {
- Job job = Job.getInstance(conf);
- job.setJarByClass(MapReduceTest.class);
- job.setMapperClass(ParsingMapper.class);
- job.setOutputFormatClass(VespaOutputFormat.class);
- job.setMapOutputValueClass(Text.class);
- job.setReducerClass(FeedReducer.class);
- job.setNumReduceTasks(1);
-
- FileInputFormat.setInputPaths(job, metricsCsvPath);
-
- boolean success = job.waitForCompletion(true);
- assertTrue(success, "Job Failed");
-
- VespaCounters counters = VespaCounters.get(job);
- assertEquals(10, counters.getDocumentsSent());
- assertEquals(0, counters.getDocumentsFailed());
- assertEquals(10, counters.getDocumentsOk());
- assertEquals(0, counters.getDocumentsSkipped());
- }
-
-
- private static void copyToHdfs(String localFile, Path hdfsDir, String hdfsName) throws IOException {
- Path hdfsPath = new Path(hdfsDir, hdfsName);
- FSDataOutputStream out = hdfs.create(hdfsPath);
-
- try (InputStream in = new BufferedInputStream(new FileInputStream(localFile))) {
- int len;
- byte[] buffer = new byte[1024];
- while ((len = in.read(buffer)) > 0) {
- out.write(buffer, 0, len);
- }
- } finally {
- out.close();
- }
- }
-
- public static class FeedMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
- public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
- context.write(key, value);
- }
- }
-
- public static class FeedReducer extends Reducer<Object, Text, LongWritable, Text> {
- public void reduce(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
- context.write(key, value);
- }
- }
-
- public static class ParsingMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
- public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
- String line = value.toString();
- if (line == null || line.length() == 0)
- return;
-
- StringTokenizer tokenizer = new StringTokenizer(line);
- long date = Long.parseLong(tokenizer.nextToken());
- String metricName = tokenizer.nextToken();
- long metricValue = Long.parseLong(tokenizer.nextToken());
- String application = tokenizer.nextToken();
-
- String docid = "id:"+application+":metric::"+metricName+"-"+date;
-
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- JsonGenerator g = new JsonFactory().createGenerator(out, JsonEncoding.UTF8);
-
- g.writeStartObject();
- g.writeObjectFieldStart("fields");
- g.writeNumberField("date", date);
- g.writeStringField("name", metricName);
- g.writeNumberField("value", metricValue);
- g.writeStringField("application", application);
- g.writeEndObject();
- g.writeStringField("put", docid);
- g.writeEndObject();
- g.close();
-
- context.write(key, new Text(out.toString()));
- }
- }
-
-
-}
diff --git a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaDocumentOperationTest.java b/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaDocumentOperationTest.java
deleted file mode 100644
index ec20e82763c..00000000000
--- a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaDocumentOperationTest.java
+++ /dev/null
@@ -1,633 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.pig;
-
-import com.fasterxml.jackson.databind.JsonNode;
-import com.fasterxml.jackson.databind.ObjectMapper;
-import org.apache.pig.data.BagFactory;
-import org.apache.pig.data.DataBag;
-import org.apache.pig.data.DataByteArray;
-import org.apache.pig.data.DataType;
-import org.apache.pig.data.SortedDataBag;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.data.TupleFactory;
-import org.apache.pig.impl.logicalLayer.FrontendException;
-import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.junit.jupiter.api.AfterEach;
-import org.junit.jupiter.api.BeforeEach;
-import org.junit.jupiter.api.Test;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.PrintStream;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
-import static org.junit.jupiter.api.Assertions.assertNull;
-import static org.junit.jupiter.api.Assertions.assertTrue;
-
-public class VespaDocumentOperationTest {
- private final ByteArrayOutputStream outContent = new ByteArrayOutputStream();
- private final PrintStream originalOut = System.out;
-
- @BeforeEach
- public void setUpStreams() {
- System.setOut(new PrintStream(outContent));
- }
-
- @AfterEach
- public void restoreStreams() {
- System.setOut(originalOut);
- }
- @Test
- public void requireThatUDFReturnsCorrectJson() throws Exception {
- String json = getDocumentOperationJson("docid=id:<application>:metrics::<name>-<date>");
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
- JsonNode fields = root.path("fields");
-
- // operation put is default
- assertEquals("id:testapp:metrics::clicks-20160112", root.get("put").asText());
- assertEquals("testapp", fields.get("application").asText());
- assertEquals("clicks", fields.get("name").asText());
- assertEquals(3, fields.get("value").asInt());
- }
-
-
- @Test
- public void requireThatUDFSupportsUpdateAssign() throws IOException {
- String json = getDocumentOperationJson("docid=id:<application>:metrics::<name>-<date>", "operation=update");
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
- JsonNode fields = root.path("fields");
-
- assertEquals("id:testapp:metrics::clicks-20160112", root.get("update").asText());
- assertEquals("testapp", fields.get("application").get("assign").asText());
- assertEquals("clicks", fields.get("name").get("assign").asText());
- assertEquals(3, fields.get("value").get("assign").asInt());
- }
-
- @Test
- public void requireThatUDFSupportsConditionalUpdateAssign() throws IOException {
- String json = getDocumentOperationJson("docid=id:<application>:metrics::<name>-<date>", "operation=update", "condition=clicks < <value>");
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
- JsonNode fields = root.path("fields");
-
- assertEquals("id:testapp:metrics::clicks-20160112", root.get("update").asText());
- assertEquals("clicks < 3", root.get("condition").asText());
- assertEquals("testapp", fields.get("application").get("assign").asText());
- assertEquals("clicks", fields.get("name").get("assign").asText());
- assertEquals(3, fields.get("value").get("assign").asInt());
- }
-
- @Test
- public void requireThatUDFSupportsCreateIfNonExistent() throws IOException {
- String json = getDocumentOperationJson("docid=id:<application>:metrics::<name>-<date>", "operation=update",
- "create-if-non-existent=true");
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
- JsonNode fields = root.path("fields");
-
- assertEquals("id:testapp:metrics::clicks-20160112", root.get("update").asText());
- assertTrue(root.get("create").asBoolean());
- assertEquals("testapp", fields.get("application").get("assign").asText());
- assertEquals("clicks", fields.get("name").get("assign").asText());
- assertEquals(3, fields.get("value").get("assign").asInt());
- }
-
-
- @Test
- public void requireThatUDFReturnsNullForMissingConfig() throws Exception {
- String json = getDocumentOperationJson();
- assertNull(json);
- }
-
-
- @Test
- public void requireThatUDFCorrectlyGeneratesRemoveBagAsMapOperation() throws Exception {
- DataBag bag = BagFactory.getInstance().newDefaultBag();
-
- Schema innerObjectSchema = new Schema();
- Tuple innerObjectTuple = TupleFactory.getInstance().newTuple();
- addToTuple("year", DataType.CHARARRAY, "2020", innerObjectSchema, innerObjectTuple);
- addToTuple("month", DataType.INTEGER, 3, innerObjectSchema, innerObjectTuple);
-
- Schema objectSchema = new Schema();
- Tuple objectTuple = TupleFactory.getInstance().newTuple();
- addToTuple("key", DataType.CHARARRAY, "234566", objectSchema, objectTuple);
- addToTuple("value", DataType.TUPLE, innerObjectTuple,innerObjectSchema,objectSchema, objectTuple);
-
- Schema bagSchema = new Schema();
- addToBagWithSchema("firstLayerTuple",DataType.TUPLE,objectTuple,objectSchema,bagSchema,bag);
-
- innerObjectSchema = new Schema();
- innerObjectTuple = TupleFactory.getInstance().newTuple();
- addToTuple("year", DataType.CHARARRAY, "2020", innerObjectSchema, innerObjectTuple);
- addToTuple("month", DataType.INTEGER, 3, innerObjectSchema, innerObjectTuple);
-
- objectSchema = new Schema();
- objectTuple = TupleFactory.getInstance().newTuple();
- addToTuple("key", DataType.CHARARRAY, "123456", objectSchema, objectTuple);
- addToTuple("value", DataType.TUPLE, innerObjectTuple,innerObjectSchema,objectSchema, objectTuple);
-
- addToBagWithSchema("firstLayerTuple",DataType.TUPLE,objectTuple,objectSchema,bagSchema,bag);
-
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
- addToTuple("bag", DataType.BAG, bag, bagSchema, schema, tuple);
- addToTuple("id", DataType.CHARARRAY, "123", schema, tuple);
-
- VespaDocumentOperation docOp = new VespaDocumentOperation("docid=id", "remove-map-fields=bag","operation=update");
- docOp.setInputSchema(schema);
- String json = docOp.exec(tuple);
-
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
- JsonNode fields = root.get("fields");
- assertEquals("{\"remove\":0}", fields.get("bag{123456}").toString());
- assertEquals("{\"remove\":0}", fields.get("bag{234566}").toString());
-
- }
-
- @Test
- public void requireThatUDFCorrectlyGeneratesAddBagAsMapOperation() throws Exception {
-
- DataBag bag = BagFactory.getInstance().newDefaultBag();
-
- Schema innerObjectSchema = new Schema();
- Tuple innerObjectTuple = TupleFactory.getInstance().newTuple();
- addToTuple("year", DataType.CHARARRAY, "2020", innerObjectSchema, innerObjectTuple);
- addToTuple("month", DataType.INTEGER, 3, innerObjectSchema, innerObjectTuple);
-
- Schema objectSchema = new Schema();
- Tuple objectTuple = TupleFactory.getInstance().newTuple();
- addToTuple("key", DataType.CHARARRAY, "123456", objectSchema, objectTuple);
- addToTuple("value", DataType.TUPLE, innerObjectTuple,innerObjectSchema,objectSchema, objectTuple);
-
- Schema bagSchema = new Schema();
- addToBagWithSchema("firstLayerTuple",DataType.TUPLE,objectTuple,objectSchema,bagSchema,bag);
-
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
- addToTuple("bag", DataType.BAG, bag, bagSchema, schema, tuple);
- addToTuple("id", DataType.CHARARRAY, "123", schema, tuple);
- VespaDocumentOperation docOp = new VespaDocumentOperation("docid=id", "update-map-fields=bag","operation=update");
- docOp.setInputSchema(schema);
- String json = docOp.exec(tuple);
-
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
-
- JsonNode fields = root.get("fields");
- JsonNode value = fields.get("bag{123456}");
- JsonNode assign = value.get("assign");
- assertEquals("2020", assign.get("year").asText());
- assertEquals(3, assign.get("month").asInt());
- }
-
- @Test
- public void requireThatUDFCorrectlyGeneratesAddTensorOperation() throws Exception {
-
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
-
- // Please refer to the tensor format documentation
-
- Map<String, Double> tensor = new HashMap<String, Double>() {{
- put("x:label1,y:label2,z:label4", 2.0);
- put("x:label3", 3.0);
- }};
-
- addToTuple("id", DataType.CHARARRAY, "123", schema, tuple);
- addToTuple("tensor", DataType.MAP, tensor, schema, tuple);
-
- VespaDocumentOperation docOp = new VespaDocumentOperation("docid=empty", "update-tensor-fields=tensor","operation=update");
- docOp.setInputSchema(schema);
- String json = docOp.exec(tuple);
-
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
- JsonNode fields = root.get("fields");
- JsonNode tensorValue = fields.get("tensor");
- JsonNode add = tensorValue.get("add");
- JsonNode cells = add.get("cells");
- Iterator<JsonNode> cellsIterator = cells.iterator();
-
- JsonNode element = cellsIterator.next();
- assertEquals("label1", element.get("address").get("x").asText());
- assertEquals("label2", element.get("address").get("y").asText());
- assertEquals("label4", element.get("address").get("z").asText());
- assertEquals("2.0", element.get("value").toString());
-
- element = cellsIterator.next();
- assertEquals("label3", element.get("address").get("x").asText());
- assertEquals("3.0", element.get("value").toString());
- }
-
- @Test
- public void requireThatUDFCorrectlyGeneratesRemoveTensorOperation() throws Exception {
-
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
-
- // Please refer to the tensor format documentation
-
- Map<String, Double> tensor = new HashMap<String, Double>() {{
- put("x:label1,y:label2,z:label4", 2.0);
- put("x:label3", 3.0);
- }};
-
- addToTuple("id", DataType.CHARARRAY, "123", schema, tuple);
- addToTuple("tensor", DataType.MAP, tensor, schema, tuple);
-
- VespaDocumentOperation docOp = new VespaDocumentOperation("docid=empty", "remove-tensor-fields=tensor","operation=update");
- docOp.setInputSchema(schema);
- String json = docOp.exec(tuple);
-
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
- JsonNode fields = root.get("fields");
- JsonNode tensorValue = fields.get("tensor");
- JsonNode remove = tensorValue.get("remove");
- JsonNode address = remove.get("addresses");
-
- Iterator<JsonNode> addressIterator = address.iterator();
-
- JsonNode element = addressIterator.next();
- assertEquals("label1", element.get("x").asText());
-
- element = addressIterator.next();
- assertEquals("label2", element.get("y").asText());
-
- element = addressIterator.next();
- assertEquals("label4", element.get("z").asText());
-
- element = addressIterator.next();
- assertEquals("label3", element.get("x").asText());
- }
-
- @Test
- public void requireThatUDFReturnsNullWhenExceptionHappens() throws IOException {
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
-
- // broken DELTA format that would throw internally
- Map<String, Double> tensor = new HashMap<String, Double>() {{
- put("xlabel1", 2.0); // missing : between 'x' and 'label1'
- }};
-
- addToTuple("id", DataType.CHARARRAY, "123", schema, tuple);
- addToTuple("tensor", DataType.MAP, tensor, schema, tuple);
-
- VespaDocumentOperation docOp = new VespaDocumentOperation("docid=empty", "create-tensor-fields=tensor");
- docOp.setInputSchema(schema);
- String json = docOp.exec(tuple);
-
- assertNull(json);
- }
-
- @Test
- public void requireThatUDFCorrectlyGeneratesRemoveOperation() throws Exception {
- String json = getDocumentOperationJson("operation=remove", "docid=id:<application>:metrics::<name>-<date>");
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
- JsonNode fields = root.get("fields");
-
- assertEquals("id:testapp:metrics::clicks-20160112", root.get("remove").asText());
- assertNull(fields);
- }
-
-
- @Test
- public void requireThatUDFGeneratesComplexDataTypes() throws Exception {
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
-
- Tuple intTuple = TupleFactory.getInstance().newTuple();
- int[] intArray = {1, 2, 3};
- for (int i : intArray) { intTuple.append(i); }
-
- Tuple stringTuple = TupleFactory.getInstance().newTuple();
- String[] stringArray = {"a", "b", "c"};
- for (String s : stringArray) { stringTuple.append(s); }
-
- DataBag bag = new SortedDataBag(null);
- bag.add(intTuple);
- bag.add(stringTuple);
-
- Map<String, Object> innerMap = new HashMap<String, Object>() {{
- put("a", "string");
- put("tuple", intTuple);
- }};
-
- DataByteArray bytes = new DataByteArray("testdata".getBytes());
-
- Map<String, Object> outerMap = new HashMap<String, Object>() {{
- put("string", "value");
- put("int", 3);
- put("float", 3.145);
- put("bool", true);
- put("byte", bytes);
- put("map", innerMap);
- put("bag", bag);
- }};
-
- addToTuple("map", DataType.MAP, outerMap, schema, tuple);
-
- VespaDocumentOperation docOp = new VespaDocumentOperation("docid=empty");
- docOp.setInputSchema(schema);
- String json = docOp.exec(tuple);
-
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
- JsonNode fields = root.get("fields");
- JsonNode map = fields.get("map");
-
- assertEquals("value", map.get("string").asText());
- assertEquals(3, map.get("int").asInt());
- assertEquals(3.145, map.get("float").asDouble(), 1e-6);
- assertTrue(map.get("bool").asBoolean());
- assertEquals("dGVzdGRhdGE=", map.get("byte").asText());
-
- assertEquals("string", map.get("map").get("a").asText());
- for (int i = 0; i < intArray.length; ++i) {
- assertEquals(intArray[i], map.get("map").get("tuple").get(i).asInt());
- }
-
- JsonNode bagField = map.get("bag");
- for (int i = 0; i < intArray.length; ++i) {
- assertEquals(intArray[i], bagField.get(0).get(i).asInt());
- }
- for (int i = 0; i < stringArray.length; ++i) {
- assertEquals(stringArray[i], bagField.get(1).get(i).asText());
- }
- }
-
-
- @Test
- public void requireThatSimpleArraysMustBeConfigured() throws Exception {
- String[] stringArray = {"a", "b", "c"};
- JsonNode array = setupSimpleArrayOperation("array", stringArray, "docid=empty"); // simple arrays not configured
- // json: [["a"], ["b"], ["c"]]
- assertEquals("a", array.get(0).get(0).asText());
- assertEquals("b", array.get(1).get(0).asText());
- assertEquals("c", array.get(2).get(0).asText());
- }
-
-
- @Test
- public void requireThatSimpleArraysAreSupported() throws Exception {
- String[] stringArray = {"a", "b", "c"};
- JsonNode array = setupSimpleArrayOperation("array", stringArray, "docid=empty", "simple-array-fields=array");
- // json: ["a", "b", "c"]
- assertEquals("a", array.get(0).asText());
- assertEquals("b", array.get(1).asText());
- assertEquals("c", array.get(2).asText());
- }
-
-
- @Test
- public void requireThatSimpleArraysCanBeConfiguredWithWildcard() throws Exception {
- String[] stringArray = {"a", "b", "c"};
- JsonNode array = setupSimpleArrayOperation("array", stringArray, "docid=empty", "simple-array-fields=*");
- // json: ["a", "b", "c"]
- assertEquals("a", array.get(0).asText());
- assertEquals("b", array.get(1).asText());
- assertEquals("c", array.get(2).asText());
- }
-
-
- @Test
- public void requireThatMultipleSimpleArraysAreSupported() throws Exception {
- String[] stringArray = {"a", "b", "c"};
- JsonNode array = setupSimpleArrayOperation("array", stringArray, "docid=empty", "simple-array-fields=empty,array");
- // json: ["a", "b", "c"]
- assertEquals("a", array.get(0).asText());
- assertEquals("b", array.get(1).asText());
- assertEquals("c", array.get(2).asText());
- }
-
-
- private JsonNode setupSimpleArrayOperation(String name, String[] array, String... params) throws IOException {
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
-
- DataBag bag = new SortedDataBag(null);
- for (String s : array) {
- Tuple stringTuple = TupleFactory.getInstance().newTuple();
- stringTuple.append(s);
- bag.add(stringTuple);
- }
- addToTuple(name, DataType.BAG, bag, schema, tuple);
-
- VespaDocumentOperation docOp = new VespaDocumentOperation(params);
- docOp.setInputSchema(schema);
- String json = docOp.exec(tuple);
-
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
- JsonNode fields = root.get("fields");
- return fields.get(name);
- }
-
-
- @Test
- public void requireThatUDFSupportsTensors() throws IOException {
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
-
- // Please refer to the tensor format documentation
-
- Map<String, Double> tensor = new HashMap<String, Double>() {{
- put("x:label1,y:label2,z:label4", 2.0);
- put("x:label3", 3.0);
- }};
-
- addToTuple("id", DataType.CHARARRAY, "123", schema, tuple);
- addToTuple("tensor", DataType.MAP, tensor, schema, tuple);
-
- VespaDocumentOperation docOp = new VespaDocumentOperation("docid=empty", "create-tensor-fields=tensor");
- docOp.setInputSchema(schema);
- String json = docOp.exec(tuple);
-
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
- JsonNode fields = root.get("fields");
- JsonNode tensorNode = fields.get("tensor");
- JsonNode cells = tensorNode.get("cells");
-
- assertEquals("label1", cells.get(0).get("address").get("x").asText());
- assertEquals("label2", cells.get(0).get("address").get("y").asText());
- assertEquals("label4", cells.get(0).get("address").get("z").asText());
- assertEquals("label3", cells.get(1).get("address").get("x").asText());
-
- assertEquals(2.0, cells.get(0).get("value").asDouble(), 1e-6);
- assertEquals(3.0, cells.get(1).get("value").asDouble(), 1e-6);
- }
-
-
- @Test
- public void requireThatUDFCanExcludeFields() throws IOException {
- String json = getDocumentOperationJson("docid=id:<application>:metrics::<name>-<date>", "exclude-fields=application,date");
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
- JsonNode fields = root.path("fields");
-
- // 'application' and 'date' fields should not appear in JSON
- assertNull(fields.get("application"));
- assertNull(fields.get("date"));
- assertNotNull(fields.get("name"));
- assertNotNull(fields.get("value"));
- }
-
-
- private String getDocumentOperationJson(String... params) throws IOException {
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
-
- addToTuple("application", DataType.CHARARRAY, "testapp", schema, tuple);
- addToTuple("name", DataType.CHARARRAY, "clicks", schema, tuple);
- addToTuple("date", DataType.CHARARRAY, "20160112", schema, tuple);
- addToTuple("value", DataType.CHARARRAY, 3, schema, tuple);
-
- VespaDocumentOperation docOp = new VespaDocumentOperation(params);
- docOp.setInputSchema(schema);
- return docOp.exec(tuple);
- }
-
-
- @Test
- public void requireThatUDFSupportsSimpleObjectFields() throws IOException {
- Schema objectSchema = new Schema();
- Tuple objectTuple = TupleFactory.getInstance().newTuple();
- addToTuple("id", DataType.LONG, 123456789L, objectSchema, objectTuple);
- addToTuple("url", DataType.CHARARRAY, "example.com", objectSchema, objectTuple);
- addToTuple("value", DataType.INTEGER, 123, objectSchema, objectTuple);
-
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
- addToTuple("object", DataType.TUPLE, objectTuple, objectSchema, schema, tuple);
-
- VespaDocumentOperation docOp = new VespaDocumentOperation("docid=empty", "simple-object-fields=object");
- docOp.setInputSchema(schema);
- String json = docOp.exec(tuple);
-
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
- JsonNode fields = root.get("fields");
- JsonNode objectNode = fields.get("object");
-
- assertEquals(123456789L, objectNode.get("id").asLong());
- assertEquals("example.com", objectNode.get("url").asText());
- assertEquals(123, objectNode.get("value").asInt());
- }
-
-
- @Test
- public void requireThatUDFSupportsBagAsMapFields() throws IOException {
- DataBag bag = BagFactory.getInstance().newDefaultBag();
-
- Schema objectSchema = new Schema();
- Tuple objectTuple = TupleFactory.getInstance().newTuple();
- addToTuple("key", DataType.CHARARRAY, "123456", objectSchema, objectTuple);
- addToTuple("value", DataType.INTEGER, 123456, objectSchema, objectTuple);
- bag.add(objectTuple);
-
- objectSchema = new Schema();
- objectTuple = TupleFactory.getInstance().newTuple();
- addToTuple("key", DataType.CHARARRAY, "234567", objectSchema, objectTuple);
- addToTuple("value", DataType.INTEGER, 234567, objectSchema, objectTuple);
- bag.add(objectTuple);
-
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
- addToTuple("bag", DataType.BAG, bag, objectSchema, schema, tuple);
-
- VespaDocumentOperation docOp = new VespaDocumentOperation("docid=empty", "bag-as-map-fields=bag");
- docOp.setInputSchema(schema);
- String json = docOp.exec(tuple);
-
- ObjectMapper m = new ObjectMapper();
- JsonNode root = m.readTree(json);
- JsonNode fields = root.get("fields");
- JsonNode bagNode = fields.get("bag");
-
- assertEquals(123456, bagNode.get("123456").asInt());
- assertEquals(234567, bagNode.get("234567").asInt());
- }
-
- @Test
- public void requireThatUDFPrintIdWhenVerbose() throws IOException {
- DataBag bag = BagFactory.getInstance().newDefaultBag();
-
- Schema objectSchema = new Schema();
- Tuple objectTuple = TupleFactory.getInstance().newTuple();
- addToTuple("key", DataType.CHARARRAY, "123456", objectSchema, objectTuple);
- addToTuple("value", DataType.INTEGER, 123456, objectSchema, objectTuple);
- bag.add(objectTuple);
-
- objectSchema = new Schema();
- objectTuple = TupleFactory.getInstance().newTuple();
- addToTuple("key", DataType.CHARARRAY, "234567", objectSchema, objectTuple);
- addToTuple("value", DataType.INTEGER, 234567, objectSchema, objectTuple);
- bag.add(objectTuple);
-
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
- addToTuple("bag", DataType.BAG, bag, objectSchema, schema, tuple);
-
- VespaDocumentOperation docOp = new VespaDocumentOperation("docid=7654321", "bag-as-map-fields=bag","verbose=true");
- docOp.setInputSchema(schema);
- String json = docOp.exec(tuple);
-
- assertTrue(outContent.toString().contains("Processing docId: 7654321"));
- }
-
- @Test
- public void requireThatUDFVerboseSetToFalseByDefault() throws IOException {
- DataBag bag = BagFactory.getInstance().newDefaultBag();
-
- Schema objectSchema = new Schema();
- Tuple objectTuple = TupleFactory.getInstance().newTuple();
- addToTuple("key", DataType.CHARARRAY, "123456", objectSchema, objectTuple);
- addToTuple("value", DataType.INTEGER, 123456, objectSchema, objectTuple);
- bag.add(objectTuple);
-
- objectSchema = new Schema();
- objectTuple = TupleFactory.getInstance().newTuple();
- addToTuple("key", DataType.CHARARRAY, "234567", objectSchema, objectTuple);
- addToTuple("value", DataType.INTEGER, 234567, objectSchema, objectTuple);
- bag.add(objectTuple);
-
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
- addToTuple("bag", DataType.BAG, bag, objectSchema, schema, tuple);
-
- VespaDocumentOperation docOp = new VespaDocumentOperation("docid=7654321", "bag-as-map-fields=bag");
- docOp.setInputSchema(schema);
- String json = docOp.exec(tuple);
-
- assertEquals("", outContent.toString());
- }
-
- private void addToTuple(String alias, byte type, Object value, Schema schema, Tuple tuple) {
- schema.add(new Schema.FieldSchema(alias, type));
- tuple.append(value);
- }
-
-
- private void addToTuple(String alias, byte type, Object value, Schema schemaInField, Schema schema, Tuple tuple)
- throws FrontendException {
- schema.add(new Schema.FieldSchema(alias, schemaInField, type));
- tuple.append(value);
- }
-
- private void addToBagWithSchema(String alias, byte type, Tuple value, Schema schemaInField, Schema schema,DataBag bag)
- throws FrontendException {
- schema.add(new Schema.FieldSchema(alias, schemaInField, type));
- bag.add(value);
- }
-}
diff --git a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaQueryTest.java b/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaQueryTest.java
deleted file mode 100644
index a0b549a737f..00000000000
--- a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaQueryTest.java
+++ /dev/null
@@ -1,101 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.pig;
-
-import com.sun.net.httpserver.HttpServer;
-import com.yahoo.vespa.hadoop.util.MockQueryHandler;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hdfs.HdfsConfiguration;
-import org.apache.pig.ExecType;
-import org.apache.pig.PigServer;
-import org.apache.pig.data.Tuple;
-import org.junit.jupiter.api.Test;
-
-import java.net.InetSocketAddress;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-public class VespaQueryTest {
-
- @Test
- public void requireThatQueriesAreReturnedCorrectly() throws Exception {
- runQueryTest("src/test/pig/query.pig", createQueryHandler(""), 18901);
- }
-
- @Test
- public void requireThatQueriesAreReturnedCorrectlyWithAlternativeJsonRoot() throws Exception {
- runQueryTest("src/test/pig/query_alt_root.pig", createQueryHandler("children"), 18902);
- }
-
- private void runQueryTest(String script, MockQueryHandler queryHandler, int port) throws Exception {
- final String endpoint = "http://localhost:" + port;
-
- HttpServer server = HttpServer.create(new InetSocketAddress(port), 0);
- server.createContext("/", queryHandler);
- server.start();
-
- PigServer ps = setup(script, endpoint);
-
- Iterator<Tuple> recommendations = ps.openIterator("recommendations");
- while (recommendations.hasNext()) {
- Tuple tuple = recommendations.next();
-
- String userid = (String) tuple.get(0);
- Integer rank = (Integer) tuple.get(1);
- String docid = (String) tuple.get(2);
- Double relevance = (Double) tuple.get(3);
- String fieldId = (String) tuple.get(4);
- String fieldContent = (String) tuple.get(5);
-
- MockQueryHandler.MockQueryHit hit = queryHandler.getHit(userid, rank);
- assertEquals(docid, hit.id);
- assertEquals(relevance, hit.relevance, 1e-3);
- assertEquals(fieldId, hit.fieldId);
- assertEquals(fieldContent, hit.fieldContent);
- }
-
- if (server != null) {
- server.stop(0);
- }
-
- }
-
- private PigServer setup(String script, String endpoint) throws Exception {
- Configuration conf = new HdfsConfiguration();
- Map<String, String> parameters = new HashMap<>();
- parameters.put("ENDPOINT", endpoint);
-
- PigServer ps = new PigServer(ExecType.LOCAL, conf);
- ps.setBatchOn();
- ps.registerScript(script, parameters);
-
- return ps;
- }
-
- private MockQueryHandler createQueryHandler(String childNode) {
- MockQueryHandler queryHandler = new MockQueryHandler(childNode);
-
- List<String> userIds = Arrays.asList("5", "104", "313");
-
- int hitsPerUser = 3;
- for (int i = 0; i < hitsPerUser * userIds.size(); ++i) {
- String id = "" + (i+1);
- String userId = userIds.get(i / hitsPerUser);
- queryHandler.newHit().
- setId("id::::" + id).
- setRelevance(1.0 - (i % hitsPerUser) * 0.1).
- setFieldSddocname("doctype").
- setFieldId("" + id).
- setFieldDate("2016060" + id).
- setFieldContent("Content for user " + userId + " hit " + i % hitsPerUser + "...").
- add(userId);
- }
-
- return queryHandler;
- }
-
-}
diff --git a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaStorageTest.java b/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaStorageTest.java
deleted file mode 100644
index 3183c770bc7..00000000000
--- a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/pig/VespaStorageTest.java
+++ /dev/null
@@ -1,106 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.pig;
-
-import com.yahoo.vespa.hadoop.mapreduce.util.VespaConfiguration;
-import com.yahoo.vespa.hadoop.mapreduce.util.VespaCounters;
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.hdfs.HdfsConfiguration;
-import org.apache.hadoop.mapred.Counters;
-import org.apache.pig.ExecType;
-import org.apache.pig.PigServer;
-import org.apache.pig.backend.executionengine.ExecJob;
-import org.apache.pig.tools.pigstats.JobStats;
-import org.apache.pig.tools.pigstats.PigStats;
-import org.apache.pig.tools.pigstats.mapreduce.MRJobStats;
-import org.junit.jupiter.api.Test;
-
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNotNull;
-
-public class VespaStorageTest {
-
- @Test
- public void requireThatPremadeOperationsFeedSucceeds() throws Exception {
- assertAllDocumentsOk("src/test/pig/feed_operations.pig");
- }
-
-
- @Test
- public void requireThatPremadeMultilineOperationsFeedSucceeds() throws Exception {
- assertAllDocumentsOk("src/test/pig/feed_multiline_operations.pig");
- }
-
-
- @Test
- public void requireThatPremadeOperationsWithJsonLoaderFeedSucceeds() throws Exception {
- assertAllDocumentsOk("src/test/pig/feed_operations_with_json_loader.pig");
- }
-
- @Test
- public void requireThatPremadeOperationsWithJsonLoaderFeedAndNonLegacyClientSucceeds() throws Exception {
- Configuration conf = new HdfsConfiguration();
- conf.set(VespaConfiguration.USE_SSL, Boolean.TRUE.toString());
- assertAllDocumentsOk("src/test/pig/feed_operations_with_json_loader.pig", conf);
- }
-
- @Test
- public void requireThatCreateOperationsFeedSucceeds() throws Exception {
- assertAllDocumentsOk("src/test/pig/feed_create_operations.pig");
- }
-
-
- @Test
- public void requireThatCreateOperationsShortFormFeedSucceeds() throws Exception {
- assertAllDocumentsOk("src/test/pig/feed_create_operations_short_form.pig");
- }
-
-
- @Test
- public void requireThatFeedVisitDataSucceeds() throws Exception {
- assertAllDocumentsOk("src/test/pig/feed_visit_data.pig");
- }
-
-
- private PigServer setup(String script, Configuration conf) throws Exception {
- if (conf == null) {
- conf = new HdfsConfiguration();
- }
- conf.setIfUnset(VespaConfiguration.DRYRUN, "true");
- conf.setIfUnset(VespaConfiguration.ENDPOINT, "dummy-endpoint");
-
- // Parameter substitutions - can also be set by configuration
- Map<String, String> parameters = new HashMap<>();
- parameters.put("ENDPOINT", "endpoint-does-not-matter-in-dryrun,another-endpoint-that-does-not-matter");
-
- PigServer ps = new PigServer(ExecType.LOCAL, conf);
- ps.setBatchOn();
- ps.registerScript(script, parameters);
-
- return ps;
- }
-
-
- private void assertAllDocumentsOk(String script) throws Exception {
- assertAllDocumentsOk(script, null);
- }
-
-
- private void assertAllDocumentsOk(String script, Configuration conf) throws Exception {
- PigServer ps = setup(script, conf);
- List<ExecJob> jobs = ps.executeBatch();
- PigStats stats = jobs.get(0).getStatistics();
- for (JobStats js : stats.getJobGraph()) {
- Counters hadoopCounters = ((MRJobStats)js).getHadoopCounters();
- assertNotNull(hadoopCounters);
- VespaCounters counters = VespaCounters.get(hadoopCounters);
- assertEquals(10, counters.getDocumentsSent());
- assertEquals(0, counters.getDocumentsFailed());
- assertEquals(10, counters.getDocumentsOk());
- }
- }
-
-}
diff --git a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/util/MockQueryHandler.java b/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/util/MockQueryHandler.java
deleted file mode 100644
index 64c160ea14c..00000000000
--- a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/util/MockQueryHandler.java
+++ /dev/null
@@ -1,219 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.util;
-
-import com.fasterxml.jackson.core.JsonEncoding;
-import com.fasterxml.jackson.core.JsonFactory;
-import com.fasterxml.jackson.core.JsonGenerator;
-import com.sun.net.httpserver.HttpExchange;
-import com.sun.net.httpserver.HttpHandler;
-
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.net.URI;
-import java.net.URLDecoder;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-
-public class MockQueryHandler implements HttpHandler {
-
- private final Map<String, List<MockQueryHit>> hitMap;
- private final String childNode;
-
- public MockQueryHandler(String childNode) {
- this.hitMap = new HashMap<>();
- this.childNode = childNode;
- }
-
- public void handle(HttpExchange t) throws IOException {
- URI uri = t.getRequestURI();
- String query = uri.getQuery();
- String response = null;
-
- // Parse query - extract "query" element
- if (query != null) {
- String params[] = query.split("[&]");
- for (String param : params) {
- int i = param.indexOf('=');
- String name = param.substring(0, i);
- String value = URLDecoder.decode(param.substring(i + 1), "UTF-8");
-
- if ("query".equalsIgnoreCase(name)) {
- response = getResponse(URLDecoder.decode(param.substring(i + 1), "UTF-8"));
- }
- }
- }
-
- t.sendResponseHeaders(200, response == null ? 0 : response.length());
- OutputStream os = t.getResponseBody();
- os.write(response == null ? "".getBytes() : response.getBytes());
- os.close();
-
- }
-
- public MockQueryHit getHit(String query, Integer rank) {
- if (!hitMap.containsKey(query)) {
- return null;
- }
- if (rank >= hitMap.get(query).size()) {
- return null;
- }
- return hitMap.get(query).get(rank);
- }
-
- public MockQueryHit newHit() {
- return new MockQueryHit(this);
- }
-
- public void addHit(String query, MockQueryHit hit) {
- if (!hitMap.containsKey(query)) {
- hitMap.put(query, new ArrayList<>());
- }
- hitMap.get(query).add(hit);
- }
-
- private String getResponse(String query) throws IOException {
- List<MockQueryHit> hits = hitMap.get(query);
- if (hits == null) {
- return null;
- }
-
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- JsonGenerator g = new JsonFactory().createGenerator(out, JsonEncoding.UTF8);
-
- writeResultStart(g, hits.size());
- for (MockQueryHit hit : hits) {
- writeHit(g, hit);
- }
- writeResultsEnd(g);
- g.close();
-
- return out.toString();
- }
-
- private void writeHit(JsonGenerator g, MockQueryHit hit) throws IOException {
- g.writeStartObject();
-
- g.writeFieldName("id");
- g.writeString(hit.id);
-
- g.writeFieldName("relevance");
- g.writeNumber(hit.relevance);
-
- g.writeFieldName("fields");
- g.writeStartObject();
-
- g.writeFieldName("sddocname");
- g.writeString(hit.fieldSddocname);
-
- g.writeFieldName("date");
- g.writeString(hit.fieldDate);
-
- g.writeFieldName("content");
- g.writeString(hit.fieldContent);
-
- g.writeFieldName("id");
- g.writeString(hit.fieldId);
-
- g.writeEndObject();
- g.writeEndObject();
- }
-
- private void writeResultStart(JsonGenerator g, int count) throws IOException {
- g.writeStartObject();
- g.writeFieldName("root");
-
- g.writeStartObject();
-
- g.writeFieldName("id");
- g.writeString("toplevel");
-
- g.writeFieldName("relevance");
- g.writeNumber(1);
-
- g.writeFieldName("fields");
- g.writeStartObject();
- g.writeFieldName("totalCount");
- g.writeNumber(count);
- g.writeEndObject();
-
- g.writeFieldName("coverage");
- g.writeStartObject();
- g.writeFieldName("coverage");
- g.writeNumber(100);
- // ... more stuff here usually
- g.writeEndObject();
-
- g.writeFieldName("children");
- g.writeStartArray();
-
- if (!childNode.isEmpty()) {
- g.writeStartObject();
- g.writeFieldName(childNode);
- g.writeStartArray();
- }
- }
-
- private void writeResultsEnd(JsonGenerator g) throws IOException {
- if (!childNode.isEmpty()) {
- g.writeEndArray();
- g.writeEndObject();
- }
- g.writeEndArray();
- g.writeEndObject();
- g.writeEndObject();
- }
-
- public static class MockQueryHit {
-
- private final MockQueryHandler handler;
-
- public String id;
- public Double relevance;
- public String fieldSddocname;
- public String fieldDate;
- public String fieldContent;
- public String fieldId;
-
- private MockQueryHit(MockQueryHandler handler) {
- this.handler = handler;
- }
-
- public void add(String query) {
- handler.addHit(query, this);
- }
-
- public MockQueryHit setId(String id) {
- this.id = id;
- return this;
- }
-
- public MockQueryHit setRelevance(Double relevance) {
- this.relevance = relevance;
- return this;
- }
-
- public MockQueryHit setFieldSddocname(String fieldSddocname) {
- this.fieldSddocname = fieldSddocname;
- return this;
- }
-
- public MockQueryHit setFieldDate(String fieldDate) {
- this.fieldDate = fieldDate;
- return this;
- }
-
- public MockQueryHit setFieldContent(String fieldContent) {
- this.fieldContent = fieldContent;
- return this;
- }
-
- public MockQueryHit setFieldId(String fieldId) {
- this.fieldId = fieldId;
- return this;
- }
- }
-
-}
diff --git a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/util/TupleToolsTest.java b/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/util/TupleToolsTest.java
deleted file mode 100644
index b4ccbdf2183..00000000000
--- a/vespa-hadoop/src/test/java/com/yahoo/vespa/hadoop/util/TupleToolsTest.java
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
-package com.yahoo.vespa.hadoop.util;
-
-import com.yahoo.vespa.hadoop.mapreduce.util.TupleTools;
-import org.apache.pig.data.DataType;
-import org.apache.pig.data.Tuple;
-import org.apache.pig.data.TupleFactory;
-import org.apache.pig.impl.logicalLayer.schema.Schema;
-import org.junit.jupiter.api.Test;
-
-import java.io.IOException;
-
-import static org.junit.jupiter.api.Assertions.assertEquals;
-
-public class TupleToolsTest {
-
- @Test
- public void requireThatTupleToStringHandlesSimpleTypes() throws IOException {
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
-
- addToTuple("id", DataType.CHARARRAY, "123", schema, tuple);
- addToTuple("rank", DataType.INTEGER, 1, schema, tuple);
-
- String template = "Id is <id> and rank is <rank>";
- String result = TupleTools.toString(schema, tuple, template);
-
- assertEquals("Id is 123 and rank is 1", result);
- }
-
-
- private void addToTuple(String alias, byte type, Object value, Schema schema, Tuple tuple) {
- schema.add(new Schema.FieldSchema(alias, type));
- tuple.append(value);
- }
-
- @Test
- public void requireThatTupleToStringHandlesStringCharacters() throws IOException {
- Schema schema = new Schema();
- Tuple tuple = TupleFactory.getInstance().newTuple();
-
- addToTuple("id", DataType.CHARARRAY, "_!@#$%^&*()", schema, tuple);
- addToTuple("rank", DataType.INTEGER, 1, schema, tuple);
-
- String template = "Id is <id> and rank is <rank>";
- String result = TupleTools.toString(schema, tuple, template);
-
- assertEquals("Id is _!@#$%^&*() and rank is 1", result);
- }
-
-}
diff --git a/vespa-hadoop/src/test/pig/feed_create_operations.pig b/vespa-hadoop/src/test/pig/feed_create_operations.pig
deleted file mode 100644
index 4583c095133..00000000000
--- a/vespa-hadoop/src/test/pig/feed_create_operations.pig
+++ /dev/null
@@ -1,24 +0,0 @@
--- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
--- REGISTER vespa-hadoop.jar -- Not needed in tests
-
--- Create valid Vespa put operations
-DEFINE VespaPutOperation
- com.yahoo.vespa.hadoop.pig.VespaDocumentOperation(
- 'operation=put',
- 'docid=id:<application>:metrics::<name>-<date>'
- );
-
--- By default, VespaStorage assumes it's feeding valid Vespa operations
-DEFINE VespaStorage
- com.yahoo.vespa.hadoop.pig.VespaStorage();
-
--- Load tabular data
-metrics = LOAD 'src/test/resources/tabular_data.csv' AS (date:chararray, name:chararray, value:int, application:chararray);
-
--- Transform tabular data to a Vespa document operation JSON format
-metrics = FOREACH metrics GENERATE VespaPutOperation(*);
-
--- Store into Vespa
-STORE metrics INTO '$ENDPOINT' USING VespaStorage();
-
-
diff --git a/vespa-hadoop/src/test/pig/feed_create_operations_short_form.pig b/vespa-hadoop/src/test/pig/feed_create_operations_short_form.pig
deleted file mode 100644
index 0f0e63d843a..00000000000
--- a/vespa-hadoop/src/test/pig/feed_create_operations_short_form.pig
+++ /dev/null
@@ -1,19 +0,0 @@
--- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
--- REGISTER vespa-hadoop.jar -- Not needed in tests
-
--- Transform tabular data to a Vespa document operation JSON format
--- as part of storing the data.
-DEFINE VespaStorage
- com.yahoo.vespa.hadoop.pig.VespaStorage(
- 'create-document-operation=true',
- 'operation=put',
- 'docid=id:<application>:metrics::<name>-<date>'
- );
-
--- Load tabular data
-metrics = LOAD 'src/test/resources/tabular_data.csv' AS (date:chararray, name:chararray, value:int, application:chararray);
-
--- Store into Vespa
-STORE metrics INTO '$ENDPOINT' USING VespaStorage();
-
-
diff --git a/vespa-hadoop/src/test/pig/feed_multiline_operations.pig b/vespa-hadoop/src/test/pig/feed_multiline_operations.pig
deleted file mode 100644
index 1971270cbdc..00000000000
--- a/vespa-hadoop/src/test/pig/feed_multiline_operations.pig
+++ /dev/null
@@ -1,15 +0,0 @@
--- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
--- REGISTER vespa-hadoop.jar -- Not needed in tests
-
--- Define short name for VespaJsonLoader
-DEFINE VespaJsonLoader com.yahoo.vespa.hadoop.pig.VespaSimpleJsonLoader();
-
--- Define short name for VespaStorage
-DEFINE VespaStorage com.yahoo.vespa.hadoop.pig.VespaStorage();
-
--- Load data - one column for json data
-metrics = LOAD 'src/test/resources/operations_multiline_data.json' USING VespaJsonLoader() AS (data:chararray);
-
--- Store into Vespa
-STORE metrics INTO '$ENDPOINT' USING VespaStorage();
-
diff --git a/vespa-hadoop/src/test/pig/feed_operations.pig b/vespa-hadoop/src/test/pig/feed_operations.pig
deleted file mode 100644
index 48873fde87a..00000000000
--- a/vespa-hadoop/src/test/pig/feed_operations.pig
+++ /dev/null
@@ -1,11 +0,0 @@
--- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
--- REGISTER vespa-hadoop.jar -- Not needed in tests
-
--- Define short name for VespaStorage
-DEFINE VespaStorage com.yahoo.vespa.hadoop.pig.VespaStorage();
-
--- Load data - one column for json data
-metrics = LOAD 'src/test/resources/operations_data.json' AS (data:chararray);
-
--- Store into Vespa
-STORE metrics INTO '$ENDPOINT' USING VespaStorage();
diff --git a/vespa-hadoop/src/test/pig/feed_operations_with_json_loader.pig b/vespa-hadoop/src/test/pig/feed_operations_with_json_loader.pig
deleted file mode 100644
index da58fe3c678..00000000000
--- a/vespa-hadoop/src/test/pig/feed_operations_with_json_loader.pig
+++ /dev/null
@@ -1,14 +0,0 @@
--- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
--- REGISTER vespa-hadoop.jar -- Not needed in tests
-
--- Define short name for VespaJsonLoader
-DEFINE VespaJsonLoader com.yahoo.vespa.hadoop.pig.VespaSimpleJsonLoader();
-
--- Define short name for VespaStorage
-DEFINE VespaStorage com.yahoo.vespa.hadoop.pig.VespaStorage();
-
--- Load data - one column for json data
-metrics = LOAD 'src/test/resources/operations_data.json' USING VespaJsonLoader() AS (data:chararray);
-
--- Store into Vespa
-STORE metrics INTO '$ENDPOINT' USING VespaStorage();
diff --git a/vespa-hadoop/src/test/pig/feed_operations_xml.pig b/vespa-hadoop/src/test/pig/feed_operations_xml.pig
deleted file mode 100644
index 4e5057f4909..00000000000
--- a/vespa-hadoop/src/test/pig/feed_operations_xml.pig
+++ /dev/null
@@ -1,11 +0,0 @@
--- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
--- REGISTER vespa-hadoop.jar -- Not needed in tests
-
--- Define short name for VespaStorage
-DEFINE VespaStorage com.yahoo.vespa.hadoop.pig.VespaStorage();
-
--- Load data - one column for xml data
-data = LOAD 'src/test/resources/operations_data.xml' AS (data:chararray);
-
--- Store into Vespa
-STORE data INTO '$ENDPOINT' USING VespaStorage();
diff --git a/vespa-hadoop/src/test/pig/feed_visit_data.pig b/vespa-hadoop/src/test/pig/feed_visit_data.pig
deleted file mode 100644
index 59d144b53dc..00000000000
--- a/vespa-hadoop/src/test/pig/feed_visit_data.pig
+++ /dev/null
@@ -1,12 +0,0 @@
--- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
--- REGISTER vespa-hadoop.jar -- Not needed in tests
-
--- Define short name for VespaStorage
-DEFINE VespaStorage com.yahoo.vespa.hadoop.pig.VespaStorage();
-
--- Load data - one column for json data
-metrics = LOAD 'src/test/resources/visit_data.json' AS (data:chararray);
-
--- Store into Vespa
-STORE metrics INTO '$ENDPOINT' USING VespaStorage();
-
diff --git a/vespa-hadoop/src/test/pig/query.pig b/vespa-hadoop/src/test/pig/query.pig
deleted file mode 100644
index 96caa5cd0c4..00000000000
--- a/vespa-hadoop/src/test/pig/query.pig
+++ /dev/null
@@ -1,19 +0,0 @@
--- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
--- REGISTER vespa-hadoop.jar -- Not needed in tests
-
--- Define Vespa query for retrieving blog posts
-DEFINE BlogPostRecommendations
- com.yahoo.vespa.hadoop.pig.VespaQuery(
- 'query=$ENDPOINT/search?query=<userid>&hits=100',
- 'schema=rank:int,id:chararray,relevance:double,fields/id:chararray,fields/content:chararray'
- );
-
--- Load data from a local file
-users = LOAD 'src/test/resources/user_ids.csv' AS (userid:chararray);
-users = FILTER users BY userid IS NOT null;
-
--- Run a set of queries against Vespa
-recommendations = FOREACH users GENERATE userid, FLATTEN(BlogPostRecommendations(*));
-
--- Output recommendations
-DUMP recommendations;
diff --git a/vespa-hadoop/src/test/pig/query_alt_root.pig b/vespa-hadoop/src/test/pig/query_alt_root.pig
deleted file mode 100644
index 2884b4a600f..00000000000
--- a/vespa-hadoop/src/test/pig/query_alt_root.pig
+++ /dev/null
@@ -1,20 +0,0 @@
--- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
--- REGISTER vespa-hadoop.jar -- Not needed in tests
-
--- Define Vespa query for retrieving blog posts
-DEFINE BlogPostRecommendations
- com.yahoo.vespa.hadoop.pig.VespaQuery(
- 'query=$ENDPOINT/search?query=<userid>&hits=100',
- 'rootnode=root/children/children',
- 'schema=rank:int,id:chararray,relevance:double,fields/id:chararray,fields/content:chararray'
- );
-
--- Load data from a local file
-users = LOAD 'src/test/resources/user_ids.csv' AS (userid:chararray);
-users = FILTER users BY userid IS NOT null;
-
--- Run a set of queries against Vespa
-recommendations = FOREACH users GENERATE userid, FLATTEN(BlogPostRecommendations(*));
-
--- Output recommendations
-DUMP recommendations;
diff --git a/vespa-hadoop/src/test/resources/operations_data.json b/vespa-hadoop/src/test/resources/operations_data.json
deleted file mode 100644
index 5af436dbfe7..00000000000
--- a/vespa-hadoop/src/test/resources/operations_data.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{"put":"id:testapp:metric::clicks-2015110414","fields":{"date":"2015110414","name":"clicks","value":1,"application":"testapp"}}
-{"fields":{"date":"2015110416","name":"clicks","value":5,"application":"testapp"},"put":"id:testapp:metric::clicks-2015110416"}
-{"put":"id:testapp:metric::clicks-2015110415","fields":{"date":"2015110415","name":"clicks","value":2,"application":"testapp"}}
-{"put":"id:testapp:metric::clicks-2015110417","fields":{"date":"2015110417","name":"clicks","value":3,"application":"testapp"}}
-{"put":"id:testapp:metric::clicks-2015110418","fields":{"date":"2015110418","name":"clicks","value":6,"application":"testapp"}}
-{"put":"id:testapp:metric::clicks-2015110419","fields":{"date":"2015110419","name":"clicks","value":3,"application":"testapp"}}
-{"put":"id:testapp:metric::clicks-2015110420","fields":{"date":"2015110420","name":"clicks","value":4,"application":"testapp"}}
-{"put":"id:testapp:metric::clicks-2015110421","fields":{"date":"2015110421","name":"clicks","value":2,"application":"testapp"}}
-{"fields":{"date":"2015110422","name":"clicks","value":5,"application":"testapp"},"condition":"metrics==0","put":"id:testapp:metric::clicks-2015110422"}
-{"put":"id:testapp:metric::clicks-2015110423","fields":{"date":"2015110423","name":"clicks","value":1,"application":"testapp"}}
diff --git a/vespa-hadoop/src/test/resources/operations_data.xml b/vespa-hadoop/src/test/resources/operations_data.xml
deleted file mode 100644
index db02b6bee73..00000000000
--- a/vespa-hadoop/src/test/resources/operations_data.xml
+++ /dev/null
@@ -1,14 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!-- Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. -->
-<vespafeed>
- <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/a-ha/Scoundrel+Days"> <url>http://music.yahoo.com/a-ha/Scoundrel+Days</url> <title><![CDATA[Scoundrel Days]]></title> <artist><![CDATA[a-ha]]></artist> <year>0</year> <popularity>290</popularity> </document>
- <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/Accept/Restless+And+Wild"> <url>http://music.yahoo.com/Accept/Restless+And+Wild</url> <title><![CDATA[Restless And Wild]]></title> <artist><![CDATA[Accept]]></artist> <year>0</year> <popularity>75</popularity> </document>
- <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/Accept/Staying+A+Life"> <url>http://music.yahoo.com/Accept/Staying+A+Life</url> <title><![CDATA[Staying A Life]]></title> <artist><![CDATA[Accept]]></artist> <year>1985</year> <popularity>77</popularity> </document>
- <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/Alice+In+Chains/Dirt"> <url>http://music.yahoo.com/Alice+In+Chains/Dirt</url> <title><![CDATA[Dirt]]></title> <artist><![CDATA[Alice In Chains]]></artist> <year>1992</year> <popularity>114</popularity> </document>
- <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/Alice+In+Chains/Live"> <url>http://music.yahoo.com/Alice+In+Chains/Live</url> <title><![CDATA[Live]]></title> <artist><![CDATA[Alice In Chains]]></artist> <year>1990</year> <popularity>363</popularity> </document>
- <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/Amy+MacDonald/This+Is+The+Life"> <url>http://music.yahoo.com/Amy+MacDonald/This+Is+The+Life</url> <title><![CDATA[This Is The Life]]></title> <artist><![CDATA[Amy MacDonald]]></artist> <year>2007</year> <popularity>355</popularity> </document>
- <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/Ane+Brun/Duets"> <url>http://music.yahoo.com/Ane+Brun/Duets</url> <title><![CDATA[Duets]]></title> <artist><![CDATA[Ane Brun]]></artist> <year>0</year> <popularity>255</popularity> </document>
- <update documenttype="music" documentid="id:music:music::http://music.yahoo.com/bobdylan/BestOf"><assign field="title">The Best of Bob Dylan</assign><add field="tracks"><item>Man Of Constant Sorrow</item></add></update>
- <remove documentid="id:music:music::http://music.yahoo.com/Aqpop/Beautifully+Smart" />
- <document documenttype="music" documentid="id:music:music::http://music.yahoo.com/Annuals/Be+He+Me"> <url>http://music.yahoo.com/Annuals/Be+He+Me</url> <title><![CDATA[Be He Me]]></title> <artist><![CDATA[Annuals]]></artist> <year>0</year> <popularity>207</popularity> </document>
-</vespafeed>
diff --git a/vespa-hadoop/src/test/resources/operations_multiline_data.json b/vespa-hadoop/src/test/resources/operations_multiline_data.json
deleted file mode 100644
index 2b51698d9b7..00000000000
--- a/vespa-hadoop/src/test/resources/operations_multiline_data.json
+++ /dev/null
@@ -1,93 +0,0 @@
-[
- {
- "put": "id:testapp:metric::clicks-2015110414",
- "fields": {
- "date": "2015110414",
- "name": "clicks",
- "value": 1,
- "application": "testapp"
- }
- },
- {
- "fields": {
- "date": "2015110416",
- "name": "clicks",
- "value": 5,
- "application": "testapp"
- },
- "put": "id:testapp:metric::clicks-2015110416"
- },
- {
- "put": "id:testapp:metric::clicks-2015110415",
- "fields": {
- "date": "2015110415",
- "name": "clicks",
- "value": 2,
- "application": "testapp"
- }
- },
- {
- "put": "id:testapp:metric::clicks-2015110417",
- "fields": {
- "date": "2015110417",
- "name": "clicks",
- "value": 3,
- "application": "testapp"
- }
- },
- {
- "put": "id:testapp:metric::clicks-2015110418",
- "fields": {
- "date": "2015110418",
- "name": "clicks",
- "value": 6,
- "application": "testapp"
- }
- },
- {
- "put": "id:testapp:metric::clicks-2015110419",
- "fields": {
- "date": "2015110419",
- "name": "clicks",
- "value": 3,
- "application": "testapp"
- }
- },
- {
- "put": "id:testapp:metric::clicks-2015110420",
- "fields": {
- "date": "2015110420",
- "name": "clicks",
- "value": 4,
- "application": "testapp"
- }
- },
- {
- "put": "id:testapp:metric::clicks-2015110421",
- "fields": {
- "date": "2015110421",
- "name": "clicks",
- "value": 2,
- "application": "testapp"
- }
- },
- {
- "fields": {
- "date": "2015110422",
- "name": "clicks",
- "value": 5,
- "application": "testapp"
- },
- "condition": "metrics==0",
- "put": "id:testapp:metric::clicks-2015110422"
- },
- {
- "put": "id:testapp:metric::clicks-2015110423",
- "fields": {
- "date": "2015110423",
- "name": "clicks",
- "value": 1,
- "application": "testapp"
- }
- }
-]
diff --git a/vespa-hadoop/src/test/resources/tabular_data.csv b/vespa-hadoop/src/test/resources/tabular_data.csv
deleted file mode 100644
index 541597998e9..00000000000
--- a/vespa-hadoop/src/test/resources/tabular_data.csv
+++ /dev/null
@@ -1,11 +0,0 @@
-2015110414 clicks 1 testapp
-2015110415 clicks 2 testapp
-2015110416 clicks 5 testapp
-2015110417 clicks 3 testapp
-2015110418 clicks 6 testapp
-2015110419 clicks 3 testapp
-2015110420 clicks 4 testapp
-2015110421 clicks 2 testapp
-2015110422 clicks 5 testapp
-2015110423 clicks 1 testapp
-
diff --git a/vespa-hadoop/src/test/resources/user_ids.csv b/vespa-hadoop/src/test/resources/user_ids.csv
deleted file mode 100644
index 5875a3b9a7c..00000000000
--- a/vespa-hadoop/src/test/resources/user_ids.csv
+++ /dev/null
@@ -1,4 +0,0 @@
-5
-104
-313
-
diff --git a/vespa-hadoop/src/test/resources/visit_data.json b/vespa-hadoop/src/test/resources/visit_data.json
deleted file mode 100644
index 947b9326cc8..00000000000
--- a/vespa-hadoop/src/test/resources/visit_data.json
+++ /dev/null
@@ -1,10 +0,0 @@
-{"id":"id:testapp:metric::clicks-2015110414","fields":{"date":"2015110414","name":"clicks","value":1,"application":"testapp"}}
-{"id":"id:testapp:metric::clicks-2015110415","fields":{"date":"2015110415","name":"clicks","value":2,"application":"testapp"}}
-{"id":"id:testapp:metric::clicks-2015110416","fields":{"date":"2015110416","name":"clicks","value":4,"application":"testapp"}}
-{"id":"id:testapp:metric::clicks-2015110417","fields":{"date":"2015110417","name":"clicks","value":3,"application":"testapp"}}
-{"id":"id:testapp:metric::clicks-2015110418","fields":{"date":"2015110418","name":"clicks","value":6,"application":"testapp"}}
-{"id":"id:testapp:metric::clicks-2015110419","fields":{"date":"2015110419","name":"clicks","value":3,"application":"testapp"}}
-{"id":"id:testapp:metric::clicks-2015110420","fields":{"date":"2015110420","name":"clicks","value":4,"application":"testapp"}}
-{"id":"id:testapp:metric::clicks-2015110421","fields":{"date":"2015110421","name":"clicks","value":2,"application":"testapp"}}
-{"id":"id:testapp:metric::clicks-2015110422","fields":{"date":"2015110422","name":"clicks","value":7,"application":"testapp"}}
-{"id":"id:testapp:metric::clicks-2015110423","fields":{"date":"2015110423","name":"clicks","value":1,"application":"testapp"}} \ No newline at end of file