summaryrefslogtreecommitdiffstats
path: root/vespajlib
diff options
context:
space:
mode:
authorJon Bratseth <bratseth@yahoo-inc.com>2017-01-06 17:09:05 +0100
committerJon Bratseth <bratseth@yahoo-inc.com>2017-01-06 17:09:05 +0100
commit21d75fade74539121c0283fb154efdc09fa886f3 (patch)
treecb432c9203b92896b177c3762e2c42bd383a65b5 /vespajlib
parent27cdc6fb1fddf7127710312d8fd881a9aec8100f (diff)
Faster indexed general join algorithm
Diffstat (limited to 'vespajlib')
-rw-r--r--vespajlib/src/main/java/com/yahoo/tensor/DimensionSizes.java3
-rw-r--r--vespajlib/src/main/java/com/yahoo/tensor/IndexedTensor.java15
-rw-r--r--vespajlib/src/main/java/com/yahoo/tensor/PartialAddress.java61
-rw-r--r--vespajlib/src/main/java/com/yahoo/tensor/functions/Concat.java24
-rw-r--r--vespajlib/src/main/java/com/yahoo/tensor/functions/Join.java95
-rw-r--r--vespajlib/src/test/java/com/yahoo/tensor/TensorFunctionBenchmark.java20
-rw-r--r--vespajlib/src/test/java/com/yahoo/tensor/functions/JoinTestCase.java11
7 files changed, 199 insertions, 30 deletions
diff --git a/vespajlib/src/main/java/com/yahoo/tensor/DimensionSizes.java b/vespajlib/src/main/java/com/yahoo/tensor/DimensionSizes.java
index 76340bb7d8f..daa85cc51e4 100644
--- a/vespajlib/src/main/java/com/yahoo/tensor/DimensionSizes.java
+++ b/vespajlib/src/main/java/com/yahoo/tensor/DimensionSizes.java
@@ -1,5 +1,7 @@
package com.yahoo.tensor;
+import com.google.common.annotations.Beta;
+
import java.util.Arrays;
/**
@@ -7,6 +9,7 @@ import java.util.Arrays;
*
* @author bratseth
*/
+@Beta
public final class DimensionSizes {
private final int[] sizes;
diff --git a/vespajlib/src/main/java/com/yahoo/tensor/IndexedTensor.java b/vespajlib/src/main/java/com/yahoo/tensor/IndexedTensor.java
index d69cf65ee8d..9315922f57a 100644
--- a/vespajlib/src/main/java/com/yahoo/tensor/IndexedTensor.java
+++ b/vespajlib/src/main/java/com/yahoo/tensor/IndexedTensor.java
@@ -52,6 +52,21 @@ public class IndexedTensor implements Tensor {
return new CellIterator();
}
+ /** Returns an iterator over all the cells in this tensor which matches the given partial address */
+ // TODO: Move up to Tensor and create a mixed tensor which can implement it (and subspace iterators) efficiently
+ public SubspaceIterator cellIterator(PartialAddress partialAddress, DimensionSizes iterationSizes) {
+ int[] startAddress = new int[type().dimensions().size()];
+ List<Integer> iterateDimensions = new ArrayList<>();
+ for (int i = 0; i < type().dimensions().size(); i++) {
+ int partialAddressLabel = partialAddress.intLabel(type.dimensions().get(i).name());
+ if (partialAddressLabel >= 0) // iterate at this label
+ startAddress[i] = partialAddressLabel;
+ else // iterate over this dimension
+ iterateDimensions.add(i);
+ }
+ return new SubspaceIterator(iterateDimensions, startAddress, iterationSizes);
+ }
+
/**
* Returns an iterator over the values of this.
* Values are returned in order of increasing indexes in each dimension, increasing
diff --git a/vespajlib/src/main/java/com/yahoo/tensor/PartialAddress.java b/vespajlib/src/main/java/com/yahoo/tensor/PartialAddress.java
new file mode 100644
index 00000000000..463b7f3e99f
--- /dev/null
+++ b/vespajlib/src/main/java/com/yahoo/tensor/PartialAddress.java
@@ -0,0 +1,61 @@
+package com.yahoo.tensor;
+
+import com.google.common.annotations.Beta;
+
+/**
+ * An address to a subset of a tensors' cells, specifying a label for some but not necessarily all of the tensors
+ * dimensions.
+ *
+ * @author bratseth
+ */
+// Implementation notes:
+// - These are created in inner (though not inner-most) loops so they are implemented with minimal allocation.
+// We also avoid non-essential error checking.
+// - We can add support for string labels later without breaking the API
+@Beta
+public class PartialAddress {
+
+ // Two arrays which contains corresponding dimension=label pairs.
+ // The sizes of these are always equal.
+ private final String[] dimensionNames;
+ private final int[] labels;
+
+ private PartialAddress(Builder builder) {
+ this.dimensionNames = builder.dimensionNames;
+ this.labels = builder.labels;
+ builder.dimensionNames = null; // invalidate builder to safely take over array ownership
+ builder.labels = null;
+ }
+
+ /** Returns the int label of this dimension, or -1 if no label is specified for it */
+ int intLabel(String dimensionName) {
+ for (int i = 0; i < dimensionNames.length; i++)
+ if (dimensionNames[i].equals(dimensionName))
+ return labels[i];
+ return -1;
+ }
+
+ public static class Builder {
+
+ private String[] dimensionNames;
+ private int[] labels;
+ private int index = 0;
+
+ public Builder(int size) {
+ dimensionNames = new String[size];
+ labels = new int[size];
+ }
+
+ public void add(String dimensionName, int label) {
+ dimensionNames[index] = dimensionName;
+ labels[index] = label;
+ index++;
+ }
+
+ public PartialAddress build() {
+ return new PartialAddress(this);
+ }
+
+ }
+
+}
diff --git a/vespajlib/src/main/java/com/yahoo/tensor/functions/Concat.java b/vespajlib/src/main/java/com/yahoo/tensor/functions/Concat.java
index 05999ff1240..fda6e8ef86c 100644
--- a/vespajlib/src/main/java/com/yahoo/tensor/functions/Concat.java
+++ b/vespajlib/src/main/java/com/yahoo/tensor/functions/Concat.java
@@ -83,7 +83,7 @@ public class Concat extends PrimitiveTensorFunction {
for (Iterator<IndexedTensor.SubspaceIterator> ib = b.subspaceIterator(otherADimensions); ib.hasNext();) {
IndexedTensor.SubspaceIterator ibSubspace = ib.next();
while (ibSubspace.hasNext()) {
- java.util.Map.Entry<TensorAddress, Double> bCell = ibSubspace.next(); // TODO: Create Cell convenience subclass for Map.Entry
+ Tensor.Cell bCell = ibSubspace.next();
TensorAddress combinedAddress = combineAddresses(aAddress, aToIndexes, bCell.getKey(), bToIndexes,
concatType, offset, dimension);
if (combinedAddress == null) continue; // incompatible
@@ -125,21 +125,21 @@ public class Concat extends PrimitiveTensorFunction {
/** Returns the concrete (not type) dimension sizes resulting from combining a and b */
private DimensionSizes concatSize(TensorType concatType, IndexedTensor a, IndexedTensor b, String concatDimension) {
- DimensionSizes.Builder joinedSizes = new DimensionSizes.Builder(concatType.dimensions().size());
- for (int i = 0; i < joinedSizes.dimensions(); i++) {
+ DimensionSizes.Builder concatSizes = new DimensionSizes.Builder(concatType.dimensions().size());
+ for (int i = 0; i < concatSizes.dimensions(); i++) {
String currentDimension = concatType.dimensions().get(i).name();
int aSize = a.type().indexOfDimension(currentDimension).map(d -> a.dimensionSizes().size(d)).orElse(0);
int bSize = b.type().indexOfDimension(currentDimension).map(d -> b.dimensionSizes().size(d)).orElse(0);
if (currentDimension.equals(concatDimension))
- joinedSizes.set(i, aSize + bSize);
+ concatSizes.set(i, aSize + bSize);
else if (aSize != 0 && bSize != 0 && aSize!=bSize )
throw new IllegalArgumentException("Dimension " + currentDimension + " must be of the same size when " +
"concatenating " + a.type() + " and " + b.type() + " along dimension " +
concatDimension + ", but was " + aSize + " and " + bSize);
else
- joinedSizes.set(i, Math.max(aSize, bSize));
+ concatSizes.set(i, Math.max(aSize, bSize));
}
- return joinedSizes.build();
+ return concatSizes.build();
}
/**
@@ -150,13 +150,13 @@ public class Concat extends PrimitiveTensorFunction {
*/
private TensorAddress combineAddresses(TensorAddress a, int[] aToIndexes, TensorAddress b, int[] bToIndexes,
TensorType concatType, int concatOffset, String concatDimension) {
- int[] joinedLabels = new int[concatType.dimensions().size()];
- Arrays.fill(joinedLabels, -1);
+ int[] combinedLabels = new int[concatType.dimensions().size()];
+ Arrays.fill(combinedLabels, -1);
int concatDimensionIndex = concatType.indexOfDimension(concatDimension).get();
- mapContent(a, joinedLabels, aToIndexes, concatDimensionIndex, concatOffset); // note: This sets a nonsensical value in the concat dimension
- boolean compatible = mapContent(b, joinedLabels, bToIndexes, concatDimensionIndex, concatOffset); // ... which is overwritten by the right value here
+ mapContent(a, combinedLabels, aToIndexes, concatDimensionIndex, concatOffset); // note: This sets a nonsensical value in the concat dimension
+ boolean compatible = mapContent(b, combinedLabels, bToIndexes, concatDimensionIndex, concatOffset); // ... which is overwritten by the right value here
if ( ! compatible) return null;
- return TensorAddress.of(joinedLabels);
+ return TensorAddress.of(combinedLabels);
}
/**
@@ -166,7 +166,7 @@ public class Concat extends PrimitiveTensorFunction {
* fromType.dimensions().get(i).name.equals(toType.dimensions().get(n).name())
* If some dimension in fromType is not present in toType, the corresponding index will be -1
*/
- // TODO: Stolen from join - put on TensorType?
+ // TODO: Stolen from join
private int[] mapIndexes(TensorType fromType, TensorType toType) {
int[] toIndexes = new int[fromType.dimensions().size()];
for (int i = 0; i < fromType.dimensions().size(); i++)
diff --git a/vespajlib/src/main/java/com/yahoo/tensor/functions/Join.java b/vespajlib/src/main/java/com/yahoo/tensor/functions/Join.java
index 23865e1cc1c..ceade39ce42 100644
--- a/vespajlib/src/main/java/com/yahoo/tensor/functions/Join.java
+++ b/vespajlib/src/main/java/com/yahoo/tensor/functions/Join.java
@@ -2,8 +2,10 @@ package com.yahoo.tensor.functions;
import com.google.common.annotations.Beta;
import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Sets;
import com.yahoo.tensor.DimensionSizes;
import com.yahoo.tensor.IndexedTensor;
+import com.yahoo.tensor.PartialAddress;
import com.yahoo.tensor.Tensor;
import com.yahoo.tensor.TensorAddress;
import com.yahoo.tensor.TensorType;
@@ -156,16 +158,20 @@ public class Join extends PrimitiveTensorFunction {
}
}
- private DimensionSizes joinedSize(TensorType joinedType, IndexedTensor subspace, IndexedTensor superspace) {
- DimensionSizes.Builder b = new DimensionSizes.Builder(joinedType.dimensions().size());
- for (int i = 0; i < b.dimensions(); i++) {
- Optional<Integer> subspaceIndex = subspace.type().indexOfDimension(joinedType.dimensions().get(i).name());
- if (subspaceIndex.isPresent())
- b.set(i, Math.min(superspace.dimensionSizes().size(i), subspace.dimensionSizes().size(subspaceIndex.get())));
- else
- b.set(i, superspace.dimensionSizes().size(i));
+ private DimensionSizes joinedSize(TensorType joinedType, IndexedTensor a, IndexedTensor b) {
+ DimensionSizes.Builder builder = new DimensionSizes.Builder(joinedType.dimensions().size());
+ for (int i = 0; i < builder.dimensions(); i++) {
+ String dimensionName = joinedType.dimensions().get(i).name();
+ Optional<Integer> aIndex = a.type().indexOfDimension(dimensionName);
+ Optional<Integer> bIndex = b.type().indexOfDimension(dimensionName);
+ if (aIndex.isPresent() && bIndex.isPresent())
+ builder.set(i, Math.min(b.dimensionSizes().size(bIndex.get()), a.dimensionSizes().size(aIndex.get())));
+ else if (aIndex.isPresent())
+ builder.set(i, a.dimensionSizes().size(aIndex.get()));
+ else if (bIndex.isPresent())
+ builder.set(i, b.dimensionSizes().size(bIndex.get()));
}
- return b.build();
+ return builder.build();
}
private Tensor generalSubspaceJoin(Tensor subspace, Tensor superspace, TensorType joinedType, boolean reversedArgumentOrder) {
@@ -200,6 +206,69 @@ public class Join extends PrimitiveTensorFunction {
/** Slow join which works for any two tensors */
private Tensor generalJoin(Tensor a, Tensor b, TensorType joinedType) {
+ if (a instanceof IndexedTensor && b instanceof IndexedTensor)
+ return indexedGeneralJoin((IndexedTensor) a, (IndexedTensor) b, joinedType);
+ else
+ return mappedGeneralJoin(a, b, joinedType);
+ }
+
+ private Tensor indexedGeneralJoin(IndexedTensor a, IndexedTensor b, TensorType joinedType) {
+ DimensionSizes joinedSize = joinedSize(joinedType, a, b);
+ Tensor.Builder builder = Tensor.Builder.of(joinedType, joinedSize);
+ int[] aToIndexes = mapIndexes(a.type(), joinedType);
+ int[] bToIndexes = mapIndexes(b.type(), joinedType);
+ joinTo(a, b, joinedType, joinedSize, aToIndexes, bToIndexes, false, builder);
+ joinTo(b, a, joinedType, joinedSize, bToIndexes, aToIndexes, true, builder);
+ return builder.build();
+ }
+
+ private void joinTo(IndexedTensor a, IndexedTensor b, TensorType joinedType, DimensionSizes joinedSize,
+ int[] aToIndexes, int[] bToIndexes, boolean reversedOrder, Tensor.Builder builder) {
+ Set<String> sharedDimensions = Sets.intersection(a.type().dimensionNames(), b.type().dimensionNames());
+ Set<String> dimensionsOnlyInA = Sets.difference(a.type().dimensionNames(), b.type().dimensionNames());
+
+ DimensionSizes aIterateSize = joinedSizeOf(a.type(), joinedType, joinedSize);
+ DimensionSizes bIterateSize = joinedSizeOf(b.type(), joinedType, joinedSize);
+
+ // for each combination of dimensions only in a
+ for (Iterator<IndexedTensor.SubspaceIterator> ia = a.subspaceIterator(dimensionsOnlyInA, aIterateSize); ia.hasNext(); ) {
+ IndexedTensor.SubspaceIterator aSubspace = ia.next();
+ // for each combination of dimensions in a which is also in b
+ while (aSubspace.hasNext()) {
+ Tensor.Cell aCell = aSubspace.next();
+ PartialAddress matchingBCells = partialAddress(a.type(), aSubspace.address(), sharedDimensions);
+ // for each matching combination of dimensions ony in b
+ for (IndexedTensor.SubspaceIterator bSubspace = b.cellIterator(matchingBCells, bIterateSize); bSubspace.hasNext(); ) {
+ Tensor.Cell bCell = bSubspace.next();
+ TensorAddress joinedAddress = joinAddresses(aCell.getKey(), aToIndexes, bCell.getKey(), bToIndexes, joinedType);
+ double joinedValue = reversedOrder ? combinator.applyAsDouble(bCell.getValue(), aCell.getValue())
+ : combinator.applyAsDouble(aCell.getValue(), bCell.getValue());
+ builder.cell(joinedAddress, joinedValue);
+ }
+ }
+ }
+ }
+
+ private PartialAddress partialAddress(TensorType addressType, TensorAddress address, Set<String> retainDimensions) {
+ PartialAddress.Builder builder = new PartialAddress.Builder(retainDimensions.size());
+ for (int i = 0; i < addressType.dimensions().size(); i++)
+ if (retainDimensions.contains(addressType.dimensions().get(i).name()))
+ builder.add(addressType.dimensions().get(i).name(), address.intLabel(i));
+ return builder.build();
+ }
+
+ /** Returns the sizes from the joined sizes which are present in the type argument */
+ private DimensionSizes joinedSizeOf(TensorType type, TensorType joinedType, DimensionSizes joinedSizes) {
+ DimensionSizes.Builder builder = new DimensionSizes.Builder(type.dimensions().size());
+ int dimensionIndex = 0;
+ for (int i = 0; i < joinedType.dimensions().size(); i++) {
+ if (type.dimensionNames().contains(joinedType.dimensions().get(i).name()))
+ builder.set(dimensionIndex++, joinedSizes.size(i));
+ }
+ return builder.build();
+ }
+
+ private Tensor mappedGeneralJoin(Tensor a, Tensor b, TensorType joinedType) {
int[] aToIndexes = mapIndexes(a.type(), joinedType);
int[] bToIndexes = mapIndexes(b.type(), joinedType);
Tensor.Builder builder = Tensor.Builder.of(joinedType);
@@ -207,8 +276,8 @@ public class Join extends PrimitiveTensorFunction {
Map.Entry<TensorAddress, Double> aCell = aIterator.next();
for (Iterator<Tensor.Cell> bIterator = b.cellIterator(); bIterator.hasNext(); ) {
Map.Entry<TensorAddress, Double> bCell = bIterator.next();
- TensorAddress combinedAddress = combineAddresses(aCell.getKey(), aToIndexes,
- bCell.getKey(), bToIndexes, joinedType);
+ TensorAddress combinedAddress = joinAddresses(aCell.getKey(), aToIndexes,
+ bCell.getKey(), bToIndexes, joinedType);
if (combinedAddress == null) continue; // not combinable
builder.cell(combinedAddress, combinator.applyAsDouble(aCell.getValue(), bCell.getValue()));
}
@@ -230,8 +299,8 @@ public class Join extends PrimitiveTensorFunction {
return toIndexes;
}
- private TensorAddress combineAddresses(TensorAddress a, int[] aToIndexes, TensorAddress b, int[] bToIndexes,
- TensorType joinedType) {
+ private TensorAddress joinAddresses(TensorAddress a, int[] aToIndexes, TensorAddress b, int[] bToIndexes,
+ TensorType joinedType) {
String[] joinedLabels = new String[joinedType.dimensions().size()];
mapContent(a, joinedLabels, aToIndexes);
boolean compatible = mapContent(b, joinedLabels, bToIndexes);
diff --git a/vespajlib/src/test/java/com/yahoo/tensor/TensorFunctionBenchmark.java b/vespajlib/src/test/java/com/yahoo/tensor/TensorFunctionBenchmark.java
index de7e49c46e4..2f060239eb1 100644
--- a/vespajlib/src/test/java/com/yahoo/tensor/TensorFunctionBenchmark.java
+++ b/vespajlib/src/test/java/com/yahoo/tensor/TensorFunctionBenchmark.java
@@ -106,41 +106,51 @@ public class TensorFunctionBenchmark {
// ---------------- Mapped with extra space (sidesteps current special-case optimizations):
// 410 ms
+ System.gc();
time = new TensorFunctionBenchmark().benchmark(20, vectors(100, 300, TensorType.Dimension.Type.mapped), TensorType.Dimension.Type.mapped, true);
System.out.printf("Mapped vectors, x space time per join: %1$8.3f ms\n", time);
// 770 ms
+ System.gc();
time = new TensorFunctionBenchmark().benchmark(20, matrix(100, 300, TensorType.Dimension.Type.mapped), TensorType.Dimension.Type.mapped, true);
System.out.printf("Mapped matrix, x space time per join: %1$8.3f ms\n", time);
// ---------------- Mapped:
// 2.6 ms
+ System.gc();
time = new TensorFunctionBenchmark().benchmark(5000, vectors(100, 300, TensorType.Dimension.Type.mapped), TensorType.Dimension.Type.mapped, false);
System.out.printf("Mapped vectors, time per join: %1$8.3f ms\n", time);
// 6.8 ms
- time = new TensorFunctionBenchmark().benchmark(500, matrix(100, 300, TensorType.Dimension.Type.mapped), TensorType.Dimension.Type.mapped, false);
+ System.gc();
+ time = new TensorFunctionBenchmark().benchmark(1000, matrix(100, 300, TensorType.Dimension.Type.mapped), TensorType.Dimension.Type.mapped, false);
System.out.printf("Mapped matrix, time per join: %1$8.3f ms\n", time);
// ---------------- Indexed (unbound) with extra space (sidesteps current special-case optimizations):
- // 1600 ms
- time = new TensorFunctionBenchmark().benchmark(20, vectors(100, 300, TensorType.Dimension.Type.indexedUnbound), TensorType.Dimension.Type.indexedUnbound, true);
+ // 30 ms
+ System.gc();
+ time = new TensorFunctionBenchmark().benchmark(500, vectors(100, 300, TensorType.Dimension.Type.indexedUnbound), TensorType.Dimension.Type.indexedUnbound, true);
System.out.printf("Indexed vectors, x space time per join: %1$8.3f ms\n", time);
- // 1800 ms
- time = new TensorFunctionBenchmark().benchmark(20, matrix(100, 300, TensorType.Dimension.Type.indexedUnbound), TensorType.Dimension.Type.indexedUnbound, true);
+ // 27 ms
+ System.gc();
+ time = new TensorFunctionBenchmark().benchmark(500, matrix(100, 300, TensorType.Dimension.Type.indexedUnbound), TensorType.Dimension.Type.indexedUnbound, true);
System.out.printf("Indexed matrix, x space time per join: %1$8.3f ms\n", time);
// ---------------- Indexed unbound:
// 0.14 ms
+ System.gc();
time = new TensorFunctionBenchmark().benchmark(50000, vectors(100, 300, TensorType.Dimension.Type.indexedUnbound), TensorType.Dimension.Type.indexedUnbound, false);
System.out.printf("Indexed unbound vectors, time per join: %1$8.3f ms\n", time);
// 0.14 ms
+ System.gc();
time = new TensorFunctionBenchmark().benchmark(50000, matrix(100, 300, TensorType.Dimension.Type.indexedUnbound), TensorType.Dimension.Type.indexedUnbound, false);
System.out.printf("Indexed unbound matrix, time per join: %1$8.3f ms\n", time);
// ---------------- Indexed bound:
// 0.14 ms
+ System.gc();
time = new TensorFunctionBenchmark().benchmark(50000, vectors(100, 300, TensorType.Dimension.Type.indexedBound), TensorType.Dimension.Type.indexedBound, false);
System.out.printf("Indexed bound vectors, time per join: %1$8.3f ms\n", time);
// 0.14 ms
+ System.gc();
time = new TensorFunctionBenchmark().benchmark(50000, matrix(100, 300, TensorType.Dimension.Type.indexedBound), TensorType.Dimension.Type.indexedBound, false);
System.out.printf("Indexed bound matrix, time per join: %1$8.3f ms\n", time);
}
diff --git a/vespajlib/src/test/java/com/yahoo/tensor/functions/JoinTestCase.java b/vespajlib/src/test/java/com/yahoo/tensor/functions/JoinTestCase.java
index f2b55c74066..bc2d1f21717 100644
--- a/vespajlib/src/test/java/com/yahoo/tensor/functions/JoinTestCase.java
+++ b/vespajlib/src/test/java/com/yahoo/tensor/functions/JoinTestCase.java
@@ -34,4 +34,15 @@ public class JoinTestCase {
t2.divide(t1));
}
+ @Test
+ public void testGeneralJoin() {
+ assertEquals(Tensor.from("tensor(x[],y[]):{ {x:0,y:0}:1, {x:1,y:0}:2, {x:2,y:0}:3 }"),
+ Tensor.from("tensor(x[]):{ {x:0}:2, {x:1}:4, {x:2}:6 }")
+ .divide(Tensor.from("tensor(y[]):{{y:0}:2}")));
+
+ assertEquals(Tensor.from("tensor(x[],y[],z[]):{ {x:0,y:0,z:0}:3, {x:1,y:0,z:0}:4, {x:0,y:1,z:0}:5, {x:1,y:1,z:0}:6 }"),
+ Tensor.from("tensor(x[],y[]):{ {x:0,y:0}:6, {x:1,y:0}:8, {x:0,y:1}:20, {x:1,y:1}:24 }")
+ .divide(Tensor.from("tensor(y[],z[]):{ {y:0,z:0}:2, {y:1,z:0}:4, {y:2,z:0}:6 }")));
+ }
+
}