diff options
author | Jon Bratseth <bratseth@yahoo-inc.com> | 2017-01-06 17:09:05 +0100 |
---|---|---|
committer | Jon Bratseth <bratseth@yahoo-inc.com> | 2017-01-06 17:09:05 +0100 |
commit | 21d75fade74539121c0283fb154efdc09fa886f3 (patch) | |
tree | cb432c9203b92896b177c3762e2c42bd383a65b5 | |
parent | 27cdc6fb1fddf7127710312d8fd881a9aec8100f (diff) |
Faster indexed general join algorithm
7 files changed, 199 insertions, 30 deletions
diff --git a/vespajlib/src/main/java/com/yahoo/tensor/DimensionSizes.java b/vespajlib/src/main/java/com/yahoo/tensor/DimensionSizes.java index 76340bb7d8f..daa85cc51e4 100644 --- a/vespajlib/src/main/java/com/yahoo/tensor/DimensionSizes.java +++ b/vespajlib/src/main/java/com/yahoo/tensor/DimensionSizes.java @@ -1,5 +1,7 @@ package com.yahoo.tensor; +import com.google.common.annotations.Beta; + import java.util.Arrays; /** @@ -7,6 +9,7 @@ import java.util.Arrays; * * @author bratseth */ +@Beta public final class DimensionSizes { private final int[] sizes; diff --git a/vespajlib/src/main/java/com/yahoo/tensor/IndexedTensor.java b/vespajlib/src/main/java/com/yahoo/tensor/IndexedTensor.java index d69cf65ee8d..9315922f57a 100644 --- a/vespajlib/src/main/java/com/yahoo/tensor/IndexedTensor.java +++ b/vespajlib/src/main/java/com/yahoo/tensor/IndexedTensor.java @@ -52,6 +52,21 @@ public class IndexedTensor implements Tensor { return new CellIterator(); } + /** Returns an iterator over all the cells in this tensor which matches the given partial address */ + // TODO: Move up to Tensor and create a mixed tensor which can implement it (and subspace iterators) efficiently + public SubspaceIterator cellIterator(PartialAddress partialAddress, DimensionSizes iterationSizes) { + int[] startAddress = new int[type().dimensions().size()]; + List<Integer> iterateDimensions = new ArrayList<>(); + for (int i = 0; i < type().dimensions().size(); i++) { + int partialAddressLabel = partialAddress.intLabel(type.dimensions().get(i).name()); + if (partialAddressLabel >= 0) // iterate at this label + startAddress[i] = partialAddressLabel; + else // iterate over this dimension + iterateDimensions.add(i); + } + return new SubspaceIterator(iterateDimensions, startAddress, iterationSizes); + } + /** * Returns an iterator over the values of this. * Values are returned in order of increasing indexes in each dimension, increasing diff --git a/vespajlib/src/main/java/com/yahoo/tensor/PartialAddress.java b/vespajlib/src/main/java/com/yahoo/tensor/PartialAddress.java new file mode 100644 index 00000000000..463b7f3e99f --- /dev/null +++ b/vespajlib/src/main/java/com/yahoo/tensor/PartialAddress.java @@ -0,0 +1,61 @@ +package com.yahoo.tensor; + +import com.google.common.annotations.Beta; + +/** + * An address to a subset of a tensors' cells, specifying a label for some but not necessarily all of the tensors + * dimensions. + * + * @author bratseth + */ +// Implementation notes: +// - These are created in inner (though not inner-most) loops so they are implemented with minimal allocation. +// We also avoid non-essential error checking. +// - We can add support for string labels later without breaking the API +@Beta +public class PartialAddress { + + // Two arrays which contains corresponding dimension=label pairs. + // The sizes of these are always equal. + private final String[] dimensionNames; + private final int[] labels; + + private PartialAddress(Builder builder) { + this.dimensionNames = builder.dimensionNames; + this.labels = builder.labels; + builder.dimensionNames = null; // invalidate builder to safely take over array ownership + builder.labels = null; + } + + /** Returns the int label of this dimension, or -1 if no label is specified for it */ + int intLabel(String dimensionName) { + for (int i = 0; i < dimensionNames.length; i++) + if (dimensionNames[i].equals(dimensionName)) + return labels[i]; + return -1; + } + + public static class Builder { + + private String[] dimensionNames; + private int[] labels; + private int index = 0; + + public Builder(int size) { + dimensionNames = new String[size]; + labels = new int[size]; + } + + public void add(String dimensionName, int label) { + dimensionNames[index] = dimensionName; + labels[index] = label; + index++; + } + + public PartialAddress build() { + return new PartialAddress(this); + } + + } + +} diff --git a/vespajlib/src/main/java/com/yahoo/tensor/functions/Concat.java b/vespajlib/src/main/java/com/yahoo/tensor/functions/Concat.java index 05999ff1240..fda6e8ef86c 100644 --- a/vespajlib/src/main/java/com/yahoo/tensor/functions/Concat.java +++ b/vespajlib/src/main/java/com/yahoo/tensor/functions/Concat.java @@ -83,7 +83,7 @@ public class Concat extends PrimitiveTensorFunction { for (Iterator<IndexedTensor.SubspaceIterator> ib = b.subspaceIterator(otherADimensions); ib.hasNext();) { IndexedTensor.SubspaceIterator ibSubspace = ib.next(); while (ibSubspace.hasNext()) { - java.util.Map.Entry<TensorAddress, Double> bCell = ibSubspace.next(); // TODO: Create Cell convenience subclass for Map.Entry + Tensor.Cell bCell = ibSubspace.next(); TensorAddress combinedAddress = combineAddresses(aAddress, aToIndexes, bCell.getKey(), bToIndexes, concatType, offset, dimension); if (combinedAddress == null) continue; // incompatible @@ -125,21 +125,21 @@ public class Concat extends PrimitiveTensorFunction { /** Returns the concrete (not type) dimension sizes resulting from combining a and b */ private DimensionSizes concatSize(TensorType concatType, IndexedTensor a, IndexedTensor b, String concatDimension) { - DimensionSizes.Builder joinedSizes = new DimensionSizes.Builder(concatType.dimensions().size()); - for (int i = 0; i < joinedSizes.dimensions(); i++) { + DimensionSizes.Builder concatSizes = new DimensionSizes.Builder(concatType.dimensions().size()); + for (int i = 0; i < concatSizes.dimensions(); i++) { String currentDimension = concatType.dimensions().get(i).name(); int aSize = a.type().indexOfDimension(currentDimension).map(d -> a.dimensionSizes().size(d)).orElse(0); int bSize = b.type().indexOfDimension(currentDimension).map(d -> b.dimensionSizes().size(d)).orElse(0); if (currentDimension.equals(concatDimension)) - joinedSizes.set(i, aSize + bSize); + concatSizes.set(i, aSize + bSize); else if (aSize != 0 && bSize != 0 && aSize!=bSize ) throw new IllegalArgumentException("Dimension " + currentDimension + " must be of the same size when " + "concatenating " + a.type() + " and " + b.type() + " along dimension " + concatDimension + ", but was " + aSize + " and " + bSize); else - joinedSizes.set(i, Math.max(aSize, bSize)); + concatSizes.set(i, Math.max(aSize, bSize)); } - return joinedSizes.build(); + return concatSizes.build(); } /** @@ -150,13 +150,13 @@ public class Concat extends PrimitiveTensorFunction { */ private TensorAddress combineAddresses(TensorAddress a, int[] aToIndexes, TensorAddress b, int[] bToIndexes, TensorType concatType, int concatOffset, String concatDimension) { - int[] joinedLabels = new int[concatType.dimensions().size()]; - Arrays.fill(joinedLabels, -1); + int[] combinedLabels = new int[concatType.dimensions().size()]; + Arrays.fill(combinedLabels, -1); int concatDimensionIndex = concatType.indexOfDimension(concatDimension).get(); - mapContent(a, joinedLabels, aToIndexes, concatDimensionIndex, concatOffset); // note: This sets a nonsensical value in the concat dimension - boolean compatible = mapContent(b, joinedLabels, bToIndexes, concatDimensionIndex, concatOffset); // ... which is overwritten by the right value here + mapContent(a, combinedLabels, aToIndexes, concatDimensionIndex, concatOffset); // note: This sets a nonsensical value in the concat dimension + boolean compatible = mapContent(b, combinedLabels, bToIndexes, concatDimensionIndex, concatOffset); // ... which is overwritten by the right value here if ( ! compatible) return null; - return TensorAddress.of(joinedLabels); + return TensorAddress.of(combinedLabels); } /** @@ -166,7 +166,7 @@ public class Concat extends PrimitiveTensorFunction { * fromType.dimensions().get(i).name.equals(toType.dimensions().get(n).name()) * If some dimension in fromType is not present in toType, the corresponding index will be -1 */ - // TODO: Stolen from join - put on TensorType? + // TODO: Stolen from join private int[] mapIndexes(TensorType fromType, TensorType toType) { int[] toIndexes = new int[fromType.dimensions().size()]; for (int i = 0; i < fromType.dimensions().size(); i++) diff --git a/vespajlib/src/main/java/com/yahoo/tensor/functions/Join.java b/vespajlib/src/main/java/com/yahoo/tensor/functions/Join.java index 23865e1cc1c..ceade39ce42 100644 --- a/vespajlib/src/main/java/com/yahoo/tensor/functions/Join.java +++ b/vespajlib/src/main/java/com/yahoo/tensor/functions/Join.java @@ -2,8 +2,10 @@ package com.yahoo.tensor.functions; import com.google.common.annotations.Beta; import com.google.common.collect.ImmutableList; +import com.google.common.collect.Sets; import com.yahoo.tensor.DimensionSizes; import com.yahoo.tensor.IndexedTensor; +import com.yahoo.tensor.PartialAddress; import com.yahoo.tensor.Tensor; import com.yahoo.tensor.TensorAddress; import com.yahoo.tensor.TensorType; @@ -156,16 +158,20 @@ public class Join extends PrimitiveTensorFunction { } } - private DimensionSizes joinedSize(TensorType joinedType, IndexedTensor subspace, IndexedTensor superspace) { - DimensionSizes.Builder b = new DimensionSizes.Builder(joinedType.dimensions().size()); - for (int i = 0; i < b.dimensions(); i++) { - Optional<Integer> subspaceIndex = subspace.type().indexOfDimension(joinedType.dimensions().get(i).name()); - if (subspaceIndex.isPresent()) - b.set(i, Math.min(superspace.dimensionSizes().size(i), subspace.dimensionSizes().size(subspaceIndex.get()))); - else - b.set(i, superspace.dimensionSizes().size(i)); + private DimensionSizes joinedSize(TensorType joinedType, IndexedTensor a, IndexedTensor b) { + DimensionSizes.Builder builder = new DimensionSizes.Builder(joinedType.dimensions().size()); + for (int i = 0; i < builder.dimensions(); i++) { + String dimensionName = joinedType.dimensions().get(i).name(); + Optional<Integer> aIndex = a.type().indexOfDimension(dimensionName); + Optional<Integer> bIndex = b.type().indexOfDimension(dimensionName); + if (aIndex.isPresent() && bIndex.isPresent()) + builder.set(i, Math.min(b.dimensionSizes().size(bIndex.get()), a.dimensionSizes().size(aIndex.get()))); + else if (aIndex.isPresent()) + builder.set(i, a.dimensionSizes().size(aIndex.get())); + else if (bIndex.isPresent()) + builder.set(i, b.dimensionSizes().size(bIndex.get())); } - return b.build(); + return builder.build(); } private Tensor generalSubspaceJoin(Tensor subspace, Tensor superspace, TensorType joinedType, boolean reversedArgumentOrder) { @@ -200,6 +206,69 @@ public class Join extends PrimitiveTensorFunction { /** Slow join which works for any two tensors */ private Tensor generalJoin(Tensor a, Tensor b, TensorType joinedType) { + if (a instanceof IndexedTensor && b instanceof IndexedTensor) + return indexedGeneralJoin((IndexedTensor) a, (IndexedTensor) b, joinedType); + else + return mappedGeneralJoin(a, b, joinedType); + } + + private Tensor indexedGeneralJoin(IndexedTensor a, IndexedTensor b, TensorType joinedType) { + DimensionSizes joinedSize = joinedSize(joinedType, a, b); + Tensor.Builder builder = Tensor.Builder.of(joinedType, joinedSize); + int[] aToIndexes = mapIndexes(a.type(), joinedType); + int[] bToIndexes = mapIndexes(b.type(), joinedType); + joinTo(a, b, joinedType, joinedSize, aToIndexes, bToIndexes, false, builder); + joinTo(b, a, joinedType, joinedSize, bToIndexes, aToIndexes, true, builder); + return builder.build(); + } + + private void joinTo(IndexedTensor a, IndexedTensor b, TensorType joinedType, DimensionSizes joinedSize, + int[] aToIndexes, int[] bToIndexes, boolean reversedOrder, Tensor.Builder builder) { + Set<String> sharedDimensions = Sets.intersection(a.type().dimensionNames(), b.type().dimensionNames()); + Set<String> dimensionsOnlyInA = Sets.difference(a.type().dimensionNames(), b.type().dimensionNames()); + + DimensionSizes aIterateSize = joinedSizeOf(a.type(), joinedType, joinedSize); + DimensionSizes bIterateSize = joinedSizeOf(b.type(), joinedType, joinedSize); + + // for each combination of dimensions only in a + for (Iterator<IndexedTensor.SubspaceIterator> ia = a.subspaceIterator(dimensionsOnlyInA, aIterateSize); ia.hasNext(); ) { + IndexedTensor.SubspaceIterator aSubspace = ia.next(); + // for each combination of dimensions in a which is also in b + while (aSubspace.hasNext()) { + Tensor.Cell aCell = aSubspace.next(); + PartialAddress matchingBCells = partialAddress(a.type(), aSubspace.address(), sharedDimensions); + // for each matching combination of dimensions ony in b + for (IndexedTensor.SubspaceIterator bSubspace = b.cellIterator(matchingBCells, bIterateSize); bSubspace.hasNext(); ) { + Tensor.Cell bCell = bSubspace.next(); + TensorAddress joinedAddress = joinAddresses(aCell.getKey(), aToIndexes, bCell.getKey(), bToIndexes, joinedType); + double joinedValue = reversedOrder ? combinator.applyAsDouble(bCell.getValue(), aCell.getValue()) + : combinator.applyAsDouble(aCell.getValue(), bCell.getValue()); + builder.cell(joinedAddress, joinedValue); + } + } + } + } + + private PartialAddress partialAddress(TensorType addressType, TensorAddress address, Set<String> retainDimensions) { + PartialAddress.Builder builder = new PartialAddress.Builder(retainDimensions.size()); + for (int i = 0; i < addressType.dimensions().size(); i++) + if (retainDimensions.contains(addressType.dimensions().get(i).name())) + builder.add(addressType.dimensions().get(i).name(), address.intLabel(i)); + return builder.build(); + } + + /** Returns the sizes from the joined sizes which are present in the type argument */ + private DimensionSizes joinedSizeOf(TensorType type, TensorType joinedType, DimensionSizes joinedSizes) { + DimensionSizes.Builder builder = new DimensionSizes.Builder(type.dimensions().size()); + int dimensionIndex = 0; + for (int i = 0; i < joinedType.dimensions().size(); i++) { + if (type.dimensionNames().contains(joinedType.dimensions().get(i).name())) + builder.set(dimensionIndex++, joinedSizes.size(i)); + } + return builder.build(); + } + + private Tensor mappedGeneralJoin(Tensor a, Tensor b, TensorType joinedType) { int[] aToIndexes = mapIndexes(a.type(), joinedType); int[] bToIndexes = mapIndexes(b.type(), joinedType); Tensor.Builder builder = Tensor.Builder.of(joinedType); @@ -207,8 +276,8 @@ public class Join extends PrimitiveTensorFunction { Map.Entry<TensorAddress, Double> aCell = aIterator.next(); for (Iterator<Tensor.Cell> bIterator = b.cellIterator(); bIterator.hasNext(); ) { Map.Entry<TensorAddress, Double> bCell = bIterator.next(); - TensorAddress combinedAddress = combineAddresses(aCell.getKey(), aToIndexes, - bCell.getKey(), bToIndexes, joinedType); + TensorAddress combinedAddress = joinAddresses(aCell.getKey(), aToIndexes, + bCell.getKey(), bToIndexes, joinedType); if (combinedAddress == null) continue; // not combinable builder.cell(combinedAddress, combinator.applyAsDouble(aCell.getValue(), bCell.getValue())); } @@ -230,8 +299,8 @@ public class Join extends PrimitiveTensorFunction { return toIndexes; } - private TensorAddress combineAddresses(TensorAddress a, int[] aToIndexes, TensorAddress b, int[] bToIndexes, - TensorType joinedType) { + private TensorAddress joinAddresses(TensorAddress a, int[] aToIndexes, TensorAddress b, int[] bToIndexes, + TensorType joinedType) { String[] joinedLabels = new String[joinedType.dimensions().size()]; mapContent(a, joinedLabels, aToIndexes); boolean compatible = mapContent(b, joinedLabels, bToIndexes); diff --git a/vespajlib/src/test/java/com/yahoo/tensor/TensorFunctionBenchmark.java b/vespajlib/src/test/java/com/yahoo/tensor/TensorFunctionBenchmark.java index de7e49c46e4..2f060239eb1 100644 --- a/vespajlib/src/test/java/com/yahoo/tensor/TensorFunctionBenchmark.java +++ b/vespajlib/src/test/java/com/yahoo/tensor/TensorFunctionBenchmark.java @@ -106,41 +106,51 @@ public class TensorFunctionBenchmark { // ---------------- Mapped with extra space (sidesteps current special-case optimizations): // 410 ms + System.gc(); time = new TensorFunctionBenchmark().benchmark(20, vectors(100, 300, TensorType.Dimension.Type.mapped), TensorType.Dimension.Type.mapped, true); System.out.printf("Mapped vectors, x space time per join: %1$8.3f ms\n", time); // 770 ms + System.gc(); time = new TensorFunctionBenchmark().benchmark(20, matrix(100, 300, TensorType.Dimension.Type.mapped), TensorType.Dimension.Type.mapped, true); System.out.printf("Mapped matrix, x space time per join: %1$8.3f ms\n", time); // ---------------- Mapped: // 2.6 ms + System.gc(); time = new TensorFunctionBenchmark().benchmark(5000, vectors(100, 300, TensorType.Dimension.Type.mapped), TensorType.Dimension.Type.mapped, false); System.out.printf("Mapped vectors, time per join: %1$8.3f ms\n", time); // 6.8 ms - time = new TensorFunctionBenchmark().benchmark(500, matrix(100, 300, TensorType.Dimension.Type.mapped), TensorType.Dimension.Type.mapped, false); + System.gc(); + time = new TensorFunctionBenchmark().benchmark(1000, matrix(100, 300, TensorType.Dimension.Type.mapped), TensorType.Dimension.Type.mapped, false); System.out.printf("Mapped matrix, time per join: %1$8.3f ms\n", time); // ---------------- Indexed (unbound) with extra space (sidesteps current special-case optimizations): - // 1600 ms - time = new TensorFunctionBenchmark().benchmark(20, vectors(100, 300, TensorType.Dimension.Type.indexedUnbound), TensorType.Dimension.Type.indexedUnbound, true); + // 30 ms + System.gc(); + time = new TensorFunctionBenchmark().benchmark(500, vectors(100, 300, TensorType.Dimension.Type.indexedUnbound), TensorType.Dimension.Type.indexedUnbound, true); System.out.printf("Indexed vectors, x space time per join: %1$8.3f ms\n", time); - // 1800 ms - time = new TensorFunctionBenchmark().benchmark(20, matrix(100, 300, TensorType.Dimension.Type.indexedUnbound), TensorType.Dimension.Type.indexedUnbound, true); + // 27 ms + System.gc(); + time = new TensorFunctionBenchmark().benchmark(500, matrix(100, 300, TensorType.Dimension.Type.indexedUnbound), TensorType.Dimension.Type.indexedUnbound, true); System.out.printf("Indexed matrix, x space time per join: %1$8.3f ms\n", time); // ---------------- Indexed unbound: // 0.14 ms + System.gc(); time = new TensorFunctionBenchmark().benchmark(50000, vectors(100, 300, TensorType.Dimension.Type.indexedUnbound), TensorType.Dimension.Type.indexedUnbound, false); System.out.printf("Indexed unbound vectors, time per join: %1$8.3f ms\n", time); // 0.14 ms + System.gc(); time = new TensorFunctionBenchmark().benchmark(50000, matrix(100, 300, TensorType.Dimension.Type.indexedUnbound), TensorType.Dimension.Type.indexedUnbound, false); System.out.printf("Indexed unbound matrix, time per join: %1$8.3f ms\n", time); // ---------------- Indexed bound: // 0.14 ms + System.gc(); time = new TensorFunctionBenchmark().benchmark(50000, vectors(100, 300, TensorType.Dimension.Type.indexedBound), TensorType.Dimension.Type.indexedBound, false); System.out.printf("Indexed bound vectors, time per join: %1$8.3f ms\n", time); // 0.14 ms + System.gc(); time = new TensorFunctionBenchmark().benchmark(50000, matrix(100, 300, TensorType.Dimension.Type.indexedBound), TensorType.Dimension.Type.indexedBound, false); System.out.printf("Indexed bound matrix, time per join: %1$8.3f ms\n", time); } diff --git a/vespajlib/src/test/java/com/yahoo/tensor/functions/JoinTestCase.java b/vespajlib/src/test/java/com/yahoo/tensor/functions/JoinTestCase.java index f2b55c74066..bc2d1f21717 100644 --- a/vespajlib/src/test/java/com/yahoo/tensor/functions/JoinTestCase.java +++ b/vespajlib/src/test/java/com/yahoo/tensor/functions/JoinTestCase.java @@ -34,4 +34,15 @@ public class JoinTestCase { t2.divide(t1)); } + @Test + public void testGeneralJoin() { + assertEquals(Tensor.from("tensor(x[],y[]):{ {x:0,y:0}:1, {x:1,y:0}:2, {x:2,y:0}:3 }"), + Tensor.from("tensor(x[]):{ {x:0}:2, {x:1}:4, {x:2}:6 }") + .divide(Tensor.from("tensor(y[]):{{y:0}:2}"))); + + assertEquals(Tensor.from("tensor(x[],y[],z[]):{ {x:0,y:0,z:0}:3, {x:1,y:0,z:0}:4, {x:0,y:1,z:0}:5, {x:1,y:1,z:0}:6 }"), + Tensor.from("tensor(x[],y[]):{ {x:0,y:0}:6, {x:1,y:0}:8, {x:0,y:1}:20, {x:1,y:1}:24 }") + .divide(Tensor.from("tensor(y[],z[]):{ {y:0,z:0}:2, {y:1,z:0}:4, {y:2,z:0}:6 }"))); + } + } |