diff options
author | Martin Polden <mpolden@mpolden.no> | 2022-04-04 09:25:52 +0200 |
---|---|---|
committer | Martin Polden <mpolden@mpolden.no> | 2022-04-04 15:26:43 +0200 |
commit | a72efb9ee5f82b57ceae028fa00841f70755a205 (patch) | |
tree | be707fc34a4e4871698bcbe51a48677b9438a32a | |
parent | 02151a078dd5f45defaa42b19bf127bc2c999944 (diff) |
Implement ArchiveStreamReader
-rw-r--r-- | cloud-tenant-base-dependencies-enforcer/pom.xml | 1 | ||||
-rw-r--r-- | container-dev/pom.xml | 4 | ||||
-rw-r--r-- | container-test/pom.xml | 5 | ||||
-rw-r--r-- | vespajlib/pom.xml | 5 | ||||
-rw-r--r-- | vespajlib/src/main/java/com/yahoo/compress/ArchiveStreamReader.java | 194 | ||||
-rw-r--r-- | vespajlib/src/test/java/com/yahoo/compress/ArchiveStreamReaderTest.java | 62 |
6 files changed, 271 insertions, 0 deletions
diff --git a/cloud-tenant-base-dependencies-enforcer/pom.xml b/cloud-tenant-base-dependencies-enforcer/pom.xml index 21e5de31534..b5281050459 100644 --- a/cloud-tenant-base-dependencies-enforcer/pom.xml +++ b/cloud-tenant-base-dependencies-enforcer/pom.xml @@ -236,6 +236,7 @@ <include>org.antlr:antlr-runtime:3.5.2:jar:test</include> <include>org.antlr:antlr4-runtime:4.9.3:jar:test</include> <include>org.apache.commons:commons-exec:1.3:jar:test</include> + <include>org.apache.commons:commons-compress:1.21:jar:test</include> <include>org.apache.commons:commons-math3:3.6.1:jar:test</include> <include>org.apache.httpcomponents.client5:httpclient5:${httpclient5.version}:jar:test</include> <include>org.apache.httpcomponents.core5:httpcore5:${httpclient5.version}:jar:test</include> diff --git a/container-dev/pom.xml b/container-dev/pom.xml index 034081f4620..6268e1e6fb4 100644 --- a/container-dev/pom.xml +++ b/container-dev/pom.xml @@ -121,6 +121,10 @@ <groupId>io.airlift</groupId> <artifactId>aircompressor</artifactId> </exclusion> + <exclusion> + <groupId>org.apache.commons</groupId> + <artifactId>commons-compress</artifactId> + </exclusion> </exclusions> </dependency> <dependency> diff --git a/container-test/pom.xml b/container-test/pom.xml index 7c739faad26..d4ea39fa3c9 100644 --- a/container-test/pom.xml +++ b/container-test/pom.xml @@ -92,5 +92,10 @@ <artifactId>aircompressor</artifactId> <scope>compile</scope> </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-compress</artifactId> + <scope>compile</scope> + </dependency> </dependencies> </project> diff --git a/vespajlib/pom.xml b/vespajlib/pom.xml index b4898b14b86..ed6ae3678f4 100644 --- a/vespajlib/pom.xml +++ b/vespajlib/pom.xml @@ -36,6 +36,11 @@ <artifactId>aircompressor</artifactId> <scope>compile</scope> </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-compress</artifactId> + <scope>compile</scope> + </dependency> <!-- provided scope --> <dependency> diff --git a/vespajlib/src/main/java/com/yahoo/compress/ArchiveStreamReader.java b/vespajlib/src/main/java/com/yahoo/compress/ArchiveStreamReader.java new file mode 100644 index 00000000000..9f6bd5b1bb8 --- /dev/null +++ b/vespajlib/src/main/java/com/yahoo/compress/ArchiveStreamReader.java @@ -0,0 +1,194 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.compress; + +import com.yahoo.path.Path; +import com.yahoo.yolean.Exceptions; +import org.apache.commons.compress.archivers.ArchiveEntry; +import org.apache.commons.compress.archivers.ArchiveInputStream; +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.io.UncheckedIOException; +import java.util.Objects; +import java.util.OptionalLong; +import java.util.function.Predicate; +import java.util.zip.GZIPInputStream; + +/** + * Helper class for safely reading files from a compressed archive. + * + * @author mpolden + */ +public class ArchiveStreamReader implements AutoCloseable { + + private final ArchiveInputStream archiveInputStream; + private final Options options; + + private long totalRead = 0; + + private ArchiveStreamReader(ArchiveInputStream archiveInputStream, Options options) { + this.archiveInputStream = Objects.requireNonNull(archiveInputStream); + this.options = Objects.requireNonNull(options); + } + + /** Create reader for an inputStream containing a tar.gz file */ + public static ArchiveStreamReader ofTarGzip(InputStream inputStream, Options options) { + return new ArchiveStreamReader(new TarArchiveInputStream(Exceptions.uncheck(() -> new GZIPInputStream(inputStream))), options); + } + + /** Create reader for an inputStream containing a ZIP file */ + public static ArchiveStreamReader ofZip(InputStream inputStream, Options options) { + return new ArchiveStreamReader(new ZipArchiveInputStream(inputStream), options); + } + + /** + * Read the next file in this archive and write it to given outputStream. Returns information about the read archive + * file, or null if there are no more files to read. + */ + public ArchiveFile readNextTo(OutputStream outputStream) { + ArchiveEntry entry; + try { + while ((entry = archiveInputStream.getNextEntry()) != null) { + Path path = Path.fromString(requireNormalized(entry.getName())); + if (isSymlink(entry)) throw new IllegalArgumentException("Archive entry " + path + " is a symbolic link, which is disallowed"); + if (!options.pathPredicate.test(path.toString())) continue; + + long size = 0; + byte[] buffer = new byte[2048]; + int read; + while ((read = archiveInputStream.read(buffer)) != -1) { + totalRead += read; + size += read; + if (totalRead > options.sizeLimit) throw new IllegalArgumentException("Total size of archive exceeds size limit"); + if (read > options.entrySizeLimit) { + if (!options.truncateEntry) throw new IllegalArgumentException("Size of entry " + path + " exceeded entry size limit"); + } else { + outputStream.write(buffer, 0, read); + } + } + return new ArchiveFile(path, crc32(entry), size); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } + return null; + } + + @Override + public void close() { + Exceptions.uncheck(archiveInputStream::close); + } + + /** Information about a file extracted from a compressed archive */ + public static class ArchiveFile { + + private final Path path; + private final OptionalLong crc32; + private final long size; + + public ArchiveFile(Path name, OptionalLong crc32, long size) { + this.path = Objects.requireNonNull(name); + this.crc32 = Objects.requireNonNull(crc32); + if (crc32.isPresent()) { + requireNonNegative("crc32", crc32.getAsLong()); + } + this.size = requireNonNegative("size", size); + } + + /** The path of this file inside its containing archive */ + public Path path() { + return path; + } + + /** The CRC-32 checksum of this file, if any */ + public OptionalLong crc32() { + return crc32; + } + + /** The decompressed size of this file */ + public long size() { + return size; + } + + private static long requireNonNegative(String field, long n) { + if (n < 0) throw new IllegalArgumentException(field + " cannot be negative, got " + n); + return n; + } + + } + + /** Get the CRC-32 checksum of given archive entry, if any */ + private static OptionalLong crc32(ArchiveEntry entry) { + long crc32 = -1; + if (entry instanceof ZipArchiveEntry) { + crc32 = ((ZipArchiveEntry) entry).getCrc(); + } + return crc32 > -1 ? OptionalLong.of(crc32) : OptionalLong.empty(); + } + + private static boolean isSymlink(ArchiveEntry entry) { + if (entry instanceof ZipArchiveEntry) return ((ZipArchiveEntry) entry).isUnixSymlink(); + if (entry instanceof TarArchiveEntry) return ((TarArchiveEntry) entry).isSymbolicLink(); + // TODO: Add workaround for the poor symlink handling in compress library + throw new IllegalArgumentException("Unsupported archive entry " + entry.getClass().getSimpleName() + ", cannot check for symbolic link"); + } + + private static String requireNormalized(String name) { + for (var part : name.split("/")) { + if (part.equals(".") || part.equals("..") || part.isEmpty()) { + throw new IllegalArgumentException("Unexpected non-normalized path found in zip content: '" + name + "'"); + } + } + return name; + } + + /** Options for reading entries of an archive */ + public static class Options { + + private long sizeLimit = 8 * (long) Math.pow(1024, 3); // 8 GB + private long entrySizeLimit = Long.MAX_VALUE; + private boolean truncateEntry = false; + private Predicate<String> pathPredicate = (path) -> true; + + private Options() {} + + /** Returns the standard set of read options */ + public static Options standard() { + return new Options(); + } + + /** Set the total size limit when decompressing entries. Default is 8 GB */ + public Options sizeLimit(long limit) { + this.sizeLimit = limit; + return this; + } + + /** Set the size limit of a decompressed entry. Default is no limit */ + public Options entrySizeLimit(long limit) { + this.entrySizeLimit = limit; + return this; + } + + /** + * Set whether to truncate the content of an entry exceeding the configured size limit, instead of throwing. + * Default is to throw. + */ + public Options truncateEntry(boolean truncate) { + this.truncateEntry = truncate; + return this; + } + + /** Set a predicate that an archive file path must match in order to be extracted. Default is to extract all files */ + public Options pathPredicate(Predicate<String> predicate) { + this.pathPredicate = predicate; + return this; + } + + } + +} diff --git a/vespajlib/src/test/java/com/yahoo/compress/ArchiveStreamReaderTest.java b/vespajlib/src/test/java/com/yahoo/compress/ArchiveStreamReaderTest.java new file mode 100644 index 00000000000..9a292e8b3ea --- /dev/null +++ b/vespajlib/src/test/java/com/yahoo/compress/ArchiveStreamReaderTest.java @@ -0,0 +1,62 @@ +package com.yahoo.compress; + +import com.yahoo.compress.ArchiveStreamReader.Options; +import com.yahoo.yolean.Exceptions; +import org.apache.commons.compress.archivers.zip.ZipArchiveEntry; +import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream; +import org.junit.jupiter.api.Test; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.UncheckedIOException; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +/** + * @author mpolden + */ +class ArchiveStreamReaderTest { + + @Test + void reading() { + Map<String, String> zipContents = Map.of("foo", "contents of foo", + "bar", "contents of bar", + "baz", "0".repeat(2049)); + ArchiveStreamReader reader = ArchiveStreamReader.ofZip(zip(zipContents), Options.standard()); + ArchiveStreamReader.ArchiveFile file; + Map<String, String> extracted = new HashMap<>(); + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + while ((file = reader.readNextTo(baos)) != null) { + extracted.put(file.path().toString(), baos.toString(StandardCharsets.UTF_8)); + baos.reset(); + } + assertEquals(zipContents, extracted); + } + + private static InputStream zip(Map<String, String> entries) { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ZipArchiveOutputStream archiveOutputStream = null; + try { + archiveOutputStream = new ZipArchiveOutputStream(baos); + for (var kv : entries.entrySet()) { + String entryName = kv.getKey(); + String contents = kv.getValue(); + ZipArchiveEntry entry = new ZipArchiveEntry(entryName); + archiveOutputStream.putArchiveEntry(entry); + archiveOutputStream.write(contents.getBytes(StandardCharsets.UTF_8)); + archiveOutputStream.closeArchiveEntry(); + } + } catch (IOException e) { + throw new UncheckedIOException(e); + } finally { + if (archiveOutputStream != null) Exceptions.uncheck(archiveOutputStream::close); + } + return new ByteArrayInputStream(baos.toByteArray()); + } + +} |