Move ArchiveStreamReader and its large dependency from vespajlib to application-model.

This avoids it being pulled in to the config-model-fat that make the install larger than necessary.
author: Henning Baldersheim <balder@yahoo-inc.com> 2022-11-26 16:04:20 +0100
committer: Henning Baldersheim <balder@yahoo-inc.com> 2022-11-26 17:26:52 +0100
commit: 9b11ce7f3b9080c6c3e640e694b253a1122e8bfa (patch)
tree: ff388da577a44e69966fd9c0d855d7acccc9bbba /application-model
parent: 46a1ed45abb3f7635069ff07d9e046406fe1062f (diff)
3 files changed, 357 insertions, 0 deletions
diff --git a/application-model/pom.xml b/application-model/pom.xml
index 2143f3a5ffd..f81c4ea4b62 100644
--- a/application-model/pom.xml
+++ b/application-model/pom.xml
@@ -22,6 +22,11 @@
             <scope>provided</scope>
         </dependency>
         <dependency>
+            <groupId>org.apache.commons</groupId>
+            <artifactId>commons-compress</artifactId>
+            <scope>provided</scope>
+        </dependency>
+        <dependency>
             <groupId>com.yahoo.vespa</groupId>
             <artifactId>vespajlib</artifactId>
             <version>${project.version}</version>
@@ -39,6 +44,11 @@
             <version>${project.version}</version>
             <scope>provided</scope>
         </dependency>
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter</artifactId>
+            <scope>test</scope>
+        </dependency>
     </dependencies>
     <build>
         <plugins>
diff --git a/application-model/src/main/java/com/yahoo/vespa/archive/ArchiveStreamReader.java b/application-model/src/main/java/com/yahoo/vespa/archive/ArchiveStreamReader.java
new file mode 100644
index 00000000000..87665efc1ef
--- /dev/null
+++ b/application-model/src/main/java/com/yahoo/vespa/archive/ArchiveStreamReader.java
@@ -0,0 +1,216 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.vespa.archive;
+
+import com.yahoo.path.Path;
+import com.yahoo.yolean.Exceptions;
+import org.apache.commons.compress.archivers.ArchiveEntry;
+import org.apache.commons.compress.archivers.ArchiveInputStream;
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.UncheckedIOException;
+import java.util.Objects;
+import java.util.OptionalLong;
+import java.util.function.Predicate;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * Helper class for safely reading files from a compressed archive.
+ *
+ * @author mpolden
+ */
+public class ArchiveStreamReader implements AutoCloseable {
+
+    private final ArchiveInputStream archiveInputStream;
+    private final Options options;
+
+    private long totalRead = 0;
+    private long entriesRead = 0;
+
+    private ArchiveStreamReader(ArchiveInputStream archiveInputStream, Options options) {
+        this.archiveInputStream = Objects.requireNonNull(archiveInputStream);
+        this.options = Objects.requireNonNull(options);
+    }
+
+    /** Create reader for an inputStream containing a tar.gz file */
+    public static ArchiveStreamReader ofTarGzip(InputStream inputStream, Options options) {
+        return new ArchiveStreamReader(new TarArchiveInputStream(Exceptions.uncheck(() -> new GZIPInputStream(inputStream))), options);
+    }
+
+    /** Create reader for an inputStream containing a ZIP file */
+    public static ArchiveStreamReader ofZip(InputStream inputStream, Options options) {
+        return new ArchiveStreamReader(new ZipArchiveInputStream(inputStream), options);
+    }
+
+    /**
+     * Read the next file in this archive and write it to given outputStream. Returns information about the read archive
+     * file, or null if there are no more files to read.
+     */
+    public ArchiveFile readNextTo(OutputStream outputStream) {
+        ArchiveEntry entry;
+        try {
+            while ((entry = archiveInputStream.getNextEntry()) != null) {
+                Path path = Path.fromString(requireNormalized(entry.getName(), options.allowDotSegment));
+                if (isSymlink(entry)) throw new IllegalArgumentException("Archive entry " + path + " is a symbolic link, which is unsupported");
+                if (entry.isDirectory()) continue;
+                if (!options.pathPredicate.test(path.toString())) continue;
+                if (++entriesRead > options.maxEntries) throw new IllegalArgumentException("Attempted to read more entries than entry limit of " + options.maxEntries);
+
+                long size = 0;
+                byte[] buffer = new byte[2048];
+                int read;
+                while ((read = archiveInputStream.read(buffer)) != -1) {
+                    totalRead += read;
+                    size += read;
+                    if (totalRead > options.maxSize) throw new IllegalArgumentException("Total size of archive exceeds size limit of " + options.maxSize + " bytes");
+                    if (read > options.maxEntrySize) {
+                        if (!options.truncateEntry) throw new IllegalArgumentException("Size of entry " + path + " exceeded entry size limit of " + options.maxEntrySize + " bytes");
+                    } else {
+                        outputStream.write(buffer, 0, read);
+                    }
+                }
+                return new ArchiveFile(path, crc32(entry), size);
+            }
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        }
+        return null;
+    }
+
+    @Override
+    public void close() {
+        Exceptions.uncheck(archiveInputStream::close);
+    }
+
+    /** Information about a file extracted from a compressed archive */
+    public static class ArchiveFile {
+
+        private final Path path;
+        private final OptionalLong crc32;
+        private final long size;
+
+        public ArchiveFile(Path name, OptionalLong crc32, long size) {
+            this.path = Objects.requireNonNull(name);
+            this.crc32 = Objects.requireNonNull(crc32);
+            if (crc32.isPresent()) {
+                requireNonNegative("crc32", crc32.getAsLong());
+            }
+            this.size = requireNonNegative("size", size);
+        }
+
+        /** The path of this file inside its containing archive */
+        public Path path() {
+            return path;
+        }
+
+        /** The CRC-32 checksum of this file, if any */
+        public OptionalLong crc32() {
+            return crc32;
+        }
+
+        /** The decompressed size of this file */
+        public long size() {
+            return size;
+        }
+
+    }
+
+    /** Get the CRC-32 checksum of given archive entry, if any */
+    private static OptionalLong crc32(ArchiveEntry entry) {
+        long crc32 = -1;
+        if (entry instanceof ZipArchiveEntry) {
+            crc32 = ((ZipArchiveEntry) entry).getCrc();
+        }
+        return crc32 > -1 ? OptionalLong.of(crc32) : OptionalLong.empty();
+    }
+
+    private static boolean isSymlink(ArchiveEntry entry) {
+        // Symlinks inside ZIP files are not part of the ZIP spec and are only supported by some implementations, such
+        // as Info-ZIP.
+        //
+        // Commons Compress only has limited support for symlinks as they are only detected when the ZIP file is read
+        // through org.apache.commons.compress.archivers.zip.ZipFile. This is not the case in this class, because it must
+        // support reading ZIP files from generic input streams. The check below thus always returns false.
+        if (entry instanceof ZipArchiveEntry zipEntry) return zipEntry.isUnixSymlink();
+        if (entry instanceof TarArchiveEntry tarEntry) return tarEntry.isSymbolicLink();
+        throw new IllegalArgumentException("Unsupported archive entry " + entry.getClass().getSimpleName() + ", cannot check for symbolic link");
+    }
+
+    private static String requireNormalized(String name, boolean allowDotSegment) {
+        for (var part : name.split("/")) {
+            if (part.isEmpty() || (!allowDotSegment && part.equals(".")) || part.equals("..")) {
+                throw new IllegalArgumentException("Unexpected non-normalized path found in zip content: '" + name + "'");
+            }
+        }
+        return name;
+    }
+
+    private static long requireNonNegative(String field, long n) {
+        if (n < 0) throw new IllegalArgumentException(field + " cannot be negative, got " + n);
+        return n;
+    }
+
+    /** Options for reading entries of an archive */
+    public static class Options {
+
+        private long maxSize = 8 * (long) Math.pow(1024, 3); // 8 GB
+        private long maxEntrySize = Long.MAX_VALUE;
+        private long maxEntries = Long.MAX_VALUE;
+        private boolean truncateEntry = false;
+        private boolean allowDotSegment = false;
+        private Predicate<String> pathPredicate = (path) -> true;
+
+        private Options() {}
+
+        /** Returns the standard set of read options */
+        public static Options standard() {
+            return new Options();
+        }
+
+        /** Set the maximum total size of decompressed entries. Default is 8 GB */
+        public Options maxSize(long size) {
+            this.maxSize = requireNonNegative("size", size);
+            return this;
+        }
+
+        /** Set the maximum size a decompressed entry. Default is no limit */
+        public Options maxEntrySize(long size) {
+            this.maxEntrySize = requireNonNegative("size", size);
+            return this;
+        }
+
+        /** Set the maximum number of entries to decompress. Default is no limit */
+        public Options maxEntries(long count) {
+            this.maxEntries = requireNonNegative("count", count);
+            return this;
+        }
+
+        /**
+         * Set whether to truncate the content of an entry exceeding the configured size limit, instead of throwing.
+         * Default is to throw.
+         */
+        public Options truncateEntry(boolean truncate) {
+            this.truncateEntry = truncate;
+            return this;
+        }
+
+        /** Set a predicate that an entry path must match in order to be extracted. Default is to extract all entries */
+        public Options pathPredicate(Predicate<String> predicate) {
+            this.pathPredicate = predicate;
+            return this;
+        }
+
+        /** Set whether to allow single-dot segments in entry paths. Default is false */
+        public Options allowDotSegment(boolean allow) {
+            this.allowDotSegment = allow;
+            return this;
+        }
+
+    }
+
+}
diff --git a/application-model/src/test/java/com/yahoo/vespa/archive/ArchiveStreamReaderTest.java b/application-model/src/test/java/com/yahoo/vespa/archive/ArchiveStreamReaderTest.java
new file mode 100644
index 00000000000..78ff2a805e5
--- /dev/null
+++ b/application-model/src/test/java/com/yahoo/vespa/archive/ArchiveStreamReaderTest.java
@@ -0,0 +1,131 @@
+package com.yahoo.vespa.archive;
+
+import com.yahoo.vespa.archive.ArchiveStreamReader.Options;
+import com.yahoo.yolean.Exceptions;
+import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
+import org.apache.commons.compress.archivers.zip.ZipArchiveOutputStream;
+import org.junit.jupiter.api.Test;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UncheckedIOException;
+import java.nio.charset.StandardCharsets;
+import java.util.HashMap;
+import java.util.Map;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+import static org.junit.jupiter.api.Assertions.fail;
+
+/**
+ * @author mpolden
+ */
+class ArchiveStreamReaderTest {
+
+    @Test
+    void reading() {
+        Map<String, String> zipFiles = Map.of("foo", "contents of foo",
+                                              "bar", "contents of bar",
+                                              "baz", "0".repeat(2049));
+        Map<String, String> zipContents = new HashMap<>(zipFiles);
+        zipContents.put("dir/", ""); // Directories are always ignored
+        Map<String, String> extracted = readAll(zip(zipContents), Options.standard());
+        assertEquals(zipFiles, extracted);
+    }
+
+    @Test
+    void entry_size_limit() {
+        Map<String, String> entries = Map.of("foo.xml", "foobar");
+        Options options = Options.standard().pathPredicate("foo.xml"::equals).maxEntrySize(1);
+        try {
+            readAll(zip(entries), options);
+            fail("Expected exception");
+        } catch (IllegalArgumentException ignored) {}
+
+        entries = Map.of("foo.xml", "foobar",
+                         "foo.jar", "0".repeat(100) // File not extracted and thus not subject to size limit
+        );
+        Map<String, String> extracted = readAll(zip(entries), options.maxEntrySize(10));
+        assertEquals(Map.of("foo.xml", "foobar"), extracted);
+    }
+
+    @Test
+    void size_limit() {
+        Map<String, String> entries = Map.of("foo.xml", "foo", "bar.xml", "bar");
+        try {
+            readAll(zip(entries), Options.standard().maxSize(4));
+            fail("Expected exception");
+        } catch (IllegalArgumentException ignored) {}
+    }
+
+    @Test
+    void entry_limit() {
+        Map<String, String> entries = Map.of("foo.xml", "foo", "bar.xml", "bar");
+        try {
+            readAll(zip(entries), Options.standard().maxEntries(1));
+            fail("Expected exception");
+        } catch (IllegalArgumentException ignored) {}
+    }
+
+    @Test
+    void paths() {
+        Map<String, Boolean> tests = Map.of(
+                "../../services.xml", true,
+                "/../.././services.xml", true,
+                "./application/././services.xml", true,
+                "application//services.xml", true,
+                "artifacts/", false, // empty dir
+                "services..xml", false,
+                "application/services.xml", false,
+                "components/foo-bar-deploy.jar", false,
+                "services.xml", false
+        );
+
+        Options options = Options.standard().maxEntrySize(1024);
+        tests.forEach((name, expectException) -> {
+            try {
+                readAll(zip(Map.of(name, "foo")), options.pathPredicate(name::equals));
+                assertFalse(expectException, "Expected exception for '" + name + "'");
+            } catch (IllegalArgumentException ignored) {
+                assertTrue(expectException, "Unexpected exception for '" + name + "'");
+            }
+        });
+    }
+
+    private static Map<String, String> readAll(InputStream inputStream, Options options) {
+        ArchiveStreamReader reader = ArchiveStreamReader.ofZip(inputStream, options);
+        ArchiveStreamReader.ArchiveFile file;
+        Map<String, String> entries = new HashMap<>();
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        while ((file = reader.readNextTo(baos)) != null) {
+            entries.put(file.path().toString(), baos.toString(StandardCharsets.UTF_8));
+            baos.reset();
+        }
+        return entries;
+    }
+
+    private static InputStream zip(Map<String, String> entries) {
+        ByteArrayOutputStream baos = new ByteArrayOutputStream();
+        ZipArchiveOutputStream archiveOutputStream = null;
+        try {
+            archiveOutputStream = new ZipArchiveOutputStream(baos);
+            for (var kv : entries.entrySet()) {
+                String entryName = kv.getKey();
+                String contents = kv.getValue();
+                ZipArchiveEntry entry = new ZipArchiveEntry(entryName);
+                archiveOutputStream.putArchiveEntry(entry);
+                archiveOutputStream.write(contents.getBytes(StandardCharsets.UTF_8));
+                archiveOutputStream.closeArchiveEntry();
+            }
+        } catch (IOException e) {
+            throw new UncheckedIOException(e);
+        } finally {
+            if (archiveOutputStream != null) Exceptions.uncheck(archiveOutputStream::close);
+        }
+        return new ByteArrayInputStream(baos.toByteArray());
+    }
+
+}
author	Henning Baldersheim <balder@yahoo-inc.com>	2022-11-26 16:04:20 +0100
committer	Henning Baldersheim <balder@yahoo-inc.com>	2022-11-26 17:26:52 +0100
commit	9b11ce7f3b9080c6c3e640e694b253a1122e8bfa (patch)
tree	ff388da577a44e69966fd9c0d855d7acccc9bbba /application-model
parent	46a1ed45abb3f7635069ff07d9e046406fe1062f (diff)