Add a codec that enables conversion to and from a base N representation

Adds a codec that enables easy conversion from an array of bytes to any numeric base in [2, 256) and back again, using a supplied custom alphabet. Implemented by treating the input byte sequence to encode verbatim as a big-endian `BigInteger` and iteratively doing a `divmod` operation until the quotient is zero, emitting the modulus mapped onto the alphabet for each iteration. Decoding reverses this process, ending up with the same `BigInteger` as in the initial encoding step.
author: Tor Brede Vekterli <vekterli@yahooinc.com> 2022-11-08 13:10:23 +0100
committer: Tor Brede Vekterli <vekterli@yahooinc.com> 2022-11-08 13:42:00 +0100
commit: fc1c56f2e080a35bbc34a27d2cabc53f8ea171b0 (patch)
tree: 21d96d4c17ef656c9af5df48a279abc1faf61312 /security-utils
parent: 247657a8e00c05b5ff3fe9d15ca258f41cedd597 (diff)
4 files changed, 316 insertions, 0 deletions
diff --git a/security-utils/src/main/java/com/yahoo/security/Base58.java b/security-utils/src/main/java/com/yahoo/security/Base58.java
new file mode 100644
index 00000000000..3010bc878a8
--- /dev/null
+++ b/security-utils/src/main/java/com/yahoo/security/Base58.java
@@ -0,0 +1,22 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.security;
+
+/**
+ * Base58 encoding using the alphabet standardized by Bitcoin et al., which avoids
+ * the use of characters [0OIl] to avoid visual ambiguity. It does not feature any
+ * potential word/line-breaking characters, which means encoded strings can usually
+ * be selected in one go on web pages or in the terminal.
+ *
+ * @see <a href="https://en.wikipedia.org/wiki/Base58">Base58 on Wiki</a>
+ *
+ * @author vekterli
+ */
+public class Base58 {
+
+    private static final BaseNCodec INSTANCE = BaseNCodec.of("123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz");
+
+    public static BaseNCodec codec() {
+        return INSTANCE;
+    }
+
+}
diff --git a/security-utils/src/main/java/com/yahoo/security/Base62.java b/security-utils/src/main/java/com/yahoo/security/Base62.java
new file mode 100644
index 00000000000..86c60a1bb1d
--- /dev/null
+++ b/security-utils/src/main/java/com/yahoo/security/Base62.java
@@ -0,0 +1,21 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.security;
+
+/**
+ * Base62 encoding which has the nice property that it does not feature any
+ * potential word/line-breaking characters, which means encoded strings can
+ * usually be selected in one go on web pages or in the terminal.
+ *
+ * @see <a href="https://en.wikipedia.org/wiki/Base62">Base62 on Wiki</a>
+ *
+ * @author vekterli
+ */
+public class Base62 {
+
+    private static final BaseNCodec INSTANCE = BaseNCodec.of("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+
+    public static BaseNCodec codec() {
+        return INSTANCE;
+    }
+
+}
diff --git a/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java b/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java
new file mode 100644
index 00000000000..0921f238460
--- /dev/null
+++ b/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java
@@ -0,0 +1,151 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.security;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+/**
+ * <p>
+ * Codec that enables easy conversion from an array of bytes to any numeric base in [2, 256)
+ * and back again, using a supplied custom alphabet.
+ * </p>
+ * <p>
+ * Implemented by treating the input byte sequence to encode verbatim as a big-endian
+ * <code>BigInteger</code> and iteratively doing a <code>divmod</code> operation until
+ * the quotient is zero, emitting the modulus mapped onto the alphabet for each iteration.
+ * </p>
+ * <p>
+ * Decoding reverses this process, ending up with the same <code>BigInteger</code> as in
+ * the initial encoding step.
+ * </p>
+ * <p>
+ * Note that <code>BigInteger</code>s represent the <em>canonical</em> form of any given
+ * integer, which means that leading zero bytes are implicitly ignored. We therefore
+ * special-case this by unary-coding the number of leading zeroes in the encoded form,
+ * where a leading zero byte is mapped to the <em>first</em> character of the alphabet.
+ * </p>
+ * <p>Example for Base58, which starts its alphabet with 1 (0 is not present):</p>
+ * <pre>
+ *   "Hello World!"     = "2NEpo7TZRRrLZSi2U"
+ *   "\0\0Hello World!" = "112NEpo7TZRRrLZSi2U" (note leading 1s)
+ * </pre>
+ * <p>Example for Base62, which starts its alphabet with 0:</p>
+ * <pre>
+ *   "Hello World!"     = "T8dgcjRGkZ3aysdN"
+ *   "\0\0Hello World!" = "00T8dgcjRGkZ3aysdN" (node leading 0s)
+ * </pre>
+ * <p>
+ * <strong>Important:</strong> runtime complexity is <em>O(n<sup>2</sup>)</em> for both
+ * encoding and decoding, so this should only be used to encode/decode relatively short
+ * byte sequences. This is <em>not</em> a replacement for Base64 etc. encoding that runs
+ * in linear time! In addition, a <code>BaseNCodec</code> with a Base64 alphabet encodes
+ * to a completely different output than a regular Base64 encoder when the input is not
+ * evenly divisible by three. This is due to regular Base64 explicitly handling padding,
+ * while this codec does not.
+ * </p>
+ *
+ * @author vekterli
+ */
+public class BaseNCodec {
+
+    public static final int MAX_BASE = 255; /** Inclusive */
+
+    private static class Alphabet {
+        final char[] alphabetChars;
+        final int[]  reverseLut;
+
+        Alphabet(String alphabetIn) {
+            if (alphabetIn.length() < 2) { // We don't do unary...
+                throw new IllegalArgumentException("Alphabet requires at least two symbols");
+            }
+            if (alphabetIn.length() > MAX_BASE) {
+                throw new IllegalArgumentException("Alphabet size too large");
+            }
+            alphabetChars = alphabetIn.toCharArray();
+            int highestChar = Integer.MIN_VALUE;
+            for (char ch : alphabetChars) {
+                highestChar = Math.max(highestChar, ch);
+            }
+            reverseLut = new int[highestChar + 1];
+            Arrays.fill(reverseLut, -1); // -1 => invalid mapping
+            for (int i = 0; i < alphabetChars.length; ++i) {
+                if (reverseLut[alphabetChars[i]] != -1) {
+                    throw new IllegalArgumentException("Alphabet character '%c' occurs more than once"
+                                                       .formatted(alphabetChars[i]));
+                }
+                reverseLut[alphabetChars[i]] = i;
+            }
+        }
+    }
+
+    private static final BigInteger BN_ZERO = BigInteger.valueOf(0);
+
+    private final Alphabet   alphabet;
+    private final BigInteger alphabetLenBN;
+
+    private BaseNCodec(String alphabet) {
+        this.alphabet = new Alphabet(alphabet);
+        this.alphabetLenBN = BigInteger.valueOf(this.alphabet.alphabetChars.length);
+    }
+
+    public static BaseNCodec of(String alphabet) {
+        return new BaseNCodec(alphabet);
+    }
+
+    public int base() { return this.alphabet.alphabetChars.length; }
+
+    public String encode(byte[] input) {
+        var sb  = new StringBuilder(input.length * 2); // Not at all exact, but builder can resize anyway
+        var num = new BigInteger(1, input); // Treat as _positive_ big endian bigint (explicit signum=1)
+        // Standard base N digit conversion loop. Note: emits in reverse order since we
+        // append the least significant digit first. We reverse this later on.
+        while (!num.equals(BN_ZERO)) {
+            BigInteger[] quotRem = num.divideAndRemainder(alphabetLenBN);
+            num = quotRem[0];
+            sb.append(alphabet.alphabetChars[quotRem[1].intValue()]);
+        }
+        for (byte leadingByte : input) {
+            if (leadingByte != 0x00) {
+                break;
+            }
+            sb.append(alphabet.alphabetChars[0]);
+        }
+        return sb.reverse().toString();
+    }
+
+    public byte[] decode(String input) {
+        char[] inputChars = input.toCharArray();
+        int prefixNulls = 0;
+        for (char leadingChar : inputChars) {
+            if (leadingChar != alphabet.alphabetChars[0]) {
+                break;
+            }
+            ++prefixNulls;
+        }
+        // Restore the BigInteger representation by reversing the base conversion done during encoding.
+        var accu = BN_ZERO;
+        for (char c : inputChars) {
+            int idx = (c < alphabet.reverseLut.length) ? alphabet.reverseLut[c] : -1;
+            if (idx == -1) {
+                throw new IllegalArgumentException("Input character not part of codec alphabet");
+            }
+            accu = accu.multiply(alphabetLenBN).add(BigInteger.valueOf(idx));
+        }
+        byte[] bnBytes = accu.toByteArray();
+        // If the most significant bigint byte is zero, it means the most significant bit of the
+        // next byte is 1 (or the bnBytes length is 1, in which case prefixNulls == 1) and the bigint
+        // representation uses 1 extra byte to be positive in 2's complement. If so, prune it away
+        // to avoid prefixing with a spurious null-byte.
+        boolean msbZero = (bnBytes[0] == 0x0);
+        if (prefixNulls == 0 && !msbZero) {
+            return bnBytes;
+        } else {
+            int realLen = (msbZero ? bnBytes.length - 1 : bnBytes.length);
+            byte[] result = new byte[prefixNulls + realLen];
+            // #prefixNulls prefix bytes are implicitly zero
+            System.arraycopy(bnBytes, (msbZero ? 1 : 0), result, prefixNulls, realLen);
+            return result;
+        }
+    }
+
+}
diff --git a/security-utils/src/test/java/com/yahoo/security/BaseNCodecTest.java b/security-utils/src/test/java/com/yahoo/security/BaseNCodecTest.java
new file mode 100644
index 00000000000..da67ea2dff3
--- /dev/null
+++ b/security-utils/src/test/java/com/yahoo/security/BaseNCodecTest.java
@@ -0,0 +1,122 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.security;
+
+import org.junit.jupiter.api.Test;
+
+import java.math.BigInteger;
+
+import static com.yahoo.security.ArrayUtils.hex;
+import static com.yahoo.security.ArrayUtils.toUtf8Bytes;
+import static com.yahoo.security.ArrayUtils.unhex;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+/**
+ * @author vekterli
+ */
+public class BaseNCodecTest {
+
+    private static void verifyRoundtrip(BaseNCodec codec, byte[] bytes, String expectedEncoded) {
+        String enc = codec.encode(bytes);
+        assertEquals(expectedEncoded, enc);
+        byte[] dec = codec.decode(enc);
+        assertEquals(hex(bytes), hex(dec));
+    }
+
+    private static void verifyRoundtrip(BaseNCodec codec, String str, String expectedEncoded) {
+        verifyRoundtrip(codec, toUtf8Bytes(str), expectedEncoded);
+    }
+
+    @Test
+    void decoding_chars_not_in_alphabet_throws() {
+        var b58 = Base58.codec();
+        // [0OIl] are not in Base58 alphabet, but within the alphabet LUT range
+        assertThrows(IllegalArgumentException.class, () -> b58.decode("233QC0"));
+        // '{' is one beyond 'z', which is the highest char in the LUT range
+        assertThrows(IllegalArgumentException.class, () -> b58.decode("233QC{"));
+    }
+
+    @Test
+    void alphabet_char_duplication_during_codec_setup_throws() {
+        assertThrows(IllegalArgumentException.class, () -> BaseNCodec.of("abcda"));
+    }
+
+    @Test
+    void base58_codec_test_cases_pass() {
+        var b58 = Base58.codec();
+        assertEquals(58, b58.base());
+        // https://datatracker.ietf.org/doc/html/draft-msporny-base58-03 test vectors:
+        verifyRoundtrip(b58, "Hello World!", "2NEpo7TZRRrLZSi2U");
+        verifyRoundtrip(b58, "The quick brown fox jumps over the lazy dog.",
+                "USm3fpXnKG5EUBx2ndxBDMPVciP5hGey2Jh4NDv6gmeo1LkMeiKrLJUUBk6Z");
+        verifyRoundtrip(b58, unhex("0000287fb4cd"), "11233QC4");
+
+        // Values that have been cross-referenced with other encoder implementations:
+        verifyRoundtrip(b58, "", "");
+        verifyRoundtrip(b58, unhex("00"), "1");
+        verifyRoundtrip(b58, unhex("0000"), "11");
+        verifyRoundtrip(b58, unhex("ff"), "5Q");
+        verifyRoundtrip(b58, unhex("00ff"), "15Q");
+        verifyRoundtrip(b58, unhex("ff00"), "LQX");
+        verifyRoundtrip(b58, unhex("ffffff"), "2UzHL");
+        verifyRoundtrip(b58, unhex("287fb4cd"), "233QC4");
+    }
+
+    @Test
+    void base62_codec_test_cases_pass() {
+        var b62 = Base62.codec();
+        assertEquals(62, b62.base());
+        verifyRoundtrip(b62, "Hello World!", "T8dgcjRGkZ3aysdN");
+        verifyRoundtrip(b62, "\0\0Hello World!", "00T8dgcjRGkZ3aysdN");
+        verifyRoundtrip(b62, "", "");
+        verifyRoundtrip(b62, unhex("00"), "0");
+        verifyRoundtrip(b62, unhex("0000"), "00");
+        verifyRoundtrip(b62, unhex("00000000ffffffff"), "00004gfFC3");
+        verifyRoundtrip(b62, unhex("ffffffff00000000"), "LygHZwPV2MC");
+    }
+
+    // Test with some common bases that are easier to reason about:
+
+    @Test
+    void codec_generalizes_down_to_base_10() {
+        var b10 = BaseNCodec.of("0123456789");
+        verifyRoundtrip(b10, unhex("00"), "0");
+        verifyRoundtrip(b10, unhex("000f"), "015");
+        verifyRoundtrip(b10, unhex("ffff"), "65535");
+
+        // A large prime number: 2^252 + 27742317777372353535851937790883648493 (Curve25519 order)
+        var numStr = "7237005577332262213973186563042994240857116359379907606001950938285454250989";
+        var numBN = new BigInteger(numStr);
+        verifyRoundtrip(b10, numBN.toByteArray(), numStr);
+    }
+
+    // Possibly world's most inefficient hex conversion?
+    @Test
+    void codec_generalizes_down_to_base_16() {
+        var b2 = BaseNCodec.of("0123456789ABCDEF");
+        assertEquals(16, b2.base());
+        verifyRoundtrip(b2, unhex(""), "");
+        verifyRoundtrip(b2, unhex("00"), "0");
+        verifyRoundtrip(b2, unhex("80"), "80");
+        verifyRoundtrip(b2, unhex("01"), "1");
+        verifyRoundtrip(b2, unhex("F0"), "F0");
+        verifyRoundtrip(b2, unhex("0F"), "F");
+        verifyRoundtrip(b2, unhex("F00F"), "F00F");
+        verifyRoundtrip(b2, unhex("5FAF"), "5FAF");
+    }
+
+    // Very likely genuinely the world's most inefficient binary conversion.
+    @Test
+    void codec_generalizes_down_to_base_2() {
+        var b2 = BaseNCodec.of("01");
+        assertEquals(2, b2.base());
+        verifyRoundtrip(b2, unhex(""), "");
+        verifyRoundtrip(b2, unhex("00"), "0");
+        verifyRoundtrip(b2, unhex("000000"), "000"); // note: prefix zero byte sentinels!
+        verifyRoundtrip(b2, unhex("80"), "10000000");
+        verifyRoundtrip(b2, unhex("01"), "1");
+        verifyRoundtrip(b2, unhex("F0"), "11110000");
+        verifyRoundtrip(b2, unhex("0F"), "1111");
+    }
+
+}
author	Tor Brede Vekterli <vekterli@yahooinc.com>	2022-11-08 13:10:23 +0100
committer	Tor Brede Vekterli <vekterli@yahooinc.com>	2022-11-08 13:42:00 +0100
commit	fc1c56f2e080a35bbc34a27d2cabc53f8ea171b0 (patch)
tree	21d96d4c17ef656c9af5df48a279abc1faf61312 /security-utils
parent	247657a8e00c05b5ff3fe9d15ca258f41cedd597 (diff)