Add a codec that enables conversion to and from a base N representation

Adds a codec that enables easy conversion from an array of bytes to any numeric base in [2, 256) and back again, using a supplied custom alphabet. Implemented by treating the input byte sequence to encode verbatim as a big-endian `BigInteger` and iteratively doing a `divmod` operation until the quotient is zero, emitting the modulus mapped onto the alphabet for each iteration. Decoding reverses this process, ending up with the same `BigInteger` as in the initial encoding step.
author: Tor Brede Vekterli <vekterli@yahooinc.com> 2022-11-08 13:10:23 +0100
committer: Tor Brede Vekterli <vekterli@yahooinc.com> 2022-11-08 13:42:00 +0100
commit: fc1c56f2e080a35bbc34a27d2cabc53f8ea171b0 (patch)
tree: 21d96d4c17ef656c9af5df48a279abc1faf61312 /security-utils/src/main/java
parent: 247657a8e00c05b5ff3fe9d15ca258f41cedd597 (diff)
3 files changed, 194 insertions, 0 deletions
diff --git a/security-utils/src/main/java/com/yahoo/security/Base58.java b/security-utils/src/main/java/com/yahoo/security/Base58.java
new file mode 100644
index 00000000000..3010bc878a8
--- /dev/null
+++ b/security-utils/src/main/java/com/yahoo/security/Base58.java
@@ -0,0 +1,22 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.security;
+
+/**
+ * Base58 encoding using the alphabet standardized by Bitcoin et al., which avoids
+ * the use of characters [0OIl] to avoid visual ambiguity. It does not feature any
+ * potential word/line-breaking characters, which means encoded strings can usually
+ * be selected in one go on web pages or in the terminal.
+ *
+ * @see <a href="https://en.wikipedia.org/wiki/Base58">Base58 on Wiki</a>
+ *
+ * @author vekterli
+ */
+public class Base58 {
+
+    private static final BaseNCodec INSTANCE = BaseNCodec.of("123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz");
+
+    public static BaseNCodec codec() {
+        return INSTANCE;
+    }
+
+}
diff --git a/security-utils/src/main/java/com/yahoo/security/Base62.java b/security-utils/src/main/java/com/yahoo/security/Base62.java
new file mode 100644
index 00000000000..86c60a1bb1d
--- /dev/null
+++ b/security-utils/src/main/java/com/yahoo/security/Base62.java
@@ -0,0 +1,21 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.security;
+
+/**
+ * Base62 encoding which has the nice property that it does not feature any
+ * potential word/line-breaking characters, which means encoded strings can
+ * usually be selected in one go on web pages or in the terminal.
+ *
+ * @see <a href="https://en.wikipedia.org/wiki/Base62">Base62 on Wiki</a>
+ *
+ * @author vekterli
+ */
+public class Base62 {
+
+    private static final BaseNCodec INSTANCE = BaseNCodec.of("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+
+    public static BaseNCodec codec() {
+        return INSTANCE;
+    }
+
+}
diff --git a/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java b/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java
new file mode 100644
index 00000000000..0921f238460
--- /dev/null
+++ b/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java
@@ -0,0 +1,151 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.security;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+/**
+ * <p>
+ * Codec that enables easy conversion from an array of bytes to any numeric base in [2, 256)
+ * and back again, using a supplied custom alphabet.
+ * </p>
+ * <p>
+ * Implemented by treating the input byte sequence to encode verbatim as a big-endian
+ * <code>BigInteger</code> and iteratively doing a <code>divmod</code> operation until
+ * the quotient is zero, emitting the modulus mapped onto the alphabet for each iteration.
+ * </p>
+ * <p>
+ * Decoding reverses this process, ending up with the same <code>BigInteger</code> as in
+ * the initial encoding step.
+ * </p>
+ * <p>
+ * Note that <code>BigInteger</code>s represent the <em>canonical</em> form of any given
+ * integer, which means that leading zero bytes are implicitly ignored. We therefore
+ * special-case this by unary-coding the number of leading zeroes in the encoded form,
+ * where a leading zero byte is mapped to the <em>first</em> character of the alphabet.
+ * </p>
+ * <p>Example for Base58, which starts its alphabet with 1 (0 is not present):</p>
+ * <pre>
+ *   "Hello World!"     = "2NEpo7TZRRrLZSi2U"
+ *   "\0\0Hello World!" = "112NEpo7TZRRrLZSi2U" (note leading 1s)
+ * </pre>
+ * <p>Example for Base62, which starts its alphabet with 0:</p>
+ * <pre>
+ *   "Hello World!"     = "T8dgcjRGkZ3aysdN"
+ *   "\0\0Hello World!" = "00T8dgcjRGkZ3aysdN" (node leading 0s)
+ * </pre>
+ * <p>
+ * <strong>Important:</strong> runtime complexity is <em>O(n<sup>2</sup>)</em> for both
+ * encoding and decoding, so this should only be used to encode/decode relatively short
+ * byte sequences. This is <em>not</em> a replacement for Base64 etc. encoding that runs
+ * in linear time! In addition, a <code>BaseNCodec</code> with a Base64 alphabet encodes
+ * to a completely different output than a regular Base64 encoder when the input is not
+ * evenly divisible by three. This is due to regular Base64 explicitly handling padding,
+ * while this codec does not.
+ * </p>
+ *
+ * @author vekterli
+ */
+public class BaseNCodec {
+
+    public static final int MAX_BASE = 255; /** Inclusive */
+
+    private static class Alphabet {
+        final char[] alphabetChars;
+        final int[]  reverseLut;
+
+        Alphabet(String alphabetIn) {
+            if (alphabetIn.length() < 2) { // We don't do unary...
+                throw new IllegalArgumentException("Alphabet requires at least two symbols");
+            }
+            if (alphabetIn.length() > MAX_BASE) {
+                throw new IllegalArgumentException("Alphabet size too large");
+            }
+            alphabetChars = alphabetIn.toCharArray();
+            int highestChar = Integer.MIN_VALUE;
+            for (char ch : alphabetChars) {
+                highestChar = Math.max(highestChar, ch);
+            }
+            reverseLut = new int[highestChar + 1];
+            Arrays.fill(reverseLut, -1); // -1 => invalid mapping
+            for (int i = 0; i < alphabetChars.length; ++i) {
+                if (reverseLut[alphabetChars[i]] != -1) {
+                    throw new IllegalArgumentException("Alphabet character '%c' occurs more than once"
+                                                       .formatted(alphabetChars[i]));
+                }
+                reverseLut[alphabetChars[i]] = i;
+            }
+        }
+    }
+
+    private static final BigInteger BN_ZERO = BigInteger.valueOf(0);
+
+    private final Alphabet   alphabet;
+    private final BigInteger alphabetLenBN;
+
+    private BaseNCodec(String alphabet) {
+        this.alphabet = new Alphabet(alphabet);
+        this.alphabetLenBN = BigInteger.valueOf(this.alphabet.alphabetChars.length);
+    }
+
+    public static BaseNCodec of(String alphabet) {
+        return new BaseNCodec(alphabet);
+    }
+
+    public int base() { return this.alphabet.alphabetChars.length; }
+
+    public String encode(byte[] input) {
+        var sb  = new StringBuilder(input.length * 2); // Not at all exact, but builder can resize anyway
+        var num = new BigInteger(1, input); // Treat as _positive_ big endian bigint (explicit signum=1)
+        // Standard base N digit conversion loop. Note: emits in reverse order since we
+        // append the least significant digit first. We reverse this later on.
+        while (!num.equals(BN_ZERO)) {
+            BigInteger[] quotRem = num.divideAndRemainder(alphabetLenBN);
+            num = quotRem[0];
+            sb.append(alphabet.alphabetChars[quotRem[1].intValue()]);
+        }
+        for (byte leadingByte : input) {
+            if (leadingByte != 0x00) {
+                break;
+            }
+            sb.append(alphabet.alphabetChars[0]);
+        }
+        return sb.reverse().toString();
+    }
+
+    public byte[] decode(String input) {
+        char[] inputChars = input.toCharArray();
+        int prefixNulls = 0;
+        for (char leadingChar : inputChars) {
+            if (leadingChar != alphabet.alphabetChars[0]) {
+                break;
+            }
+            ++prefixNulls;
+        }
+        // Restore the BigInteger representation by reversing the base conversion done during encoding.
+        var accu = BN_ZERO;
+        for (char c : inputChars) {
+            int idx = (c < alphabet.reverseLut.length) ? alphabet.reverseLut[c] : -1;
+            if (idx == -1) {
+                throw new IllegalArgumentException("Input character not part of codec alphabet");
+            }
+            accu = accu.multiply(alphabetLenBN).add(BigInteger.valueOf(idx));
+        }
+        byte[] bnBytes = accu.toByteArray();
+        // If the most significant bigint byte is zero, it means the most significant bit of the
+        // next byte is 1 (or the bnBytes length is 1, in which case prefixNulls == 1) and the bigint
+        // representation uses 1 extra byte to be positive in 2's complement. If so, prune it away
+        // to avoid prefixing with a spurious null-byte.
+        boolean msbZero = (bnBytes[0] == 0x0);
+        if (prefixNulls == 0 && !msbZero) {
+            return bnBytes;
+        } else {
+            int realLen = (msbZero ? bnBytes.length - 1 : bnBytes.length);
+            byte[] result = new byte[prefixNulls + realLen];
+            // #prefixNulls prefix bytes are implicitly zero
+            System.arraycopy(bnBytes, (msbZero ? 1 : 0), result, prefixNulls, realLen);
+            return result;
+        }
+    }
+
+}
author	Tor Brede Vekterli <vekterli@yahooinc.com>	2022-11-08 13:10:23 +0100
committer	Tor Brede Vekterli <vekterli@yahooinc.com>	2022-11-08 13:42:00 +0100
commit	fc1c56f2e080a35bbc34a27d2cabc53f8ea171b0 (patch)
tree	21d96d4c17ef656c9af5df48a279abc1faf61312 /security-utils/src/main/java
parent	247657a8e00c05b5ff3fe9d15ca258f41cedd597 (diff)