diff options
author | Tor Brede Vekterli <vekterli@yahooinc.com> | 2022-11-08 13:10:23 +0100 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@yahooinc.com> | 2022-11-08 13:42:00 +0100 |
commit | fc1c56f2e080a35bbc34a27d2cabc53f8ea171b0 (patch) | |
tree | 21d96d4c17ef656c9af5df48a279abc1faf61312 /security-utils/src/main/java | |
parent | 247657a8e00c05b5ff3fe9d15ca258f41cedd597 (diff) |
Add a codec that enables conversion to and from a base N representation
Adds a codec that enables easy conversion from an array of bytes to any
numeric base in [2, 256) and back again, using a supplied custom alphabet.
Implemented by treating the input byte sequence to encode verbatim as a
big-endian `BigInteger` and iteratively doing a `divmod` operation until
the quotient is zero, emitting the modulus mapped onto the alphabet for
each iteration.
Decoding reverses this process, ending up with the same `BigInteger` as
in the initial encoding step.
Diffstat (limited to 'security-utils/src/main/java')
3 files changed, 194 insertions, 0 deletions
diff --git a/security-utils/src/main/java/com/yahoo/security/Base58.java b/security-utils/src/main/java/com/yahoo/security/Base58.java new file mode 100644 index 00000000000..3010bc878a8 --- /dev/null +++ b/security-utils/src/main/java/com/yahoo/security/Base58.java @@ -0,0 +1,22 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.security; + +/** + * Base58 encoding using the alphabet standardized by Bitcoin et al., which avoids + * the use of characters [0OIl] to avoid visual ambiguity. It does not feature any + * potential word/line-breaking characters, which means encoded strings can usually + * be selected in one go on web pages or in the terminal. + * + * @see <a href="https://en.wikipedia.org/wiki/Base58">Base58 on Wiki</a> + * + * @author vekterli + */ +public class Base58 { + + private static final BaseNCodec INSTANCE = BaseNCodec.of("123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"); + + public static BaseNCodec codec() { + return INSTANCE; + } + +} diff --git a/security-utils/src/main/java/com/yahoo/security/Base62.java b/security-utils/src/main/java/com/yahoo/security/Base62.java new file mode 100644 index 00000000000..86c60a1bb1d --- /dev/null +++ b/security-utils/src/main/java/com/yahoo/security/Base62.java @@ -0,0 +1,21 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.security; + +/** + * Base62 encoding which has the nice property that it does not feature any + * potential word/line-breaking characters, which means encoded strings can + * usually be selected in one go on web pages or in the terminal. + * + * @see <a href="https://en.wikipedia.org/wiki/Base62">Base62 on Wiki</a> + * + * @author vekterli + */ +public class Base62 { + + private static final BaseNCodec INSTANCE = BaseNCodec.of("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + + public static BaseNCodec codec() { + return INSTANCE; + } + +} diff --git a/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java b/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java new file mode 100644 index 00000000000..0921f238460 --- /dev/null +++ b/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java @@ -0,0 +1,151 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.security; + +import java.math.BigInteger; +import java.util.Arrays; + +/** + * <p> + * Codec that enables easy conversion from an array of bytes to any numeric base in [2, 256) + * and back again, using a supplied custom alphabet. + * </p> + * <p> + * Implemented by treating the input byte sequence to encode verbatim as a big-endian + * <code>BigInteger</code> and iteratively doing a <code>divmod</code> operation until + * the quotient is zero, emitting the modulus mapped onto the alphabet for each iteration. + * </p> + * <p> + * Decoding reverses this process, ending up with the same <code>BigInteger</code> as in + * the initial encoding step. + * </p> + * <p> + * Note that <code>BigInteger</code>s represent the <em>canonical</em> form of any given + * integer, which means that leading zero bytes are implicitly ignored. We therefore + * special-case this by unary-coding the number of leading zeroes in the encoded form, + * where a leading zero byte is mapped to the <em>first</em> character of the alphabet. + * </p> + * <p>Example for Base58, which starts its alphabet with 1 (0 is not present):</p> + * <pre> + * "Hello World!" = "2NEpo7TZRRrLZSi2U" + * "\0\0Hello World!" = "112NEpo7TZRRrLZSi2U" (note leading 1s) + * </pre> + * <p>Example for Base62, which starts its alphabet with 0:</p> + * <pre> + * "Hello World!" = "T8dgcjRGkZ3aysdN" + * "\0\0Hello World!" = "00T8dgcjRGkZ3aysdN" (node leading 0s) + * </pre> + * <p> + * <strong>Important:</strong> runtime complexity is <em>O(n<sup>2</sup>)</em> for both + * encoding and decoding, so this should only be used to encode/decode relatively short + * byte sequences. This is <em>not</em> a replacement for Base64 etc. encoding that runs + * in linear time! In addition, a <code>BaseNCodec</code> with a Base64 alphabet encodes + * to a completely different output than a regular Base64 encoder when the input is not + * evenly divisible by three. This is due to regular Base64 explicitly handling padding, + * while this codec does not. + * </p> + * + * @author vekterli + */ +public class BaseNCodec { + + public static final int MAX_BASE = 255; /** Inclusive */ + + private static class Alphabet { + final char[] alphabetChars; + final int[] reverseLut; + + Alphabet(String alphabetIn) { + if (alphabetIn.length() < 2) { // We don't do unary... + throw new IllegalArgumentException("Alphabet requires at least two symbols"); + } + if (alphabetIn.length() > MAX_BASE) { + throw new IllegalArgumentException("Alphabet size too large"); + } + alphabetChars = alphabetIn.toCharArray(); + int highestChar = Integer.MIN_VALUE; + for (char ch : alphabetChars) { + highestChar = Math.max(highestChar, ch); + } + reverseLut = new int[highestChar + 1]; + Arrays.fill(reverseLut, -1); // -1 => invalid mapping + for (int i = 0; i < alphabetChars.length; ++i) { + if (reverseLut[alphabetChars[i]] != -1) { + throw new IllegalArgumentException("Alphabet character '%c' occurs more than once" + .formatted(alphabetChars[i])); + } + reverseLut[alphabetChars[i]] = i; + } + } + } + + private static final BigInteger BN_ZERO = BigInteger.valueOf(0); + + private final Alphabet alphabet; + private final BigInteger alphabetLenBN; + + private BaseNCodec(String alphabet) { + this.alphabet = new Alphabet(alphabet); + this.alphabetLenBN = BigInteger.valueOf(this.alphabet.alphabetChars.length); + } + + public static BaseNCodec of(String alphabet) { + return new BaseNCodec(alphabet); + } + + public int base() { return this.alphabet.alphabetChars.length; } + + public String encode(byte[] input) { + var sb = new StringBuilder(input.length * 2); // Not at all exact, but builder can resize anyway + var num = new BigInteger(1, input); // Treat as _positive_ big endian bigint (explicit signum=1) + // Standard base N digit conversion loop. Note: emits in reverse order since we + // append the least significant digit first. We reverse this later on. + while (!num.equals(BN_ZERO)) { + BigInteger[] quotRem = num.divideAndRemainder(alphabetLenBN); + num = quotRem[0]; + sb.append(alphabet.alphabetChars[quotRem[1].intValue()]); + } + for (byte leadingByte : input) { + if (leadingByte != 0x00) { + break; + } + sb.append(alphabet.alphabetChars[0]); + } + return sb.reverse().toString(); + } + + public byte[] decode(String input) { + char[] inputChars = input.toCharArray(); + int prefixNulls = 0; + for (char leadingChar : inputChars) { + if (leadingChar != alphabet.alphabetChars[0]) { + break; + } + ++prefixNulls; + } + // Restore the BigInteger representation by reversing the base conversion done during encoding. + var accu = BN_ZERO; + for (char c : inputChars) { + int idx = (c < alphabet.reverseLut.length) ? alphabet.reverseLut[c] : -1; + if (idx == -1) { + throw new IllegalArgumentException("Input character not part of codec alphabet"); + } + accu = accu.multiply(alphabetLenBN).add(BigInteger.valueOf(idx)); + } + byte[] bnBytes = accu.toByteArray(); + // If the most significant bigint byte is zero, it means the most significant bit of the + // next byte is 1 (or the bnBytes length is 1, in which case prefixNulls == 1) and the bigint + // representation uses 1 extra byte to be positive in 2's complement. If so, prune it away + // to avoid prefixing with a spurious null-byte. + boolean msbZero = (bnBytes[0] == 0x0); + if (prefixNulls == 0 && !msbZero) { + return bnBytes; + } else { + int realLen = (msbZero ? bnBytes.length - 1 : bnBytes.length); + byte[] result = new byte[prefixNulls + realLen]; + // #prefixNulls prefix bytes are implicitly zero + System.arraycopy(bnBytes, (msbZero ? 1 : 0), result, prefixNulls, realLen); + return result; + } + } + +} |