diff options
author | Tor Brede Vekterli <vekterli@yahooinc.com> | 2022-11-08 13:10:23 +0100 |
---|---|---|
committer | Tor Brede Vekterli <vekterli@yahooinc.com> | 2022-11-08 13:42:00 +0100 |
commit | fc1c56f2e080a35bbc34a27d2cabc53f8ea171b0 (patch) | |
tree | 21d96d4c17ef656c9af5df48a279abc1faf61312 /security-utils | |
parent | 247657a8e00c05b5ff3fe9d15ca258f41cedd597 (diff) |
Add a codec that enables conversion to and from a base N representation
Adds a codec that enables easy conversion from an array of bytes to any
numeric base in [2, 256) and back again, using a supplied custom alphabet.
Implemented by treating the input byte sequence to encode verbatim as a
big-endian `BigInteger` and iteratively doing a `divmod` operation until
the quotient is zero, emitting the modulus mapped onto the alphabet for
each iteration.
Decoding reverses this process, ending up with the same `BigInteger` as
in the initial encoding step.
Diffstat (limited to 'security-utils')
4 files changed, 316 insertions, 0 deletions
diff --git a/security-utils/src/main/java/com/yahoo/security/Base58.java b/security-utils/src/main/java/com/yahoo/security/Base58.java new file mode 100644 index 00000000000..3010bc878a8 --- /dev/null +++ b/security-utils/src/main/java/com/yahoo/security/Base58.java @@ -0,0 +1,22 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.security; + +/** + * Base58 encoding using the alphabet standardized by Bitcoin et al., which avoids + * the use of characters [0OIl] to avoid visual ambiguity. It does not feature any + * potential word/line-breaking characters, which means encoded strings can usually + * be selected in one go on web pages or in the terminal. + * + * @see <a href="https://en.wikipedia.org/wiki/Base58">Base58 on Wiki</a> + * + * @author vekterli + */ +public class Base58 { + + private static final BaseNCodec INSTANCE = BaseNCodec.of("123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz"); + + public static BaseNCodec codec() { + return INSTANCE; + } + +} diff --git a/security-utils/src/main/java/com/yahoo/security/Base62.java b/security-utils/src/main/java/com/yahoo/security/Base62.java new file mode 100644 index 00000000000..86c60a1bb1d --- /dev/null +++ b/security-utils/src/main/java/com/yahoo/security/Base62.java @@ -0,0 +1,21 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.security; + +/** + * Base62 encoding which has the nice property that it does not feature any + * potential word/line-breaking characters, which means encoded strings can + * usually be selected in one go on web pages or in the terminal. + * + * @see <a href="https://en.wikipedia.org/wiki/Base62">Base62 on Wiki</a> + * + * @author vekterli + */ +public class Base62 { + + private static final BaseNCodec INSTANCE = BaseNCodec.of("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); + + public static BaseNCodec codec() { + return INSTANCE; + } + +} diff --git a/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java b/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java new file mode 100644 index 00000000000..0921f238460 --- /dev/null +++ b/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java @@ -0,0 +1,151 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.security; + +import java.math.BigInteger; +import java.util.Arrays; + +/** + * <p> + * Codec that enables easy conversion from an array of bytes to any numeric base in [2, 256) + * and back again, using a supplied custom alphabet. + * </p> + * <p> + * Implemented by treating the input byte sequence to encode verbatim as a big-endian + * <code>BigInteger</code> and iteratively doing a <code>divmod</code> operation until + * the quotient is zero, emitting the modulus mapped onto the alphabet for each iteration. + * </p> + * <p> + * Decoding reverses this process, ending up with the same <code>BigInteger</code> as in + * the initial encoding step. + * </p> + * <p> + * Note that <code>BigInteger</code>s represent the <em>canonical</em> form of any given + * integer, which means that leading zero bytes are implicitly ignored. We therefore + * special-case this by unary-coding the number of leading zeroes in the encoded form, + * where a leading zero byte is mapped to the <em>first</em> character of the alphabet. + * </p> + * <p>Example for Base58, which starts its alphabet with 1 (0 is not present):</p> + * <pre> + * "Hello World!" = "2NEpo7TZRRrLZSi2U" + * "\0\0Hello World!" = "112NEpo7TZRRrLZSi2U" (note leading 1s) + * </pre> + * <p>Example for Base62, which starts its alphabet with 0:</p> + * <pre> + * "Hello World!" = "T8dgcjRGkZ3aysdN" + * "\0\0Hello World!" = "00T8dgcjRGkZ3aysdN" (node leading 0s) + * </pre> + * <p> + * <strong>Important:</strong> runtime complexity is <em>O(n<sup>2</sup>)</em> for both + * encoding and decoding, so this should only be used to encode/decode relatively short + * byte sequences. This is <em>not</em> a replacement for Base64 etc. encoding that runs + * in linear time! In addition, a <code>BaseNCodec</code> with a Base64 alphabet encodes + * to a completely different output than a regular Base64 encoder when the input is not + * evenly divisible by three. This is due to regular Base64 explicitly handling padding, + * while this codec does not. + * </p> + * + * @author vekterli + */ +public class BaseNCodec { + + public static final int MAX_BASE = 255; /** Inclusive */ + + private static class Alphabet { + final char[] alphabetChars; + final int[] reverseLut; + + Alphabet(String alphabetIn) { + if (alphabetIn.length() < 2) { // We don't do unary... + throw new IllegalArgumentException("Alphabet requires at least two symbols"); + } + if (alphabetIn.length() > MAX_BASE) { + throw new IllegalArgumentException("Alphabet size too large"); + } + alphabetChars = alphabetIn.toCharArray(); + int highestChar = Integer.MIN_VALUE; + for (char ch : alphabetChars) { + highestChar = Math.max(highestChar, ch); + } + reverseLut = new int[highestChar + 1]; + Arrays.fill(reverseLut, -1); // -1 => invalid mapping + for (int i = 0; i < alphabetChars.length; ++i) { + if (reverseLut[alphabetChars[i]] != -1) { + throw new IllegalArgumentException("Alphabet character '%c' occurs more than once" + .formatted(alphabetChars[i])); + } + reverseLut[alphabetChars[i]] = i; + } + } + } + + private static final BigInteger BN_ZERO = BigInteger.valueOf(0); + + private final Alphabet alphabet; + private final BigInteger alphabetLenBN; + + private BaseNCodec(String alphabet) { + this.alphabet = new Alphabet(alphabet); + this.alphabetLenBN = BigInteger.valueOf(this.alphabet.alphabetChars.length); + } + + public static BaseNCodec of(String alphabet) { + return new BaseNCodec(alphabet); + } + + public int base() { return this.alphabet.alphabetChars.length; } + + public String encode(byte[] input) { + var sb = new StringBuilder(input.length * 2); // Not at all exact, but builder can resize anyway + var num = new BigInteger(1, input); // Treat as _positive_ big endian bigint (explicit signum=1) + // Standard base N digit conversion loop. Note: emits in reverse order since we + // append the least significant digit first. We reverse this later on. + while (!num.equals(BN_ZERO)) { + BigInteger[] quotRem = num.divideAndRemainder(alphabetLenBN); + num = quotRem[0]; + sb.append(alphabet.alphabetChars[quotRem[1].intValue()]); + } + for (byte leadingByte : input) { + if (leadingByte != 0x00) { + break; + } + sb.append(alphabet.alphabetChars[0]); + } + return sb.reverse().toString(); + } + + public byte[] decode(String input) { + char[] inputChars = input.toCharArray(); + int prefixNulls = 0; + for (char leadingChar : inputChars) { + if (leadingChar != alphabet.alphabetChars[0]) { + break; + } + ++prefixNulls; + } + // Restore the BigInteger representation by reversing the base conversion done during encoding. + var accu = BN_ZERO; + for (char c : inputChars) { + int idx = (c < alphabet.reverseLut.length) ? alphabet.reverseLut[c] : -1; + if (idx == -1) { + throw new IllegalArgumentException("Input character not part of codec alphabet"); + } + accu = accu.multiply(alphabetLenBN).add(BigInteger.valueOf(idx)); + } + byte[] bnBytes = accu.toByteArray(); + // If the most significant bigint byte is zero, it means the most significant bit of the + // next byte is 1 (or the bnBytes length is 1, in which case prefixNulls == 1) and the bigint + // representation uses 1 extra byte to be positive in 2's complement. If so, prune it away + // to avoid prefixing with a spurious null-byte. + boolean msbZero = (bnBytes[0] == 0x0); + if (prefixNulls == 0 && !msbZero) { + return bnBytes; + } else { + int realLen = (msbZero ? bnBytes.length - 1 : bnBytes.length); + byte[] result = new byte[prefixNulls + realLen]; + // #prefixNulls prefix bytes are implicitly zero + System.arraycopy(bnBytes, (msbZero ? 1 : 0), result, prefixNulls, realLen); + return result; + } + } + +} diff --git a/security-utils/src/test/java/com/yahoo/security/BaseNCodecTest.java b/security-utils/src/test/java/com/yahoo/security/BaseNCodecTest.java new file mode 100644 index 00000000000..da67ea2dff3 --- /dev/null +++ b/security-utils/src/test/java/com/yahoo/security/BaseNCodecTest.java @@ -0,0 +1,122 @@ +// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root. +package com.yahoo.security; + +import org.junit.jupiter.api.Test; + +import java.math.BigInteger; + +import static com.yahoo.security.ArrayUtils.hex; +import static com.yahoo.security.ArrayUtils.toUtf8Bytes; +import static com.yahoo.security.ArrayUtils.unhex; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; + +/** + * @author vekterli + */ +public class BaseNCodecTest { + + private static void verifyRoundtrip(BaseNCodec codec, byte[] bytes, String expectedEncoded) { + String enc = codec.encode(bytes); + assertEquals(expectedEncoded, enc); + byte[] dec = codec.decode(enc); + assertEquals(hex(bytes), hex(dec)); + } + + private static void verifyRoundtrip(BaseNCodec codec, String str, String expectedEncoded) { + verifyRoundtrip(codec, toUtf8Bytes(str), expectedEncoded); + } + + @Test + void decoding_chars_not_in_alphabet_throws() { + var b58 = Base58.codec(); + // [0OIl] are not in Base58 alphabet, but within the alphabet LUT range + assertThrows(IllegalArgumentException.class, () -> b58.decode("233QC0")); + // '{' is one beyond 'z', which is the highest char in the LUT range + assertThrows(IllegalArgumentException.class, () -> b58.decode("233QC{")); + } + + @Test + void alphabet_char_duplication_during_codec_setup_throws() { + assertThrows(IllegalArgumentException.class, () -> BaseNCodec.of("abcda")); + } + + @Test + void base58_codec_test_cases_pass() { + var b58 = Base58.codec(); + assertEquals(58, b58.base()); + // https://datatracker.ietf.org/doc/html/draft-msporny-base58-03 test vectors: + verifyRoundtrip(b58, "Hello World!", "2NEpo7TZRRrLZSi2U"); + verifyRoundtrip(b58, "The quick brown fox jumps over the lazy dog.", + "USm3fpXnKG5EUBx2ndxBDMPVciP5hGey2Jh4NDv6gmeo1LkMeiKrLJUUBk6Z"); + verifyRoundtrip(b58, unhex("0000287fb4cd"), "11233QC4"); + + // Values that have been cross-referenced with other encoder implementations: + verifyRoundtrip(b58, "", ""); + verifyRoundtrip(b58, unhex("00"), "1"); + verifyRoundtrip(b58, unhex("0000"), "11"); + verifyRoundtrip(b58, unhex("ff"), "5Q"); + verifyRoundtrip(b58, unhex("00ff"), "15Q"); + verifyRoundtrip(b58, unhex("ff00"), "LQX"); + verifyRoundtrip(b58, unhex("ffffff"), "2UzHL"); + verifyRoundtrip(b58, unhex("287fb4cd"), "233QC4"); + } + + @Test + void base62_codec_test_cases_pass() { + var b62 = Base62.codec(); + assertEquals(62, b62.base()); + verifyRoundtrip(b62, "Hello World!", "T8dgcjRGkZ3aysdN"); + verifyRoundtrip(b62, "\0\0Hello World!", "00T8dgcjRGkZ3aysdN"); + verifyRoundtrip(b62, "", ""); + verifyRoundtrip(b62, unhex("00"), "0"); + verifyRoundtrip(b62, unhex("0000"), "00"); + verifyRoundtrip(b62, unhex("00000000ffffffff"), "00004gfFC3"); + verifyRoundtrip(b62, unhex("ffffffff00000000"), "LygHZwPV2MC"); + } + + // Test with some common bases that are easier to reason about: + + @Test + void codec_generalizes_down_to_base_10() { + var b10 = BaseNCodec.of("0123456789"); + verifyRoundtrip(b10, unhex("00"), "0"); + verifyRoundtrip(b10, unhex("000f"), "015"); + verifyRoundtrip(b10, unhex("ffff"), "65535"); + + // A large prime number: 2^252 + 27742317777372353535851937790883648493 (Curve25519 order) + var numStr = "7237005577332262213973186563042994240857116359379907606001950938285454250989"; + var numBN = new BigInteger(numStr); + verifyRoundtrip(b10, numBN.toByteArray(), numStr); + } + + // Possibly world's most inefficient hex conversion? + @Test + void codec_generalizes_down_to_base_16() { + var b2 = BaseNCodec.of("0123456789ABCDEF"); + assertEquals(16, b2.base()); + verifyRoundtrip(b2, unhex(""), ""); + verifyRoundtrip(b2, unhex("00"), "0"); + verifyRoundtrip(b2, unhex("80"), "80"); + verifyRoundtrip(b2, unhex("01"), "1"); + verifyRoundtrip(b2, unhex("F0"), "F0"); + verifyRoundtrip(b2, unhex("0F"), "F"); + verifyRoundtrip(b2, unhex("F00F"), "F00F"); + verifyRoundtrip(b2, unhex("5FAF"), "5FAF"); + } + + // Very likely genuinely the world's most inefficient binary conversion. + @Test + void codec_generalizes_down_to_base_2() { + var b2 = BaseNCodec.of("01"); + assertEquals(2, b2.base()); + verifyRoundtrip(b2, unhex(""), ""); + verifyRoundtrip(b2, unhex("00"), "0"); + verifyRoundtrip(b2, unhex("000000"), "000"); // note: prefix zero byte sentinels! + verifyRoundtrip(b2, unhex("80"), "10000000"); + verifyRoundtrip(b2, unhex("01"), "1"); + verifyRoundtrip(b2, unhex("F0"), "11110000"); + verifyRoundtrip(b2, unhex("0F"), "1111"); + } + +} |