aboutsummaryrefslogtreecommitdiffstats
path: root/security-utils
diff options
context:
space:
mode:
authorTor Brede Vekterli <vekterli@yahooinc.com>2022-11-08 13:10:23 +0100
committerTor Brede Vekterli <vekterli@yahooinc.com>2022-11-08 13:42:00 +0100
commitfc1c56f2e080a35bbc34a27d2cabc53f8ea171b0 (patch)
tree21d96d4c17ef656c9af5df48a279abc1faf61312 /security-utils
parent247657a8e00c05b5ff3fe9d15ca258f41cedd597 (diff)
Add a codec that enables conversion to and from a base N representation
Adds a codec that enables easy conversion from an array of bytes to any numeric base in [2, 256) and back again, using a supplied custom alphabet. Implemented by treating the input byte sequence to encode verbatim as a big-endian `BigInteger` and iteratively doing a `divmod` operation until the quotient is zero, emitting the modulus mapped onto the alphabet for each iteration. Decoding reverses this process, ending up with the same `BigInteger` as in the initial encoding step.
Diffstat (limited to 'security-utils')
-rw-r--r--security-utils/src/main/java/com/yahoo/security/Base58.java22
-rw-r--r--security-utils/src/main/java/com/yahoo/security/Base62.java21
-rw-r--r--security-utils/src/main/java/com/yahoo/security/BaseNCodec.java151
-rw-r--r--security-utils/src/test/java/com/yahoo/security/BaseNCodecTest.java122
4 files changed, 316 insertions, 0 deletions
diff --git a/security-utils/src/main/java/com/yahoo/security/Base58.java b/security-utils/src/main/java/com/yahoo/security/Base58.java
new file mode 100644
index 00000000000..3010bc878a8
--- /dev/null
+++ b/security-utils/src/main/java/com/yahoo/security/Base58.java
@@ -0,0 +1,22 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.security;
+
+/**
+ * Base58 encoding using the alphabet standardized by Bitcoin et al., which avoids
+ * the use of characters [0OIl] to avoid visual ambiguity. It does not feature any
+ * potential word/line-breaking characters, which means encoded strings can usually
+ * be selected in one go on web pages or in the terminal.
+ *
+ * @see <a href="https://en.wikipedia.org/wiki/Base58">Base58 on Wiki</a>
+ *
+ * @author vekterli
+ */
+public class Base58 {
+
+ private static final BaseNCodec INSTANCE = BaseNCodec.of("123456789ABCDEFGHJKLMNPQRSTUVWXYZabcdefghijkmnopqrstuvwxyz");
+
+ public static BaseNCodec codec() {
+ return INSTANCE;
+ }
+
+}
diff --git a/security-utils/src/main/java/com/yahoo/security/Base62.java b/security-utils/src/main/java/com/yahoo/security/Base62.java
new file mode 100644
index 00000000000..86c60a1bb1d
--- /dev/null
+++ b/security-utils/src/main/java/com/yahoo/security/Base62.java
@@ -0,0 +1,21 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.security;
+
+/**
+ * Base62 encoding which has the nice property that it does not feature any
+ * potential word/line-breaking characters, which means encoded strings can
+ * usually be selected in one go on web pages or in the terminal.
+ *
+ * @see <a href="https://en.wikipedia.org/wiki/Base62">Base62 on Wiki</a>
+ *
+ * @author vekterli
+ */
+public class Base62 {
+
+ private static final BaseNCodec INSTANCE = BaseNCodec.of("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz");
+
+ public static BaseNCodec codec() {
+ return INSTANCE;
+ }
+
+}
diff --git a/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java b/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java
new file mode 100644
index 00000000000..0921f238460
--- /dev/null
+++ b/security-utils/src/main/java/com/yahoo/security/BaseNCodec.java
@@ -0,0 +1,151 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.security;
+
+import java.math.BigInteger;
+import java.util.Arrays;
+
+/**
+ * <p>
+ * Codec that enables easy conversion from an array of bytes to any numeric base in [2, 256)
+ * and back again, using a supplied custom alphabet.
+ * </p>
+ * <p>
+ * Implemented by treating the input byte sequence to encode verbatim as a big-endian
+ * <code>BigInteger</code> and iteratively doing a <code>divmod</code> operation until
+ * the quotient is zero, emitting the modulus mapped onto the alphabet for each iteration.
+ * </p>
+ * <p>
+ * Decoding reverses this process, ending up with the same <code>BigInteger</code> as in
+ * the initial encoding step.
+ * </p>
+ * <p>
+ * Note that <code>BigInteger</code>s represent the <em>canonical</em> form of any given
+ * integer, which means that leading zero bytes are implicitly ignored. We therefore
+ * special-case this by unary-coding the number of leading zeroes in the encoded form,
+ * where a leading zero byte is mapped to the <em>first</em> character of the alphabet.
+ * </p>
+ * <p>Example for Base58, which starts its alphabet with 1 (0 is not present):</p>
+ * <pre>
+ * "Hello World!" = "2NEpo7TZRRrLZSi2U"
+ * "\0\0Hello World!" = "112NEpo7TZRRrLZSi2U" (note leading 1s)
+ * </pre>
+ * <p>Example for Base62, which starts its alphabet with 0:</p>
+ * <pre>
+ * "Hello World!" = "T8dgcjRGkZ3aysdN"
+ * "\0\0Hello World!" = "00T8dgcjRGkZ3aysdN" (node leading 0s)
+ * </pre>
+ * <p>
+ * <strong>Important:</strong> runtime complexity is <em>O(n<sup>2</sup>)</em> for both
+ * encoding and decoding, so this should only be used to encode/decode relatively short
+ * byte sequences. This is <em>not</em> a replacement for Base64 etc. encoding that runs
+ * in linear time! In addition, a <code>BaseNCodec</code> with a Base64 alphabet encodes
+ * to a completely different output than a regular Base64 encoder when the input is not
+ * evenly divisible by three. This is due to regular Base64 explicitly handling padding,
+ * while this codec does not.
+ * </p>
+ *
+ * @author vekterli
+ */
+public class BaseNCodec {
+
+ public static final int MAX_BASE = 255; /** Inclusive */
+
+ private static class Alphabet {
+ final char[] alphabetChars;
+ final int[] reverseLut;
+
+ Alphabet(String alphabetIn) {
+ if (alphabetIn.length() < 2) { // We don't do unary...
+ throw new IllegalArgumentException("Alphabet requires at least two symbols");
+ }
+ if (alphabetIn.length() > MAX_BASE) {
+ throw new IllegalArgumentException("Alphabet size too large");
+ }
+ alphabetChars = alphabetIn.toCharArray();
+ int highestChar = Integer.MIN_VALUE;
+ for (char ch : alphabetChars) {
+ highestChar = Math.max(highestChar, ch);
+ }
+ reverseLut = new int[highestChar + 1];
+ Arrays.fill(reverseLut, -1); // -1 => invalid mapping
+ for (int i = 0; i < alphabetChars.length; ++i) {
+ if (reverseLut[alphabetChars[i]] != -1) {
+ throw new IllegalArgumentException("Alphabet character '%c' occurs more than once"
+ .formatted(alphabetChars[i]));
+ }
+ reverseLut[alphabetChars[i]] = i;
+ }
+ }
+ }
+
+ private static final BigInteger BN_ZERO = BigInteger.valueOf(0);
+
+ private final Alphabet alphabet;
+ private final BigInteger alphabetLenBN;
+
+ private BaseNCodec(String alphabet) {
+ this.alphabet = new Alphabet(alphabet);
+ this.alphabetLenBN = BigInteger.valueOf(this.alphabet.alphabetChars.length);
+ }
+
+ public static BaseNCodec of(String alphabet) {
+ return new BaseNCodec(alphabet);
+ }
+
+ public int base() { return this.alphabet.alphabetChars.length; }
+
+ public String encode(byte[] input) {
+ var sb = new StringBuilder(input.length * 2); // Not at all exact, but builder can resize anyway
+ var num = new BigInteger(1, input); // Treat as _positive_ big endian bigint (explicit signum=1)
+ // Standard base N digit conversion loop. Note: emits in reverse order since we
+ // append the least significant digit first. We reverse this later on.
+ while (!num.equals(BN_ZERO)) {
+ BigInteger[] quotRem = num.divideAndRemainder(alphabetLenBN);
+ num = quotRem[0];
+ sb.append(alphabet.alphabetChars[quotRem[1].intValue()]);
+ }
+ for (byte leadingByte : input) {
+ if (leadingByte != 0x00) {
+ break;
+ }
+ sb.append(alphabet.alphabetChars[0]);
+ }
+ return sb.reverse().toString();
+ }
+
+ public byte[] decode(String input) {
+ char[] inputChars = input.toCharArray();
+ int prefixNulls = 0;
+ for (char leadingChar : inputChars) {
+ if (leadingChar != alphabet.alphabetChars[0]) {
+ break;
+ }
+ ++prefixNulls;
+ }
+ // Restore the BigInteger representation by reversing the base conversion done during encoding.
+ var accu = BN_ZERO;
+ for (char c : inputChars) {
+ int idx = (c < alphabet.reverseLut.length) ? alphabet.reverseLut[c] : -1;
+ if (idx == -1) {
+ throw new IllegalArgumentException("Input character not part of codec alphabet");
+ }
+ accu = accu.multiply(alphabetLenBN).add(BigInteger.valueOf(idx));
+ }
+ byte[] bnBytes = accu.toByteArray();
+ // If the most significant bigint byte is zero, it means the most significant bit of the
+ // next byte is 1 (or the bnBytes length is 1, in which case prefixNulls == 1) and the bigint
+ // representation uses 1 extra byte to be positive in 2's complement. If so, prune it away
+ // to avoid prefixing with a spurious null-byte.
+ boolean msbZero = (bnBytes[0] == 0x0);
+ if (prefixNulls == 0 && !msbZero) {
+ return bnBytes;
+ } else {
+ int realLen = (msbZero ? bnBytes.length - 1 : bnBytes.length);
+ byte[] result = new byte[prefixNulls + realLen];
+ // #prefixNulls prefix bytes are implicitly zero
+ System.arraycopy(bnBytes, (msbZero ? 1 : 0), result, prefixNulls, realLen);
+ return result;
+ }
+ }
+
+}
diff --git a/security-utils/src/test/java/com/yahoo/security/BaseNCodecTest.java b/security-utils/src/test/java/com/yahoo/security/BaseNCodecTest.java
new file mode 100644
index 00000000000..da67ea2dff3
--- /dev/null
+++ b/security-utils/src/test/java/com/yahoo/security/BaseNCodecTest.java
@@ -0,0 +1,122 @@
+// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
+package com.yahoo.security;
+
+import org.junit.jupiter.api.Test;
+
+import java.math.BigInteger;
+
+import static com.yahoo.security.ArrayUtils.hex;
+import static com.yahoo.security.ArrayUtils.toUtf8Bytes;
+import static com.yahoo.security.ArrayUtils.unhex;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+/**
+ * @author vekterli
+ */
+public class BaseNCodecTest {
+
+ private static void verifyRoundtrip(BaseNCodec codec, byte[] bytes, String expectedEncoded) {
+ String enc = codec.encode(bytes);
+ assertEquals(expectedEncoded, enc);
+ byte[] dec = codec.decode(enc);
+ assertEquals(hex(bytes), hex(dec));
+ }
+
+ private static void verifyRoundtrip(BaseNCodec codec, String str, String expectedEncoded) {
+ verifyRoundtrip(codec, toUtf8Bytes(str), expectedEncoded);
+ }
+
+ @Test
+ void decoding_chars_not_in_alphabet_throws() {
+ var b58 = Base58.codec();
+ // [0OIl] are not in Base58 alphabet, but within the alphabet LUT range
+ assertThrows(IllegalArgumentException.class, () -> b58.decode("233QC0"));
+ // '{' is one beyond 'z', which is the highest char in the LUT range
+ assertThrows(IllegalArgumentException.class, () -> b58.decode("233QC{"));
+ }
+
+ @Test
+ void alphabet_char_duplication_during_codec_setup_throws() {
+ assertThrows(IllegalArgumentException.class, () -> BaseNCodec.of("abcda"));
+ }
+
+ @Test
+ void base58_codec_test_cases_pass() {
+ var b58 = Base58.codec();
+ assertEquals(58, b58.base());
+ // https://datatracker.ietf.org/doc/html/draft-msporny-base58-03 test vectors:
+ verifyRoundtrip(b58, "Hello World!", "2NEpo7TZRRrLZSi2U");
+ verifyRoundtrip(b58, "The quick brown fox jumps over the lazy dog.",
+ "USm3fpXnKG5EUBx2ndxBDMPVciP5hGey2Jh4NDv6gmeo1LkMeiKrLJUUBk6Z");
+ verifyRoundtrip(b58, unhex("0000287fb4cd"), "11233QC4");
+
+ // Values that have been cross-referenced with other encoder implementations:
+ verifyRoundtrip(b58, "", "");
+ verifyRoundtrip(b58, unhex("00"), "1");
+ verifyRoundtrip(b58, unhex("0000"), "11");
+ verifyRoundtrip(b58, unhex("ff"), "5Q");
+ verifyRoundtrip(b58, unhex("00ff"), "15Q");
+ verifyRoundtrip(b58, unhex("ff00"), "LQX");
+ verifyRoundtrip(b58, unhex("ffffff"), "2UzHL");
+ verifyRoundtrip(b58, unhex("287fb4cd"), "233QC4");
+ }
+
+ @Test
+ void base62_codec_test_cases_pass() {
+ var b62 = Base62.codec();
+ assertEquals(62, b62.base());
+ verifyRoundtrip(b62, "Hello World!", "T8dgcjRGkZ3aysdN");
+ verifyRoundtrip(b62, "\0\0Hello World!", "00T8dgcjRGkZ3aysdN");
+ verifyRoundtrip(b62, "", "");
+ verifyRoundtrip(b62, unhex("00"), "0");
+ verifyRoundtrip(b62, unhex("0000"), "00");
+ verifyRoundtrip(b62, unhex("00000000ffffffff"), "00004gfFC3");
+ verifyRoundtrip(b62, unhex("ffffffff00000000"), "LygHZwPV2MC");
+ }
+
+ // Test with some common bases that are easier to reason about:
+
+ @Test
+ void codec_generalizes_down_to_base_10() {
+ var b10 = BaseNCodec.of("0123456789");
+ verifyRoundtrip(b10, unhex("00"), "0");
+ verifyRoundtrip(b10, unhex("000f"), "015");
+ verifyRoundtrip(b10, unhex("ffff"), "65535");
+
+ // A large prime number: 2^252 + 27742317777372353535851937790883648493 (Curve25519 order)
+ var numStr = "7237005577332262213973186563042994240857116359379907606001950938285454250989";
+ var numBN = new BigInteger(numStr);
+ verifyRoundtrip(b10, numBN.toByteArray(), numStr);
+ }
+
+ // Possibly world's most inefficient hex conversion?
+ @Test
+ void codec_generalizes_down_to_base_16() {
+ var b2 = BaseNCodec.of("0123456789ABCDEF");
+ assertEquals(16, b2.base());
+ verifyRoundtrip(b2, unhex(""), "");
+ verifyRoundtrip(b2, unhex("00"), "0");
+ verifyRoundtrip(b2, unhex("80"), "80");
+ verifyRoundtrip(b2, unhex("01"), "1");
+ verifyRoundtrip(b2, unhex("F0"), "F0");
+ verifyRoundtrip(b2, unhex("0F"), "F");
+ verifyRoundtrip(b2, unhex("F00F"), "F00F");
+ verifyRoundtrip(b2, unhex("5FAF"), "5FAF");
+ }
+
+ // Very likely genuinely the world's most inefficient binary conversion.
+ @Test
+ void codec_generalizes_down_to_base_2() {
+ var b2 = BaseNCodec.of("01");
+ assertEquals(2, b2.base());
+ verifyRoundtrip(b2, unhex(""), "");
+ verifyRoundtrip(b2, unhex("00"), "0");
+ verifyRoundtrip(b2, unhex("000000"), "000"); // note: prefix zero byte sentinels!
+ verifyRoundtrip(b2, unhex("80"), "10000000");
+ verifyRoundtrip(b2, unhex("01"), "1");
+ verifyRoundtrip(b2, unhex("F0"), "11110000");
+ verifyRoundtrip(b2, unhex("0F"), "1111");
+ }
+
+}