// Copyright Yahoo. Licensed under the terms of the Apache 2.0 license. See LICENSE in the project root.
package com.yahoo.security;
import java.math.BigInteger;
import java.util.Arrays;
/**
*
* Codec that enables easy conversion from an array of bytes to any numeric base in [2, 256)
* and back again, using a supplied custom alphabet.
*
*
* Implemented by treating the input byte sequence to encode verbatim as a big-endian
* BigInteger
and iteratively doing a divmod
operation until
* the quotient is zero, emitting the modulus mapped onto the alphabet for each iteration.
*
*
* Decoding reverses this process, ending up with the same BigInteger
as in
* the initial encoding step.
*
*
* Note that BigInteger
s represent the canonical form of any given
* integer, which means that leading zero bytes are implicitly ignored. We therefore
* special-case this by unary-coding the number of leading zeroes in the encoded form,
* where a leading zero byte is mapped to the first character of the alphabet.
*
* Example for Base58, which starts its alphabet with 1 (0 is not present):
*
* "Hello World!" = "2NEpo7TZRRrLZSi2U"
* "\0\0Hello World!" = "112NEpo7TZRRrLZSi2U" (note leading 1s)
*
* Example for Base62, which starts its alphabet with 0:
*
* "Hello World!" = "T8dgcjRGkZ3aysdN"
* "\0\0Hello World!" = "00T8dgcjRGkZ3aysdN" (node leading 0s)
*
*
* Important: runtime complexity is O(n2) for both
* encoding and decoding, so this should only be used to encode/decode relatively short
* byte sequences. This is not a replacement for Base64 etc. encoding that runs
* in linear time! In addition, a BaseNCodec
with a Base64 alphabet encodes
* to a completely different output than a regular Base64 encoder when the input is not
* evenly divisible by three. This is due to regular Base64 explicitly handling padding,
* while this codec does not.
*
*
* @author vekterli
*/
public class BaseNCodec {
public static final int MAX_BASE = 255; /** Inclusive */
private static class Alphabet {
final char[] alphabetChars;
final int[] reverseLut;
Alphabet(String alphabetIn) {
if (alphabetIn.length() < 2) { // We don't do unary...
throw new IllegalArgumentException("Alphabet requires at least two symbols");
}
if (alphabetIn.length() > MAX_BASE) {
throw new IllegalArgumentException("Alphabet size too large");
}
alphabetChars = alphabetIn.toCharArray();
int highestChar = Integer.MIN_VALUE;
for (char ch : alphabetChars) {
highestChar = Math.max(highestChar, ch);
}
reverseLut = new int[highestChar + 1];
Arrays.fill(reverseLut, -1); // -1 => invalid mapping
for (int i = 0; i < alphabetChars.length; ++i) {
if (reverseLut[alphabetChars[i]] != -1) {
throw new IllegalArgumentException("Alphabet character '%c' occurs more than once"
.formatted(alphabetChars[i]));
}
reverseLut[alphabetChars[i]] = i;
}
}
}
private static final BigInteger BN_ZERO = BigInteger.valueOf(0);
private final Alphabet alphabet;
private final BigInteger alphabetLenBN;
private BaseNCodec(String alphabet) {
this.alphabet = new Alphabet(alphabet);
this.alphabetLenBN = BigInteger.valueOf(this.alphabet.alphabetChars.length);
}
public static BaseNCodec of(String alphabet) {
return new BaseNCodec(alphabet);
}
public int base() { return this.alphabet.alphabetChars.length; }
public String encode(byte[] input) {
var sb = new StringBuilder(input.length * 2); // Not at all exact, but builder can resize anyway
var num = new BigInteger(1, input); // Treat as _positive_ big endian bigint (explicit signum=1)
// Standard base N digit conversion loop. Note: emits in reverse order since we
// append the least significant digit first. We reverse this later on.
while (!num.equals(BN_ZERO)) {
BigInteger[] quotRem = num.divideAndRemainder(alphabetLenBN);
num = quotRem[0];
sb.append(alphabet.alphabetChars[quotRem[1].intValue()]);
}
for (byte leadingByte : input) {
if (leadingByte != 0x00) {
break;
}
sb.append(alphabet.alphabetChars[0]);
}
return sb.reverse().toString();
}
public byte[] decode(String input) {
char[] inputChars = input.toCharArray();
int prefixNulls = 0;
for (char leadingChar : inputChars) {
if (leadingChar != alphabet.alphabetChars[0]) {
break;
}
++prefixNulls;
}
// Restore the BigInteger representation by reversing the base conversion done during encoding.
var accu = BN_ZERO;
for (char c : inputChars) {
int idx = (c < alphabet.reverseLut.length) ? alphabet.reverseLut[c] : -1;
if (idx == -1) {
throw new IllegalArgumentException("Input character not part of codec alphabet");
}
accu = accu.multiply(alphabetLenBN).add(BigInteger.valueOf(idx));
}
byte[] bnBytes = accu.toByteArray();
// If the most significant bigint byte is zero, it means the most significant bit of the
// next byte is 1 (or the bnBytes length is 1, in which case prefixNulls == 1) and the bigint
// representation uses 1 extra byte to be positive in 2's complement. If so, prune it away
// to avoid prefixing with a spurious null-byte.
boolean msbZero = (bnBytes[0] == 0x0);
if (prefixNulls == 0 && !msbZero) {
return bnBytes;
} else {
int realLen = (msbZero ? bnBytes.length - 1 : bnBytes.length);
byte[] result = new byte[prefixNulls + realLen];
// #prefixNulls prefix bytes are implicitly zero
System.arraycopy(bnBytes, (msbZero ? 1 : 0), result, prefixNulls, realLen);
return result;
}
}
}