[Rust 每日一库] base64

又功能一相当基础的小而美库

核心功能

base64 库把核心功能在 engine::Engine

pub trait Engine {
    fn internal_encode(&self, input: &[u8], output: &mut [u8]) -> usize;
    fn internal_decoded_len_estimate(&self, input_len: usize) -> Self::DecodeEstimate;
    fn internal_decode(
        &self,
        input: &[u8],
        output: &mut [u8],
        decode_estimate: Self::DecodeEstimate,
    ) -> Result<DecodeMetadata, DecodeSliceError>;

}

然后它实现了一个 GeneralPurpose 实现了这个 Trait

值得注意的是它的实现用到了循环展开

fn read_u64(s: &[u8]) -> u64 {
    u64::from_be_bytes(s[..8].try_into().unwrap())
}
fn internal_encode(&self, input: &[u8], output: &mut [u8]) -> usize {
    let mut input_index: usize = 0;

    const BLOCKS_PER_FAST_LOOP: usize = 4;
    const LOW_SIX_BITS: u64 = 0x3F;

    // we read 8 bytes at a time (u64) but only actually consume 6 of those bytes. Thus, we need
    // 2 trailing bytes to be available to read..
    let last_fast_index = input.len().saturating_sub(BLOCKS_PER_FAST_LOOP * 6 + 2);
    let mut output_index = 0;

    if last_fast_index > 0 {
        while input_index <= last_fast_index {
            // Major performance wins from letting the optimizer do the bounds check once, mostly
            // on the output side
            let input_chunk =
                &input[input_index..(input_index + (BLOCKS_PER_FAST_LOOP * 6 + 2))];
            let output_chunk =
                &mut output[output_index..(output_index + BLOCKS_PER_FAST_LOOP * 8)];

            // Hand-unrolling for 32 vs 16 or 8 bytes produces yields performance about equivalent
            // to unsafe pointer code on a Xeon E5-1650v3. 64 byte unrolling was slightly better for
            // large inputs but significantly worse for 50-byte input, unsurprisingly. I suspect
            // that it's a not uncommon use case to encode smallish chunks of data (e.g. a 64-byte
            // SHA-512 digest), so it would be nice if that fit in the unrolled loop at least once.
            // Plus, single-digit percentage performance differences might well be quite different
            // on different hardware.

            let input_u64 = read_u64(&input_chunk[0..]);

            output_chunk[0] = self.encode_table[((input_u64 >> 58) & LOW_SIX_BITS) as usize];
            output_chunk[1] = self.encode_table[((input_u64 >> 52) & LOW_SIX_BITS) as usize];
            output_chunk[2] = self.encode_table[((input_u64 >> 46) & LOW_SIX_BITS) as usize];
            output_chunk[3] = self.encode_table[((input_u64 >> 40) & LOW_SIX_BITS) as usize];
            output_chunk[4] = self.encode_table[((input_u64 >> 34) & LOW_SIX_BITS) as usize];
            output_chunk[5] = self.encode_table[((input_u64 >> 28) & LOW_SIX_BITS) as usize];
            output_chunk[6] = self.encode_table[((input_u64 >> 22) & LOW_SIX_BITS) as usize];
            output_chunk[7] = self.encode_table[((input_u64 >> 16) & LOW_SIX_BITS) as usize];

            let input_u64 = read_u64(&input_chunk[6..]);

            output_chunk[8] = self.encode_table[((input_u64 >> 58) & LOW_SIX_BITS) as usize];
            output_chunk[9] = self.encode_table[((input_u64 >> 52) & LOW_SIX_BITS) as usize];
            output_chunk[10] = self.encode_table[((input_u64 >> 46) & LOW_SIX_BITS) as usize];
            output_chunk[11] = self.encode_table[((input_u64 >> 40) & LOW_SIX_BITS) as usize];
            output_chunk[12] = self.encode_table[((input_u64 >> 34) & LOW_SIX_BITS) as usize];
            output_chunk[13] = self.encode_table[((input_u64 >> 28) & LOW_SIX_BITS) as usize];
            output_chunk[14] = self.encode_table[((input_u64 >> 22) & LOW_SIX_BITS) as usize];
            output_chunk[15] = self.encode_table[((input_u64 >> 16) & LOW_SIX_BITS) as usize];

            let input_u64 = read_u64(&input_chunk[12..]);

            output_chunk[16] = self.encode_table[((input_u64 >> 58) & LOW_SIX_BITS) as usize];
            output_chunk[17] = self.encode_table[((input_u64 >> 52) & LOW_SIX_BITS) as usize];
            output_chunk[18] = self.encode_table[((input_u64 >> 46) & LOW_SIX_BITS) as usize];
            output_chunk[19] = self.encode_table[((input_u64 >> 40) & LOW_SIX_BITS) as usize];
            output_chunk[20] = self.encode_table[((input_u64 >> 34) & LOW_SIX_BITS) as usize];
            output_chunk[21] = self.encode_table[((input_u64 >> 28) & LOW_SIX_BITS) as usize];
            output_chunk[22] = self.encode_table[((input_u64 >> 22) & LOW_SIX_BITS) as usize];
            output_chunk[23] = self.encode_table[((input_u64 >> 16) & LOW_SIX_BITS) as usize];

            let input_u64 = read_u64(&input_chunk[18..]);

            output_chunk[24] = self.encode_table[((input_u64 >> 58) & LOW_SIX_BITS) as usize];
            output_chunk[25] = self.encode_table[((input_u64 >> 52) & LOW_SIX_BITS) as usize];
            output_chunk[26] = self.encode_table[((input_u64 >> 46) & LOW_SIX_BITS) as usize];
            output_chunk[27] = self.encode_table[((input_u64 >> 40) & LOW_SIX_BITS) as usize];
            output_chunk[28] = self.encode_table[((input_u64 >> 34) & LOW_SIX_BITS) as usize];
            output_chunk[29] = self.encode_table[((input_u64 >> 28) & LOW_SIX_BITS) as usize];
            output_chunk[30] = self.encode_table[((input_u64 >> 22) & LOW_SIX_BITS) as usize];
            output_chunk[31] = self.encode_table[((input_u64 >> 16) & LOW_SIX_BITS) as usize];

            output_index += BLOCKS_PER_FAST_LOOP * 8;
            input_index += BLOCKS_PER_FAST_LOOP * 6;
        }
    }

    // Encode what's left after the fast loop.

    const LOW_SIX_BITS_U8: u8 = 0x3F;

    let rem = input.len() % 3;
    let start_of_rem = input.len() - rem;

    // start at the first index not handled by fast loop, which may be 0.

    while input_index < start_of_rem {
        let input_chunk = &input[input_index..(input_index + 3)];
        let output_chunk = &mut output[output_index..(output_index + 4)];

        output_chunk[0] = self.encode_table[(input_chunk[0] >> 2) as usize];
        output_chunk[1] = self.encode_table
            [((input_chunk[0] << 4 | input_chunk[1] >> 4) & LOW_SIX_BITS_U8) as usize];
        output_chunk[2] = self.encode_table
            [((input_chunk[1] << 2 | input_chunk[2] >> 6) & LOW_SIX_BITS_U8) as usize];
        output_chunk[3] = self.encode_table[(input_chunk[2] & LOW_SIX_BITS_U8) as usize];

        input_index += 3;
        output_index += 4;
    }

    if rem == 2 {
        output[output_index] = self.encode_table[(input[start_of_rem] >> 2) as usize];
        output[output_index + 1] =
            self.encode_table[((input[start_of_rem] << 4 | input[start_of_rem + 1] >> 4)
                & LOW_SIX_BITS_U8) as usize];
        output[output_index + 2] =
            self.encode_table[((input[start_of_rem + 1] << 2) & LOW_SIX_BITS_U8) as usize];
        output_index += 3;
    } else if rem == 1 {
        output[output_index] = self.encode_table[(input[start_of_rem] >> 2) as usize];
        output[output_index + 1] =
            self.encode_table[((input[start_of_rem] << 4) & LOW_SIX_BITS_U8) as usize];
        output_index += 2;
    }

    output_index
}

O3 优化下，x86_64 的汇编代码就是朴素的一堆 mov, and 和 shr；arm 汇编倒是做了乱序处理；然而实测在 x86_64 (ryzen 6800h) 上展开与否并不会显著影响性能（提升平均 5% 左右且不稳定）。个人不是很喜欢这个做法

而且有一个有趣的现象：当我把 internal_encode 提出来集成到代码里，提升能稳定达到 10%，比展开效果还好…我想可能是过度的各种 trait 包装阻碍了编译器进行少量优化

除此之外，它还实现了一个 Naive Engine，配合 opt-level=z 能实现最小的生成代码

Alphabet

base64::alphabet 是编码表类型

#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Alphabet { unsafe

```rust
    pub(crate) symbols: [u8; ALPHABET_SIZE],
}

它支持 const 构造

const fn from_str_unchecked(alphabet: &str) -> Self {
    let mut symbols = [0_u8; ALPHABET_SIZE];
    let source_bytes = alphabet.as_bytes();

    // a way to copy that's allowed in const fn
    let mut index = 0;
    while index < ALPHABET_SIZE {
        symbols[index] = source_bytes[index];
        index += 1;
    }

    Self { symbols }
}

可以看到，为了规避 const fn 内不能用 for （这是因为 Iterator::next 方法不是 const），它 while 取而代之。这是常见的 const fn 内 slice 循环的技巧。要不是 const fn 就可以用 alphabet.as_bytes().try_into().unwrap() 一键解决；不过我选 unsafe

#[no_mangle]
const fn from_str_unchecked(alphabet: &str) -> [u8;64] {
    if alphabet.len()!=64 {
        panic!("Alphabet must be 64-lengthed");
    }
    unsafe{(alphabet.as_ptr() as *const [u8;64]).read()}
}

其他

base64 还实现了 Display trait，以方便 Debug 和用 fmt 宏做 String；实现了 Read 和 Write 做 Reader/Writer 嵌套。大概就这些

总结

总的来说，这并不算一个很高质量的库，唯一可圈可点之处就是零依赖。但其过于常用，性能够用，使用方便，功能够完善，开发较早，因此列于常用榜前列

然而，它既没有性能达到极限（Turbo Base64 / simdutf），又自以为是地循环展开，用起来还有点麻烦（base64::prelude::BASE64_STANDARD）；这对于一个近十年的库来说有点难以接受

实话说，这是现在很多 Rust 库的通病，即自以为是地做实际提升百分之几的优化，做很 fancy 的包装，搞得代码一大堆，一点优化的效果被包装全吃完了。这值得我们警惕