又功能一相当基础的小而美库
核心功能
base64 库把核心功能在 engine::Engine
pub trait Engine {
fn internal_encode(&self, input: &[u8], output: &mut [u8]) -> usize;
fn internal_decoded_len_estimate(&self, input_len: usize) -> Self::DecodeEstimate;
fn internal_decode(
&self,
input: &[u8],
output: &mut [u8],
decode_estimate: Self::DecodeEstimate,
) -> Result<DecodeMetadata, DecodeSliceError>;
}
然后它实现了一个 GeneralPurpose
实现了这个 Trait
值得注意的是它的实现用到了循环展开
fn read_u64(s: &[u8]) -> u64 {
u64::from_be_bytes(s[..8].try_into().unwrap())
}
fn internal_encode(&self, input: &[u8], output: &mut [u8]) -> usize {
let mut input_index: usize = 0;
const BLOCKS_PER_FAST_LOOP: usize = 4;
const LOW_SIX_BITS: u64 = 0x3F;
// we read 8 bytes at a time (u64) but only actually consume 6 of those bytes. Thus, we need
// 2 trailing bytes to be available to read..
let last_fast_index = input.len().saturating_sub(BLOCKS_PER_FAST_LOOP * 6 + 2);
let mut output_index = 0;
if last_fast_index > 0 {
while input_index <= last_fast_index {
// Major performance wins from letting the optimizer do the bounds check once, mostly
// on the output side
let input_chunk =
&input[input_index..(input_index + (BLOCKS_PER_FAST_LOOP * 6 + 2))];
let output_chunk =
&mut output[output_index..(output_index + BLOCKS_PER_FAST_LOOP * 8)];
// Hand-unrolling for 32 vs 16 or 8 bytes produces yields performance about equivalent
// to unsafe pointer code on a Xeon E5-1650v3. 64 byte unrolling was slightly better for
// large inputs but significantly worse for 50-byte input, unsurprisingly. I suspect
// that it's a not uncommon use case to encode smallish chunks of data (e.g. a 64-byte
// SHA-512 digest), so it would be nice if that fit in the unrolled loop at least once.
// Plus, single-digit percentage performance differences might well be quite different
// on different hardware.
let input_u64 = read_u64(&input_chunk[0..]);
output_chunk[0] = self.encode_table[((input_u64 >> 58) & LOW_SIX_BITS) as usize];
output_chunk[1] = self.encode_table[((input_u64 >> 52) & LOW_SIX_BITS) as usize];
output_chunk[2] = self.encode_table[((input_u64 >> 46) & LOW_SIX_BITS) as usize];
output_chunk[3] = self.encode_table[((input_u64 >> 40) & LOW_SIX_BITS) as usize];
output_chunk[4] = self.encode_table[((input_u64 >> 34) & LOW_SIX_BITS) as usize];
output_chunk[5] = self.encode_table[((input_u64 >> 28) & LOW_SIX_BITS) as usize];
output_chunk[6] = self.encode_table[((input_u64 >> 22) & LOW_SIX_BITS) as usize];
output_chunk[7] = self.encode_table[((input_u64 >> 16) & LOW_SIX_BITS) as usize];
let input_u64 = read_u64(&input_chunk[6..]);
output_chunk[8] = self.encode_table[((input_u64 >> 58) & LOW_SIX_BITS) as usize];
output_chunk[9] = self.encode_table[((input_u64 >> 52) & LOW_SIX_BITS) as usize];
output_chunk[10] = self.encode_table[((input_u64 >> 46) & LOW_SIX_BITS) as usize];
output_chunk[11] = self.encode_table[((input_u64 >> 40) & LOW_SIX_BITS) as usize];
output_chunk[12] = self.encode_table[((input_u64 >> 34) & LOW_SIX_BITS) as usize];
output_chunk[13] = self.encode_table[((input_u64 >> 28) & LOW_SIX_BITS) as usize];
output_chunk[14] = self.encode_table[((input_u64 >> 22) & LOW_SIX_BITS) as usize];
output_chunk[15] = self.encode_table[((input_u64 >> 16) & LOW_SIX_BITS) as usize];
let input_u64 = read_u64(&input_chunk[12..]);
output_chunk[16] = self.encode_table[((input_u64 >> 58) & LOW_SIX_BITS) as usize];
output_chunk[17] = self.encode_table[((input_u64 >> 52) & LOW_SIX_BITS) as usize];
output_chunk[18] = self.encode_table[((input_u64 >> 46) & LOW_SIX_BITS) as usize];
output_chunk[19] = self.encode_table[((input_u64 >> 40) & LOW_SIX_BITS) as usize];
output_chunk[20] = self.encode_table[((input_u64 >> 34) & LOW_SIX_BITS) as usize];
output_chunk[21] = self.encode_table[((input_u64 >> 28) & LOW_SIX_BITS) as usize];
output_chunk[22] = self.encode_table[((input_u64 >> 22) & LOW_SIX_BITS) as usize];
output_chunk[23] = self.encode_table[((input_u64 >> 16) & LOW_SIX_BITS) as usize];
let input_u64 = read_u64(&input_chunk[18..]);
output_chunk[24] = self.encode_table[((input_u64 >> 58) & LOW_SIX_BITS) as usize];
output_chunk[25] = self.encode_table[((input_u64 >> 52) & LOW_SIX_BITS) as usize];
output_chunk[26] = self.encode_table[((input_u64 >> 46) & LOW_SIX_BITS) as usize];
output_chunk[27] = self.encode_table[((input_u64 >> 40) & LOW_SIX_BITS) as usize];
output_chunk[28] = self.encode_table[((input_u64 >> 34) & LOW_SIX_BITS) as usize];
output_chunk[29] = self.encode_table[((input_u64 >> 28) & LOW_SIX_BITS) as usize];
output_chunk[30] = self.encode_table[((input_u64 >> 22) & LOW_SIX_BITS) as usize];
output_chunk[31] = self.encode_table[((input_u64 >> 16) & LOW_SIX_BITS) as usize];
output_index += BLOCKS_PER_FAST_LOOP * 8;
input_index += BLOCKS_PER_FAST_LOOP * 6;
}
}
// Encode what's left after the fast loop.
const LOW_SIX_BITS_U8: u8 = 0x3F;
let rem = input.len() % 3;
let start_of_rem = input.len() - rem;
// start at the first index not handled by fast loop, which may be 0.
while input_index < start_of_rem {
let input_chunk = &input[input_index..(input_index + 3)];
let output_chunk = &mut output[output_index..(output_index + 4)];
output_chunk[0] = self.encode_table[(input_chunk[0] >> 2) as usize];
output_chunk[1] = self.encode_table
[((input_chunk[0] << 4 | input_chunk[1] >> 4) & LOW_SIX_BITS_U8) as usize];
output_chunk[2] = self.encode_table
[((input_chunk[1] << 2 | input_chunk[2] >> 6) & LOW_SIX_BITS_U8) as usize];
output_chunk[3] = self.encode_table[(input_chunk[2] & LOW_SIX_BITS_U8) as usize];
input_index += 3;
output_index += 4;
}
if rem == 2 {
output[output_index] = self.encode_table[(input[start_of_rem] >> 2) as usize];
output[output_index + 1] =
self.encode_table[((input[start_of_rem] << 4 | input[start_of_rem + 1] >> 4)
& LOW_SIX_BITS_U8) as usize];
output[output_index + 2] =
self.encode_table[((input[start_of_rem + 1] << 2) & LOW_SIX_BITS_U8) as usize];
output_index += 3;
} else if rem == 1 {
output[output_index] = self.encode_table[(input[start_of_rem] >> 2) as usize];
output[output_index + 1] =
self.encode_table[((input[start_of_rem] << 4) & LOW_SIX_BITS_U8) as usize];
output_index += 2;
}
output_index
}
O3 优化下,x86_64 的汇编代码就是朴素的一堆 mov
, and
和 shr
;arm 汇编倒是做了乱序处理;然而实测在 x86_64 (ryzen 6800h)
上展开与否并不会显著影响性能(提升平均 5% 左右且不稳定)。个人不是很喜欢这个做法
而且有一个有趣的现象:当我把 internal_encode
提出来集成到代码里,提升能稳定达到 10%,比展开效果还好…我想可能是过度的各种 trait 包装阻碍了编译器进行少量优化
除此之外,它还实现了一个 Naive
Engine,配合 opt-level=z
能实现最小的生成代码
Alphabet
base64::alphabet
是编码表类型
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Alphabet { unsafe
```rust
pub(crate) symbols: [u8; ALPHABET_SIZE],
}
它支持 const 构造
const fn from_str_unchecked(alphabet: &str) -> Self {
let mut symbols = [0_u8; ALPHABET_SIZE];
let source_bytes = alphabet.as_bytes();
// a way to copy that's allowed in const fn
let mut index = 0;
while index < ALPHABET_SIZE {
symbols[index] = source_bytes[index];
index += 1;
}
Self { symbols }
}
可以看到,为了规避 const fn 内不能用 for (这是因为 Iterator::next
方法不是 const
),它 while 取而代之。这是常见的 const fn
内 slice 循环的技巧。要不是 const fn 就可以用 alphabet.as_bytes().try_into().unwrap()
一键解决;不过我选 unsafe
#[no_mangle]
const fn from_str_unchecked(alphabet: &str) -> [u8;64] {
if alphabet.len()!=64 {
panic!("Alphabet must be 64-lengthed");
}
unsafe{(alphabet.as_ptr() as *const [u8;64]).read()}
}
其他
base64 还实现了 Display trait,以方便 Debug 和用 fmt
宏做 String;实现了 Read 和 Write 做 Reader/Writer 嵌套。大概就这些
总结
总的来说,这并不算一个很高质量的库,唯一可圈可点之处就是零依赖。但其过于常用,性能够用,使用方便,功能够完善,开发较早,因此列于常用榜前列
然而,它既没有性能达到极限(Turbo Base64 / simdutf),又自以为是地循环展开,用起来还有点麻烦(base64::prelude::BASE64_STANDARD
);这对于一个近十年的库来说有点难以接受
实话说,这是现在很多 Rust 库的通病,即自以为是地做实际提升百分之几的优化,做很 fancy 的包装,搞得代码一大堆,一点优化的效果被包装全吃完了。这值得我们警惕