diff --git a/articles/20230617-riscv-klibc-opt-summary.md b/articles/20230617-riscv-klibc-opt-summary.md new file mode 100644 index 0000000000000000000000000000000000000000..340704d105ced1dd5e0496607c3eda643bdd1899 --- /dev/null +++ b/articles/20230617-riscv-klibc-opt-summary.md @@ -0,0 +1,618 @@ +> Corrector: [TinyCorrect](https://gitee.com/tinylab/tinycorrect) v0.1 - [urls pangu autocorrect]
+> Author: Jingqing 2351290287@qq.com
+> Date: 2023/6/17
+> Revisor: Falcon
+> Project: [RISC-V Linux 内核剖析](https://gitee.com/tinylab/riscv-linux)
+> Proposal: [【老师提案】RISC-V Generic library routines and assembly 技术调研、分析与优化 · Issue #I64R6O · 泰晓科技/RISCV-Linux - Gitee.com](https://gitee.com/tinylab/riscv-linux/issues/I64R6O)
+> Sponsor: PLCT Lab, ISCAS + +# 近半年 RISC-V 内核库中 str 和 mem 函数的优化内容总结 + +## 简介 + +本文结合 简要梳理了一下 RISC-V Linux 内核库函数的优化演进情况,主要涉及 Memory, String 操作两大部分。 + +## Memory + +### riscv: optimized mem* functions + +[riscv: optimized mem* functions][002] + +该组 patchset 对各种 mem 相关操作函数进行了优化,以下逐个分析。 + +#### memcpy + +主要是由“直接逐字节复制”转变为“先对齐再按字复制”。 + +1. 如果仍未启用高效对齐访问 CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS,则先在不改变 dest 和 src 相对距离的情况下将 desc 对齐在字边界上。 +2. 如果 `distance==0` 说明 src 和 dest 两者已经对齐,直接进行(32 or 64 bits)字长复制。 +3. 如果 `distance !=0` 说明未对齐,按照差值逐字复制。 + +```c ++void *__memcpy(void *dest, const void *src, size_t count) ++{ ++ union const_types s = { .as_u8 = src }; ++ union types d = { .as_u8 = dest }; ++ int distance = 0; ++ ++ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { ++ if (count < MIN_THRESHOLD) ++ goto copy_remainder; ++ ++ /* Copy a byte at time until destination is aligned. */ ++ for (; d.as_uptr & WORD_MASK; count--) ++ *d.as_u8++ = *s.as_u8++; ++ ++ distance = s.as_uptr & WORD_MASK; ++ } ++ ++ if (distance) { ++ unsigned long last, next; ++ ++ /* ++ * s is distance bytes ahead of d, and d just reached ++ * the alignment boundary. Move s backward to word align it ++ * and shift data to compensate for distance, in order to do ++ * word-by-word copy. ++ */ ++ s.as_u8 -= distance; ++ ++ next = s.as_ulong[0]; ++ for (; count >= BYTES_LONG; count -= BYTES_LONG) { ++ last = next; ++ next = s.as_ulong[1]; ++ ++ d.as_ulong[0] = last >> (distance * 8) | ++ next << ((BYTES_LONG - distance) * 8); ++ ++ d.as_ulong++; ++ s.as_ulong++; ++ } ++ ++ /* Restore s with the original offset. */ ++ s.as_u8 += distance; ++ } else { ++ /* ++ * If the source and dest lower bits are the same, do a simple ++ * 32/64 bit wide copy. ++ */ ++ for (; count >= BYTES_LONG; count -= BYTES_LONG) ++ *d.as_ulong++ = *s.as_ulong++; ++ } ++ ++copy_remainder: ++ while (count--) ++ *d.as_u8++ = *s.as_u8++; ++ ++ return dest; ++} ++EXPORT_SYMBOL(__memcpy); ++ ++void *memcpy(void *dest, const void *src, size_t count) __weak __alias(__memcpy); ++EXPORT_SYMBOL(memcpy); +``` + +#### memmove + +如果 dest 和 src 不重叠或者 `dest src) { ++ const char *s = src + count; ++ char *tmp = dest + count; ++ ++ while (count--) ++ *--tmp = *--s; ++ } ++ return dest; ++} ++EXPORT_SYMBOL(__memmove); ++ ++void *memmove(void *dest, const void *src, size_t count) __weak __alias(__memmove); ++EXPORT_SYMBOL(memmove); +``` + +#### memset + +旧 memset:永远一次一个字节地填充。安全但是效率低。 + +修改后:也是采用对齐机制,先按字节填充,等到和最大填充单位的倍数对齐时按最大填充单位填入。 + +```c ++void *__memset(void *s, int c, size_t count) ++{ ++ union types dest = { .as_u8 = s }; ++ ++ if (count >= MIN_THRESHOLD) { ++ unsigned long cu = (unsigned long)c; ++ ++ /* Compose an ulong with 'c' repeated 4/8 times */ ++#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER ++ cu *= 0x0101010101010101UL; ++#else ++ cu |= cu << 8; ++ cu |= cu << 16; ++ /* Suppress warning on 32 bit machines */ ++ cu |= (cu << 16) << 16; ++#endif ++ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { ++ /* ++ * Fill the buffer one byte at time until ++ * the destination is word aligned. ++ */ ++ for (; count && dest.as_uptr & WORD_MASK; count--) ++ *dest.as_u8++ = c; ++ } ++ ++ /* Copy using the largest size allowed */ ++ for (; count >= BYTES_LONG; count -= BYTES_LONG) ++ *dest.as_ulong++ = cu; ++ } ++ ++ /* copy the remainder */ ++ while (count--) ++ *dest.as_u8++ = c; ++ ++ return s; ++} ++EXPORT_SYMBOL(__memset); ++ ++void *memset(void *s, int c, size_t count) __weak __alias(__memset); ++EXPORT_SYMBOL(memset); +``` + +### riscv: lib: optimize memcmp with ld insn + +[riscv: lib: optimize memcmp with ld insn][003] + +这笔优化发到了 v3, 但是 Maintainer 反馈了一些编译问题,没有看到作者提交新的版本。 + +这笔优化的核心代码和解读如下: + +旧代码: + +``` +sb a1, 0(t0) +addi t0, t0, 1 +bltu t0, a3, 5b +``` + +新代码: + +``` +/* fill head and tail with minimal branching */ +sb a1, 0(t0) +sb a1, -1(a3) +li a4, 2 +bgeu a4, a2, 6f + +sb a1, 1(t0) +sb a1, 2(t0) +sb a1, -2(a3) +sb a1, -3(a3) +li a4, 6 +bgeu a4, a2, 6f + +/* + * Adding additional detection to avoid + * redundant stores can lead + * to better performance + */ +sb a1, 3(t0) +sb a1, -4(a3) +li a4, 8 +bgeu a4, a2, 6f + +sb a1, 4(t0) +sb a1, -5(a3) +li a4, 10 +bgeu a4, a2, 6f + +sb a1, 5(t0) +sb a1, 6(t0) +sb a1, -6(a3) +sb a1, -7(a3) +li a4, 14 +bgeu a4, a2, 6f + +/* store the last byte */ +sb a1, 7(t0) +``` + +主要的改动如下: + +1. 将旧代码中的一行 `addi t0, t0, 1` 替换为一系列新的存储指令,用于填充头部和尾部。新代码中的存储指令是以一定的间隔连续存储数据。 +2. 添加了额外的条件检测和分支,以避免重复存储,这可能会提高性能。 +3. 添加了一行 `li a4, 2` 来设置一个常数,用于条件比较。 +4. 添加了 `6f` 标签,用于跳转到代码的结尾。 + +它的核心优化思路是用许多分支结构填充头尾,这样虽然可能有一部分存储冗余,但是因为并行存储,减少跳转次数,提高了效率。 + +### RISC-V: Apply Zicboz to clear_page and memset + +[RISC-V: Apply Zicboz to clear_page and memset][004] + +引入 Zicboz 扩展后,Zicboz 块大小的内存自然对齐。因此要对接收任意内存块地址和大小的 memset() 来清空内存的方法进行优化。 + +分析发现当输入的地址未对齐或者太小时,Zicboz 中的 memset 会显得效率低一些(多了几十条指令)。 + +1. 首先检查是否启用了 CONFIG_RISCV_ISA_ZICBOZ 来判断是否使用 Zicboz 扩展。如果不使用 Zicboz 扩展或者传入的参数不适合使用 Zicboz 扩展,则代码会跳转到.Ldo_memset 标签处执行内存清零的逻辑。 +2. 如果使用 Zicboz 扩展进行内存清零,代码会将地址和长度进行对齐,并使用 Zicboz 扩展的指令进行内存清零操作。 +3. 在进行 Zicboz 扩展内存清零时,如果还有一些字节无法使用 Zicboz 扩展一次性清零,则会使用 Duff's 设备来处理剩余的字节。 + +```c ++#ifdef CONFIG_RISCV_ISA_ZICBOZ ++ ALT_ZICBOZ("j .Ldo_memset", "nop") ++ /* ++ * t1 will be the Zicboz block size. ++ * Zero means we're not using Zicboz, and we don't when a1 != 0 ++ */ ++ li t1, 0 ++ bnez a1, .Ldo_memset ++ la a3, riscv_cboz_block_size ++ lw t1, 0(a3) ++ ++ /* ++ * Round to nearest Zicboz block-aligned address ++ * greater than or equal to the start address. ++ */ ++ addi a3, t1, -1 ++ not t2, a3 /* t2 is Zicboz block size mask */ ++ add a3, t0, a3 ++ and t3, a3, t2 /* t3 is Zicboz block aligned start */ ++ ++ /* Did we go too far or not have at least one block? */ ++ add a3, a0, a2 ++ and a3, a3, t2 ++ bgtu a3, t3, .Ldo_zero ++ li t1, 0 ++ j .Ldo_memset ++ ++.Ldo_zero: ++ /* Use Duff for initial bytes if there are any */ ++ bne t3, t0, .Ldo_memset ++ ++.Ldo_zero2: ++ /* Calculate end address */ ++ and a3, a2, t2 ++ add a3, t0, a3 ++ sub a4, a3, t0 ++ ++.Lzero_loop: ++ CBO_ZERO(t0) ++ add t0, t0, t1 ++ bltu t0, a3, .Lzero_loop ++ li t1, 0 /* We're done with Zicboz */ ++ ++ sub a2, a2, a4 /* Update count */ ++ sltiu a3, a2, 16 ++ bnez a3, .Lfinish ++ ++ /* t0 is Zicboz block size aligned, so it must be SZREG aligned */ ++ j .Ldo_duff3 ++#endif ++ +``` + +### RISC-V: Optimize memset for data sizes less than 16 bytes + +[RISC-V: Optimize memset for data sizes less than 16 bytes][006] ... + +在上述 memset 优化的基础上继续进行。 + +大于等于 16 字节先对齐后按 16byte 倍数存储。对于尾部数据或小于 16 字节的数据,memset 使用字节存储,效率相对低。改进方案决定用许多分支结构填充头尾,这样虽然可能有一部分存储冗余,但是因为并行存储,减少跳转次数,提高了效率。 + +```c ++void *__memset(void *s, int c, size_t count) ++{ ++ union types dest = { .as_u8 = s }; ++ ++ if (count >= MIN_THRESHOLD) { ++ unsigned long cu = (unsigned long)c; ++ ++ /* Compose an ulong with 'c' repeated 4/8 times */ ++#ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER ++ cu *= 0x0101010101010101UL; ++#else ++ cu |= cu << 8; ++ cu |= cu << 16; ++ /* Suppress warning on 32 bit machines */ ++ cu |= (cu << 16) << 16;//8bits 的 c 复制 4 次来构造 unsigned long 的 cu ++#endif ++ if (!IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)) { ++ /* ++ * Fill the buffer one byte at time until ++ * the destination is word aligned. ++ */ ++ for (; count && dest.as_uptr & WORD_MASK; count--) ++ *dest.as_u8++ = c;//逐字节填充对应地址中的值=c ++ } ++ ++ /* Copy using the largest size allowed */ ++ for (; count >= BYTES_LONG; count -= BYTES_LONG) ++ *dest.as_ulong++ = cu;//BYTES_LONG 的整数倍部分复制为 cu ++ } ++ ++ /* copy the remainder */ ++ while (count--) ++ *dest.as_u8++ = c;//剩余值全部设置为 c ++ ++ return s; ++} ++EXPORT_SYMBOL(__memset); ++ ++void *memset(void *s, int c, size_t count) __weak __alias(__memset); ++EXPORT_SYMBOL(memset); +``` + +## String + +### Zbb string optimizations + +[Zbb string optimizations][001] + +主要是为 zbb 提供了通用的一些字符串支持,后续特定用法优化拓展需要单独实现。 + +- 为 Zbb 系统添加了允许未对齐访问的 strcmp,strncmp,strlen 以及生成相应 makefile 文件。 + +- 用位域而不是数字代替 CPU 的补丁拓展 errata-id 的宏定义,简化。 + + ```c + -#define CPUFEATURE_SVPBMT 0 + -#define CPUFEATURE_ZICBOM 1 + -#define CPUFEATURE_ZBB 2 + +#define CPUFEATURE_SVPBMT (1 << 0) + +#define CPUFEATURE_ZICBOM (1 << 1) + +#define CPUFEATURE_ZBB (1 << 2) + ``` + +### Zbb+ fast-unaligned string optimization + +[Zbb + fast-unaligned string optimization][005] ... + +添加多个 strcmp 变体用于快速比较非对齐访问。优先使用效率高的优化变体,在无法生效的情况下退回到通用情况。 + +```c ++static bool __init_or_module cpufeature_probe_fast_unaligned(unsigned int stage) ++{ ++ int cpu; ++ ++ if (stage == RISCV_ALTERNATIVES_EARLY_BOOT) ++ return false; ++ ++ for_each_possible_cpu(cpu) { ++ long perf = per_cpu(misaligned_access_speed, cpu); ++ ++ if (perf != RISCV_HWPROBE_MISALIGNED_FAST) ++ return false; ++ } ++ ++ return true; ++} ++ +``` + +#### strcmp_zbb + +检查两个字符串是否对齐到 SZREG 的边界。如果是,则以 SZREG 为单位比较两个字符串中的内容。如果不是,则按字节读取。 + +```c ++/* ++ * Variant of strcmp using the ZBB extension if available ++ */ ++#ifdef CONFIG_RISCV_ISA_ZBB ++strcmp_zbb: ++ ++.option push ++.option arch,+zbb ++ ++ /* ++ * Returns ++ * a0 - comparison result, value like strcmp ++ * ++ * Parameters ++ * a0 - string1 ++ * a1 - string2 ++ * ++ * Clobbers ++ * t0, t1, t2, t3, t4, t5 ++ */ ++ ++ or t2, a0, a1 ++ li t4, -1 ++ and t2, t2, SZREG-1 ++ bnez t2, 3f ++ ++ /* Main loop for aligned string. */ ++ .p2align 3 ++1: ++ REG_L t0, 0(a0) ++ REG_L t1, 0(a1) ++ orc.b t3, t0 ++ bne t3, t4, 2f ++ addi a0, a0, SZREG ++ addi a1, a1, SZREG ++ beq t0, t1, 1b ++ ++ /* ++ * Words don't match, and no null byte in the first ++ * word. Get bytes in big-endian order and compare. ++ */ ++#ifndef CONFIG_CPU_BIG_ENDIAN ++ rev8 t0, t0 ++ rev8 t1, t1 ++#endif ++ ++ /* Synthesize (t0 >= t1) ? 1 : -1 in a branchless sequence. */ ++ sltu a0, t0, t1 ++ neg a0, a0 ++ ori a0, a0, 1 ++ ret ++ ++2: ++ /* ++ * Found a null byte. ++ * If words don't match, fall back to simple loop. ++ */ ++ bne t0, t1, 3f ++ ++ /* Otherwise, strings are equal. */ ++ li a0, 0 ++ ret ++ ++ /* Simple loop for misaligned strings. */ ++ .p2align 3 ++3: ++ lbu t0, 0(a0) ++ lbu t1, 0(a1) ++ addi a0, a0, 1 ++ addi a1, a1, 1 ++ bne t0, t1, 4f ++ bnez t0, 3b ++ ++4: ++ sub a0, t0, t1 ++ ret ++ ++.option pop ++#endif +``` + +#### strlen_zbb + +启用 CONFIG_RISCV_ISA_ZBB 的前提下,移位对齐字符后从头开始以 SZREG 为单位读取,并剔除第一个和最后一个机器字头尾的空字符。最后计算结果求和。 + +```c ++#ifdef CONFIG_RISCV_ISA_ZBB ++strlen_zbb: ++ ++#ifdef CONFIG_CPU_BIG_ENDIAN ++# define CZ clz ++# define SHIFT sll ++#else ++# define CZ ctz ++# define SHIFT srl ++#endif ++ ++.option push ++.option arch,+zbb ++ ++ /* ++ * Returns ++ * a0 - string length ++ * ++ * Parameters ++ * a0 - String to measure ++ * ++ * Clobbers ++ * t0, t1, t2, t3 ++ */ ++ ++ /* Number of irrelevant bytes in the first word. */ ++ andi t2, a0, SZREG-1 ++ ++ /* Align pointer. */ ++ andi t0, a0, -SZREG ++ ++ li t3, SZREG ++ sub t3, t3, t2 ++ slli t2, t2, 3 ++ ++ /* Get the first word. */ ++ REG_L t1, 0(t0) ++ ++ /* ++ * Shift away the partial data we loaded to remove the irrelevant bytes ++ * preceding the string with the effect of adding NUL bytes at the ++ * end of the string's first word. ++ */ ++ SHIFT t1, t1, t2 ++ ++ /* Convert non-NUL into 0xff and NUL into 0x00. */ ++ orc.b t1, t1 ++ ++ /* Convert non-NUL into 0x00 and NUL into 0xff. */ ++ not t1, t1 ++ ++ /* ++ * Search for the first set bit (corresponding to a NUL byte in the ++ * original chunk). ++ */ ++ CZ t1, t1 ++ ++ /* ++ * The first chunk is special: compare against the number ++ * of valid bytes in this chunk. ++ */ ++ srli a0, t1, 3 ++ bgtu t3, a0, 3f ++ ++ /* Prepare for the word comparison loop. */ ++ addi t2, t0, SZREG ++ li t3, -1 ++ ++ /* ++ * Our critical loop is 4 instructions and processes data in ++ * 4 byte or 8 byte chunks. ++ */ ++ .p2align 3 ++1: ++ REG_L t1, SZREG(t0) ++ addi t0, t0, SZREG ++ orc.b t1, t1 ++ beq t1, t3, 1b ++2: ++ not t1, t1 ++ CZ t1, t1 ++ ++ /* Get number of processed words. */ ++ sub t2, t0, t2 ++ ++ /* Add number of characters in the first word. */ ++ add a0, a0, t2 ++ srli t1, t1, 3 ++ ++ /* Add number of characters in the last word. */ ++ add a0, a0, t1 ++3: ++ ret ++ ++.option pop ++#endif +``` + +## 总结 + +以上梳理了 memory 和 strcmp 相关优化代码,可以发现: + +memory 相关优化方法主要有两点:通过连续存储减少条件分支及其跳转次数,减少判断上的时间;以及通过对齐机制把内存操作函数拆为单位块的对齐部分和单独处理的非对齐部分,批量操作一定程度上提高效率。 + +string 对于 zbb 支持部分的函数优化,主要是先提供通用支持未对齐方式的字符串函数以及方便后续添加优化函数的框架,之后又提出了优化对齐方式下按 SZREG 块单位执行函数的优化方案。当优化方案不适用时再使用通用函数,以此优化部分情况下的 zbb 中 str 相关函数的使用效率。 + +接下来将按照 Memory, String, 数据运算,其他库函数等几个方面系统地展开对 RISC-V Linux 内核库函数的解读,敬请期待。 + +## 参考资料 + +- [Zbb string optimizations][001] +- [riscv: optimized mem* functions][002] +- [riscv: lib: optimize memcmp with ld insn][003] +- [RISC-V: Apply Zicboz to clear_page and memset][004] +- [Zbb+ fast-unaligned string optimization][005] +- [RISC-V: Optimize memset for data sizes less than 16 bytes][006] + +[001]: https://lore.kernel.org/all/20230113212301.3534711-1-heiko@sntech.de/ +[002]: https://lore.kernel.org/linux-riscv/20210929172234.31620-1-mcroce@linux.microsoft.com/ +[003]: https://lore.kernel.org/linux-riscv/20220906115359.173660-1-zouyipeng@huawei.com/ +[004]: https://lore.kernel.org/linux-riscv/20221027130247.31634-1-ajones@ventanamicro.com/ +[005]: https://lore.kernel.org/linux-riscv/20230113212351.3534769-1-heiko@sntech.de/ +[006]: https://lore.kernel.org/linux-riscv/20230511012604.3222-1-zhang_fei_0403@163.com/