From fc1fb6f318ed8490200b2582cd1af075f34c9476 Mon Sep 17 00:00:00 2001 From: Mauri de Souza Meneguzzo Date: Wed, 27 May 2026 11:50:11 -0300 Subject: [PATCH] internal/bytealg: unroll compare_arm64.s chunk16_loop to 32 bytes/iter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit goos: darwin goarch: arm64 pkg: bytes cpu: Apple M3 Pro │ old.txt │ new.txt │ │ sec/op │ sec/op vs base │ CompareBytesBigUnaligned/offset=1-11 26.44µ ± 0% 22.91µ ± 1% -13.36% (p=0.000 n=10) CompareBytesBigUnaligned/offset=2-11 26.43µ ± 0% 22.90µ ± 0% -13.33% (p=0.000 n=10) CompareBytesBigUnaligned/offset=3-11 26.42µ ± 1% 22.90µ ± 1% -13.32% (p=0.000 n=7+10) CompareBytesBigUnaligned/offset=4-11 22.92µ ± ∞ ¹ geomean 26.43µ 22.91µ -13.34% ¹ need >= 6 samples for confidence interval at level 0.95 │ old.txt │ new.txt │ │ B/s │ B/s vs base │ CompareBytesBigUnaligned/offset=1-11 36.94Gi ± 0% 42.63Gi ± 1% +15.41% (p=0.000 n=10) CompareBytesBigUnaligned/offset=2-11 36.96Gi ± 0% 42.64Gi ± 0% +15.39% (p=0.000 n=10) CompareBytesBigUnaligned/offset=3-11 36.97Gi ± 1% 42.65Gi ± 1% +15.37% (p=0.000 n=7+10) CompareBytesBigUnaligned/offset=4-11 42.60Gi ± ∞ ¹ geomean 36.95Gi 42.63Gi +15.39% --- src/internal/bytealg/compare_arm64.s | 31 ++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/src/internal/bytealg/compare_arm64.s b/src/internal/bytealg/compare_arm64.s index cc02c464e8b274..852a535779b985 100644 --- a/src/internal/bytealg/compare_arm64.s +++ b/src/internal/bytealg/compare_arm64.s @@ -39,11 +39,17 @@ TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 CSEL LT, R3, R1, R6 // R6 is min(R1, R3) CBZ R6, samebytes - BIC $0xf, R6, R10 - CBZ R10, small // length < 16 - ADD R0, R10 // end of chunk16 - // length >= 16 -chunk16_loop: + BIC $0x1f, R6, R10 + CBZ R10, chunk16 // length < 32, try single chunk16 + ADD R0, R10 // end of chunk32 + // length >= 32 +chunk32_loop: + LDP.P 16(R0), (R4, R8) + LDP.P 16(R2), (R5, R9) + CMP R4, R5 + BNE cmp + CMP R8, R9 + BNE cmpnext LDP.P 16(R0), (R4, R8) LDP.P 16(R2), (R5, R9) CMP R4, R5 @@ -51,7 +57,20 @@ chunk16_loop: CMP R8, R9 BNE cmpnext CMP R10, R0 - BNE chunk16_loop + BNE chunk32_loop + AND $0x1f, R6, R6 // remaining 0-31 bytes + CBZ R6, samebytes + // handle remaining 0-31 bytes: one possible 16-byte chunk then tail + TBZ $4, R6, small_direct +chunk16: + TBZ $4, R6, small // length < 16 + LDP.P 16(R0), (R4, R8) + LDP.P 16(R2), (R5, R9) + CMP R4, R5 + BNE cmp + CMP R8, R9 + BNE cmpnext +small_direct: AND $0xf, R6, R6 CBZ R6, samebytes SUBS $8, R6