From dcf9dfd91398d35efd67f872638d6a46da1858cb Mon Sep 17 00:00:00 2001 From: Brendan Fletcher Date: Wed, 16 Apr 2025 00:08:28 -0400 Subject: [PATCH 1/4] Improve average speed of count-leading-zero libcalls --- src/crt/bctlz.src | 18 ------ src/crt/ctlz.src | 101 ++++++++++++++++++++++++++++++++ src/crt/i48ctlz.src | 31 ---------- src/crt/ictlz.src | 17 ------ src/crt/lctlz.src | 20 ------- src/crt/llctlz.src | 26 -------- src/crt/sctlz.src | 17 ------ src/libc/include/ez80_builtin.h | 10 +++- 8 files changed, 109 insertions(+), 131 deletions(-) delete mode 100644 src/crt/bctlz.src create mode 100644 src/crt/ctlz.src delete mode 100644 src/crt/i48ctlz.src delete mode 100644 src/crt/ictlz.src delete mode 100644 src/crt/lctlz.src delete mode 100644 src/crt/llctlz.src delete mode 100644 src/crt/sctlz.src diff --git a/src/crt/bctlz.src b/src/crt/bctlz.src deleted file mode 100644 index 8a9579b69..000000000 --- a/src/crt/bctlz.src +++ /dev/null @@ -1,18 +0,0 @@ - assume adl=1 - - section .text - public __bctlz -__bctlz: - push hl - - scf - sbc hl, hl - -.loop: - rla - inc hl - jr nc, .loop - - ld a, l - pop hl - ret diff --git a/src/crt/ctlz.src b/src/crt/ctlz.src new file mode 100644 index 000000000..e9b41285e --- /dev/null +++ b/src/crt/ctlz.src @@ -0,0 +1,101 @@ + assume adl=1 + + section .text + public __bctlz +__bctlz: + cp a, 010h + jr c, .low4 + cp a, 040h + jr c, .low6 + add a, a + sbc a, a + inc a + ret +.low6: + add a, -020h + sbc a, a + add a, 3 + ret +.low4: + cp a, 4 + jr c, .low2 + add a, -8 + sbc a, a + add a, 5 + ret +.low2: + add a, -3 + cpl + adc a, 6 + ret + + section .text + public __sctlz +__sctlz: + ld a, h + or a, a + jr nz, __bctlz + ld a, l + require __sctlz.hijack + + section .text + private __sctlz.hijack +__sctlz.hijack: + call __bctlz + add a, 8 + ret + + section .text + public __ictlz +__ictlz: + dec sp + push hl + inc sp + pop af + or a, a + jr nz, __bctlz + or a, h + jr nz, __sctlz.hijack + ld a, l + call __bctlz + add a, 16 + ret + + section .text + public __lctlz +__lctlz: + ld a, e + or a, a + jr nz, __bctlz + call __ictlz + add a, 8 + ret + + section .text + public __llctlz +__llctlz: + ld a, b + or a, a + jr nz, __bctlz + or a, c + jr nz, __sctlz.hijack + call __i48ctlz + add a, 16 + ret + + section .text + public __i48ctlz +__i48ctlz: + ex de, hl + add hl, de + or a, a + sbc hl, de + jr z, .low + call __ictlz + ex de, hl + ret +.low: + ex de, hl + call __ictlz + add a, 24 + ret diff --git a/src/crt/i48ctlz.src b/src/crt/i48ctlz.src deleted file mode 100644 index c9a333220..000000000 --- a/src/crt/i48ctlz.src +++ /dev/null @@ -1,31 +0,0 @@ -; Performs 48-bit count leading zeros -; -; Returns: -; a = number of leading 0's in ude:uhl - - assume adl=1 - - section .text - public __i48ctlz -__i48ctlz: - push de - ex de, hl - - ; Check if upper half is non-zero - add hl, de - xor a, a - sbc hl, de - jr nz, .nonzero - - ; Skip to lower half - add hl, de - sub a, -23 ; Set carry to prevent infinite loop -.loop: - inc a -.nonzero: - adc hl, hl - jr nc, .loop - - ex de, hl - pop de - ret \ No newline at end of file diff --git a/src/crt/ictlz.src b/src/crt/ictlz.src deleted file mode 100644 index 34dcb3ed1..000000000 --- a/src/crt/ictlz.src +++ /dev/null @@ -1,17 +0,0 @@ - assume adl=1 - - section .text - public __ictlz -__ictlz: - push hl - - scf - sbc a, a - -.loop: - adc hl, hl - inc a - jr nc, .loop - - pop hl - ret diff --git a/src/crt/lctlz.src b/src/crt/lctlz.src deleted file mode 100644 index 9943bad91..000000000 --- a/src/crt/lctlz.src +++ /dev/null @@ -1,20 +0,0 @@ - assume adl=1 - - section .text - public __lctlz -__lctlz: - push de - push hl - - scf - sbc a, a - -.loop: - adc hl, hl - rl e - inc a - jr nc, .loop - - pop hl - pop de - ret diff --git a/src/crt/llctlz.src b/src/crt/llctlz.src deleted file mode 100644 index ad43d18ae..000000000 --- a/src/crt/llctlz.src +++ /dev/null @@ -1,26 +0,0 @@ - assume adl=1 - - section .text - public __llctlz -__llctlz: - push bc - push de - push hl - - scf - sbc a, a - -.loop: - adc hl, hl - ex de, hl - adc hl, hl - ex de, hl - rl c - rl b - inc a - jr nc, .loop - - pop hl - pop de - pop bc - ret diff --git a/src/crt/sctlz.src b/src/crt/sctlz.src deleted file mode 100644 index 96815e2c9..000000000 --- a/src/crt/sctlz.src +++ /dev/null @@ -1,17 +0,0 @@ - assume adl=1 - - section .text - public __sctlz -__sctlz: - push hl - - scf - sbc a, a - -.loop: - adc.s hl, hl - inc a - jr nc, .loop - - pop hl - ret diff --git a/src/libc/include/ez80_builtin.h b/src/libc/include/ez80_builtin.h index 031032268..3b43dd9ed 100644 --- a/src/libc/include/ez80_builtin.h +++ b/src/libc/include/ez80_builtin.h @@ -11,13 +11,19 @@ extern "C" { #if __has_builtin(__builtin_clzc) #define __ez80_clzc __builtin_clzc #else -unsigned char __ez80_clzc(unsigned char) __NOEXCEPT_CONST; +static inline __attribute__((__always_inline__)) +int __ez80_clzc(unsigned char __x) __NOEXCEPT_CONST { + return (unsigned char)(__builtin_clzs(__x) - 8); +} #endif #if __has_builtin(__builtin_clzi48) #define __ez80_clzi48 __builtin_clzi48 #else -unsigned char __ez80_clzi48(unsigned __int48) __NOEXCEPT_CONST; +static inline __attribute__((__always_inline__)) +int __ez80_clzi48(unsigned __int48 __x) __NOEXCEPT_CONST { + return (unsigned char)(__builtin_clzll(__x) - 16); +} #endif #if __has_builtin(__builtin_ctzc) From 4666037d654dca3289e81f27d7ec3658fd67972c Mon Sep 17 00:00:00 2001 From: Brendan Fletcher Date: Sat, 19 Apr 2025 19:03:20 -0400 Subject: [PATCH 2/4] Fix return types of ez80_builtin functions, optimize some implementations --- src/libc/ez80_builtin.src | 219 +++++++++++------------- src/libc/include/ez80_builtin.h | 20 ++- test/standalone/ez80_builtin/src/main.c | 62 +++---- 3 files changed, 149 insertions(+), 152 deletions(-) diff --git a/src/libc/ez80_builtin.src b/src/libc/ez80_builtin.src index 209ca9eca..004e724c3 100644 --- a/src/libc/ez80_builtin.src +++ b/src/libc/ez80_builtin.src @@ -1,31 +1,5 @@ assume adl=1 -;------------------------------------------------------------------------------- - - section .text - - public ___ez80_clzc -___ez80_clzc: - ld hl, 3 - add hl, sp - ld a, (hl) - jp __bctlz - -;------------------------------------------------------------------------------- - - section .text - - public ___ez80_clzi48 -___ez80_clzi48: - ld hl, 6 - add hl, sp - ld de, (hl) - dec hl - dec hl - dec hl - ld hl, (hl) - jp __i48ctlz - ;------------------------------------------------------------------------------- section .text @@ -35,13 +9,15 @@ ___ez80_ctzc: ; unoptimized ld hl, 3 add hl, sp - ld a, (hl) - ld l, a - cpl - dec l + ld l, (hl) + xor a, a + sub a, l and a, l - ld l, a - jp __bpopcnt + call __bctlz + bit 3, a + ret nz + xor a, 7 + ret ;------------------------------------------------------------------------------- @@ -50,19 +26,23 @@ ___ez80_ctzc: public ___ez80_ctzi48 ___ez80_ctzi48: ; unoptimized + ld hl, 3 + add hl, sp + ld bc, (hl) + inc hl + inc hl + inc hl + ld iy, (hl) + sbc hl, hl + add hl, bc + lea de, iy + call __i48neg + call __i48and + call __i48ctlz + cpl + add a, 48 + ret p ld a, 48 - ld iy, 0 - add iy, sp - ld hl, (iy + 3) - ld de, (iy + 6) - call __i48cmpzero - ret z - ld bc, (iy + 3) - ld iy, (iy + 6) - call __i48sub_1 - call __i48xor - call __i48popcnt - dec a ret ;------------------------------------------------------------------------------- @@ -74,13 +54,14 @@ ___ez80_ffsc: ; unoptimized ld hl, 3 add hl, sp - ld a, (hl) - or a, a - ret z - ld l, a - dec l - xor a, l - jp __bpopcnt + ld l, (hl) + xor a, a + sub a, l + and a, l + call __bctlz + cpl + add a, 9 + ret ;------------------------------------------------------------------------------- @@ -92,21 +73,21 @@ ___ez80_ffss: ld hl, 3 add hl, sp ld hl, (hl) - xor a, a - or a, h - or a, l - ret z + ; HL & -HL ld b, h ld c, l - dec bc - ; HL ^ (HL - 1) + sbc hl, hl + sbc hl, bc ld a, h - xor a, b + and a, b ld h, a ld a, l - xor a, c + and a, c ld l, a - jp __spopcnt + call __sctlz + cpl + add a, 17 + ret ;------------------------------------------------------------------------------- @@ -115,18 +96,22 @@ ___ez80_ffss: public ___ez80_ffsi48 ___ez80_ffsi48: ; unoptimized - xor a, a - ld iy, 0 - add iy, sp - ld hl, (iy + 3) - ld de, (iy + 6) - call __i48cmpzero - ret z - ld bc, (iy + 3) - ld iy, (iy + 6) - call __i48sub_1 - call __i48xor - jp __i48popcnt + ld hl, 3 + add hl, sp + ld bc, (hl) + inc hl + inc hl + inc hl + ld iy, (hl) + sbc hl, hl + add hl, bc + lea de, iy + call __i48neg + call __i48and + call __i48ctlz + cpl + add a, 49 + ret ;------------------------------------------------------------------------------- @@ -134,17 +119,15 @@ ___ez80_ffsi48: public ___ez80_clrsbc ___ez80_clrsbc: - ; unoptimized ld hl, 3 add hl, sp ld a, (hl) - rlca - sbc hl, hl + add a, a + ld l, a + sbc a, a xor a, l - rrca - call __bctlz - dec a - ret + set 0, a + jp __bctlz ;------------------------------------------------------------------------------- @@ -156,12 +139,10 @@ ___ez80_clrsbs: ld hl, 3 add hl, sp ld hl, (hl) - ld a, h - rla + add.s hl, hl call c, __snot - call __sctlz - dec a - ret + set 0, l + jp __sctlz ;------------------------------------------------------------------------------- @@ -169,22 +150,20 @@ ___ez80_clrsbs: public ___ez80_clrsbi48 ___ez80_clrsbi48: - ; unoptimized - ld hl, 8 + ld hl, 6 add hl, sp - ld a, (hl) - dec hl - dec hl ld de, (hl) dec hl dec hl dec hl ld hl, (hl) - rla + add hl, hl + ex de, hl + adc hl, hl + ex de, hl call c, __i48not - call __i48ctlz - dec a - ret + set 0, l + jp __i48ctlz ;------------------------------------------------------------------------------- @@ -207,13 +186,14 @@ ___ez80_parityc: public ___ez80_paritys ___ez80_paritys: - ; unoptimized ld hl, 3 add hl, sp - ld hl, (hl) - call __spopcnt - rrca - sbc a, a + ld a, (hl) + inc hl + xor a, (hl) + ld a, 0 + ret pe + dec a ret ;------------------------------------------------------------------------------- @@ -222,17 +202,17 @@ ___ez80_paritys: public ___ez80_parityi48 ___ez80_parityi48: - ; unoptimized - ld hl, 6 + ld hl, 3 add hl, sp - ld de, (hl) - dec hl - dec hl - dec hl - ld hl, (hl) - call __i48popcnt - rrca - sbc a, a + ld a, (hl) + ld b, 5 +.loop: + inc hl + xor a, (hl) + djnz .loop + ld a, b + ret pe + dec a ret ;------------------------------------------------------------------------------- @@ -278,14 +258,16 @@ ___ez80_popcounti48: public ___ez80_bswap24 ___ez80_bswap24: - ; unoptimized - ld iy, 0 - add iy, sp - ld h, (iy + 3) - ld l, (iy + 5) - ld (iy + 3), l - ld (iy + 5), h - ld hl, (iy + 3) + ld hl, 1 + add hl, sp + ld de, (hl) + inc hl + inc hl + inc hl + ld d, (hl) + inc hl + ld e, (hl) + ex de, hl ret ;------------------------------------------------------------------------------- @@ -424,9 +406,8 @@ ___ez80_rotateright48: extern __snot extern __i48not - extern __i48xor - extern __i48cmpzero - extern __i48sub_1 + extern __i48and + extern __i48neg extern __bctlz extern __sctlz diff --git a/src/libc/include/ez80_builtin.h b/src/libc/include/ez80_builtin.h index 3b43dd9ed..b43019997 100644 --- a/src/libc/include/ez80_builtin.h +++ b/src/libc/include/ez80_builtin.h @@ -12,102 +12,118 @@ extern "C" { #define __ez80_clzc __builtin_clzc #else static inline __attribute__((__always_inline__)) -int __ez80_clzc(unsigned char __x) __NOEXCEPT_CONST { +unsigned char __ez80_clzc(unsigned char __x) __NOEXCEPT_CONST { return (unsigned char)(__builtin_clzs(__x) - 8); } +#define __ez80_clzc(x) ((int)__ez80_clzc(x)) #endif #if __has_builtin(__builtin_clzi48) #define __ez80_clzi48 __builtin_clzi48 #else static inline __attribute__((__always_inline__)) -int __ez80_clzi48(unsigned __int48 __x) __NOEXCEPT_CONST { +unsigned char __ez80_clzi48(unsigned __int48 __x) __NOEXCEPT_CONST { return (unsigned char)(__builtin_clzll(__x) - 16); } +#define __ez80_clzi48(x) ((int)__ez80_clzi48(x)) #endif #if __has_builtin(__builtin_ctzc) #define __ez80_ctzc __builtin_ctzc #else unsigned char __ez80_ctzc(unsigned char) __NOEXCEPT_CONST; +#define __ez80_ctzc(x) ((int)__ez80_ctzc(x)) #endif #if __has_builtin(__builtin_ctzi48) #define __ez80_ctzi48 __builtin_ctzi48 #else unsigned char __ez80_ctzi48(unsigned __int48) __NOEXCEPT_CONST; +#define __ez80_ctzi48(x) ((int)__ez80_ctzi48(x)) #endif #if __has_builtin(__builtin_ffsc) #define __ez80_ffsc __builtin_ffsc #else unsigned char __ez80_ffsc(unsigned char) __NOEXCEPT_CONST; +#define __ez80_ffsc(x) ((int)__ez80_ffsc(x)) #endif #if __has_builtin(__builtin_ffss) #define __ez80_ffss __builtin_ffss #else unsigned char __ez80_ffss(unsigned short) __NOEXCEPT_CONST; +#define __ez80_ffss(x) ((int)__ez80_ffss(x)) #endif #if __has_builtin(__builtin_ffsi48) #define __ez80_ffsi48 __builtin_ffsi48 #else unsigned char __ez80_ffsi48(unsigned __int48) __NOEXCEPT_CONST; +#define __ez80_ffsi48(x) ((int)__ez80_ffsi48(x)) #endif #if __has_builtin(__builtin_clrsbc) #define __ez80_clrsbc __builtin_clrsbc #else unsigned char __ez80_clrsbc(unsigned char) __NOEXCEPT_CONST; +#define __ez80_clrsbc(x) ((int)__ez80_clrsbc(x)) #endif #if __has_builtin(__builtin_clrsbs) #define __ez80_clrsbs __builtin_clrsbs #else unsigned char __ez80_clrsbs(unsigned short) __NOEXCEPT_CONST; +#define __ez80_clrsbs(x) ((int)__ez80_clrsbs(x)) #endif #if __has_builtin(__builtin_clrsbi48) #define __ez80_clrsbi48 __builtin_clrsbi48 #else unsigned char __ez80_clrsbi48(unsigned __int48) __NOEXCEPT_CONST; +#define __ez80_clrsbi48(x) ((int)__ez80_clrsbi48(x)) #endif #if __has_builtin(__builtin_parityc) #define __ez80_parityc __builtin_parityc #else bool __ez80_parityc(unsigned char) __NOEXCEPT_CONST; +#define __ez80_parityc(x) ((int)__ez80_parityc(x)) #endif #if __has_builtin(__builtin_paritys) #define __ez80_paritys __builtin_paritys #else bool __ez80_paritys(unsigned short) __NOEXCEPT_CONST; +#define __ez80_paritys(x) ((int)__ez80_paritys(x)) #endif #if __has_builtin(__builtin_parityi48) #define __ez80_parityi48 __builtin_parityi48 #else bool __ez80_parityi48(unsigned __int48) __NOEXCEPT_CONST; +#define __ez80_parityi48(x) ((int)__ez80_parityi48(x)) #endif #if __has_builtin(__builtin_popcountc) #define __ez80_popcountc __builtin_popcountc #else unsigned char __ez80_popcountc(unsigned char) __NOEXCEPT_CONST; +#define __ez80_popcountc(x) ((int)__ez80_popcountc(x)) #endif #if __has_builtin(__builtin_popcounts) #define __ez80_popcounts __builtin_popcounts #else unsigned char __ez80_popcounts(unsigned short) __NOEXCEPT_CONST; +#define __ez80_popcounts(x) ((int)__ez80_popcounts(x)) #endif #if __has_builtin(__builtin_popcounti48) #define __ez80_popcounti48 __builtin_popcounti48 #else unsigned char __ez80_popcounti48(unsigned __int48) __NOEXCEPT_CONST; +#define __ez80_popcounti48(x) ((int)__ez80_popcounti48(x)) #endif #if __has_builtin(__builtin_bswap24) diff --git a/test/standalone/ez80_builtin/src/main.c b/test/standalone/ez80_builtin/src/main.c index db5101213..30e97e53f 100644 --- a/test/standalone/ez80_builtin/src/main.c +++ b/test/standalone/ez80_builtin/src/main.c @@ -105,11 +105,11 @@ __attribute__((__unused__)) static uint64_t rand64(void) { } static int test_clzc(void) { - unsigned char truth, guess; + int truth, guess; unsigned char input; for (int i = 0; i < 256; i ++) { input = (unsigned char)i; - truth = (input == 0) ? 8 : (unsigned char)(__builtin_clz((unsigned int)input) - 16); + truth = (input == 0) ? 8 : __builtin_clz((unsigned int)input) - 16; guess = __ez80_clzc(input); CMP("%d", input, truth, guess); } @@ -117,11 +117,11 @@ static int test_clzc(void) { } static int test_ctzc(void) { - unsigned char truth, guess; + int truth, guess; unsigned char input; for (int i = 0; i < 256; i++) { input = (unsigned char)i; - truth = (input == 0) ? 8 : (unsigned char)__builtin_ctz((unsigned int)input); + truth = (input == 0) ? 8 : __builtin_ctz((unsigned int)input); guess = __ez80_ctzc(input); CMP("%d", input, truth, guess); } @@ -129,11 +129,11 @@ static int test_ctzc(void) { } static int test_ffsc(void) { - unsigned char truth, guess; + int truth, guess; unsigned char input; for (int i = 0; i < 256; i++) { input = (unsigned char)i; - truth = (unsigned char)__builtin_ffs(input); + truth = __builtin_ffs(input); guess = __ez80_ffsc(input); CMP("%d", input, truth, guess); } @@ -141,11 +141,11 @@ static int test_ffsc(void) { } static int test_clrsbc(void) { - unsigned char truth, guess; + int truth, guess; unsigned char input; for (int i = 0; i < 256; i++) { input = (unsigned char)i; - truth = (unsigned char)__builtin_clrsb((signed int)((signed char)input)) - 16; + truth = __builtin_clrsb((signed int)((signed char)input)) - 16; guess = __ez80_clrsbc(input); CMP("%d", input, truth, guess); } @@ -153,11 +153,11 @@ static int test_clrsbc(void) { } static int test_parityc(void) { - unsigned char truth, guess; + int truth, guess; unsigned char input; for (int i = 0; i < 256; i++) { input = (unsigned char)i; - truth = (unsigned char)__builtin_parity((unsigned int)input); + truth = __builtin_parity((unsigned int)input); guess = __ez80_parityc(input); CMP("%d", input, truth, guess); } @@ -165,11 +165,11 @@ static int test_parityc(void) { } static int test_popcountc(void) { - unsigned char truth, guess; + int truth, guess; unsigned char input; for (int i = 0; i < 256; i++) { input = (unsigned char)i; - truth = (unsigned char)__builtin_popcount((unsigned int)input); + truth = __builtin_popcount((unsigned int)input); guess = __ez80_popcountc(input); CMP("%d", input, truth, guess); } @@ -177,7 +177,7 @@ static int test_popcountc(void) { } static int test_ffss(void) { - unsigned char truth, guess; + int truth, guess; unsigned short input; CMP("%d", 0, 0, __ez80_ffss(0)); CMP("%d", 1, 1, __ez80_ffss(1)); @@ -186,7 +186,7 @@ static int test_ffss(void) { CMP("%d", USHRT_MAX, 1, __ez80_ffss(USHRT_MAX)); for (int i = 0; i < RANDOM_TEST_COUNT; i++) { input = rand16(); - truth = (unsigned char)__builtin_ffs((int)input); + truth = __builtin_ffs((int)input); guess = __ez80_ffss(input); CMP("%d", input, truth, guess); } @@ -194,7 +194,7 @@ static int test_ffss(void) { } static int test_clrsbs(void) { - unsigned char truth, guess; + int truth, guess; unsigned short input; CMP("%d", 0, 15, __ez80_clrsbs(0)); CMP("%d", 1, 14, __ez80_clrsbs(1)); @@ -203,7 +203,7 @@ static int test_clrsbs(void) { CMP("%d", USHRT_MAX, 15, __ez80_clrsbs(USHRT_MAX)); for (int i = 0; i < RANDOM_TEST_COUNT; i++) { input = rand16(); - truth = (unsigned char)__builtin_clrsb((signed int)((signed short)input)) - 8; + truth = __builtin_clrsb((signed int)((signed short)input)) - 8; guess = __ez80_clrsbs(input); CMP("%d", input, truth, guess); } @@ -211,7 +211,7 @@ static int test_clrsbs(void) { } static int test_popcounts(void) { - unsigned char truth, guess; + int truth, guess; unsigned short input; CMP("%d", 0, 0, __ez80_popcounts(0)); CMP("%d", 1, 1, __ez80_popcounts(1)); @@ -220,7 +220,7 @@ static int test_popcounts(void) { CMP("%d", USHRT_MAX, 16, __ez80_popcounts(USHRT_MAX)); for (int i = 0; i < RANDOM_TEST_COUNT; i++) { input = rand16(); - truth = (unsigned char)__builtin_popcount((unsigned int)input); + truth = __builtin_popcount((unsigned int)input); guess = __ez80_popcounts(input); CMP("%d", input, truth, guess); } @@ -228,7 +228,7 @@ static int test_popcounts(void) { } static int test_paritys(void) { - unsigned char truth, guess; + int truth, guess; unsigned short input; CMP("%d", 0, false, __ez80_paritys(0)); CMP("%d", 1, true, __ez80_paritys(1)); @@ -237,7 +237,7 @@ static int test_paritys(void) { CMP("%d", USHRT_MAX, false, __ez80_paritys(USHRT_MAX)); for (int i = 0; i < RANDOM_TEST_COUNT; i++) { input = rand16(); - truth = __ez80_paritys(input) % 2; + truth = __ez80_popcounts(input) % 2; guess = __ez80_paritys(input); CMP("%d", input, (int)truth, (int)guess); } @@ -245,7 +245,7 @@ static int test_paritys(void) { } static int test_clzi48(void) { - unsigned char truth, guess; + int truth, guess; uint48_t input; CMP("%012llX", (uint64_t)0, 48, __ez80_clzi48((uint48_t)0)); CMP("%012llX", (uint64_t)1, 47, __ez80_clzi48((uint48_t)1)); @@ -254,7 +254,7 @@ static int test_clzi48(void) { CMP("%012llX", (uint64_t)UINT48_MAX, 0, __ez80_clzi48(UINT48_MAX)); for (int i = 0; i < RANDOM_TEST_COUNT; i++) { input = rand48(); - truth = (input == 0) ? 48 : (unsigned char)(__builtin_clzll((unsigned long long)input) - 16); + truth = (input == 0) ? 48 : __builtin_clzll((unsigned long long)input) - 16; guess = __ez80_clzi48(input); CMP("%012llX", (uint64_t)input, truth, guess); } @@ -262,7 +262,7 @@ static int test_clzi48(void) { } static int test_ctzi48(void) { - unsigned char truth, guess; + int truth, guess; uint48_t input; CMP("%012llX", (uint64_t)0, 48, __ez80_ctzi48((uint48_t)0)); CMP("%012llX", (uint64_t)1, 0, __ez80_ctzi48((uint48_t)1)); @@ -271,7 +271,7 @@ static int test_ctzi48(void) { CMP("%012llX", (uint64_t)UINT48_MAX, 0, __ez80_ctzi48(UINT48_MAX)); for (int i = 0; i < RANDOM_TEST_COUNT; i++) { input = rand48(); - truth = (input == 0) ? 48 : (unsigned char)(__builtin_ctzll((unsigned long long)input)); + truth = (input == 0) ? 48 : __builtin_ctzll((unsigned long long)input); guess = __ez80_ctzi48(input); CMP("%012llX", (uint64_t)input, truth, guess); } @@ -279,7 +279,7 @@ static int test_ctzi48(void) { } static int test_ffsi48(void) { - unsigned char truth, guess; + int truth, guess; uint48_t input; CMP("%012llX", (uint64_t)0, 0, __ez80_ffsi48((uint48_t)0)); CMP("%012llX", (uint64_t)1, 1, __ez80_ffsi48((uint48_t)1)); @@ -288,7 +288,7 @@ static int test_ffsi48(void) { CMP("%012llX", (uint64_t)UINT48_MAX, 1, __ez80_ffsi48(UINT48_MAX)); for (int i = 0; i < RANDOM_TEST_COUNT; i++) { input = rand48(); - truth = (unsigned char)__builtin_ffsll((unsigned long long)input); + truth = __builtin_ffsll((unsigned long long)input); guess = __ez80_ffsi48(input); CMP("%012llX", (uint64_t)input, truth, guess); } @@ -296,7 +296,7 @@ static int test_ffsi48(void) { } static int test_clrsbi48(void) { - unsigned char truth, guess; + int truth, guess; uint48_t input; CMP("%012llX", (uint64_t)0, 47, __ez80_clrsbi48((uint48_t)0)); CMP("%012llX", (uint64_t)1, 46, __ez80_clrsbi48((uint48_t)1)); @@ -305,7 +305,7 @@ static int test_clrsbi48(void) { CMP("%012llX", (uint64_t)UINT48_MAX, 47, __ez80_clrsbi48(UINT48_MAX)); for (int i = 0; i < RANDOM_TEST_COUNT; i++) { input = rand48(); - truth = (unsigned char)__builtin_clrsbll((signed long long)((signed __int48)input)) - 16; + truth = __builtin_clrsbll((signed long long)((signed __int48)input)) - 16; guess = __ez80_clrsbi48(input); CMP("%012llX", (uint64_t)input, truth, guess); } @@ -313,7 +313,7 @@ static int test_clrsbi48(void) { } static int test_popcounti48(void) { - unsigned char truth, guess; + int truth, guess; uint48_t input; CMP("%012llX", (uint64_t)0, 0, __ez80_popcounti48((uint48_t)0)); CMP("%012llX", (uint64_t)1, 1, __ez80_popcounti48((uint48_t)1)); @@ -322,7 +322,7 @@ static int test_popcounti48(void) { CMP("%012llX", (uint64_t)UINT48_MAX, 48, __ez80_popcounti48(UINT48_MAX)); for (int i = 0; i < RANDOM_TEST_COUNT; i++) { input = rand48(); - truth = (unsigned char)__builtin_popcountll((unsigned long long)input); + truth = __builtin_popcountll((unsigned long long)input); guess = __ez80_popcounti48(input); CMP("%012llX", (uint64_t)input, truth, guess); } @@ -330,7 +330,7 @@ static int test_popcounti48(void) { } static int test_parityi48(void) { - bool truth, guess; + int truth, guess; uint48_t input; CMP("%012llX", (uint64_t)0, false, __ez80_parityi48((uint48_t)0)); CMP("%012llX", (uint64_t)1, true, __ez80_parityi48((uint48_t)1)); From 1c404eeb88c6468739bde50845b07e31c09f2983 Mon Sep 17 00:00:00 2001 From: Brendan Fletcher Date: Sat, 19 Apr 2025 19:29:24 -0400 Subject: [PATCH 3/4] what happens when you assume --- src/libc/ilogbf.src | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/libc/ilogbf.src b/src/libc/ilogbf.src index 1ccf51c7d..97d48bb1f 100644 --- a/src/libc/ilogbf.src +++ b/src/libc/ilogbf.src @@ -36,11 +36,11 @@ _ilogbf: sbc hl, de jr z, .ret_zero call __ictlz - ; Carry flag will be set at the end of __ictlz -; scf - sbc hl, hl - neg - add a, 129 + ex de, hl + ; DE was zero, so HL is now zero + dec hl + cpl + add a, 130 ld l, a ret From fd242c44e99cce8c57c7fe99491f341ae0ea315b Mon Sep 17 00:00:00 2001 From: ZERICO2005 <71151164+ZERICO2005@users.noreply.github.com> Date: Sat, 19 Apr 2025 17:54:03 -0600 Subject: [PATCH 4/4] optimized neg to cpl in frexpf and ilogbl --- src/libc/frexpf.src | 4 ++-- src/libc/ilogbl.src | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/libc/frexpf.src b/src/libc/frexpf.src index 76809c5fd..9b5e2682a 100644 --- a/src/libc/frexpf.src +++ b/src/libc/frexpf.src @@ -58,8 +58,8 @@ _frexpf: ld (iy + 3), hl scf sbc hl, hl - neg - add a, 130 ; 127 + 3? idk where this magic number comes from + cpl + add a, 131 ; 127 + 3 + 1? idk where this magic number comes from ld l, a ld a, b ; exponent xor a, $3F diff --git a/src/libc/ilogbl.src b/src/libc/ilogbl.src index 678b81a14..9e824d67a 100644 --- a/src/libc/ilogbl.src +++ b/src/libc/ilogbl.src @@ -41,11 +41,11 @@ _ilogbl: cp a, 64 sbc hl, hl jr z, .ret_zero ; A was 64 - neg + cpl ld l, a - ; -1023 + 11 + 1 + ; -1023 + 11 + 1 + 1 ; Float64_ilogb_subnorm_max + Float64_exponent_bits + Float64_sign_bits - ld de, -1011 + ld de, -1010 add hl, de ret