Skip to content

Commit 97b7040

Browse files
committed
ARCH/X86: Use UCS function to count leading zeros
1 parent 4a0328f commit 97b7040

File tree

5 files changed

+77
-51
lines changed

5 files changed

+77
-51
lines changed

contrib/test_jenkins.sh

+3-6
Original file line numberDiff line numberDiff line change
@@ -1128,12 +1128,9 @@ run_release_mode_tests() {
11281128
# Run nt_buffer_transfer tests
11291129
#
11301130
run_nt_buffer_transfer_tests() {
1131-
if lscpu | grep -q 'AuthenticAMD'
1132-
then
1133-
build release --enable-gtest --enable-optimizations
1134-
echo "==== Running nt_buffer_transfer tests ===="
1135-
./test/gtest/gtest --gtest_filter="test_arch.nt_buffer_transfer_*"
1136-
fi
1131+
build release --enable-gtest --enable-optimizations
1132+
echo "==== Running test_arch tests with optimizations ===="
1133+
./test/gtest/gtest --gtest_filter="test_arch.*"
11371134
}
11381135

11391136
set_ucx_common_test_env() {

src/ucs/arch/bitops.h

+3-2
Original file line numberDiff line numberDiff line change
@@ -121,10 +121,11 @@ BEGIN_C_DECLS
121121
((sizeof(_n) <= 4) ? __builtin_ctz((uint32_t)(_n)) : __builtin_ctzl(_n))
122122

123123
/* Returns the number of leading 0-bits in _n.
124-
* If _n is 0, the result is undefined
125124
*/
126125
#define ucs_count_leading_zero_bits(_n) \
127-
((sizeof(_n) <= 4) ? __builtin_clz((uint32_t)(_n)) : __builtin_clzl(_n))
126+
((_n) ? ((sizeof(_n) <= 4) ? __builtin_clz((uint32_t)(_n)) : \
127+
__builtin_clzl(_n)) : \
128+
(int)(sizeof(_n) * 8))
128129

129130
/* Returns the number of bits lower than 'bit_index' that are set in 'mask'
130131
* For example: ucs_bitmap2idx(mask=0xF0, idx=6) returns 2

src/ucs/arch/x86_64/cpu.c

+3-7
Original file line numberDiff line numberDiff line change
@@ -1060,13 +1060,12 @@ size_t ucs_x86_nt_src_buffer_transfer(void *dst, const void *src, size_t len)
10601060
return len;
10611061
}
10621062

1063-
static UCS_F_ALWAYS_INLINE
1064-
void ucs_x86_copy_bytes_le_128(void *dst, const void *src, size_t len)
1063+
static UCS_F_ALWAYS_INLINE void
1064+
ucs_x86_copy_bytes_le_128(void *dst, const void *src, uint32_t len)
10651065
{
1066-
#if defined (__LZCNT__)
10671066
__m256i y0, y1, y2, y3;
10681067
/* Handle lengths that fall usually within eager short range */
1069-
switch (_lzcnt_u32(len)) {
1068+
switch (ucs_count_leading_zero_bits(len)) {
10701069
/* 0 */
10711070
case 32:
10721071
break;
@@ -1121,9 +1120,6 @@ void ucs_x86_copy_bytes_le_128(void *dst, const void *src, size_t len)
11211120
_mm256_storeu_si256(UCS_PTR_BYTE_OFFSET(dst, len - 32), y3);
11221121
break;
11231122
}
1124-
#else
1125-
memcpy(dst, src, len);
1126-
#endif
11271123
}
11281124

11291125
/* This is an adaptation of the memcpy code from https://github.com/amd/aocl-libmem

test/gtest/ucs/arch/test_x86_64.cc

+32-36
Original file line numberDiff line numberDiff line change
@@ -64,63 +64,70 @@ class test_arch : public ucs::test {
6464
return result;
6565
}
6666

67-
void nt_buffer_transfer_test(ucs_arch_memcpy_hint_t hint) {
67+
void nt_buffer_transfer_test(ucs_arch_memcpy_hint_t hint)
68+
{
6869
#ifndef __AVX__
6970
UCS_TEST_SKIP_R("Built without AVX support");
7071
#else
71-
int i, j, result, ret = 0;
72-
char *test_window_src, *test_window_dst, *src, *dst, *dup;
72+
int i, j;
73+
char *src, *dst;
7374
size_t len, total_size, test_window_size, hole_size, align;
7475

7576
align = 64;
7677
test_window_size = 8 * 1024;
7778
hole_size = 2 * align;
7879

80+
auto msg = [&]() {
81+
std::stringstream ss;
82+
ss << "using length=" << len << " src_align=" << i
83+
<< " dst_align=" << j;
84+
return ss.str();
85+
};
86+
7987
/*
8088
* Allocate a hole above and below the test_window_size
8189
* to check for writes beyond the designated area.
8290
*/
8391
total_size = test_window_size + (2 * hole_size);
8492

85-
ret = posix_memalign((void**)&test_window_src, align, total_size);
86-
if (ret) {
87-
goto src_fail;
88-
}
93+
auto alloc_aligned = [&align, &total_size]() {
94+
void *ptr;
95+
return std::unique_ptr<char>(reinterpret_cast<char*>(
96+
!posix_memalign(&ptr, align, total_size) ? ptr : nullptr));
97+
};
8998

90-
ret = posix_memalign((void**)&test_window_dst, align, total_size);
91-
if (ret) {
92-
goto dst_fail;
93-
}
99+
auto test_window_src = alloc_aligned();
100+
auto test_window_dst = alloc_aligned();
101+
auto dup = alloc_aligned();
94102

95-
ret = posix_memalign((void**)&dup, align, total_size);
96-
if (ret) {
97-
goto dup_fail;
98-
}
103+
ASSERT_TRUE(test_window_src);
104+
ASSERT_TRUE(test_window_dst);
105+
ASSERT_TRUE(dup);
99106

100-
src = test_window_src + hole_size;
101-
dst = test_window_dst + hole_size;
107+
src = test_window_src.get() + hole_size;
108+
dst = test_window_dst.get() + hole_size;
102109

103110
/* Initialize the regions with known patterns */
104-
memset(dup, 0x0, total_size);
105-
memset(test_window_src, 0xdeaddead, total_size);
106-
memset(test_window_dst, 0x0, total_size);
111+
memset(dup.get(), 0x0, total_size);
112+
memset(test_window_src.get(), 0xdeaddead, total_size);
113+
memset(test_window_dst.get(), 0x0, total_size);
107114

108115
len = 0;
109116

110117
while (len < test_window_size) {
111118
for (i = 0; i < align; i++) {
112119
for (j = 0; j < align; j++) {
113120
/* Perform the transfer */
114-
ucs_x86_nt_buffer_transfer(dst + i, src + j, len, hint, len);
115-
result = memcmp(src + j, dst + i, len);
116-
EXPECT_EQ(0, result);
121+
ucs_x86_nt_buffer_transfer(dst + i, src + j, len, hint,
122+
len);
123+
ASSERT_EQ(0, memcmp(src + j, dst + i, len)) << msg();
117124

118125
/* reset the copied region back to zero */
119126
memset(dst + i, 0x0, len);
120127

121128
/* check for any modifications in the holes */
122-
result = memcmp(test_window_dst, dup, total_size);
123-
EXPECT_EQ(0, result);
129+
ASSERT_EQ(0, memcmp(test_window_dst.get(), dup.get(),
130+
total_size));
124131
}
125132
}
126133
/* Check for each len for less than 1k sizes
@@ -132,17 +139,6 @@ class test_arch : public ucs::test {
132139
len += 53;
133140
}
134141
}
135-
136-
free(dup);
137-
138-
dup_fail:
139-
free(test_window_dst);
140-
dst_fail:
141-
free(test_window_src);
142-
src_fail:
143-
if (ret) {
144-
UCS_TEST_ABORT("Failed to allocate memory: " << strerror(ret));
145-
}
146142
#endif
147143
}
148144
};

test/gtest/ucs/test_bitops.cc

+36
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,42 @@ UCS_TEST_F(test_bitops, is_equal) {
138138
test_bitops::check_bitwise_equality(buffer1, buffer2, indices, 0);
139139
}
140140

141+
template<typename T> void test_clz()
142+
{
143+
constexpr int bits = sizeof(T) * 8;
144+
T v = 1;
145+
146+
for (int i = bits - 1; v != 0; v <<= 1, --i) {
147+
ASSERT_EQ(i, ucs_count_leading_zero_bits(v));
148+
}
149+
150+
ASSERT_EQ(bits, ucs_count_leading_zero_bits(v));
151+
}
152+
153+
UCS_TEST_F(test_bitops, clz) {
154+
test_clz<uint32_t>();
155+
test_clz<uint64_t>();
156+
test_clz<int32_t>();
157+
test_clz<int64_t>();
158+
test_clz<size_t>();
159+
test_clz<ssize_t>();
160+
}
161+
162+
UCS_TEST_F(test_bitops, clz_type)
163+
{
164+
EXPECT_GT(0, ucs_count_leading_zero_bits(~0LLU) - 1);
165+
166+
EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(~0LLU) - 1);
167+
EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(0LLU) - 65);
168+
EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(~0U) - 1);
169+
EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(0U) - 33);
170+
171+
EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(~0LL) - 1);
172+
EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(0LL) - 65);
173+
EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(~0) - 1);
174+
EXPECT_EQ(UINT32_MAX, ucs_count_leading_zero_bits(0) - 33);
175+
}
176+
141177
template<typename Type> void test_mask()
142178
{
143179
Type expected = 0;

0 commit comments

Comments
 (0)