@@ -355,9 +355,13 @@ __XETLA_API xetla_vector<T, N> xetla_load_global(
355355 __ESIMD_NS::cache_hint_L1<gpu::xetla::detail::get_cache_hint (L1H)>,
356356 __ESIMD_NS::cache_hint_L2<gpu::xetla::detail::get_cache_hint (L2H)>,
357357 __ESIMD_NS::alignment<alignment>};
358- if constexpr (sizeof (T) * N < sizeof (uint32_t )) {
359- xetla_vector<uint32_t , N> offsets (byte_offset, sizeof (T));
360- return __ESIMD_NS::gather<T, N, uint32_t >(ptr, offsets);
358+ if constexpr (sizeof (T) * N < sizeof (uint32_t ) || N == 1 ) {
359+ xetla_vector<T, N> ret;
360+ #pragma unroll
361+ for (uint32_t i = 0 ; i < N; i++) {
362+ ret[i] = ptr[i + byte_offset / sizeof (T)];
363+ }
364+ return ret;
361365 } else {
362366 return __ESIMD_NS::block_load<T, N>(ptr, byte_offset, props);
363367 }
@@ -501,9 +505,11 @@ __XETLA_API void xetla_store_global(
501505 __ESIMD_NS::cache_hint_L2<gpu::xetla::detail::get_cache_hint (L2H)>,
502506 __ESIMD_NS::alignment<alignment>};
503507
504- if constexpr (sizeof (T) * N < sizeof (uint32_t )) {
505- xetla_vector<uint32_t , N> offsets (byte_offset, sizeof (T));
506- return __ESIMD_NS::scatter<T, N, uint32_t >(ptr, offsets, vals);
508+ if constexpr (sizeof (T) * N < sizeof (uint32_t ) || N == 1 ) {
509+ #pragma unroll
510+ for (uint32_t i = 0 ; i < N; i++) {
511+ ptr[i + byte_offset / sizeof (T)] = vals[i];
512+ }
507513 } else {
508514 __ESIMD_NS::block_store<T, N>(ptr, byte_offset, vals, props);
509515 }
0 commit comments