@@ -256,7 +256,7 @@ constexpr __ESIMD_NS::atomic_op get_atomic_op(gpu::xetla::atomic_op ao) {
256256// /
257257template <
258258 typename Ty,
259- uint8_t NElts = 1 ,
259+ int NElts = 1 ,
260260 data_size DS = data_size::default_size,
261261 cache_hint L1H = cache_hint::cached,
262262 cache_hint L2H = cache_hint::cached,
@@ -293,7 +293,7 @@ __XETLA_API void xetla_prefetch_global(
293293// /
294294template <
295295 typename Ty,
296- uint8_t NElts = 1 ,
296+ int NElts = 1 ,
297297 data_size DS = data_size::default_size,
298298 cache_hint L1H = cache_hint::cached,
299299 cache_hint L2H = cache_hint::cached>
@@ -385,7 +385,7 @@ __XETLA_API xetla_vector<T, N> xetla_load_global(
385385// /
386386template <
387387 typename Ty,
388- uint8_t NElts = 1 ,
388+ int NElts = 1 ,
389389 data_size DS = data_size::default_size,
390390 cache_hint L1H = cache_hint::none,
391391 cache_hint L2H = cache_hint::none,
@@ -431,7 +431,7 @@ __XETLA_API xetla_vector<Ty, N * NElts> xetla_load_global(
431431// /
432432template <
433433 typename Ty,
434- uint8_t NElts = 1 ,
434+ int NElts = 1 ,
435435 data_size DS = data_size::default_size,
436436 cache_hint L1H = cache_hint::none,
437437 cache_hint L2H = cache_hint::none,
@@ -653,7 +653,7 @@ __XETLA_API void xetla_local_init() {
653653// /
654654template <
655655 typename Ty,
656- uint8_t NElts = 1 ,
656+ int NElts = 1 ,
657657 data_size DS = data_size::default_size,
658658 int N>
659659__XETLA_API xetla_vector<Ty, N * NElts> xetla_load_local (
@@ -670,35 +670,31 @@ __XETLA_API xetla_vector<Ty, N * NElts> xetla_load_local(
670670 xetla_cvt<uint64_t , uint32_t >(offsets), pred);
671671}
672672
673- // / @brief SLM block load. (transposed gather with 1 channel).
674- // / Collects elements located at slm and returns them as a single \ref
675- // / xetla_vector object.
676- // /
677- // / Supported platforms: DG2, PVC
678- // /
679- // / VISA instruction: lsc_load.slm
680- // /
681- // / @tparam Ty is element type.
682- // / @tparam NElts is the number of elements to load per address (i.e.
683- // / vector_size per SIMD channel).
684- // / @tparam DS is the data size.
685- // / @param offset [in] is the zero-based offset for SLM buffer in bytes.
686- // / @return is a xetla_vector of type T and size NElts.
687- // /
688- template <
689- typename Ty,
690- uint8_t NElts = 1 ,
691- data_size DS = data_size::default_size>
673+ // / Loads a contiguous block of SLM memory referenced by the given byte-offset
674+ // / \p offset, then returns the loaded data as a simd object.
675+ // / The generated code depends on the combination {T, N, Flags}.
676+ // / Providing flags specifying the alignment of 16-bytes or more produces more
677+ // / efficient code. If the alignment is smaller than 16-bytes, then less
678+ // / efficient gather is generated. If the loaded vector is too long
679+ // / for 1 flat-load GPU instruction, then a series of flat-loads and/or gathers
680+ // / may be generated.
681+ // / @tparam T Element type.
682+ // / @tparam N Number of elements to load.
683+ // / @tparam Flags The alignment specifier type tag.
684+ // / @param byte_offset The byte-offset to load from.
685+ // / @param Flags Specifies the alignment.
686+ // / @return A vector of loaded elements.
687+ // /
688+ template <typename Ty, int NElts = 1 , data_size DS = data_size::default_size>
692689__XETLA_API xetla_vector<Ty, NElts> xetla_load_local (uint32_t offset) {
693690 using T = native_type_t <Ty>;
694- DEBUG_INVOKE (
695- dbg_level::core,
696- core::general_1d<gpu_arch::XeHpc, Ty>::template check_restriction<NElts>(
697- (uint64_t )offset));
691+ // DEBUG_INVOKE(
692+ // dbg_level::core,
693+ // core::general_1d<gpu_arch::XeHpc, Ty>::template
694+ // check_restriction<NElts>(
695+ // (uint64_t)offset));
698696
699- return __ESIMD_ENS::
700- lsc_slm_block_load<T, NElts, gpu::xetla::detail::get_data_size (DS)>(
701- offset);
697+ return __ESIMD_NS::slm_block_load<T, NElts>(offset);
702698}
703699
704700// / @brief SLM scattered store.
@@ -719,7 +715,7 @@ __XETLA_API xetla_vector<Ty, NElts> xetla_load_local(uint32_t offset) {
719715// /
720716template <
721717 typename Ty,
722- uint8_t NElts = 1 ,
718+ int NElts = 1 ,
723719 data_size DS = data_size::default_size,
724720 int N>
725721__XETLA_API void xetla_store_local (
@@ -737,36 +733,38 @@ __XETLA_API void xetla_store_local(
737733 offsets, vals, pred);
738734}
739735
740- // / @brief SLM block store (transposed SLM scatter with 1 channel).
741- // / Scatters elements located to slm.
742- // /
743- // / Supported platforms: DG2, PVC
744- // /
745- // / VISA instruction: lsc_store.slm
746- // /
747- // / @tparam Ty is element type.
748- // / @tparam NElts is the number of elements to store per address (i.e.
749- // / vector_size per SIMD channel).
750- // / @tparam DS is the data size.
751- // / @param offset [in] is the zero-based offset for SLM buffer in bytes.
752- // / @param vals [in] is values to store.
753- // /
754- template <
755- typename Ty,
756- uint8_t NElts = 1 ,
757- data_size DS = data_size::default_size>
736+ // / Stores elements of the vector \p vals to a contiguous block of SLM memory
737+ // / at the given byte-offset \p offset.
738+ // / The generated code depends on the combination {T, N, Flags}.
739+ // / Providing flags specifying the alignment of 16-bytes or more produces more
740+ // / efficient code. If the alignment is smaller than 16-bytes, then less
741+ // / efficient scatter is generated. If the stored vector is too long
742+ // / for 1 flat-store GPU instruction, then a series of flat-store and/or
743+ // / scatters may be generated.
744+ // / @tparam T Element type.
745+ // / @tparam N Number of elements to store.
746+ // / @tparam Flags The alignment specifier type tag.
747+ // / @param offset The byte-offset to store at.
748+ // / @param vals The vector to store.
749+ // / @param Flags Specifies the alignment.
750+ // /
751+ template <typename Ty, int NElts = 1 , data_size DS = data_size::default_size>
758752__XETLA_API void xetla_store_local (
759753 uint32_t offset,
760754 xetla_vector<Ty, NElts> vals) {
761- using T = native_type_t <Ty>;
762- DEBUG_INVOKE (
763- dbg_level::core,
764- core::general_1d<gpu_arch::XeHpc, Ty>::template check_restriction<NElts>(
765- offset));
766-
767- __ESIMD_ENS::
768- lsc_slm_block_store<T, NElts, gpu::xetla::detail::get_data_size (DS)>(
769- offset, vals);
755+ // using T = native_type_t<Ty>;
756+ // DEBUG_INVOKE(
757+ // dbg_level::core,
758+ // core::general_1d<gpu_arch::XeHpc, Ty>::template
759+ // check_restriction<NElts>(
760+ // offset));
761+
762+ // __ESIMD_ENS::
763+ // lsc_slm_block_store<T, NElts, gpu::xetla::detail::get_data_size(DS)>(
764+ // offset, vals);
765+ // __ESIMD_NS::properties props{};
766+
767+ __ESIMD_NS::slm_block_store<Ty, NElts>(offset, vals);
770768}
771769
772770// / @brief SLM scattered atomic (0 src).
0 commit comments