8#if CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
28 return __builtin_amdgcn_readfirstlane(
static_cast<uint32_t>(v));
33 return __builtin_amdgcn_readfirstlane(
static_cast<uint32_t>(v));
38 return __builtin_amdgcn_readfirstlane(
value);
43 return __builtin_amdgcn_readfirstlane(
value);
46template <
typename Object, std::enable_if_t<std::is_trivially_copyable_v<Object>,
int> = 0>
49 constexpr size_t ObjectSize =
sizeof(Object);
50 constexpr size_t SGPR_size = 4;
51 constexpr size_t NumFull = ObjectSize / SGPR_size;
52 constexpr size_t Tail = ObjectSize % SGPR_size;
54 const unsigned char* src =
reinterpret_cast<const unsigned char*
>(&obj);
55 alignas(Object)
unsigned char dst[ObjectSize];
58 constexpr size_t offset = Ic * SGPR_size;
60 __builtin_memcpy(&read_src, src +
offset, SGPR_size);
61 read_src = __builtin_amdgcn_readfirstlane(read_src);
62 __builtin_memcpy(dst +
offset, &read_src, SGPR_size);
65 if constexpr(Tail != 0)
67 constexpr size_t offset = NumFull * SGPR_size;
69 __builtin_memcpy(&tail_loc, src +
offset, Tail);
70 tail_loc = __builtin_amdgcn_readfirstlane(tail_loc);
71 __builtin_memcpy(dst +
offset, &tail_loc, Tail);
74 __builtin_memcpy(&out, dst, ObjectSize);
87template <
typename ForceSGPR = std::false_type>
94 if constexpr(std::is_same_v<ForceSGPR, std::true_type>)
108template<
typename T>
struct buffer_load_trait<4 , T> {
using payload_t = float; };
109template<
typename T>
struct buffer_load_trait<2 , T> {
using payload_t = float; };
110template<
typename T>
struct buffer_load_trait<1 , T> {
using payload_t = float; };
112#if CK_TILE_BUFFER_LOAD_RAW_BF16_WA
121template <index_t
bytes,
bool pre_nop = false>
123#pragma clang diagnostic push
124#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
127template <
bool pre_nop>
130 template <
typename T>
139 static_assert(
sizeof(T) == 16);
141 if constexpr(pre_nop)
142 asm volatile(
"s_nop 4\n"
143 "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
144 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
145 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
148 asm volatile(
"buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
149 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
150 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
155template <
bool pre_nop>
158 template <
typename T>
167 static_assert(
sizeof(T) == 8);
169 if constexpr(pre_nop)
170 asm volatile(
"s_nop 4\n"
171 "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
172 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
173 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
176 asm volatile(
"buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
177 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
178 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
183template <
bool pre_nop>
186 template <
typename T>
195 static_assert(
sizeof(T) == 4);
197 if constexpr(pre_nop)
198 asm volatile(
"s_nop 4\n"
199 "buffer_load_dword %0, %1, %2, 0 offen offset:%3"
200 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
201 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
204 asm volatile(
"buffer_load_dword %0, %1, %2, 0 offen offset:%3"
205 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
206 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
211template <
bool pre_nop>
214 template <
typename T>
223 static_assert(
sizeof(T) == 4);
225 if constexpr(pre_nop)
226 asm volatile(
"s_nop 4\n"
227 "buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
228 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
229 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
232 asm volatile(
"buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
233 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
234 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
239template <
bool pre_nop>
242 template <
typename T>
251 static_assert(
sizeof(T) == 4);
253 if constexpr(pre_nop)
254 asm volatile(
"s_nop 4\n"
255 "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
256 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
257 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
260 asm volatile(
"buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
261 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
262 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
267template <index_t
bytes,
bool pre_nop = false>
270template <
bool pre_nop>
273 template <
typename T>
282 static_assert(
sizeof(T) == 16);
283 auto saved_exec = __builtin_amdgcn_read_exec();
285 static_assert(
sizeof(mbuf_t) ==
sizeof(T));
286 if constexpr(pre_nop)
287 asm volatile(
"s_nop 4\n"
288 "v_cmpx_le_u32 exec, 1, %4\n"
289 "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
291 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
292 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
295 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
296 "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
298 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
299 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
304template <
bool pre_nop>
307 template <
typename T>
316 static_assert(
sizeof(T) == 8);
317 auto saved_exec = __builtin_amdgcn_read_exec();
319 if constexpr(pre_nop)
320 asm volatile(
"s_nop 4\n"
321 "v_cmpx_le_u32 exec, 1, %4\n"
322 "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
324 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
325 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
328 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
329 "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
331 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
332 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
337template <
bool pre_nop>
340 template <
typename T>
349 static_assert(
sizeof(T) == 4);
350 auto saved_exec = __builtin_amdgcn_read_exec();
352 if constexpr(pre_nop)
353 asm volatile(
"s_nop 4\n"
354 "v_cmpx_le_u32 exec, 1, %4\n"
355 "buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
357 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
358 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
361 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
362 "buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
364 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
365 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
370template <
bool pre_nop>
373 template <
typename T>
382 static_assert(
sizeof(T) == 4);
383 auto saved_exec = __builtin_amdgcn_read_exec();
385 if constexpr(pre_nop)
386 asm volatile(
"s_nop 4\n"
387 "v_cmpx_le_u32 exec, 1, %4\n"
388 "buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
390 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
391 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
394 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
395 "buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
397 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
398 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
403template <
bool pre_nop>
406 template <
typename T>
415 static_assert(
sizeof(T) == 4);
416 auto saved_exec = __builtin_amdgcn_read_exec();
418 if constexpr(pre_nop)
419 asm volatile(
"s_nop 4\n"
420 "v_cmpx_le_u32 exec, 1, %4\n"
421 "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
423 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
424 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
427 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
428 "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
430 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
431 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
435#pragma clang diagnostic pop
436template <index_t
bytes>
442 template <
typename T>
450 static_assert(
sizeof(T) == 16);
452 asm volatile(
"buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3"
462 template <
typename T>
470 static_assert(
sizeof(T) == 8);
472 asm volatile(
"buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3"
482 template <
typename T>
490 static_assert(
sizeof(T) == 4);
491 using mbuf_t = float;
492 asm volatile(
"buffer_store_dword %0, %1, %2, 0 offen offset:%3"
502 template <
typename T>
510 static_assert(
sizeof(T) == 2);
511 using mbuf_t = short;
512 asm volatile(
"buffer_store_short %0, %1, %2, 0 offen offset:%3"
522 template <
typename T>
530 static_assert(
sizeof(T) == 4);
531 using mbuf_t = float;
532 asm volatile(
"buffer_store_byte %0, %1, %2, 0 offen offset:%3"
539template <index_t
bytes>
545 template <
typename T>
553 static_assert(
sizeof(T) == 16);
554 auto save_exec = __builtin_amdgcn_read_exec();
556 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
557 "buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
573 template <
typename T>
581 static_assert(
sizeof(T) == 8);
582 auto save_exec = __builtin_amdgcn_read_exec();
584 using mbuf_t =
ext_vector_t<
typename T::value_type, T::size()>;
585 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
586 "buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
602 template <
typename T>
610 static_assert(
sizeof(T) == 4);
611 auto save_exec = __builtin_amdgcn_read_exec();
612 using mbuf_t = float;
613 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
614 "buffer_store_dword %0, %1, %2, 0 offen offset:%3\n"
630 template <
typename T>
638 static_assert(
sizeof(T) == 2);
639 auto save_exec = __builtin_amdgcn_read_exec();
640 using mbuf_t = short;
641 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
642 "buffer_store_short %0, %1, %2, 0 offen offset:%3\n"
658 template <
typename T>
666 static_assert(
sizeof(T) == 4);
667 auto save_exec = __builtin_amdgcn_read_exec();
668 using mbuf_t = float;
669 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
670 "buffer_store_byte %0, %1, %2, 0 offen offset:%3\n"
685 asm volatile(
"s_waitcnt vmcnt(%0)" : :
"n"(cnt) :
"memory");
690 asm volatile(
"s_waitcnt lgkmcnt(%0)" : :
"n"(cnt) :
"memory");
693template <
typename scalar_type, index_t N,
bool pre_nop = false>
696template <
bool pre_nop>
699 template <
typename T>
707 static_assert(
sizeof(T) == 4);
708 auto save_exec = __builtin_amdgcn_read_exec();
709 using mbuf_t = float;
710 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
711 "global_atomic_pk_add_bf16 %0, %1, %2 offset:%3\n"
724template <
typename scalar_type, index_t N,
bool pre_nop = false>
727template <
bool pre_nop>
730 template <
typename T>
738 static_assert(
sizeof(T) == 4);
739 using mbuf_t = float;
740 asm volatile(
"global_atomic_pk_add_bf16 %0, %1, %2 offset:%3"
754template<
typename T>
struct smem_load_trait<4 , T> {
using payload_t = float; };
755template<
typename T>
struct smem_load_trait<2 , T> {
using payload_t = float; };
756template<
typename T>
struct smem_load_trait<1 , T> {
using payload_t = float; };
768 template <
typename T>
771 static_assert(
sizeof(T) == 16);
773 asm volatile(
"ds_read_b128 %0, %1 offset:%2"
774 :
"=v"(
reinterpret_cast<mbuf_t&
>(
value))
775 :
"v"(v_offset),
"n"(i_offset)
783 template <
typename T>
786 static_assert(
sizeof(T) == 8);
788 asm volatile(
"ds_read_b64 %0, %1 offset:%2"
789 :
"=v"(
reinterpret_cast<mbuf_t&
>(
value))
790 :
"v"(v_offset),
"n"(i_offset)
798 template <
typename T>
801 static_assert(
sizeof(T) == 4);
803 asm volatile(
"ds_read_b32 %0, %1 offset:%2"
804 :
"=v"(
reinterpret_cast<mbuf_t&
>(
value))
805 :
"v"(v_offset),
"n"(i_offset)
813 template <
typename T>
816 static_assert(
sizeof(T) == 4);
818 asm volatile(
"ds_read_u16 %0, %1 offset:%2"
819 :
"=v"(
reinterpret_cast<mbuf_t&
>(
value))
820 :
"v"(v_offset),
"n"(i_offset)
828 template <
typename T>
831 static_assert(
sizeof(T) == 4);
833 asm volatile(
"ds_read_u8 %0, %1 offset:%2"
834 :
"=v"(
reinterpret_cast<mbuf_t&
>(
value))
835 :
"v"(v_offset),
"n"(i_offset)
852 static_for<0, kSize, 1>{}([&](
auto i){
853 asm volatile(
" " : :
"v"(b.get(
number<i>{})) :
"memory");
911 using da_type = array<float, (
sizeof(T) + 3) / 4>;
912 auto & dummy =
reinterpret_cast<da_type&
>(buffer);
916template<
typename Tx,
typename... Ty>
924template <
typename... T>
927 asm volatile(
"s_waitcnt vmcnt(%0)" : :
"n"(cnt) :
"memory");
933 asm volatile(
"s_waitcnt vmcnt(%0)" : :
"n"(cnt) :
"memory");
938 asm volatile(
"s_waitcnt vmcnt(%0)" : :
"n"(cnt) :
"memory");
1174 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
1183 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16");
1199 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.f32");
1207 int glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fmax.f64");
1217 index_t aux) __asm(
"llvm.amdgcn.raw.buffer.load.lds");
1219template <
unsigned num_dwords,
bool pre_nop = false>
1228#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr) \
1229 if constexpr(pre_nop) \
1230 asm volatile("s_nop 4\n" instr " %1, %2, 0 offen offset:%3 lds" \
1232 : "v"(voffset), "s"(rsrc), "n"(ioffset) \
1235 asm volatile(instr " %1, %2, 0 offen offset:%3 lds" \
1237 : "v"(voffset), "s"(rsrc), "n"(ioffset) \
1240 if constexpr(num_dwords == 1)
1244#if defined(__gfx950__)
1245 else if constexpr(num_dwords == 3)
1249 else if constexpr(num_dwords == 4)
1256 static_assert(
false,
"wrong! not implemented data width");
1258#undef CK_TILE_ASYNC_LOAD_WITH_INSTR
1263 asm volatile(
"s_waitcnt vmcnt(%0)" : :
"n"(cnt) :
"memory");
1294 index_t src_thread_addr_offset,
1297 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
1298 "wrong! not implemented");
1302 if constexpr(N == 1)
1305 src_thread_addr_offset,
1306 src_wave_addr_offset,
1307 static_cast<index_t>(coherence)));
1309 else if constexpr(N == 2)
1313 src_thread_addr_offset,
1314 src_wave_addr_offset,
1315 static_cast<index_t>(coherence));
1319 else if constexpr(N == 4)
1322 src_thread_addr_offset,
1323 src_wave_addr_offset,
1324 static_cast<index_t>(coherence));
1328 else if constexpr(N == 8)
1331 src_thread_addr_offset,
1332 src_wave_addr_offset,
1333 static_cast<index_t>(coherence));
1337 else if constexpr(N == 16)
1340 src_thread_addr_offset,
1341 src_wave_addr_offset,
1342 static_cast<index_t>(coherence));
1345 else if constexpr(N == 32)
1348 src_thread_addr_offset,
1349 src_wave_addr_offset,
1350 static_cast<index_t>(coherence));
1353 src_thread_addr_offset,
1354 src_wave_addr_offset + 4 *
sizeof(
int32_t),
1355 static_cast<index_t>(coherence));
1358 tmp.template get_as<int32x4_t>()(
number<0>{}) = tmp0;
1359 tmp.template get_as<int32x4_t>()(
number<1>{}) = tmp1;
1363 else if constexpr(N == 64)
1366 src_thread_addr_offset,
1367 src_wave_addr_offset,
1368 static_cast<index_t>(coherence));
1371 src_thread_addr_offset,
1372 src_wave_addr_offset + 4 *
sizeof(
int32_t),
1373 static_cast<index_t>(coherence));
1376 src_thread_addr_offset,
1377 src_wave_addr_offset + 8 *
sizeof(
int32_t),
1378 static_cast<index_t>(coherence));
1381 src_thread_addr_offset,
1382 src_wave_addr_offset + 12 *
sizeof(
int32_t),
1383 static_cast<index_t>(coherence));
1387 tmp.template get_as<int32x4_t>()(
number<0>{}) = tmp0;
1388 tmp.template get_as<int32x4_t>()(
number<1>{}) = tmp1;
1389 tmp.template get_as<int32x4_t>()(
number<2>{}) = tmp2;
1390 tmp.template get_as<int32x4_t>()(
number<3>{}) = tmp3;
1396#ifndef BUFFER_LOAD_USE_INLINEASM
1397#define BUFFER_LOAD_USE_INLINEASM 0
1400template <
typename T,
1404 index_t src_thread_addr_offset,
1408 (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
1409 (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1410 (std::is_same<T, fp16_t>::value &&
1411 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
1412 (std::is_same<T, bf16_t>::value &&
1413 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
1414 (std::is_same<T, int32_t>::value &&
1415 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1416 (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1417 (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1418 (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1419 (std::is_same<T, e8m0_bexp_t>::value &&
1420 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1421 (std::is_same<T, pk_fp4_raw_t>::value &&
1422 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1423 (std::is_same<T, pk_int4_t>::value &&
1424 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32) ||
1425 (std::is_same<T, pk_fp4_t>::value &&
1426 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16))),
1427 "wrong! not implemented");
1431 if constexpr(std::is_same<T, float>::value)
1433 if constexpr(N == 1)
1437 src_thread_addr_offset,
1438 src_wave_addr_offset,
1439 static_cast<index_t>(coherence)));
1441 else if constexpr(N == 2)
1445 src_thread_addr_offset,
1446 src_wave_addr_offset,
1447 static_cast<index_t>(coherence)));
1449 else if constexpr(N == 4)
1453 src_thread_addr_offset,
1454 src_wave_addr_offset,
1455 static_cast<index_t>(coherence)));
1457 else if constexpr(N == 8)
1461 tmp.template get_as<fp32x4_t>()(
number<0>{}) =
1463 src_thread_addr_offset,
1464 src_wave_addr_offset,
1465 static_cast<index_t>(coherence));
1467 tmp.template get_as<fp32x4_t>()(
number<1>{}) =
1469 src_thread_addr_offset,
1470 src_wave_addr_offset + 4 *
sizeof(
float),
1471 static_cast<index_t>(coherence));
1475 else if constexpr(N == 16)
1479 tmp.template get_as<fp32x4_t>()(
number<0>{}) =
1481 src_thread_addr_offset,
1482 src_wave_addr_offset,
1483 static_cast<index_t>(coherence));
1485 tmp.template get_as<fp32x4_t>()(
number<1>{}) =
1487 src_thread_addr_offset,
1488 src_wave_addr_offset + 4 *
sizeof(
float),
1489 static_cast<index_t>(coherence));
1491 tmp.template get_as<fp32x4_t>()(
number<2>{}) =
1493 src_thread_addr_offset,
1494 src_wave_addr_offset + 8 *
sizeof(
float),
1495 static_cast<index_t>(coherence));
1497 tmp.template get_as<fp32x4_t>()(
number<3>{}) =
1499 src_thread_addr_offset,
1500 src_wave_addr_offset + 12 *
sizeof(
float),
1501 static_cast<index_t>(coherence));
1506 else if constexpr(std::is_same<T, fp16_t>::value)
1508 if constexpr(N == 1)
1512 src_thread_addr_offset,
1513 src_wave_addr_offset,
1514 static_cast<index_t>(coherence)));
1516 else if constexpr(N == 2)
1520 src_thread_addr_offset,
1521 src_wave_addr_offset,
1522 static_cast<index_t>(coherence)));
1524 else if constexpr(N == 4)
1528 src_thread_addr_offset,
1529 src_wave_addr_offset,
1530 static_cast<index_t>(coherence)));
1540 src_wave_buffer_resource,
1541 src_thread_addr_offset,
1542 src_wave_addr_offset + (chunk * 4) *
sizeof(
float),
1543 static_cast<index_t>(coherence));
1548 else if constexpr(std::is_same<T, bf16_t>::value)
1550 if constexpr(N == 1)
1554 src_thread_addr_offset,
1555 src_wave_addr_offset,
1556 static_cast<index_t>(coherence)));
1558 else if constexpr(N == 2)
1562 src_thread_addr_offset,
1563 src_wave_addr_offset,
1564 static_cast<index_t>(coherence)));
1566 else if constexpr(N == 4)
1570 src_thread_addr_offset,
1571 src_wave_addr_offset,
1572 static_cast<index_t>(coherence)));
1582 src_wave_buffer_resource,
1583 src_thread_addr_offset,
1584 src_wave_addr_offset + (chunk * 4) *
sizeof(
float),
1585 static_cast<index_t>(coherence));
1593 src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
1599template <
typename T,
1602 bool oob_conditional_check =
true,
1603 bool pre_nop =
false>
1606 index_t src_thread_addr_offset,
1608 index_t src_linear_addr_offset,
1612 constexpr index_t bytes =
sizeof(T) * N;
1613 static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
1614 "wrong! not supported by buffer_load instruction");
1617 if constexpr(oob_conditional_check)
1620 src_wave_buffer_resource,
1621 src_thread_addr_offset,
1622 src_wave_addr_offset,
1623 src_linear_addr_offset,
1630 src_wave_buffer_resource,
1631 src_thread_addr_offset,
1632 src_wave_addr_offset,
1633 src_linear_addr_offset,
1639template <
typename T,
1642 bool pre_nop =
false>
1645 index_t src_thread_addr_offset,
1647 index_t src_immediate_addr_offset = 0,
1650 constexpr index_t num_bytes =
sizeof(T) * N;
1651 constexpr index_t num_words = num_bytes / 4;
1652 static_assert(num_bytes % 4 == 0 && (num_words == 1 || num_words == 3 || num_words == 4),
1653 "wrong! only support in dword, dwordx3, dwordx4");
1656 src_wave_buffer_resource,
1657 src_thread_addr_offset,
1658 src_wave_addr_offset,
1659 src_immediate_addr_offset,
1664template <
typename T,
1667 bool oob_conditional_check =
true>
1670 index_t src_thread_addr_offset,
1672 index_t src_immediate_addr_offset = 0,
1676 constexpr index_t bytes =
sizeof(T) * N;
1680 assert(src_immediate_addr_offset == 0 &&
1681 "wrong! not implemented src_immediate_addr_offset size, only 0 supported");
1682 ignore = src_immediate_addr_offset;
1684#if defined(__gfx950__)
1685 static_assert(bytes == 4 || bytes == 12 || bytes == 16,
1686 "wrong! only support in dword, dwordx3, dwordx4");
1687 src_wave_addr_offset = 0;
1689 static_assert(bytes == 4,
"wrong! not implemented vector size");
1693 index_t v_offset = src_thread_addr_offset;
1694 if constexpr(oob_conditional_check)
1695 v_offset = flag ? v_offset : src_wave_buffer_resource[2];
1697#pragma clang diagnostic push
1698#pragma clang diagnostic ignored "-Wold-style-cast"
1704 src_wave_addr_offset,
1706 static_cast<index_t>(coherence));
1707#pragma clang diagnostic pop
1714 index_t dst_thread_addr_offset,
1717 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
1718 "wrong! not implemented");
1720 if constexpr(N == 1)
1723 dst_wave_buffer_resource,
1724 dst_thread_addr_offset,
1725 dst_wave_addr_offset,
1726 static_cast<index_t>(coherence));
1728 else if constexpr(N == 2)
1732 dst_wave_buffer_resource,
1733 dst_thread_addr_offset,
1734 dst_wave_addr_offset,
1735 static_cast<index_t>(coherence));
1737 else if constexpr(N == 4)
1740 dst_wave_buffer_resource,
1741 dst_thread_addr_offset,
1742 dst_wave_addr_offset,
1743 static_cast<index_t>(coherence));
1745 else if constexpr(N == 8)
1748 dst_wave_buffer_resource,
1749 dst_thread_addr_offset,
1750 dst_wave_addr_offset,
1751 static_cast<index_t>(coherence));
1753 else if constexpr(N == 16)
1756 dst_wave_buffer_resource,
1757 dst_thread_addr_offset,
1758 dst_wave_addr_offset,
1759 static_cast<index_t>(coherence));
1761 else if constexpr(N == 32)
1764 src_thread_data.template get_as<int32x4_t>()[
number<0>{}],
1765 dst_wave_buffer_resource,
1766 dst_thread_addr_offset,
1767 dst_wave_addr_offset,
1768 static_cast<index_t>(coherence));
1771 src_thread_data.template get_as<int32x4_t>()[
number<1>{}],
1772 dst_wave_buffer_resource,
1773 dst_thread_addr_offset,
1774 dst_wave_addr_offset +
sizeof(
int32_t) * 4,
1775 static_cast<index_t>(coherence));
1777 else if constexpr(N == 64)
1780 src_thread_data.template get_as<int32x4_t>()[
number<0>{}],
1781 dst_wave_buffer_resource,
1782 dst_thread_addr_offset,
1783 dst_wave_addr_offset,
1784 static_cast<index_t>(coherence));
1787 src_thread_data.template get_as<int32x4_t>()[
number<1>{}],
1788 dst_wave_buffer_resource,
1789 dst_thread_addr_offset,
1790 dst_wave_addr_offset +
sizeof(
int32_t) * 4,
1791 static_cast<index_t>(coherence));
1794 src_thread_data.template get_as<int32x4_t>()[
number<2>{}],
1795 dst_wave_buffer_resource,
1796 dst_thread_addr_offset,
1797 dst_wave_addr_offset +
sizeof(
int32_t) * 8,
1798 static_cast<index_t>(coherence));
1801 src_thread_data.template get_as<int32x4_t>()[
number<3>{}],
1802 dst_wave_buffer_resource,
1803 dst_thread_addr_offset,
1804 dst_wave_addr_offset +
sizeof(
int32_t) * 12,
1805 static_cast<index_t>(coherence));
1809template <
typename T,
1814 index_t dst_thread_addr_offset,
1818 (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
1819 (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1820 (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1821 (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1822 (std::is_same<T, int32_t>::value &&
1823 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1824 (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1825 (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1826 (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1827 (std::is_same<T, uint16_t>::value &&
1828 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1829 (std::is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
1830 "wrong! not implemented");
1832 if constexpr(std::is_same<T, float>::value)
1834 if constexpr(N == 1)
1837 dst_wave_buffer_resource,
1838 dst_thread_addr_offset,
1839 dst_wave_addr_offset,
1840 static_cast<index_t>(coherence));
1842 else if constexpr(N == 2)
1845 dst_wave_buffer_resource,
1846 dst_thread_addr_offset,
1847 dst_wave_addr_offset,
1848 static_cast<index_t>(coherence));
1850 else if constexpr(N == 4)
1853 dst_wave_buffer_resource,
1854 dst_thread_addr_offset,
1855 dst_wave_addr_offset,
1856 static_cast<index_t>(coherence));
1858 else if constexpr(N == 8)
1861 src_thread_data.template get_as<fp32x4_t>()[
number<0>{}],
1862 dst_wave_buffer_resource,
1863 dst_thread_addr_offset,
1864 dst_wave_addr_offset,
1865 static_cast<index_t>(coherence));
1867 src_thread_data.template get_as<fp32x4_t>()[
number<1>{}],
1868 dst_wave_buffer_resource,
1869 dst_thread_addr_offset,
1870 dst_wave_addr_offset + 4 *
sizeof(
float),
1871 static_cast<index_t>(coherence));
1874 else if constexpr(std::is_same<T, fp16_t>::value)
1876 if constexpr(N == 1)
1879 dst_wave_buffer_resource,
1880 dst_thread_addr_offset,
1881 dst_wave_addr_offset,
1882 static_cast<index_t>(coherence));
1884 else if constexpr(N == 2)
1887 dst_wave_buffer_resource,
1888 dst_thread_addr_offset,
1889 dst_wave_addr_offset,
1890 static_cast<index_t>(coherence));
1892 else if constexpr(N == 4)
1895 dst_wave_buffer_resource,
1896 dst_thread_addr_offset,
1897 dst_wave_addr_offset,
1898 static_cast<index_t>(coherence));
1900 else if constexpr(N == 8)
1906 dst_wave_buffer_resource,
1907 dst_thread_addr_offset,
1908 dst_wave_addr_offset,
1909 static_cast<index_t>(coherence));
1912 dst_wave_buffer_resource,
1913 dst_thread_addr_offset,
1914 dst_wave_addr_offset + 4 *
sizeof(
fp16_t),
1915 static_cast<index_t>(coherence));
1918 dst_wave_buffer_resource,
1919 dst_thread_addr_offset,
1920 dst_wave_addr_offset,
1921 static_cast<index_t>(coherence));
1925 else if constexpr(std::is_same<T, bf16_t>::value)
1927 if constexpr(N == 1)
1930 dst_wave_buffer_resource,
1931 dst_thread_addr_offset,
1932 dst_wave_addr_offset,
1933 static_cast<index_t>(coherence));
1935 else if constexpr(N == 2)
1938 dst_wave_buffer_resource,
1939 dst_thread_addr_offset,
1940 dst_wave_addr_offset,
1941 static_cast<index_t>(coherence));
1943 else if constexpr(N == 4)
1946 dst_wave_buffer_resource,
1947 dst_thread_addr_offset,
1948 dst_wave_addr_offset,
1949 static_cast<index_t>(coherence));
1951 else if constexpr(N == 8)
1954 src_thread_data.template get_as<int16x4_t>()[
number<0>{}],
1955 dst_wave_buffer_resource,
1956 dst_thread_addr_offset,
1957 dst_wave_addr_offset,
1958 static_cast<index_t>(coherence));
1961 src_thread_data.template get_as<int16x4_t>()[
number<1>{}],
1962 dst_wave_buffer_resource,
1963 dst_thread_addr_offset,
1964 dst_wave_addr_offset + 4 *
sizeof(
bf16_t),
1965 static_cast<index_t>(coherence));
1968 else if constexpr(std::is_same<T, uint16_t>::value)
1970 if constexpr(N == 1)
1973 dst_wave_buffer_resource,
1974 dst_thread_addr_offset,
1975 dst_wave_addr_offset,
1976 static_cast<index_t>(coherence));
1978 else if constexpr(N == 2)
1981 dst_wave_buffer_resource,
1982 dst_thread_addr_offset,
1983 dst_wave_addr_offset,
1984 static_cast<index_t>(coherence));
1986 else if constexpr(N == 4)
1989 dst_wave_buffer_resource,
1990 dst_thread_addr_offset,
1991 dst_wave_addr_offset,
1992 static_cast<index_t>(coherence));
1994 else if constexpr(N == 8)
1997 src_thread_data.template get_as<uint16x4_t>()[
number<0>{}],
1998 dst_wave_buffer_resource,
1999 dst_thread_addr_offset,
2000 dst_wave_addr_offset,
2001 static_cast<index_t>(coherence));
2004 src_thread_data.template get_as<uint16x4_t>()[
number<1>{}],
2005 dst_wave_buffer_resource,
2006 dst_thread_addr_offset,
2007 dst_wave_addr_offset + 4 *
sizeof(
uint16_t),
2008 static_cast<index_t>(coherence));
2016 dst_wave_buffer_resource,
2017 dst_thread_addr_offset,
2018 dst_wave_addr_offset);
2022template <
typename T,
2025 bool oob_conditional_check =
true>
2028 index_t dst_thread_addr_offset,
2030 index_t dst_linear_addr_offset,
2033 constexpr index_t bytes =
sizeof(T) * N;
2034 static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
2035 "wrong! not supported by buffer_store instruction");
2038 if constexpr(oob_conditional_check)
2041 dst_wave_buffer_resource,
2042 dst_thread_addr_offset,
2043 dst_wave_addr_offset,
2044 dst_linear_addr_offset,
2050 dst_wave_buffer_resource,
2051 dst_thread_addr_offset,
2052 dst_wave_addr_offset,
2053 dst_linear_addr_offset);
2057template <
typename T, index_t N>
2060 index_t dst_thread_addr_offset,
2063 static_assert((std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
2064 (std::is_same<T, fp16_t>::value && (N == 2 || N == 4 || N == 8)) ||
2065 (std::is_same<T, bf16_t>::value && (N == 2 || N == 4 || N == 8)) ||
2066 (std::is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
2067 "wrong! not implemented");
2069 if constexpr(std::is_same<T, float>::value)
2071 if constexpr(N == 1)
2074 dst_wave_buffer_resource,
2075 dst_thread_addr_offset,
2076 dst_wave_addr_offset,
2079 else if constexpr(N == 2)
2082 src_thread_data.template get_as<float>()[
number<0>{}],
2083 dst_wave_buffer_resource,
2084 dst_thread_addr_offset,
2085 dst_wave_addr_offset,
2089 src_thread_data.template get_as<float>()[
number<1>{}],
2090 dst_wave_buffer_resource,
2091 dst_thread_addr_offset,
2092 dst_wave_addr_offset +
sizeof(
float),
2095 else if constexpr(N == 4)
2098 src_thread_data.template get_as<float>()[
number<0>{}],
2099 dst_wave_buffer_resource,
2100 dst_thread_addr_offset,
2101 dst_wave_addr_offset,
2105 src_thread_data.template get_as<float>()[
number<1>{}],
2106 dst_wave_buffer_resource,
2107 dst_thread_addr_offset,
2108 dst_wave_addr_offset +
sizeof(
float),
2112 src_thread_data.template get_as<float>()[
number<2>{}],
2113 dst_wave_buffer_resource,
2114 dst_thread_addr_offset,
2115 dst_wave_addr_offset + 2 *
sizeof(
float),
2119 src_thread_data.template get_as<float>()[
number<3>{}],
2120 dst_wave_buffer_resource,
2121 dst_thread_addr_offset,
2122 dst_wave_addr_offset + 3 *
sizeof(
float),
2126 else if constexpr(std::is_same<T, fp16_t>::value)
2128 if constexpr(N == 2)
2131 dst_wave_buffer_resource,
2132 dst_thread_addr_offset,
2133 dst_wave_addr_offset,
2136 else if constexpr(N == 4)
2140 src_thread_data.template get_as<fp16x2_t>()[i],
2141 dst_wave_buffer_resource,
2142 dst_thread_addr_offset,
2143 dst_wave_addr_offset + i *
sizeof(
fp16x2_t),
2147 else if constexpr(N == 8)
2151 src_thread_data.template get_as<fp16x2_t>()[i],
2152 dst_wave_buffer_resource,
2153 dst_thread_addr_offset,
2154 dst_wave_addr_offset + i *
sizeof(
fp16x2_t),
2159 else if constexpr(std::is_same<T, bf16_t>::value)
2161 if constexpr(N == 2)
2164 dst_wave_buffer_resource,
2165 dst_thread_addr_offset,
2166 dst_wave_addr_offset,
2169 else if constexpr(N == 4)
2173 src_thread_data.template get_as<bf16x2_t>()[i],
2174 dst_wave_buffer_resource,
2175 dst_thread_addr_offset,
2176 dst_wave_addr_offset + i *
sizeof(
bf16x2_t),
2180 else if constexpr(N == 8)
2184 src_thread_data.template get_as<bf16x2_t>()[i],
2185 dst_wave_buffer_resource,
2186 dst_thread_addr_offset,
2187 dst_wave_addr_offset + i *
sizeof(
bf16x2_t),
2192 else if constexpr(std::is_same<T, int32_t>::value)
2194 if constexpr(N == 1)
2197 dst_wave_buffer_resource,
2198 dst_thread_addr_offset,
2199 dst_wave_addr_offset,
2202 else if constexpr(N == 2)
2205 src_thread_data.template get_as<int32_t>()[
number<0>{}],
2206 dst_wave_buffer_resource,
2207 dst_thread_addr_offset,
2208 dst_wave_addr_offset,
2212 src_thread_data.template get_as<int32_t>()[
number<1>{}],
2213 dst_wave_buffer_resource,
2214 dst_thread_addr_offset,
2215 dst_wave_addr_offset +
sizeof(
int32_t),
2218 else if constexpr(N == 4)
2221 src_thread_data.template get_as<int32_t>()[
number<0>{}],
2222 dst_wave_buffer_resource,
2223 dst_thread_addr_offset,
2224 dst_wave_addr_offset,
2228 src_thread_data.template get_as<int32_t>()[
number<1>{}],
2229 dst_wave_buffer_resource,
2230 dst_thread_addr_offset,
2231 dst_wave_addr_offset +
sizeof(
int32_t),
2235 src_thread_data.template get_as<int32_t>()[
number<2>{}],
2236 dst_wave_buffer_resource,
2237 dst_thread_addr_offset,
2238 dst_wave_addr_offset + 2 *
sizeof(
int32_t),
2242 src_thread_data.template get_as<int32_t>()[
number<3>{}],
2243 dst_wave_buffer_resource,
2244 dst_thread_addr_offset,
2245 dst_wave_addr_offset + 3 *
sizeof(
int32_t),
2251template <
typename T, index_t N>
2254 index_t dst_thread_addr_offset,
2257 static_assert((std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
2258 "wrong! not implemented");
2259 if constexpr(std::is_same<T, double>::value)
2261 if constexpr(N == 1)
2264 dst_wave_buffer_resource,
2265 dst_thread_addr_offset,
2266 dst_wave_addr_offset,
2269 else if constexpr(N == 2)
2272 src_thread_data.template get_as<double>()[
number<0>{}],
2273 dst_wave_buffer_resource,
2274 dst_thread_addr_offset,
2275 dst_wave_addr_offset,
2279 src_thread_data.template get_as<double>()[
number<1>{}],
2280 dst_wave_buffer_resource,
2281 dst_thread_addr_offset,
2282 dst_wave_addr_offset +
sizeof(
double),
2285 else if constexpr(N == 4)
2288 src_thread_data.template get_as<double>()[
number<0>{}],
2289 dst_wave_buffer_resource,
2290 dst_thread_addr_offset,
2291 dst_wave_addr_offset,
2295 src_thread_data.template get_as<double>()[
number<1>{}],
2296 dst_wave_buffer_resource,
2297 dst_thread_addr_offset,
2298 dst_wave_addr_offset +
sizeof(
double),
2302 src_thread_data.template get_as<double>()[
number<2>{}],
2303 dst_wave_buffer_resource,
2304 dst_thread_addr_offset,
2305 dst_wave_addr_offset + 2 *
sizeof(
double),
2309 src_thread_data.template get_as<double>()[
number<3>{}],
2310 dst_wave_buffer_resource,
2311 dst_thread_addr_offset,
2312 dst_wave_addr_offset + 3 *
sizeof(
double),
2323template <
typename T,
2326 bool oob_conditional_check =
true>
2329 index_t src_thread_element_offset,
2330 bool src_thread_element_valid,
2331 index_t src_element_space_size)
2333 const int32x4_t src_wave_buffer_resource =
2336 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
2338#if CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
2340 if constexpr(oob_conditional_check)
2341 return src_thread_element_valid ? 0 : 0x80000000;
2346 src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
2350 if constexpr(oob_conditional_check)
2361template <
typename T,
2364 bool oob_conditional_check =
true>
2367 index_t src_thread_element_offset,
2368 bool src_thread_element_valid,
2369 index_t src_element_space_size,
2372 const int32x4_t src_wave_buffer_resource =
2375 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
2380 if constexpr(oob_conditional_check)
2381 return src_thread_element_valid ? tmp :
thread_buffer<T, N>{customized_value};
2386template <
typename T,
2389 bool oob_conditional_check =
true,
2390 bool pre_nop =
false>
2392 const T* p_src_wave,
2393 index_t src_thread_element_offset,
2394 index_t src_linear_element_offset,
2395 index_t src_element_space_size,
2399 const int32x4_t src_wave_buffer_resource =
2402 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
2403 index_t src_linear_addr_offset = src_linear_element_offset *
sizeof(T);
2407 src_wave_buffer_resource,
2408 src_thread_addr_offset,
2410 src_linear_addr_offset,
2416template <
typename T,
2419 bool oob_conditional_check =
true,
2420 bool pre_nop =
false>
2422 const int32x4_t src_wave_buffer_resource,
2423 index_t src_thread_element_offset,
2424 index_t src_linear_element_offset,
2428 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
2429 index_t src_linear_addr_offset = src_linear_element_offset *
sizeof(T);
2433 src_wave_buffer_resource,
2434 src_thread_addr_offset,
2436 src_linear_addr_offset,
2445template <
typename T,
2448 bool pre_nop =
false>
2450 const T* p_src_wave,
2451 index_t src_thread_element_offset,
2452 index_t src_linear_element_offset,
2453 index_t src_element_space_size,
2456 const int32x4_t src_wave_buffer_resource =
2459 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
2460 index_t src_linear_addr_offset = src_linear_element_offset *
sizeof(T);
2463 src_wave_buffer_resource,
2464 src_thread_addr_offset,
2466 src_linear_addr_offset,
2471template <
typename T,
2474 bool pre_nop =
false>
2476 const int32x4_t src_wave_buffer_resource,
2477 index_t src_thread_element_offset,
2478 index_t src_linear_element_offset,
2481 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
2482 index_t src_linear_addr_offset = src_linear_element_offset *
sizeof(T);
2485 src_wave_buffer_resource,
2486 src_thread_addr_offset,
2488 src_linear_addr_offset,
2493template <
typename T,
2496 bool oob_conditional_check =
false>
2498 const int32x4_t src_wave_buffer_resource,
2499 index_t src_thread_element_offset,
2500 index_t src_linear_element_offset,
2501 bool is_valid_element,
2504 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
2505 index_t src_linear_addr_offset = src_linear_element_offset *
sizeof(T);
2508 src_wave_buffer_resource,
2509 src_thread_addr_offset,
2511 src_linear_addr_offset,
2520template <
typename T,
2523 bool oob_conditional_check =
true>
2526 const index_t dst_thread_element_offset,
2527 const bool dst_thread_element_valid,
2528 const index_t dst_element_space_size)
2530 const int32x4_t dst_wave_buffer_resource =
2533 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
2535#if CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
2537 if constexpr(oob_conditional_check)
2538 return dst_thread_element_valid ? 0 : 0x80000000;
2543 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
2545 if constexpr(oob_conditional_check)
2547 if(dst_thread_element_valid)
2550 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
2556 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
2561template <
typename T,
2564 bool oob_conditional_check =
true>
2567 const index_t dst_thread_element_offset,
2568 const index_t dst_linear_element_offset,
2569 const bool dst_thread_element_valid,
2570 const index_t dst_element_space_size)
2572 const int32x4_t dst_wave_buffer_resource =
2575 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
2576 index_t dst_linear_addr_offset = dst_linear_element_offset *
sizeof(T);
2579 dst_wave_buffer_resource,
2580 dst_thread_addr_offset,
2582 dst_linear_addr_offset,
2583 dst_thread_element_valid);
2590template <
typename T, index_t N>
2593 const index_t dst_thread_element_offset,
2594 const bool dst_thread_element_valid,
2595 const index_t dst_element_space_size)
2597 const int32x4_t dst_wave_buffer_resource =
2600 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
2602#if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
2603 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
2606 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
2608 if(dst_thread_element_valid)
2611 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
2616template <
typename T,
2619 bool oob_conditional_check =
true,
2620 bool pre_nop =
false>
2623 const index_t dst_thread_element_offset,
2624 const index_t dst_linear_element_offset,
2625 const bool dst_thread_element_valid,
2626 const index_t dst_element_space_size,
2629 const int32x4_t dst_wave_buffer_resource =
2632 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
2633 index_t dst_linear_addr_offset = dst_linear_element_offset *
sizeof(T);
2635 if constexpr(oob_conditional_check)
2638 dst_wave_buffer_resource,
2639 dst_thread_addr_offset,
2641 dst_linear_addr_offset,
2642 dst_thread_element_valid);
2647 dst_wave_buffer_resource,
2648 dst_thread_addr_offset,
2650 dst_linear_addr_offset,
2659template <
typename T, index_t N>
2662 const index_t dst_thread_element_offset,
2663 const bool dst_thread_element_valid,
2664 const index_t dst_element_space_size)
2666 const int32x4_t dst_wave_buffer_resource =
2669 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
2671#if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
2672 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
2675 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
2677 if(dst_thread_element_valid)
2680 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
2685template <
typename T, index_t NumElemsPerThread>
2690 const bool is_valid,
2691 const index_t src_element_space_size)
2697 const index_t global_offset_bytes = is_valid ? global_offset *
sizeof(T) : 0x80000000;
2699#if CK_TILE_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
2700 T* lds_ptr = lds_base_ptr + lds_offset;
2702 asm volatile(
"s_mov_b32 m0, %0; \n\t"
2703 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::
"s"(lds_ptr_sgpr),
2704 "v"(global_offset_bytes),
2709#if defined(__gfx9__)
2710 constexpr auto bytes_per_thread =
sizeof(T) * NumElemsPerThread;
2715#if defined(__gfx950__)
2716 constexpr auto dword_bytes = 4;
2717 static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
2718 bytes_per_thread == dword_bytes * 4);
2719#elif defined(__gfx9__)
2720 constexpr auto dword_bytes = 4;
2721 static_assert(bytes_per_thread == dword_bytes);
2728 src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
2732#if defined(__gfx950__)
2733template <
typename T, index_t N>
2734__device__
auto amd_transpose_load_to_vgpr(
const T* __restrict__ in_ptr)
2736#define __LDS_ADDR __attribute__((address_space(3)))
2738 static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
2739 "We need to have the compatible compiler version to build this instruction");
2741#pragma clang diagnostic push
2742#pragma clang diagnostic ignored "-Wold-style-cast"
2744 const auto in_ptr_ = (__LDS_ADDR T*)(
const_cast<T*
>(in_ptr));
2745#pragma clang diagnostic pop
2748 typedef __attribute__((__vector_size__(4 *
sizeof(__fp16)))) __fp16 llvm_fp16x4_t;
2749 auto lds_ptr =
reinterpret_cast<__LDS_ADDR llvm_fp16x4_t*
>(in_ptr_);
2754 typedef __attribute__((__vector_size__(4 *
sizeof(__bf16)))) __bf16 llvm_bf16x4_t;
2755 auto lds_ptr =
reinterpret_cast<__LDS_ADDR llvm_bf16x4_t*
>(in_ptr_);
2758 else if constexpr(std::is_same_v<remove_cvref_t<T>,
ck_tile::fp8_t> ||
2762 typedef __attribute__((__vector_size__(2 *
sizeof(
index_t))))
index_t llvm_i32x2_t;
2763 auto lds_ptr =
reinterpret_cast<__LDS_ADDR llvm_i32x2_t*
>(in_ptr_);
2768 static_assert(
false,
"not implemented");
#define CK_TILE_DEVICE
Definition config.hpp:41
#define CK_TILE_BUFFER_RESOURCE_3RD_DWORD
Definition config.hpp:210
#define CK_TILE_DEVICE_EXTERN
Definition config.hpp:43
#define CK_TILE_LDS_ADDR
Definition config.hpp:58
Definition tile/core/arch/amd_buffer_addressing.hpp:110
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 3 >(array< float, 3 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:997
CK_TILE_DEVICE void insert_dummy_dep()
Definition tile/core/arch/amd_buffer_addressing.hpp:1037
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 32 >(array< float, 32 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:1025
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 8 >(array< float, 8 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:1009
CK_TILE_DEVICE void insert_dummy_dep_per_dword(array< float, N > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:981
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 2 >(array< float, 2 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:991
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 4 >(array< float, 4 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:1003
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 16 >(array< float, 16 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:1016
Definition tile/core/algorithm/cluster_descriptor.hpp:13
CK_TILE_DEVICE_EXTERN int8x2_t llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
_Float16 fp16x2_t
Definition half.hpp:385
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:1535
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition type_traits.hpp:21
int8_t int8x2_t
Definition pk_int4.hpp:103
CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2874
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16(_Float16 vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
__device__ uint32_t amd_wave_read_first_lane(uint16_t v)
Definition tile/core/arch/amd_buffer_addressing.hpp:35
_Float16 half_t
Definition half.hpp:111
CK_TILE_DEVICE_EXTERN fp16x4_t llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
uint16_t uint16x2_t
Definition vector_type.hpp:181
int16_t int16x4_t
Definition vector_type.hpp:173
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
CK_TILE_DEVICE_EXTERN int8_t llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8(int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
constant< b > bool_constant
Definition tile/core/numeric/integral_constant.hpp:43
int8_t int8_t
Definition int8.hpp:20
CK_TILE_DEVICE void amd_buffer_store(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2738
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x2(int16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
bfloat16_t bf16_t
Definition bfloat16.hpp:113
CK_TILE_DEVICE auto async_load_fence_raw(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:1068
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x4(int16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
_Float16 fp16_t
Definition half.hpp:110
amd_buffer_coherence_enum
Definition tile/core/arch/amd_buffer_addressing.hpp:1404
@ glc_slc
Definition tile/core/arch/amd_buffer_addressing.hpp:1408
@ SYSTEM_NT1
Definition tile/core/arch/amd_buffer_addressing.hpp:1419
@ coherence_default
Definition tile/core/arch/amd_buffer_addressing.hpp:1405
@ WAVE_NT0
Definition tile/core/arch/amd_buffer_addressing.hpp:1412
@ slc
Definition tile/core/arch/amd_buffer_addressing.hpp:1407
@ DEVICE_NT1
Definition tile/core/arch/amd_buffer_addressing.hpp:1417
@ SYSTEM_NT0
Definition tile/core/arch/amd_buffer_addressing.hpp:1418
@ glc
Definition tile/core/arch/amd_buffer_addressing.hpp:1406
@ GROUP_NT1
Definition tile/core/arch/amd_buffer_addressing.hpp:1415
@ DEVICE_NT0
Definition tile/core/arch/amd_buffer_addressing.hpp:1416
@ GROUP_NT0
Definition tile/core/arch/amd_buffer_addressing.hpp:1414
@ WAVE_NT1
Definition tile/core/arch/amd_buffer_addressing.hpp:1413
CK_TILE_DEVICE_EXTERN double llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
_BitInt(8) fp8_t
Definition float8.hpp:204
CK_TILE_DEVICE_EXTERN fp32x2_t llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
CK_TILE_DEVICE void amd_async_buffer_load_with_oob(CK_TILE_LDS_ADDR T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool is_valid_element, bool_constant< oob_conditional_check >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:2711
tuple_array< T, N > thread_buffer
Definition thread_buffer.hpp:14
int32_t int32x4_t
Definition vector_type.hpp:155
CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T *smem, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:2663
bfloat16_t bf16x2_t
Definition pk_fp4.hpp:24
CK_TILE_HOST_DEVICE constexpr Y bit_cast(const X &x)
Definition bit_cast.hpp:11
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x2(fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x4(fp32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x4(fp16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
CK_TILE_DEVICE_EXTERN int32x4_t llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
CK_TILE_DEVICE void lds_load_fence(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:820
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer< int8_t, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:1926
CK_TILE_DEVICE_EXTERN bf16x2_t llvm_amdgcn_raw_buffer_atomic_add_bf16x2(bf16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16")
CK_TILE_DEVICE void async_buffer_load_dwordxn_v(void *smem, int32x4_t rsrc, index_t voffset, index_t, index_t ioffset, index_t=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:1352
_Float16 fp16x4_t
Definition vector_type.hpp:137
CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
CK_TILE_DEVICE void amd_buffer_atomic_max_impl(const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:2466
CK_TILE_DEVICE void amd_buffer_atomic_add_raw(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:2835
CK_TILE_DEVICE_EXTERN int32x2_t llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
constant< v > number
Definition tile/core/numeric/integral_constant.hpp:37
CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer< T, N > &src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:2272
constexpr detail::ignore_t ignore
Definition tile/core/utility/ignore.hpp:20
CK_TILE_DEVICE void buffer_store_fence(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:1063
CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
bfloat16_t bf16x4_t
Definition vector_type.hpp:146
int32_t int32_t
Definition integer.hpp:10
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer< T, N > &dst, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, index_t is_valid_element=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:2605
bfloat16_t bf16x8_t
Definition vector_type.hpp:147
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_zero(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2542
CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:1393
CK_TILE_DEVICE void amd_buffer_store_raw(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2779
CK_TILE_DEVICE_EXTERN fp32x4_t llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
CK_TILE_DEVICE void buffer_load_fence(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:815
typename impl::ext_vector< T, N >::type ext_vector_t
Definition vector_type.hpp:84
unsigned _BitInt(8) bf8_t
Definition float8.hpp:206
CK_TILE_DEVICE_EXTERN int16_t llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
CK_TILE_DEVICE_EXTERN int16x4_t llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:2026
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x2(fp32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
float fp32x4_t
Definition vector_type.hpp:128
CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void *ptr, uint32_t size=0xffffffff, ForceSGPR={})
Definition tile/core/arch/amd_buffer_addressing.hpp:97
CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2805
uint16_t uint16x4_t
Definition vector_type.hpp:182
float fp32x2_t
Definition pk_fp4.hpp:22
int8_t int8x4_t
Definition vector_type.hpp:191
CK_TILE_DEVICE_EXTERN int16x2_t llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, index_t flag=0, bool_constant< oob_conditional_check >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:1882
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x2(uint16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
int32_t index_t
Definition integer.hpp:9
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition tile/core/arch/amd_buffer_addressing.hpp:2580
int32_t int32x2_t
Definition vector_type.hpp:154
CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
CK_TILE_DEVICE_EXTERN _Float16 llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x4(uint16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
CK_TILE_DEVICE thread_buffer< int8_t, N > amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:1425
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, as3_uint32_ptr lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
CK_TILE_DEVICE void amd_async_buffer_load_impl(CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:1857
CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16(int16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
int16_t int16x2_t
Definition vector_type.hpp:172
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer< T, N > &dst_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset, index_t dst_linear_addr_offset, index_t is_valid_element=1)
Definition tile/core/arch/amd_buffer_addressing.hpp:2240
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16(uint16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
CK_TILE_DEVICE_EXTERN int8x4_t llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
CK_TILE_DEVICE void amd_buffer_load_raw_impl(thread_buffer< T, N > &dst, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_linear_addr_offset, index_t flag=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:1818
__device__ void amd_direct_load_global_to_lds(const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)
Definition utility/amd_buffer_addressing.hpp:1015
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
signed short int16_t
Definition stdint.h:122
unsigned short uint16_t
Definition stdint.h:125
_W64 unsigned int uintptr_t
Definition stdint.h:164
unsigned int uint32_t
Definition stdint.h:126
unsigned char uint8_t
Definition stdint.h:124
Definition tile/core/arch/amd_buffer_addressing.hpp:826
Definition tile/core/arch/amd_buffer_addressing.hpp:857
Definition tile/core/arch/amd_buffer_addressing.hpp:134
Definition tile/core/arch/amd_buffer_addressing.hpp:131
Definition tile/core/arch/amd_buffer_addressing.hpp:90
Definition tile/core/arch/amd_buffer_addressing.hpp:140
Definition tile/core/arch/amd_buffer_addressing.hpp:137
fp32x4_t payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:115
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:119
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:118
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:117
fp32x2_t payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:116
Definition tile/core/arch/amd_buffer_addressing.hpp:113
fp32x4_t payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:884
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:888
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:886
fp32x2_t payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:885
Definition tile/core/arch/amd_buffer_addressing.hpp:882
static CK_TILE_HOST_DEVICE constexpr T zero()
Definition tile/core/numeric/numeric.hpp:58
Definition coordinate_transform.hpp:1392
Definition tile/core/arch/amd_buffer_addressing.hpp:895
Definition tile/core/utility/functional.hpp:43
Definition tile/core/utility/debug.hpp:67
uint32_t * as3_uint32_ptr
Definition tile/core/arch/amd_buffer_addressing.hpp:29
#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr)