8#if !CK_TILE_USE_BUFFER_ADDRESSING_BUILTIN
23#if __cplusplus >= 202002L
24#define LIKELY(x) (x) [[likely]]
26#define LIKELY(x) (__builtin_expect(!!(x), 1))
37 return __builtin_amdgcn_readfirstlane(
static_cast<uint32_t>(v));
42 return __builtin_amdgcn_readfirstlane(
static_cast<uint32_t>(v));
47 return __builtin_amdgcn_readfirstlane(
value);
52 return __builtin_amdgcn_readfirstlane(
value);
55template <
typename Object, std::enable_if_t<std::is_trivially_copyable_v<Object>,
int> = 0>
58 constexpr size_t ObjectSize =
sizeof(Object);
59 constexpr size_t SGPR_size = 4;
60 constexpr size_t NumFull = ObjectSize / SGPR_size;
61 constexpr size_t Tail = ObjectSize % SGPR_size;
63 const unsigned char* src =
reinterpret_cast<const unsigned char*
>(&obj);
64 alignas(Object)
unsigned char dst[ObjectSize];
67 constexpr size_t offset = Ic * SGPR_size;
69 __builtin_memcpy(&read_src, src +
offset, SGPR_size);
70 read_src = __builtin_amdgcn_readfirstlane(read_src);
71 __builtin_memcpy(dst +
offset, &read_src, SGPR_size);
74 if constexpr(Tail != 0)
76 constexpr size_t offset = NumFull * SGPR_size;
78 __builtin_memcpy(&tail_loc, src +
offset, Tail);
79 tail_loc = __builtin_amdgcn_readfirstlane(tail_loc);
80 __builtin_memcpy(dst +
offset, &tail_loc, Tail);
83 __builtin_memcpy(&out, dst, ObjectSize);
96template <
typename ForceSGPR = std::false_type>
103 if constexpr(std::is_same_v<ForceSGPR, std::true_type>)
121#if CK_TILE_BUFFER_LOAD_RAW_BF16_WA
130template <index_t
bytes,
bool pre_nop = false>
133template <index_t
bytes,
bool pre_nop = false>
136template <index_t
bytes>
139template <index_t
bytes>
142#pragma clang diagnostic push
143#pragma clang diagnostic ignored "-Wundefined-reinterpret-cast"
147#define HAS_RAW_BUFFER_BUILTINS \
148 __has_builtin(__builtin_amdgcn_raw_buffer_load_b32) && \
149 __has_builtin(__builtin_amdgcn_make_buffer_rsrc) && \
150 __has_builtin(__builtin_amdgcn_raw_buffer_store_b32)
152#if HAS_RAW_BUFFER_BUILTINS
155 __amdgpu_buffer_rsrc_t as_rsrc;
156 static_assert(
sizeof(res) ==
sizeof(as_rsrc) &&
"Size of buffer resource should match");
157 memcpy(&as_rsrc, &res,
sizeof(res));
162template <
bool pre_nop>
165 template <
typename T>
174 static_assert(
sizeof(T) == 16);
176#if HAS_RAW_BUFFER_BUILTINS
178 reinterpret_cast<mbuf_t&
>(
value) = __builtin_amdgcn_raw_buffer_load_b128(
179 cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
181 if constexpr(pre_nop)
182 asm volatile(
"s_nop 4\n"
183 "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
184 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
185 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
188 asm volatile(
"buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3"
189 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
190 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
196template <
bool pre_nop>
199 template <
typename T>
208 static_assert(
sizeof(T) == 8);
210#if HAS_RAW_BUFFER_BUILTINS
212 reinterpret_cast<mbuf_t&
>(
value) = __builtin_amdgcn_raw_buffer_load_b64(
213 cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
215 if constexpr(pre_nop)
216 asm volatile(
"s_nop 4\n"
217 "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
218 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
219 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
222 asm volatile(
"buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3"
223 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
224 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
230template <
bool pre_nop>
233 template <
typename T>
242 static_assert(
sizeof(T) == 4);
245#if HAS_RAW_BUFFER_BUILTINS
247 reinterpret_cast<mbuf_t&
>(
value) = __builtin_amdgcn_raw_buffer_load_b32(
248 cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
250 if constexpr(pre_nop)
251 asm volatile(
"s_nop 4\n"
252 "buffer_load_dword %0, %1, %2, 0 offen offset:%3"
253 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
254 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
257 asm volatile(
"buffer_load_dword %0, %1, %2, 0 offen offset:%3"
258 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
259 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
265template <
bool pre_nop>
268 template <
typename T>
277 static_assert(
sizeof(T) == 4);
280#if HAS_RAW_BUFFER_BUILTINS
282 reinterpret_cast<mbuf_t&
>(
value) = __builtin_amdgcn_raw_buffer_load_b16(
283 cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
285 if constexpr(pre_nop)
286 asm volatile(
"s_nop 4\n"
287 "buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
288 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
289 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
292 asm volatile(
"buffer_load_ushort %0, %1, %2, 0 offen offset:%3"
293 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
294 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
300template <
bool pre_nop>
303 template <
typename T>
312 static_assert(
sizeof(T) == 4);
314#if HAS_RAW_BUFFER_BUILTINS
316 reinterpret_cast<mbuf_t&
>(
value) = __builtin_amdgcn_raw_buffer_load_b16(
317 cast_to_amdgpu_buffer_rsrc_t(res), v_offset, s_offset, 0);
319 if constexpr(pre_nop)
320 asm volatile(
"s_nop 4\n"
321 "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
322 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
323 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
326 asm volatile(
"buffer_load_ubyte %0, %1, %2, 0 offen offset:%3"
327 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
328 :
"v"(v_offset),
"s"(res),
"n"(i_offset)
334#if HAS_RAW_BUFFER_BUILTINS
335template <index_t
bytes,
bool pre_nop>
338 template <
typename T>
349 buffer_load<bytes, pre_nop>{}(
355template <
bool pre_nop>
358 template <
typename T>
367 static_assert(
sizeof(T) == 16);
368 auto saved_exec = __builtin_amdgcn_read_exec();
370 static_assert(
sizeof(mbuf_t) ==
sizeof(T));
371 if constexpr(pre_nop)
372 asm volatile(
"s_nop 4\n"
373 "v_cmpx_le_u32 exec, 1, %4\n"
374 "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
376 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
377 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
380 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
381 "buffer_load_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
383 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
384 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
389template <
bool pre_nop>
392 template <
typename T>
401 static_assert(
sizeof(T) == 8);
402 auto saved_exec = __builtin_amdgcn_read_exec();
404 if constexpr(pre_nop)
405 asm volatile(
"s_nop 4\n"
406 "v_cmpx_le_u32 exec, 1, %4\n"
407 "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
409 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
410 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
413 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
414 "buffer_load_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
416 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
417 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
422template <
bool pre_nop>
425 template <
typename T>
434 static_assert(
sizeof(T) == 4);
435 auto saved_exec = __builtin_amdgcn_read_exec();
437 if constexpr(pre_nop)
438 asm volatile(
"s_nop 4\n"
439 "v_cmpx_le_u32 exec, 1, %4\n"
440 "buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
442 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
443 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
446 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
447 "buffer_load_dword %0, %1, %2, 0 offen offset:%3\n"
449 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
450 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
455template <
bool pre_nop>
458 template <
typename T>
467 static_assert(
sizeof(T) == 4);
468 auto saved_exec = __builtin_amdgcn_read_exec();
470 if constexpr(pre_nop)
471 asm volatile(
"s_nop 4\n"
472 "v_cmpx_le_u32 exec, 1, %4\n"
473 "buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
475 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
476 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
479 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
480 "buffer_load_ushort %0, %1, %2, 0 offen offset:%3\n"
482 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
483 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
488template <
bool pre_nop>
491 template <
typename T>
500 static_assert(
sizeof(T) == 4);
501 auto saved_exec = __builtin_amdgcn_read_exec();
503 if constexpr(pre_nop)
504 asm volatile(
"s_nop 4\n"
505 "v_cmpx_le_u32 exec, 1, %4\n"
506 "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
508 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
509 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
512 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
513 "buffer_load_ubyte %0, %1, %2, 0 offen offset:%3\n"
515 :
"+v"(
reinterpret_cast<mbuf_t&
>(
value))
516 :
"v"(v_offset),
"s"(res),
"n"(i_offset),
"v"(flag),
"s"(saved_exec)
522#pragma clang diagnostic pop
527 template <
typename T>
535 static_assert(
sizeof(T) == 16);
537#if HAS_RAW_BUFFER_BUILTINS
539 __builtin_amdgcn_raw_buffer_store_b128(
542 asm volatile(
"buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3"
553 template <
typename T>
561 static_assert(
sizeof(T) == 8);
563#if HAS_RAW_BUFFER_BUILTINS
565 __builtin_amdgcn_raw_buffer_store_b64(
568 asm volatile(
"buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3"
579 template <
typename T>
587 static_assert(
sizeof(T) == 4);
589#if HAS_RAW_BUFFER_BUILTINS
591 __builtin_amdgcn_raw_buffer_store_b32(
594 asm volatile(
"buffer_store_dword %0, %1, %2, 0 offen offset:%3"
605 template <
typename T>
613 static_assert(
sizeof(T) == 2);
615#if HAS_RAW_BUFFER_BUILTINS
617 __builtin_amdgcn_raw_buffer_store_b16(
620 asm volatile(
"buffer_store_short %0, %1, %2, 0 offen offset:%3"
631 template <
typename T>
639 static_assert(
sizeof(T) == 1);
641#if HAS_RAW_BUFFER_BUILTINS
643 __builtin_amdgcn_raw_buffer_store_b8(
646 asm volatile(
"buffer_store_byte %0, %1, %2, 0 offen offset:%3"
654#if HAS_RAW_BUFFER_BUILTINS
655template <index_t
bytes>
656struct buffer_store_if
658 template <
typename T>
668 buffer_store<bytes>{}(
value, res, v_offset, s_offset, i_offset);
676 template <
typename T>
684 static_assert(
sizeof(T) == 16);
685 auto save_exec = __builtin_amdgcn_read_exec();
687 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
688 "buffer_store_dwordx4 %0, %1, %2, 0 offen offset:%3\n"
704 template <
typename T>
712 static_assert(
sizeof(T) == 8);
713 auto save_exec = __builtin_amdgcn_read_exec();
715 using mbuf_t =
ext_vector_t<
typename T::value_type, T::size()>;
716 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
717 "buffer_store_dwordx2 %0, %1, %2, 0 offen offset:%3\n"
733 template <
typename T>
741 static_assert(
sizeof(T) == 4);
742 auto save_exec = __builtin_amdgcn_read_exec();
743 using mbuf_t = float;
744 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
745 "buffer_store_dword %0, %1, %2, 0 offen offset:%3\n"
761 template <
typename T>
769 static_assert(
sizeof(T) == 2);
770 auto save_exec = __builtin_amdgcn_read_exec();
771 using mbuf_t = short;
772 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
773 "buffer_store_short %0, %1, %2, 0 offen offset:%3\n"
789 template <
typename T>
797 static_assert(
sizeof(T) == 4);
798 auto save_exec = __builtin_amdgcn_read_exec();
799 using mbuf_t = float;
800 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
801 "buffer_store_byte %0, %1, %2, 0 offen offset:%3\n"
817 asm volatile(
"s_waitcnt vmcnt(%0)" : :
"n"(cnt) :
"memory");
822 asm volatile(
"s_waitcnt lgkmcnt(%0)" : :
"n"(cnt) :
"memory");
825template <
typename scalar_type, index_t N,
bool pre_nop = false>
828template <
bool pre_nop>
831 template <
typename T>
839 static_assert(
sizeof(T) == 4);
840 auto save_exec = __builtin_amdgcn_read_exec();
841 using mbuf_t = float;
842 asm volatile(
"v_cmpx_le_u32 exec, 1, %4\n"
843 "global_atomic_pk_add_bf16 %0, %1, %2 offset:%3\n"
856template <
typename scalar_type, index_t N,
bool pre_nop = false>
859template <
bool pre_nop>
862 template <
typename T>
870 static_assert(
sizeof(T) == 4);
871 using mbuf_t = float;
872 asm volatile(
"global_atomic_pk_add_bf16 %0, %1, %2 offset:%3"
900 template <
typename T>
903 static_assert(
sizeof(T) == 16);
905 asm volatile(
"ds_read_b128 %0, %1 offset:%2"
906 :
"=v"(
reinterpret_cast<mbuf_t&
>(
value))
907 :
"v"(v_offset),
"n"(i_offset)
915 template <
typename T>
918 static_assert(
sizeof(T) == 8);
920 asm volatile(
"ds_read_b64 %0, %1 offset:%2"
921 :
"=v"(
reinterpret_cast<mbuf_t&
>(
value))
922 :
"v"(v_offset),
"n"(i_offset)
930 template <
typename T>
933 static_assert(
sizeof(T) == 4);
935 asm volatile(
"ds_read_b32 %0, %1 offset:%2"
936 :
"=v"(
reinterpret_cast<mbuf_t&
>(
value))
937 :
"v"(v_offset),
"n"(i_offset)
945 template <
typename T>
948 static_assert(
sizeof(T) == 4);
950 asm volatile(
"ds_read_u16 %0, %1 offset:%2"
951 :
"=v"(
reinterpret_cast<mbuf_t&
>(
value))
952 :
"v"(v_offset),
"n"(i_offset)
960 template <
typename T>
963 static_assert(
sizeof(T) == 4);
965 asm volatile(
"ds_read_u8 %0, %1 offset:%2"
966 :
"=v"(
reinterpret_cast<mbuf_t&
>(
value))
967 :
"v"(v_offset),
"n"(i_offset)
985 asm volatile(
" " : :
"v"(b.
get(
number<i>{})) :
"memory");
1043 using da_type =
array<float, (
sizeof(T) + 3) / 4>;
1044 auto & dummy =
reinterpret_cast<da_type&
>(buffer);
1048template<
typename Tx,
typename... Ty>
1056template <
typename... T>
1059 asm volatile(
"s_waitcnt vmcnt(%0)" : :
"n"(cnt) :
"memory");
1065 asm volatile(
"s_waitcnt vmcnt(%0)" : :
"n"(cnt) :
"memory");
1070 asm volatile(
"s_waitcnt vmcnt(%0)" : :
"n"(cnt) :
"memory");
1306 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
1315 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16");
1331 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.f32");
1339 int glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fmax.f64");
1349 index_t aux) __asm(
"llvm.amdgcn.raw.buffer.load.lds");
1351template <
unsigned num_dwords,
bool pre_nop = false>
1360#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr) \
1361 if constexpr(pre_nop) \
1362 asm volatile("s_nop 4\n" instr " %1, %2, 0 offen offset:%3 lds" \
1364 : "v"(voffset), "s"(rsrc), "n"(ioffset) \
1367 asm volatile(instr " %1, %2, 0 offen offset:%3 lds" \
1369 : "v"(voffset), "s"(rsrc), "n"(ioffset) \
1372 if constexpr(num_dwords == 1)
1376#if defined(__gfx950__)
1377 else if constexpr(num_dwords == 3)
1381 else if constexpr(num_dwords == 4)
1388 static_assert(
false,
"wrong! not implemented data width");
1390#undef CK_TILE_ASYNC_LOAD_WITH_INSTR
1395 asm volatile(
"s_waitcnt vmcnt(%0)" : :
"n"(cnt) :
"memory");
1426 index_t src_thread_addr_offset,
1429 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
1430 "wrong! not implemented");
1434 if constexpr(N == 1)
1437 src_thread_addr_offset,
1438 src_wave_addr_offset,
1439 static_cast<index_t>(coherence)));
1441 else if constexpr(N == 2)
1445 src_thread_addr_offset,
1446 src_wave_addr_offset,
1447 static_cast<index_t>(coherence));
1451 else if constexpr(N == 4)
1454 src_thread_addr_offset,
1455 src_wave_addr_offset,
1456 static_cast<index_t>(coherence));
1460 else if constexpr(N == 8)
1463 src_thread_addr_offset,
1464 src_wave_addr_offset,
1465 static_cast<index_t>(coherence));
1469 else if constexpr(N == 16)
1472 src_thread_addr_offset,
1473 src_wave_addr_offset,
1474 static_cast<index_t>(coherence));
1477 else if constexpr(N == 32)
1480 src_thread_addr_offset,
1481 src_wave_addr_offset,
1482 static_cast<index_t>(coherence));
1485 src_thread_addr_offset,
1486 src_wave_addr_offset + 4 *
sizeof(
int32_t),
1487 static_cast<index_t>(coherence));
1490 tmp.template get_as<int32x4_t>()(
number<0>{}) = tmp0;
1491 tmp.template get_as<int32x4_t>()(
number<1>{}) = tmp1;
1495 else if constexpr(N == 64)
1498 src_thread_addr_offset,
1499 src_wave_addr_offset,
1500 static_cast<index_t>(coherence));
1503 src_thread_addr_offset,
1504 src_wave_addr_offset + 4 *
sizeof(
int32_t),
1505 static_cast<index_t>(coherence));
1508 src_thread_addr_offset,
1509 src_wave_addr_offset + 8 *
sizeof(
int32_t),
1510 static_cast<index_t>(coherence));
1513 src_thread_addr_offset,
1514 src_wave_addr_offset + 12 *
sizeof(
int32_t),
1515 static_cast<index_t>(coherence));
1519 tmp.template get_as<int32x4_t>()(
number<0>{}) = tmp0;
1520 tmp.template get_as<int32x4_t>()(
number<1>{}) = tmp1;
1521 tmp.template get_as<int32x4_t>()(
number<2>{}) = tmp2;
1522 tmp.template get_as<int32x4_t>()(
number<3>{}) = tmp3;
1528#ifndef BUFFER_LOAD_USE_INLINEASM
1529#define BUFFER_LOAD_USE_INLINEASM 0
1532template <
typename T,
1536 index_t src_thread_addr_offset,
1540 (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
1541 (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1542 (std::is_same<T, fp16_t>::value &&
1543 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
1544 (std::is_same<T, bf16_t>::value &&
1545 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32)) ||
1546 (std::is_same<T, int32_t>::value &&
1547 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1548 (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1549 (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1550 (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1551 (std::is_same<T, e8m0_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
1552 (std::is_same<T, pk_int4_t>::value &&
1553 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32) ||
1554 (std::is_same<T, pk_fp4_t>::value &&
1555 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16))),
1556 "wrong! not implemented");
1560 if constexpr(std::is_same<T, float>::value)
1562 if constexpr(N == 1)
1566 src_thread_addr_offset,
1567 src_wave_addr_offset,
1568 static_cast<index_t>(coherence)));
1570 else if constexpr(N == 2)
1574 src_thread_addr_offset,
1575 src_wave_addr_offset,
1576 static_cast<index_t>(coherence)));
1578 else if constexpr(N == 4)
1582 src_thread_addr_offset,
1583 src_wave_addr_offset,
1584 static_cast<index_t>(coherence)));
1586 else if constexpr(N == 8)
1590 tmp.template get_as<fp32x4_t>()(
number<0>{}) =
1592 src_thread_addr_offset,
1593 src_wave_addr_offset,
1594 static_cast<index_t>(coherence));
1596 tmp.template get_as<fp32x4_t>()(
number<1>{}) =
1598 src_thread_addr_offset,
1599 src_wave_addr_offset + 4 *
sizeof(
float),
1600 static_cast<index_t>(coherence));
1604 else if constexpr(N == 16)
1608 tmp.template get_as<fp32x4_t>()(
number<0>{}) =
1610 src_thread_addr_offset,
1611 src_wave_addr_offset,
1612 static_cast<index_t>(coherence));
1614 tmp.template get_as<fp32x4_t>()(
number<1>{}) =
1616 src_thread_addr_offset,
1617 src_wave_addr_offset + 4 *
sizeof(
float),
1618 static_cast<index_t>(coherence));
1620 tmp.template get_as<fp32x4_t>()(
number<2>{}) =
1622 src_thread_addr_offset,
1623 src_wave_addr_offset + 8 *
sizeof(
float),
1624 static_cast<index_t>(coherence));
1626 tmp.template get_as<fp32x4_t>()(
number<3>{}) =
1628 src_thread_addr_offset,
1629 src_wave_addr_offset + 12 *
sizeof(
float),
1630 static_cast<index_t>(coherence));
1635 else if constexpr(std::is_same<T, fp16_t>::value)
1637 if constexpr(N == 1)
1641 src_thread_addr_offset,
1642 src_wave_addr_offset,
1643 static_cast<index_t>(coherence)));
1645 else if constexpr(N == 2)
1649 src_thread_addr_offset,
1650 src_wave_addr_offset,
1651 static_cast<index_t>(coherence)));
1653 else if constexpr(N == 4)
1657 src_thread_addr_offset,
1658 src_wave_addr_offset,
1659 static_cast<index_t>(coherence)));
1661 else if constexpr(N == 8)
1665 src_thread_addr_offset,
1666 src_wave_addr_offset,
1667 static_cast<index_t>(coherence));
1671 else if constexpr(N == 16)
1675 tmp.template get_as<fp32x4_t>()(
number<0>{}) =
1677 src_thread_addr_offset,
1678 src_wave_addr_offset,
1679 static_cast<index_t>(coherence));
1681 tmp.template get_as<fp32x4_t>()(
number<1>{}) =
1683 src_thread_addr_offset,
1684 src_wave_addr_offset + 4 *
sizeof(
float),
1685 static_cast<index_t>(coherence));
1689 else if constexpr(N == 32)
1693 tmp.template get_as<fp32x4_t>()(
number<0>{}) =
1695 src_thread_addr_offset,
1696 src_wave_addr_offset,
1697 static_cast<index_t>(coherence));
1699 tmp.template get_as<fp32x4_t>()(
number<1>{}) =
1701 src_thread_addr_offset,
1702 src_wave_addr_offset + 4 *
sizeof(
float),
1703 static_cast<index_t>(coherence));
1705 tmp.template get_as<fp32x4_t>()(
number<2>{}) =
1707 src_thread_addr_offset,
1708 src_wave_addr_offset + 8 *
sizeof(
float),
1709 static_cast<index_t>(coherence));
1711 tmp.template get_as<fp32x4_t>()(
number<3>{}) =
1713 src_thread_addr_offset,
1714 src_wave_addr_offset + 12 *
sizeof(
float),
1715 static_cast<index_t>(coherence));
1720 else if constexpr(std::is_same<T, bf16_t>::value)
1722 if constexpr(N == 1)
1726 src_thread_addr_offset,
1727 src_wave_addr_offset,
1728 static_cast<index_t>(coherence)));
1730 else if constexpr(N == 2)
1734 src_thread_addr_offset,
1735 src_wave_addr_offset,
1736 static_cast<index_t>(coherence)));
1738 else if constexpr(N == 4)
1742 src_thread_addr_offset,
1743 src_wave_addr_offset,
1744 static_cast<index_t>(coherence)));
1746 else if constexpr(N == 8)
1749 src_thread_addr_offset,
1750 src_wave_addr_offset,
1751 static_cast<index_t>(coherence));
1755 else if constexpr(N == 16)
1759 tmp.template get_as<fp32x4_t>()(
number<0>{}) =
1761 src_thread_addr_offset,
1762 src_wave_addr_offset,
1763 static_cast<index_t>(coherence));
1765 tmp.template get_as<fp32x4_t>()(
number<1>{}) =
1767 src_thread_addr_offset,
1768 src_wave_addr_offset + 4 *
sizeof(
float),
1769 static_cast<index_t>(coherence));
1773 else if constexpr(N == 32)
1777 tmp.template get_as<fp32x4_t>()(
number<0>{}) =
1779 src_thread_addr_offset,
1780 src_wave_addr_offset,
1781 static_cast<index_t>(coherence));
1783 tmp.template get_as<fp32x4_t>()(
number<1>{}) =
1785 src_thread_addr_offset,
1786 src_wave_addr_offset + 4 *
sizeof(
float),
1787 static_cast<index_t>(coherence));
1789 tmp.template get_as<fp32x4_t>()(
number<2>{}) =
1791 src_thread_addr_offset,
1792 src_wave_addr_offset + 8 *
sizeof(
float),
1793 static_cast<index_t>(coherence));
1795 tmp.template get_as<fp32x4_t>()(
number<3>{}) =
1797 src_thread_addr_offset,
1798 src_wave_addr_offset + 12 *
sizeof(
float),
1799 static_cast<index_t>(coherence));
1807 src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
1813template <
typename T,
1816 bool oob_conditional_check =
true,
1817 bool pre_nop =
false>
1820 index_t src_thread_addr_offset,
1822 index_t src_linear_addr_offset,
1826 constexpr index_t bytes =
sizeof(T) * N;
1827 static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
1828 "wrong! not supported by buffer_load instruction");
1831 if constexpr(oob_conditional_check)
1833 buffer_load_if<
sizeof(type), pre_nop>{}(dst,
1834 src_wave_buffer_resource,
1835 src_thread_addr_offset,
1836 src_wave_addr_offset,
1837 src_linear_addr_offset,
1844 src_wave_buffer_resource,
1845 src_thread_addr_offset,
1846 src_wave_addr_offset,
1847 src_linear_addr_offset,
1853template <
typename T,
1856 bool pre_nop =
false>
1859 index_t src_thread_addr_offset,
1861 index_t src_immediate_addr_offset = 0,
1864 constexpr index_t num_bytes =
sizeof(T) * N;
1865 constexpr index_t num_words = num_bytes / 4;
1866 static_assert(num_bytes % 4 == 0 && (num_words == 1 || num_words == 3 || num_words == 4),
1867 "wrong! only support in dword, dwordx3, dwordx4");
1870 src_wave_buffer_resource,
1871 src_thread_addr_offset,
1872 src_wave_addr_offset,
1873 src_immediate_addr_offset,
1878template <
typename T,
1881 bool oob_conditional_check =
true>
1884 index_t src_thread_addr_offset,
1886 index_t src_immediate_addr_offset = 0,
1890 constexpr index_t bytes =
sizeof(T) * N;
1894 assert(src_immediate_addr_offset == 0 &&
1895 "wrong! not implemented src_immediate_addr_offset size, only 0 supported");
1896 ignore = src_immediate_addr_offset;
1898#if defined(__gfx950__)
1899 static_assert(bytes == 4 || bytes == 12 || bytes == 16,
1900 "wrong! only support in dword, dwordx3, dwordx4");
1901 src_wave_addr_offset = 0;
1903 static_assert(bytes == 4,
"wrong! not implemented vector size");
1907 index_t v_offset = src_thread_addr_offset;
1908 if constexpr(oob_conditional_check)
1909 v_offset = flag ? v_offset : src_wave_buffer_resource[2];
1911#pragma clang diagnostic push
1912#pragma clang diagnostic ignored "-Wold-style-cast"
1918 src_wave_addr_offset,
1920 static_cast<index_t>(coherence));
1921#pragma clang diagnostic pop
1928 index_t dst_thread_addr_offset,
1931 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
1932 "wrong! not implemented");
1934 if constexpr(N == 1)
1937 dst_wave_buffer_resource,
1938 dst_thread_addr_offset,
1939 dst_wave_addr_offset,
1940 static_cast<index_t>(coherence));
1942 else if constexpr(N == 2)
1946 dst_wave_buffer_resource,
1947 dst_thread_addr_offset,
1948 dst_wave_addr_offset,
1949 static_cast<index_t>(coherence));
1951 else if constexpr(N == 4)
1954 dst_wave_buffer_resource,
1955 dst_thread_addr_offset,
1956 dst_wave_addr_offset,
1957 static_cast<index_t>(coherence));
1959 else if constexpr(N == 8)
1962 dst_wave_buffer_resource,
1963 dst_thread_addr_offset,
1964 dst_wave_addr_offset,
1965 static_cast<index_t>(coherence));
1967 else if constexpr(N == 16)
1970 dst_wave_buffer_resource,
1971 dst_thread_addr_offset,
1972 dst_wave_addr_offset,
1973 static_cast<index_t>(coherence));
1975 else if constexpr(N == 32)
1978 src_thread_data.template get_as<int32x4_t>()[
number<0>{}],
1979 dst_wave_buffer_resource,
1980 dst_thread_addr_offset,
1981 dst_wave_addr_offset,
1982 static_cast<index_t>(coherence));
1985 src_thread_data.template get_as<int32x4_t>()[
number<1>{}],
1986 dst_wave_buffer_resource,
1987 dst_thread_addr_offset,
1988 dst_wave_addr_offset +
sizeof(
int32_t) * 4,
1989 static_cast<index_t>(coherence));
1991 else if constexpr(N == 64)
1994 src_thread_data.template get_as<int32x4_t>()[
number<0>{}],
1995 dst_wave_buffer_resource,
1996 dst_thread_addr_offset,
1997 dst_wave_addr_offset,
1998 static_cast<index_t>(coherence));
2001 src_thread_data.template get_as<int32x4_t>()[
number<1>{}],
2002 dst_wave_buffer_resource,
2003 dst_thread_addr_offset,
2004 dst_wave_addr_offset +
sizeof(
int32_t) * 4,
2005 static_cast<index_t>(coherence));
2008 src_thread_data.template get_as<int32x4_t>()[
number<2>{}],
2009 dst_wave_buffer_resource,
2010 dst_thread_addr_offset,
2011 dst_wave_addr_offset +
sizeof(
int32_t) * 8,
2012 static_cast<index_t>(coherence));
2015 src_thread_data.template get_as<int32x4_t>()[
number<3>{}],
2016 dst_wave_buffer_resource,
2017 dst_thread_addr_offset,
2018 dst_wave_addr_offset +
sizeof(
int32_t) * 12,
2019 static_cast<index_t>(coherence));
2023template <
typename T,
2028 index_t dst_thread_addr_offset,
2032 (std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4 || N == 8)) ||
2033 (std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
2034 (std::is_same<T, fp16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
2035 (std::is_same<T, bf16_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
2036 (std::is_same<T, int32_t>::value &&
2037 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
2038 (std::is_same<T, fp8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
2039 (std::is_same<T, bf8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
2040 (std::is_same<T, int8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
2041 (std::is_same<T, uint16_t>::value &&
2042 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
2043 (std::is_same<T, uint8_t>::value && (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)),
2044 "wrong! not implemented");
2046 if constexpr(std::is_same<T, float>::value)
2048 if constexpr(N == 1)
2051 dst_wave_buffer_resource,
2052 dst_thread_addr_offset,
2053 dst_wave_addr_offset,
2054 static_cast<index_t>(coherence));
2056 else if constexpr(N == 2)
2059 dst_wave_buffer_resource,
2060 dst_thread_addr_offset,
2061 dst_wave_addr_offset,
2062 static_cast<index_t>(coherence));
2064 else if constexpr(N == 4)
2067 dst_wave_buffer_resource,
2068 dst_thread_addr_offset,
2069 dst_wave_addr_offset,
2070 static_cast<index_t>(coherence));
2072 else if constexpr(N == 8)
2075 src_thread_data.template get_as<fp32x4_t>()[
number<0>{}],
2076 dst_wave_buffer_resource,
2077 dst_thread_addr_offset,
2078 dst_wave_addr_offset,
2079 static_cast<index_t>(coherence));
2081 src_thread_data.template get_as<fp32x4_t>()[
number<1>{}],
2082 dst_wave_buffer_resource,
2083 dst_thread_addr_offset,
2084 dst_wave_addr_offset + 4 *
sizeof(
float),
2085 static_cast<index_t>(coherence));
2088 else if constexpr(std::is_same<T, fp16_t>::value)
2090 if constexpr(N == 1)
2093 dst_wave_buffer_resource,
2094 dst_thread_addr_offset,
2095 dst_wave_addr_offset,
2096 static_cast<index_t>(coherence));
2098 else if constexpr(N == 2)
2101 dst_wave_buffer_resource,
2102 dst_thread_addr_offset,
2103 dst_wave_addr_offset,
2104 static_cast<index_t>(coherence));
2106 else if constexpr(N == 4)
2109 dst_wave_buffer_resource,
2110 dst_thread_addr_offset,
2111 dst_wave_addr_offset,
2112 static_cast<index_t>(coherence));
2114 else if constexpr(N == 8)
2120 dst_wave_buffer_resource,
2121 dst_thread_addr_offset,
2122 dst_wave_addr_offset,
2123 static_cast<index_t>(coherence));
2126 dst_wave_buffer_resource,
2127 dst_thread_addr_offset,
2128 dst_wave_addr_offset + 4 *
sizeof(
fp16_t),
2129 static_cast<index_t>(coherence));
2132 dst_wave_buffer_resource,
2133 dst_thread_addr_offset,
2134 dst_wave_addr_offset,
2135 static_cast<index_t>(coherence));
2139 else if constexpr(std::is_same<T, bf16_t>::value)
2141 if constexpr(N == 1)
2144 dst_wave_buffer_resource,
2145 dst_thread_addr_offset,
2146 dst_wave_addr_offset,
2147 static_cast<index_t>(coherence));
2149 else if constexpr(N == 2)
2152 dst_wave_buffer_resource,
2153 dst_thread_addr_offset,
2154 dst_wave_addr_offset,
2155 static_cast<index_t>(coherence));
2157 else if constexpr(N == 4)
2160 dst_wave_buffer_resource,
2161 dst_thread_addr_offset,
2162 dst_wave_addr_offset,
2163 static_cast<index_t>(coherence));
2165 else if constexpr(N == 8)
2168 src_thread_data.template get_as<int16x4_t>()[
number<0>{}],
2169 dst_wave_buffer_resource,
2170 dst_thread_addr_offset,
2171 dst_wave_addr_offset,
2172 static_cast<index_t>(coherence));
2175 src_thread_data.template get_as<int16x4_t>()[
number<1>{}],
2176 dst_wave_buffer_resource,
2177 dst_thread_addr_offset,
2178 dst_wave_addr_offset + 4 *
sizeof(
bf16_t),
2179 static_cast<index_t>(coherence));
2182 else if constexpr(std::is_same<T, uint16_t>::value)
2184 if constexpr(N == 1)
2187 dst_wave_buffer_resource,
2188 dst_thread_addr_offset,
2189 dst_wave_addr_offset,
2190 static_cast<index_t>(coherence));
2192 else if constexpr(N == 2)
2195 dst_wave_buffer_resource,
2196 dst_thread_addr_offset,
2197 dst_wave_addr_offset,
2198 static_cast<index_t>(coherence));
2200 else if constexpr(N == 4)
2203 dst_wave_buffer_resource,
2204 dst_thread_addr_offset,
2205 dst_wave_addr_offset,
2206 static_cast<index_t>(coherence));
2208 else if constexpr(N == 8)
2211 src_thread_data.template get_as<uint16x4_t>()[
number<0>{}],
2212 dst_wave_buffer_resource,
2213 dst_thread_addr_offset,
2214 dst_wave_addr_offset,
2215 static_cast<index_t>(coherence));
2218 src_thread_data.template get_as<uint16x4_t>()[
number<1>{}],
2219 dst_wave_buffer_resource,
2220 dst_thread_addr_offset,
2221 dst_wave_addr_offset + 4 *
sizeof(
uint16_t),
2222 static_cast<index_t>(coherence));
2230 dst_wave_buffer_resource,
2231 dst_thread_addr_offset,
2232 dst_wave_addr_offset);
2236template <
typename T,
2239 bool oob_conditional_check =
true>
2242 index_t dst_thread_addr_offset,
2244 index_t dst_linear_addr_offset,
2247 constexpr index_t bytes =
sizeof(T) * N;
2248 static_assert(bytes == 1 || bytes == 2 || bytes == 4 || bytes == 8 || bytes == 16,
2249 "wrong! not supported by buffer_store instruction");
2252 if constexpr(oob_conditional_check)
2255 dst_wave_buffer_resource,
2256 dst_thread_addr_offset,
2257 dst_wave_addr_offset,
2258 dst_linear_addr_offset,
2264 dst_wave_buffer_resource,
2265 dst_thread_addr_offset,
2266 dst_wave_addr_offset,
2267 dst_linear_addr_offset);
2271template <
typename T, index_t N>
2274 index_t dst_thread_addr_offset,
2277 static_assert((std::is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
2278 (std::is_same<T, fp16_t>::value && (N == 2 || N == 4 || N == 8)) ||
2279 (std::is_same<T, bf16_t>::value && (N == 2 || N == 4 || N == 8)) ||
2280 (std::is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
2281 "wrong! not implemented");
2283 if constexpr(std::is_same<T, float>::value)
2285 if constexpr(N == 1)
2288 dst_wave_buffer_resource,
2289 dst_thread_addr_offset,
2290 dst_wave_addr_offset,
2293 else if constexpr(N == 2)
2296 src_thread_data.template get_as<float>()[
number<0>{}],
2297 dst_wave_buffer_resource,
2298 dst_thread_addr_offset,
2299 dst_wave_addr_offset,
2303 src_thread_data.template get_as<float>()[
number<1>{}],
2304 dst_wave_buffer_resource,
2305 dst_thread_addr_offset,
2306 dst_wave_addr_offset +
sizeof(
float),
2309 else if constexpr(N == 4)
2312 src_thread_data.template get_as<float>()[
number<0>{}],
2313 dst_wave_buffer_resource,
2314 dst_thread_addr_offset,
2315 dst_wave_addr_offset,
2319 src_thread_data.template get_as<float>()[
number<1>{}],
2320 dst_wave_buffer_resource,
2321 dst_thread_addr_offset,
2322 dst_wave_addr_offset +
sizeof(
float),
2326 src_thread_data.template get_as<float>()[
number<2>{}],
2327 dst_wave_buffer_resource,
2328 dst_thread_addr_offset,
2329 dst_wave_addr_offset + 2 *
sizeof(
float),
2333 src_thread_data.template get_as<float>()[
number<3>{}],
2334 dst_wave_buffer_resource,
2335 dst_thread_addr_offset,
2336 dst_wave_addr_offset + 3 *
sizeof(
float),
2340 else if constexpr(std::is_same<T, fp16_t>::value)
2342 if constexpr(N == 2)
2345 dst_wave_buffer_resource,
2346 dst_thread_addr_offset,
2347 dst_wave_addr_offset,
2350 else if constexpr(N == 4)
2354 src_thread_data.template get_as<fp16x2_t>()[i],
2355 dst_wave_buffer_resource,
2356 dst_thread_addr_offset,
2357 dst_wave_addr_offset + i *
sizeof(
fp16x2_t),
2361 else if constexpr(N == 8)
2365 src_thread_data.template get_as<fp16x2_t>()[i],
2366 dst_wave_buffer_resource,
2367 dst_thread_addr_offset,
2368 dst_wave_addr_offset + i *
sizeof(
fp16x2_t),
2373 else if constexpr(std::is_same<T, bf16_t>::value)
2375 if constexpr(N == 2)
2378 dst_wave_buffer_resource,
2379 dst_thread_addr_offset,
2380 dst_wave_addr_offset,
2383 else if constexpr(N == 4)
2387 src_thread_data.template get_as<bf16x2_t>()[i],
2388 dst_wave_buffer_resource,
2389 dst_thread_addr_offset,
2390 dst_wave_addr_offset + i *
sizeof(
bf16x2_t),
2394 else if constexpr(N == 8)
2398 src_thread_data.template get_as<bf16x2_t>()[i],
2399 dst_wave_buffer_resource,
2400 dst_thread_addr_offset,
2401 dst_wave_addr_offset + i *
sizeof(
bf16x2_t),
2406 else if constexpr(std::is_same<T, int32_t>::value)
2408 if constexpr(N == 1)
2411 dst_wave_buffer_resource,
2412 dst_thread_addr_offset,
2413 dst_wave_addr_offset,
2416 else if constexpr(N == 2)
2419 src_thread_data.template get_as<int32_t>()[
number<0>{}],
2420 dst_wave_buffer_resource,
2421 dst_thread_addr_offset,
2422 dst_wave_addr_offset,
2426 src_thread_data.template get_as<int32_t>()[
number<1>{}],
2427 dst_wave_buffer_resource,
2428 dst_thread_addr_offset,
2429 dst_wave_addr_offset +
sizeof(
int32_t),
2432 else if constexpr(N == 4)
2435 src_thread_data.template get_as<int32_t>()[
number<0>{}],
2436 dst_wave_buffer_resource,
2437 dst_thread_addr_offset,
2438 dst_wave_addr_offset,
2442 src_thread_data.template get_as<int32_t>()[
number<1>{}],
2443 dst_wave_buffer_resource,
2444 dst_thread_addr_offset,
2445 dst_wave_addr_offset +
sizeof(
int32_t),
2449 src_thread_data.template get_as<int32_t>()[
number<2>{}],
2450 dst_wave_buffer_resource,
2451 dst_thread_addr_offset,
2452 dst_wave_addr_offset + 2 *
sizeof(
int32_t),
2456 src_thread_data.template get_as<int32_t>()[
number<3>{}],
2457 dst_wave_buffer_resource,
2458 dst_thread_addr_offset,
2459 dst_wave_addr_offset + 3 *
sizeof(
int32_t),
2465template <
typename T, index_t N>
2468 index_t dst_thread_addr_offset,
2471 static_assert((std::is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
2472 "wrong! not implemented");
2473 if constexpr(std::is_same<T, double>::value)
2475 if constexpr(N == 1)
2478 dst_wave_buffer_resource,
2479 dst_thread_addr_offset,
2480 dst_wave_addr_offset,
2483 else if constexpr(N == 2)
2486 src_thread_data.template get_as<double>()[
number<0>{}],
2487 dst_wave_buffer_resource,
2488 dst_thread_addr_offset,
2489 dst_wave_addr_offset,
2493 src_thread_data.template get_as<double>()[
number<1>{}],
2494 dst_wave_buffer_resource,
2495 dst_thread_addr_offset,
2496 dst_wave_addr_offset +
sizeof(
double),
2499 else if constexpr(N == 4)
2502 src_thread_data.template get_as<double>()[
number<0>{}],
2503 dst_wave_buffer_resource,
2504 dst_thread_addr_offset,
2505 dst_wave_addr_offset,
2509 src_thread_data.template get_as<double>()[
number<1>{}],
2510 dst_wave_buffer_resource,
2511 dst_thread_addr_offset,
2512 dst_wave_addr_offset +
sizeof(
double),
2516 src_thread_data.template get_as<double>()[
number<2>{}],
2517 dst_wave_buffer_resource,
2518 dst_thread_addr_offset,
2519 dst_wave_addr_offset + 2 *
sizeof(
double),
2523 src_thread_data.template get_as<double>()[
number<3>{}],
2524 dst_wave_buffer_resource,
2525 dst_thread_addr_offset,
2526 dst_wave_addr_offset + 3 *
sizeof(
double),
2537template <
typename T,
2540 bool oob_conditional_check =
true>
2543 index_t src_thread_element_offset,
2544 bool src_thread_element_valid,
2545 index_t src_element_space_size)
2547 const int32x4_t src_wave_buffer_resource =
2550 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
2552#if CK_TILE_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
2554 if constexpr(oob_conditional_check)
2555 return src_thread_element_valid ? 0 : 0x80000000;
2560 src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
2564 if constexpr(oob_conditional_check)
2575template <
typename T,
2578 bool oob_conditional_check =
true>
2581 index_t src_thread_element_offset,
2582 bool src_thread_element_valid,
2583 index_t src_element_space_size,
2586 const int32x4_t src_wave_buffer_resource =
2589 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
2594 if constexpr(oob_conditional_check)
2600template <
typename T,
2603 bool oob_conditional_check =
true,
2604 bool pre_nop =
false>
2606 const T* p_src_wave,
2607 index_t src_thread_element_offset,
2608 index_t src_linear_element_offset,
2609 index_t src_element_space_size,
2613 const int32x4_t src_wave_buffer_resource =
2616 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
2617 index_t src_linear_addr_offset = src_linear_element_offset *
sizeof(T);
2621 src_wave_buffer_resource,
2622 src_thread_addr_offset,
2624 src_linear_addr_offset,
2630template <
typename T,
2633 bool oob_conditional_check =
true,
2634 bool pre_nop =
false>
2636 const int32x4_t src_wave_buffer_resource,
2637 index_t src_thread_element_offset,
2638 index_t src_linear_element_offset,
2642 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
2643 index_t src_linear_addr_offset = src_linear_element_offset *
sizeof(T);
2647 src_wave_buffer_resource,
2648 src_thread_addr_offset,
2650 src_linear_addr_offset,
2659template <
typename T,
2662 bool pre_nop =
false>
2664 const T* p_src_wave,
2665 index_t src_thread_element_offset,
2666 index_t src_linear_element_offset,
2667 index_t src_element_space_size,
2670 const int32x4_t src_wave_buffer_resource =
2673 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
2674 index_t src_linear_addr_offset = src_linear_element_offset *
sizeof(T);
2677 src_wave_buffer_resource,
2678 src_thread_addr_offset,
2680 src_linear_addr_offset,
2685template <
typename T,
2688 bool pre_nop =
false>
2690 const int32x4_t src_wave_buffer_resource,
2691 index_t src_thread_element_offset,
2692 index_t src_linear_element_offset,
2695 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
2696 index_t src_linear_addr_offset = src_linear_element_offset *
sizeof(T);
2699 src_wave_buffer_resource,
2700 src_thread_addr_offset,
2702 src_linear_addr_offset,
2707template <
typename T,
2710 bool oob_conditional_check =
false>
2712 const int32x4_t src_wave_buffer_resource,
2713 index_t src_thread_element_offset,
2714 index_t src_linear_element_offset,
2715 bool is_valid_element,
2718 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
2719 index_t src_linear_addr_offset = src_linear_element_offset *
sizeof(T);
2722 src_wave_buffer_resource,
2723 src_thread_addr_offset,
2725 src_linear_addr_offset,
2734template <
typename T,
2737 bool oob_conditional_check =
true>
2740 const index_t dst_thread_element_offset,
2741 const bool dst_thread_element_valid,
2742 const index_t dst_element_space_size)
2744 const int32x4_t dst_wave_buffer_resource =
2747 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
2749#if CK_TILE_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
2751 if constexpr(oob_conditional_check)
2752 return dst_thread_element_valid ? 0 : 0x80000000;
2757 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
2759 if constexpr(oob_conditional_check)
2761 if(dst_thread_element_valid)
2764 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
2770 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
2775template <
typename T,
2778 bool oob_conditional_check =
true>
2781 const index_t dst_thread_element_offset,
2782 const index_t dst_linear_element_offset,
2783 const bool dst_thread_element_valid,
2784 const index_t dst_element_space_size)
2786 const int32x4_t dst_wave_buffer_resource =
2789 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
2790 index_t dst_linear_addr_offset = dst_linear_element_offset *
sizeof(T);
2793 dst_wave_buffer_resource,
2794 dst_thread_addr_offset,
2796 dst_linear_addr_offset,
2797 dst_thread_element_valid);
2804template <
typename T, index_t N>
2807 const index_t dst_thread_element_offset,
2808 const bool dst_thread_element_valid,
2809 const index_t dst_element_space_size)
2811 const int32x4_t dst_wave_buffer_resource =
2814 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
2816#if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
2817 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
2820 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
2822 if(dst_thread_element_valid)
2825 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
2830template <
typename T,
2833 bool oob_conditional_check =
true,
2834 bool pre_nop =
false>
2837 const index_t dst_thread_element_offset,
2838 const index_t dst_linear_element_offset,
2839 const bool dst_thread_element_valid,
2840 const index_t dst_element_space_size,
2843 const int32x4_t dst_wave_buffer_resource =
2846 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
2847 index_t dst_linear_addr_offset = dst_linear_element_offset *
sizeof(T);
2849 if constexpr(oob_conditional_check)
2851 buffer_atomic_add_if<T, N, pre_nop>{}(src_thread_data,
2852 dst_wave_buffer_resource,
2853 dst_thread_addr_offset,
2855 dst_linear_addr_offset,
2856 dst_thread_element_valid);
2861 dst_wave_buffer_resource,
2862 dst_thread_addr_offset,
2864 dst_linear_addr_offset,
2873template <
typename T, index_t N>
2876 const index_t dst_thread_element_offset,
2877 const bool dst_thread_element_valid,
2878 const index_t dst_element_space_size)
2880 const int32x4_t dst_wave_buffer_resource =
2883 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
2885#if CK_TILE_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
2886 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
2889 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
2891 if(dst_thread_element_valid)
2894 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
2899#if defined(__gfx950__)
2900template <
typename T, index_t N>
2901__device__
auto amd_transpose_load_to_vgpr(
const T* __restrict__ in_ptr)
2903#define __LDS_ADDR __attribute__((address_space(3)))
2905 static_assert(__has_builtin(__builtin_amdgcn_raw_buffer_load_b32),
2906 "We need to have the compatible compiler version to build this instruction");
2908#pragma clang diagnostic push
2909#pragma clang diagnostic ignored "-Wold-style-cast"
2911 const auto in_ptr_ = (__LDS_ADDR T*)(
const_cast<T*
>(in_ptr));
2912#pragma clang diagnostic pop
2915 typedef __attribute__((__vector_size__(4 *
sizeof(__fp16)))) __fp16 llvm_fp16x4_t;
2916 auto lds_ptr =
reinterpret_cast<__LDS_ADDR llvm_fp16x4_t*
>(in_ptr_);
2921 typedef __attribute__((__vector_size__(4 *
sizeof(__bf16)))) __bf16 llvm_bf16x4_t;
2922 auto lds_ptr =
reinterpret_cast<__LDS_ADDR llvm_bf16x4_t*
>(in_ptr_);
2925 else if constexpr(std::is_same_v<remove_cvref_t<T>,
ck_tile::fp8_t> ||
2929 typedef __attribute__((__vector_size__(2 *
sizeof(
index_t))))
index_t llvm_i32x2_t;
2930 auto lds_ptr =
reinterpret_cast<__LDS_ADDR llvm_i32x2_t*
>(in_ptr_);
2935 static_assert(
false,
"not implemented");
#define CK_TILE_DEVICE
Definition config.hpp:41
#define CK_TILE_BUFFER_RESOURCE_3RD_DWORD
Definition config.hpp:210
#define CK_TILE_DEVICE_EXTERN
Definition config.hpp:43
#define CK_TILE_LDS_ADDR
Definition config.hpp:58
Definition tile/core/arch/amd_buffer_addressing.hpp:110
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 3 >(array< float, 3 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:997
CK_TILE_DEVICE void insert_dummy_dep()
Definition tile/core/arch/amd_buffer_addressing.hpp:1037
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 32 >(array< float, 32 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:1025
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 8 >(array< float, 8 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:1009
CK_TILE_DEVICE void insert_dummy_dep_per_dword(array< float, N > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:981
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 2 >(array< float, 2 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:991
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 4 >(array< float, 4 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:1003
CK_TILE_DEVICE void insert_dummy_dep_per_dword< 16 >(array< float, 16 > &b)
Definition tile/core/arch/amd_buffer_addressing.hpp:1016
Definition tile/core/algorithm/cluster_descriptor.hpp:13
CK_TILE_DEVICE_EXTERN int8x2_t llvm_amdgcn_raw_buffer_load_i8x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i8")
_Float16 fp16x2_t
Definition half.hpp:385
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:1535
remove_cv_t< std::remove_reference_t< T > > remove_cvref_t
Definition type_traits.hpp:21
int8_t int8x2_t
Definition pk_int4.hpp:103
CK_TILE_DEVICE void amd_buffer_atomic_max(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2874
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16(_Float16 vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f16")
__device__ uint32_t amd_wave_read_first_lane(uint16_t v)
Definition tile/core/arch/amd_buffer_addressing.hpp:35
_Float16 half_t
Definition half.hpp:111
CK_TILE_DEVICE_EXTERN fp16x4_t llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16")
uint16_t uint16x2_t
Definition vector_type.hpp:181
int16_t int16x4_t
Definition vector_type.hpp:173
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8")
CK_TILE_DEVICE_EXTERN int8_t llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i8")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8(int8_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i8")
constant< b > bool_constant
Definition tile/core/numeric/integral_constant.hpp:43
int8_t int8_t
Definition int8.hpp:20
CK_TILE_DEVICE void amd_buffer_store(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2738
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x2(int16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
bfloat16_t bf16_t
Definition bfloat16.hpp:113
CK_TILE_DEVICE auto async_load_fence_raw(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:1068
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16x4(int16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
_Float16 fp16_t
Definition half.hpp:110
amd_buffer_coherence_enum
Definition tile/core/arch/amd_buffer_addressing.hpp:1404
@ glc_slc
Definition tile/core/arch/amd_buffer_addressing.hpp:1408
@ SYSTEM_NT1
Definition tile/core/arch/amd_buffer_addressing.hpp:1419
@ coherence_default
Definition tile/core/arch/amd_buffer_addressing.hpp:1405
@ WAVE_NT0
Definition tile/core/arch/amd_buffer_addressing.hpp:1412
@ slc
Definition tile/core/arch/amd_buffer_addressing.hpp:1407
@ DEVICE_NT1
Definition tile/core/arch/amd_buffer_addressing.hpp:1417
@ SYSTEM_NT0
Definition tile/core/arch/amd_buffer_addressing.hpp:1418
@ glc
Definition tile/core/arch/amd_buffer_addressing.hpp:1406
@ GROUP_NT1
Definition tile/core/arch/amd_buffer_addressing.hpp:1415
@ DEVICE_NT0
Definition tile/core/arch/amd_buffer_addressing.hpp:1416
@ GROUP_NT0
Definition tile/core/arch/amd_buffer_addressing.hpp:1414
@ WAVE_NT1
Definition tile/core/arch/amd_buffer_addressing.hpp:1413
CK_TILE_DEVICE_EXTERN double llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_atomic_add_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
_BitInt(8) fp8_t
Definition float8.hpp:204
CK_TILE_DEVICE_EXTERN fp32x2_t llvm_amdgcn_raw_buffer_load_fp32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f32")
CK_TILE_DEVICE void amd_async_buffer_load_with_oob(CK_TILE_LDS_ADDR T *smem, const int32x4_t src_wave_buffer_resource, index_t src_thread_element_offset, index_t src_linear_element_offset, bool is_valid_element, bool_constant< oob_conditional_check >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:2711
tuple_array< T, N > thread_buffer
Definition thread_buffer.hpp:14
int32_t int32x4_t
Definition vector_type.hpp:155
CK_TILE_DEVICE void amd_async_buffer_load_with_oob_raw(T *smem, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:2663
bfloat16_t bf16x2_t
Definition pk_fp4.hpp:24
CK_TILE_HOST_DEVICE constexpr Y bit_cast(const X &x)
Definition bit_cast.hpp:11
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x2(fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x4(fp32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp16x4(fp16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16")
CK_TILE_DEVICE_EXTERN int32x4_t llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32")
CK_TILE_DEVICE void lds_load_fence(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:820
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i8x2(int8x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i8")
uint32_t uint32x4_t
Definition vector_type.hpp:164
CK_TILE_DEVICE void amd_buffer_store_impl_with_bytes(const thread_buffer< int8_t, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:1926
CK_TILE_DEVICE_EXTERN bf16x2_t llvm_amdgcn_raw_buffer_atomic_add_bf16x2(bf16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2bf16")
CK_TILE_DEVICE void async_buffer_load_dwordxn_v(void *smem, int32x4_t rsrc, index_t voffset, index_t, index_t ioffset, index_t=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:1352
_Float16 fp16x4_t
Definition vector_type.hpp:137
CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i32")
CK_TILE_DEVICE void amd_buffer_atomic_max_impl(const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:2466
CK_TILE_DEVICE void amd_buffer_atomic_add_raw(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:2835
CK_TILE_DEVICE_EXTERN int32x2_t llvm_amdgcn_raw_buffer_load_i32x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i32")
constant< v > number
Definition tile/core/numeric/integral_constant.hpp:37
CK_TILE_DEVICE void amd_buffer_atomic_add_impl(const thread_buffer< T, N > &src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:2272
constexpr detail::ignore_t ignore
Definition tile/core/utility/ignore.hpp:20
CK_TILE_DEVICE void buffer_store_fence(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:1063
CK_TILE_DEVICE_EXTERN float llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f32")
CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(fp16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
bfloat16_t bf16x4_t
Definition vector_type.hpp:146
int32_t int32_t
Definition integer.hpp:10
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32")
CK_TILE_DEVICE void amd_buffer_load_raw(thread_buffer< T, N > &dst, const T *p_src_wave, index_t src_thread_element_offset, index_t src_linear_element_offset, index_t src_element_space_size, index_t is_valid_element=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:2605
bfloat16_t bf16x8_t
Definition vector_type.hpp:147
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_zero(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2542
CK_TILE_DEVICE void async_buffer_load_fence(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:1393
CK_TILE_DEVICE void amd_buffer_store_raw(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const index_t dst_linear_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2779
CK_TILE_DEVICE_EXTERN fp32x4_t llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32")
CK_TILE_DEVICE void buffer_load_fence(index_t cnt=0)
Definition tile/core/arch/amd_buffer_addressing.hpp:815
typename impl::ext_vector< T, N >::type ext_vector_t
Definition vector_type.hpp:84
unsigned _BitInt(8) bf8_t
Definition float8.hpp:206
CK_TILE_DEVICE_EXTERN int16_t llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.i16")
CK_TILE_DEVICE_EXTERN int16x4_t llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16")
CK_TILE_DEVICE void amd_buffer_store_impl(const thread_buffer< T, N > src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:2026
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32x2(fp32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2f32")
float fp32x4_t
Definition vector_type.hpp:128
CK_TILE_DEVICE int32x4_t make_wave_buffer_resource(const void *ptr, uint32_t size=0xffffffff, ForceSGPR={})
Definition tile/core/arch/amd_buffer_addressing.hpp:97
CK_TILE_DEVICE void amd_buffer_atomic_add(const thread_buffer< T, N > &src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition tile/core/arch/amd_buffer_addressing.hpp:2805
uint16_t uint16x4_t
Definition vector_type.hpp:182
float fp32x2_t
Definition pk_fp4.hpp:22
int8_t int8x4_t
Definition vector_type.hpp:191
CK_TILE_DEVICE_EXTERN int16x2_t llvm_amdgcn_raw_buffer_load_i16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2i16")
CK_TILE_DEVICE void amd_async_buffer_load(CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, index_t flag=0, bool_constant< oob_conditional_check >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:1882
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x2(uint16x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i16")
int32_t index_t
Definition integer.hpp:9
CK_TILE_DEVICE thread_buffer< T, N > amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition tile/core/arch/amd_buffer_addressing.hpp:2580
int32_t int32x2_t
Definition vector_type.hpp:154
CK_TILE_DEVICE_EXTERN fp16x2_t llvm_amdgcn_raw_buffer_load_fp16x2(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v2f16")
CK_TILE_DEVICE_EXTERN _Float16 llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.f16")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16x4(uint16x4_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16")
CK_TILE_DEVICE thread_buffer< int8_t, N > amd_buffer_load_impl_with_bytes(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:1425
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, as3_uint32_ptr lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
CK_TILE_DEVICE void amd_async_buffer_load_impl(CK_TILE_LDS_ADDR T *smem, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_immediate_addr_offset=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:1857
CK_TILE_DEVICE_EXTERN int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i16(int16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
int16_t int16x2_t
Definition vector_type.hpp:172
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i32")
CK_TILE_DEVICE void amd_buffer_store_raw_impl(const thread_buffer< T, N > &dst_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset, index_t dst_linear_addr_offset, index_t is_valid_element=1)
Definition tile/core/arch/amd_buffer_addressing.hpp:2240
uint32_t uint32x2_t
Definition vector_type.hpp:163
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_i32x2(int32x2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v2i32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.f32")
CK_TILE_DEVICE_EXTERN void llvm_amdgcn_raw_buffer_store_ui16(uint16_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.i16")
CK_TILE_DEVICE_EXTERN int8x4_t llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8")
CK_TILE_DEVICE void amd_buffer_load_raw_impl(thread_buffer< T, N > &dst, int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset, index_t src_linear_addr_offset, index_t flag=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:1818
const GenericPointer< typename T::ValueType > T2 value
Definition pointer.h:1697
signed short int16_t
Definition stdint.h:122
unsigned short uint16_t
Definition stdint.h:125
unsigned int uint32_t
Definition stdint.h:126
unsigned char uint8_t
Definition stdint.h:124
signed char int8_t
Definition stdint.h:121
A fixed-size array container similar to std::array with additional utilities.
Definition tile/core/container/array.hpp:43
CK_TILE_HOST_DEVICE constexpr auto & get()
Definition tile/core/container/array.hpp:101
CK_TILE_DEVICE void operator()(const T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t)
Definition tile/core/arch/amd_buffer_addressing.hpp:863
CK_TILE_DEVICE void operator()(const T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t flag=1)
Definition tile/core/arch/amd_buffer_addressing.hpp:832
Definition tile/core/arch/amd_buffer_addressing.hpp:826
Definition tile/core/arch/amd_buffer_addressing.hpp:857
CK_TILE_DEVICE void operator()(T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:166
CK_TILE_DEVICE void operator()(T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:304
CK_TILE_DEVICE void operator()(T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:269
CK_TILE_DEVICE void operator()(T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:234
CK_TILE_DEVICE void operator()(T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:200
CK_TILE_DEVICE void operator()(T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t flag=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:359
CK_TILE_DEVICE void operator()(T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t flag=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:492
CK_TILE_DEVICE void operator()(T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t flag=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:459
CK_TILE_DEVICE void operator()(T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t flag=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:426
CK_TILE_DEVICE void operator()(T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t flag=0, bool_constant< pre_nop >={})
Definition tile/core/arch/amd_buffer_addressing.hpp:393
Definition tile/core/arch/amd_buffer_addressing.hpp:134
Definition tile/core/arch/amd_buffer_addressing.hpp:131
Definition tile/core/arch/amd_buffer_addressing.hpp:90
const void * ptr
Definition tile/core/arch/amd_buffer_addressing.hpp:91
uint32_t range
Definition tile/core/arch/amd_buffer_addressing.hpp:92
uint32_t config
Definition tile/core/arch/amd_buffer_addressing.hpp:93
CK_TILE_DEVICE void operator()(const T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t=1)
Definition tile/core/arch/amd_buffer_addressing.hpp:528
CK_TILE_DEVICE void operator()(const T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t=1)
Definition tile/core/arch/amd_buffer_addressing.hpp:632
CK_TILE_DEVICE void operator()(const T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t=1)
Definition tile/core/arch/amd_buffer_addressing.hpp:606
CK_TILE_DEVICE void operator()(const T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t=1)
Definition tile/core/arch/amd_buffer_addressing.hpp:580
CK_TILE_DEVICE void operator()(const T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t=1)
Definition tile/core/arch/amd_buffer_addressing.hpp:554
CK_TILE_DEVICE void operator()(const T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t flag=1)
Definition tile/core/arch/amd_buffer_addressing.hpp:677
CK_TILE_DEVICE void operator()(const T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t flag=1)
Definition tile/core/arch/amd_buffer_addressing.hpp:790
CK_TILE_DEVICE void operator()(const T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t flag=1)
Definition tile/core/arch/amd_buffer_addressing.hpp:762
CK_TILE_DEVICE void operator()(const T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t flag=1)
Definition tile/core/arch/amd_buffer_addressing.hpp:734
CK_TILE_DEVICE void operator()(const T &value, int32x4_t res, index_t v_offset, index_t, index_t i_offset, index_t flag=1)
Definition tile/core/arch/amd_buffer_addressing.hpp:705
Definition tile/core/arch/amd_buffer_addressing.hpp:140
Definition tile/core/arch/amd_buffer_addressing.hpp:137
fp32x4_t payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:115
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:119
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:118
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:117
fp32x2_t payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:116
Definition tile/core/arch/amd_buffer_addressing.hpp:113
fp32x4_t payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:884
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:888
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:887
float payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:886
fp32x2_t payload_t
Definition tile/core/arch/amd_buffer_addressing.hpp:885
Definition tile/core/arch/amd_buffer_addressing.hpp:882
static CK_TILE_HOST_DEVICE constexpr T zero()
Definition tile/core/numeric/numeric.hpp:58
Definition coordinate_transform.hpp:1392
CK_TILE_DEVICE void operator()(T &value, index_t v_offset, index_t i_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:901
CK_TILE_DEVICE void operator()(T &value, index_t v_offset, index_t i_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:961
CK_TILE_DEVICE void operator()(T &value, index_t v_offset, index_t i_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:946
CK_TILE_DEVICE void operator()(T &value, index_t v_offset, index_t i_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:931
CK_TILE_DEVICE void operator()(T &value, index_t v_offset, index_t i_offset)
Definition tile/core/arch/amd_buffer_addressing.hpp:916
Definition tile/core/arch/amd_buffer_addressing.hpp:895
Definition tile/core/utility/functional.hpp:43
Definition tile/core/utility/debug.hpp:67
uint32_t * as3_uint32_ptr
Definition tile/core/arch/amd_buffer_addressing.hpp:29
#define CK_TILE_ASYNC_LOAD_WITH_INSTR(instr)
#define LIKELY(x)
Definition tile/core/arch/amd_buffer_addressing.hpp:26