Skip to content

Commit 829afc4

Browse files
authoredFeb 19, 2026··
[AMDGPU] Add WMMA and SWMMAC instructions for gfx1170 (#180731)
Introduce two new subtarget features: - WMMA256bInsts for GFX11 WMMA instructions and - WMMA128bInsts for GFX1170 and GFX12 WMMA and SWMMAC instructions Some WMMA instructions have changed from GFX 11.0 to GFX 11.7 so new Real versions were added with "_gfx1170" suffix. For consistency all WMMA and SWMMAC GFX11.7 instructions use this suffix. To resolve decoding issues between different formats for some WMMA instructions between GFX 11 and GFX 11.7, new decoding tables were added.
1 parent df1eec7 commit 829afc4

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

42 files changed

+10512
-2861
lines changed
 

‎clang/include/clang/Basic/BuiltinsAMDGPU.td‎

Lines changed: 76 additions & 72 deletions
Large diffs are not rendered by default.

‎clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl‎

Lines changed: 71 additions & 70 deletions
Large diffs are not rendered by default.

‎clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl‎

Lines changed: 71 additions & 70 deletions
Large diffs are not rendered by default.

‎clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl‎

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,14 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b
2121
global v16s* out16s, v2i a2i, v2i b2i, v16s c16s,
2222
global v8i* out8i, v4i a4i, v4i b4i, v8i c8i)
2323
{
24-
*out8f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a16h, b16h, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w32' needs target feature gfx11-insts,wavefrontsize32}}
25-
*out8f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a16s, b16s, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32' needs target feature gfx11-insts,wavefrontsize32}}
26-
*out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w32' needs target feature gfx11-insts,wavefrontsize32}}
27-
*out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32' needs target feature gfx11-insts,wavefrontsize32}}
28-
*out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32' needs target feature gfx11-insts,wavefrontsize32}}
29-
*out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32' needs target feature gfx11-insts,wavefrontsize32}}
30-
*out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a4i, true, b4i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32' needs target feature gfx11-insts,wavefrontsize32}}
31-
*out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32(true, a2i, true, b2i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32' needs target feature gfx11-insts,wavefrontsize32}}
24+
*out8f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32(a16h, b16h, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
25+
*out8f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32(a16s, b16s, c8f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
26+
*out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
27+
*out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
28+
*out16h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32(a16h, b16h, c16h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
29+
*out16s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(a16s, b16s, c16s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
30+
*out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32(true, a4i, true, b4i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
31+
*out8i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32(true, a2i, true, b2i, c8i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32' needs target feature wmma-256b-insts,wavefrontsize32}}
3232
}
3333

3434
#endif

‎clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64-gfx10-err.cl‎

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -21,14 +21,14 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out4f, v16h a16h, v16h b
2121
global v8s* out8s, v4i a4i, v4i b4i, v8s c8s,
2222
global v4i* out4i, v2i a2i, v2i b2i, v4i c4i)
2323
{
24-
*out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target feature gfx11-insts,wavefrontsize64}}
25-
*out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target feature gfx11-insts,wavefrontsize64}}
26-
*out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target feature gfx11-insts,wavefrontsize64}}
27-
*out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target feature gfx11-insts,wavefrontsize64}}
28-
*out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' needs target feature gfx11-insts,wavefrontsize64}}
29-
*out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' needs target feature gfx11-insts,wavefrontsize64}}
30-
*out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' needs target feature gfx11-insts,wavefrontsize64}}
31-
*out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' needs target feature gfx11-insts,wavefrontsize64}}
24+
*out4f = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64(a16h, b16h, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_f16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
25+
*out4f = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64(a16s, b16s, c4f); // expected-error{{'__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
26+
*out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
27+
*out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
28+
*out8h = __builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64(a16h, b16h, c8h, true); // expected-error{{'__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
29+
*out8s = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(a16s, b16s, c8s, true); // expected-error{{'__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
30+
*out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64(true, a4i, true, b4i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
31+
*out4i = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64(true, a2i, true, b2i, c4i, false); // expected-error{{'__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64' needs target feature wmma-256b-insts,wavefrontsize64}}
3232
}
3333

3434
#endif

‎llvm/lib/Target/AMDGPU/AMDGPU.td‎

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -775,6 +775,14 @@ defm CvtFP8VOP1Bug : AMDGPUSubtargetFeature<"cvt-fp8-vop1-bug",
775775
[FeatureFP8ConversionInsts]
776776
>;
777777

778+
defm WMMA256bInsts : AMDGPUSubtargetFeature<"wmma-256b-insts",
779+
"Has WMMA instructions where A and B matrices have duplicated data"
780+
>;
781+
782+
defm WMMA128bInsts : AMDGPUSubtargetFeature<"wmma-128b-insts",
783+
"Has WMMA instructions where A and B matrices do not have duplicated data"
784+
>;
785+
778786
defm PkFmacF16Inst : AMDGPUSubtargetFeature<"pk-fmac-f16-inst",
779787
"Has v_pk_fmac_f16 instruction"
780788
>;
@@ -1820,9 +1828,9 @@ def FeatureISAVersion11_Common : FeatureSet<
18201828
FeatureD16Writes32BitVgpr,
18211829
]>;
18221830

1823-
// There are few workarounds that need to be
1824-
// added to all targets. This pessimizes codegen
1825-
// a bit on the generic GFX11 target.
1831+
// There are few workarounds that need to be added to all targets. This
1832+
// pessimizes codegen a bit on the generic GFX11 target. This generic target
1833+
// does not include GFX1170 due to incompatible changes.
18261834
def FeatureISAVersion11_Generic: FeatureSet<
18271835
!listconcat(FeatureISAVersion11_Common.Features,
18281836
[FeatureMSAALoadDstSelBug,
@@ -1831,14 +1839,16 @@ def FeatureISAVersion11_Generic: FeatureSet<
18311839
FeatureMADIntraFwdBug,
18321840
FeaturePrivEnabledTrap2NopBug,
18331841
FeatureRequiresCOV6,
1834-
FeatureRequiredExportPriority])>;
1842+
FeatureRequiredExportPriority,
1843+
FeatureWMMA256bInsts])>;
18351844

18361845
def FeatureISAVersion11_0_Common : FeatureSet<
18371846
!listconcat(FeatureISAVersion11_Common.Features,
18381847
[FeatureMSAALoadDstSelBug,
18391848
FeatureVALUTransUseHazard,
18401849
FeatureMADIntraFwdBug,
1841-
FeaturePrivEnabledTrap2NopBug])>;
1850+
FeaturePrivEnabledTrap2NopBug,
1851+
FeatureWMMA256bInsts])>;
18421852

18431853
def FeatureISAVersion11_0_0 : FeatureSet<
18441854
!listconcat(FeatureISAVersion11_0_Common.Features,
@@ -1861,7 +1871,8 @@ def FeatureISAVersion11_5_Common : FeatureSet<
18611871
!listconcat(FeatureISAVersion11_Common.Features,
18621872
[FeatureSALUFloatInsts,
18631873
FeatureDPPSrc1SGPR,
1864-
FeatureRequiredExportPriority])>;
1874+
FeatureRequiredExportPriority,
1875+
FeatureWMMA256bInsts])>;
18651876

18661877
def FeatureISAVersion11_5_0 : FeatureSet<
18671878
!listconcat(FeatureISAVersion11_5_Common.Features,
@@ -1885,7 +1896,8 @@ def FeatureISAVersion11_7_0 : FeatureSet<
18851896
[FeatureSALUFloatInsts,
18861897
FeatureDPPSrc1SGPR,
18871898
FeatureFP8ConversionInsts,
1888-
FeatureDot11Insts])>;
1899+
FeatureDot11Insts,
1900+
FeatureWMMA128bInsts])>;
18891901

18901902
def FeatureISAVersion12 : FeatureSet<
18911903
[FeatureGFX12,
@@ -1915,6 +1927,7 @@ def FeatureISAVersion12 : FeatureSet<
19151927
FeatureImageInsts,
19161928
FeatureExtendedImageInsts,
19171929
FeatureFP8ConversionInsts,
1930+
FeatureWMMA128bInsts,
19181931
FeatureIEEEMinimumMaximumInsts,
19191932
FeaturePackedTID,
19201933
FeatureVcmpxPermlaneHazard,

‎llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp‎

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1556,6 +1556,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
15561556
return AMDGPU::isGFX11Plus(getSTI());
15571557
}
15581558

1559+
bool isGFX1170() const { return AMDGPU::isGFX1170(getSTI()); }
1560+
15591561
bool isGFX12() const { return AMDGPU::isGFX12(getSTI()); }
15601562

15611563
bool isGFX12Plus() const { return AMDGPU::isGFX12Plus(getSTI()); }

‎llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp‎

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -686,11 +686,19 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
686686
Address, CS))
687687
break;
688688

689+
if (isGFX1170() &&
690+
tryDecodeInst(DecoderTableGFX117064, MI, QW, Address, CS))
691+
break;
692+
689693
if (isGFX11() &&
690694
tryDecodeInst(DecoderTableGFX1164, DecoderTableGFX11_FAKE1664, MI, QW,
691695
Address, CS))
692696
break;
693697

698+
if (isGFX1170() &&
699+
tryDecodeInst(DecoderTableGFX1170W6464, MI, QW, Address, CS))
700+
break;
701+
694702
if (isGFX11() &&
695703
tryDecodeInst(DecoderTableGFX11W6464, MI, QW, Address, CS))
696704
break;
@@ -2247,6 +2255,8 @@ bool AMDGPUDisassembler::isGFX11Plus() const {
22472255
return AMDGPU::isGFX11Plus(STI);
22482256
}
22492257

2258+
bool AMDGPUDisassembler::isGFX1170() const { return AMDGPU::isGFX1170(STI); }
2259+
22502260
bool AMDGPUDisassembler::isGFX12() const {
22512261
return STI.hasFeature(AMDGPU::FeatureGFX12);
22522262
}

‎llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ class AMDGPUDisassembler : public MCDisassembler {
178178
bool isGFX10() const;
179179
bool isGFX10Plus() const;
180180
bool isGFX11() const;
181+
bool isGFX1170() const;
181182
bool isGFX11Plus() const;
182183
bool isGFX12() const;
183184
bool isGFX12Plus() const;

0 commit comments

Comments
 (0)
Please sign in to comment.