@@ -48,7 +48,7 @@ using device_grouped_conv_fwd_wmma_cshufflev3_scaleadd_ab_bf16_instances =
4848 // ########################################| | | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerWmma| _NWaveNPerWmma| | |
4949 // ########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
5050 // generic instance
51- DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<NDimSpatial, ALayout, BLayout, ck::Tuple<>, ELayout, ck::Tuple<BF16, BF16>, ck::Tuple<BF16, BF16>, F32, BF16, ck::Tuple<>, BF16, ScaleAdd, ScaleAdd, PassThrough, ConvSpec, GemmMNKPadding, 256 , 128 , 256 , 32 , 8 , 8 , 16 , 16 , 4 , 4 , S<4 , 64 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 8 , 8 , 0 , S<4 , 64 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 8 , 8 , 0 , 1 , 1 , S<1 , 32 , 1 , 8 >, 8 , BlockGemmPipelineScheduler::Interwave , BlockGemmPipelineVersion::v1>
51+ DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<NDimSpatial, ALayout, BLayout, ck::Tuple<>, ELayout, ck::Tuple<BF16, BF16>, ck::Tuple<BF16, BF16>, F32, BF16, ck::Tuple<>, BF16, ScaleAdd, ScaleAdd, PassThrough, ConvSpec, GemmMNKPadding, 256 , 128 , 256 , 32 , 8 , 8 , 16 , 16 , 4 , 4 , S<4 , 64 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 8 , 8 , 0 , S<4 , 64 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 8 , 8 , 0 , 1 , 1 , S<1 , 32 , 1 , 8 >, 8 , BlockGemmPipelineScheduler::Intrawave , BlockGemmPipelineVersion::v1>
5252#ifndef ONE_INSTANCE_PER_LIST
5353 ,
5454 DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<NDimSpatial, ALayout, BLayout, ck::Tuple<>, ELayout, ck::Tuple<BF16, BF16>, ck::Tuple<BF16, BF16>, F32, BF16, ck::Tuple<>, BF16, ScaleAdd, ScaleAdd, PassThrough, ConvSpec, GemmMNKPadding, 256 , 128 , 128 , 64 , 8 , 8 , 16 , 16 , 4 , 2 , S<8 , 32 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 8 , 8 , 1 , S<8 , 32 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 8 , 8 , 1 , 1 , 1 , S<1 , 32 , 1 , 8 >, 1 , BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
@@ -100,7 +100,7 @@ using device_grouped_conv_fwd_wmma_cshufflev3_scaleadd_ab_f16_instances =
100100 // ########################################| | | | | | | | | | | | Operation| Operation| Operation| | | | | | | | | | | Wave| Wave| Lengths_K0_M_K1| ArrangeOrder| | | PerVector| PerVector_K1| | Lengths_K0_N_K1| ArrangeOrder| | | PerVector| PerVector_K1| | PerShuffle| PerShuffle| _NBlock_NWaveNPerWmma| _NWaveNPerWmma| | |
101101 // ########################################| | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | |
102102 // generic instance
103- DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<NDimSpatial, ALayout, BLayout, ck::Tuple<>, ELayout, ck::Tuple<F16, F16>, ck::Tuple<F16, F16>, F32, F16, ck::Tuple<>, F16, ScaleAdd, ScaleAdd, PassThrough, ConvSpec, GemmMNKPadding, 256 , 128 , 256 , 32 , 8 , 8 , 16 , 16 , 4 , 4 , S<4 , 64 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 8 , 8 , 0 , S<4 , 64 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 8 , 8 , 0 , 1 , 1 , S<1 , 32 , 1 , 8 >, 8 , BlockGemmPipelineScheduler::Interwave , BlockGemmPipelineVersion::v1>
103+ DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<NDimSpatial, ALayout, BLayout, ck::Tuple<>, ELayout, ck::Tuple<F16, F16>, ck::Tuple<F16, F16>, F32, F16, ck::Tuple<>, F16, ScaleAdd, ScaleAdd, PassThrough, ConvSpec, GemmMNKPadding, 256 , 128 , 256 , 32 , 8 , 8 , 16 , 16 , 4 , 4 , S<4 , 64 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 8 , 8 , 0 , S<4 , 64 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 8 , 8 , 0 , 1 , 1 , S<1 , 32 , 1 , 8 >, 8 , BlockGemmPipelineScheduler::Intrawave , BlockGemmPipelineVersion::v1>
104104#ifndef ONE_INSTANCE_PER_LIST
105105 ,
106106 DeviceGroupedConvFwdMultipleABD_Wmma_CShuffle_V3<NDimSpatial, ALayout, BLayout, ck::Tuple<>, ELayout, ck::Tuple<F16, F16>, ck::Tuple<F16, F16>, F32, F16, ck::Tuple<>, F16, ScaleAdd, ScaleAdd, PassThrough, ConvSpec, GemmMNKPadding, 256 , 128 , 128 , 64 , 8 , 8 , 16 , 16 , 4 , 2 , S<8 , 32 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 8 , 8 , 1 , S<8 , 32 , 1 >, S<1 , 0 , 2 >, S<1 , 0 , 2 >, 2 , 8 , 8 , 1 , 1 , 1 , S<1 , 32 , 1 , 8 >, 1 , BlockGemmPipelineScheduler::Intrawave, BlockGemmPipelineVersion::v1>,
0 commit comments