Comparar texto

Encontre a diferença entre dois arquivos de texto

Diff em tempo real

Diff unificado

Recolher linhas

Destacar alterações

Realce de sintaxe

Ferramentas

Diffchecker Desktop The most secure way to run Diffchecker. Get the Diffchecker Desktop app: your diffs never leave your computer!Get Desktop

sm90 vs sm100 rowwise cutlass gemm

Created 8 months agoDiff never expires

Linhas
Total
Removido

Palavras
Total
Removido

Para continuar usando este recurso, atualize para Diffchecker Pro Ver preços

189 linhas

Linhas
Total
Adicionado

Palavras
Total
Adicionado

Para continuar usando este recurso, atualize para Diffchecker Pro Ver preços

179 linhas

// Cutlass rowwise kernel for sm90

// Cutlass rowwise kernel for SM100

template <

typename TileShape,

typename ClusterShape,

typename Transposed,

typename FastAccum,

typename DtypeA,

typename DtypeB,

typename DtypeBias>

void f8f8bf16_rowwise_impl(

void f8f8bf16_rowwise_impl_sm100(

at::Tensor XQ, // FP8

at::Tensor WQ, // FP8

at::Tensor x_scale,

at::Tensor w_scale,

std::optional<at::Tensor> bias,

at::Tensor out,

const int swizzle) {

int M = XQ.size(0);

int N = WQ.size(1);

int K = XQ.size(1);

// Workaround for https://github.com/pytorch/pytorch/issues/133334.

if (M % 256 > 0) {

int padded_M = ((M - 1) / 256 + 1) * 256;

at::Tensor padded_x_scale = x_scale.new_empty({padded_M, 1});

padded_x_scale.slice(/*dim=*/0, /*start=*/0, /*end=*/M)

.copy_(std::move(x_scale));

x_scale = std::move(padded_x_scale);

}

using LayoutInputA = cutlass::layout::RowMajor;

constexpr int AlignmentInputA = 16 / sizeof(DtypeA);

using LayoutInputB = cutlass::layout::ColumnMajor;

constexpr int AlignmentInputB = 16 / sizeof(DtypeB);

using LayoutOutput = std::conditional_t<

Transposed::value,

cutlass::layout::ColumnMajor,

cutlass::layout::RowMajor>;

constexpr int AlignmentOutput = 16 / sizeof(DtypeOutput);

// Tag indicating the minimum SM that supports the intended feature

using ArchTag = cutlass::arch::Sm90;

using ArchTag = cutlass::arch::Sm100;

using OperatorClass = cutlass::arch::OpClassTensorOp;

// Implement rowwise scaling epilogue.

constexpr int ColBroadcastStages = 0;

constexpr int RowBroadcastStages = 0;

using XScale = cutlass::epilogue::fusion::

Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeScale>;

using WScale = cutlass::epilogue::fusion::

Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeScale>;

using Bias = std::conditional_t<

Transposed::value,

cutlass::epilogue::fusion::

Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeBias>,

cutlass::epilogue::fusion::

Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeBias>>;

using Accum = cutlass::epilogue::fusion::Sm90AccFetch;

using AccumScale = cutlass::epilogue::fusion::Sm90EVT<

Multiply,

WScale,

cutlass::epilogue::fusion::Sm90EVT<Multiply, XScale, Accum>>;

using EpilogueEVT = cutlass::epilogue::fusion::Sm90EVT<

Cast,

cutlass::epilogue::fusion::Sm90EVT<

Add,

Bias,

AccumScale>>;

constexpr bool large_tile = std::is_same_v<TileShape, cute::Shape<cute::_128, cute::_128, cute::_128>>;

using EpilogueScheduleType = cutlass::epilogue::collective::EpilogueScheduleAuto;

using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<

using CollectiveEpilogue =

cutlass::arch::Sm100, OperatorClass,

typename cutlass::epilogue::collective::CollectiveBuilder<

TileShape, ClusterShape,

ArchTag,

cutlass::epilogue::collective::EpilogueTileAuto,

OperatorClass,

DtypeAccum, DtypeEpilogue,

TileShape,

DtypeOutput, LayoutOutput, AlignmentOutput,

ClusterShape,

DtypeOutput, LayoutOutput, AlignmentOutput,

cutlass::epilogue::collective::EpilogueTileAuto,

EpilogueScheduleType,

DtypeAccum,

EpilogueEVT>::CollectiveOp;

DtypeEpilogue,

DtypeOutput,

LayoutOutput,

AlignmentOutput,

DtypeOutput,

LayoutOutput,

AlignmentOutput,

typename Schedule<large_tile, FastAccum::value>::epilogue_type,

EpilogueEVT>::CollectiveOp;

using MainloopScheduleType = cutlass::gemm::collective::KernelScheduleAuto;

using CollectiveMainloop =

typename cutlass::gemm::collective::CollectiveBuilder<

ArchTag,

OperatorClass,

DtypeA,

LayoutInputA,

AlignmentInputA,

DtypeB,

LayoutInputB,

AlignmentInputB,

DtypeAccum,

TileShape,

ClusterShape,

cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(

cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,

sizeof(typename CollectiveEpilogue::SharedStorage))>,

MainloopScheduleType>::CollectiveOp;

typename Schedule<large_tile, FastAccum::value>::type>::

CollectiveOp;

using GemmKernel = cutlass::gemm::kernel::GemmUniversal<

cute::Shape<int, int, int>,

CollectiveMainloop,

CollectiveEpilogue>;

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

using StrideInputA = typename Gemm::GemmKernel::StrideA;

using StrideInputB = typename Gemm::GemmKernel::StrideB;

using StrideOutput = typename Gemm::GemmKernel::StrideC;

StrideInputA stride_a = cutlass::make_cute_packed_stride(

StrideInputA{}, cute::make_shape(M, static_cast<int>(XQ.stride(0)), 1));

StrideInputB stride_b = cutlass::make_cute_packed_stride(

StrideInputB{}, cute::make_shape(N, static_cast<int>(WQ.stride(1)), 1));

StrideOutput stride_output = cutlass::make_cute_packed_stride(

StrideOutput{}, cute::make_shape(M, static_cast<int>(out.stride(0)), 1));

typename Gemm::Arguments arguments{

cutlass::gemm::GemmUniversalMode::kGemm,

{M, N, K},

{reinterpret_cast<DtypeA*>(XQ.data_ptr()),

stride_a,

reinterpret_cast<DtypeB*>(WQ.data_ptr()),

stride_b},

{{{{bias.has_value() ? reinterpret_cast<DtypeBias*>(bias->data_ptr())

: nullptr},

{{reinterpret_cast<DtypeScale*>(w_scale.data_ptr())},

{{reinterpret_cast<DtypeScale*>(x_scale.data_ptr())}}}}},

reinterpret_cast<DtypeOutput*>(out.data_ptr()),

stride_output,

reinterpret_cast<DtypeOutput*>(out.data_ptr()),

stride_output}};

Gemm gemm;

// Using the arguments, query for extra workspace required for matrix

// multiplication computation

size_t workspace_size = Gemm::get_workspace_size(arguments);

// Ensure persistent kernels leave enough free SMs for NCCL background ops.

if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {

arguments.hw_info.sm_count =

at::cuda::getDeviceProperties(out.device().index())->multiProcessorCount -

at::globalContext()._SMCarveout_EXPERIMENTAL().value();

}

// Set the swizzle size

arguments.scheduler.max_swizzle_size = swizzle;

// Allocate workspace memory

auto workspace = XQ.new_empty(

{static_cast<int64_t>(workspace_size)},

at::TensorOptions().dtype(at::kByte));

// Check the problem size is supported or not

cutlass::Status status = gemm.can_implement(arguments);

if (status != cutlass::Status::kSuccess) {

throw std::runtime_error("cutlass cannot implement");

}

// Initialize CUTLASS kernel with arguments and workspace pointer

status = gemm.initialize(arguments, workspace.data_ptr());

if (status != cutlass::Status::kSuccess) {

throw std::runtime_error("cutlass cannot initialize");

}

status = gemm(at::cuda::getCurrentCUDAStream());

if (status != cutlass::Status::kSuccess) {

throw std::runtime_error(

std::string("cutlass cannot run") +

cutlass::cutlassGetStatusString(status));

}

C10_CUDA_KERNEL_LAUNCH_CHECK();

}

Diferenças salvas

Texto original

Abrir arquivo

// Cutlass rowwise kernel for sm90
template <
    typename TileShape,
    typename ClusterShape,
    typename Transposed,
    typename FastAccum,
    typename DtypeA,
    typename DtypeB,
    typename DtypeBias>
void f8f8bf16_rowwise_impl(
    at::Tensor XQ, // FP8
    at::Tensor WQ, // FP8
    at::Tensor x_scale,
    at::Tensor w_scale,
    std::optional<at::Tensor> bias,
    at::Tensor out,
    const int swizzle) {
  int M = XQ.size(0);
  int N = WQ.size(1);
  int K = XQ.size(1);

// Workaround for https://github.com/pytorch/pytorch/issues/133334.
  if (M % 256 > 0) {
    int padded_M = ((M - 1) / 256 + 1) * 256;
    at::Tensor padded_x_scale = x_scale.new_empty({padded_M, 1});
    padded_x_scale.slice(/*dim=*/0, /*start=*/0, /*end=*/M)
        .copy_(std::move(x_scale));
    x_scale = std::move(padded_x_scale);
  }

using LayoutInputA = cutlass::layout::RowMajor;
  constexpr int AlignmentInputA = 16 / sizeof(DtypeA);

using LayoutInputB = cutlass::layout::ColumnMajor;
  constexpr int AlignmentInputB = 16 / sizeof(DtypeB);

using LayoutOutput = std::conditional_t<
      Transposed::value,
      cutlass::layout::ColumnMajor,
      cutlass::layout::RowMajor>;
  constexpr int AlignmentOutput = 16 / sizeof(DtypeOutput);

// Tag indicating the minimum SM that supports the intended feature
  using ArchTag = cutlass::arch::Sm90;
  using OperatorClass = cutlass::arch::OpClassTensorOp;

// Implement rowwise scaling epilogue.
  constexpr int ColBroadcastStages = 0;
  constexpr int RowBroadcastStages = 0;

using XScale = cutlass::epilogue::fusion::
      Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeScale>;

using WScale = cutlass::epilogue::fusion::
      Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeScale>;

using Bias = std::conditional_t<
      Transposed::value,
      cutlass::epilogue::fusion::
          Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeBias>,
      cutlass::epilogue::fusion::
          Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeBias>>;

using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
  using AccumScale = cutlass::epilogue::fusion::Sm90EVT<
      Multiply,
      WScale,
      cutlass::epilogue::fusion::Sm90EVT<Multiply, XScale, Accum>>;

using EpilogueEVT = cutlass::epilogue::fusion::Sm90EVT<
      Cast,
      cutlass::epilogue::fusion::Sm90EVT<
          Add,
          Bias,
          AccumScale>>;

constexpr bool large_tile = std::is_same_v<TileShape, cute::Shape<cute::_128, cute::_128, cute::_128>>;

using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
          ArchTag,
          OperatorClass,
          TileShape,
          ClusterShape,
          cutlass::epilogue::collective::EpilogueTileAuto,
          DtypeAccum,
          DtypeEpilogue,
          DtypeOutput,
          LayoutOutput,
          AlignmentOutput,
          DtypeOutput,
          LayoutOutput,
          AlignmentOutput,
          typename Schedule<large_tile, FastAccum::value>::epilogue_type,
          EpilogueEVT>::CollectiveOp;

using CollectiveMainloop =
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag,
          OperatorClass,
          DtypeA,
          LayoutInputA,
          AlignmentInputA,
          DtypeB,
          LayoutInputB,
          AlignmentInputB,
          DtypeAccum,
          TileShape,
          ClusterShape,
          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
              sizeof(typename CollectiveEpilogue::SharedStorage))>,
          typename Schedule<large_tile, FastAccum::value>::type>::
          CollectiveOp;

using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
      cute::Shape<int, int, int>,
      CollectiveMainloop,
      CollectiveEpilogue>;

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

using StrideInputA = typename Gemm::GemmKernel::StrideA;
  using StrideInputB = typename Gemm::GemmKernel::StrideB;
  using StrideOutput = typename Gemm::GemmKernel::StrideC;

StrideInputA stride_a = cutlass::make_cute_packed_stride(
      StrideInputA{}, cute::make_shape(M, static_cast<int>(XQ.stride(0)), 1));
  StrideInputB stride_b = cutlass::make_cute_packed_stride(
      StrideInputB{}, cute::make_shape(N, static_cast<int>(WQ.stride(1)), 1));
  StrideOutput stride_output = cutlass::make_cute_packed_stride(
      StrideOutput{}, cute::make_shape(M, static_cast<int>(out.stride(0)), 1));

typename Gemm::Arguments arguments{
      cutlass::gemm::GemmUniversalMode::kGemm,
      {M, N, K},
      {reinterpret_cast<DtypeA*>(XQ.data_ptr()),
       stride_a,
       reinterpret_cast<DtypeB*>(WQ.data_ptr()),
       stride_b},
      {{{{bias.has_value() ? reinterpret_cast<DtypeBias*>(bias->data_ptr())
                           : nullptr},
         {{reinterpret_cast<DtypeScale*>(w_scale.data_ptr())},
          {{reinterpret_cast<DtypeScale*>(x_scale.data_ptr())}}}}},
       reinterpret_cast<DtypeOutput*>(out.data_ptr()),
       stride_output,
       reinterpret_cast<DtypeOutput*>(out.data_ptr()),
       stride_output}};

Gemm gemm;

// Using the arguments, query for extra workspace required for matrix
  // multiplication computation
  size_t workspace_size = Gemm::get_workspace_size(arguments);

// Ensure persistent kernels leave enough free SMs for NCCL background ops.
  if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
    arguments.hw_info.sm_count =
        at::cuda::getDeviceProperties(out.device().index())->multiProcessorCount -
        at::globalContext()._SMCarveout_EXPERIMENTAL().value();
  }

// Set the swizzle size
  arguments.scheduler.max_swizzle_size = swizzle;

// Allocate workspace memory
  auto workspace = XQ.new_empty(
      {static_cast<int64_t>(workspace_size)},
      at::TensorOptions().dtype(at::kByte));

// Check the problem size is supported or not
  cutlass::Status status = gemm.can_implement(arguments);
  if (status != cutlass::Status::kSuccess) {
    throw std::runtime_error("cutlass cannot implement");
  }

// Initialize CUTLASS kernel with arguments and workspace pointer
  status = gemm.initialize(arguments, workspace.data_ptr());
  if (status != cutlass::Status::kSuccess) {
    throw std::runtime_error("cutlass cannot initialize");
  }

status = gemm(at::cuda::getCurrentCUDAStream());
  if (status != cutlass::Status::kSuccess) {
    throw std::runtime_error(
        std::string("cutlass cannot run") +
        cutlass::cutlassGetStatusString(status));
  }
  C10_CUDA_KERNEL_LAUNCH_CHECK();
}

Texto alterado

Abrir arquivo

// Cutlass rowwise kernel for SM100
template <
    typename TileShape,
    typename ClusterShape,
    typename Transposed,
    typename FastAccum,
    typename DtypeA,
    typename DtypeB,
    typename DtypeBias>
void f8f8bf16_rowwise_impl_sm100(
    at::Tensor XQ, // FP8
    at::Tensor WQ, // FP8
    at::Tensor x_scale,
    at::Tensor w_scale,
    std::optional<at::Tensor> bias,
    at::Tensor out,
    const int swizzle) {
  int M = XQ.size(0);
  int N = WQ.size(1);
  int K = XQ.size(1);

using LayoutInputA = cutlass::layout::RowMajor;
  constexpr int AlignmentInputA = 16 / sizeof(DtypeA);

using LayoutInputB = cutlass::layout::ColumnMajor;
  constexpr int AlignmentInputB = 16 / sizeof(DtypeB);

using LayoutOutput = std::conditional_t<
      Transposed::value,
      cutlass::layout::ColumnMajor,
      cutlass::layout::RowMajor>;
  constexpr int AlignmentOutput = 16 / sizeof(DtypeOutput);

// Tag indicating the minimum SM that supports the intended feature
  using ArchTag = cutlass::arch::Sm100;
  using OperatorClass = cutlass::arch::OpClassTensorOp;

// Implement rowwise scaling epilogue.
  constexpr int ColBroadcastStages = 0;
  constexpr int RowBroadcastStages = 0;

using XScale = cutlass::epilogue::fusion::
      Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeScale>;

using WScale = cutlass::epilogue::fusion::
      Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeScale>;

using EpilogueEVT = cutlass::epilogue::fusion::Sm90EVT<
      Cast,
      cutlass::epilogue::fusion::Sm90EVT<
          Add,
          Bias,
          AccumScale>>;

using EpilogueScheduleType = cutlass::epilogue::collective::EpilogueScheduleAuto;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, OperatorClass,
      TileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      DtypeAccum, DtypeEpilogue,
      DtypeOutput, LayoutOutput, AlignmentOutput,
      DtypeOutput, LayoutOutput, AlignmentOutput,
      EpilogueScheduleType,
      EpilogueEVT>::CollectiveOp;

using MainloopScheduleType = cutlass::gemm::collective::KernelScheduleAuto;
  using CollectiveMainloop =
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag,
          OperatorClass,
          DtypeA,
          LayoutInputA,
          AlignmentInputA,
          DtypeB,
          LayoutInputB,
          AlignmentInputB,
          DtypeAccum,
          TileShape,
          ClusterShape,
          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
          MainloopScheduleType>::CollectiveOp;

using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
      cute::Shape<int, int, int>,
      CollectiveMainloop,
      CollectiveEpilogue>;

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

using StrideInputA = typename Gemm::GemmKernel::StrideA;
  using StrideInputB = typename Gemm::GemmKernel::StrideB;
  using StrideOutput = typename Gemm::GemmKernel::StrideC;

Gemm gemm;

// Using the arguments, query for extra workspace required for matrix
  // multiplication computation
  size_t workspace_size = Gemm::get_workspace_size(arguments);

// Set the swizzle size
  arguments.scheduler.max_swizzle_size = swizzle;

// Allocate workspace memory
  auto workspace = XQ.new_empty(
      {static_cast<int64_t>(workspace_size)},
      at::TensorOptions().dtype(at::kByte));