比較文本

尋找兩個文字檔案之間的差異

即時編輯器

摺疊未變更行

關閉換行

檢視

比對精度

語法突出顯示

Diffchecker Desktop 執行Diffchecker最安全的方式。取得Diffchecker桌面應用程式：您的差異永遠不會離開您的電腦！取得桌面版

sm90 vs sm100 rowwise cutlass gemm

建立於去年差異永不過期

33 刪除

行
總計
刪除

字符
總計
刪除

要繼續使用此功能，請升級到 Diffchecker Pro 查看價格

189 行

10 新增

行
總計
新增

字符
總計
新增

要繼續使用此功能，請升級到 Diffchecker Pro 查看價格

179 行

// Cutlass rowwise kernel for sm90

// Cutlass rowwise kernel for SM100

template <

typename TileShape,

typename ClusterShape,

typename Transposed,

typename FastAccum,

typename DtypeA,

typename DtypeB,

typename DtypeBias>

void f8f8bf16_rowwise_impl(

void f8f8bf16_rowwise_impl_sm100(

at::Tensor XQ, // FP8

at::Tensor WQ, // FP8

at::Tensor x_scale,

at::Tensor w_scale,

std::optional<at::Tensor> bias,

at::Tensor out,

const int swizzle) {

int M = XQ.size(0);

int N = WQ.size(1);

int K = XQ.size(1);

// Workaround for https://github.com/pytorch/pytorch/issues/133334.

if (M % 256 > 0) {

int padded_M = ((M - 1) / 256 + 1) * 256;

at::Tensor padded_x_scale = x_scale.new_empty({padded_M, 1});

padded_x_scale.slice(/*dim=*/0, /*start=*/0, /*end=*/M)

.copy_(std::move(x_scale));

x_scale = std::move(padded_x_scale);

}

using LayoutInputA = cutlass::layout::RowMajor;

constexpr int AlignmentInputA = 16 / sizeof(DtypeA);

using LayoutInputB = cutlass::layout::ColumnMajor;

constexpr int AlignmentInputB = 16 / sizeof(DtypeB);

using LayoutOutput = std::conditional_t<

Transposed::value,

cutlass::layout::ColumnMajor,

cutlass::layout::RowMajor>;

constexpr int AlignmentOutput = 16 / sizeof(DtypeOutput);

// Tag indicating the minimum SM that supports the intended feature

using ArchTag = cutlass::arch::Sm90;

using ArchTag = cutlass::arch::Sm100;

using OperatorClass = cutlass::arch::OpClassTensorOp;

// Implement rowwise scaling epilogue.

constexpr int ColBroadcastStages = 0;

constexpr int RowBroadcastStages = 0;

using XScale = cutlass::epilogue::fusion::

Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeScale>;

using WScale = cutlass::epilogue::fusion::

Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeScale>;

using Bias = std::conditional_t<

Transposed::value,

cutlass::epilogue::fusion::

Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeBias>,

cutlass::epilogue::fusion::

Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeBias>>;

using Accum = cutlass::epilogue::fusion::Sm90AccFetch;

using AccumScale = cutlass::epilogue::fusion::Sm90EVT<

Multiply,

WScale,

cutlass::epilogue::fusion::Sm90EVT<Multiply, XScale, Accum>>;

using EpilogueEVT = cutlass::epilogue::fusion::Sm90EVT<

Cast,

cutlass::epilogue::fusion::Sm90EVT<

Add,

Bias,

AccumScale>>;

constexpr bool large_tile = std::is_same_v<TileShape, cute::Shape<cute::_128, cute::_128, cute::_128>>;

using EpilogueScheduleType = cutlass::epilogue::collective::EpilogueScheduleAuto;

using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<

using CollectiveEpilogue =

cutlass::arch::Sm100, OperatorClass,

typename cutlass::epilogue::collective::CollectiveBuilder<

TileShape, ClusterShape,

ArchTag,

cutlass::epilogue::collective::EpilogueTileAuto,

OperatorClass,

DtypeAccum, DtypeEpilogue,

TileShape,

DtypeOutput, LayoutOutput, AlignmentOutput,

ClusterShape,

DtypeOutput, LayoutOutput, AlignmentOutput,

cutlass::epilogue::collective::EpilogueTileAuto,

EpilogueScheduleType,

DtypeAccum,

EpilogueEVT>::CollectiveOp;

DtypeEpilogue,

DtypeOutput,

LayoutOutput,

AlignmentOutput,

DtypeOutput,

LayoutOutput,

AlignmentOutput,

typename Schedule<large_tile, FastAccum::value>::epilogue_type,

EpilogueEVT>::CollectiveOp;

using MainloopScheduleType = cutlass::gemm::collective::KernelScheduleAuto;

using CollectiveMainloop =

typename cutlass::gemm::collective::CollectiveBuilder<

ArchTag,

OperatorClass,

DtypeA,

LayoutInputA,

AlignmentInputA,

DtypeB,

LayoutInputB,

AlignmentInputB,

DtypeAccum,

TileShape,

ClusterShape,

cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(

cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,

sizeof(typename CollectiveEpilogue::SharedStorage))>,

MainloopScheduleType>::CollectiveOp;

typename Schedule<large_tile, FastAccum::value>::type>::

CollectiveOp;

using GemmKernel = cutlass::gemm::kernel::GemmUniversal<

cute::Shape<int, int, int>,

CollectiveMainloop,

CollectiveEpilogue>;

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

using StrideInputA = typename Gemm::GemmKernel::StrideA;

using StrideInputB = typename Gemm::GemmKernel::StrideB;

using StrideOutput = typename Gemm::GemmKernel::StrideC;

StrideInputA stride_a = cutlass::make_cute_packed_stride(

StrideInputA{}, cute::make_shape(M, static_cast<int>(XQ.stride(0)), 1));

StrideInputB stride_b = cutlass::make_cute_packed_stride(

StrideInputB{}, cute::make_shape(N, static_cast<int>(WQ.stride(1)), 1));

StrideOutput stride_output = cutlass::make_cute_packed_stride(

StrideOutput{}, cute::make_shape(M, static_cast<int>(out.stride(0)), 1));

typename Gemm::Arguments arguments{

cutlass::gemm::GemmUniversalMode::kGemm,

{M, N, K},

{reinterpret_cast<DtypeA*>(XQ.data_ptr()),

stride_a,

reinterpret_cast<DtypeB*>(WQ.data_ptr()),

stride_b},

{{{{bias.has_value() ? reinterpret_cast<DtypeBias*>(bias->data_ptr())

: nullptr},

{{reinterpret_cast<DtypeScale*>(w_scale.data_ptr())},

{{reinterpret_cast<DtypeScale*>(x_scale.data_ptr())}}}}},

reinterpret_cast<DtypeOutput*>(out.data_ptr()),

stride_output,

reinterpret_cast<DtypeOutput*>(out.data_ptr()),

stride_output}};

Gemm gemm;

// Using the arguments, query for extra workspace required for matrix

// multiplication computation

size_t workspace_size = Gemm::get_workspace_size(arguments);

// Ensure persistent kernels leave enough free SMs for NCCL background ops.

if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {

arguments.hw_info.sm_count =

at::cuda::getDeviceProperties(out.device().index())->multiProcessorCount -

at::globalContext()._SMCarveout_EXPERIMENTAL().value();

}

// Set the swizzle size

arguments.scheduler.max_swizzle_size = swizzle;

// Allocate workspace memory

auto workspace = XQ.new_empty(

{static_cast<int64_t>(workspace_size)},

at::TensorOptions().dtype(at::kByte));

// Check the problem size is supported or not

cutlass::Status status = gemm.can_implement(arguments);

if (status != cutlass::Status::kSuccess) {

throw std::runtime_error("cutlass cannot implement");

}

// Initialize CUTLASS kernel with arguments and workspace pointer

status = gemm.initialize(arguments, workspace.data_ptr());

if (status != cutlass::Status::kSuccess) {

throw std::runtime_error("cutlass cannot initialize");

}

status = gemm(at::cuda::getCurrentCUDAStream());

if (status != cutlass::Status::kSuccess) {

throw std::runtime_error(

std::string("cutlass cannot run") +

cutlass::cutlassGetStatusString(status));

}

C10_CUDA_KERNEL_LAUNCH_CHECK();

}

已保存差異

原始文本

開啟檔案

// Cutlass rowwise kernel for sm90
template <
    typename TileShape,
    typename ClusterShape,
    typename Transposed,
    typename FastAccum,
    typename DtypeA,
    typename DtypeB,
    typename DtypeBias>
void f8f8bf16_rowwise_impl(
    at::Tensor XQ, // FP8
    at::Tensor WQ, // FP8
    at::Tensor x_scale,
    at::Tensor w_scale,
    std::optional<at::Tensor> bias,
    at::Tensor out,
    const int swizzle) {
  int M = XQ.size(0);
  int N = WQ.size(1);
  int K = XQ.size(1);

// Workaround for https://github.com/pytorch/pytorch/issues/133334.
  if (M % 256 > 0) {
    int padded_M = ((M - 1) / 256 + 1) * 256;
    at::Tensor padded_x_scale = x_scale.new_empty({padded_M, 1});
    padded_x_scale.slice(/*dim=*/0, /*start=*/0, /*end=*/M)
        .copy_(std::move(x_scale));
    x_scale = std::move(padded_x_scale);
  }

using LayoutInputA = cutlass::layout::RowMajor;
  constexpr int AlignmentInputA = 16 / sizeof(DtypeA);

using LayoutInputB = cutlass::layout::ColumnMajor;
  constexpr int AlignmentInputB = 16 / sizeof(DtypeB);

using LayoutOutput = std::conditional_t<
      Transposed::value,
      cutlass::layout::ColumnMajor,
      cutlass::layout::RowMajor>;
  constexpr int AlignmentOutput = 16 / sizeof(DtypeOutput);

// Tag indicating the minimum SM that supports the intended feature
  using ArchTag = cutlass::arch::Sm90;
  using OperatorClass = cutlass::arch::OpClassTensorOp;

// Implement rowwise scaling epilogue.
  constexpr int ColBroadcastStages = 0;
  constexpr int RowBroadcastStages = 0;

using XScale = cutlass::epilogue::fusion::
      Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeScale>;

using WScale = cutlass::epilogue::fusion::
      Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeScale>;

using Bias = std::conditional_t<
      Transposed::value,
      cutlass::epilogue::fusion::
          Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeBias>,
      cutlass::epilogue::fusion::
          Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeBias>>;

using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
  using AccumScale = cutlass::epilogue::fusion::Sm90EVT<
      Multiply,
      WScale,
      cutlass::epilogue::fusion::Sm90EVT<Multiply, XScale, Accum>>;

using EpilogueEVT = cutlass::epilogue::fusion::Sm90EVT<
      Cast,
      cutlass::epilogue::fusion::Sm90EVT<
          Add,
          Bias,
          AccumScale>>;

constexpr bool large_tile = std::is_same_v<TileShape, cute::Shape<cute::_128, cute::_128, cute::_128>>;

using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
          ArchTag,
          OperatorClass,
          TileShape,
          ClusterShape,
          cutlass::epilogue::collective::EpilogueTileAuto,
          DtypeAccum,
          DtypeEpilogue,
          DtypeOutput,
          LayoutOutput,
          AlignmentOutput,
          DtypeOutput,
          LayoutOutput,
          AlignmentOutput,
          typename Schedule<large_tile, FastAccum::value>::epilogue_type,
          EpilogueEVT>::CollectiveOp;

using CollectiveMainloop =
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag,
          OperatorClass,
          DtypeA,
          LayoutInputA,
          AlignmentInputA,
          DtypeB,
          LayoutInputB,
          AlignmentInputB,
          DtypeAccum,
          TileShape,
          ClusterShape,
          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
              sizeof(typename CollectiveEpilogue::SharedStorage))>,
          typename Schedule<large_tile, FastAccum::value>::type>::
          CollectiveOp;

using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
      cute::Shape<int, int, int>,
      CollectiveMainloop,
      CollectiveEpilogue>;

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

using StrideInputA = typename Gemm::GemmKernel::StrideA;
  using StrideInputB = typename Gemm::GemmKernel::StrideB;
  using StrideOutput = typename Gemm::GemmKernel::StrideC;

StrideInputA stride_a = cutlass::make_cute_packed_stride(
      StrideInputA{}, cute::make_shape(M, static_cast<int>(XQ.stride(0)), 1));
  StrideInputB stride_b = cutlass::make_cute_packed_stride(
      StrideInputB{}, cute::make_shape(N, static_cast<int>(WQ.stride(1)), 1));
  StrideOutput stride_output = cutlass::make_cute_packed_stride(
      StrideOutput{}, cute::make_shape(M, static_cast<int>(out.stride(0)), 1));

typename Gemm::Arguments arguments{
      cutlass::gemm::GemmUniversalMode::kGemm,
      {M, N, K},
      {reinterpret_cast<DtypeA*>(XQ.data_ptr()),
       stride_a,
       reinterpret_cast<DtypeB*>(WQ.data_ptr()),
       stride_b},
      {{{{bias.has_value() ? reinterpret_cast<DtypeBias*>(bias->data_ptr())
                           : nullptr},
         {{reinterpret_cast<DtypeScale*>(w_scale.data_ptr())},
          {{reinterpret_cast<DtypeScale*>(x_scale.data_ptr())}}}}},
       reinterpret_cast<DtypeOutput*>(out.data_ptr()),
       stride_output,
       reinterpret_cast<DtypeOutput*>(out.data_ptr()),
       stride_output}};

Gemm gemm;

// Using the arguments, query for extra workspace required for matrix
  // multiplication computation
  size_t workspace_size = Gemm::get_workspace_size(arguments);

// Ensure persistent kernels leave enough free SMs for NCCL background ops.
  if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
    arguments.hw_info.sm_count =
        at::cuda::getDeviceProperties(out.device().index())->multiProcessorCount -
        at::globalContext()._SMCarveout_EXPERIMENTAL().value();
  }

// Set the swizzle size
  arguments.scheduler.max_swizzle_size = swizzle;

// Allocate workspace memory
  auto workspace = XQ.new_empty(
      {static_cast<int64_t>(workspace_size)},
      at::TensorOptions().dtype(at::kByte));

// Check the problem size is supported or not
  cutlass::Status status = gemm.can_implement(arguments);
  if (status != cutlass::Status::kSuccess) {
    throw std::runtime_error("cutlass cannot implement");
  }

// Initialize CUTLASS kernel with arguments and workspace pointer
  status = gemm.initialize(arguments, workspace.data_ptr());
  if (status != cutlass::Status::kSuccess) {
    throw std::runtime_error("cutlass cannot initialize");
  }

status = gemm(at::cuda::getCurrentCUDAStream());
  if (status != cutlass::Status::kSuccess) {
    throw std::runtime_error(
        std::string("cutlass cannot run") +
        cutlass::cutlassGetStatusString(status));
  }
  C10_CUDA_KERNEL_LAUNCH_CHECK();
}

更改後文本

開啟檔案

// Cutlass rowwise kernel for SM100
template <
    typename TileShape,
    typename ClusterShape,
    typename Transposed,
    typename FastAccum,
    typename DtypeA,
    typename DtypeB,
    typename DtypeBias>
void f8f8bf16_rowwise_impl_sm100(
    at::Tensor XQ, // FP8
    at::Tensor WQ, // FP8
    at::Tensor x_scale,
    at::Tensor w_scale,
    std::optional<at::Tensor> bias,
    at::Tensor out,
    const int swizzle) {
  int M = XQ.size(0);
  int N = WQ.size(1);
  int K = XQ.size(1);

using LayoutInputA = cutlass::layout::RowMajor;
  constexpr int AlignmentInputA = 16 / sizeof(DtypeA);

using LayoutInputB = cutlass::layout::ColumnMajor;
  constexpr int AlignmentInputB = 16 / sizeof(DtypeB);

using LayoutOutput = std::conditional_t<
      Transposed::value,
      cutlass::layout::ColumnMajor,
      cutlass::layout::RowMajor>;
  constexpr int AlignmentOutput = 16 / sizeof(DtypeOutput);

// Tag indicating the minimum SM that supports the intended feature
  using ArchTag = cutlass::arch::Sm100;
  using OperatorClass = cutlass::arch::OpClassTensorOp;

// Implement rowwise scaling epilogue.
  constexpr int ColBroadcastStages = 0;
  constexpr int RowBroadcastStages = 0;

using XScale = cutlass::epilogue::fusion::
      Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeScale>;

using WScale = cutlass::epilogue::fusion::
      Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeScale>;

using EpilogueEVT = cutlass::epilogue::fusion::Sm90EVT<
      Cast,
      cutlass::epilogue::fusion::Sm90EVT<
          Add,
          Bias,
          AccumScale>>;

using EpilogueScheduleType = cutlass::epilogue::collective::EpilogueScheduleAuto;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, OperatorClass,
      TileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      DtypeAccum, DtypeEpilogue,
      DtypeOutput, LayoutOutput, AlignmentOutput,
      DtypeOutput, LayoutOutput, AlignmentOutput,
      EpilogueScheduleType,
      EpilogueEVT>::CollectiveOp;

using MainloopScheduleType = cutlass::gemm::collective::KernelScheduleAuto;
  using CollectiveMainloop =
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag,
          OperatorClass,
          DtypeA,
          LayoutInputA,
          AlignmentInputA,
          DtypeB,
          LayoutInputB,
          AlignmentInputB,
          DtypeAccum,
          TileShape,
          ClusterShape,
          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
          MainloopScheduleType>::CollectiveOp;

using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
      cute::Shape<int, int, int>,
      CollectiveMainloop,
      CollectiveEpilogue>;

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

using StrideInputA = typename Gemm::GemmKernel::StrideA;
  using StrideInputB = typename Gemm::GemmKernel::StrideB;
  using StrideOutput = typename Gemm::GemmKernel::StrideC;

Gemm gemm;

// Using the arguments, query for extra workspace required for matrix
  // multiplication computation
  size_t workspace_size = Gemm::get_workspace_size(arguments);

// Set the swizzle size
  arguments.scheduler.max_swizzle_size = swizzle;

// Allocate workspace memory
  auto workspace = XQ.new_empty(
      {static_cast<int64_t>(workspace_size)},
      at::TensorOptions().dtype(at::kByte));