टेक्स्ट की तुलना करें

दो टेक्स्ट फ़ाइलों के बीच अंतर ढूंढें

रियल-टाइम एडिटर

अपरिवर्तित संक्षिप्त करें

लाइन रैप बंद

लेआउट

परिवर्तन हाइलाइट करें

सिंटैक्स हाइलाइटिंग

Diffchecker Desktop Diffchecker चलाने का सबसे सुरक्षित तरीका। Diffchecker Desktop ऐप पाएं: आपके diffs कभी आपके कंप्यूटर से बाहर नहीं जाते!Desktop पाएं

sm90 vs sm100 rowwise cutlass gemm

बनाया गया पिछला वर्षDiff कभी समाप्त नहीं होता

33 हटाए गए

लाइनें
कुल
हटाया गया

अक्षर
कुल
हटाया गया

इस सुविधा का उपयोग जारी रखने के लिए, अपग्रेड करें Diffchecker Pro मूल्य देखें

189 लाइनें

10 जोड़े गए

लाइनें
कुल
जोड़ा गया

अक्षर
कुल
जोड़ा गया

इस सुविधा का उपयोग जारी रखने के लिए, अपग्रेड करें Diffchecker Pro मूल्य देखें

179 लाइनें

// Cutlass rowwise kernel for sm90

// Cutlass rowwise kernel for SM100

template <

typename TileShape,

typename ClusterShape,

typename Transposed,

typename FastAccum,

typename DtypeA,

typename DtypeB,

typename DtypeBias>

void f8f8bf16_rowwise_impl(

void f8f8bf16_rowwise_impl_sm100(

at::Tensor XQ, // FP8

at::Tensor WQ, // FP8

at::Tensor x_scale,

at::Tensor w_scale,

std::optional<at::Tensor> bias,

at::Tensor out,

const int swizzle) {

int M = XQ.size(0);

int N = WQ.size(1);

int K = XQ.size(1);

// Workaround for https://github.com/pytorch/pytorch/issues/133334.

if (M % 256 > 0) {

int padded_M = ((M - 1) / 256 + 1) * 256;

at::Tensor padded_x_scale = x_scale.new_empty({padded_M, 1});

padded_x_scale.slice(/*dim=*/0, /*start=*/0, /*end=*/M)

.copy_(std::move(x_scale));

x_scale = std::move(padded_x_scale);

}

using LayoutInputA = cutlass::layout::RowMajor;

constexpr int AlignmentInputA = 16 / sizeof(DtypeA);

using LayoutInputB = cutlass::layout::ColumnMajor;

constexpr int AlignmentInputB = 16 / sizeof(DtypeB);

using LayoutOutput = std::conditional_t<

Transposed::value,

cutlass::layout::ColumnMajor,

cutlass::layout::RowMajor>;

constexpr int AlignmentOutput = 16 / sizeof(DtypeOutput);

// Tag indicating the minimum SM that supports the intended feature

using ArchTag = cutlass::arch::Sm90;

using ArchTag = cutlass::arch::Sm100;

using OperatorClass = cutlass::arch::OpClassTensorOp;

// Implement rowwise scaling epilogue.

constexpr int ColBroadcastStages = 0;

constexpr int RowBroadcastStages = 0;

using XScale = cutlass::epilogue::fusion::

Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeScale>;

using WScale = cutlass::epilogue::fusion::

Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeScale>;

using Bias = std::conditional_t<

Transposed::value,

cutlass::epilogue::fusion::

Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeBias>,

cutlass::epilogue::fusion::

Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeBias>>;

using Accum = cutlass::epilogue::fusion::Sm90AccFetch;

using AccumScale = cutlass::epilogue::fusion::Sm90EVT<

Multiply,

WScale,

cutlass::epilogue::fusion::Sm90EVT<Multiply, XScale, Accum>>;

using EpilogueEVT = cutlass::epilogue::fusion::Sm90EVT<

Cast,

cutlass::epilogue::fusion::Sm90EVT<

Add,

Bias,

AccumScale>>;

constexpr bool large_tile = std::is_same_v<TileShape, cute::Shape<cute::_128, cute::_128, cute::_128>>;

using EpilogueScheduleType = cutlass::epilogue::collective::EpilogueScheduleAuto;

using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<

using CollectiveEpilogue =

cutlass::arch::Sm100, OperatorClass,

typename cutlass::epilogue::collective::CollectiveBuilder<

TileShape, ClusterShape,

ArchTag,

cutlass::epilogue::collective::EpilogueTileAuto,

OperatorClass,

DtypeAccum, DtypeEpilogue,

TileShape,

DtypeOutput, LayoutOutput, AlignmentOutput,

ClusterShape,

DtypeOutput, LayoutOutput, AlignmentOutput,

cutlass::epilogue::collective::EpilogueTileAuto,

EpilogueScheduleType,

DtypeAccum,

EpilogueEVT>::CollectiveOp;

DtypeEpilogue,

DtypeOutput,

LayoutOutput,

AlignmentOutput,

DtypeOutput,

LayoutOutput,

AlignmentOutput,

typename Schedule<large_tile, FastAccum::value>::epilogue_type,

EpilogueEVT>::CollectiveOp;

using MainloopScheduleType = cutlass::gemm::collective::KernelScheduleAuto;

using CollectiveMainloop =

typename cutlass::gemm::collective::CollectiveBuilder<

ArchTag,

OperatorClass,

DtypeA,

LayoutInputA,

AlignmentInputA,

DtypeB,

LayoutInputB,

AlignmentInputB,

DtypeAccum,

TileShape,

ClusterShape,

cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(

cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,

sizeof(typename CollectiveEpilogue::SharedStorage))>,

MainloopScheduleType>::CollectiveOp;

typename Schedule<large_tile, FastAccum::value>::type>::

CollectiveOp;

using GemmKernel = cutlass::gemm::kernel::GemmUniversal<

cute::Shape<int, int, int>,

CollectiveMainloop,

CollectiveEpilogue>;

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

using StrideInputA = typename Gemm::GemmKernel::StrideA;

using StrideInputB = typename Gemm::GemmKernel::StrideB;

using StrideOutput = typename Gemm::GemmKernel::StrideC;

StrideInputA stride_a = cutlass::make_cute_packed_stride(

StrideInputA{}, cute::make_shape(M, static_cast<int>(XQ.stride(0)), 1));

StrideInputB stride_b = cutlass::make_cute_packed_stride(

StrideInputB{}, cute::make_shape(N, static_cast<int>(WQ.stride(1)), 1));

StrideOutput stride_output = cutlass::make_cute_packed_stride(

StrideOutput{}, cute::make_shape(M, static_cast<int>(out.stride(0)), 1));

typename Gemm::Arguments arguments{

cutlass::gemm::GemmUniversalMode::kGemm,

{M, N, K},

{reinterpret_cast<DtypeA*>(XQ.data_ptr()),

stride_a,

reinterpret_cast<DtypeB*>(WQ.data_ptr()),

stride_b},

{{{{bias.has_value() ? reinterpret_cast<DtypeBias*>(bias->data_ptr())

: nullptr},

{{reinterpret_cast<DtypeScale*>(w_scale.data_ptr())},

{{reinterpret_cast<DtypeScale*>(x_scale.data_ptr())}}}}},

reinterpret_cast<DtypeOutput*>(out.data_ptr()),

stride_output,

reinterpret_cast<DtypeOutput*>(out.data_ptr()),

stride_output}};

Gemm gemm;

// Using the arguments, query for extra workspace required for matrix

// multiplication computation

size_t workspace_size = Gemm::get_workspace_size(arguments);

// Ensure persistent kernels leave enough free SMs for NCCL background ops.

if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {

arguments.hw_info.sm_count =

at::cuda::getDeviceProperties(out.device().index())->multiProcessorCount -

at::globalContext()._SMCarveout_EXPERIMENTAL().value();

}

// Set the swizzle size

arguments.scheduler.max_swizzle_size = swizzle;

// Allocate workspace memory

auto workspace = XQ.new_empty(

{static_cast<int64_t>(workspace_size)},

at::TensorOptions().dtype(at::kByte));

// Check the problem size is supported or not

cutlass::Status status = gemm.can_implement(arguments);

if (status != cutlass::Status::kSuccess) {

throw std::runtime_error("cutlass cannot implement");

}

// Initialize CUTLASS kernel with arguments and workspace pointer

status = gemm.initialize(arguments, workspace.data_ptr());

if (status != cutlass::Status::kSuccess) {

throw std::runtime_error("cutlass cannot initialize");

}

status = gemm(at::cuda::getCurrentCUDAStream());

if (status != cutlass::Status::kSuccess) {

throw std::runtime_error(

std::string("cutlass cannot run") +

cutlass::cutlassGetStatusString(status));

}

C10_CUDA_KERNEL_LAUNCH_CHECK();

}

सेव किए गए Diffs

ऑरिजनल टेक्स्ट

फ़ाइल खोलें

// Cutlass rowwise kernel for sm90
template <
    typename TileShape,
    typename ClusterShape,
    typename Transposed,
    typename FastAccum,
    typename DtypeA,
    typename DtypeB,
    typename DtypeBias>
void f8f8bf16_rowwise_impl(
    at::Tensor XQ, // FP8
    at::Tensor WQ, // FP8
    at::Tensor x_scale,
    at::Tensor w_scale,
    std::optional<at::Tensor> bias,
    at::Tensor out,
    const int swizzle) {
  int M = XQ.size(0);
  int N = WQ.size(1);
  int K = XQ.size(1);

// Workaround for https://github.com/pytorch/pytorch/issues/133334.
  if (M % 256 > 0) {
    int padded_M = ((M - 1) / 256 + 1) * 256;
    at::Tensor padded_x_scale = x_scale.new_empty({padded_M, 1});
    padded_x_scale.slice(/*dim=*/0, /*start=*/0, /*end=*/M)
        .copy_(std::move(x_scale));
    x_scale = std::move(padded_x_scale);
  }

using LayoutInputA = cutlass::layout::RowMajor;
  constexpr int AlignmentInputA = 16 / sizeof(DtypeA);

using LayoutInputB = cutlass::layout::ColumnMajor;
  constexpr int AlignmentInputB = 16 / sizeof(DtypeB);

using LayoutOutput = std::conditional_t<
      Transposed::value,
      cutlass::layout::ColumnMajor,
      cutlass::layout::RowMajor>;
  constexpr int AlignmentOutput = 16 / sizeof(DtypeOutput);

// Tag indicating the minimum SM that supports the intended feature
  using ArchTag = cutlass::arch::Sm90;
  using OperatorClass = cutlass::arch::OpClassTensorOp;

// Implement rowwise scaling epilogue.
  constexpr int ColBroadcastStages = 0;
  constexpr int RowBroadcastStages = 0;

using XScale = cutlass::epilogue::fusion::
      Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeScale>;

using WScale = cutlass::epilogue::fusion::
      Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeScale>;

using Bias = std::conditional_t<
      Transposed::value,
      cutlass::epilogue::fusion::
          Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeBias>,
      cutlass::epilogue::fusion::
          Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeBias>>;

using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
  using AccumScale = cutlass::epilogue::fusion::Sm90EVT<
      Multiply,
      WScale,
      cutlass::epilogue::fusion::Sm90EVT<Multiply, XScale, Accum>>;

using EpilogueEVT = cutlass::epilogue::fusion::Sm90EVT<
      Cast,
      cutlass::epilogue::fusion::Sm90EVT<
          Add,
          Bias,
          AccumScale>>;

constexpr bool large_tile = std::is_same_v<TileShape, cute::Shape<cute::_128, cute::_128, cute::_128>>;

using CollectiveEpilogue =
      typename cutlass::epilogue::collective::CollectiveBuilder<
          ArchTag,
          OperatorClass,
          TileShape,
          ClusterShape,
          cutlass::epilogue::collective::EpilogueTileAuto,
          DtypeAccum,
          DtypeEpilogue,
          DtypeOutput,
          LayoutOutput,
          AlignmentOutput,
          DtypeOutput,
          LayoutOutput,
          AlignmentOutput,
          typename Schedule<large_tile, FastAccum::value>::epilogue_type,
          EpilogueEVT>::CollectiveOp;

using CollectiveMainloop =
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag,
          OperatorClass,
          DtypeA,
          LayoutInputA,
          AlignmentInputA,
          DtypeB,
          LayoutInputB,
          AlignmentInputB,
          DtypeAccum,
          TileShape,
          ClusterShape,
          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(
              sizeof(typename CollectiveEpilogue::SharedStorage))>,
          typename Schedule<large_tile, FastAccum::value>::type>::
          CollectiveOp;

using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
      cute::Shape<int, int, int>,
      CollectiveMainloop,
      CollectiveEpilogue>;

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

using StrideInputA = typename Gemm::GemmKernel::StrideA;
  using StrideInputB = typename Gemm::GemmKernel::StrideB;
  using StrideOutput = typename Gemm::GemmKernel::StrideC;

StrideInputA stride_a = cutlass::make_cute_packed_stride(
      StrideInputA{}, cute::make_shape(M, static_cast<int>(XQ.stride(0)), 1));
  StrideInputB stride_b = cutlass::make_cute_packed_stride(
      StrideInputB{}, cute::make_shape(N, static_cast<int>(WQ.stride(1)), 1));
  StrideOutput stride_output = cutlass::make_cute_packed_stride(
      StrideOutput{}, cute::make_shape(M, static_cast<int>(out.stride(0)), 1));

typename Gemm::Arguments arguments{
      cutlass::gemm::GemmUniversalMode::kGemm,
      {M, N, K},
      {reinterpret_cast<DtypeA*>(XQ.data_ptr()),
       stride_a,
       reinterpret_cast<DtypeB*>(WQ.data_ptr()),
       stride_b},
      {{{{bias.has_value() ? reinterpret_cast<DtypeBias*>(bias->data_ptr())
                           : nullptr},
         {{reinterpret_cast<DtypeScale*>(w_scale.data_ptr())},
          {{reinterpret_cast<DtypeScale*>(x_scale.data_ptr())}}}}},
       reinterpret_cast<DtypeOutput*>(out.data_ptr()),
       stride_output,
       reinterpret_cast<DtypeOutput*>(out.data_ptr()),
       stride_output}};

Gemm gemm;

// Using the arguments, query for extra workspace required for matrix
  // multiplication computation
  size_t workspace_size = Gemm::get_workspace_size(arguments);

// Ensure persistent kernels leave enough free SMs for NCCL background ops.
  if (at::globalContext()._SMCarveout_EXPERIMENTAL().has_value()) {
    arguments.hw_info.sm_count =
        at::cuda::getDeviceProperties(out.device().index())->multiProcessorCount -
        at::globalContext()._SMCarveout_EXPERIMENTAL().value();
  }

// Set the swizzle size
  arguments.scheduler.max_swizzle_size = swizzle;

// Allocate workspace memory
  auto workspace = XQ.new_empty(
      {static_cast<int64_t>(workspace_size)},
      at::TensorOptions().dtype(at::kByte));

// Check the problem size is supported or not
  cutlass::Status status = gemm.can_implement(arguments);
  if (status != cutlass::Status::kSuccess) {
    throw std::runtime_error("cutlass cannot implement");
  }

// Initialize CUTLASS kernel with arguments and workspace pointer
  status = gemm.initialize(arguments, workspace.data_ptr());
  if (status != cutlass::Status::kSuccess) {
    throw std::runtime_error("cutlass cannot initialize");
  }

status = gemm(at::cuda::getCurrentCUDAStream());
  if (status != cutlass::Status::kSuccess) {
    throw std::runtime_error(
        std::string("cutlass cannot run") +
        cutlass::cutlassGetStatusString(status));
  }
  C10_CUDA_KERNEL_LAUNCH_CHECK();
}

परिवर्तित टेक्स्ट

फ़ाइल खोलें

// Cutlass rowwise kernel for SM100
template <
    typename TileShape,
    typename ClusterShape,
    typename Transposed,
    typename FastAccum,
    typename DtypeA,
    typename DtypeB,
    typename DtypeBias>
void f8f8bf16_rowwise_impl_sm100(
    at::Tensor XQ, // FP8
    at::Tensor WQ, // FP8
    at::Tensor x_scale,
    at::Tensor w_scale,
    std::optional<at::Tensor> bias,
    at::Tensor out,
    const int swizzle) {
  int M = XQ.size(0);
  int N = WQ.size(1);
  int K = XQ.size(1);

using LayoutInputA = cutlass::layout::RowMajor;
  constexpr int AlignmentInputA = 16 / sizeof(DtypeA);

using LayoutInputB = cutlass::layout::ColumnMajor;
  constexpr int AlignmentInputB = 16 / sizeof(DtypeB);

using LayoutOutput = std::conditional_t<
      Transposed::value,
      cutlass::layout::ColumnMajor,
      cutlass::layout::RowMajor>;
  constexpr int AlignmentOutput = 16 / sizeof(DtypeOutput);

// Tag indicating the minimum SM that supports the intended feature
  using ArchTag = cutlass::arch::Sm100;
  using OperatorClass = cutlass::arch::OpClassTensorOp;

// Implement rowwise scaling epilogue.
  constexpr int ColBroadcastStages = 0;
  constexpr int RowBroadcastStages = 0;

using XScale = cutlass::epilogue::fusion::
      Sm90ColBroadcast<ColBroadcastStages, TileShape, DtypeScale>;

using WScale = cutlass::epilogue::fusion::
      Sm90RowBroadcast<RowBroadcastStages, TileShape, DtypeScale>;

using EpilogueEVT = cutlass::epilogue::fusion::Sm90EVT<
      Cast,
      cutlass::epilogue::fusion::Sm90EVT<
          Add,
          Bias,
          AccumScale>>;

using EpilogueScheduleType = cutlass::epilogue::collective::EpilogueScheduleAuto;
  using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder<
      cutlass::arch::Sm100, OperatorClass,
      TileShape, ClusterShape,
      cutlass::epilogue::collective::EpilogueTileAuto,
      DtypeAccum, DtypeEpilogue,
      DtypeOutput, LayoutOutput, AlignmentOutput,
      DtypeOutput, LayoutOutput, AlignmentOutput,
      EpilogueScheduleType,
      EpilogueEVT>::CollectiveOp;

using MainloopScheduleType = cutlass::gemm::collective::KernelScheduleAuto;
  using CollectiveMainloop =
      typename cutlass::gemm::collective::CollectiveBuilder<
          ArchTag,
          OperatorClass,
          DtypeA,
          LayoutInputA,
          AlignmentInputA,
          DtypeB,
          LayoutInputB,
          AlignmentInputB,
          DtypeAccum,
          TileShape,
          ClusterShape,
          cutlass::gemm::collective::StageCountAutoCarveout<static_cast<int>(sizeof(typename CollectiveEpilogue::SharedStorage))>,
          MainloopScheduleType>::CollectiveOp;

using GemmKernel = cutlass::gemm::kernel::GemmUniversal<
      cute::Shape<int, int, int>,
      CollectiveMainloop,
      CollectiveEpilogue>;

using Gemm = cutlass::gemm::device::GemmUniversalAdapter<GemmKernel>;

using StrideInputA = typename Gemm::GemmKernel::StrideA;
  using StrideInputB = typename Gemm::GemmKernel::StrideB;
  using StrideOutput = typename Gemm::GemmKernel::StrideC;

Gemm gemm;

// Using the arguments, query for extra workspace required for matrix
  // multiplication computation
  size_t workspace_size = Gemm::get_workspace_size(arguments);

// Set the swizzle size
  arguments.scheduler.max_swizzle_size = swizzle;

// Allocate workspace memory
  auto workspace = XQ.new_empty(
      {static_cast<int64_t>(workspace_size)},
      at::TensorOptions().dtype(at::kByte));