Diff
checker
텍스트
텍스트
이미지
문서
Excel
폴더
Legal
Enterprise
데스크톱
요금제
로그인
데스크톱 앱 다운로드
텍스트 비교
두 텍스트 파일의 차이점을 찾아보세요
도구
기록
실시간 편집
변경 없는 행 숨기기
줄바꿈 비활성화
레이아웃
나란히 보기
합쳐 보기
비교 단위
스마트
단어
글자
구문 강조
언어 선택
제외
텍스트 변환
첫 변경으로
수정
Diffchecker Desktop
가장 안전하게 Diffchecker를 사용하는 방법. 데스크톱 앱을 사용하면 비교 데이터가 외부로 전송되지 않습니다!
데스크톱 앱 받기
Bad left good right
생성일
작년
비교 결과 만료 없음
초기화
내보내기
공유
설명
370 삭제
행
총
삭제
글자
총
삭제
이 기능을 계속 사용하려면 업그레이드해 주세요
Diff
checker
Pro
요금제 보기
559 행
복사
322 추가
행
총
추가
글자
총
추가
이 기능을 계속 사용하려면 업그레이드해 주세요
Diff
checker
Pro
요금제 보기
508 행
복사
//
//
// Generated by LLVM NVPTX Back-End
// Generated by LLVM NVPTX Back-End
//
//
.version 8.4
.version 8.4
.target sm_90a
.target sm_90a
.address_size 64
.address_size 64
// .globl triton_f4_to_bf16_kernel // -- Begin function triton_f4_to_bf16_kernel
// .globl triton_f4_to_bf16_kernel // -- Begin function triton_f4_to_bf16_kernel
// @triton_f4_to_bf16_kernel
// @triton_f4_to_bf16_kernel
.visible .entry triton_f4_to_bf16_kernel(
.visible .entry triton_f4_to_bf16_kernel(
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_0,
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_0,
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_1,
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_1,
복사
복사됨
복사
복사됨
.param .u32 triton_f4_to_bf16_kernel_param_2
,
.param .u32 triton_f4_to_bf16_kernel_param_2
.param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_3
)
)
.reqntid 128, 1, 1
.reqntid 128, 1, 1
{
{
.reg .pred %p<25>;
.reg .pred %p<25>;
복사
복사됨
복사
복사됨
.reg .b16 %rs<
61
>;
.reg .b16 %rs<
65
>;
.reg .b32 %r<
132>;
.reg .b32 %r<
111
>;
.reg .f32 %f<9
>;
.reg .b64 %rd<13>;
.reg .b64 %rd<13>;
.loc 1 178 0 // custom_cast.py:178:0
.loc 1 178 0 // custom_cast.py:178:0
$L__func_begin0:
$L__func_begin0:
.loc 1 178 0 // custom_cast.py:178:0
.loc 1 178 0 // custom_cast.py:178:0
// %bb.0:
// %bb.0:
ld.param.u64 %rd9, [triton_f4_to_bf16_kernel_param_0];
ld.param.u64 %rd9, [triton_f4_to_bf16_kernel_param_0];
ld.param.u64 %rd10, [triton_f4_to_bf16_kernel_param_1];
ld.param.u64 %rd10, [triton_f4_to_bf16_kernel_param_1];
$L__tmp0:
$L__tmp0:
.loc 1 194 24 // custom_cast.py:194:24
.loc 1 194 24 // custom_cast.py:194:24
복사
복사됨
복사
복사됨
mov.u32
%r5
, %ctaid.x;
// begin inline asm
ld.param.u32 %
r6
, [triton_f4_to_bf16_kernel_param_2];
mov.u32
%r1
, %ctaid.x;
// end inline asm
ld.param.u32 %
r14
, [triton_f4_to_bf16_kernel_param_2];
.loc 1 195 37 // custom_cast.py:195:37
.loc 1 195 37 // custom_cast.py:195:37
복사
복사됨
복사
복사됨
shl.b32 %
r7
, %
r6
, 1;
shl.b32 %
r15
, %
r14
, 1;
.loc 1 198 27 // custom_cast.py:198:27
.loc 1 198 27 // custom_cast.py:198:27
복사
복사됨
복사
복사됨
shl.b32 %
r8
, %
r5
, 9;
shl.b32 %
r16
, %
r1
, 9;
.loc 1 199 47 // custom_cast.py:199:47
.loc 1 199 47 // custom_cast.py:199:47
복사
복사됨
복사
복사됨
mov.u32 %
r9
, %tid.x;
mov.u32 %
r17
, %tid.x;
shl.b32 %r1
0
, %
r9
, 2;
shl.b32 %r1
8
, %
r17
, 2;
and.b32 %r1
1
, %r1
0
, 508;
and.b32 %r1
9
, %r1
8
, 508;
.loc 1 199 34 // custom_cast.py:199:34
.loc 1 199 34 // custom_cast.py:199:34
복사
복사됨
복사
복사됨
or.b32 %r
12
, %r1
1
, %
r8
;
or.b32 %r
20
, %r1
6
, %
r19
;
or.b32 %r
13
, %r
12
, 1;
or.b32 %r
21
, %r
20
, 1;
or.b32 %
r14
, %r
12
, 2;
or.b32 %
r22
, %r
20
, 2;
or.b32 %
r15
, %r
12
, 3;
or.b32 %
r23
, %r
20
, 3;
.loc 1 201 27 // custom_cast.py:201:27
.loc 1 201 27 // custom_cast.py:201:27
복사
복사됨
복사
복사됨
setp.lt.s32 %p1, %r
12
, %
r6
;
setp.lt.s32 %p1, %r
20
, %
r14
;
setp.lt.s32 %p2, %r
13
, %
r6
;
setp.lt.s32 %p2, %r
21
, %
r14
;
setp.lt.s32 %p3, %
r14
, %
r6
;
setp.lt.s32 %p3, %
r22
, %
r14
;
setp.lt.s32 %p4, %
r15
, %
r6
;
setp.lt.s32 %p4, %
r23
, %
r14
;
.loc 1 204 31 // custom_cast.py:204:31
.loc 1 204 31 // custom_cast.py:204:31
복사
복사됨
복사
복사됨
cvt.s64.s32 %rd11, %r
12
;
cvt.s64.s32 %rd11, %r
20
;
add.s64 %rd1, %rd9, %rd11;
add.s64 %rd1, %rd9, %rd11;
add.s64 %rd2, %rd1, 1;
add.s64 %rd2, %rd1, 1;
add.s64 %rd3, %rd1, 2;
add.s64 %rd3, %rd1, 2;
add.s64 %rd4, %rd1, 3;
add.s64 %rd4, %rd1, 3;
.loc 1 204 23 // custom_cast.py:204:23
.loc 1 204 23 // custom_cast.py:204:23
// begin inline asm
// begin inline asm
mov.u16 %rs1, 0x0;
mov.u16 %rs1, 0x0;
@%p1 ld.global.b8 { %rs1 }, [ %rd1 + 0 ];
@%p1 ld.global.b8 { %rs1 }, [ %rd1 + 0 ];
// end inline asm
// end inline asm
// begin inline asm
// begin inline asm
mov.u16 %rs2, 0x0;
mov.u16 %rs2, 0x0;
@%p2 ld.global.b8 { %rs2 }, [ %rd2 + 0 ];
@%p2 ld.global.b8 { %rs2 }, [ %rd2 + 0 ];
// end inline asm
// end inline asm
// begin inline asm
// begin inline asm
mov.u16 %rs3, 0x0;
mov.u16 %rs3, 0x0;
@%p3 ld.global.b8 { %rs3 }, [ %rd3 + 0 ];
@%p3 ld.global.b8 { %rs3 }, [ %rd3 + 0 ];
// end inline asm
// end inline asm
// begin inline asm
// begin inline asm
mov.u16 %rs4, 0x0;
mov.u16 %rs4, 0x0;
@%p4 ld.global.b8 { %rs4 }, [ %rd4 + 0 ];
@%p4 ld.global.b8 { %rs4 }, [ %rd4 + 0 ];
// end inline asm
// end inline asm
$L__tmp1:
$L__tmp1:
복사
복사됨
복사
복사됨
.loc 1 151 26 // custom_cast.py:151:26
and.b16 %rs5, %rs1, 1;
and.b16 %rs6, %rs2, 1;
and.b16 %rs7, %rs3, 1;
and.b16 %rs8, %rs4, 1;
.loc 1 123 29 // custom_cast.py:123:29
.loc 1 123 29 // custom_cast.py:123:29
복사
복사됨
복사
복사됨
shr.u16 %
rs9
, %rs1, 4;
shr.u16 %
rs13
, %rs1, 4;
and.b
16
%rs1
0
, %rs
9, 15
;
shr.u
16
%rs14, %rs2, 4;
shr.u16
%rs1
5
, %rs
3, 4;
shr.u16 %rs16, %rs4, 4
;
.loc 1 131 18 // custom_cast.py:131:18
.loc 1 131 18 // custom_cast.py:131:18
복사
복사됨
복사
복사됨
mov.b32 %r16, {
%rs1
0
, %rs1
1}
;
and.b16
%rs1
7
, %rs1
3, 8
;
mov.b32
%
r17, {
%rs1
0
, %rs1
}
;
and.b16
%
rs18,
%rs1
4, 8;
and.
b32
%
r18
, %
r17, 524296
;
and.b16 %rs19
, %rs1
5, 8
;
and.
b16
%
rs20
, %
rs16, 8
;
.loc 1 134 16 // custom_cast.py:134:16
.loc 1 134 16 // custom_cast.py:134:16
복사
복사됨
복사
복사됨
and.
b32 %r19
, %
r17, 458759
;
and.
b16 %rs21, %rs13, 7;
and.b16 %rs22, %rs1, 6;
and.b16 %rs23, %rs1, 7;
and.b16 %rs24, %rs14, 7;
and.b16 %rs25, %rs2, 6;
and.b16 %rs26, %rs2, 7;
and.b16 %rs27, %rs15, 7;
and.b16 %rs28, %rs3, 6;
and.b16 %rs29
, %
rs3, 7;
and.b16 %rs30, %rs16, 7;
and.b16 %rs31, %rs4, 6;
and.b16 %rs32, %rs4, 7
;
.loc 1 137 25 // custom_cast.py:137:25
.loc 1 137 25 // custom_cast.py:137:25
복사
복사됨
복사
복사됨
mov.b32 {%rs12, %rs13}, %r19;
setp.eq.s16 %p9, %rs
21
, 0;
setp.eq.s16 %p9, %rs
12
, 0;
setp.eq.s16 %p10, %rs
2
3, 0;
setp.eq.s16 %p10, %rs
1
3, 0;
setp.eq.s16 %p11, %rs24, 0;
setp.eq.s16 %p12, %rs26, 0;
setp.eq.s16 %p13, %rs27, 0;
setp.eq.s16 %p14, %rs29, 0;
setp.eq.s16 %p15, %rs30, 0;
setp.eq.s16 %p16, %rs32, 0;
.loc 1 143 29 // custom_cast.py:143:29
.loc 1 143 29 // custom_cast.py:143:29
복사
복사됨
복사
복사됨
setp.eq.s16 %p1
1
, %
rs13
, 1;
setp.eq.s16 %p1
7
, %
rs21, 1;
setp.eq.s16 %
p12, %rs12, 1;
setp.eq.s16 %p18, %rs23
, 1;
setp.eq.s16 %
p19, %rs24, 1;
setp.eq.s16 %p20, %rs26, 1;
setp.eq.s16 %p21, %rs27, 1;
setp.eq.s16 %p22, %rs29, 1;
setp.eq.s16 %p23, %rs30, 1;
setp.eq.s16 %p24, %rs32, 1;
.loc 1 146 29 // custom_cast.py:146:29
.loc 1 146 29 // custom_cast.py:146:29
복사
복사됨
복사
복사됨
shr.u16 %
rs14
, %
rs12
, 1;
shr.u16 %
rs33, %rs1, 5;
shr.u16 %
rs15, %rs13, 1;
and.b16 %rs34
, %
rs33, 3;
shr.u16 %rs35, %rs22
, 1;
shr.u16 %
rs36, %rs2, 5;
and.b16 %rs37, %rs36, 3;
shr.u16 %rs38, %rs25, 1;
shr.u16 %rs39, %rs3, 5;
and.b16 %rs40, %rs39, 3;
shr.u16 %rs41, %rs28, 1;
shr.u16 %rs42, %rs4, 5;
and.b16 %rs43, %rs42, 3;
shr.u16 %rs44, %rs31, 1;
.loc 1 147 56 // custom_cast.py:147:56
.loc 1 147 56 // custom_cast.py:147:56
복사
복사됨
복사
복사됨
add.s16 %
rs16
, %rs
1
4, 126;
add.s16 %
rs45
, %rs
3
4, 126;
.loc 1 151 26 // custom_cast.py:151:26
add.s16 %rs46
, %rs
35, 126
;
mov.b32 %r20, {%rs17
, %rs
15}
;
a
dd.s16
%
rs47
, %
rs37, 126
;
a
nd.b32
%
r21
, %
r16, 1;
add.s16
%
rs48
, %
rs38, 126
;
mov.b32 %r22, 8257536
;
add.s
16
%rs49
, %
rs40, 126;
add.s16
x2
%
r23
, %
r20, %r22
;
add.s
16 %
rs50
, %rs
41, 126
;
.loc 1 152 34 // custom_cast.py:152:34
add.s
16
%rs
51
, %
rs43, 126;
{ .reg .b
16
tmp; mov.b32 {tmp
, %
rs18}, %r23; }
add.s
16 %
rs52
, %
rs44, 126
;
cvt.u32.u
16 %
r24
, %rs
18
;
{ .reg .b
16
tmp; mov.b32 {
%rs
19, tmp}
, %
r21; }
cvt.u32.u
16 %
r25
, %
rs19
;
.loc 1 148 39 // custom_cast.py:148:39
.loc 1 148 39 // custom_cast.py:148:39
복사
복사됨
복사
복사됨
cvt.u32.u16 %r26, %
rs16
;
cvt.u32.u16 %r24, %rs45;
cvt.u32.u16 %r27, %
rs5
;
cvt.u32.u16 %r25, %rs46;
.loc 1 152 48 // custom_cast.py:152:48
cvt.u32.u16 %r26, %
rs47
;
shl.b32
%r28, %
r24, 23
;
cvt.u32.u16 %r27, %
rs48
;
shl.b32
%r29, %
r25, 22
;
cvt.u32.u16
%r28, %
rs49
;
cvt.u32.u16
%r29, %
rs50;
cvt.u32.u16 %r30, %rs51;
cvt.u32.u16 %r31, %rs52
;
.loc 1 148 52 // custom_cast.py:148:52
.loc 1 148 52 // custom_cast.py:148:52
복사
복사됨
복사
복사됨
shl.b32 %r3
0
, %r2
6
, 23;
shl.b32 %r3
2
, %r2
4
, 23;
shl.b32 %
r31, %r27, 22;
shl.b32 %
r33, %r25, 23;
.loc 1 156 30 // custom_cast.py:156:30
shl.b32 %r34, %r26, 23;
or.b32 %r32, %r29, %r30;
shl.b32 %r35, %r27, 23;
or.b32 %r33, %r28, %r31;
shl.b32 %r36, %r28, 23;
.loc 1 158 48 // custom_cast.py:158:48
shl.b32 %r37, %r29, 23;
selp.b32 %r34, 0, %r33, %p10;
shl.b32
%r38, %r30, 23;
selp.b32 %r35, 0, %r32, %p9;
shl.b32 %r39, %r31, 23;
.loc 1 160 63 // custom_cast.py:160:63
selp.b32 %r36, 1056964608, %r35, %p12;
selp.b32 %r37, 1056964608, %r34, %p11;
.loc 1 163 26 // custom_cast.py:163:26
mov.b32 {%rs20, %rs21}, %r18;
cvt.u32.u16 %r38, %rs21;
cvt.u32.u16 %r39, %rs20;
.loc 1 164 8 // custom_cast.py:164:8
shl.b32 %r40, %r39, 28;
shl.b32
%r41, %r38, 28;
.loc 1 166 22 // custom_cast.py:166:22
or.b32 %r42, %r37, %r41;
or.b32 %r43, %r36, %r40;
.loc 1 173 23 // custom_cast.py:173:23
mov.b32 %f1, %r43;
mov.b32 %f2, %r42;
.loc 1 174 23 // custom_cast.py:174:23
cvt.rn.bf16x2.f32 %r1, %f2, %f1;
.loc 1 123 29 // custom_cast.py:123:29
shr.u16 %rs22, %rs2, 4;
and.b16 %rs23, %rs22, 15;
.loc 1 131 18 // custom_cast.py:131:18
mov.b32 %r44, {%rs23, %rs24};
mov.b32 %r45, {%rs23, %rs2};
and.b32 %r46, %r45, 524296;
.loc 1 134 16 // custom_cast.py:134:16
and.b32 %r47, %r45, 458759;
.loc 1 137 25 // custom_cast.py:137:25
mov.b32 {%rs25, %rs26}, %r47;
setp.eq.s16 %p13, %rs25, 0;
setp.eq.s16 %p14, %rs26, 0;
.loc 1 143 29 // custom_cast.py:143:29
setp.eq.s16 %p15, %rs26, 1;
setp.eq.s16 %p16, %rs25, 1;
.loc 1 146 29 // custom_cast.py:146:29
shr.u16 %rs27, %rs25, 1;
shr.u16 %rs28, %rs26, 1;
.loc 1 147 56 // custom_cast.py:147:56
add.s16 %rs29, %rs27, 126;
.loc 1 151 26 // custom_cast.py:151:26
.loc 1 151 26 // custom_cast.py:151:26
복사
복사됨
복사
복사됨
mov.b32 %r48, {
%rs
30
, %
rs28}
;
and.b16
%rs
53
, %
rs13, 1
;
and.
b32
%
r49
, %
r44
, 1;
and.
b16
%
rs54
, %
rs14
, 1;
a
dd.s16x2
%
r50
, %
r48
, %
r22
;
a
nd.b16
%
rs55
, %
rs15, 1;
and.b16 %rs56
, %
rs16, 1
;
.loc 1 152 34 // custom_cast.py:152:34
.loc 1 152 34 // custom_cast.py:152:34
복사
복사됨
복사
복사됨
{ .reg .b16 tmp; mov.b32 {tmp, %rs31}, %r50; }
cvt.u32.u16 %r40, %rs53;
cvt.u32.u16 %r51, %rs31;
$L__tmp2:
{ .reg .b16 tmp; mov.b32 {%rs32, tmp}, %r49; }
.loc 1 204 23 // custom_cast.py:204:23
cvt.u32.u16 %r52, %rs32;
cvt.
u32.u16 %r41, %rs1;
.loc 1 148 39 // custom_cast.py:148:39
$L__tmp3:
cvt.u32.u16 %r53, %rs29;
cvt.u32.u16 %r54, %rs6;
.loc 1 152 48 // custom_cast.py:152:48
shl.b32 %r55, %r51, 23;
shl.b32 %r56, %r52, 22;
.loc 1 148 52 // custom_cast.py:148:52
shl.b32 %r57, %r53, 23;
shl.b32 %r58, %r54, 22;
.loc 1 156 30 // custom_cast.py:156:30
or.b32 %r59, %r56, %r57;
or.b32 %r60, %r55, %r58;
.loc 1 158 48 // custom_cast.py:158:48
selp.b32 %r61, 0, %r60, %p14;
selp.b32 %r62, 0, %r59, %p13;
.loc 1 160 63 // custom_cast.py:160:63
selp.b32 %r63, 1056964608, %r62, %p16;
selp.b32 %r64, 1056964608, %r61, %p15;
.loc 1 163 26 // custom_cast.py:163:26
mov.b32 {%rs33, %rs34}, %r46;
cvt.u32.u16 %r65, %rs34;
cvt.u32.u16 %r66, %rs33;
.loc 1 164 8 // custom_cast.py:164:8
shl.b32 %r67, %r66, 28;
shl.b32 %r68, %r65, 28;
.loc 1 166 22 // custom_cast.py:166:22
or.b32 %r69, %r64, %r68;
or.b32 %r70, %r63, %r67;
.loc 1 173 23 // custom_cast.py:173:23
mov.b32 %f3, %r70;
mov.b32 %f4, %r69;
.loc 1 174 23 // custom_cast.py:174:23
cvt.
rn.bf16x2.f32 %r2, %f4, %f3;
.loc 1 123 29 // custom_cast.py:123:29
shr.u16 %rs35, %rs3, 4;
and.b16 %rs36, %rs35, 15;
.loc 1 131 18 // custom_cast.py:131:18
mov.b32 %r71, {%rs36, %rs37};
mov.b32 %r72, {%rs36, %rs3};
and.b32 %r73, %r72, 524296;
.loc 1 134 16 // custom_cast.py:134:16
and.b32 %r74, %r72, 458759;
.loc 1 137 25 // custom_cast.py:137:25
mov.b32 {%rs38, %rs39}, %r74;
setp.eq.s16 %p17, %rs38, 0;
setp.eq.s16 %p18, %rs39, 0;
.loc 1 143 29 // custom_cast.py:143:29
setp.eq.s16 %p19, %rs39, 1;
setp.eq.s16 %p20, %rs38, 1;
.loc 1 146 29 // custom_cast.py:146:29
shr.u16 %rs40, %rs38, 1;
shr.u16 %rs41, %rs39, 1;
.loc 1 147 56 // custom_cast.py:147:56
add.s16 %rs42, %rs40, 126;
.loc 1 151 26 // custom_cast.py:151:26
mov.b32 %r75, {%rs43, %rs41};
and.b32 %r76, %r71, 1;
add.s16x2 %r77, %r75, %r22;
.loc 1 152 34 // custom_cast.py:152:34
.loc 1 152 34 // custom_cast.py:152:34
복사
복사됨
복사
복사됨
{ .reg .b16 tmp; mov.b32 {tmp, %rs44}, %r77; }
and.b32 %r42, %r41, 1;
cvt.u32.u16 %r78, %rs44;
cvt.u32.u16 %r43, %rs54;
{ .reg .b16 tmp; mov.b32 {%rs45, tmp}, %r76; }
$L__tmp4:
cvt.u32.u16 %r79, %rs45;
.loc 1 204
23 // custom_cast.py:
204:23
.loc 1 148 39 // custom_cast.py:148:39
cvt.
u32.u16 %r44, %rs2;
cvt.u32.u16 %r80, %rs42;
$L__tmp5:
cvt.u32.u16 %r81, %rs7;
.loc 1 152 48 // custom_cast.py:152:48
shl.b32 %r82, %r78, 23;
shl.b32 %r83, %r79, 22;
.loc 1 148 52 // custom_cast.py:148:52
shl.b32 %r84, %r80, 23;
shl.b32 %r85, %r81, 22;
.loc 1 156 30 // custom_cast.py:156:30
or.b32 %r86, %r83, %r84;
or.b32 %r87, %r82, %r85;
.loc 1 158 48 // custom_cast.py:158:48
selp.b32 %r88, 0, %r87, %p18;
selp.b32 %r89, 0, %r86, %p17;
.loc 1 160 63 // custom_cast.py:160:63
selp.b32 %r90, 1056964608, %r89, %p20;
selp.b32 %r91, 1056964608, %r88, %p19;
.loc 1 163 26 // custom_cast.py:163:26
mov.b32 {%rs46, %rs47}, %r73;
cvt.u32.u16 %r92, %rs47;
cvt.u32.u16 %r93, %rs46;
.loc 1 164 8 // custom_cast.py:164:8
shl.b32 %r94, %r93, 28;
shl.b32 %r95, %r92, 28;
.loc 1 166 22 // custom_cast.py:166:22
or.b32 %r96, %r91, %r95;
or.b32 %r97, %r90, %r94;
.loc 1 173
23 // custom_cast.py:
173:23
mov.b32 %f5, %r97;
mov.b32 %f6, %r96;
.loc 1 174 23 // custom_cast.py:174:23
cvt.
rn.bf16x2.f32 %r3, %f6, %f5;
.loc 1 123 29 // custom_cast.py:123:29
shr.u16 %rs48, %rs4, 4;
and.b16 %rs49, %rs48, 15;
.loc 1 131 18 // custom_cast.py:131:18
mov.b32 %r98, {%rs49, %rs50};
mov.b32 %r99, {%rs49, %rs4};
and.b32 %r100, %r99, 524296;
.loc 1 134 16 // custom_cast.py:134:16
and.b32 %r101, %r99, 458759;
.loc 1 137 25 // custom_cast.py:137:25
mov.b32 {%rs51, %rs52}, %r101;
setp.eq.s16 %p21, %rs51, 0;
setp.eq.s16 %p22, %rs52, 0;
.loc 1 143 29 // custom_cast.py:143:29
setp.eq.s16 %p23, %rs52, 1;
setp.eq.s16 %p24, %rs51, 1;
.loc 1 146 29 // custom_cast.py:146:29
shr.u16 %rs53, %rs51, 1;
shr.u16 %rs54, %rs52, 1;
.loc 1 147 56 // custom_cast.py:147:56
add.s16 %rs55, %rs53, 126;
.loc 1 151 26 // custom_cast.py:151:26
mov.b32 %r102, {%rs56, %rs54};
and.b32 %r103, %r98, 1;
add.s16x2 %r104, %r102, %r22;
.loc 1 152 34 // custom_cast.py:152:34
.loc 1 152 34 // custom_cast.py:152:34
복사
복사됨
복사
복사됨
{ .reg .b16 tmp; mov
.b32
{tmp, %rs57}
, %
r104; }
and
.b32
%r45
, %
r44, 1;
cvt.u32.u16 %
r105
, %rs5
7;
cvt.u32.u16 %
r46
, %rs5
5;
{ .reg .b16
tmp
; mov.b32 {%rs58, tmp}, %r103; }
$L__
tmp
6:
cvt.u32.u16 %
r106
, %
rs58;
.loc 1 204 23 // custom_cast.py:204:23
.loc 1
148 39
// custom_cast.py:
148:39
cvt.u32.u16 %
r47
, %
rs3;
cvt.u32.u16 %
r107
, %rs5
5;
$L__tmp7:
cvt.u32.u16 %
r108
, %rs
8;
.loc 1
152 34
// custom_cast.py:
152:34
and.b32 %r48, %r47, 1;
cvt.u32.u16 %
r49
, %rs5
6;
$L__tmp8:
.loc 1 204 23 // custom_cast.py:204:23
cvt.u32.u16 %
r50
, %rs
4;
$L__tmp9:
.loc 1 152 34 // custom_cast.py:152:34
and.b32 %r51, %r50, 1;
.loc 1 152 48 // custom_cast.py:152:48
.loc 1 152 48 // custom_cast.py:152:48
복사
복사됨
복사
복사됨
shl.b32 %
r109
, %
r105, 23
;
shl.b32 %
r52
, %
r40, 22;
shl.b32 %
r110
, %
r106
, 22;
shl.b32 %r53, %r42, 22;
.loc 1 148 52 // custom_cast.py:148:52
shl.b32 %r54, %r43, 22;
shl.b32 %
r111
, %
r107, 23
;
shl.b32 %r55, %r45, 22
;
shl.b32 %
r112
, %
r108
, 22;
shl.b32 %
r56
, %
r46
, 22;
shl.b32 %r57, %r48, 22;
shl.b32 %
r58
, %
r49, 22
;
shl.b32 %
r59
, %
r51
, 22;
.loc 1 156 30 // custom_cast.py:156:30
.loc 1 156 30 // custom_cast.py:156:30
복사
복사됨
복사
복사됨
or.b32 %
r113
, %
r110
, %
r111
;
or.b32 %
r60, %r32, %r52;
or.b32 %
r114
, %
r109
, %
r112
;
or.b32 %r61, %r33
, %
r53;
or.b32 %r62
, %
r34, %r54
;
or.b32 %
r63, %r35, %r55;
or.b32 %r64
, %
r36, %r56;
or.b32 %r65, %r37, %r57;
or.b32 %r66, %r38, %r58;
or.b32 %r67, %r39
, %
r59
;
.loc 1 158 48 // custom_cast.py:158:48
.loc 1 158 48 // custom_cast.py:158:48
복사
복사됨
복사
복사됨
selp.b32 %
r115, 0, %r114, %p2
2;
selp.b32 %
r68, 0, %r60, %p9;
selp.b32 %
r116
, 0, %
r113, %p21;
selp.b32 %r69, 0, %r61, %p10;
selp.b32 %r70, 0, %r62, %p11;
selp.b32 %r71, 0, %r63, %p1
2;
selp.b32 %
r72
, 0, %
r64, %p13;
selp.b32 %r73, 0, %r65, %p14;
selp.b32 %r74, 0, %r66, %p15;
selp.b32 %r75, 0, %r67, %p16;
.loc 1 160 63 // custom_cast.py:160:63
.loc 1 160 63 // custom_cast.py:160:63
복사
복사됨
복사
복사됨
selp.b32 %
r117
, 1056964608, %
r116
, %
p24
;
selp.b32 %
r76, 1056964608, %r68, %p17;
selp.b32 %
r118
, 1056964608, %
r115, %p23;
selp.b32 %r77
, 1056964608, %
r69
, %
p18
;
selp.b32 %
r78
, 1056964608, %
r70, %p19;
selp.b32 %r79, 1056964608, %r71, %p20;
selp.b32 %r80, 1056964608, %r72, %p21;
selp.b32 %r81, 1056964608, %r73, %p22;
selp.b32 %r82, 1056964608, %r74, %p23;
selp.b32 %r83, 1056964608, %r75, %p24;
.loc 1 163 26 // custom_cast.py:163:26
.loc 1 163 26 // custom_cast.py:163:26
복사
복사됨
복사
복사됨
mov
.b32
{%rs59
, %
rs60}
, %
r100
;
cvt.u32.u16 %r84, %rs17;
cvt.u32.u16 %
r119
, %
rs60
;
and
.b32
%r85, %r41, 8;
cvt.u32.u16 %
r120
, %
rs59
;
cvt.u32.u16 %r86
, %
rs18;
and.b32 %r87
, %
r44, 8
;
cvt.u32.u16 %
r88
, %
rs19;
and.b32 %r89, %r47, 8
;
cvt.u32.u16 %
r90
, %
rs20;
and.b32 %r91, %r50, 8
;
.loc 1 164 8 // custom_cast.py:164:8
.loc 1 164 8 // custom_cast.py:164:8
복사
복사됨
복사
복사됨
shl.b32 %
r121, %r120
, 28;
shl.b32 %
r92, %r84, 28;
shl.b32 %
r122
, %
r119
, 28;
shl.b32 %r93, %r85, 28;
shl.b32 %r94, %r86, 28;
shl.b32 %r95, %r87, 28;
shl.b32 %r96, %r88, 28;
shl.b32 %r97, %r89, 28;
shl.b32 %r98, %r90
, 28;
shl.b32 %
r99
, %
r91
, 28;
.loc 1 166 22 // custom_cast.py:166:22
.loc 1 166 22 // custom_cast.py:166:22
복사
복사됨
복사
복사됨
or.b32 %r
123
, %
r118
, %
r122
;
or.b32 %r
2
, %
r76
, %
r92
;
or.b32 %
r124, %r117, %r121;
or.b32 %
r3, %r77, %r93;
.loc 1 173 23 // custom_cast.py:173:23
or.b32 %r4, %r78, %r94;
mov.b32 %f7, %r124;
or.b32 %r5, %r79, %r95;
mov.b32 %f8, %r123;
or.b32 %r6, %r80, %r96;
or.b32 %r7, %r81, %r97;
or.b32 %r8, %r82, %r98;
or.b32 %r9, %r83, %r99;
.loc 1 174 23 // custom_cast.py:174:23
.loc 1 174 23 // custom_cast.py:174:23
복사
복사됨
복사
복사됨
cvt.rn.bf16x2.f32 %r4, %f8, %f7;
// begin inline asm
$L__tmp2:
cvt.rn.bf16.f32 %rs5, %r2;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs6, %r3;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs7, %r4;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs8, %r5;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs9, %r6;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs10, %r7;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs11, %r8;
// end inline asm
// begin inline asm
cvt.rn.bf16.f32 %rs12, %r9;
// end inline asm
$L__tmp10:
.loc 1 220 28 // custom_cast.py:220:28
.loc 1 220 28 // custom_cast.py:220:28
복사
복사됨
복사
복사됨
shl.b32 %
r125
, %
r5
, 10;
shl.b32 %
r100
, %
r1
, 10;
.loc 1 221 49 // custom_cast.py:221:49
.loc 1 221 49 // custom_cast.py:221:49
복사
복사됨
복사
복사됨
shl.b32 %
r126
, %
r9
, 3;
shl.b32 %
r101
, %
r17
, 3;
and.b32 %r1
27
, %
r126
, 1016;
and.b32 %r1
02
, %
r101
, 1016;
.loc 1 221 36 // custom_cast.py:221:36
.loc 1 221 36 // custom_cast.py:221:36
복사
복사됨
복사
복사됨
or.b32 %
r128
, %
r127
, %r1
25
;
or.b32 %
r103
, %
r100
, %r1
02
;
or.b32 %
r129
, %
r128
, 2;
or.b32 %
r104
, %
r103
, 2;
or.b32 %r1
30
, %
r128
, 4;
or.b32 %r1
05
, %
r103
, 4;
or.b32 %
r131
, %
r128
, 6;
or.b32 %
r106
, %
r103
, 6;
.loc 1 222 29 // custom_cast.py:222:29
.loc 1 222 29 // custom_cast.py:222:29
복사
복사됨
복사
복사됨
setp.lt.s32 %p5, %
r128
, %
r7
;
setp.lt.s32 %p5, %
r103
, %
r15
;
setp.lt.s32 %p6, %
r129
, %
r7
;
setp.lt.s32 %p6, %
r104
, %
r15
;
setp.lt.s32 %p7, %r1
30
, %
r7
;
setp.lt.s32 %p7, %r1
05
, %
r15
;
setp.lt.s32 %p8, %
r131
, %
r7
;
setp.lt.s32 %p8, %
r106
, %
r15
;
.loc 1 224 26 // custom_cast.py:224:26
.loc 1 224 26 // custom_cast.py:224:26
복사
복사됨
복사
복사됨
mul.wide.s32 %rd12, %
r128
, 2;
mul.wide.s32 %rd12, %
r103
, 2;
add.s64 %rd5, %rd10, %rd12;
add.s64 %rd5, %rd10, %rd12;
add.s64 %rd6, %rd5, 4;
add.s64 %rd6, %rd5, 4;
add.s64 %rd7, %rd5, 8;
add.s64 %rd7, %rd5, 8;
add.s64 %rd8, %rd5, 12;
add.s64 %rd8, %rd5, 12;
.loc 1 224 39 // custom_cast.py:224:39
.loc 1 224 39 // custom_cast.py:224:39
복사
복사됨
복사
복사됨
mov.b32 %r107, {%rs5, %rs6};
// begin inline asm
// begin inline asm
복사
복사됨
복사
복사됨
@%p5 st.global.b32 [ %rd5 + 0 ], { %r1
};
@%p5 st.global.b32 [ %rd5 + 0 ], { %r1
07
};
// end inline asm
// end inline asm
복사
복사됨
복사
복사됨
mov.b32 %r108, {%rs7, %rs8};
// begin inline asm
// begin inline asm
복사
복사됨
복사
복사됨
@%p6 st.global.b32 [ %rd6 + 0 ], { %
r2
};
@%p6 st.global.b32 [ %rd6 + 0 ], { %
r108
};
// end inline asm
// end inline asm
복사
복사됨
복사
복사됨
mov.b32 %r109, {%rs9, %rs10};
// begin inline asm
// begin inline asm
복사
복사됨
복사
복사됨
@%p7 st.global.b32 [ %rd7 + 0 ], { %
r3
};
@%p7 st.global.b32 [ %rd7 + 0 ], { %
r109
};
// end inline asm
// end inline asm
복사
복사됨
복사
복사됨
mov.b32 %r110, {%rs11, %rs12};
// begin inline asm
// begin inline asm
복사
복사됨
복사
복사됨
@%p8 st.global.b32 [ %rd8 + 0 ], { %
r4
};
@%p8 st.global.b32 [ %rd8 + 0 ], { %
r110
};
// end inline asm
// end inline asm
.loc 1 224 4 // custom_cast.py:224:4
.loc 1 224 4 // custom_cast.py:224:4
ret;
ret;
복사
복사됨
복사
복사됨
$L__tmp
3
:
$L__tmp
11
:
$L__func_end0:
$L__func_end0:
// -- End function
// -- End function
}
}
.file 1 "/home/drisspg/meta/ao/torchao/prototype/mx_formats/custom_cast.py"
.file 1 "/home/drisspg/meta/ao/torchao/prototype/mx_formats/custom_cast.py"
.section .debug_abbrev
.section .debug_abbrev
{
{
.b8 1 // Abbreviation Code
.b8 1 // Abbreviation Code
.b8 17 // DW_TAG_compile_unit
.b8 17 // DW_TAG_compile_unit
.b8 1 // DW_CHILDREN_yes
.b8 1 // DW_CHILDREN_yes
.b8 37 // DW_AT_producer
.b8 37 // DW_AT_producer
.b8 8 // DW_FORM_string
.b8 8 // DW_FORM_string
.b8 19 // DW_AT_language
.b8 19 // DW_AT_language
.b8 5 // DW_FORM_data2
.b8 5 // DW_FORM_data2
.b8 3 // DW_AT_name
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 8 // DW_FORM_string
.b8 16 // DW_AT_stmt_list
.b8 16 // DW_AT_stmt_list
.b8 6 // DW_FORM_data4
.b8 6 // DW_FORM_data4
.b8 27 // DW_AT_comp_dir
.b8 27 // DW_AT_comp_dir
.b8 8 // DW_FORM_string
.b8 8 // DW_FORM_string
.b8 0 // EOM(1)
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(2)
.b8 2 // Abbreviation Code
.b8 2 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 46 // DW_TAG_subprogram
.b8 0 // DW_CHILDREN_no
.b8 0 // DW_CHILDREN_no
.b8 3 // DW_AT_name
.b8 3 // DW_AT_name
.b8 8 // DW_FORM_string
.b8 8 // DW_FORM_string
.b8 32 // DW_AT_inline
.b8 32 // DW_AT_inline
.b8 11 // DW_FORM_data1
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(2)
.b8 3 // Abbreviation Code
.b8 3 // Abbreviation Code
.b8 46 // DW_TAG_subprogram
.b8 46 // DW_TAG_subprogram
.b8 1 // DW_CHILDREN_yes
.b8 1 // DW_CHILDREN_yes
.b8 17 // DW_AT_low_pc
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 1 // DW_FORM_addr
.b8 49 // DW_AT_abstract_origin
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 19 // DW_FORM_ref4
.b8 0 // EOM(1)
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(2)
.b8 4 // Abbreviation Code
.b8 4 // Abbreviation Code
.b8 29 // DW_TAG_inlined_subroutine
.b8 29 // DW_TAG_inlined_subroutine
.b8 0 // DW_CHILDREN_no
.b8 0 // DW_CHILDREN_no
.b8 49 // DW_AT_abstract_origin
.b8 49 // DW_AT_abstract_origin
.b8 19 // DW_FORM_ref4
.b8 19 // DW_FORM_ref4
.b8 17 // DW_AT_low_pc
.b8 17 // DW_AT_low_pc
.b8 1 // DW_FORM_addr
.b8 1 // DW_FORM_addr
.b8 18 // DW_AT_high_pc
.b8 18 // DW_AT_high_pc
.b8 1 // DW_FORM_addr
.b8 1 // DW_FORM_addr
.b8 88 // DW_AT_call_file
.b8 88 // DW_AT_call_file
.b8 11 // DW_FORM_data1
.b8 11 // DW_FORM_data1
.b8 89 // DW_AT_call_line
.b8 89 // DW_AT_call_line
.b8 11 // DW_FORM_data1
.b8 11 // DW_FORM_data1
.b8 87 // DW_AT_call_column
.b8 87 // DW_AT_call_column
.b8 11 // DW_FORM_data1
.b8 11 // DW_FORM_data1
.b8 0 // EOM(1)
.b8 0 // EOM(1)
.b8 0 // EOM(2)
.b8 0 // EOM(2)
.b8 0 // EOM(3)
.b8 0 // EOM(3)
}
}
.section .debug_info
.section .debug_info
{
{
.b32 161 // Length of Unit
.b32 161 // Length of Unit
.b8 2 // DWARF version number
.b8 2 // DWARF version number
.b8 0
.b8 0
.b32 .debug_abbrev // Offset Into Abbrev. Section
.b32 .debug_abbrev // Offset Into Abbrev. Section
.b8 8 // Address Size (in bytes)
.b8 8 // Address Size (in bytes)
.b8 1 // Abbrev [1] 0xb:0x9a DW_TAG_compile_unit
.b8 1 // Abbrev [1] 0xb:0x9a DW_TAG_compile_unit
.b8 116 // DW_AT_producer
.b8 116 // DW_AT_producer
.b8 114
.b8 114
.b8 105
.b8 105
.b8 116
.b8 116
.b8 111
.b8 111
.b8 110
.b8 110
.b8 0
.b8 0
.b8 2 // DW_AT_language
.b8 2 // DW_AT_language
.b8 0
.b8 0
.b8 99 // DW_AT_name
.b8 99 // DW_AT_name
.b8 117
.b8 117
.b8 115
.b8 115
.b8 116
.b8 116
.b8 111
.b8 111
.b8 109
.b8 109
.b8 95
.b8 95
.b8 99
.b8 99
.b8 97
.b8 97
.b8 115
.b8 115
.b8 116
.b8 116
.b8 46
.b8 46
.b8 112
.b8 112
.b8 121
.b8 121
.b8 0
.b8 0
.b32 .debug_line // DW_AT_stmt_list
.b32 .debug_line // DW_AT_stmt_list
.b8 47 // DW_AT_comp_dir
.b8 47 // DW_AT_comp_dir
.b8 104
.b8 104
.b8 111
.b8 111
.b8 109
.b8 109
.b8 101
.b8 101
.b8 47
.b8 47
.b8 100
.b8 100
.b8 114
.b8 114
.b8 105
.b8 105
.b8 115
.b8 115
.b8 115
.b8 115
.b8 112
.b8 112
.b8 103
.b8 103
.b8 47
.b8 47
.b8 109
.b8 109
.b8 101
.b8 101
.b8 116
.b8 116
.b8 97
.b8 97
.b8 47
.b8 47
.b8 97
.b8 97
.b8 111
.b8 111
.b8 47
.b8 47
.b8 116
.b8 116
.b8 111
.b8 111
.b8 114
.b8 114
.b8 99
.b8 99
.b8 104
.b8 104
.b8 97
.b8 97
.b8 111
.b8 111
.b8 47
.b8 47
.b8 112
.b8 112
.b8 114
.b8 114
.b8 111
.b8 111
.b8 116
.b8 116
.b8 111
.b8 111
.b8 116
.b8 116
.b8 121
.b8 121
.b8 112
.b8 112
.b8 101
.b8 101
.b8 47
.b8 47
.b8 109
.b8 109
.b8 120
.b8 120
.b8 95
.b8 95
.b8 102
.b8 102
.b8 111
.b8 111
.b8 114
.b8 114
.b8 109
.b8 109
.b8 97
.b8 97
.b8 116
.b8 116
.b8 115
.b8 115
.b8 0
.b8 0
.b8 2 // Abbrev [2] 0x5b:0x1b DW_TAG_subprogram
.b8 2 // Abbrev [2] 0x5b:0x1b DW_TAG_subprogram
.b8 116 // DW_AT_name
.b8 116 // DW_AT_name
.b8 114
.b8 114
.b8 105
.b8 105
.b8 116
.b8 116
.b8 111
.b8 111
.b8 110
.b8 110
.b8 95
.b8 95
.b8 102
.b8 102
.b8 52
.b8 52
.b8 95
.b8 95
.b8 116
.b8 116
.b8 111
.b8 111
.b8 95
.b8 95
.b8 98
.b8 98
.b8 102
.b8 102
.b8 49
.b8 49
.b8 54
.b8 54
.b8 95
.b8 95
.b8 107
.b8 107
.b8 101
.b8 101
.b8 114
.b8 114
.b8 110
.b8 110
.b8 101
.b8 101
.b8 108
.b8 108
.b8 0
.b8 0
.b8 1 // DW_AT_inline
.b8 1 // DW_AT_inline
.b8 3 // Abbrev [3] 0x76:0x2e DW_TAG_subprogram
.b8 3 // Abbrev [3] 0x76:0x2e DW_TAG_subprogram
.b64 $L__func_begin0 // DW_AT_low_pc
.b64 $L__func_begin0 // DW_AT_low_pc
.b64 $L__func_end0 // DW_AT_high_pc
.b64 $L__func_end0 // DW_AT_high_pc
.b32 91 // DW_AT_abstract_origin
.b32 91 // DW_AT_abstract_origin
.b8 4 // Abbrev [4] 0x8b:0x18 DW_TAG_inlined_subroutine
.b8 4 // Abbrev [4] 0x8b:0x18 DW_TAG_inlined_subroutine
.b32 91 // DW_AT_abstract_origin
.b32 91 // DW_AT_abstract_origin
.b64 $L__tmp1 // DW_AT_low_pc
.b64 $L__tmp1 // DW_AT_low_pc
복사
복사됨
복사
복사됨
.b64 $L__tmp
2
// DW_AT_high_pc
.b64 $L__tmp
10
// DW_AT_high_pc
.b8 1 // DW_AT_call_file
.b8 1 // DW_AT_call_file
.b8 216 // DW_AT_call_line
.b8 216 // DW_AT_call_line
.b8 8 // DW_AT_call_column
.b8 8 // DW_AT_call_column
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
.b8 0 // End Of Children Mark
}
}
.section .debug_macinfo { }
.section .debug_macinfo { }
저장된 비교 결과
원본
파일 열기
// // Generated by LLVM NVPTX Back-End // .version 8.4 .target sm_90a .address_size 64 // .globl triton_f4_to_bf16_kernel // -- Begin function triton_f4_to_bf16_kernel // @triton_f4_to_bf16_kernel .visible .entry triton_f4_to_bf16_kernel( .param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_0, .param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_1, .param .u32 triton_f4_to_bf16_kernel_param_2, .param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_3 ) .reqntid 128, 1, 1 { .reg .pred %p<25>; .reg .b16 %rs<61>; .reg .b32 %r<132>; .reg .f32 %f<9>; .reg .b64 %rd<13>; .loc 1 178 0 // custom_cast.py:178:0 $L__func_begin0: .loc 1 178 0 // custom_cast.py:178:0 // %bb.0: ld.param.u64 %rd9, [triton_f4_to_bf16_kernel_param_0]; ld.param.u64 %rd10, [triton_f4_to_bf16_kernel_param_1]; $L__tmp0: .loc 1 194 24 // custom_cast.py:194:24 mov.u32 %r5, %ctaid.x; ld.param.u32 %r6, [triton_f4_to_bf16_kernel_param_2]; .loc 1 195 37 // custom_cast.py:195:37 shl.b32 %r7, %r6, 1; .loc 1 198 27 // custom_cast.py:198:27 shl.b32 %r8, %r5, 9; .loc 1 199 47 // custom_cast.py:199:47 mov.u32 %r9, %tid.x; shl.b32 %r10, %r9, 2; and.b32 %r11, %r10, 508; .loc 1 199 34 // custom_cast.py:199:34 or.b32 %r12, %r11, %r8; or.b32 %r13, %r12, 1; or.b32 %r14, %r12, 2; or.b32 %r15, %r12, 3; .loc 1 201 27 // custom_cast.py:201:27 setp.lt.s32 %p1, %r12, %r6; setp.lt.s32 %p2, %r13, %r6; setp.lt.s32 %p3, %r14, %r6; setp.lt.s32 %p4, %r15, %r6; .loc 1 204 31 // custom_cast.py:204:31 cvt.s64.s32 %rd11, %r12; add.s64 %rd1, %rd9, %rd11; add.s64 %rd2, %rd1, 1; add.s64 %rd3, %rd1, 2; add.s64 %rd4, %rd1, 3; .loc 1 204 23 // custom_cast.py:204:23 // begin inline asm mov.u16 %rs1, 0x0; @%p1 ld.global.b8 { %rs1 }, [ %rd1 + 0 ]; // end inline asm // begin inline asm mov.u16 %rs2, 0x0; @%p2 ld.global.b8 { %rs2 }, [ %rd2 + 0 ]; // end inline asm // begin inline asm mov.u16 %rs3, 0x0; @%p3 ld.global.b8 { %rs3 }, [ %rd3 + 0 ]; // end inline asm // begin inline asm mov.u16 %rs4, 0x0; @%p4 ld.global.b8 { %rs4 }, [ %rd4 + 0 ]; // end inline asm $L__tmp1: .loc 1 151 26 // custom_cast.py:151:26 and.b16 %rs5, %rs1, 1; and.b16 %rs6, %rs2, 1; and.b16 %rs7, %rs3, 1; and.b16 %rs8, %rs4, 1; .loc 1 123 29 // custom_cast.py:123:29 shr.u16 %rs9, %rs1, 4; and.b16 %rs10, %rs9, 15; .loc 1 131 18 // custom_cast.py:131:18 mov.b32 %r16, {%rs10, %rs11}; mov.b32 %r17, {%rs10, %rs1}; and.b32 %r18, %r17, 524296; .loc 1 134 16 // custom_cast.py:134:16 and.b32 %r19, %r17, 458759; .loc 1 137 25 // custom_cast.py:137:25 mov.b32 {%rs12, %rs13}, %r19; setp.eq.s16 %p9, %rs12, 0; setp.eq.s16 %p10, %rs13, 0; .loc 1 143 29 // custom_cast.py:143:29 setp.eq.s16 %p11, %rs13, 1; setp.eq.s16 %p12, %rs12, 1; .loc 1 146 29 // custom_cast.py:146:29 shr.u16 %rs14, %rs12, 1; shr.u16 %rs15, %rs13, 1; .loc 1 147 56 // custom_cast.py:147:56 add.s16 %rs16, %rs14, 126; .loc 1 151 26 // custom_cast.py:151:26 mov.b32 %r20, {%rs17, %rs15}; and.b32 %r21, %r16, 1; mov.b32 %r22, 8257536; add.s16x2 %r23, %r20, %r22; .loc 1 152 34 // custom_cast.py:152:34 { .reg .b16 tmp; mov.b32 {tmp, %rs18}, %r23; } cvt.u32.u16 %r24, %rs18; { .reg .b16 tmp; mov.b32 {%rs19, tmp}, %r21; } cvt.u32.u16 %r25, %rs19; .loc 1 148 39 // custom_cast.py:148:39 cvt.u32.u16 %r26, %rs16; cvt.u32.u16 %r27, %rs5; .loc 1 152 48 // custom_cast.py:152:48 shl.b32 %r28, %r24, 23; shl.b32 %r29, %r25, 22; .loc 1 148 52 // custom_cast.py:148:52 shl.b32 %r30, %r26, 23; shl.b32 %r31, %r27, 22; .loc 1 156 30 // custom_cast.py:156:30 or.b32 %r32, %r29, %r30; or.b32 %r33, %r28, %r31; .loc 1 158 48 // custom_cast.py:158:48 selp.b32 %r34, 0, %r33, %p10; selp.b32 %r35, 0, %r32, %p9; .loc 1 160 63 // custom_cast.py:160:63 selp.b32 %r36, 1056964608, %r35, %p12; selp.b32 %r37, 1056964608, %r34, %p11; .loc 1 163 26 // custom_cast.py:163:26 mov.b32 {%rs20, %rs21}, %r18; cvt.u32.u16 %r38, %rs21; cvt.u32.u16 %r39, %rs20; .loc 1 164 8 // custom_cast.py:164:8 shl.b32 %r40, %r39, 28; shl.b32 %r41, %r38, 28; .loc 1 166 22 // custom_cast.py:166:22 or.b32 %r42, %r37, %r41; or.b32 %r43, %r36, %r40; .loc 1 173 23 // custom_cast.py:173:23 mov.b32 %f1, %r43; mov.b32 %f2, %r42; .loc 1 174 23 // custom_cast.py:174:23 cvt.rn.bf16x2.f32 %r1, %f2, %f1; .loc 1 123 29 // custom_cast.py:123:29 shr.u16 %rs22, %rs2, 4; and.b16 %rs23, %rs22, 15; .loc 1 131 18 // custom_cast.py:131:18 mov.b32 %r44, {%rs23, %rs24}; mov.b32 %r45, {%rs23, %rs2}; and.b32 %r46, %r45, 524296; .loc 1 134 16 // custom_cast.py:134:16 and.b32 %r47, %r45, 458759; .loc 1 137 25 // custom_cast.py:137:25 mov.b32 {%rs25, %rs26}, %r47; setp.eq.s16 %p13, %rs25, 0; setp.eq.s16 %p14, %rs26, 0; .loc 1 143 29 // custom_cast.py:143:29 setp.eq.s16 %p15, %rs26, 1; setp.eq.s16 %p16, %rs25, 1; .loc 1 146 29 // custom_cast.py:146:29 shr.u16 %rs27, %rs25, 1; shr.u16 %rs28, %rs26, 1; .loc 1 147 56 // custom_cast.py:147:56 add.s16 %rs29, %rs27, 126; .loc 1 151 26 // custom_cast.py:151:26 mov.b32 %r48, {%rs30, %rs28}; and.b32 %r49, %r44, 1; add.s16x2 %r50, %r48, %r22; .loc 1 152 34 // custom_cast.py:152:34 { .reg .b16 tmp; mov.b32 {tmp, %rs31}, %r50; } cvt.u32.u16 %r51, %rs31; { .reg .b16 tmp; mov.b32 {%rs32, tmp}, %r49; } cvt.u32.u16 %r52, %rs32; .loc 1 148 39 // custom_cast.py:148:39 cvt.u32.u16 %r53, %rs29; cvt.u32.u16 %r54, %rs6; .loc 1 152 48 // custom_cast.py:152:48 shl.b32 %r55, %r51, 23; shl.b32 %r56, %r52, 22; .loc 1 148 52 // custom_cast.py:148:52 shl.b32 %r57, %r53, 23; shl.b32 %r58, %r54, 22; .loc 1 156 30 // custom_cast.py:156:30 or.b32 %r59, %r56, %r57; or.b32 %r60, %r55, %r58; .loc 1 158 48 // custom_cast.py:158:48 selp.b32 %r61, 0, %r60, %p14; selp.b32 %r62, 0, %r59, %p13; .loc 1 160 63 // custom_cast.py:160:63 selp.b32 %r63, 1056964608, %r62, %p16; selp.b32 %r64, 1056964608, %r61, %p15; .loc 1 163 26 // custom_cast.py:163:26 mov.b32 {%rs33, %rs34}, %r46; cvt.u32.u16 %r65, %rs34; cvt.u32.u16 %r66, %rs33; .loc 1 164 8 // custom_cast.py:164:8 shl.b32 %r67, %r66, 28; shl.b32 %r68, %r65, 28; .loc 1 166 22 // custom_cast.py:166:22 or.b32 %r69, %r64, %r68; or.b32 %r70, %r63, %r67; .loc 1 173 23 // custom_cast.py:173:23 mov.b32 %f3, %r70; mov.b32 %f4, %r69; .loc 1 174 23 // custom_cast.py:174:23 cvt.rn.bf16x2.f32 %r2, %f4, %f3; .loc 1 123 29 // custom_cast.py:123:29 shr.u16 %rs35, %rs3, 4; and.b16 %rs36, %rs35, 15; .loc 1 131 18 // custom_cast.py:131:18 mov.b32 %r71, {%rs36, %rs37}; mov.b32 %r72, {%rs36, %rs3}; and.b32 %r73, %r72, 524296; .loc 1 134 16 // custom_cast.py:134:16 and.b32 %r74, %r72, 458759; .loc 1 137 25 // custom_cast.py:137:25 mov.b32 {%rs38, %rs39}, %r74; setp.eq.s16 %p17, %rs38, 0; setp.eq.s16 %p18, %rs39, 0; .loc 1 143 29 // custom_cast.py:143:29 setp.eq.s16 %p19, %rs39, 1; setp.eq.s16 %p20, %rs38, 1; .loc 1 146 29 // custom_cast.py:146:29 shr.u16 %rs40, %rs38, 1; shr.u16 %rs41, %rs39, 1; .loc 1 147 56 // custom_cast.py:147:56 add.s16 %rs42, %rs40, 126; .loc 1 151 26 // custom_cast.py:151:26 mov.b32 %r75, {%rs43, %rs41}; and.b32 %r76, %r71, 1; add.s16x2 %r77, %r75, %r22; .loc 1 152 34 // custom_cast.py:152:34 { .reg .b16 tmp; mov.b32 {tmp, %rs44}, %r77; } cvt.u32.u16 %r78, %rs44; { .reg .b16 tmp; mov.b32 {%rs45, tmp}, %r76; } cvt.u32.u16 %r79, %rs45; .loc 1 148 39 // custom_cast.py:148:39 cvt.u32.u16 %r80, %rs42; cvt.u32.u16 %r81, %rs7; .loc 1 152 48 // custom_cast.py:152:48 shl.b32 %r82, %r78, 23; shl.b32 %r83, %r79, 22; .loc 1 148 52 // custom_cast.py:148:52 shl.b32 %r84, %r80, 23; shl.b32 %r85, %r81, 22; .loc 1 156 30 // custom_cast.py:156:30 or.b32 %r86, %r83, %r84; or.b32 %r87, %r82, %r85; .loc 1 158 48 // custom_cast.py:158:48 selp.b32 %r88, 0, %r87, %p18; selp.b32 %r89, 0, %r86, %p17; .loc 1 160 63 // custom_cast.py:160:63 selp.b32 %r90, 1056964608, %r89, %p20; selp.b32 %r91, 1056964608, %r88, %p19; .loc 1 163 26 // custom_cast.py:163:26 mov.b32 {%rs46, %rs47}, %r73; cvt.u32.u16 %r92, %rs47; cvt.u32.u16 %r93, %rs46; .loc 1 164 8 // custom_cast.py:164:8 shl.b32 %r94, %r93, 28; shl.b32 %r95, %r92, 28; .loc 1 166 22 // custom_cast.py:166:22 or.b32 %r96, %r91, %r95; or.b32 %r97, %r90, %r94; .loc 1 173 23 // custom_cast.py:173:23 mov.b32 %f5, %r97; mov.b32 %f6, %r96; .loc 1 174 23 // custom_cast.py:174:23 cvt.rn.bf16x2.f32 %r3, %f6, %f5; .loc 1 123 29 // custom_cast.py:123:29 shr.u16 %rs48, %rs4, 4; and.b16 %rs49, %rs48, 15; .loc 1 131 18 // custom_cast.py:131:18 mov.b32 %r98, {%rs49, %rs50}; mov.b32 %r99, {%rs49, %rs4}; and.b32 %r100, %r99, 524296; .loc 1 134 16 // custom_cast.py:134:16 and.b32 %r101, %r99, 458759; .loc 1 137 25 // custom_cast.py:137:25 mov.b32 {%rs51, %rs52}, %r101; setp.eq.s16 %p21, %rs51, 0; setp.eq.s16 %p22, %rs52, 0; .loc 1 143 29 // custom_cast.py:143:29 setp.eq.s16 %p23, %rs52, 1; setp.eq.s16 %p24, %rs51, 1; .loc 1 146 29 // custom_cast.py:146:29 shr.u16 %rs53, %rs51, 1; shr.u16 %rs54, %rs52, 1; .loc 1 147 56 // custom_cast.py:147:56 add.s16 %rs55, %rs53, 126; .loc 1 151 26 // custom_cast.py:151:26 mov.b32 %r102, {%rs56, %rs54}; and.b32 %r103, %r98, 1; add.s16x2 %r104, %r102, %r22; .loc 1 152 34 // custom_cast.py:152:34 { .reg .b16 tmp; mov.b32 {tmp, %rs57}, %r104; } cvt.u32.u16 %r105, %rs57; { .reg .b16 tmp; mov.b32 {%rs58, tmp}, %r103; } cvt.u32.u16 %r106, %rs58; .loc 1 148 39 // custom_cast.py:148:39 cvt.u32.u16 %r107, %rs55; cvt.u32.u16 %r108, %rs8; .loc 1 152 48 // custom_cast.py:152:48 shl.b32 %r109, %r105, 23; shl.b32 %r110, %r106, 22; .loc 1 148 52 // custom_cast.py:148:52 shl.b32 %r111, %r107, 23; shl.b32 %r112, %r108, 22; .loc 1 156 30 // custom_cast.py:156:30 or.b32 %r113, %r110, %r111; or.b32 %r114, %r109, %r112; .loc 1 158 48 // custom_cast.py:158:48 selp.b32 %r115, 0, %r114, %p22; selp.b32 %r116, 0, %r113, %p21; .loc 1 160 63 // custom_cast.py:160:63 selp.b32 %r117, 1056964608, %r116, %p24; selp.b32 %r118, 1056964608, %r115, %p23; .loc 1 163 26 // custom_cast.py:163:26 mov.b32 {%rs59, %rs60}, %r100; cvt.u32.u16 %r119, %rs60; cvt.u32.u16 %r120, %rs59; .loc 1 164 8 // custom_cast.py:164:8 shl.b32 %r121, %r120, 28; shl.b32 %r122, %r119, 28; .loc 1 166 22 // custom_cast.py:166:22 or.b32 %r123, %r118, %r122; or.b32 %r124, %r117, %r121; .loc 1 173 23 // custom_cast.py:173:23 mov.b32 %f7, %r124; mov.b32 %f8, %r123; .loc 1 174 23 // custom_cast.py:174:23 cvt.rn.bf16x2.f32 %r4, %f8, %f7; $L__tmp2: .loc 1 220 28 // custom_cast.py:220:28 shl.b32 %r125, %r5, 10; .loc 1 221 49 // custom_cast.py:221:49 shl.b32 %r126, %r9, 3; and.b32 %r127, %r126, 1016; .loc 1 221 36 // custom_cast.py:221:36 or.b32 %r128, %r127, %r125; or.b32 %r129, %r128, 2; or.b32 %r130, %r128, 4; or.b32 %r131, %r128, 6; .loc 1 222 29 // custom_cast.py:222:29 setp.lt.s32 %p5, %r128, %r7; setp.lt.s32 %p6, %r129, %r7; setp.lt.s32 %p7, %r130, %r7; setp.lt.s32 %p8, %r131, %r7; .loc 1 224 26 // custom_cast.py:224:26 mul.wide.s32 %rd12, %r128, 2; add.s64 %rd5, %rd10, %rd12; add.s64 %rd6, %rd5, 4; add.s64 %rd7, %rd5, 8; add.s64 %rd8, %rd5, 12; .loc 1 224 39 // custom_cast.py:224:39 // begin inline asm @%p5 st.global.b32 [ %rd5 + 0 ], { %r1 }; // end inline asm // begin inline asm @%p6 st.global.b32 [ %rd6 + 0 ], { %r2 }; // end inline asm // begin inline asm @%p7 st.global.b32 [ %rd7 + 0 ], { %r3 }; // end inline asm // begin inline asm @%p8 st.global.b32 [ %rd8 + 0 ], { %r4 }; // end inline asm .loc 1 224 4 // custom_cast.py:224:4 ret; $L__tmp3: $L__func_end0: // -- End function } .file 1 "/home/drisspg/meta/ao/torchao/prototype/mx_formats/custom_cast.py" .section .debug_abbrev { .b8 1 // Abbreviation Code .b8 17 // DW_TAG_compile_unit .b8 1 // DW_CHILDREN_yes .b8 37 // DW_AT_producer .b8 8 // DW_FORM_string .b8 19 // DW_AT_language .b8 5 // DW_FORM_data2 .b8 3 // DW_AT_name .b8 8 // DW_FORM_string .b8 16 // DW_AT_stmt_list .b8 6 // DW_FORM_data4 .b8 27 // DW_AT_comp_dir .b8 8 // DW_FORM_string .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 2 // Abbreviation Code .b8 46 // DW_TAG_subprogram .b8 0 // DW_CHILDREN_no .b8 3 // DW_AT_name .b8 8 // DW_FORM_string .b8 32 // DW_AT_inline .b8 11 // DW_FORM_data1 .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 3 // Abbreviation Code .b8 46 // DW_TAG_subprogram .b8 1 // DW_CHILDREN_yes .b8 17 // DW_AT_low_pc .b8 1 // DW_FORM_addr .b8 18 // DW_AT_high_pc .b8 1 // DW_FORM_addr .b8 49 // DW_AT_abstract_origin .b8 19 // DW_FORM_ref4 .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 4 // Abbreviation Code .b8 29 // DW_TAG_inlined_subroutine .b8 0 // DW_CHILDREN_no .b8 49 // DW_AT_abstract_origin .b8 19 // DW_FORM_ref4 .b8 17 // DW_AT_low_pc .b8 1 // DW_FORM_addr .b8 18 // DW_AT_high_pc .b8 1 // DW_FORM_addr .b8 88 // DW_AT_call_file .b8 11 // DW_FORM_data1 .b8 89 // DW_AT_call_line .b8 11 // DW_FORM_data1 .b8 87 // DW_AT_call_column .b8 11 // DW_FORM_data1 .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 0 // EOM(3) } .section .debug_info { .b32 161 // Length of Unit .b8 2 // DWARF version number .b8 0 .b32 .debug_abbrev // Offset Into Abbrev. Section .b8 8 // Address Size (in bytes) .b8 1 // Abbrev [1] 0xb:0x9a DW_TAG_compile_unit .b8 116 // DW_AT_producer .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 0 .b8 2 // DW_AT_language .b8 0 .b8 99 // DW_AT_name .b8 117 .b8 115 .b8 116 .b8 111 .b8 109 .b8 95 .b8 99 .b8 97 .b8 115 .b8 116 .b8 46 .b8 112 .b8 121 .b8 0 .b32 .debug_line // DW_AT_stmt_list .b8 47 // DW_AT_comp_dir .b8 104 .b8 111 .b8 109 .b8 101 .b8 47 .b8 100 .b8 114 .b8 105 .b8 115 .b8 115 .b8 112 .b8 103 .b8 47 .b8 109 .b8 101 .b8 116 .b8 97 .b8 47 .b8 97 .b8 111 .b8 47 .b8 116 .b8 111 .b8 114 .b8 99 .b8 104 .b8 97 .b8 111 .b8 47 .b8 112 .b8 114 .b8 111 .b8 116 .b8 111 .b8 116 .b8 121 .b8 112 .b8 101 .b8 47 .b8 109 .b8 120 .b8 95 .b8 102 .b8 111 .b8 114 .b8 109 .b8 97 .b8 116 .b8 115 .b8 0 .b8 2 // Abbrev [2] 0x5b:0x1b DW_TAG_subprogram .b8 116 // DW_AT_name .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 102 .b8 52 .b8 95 .b8 116 .b8 111 .b8 95 .b8 98 .b8 102 .b8 49 .b8 54 .b8 95 .b8 107 .b8 101 .b8 114 .b8 110 .b8 101 .b8 108 .b8 0 .b8 1 // DW_AT_inline .b8 3 // Abbrev [3] 0x76:0x2e DW_TAG_subprogram .b64 $L__func_begin0 // DW_AT_low_pc .b64 $L__func_end0 // DW_AT_high_pc .b32 91 // DW_AT_abstract_origin .b8 4 // Abbrev [4] 0x8b:0x18 DW_TAG_inlined_subroutine .b32 91 // DW_AT_abstract_origin .b64 $L__tmp1 // DW_AT_low_pc .b64 $L__tmp2 // DW_AT_high_pc .b8 1 // DW_AT_call_file .b8 216 // DW_AT_call_line .b8 8 // DW_AT_call_column .b8 0 // End Of Children Mark .b8 0 // End Of Children Mark } .section .debug_macinfo { }
수정본
파일 열기
// // Generated by LLVM NVPTX Back-End // .version 8.4 .target sm_90a .address_size 64 // .globl triton_f4_to_bf16_kernel // -- Begin function triton_f4_to_bf16_kernel // @triton_f4_to_bf16_kernel .visible .entry triton_f4_to_bf16_kernel( .param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_0, .param .u64 .ptr .global .align 1 triton_f4_to_bf16_kernel_param_1, .param .u32 triton_f4_to_bf16_kernel_param_2 ) .reqntid 128, 1, 1 { .reg .pred %p<25>; .reg .b16 %rs<65>; .reg .b32 %r<111>; .reg .b64 %rd<13>; .loc 1 178 0 // custom_cast.py:178:0 $L__func_begin0: .loc 1 178 0 // custom_cast.py:178:0 // %bb.0: ld.param.u64 %rd9, [triton_f4_to_bf16_kernel_param_0]; ld.param.u64 %rd10, [triton_f4_to_bf16_kernel_param_1]; $L__tmp0: .loc 1 194 24 // custom_cast.py:194:24 // begin inline asm mov.u32 %r1, %ctaid.x; // end inline asm ld.param.u32 %r14, [triton_f4_to_bf16_kernel_param_2]; .loc 1 195 37 // custom_cast.py:195:37 shl.b32 %r15, %r14, 1; .loc 1 198 27 // custom_cast.py:198:27 shl.b32 %r16, %r1, 9; .loc 1 199 47 // custom_cast.py:199:47 mov.u32 %r17, %tid.x; shl.b32 %r18, %r17, 2; and.b32 %r19, %r18, 508; .loc 1 199 34 // custom_cast.py:199:34 or.b32 %r20, %r16, %r19; or.b32 %r21, %r20, 1; or.b32 %r22, %r20, 2; or.b32 %r23, %r20, 3; .loc 1 201 27 // custom_cast.py:201:27 setp.lt.s32 %p1, %r20, %r14; setp.lt.s32 %p2, %r21, %r14; setp.lt.s32 %p3, %r22, %r14; setp.lt.s32 %p4, %r23, %r14; .loc 1 204 31 // custom_cast.py:204:31 cvt.s64.s32 %rd11, %r20; add.s64 %rd1, %rd9, %rd11; add.s64 %rd2, %rd1, 1; add.s64 %rd3, %rd1, 2; add.s64 %rd4, %rd1, 3; .loc 1 204 23 // custom_cast.py:204:23 // begin inline asm mov.u16 %rs1, 0x0; @%p1 ld.global.b8 { %rs1 }, [ %rd1 + 0 ]; // end inline asm // begin inline asm mov.u16 %rs2, 0x0; @%p2 ld.global.b8 { %rs2 }, [ %rd2 + 0 ]; // end inline asm // begin inline asm mov.u16 %rs3, 0x0; @%p3 ld.global.b8 { %rs3 }, [ %rd3 + 0 ]; // end inline asm // begin inline asm mov.u16 %rs4, 0x0; @%p4 ld.global.b8 { %rs4 }, [ %rd4 + 0 ]; // end inline asm $L__tmp1: .loc 1 123 29 // custom_cast.py:123:29 shr.u16 %rs13, %rs1, 4; shr.u16 %rs14, %rs2, 4; shr.u16 %rs15, %rs3, 4; shr.u16 %rs16, %rs4, 4; .loc 1 131 18 // custom_cast.py:131:18 and.b16 %rs17, %rs13, 8; and.b16 %rs18, %rs14, 8; and.b16 %rs19, %rs15, 8; and.b16 %rs20, %rs16, 8; .loc 1 134 16 // custom_cast.py:134:16 and.b16 %rs21, %rs13, 7; and.b16 %rs22, %rs1, 6; and.b16 %rs23, %rs1, 7; and.b16 %rs24, %rs14, 7; and.b16 %rs25, %rs2, 6; and.b16 %rs26, %rs2, 7; and.b16 %rs27, %rs15, 7; and.b16 %rs28, %rs3, 6; and.b16 %rs29, %rs3, 7; and.b16 %rs30, %rs16, 7; and.b16 %rs31, %rs4, 6; and.b16 %rs32, %rs4, 7; .loc 1 137 25 // custom_cast.py:137:25 setp.eq.s16 %p9, %rs21, 0; setp.eq.s16 %p10, %rs23, 0; setp.eq.s16 %p11, %rs24, 0; setp.eq.s16 %p12, %rs26, 0; setp.eq.s16 %p13, %rs27, 0; setp.eq.s16 %p14, %rs29, 0; setp.eq.s16 %p15, %rs30, 0; setp.eq.s16 %p16, %rs32, 0; .loc 1 143 29 // custom_cast.py:143:29 setp.eq.s16 %p17, %rs21, 1; setp.eq.s16 %p18, %rs23, 1; setp.eq.s16 %p19, %rs24, 1; setp.eq.s16 %p20, %rs26, 1; setp.eq.s16 %p21, %rs27, 1; setp.eq.s16 %p22, %rs29, 1; setp.eq.s16 %p23, %rs30, 1; setp.eq.s16 %p24, %rs32, 1; .loc 1 146 29 // custom_cast.py:146:29 shr.u16 %rs33, %rs1, 5; and.b16 %rs34, %rs33, 3; shr.u16 %rs35, %rs22, 1; shr.u16 %rs36, %rs2, 5; and.b16 %rs37, %rs36, 3; shr.u16 %rs38, %rs25, 1; shr.u16 %rs39, %rs3, 5; and.b16 %rs40, %rs39, 3; shr.u16 %rs41, %rs28, 1; shr.u16 %rs42, %rs4, 5; and.b16 %rs43, %rs42, 3; shr.u16 %rs44, %rs31, 1; .loc 1 147 56 // custom_cast.py:147:56 add.s16 %rs45, %rs34, 126; add.s16 %rs46, %rs35, 126; add.s16 %rs47, %rs37, 126; add.s16 %rs48, %rs38, 126; add.s16 %rs49, %rs40, 126; add.s16 %rs50, %rs41, 126; add.s16 %rs51, %rs43, 126; add.s16 %rs52, %rs44, 126; .loc 1 148 39 // custom_cast.py:148:39 cvt.u32.u16 %r24, %rs45; cvt.u32.u16 %r25, %rs46; cvt.u32.u16 %r26, %rs47; cvt.u32.u16 %r27, %rs48; cvt.u32.u16 %r28, %rs49; cvt.u32.u16 %r29, %rs50; cvt.u32.u16 %r30, %rs51; cvt.u32.u16 %r31, %rs52; .loc 1 148 52 // custom_cast.py:148:52 shl.b32 %r32, %r24, 23; shl.b32 %r33, %r25, 23; shl.b32 %r34, %r26, 23; shl.b32 %r35, %r27, 23; shl.b32 %r36, %r28, 23; shl.b32 %r37, %r29, 23; shl.b32 %r38, %r30, 23; shl.b32 %r39, %r31, 23; .loc 1 151 26 // custom_cast.py:151:26 and.b16 %rs53, %rs13, 1; and.b16 %rs54, %rs14, 1; and.b16 %rs55, %rs15, 1; and.b16 %rs56, %rs16, 1; .loc 1 152 34 // custom_cast.py:152:34 cvt.u32.u16 %r40, %rs53; $L__tmp2: .loc 1 204 23 // custom_cast.py:204:23 cvt.u32.u16 %r41, %rs1; $L__tmp3: .loc 1 152 34 // custom_cast.py:152:34 and.b32 %r42, %r41, 1; cvt.u32.u16 %r43, %rs54; $L__tmp4: .loc 1 204 23 // custom_cast.py:204:23 cvt.u32.u16 %r44, %rs2; $L__tmp5: .loc 1 152 34 // custom_cast.py:152:34 and.b32 %r45, %r44, 1; cvt.u32.u16 %r46, %rs55; $L__tmp6: .loc 1 204 23 // custom_cast.py:204:23 cvt.u32.u16 %r47, %rs3; $L__tmp7: .loc 1 152 34 // custom_cast.py:152:34 and.b32 %r48, %r47, 1; cvt.u32.u16 %r49, %rs56; $L__tmp8: .loc 1 204 23 // custom_cast.py:204:23 cvt.u32.u16 %r50, %rs4; $L__tmp9: .loc 1 152 34 // custom_cast.py:152:34 and.b32 %r51, %r50, 1; .loc 1 152 48 // custom_cast.py:152:48 shl.b32 %r52, %r40, 22; shl.b32 %r53, %r42, 22; shl.b32 %r54, %r43, 22; shl.b32 %r55, %r45, 22; shl.b32 %r56, %r46, 22; shl.b32 %r57, %r48, 22; shl.b32 %r58, %r49, 22; shl.b32 %r59, %r51, 22; .loc 1 156 30 // custom_cast.py:156:30 or.b32 %r60, %r32, %r52; or.b32 %r61, %r33, %r53; or.b32 %r62, %r34, %r54; or.b32 %r63, %r35, %r55; or.b32 %r64, %r36, %r56; or.b32 %r65, %r37, %r57; or.b32 %r66, %r38, %r58; or.b32 %r67, %r39, %r59; .loc 1 158 48 // custom_cast.py:158:48 selp.b32 %r68, 0, %r60, %p9; selp.b32 %r69, 0, %r61, %p10; selp.b32 %r70, 0, %r62, %p11; selp.b32 %r71, 0, %r63, %p12; selp.b32 %r72, 0, %r64, %p13; selp.b32 %r73, 0, %r65, %p14; selp.b32 %r74, 0, %r66, %p15; selp.b32 %r75, 0, %r67, %p16; .loc 1 160 63 // custom_cast.py:160:63 selp.b32 %r76, 1056964608, %r68, %p17; selp.b32 %r77, 1056964608, %r69, %p18; selp.b32 %r78, 1056964608, %r70, %p19; selp.b32 %r79, 1056964608, %r71, %p20; selp.b32 %r80, 1056964608, %r72, %p21; selp.b32 %r81, 1056964608, %r73, %p22; selp.b32 %r82, 1056964608, %r74, %p23; selp.b32 %r83, 1056964608, %r75, %p24; .loc 1 163 26 // custom_cast.py:163:26 cvt.u32.u16 %r84, %rs17; and.b32 %r85, %r41, 8; cvt.u32.u16 %r86, %rs18; and.b32 %r87, %r44, 8; cvt.u32.u16 %r88, %rs19; and.b32 %r89, %r47, 8; cvt.u32.u16 %r90, %rs20; and.b32 %r91, %r50, 8; .loc 1 164 8 // custom_cast.py:164:8 shl.b32 %r92, %r84, 28; shl.b32 %r93, %r85, 28; shl.b32 %r94, %r86, 28; shl.b32 %r95, %r87, 28; shl.b32 %r96, %r88, 28; shl.b32 %r97, %r89, 28; shl.b32 %r98, %r90, 28; shl.b32 %r99, %r91, 28; .loc 1 166 22 // custom_cast.py:166:22 or.b32 %r2, %r76, %r92; or.b32 %r3, %r77, %r93; or.b32 %r4, %r78, %r94; or.b32 %r5, %r79, %r95; or.b32 %r6, %r80, %r96; or.b32 %r7, %r81, %r97; or.b32 %r8, %r82, %r98; or.b32 %r9, %r83, %r99; .loc 1 174 23 // custom_cast.py:174:23 // begin inline asm cvt.rn.bf16.f32 %rs5, %r2; // end inline asm // begin inline asm cvt.rn.bf16.f32 %rs6, %r3; // end inline asm // begin inline asm cvt.rn.bf16.f32 %rs7, %r4; // end inline asm // begin inline asm cvt.rn.bf16.f32 %rs8, %r5; // end inline asm // begin inline asm cvt.rn.bf16.f32 %rs9, %r6; // end inline asm // begin inline asm cvt.rn.bf16.f32 %rs10, %r7; // end inline asm // begin inline asm cvt.rn.bf16.f32 %rs11, %r8; // end inline asm // begin inline asm cvt.rn.bf16.f32 %rs12, %r9; // end inline asm $L__tmp10: .loc 1 220 28 // custom_cast.py:220:28 shl.b32 %r100, %r1, 10; .loc 1 221 49 // custom_cast.py:221:49 shl.b32 %r101, %r17, 3; and.b32 %r102, %r101, 1016; .loc 1 221 36 // custom_cast.py:221:36 or.b32 %r103, %r100, %r102; or.b32 %r104, %r103, 2; or.b32 %r105, %r103, 4; or.b32 %r106, %r103, 6; .loc 1 222 29 // custom_cast.py:222:29 setp.lt.s32 %p5, %r103, %r15; setp.lt.s32 %p6, %r104, %r15; setp.lt.s32 %p7, %r105, %r15; setp.lt.s32 %p8, %r106, %r15; .loc 1 224 26 // custom_cast.py:224:26 mul.wide.s32 %rd12, %r103, 2; add.s64 %rd5, %rd10, %rd12; add.s64 %rd6, %rd5, 4; add.s64 %rd7, %rd5, 8; add.s64 %rd8, %rd5, 12; .loc 1 224 39 // custom_cast.py:224:39 mov.b32 %r107, {%rs5, %rs6}; // begin inline asm @%p5 st.global.b32 [ %rd5 + 0 ], { %r107 }; // end inline asm mov.b32 %r108, {%rs7, %rs8}; // begin inline asm @%p6 st.global.b32 [ %rd6 + 0 ], { %r108 }; // end inline asm mov.b32 %r109, {%rs9, %rs10}; // begin inline asm @%p7 st.global.b32 [ %rd7 + 0 ], { %r109 }; // end inline asm mov.b32 %r110, {%rs11, %rs12}; // begin inline asm @%p8 st.global.b32 [ %rd8 + 0 ], { %r110 }; // end inline asm .loc 1 224 4 // custom_cast.py:224:4 ret; $L__tmp11: $L__func_end0: // -- End function } .file 1 "/home/drisspg/meta/ao/torchao/prototype/mx_formats/custom_cast.py" .section .debug_abbrev { .b8 1 // Abbreviation Code .b8 17 // DW_TAG_compile_unit .b8 1 // DW_CHILDREN_yes .b8 37 // DW_AT_producer .b8 8 // DW_FORM_string .b8 19 // DW_AT_language .b8 5 // DW_FORM_data2 .b8 3 // DW_AT_name .b8 8 // DW_FORM_string .b8 16 // DW_AT_stmt_list .b8 6 // DW_FORM_data4 .b8 27 // DW_AT_comp_dir .b8 8 // DW_FORM_string .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 2 // Abbreviation Code .b8 46 // DW_TAG_subprogram .b8 0 // DW_CHILDREN_no .b8 3 // DW_AT_name .b8 8 // DW_FORM_string .b8 32 // DW_AT_inline .b8 11 // DW_FORM_data1 .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 3 // Abbreviation Code .b8 46 // DW_TAG_subprogram .b8 1 // DW_CHILDREN_yes .b8 17 // DW_AT_low_pc .b8 1 // DW_FORM_addr .b8 18 // DW_AT_high_pc .b8 1 // DW_FORM_addr .b8 49 // DW_AT_abstract_origin .b8 19 // DW_FORM_ref4 .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 4 // Abbreviation Code .b8 29 // DW_TAG_inlined_subroutine .b8 0 // DW_CHILDREN_no .b8 49 // DW_AT_abstract_origin .b8 19 // DW_FORM_ref4 .b8 17 // DW_AT_low_pc .b8 1 // DW_FORM_addr .b8 18 // DW_AT_high_pc .b8 1 // DW_FORM_addr .b8 88 // DW_AT_call_file .b8 11 // DW_FORM_data1 .b8 89 // DW_AT_call_line .b8 11 // DW_FORM_data1 .b8 87 // DW_AT_call_column .b8 11 // DW_FORM_data1 .b8 0 // EOM(1) .b8 0 // EOM(2) .b8 0 // EOM(3) } .section .debug_info { .b32 161 // Length of Unit .b8 2 // DWARF version number .b8 0 .b32 .debug_abbrev // Offset Into Abbrev. Section .b8 8 // Address Size (in bytes) .b8 1 // Abbrev [1] 0xb:0x9a DW_TAG_compile_unit .b8 116 // DW_AT_producer .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 0 .b8 2 // DW_AT_language .b8 0 .b8 99 // DW_AT_name .b8 117 .b8 115 .b8 116 .b8 111 .b8 109 .b8 95 .b8 99 .b8 97 .b8 115 .b8 116 .b8 46 .b8 112 .b8 121 .b8 0 .b32 .debug_line // DW_AT_stmt_list .b8 47 // DW_AT_comp_dir .b8 104 .b8 111 .b8 109 .b8 101 .b8 47 .b8 100 .b8 114 .b8 105 .b8 115 .b8 115 .b8 112 .b8 103 .b8 47 .b8 109 .b8 101 .b8 116 .b8 97 .b8 47 .b8 97 .b8 111 .b8 47 .b8 116 .b8 111 .b8 114 .b8 99 .b8 104 .b8 97 .b8 111 .b8 47 .b8 112 .b8 114 .b8 111 .b8 116 .b8 111 .b8 116 .b8 121 .b8 112 .b8 101 .b8 47 .b8 109 .b8 120 .b8 95 .b8 102 .b8 111 .b8 114 .b8 109 .b8 97 .b8 116 .b8 115 .b8 0 .b8 2 // Abbrev [2] 0x5b:0x1b DW_TAG_subprogram .b8 116 // DW_AT_name .b8 114 .b8 105 .b8 116 .b8 111 .b8 110 .b8 95 .b8 102 .b8 52 .b8 95 .b8 116 .b8 111 .b8 95 .b8 98 .b8 102 .b8 49 .b8 54 .b8 95 .b8 107 .b8 101 .b8 114 .b8 110 .b8 101 .b8 108 .b8 0 .b8 1 // DW_AT_inline .b8 3 // Abbrev [3] 0x76:0x2e DW_TAG_subprogram .b64 $L__func_begin0 // DW_AT_low_pc .b64 $L__func_end0 // DW_AT_high_pc .b32 91 // DW_AT_abstract_origin .b8 4 // Abbrev [4] 0x8b:0x18 DW_TAG_inlined_subroutine .b32 91 // DW_AT_abstract_origin .b64 $L__tmp1 // DW_AT_low_pc .b64 $L__tmp10 // DW_AT_high_pc .b8 1 // DW_AT_call_file .b8 216 // DW_AT_call_line .b8 8 // DW_AT_call_column .b8 0 // End Of Children Mark .b8 0 // End Of Children Mark } .section .debug_macinfo { }
비교하기