matmul-diff-2275

Created Diff never expires
523 rimozioni
Linee
Totale
Rimosso
Parole
Totale
Rimosso
Per continuare a utilizzare questa funzione, aggiorna a
Diffchecker logo
Diffchecker Pro
542 linee
448 aggiunte
Linee
Totale
Aggiunto
Parole
Totale
Aggiunto
Per continuare a utilizzare questa funzione, aggiorna a
Diffchecker logo
Diffchecker Pro
480 linee
__global__ void kernel4(Tensor<__half, 2> T0, Tensor<__half, 2> T1, Tensor<float, 2> T4) {
__global__ void kernel4(Tensor<__half, 2> T0, Tensor<__half, 2> T1, Tensor<float, 2> T4) {
alignas(16) extern __shared__ char array[];
alignas(16) extern __shared__ char array[];
unsigned smem_offset = 0;
unsigned smem_offset = 0;
NVFUSER_DEFINE_MAGIC_ZERO
NVFUSER_DEFINE_MAGIC_ZERO
int i414;
int i1864;
i414 = ((((((nvfuser_index_t)threadIdx.z) * 2) + ((nvfuser_index_t)threadIdx.y)) * 32) + ((nvfuser_index_t)threadIdx.x)) * 8;
i1864 = (((nvfuser_index_t)threadIdx.z) * 4) + (2 * ((nvfuser_index_t)threadIdx.y));
int i644;
int i1865;
i644 = ((nvfuser_index_t)blockIdx.y) * 128;
i1865 = i1864 + (((nvfuser_index_t)threadIdx.x) / 16);
int i1361;
int i1889;
i1361 = ((nvfuser_index_t)blockIdx.x) * 256;
i1889 = i1865 % 8;
int i3196;
int i1229;
i3196 = ((nvfuser_index_t)threadIdx.x) / 8;
i1229 = ((nvfuser_index_t)threadIdx.x) % 16;
int i1866;
i1866 = i1865 / 8;
int i1875;
i1875 = (i1229 / 8) * 64;
int i1893;
i1893 = i1229 % 8;
int i1896;
i1896 = (((1024 * i1866) + (128 * i1889)) + i1875) + ((i1889 ^ i1893) * 8);
int i3166;
i3166 = T1.size[1] * 8;
int i3192;
i3192 = i3166 * i1866;
int i3193;
i3193 = T1.size[1] * i1889;
int i3180;
i3180 = ((nvfuser_index_t)blockIdx.y) * 128;
int i3197;
int i3197;
i3197 = ((nvfuser_index_t)threadIdx.x) % 8;
i3197 = i1893 * 8;
int i3302;
int i3198;
i3302 = (i3196 / 2) * 8;
i3198 = (((i3192 + i3193) + i3180) + i1875) + i3197;
int i3303;
int i3186;
i3303 = i3302 + i3197;
i3186 = T1.size[1] * 32;
int i3330;
int i3189;
i3330 = i3303 % 8;
i3189 = T1.size[1] * 16;
int i3311;
int i6198;
i3311 = ((nvfuser_index_t)threadIdx.z) * 64;
i6198 = (((nvfuser_index_t)threadIdx.z) * 2) + ((nvfuser_index_t)threadIdx.y);
int i3333;
int i6199;
i3333 = ((((i3303 / 8) * 8) + i3330) * 256) + i3311;
i6199 = i6198 + (((nvfuser_index_t)threadIdx.x) / 32);
int i3318;
int i6221;
i3318 = (i3196 % 2) * 8;
i6221 = i6199 % 8;
int i3520;
int i5800;
i3520 = i3318 + i3197;
i5800 = ((nvfuser_index_t)threadIdx.x) % 32;
int i3523;
int i6200;
i3523 = i3520 % 8;
i6200 = i6199 / 8;
int i3501;
int i6208;
i3501 = ((nvfuser_index_t)threadIdx.y) * 64;
i6208 = (i5800 / 8) * 64;
int i3526;
int i6225;
i3526 = ((((i3520 / 8) * 8) + i3523) * 128) + i3501;
i6225 = i5800 % 8;
int i4604;
int i6228;
i4604 = (i644 + i3501) + ((((nvfuser_index_t)threadIdx.x) % 4) * 2);
i6228 = (((2048 * i6200) + (256 * i6221)) + i6208) + ((i6221 ^ i6225) * 8);
int i4608;
int i7323;
i4608 = (i1361 + i3311) + (((nvfuser_index_t)threadIdx.x) / 4);
i7323 = T0.size[1] * 8;
int i5077;
int i7347;
i5077 = i414 + 7;
i7347 = i7323 * i6200;
int i7348;
i7348 = T0.size[1] * i6221;
int i7335;
i7335 = ((nvfuser_index_t)blockIdx.x) * 256;
int i7352;
i7352 = i6225 * 8;
int i7353;
i7353 = (((i7347 + i7348) + i7335) + i6208) + i7352;
int i7341;
i7341 = T0.size[1] * 32;
int i11742;
i11742 = (((((T1.size[1] * 96) + i3192) + i3193) + i3180) + i1875) + i3197;
int i16008;
i16008 = (((((T0.size[1] * 96) + i7347) + i7348) + i7335) + i6208) + i7352;
int i17482;
i17482 = ((nvfuser_index_t)threadIdx.x) % 8;
int i17724;
i17724 = i17482 % 8;
int i17481;
i17481 = ((nvfuser_index_t)threadIdx.x) / 8;
int i17715;
i17715 = i17482 / 8;
int i17550;
i17550 = i17481 / 2;
int i17829;
i17829 = 2048 * i17550;
int i17830;
i17830 = 2048 * i17715;
int i17832;
i17832 = 256 * i17724;
int i17834;
i17834 = ((nvfuser_index_t)threadIdx.z) * 64;
int i17835;
i17835 = ((i17829 + i17830) + i17832) + i17834;
int i17570;
i17570 = i17481 % 2;
int i18255;
i18255 = 1024 * i17570;
int i18256;
i18256 = 1024 * i17715;
int i18258;
i18258 = 128 * i17724;
int i18246;
i18246 = ((nvfuser_index_t)threadIdx.y) * 64;
int i18260;
i18260 = ((i18255 + i18256) + i18258) + i18246;
int i18918;
i18918 = (((4096 + i17829) + i17830) + i17832) + i17834;
int i19677;
i19677 = (((2048 + i18255) + i18256) + i18258) + i18246;
int i20235;
i20235 = ((nvfuser_index_t)threadIdx.x) / 4;
int i20286;
i20286 = (((nvfuser_index_t)threadIdx.x) % 4) * 2;
int i20301;
i20301 = ((((((T1.size[1] * ((nvfuser_index_t)blockIdx.x)) * 256) + ((T1.size[1] * ((nvfuser_index_t)threadIdx.z)) * 64)) + (T1.size[1] * i20235)) + i3180) + i18246) + i20286;
int i22576;
i22576 = (8 * ((nvfuser_index_t)threadIdx.x)) + 7;
int i22577;
i22577 = i22576 % 128;
int i22578;
i22578 = i22577 / 8;
bool b22625;
b22625 = (((i3180 + ((i22578 / 8) * 64)) + ((i22578 % 8) * 8)) + (i22577 % 8)) < T1.size[1];
int i22627;
i22627 = i1864 + (i22576 / 128);
int i22629;
i22629 = 8 * (i22627 / 8);
int i22630;
i22630 = i22627 % 8;
int i22631;
i22631 = i22629 + i22630;
int i27301;
i27301 = i22576 % 256;
int i27302;
i27302 = i27301 / 8;
bool b27311;
b27311 = (((i7335 + ((i27302 / 8) * 64)) + ((i27302 % 8) * 8)) + (i27301 % 8)) < T0.size[1];
int i27313;
i27313 = i6198 + (i22576 / 256);
int i27315;
i27315 = 8 * (i27313 / 8);
int i27316;
i27316 = i27313 % 8;
int i27317;
i27317 = i27315 + i27316;
int i31979;
i31979 = (96 + i22629) + i22630;
int i36682;
i36682 = (96 + i27315) + i27316;
int i39223;
i39223 = (i7335 + i17834) + i20235;
int i39228;
i39228 = (i3180 + i18246) + i20286;
smem_offset = alignBufferSize(smem_offset, 16);
smem_offset = alignBufferSize(smem_offset, 16);
__half* T7 = reinterpret_cast<__half*>(array + smem_offset);
__half* T7 = reinterpret_cast<__half*>(array + smem_offset);
smem_offset += (((((((ceilDiv(((((ceilDiv(32, 8)) * (ceilDiv(8, 1))) * 1) * (((ceilDiv(128, 64)) * (ceilDiv(64, 8))) * 8)), 2048)) * (ceilDiv((ceilDiv((ceilDiv(2048, 8)), 32)), 2))) * 2) * 32) * 8) * 4) * sizeof(__half));
smem_offset += (((((((ceilDiv(((((ceilDiv(32, 8)) * (ceilDiv(8, 1))) * 1) * (((ceilDiv(128, 64)) * (ceilDiv(64, 8))) * 8)), 2048)) * (ceilDiv((ceilDiv((ceilDiv(2048, 8)), 32)), 2))) * 2) * 32) * 8) * 4) * sizeof(__half));
smem_offset = alignBufferSize(smem_offset, 16);
smem_offset = alignBufferSize(smem_offset, 16);
__half* T6 = reinterpret_cast<__half*>(array + smem_offset);
__half* T6 = reinterpret_cast<__half*>(array + smem_offset);
smem_offset += (((((((ceilDiv(((((ceilDiv(32, 8)) * (ceilDiv(8, 1))) * 1) * (((ceilDiv(256, 64)) * (ceilDiv(64, 8))) * 8)), 2048)) * (ceilDiv((ceilDiv((ceilDiv(2048, 8)), 32)), 2))) * 2) * 32) * 8) * 4) * sizeof(__half));
smem_offset += (((((((ceilDiv(((((ceilDiv(32, 8)) * (ceilDiv(8, 1))) * 1) * (((ceilDiv(256, 64)) * (ceilDiv(64, 8))) * 8)), 2048)) * (ceilDiv((ceilDiv((ceilDiv(2048, 8)), 32)), 2))) * 2) * 32) * 8) * 4) * sizeof(__half));
float T5[(((((ceilDiv(64, 16)) * (ceilDiv(64, 16))) * (ceilDiv(16, 8))) * (ceilDiv(16, 8))) * 2)];
float T5[(((((ceilDiv(64, 16)) * (ceilDiv(64, 16))) * (ceilDiv(16, 8))) * (ceilDiv(16, 8))) * 2)];
#pragma unroll
#pragma unroll
for(nvfuser_index_t i166 = 0; i166 < 4; ++i166) {
for(nvfuser_index_t i166 = 0; i166 < 4; ++i166) {
int i249;
int i251;
i249 = 32 * i166;
i251 = 32 * i166;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i167 = 0; i167 < 4; ++i167) {
for(nvfuser_index_t i167 = 0; i167 < 4; ++i167) {
Ampere::initM16N16K16TN<16>(reinterpret_cast<Array<float,8,8>*>(&T5[(i249 + (4 * i167))]));
Ampere::initM16N16K16TN<16>(reinterpret_cast<Array<float,8,8>*>(&T5[(i251 + (4 * i167))]));
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
#pragma unroll
for(nvfuser_index_t i160 = 0; i160 < 3; ++i160) {
for(nvfuser_index_t i160 = 0; i160 < 3; ++i160) {
int i409;
int i1897;
i409 = 4096 * i160;
i1897 = i1896 + (4096 * i160);
int i679;
int i3199;
i679 = 32 * i160;
i3199 = i3198 + (i3186 * i160);
int i1122;
int i6229;
i1122 = 8192 * i160;
i6229 = i6228 + (8192 * i160);
int i7354;
i7354 = i7353 + (i7341 * i160);
int i22613;
i22613 = 32 * i160;
int i22632;
i22632 = i22631 + i22613;
int i27318;
i27318 = i27317 + i22613;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i154 = 0; i154 < 2; ++i154) {
for(nvfuser_index_t i154 = 0; i154 < 2; ++i154) {
int i416;
bool b22635;
i416 = i414 + (2048 * i154);
b22635 = b22625 && ((i22632 + (16 * (i154 + nvfuser_zero))) < T0.size[0]);
int i417;
if ((!b22635)) {
i417 = i416 / 128;
arraySet<__half, 8>(&T7[(i1897 + (2048 * i154))], (__half)0);
int i476;
i476 = i417 % 8;
int i480;
i480 = i416 % 128;
int i481;
i481 = i480 / 8;
int i5078;
i5078 = i5077 + (2048 * (i154 + nvfuser_zero));
int i5079;
i5079 = i5078 % 128;
int i5080;
i5080 = i5079 / 8;
int i5090;
i5090 = i5078 / 128;
bool b5097;
b5097 = ((((i644 + (64 * (i5080 / 8))) + (8 * (i5080 % 8))) + (i5079 % 8)) < T1.size[1]) && (((i679 + (8 * (i5090 / 8))) + (i5090 % 8)) < T0.size[0]);
if ((!b5097)) {
arraySet<__half, 8>(&T7[((((i409 + (128 * ((8 * (i417 / 8)) + i476))) + (64 * (i481 / 8))) + (8 * (i476 ^ (i481 % 8)))) + (i480 % 8))], (__half)0);
}
}
}
}
#pragma unroll
#pragma unroll
for(nvfuser_index_t i154 = 0; i154 < 2; ++i154) {
for(nvfuser_index_t i154 = 0; i154 < 2; ++i154) {
int i650;
int i1912;
i650 = 2048 * (i154 + nvfuser_zero);
i1912 = i154 + nvfuser_zero;
int i703;
Ampere::cpAsync(reinterpret_cast<Array<__half,8,8>*>(&T7[(i1897 + (2048 * i154))]),reinterpret_cast<Array<__half,8,8>*>(&T1[(i3199 + (i3189 * i1912))]),(b22625 && ((i22632 + (16 * i1912)) < T0.size[0])));
i703 = i414 + i650;
int i704;
i704 = i703 % 128;
int i705;
i705 = i704 / 8;
int i714;
i714 = i703 / 128;
int i946;
i946 = i414 + (2048 * i154);
int i947;
i947 = i946 / 128;
int i950;
i950 = i947 % 8;
int i954;
i954 = i946 % 128;
int i955;
i955 = i954 / 8;
int i5695;
i5695 = i5077 + i650;
int i5696;
i5696 = i5695 % 128;
int i5697;
i5697 = i5696 / 8;
int i5707;
i5707 = i5695 / 128;
Ampere::cpAsync(reinterpret_cast<Array<__half,8,8>*>(&T7[((((i409 + (128 * ((8 * (i947 / 8)) + i950))) + (64 * (i955 / 8))) + (8 * (i950 ^ (i955 % 8)))) + (i954 % 8))]),reinterpret_cast<Array<__half,8,8>*>(&T1[((((i644 + (64 * (i705 / 8))) + (8 * (i705 % 8))) + (i704 % 8)) + (T1.size[1] * ((i679 + (8 * (i714 / 8))) + (i714 % 8))))]),(((((i644 + (64 * (i5697 / 8))) + (8 * (i5697 % 8))) + (i5696 % 8)) < T1.size[1]) && (((i679 + (8 * (i5707 / 8))) + (i5707 % 8)) < T0.size[0])));
}
}
#pragma unroll
#pragma unroll
for(nvfuser_index_t i143 = 0; i143 < 4; ++i143) {
for(nvfuser_index_t i143 = 0; i143 < 4; ++i143) {
int i1189;
bool b27321;
i1189 = i414 + (2048 * i143);
b27321 = b27311 && ((i27318 + (8 * (i143 + nvfuser_zero))) < T0.size[0]);
int i1190;
if ((!b27321)) {
i1190 = i1189 / 256;
arraySet<__half, 8>(&T6[(i6229 + (2048 * i143))], (__half)0);
int i1193;
i1193 = i1190 % 8;
int i1197;
i1197 = i1189 % 256;
int i1198;
i1198 = i1197 / 8;
int i6178;
i6178 = i5077 + (2048 * (i143 + nvfuser_zero));
int i6179;
i6179 = i6178 % 256;
int i6180;
i6180 = i6179 / 8;
int i6190;
i6190 = i6178 / 256;
bool b6197;
b6197 = ((((i1361 + (64 * (i6180 / 8))) + (8 * (i6180 % 8))) + (i6179 % 8)) < T0.size[1]) && (((i679 + (8 * (i6190 / 8))) + (i6190 % 8)) < T0.size[0]);
if ((!b6197)) {
arraySet<__half, 8>(&T6[((((i1122 + (256 * ((8 * (i1190 / 8)) + i1193))) + (64 * (i1198 / 8))) + (8 * (i1193 ^ (i1198 % 8)))) + (i1197 % 8))], (__half)0);
}
}
}
}
#pragma unroll
#pragma unroll
for(nvfuser_index_t i143 = 0; i143 < 4; ++i143) {
for(nvfuser_index_t i143 = 0; i143 < 4; ++i143) {
int i1367;
int i6244;
i1367 = 2048 * (i143 + nvfuser_zero);
i6244 = i143 + nvfuser_zero;
int i1420;
Ampere::cpAsync(reinterpret_cast<Array<__half,8,8>*>(&T6[(i6229 + (2048 * i143))]),reinterpret_cast<Array<__half,8,8>*>(&T0[(i7354 + (i7323 * i6244))]),(b27311 && ((i27318 + (8 * i6244)) < T0.size[0])));
i1420 = i414 + i1367;
int i1421;
i1421 = i1420 % 256;
int i1422;
i1422 = i1421 / 8;
int i1431;
i1431 = i1420 / 256;
int i1663;
i1663 = i414 + (2048 * i143);
int i1664;
i1664 = i1663 / 256;
int i1667;
i1667 = i1664 % 8;
int i1671;
i1671 = i1663 % 256;
int i1672;
i1672 = i1671 / 8;
int i6795;
i6795 = i5077 + i1367;
int i6796;
i6796 = i6795 % 256;
int i6797;
i6797 = i6796 / 8;
int i6807;
i6807 = i6795 / 256;
Ampere::cpAsync(reinterpret_cast<Array<__half,8,8>*>(&T6[((((i1122 + (256 * ((8 * (i1664 / 8)) + i1667))) + (64 * (i1672 / 8))) + (8 * (i1667 ^ (i1672 % 8)))) + (i1671 % 8))]),reinterpret_cast<Array<__half,8,8>*>(&T0[((((i1361 + (64 * (i1422 / 8))) + (8 * (i1422 % 8))) + (i1421 % 8)) + (T0.size[1] * ((i679 + (8 * (i1431 / 8))) + (i1431 % 8))))]),(((((i1361 + (64 * (i6797 / 8))) + (8 * (i6797 % 8))) + (i6796 % 8)) < T0.size[1]) && (((i679 + (8 * (i6807 / 8))) + (i6807 % 8)) < T0.size[0])));
}
}
Ampere::cpAsyncCommit();
Ampere::cpAsyncCommit();
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
Ampere::cpAsyncPartialBarrier<2>();
Ampere::cpAsyncPartialBarrier<2>();
__barrier_sync(0);
__barrier_sync(0);
#pragma unroll 1
#pragma unroll 1
for(nvfuser_index_t i161 = 0; i161 < (ceilDiv(T0.size[0], 32)); ++i161) {
for(nvfuser_index_t i161 = 0; i161 < (ceilDiv(T0.size[0], 32)); ++i161) {
int i1846;
int i10426;
i1846 = 3 + i161;
i10426 = (3 + i161) % 4;
int i1847;
int i10431;
i1847 = i1846 % 4;
i10431 = i1896 + (4096 * i10426);
int i1848;
int i11744;
i1848 = 4096 * i1847;
i11744 = i11742 + (i3186 * i161);
int i2162;
int i14873;
i2162 = 32 * i1846;
i14873 = i6228 + (8192 * i10426);
int i2656;
int i16010;
i2656 = 8192 * i1847;
i16010 = i16008 + (i7341 * i161);
int i3259;
int i17544;
i3259 = i161 % 4;
i17544 = i161 % 4;
int i3313;
int i17836;
i3313 = 8192 * i3259;
i17836 = 8192 * i17544;
int i3334;
int i17837;
i3334 = i3333 + i3313;
i17837 = i17835 + i17836;
int i3527;
int i18261;
i3527 = 4096 * i3259;
i18261 = 4096 * i17544;
int i3528;
int i18262;
i3528 = i3526 + i3527;
i18262 = i18260 + i18261;
int i3743;
int i18919;
i3743 = i3311 + i3313;
i18919 = i18918 + i17836;
int i4054;
int i19678;
i4054 = i3501 + i3527;
i19678 = i19677 + i18261;
int i31972;
i31972 = 32 * i161;
int i31980;
i31980 = i31979 + i31972;
int i36683;
i36683 = i36682 + i31972;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i154 = 0; i154 < 2; ++i154) {
for(nvfuser_index_t i154 = 0; i154 < 2; ++i154) {
int i1915;
bool b31983;
i1915 = i414 + (2048 * i154);
b31983 = b22625 && ((i31980 + (16 * (i154 + nvfuser_zero))) < T0.size[0]);
int i1916;
if ((!b31983)) {
i1916 = i1915 / 128;
arraySet<__half, 8>(&T7[(i10431 + (2048 * i154))], (__half)0);
int i1919;
i1919 = i1916 % 8;
int i1923;
i1923 = i1915 % 128;
int i1924;
i1924 = i1923 / 8;
int i7288;
i7288 = i5077 + (2048 * (i154 + nvfuser_zero));
int i7289;
i7289 = i7288 % 128;
int i7290;
i7290 = i7289 / 8;
int i7300;
i7300 = i7288 / 128;
bool b7307;
b7307 = ((((i644 + (64 * (i7290 / 8))) + (8 * (i7290 % 8))) + (i7289 % 8)) < T1.size[1]) && (((i2162 + (8 * (i7300 / 8))) + (i7300 % 8)) < T0.size[0]);
if ((!b7307)) {
arraySet<__half, 8>(&T7[((((i1848 + (128 * ((8 * (i1916 / 8)) + i1919))) + (64 * (i1924 / 8))) + (8 * (i1919 ^ (i1924 % 8)))) + (i1923 % 8))], (__half)0);
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
#pragma unroll
for(nvfuser_index_t i154 = 0; i154 < 2; ++i154) {
for(nvfuser_index_t i154 = 0; i154 < 2; ++i154) {
int i2097;
int i10446;
i2097 = 2048 * (i154 + nvfuser_zero);
i10446 = i154 + nvfuser_zero;
int i2151;
Ampere::cpAsync(reinterpret_cast<Array<__half,8,8>*>(&T7[(i10431 + (2048 * i154))]),reinterpret_cast<Array<__half,8,8>*>(&T1[(i11744 + (i3189 * i10446))]),(b22625 && ((i31980 + (16 * i10446)) < T0.size[0])));
i2151 = i414 + i2097;
int i2152;
i2152 = i2151 % 128;
int i2153;
i2153 = i2152 / 8;
int i2163;
i2163 = i2151 / 128;
int i2404;
i2404 = i414 + (2048 * i154);
int i2405;
i2405 = i2404 / 128;
int i2408;
i2408 = i2405 % 8;
int i2412;
i2412 = i2404 % 128;
int i2413;
i2413 = i2412 / 8;
int i7917;
i7917 = i5077 + i2097;
int i7918;
i7918 = i7917 % 128;
int i7919;
i7919 = i7918 / 8;
int i7929;
i7929 = i7917 / 128;
Ampere::cpAsync(reinterpret_cast<Array<__half,8,8>*>(&T7[((((i1848 + (128 * ((8 * (i2405 / 8)) + i2408))) + (64 * (i2413 / 8))) + (8 * (i2408 ^ (i2413 % 8)))) + (i2412 % 8))]),reinterpret_cast<Array<__half,8,8>*>(&T1[((((i644 + (64 * (i2153 / 8))) + (8 * (i2153 % 8))) + (i2152 % 8)) + (T1.size[1] * ((i2162 + (8 * (i2163 / 8))) + (i2163 % 8))))]),(((((i644 + (64 * (i7919 / 8))) + (8 * (i7919 % 8))) + (i7918 % 8)) < T1.size[1]) && (((i2162 + (8 * (i7929 / 8))) + (i7929 % 8)) < T0.size[0])));
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
#pragma unroll
for(nvfuser_index_t i143 = 0; i143 < 4; ++i143) {
for(nvfuser_index_t i143 = 0; i143 < 4; ++i143) {
int i2657;
bool b36686;
i2657 = i414 + (2048 * i143);
b36686 = b27311 && ((i36683 + (8 * (i143 + nvfuser_zero))) < T0.size[0]);
int i2658;
if ((!b36686)) {
i2658 = i2657 / 256;
arraySet<__half, 8>(&T6[(i14873 + (2048 * i143))], (__half)0);
int i2661;
i2661 = i2658 % 8;
int i2665;
i2665 = i2657 % 256;
int i2666;
i2666 = i2665 / 8;
int i8410;
i8410 = i5077 + (2048 * (i143 + nvfuser_zero));
int i8411;
i8411 = i8410 % 256;
int i8412;
i8412 = i8411 / 8;
int i8422;
i8422 = i8410 / 256;
bool b8429;
b8429 = ((((i1361 + (64 * (i8412 / 8))) + (8 * (i8412 % 8))) + (i8411 % 8)) < T0.size[1]) && (((i2162 + (8 * (i8422 / 8))) + (i8422 % 8)) < T0.size[0]);
if ((!b8429)) {
arraySet<__half, 8>(&T6[((((i2656 + (256 * ((8 * (i2658 / 8)) + i2661))) + (64 * (i2666 / 8))) + (8 * (i2661 ^ (i2666 % 8)))) + (i2665 % 8))], (__half)0);
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
#pragma unroll
for(nvfuser_index_t i143 = 0; i143 < 4; ++i143) {
for(nvfuser_index_t i143 = 0; i143 < 4; ++i143) {
int i2839;
int i14888;
i2839 = 2048 * (i143 + nvfuser_zero);
i14888 = i143 + nvfuser_zero;
int i2893;
Ampere::cpAsync(reinterpret_cast<Array<__half,8,8>*>(&T6[(i14873 + (2048 * i143))]),reinterpret_cast<Array<__half,8,8>*>(&T0[(i16010 + (i7323 * i14888))]),(b27311 && ((i36683 + (8 * i14888)) < T0.size[0])));
i2893 = i414 + i2839;
int i2894;
i2894 = i2893 % 256;
int i2895;
i2895 = i2894 / 8;
int i2904;
i2904 = i2893 / 256;
int i3145;
i3145 = i414 + (2048 * i143);
int i3146;
i3146 = i3145 / 256;
int i3149;
i3149 = i3146 % 8;
int i3153;
i3153 = i3145 % 256;
int i3154;
i3154 = i3153 / 8;
int i9039;
i9039 = i5077 + i2839;
int i9040;
i9040 = i9039 % 256;
int i9041;
i9041 = i9040 / 8;
int i9051;
i9051 = i9039 / 256;
Ampere::cpAsync(reinterpret_cast<Array<__half,8,8>*>(&T6[((((i2656 + (256 * ((8 * (i3146 / 8)) + i3149))) + (64 * (i3154 / 8))) + (8 * (i3149 ^ (i3154 % 8)))) + (i3153 % 8))]),reinterpret_cast<Array<__half,8,8>*>(&T0[((((i1361 + (64 * (i2895 / 8))) + (8 * (i2895 % 8))) + (i2894 % 8)) + (T0.size[1] * ((i2162 + (8 * (i2904 / 8))) + (i2904 % 8))))]),(((((i1361 + (64 * (i9041 / 8))) + (8 * (i9041 % 8))) + (i9040 % 8)) < T0.size[1]) && (((i2162 + (8 * (i9051 / 8))) + (i9051 % 8)) < T0.size[0])));
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
Array<__half, (((ceilDiv(64, 16)) * 8) * 2), 8> T8;
Array<__half, (((ceilDiv(64, 16)) * 8) * 2), 8> T8;
Array<__half, (((ceilDiv(64, 16)) * 8) * 2), 8> T9;
Array<__half, (((ceilDiv(64, 16)) * 8) * 2), 8> T9;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i145 = 0; i145 < 4; ++i145) {
for(nvfuser_index_t i145 = 0; i145 < 4; ++i145) {
int i3320;
Turing::ldMatrixT (*reinterpret_cast<Array<__half,8,8>*>(&T8[(8 * i145)]),&T6[(i17837 + (8 * (i17724 ^ (i17570 + (2 * i145)))))]);
i3320 = i3318 + (16 * i145);
Turing::ldMatrixT (*reinterpret_cast<Array<__half,8,8>*>(&T8[(8 * i145)]),&T6[((i3334 + (8 * (i3330 ^ (i3320 / 8)))) + (i3320 % 8))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
#pragma unroll
for(nvfuser_index_t i156 = 0; i156 < 4; ++i156) {
for(nvfuser_index_t i156 = 0; i156 < 4; ++i156) {
int i3529;
Turing::ldMatrixT (*reinterpret_cast<Array<__half,8,8>*>(&T9[(8 * i156)]),&T7[(i18262 + (8 * (i17724 ^ (i17550 + (2 * i156)))))]);
i3529 = i3302 + (16 * i156);
Turing::ldMatrixT (*reinterpret_cast<Array<__half,8,8>*>(&T9[(8 * i156)]),&T7[((i3528 + (8 * (i3523 ^ (i3529 / 8)))) + (i3529 % 8))]);
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
#pragma unroll
#pragma unroll
for(nvfuser_index_t i164 = 0; i164 < 1; ++i164) {
for(nvfuser_index_t i164 = 0; i164 < 1; ++i164) {
int i3711;
int i18920;
i3711 = 1 + i164;
i18920 = i18919 + (4096 * i164);
int i3712;
int i18960;
i3712 = 16 * i3711;
i18960 = 32 * ((1 + i164) % 2);
int i3744;
int i19035;
i3744 = i3303 + i3712;
i19035 = 32 * (i164 % 2);
int i3747;
int i19679;
i3747 = i3744 % 8;
i19679 = i19678 + (2048 * i164);
int i3750;
i3750 = i3743 + (256 * ((8 * (i3744 / 8)) + i3747));
int i3789;
i3789 = 32 * (i3711 % 2);
int i3855;
i3855 = 32 * (i164 % 2);
int i4055;
i4055 = i3520 + i3712;
int i4058;
i4058 = i4055 % 8;
int i4061;
i4061 = i4054 + (128 * ((8 * (i4055 / 8)) + i4058));
#pragma unroll
#pragma unroll
for(nvfuser_index_t i145 = 0; i145 < 4; ++i145) {
for(nvfuser_index_t i145 = 0; i145 < 4; ++i145) {
int i3751;
Turing::ldMatrixT (*reinterpret_cast<Array<__half,8,8>*>(&T8[(i18960 + (8 * i145))]),&T6[(i18920 + (8 * (i17724 ^ (i17570 + (2 * i145)))))]);
i3751 = i3318 + (16 * i145);
Turing::ldMatrixT (*reinterpret_cast<Array<__half,8,8>*>(&T8[(i3789 + (8 * i145))]),&T6[((i3750 + (i3751 % 8)) + (8 * (i3747 ^ (i3751 / 8))))]);
}
}
__half T2[((ceilDiv(64, 16)) * 8)];
__half T2[((ceilDiv(64, 16)) * 8)];
#pragma unroll
#pragma unroll
for(nvfuser_index_t i147 = 0; i147 < 4; ++i147) {
for(nvfuser_index_t i147 = 0; i147 < 4; ++i147) {
int i3808;
int i18982;
i3808 = 8 * i147;
i18982 = 8 * i147;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i149 = 0; i149 < 8; ++i149) {
for(nvfuser_index_t i149 = 0; i149 < 8; ++i149) {
T2[(i3808 + i149)] = 0.00000000000000000e+00;
T2[(i18982 + i149)] = 0.00000000000000000e+00;
}
}
}
}
#pragma unroll
#pragma unroll
for(nvfuser_index_t i147 = 0; i147 < 4; ++i147) {
for(nvfuser_index_t i147 = 0; i147 < 4; ++i147) {
int i3827;
int i19003;
i3827 = 8 * i147;
i19003 = 8 * i147;
int i3859;
int i19039;
i3859 = i3855 + i3827;
i19039 = i19035 + i19003;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i149 = 0; i149 < 8; ++i149) {
for(nvfuser_index_t i149 = 0; i149 < 8; ++i149) {
T2[(i3827 + i149)]
T2[(i19003 + i149)]
= T8[(i3859 + i149)];
= T8[(i19039 + i149)];
}
}
}
}
#pragma unroll
#pragma unroll
for(nvfuser_index_t i156 = 0; i156 < 4; ++i156) {
for(nvfuser_index_t i156 = 0; i156 < 4; ++i156) {
int i4062;
Turing::ldMatrixT (*reinterpret_cast<Array<__half,8,8>*>(&T9[(i18960 + (8 * i156))]),&T7[(i19679 + (8 * (i17724 ^ (i17550 + (2 * i156)))))]);
i4062 = i3302 + (16 * i156);
Turing::ldMatrixT (*reinterpret_cast<Array<__half,8,8>*>(&T9[(i3789 + (8 * i156))]),&T7[((i4061 + (i4062 % 8)) + (8 * (i4058 ^ (i4062 / 8))))]);
}
}
__half T3[((ceilDiv(64, 16)) * 8)];
__half T3[((ceilDiv(64, 16)) * 8)];
#pragma unroll
#pragma unroll
for(nvfuser_index_t i158 = 0; i158 < 4; ++i158) {
for(nvfuser_index_t i158 = 0; i158 < 4; ++i158) {
int i4117;
int i19742;
i4117 = 8 * i158;
i19742 = 8 * i158;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i159 = 0; i159 < 8; ++i159) {
for(nvfuser_index_t i159 = 0; i159 < 8; ++i159) {
T3[(i4117 + i159)] = 0.00000000000000000e+00;
T3[(i19742 + i159)] = 0.00000000000000000e+00;
}
}
}
}
#pragma unroll
#pragma unroll
for(nvfuser_index_t i158 = 0; i158 < 4; ++i158) {
for(nvfuser_index_t i158 = 0; i158 < 4; ++i158) {
int i4136;
int i19763;
i4136 = 8 * i158;
i19763 = 8 * i158;
int i4168;
int i19799;
i4168 = i3855 + i4136;
i19799 = i19035 + i19763;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i159 = 0; i159 < 8; ++i159) {
for(nvfuser_index_t i159 = 0; i159 < 8; ++i159) {
T3[(i4136 + i159)]
T3[(i19763 + i159)]
= T9[(i4168 + i159)];
= T9[(i19799 + i159)];
}
}
}
}
#pragma unroll
#pragma unroll
for(nvfuser_index_t i166 = 0; i166 < 4; ++i166) {
for(nvfuser_index_t i166 = 0; i166 < 4; ++i166) {
int i4226;
int i19866;
i4226 = 32 * i166;
i19866 = 32 * i166;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i167 = 0; i167 < 4; ++i167) {
for(nvfuser_index_t i167 = 0; i167 < 4; ++i167) {
Ampere::M16N16K16TN<16>(
Ampere::M16N16K16TN<16>(
reinterpret_cast<Array<float,8,8>*>(&T5[(i4226 + (4 * i167))]),
reinterpret_cast<Array<float,8,8>*>(&T5[(i19866 + (4 * i167))]),
&(reinterpret_cast<Array<__half,8,8>*>(&T2)[i166]),
&(reinterpret_cast<Array<__half,8,8>*>(&T2)[i166]),
&(reinterpret_cast<Array<__half,8,8>*>(&T3)[i167]));
&(reinterpret_cast<Array<__half,8,8>*>(&T3)[i167]));
}
}
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
__half T2[((ceilDiv(64, 16)) * 8)];
__half T2[((ceilDiv(64, 16)) * 8)];
#pragma unroll
#pragma unroll
for(nvfuser_index_t i147 = 0; i147 < 4; ++i147) {
for(nvfuser_index_t i147 = 0; i147 < 4; ++i147) {
int i4246;
int i19887;
i4246 = 8 * i147;
i19887 = 8 * i147;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i149 = 0; i149 < 8; ++i149) {
for(nvfuser_index_t i149 = 0; i149 < 8; ++i149) {
T2[(i4246 + i149)] = 0.00000000000000000e+00;
T2[(i19887 + i149)] = 0.00000000000000000e+00;
}
}
}
}
#pragma unroll
#pragma unroll
for(nvfuser_index_t i147 = 0; i147 < 4; ++i147) {
for(nvfuser_index_t i147 = 0; i147 < 4; ++i147) {
int i4265;
int i19907;
i4265 = 8 * i147;
i19907 = 8 * i147;
int i4295;
int i19938;
i4295 = 32 + i4265;
i19938 = 32 + i19907;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i149 = 0; i149 < 8; ++i149) {
for(nvfuser_index_t i149 = 0; i149 < 8; ++i149) {
T2[(i4265 + i149)]
T2[(i19907 + i149)]
= T8[(i4295 + i149)];
= T8[(i19938 + i149)];
}
}
}
}
__half T3[((ceilDiv(64, 16)) * 8)];
__half T3[((ceilDiv(64, 16)) * 8)];
#pragma unroll
#pragma unroll
for(nvfuser_index_t i158 = 0; i158 < 4; ++i158) {
for(nvfuser_index_t i158 = 0; i158 < 4; ++i158) {
int i4314;
int i19958;
i4314 = 8 * i158;
i19958 = 8 * i158;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i159 = 0; i159 < 8; ++i159) {
for(nvfuser_index_t i159 = 0; i159 < 8; ++i159) {
T3[(i4314 + i159)] = 0.00000000000000000e+00;
T3[(i19958 + i159)] = 0.00000000000000000e+00;
}
}
}
}
#pragma unroll
#pragma unroll
for(nvfuser_index_t i158 = 0; i158 < 4; ++i158) {
for(nvfuser_index_t i158 = 0; i158 < 4; ++i158) {
int i4333;
int i19978;
i4333 = 8 * i158;
i19978 = 8 * i158;
int i4363;
int i20009;
i4363 = 32 + i4333;
i20009 = 32 + i19978;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i159 = 0; i159 < 8; ++i159) {
for(nvfuser_index_t i159 = 0; i159 < 8; ++i159) {
T3[(i4333 + i159)]
T3[(i19978 + i159)]
= T9[(i4363 + i159)];
= T9[(i20009 + i159)];
}
}
}
}
#pragma unroll
#pragma unroll
for(nvfuser_index_t i166 = 0; i166 < 4; ++i166) {
for(nvfuser_index_t i166 = 0; i166 < 4; ++i166) {
int i4421;
int i20073;
i4421 = 32 * i166;
i20073 = 32 * i166;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i167 = 0; i167 < 4; ++i167) {
for(nvfuser_index_t i167 = 0; i167 < 4; ++i167) {
Ampere::M16N16K16TN<16>(
Ampere::M16N16K16TN<16>(
reinterpret_cast<Array<float,8,8>*>(&T5[(i4421 + (4 * i167))]),
reinterpret_cast<Array<float,8,8>*>(&T5[(i20073 + (4 * i167))]),
&(reinterpret_cast<Array<__half,8,8>*>(&T2)[i166]),
&(reinterpret_cast<Array<__half,8,8>*>(&T2)[i166]),
&(reinterpret_cast<Array<__half,8,8>*>(&T3)[i167]));
&(reinterpret_cast<Array<__half,8,8>*>(&T3)[i167]));
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
Ampere::cpAsyncPartialBarrier<2>();
Ampere::cpAsyncPartialBarrier<2>();
__barrier_sync(0);
__barrier_sync(0);
Ampere::cpAsyncCommit();
Ampere::cpAsyncCommit();
}
}
#pragma unroll
#pragma unroll
for(nvfuser_index_t i172 = 0; i172 < 4; ++i172) {
for(nvfuser_index_t i172 = 0; i172 < 4; ++i172) {
int i4475;
int i20139;
i4475 = 2 * i172;
i20139 = 32 * i172;
int i4609;
int i20303;
i4609 = i4608 + (16 * i172);
i20303 = i20301 + (i3189 * i172);
int i39224;
i39224 = i39223 + (16 * i172);
#pragma unroll
#pragma unroll
for(nvfuser_index_t i173 = 0; i173 < 4; ++i173) {
for(nvfuser_index_t i173 = 0; i173 < 4; ++i173) {
int i4472;
int i20141;
i4472 = 4 * i173;
i20141 = i20139 + (4 * i173);
int i4605;
int i20291;
i4605 = i4604 + (16 * i173);
i20291 = 16 * i173;
int i20304;
i20304 = i20303 + i20291;
int i39229;
i39229 = i39228 + i20291;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i174 = 0; i174 < 2; ++i174) {
for(nvfuser_index_t i174 = 0; i174 < 2; ++i174) {
int i4474;
int i20143;
i4474 = i4472 + (2 * i174);
i20143 = i20141 + (2 * i174);
int i4606;
int i20293;
i4606 = i4605 + (8 * i174);
i20293 = 8 * i174;
int i20305;
i20305 = i20304 + i20293;
int i39230;
i39230 = i39229 + i20293;
#pragma unroll
#pragma unroll
for(nvfuser_index_t i175 = 0; i175 < 2; ++i175) {
for(nvfuser_index_t i175 = 0; i175 < 2; ++i175) {
int i4478;
int i20145;
i4478 = i4474 + (16 * (i4475 + i175));
i20145 = i20143 + (16 * i175);
int i4612;
int i20307;
i4612 = i4606 + (T1.size[1] * (i4609 + (8 * i175)));
i20307 = i20305 + (i3166 * i175);
bool b9214;
bool b39226;
b9214 = (i4609 + (8 * (i175 + nvfuser_zero))) < T0.size[1];
b39226 = (i39224 + (8 * (i175 + nvfuser_zero))) < T0.size[1];
#pragma unroll
#pragma unroll
for(nvfuser_index_t i176 = 0; i176 < 2; ++i176) {
for(nvfuser_index_t i176 = 0; i176 < 2; ++i176) {
int i4480;
int i20147;
i4480 = i176 + nvfuser_zero;
i20147 = i176 + nvfuser_zero;
if ((b9214 && ((i4606 + i4480) < T1.size[1]))) {
if ((b39226 && ((i39230 + i20147) < T1.size[1]))) {
T4[(i4612 + i4480)]
T4[(i20307 + i20147)]
= T5[(i4478 + i176)];
= T5[(i20145 + i176)];
}
}
}
}
}
}
}
}
}
}
}
}
NVFUSER_UPDATE_MAGIC_ZERO
NVFUSER_UPDATE_MAGIC_ZERO
}
}