Compare text

Find the difference between two text files

Real-time diff

Unified diff

Collapse lines

Highlight change

Syntax highlighting

Tools

Diffchecker Desktop The most secure way to run Diffchecker. Get the Diffchecker Desktop app: your diffs never leave your computer!Get Desktop

Fwd 02/09 vs. 02/14

Created 29 days agoDiff never expires

Lines
Total
Removed

Words
Total
Removed

To continue using this feature, upgrade to Diffchecker Pro View Pricing

383 lines

Lines
Total
Added

Words
Total
Added

To continue using this feature, upgrade to Diffchecker Pro View Pricing

# AOT ID: ['0_forward']

from ctypes import c_void_p, c_long, c_int

import torch

import math

import random

import os

import tempfile

from math import inf, nan

from cmath import nanj

from torch._inductor.hooks import run_intermediate_hooks

from torch._inductor.utils import maybe_profile

from torch._inductor.codegen.memory_planning import _align as align

from torch import device, empty_strided

from torch._inductor.async_compile import AsyncCompile

from torch._inductor.select_algorithm import extern_kernels

from torch._inductor.codegen.multi_kernel import MultiKernelCall

import triton

import triton.language as tl

from torch._inductor.runtime.triton_heuristics import (

grid,

split_scan_grid,

grid_combo_kernels,

start_graph,

end_graph,

cooperative_reduction_grid,

)

from torch._C import _cuda_getCurrentRawStream as get_raw_stream

import torch._inductor.kernel.flex_attention

aten = torch.ops.aten

inductor_ops = torch.ops.inductor

_quantized = torch.ops._quantized

assert_size_stride = torch._C._dynamo.guards.assert_size_stride

empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu

empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda

empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu

reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor

alloc_from_pool = torch.ops.inductor._alloc_from_pool

async_compile = AsyncCompile()

empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p

# kernel path: /tmp/torchinductor_root/py/cpy464bp6ukc5g2wp7hq2jbqpvoi3yp2gkjew2p7up4q2t7wzimf.py

# Topologically Sorted Source Nodes: [embedding_3, x, mul, mul_1, x_1], Original ATen: [aten.embedding, aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]

# Source node to ATen node mapping:

# embedding_3 => embedding_3

# mul => mul_1

# mul_1 => mul_2

# x => add, convert_element_type_10, convert_element_type_11, mean, mul, pow_1, rsqrt

# x_1 => add_1

# Graph fragment:

# %embedding_3 : [num_users=2] = call_function[target=torch.ops.aten.embedding.default](args = (%primals_6, %primals_1), kwargs = {})

# %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})

# %pow_1 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_10, 2), kwargs = {})

# %mean : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_1, [2], True), kwargs = {})

# %add : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean, 1.1920928955078125e-07), kwargs = {})

# %rsqrt : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add,), kwargs = {})

# %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})

# %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})

# %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_4, %convert_element_type_11), kwargs = {})

# %mul_2 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_5, %convert_element_type_11), kwargs = {})

# %add_1 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_1, %mul_2), kwargs = {})

triton_per_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_0 = async_compile.triton('triton_per_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_0', '''

import triton

import triton.language as tl

from triton.compiler.compiler import AttrsDescriptor

from torch._inductor.runtime import triton_helpers, triton_heuristics

from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math

from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties

triton_helpers.set_driver_to_gpu()

@triton_heuristics.persistent_reduction(

size_hints={'x': 65536, 'r0_': 1024},

reduction_hint=ReductionHint.INNER,

filename=__file__,

triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*i32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'out_ptr0': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': True, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}

)

@triton.jit

def triton_per_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_0(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, r0_numel):

xnumel = 65536

XBLOCK: tl.constexpr = 1

r0_numel = 1024

R0_BLOCK: tl.constexpr = 1024

rnumel = r0_numel

RBLOCK: tl.constexpr = R0_BLOCK

xoffset = tl.program_id(0) * XBLOCK

xindex = tl.full([1], xoffset, tl.int32)

xmask = tl.full([R0_BLOCK], True, tl.int1)

r0_index = tl.arange(0, R0_BLOCK)[:]

r0_offset = 0

r0_mask = tl.full([R0_BLOCK], True, tl.int1)

roffset = r0_offset

rindex = r0_index

x0 = xindex

r0_1 = r0_index

tmp0 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last')

tmp17 = tl.load(in_ptr2 + (16))

tmp18 = tl.broadcast_to(tmp17, [R0_BLOCK])

tmp23 = tl.load(in_ptr2 + (17))

tmp24 = tl.broadcast_to(tmp23, [R0_BLOCK])

tmp1 = tl.full([R0_BLOCK], 50257, tl.int32)

tmp2 = tmp0 + tmp1

tmp3 = tmp0 < 0

tmp4 = tl.where(tmp3, tmp2, tmp0)

tl.device_assert((0 <= tmp4) & (tmp4 < 50257), "index out of bounds: 0 <= tmp4 < 50257")

tmp6 = tl.load(in_ptr1 + (r0_1 + 1024*tmp4), None).to(tl.float32)

tmp7 = tmp6.to(tl.float32)

tmp8 = tmp7 * tmp7

tmp9 = tl.broadcast_to(tmp8, [R0_BLOCK])

tmp11 = triton_helpers.promote_to_tensor(tl.sum(tmp9, 0))

tmp12 = 1024.0

tmp13 = tmp11 / tmp12

tmp14 = 1.1920928955078125e-07

tmp15 = tmp13 + tmp14

tmp16 = libdevice.rsqrt(tmp15)

tmp19 = tmp18.to(tl.float32)

tmp20 = tmp7 * tmp16

tmp21 = tmp20.to(tl.float32)

tmp22 = tmp19 * tmp21

tmp25 = tmp24.to(tl.float32)

tmp26 = tmp25 * tmp21

tmp27 = tmp22 + tmp26

tl.store(out_ptr0 + (r0_1 + 1024*x0), tmp6, None)

tl.debug_barrier()

tl.store(in_out_ptr0 + (x0), tmp16, None)

tl.store(out_ptr1 + (r0_1 + 1024*x0), tmp27, None)

''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/ur/curpusl7dnnsyt2hqus4f27u67wivplzuyflwhcwb2vy5hcbp3bi.py

# Topologically Sorted Source Nodes: [embedding, v_1, mul_10, mul_11, v_2], Original ATen: [aten.embedding, aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]

# Source node to ATen node mapping:

# embedding => embedding

# mul_10 => mul_14

# mul_11 => mul_15

# v_1 => add_8, convert_element_type_22, convert_element_type_23, mean_3, mul_13, pow_4, rsqrt_3

# v_2 => add_9

# Graph fragment:

# %embedding : [num_users=2] = call_function[target=torch.ops.aten.embedding.default](args = (%primals_2, %primals_1), kwargs = {})

# %convert_element_type_22 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_14, torch.float32), kwargs = {})

# %pow_4 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_22, 2), kwargs = {})

# %mean_3 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_4, [3], True), kwargs = {})

# %add_8 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_3, 1.1920928955078125e-07), kwargs = {})

# %rsqrt_3 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_8,), kwargs = {})

# %mul_13 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_22, %rsqrt_3), kwargs = {})

# %convert_element_type_23 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_13, torch.bfloat16), kwargs = {})

# %mul_14 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_6, %convert_element_type_23), kwargs = {})

# %mul_15 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_7, %view_12), kwargs = {})

# %add_9 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_14, %mul_15), kwargs = {})

triton_red_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_1 = async_compile.triton('triton_red_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_1', '''

import triton

import triton.language as tl

from triton.compiler.compiler import AttrsDescriptor

from torch._inductor.runtime import triton_helpers, triton_heuristics

from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math

from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties

triton_helpers.set_driver_to_gpu()

@triton_heuristics.reduction(

size_hints={'x': 524288, 'r0_': 128},

reduction_hint=ReductionHint.DEFAULT,

filename=__file__,

triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*bf16', 'in_ptr1': '*i32', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr0': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7, 8), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_1', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}

)

@triton.jit

def triton_red_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_1(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):

xnumel = 524288

r0_numel = 128

rnumel = r0_numel

RBLOCK: tl.constexpr = R0_BLOCK

xoffset = tl.program_id(0) * XBLOCK

xindex = xoffset + tl.arange(0, XBLOCK)[:, None]

xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)

r0_base = tl.arange(0, R0_BLOCK)[None, :]

rbase = r0_base

x0 = (xindex % 8)

x1 = xindex // 8

_tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)

x3 = xindex

for r0_offset in range(0, r0_numel, R0_BLOCK):

r0_index = r0_offset + r0_base

r0_mask = r0_index < r0_numel

roffset = r0_offset

rindex = r0_index

r0_2 = r0_index

tmp0 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)

tmp1 = tmp0.to(tl.float32)

tmp2 = tmp1 * tmp1

tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])

tmp5 = _tmp4 + tmp3

_tmp4 = tl.where(r0_mask, tmp5, _tmp4)

tmp4 = tl.sum(_tmp4, 1)[:, None]

tmp6 = 128.0

tmp7 = tmp4 / tmp6

tmp8 = 1.1920928955078125e-07

tmp9 = tmp7 + tmp8

tmp10 = libdevice.rsqrt(tmp9)

tl.debug_barrier()

tl.store(in_out_ptr0 + (x3), tmp10, None)

tmp11 = tl.load(in_ptr1 + (x1), None, eviction_policy='evict_last')

tmp18 = tl.load(in_ptr3 + (48))

tmp19 = tl.broadcast_to(tmp18, [XBLOCK, R0_BLOCK])

tmp26 = tl.load(in_ptr3 + (49))

tmp27 = tl.broadcast_to(tmp26, [XBLOCK, R0_BLOCK])

for r0_offset in range(0, r0_numel, R0_BLOCK):

r0_index = r0_offset + r0_base

r0_mask = r0_index < r0_numel

roffset = r0_offset

rindex = r0_index

r0_2 = r0_index

tmp21 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)

tmp12 = tl.full([XBLOCK, R0_BLOCK], 50257, tl.int32)

tmp13 = tmp11 + tmp12

tmp14 = tmp11 < 0

tmp15 = tl.where(tmp14, tmp13, tmp11)

tl.device_assert((0 <= tmp15) & (tmp15 < 50257), "index out of bounds: 0 <= tmp15 < 50257")

tmp17 = tl.load(in_ptr2 + (r0_2 + 128*x0 + 1024*tmp15), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)

tmp20 = tmp19.to(tl.float32)

tmp22 = tmp21.to(tl.float32)

tmp23 = tmp22 * tmp10

tmp24 = tmp23.to(tl.float32)

tmp25 = tmp20 * tmp24

tmp28 = tmp27.to(tl.float32)

tmp29 = tmp28 * tmp17

tmp30 = tmp25 + tmp29

tl.store(out_ptr0 + (r0_2 + 128*x3), tmp17, r0_mask)

tl.store(out_ptr1 + (r0_2 + 128*x3), tmp30, r0_mask)

''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/6m/c6mw7nbczjgpvhrjjk6pggm7fvu5xptqqgclx5n5k4tqobkftzww.py

# Topologically Sorted Source Nodes: [eq, cumsum], Original ATen: [aten.eq, aten.cumsum]

# Source node to ATen node mapping:

# cumsum => cumsum

# eq => eq

# Graph fragment:

# %eq : [num_users=1] = call_function[target=torch.ops.aten.eq.Scalar](args = (%primals_1, 50256), kwargs = {})

# %cumsum : [num_users=17] = call_function[target=torch.ops.aten.cumsum.default](args = (%eq, 0), kwargs = {})

triton_spl_fused_cumsum_eq_2 = async_compile.triton('triton_spl_fused_cumsum_eq_2', '''

import triton

import triton.language as tl

from triton.compiler.compiler import AttrsDescriptor

from torch._inductor.runtime import triton_helpers, triton_heuristics

from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math

from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties

triton_helpers.set_driver_to_gpu()

@triton.jit

def _triton_helper_fn_add0(arg0_0, arg1_0):

tmp0 = arg0_0 + arg1_0

return tmp0

@triton_heuristics.split_scan(

size_hints={'x': 1, 'r0_': 65536},

reduction_hint=ReductionHint.INNER,

filename=__file__,

triton_meta={'signature': {'in_ptr0': '*i32', 'out_ptr0': '*i64', 'ws_ptr': '*u8', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {'xnumel': 1}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 4), 'tt.equal_to': (3,)}, 'cls': 'AttrsDescriptor'})]},

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_spl_fused_cumsum_eq_2', 'mutated_arg_names': ['ws_ptr'], 'optimize_mem': False, 'no_x_dim': True, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}

)

@triton.jit

def triton_spl_fused_cumsum_eq_2(in_ptr0, out_ptr0, ws_ptr, xnumel, r0_numel, R0_BLOCK : tl.constexpr):

xnumel = 1

XBLOCK: tl.constexpr = 1

r0_numel = 65536

rnumel = r0_numel

RBLOCK: tl.constexpr = R0_BLOCK

xoffset = tl.program_id(1) * XBLOCK

xindex = tl.full([1], xoffset, tl.int32)

xmask = tl.full([R0_BLOCK], True, tl.int1)

r0_offset = tl.program_id(0) * R0_BLOCK

r0_index = r0_offset + tl.arange(0, R0_BLOCK)[:]

r0_mask = tl.full([R0_BLOCK], True, tl.int1)

roffset = r0_offset

rindex = r0_index

r0_0 = r0_index

tmp0 = tl.load(in_ptr0 + (r0_0), None, eviction_policy='evict_last')

tmp4 = tl.num_programs(0)

tmp5 = ws_ptr.to(tl.pointer_type(tl.uint64)) + xoffset * 3 * tmp4

tmp1 = tl.full([1], 50256, tl.int32)

tmp2 = tmp0 == tmp1

tmp3 = tmp2.to(tl.int64)

tmp6 = tmp3.to(tl.int64)

tmp7 = tl.broadcast_to(tmp6, [R0_BLOCK])

tmp8 = tl.reduce(tmp7, 0, _triton_helper_fn_add0)

tmp9 = triton_helpers.exclusive_scan_decoupled_lookback_64(

tmp5,

tmp8,

tl.program_id(0),

_triton_helper_fn_add0,

)

tmp10 = tl.associative_scan(tmp7, 0, _triton_helper_fn_add0)

tmp11 = _triton_helper_fn_add0(tmp9, tmp10)

tmp12 = tl.where(roffset == 0, tmp10, tmp11)

tl.store(out_ptr0 + (tl.broadcast_to(r0_0, [R0_BLOCK])), tmp12, None)

''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/bk/cbk7z3iym6kwqv7oruouptcabnpwb6ym23vicolbtktxjwhgmno4.py

# Topologically Sorted Source Nodes: [block_idx, causal_blockmask_any, causal_blockmask_all, docs_low, docs_high, le, ge_1, document_blockmask_any, eq_1, eq_2, document_blockmask_all, blockmask_any, blockmask_all, invert, and__4, num_blocks, argsort, num_blocks_1, argsort_1, sub, clamp_min, child, sub_1, child_4, floordiv, sub_2, clamp_min_1, child_8, sub_3, child_11], Original ATen: [aten.arange, aten.ge, aten.gt, aten.clone, aten.le, aten.bitwise_and, aten.eq, aten.bitwise_not, aten.sum, aten.sort, aten.sub, aten.clamp_min, aten.clamp_max, aten.floor_divide]

# Source node to ATen node mapping:

# and__4 => bitwise_and_4

# argsort => sort

# argsort_1 => sort_1

# block_idx => iota

# blockmask_all => bitwise_and_3

# blockmask_any => bitwise_and_2

# causal_blockmask_all => gt

# causal_blockmask_any => ge

# child => clamp_max

# child_11 => clamp_max_3

# child_4 => clamp_max_1

# child_8 => clamp_max_2

# clamp_min => clamp_min

# clamp_min_1 => clamp_min_1

# docs_high => clone_1

# docs_low => clone

# document_blockmask_all => bitwise_and_1

# document_blockmask_any => bitwise_and

# eq_1 => eq_1

# eq_2 => eq_2

# floordiv => div

# ge_1 => ge_1

# invert => bitwise_not

# le => le

# num_blocks => sum_1

# num_blocks_1 => sum_2

# sub => sub

# sub_1 => sub_1

# sub_2 => sub_2

# sub_3 => sub_3

# Graph fragment:

# %iota : [num_users=3] = call_function[target=torch.ops.prims.iota.default](args = (512,), kwargs = {start: 0, step: 1, dtype: torch.int32, device: cuda, requires_grad: False})

# %ge : [num_users=1] = call_function[target=torch.ops.aten.ge.Tensor](args = (%unsqueeze, %iota), kwargs = {})

# %gt : [num_users=1] = call_function[target=torch.ops.aten.gt.Tensor](args = (%unsqueeze, %iota), kwargs = {})

# %clone : [num_users=3] = call_function[target=torch.ops.aten.clone.default](args = (%select,), kwargs = {memory_format: torch.contiguous_format})

# %clone_1 : [num_users=3] = call_function[target=torch.ops.aten.clone.default](args = (%select_1,), kwargs = {memory_format: torch.contiguous_format})

# %le : [num_users=1] = call_function[target=torch.ops.aten.le.Tensor](args = (%unsqueeze_2, %clone_1), kwargs = {})

# %ge_1 : [num_users=1] = call_function[target=torch.ops.aten.ge.Tensor](args = (%unsqueeze_3, %clone), kwargs = {})

# %bitwise_and : [num_users=1] = call_function[target=torch.ops.aten.bitwise_and.Tensor](args = (%le, %ge_1), kwargs = {})

# %eq_1 : [num_users=1] = call_function[target=torch.ops.aten.eq.Tensor](args = (%unsqueeze_2, %clone_1), kwargs = {})

# %eq_2 : [num_users=1] = call_function[target=torch.ops.aten.eq.Tensor](args = (%unsqueeze_3, %clone), kwargs = {})

# %bitwise_and_1 : [num_users=1] = call_function[target=torch.ops.aten.bitwise_and.Tensor](args = (%eq_1, %eq_2), kwargs = {})

# %bitwise_and_2 : [num_users=1] = call_function[target=torch.ops.aten.bitwise_and.Tensor](args = (%ge, %bitwise_and), kwargs = {})

# %bitwise_and_3 : [num_users=3] = call_function[target=torch.ops.aten.bitwise_and.Tensor](args = (%gt, %bitwise_and_1), kwargs = {})

# %bitwise_not : [num_users=1] = call_function[target=torch.ops.aten.bitwise_not.default](args = (%bitwise_and_3,), kwargs = {})

# %bitwise_and_4 : [num_users=2] = call_function[target=torch.ops.aten.bitwise_and.Tensor](args = (%bitwise_and_2, %bitwise_not), kwargs = {})

# %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%bitwise_and_4, [-1]), kwargs = {dtype: torch.int32})

# %sort : [num_users=1] = call_function[target=torch.ops.aten.sort.stable](args = (%bitwise_and_4,), kwargs = {stable: True})

# %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%bitwise_and_3, [-1]), kwargs = {dtype: torch.int32})

# %sort_1 : [num_users=1] = call_function[target=torch.ops.aten.sort.stable](args = (%bitwise_and_3,), kwargs = {stable: True})

# %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%primals_5, %unsqueeze_11), kwargs = {})

# %clamp_min : [num_users=1] = call_function[target=torch.ops.aten.clamp_min.default](args = (%sub, 1), kwargs = {})

# %clamp_max : [num_users=6] = call_function[target=torch.ops.aten.clamp_max.Tensor](args = (%unsqueeze_7, %clamp_min), kwargs = {})

# %sub_1 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%primals_5, 1), kwargs = {})

# %clamp_max_1 : [num_users=6] = call_function[target=torch.ops.aten.clamp_max.Tensor](args = (%unsqueeze_11, %sub_1), kwargs = {})

# %div : [num_users=2] = call_function[target=torch.ops.aten.div.Tensor_mode](args = (%primals_5, 2), kwargs = {rounding_mode: floor})

# %sub_2 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%div, %unsqueeze_11), kwargs = {})

# %clamp_min_1 : [num_users=1] = call_function[target=torch.ops.aten.clamp_min.default](args = (%sub_2, 1), kwargs = {})

# %clamp_max_2 : [num_users=13] = call_function[target=torch.ops.aten.clamp_max.Tensor](args = (%unsqueeze_7, %clamp_min_1), kwargs = {})

# %sub_3 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%div, 1), kwargs = {})

# %clamp_max_3 : [num_users=13] = call_function[target=torch.ops.aten.clamp_max.Tensor](args = (%unsqueeze_11, %sub_3), kwargs = {})

triton_per_fused_arange_bitwise_and_bitwise_not_clamp_max_clamp_min_clone_eq_floor_divide_ge_gt_le_sort_sub_sum_3 = async_compile.triton('triton_per_fused_arange_bitwise_and_bitwise_not_clamp_max_clamp_min_clone_eq_floor_divide_ge_gt_le_sort_sub_sum_3', '''

import triton

import triton.language as tl

from triton.compiler.compiler import AttrsDescriptor

from torch._inductor.runtime import triton_helpers, triton_heuristics

from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math

from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties

triton_helpers.set_driver_to_gpu()

@triton_heuristics.persistent_reduction(

size_hints={'x': 512, 'r0_': 512},

reduction_hint=ReductionHint.DEFAULT,

filename=__file__,

triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*i32', 'out_ptr3': '*i16', 'out_ptr5': '*i16', 'out_ptr6': '*i32', 'out_ptr7': '*i32', 'out_ptr8': '*i32', 'out_ptr9': '*i32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_arange_bitwise_and_bitwise_not_clamp_max_clamp_min_clone_eq_floor_divide_ge_gt_le_s

Saved diffs

Original text

Open file

# AOT ID: ['0_forward']
from ctypes import c_void_p, c_long, c_int
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from cmath import nanj
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align
from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall
import triton
import triton.language as tl
from torch._inductor.runtime.triton_heuristics import (
    grid,
    split_scan_grid,
    grid_combo_kernels,
    start_graph,
    end_graph,
    cooperative_reduction_grid,
)
from torch._C import _cuda_getCurrentRawStream as get_raw_stream
from torch._C import _cuda_getCurrentRawStream as get_raw_stream
import torch._inductor.kernel.flex_attention

aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
empty_strided_xpu = torch._C._dynamo.guards._empty_strided_xpu
reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
alloc_from_pool = torch.ops.inductor._alloc_from_pool
async_compile = AsyncCompile()
empty_strided_p2p = torch._C._distributed_c10d._SymmetricMemory.empty_strided_p2p

# kernel path: /tmp/torchinductor_root/py/cpy464bp6ukc5g2wp7hq2jbqpvoi3yp2gkjew2p7up4q2t7wzimf.py
# Topologically Sorted Source Nodes: [embedding_3, x, mul, mul_1, x_1], Original ATen: [aten.embedding, aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   embedding_3 => embedding_3
#   mul => mul_1
#   mul_1 => mul_2
#   x => add, convert_element_type_10, convert_element_type_11, mean, mul, pow_1, rsqrt
#   x_1 => add_1
# Graph fragment:
#   %embedding_3 : [num_users=2] = call_function[target=torch.ops.aten.embedding.default](args = (%primals_6, %primals_1), kwargs = {})
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %pow_1 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_10, 2), kwargs = {})
#   %mean : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_1, [2], True), kwargs = {})
#   %add : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add,), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %mul_1 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_4, %convert_element_type_11), kwargs = {})
#   %mul_2 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_5, %convert_element_type_11), kwargs = {})
#   %add_1 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_1, %mul_2), kwargs = {})
triton_per_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_0 = async_compile.triton('triton_per_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_0', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

from torch._inductor.runtime import triton_helpers, triton_heuristics
from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, DeviceProperties
triton_helpers.set_driver_to_gpu()

@triton_heuristics.persistent_reduction(
    size_hints={'x': 65536, 'r0_': 1024},
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*i32', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'out_ptr0': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_0', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': True, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_per_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_0(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, xnumel, r0_numel):
    xnumel = 65536
    XBLOCK: tl.constexpr = 1
    r0_numel = 1024
    R0_BLOCK: tl.constexpr = 1024
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = tl.full([1], xoffset, tl.int32)
    xmask = tl.full([R0_BLOCK], True, tl.int1)
    r0_index = tl.arange(0, R0_BLOCK)[:]
    r0_offset = 0
    r0_mask = tl.full([R0_BLOCK], True, tl.int1)
    roffset = r0_offset
    rindex = r0_index
    x0 = xindex
    r0_1 = r0_index
    tmp0 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last')
    tmp17 = tl.load(in_ptr2 + (16))
    tmp18 = tl.broadcast_to(tmp17, [R0_BLOCK])
    tmp23 = tl.load(in_ptr2 + (17))
    tmp24 = tl.broadcast_to(tmp23, [R0_BLOCK])
    tmp1 = tl.full([R0_BLOCK], 50257, tl.int32)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp0 < 0
    tmp4 = tl.where(tmp3, tmp2, tmp0)
    tl.device_assert((0 <= tmp4) & (tmp4 < 50257), "index out of bounds: 0 <= tmp4 < 50257")
    tmp6 = tl.load(in_ptr1 + (r0_1 + 1024*tmp4), None).to(tl.float32)
    tmp7 = tmp6.to(tl.float32)
    tmp8 = tmp7 * tmp7
    tmp9 = tl.broadcast_to(tmp8, [R0_BLOCK])
    tmp11 = triton_helpers.promote_to_tensor(tl.sum(tmp9, 0))
    tmp12 = 1024.0
    tmp13 = tmp11 / tmp12
    tmp14 = 1.1920928955078125e-07
    tmp15 = tmp13 + tmp14
    tmp16 = libdevice.rsqrt(tmp15)
    tmp19 = tmp18.to(tl.float32)
    tmp20 = tmp7 * tmp16
    tmp21 = tmp20.to(tl.float32)
    tmp22 = tmp19 * tmp21
    tmp25 = tmp24.to(tl.float32)
    tmp26 = tmp25 * tmp21
    tmp27 = tmp22 + tmp26
    tl.store(out_ptr0 + (r0_1 + 1024*x0), tmp6, None)
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (x0), tmp16, None)
    tl.store(out_ptr1 + (r0_1 + 1024*x0), tmp27, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/ur/curpusl7dnnsyt2hqus4f27u67wivplzuyflwhcwb2vy5hcbp3bi.py
# Topologically Sorted Source Nodes: [embedding, v_1, mul_10, mul_11, v_2], Original ATen: [aten.embedding, aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   embedding => embedding
#   mul_10 => mul_14
#   mul_11 => mul_15
#   v_1 => add_8, convert_element_type_22, convert_element_type_23, mean_3, mul_13, pow_4, rsqrt_3
#   v_2 => add_9
# Graph fragment:
#   %embedding : [num_users=2] = call_function[target=torch.ops.aten.embedding.default](args = (%primals_2, %primals_1), kwargs = {})
#   %convert_element_type_22 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_14, torch.float32), kwargs = {})
#   %pow_4 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_22, 2), kwargs = {})
#   %mean_3 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_4, [3], True), kwargs = {})
#   %add_8 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_3, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_3 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_8,), kwargs = {})
#   %mul_13 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_22, %rsqrt_3), kwargs = {})
#   %convert_element_type_23 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_13, torch.bfloat16), kwargs = {})
#   %mul_14 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_6, %convert_element_type_23), kwargs = {})
#   %mul_15 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_7, %view_12), kwargs = {})
#   %add_9 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_14, %mul_15), kwargs = {})
triton_red_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_1 = async_compile.triton('triton_red_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_1', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

# kernel path: /tmp/torchinductor_root/6m/c6mw7nbczjgpvhrjjk6pggm7fvu5xptqqgclx5n5k4tqobkftzww.py
# Topologically Sorted Source Nodes: [eq, cumsum], Original ATen: [aten.eq, aten.cumsum]
# Source node to ATen node mapping:
#   cumsum => cumsum
#   eq => eq
# Graph fragment:
#   %eq : [num_users=1] = call_function[target=torch.ops.aten.eq.Scalar](args = (%primals_1, 50256), kwargs = {})
#   %cumsum : [num_users=17] = call_function[target=torch.ops.aten.cumsum.default](args = (%eq, 0), kwargs = {})
triton_spl_fused_cumsum_eq_2 = async_compile.triton('triton_spl_fused_cumsum_eq_2', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton.jit
def _triton_helper_fn_add0(arg0_0, arg1_0):
    tmp0 = arg0_0 + arg1_0
    return tmp0

@triton_heuristics.split_scan(
    size_hints={'x': 1, 'r0_': 65536},
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    triton_meta={'signature': {'in_ptr0': '*i32', 'out_ptr0': '*i64', 'ws_ptr': '*u8', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {'xnumel': 1}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 4), 'tt.equal_to': (3,)}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_spl_fused_cumsum_eq_2', 'mutated_arg_names': ['ws_ptr'], 'optimize_mem': False, 'no_x_dim': True, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_spl_fused_cumsum_eq_2(in_ptr0, out_ptr0, ws_ptr, xnumel, r0_numel, R0_BLOCK : tl.constexpr):
    xnumel = 1
    XBLOCK: tl.constexpr = 1
    r0_numel = 65536
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(1) * XBLOCK
    xindex = tl.full([1], xoffset, tl.int32)
    xmask = tl.full([R0_BLOCK], True, tl.int1)
    r0_offset = tl.program_id(0) * R0_BLOCK
    r0_index = r0_offset + tl.arange(0, R0_BLOCK)[:]
    r0_mask = tl.full([R0_BLOCK], True, tl.int1)
    roffset = r0_offset
    rindex = r0_index
    r0_0 = r0_index
    tmp0 = tl.load(in_ptr0 + (r0_0), None, eviction_policy='evict_last')
    tmp4 = tl.num_programs(0)
    tmp5 = ws_ptr.to(tl.pointer_type(tl.uint64)) + xoffset * 3 * tmp4
    tmp1 = tl.full([1], 50256, tl.int32)
    tmp2 = tmp0 == tmp1
    tmp3 = tmp2.to(tl.int64)
    tmp6 = tmp3.to(tl.int64)
    tmp7 = tl.broadcast_to(tmp6, [R0_BLOCK])
    tmp8 = tl.reduce(tmp7, 0, _triton_helper_fn_add0)
    tmp9 = triton_helpers.exclusive_scan_decoupled_lookback_64(
        tmp5,
        tmp8,
        tl.program_id(0),
        _triton_helper_fn_add0,
    )
    tmp10 = tl.associative_scan(tmp7, 0, _triton_helper_fn_add0)
    tmp11 = _triton_helper_fn_add0(tmp9, tmp10)
    tmp12 = tl.where(roffset == 0, tmp10, tmp11)
    tl.store(out_ptr0 + (tl.broadcast_to(r0_0, [R0_BLOCK])), tmp12, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/bk/cbk7z3iym6kwqv7oruouptcabnpwb6ym23vicolbtktxjwhgmno4.py
# Topologically Sorted Source Nodes: [block_idx, causal_blockmask_any, causal_blockmask_all, docs_low, docs_high, le, ge_1, document_blockmask_any, eq_1, eq_2, document_blockmask_all, blockmask_any, blockmask_all, invert, and__4, num_blocks, argsort, num_blocks_1, argsort_1, sub, clamp_min, child, sub_1, child_4, floordiv, sub_2, clamp_min_1, child_8, sub_3, child_11], Original ATen: [aten.arange, aten.ge, aten.gt, aten.clone, aten.le, aten.bitwise_and, aten.eq, aten.bitwise_not, aten.sum, aten.sort, aten.sub, aten.clamp_min, aten.clamp_max, aten.floor_divide]
# Source node to ATen node mapping:
#   and__4 => bitwise_and_4
#   argsort => sort
#   argsort_1 => sort_1
#   block_idx => iota
#   blockmask_all => bitwise_and_3
#   blockmask_any => bitwise_and_2
#   causal_blockmask_all => gt
#   causal_blockmask_any => ge
#   child => clamp_max
#   child_11 => clamp_max_3
#   child_4 => clamp_max_1
#   child_8 => clamp_max_2
#   clamp_min => clamp_min
#   clamp_min_1 => clamp_min_1
#   docs_high => clone_1
#   docs_low => clone
#   document_blockmask_all => bitwise_and_1
#   document_blockmask_any => bitwise_and
#   eq_1 => eq_1
#   eq_2 => eq_2
#   floordiv => div
#   ge_1 => ge_1
#   invert => bitwise_not
#   le => le
#   num_blocks => sum_1
#   num_blocks_1 => sum_2
#   sub => sub
#   sub_1 => sub_1
#   sub_2 => sub_2
#   sub_3 => sub_3
# Graph fragment:
#   %iota : [num_users=3] = call_function[target=torch.ops.prims.iota.default](args = (512,), kwargs = {start: 0, step: 1, dtype: torch.int32, device: cuda, requires_grad: False})
#   %ge : [num_users=1] = call_function[target=torch.ops.aten.ge.Tensor](args = (%unsqueeze, %iota), kwargs = {})
#   %gt : [num_users=1] = call_function[target=torch.ops.aten.gt.Tensor](args = (%unsqueeze, %iota), kwargs = {})
#   %clone : [num_users=3] = call_function[target=torch.ops.aten.clone.default](args = (%select,), kwargs = {memory_format: torch.contiguous_format})
#   %clone_1 : [num_users=3] = call_function[target=torch.ops.aten.clone.default](args = (%select_1,), kwargs = {memory_format: torch.contiguous_format})
#   %le : [num_users=1] = call_function[target=torch.ops.aten.le.Tensor](args = (%unsqueeze_2, %clone_1), kwargs = {})
#   %ge_1 : [num_users=1] = call_function[target=torch.ops.aten.ge.Tensor](args = (%unsqueeze_3, %clone), kwargs = {})
#   %bitwise_and : [num_users=1] = call_function[target=torch.ops.aten.bitwise_and.Tensor](args = (%le, %ge_1), kwargs = {})
#   %eq_1 : [num_users=1] = call_function[target=torch.ops.aten.eq.Tensor](args = (%unsqueeze_2, %clone_1), kwargs = {})
#   %eq_2 : [num_users=1] = call_function[target=torch.ops.aten.eq.Tensor](args = (%unsqueeze_3, %clone), kwargs = {})
#   %bitwise_and_1 : [num_users=1] = call_function[target=torch.ops.aten.bitwise_and.Tensor](args = (%eq_1, %eq_2), kwargs = {})
#   %bitwise_and_2 : [num_users=1] = call_function[target=torch.ops.aten.bitwise_and.Tensor](args = (%ge, %bitwise_and), kwargs = {})
#   %bitwise_and_3 : [num_users=3] = call_function[target=torch.ops.aten.bitwise_and.Tensor](args = (%gt, %bitwise_and_1), kwargs = {})
#   %bitwise_not : [num_users=1] = call_function[target=torch.ops.aten.bitwise_not.default](args = (%bitwise_and_3,), kwargs = {})
#   %bitwise_and_4 : [num_users=2] = call_function[target=torch.ops.aten.bitwise_and.Tensor](args = (%bitwise_and_2, %bitwise_not), kwargs = {})
#   %sum_1 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%bitwise_and_4, [-1]), kwargs = {dtype: torch.int32})
#   %sort : [num_users=1] = call_function[target=torch.ops.aten.sort.stable](args = (%bitwise_and_4,), kwargs = {stable: True})
#   %sum_2 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%bitwise_and_3, [-1]), kwargs = {dtype: torch.int32})
#   %sort_1 : [num_users=1] = call_function[target=torch.ops.aten.sort.stable](args = (%bitwise_and_3,), kwargs = {stable: True})
#   %sub : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%primals_5, %unsqueeze_11), kwargs = {})
#   %clamp_min : [num_users=1] = call_function[target=torch.ops.aten.clamp_min.default](args = (%sub, 1), kwargs = {})
#   %clamp_max : [num_users=6] = call_function[target=torch.ops.aten.clamp_max.Tensor](args = (%unsqueeze_7, %clamp_min), kwargs = {})
#   %sub_1 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%primals_5, 1), kwargs = {})
#   %clamp_max_1 : [num_users=6] = call_function[target=torch.ops.aten.clamp_max.Tensor](args = (%unsqueeze_11, %sub_1), kwargs = {})
#   %div : [num_users=2] = call_function[target=torch.ops.aten.div.Tensor_mode](args = (%primals_5, 2), kwargs = {rounding_mode: floor})
#   %sub_2 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%div, %unsqueeze_11), kwargs = {})
#   %clamp_min_1 : [num_users=1] = call_function[target=torch.ops.aten.clamp_min.default](args = (%sub_2, 1), kwargs = {})
#   %clamp_max_2 : [num_users=13] = call_function[target=torch.ops.aten.clamp_max.Tensor](args = (%unsqueeze_7, %clamp_min_1), kwargs = {})
#   %sub_3 : [num_users=1] = call_function[target=torch.ops.aten.sub.Tensor](args = (%div, 1), kwargs = {})
#   %clamp_max_3 : [num_users=13] = call_function[target=torch.ops.aten.clamp_max.Tensor](args = (%unsqueeze_11, %sub_3), kwargs = {})
triton_per_fused_arange_bitwise_and_bitwise_not_clamp_max_clamp_min_clone_eq_floor_divide_ge_gt_le_sort_sub_sum_3 = async_compile.triton('triton_per_fused_arange_bitwise_and_bitwise_not_clamp_max_clamp_min_clone_eq_floor_divide_ge_gt_le_sort_sub_sum_3', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.persistent_reduction(
    size_hints={'x': 512, 'r0_': 512},
    reduction_hint=ReductionHint.DEFAULT,
    filename=__file__,
    triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*i32', 'out_ptr3': '*i16', 'out_ptr5': '*i16', 'out_ptr6': '*i32', 'out_ptr7': '*i32', 'out_ptr8': '*i32', 'out_ptr9': '*i32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_arange_bitwise_and_bitwise_not_clamp_max_clamp_min_clone_eq_floor_divide_ge_gt_le_sort_sub_sum_3', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 5, 'num_reduction': 2, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_per_fused_arange_bitwise_and_bitwise_not_clamp_max_clamp_min_clone_eq_floor_divide_ge_gt_le_sort_sub_sum_3(in_ptr0, in_ptr1, out_ptr3, out_ptr5, out_ptr6, out_ptr7, out_ptr8, out_ptr9, xnumel, r0_numel, XBLOCK : tl.constexpr):
    xnumel = 512
    r0_numel = 512
    R0_BLOCK: tl.constexpr = 512
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    r0_index = tl.arange(0, R0_BLOCK)[None, :]
    r0_offset = 0
    r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    roffset = r0_offset
    rindex = r0_index
    x0 = xindex
    r0_1 = r0_index
    tmp3 = tl.load(in_ptr0 + (128*x0), xmask, eviction_policy='evict_last')
    tmp4 = tl.load(in_ptr0 + (127 + 128*r0_1), None, eviction_policy='evict_last')
    tmp6 = tl.load(in_ptr0 + (127 + 128*x0), xmask, eviction_policy='evict_last')
    tmp7 = tl.load(in_ptr0 + (128*r0_1), None, eviction_policy='evict_last')
    tmp36 = tl.load(in_ptr1 + (0))
    tmp37 = tl.broadcast_to(tmp36, [XBLOCK, 1])
    tmp0 = x0
    tmp1 = r0_1
    tmp2 = tmp0 > tmp1
    tmp5 = tmp3 == tmp4
    tmp8 = tmp6 == tmp7
    tmp9 = tmp5 & tmp8
    tmp10 = tmp2 & tmp9
    tmp11 = tmp0 >= tmp1
    tmp12 = tmp3 <= tmp4
    tmp13 = tmp6 >= tmp7
    tmp14 = tmp12 & tmp13
    tmp15 = tmp11 & tmp14
    tmp16 = tmp10 == 0
    tmp17 = tmp15 & tmp16
    tmp18 = tmp17.to(tl.int32)
    tmp19 = tl.broadcast_to(tmp18, [XBLOCK, R0_BLOCK])
    tmp21 = tl.where(xmask, tmp19, 0)
    tmp22 = tl.sum(tmp21, 1)[:, None]
    tmp23 = tmp1.to(tl.int16)
    tmp24 = tl.broadcast_to(tmp17, [XBLOCK, R0_BLOCK])
    tmp25 = tl.broadcast_to(tmp23, [XBLOCK, R0_BLOCK])
    tmp26, tmp27, = triton_helpers.sort_with_index(tmp24, tmp25, None, 1, stable=True, descending=False)
    tmp28 = tmp10.to(tl.int32)
    tmp29 = tl.broadcast_to(tmp28, [XBLOCK, R0_BLOCK])
    tmp31 = tl.where(xmask, tmp29, 0)
    tmp32 = tl.sum(tmp31, 1)[:, None]
    tmp33 = tl.broadcast_to(tmp10, [XBLOCK, R0_BLOCK])
    tmp34, tmp35, = triton_helpers.sort_with_index(tmp33, tmp25, None, 1, stable=True, descending=False)
    tmp38 = tmp37 - tmp32
    tmp39 = tl.full([1, 1], 1, tl.int32)
    tmp40 = triton_helpers.maximum(tmp38, tmp39)
    tmp41 = triton_helpers.minimum(tmp22, tmp40)
    tmp42 = tl.full([1, 1], 2, tl.int32)
    tmp43 = tl.where((tmp37 < 0) != (tmp42 < 0), tl.where(tmp37 % tmp42 != 0, tmp37 // tmp42 - 1, tmp37 // tmp42), tmp37 // tmp42)
    tmp44 = tmp43 - tmp32
    tmp45 = triton_helpers.maximum(tmp44, tmp39)
    tmp46 = triton_helpers.minimum(tmp22, tmp45)
    tmp47 = tmp37 - tmp39
    tmp48 = triton_helpers.minimum(tmp32, tmp47)
    tmp49 = tmp43 - tmp39
    tmp50 = triton_helpers.minimum(tmp32, tmp49)
    tl.store(out_ptr3 + (r0_1 + 512*x0), tmp27, xmask)
    tl.store(out_ptr5 + (r0_1 + 512*x0), tmp35, xmask)
    tl.store(out_ptr6 + (x0), tmp41, xmask)
    tl.store(out_ptr7 + (x0), tmp46, xmask)
    tl.store(out_ptr8 + (x0), tmp48, xmask)
    tl.store(out_ptr9 + (x0), tmp50, xmask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/gx/cgxf67a4alzuoufgmrftdu2noeo2t3feeozf7pos2mmjzhgdinsy.py
# Topologically Sorted Source Nodes: [dense_mask, setitem, setitem_1], Original ATen: [aten.new_zeros, aten.view, aten.index_put]
# Source node to ATen node mapping:
#   dense_mask => full_default
#   setitem => full_default_2
#   setitem_1 => index_put_1
# Graph fragment:
#   %full_default : [num_users=4] = call_function[target=torch.ops.aten.full.default](args = ([1, 1, 512, 513], 0), kwargs = {dtype: torch.int32, layout: torch.strided, device: cuda:0, pin_memory: False})
#   %full_default_2 : [num_users=4] = call_function[target=torch.ops.aten.full.default](args = ([1, 1, 1, 1], 1), kwargs = {dtype: torch.int32, layout: torch.strided, device: cuda:0, pin_memory: False})
#   %index_put_1 : [num_users=1] = call_function[target=torch.ops.aten.index_put.default](args = (%full_default, [%unsqueeze_20, %unsqueeze_17, %unsqueeze_14, %where_1], %full_default_2), kwargs = {})
triton_poi_fused_index_put_new_zeros_view_4 = async_compile.triton('triton_poi_fused_index_put_new_zeros_view_4', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 524288}, 
    filename=__file__,
    triton_meta={'signature': {'out_ptr0': '*i32', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_index_put_new_zeros_view_4', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused_index_put_new_zeros_view_4(out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 262656
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = tl.full([1], 0, tl.int32)
    tl.store(out_ptr0 + (x0), tmp0, xmask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/7m/c7mp6ds3uhns2tlgjpadhnpruboxnqyhqdjo4q2u2pcaqvs3o566.py
# Topologically Sorted Source Nodes: [argsort_1, flip_1, indices_1, dense_mask, setitem, setitem_1, setitem_3], Original ATen: [aten.sort, aten.flip, aten._to_copy, aten.new_zeros, aten.view, aten.index_put]
# Source node to ATen node mapping:
#   argsort_1 => sort_1
#   dense_mask => full_default
#   flip_1 => rev_1
#   indices_1 => convert_element_type_1
#   setitem => full_default_2
#   setitem_1 => index_put_1
#   setitem_3 => index_put_3
# Graph fragment:
#   %sort_1 : [num_users=1] = call_function[target=torch.ops.aten.sort.stable](args = (%bitwise_and_3,), kwargs = {stable: True})
#   %rev_1 : [num_users=1] = call_function[target=torch.ops.prims.rev.default](args = (%getitem_3, [1]), kwargs = {})
#   %convert_element_type_1 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%rev_1, torch.int32), kwargs = {})
#   %full_default : [num_users=4] = call_function[target=torch.ops.aten.full.default](args = ([1, 1, 512, 513], 0), kwargs = {dtype: torch.int32, layout: torch.strided, device: cuda:0, pin_memory: False})
#   %full_default_2 : [num_users=4] = call_function[target=torch.ops.aten.full.default](args = ([1, 1, 1, 1], 1), kwargs = {dtype: torch.int32, layout: torch.strided, device: cuda:0, pin_memory: False})
#   %index_put_1 : [num_users=1] = call_function[target=torch.ops.aten.index_put.default](args = (%full_default, [%unsqueeze_20, %unsqueeze_17, %unsqueeze_14, %where_1], %full_default_2), kwargs = {})
#   %index_put_3 : [num_users=1] = call_function[target=torch.ops.aten.index_put_.default](args = (%full_default, [%unsqueeze_20, %unsqueeze_17, %unsqueeze_14, %where_3], %full_default_2), kwargs = {})
triton_poi_fused__to_copy_flip_index_put_new_zeros_sort_view_5 = async_compile.triton('triton_poi_fused__to_copy_flip_index_put_new_zeros_sort_view_5', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 262144}, 
    filename=__file__,
    triton_meta={'signature': {'in_ptr0': '*i16', 'in_ptr1': '*i32', 'in_ptr2': '*i32', 'out_ptr0': '*i32', 'out_ptr1': '*i32', 'out_ptr2': '*i32', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_flip_index_put_new_zeros_sort_view_5', 'mutated_arg_names': ['out_ptr1', 'out_ptr2'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_flip_index_put_new_zeros_sort_view_5(in_ptr0, in_ptr1, in_ptr2, out_ptr0, out_ptr1, out_ptr2, xnumel, XBLOCK : tl.constexpr):
    xnumel = 262144
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = (xindex % 512)
    x1 = xindex // 512
    x2 = xindex
    tmp0 = tl.load(in_ptr0 + (511 + ((-1)*x0) + 512*x1), None, eviction_policy='evict_last')
    tmp3 = tl.load(in_ptr1 + (x1), None, eviction_policy='evict_last')
    tmp14 = tl.load(in_ptr2 + (x1), None, eviction_policy='evict_last')
    tmp1 = tmp0.to(tl.int64)
    tmp2 = tmp1.to(tl.int32)
    tmp4 = x0
    tmp5 = tmp4 < tmp3
    tmp6 = tl.full([1], 512, tl.int32)
    tmp7 = tl.where(tmp5, tmp2, tmp6)
    tmp8 = tl.full([XBLOCK], 513, tl.int32)
    tmp9 = tmp7 + tmp8
    tmp10 = tmp7 < 0
    tmp11 = tl.where(tmp10, tmp9, tmp7)
    tl.device_assert((0 <= tmp11) & (tmp11 < 513), "index out of bounds: 0 <= tmp11 < 513")
    tmp13 = tl.full([1], 1, tl.int32)
    tmp15 = tmp4 < tmp14
    tmp16 = tl.where(tmp15, tmp2, tmp6)
    tmp17 = tmp16 + tmp8
    tmp18 = tmp16 < 0
    tmp19 = tl.where(tmp18, tmp17, tmp16)
    tl.device_assert((0 <= tmp19) & (tmp19 < 513), "index out of bounds: 0 <= tmp19 < 513")
    tl.store(out_ptr0 + (x2), tmp2, None)
    tl.store(out_ptr1 + (tmp11 + 513*x1), tmp13, None)
    tl.store(out_ptr2 + (tmp19 + 513*x1), tmp13, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/ie/cie3g2ahnlpzv4khntxirtrjhf6iifzdd67ektpqtgerqb6gh55l.py
# Topologically Sorted Source Nodes: [q_1, k_1, cat, q_2, cat_1, k_2], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
# Source node to ATen node mapping:
#   cat => cat
#   cat_1 => cat_1
#   k_1 => add_3, convert_element_type_16, mean_2, pow_3, rsqrt_2
#   k_2 => convert_element_type_21
#   q_1 => add_2, convert_element_type_14, mean_1, pow_2, rsqrt_1
#   q_2 => convert_element_type_19
# Graph fragment:
#   %convert_element_type_14 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_12, torch.float32), kwargs = {})
#   %pow_2 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_14, 2), kwargs = {})
#   %mean_1 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_2, [3], True), kwargs = {})
#   %add_2 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_1, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_1 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_2,), kwargs = {})
#   %convert_element_type_16 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_13, torch.float32), kwargs = {})
#   %pow_3 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_16, 2), kwargs = {})
#   %mean_2 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_3, [3], True), kwargs = {})
#   %add_3 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_2, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_2 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_3,), kwargs = {})
#   %cat : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%add_4, %add_5], 3), kwargs = {})
#   %convert_element_type_19 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%cat, torch.bfloat16), kwargs = {})
#   %cat_1 : [num_users=1] = call_function[target=torch.ops.aten.cat.default](args = ([%add_6, %add_7], 3), kwargs = {})
#   %convert_element_type_21 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%cat_1, torch.bfloat16), kwargs = {})
triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6 = async_compile.triton('triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints={'x': 524288, 'r0_': 128},
    reduction_hint=ReductionHint.DEFAULT,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_out_ptr1': '*fp32', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*fp32', 'out_ptr1': '*bf16', 'out_ptr3': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7, 8), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6', 'mutated_arg_names': ['in_out_ptr0', 'in_out_ptr1'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 14, 'num_reduction': 2, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6(in_out_ptr0, in_out_ptr1, in_ptr0, in_ptr1, in_ptr2, out_ptr1, out_ptr3, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
    xnumel = 524288
    r0_numel = 128
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    r0_base = tl.arange(0, R0_BLOCK)[None, :]
    rbase = r0_base
    x0 = (xindex % 8)
    x1 = xindex // 8
    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
    x3 = xindex
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp0 = tl.load(in_ptr0 + (r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp2 = tmp1 * tmp1
        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
        tmp5 = _tmp4 + tmp3
        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tmp6 = 128.0
    tmp7 = tmp4 / tmp6
    tmp8 = 1.1920928955078125e-07
    tmp9 = tmp7 + tmp8
    tmp10 = libdevice.rsqrt(tmp9)
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (x3), tmp10, None)
    _tmp15 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp11 = tl.load(in_ptr0 + (1024 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp12 = tmp11.to(tl.float32)
        tmp13 = tmp12 * tmp12
        tmp14 = tl.broadcast_to(tmp13, [XBLOCK, R0_BLOCK])
        tmp16 = _tmp15 + tmp14
        _tmp15 = tl.where(r0_mask, tmp16, _tmp15)
    tmp15 = tl.sum(_tmp15, 1)[:, None]
    tmp17 = 128.0
    tmp18 = tmp15 / tmp17
    tmp19 = 1.1920928955078125e-07
    tmp20 = tmp18 + tmp19
    tmp21 = libdevice.rsqrt(tmp20)
    tl.debug_barrier()
    tl.store(in_out_ptr1 + (x3), tmp21, None)
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp22 = r0_2
        tmp23 = tl.full([1, 1], 0, tl.int64)
        tmp24 = tmp22 >= tmp23
        tmp25 = tl.full([1, 1], 64, tl.int64)
        tmp26 = tmp22 < tmp25
        tmp27 = tl.load(in_ptr0 + (128*x0 + 3072*x1 + (r0_2)), r0_mask & tmp26, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp28 = tmp27.to(tl.float32)
        tmp29 = tmp28 * tmp10
        tmp30 = tl.load(in_ptr1 + (64*x1 + (r0_2)), r0_mask & tmp26, eviction_policy='evict_last', other=0.0)
        tmp31 = tmp29 * tmp30
        tmp32 = tl.load(in_ptr0 + (64 + 128*x0 + 3072*x1 + (r0_2)), r0_mask & tmp26, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp33 = tmp32.to(tl.float32)
        tmp34 = tmp33 * tmp10
        tmp35 = tl.load(in_ptr2 + (64*x1 + (r0_2)), r0_mask & tmp26, eviction_policy='evict_last', other=0.0)
        tmp36 = tmp34 * tmp35
        tmp37 = tmp31 + tmp36
        tmp38 = tl.full(tmp37.shape, 0.0, tmp37.dtype)
        tmp39 = tl.where(tmp26, tmp37, tmp38)
        tmp40 = tmp22 >= tmp25
        tmp41 = tl.full([1, 1], 128, tl.int64)
        tmp42 = tmp22 < tmp41
        tmp43 = tl.load(in_ptr0 + (128*x0 + 3072*x1 + ((-64) + r0_2)), r0_mask & tmp40, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp44 = tmp43.to(tl.float32)
        tmp45 = tmp44 * tmp10
        tmp46 = tl.load(in_ptr2 + (64*x1 + ((-64) + r0_2)), r0_mask & tmp40, eviction_policy='evict_last', other=0.0)
        tmp47 = -tmp46
        tmp48 = tmp45 * tmp47
        tmp49 = tl.load(in_ptr0 + (64 + 128*x0 + 3072*x1 + ((-64) + r0_2)), r0_mask & tmp40, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp50 = tmp49.to(tl.float32)
        tmp51 = tmp50 * tmp10
        tmp52 = tl.load(in_ptr1 + (64*x1 + ((-64) + r0_2)), r0_mask & tmp40, eviction_policy='evict_last', other=0.0)
        tmp53 = tmp51 * tmp52
        tmp54 = tmp48 + tmp53
        tmp55 = tl.full(tmp54.shape, 0.0, tmp54.dtype)
        tmp56 = tl.where(tmp40, tmp54, tmp55)
        tmp57 = tl.where(tmp26, tmp39, tmp56)
        tmp58 = tmp57.to(tl.float32)
        tmp59 = tl.load(in_ptr0 + (1024 + 128*x0 + 3072*x1 + (r0_2)), r0_mask & tmp26, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp60 = tmp59.to(tl.float32)
        tmp61 = tmp60 * tmp21
        tmp62 = tmp61 * tmp30
        tmp63 = tl.load(in_ptr0 + (1088 + 128*x0 + 3072*x1 + (r0_2)), r0_mask & tmp26, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp64 = tmp63.to(tl.float32)
        tmp65 = tmp64 * tmp21
        tmp66 = tmp65 * tmp35
        tmp67 = tmp62 + tmp66
        tmp68 = tl.full(tmp67.shape, 0.0, tmp67.dtype)
        tmp69 = tl.where(tmp26, tmp67, tmp68)
        tmp70 = tl.load(in_ptr0 + (1024 + 128*x0 + 3072*x1 + ((-64) + r0_2)), r0_mask & tmp40, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp71 = tmp70.to(tl.float32)
        tmp72 = tmp71 * tmp21
        tmp73 = tmp72 * tmp47
        tmp74 = tl.load(in_ptr0 + (1088 + 128*x0 + 3072*x1 + ((-64) + r0_2)), r0_mask & tmp40, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp75 = tmp74.to(tl.float32)
        tmp76 = tmp75 * tmp21
        tmp77 = tmp76 * tmp52
        tmp78 = tmp73 + tmp77
        tmp79 = tl.full(tmp78.shape, 0.0, tmp78.dtype)
        tmp80 = tl.where(tmp40, tmp78, tmp79)
        tmp81 = tl.where(tmp26, tmp69, tmp80)
        tmp82 = tmp81.to(tl.float32)
        tl.store(out_ptr1 + (r0_2 + 128*x3), tmp58, r0_mask)
        tl.store(out_ptr3 + (r0_2 + 128*x3), tmp82, r0_mask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/35/c35c5w4vcvevwm3gpnhr2kuscrqfvifuzxwg3yo36at4qtymhhek.py
# Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
# Source node to ATen node mapping:
#   flex_attention => flex_attention
# Graph fragment:
#   %flex_attention : [num_users=2] = call_function[target=torch.ops.higher_order.flex_attention](args = (%permute_5, %permute_6, %permute_7, %sdpa_score0, (65536, 65536, %clamp_max, %unsqueeze_9, %clamp_max_1, %unsqueeze_13, %convert_element_type_2, %clone_4, %convert_element_type_4, %clone_7, 128, 128, %sdpa_mask0), 0.12, {PRESCALE_QK: False, ROWS_GUARANTEED_SAFE: False, BLOCKS_ARE_CONTIGUOUS: False, WRITE_DQ: True, OUTPUT_LOGSUMEXP: True}, (), (%cumsum,)), kwargs = {})
triton_tem_fused_7 = async_compile.triton('triton_tem_fused_7', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.template(
    num_stages=3,
    num_warps=8,
    triton_meta={'signature': {'arg_Q': '*bf16', 'arg_K': '*bf16', 'arg_V': '*bf16', 'arg_LSE': '*fp32', 'arg_KV_NUM_BLKS': '*i32', 'arg_KV_IDX': '*i32', 'arg_FULL_KV_NUM_BLKS': '*i32', 'arg_FULL_KV_IDX': '*i32', 'in_ptr8': '*i64', 'out_ptr0': '*bf16'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'kernel_name': 'triton_tem_fused_7', 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
)
@triton.jit
def triton_tem_fused_7(arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, in_ptr8, out_ptr0):
    PRESCALE_QK : tl.constexpr = False
    ROWS_GUARANTEED_SAFE : tl.constexpr = False
    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
    WRITE_DQ : tl.constexpr = True
    OUTPUT_LOGSUMEXP : tl.constexpr = True
    FLOAT32_PRECISION : tl.constexpr = 'ieee'
    IS_DIVISIBLE : tl.constexpr = True
    SM_SCALE : tl.constexpr = 0.12
    GQA_SHARED_HEADS : tl.constexpr = 1
    HAS_FULL_BLOCKS : tl.constexpr = True
    QK_HEAD_DIM : tl.constexpr = 128
    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
    V_HEAD_DIM : tl.constexpr = 128
    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
    SAFE_HEAD_DIM : tl.constexpr = True
    BLOCK_M : tl.constexpr = 128
    BLOCK_N : tl.constexpr = 64
    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128
    Q = arg_Q
    K = arg_K
    V = arg_V
    LSE = arg_LSE
    KV_NUM_BLKS = arg_KV_NUM_BLKS
    KV_IDX = arg_KV_IDX
    FULL_KV_NUM_BLKS = arg_FULL_KV_NUM_BLKS
    FULL_KV_IDX = arg_FULL_KV_IDX

# Sub notation for this kernel:
    #
    # Q: Query, K: Key, V: Value
    # M: Number of queries, N: Number of keys/values, D: Model dimension
    # QK_HEAD_DIM: The dimension of the query and key embeddings
    # V_HEAD_DIM: The dimension of the value embeddings
    # z: Batch size, h: Number of heads, m: Number of queries per head, k: Number of keys per head
    # GQA_SHARED_HEADS: number of query heads sharing one kv head in GQA setups.
    #
    # The following FULL_* and PARTIAL_* is defined in the block sparse mask grid, rather than the thread block grid.
    # KV_NUM_BLKS: The number of KV blocks (that may or may not require masking) for each query.
    # KV_IDX: The indices of KV blocks (that may or may not require masking) for each query.
    # FULL_KV_NUM_BLKS: The number of fully unmasked KV blocks (so we don't need masking) for each query.
    # FULL_KV_IDX: The indices of fully unmasked KV blocks (so we don't need masking) for each query.
    #
    # OUTPUT_LOGSUMEXP: We only need to store the logsumexp if we require grad
    #
    # (Modifiable) Performance tuning options
    # BLOCK_M: The thread block size across the seqlen dim of Q.
    # BLOCK_N: Iterate over BLOCK_N across the seqlen dim of K/V in each thread block.

# The below are kernel options that can be applied for certain score_mods,
    # or involve a numerics vs. perf tradeoff
    # PRESCALE_QK: Whether to pre-scale QK by 1/sqrt(d) and change of base. Has
    # about 20% more numerical error, but slightly faster.
    # ROWS_GUARANTEED_SAFE: Is it guaranteed that at least one value in each row
    # is not masked out? If so, we can skip an extra safety check
    # BLOCKS_ARE_CONTIGUOUS: Is it guaranteed that all blocks in the mask are
    # contiguous? If so, we don't need to do an indirect jump for every block

tl.static_assert(SPARSE_Q_BLOCK_SIZE >= BLOCK_M and SPARSE_Q_BLOCK_SIZE % BLOCK_M == 0)
    tl.static_assert(SPARSE_KV_BLOCK_SIZE >= BLOCK_N and SPARSE_KV_BLOCK_SIZE % BLOCK_N == 0)

# Define strides of inputs
    stride_qz, stride_qh, stride_qm, stride_qk = 0, 128, 1024, 1
    stride_kz, stride_kh, stride_kn, stride_kk = 0, 128, 1024, 1
    stride_vz, stride_vh, stride_vn, stride_vk = 0, 128, 1024, 1

ZQ = 1
    HQ = 8
    Q_LEN = 65536
    ZKV = 1
    KV_LEN = 65536

MATMUL_PRECISION = Q.dtype.element_ty

q_start = tl.program_id(0)
    off_zq = tl.program_id(1) // HQ
    off_hq = tl.program_id(1) % HQ

# We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq.
    # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0.
    off_zkv = off_zq % ZKV
    off_hkv = off_hq // GQA_SHARED_HEADS
    off_g = off_hq % GQA_SHARED_HEADS

q_offset = off_zq * stride_qz + off_hq * stride_qh
    k_offset = off_zkv * stride_kz + off_hkv * stride_kh
    v_offset = off_zkv * stride_vz + off_hkv * stride_vh

Q = Q + q_offset
    K = K + k_offset
    V = V + v_offset

SPARSE_Z = 1
    SPARSE_HQ = 1

sparse_idx_z = off_zq % SPARSE_Z
    sparse_idx_hq = off_hq % SPARSE_HQ

SPARSE_Q_MULTIPLE: tl.constexpr = (SPARSE_Q_BLOCK_SIZE // BLOCK_M)
    SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)

stride_kv_num_blks_h = 512
    stride_kv_idx_h = 0
    stride_kv_idx_m = 512

# initialize pointer to m and l
    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
    acc = tl.zeros([BLOCK_M, V_HEAD_DIM_ROUNDED], dtype=tl.float32)

offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)

# KV_IDX and KV_NUM_BLKS are always contiguous.
    sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
    sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE
    sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m  # noqa: B950

Q_block_ptr = tl.make_block_ptr(
        base=Q,
        shape=(Q_LEN, QK_HEAD_DIM),
        strides=(stride_qm, stride_qk),
        offsets=(q_start * BLOCK_M, 0),
        block_shape=(BLOCK_M, QK_HEAD_DIM_ROUNDED),
        order=(1, 0)
    )
    q = load_checked_block(Q_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
    # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # We don't know anything "special" about these blocks, so we need to apply
    # both score_mod and mask_mod to it
    kv_indices = KV_IDX + sparse_kv_idx_offset
    kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
    kv_num_blocks = tl.load(KV_NUM_BLKS + sparse_kv_num_blks_offset)
    block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))

K_block_ptr = tl.make_block_ptr(
        base=K,
        shape=(QK_HEAD_DIM, KV_LEN),
        strides=(stride_kk, stride_kn),
        offsets=(0, kv_start),
        block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
        order=(0, 1)
    )
    V_block_ptr = tl.make_block_ptr(
        base=V,
        shape=(KV_LEN, V_HEAD_DIM),
        strides=(stride_vn, stride_vk),
        offsets=(kv_start, 0),
        block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
        order=(1, 0)
    )
    offs_n = kv_start + tl.arange(0, BLOCK_N)

acc, l_i, m_i = forward_inner(
        arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, in_ptr8, out_ptr0,
        q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
        acc, l_i, m_i,
        off_zq, off_hq, offs_m[:, None], offs_n[None, :],
        kv_indices, kv_num_blocks,
        0, block_n_end,
        MATMUL_PRECISION,
        IS_FULL_BLOCKS=False,
    )

# ~~~~~~~~~~~~~~ "full" blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # We know these blocks are guaranteed to be "full", so we don't need to
    # apply mask_mod to them - only score_mod
    if HAS_FULL_BLOCKS:
        # FULL_KV_IDX and FULL_KV_NUM_BLKS are always contiguous.
        kv_indices = FULL_KV_IDX + sparse_kv_idx_offset
        kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
        kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
        block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))

K_block_ptr = tl.make_block_ptr(
            base=K,
            shape=(QK_HEAD_DIM, KV_LEN),
            strides=(stride_kk, stride_kn),
            offsets=(0, kv_start),
            block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
            order=(0, 1)
        )
        V_block_ptr = tl.make_block_ptr(
            base=V,
            shape=(KV_LEN, V_HEAD_DIM),
            strides=(stride_vn, stride_vk),
            offsets=(kv_start, 0),
            block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
            order=(1, 0)
        )
        offs_n = kv_start + tl.arange(0, BLOCK_N)

acc, l_i, m_i = forward_inner(
            arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, in_ptr8, out_ptr0,
            q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
            acc, l_i, m_i,
            off_zq, off_hq, offs_m[:, None], offs_n[None, :],
            kv_indices, kv_num_blocks,
            0, block_n_end,
            MATMUL_PRECISION,
            IS_FULL_BLOCKS=True,
        )

# [Note] Handle fully masked out rows:
    # Li will be the sum(e^(-inf)) == 0.0 for masked out rows, mi will be -inf.
    # We set Li to 1.0 which will result in lse/out = 0.0 | after the log(li) + mi(0.0) step
    l_i = tl.where(l_i == 0.0, 1, l_i)

acc = acc / l_i[:, None]
    idx_zq = tl.program_id(1) // HQ
    idx_hq = tl.program_id(1) % HQ
    idx_m = offs_m[:, None]
    idx_d = tl.arange(0, V_HEAD_DIM_ROUNDED)[None, :]

mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)

xindex = idx_d + 128*idx_m + 8388608*idx_hq + 67108864*idx_zq
    tl.store(out_ptr0 + (tl.broadcast_to(idx_d + 128*idx_hq + 1024*idx_m, acc.shape)), acc, mask)

if OUTPUT_LOGSUMEXP:
        off_hz = tl.program_id(1)
        l_ptrs = LSE + off_hz * Q_LEN + offs_m
        lse = m_i + tl.math.log2(l_i)
        if IS_DIVISIBLE:
            tl.store(l_ptrs, lse)
        else:
            tl.store(l_ptrs, lse, mask=offs_m < Q_LEN)

@triton.jit
def forward_inner(
    arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, in_ptr8, out_ptr0,
    q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
    # accumulated values
    acc, l_i, m_i,
    # Offsets used as inputs to score_mod & mask_mod
    # of size [BLOCK_M, BLOCK_N] or scalar.
    off_z, off_h, offs_m, offs_n,
    # blocksparse data
    kv_indices, kv_num_blocks,
    # start kv and end kv block
    block_n_start, block_n_end,
    MATMUL_PRECISION,
    IS_FULL_BLOCKS,
):
    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
    PRESCALE_QK : tl.constexpr = False
    ROWS_GUARANTEED_SAFE : tl.constexpr = False
    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
    WRITE_DQ : tl.constexpr = True
    OUTPUT_LOGSUMEXP : tl.constexpr = True
    FLOAT32_PRECISION : tl.constexpr = 'ieee'
    IS_DIVISIBLE : tl.constexpr = True
    SM_SCALE : tl.constexpr = 0.12
    GQA_SHARED_HEADS : tl.constexpr = 1
    HAS_FULL_BLOCKS : tl.constexpr = True
    QK_HEAD_DIM : tl.constexpr = 128
    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
    V_HEAD_DIM : tl.constexpr = 128
    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
    SAFE_HEAD_DIM : tl.constexpr = True
    BLOCK_M : tl.constexpr = 128
    BLOCK_N : tl.constexpr = 64
    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128

SPARSE_KV_MULTIPLE: tl.constexpr = (SPARSE_KV_BLOCK_SIZE // BLOCK_N)
    RCP_LN2: tl.constexpr = 1.44269504

if PRESCALE_QK:
        q = (q * SM_SCALE * RCP_LN2).to(MATMUL_PRECISION)

# loop over k, v and update accumulator until block_n_end
    for start_n in range(block_n_start, block_n_end):
        if IS_DIVISIBLE:
            acc, l_i, m_i = forward_block_mn(
                arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, in_ptr8, out_ptr0,
                q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
                # accumulated values
                acc, l_i, m_i,
                # Offsets
                off_z, off_h, offs_m, offs_n,
                MATMUL_PRECISION, RCP_LN2,
                IS_FULL_BLOCKS,
            )
        else:
            # Benchmark shows even we applied mod & mask to each block for non divisible seqlen,
            # it's on par or slightly faster than only applying to the last block in fwd.
            # However, we choose different strategy for bwd, where we only apply mod & mask
            # to the last block because it's faster a lot.
            acc, l_i, m_i = forward_block_mn(
                arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, in_ptr8, out_ptr0,
                q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
                # accumulated values
                acc, l_i, m_i,
                # Offsets
                off_z, off_h, offs_m, offs_n,
                MATMUL_PRECISION, RCP_LN2,
                IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
            )

# update pointers
        offset = get_offset_for_next_block(
            start_n, kv_indices, kv_num_blocks,
            SPARSE_KV_BLOCK_SIZE, SPARSE_KV_MULTIPLE, BLOCK_N, BLOCKS_ARE_CONTIGUOUS
        )

V_block_ptr = tl.advance(V_block_ptr, (offset, 0))
        K_block_ptr = tl.advance(K_block_ptr, (0, offset))

offs_n = offs_n + offset

return acc, l_i, m_i

@triton.jit
def get_offset_for_next_block(
    loop_iter, col_indices, total_blocks,
    SPARSE_BLOCK, SPARSE_BLOCK_MULTIPLE, BLOCK,
    BLOCKS_ARE_CONTIGUOUS: tl.constexpr
):
    if BLOCKS_ARE_CONTIGUOUS:
        return BLOCK
    cur_block_idx = loop_iter // SPARSE_BLOCK_MULTIPLE
    cur_block = tl.load(col_indices + cur_block_idx, eviction_policy="evict_last")
    next_block = tl.load(col_indices + cur_block_idx + 1, eviction_policy="evict_last", mask=cur_block_idx + 1 < total_blocks)
    needs_jump = (loop_iter + 1) % SPARSE_BLOCK_MULTIPLE == 0
    jump_to_block = (next_block - cur_block ) * SPARSE_BLOCK - (SPARSE_BLOCK_MULTIPLE - 1) * BLOCK
    offset = jump_to_block * needs_jump + (1 - needs_jump) * BLOCK
    return offset

@triton.jit
def forward_block_mn(
    arg_Q, arg_K, arg_V, arg_LSE, arg_KV_NUM_BLKS, arg_KV_IDX, arg_FULL_KV_NUM_BLKS, arg_FULL_KV_IDX, in_ptr8, out_ptr0,
    q, K_block_ptr, V_block_ptr, Q_LEN, KV_LEN,
    # accumulated values
    acc, l_i, m_i,
    # Offsets
    off_z, off_h, offs_m, offs_n,
    MATMUL_PRECISION, RCP_LN2,
    IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,

):
    # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
    PRESCALE_QK : tl.constexpr = False
    ROWS_GUARANTEED_SAFE : tl.constexpr = False
    BLOCKS_ARE_CONTIGUOUS : tl.constexpr = False
    WRITE_DQ : tl.constexpr = True
    OUTPUT_LOGSUMEXP : tl.constexpr = True
    FLOAT32_PRECISION : tl.constexpr = 'ieee'
    IS_DIVISIBLE : tl.constexpr = True
    SM_SCALE : tl.constexpr = 0.12
    GQA_SHARED_HEADS : tl.constexpr = 1
    HAS_FULL_BLOCKS : tl.constexpr = True
    QK_HEAD_DIM : tl.constexpr = 128
    QK_HEAD_DIM_ROUNDED : tl.constexpr = 128
    V_HEAD_DIM : tl.constexpr = 128
    V_HEAD_DIM_ROUNDED : tl.constexpr = 128
    SAFE_HEAD_DIM : tl.constexpr = True
    BLOCK_M : tl.constexpr = 128
    BLOCK_N : tl.constexpr = 64
    SPARSE_Q_BLOCK_SIZE : tl.constexpr = 128
    SPARSE_KV_BLOCK_SIZE : tl.constexpr = 128

# -- load k --
    k = load_checked_block(K_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
    # -- compute qk ---
    qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
    if not PRESCALE_QK:
        qk *= SM_SCALE
    # ~~~~~~~~~~~~~~~~~~~ Apply score modification  ~~~~~~~~~~~~~~~~~~~
    # If this is the last block of a non divisible seqlen, we still need to load [BLOCK_M, BLOCK_N] elements,
    # which is larger than the actual number of elements. To avoid access memory out of bound,
    # we need to mask out the elements that are out of Q_LEN & KV_LEN.
    m = get_bounded_indices(offs_m, Q_LEN if CHECK_BLOCK_BOUNDARY else None)
    n = get_bounded_indices(offs_n, KV_LEN if CHECK_BLOCK_BOUNDARY else None)

tmp0 = (qk)
    post_mod_scores = tmp0

if CHECK_BLOCK_BOUNDARY:
        # Mask out the elements that are out of the KV_LEN for non divisible seqlen.
        post_mod_scores = tl.where(offs_n < KV_LEN, post_mod_scores, float("-inf"))

if not IS_FULL_BLOCKS:
        tmp1 = (m)
        tmp2 = (n)
        tmp3 = tmp1 >= tmp2
        tmp4 = tl.load(in_ptr8 + tmp1)
        tmp5 = tl.load(in_ptr8 + tmp2)
        tmp6 = tmp4 == tmp5
        tmp7 = tmp3 & tmp6
        mask_mod_output = tmp7

if CHECK_BLOCK_BOUNDARY:
            mask_mod_output = tl.where(offs_n < KV_LEN, mask_mod_output, False)
        # apply mask for partially unmasked blocks
        post_mod_scores = tl.where(mask_mod_output, post_mod_scores, float("-inf"))

if not PRESCALE_QK:
        post_mod_scores *= RCP_LN2
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# -- compute scaling constant ---
    m_ij = tl.maximum(m_i, tl.max(post_mod_scores, 1))
    if not ROWS_GUARANTEED_SAFE:
        masked_out_rows = (m_ij == float("-inf"))
        m_ij_masked = tl.where(masked_out_rows, 0, m_ij)
    else:
        m_ij_masked = m_ij

alpha = tl.math.exp2(m_i - m_ij_masked)
    p = tl.math.exp2(post_mod_scores - m_ij_masked[:, None])

# NB: l_i update is pulled up here since it's a bit faster
    # NB: For headdim=256, it's faster to move it back down to after m_i =
    # m_ij
    l_i = l_i * alpha + tl.sum(p, 1)
    # # -- scale and update acc --
    acc = acc * alpha[:, None]
    v = load_checked_block(V_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
    acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)

# -- update m_i
    m_i = m_ij

return acc, l_i, m_i

@triton.jit
def load_checked_block(block_ptr, IS_DIVISIBLE: tl.constexpr, SAFE_HEAD_DIM: tl.constexpr):
  if IS_DIVISIBLE and SAFE_HEAD_DIM:
    return tl.load(block_ptr)
  elif IS_DIVISIBLE and not SAFE_HEAD_DIM:
    return tl.load(block_ptr, boundary_check=(1,), padding_option="zero")
  elif not IS_DIVISIBLE and SAFE_HEAD_DIM:
      return tl.load(block_ptr, boundary_check=(0,), padding_option="zero")
  else:
      return tl.load(block_ptr, boundary_check=(0, 1), padding_option="zero")

@triton.jit
def get_bounded_indices(indices, max_len=None):
    return indices % max_len if max_len is not None else indices
''', device_str='cuda')
meta0 = {'PRESCALE_QK': False, 'ROWS_GUARANTEED_SAFE': False, 'BLOCKS_ARE_CONTIGUOUS': False, 'WRITE_DQ': True, 'OUTPUT_LOGSUMEXP': True, 'FLOAT32_PRECISION': "'ieee'", 'IS_DIVISIBLE': True, 'SM_SCALE': 0.12, 'GQA_SHARED_HEADS': 1, 'HAS_FULL_BLOCKS': True, 'QK_HEAD_DIM': 128, 'QK_HEAD_DIM_ROUNDED': 128, 'V_HEAD_DIM': 128, 'V_HEAD_DIM_ROUNDED': 128, 'SAFE_HEAD_DIM': True, 'BLOCK_M': 128, 'BLOCK_N': 64, 'SPARSE_Q_BLOCK_SIZE': 128, 'SPARSE_KV_BLOCK_SIZE': 128}

# kernel path: /tmp/torchinductor_root/eu/ceugb6dsvvyeghehs3ndikgqq6v3xcihewqlokt7becln37n27dc.py
# Topologically Sorted Source Nodes: [x_5, rms_norm_4], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   rms_norm_4 => add_11, convert_element_type_26, convert_element_type_27, mean_4, mul_16, pow_5, rsqrt_4
#   x_5 => add_10
# Graph fragment:
#   %add_10 : [num_users=3] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_1, %view_15), kwargs = {})
#   %convert_element_type_26 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_10, torch.float32), kwargs = {})
#   %pow_5 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_26, 2), kwargs = {})
#   %mean_4 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_5, [2], True), kwargs = {})
#   %add_11 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_4, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_4 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_11,), kwargs = {})
#   %mul_16 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_26, %rsqrt_4), kwargs = {})
#   %convert_element_type_27 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_16, torch.bfloat16), kwargs = {})
triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8 = async_compile.triton('triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.persistent_reduction(
    size_hints={'x': 65536, 'r0_': 1024},
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_out_ptr1': '*fp32', 'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8', 'mutated_arg_names': ['in_out_ptr0', 'in_out_ptr1'], 'optimize_mem': False, 'no_x_dim': True, 'num_load': 2, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8(in_out_ptr0, in_out_ptr1, in_ptr0, out_ptr0, xnumel, r0_numel):
    xnumel = 65536
    XBLOCK: tl.constexpr = 1
    r0_numel = 1024
    R0_BLOCK: tl.constexpr = 1024
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = tl.full([1], xoffset, tl.int32)
    xmask = tl.full([R0_BLOCK], True, tl.int1)
    r0_index = tl.arange(0, R0_BLOCK)[:]
    r0_offset = 0
    r0_mask = tl.full([R0_BLOCK], True, tl.int1)
    roffset = r0_offset
    rindex = r0_index
    r0_1 = r0_index
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (r0_1 + 1024*x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (r0_1 + 1024*x0), None).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp4 = tmp3 * tmp3
    tmp5 = tl.broadcast_to(tmp4, [R0_BLOCK])
    tmp7 = triton_helpers.promote_to_tensor(tl.sum(tmp5, 0))
    tmp8 = 1024.0
    tmp9 = tmp7 / tmp8
    tmp10 = 1.1920928955078125e-07
    tmp11 = tmp9 + tmp10
    tmp12 = libdevice.rsqrt(tmp11)
    tmp13 = tmp3 * tmp12
    tmp14 = tmp13.to(tl.float32)
    tl.store(in_out_ptr0 + (r0_1 + 1024*x0), tmp2, None)
    tl.debug_barrier()
    tl.store(in_out_ptr1 + (x0), tmp12, None)
    tl.store(out_ptr0 + (r0_1 + 1024*x0), tmp14, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/tp/ctpjbbgoxmnvxctkhtjck2h2ymxmqabuz2hwa5fpy2jtu7gdshgt.py
# Topologically Sorted Source Nodes: [relu, x_7], Original ATen: [aten.relu, aten.pow]
# Source node to ATen node mapping:
#   relu => relu
#   x_7 => pow_6
# Graph fragment:
#   %relu : [num_users=1] = call_function[target=torch.ops.aten.relu.default](args = (%view_17,), kwargs = {})
#   %pow_6 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%relu, 2), kwargs = {})
triton_poi_fused_pow_relu_9 = async_compile.triton('triton_poi_fused_pow_relu_9', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 268435456}, 
    filename=__file__,
    triton_meta={'signature': {'in_ptr0': '*bf16', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_pow_relu_9', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused_pow_relu_9(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 268435456
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.full([1], 0, tl.int32)
    tmp2 = triton_helpers.maximum(tmp1, tmp0)
    tmp3 = tmp2 * tmp2
    tl.store(out_ptr0 + (x0), tmp3, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/w6/cw6jtz6rocgynoltrjvx4g7bjex5jezjoajdcb2zqdb2ib4ofrqh.py
# Topologically Sorted Source Nodes: [x, x_9, mul_12, mul_13, x_10], Original ATen: [aten._to_copy, aten.mul, aten.add]
# Source node to ATen node mapping:
#   mul_12 => mul_17
#   mul_13 => mul_18
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_10 => add_13
#   x_9 => add_12
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_12 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_10, %view_19), kwargs = {})
#   %mul_17 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_11, %add_12), kwargs = {})
#   %mul_18 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_12, %convert_element_type_11), kwargs = {})
#   %add_13 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_17, %mul_18), kwargs = {})
triton_poi_fused__to_copy_add_mul_10 = async_compile.triton('triton_poi_fused__to_copy_add_mul_10', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_10', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_add_mul_10(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 67108864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    x2 = xindex // 1024
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (18))
    tmp4 = tl.broadcast_to(tmp3, [XBLOCK])
    tmp7 = tl.load(in_ptr1 + (19))
    tmp8 = tl.broadcast_to(tmp7, [XBLOCK])
    tmp10 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp12 = tl.load(in_ptr3 + (x2), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp5 * tmp2
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp10.to(tl.float32)
    tmp13 = tmp11 * tmp12
    tmp14 = tmp13.to(tl.float32)
    tmp15 = tmp9 * tmp14
    tmp16 = tmp6 + tmp15
    tl.store(in_out_ptr0 + (x0), tmp2, None)
    tl.store(out_ptr0 + (x0), tmp16, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/2x/c2xdlzp66bt5ttwm5zhxe5zvcez3oulphhjg7i7kdw7qc7euxsur.py
# Topologically Sorted Source Nodes: [embedding_1, v_4, mul_22, mul_23, v_5], Original ATen: [aten.embedding, aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   embedding_1 => embedding_1
#   mul_22 => mul_30
#   mul_23 => mul_31
#   v_4 => add_20, convert_element_type_42, convert_element_type_43, mean_7, mul_29, pow_9, rsqrt_7
#   v_5 => add_21
# Graph fragment:
#   %embedding_1 : [num_users=2] = call_function[target=torch.ops.aten.embedding.default](args = (%primals_3, %primals_1), kwargs = {})
#   %convert_element_type_42 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_23, torch.float32), kwargs = {})
#   %pow_9 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_42, 2), kwargs = {})
#   %mean_7 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_9, [3], True), kwargs = {})
#   %add_20 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_7, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_7 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_20,), kwargs = {})
#   %mul_29 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_42, %rsqrt_7), kwargs = {})
#   %convert_element_type_43 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_29, torch.bfloat16), kwargs = {})
#   %mul_30 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_13, %convert_element_type_43), kwargs = {})
#   %mul_31 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_14, %view_24), kwargs = {})
#   %add_21 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_30, %mul_31), kwargs = {})
triton_red_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_11 = async_compile.triton('triton_red_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_11', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

# kernel path: /tmp/torchinductor_root/eq/ceqizvwrmbtne3rru62flx7bhfz66xdovgwltndzindwwrt4gws2.py
# Topologically Sorted Source Nodes: [x, x_18, mul_24, mul_25, x_19], Original ATen: [aten._to_copy, aten.mul, aten.add]
# Source node to ATen node mapping:
#   mul_24 => mul_33
#   mul_25 => mul_34
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_18 => add_24
#   x_19 => add_25
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_24 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_22, %view_31), kwargs = {})
#   %mul_33 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_18, %add_24), kwargs = {})
#   %mul_34 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_19, %convert_element_type_11), kwargs = {})
#   %add_25 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_33, %mul_34), kwargs = {})
triton_poi_fused__to_copy_add_mul_12 = async_compile.triton('triton_poi_fused__to_copy_add_mul_12', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_12', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_add_mul_12(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 67108864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    x2 = xindex // 1024
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (20))
    tmp4 = tl.broadcast_to(tmp3, [XBLOCK])
    tmp7 = tl.load(in_ptr1 + (21))
    tmp8 = tl.broadcast_to(tmp7, [XBLOCK])
    tmp10 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp12 = tl.load(in_ptr3 + (x2), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp5 * tmp2
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp10.to(tl.float32)
    tmp13 = tmp11 * tmp12
    tmp14 = tmp13.to(tl.float32)
    tmp15 = tmp9 * tmp14
    tmp16 = tmp6 + tmp15
    tl.store(in_out_ptr0 + (x0), tmp2, None)
    tl.store(out_ptr0 + (x0), tmp16, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/ek/cekprr4tsruoz6wra54tkv633qy6x7kqitfkedtmipqxfnukox6c.py
# Topologically Sorted Source Nodes: [embedding_2, v_7, mul_34, mul_35, v_8], Original ATen: [aten.embedding, aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   embedding_2 => embedding_2
#   mul_34 => mul_46
#   mul_35 => mul_47
#   v_7 => add_32, convert_element_type_62, convert_element_type_63, mean_11, mul_45, pow_14, rsqrt_11
#   v_8 => add_33
# Graph fragment:
#   %embedding_2 : [num_users=2] = call_function[target=torch.ops.aten.embedding.default](args = (%primals_4, %primals_1), kwargs = {})
#   %convert_element_type_62 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_32, torch.float32), kwargs = {})
#   %pow_14 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_62, 2), kwargs = {})
#   %mean_11 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_14, [3], True), kwargs = {})
#   %add_32 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_11, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_11 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_32,), kwargs = {})
#   %mul_45 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_62, %rsqrt_11), kwargs = {})
#   %convert_element_type_63 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_45, torch.bfloat16), kwargs = {})
#   %mul_46 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_20, %convert_element_type_63), kwargs = {})
#   %mul_47 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_21, %view_36), kwargs = {})
#   %add_33 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_46, %mul_47), kwargs = {})
triton_red_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_13 = async_compile.triton('triton_red_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_13', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

# kernel path: /tmp/torchinductor_root/no/cnowegau77zt7snqftl6jpppcwz4wznovqu4a2lqcrea3g5ogluq.py
# Topologically Sorted Source Nodes: [num_blocks_in_row], Original ATen: [aten.sum]
# Source node to ATen node mapping:
#   num_blocks_in_row => sum_3
# Graph fragment:
#   %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%permute, [-1]), kwargs = {})
triton_red_fused_sum_14 = async_compile.triton('triton_red_fused_sum_14', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints={'x': 2048, 'r0_': 128},
    reduction_hint=ReductionHint.OUTER,
    filename=__file__,
    triton_meta={'signature': {'in_ptr0': '*i32', 'out_ptr0': '*i64', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_sum_14', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_red_fused_sum_14(in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
    xnumel = 2048
    r0_numel = 128
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    r0_base = tl.arange(0, R0_BLOCK)[None, :]
    rbase = r0_base
    x0 = (xindex % 512)
    x1 = xindex // 512
    _tmp3 = tl.full([XBLOCK, R0_BLOCK], 0, tl.int64)
    x3 = xindex
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp0 = tl.load(in_ptr0 + (x0 + 513*r0_2 + 65664*x1), xmask & r0_mask, eviction_policy='evict_first', other=0.0)
        tmp1 = tmp0.to(tl.int64)
        tmp2 = tl.broadcast_to(tmp1, [XBLOCK, R0_BLOCK])
        tmp4 = _tmp3 + tmp2
        _tmp3 = tl.where(r0_mask & xmask, tmp4, _tmp3)
    tmp3 = tl.sum(_tmp3, 1)[:, None]
    tl.store(out_ptr0 + (x3), tmp3, xmask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/up/cups6t32a5orj7n3k4tdzndjkzbncopfcwpmoynf47ixpfr4tu7i.py
# Topologically Sorted Source Nodes: [num_blocks_in_row, q_num_blocks], Original ATen: [aten.sum, aten._to_copy]
# Source node to ATen node mapping:
#   num_blocks_in_row => sum_3
#   q_num_blocks => convert_element_type_2
# Graph fragment:
#   %sum_3 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%permute, [-1]), kwargs = {})
#   %convert_element_type_2 : [num_users=5] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%sum_3, torch.int32), kwargs = {})
triton_per_fused__to_copy_sum_15 = async_compile.triton('triton_per_fused__to_copy_sum_15', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.persistent_reduction(
    size_hints={'x': 512, 'r0_': 4},
    reduction_hint=ReductionHint.OUTER_TINY,
    filename=__file__,
    triton_meta={'signature': {'in_ptr0': '*i64', 'out_ptr1': '*i32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_sum_15', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_per_fused__to_copy_sum_15(in_ptr0, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr):
    xnumel = 512
    r0_numel = 4
    R0_BLOCK: tl.constexpr = 4
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    r0_index = tl.arange(0, R0_BLOCK)[None, :]
    r0_offset = 0
    r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    roffset = r0_offset
    rindex = r0_index
    r0_1 = r0_index
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + 512*r0_1), xmask, other=0.0)
    tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
    tmp3 = tl.where(xmask, tmp1, 0)
    tmp4 = tl.sum(tmp3, 1)[:, None]
    tmp5 = tmp4.to(tl.int32)
    tl.store(out_ptr1 + (x0), tmp5, xmask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/ww/cwwx7k7fdaoub7zjgl2hhh6kbk7fwq3xkewowrxqucdtbj33nghh.py
# Topologically Sorted Source Nodes: [col_indices, q_indices], Original ATen: [aten.sort, aten._to_copy]
# Source node to ATen node mapping:
#   col_indices => sort_2
#   q_indices => clone_4, convert_element_type_3
# Graph fragment:
#   %sort_2 : [num_users=1] = call_function[target=torch.ops.aten.sort.stable](args = (%permute,), kwargs = {stable: True, descending: True})
#   %convert_element_type_3 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_5, torch.int32), kwargs = {})
#   %clone_4 : [num_users=5] = call_function[target=torch.ops.aten.clone.default](args = (%convert_element_type_3,), kwargs = {memory_format: torch.contiguous_format})
triton_per_fused__to_copy_sort_16 = async_compile.triton('triton_per_fused__to_copy_sort_16', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.persistent_reduction(
    size_hints={'x': 512, 'r0_': 512},
    reduction_hint=ReductionHint.DEFAULT,
    filename=__file__,
    triton_meta={'signature': {'in_ptr0': '*i32', 'out_ptr1': '*i32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_sort_16', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_per_fused__to_copy_sort_16(in_ptr0, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr):
    xnumel = 512
    r0_numel = 512
    R0_BLOCK: tl.constexpr = 512
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = xindex < xnumel
    r0_index = tl.arange(0, R0_BLOCK)[None, :]
    r0_offset = 0
    r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    roffset = r0_offset
    rindex = r0_index
    r0_1 = r0_index
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0 + 513*r0_1), xmask, other=0.0)
    tmp1 = r0_1
    tmp2 = tmp1.to(tl.int16)
    tmp3 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
    tmp4 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
    tmp5, tmp6, = triton_helpers.sort_with_index(tmp3, tmp4, None, 1, stable=True, descending=True)
    tmp7 = tmp6.to(tl.int64)
    tmp8 = tmp7.to(tl.int32)
    tl.store(out_ptr1 + (r0_1 + 512*x0), tmp8, xmask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/j7/cj7da7nwso5qj2oggruryvukhxrdssv5npbid2c5krwk2jsitmsx.py
# Topologically Sorted Source Nodes: [x, x_27, mul_36, mul_37, x_28], Original ATen: [aten._to_copy, aten.mul, aten.add]
# Source node to ATen node mapping:
#   mul_36 => mul_49
#   mul_37 => mul_50
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_27 => add_36
#   x_28 => add_37
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_36 : [num_users=3] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_34, %view_43), kwargs = {})
#   %mul_49 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_25, %add_36), kwargs = {})
#   %mul_50 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_26, %convert_element_type_11), kwargs = {})
#   %add_37 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_49, %mul_50), kwargs = {})
triton_poi_fused__to_copy_add_mul_17 = async_compile.triton('triton_poi_fused__to_copy_add_mul_17', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_17', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_add_mul_17(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 67108864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    x2 = xindex // 1024
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (22))
    tmp4 = tl.broadcast_to(tmp3, [XBLOCK])
    tmp7 = tl.load(in_ptr1 + (23))
    tmp8 = tl.broadcast_to(tmp7, [XBLOCK])
    tmp10 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp12 = tl.load(in_ptr3 + (x2), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp5 * tmp2
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp10.to(tl.float32)
    tmp13 = tmp11 * tmp12
    tmp14 = tmp13.to(tl.float32)
    tmp15 = tmp9 * tmp14
    tmp16 = tmp6 + tmp15
    tl.store(in_out_ptr0 + (x0), tmp2, None)
    tl.store(out_ptr0 + (x0), tmp16, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/gm/cgmaqewctbulyeoh5pu63tjiexujhkg6mlo2syr5q4zpxnigwbjv.py
# Topologically Sorted Source Nodes: [v_10, v_11], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   v_10 => add_44, convert_element_type_82, convert_element_type_83, mean_15, mul_61, pow_19, rsqrt_15
#   v_11 => mul_62
# Graph fragment:
#   %convert_element_type_82 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_41, torch.float32), kwargs = {})
#   %pow_19 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_82, 2), kwargs = {})
#   %mean_15 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_19, [3], True), kwargs = {})
#   %add_44 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_15, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_15 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_44,), kwargs = {})
#   %mul_61 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_82, %rsqrt_15), kwargs = {})
#   %convert_element_type_83 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_61, torch.bfloat16), kwargs = {})
#   %mul_62 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_27, %convert_element_type_83), kwargs = {})
triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_18 = async_compile.triton('triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_18', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints={'x': 524288, 'r0_': 128},
    reduction_hint=ReductionHint.DEFAULT,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_18', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_18(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
    xnumel = 524288
    r0_numel = 128
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    r0_base = tl.arange(0, R0_BLOCK)[None, :]
    rbase = r0_base
    x0 = (xindex % 8)
    x1 = xindex // 8
    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
    x3 = xindex
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp0 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp2 = tmp1 * tmp1
        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
        tmp5 = _tmp4 + tmp3
        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tmp6 = 128.0
    tmp7 = tmp4 / tmp6
    tmp8 = 1.1920928955078125e-07
    tmp9 = tmp7 + tmp8
    tmp10 = libdevice.rsqrt(tmp9)
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (x3), tmp10, None)
    tmp11 = tl.load(in_ptr1 + (54))
    tmp12 = tl.broadcast_to(tmp11, [XBLOCK, R0_BLOCK])
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp14 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
        tmp13 = tmp12.to(tl.float32)
        tmp15 = tmp14.to(tl.float32)
        tmp16 = tmp15 * tmp10
        tmp17 = tmp16.to(tl.float32)
        tmp18 = tmp13 * tmp17
        tl.store(out_ptr0 + (r0_2 + 128*x3), tmp18, r0_mask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/4d/c4dqejvn5xzsntiixputovh3x2u5rzohnwhvpdkklhzxw7tz4v4f.py
# Topologically Sorted Source Nodes: [x, x_36, mul_47, mul_48, x_37], Original ATen: [aten._to_copy, aten.mul, aten.add]
# Source node to ATen node mapping:
#   mul_47 => mul_64
#   mul_48 => mul_65
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_36 => add_47
#   x_37 => add_48
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_47 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_45, %view_54), kwargs = {})
#   %mul_64 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_31, %add_47), kwargs = {})
#   %mul_65 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_32, %convert_element_type_11), kwargs = {})
#   %add_48 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_64, %mul_65), kwargs = {})
triton_poi_fused__to_copy_add_mul_19 = async_compile.triton('triton_poi_fused__to_copy_add_mul_19', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_19', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_add_mul_19(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 67108864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    x2 = xindex // 1024
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (24))
    tmp4 = tl.broadcast_to(tmp3, [XBLOCK])
    tmp7 = tl.load(in_ptr1 + (25))
    tmp8 = tl.broadcast_to(tmp7, [XBLOCK])
    tmp10 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp12 = tl.load(in_ptr3 + (x2), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp5 * tmp2
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp10.to(tl.float32)
    tmp13 = tmp11 * tmp12
    tmp14 = tmp13.to(tl.float32)
    tmp15 = tmp9 * tmp14
    tmp16 = tmp6 + tmp15
    tl.store(in_out_ptr0 + (x0), tmp2, None)
    tl.store(out_ptr0 + (x0), tmp16, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/x3/cx3qokhbbubra7mqgvmkq5kftfsuizstr3yk4qyyh6cuyrs34o3o.py
# Topologically Sorted Source Nodes: [v_13, v_14], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   v_13 => add_55, convert_element_type_102, convert_element_type_103, mean_19, mul_76, pow_24, rsqrt_19
#   v_14 => mul_77
# Graph fragment:
#   %convert_element_type_102 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_50, torch.float32), kwargs = {})
#   %pow_24 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_102, 2), kwargs = {})
#   %mean_19 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_24, [3], True), kwargs = {})
#   %add_55 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_19, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_19 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_55,), kwargs = {})
#   %mul_76 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_102, %rsqrt_19), kwargs = {})
#   %convert_element_type_103 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_76, torch.bfloat16), kwargs = {})
#   %mul_77 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_33, %convert_element_type_103), kwargs = {})
triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_20 = async_compile.triton('triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_20', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints={'x': 524288, 'r0_': 128},
    reduction_hint=ReductionHint.DEFAULT,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_20', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_20(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
    xnumel = 524288
    r0_numel = 128
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    r0_base = tl.arange(0, R0_BLOCK)[None, :]
    rbase = r0_base
    x0 = (xindex % 8)
    x1 = xindex // 8
    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
    x3 = xindex
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp0 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp2 = tmp1 * tmp1
        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
        tmp5 = _tmp4 + tmp3
        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tmp6 = 128.0
    tmp7 = tmp4 / tmp6
    tmp8 = 1.1920928955078125e-07
    tmp9 = tmp7 + tmp8
    tmp10 = libdevice.rsqrt(tmp9)
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (x3), tmp10, None)
    tmp11 = tl.load(in_ptr1 + (56))
    tmp12 = tl.broadcast_to(tmp11, [XBLOCK, R0_BLOCK])
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp14 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
        tmp13 = tmp12.to(tl.float32)
        tmp15 = tmp14.to(tl.float32)
        tmp16 = tmp15 * tmp10
        tmp17 = tmp16.to(tl.float32)
        tmp18 = tmp13 * tmp17
        tl.store(out_ptr0 + (r0_2 + 128*x3), tmp18, r0_mask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/5a/c5ad5ujjpkwpl33x5cd4btjh4opkya6gqdq3iseumumg2sv5rv47.py
# Topologically Sorted Source Nodes: [x, x_45, mul_58, mul_59, x_46], Original ATen: [aten._to_copy, aten.mul, aten.add]
# Source node to ATen node mapping:
#   mul_58 => mul_79
#   mul_59 => mul_80
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_45 => add_58
#   x_46 => add_59
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_58 : [num_users=3] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_56, %view_65), kwargs = {})
#   %mul_79 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_37, %add_58), kwargs = {})
#   %mul_80 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_38, %convert_element_type_11), kwargs = {})
#   %add_59 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_79, %mul_80), kwargs = {})
triton_poi_fused__to_copy_add_mul_21 = async_compile.triton('triton_poi_fused__to_copy_add_mul_21', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_21', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_add_mul_21(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 67108864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    x2 = xindex // 1024
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (26))
    tmp4 = tl.broadcast_to(tmp3, [XBLOCK])
    tmp7 = tl.load(in_ptr1 + (27))
    tmp8 = tl.broadcast_to(tmp7, [XBLOCK])
    tmp10 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp12 = tl.load(in_ptr3 + (x2), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp5 * tmp2
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp10.to(tl.float32)
    tmp13 = tmp11 * tmp12
    tmp14 = tmp13.to(tl.float32)
    tmp15 = tmp9 * tmp14
    tmp16 = tmp6 + tmp15
    tl.store(in_out_ptr0 + (x0), tmp2, None)
    tl.store(out_ptr0 + (x0), tmp16, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/3l/c3l352sbbwmfqadyepstqkxyunlrkwlyipt6wzxc5sdid4kso2we.py
# Topologically Sorted Source Nodes: [v_16, v_17], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   v_16 => add_66, convert_element_type_122, convert_element_type_123, mean_23, mul_91, pow_29, rsqrt_23
#   v_17 => mul_92
# Graph fragment:
#   %convert_element_type_122 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_59, torch.float32), kwargs = {})
#   %pow_29 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_122, 2), kwargs = {})
#   %mean_23 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_29, [3], True), kwargs = {})
#   %add_66 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_23, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_23 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_66,), kwargs = {})
#   %mul_91 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_122, %rsqrt_23), kwargs = {})
#   %convert_element_type_123 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_91, torch.bfloat16), kwargs = {})
#   %mul_92 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_39, %convert_element_type_123), kwargs = {})
triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_22 = async_compile.triton('triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_22', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints={'x': 524288, 'r0_': 128},
    reduction_hint=ReductionHint.DEFAULT,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_22', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_22(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
    xnumel = 524288
    r0_numel = 128
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    r0_base = tl.arange(0, R0_BLOCK)[None, :]
    rbase = r0_base
    x0 = (xindex % 8)
    x1 = xindex // 8
    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
    x3 = xindex
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp0 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp2 = tmp1 * tmp1
        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
        tmp5 = _tmp4 + tmp3
        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tmp6 = 128.0
    tmp7 = tmp4 / tmp6
    tmp8 = 1.1920928955078125e-07
    tmp9 = tmp7 + tmp8
    tmp10 = libdevice.rsqrt(tmp9)
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (x3), tmp10, None)
    tmp11 = tl.load(in_ptr1 + (58))
    tmp12 = tl.broadcast_to(tmp11, [XBLOCK, R0_BLOCK])
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp14 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
        tmp13 = tmp12.to(tl.float32)
        tmp15 = tmp14.to(tl.float32)
        tmp16 = tmp15 * tmp10
        tmp17 = tmp16.to(tl.float32)
        tmp18 = tmp13 * tmp17
        tl.store(out_ptr0 + (r0_2 + 128*x3), tmp18, r0_mask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/5o/c5olhh6jxkui3462aiyb2mh3d33xh5565xekk4vyopjjfevx33yd.py
# Topologically Sorted Source Nodes: [x, x_54, mul_69, mul_70, x_55], Original ATen: [aten._to_copy, aten.mul, aten.add]
# Source node to ATen node mapping:
#   mul_69 => mul_94
#   mul_70 => mul_95
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_54 => add_69
#   x_55 => add_70
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_69 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_67, %view_76), kwargs = {})
#   %mul_94 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_43, %add_69), kwargs = {})
#   %mul_95 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_44, %convert_element_type_11), kwargs = {})
#   %add_70 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_94, %mul_95), kwargs = {})
triton_poi_fused__to_copy_add_mul_23 = async_compile.triton('triton_poi_fused__to_copy_add_mul_23', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_23', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_add_mul_23(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 67108864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    x2 = xindex // 1024
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (28))
    tmp4 = tl.broadcast_to(tmp3, [XBLOCK])
    tmp7 = tl.load(in_ptr1 + (29))
    tmp8 = tl.broadcast_to(tmp7, [XBLOCK])
    tmp10 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp12 = tl.load(in_ptr3 + (x2), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp5 * tmp2
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp10.to(tl.float32)
    tmp13 = tmp11 * tmp12
    tmp14 = tmp13.to(tl.float32)
    tmp15 = tmp9 * tmp14
    tmp16 = tmp6 + tmp15
    tl.store(in_out_ptr0 + (x0), tmp2, None)
    tl.store(out_ptr0 + (x0), tmp16, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/or/coradf3ojpewewi55bvj5sp4f3zei2ezt3z4synn2r4sgldawiiv.py
# Topologically Sorted Source Nodes: [v_19, v_20], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   v_19 => add_77, convert_element_type_142, convert_element_type_143, mean_27, mul_106, pow_34, rsqrt_27
#   v_20 => mul_107
# Graph fragment:
#   %convert_element_type_142 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_68, torch.float32), kwargs = {})
#   %pow_34 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_142, 2), kwargs = {})
#   %mean_27 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_34, [3], True), kwargs = {})
#   %add_77 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_27, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_27 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_77,), kwargs = {})
#   %mul_106 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_142, %rsqrt_27), kwargs = {})
#   %convert_element_type_143 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_106, torch.bfloat16), kwargs = {})
#   %mul_107 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_45, %convert_element_type_143), kwargs = {})
triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_24 = async_compile.triton('triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_24', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints={'x': 524288, 'r0_': 128},
    reduction_hint=ReductionHint.DEFAULT,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_24', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_24(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
    xnumel = 524288
    r0_numel = 128
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    r0_base = tl.arange(0, R0_BLOCK)[None, :]
    rbase = r0_base
    x0 = (xindex % 8)
    x1 = xindex // 8
    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
    x3 = xindex
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp0 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp2 = tmp1 * tmp1
        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
        tmp5 = _tmp4 + tmp3
        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tmp6 = 128.0
    tmp7 = tmp4 / tmp6
    tmp8 = 1.1920928955078125e-07
    tmp9 = tmp7 + tmp8
    tmp10 = libdevice.rsqrt(tmp9)
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (x3), tmp10, None)
    tmp11 = tl.load(in_ptr1 + (60))
    tmp12 = tl.broadcast_to(tmp11, [XBLOCK, R0_BLOCK])
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp14 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
        tmp13 = tmp12.to(tl.float32)
        tmp15 = tmp14.to(tl.float32)
        tmp16 = tmp15 * tmp10
        tmp17 = tmp16.to(tl.float32)
        tmp18 = tmp13 * tmp17
        tl.store(out_ptr0 + (r0_2 + 128*x3), tmp18, r0_mask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/um/cum7bkjts477o3fmsa7k67uac75usvbmsslvohj6baymp6owp6g5.py
# Topologically Sorted Source Nodes: [x, x_63, mul_80, mul_81, x_64, rms_norm_29], Original ATen: [aten._to_copy, aten.mul, aten.add, aten.pow, aten.mean, aten.rsqrt]
# Source node to ATen node mapping:
#   mul_80 => mul_109
#   mul_81 => mul_110
#   rms_norm_29 => add_82, convert_element_type_152, convert_element_type_153, mean_29, mul_111, pow_37, rsqrt_29
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_63 => add_80
#   x_64 => add_81
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_80 : [num_users=3] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_78, %view_87), kwargs = {})
#   %mul_109 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_48, %add_80), kwargs = {})
#   %mul_110 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_49, %convert_element_type_11), kwargs = {})
#   %add_81 : [num_users=3] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_109, %mul_110), kwargs = {})
#   %convert_element_type_152 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%add_81, torch.float32), kwargs = {})
#   %pow_37 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_152, 2), kwargs = {})
#   %mean_29 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_37, [2], True), kwargs = {})
#   %add_82 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_29, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_29 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_82,), kwargs = {})
#   %mul_111 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_152, %rsqrt_29), kwargs = {})
#   %convert_element_type_153 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_111, torch.bfloat16), kwargs = {})
triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_25 = async_compile.triton('triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_25', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.persistent_reduction(
    size_hints={'x': 65536, 'r0_': 1024},
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_out_ptr1': '*fp32', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr0': '*bf16', 'out_ptr1': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7, 8, 9), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_25', 'mutated_arg_names': ['in_out_ptr0', 'in_out_ptr1'], 'optimize_mem': False, 'no_x_dim': True, 'num_load': 6, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_25(in_out_ptr0, in_out_ptr1, in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, r0_numel):
    xnumel = 65536
    XBLOCK: tl.constexpr = 1
    r0_numel = 1024
    R0_BLOCK: tl.constexpr = 1024
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = tl.full([1], xoffset, tl.int32)
    xmask = tl.full([R0_BLOCK], True, tl.int1)
    r0_index = tl.arange(0, R0_BLOCK)[:]
    r0_offset = 0
    r0_mask = tl.full([R0_BLOCK], True, tl.int1)
    roffset = r0_offset
    rindex = r0_index
    r0_1 = r0_index
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (r0_1 + 1024*x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (r0_1 + 1024*x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (30))
    tmp4 = tl.broadcast_to(tmp3, [R0_BLOCK])
    tmp7 = tl.load(in_ptr1 + (31))
    tmp8 = tl.broadcast_to(tmp7, [R0_BLOCK])
    tmp10 = tl.load(in_ptr2 + (r0_1 + 1024*x0), None).to(tl.float32)
    tmp12 = tl.load(in_ptr3 + (x0), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp5 * tmp2
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp10.to(tl.float32)
    tmp13 = tmp11 * tmp12
    tmp14 = tmp13.to(tl.float32)
    tmp15 = tmp9 * tmp14
    tmp16 = tmp6 + tmp15
    tmp17 = tmp16.to(tl.float32)
    tmp18 = tmp17 * tmp17
    tmp19 = tl.broadcast_to(tmp18, [R0_BLOCK])
    tmp21 = triton_helpers.promote_to_tensor(tl.sum(tmp19, 0))
    tmp22 = 1024.0
    tmp23 = tmp21 / tmp22
    tmp24 = 1.1920928955078125e-07
    tmp25 = tmp23 + tmp24
    tmp26 = libdevice.rsqrt(tmp25)
    tmp27 = tmp17 * tmp26
    tmp28 = tmp27.to(tl.float32)
    tl.store(in_out_ptr0 + (r0_1 + 1024*x0), tmp2, None)
    tl.store(out_ptr0 + (r0_1 + 1024*x0), tmp16, None)
    tl.debug_barrier()
    tl.store(in_out_ptr1 + (x0), tmp26, None)
    tl.store(out_ptr1 + (r0_1 + 1024*x0), tmp28, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/6e/c6eqp7rrtnu3dawhkpvzlmjehcgozpg63zhr6ygv4ottseeptwwv.py
# Topologically Sorted Source Nodes: [x, x_68, mul_82, mul_83, x_69], Original ATen: [aten._to_copy, aten.mul, aten.add]
# Source node to ATen node mapping:
#   mul_82 => mul_112
#   mul_83 => mul_113
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_68 => add_83
#   x_69 => add_84
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_83 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_81, %view_91), kwargs = {})
#   %mul_112 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_52, %add_83), kwargs = {})
#   %mul_113 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_53, %convert_element_type_11), kwargs = {})
#   %add_84 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_112, %mul_113), kwargs = {})
triton_poi_fused__to_copy_add_mul_26 = async_compile.triton('triton_poi_fused__to_copy_add_mul_26', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_26', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_add_mul_26(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 67108864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    x2 = xindex // 1024
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (32))
    tmp4 = tl.broadcast_to(tmp3, [XBLOCK])
    tmp7 = tl.load(in_ptr1 + (33))
    tmp8 = tl.broadcast_to(tmp7, [XBLOCK])
    tmp10 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp12 = tl.load(in_ptr3 + (x2), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp5 * tmp2
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp10.to(tl.float32)
    tmp13 = tmp11 * tmp12
    tmp14 = tmp13.to(tl.float32)
    tmp15 = tmp9 * tmp14
    tmp16 = tmp6 + tmp15
    tl.store(in_out_ptr0 + (x0), tmp2, None)
    tl.store(out_ptr0 + (x0), tmp16, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/5l/c5l3hhnvikxdguwukqkrvay66rhnzof4rsltb5ausuvgjqtdjzgh.py
# Topologically Sorted Source Nodes: [v_22, v_23], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   v_22 => add_91, convert_element_type_168, convert_element_type_169, mean_32, mul_124, pow_41, rsqrt_32
#   v_23 => mul_125
# Graph fragment:
#   %convert_element_type_168 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_77, torch.float32), kwargs = {})
#   %pow_41 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_168, 2), kwargs = {})
#   %mean_32 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_41, [3], True), kwargs = {})
#   %add_91 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_32, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_32 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_91,), kwargs = {})
#   %mul_124 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_168, %rsqrt_32), kwargs = {})
#   %convert_element_type_169 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_124, torch.bfloat16), kwargs = {})
#   %mul_125 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_54, %convert_element_type_169), kwargs = {})
triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_27 = async_compile.triton('triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_27', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints={'x': 524288, 'r0_': 128},
    reduction_hint=ReductionHint.DEFAULT,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_27', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_27(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
    xnumel = 524288
    r0_numel = 128
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    r0_base = tl.arange(0, R0_BLOCK)[None, :]
    rbase = r0_base
    x0 = (xindex % 8)
    x1 = xindex // 8
    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
    x3 = xindex
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp0 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp2 = tmp1 * tmp1
        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
        tmp5 = _tmp4 + tmp3
        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tmp6 = 128.0
    tmp7 = tmp4 / tmp6
    tmp8 = 1.1920928955078125e-07
    tmp9 = tmp7 + tmp8
    tmp10 = libdevice.rsqrt(tmp9)
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (x3), tmp10, None)
    tmp11 = tl.load(in_ptr1 + (64))
    tmp12 = tl.broadcast_to(tmp11, [XBLOCK, R0_BLOCK])
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp14 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
        tmp13 = tmp12.to(tl.float32)
        tmp15 = tmp14.to(tl.float32)
        tmp16 = tmp15 * tmp10
        tmp17 = tmp16.to(tl.float32)
        tmp18 = tmp13 * tmp17
        tl.store(out_ptr0 + (r0_2 + 128*x3), tmp18, r0_mask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/3m/c3mu34onv67mk36kyvc3jbfuqmfaghgdlivtfha4mftsifcmne6d.py
# Topologically Sorted Source Nodes: [x, x_77, mul_93, x_78, mul_94, mul_95, x_79], Original ATen: [aten._to_copy, aten.mul, aten.add]
# Source node to ATen node mapping:
#   mul_93 => mul_127
#   mul_94 => mul_128
#   mul_95 => mul_129
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_77 => add_94
#   x_78 => add_95
#   x_79 => add_96
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_94 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_92, %view_102), kwargs = {})
#   %mul_127 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_56, %add_80), kwargs = {})
#   %add_95 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_94, %mul_127), kwargs = {})
#   %mul_128 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_59, %add_95), kwargs = {})
#   %mul_129 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_60, %convert_element_type_11), kwargs = {})
#   %add_96 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_128, %mul_129), kwargs = {})
triton_poi_fused__to_copy_add_mul_28 = async_compile.triton('triton_poi_fused__to_copy_add_mul_28', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_28', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 8, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_add_mul_28(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 67108864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    x2 = xindex // 1024
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (6))
    tmp4 = tl.broadcast_to(tmp3, [XBLOCK])
    tmp6 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp9 = tl.load(in_ptr1 + (34))
    tmp10 = tl.broadcast_to(tmp9, [XBLOCK])
    tmp13 = tl.load(in_ptr1 + (35))
    tmp14 = tl.broadcast_to(tmp13, [XBLOCK])
    tmp16 = tl.load(in_ptr3 + (x0), None).to(tl.float32)
    tmp18 = tl.load(in_ptr4 + (x2), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp5 * tmp6
    tmp8 = tmp2 + tmp7
    tmp11 = tmp10.to(tl.float32)
    tmp12 = tmp11 * tmp8
    tmp15 = tmp14.to(tl.float32)
    tmp17 = tmp16.to(tl.float32)
    tmp19 = tmp17 * tmp18
    tmp20 = tmp19.to(tl.float32)
    tmp21 = tmp15 * tmp20
    tmp22 = tmp12 + tmp21
    tl.store(in_out_ptr0 + (x0), tmp8, None)
    tl.store(out_ptr0 + (x0), tmp22, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/rq/crqgkrarwr5xx4ejciborzwe6rpgdkjtfio4h7onae3l5ne54mfh.py
# Topologically Sorted Source Nodes: [v_25, v_26], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   v_25 => add_103, convert_element_type_188, convert_element_type_189, mean_36, mul_140, pow_46, rsqrt_36
#   v_26 => mul_141
# Graph fragment:
#   %convert_element_type_188 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_86, torch.float32), kwargs = {})
#   %pow_46 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_188, 2), kwargs = {})
#   %mean_36 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_46, [3], True), kwargs = {})
#   %add_103 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_36, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_36 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_103,), kwargs = {})
#   %mul_140 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_188, %rsqrt_36), kwargs = {})
#   %convert_element_type_189 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_140, torch.bfloat16), kwargs = {})
#   %mul_141 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_61, %convert_element_type_189), kwargs = {})
triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_29 = async_compile.triton('triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_29', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints={'x': 524288, 'r0_': 128},
    reduction_hint=ReductionHint.DEFAULT,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_29', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_29(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
    xnumel = 524288
    r0_numel = 128
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    r0_base = tl.arange(0, R0_BLOCK)[None, :]
    rbase = r0_base
    x0 = (xindex % 8)
    x1 = xindex // 8
    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
    x3 = xindex
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp0 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp2 = tmp1 * tmp1
        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
        tmp5 = _tmp4 + tmp3
        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tmp6 = 128.0
    tmp7 = tmp4 / tmp6
    tmp8 = 1.1920928955078125e-07
    tmp9 = tmp7 + tmp8
    tmp10 = libdevice.rsqrt(tmp9)
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (x3), tmp10, None)
    tmp11 = tl.load(in_ptr1 + (66))
    tmp12 = tl.broadcast_to(tmp11, [XBLOCK, R0_BLOCK])
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp14 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
        tmp13 = tmp12.to(tl.float32)
        tmp15 = tmp14.to(tl.float32)
        tmp16 = tmp15 * tmp10
        tmp17 = tmp16.to(tl.float32)
        tmp18 = tmp13 * tmp17
        tl.store(out_ptr0 + (r0_2 + 128*x3), tmp18, r0_mask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/ob/cobkd7rgne33lnzcrlopy7kshaq5iahp73bpijn6jkyxonyiefbn.py
# Topologically Sorted Source Nodes: [x, x_87, mul_105, x_88, mul_106, mul_107, x_89], Original ATen: [aten._to_copy, aten.mul, aten.add]
# Source node to ATen node mapping:
#   mul_105 => mul_143
#   mul_106 => mul_144
#   mul_107 => mul_145
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_87 => add_106
#   x_88 => add_107
#   x_89 => add_108
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_106 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_104, %view_113), kwargs = {})
#   %mul_143 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_63, %add_58), kwargs = {})
#   %add_107 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_106, %mul_143), kwargs = {})
#   %mul_144 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_66, %add_107), kwargs = {})
#   %mul_145 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_67, %convert_element_type_11), kwargs = {})
#   %add_108 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_144, %mul_145), kwargs = {})
triton_poi_fused__to_copy_add_mul_30 = async_compile.triton('triton_poi_fused__to_copy_add_mul_30', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_30', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 8, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_add_mul_30(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 67108864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    x2 = xindex // 1024
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (4))
    tmp4 = tl.broadcast_to(tmp3, [XBLOCK])
    tmp6 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp9 = tl.load(in_ptr1 + (36))
    tmp10 = tl.broadcast_to(tmp9, [XBLOCK])
    tmp13 = tl.load(in_ptr1 + (37))
    tmp14 = tl.broadcast_to(tmp13, [XBLOCK])
    tmp16 = tl.load(in_ptr3 + (x0), None).to(tl.float32)
    tmp18 = tl.load(in_ptr4 + (x2), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp5 * tmp6
    tmp8 = tmp2 + tmp7
    tmp11 = tmp10.to(tl.float32)
    tmp12 = tmp11 * tmp8
    tmp15 = tmp14.to(tl.float32)
    tmp17 = tmp16.to(tl.float32)
    tmp19 = tmp17 * tmp18
    tmp20 = tmp19.to(tl.float32)
    tmp21 = tmp15 * tmp20
    tmp22 = tmp12 + tmp21
    tl.store(in_out_ptr0 + (x0), tmp8, None)
    tl.store(out_ptr0 + (x0), tmp22, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/rl/crlqncs5cyay6ejld2o2hjojxrcp37wbsigwzffhbw6q7gbyvopx.py
# Topologically Sorted Source Nodes: [v_28, v_29], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   v_28 => add_115, convert_element_type_208, convert_element_type_209, mean_40, mul_156, pow_51, rsqrt_40
#   v_29 => mul_157
# Graph fragment:
#   %convert_element_type_208 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_95, torch.float32), kwargs = {})
#   %pow_51 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_208, 2), kwargs = {})
#   %mean_40 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_51, [3], True), kwargs = {})
#   %add_115 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_40, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_40 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_115,), kwargs = {})
#   %mul_156 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_208, %rsqrt_40), kwargs = {})
#   %convert_element_type_209 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_156, torch.bfloat16), kwargs = {})
#   %mul_157 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_68, %convert_element_type_209), kwargs = {})
triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_31 = async_compile.triton('triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_31', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints={'x': 524288, 'r0_': 128},
    reduction_hint=ReductionHint.DEFAULT,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_31', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_31(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
    xnumel = 524288
    r0_numel = 128
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    r0_base = tl.arange(0, R0_BLOCK)[None, :]
    rbase = r0_base
    x0 = (xindex % 8)
    x1 = xindex // 8
    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
    x3 = xindex
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp0 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp2 = tmp1 * tmp1
        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
        tmp5 = _tmp4 + tmp3
        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tmp6 = 128.0
    tmp7 = tmp4 / tmp6
    tmp8 = 1.1920928955078125e-07
    tmp9 = tmp7 + tmp8
    tmp10 = libdevice.rsqrt(tmp9)
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (x3), tmp10, None)
    tmp11 = tl.load(in_ptr1 + (68))
    tmp12 = tl.broadcast_to(tmp11, [XBLOCK, R0_BLOCK])
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp14 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
        tmp13 = tmp12.to(tl.float32)
        tmp15 = tmp14.to(tl.float32)
        tmp16 = tmp15 * tmp10
        tmp17 = tmp16.to(tl.float32)
        tmp18 = tmp13 * tmp17
        tl.store(out_ptr0 + (r0_2 + 128*x3), tmp18, r0_mask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/74/c743vdp4z34l2dxe7ice23yfzm4yzjjkvwh2o2aia3zky6fjyd4a.py
# Topologically Sorted Source Nodes: [x, x_97, mul_117, x_98, mul_118, mul_119, x_99], Original ATen: [aten._to_copy, aten.mul, aten.add]
# Source node to ATen node mapping:
#   mul_117 => mul_159
#   mul_118 => mul_160
#   mul_119 => mul_161
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_97 => add_118
#   x_98 => add_119
#   x_99 => add_120
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_118 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_116, %view_124), kwargs = {})
#   %mul_159 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_70, %add_36), kwargs = {})
#   %add_119 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_118, %mul_159), kwargs = {})
#   %mul_160 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_73, %add_119), kwargs = {})
#   %mul_161 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_74, %convert_element_type_11), kwargs = {})
#   %add_120 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_160, %mul_161), kwargs = {})
triton_poi_fused__to_copy_add_mul_32 = async_compile.triton('triton_poi_fused__to_copy_add_mul_32', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*bf16', 'in_ptr4': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6, 7), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_32', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 8, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_add_mul_32(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 67108864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    x2 = xindex // 1024
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (2))
    tmp4 = tl.broadcast_to(tmp3, [XBLOCK])
    tmp6 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp9 = tl.load(in_ptr1 + (38))
    tmp10 = tl.broadcast_to(tmp9, [XBLOCK])
    tmp13 = tl.load(in_ptr1 + (39))
    tmp14 = tl.broadcast_to(tmp13, [XBLOCK])
    tmp16 = tl.load(in_ptr3 + (x0), None).to(tl.float32)
    tmp18 = tl.load(in_ptr4 + (x2), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp7 = tmp5 * tmp6
    tmp8 = tmp2 + tmp7
    tmp11 = tmp10.to(tl.float32)
    tmp12 = tmp11 * tmp8
    tmp15 = tmp14.to(tl.float32)
    tmp17 = tmp16.to(tl.float32)
    tmp19 = tmp17 * tmp18
    tmp20 = tmp19.to(tl.float32)
    tmp21 = tmp15 * tmp20
    tmp22 = tmp12 + tmp21
    tl.store(in_out_ptr0 + (x0), tmp8, None)
    tl.store(out_ptr0 + (x0), tmp22, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/zc/czcz7way4mimo7honhnkuencqqn5henx46ibshrdyn5fdwxy7566.py
# Topologically Sorted Source Nodes: [v_31, v_32], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   v_31 => add_127, convert_element_type_228, convert_element_type_229, mean_44, mul_172, pow_56, rsqrt_44
#   v_32 => mul_173
# Graph fragment:
#   %convert_element_type_228 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_104, torch.float32), kwargs = {})
#   %pow_56 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_228, 2), kwargs = {})
#   %mean_44 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_56, [3], True), kwargs = {})
#   %add_127 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_44, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_44 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_127,), kwargs = {})
#   %mul_172 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_228, %rsqrt_44), kwargs = {})
#   %convert_element_type_229 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_172, torch.bfloat16), kwargs = {})
#   %mul_173 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_75, %convert_element_type_229), kwargs = {})
triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_33 = async_compile.triton('triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_33', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints={'x': 524288, 'r0_': 128},
    reduction_hint=ReductionHint.DEFAULT,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_33', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_33(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
    xnumel = 524288
    r0_numel = 128
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    r0_base = tl.arange(0, R0_BLOCK)[None, :]
    rbase = r0_base
    x0 = (xindex % 8)
    x1 = xindex // 8
    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
    x3 = xindex
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp0 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp2 = tmp1 * tmp1
        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
        tmp5 = _tmp4 + tmp3
        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tmp6 = 128.0
    tmp7 = tmp4 / tmp6
    tmp8 = 1.1920928955078125e-07
    tmp9 = tmp7 + tmp8
    tmp10 = libdevice.rsqrt(tmp9)
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (x3), tmp10, None)
    tmp11 = tl.load(in_ptr1 + (70))
    tmp12 = tl.broadcast_to(tmp11, [XBLOCK, R0_BLOCK])
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp14 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
        tmp13 = tmp12.to(tl.float32)
        tmp15 = tmp14.to(tl.float32)
        tmp16 = tmp15 * tmp10
        tmp17 = tmp16.to(tl.float32)
        tmp18 = tmp13 * tmp17
        tl.store(out_ptr0 + (r0_2 + 128*x3), tmp18, r0_mask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/5x/c5xcumpvhtklfpycbfuu6qybeij3dxsovfaavr74yjxandxsnsgj.py
# Topologically Sorted Source Nodes: [x, x_107, mul_129, mul_130, x_108], Original ATen: [aten._to_copy, aten.mul, aten.add]
# Source node to ATen node mapping:
#   mul_129 => mul_175
#   mul_130 => mul_176
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_107 => add_130
#   x_108 => add_131
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_130 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_128, %view_135), kwargs = {})
#   %mul_175 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_79, %add_130), kwargs = {})
#   %mul_176 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_80, %convert_element_type_11), kwargs = {})
#   %add_131 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_175, %mul_176), kwargs = {})
triton_poi_fused__to_copy_add_mul_34 = async_compile.triton('triton_poi_fused__to_copy_add_mul_34', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_34', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_add_mul_34(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 67108864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    x2 = xindex // 1024
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (40))
    tmp4 = tl.broadcast_to(tmp3, [XBLOCK])
    tmp7 = tl.load(in_ptr1 + (41))
    tmp8 = tl.broadcast_to(tmp7, [XBLOCK])
    tmp10 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp12 = tl.load(in_ptr3 + (x2), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp5 * tmp2
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp10.to(tl.float32)
    tmp13 = tmp11 * tmp12
    tmp14 = tmp13.to(tl.float32)
    tmp15 = tmp9 * tmp14
    tmp16 = tmp6 + tmp15
    tl.store(in_out_ptr0 + (x0), tmp2, None)
    tl.store(out_ptr0 + (x0), tmp16, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/kv/ckv3qafx4eqxl5jz764omxg2lyc62muwyya6is2mcpk74rln7pwb.py
# Topologically Sorted Source Nodes: [v_34, v_35], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   v_34 => add_138, convert_element_type_248, convert_element_type_249, mean_48, mul_187, pow_61, rsqrt_48
#   v_35 => mul_188
# Graph fragment:
#   %convert_element_type_248 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_113, torch.float32), kwargs = {})
#   %pow_61 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_248, 2), kwargs = {})
#   %mean_48 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_61, [3], True), kwargs = {})
#   %add_138 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_48, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_48 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_138,), kwargs = {})
#   %mul_187 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_248, %rsqrt_48), kwargs = {})
#   %convert_element_type_249 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_187, torch.bfloat16), kwargs = {})
#   %mul_188 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_81, %convert_element_type_249), kwargs = {})
triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_35 = async_compile.triton('triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_35', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints={'x': 524288, 'r0_': 128},
    reduction_hint=ReductionHint.DEFAULT,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_35', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 1, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_35(in_out_ptr0, in_ptr0, in_ptr1, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
    xnumel = 524288
    r0_numel = 128
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    r0_base = tl.arange(0, R0_BLOCK)[None, :]
    rbase = r0_base
    x0 = (xindex % 8)
    x1 = xindex // 8
    _tmp4 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
    x3 = xindex
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp0 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp2 = tmp1 * tmp1
        tmp3 = tl.broadcast_to(tmp2, [XBLOCK, R0_BLOCK])
        tmp5 = _tmp4 + tmp3
        _tmp4 = tl.where(r0_mask, tmp5, _tmp4)
    tmp4 = tl.sum(_tmp4, 1)[:, None]
    tmp6 = 128.0
    tmp7 = tmp4 / tmp6
    tmp8 = 1.1920928955078125e-07
    tmp9 = tmp7 + tmp8
    tmp10 = libdevice.rsqrt(tmp9)
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (x3), tmp10, None)
    tmp11 = tl.load(in_ptr1 + (72))
    tmp12 = tl.broadcast_to(tmp11, [XBLOCK, R0_BLOCK])
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_2 = r0_index
        tmp14 = tl.load(in_ptr0 + (2048 + r0_2 + 128*x0 + 3072*x1), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
        tmp13 = tmp12.to(tl.float32)
        tmp15 = tmp14.to(tl.float32)
        tmp16 = tmp15 * tmp10
        tmp17 = tmp16.to(tl.float32)
        tmp18 = tmp13 * tmp17
        tl.store(out_ptr0 + (r0_2 + 128*x3), tmp18, r0_mask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/zb/czbz2r2e3kp7oggmjdk3a5upaicsccihpzape65mdcrfvklyf3x5.py
# Topologically Sorted Source Nodes: [x, x_116, mul_140, mul_141, x_117], Original ATen: [aten._to_copy, aten.mul, aten.add]
# Source node to ATen node mapping:
#   mul_140 => mul_190
#   mul_141 => mul_191
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_116 => add_141
#   x_117 => add_142
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_141 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_139, %view_146), kwargs = {})
#   %mul_190 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_85, %add_141), kwargs = {})
#   %mul_191 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_86, %convert_element_type_11), kwargs = {})
#   %add_142 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_190, %mul_191), kwargs = {})
triton_poi_fused__to_copy_add_mul_36 = async_compile.triton('triton_poi_fused__to_copy_add_mul_36', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_36', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_add_mul_36(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 67108864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    x2 = xindex // 1024
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (42))
    tmp4 = tl.broadcast_to(tmp3, [XBLOCK])
    tmp7 = tl.load(in_ptr1 + (43))
    tmp8 = tl.broadcast_to(tmp7, [XBLOCK])
    tmp10 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp12 = tl.load(in_ptr3 + (x2), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp5 * tmp2
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp10.to(tl.float32)
    tmp13 = tmp11 * tmp12
    tmp14 = tmp13.to(tl.float32)
    tmp15 = tmp9 * tmp14
    tmp16 = tmp6 + tmp15
    tl.store(in_out_ptr0 + (x0), tmp2, None)
    tl.store(out_ptr0 + (x0), tmp16, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/67/c67g3sywkepwx42v2kte7km6tyzt3wowl5bqev5loa7xhfbjlrll.py
# Topologically Sorted Source Nodes: [v_37, mul_150, mul_151, v_38], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   mul_150 => mul_203
#   mul_151 => mul_204
#   v_37 => add_149, convert_element_type_268, convert_element_type_269, mean_52, mul_202, pow_66, rsqrt_52
#   v_38 => add_150
# Graph fragment:
#   %convert_element_type_268 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_122, torch.float32), kwargs = {})
#   %pow_66 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_268, 2), kwargs = {})
#   %mean_52 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_66, [3], True), kwargs = {})
#   %add_149 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_52, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_52 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_149,), kwargs = {})
#   %mul_202 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_268, %rsqrt_52), kwargs = {})
#   %convert_element_type_269 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_202, torch.bfloat16), kwargs = {})
#   %mul_203 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_87, %convert_element_type_269), kwargs = {})
#   %mul_204 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_88, %view_12), kwargs = {})
#   %add_150 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_203, %mul_204), kwargs = {})
triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_37 = async_compile.triton('triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_37', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

# kernel path: /tmp/torchinductor_root/sa/csauln5lydlnnbbeigkp3i64cxmbs2cl6dszkp422muv53oksaxg.py
# Topologically Sorted Source Nodes: [x, x_125, mul_152, mul_153, x_126], Original ATen: [aten._to_copy, aten.mul, aten.add]
# Source node to ATen node mapping:
#   mul_152 => mul_206
#   mul_153 => mul_207
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_125 => add_153
#   x_126 => add_154
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_153 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_151, %view_158), kwargs = {})
#   %mul_206 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_92, %add_153), kwargs = {})
#   %mul_207 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_93, %convert_element_type_11), kwargs = {})
#   %add_154 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_206, %mul_207), kwargs = {})
triton_poi_fused__to_copy_add_mul_38 = async_compile.triton('triton_poi_fused__to_copy_add_mul_38', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_38', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_add_mul_38(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 67108864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    x2 = xindex // 1024
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (44))
    tmp4 = tl.broadcast_to(tmp3, [XBLOCK])
    tmp7 = tl.load(in_ptr1 + (45))
    tmp8 = tl.broadcast_to(tmp7, [XBLOCK])
    tmp10 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp12 = tl.load(in_ptr3 + (x2), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp5 * tmp2
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp10.to(tl.float32)
    tmp13 = tmp11 * tmp12
    tmp14 = tmp13.to(tl.float32)
    tmp15 = tmp9 * tmp14
    tmp16 = tmp6 + tmp15
    tl.store(in_out_ptr0 + (x0), tmp2, None)
    tl.store(out_ptr0 + (x0), tmp16, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/cl/cclwkzasbgin3qiryoc4tob3tnaiz2vd7t6kvhgpvlf2z4lysfc7.py
# Topologically Sorted Source Nodes: [v_40, mul_162, mul_163, v_41], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   mul_162 => mul_219
#   mul_163 => mul_220
#   v_40 => add_161, convert_element_type_288, convert_element_type_289, mean_56, mul_218, pow_71, rsqrt_56
#   v_41 => add_162
# Graph fragment:
#   %convert_element_type_288 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_131, torch.float32), kwargs = {})
#   %pow_71 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_288, 2), kwargs = {})
#   %mean_56 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_71, [3], True), kwargs = {})
#   %add_161 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_56, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_56 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_161,), kwargs = {})
#   %mul_218 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_288, %rsqrt_56), kwargs = {})
#   %convert_element_type_289 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_218, torch.bfloat16), kwargs = {})
#   %mul_219 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_94, %convert_element_type_289), kwargs = {})
#   %mul_220 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_95, %view_24), kwargs = {})
#   %add_162 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_219, %mul_220), kwargs = {})
triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_39 = async_compile.triton('triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_39', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

# kernel path: /tmp/torchinductor_root/e6/ce6faozrowqnugjloaot2m4weqyoin7iw7p4gayjnqto7ljbbsy5.py
# Topologically Sorted Source Nodes: [x, x_134, mul_164, mul_165, x_135], Original ATen: [aten._to_copy, aten.mul, aten.add]
# Source node to ATen node mapping:
#   mul_164 => mul_222
#   mul_165 => mul_223
#   x => convert_element_type_10, convert_element_type_11, mul
#   x_134 => add_165
#   x_135 => add_166
# Graph fragment:
#   %convert_element_type_10 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%unsqueeze_42, torch.float32), kwargs = {})
#   %mul : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_10, %rsqrt), kwargs = {})
#   %convert_element_type_11 : [num_users=17] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul, torch.bfloat16), kwargs = {})
#   %add_165 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%add_163, %view_170), kwargs = {})
#   %mul_222 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_99, %add_165), kwargs = {})
#   %mul_223 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_100, %convert_element_type_11), kwargs = {})
#   %add_166 : [num_users=2] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_222, %mul_223), kwargs = {})
triton_poi_fused__to_copy_add_mul_40 = async_compile.triton('triton_poi_fused__to_copy_add_mul_40', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*bf16', 'in_ptr0': '*bf16', 'in_ptr1': '*fp32', 'in_ptr2': '*bf16', 'in_ptr3': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 6), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_add_mul_40', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 6, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_add_mul_40(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 67108864
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    x2 = xindex // 1024
    tmp0 = tl.load(in_ptr0 + (x0), None).to(tl.float32)
    tmp1 = tl.load(in_out_ptr0 + (x0), None).to(tl.float32)
    tmp3 = tl.load(in_ptr1 + (46))
    tmp4 = tl.broadcast_to(tmp3, [XBLOCK])
    tmp7 = tl.load(in_ptr1 + (47))
    tmp8 = tl.broadcast_to(tmp7, [XBLOCK])
    tmp10 = tl.load(in_ptr2 + (x0), None).to(tl.float32)
    tmp12 = tl.load(in_ptr3 + (x2), None, eviction_policy='evict_last')
    tmp2 = tmp0 + tmp1
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tmp5 * tmp2
    tmp9 = tmp8.to(tl.float32)
    tmp11 = tmp10.to(tl.float32)
    tmp13 = tmp11 * tmp12
    tmp14 = tmp13.to(tl.float32)
    tmp15 = tmp9 * tmp14
    tmp16 = tmp6 + tmp15
    tl.store(in_out_ptr0 + (x0), tmp2, None)
    tl.store(out_ptr0 + (x0), tmp16, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/d6/cd6v7u3w7avnu2dbjxczpvj3vflng54wcaqoh6hozw5pm3mgobxa.py
# Topologically Sorted Source Nodes: [v_43, mul_174, mul_175, v_44], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
# Source node to ATen node mapping:
#   mul_174 => mul_235
#   mul_175 => mul_236
#   v_43 => add_173, convert_element_type_308, convert_element_type_309, mean_60, mul_234, pow_76, rsqrt_60
#   v_44 => add_174
# Graph fragment:
#   %convert_element_type_308 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%getitem_140, torch.float32), kwargs = {})
#   %pow_76 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_308, 2), kwargs = {})
#   %mean_60 : [num_users=1] = call_function[target=torch.ops.aten.mean.dim](args = (%pow_76, [3], True), kwargs = {})
#   %add_173 : [num_users=1] = call_function[target=torch.ops.aten.add.Scalar](args = (%mean_60, 1.1920928955078125e-07), kwargs = {})
#   %rsqrt_60 : [num_users=2] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_173,), kwargs = {})
#   %mul_234 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_308, %rsqrt_60), kwargs = {})
#   %convert_element_type_309 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mul_234, torch.bfloat16), kwargs = {})
#   %mul_235 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_101, %convert_element_type_309), kwargs = {})
#   %mul_236 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%select_102, %view_36), kwargs = {})
#   %add_174 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%mul_235, %mul_236), kwargs = {})
triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_41 = async_compile.triton('triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_41', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

# kernel path: /tmp/torchinductor_root/o4/co4c4lyddpoksumdecjnu5gcy24w3lukewqn4peal55da2yz7hat.py
# Topologically Sorted Source Nodes: [bfloat16, linear_62], Original ATen: [aten._to_copy, aten.t]
# Source node to ATen node mapping:
#   bfloat16 => convert_element_type_320
#   linear_62 => permute_126
# Graph fragment:
#   %convert_element_type_320 : [num_users=1] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%primals_85, torch.bfloat16), kwargs = {})
#   %permute_126 : [num_users=2] = call_function[target=torch.ops.aten.permute.default](args = (%convert_element_type_320, [1, 0]), kwargs = {})
triton_poi_fused__to_copy_t_42 = async_compile.triton('triton_poi_fused__to_copy_t_42', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints={'x': 67108864}, 
    filename=__file__,
    triton_meta={'signature': {'in_ptr0': '*fp32', 'out_ptr0': '*bf16', 'xnumel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__to_copy_t_42', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_poi_fused__to_copy_t_42(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 51511296
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (x0), None)
    tmp1 = tmp0.to(tl.float32)
    tl.store(out_ptr0 + (x0), tmp1, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/yp/cypjst4ud7gj2epdz2z4zp62hwuddu24imu72l4dphvx437smlon.py
# Topologically Sorted Source Nodes: [logits, mul_176, square_16, add_116, rsqrt, mul_177, loss], Original ATen: [aten._to_copy, aten.mul, aten.pow, aten.add, aten.rsqrt, aten._log_softmax]
# Source node to ATen node mapping:
#   add_116 => add_179
#   logits => convert_element_type_323
#   loss => amax, exp, log, sub_4, sum_7
#   mul_176 => mul_239
#   mul_177 => mul_240
#   rsqrt => rsqrt_63
#   square_16 => pow_80
# Graph fragment:
#   %convert_element_type_323 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%mm_62, torch.float32), kwargs = {})
#   %mul_239 : [num_users=1] = call_function[target=torch.ops.aten.mul.Tensor](args = (%convert_element_type_323, 15), kwargs = {})
#   %pow_80 : [num_users=1] = call_function[target=torch.ops.aten.pow.Tensor_Scalar](args = (%convert_element_type_323, 2), kwargs = {})
#   %add_179 : [num_users=1] = call_function[target=torch.ops.aten.add.Tensor](args = (%pow_80, 225), kwargs = {})
#   %rsqrt_63 : [num_users=1] = call_function[target=torch.ops.aten.rsqrt.default](args = (%add_179,), kwargs = {})
#   %mul_240 : [num_users=2] = call_function[target=torch.ops.aten.mul.Tensor](args = (%mul_239, %rsqrt_63), kwargs = {})
#   %amax : [num_users=2] = call_function[target=torch.ops.aten.amax.default](args = (%mul_240, [1], True), kwargs = {})
#   %sub_4 : [num_users=2] = call_function[target=torch.ops.aten.sub.Tensor](args = (%mul_240, %amax), kwargs = {})
#   %exp : [num_users=1] = call_function[target=torch.ops.aten.exp.default](args = (%sub_4,), kwargs = {})
#   %sum_7 : [num_users=1] = call_function[target=torch.ops.aten.sum.dim_IntList](args = (%exp, [1], True), kwargs = {})
#   %log : [num_users=2] = call_function[target=torch.ops.aten.log.default](args = (%sum_7,), kwargs = {})
triton_red_fused__log_softmax__to_copy_add_mul_pow_rsqrt_43 = async_compile.triton('triton_red_fused__log_softmax__to_copy_add_mul_pow_rsqrt_43', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints={'x': 65536, 'r0_': 65536},
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*bf16', 'out_ptr0': '*fp32', 'xnumel': 'i64', 'r0_numel': 'i64'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__log_softmax__to_copy_add_mul_pow_rsqrt_43', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 2, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_red_fused__log_softmax__to_copy_add_mul_pow_rsqrt_43(in_out_ptr0, in_ptr0, out_ptr0, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
    xnumel = 65536
    r0_numel = 50304
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0).to(tl.int64) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64)
    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    r0_base = tl.arange(0, R0_BLOCK)[None, :].to(tl.int64)
    rbase = r0_base
    x0 = xindex
    _tmp10 = tl.full([XBLOCK, R0_BLOCK], float("-inf"), tl.float32)
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_1 = r0_index
        tmp0 = tl.load(in_ptr0 + (r0_1 + 50304*x0), r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp2 = 15.0
        tmp3 = tmp1 * tmp2
        tmp4 = tmp1 * tmp1
        tmp5 = 225.0
        tmp6 = tmp4 + tmp5
        tmp7 = libdevice.rsqrt(tmp6)
        tmp8 = tmp3 * tmp7
        tmp9 = tl.broadcast_to(tmp8, [XBLOCK, R0_BLOCK])
        tmp11 = triton_helpers.maximum(_tmp10, tmp9)
        _tmp10 = tl.where(r0_mask, tmp11, _tmp10)
    tmp10 = triton_helpers.max2(_tmp10, 1)[:, None]
    tl.store(out_ptr0 + (x0), tmp10, None)
    _tmp24 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_1 = r0_index
        tmp12 = tl.load(in_ptr0 + (r0_1 + 50304*x0), r0_mask, eviction_policy='evict_first', other=0.0).to(tl.float32)
        tmp13 = tmp12.to(tl.float32)
        tmp14 = 15.0
        tmp15 = tmp13 * tmp14
        tmp16 = tmp13 * tmp13
        tmp17 = 225.0
        tmp18 = tmp16 + tmp17
        tmp19 = libdevice.rsqrt(tmp18)
        tmp20 = tmp15 * tmp19
        tmp21 = tmp20 - tmp10
        tmp22 = tl_math.exp(tmp21)
        tmp23 = tl.broadcast_to(tmp22, [XBLOCK, R0_BLOCK])
        tmp25 = _tmp24 + tmp23
        _tmp24 = tl.where(r0_mask, tmp25, _tmp24)
    tmp24 = tl.sum(_tmp24, 1)[:, None]
    tmp26 = tl_math.log(tmp24)
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (x0), tmp26, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/tr/ctrqxeycm6lkcq3z6li6ldlnkdmewwjrqbbg6ysmgy2mz4arywh2.py
# Topologically Sorted Source Nodes: [loss], Original ATen: [aten.nll_loss_forward]
# Source node to ATen node mapping:
#   loss => full_default_13, ne, neg_30, sum_8, sum_9, where_5
# Graph fragment:
#   %ne : [num_users=3] = call_function[target=torch.ops.aten.ne.Scalar](args = (%primals_86, -100), kwargs = {})
#   %neg_30 : [num_users=1] = call_function[target=torch.ops.aten.neg.default](args = (%squeeze,), kwargs = {})
#   %full_default_13 : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([], 0.0), kwargs = {dtype: torch.float32, layout: torch.strided, device: cuda:0, pin_memory: False})
#   %where_5 : [num_users=1] = call_function[target=torch.ops.aten.where.self](args = (%ne, %neg_30, %full_default_13), kwargs = {})
#   %sum_8 : [num_users=1] = call_function[target=torch.ops.aten.sum.default](args = (%ne,), kwargs = {})
#   %sum_9 : [num_users=1] = call_function[target=torch.ops.aten.sum.default](args = (%where_5,), kwargs = {})
triton_red_fused_nll_loss_forward_44 = async_compile.triton('triton_red_fused_nll_loss_forward_44', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints={'x': 8, 'r0_': 8192},
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    triton_meta={'signature': {'in_ptr0': '*i64', 'in_ptr1': '*bf16', 'in_ptr2': '*fp32', 'in_ptr3': '*fp32', 'out_ptr0': '*i64', 'out_ptr1': '*fp32', 'xnumel': 'i64', 'r0_numel': 'i64'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3, 4, 5, 7), 'tt.equal_to': ()}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_nll_loss_forward_44', 'mutated_arg_names': [], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 3, 'num_reduction': 2, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_red_fused_nll_loss_forward_44(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr, R0_BLOCK : tl.constexpr):
    xnumel = 8
    r0_numel = 8192
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0).to(tl.int64) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None].to(tl.int64)
    xmask = xindex < xnumel
    r0_base = tl.arange(0, R0_BLOCK)[None, :].to(tl.int64)
    rbase = r0_base
    x0 = xindex
    _tmp5 = tl.full([XBLOCK, R0_BLOCK], 0, tl.int64)
    _tmp31 = tl.full([XBLOCK, R0_BLOCK], 0, tl.float32)
    for r0_offset in range(0, r0_numel, R0_BLOCK):
        r0_index = r0_offset + r0_base
        r0_mask = r0_index < r0_numel
        roffset = r0_offset
        rindex = r0_index
        r0_1 = r0_index
        tmp0 = tl.load(in_ptr0 + (r0_1 + 8192*x0), xmask & r0_mask, eviction_policy='evict_first', other=0.0)
        tmp23 = tl.load(in_ptr2 + (r0_1 + 8192*x0), xmask & r0_mask, eviction_policy='evict_first', other=0.0)
        tmp25 = tl.load(in_ptr3 + (r0_1 + 8192*x0), xmask & r0_mask, eviction_policy='evict_first', other=0.0)
        tmp1 = tl.full([1, 1], -100, tl.int64)
        tmp2 = tmp0 != tmp1
        tmp3 = tmp2.to(tl.int64)
        tmp4 = tl.broadcast_to(tmp3, [XBLOCK, R0_BLOCK])
        tmp6 = _tmp5 + tmp4
        _tmp5 = tl.where(r0_mask & xmask, tmp6, _tmp5)
        tmp7 = tl.full([1, 1], 0, tl.int64)
        tmp8 = tl.where(tmp2, tmp0, tmp7)
        tmp9 = tl.full([XBLOCK, R0_BLOCK], 50304, tl.int32)
        tmp10 = tmp8 + tmp9
        tmp11 = tmp8 < 0
        tmp12 = tl.where(tmp11, tmp10, tmp8)
        tl.device_assert(((0 <= tmp12) & (tmp12 < 50304)) | ~(xmask & r0_mask), "index out of bounds: 0 <= tmp12 < 50304")
        tmp14 = tl.load(in_ptr1 + (tmp12 + 50304*r0_1 + 412090368*x0), xmask & r0_mask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp15 = tmp14.to(tl.float32)
        tmp16 = 15.0
        tmp17 = tmp15 * tmp16
        tmp18 = tmp15 * tmp15
        tmp19 = 225.0
        tmp20 = tmp18 + tmp19
        tmp21 = libdevice.rsqrt(tmp20)
        tmp22 = tmp17 * tmp21
        tmp24 = tmp22 - tmp23
        tmp26 = tmp24 - tmp25
        tmp27 = -tmp26
        tmp28 = 0.0
        tmp29 = tl.where(tmp2, tmp27, tmp28)
        tmp30 = tl.broadcast_to(tmp29, [XBLOCK, R0_BLOCK])
        tmp32 = _tmp31 + tmp30
        _tmp31 = tl.where(r0_mask & xmask, tmp32, _tmp31)
    tmp5 = tl.sum(_tmp5, 1)[:, None]
    tmp31 = tl.sum(_tmp31, 1)[:, None]
    tl.store(out_ptr0 + (x0), tmp5, xmask)
    tl.store(out_ptr1 + (x0), tmp31, xmask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_root/cz/cczaa4cqrwbmxgqpfta2ew3tmdj3ra4nthli7vgpuh35bxe4sb52.py
# Topologically Sorted Source Nodes: [loss], Original ATen: [aten.nll_loss_forward]
# Source node to ATen node mapping:
#   loss => convert_element_type_324, div_1, full_default_13, ne, neg_30, sum_8, sum_9, where_5
# Graph fragment:
#   %ne : [num_users=3] = call_function[target=torch.ops.aten.ne.Scalar](args = (%primals_86, -100), kwargs = {})
#   %neg_30 : [num_users=1] = call_function[target=torch.ops.aten.neg.default](args = (%squeeze,), kwargs = {})
#   %full_default_13 : [num_users=1] = call_function[target=torch.ops.aten.full.default](args = ([], 0.0), kwargs = {dtype: torch.float32, layout: torch.strided, device: cuda:0, pin_memory: False})
#   %where_5 : [num_users=1] = call_function[target=torch.ops.aten.where.self](args = (%ne, %neg_30, %full_default_13), kwargs = {})
#   %sum_8 : [num_users=1] = call_function[target=torch.ops.aten.sum.default](args = (%ne,), kwargs = {})
#   %convert_element_type_324 : [num_users=2] = call_function[target=torch.ops.prims.convert_element_type.default](args = (%sum_8, torch.float32), kwargs = {})
#   %sum_9 : [num_users=1] = call_function[target=torch.ops.aten.sum.default](args = (%where_5,), kwargs = {})
#   %div_1 : [num_users=1] = call_function[target=torch.ops.aten.div.Tensor](args = (%sum_9, %convert_element_type_324), kwargs = {})
triton_per_fused_nll_loss_forward_45 = async_compile.triton('triton_per_fused_nll_loss_forward_45', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.persistent_reduction(
    size_hints={'x': 1, 'r0_': 8},
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    triton_meta={'signature': {'in_out_ptr0': '*fp32', 'in_ptr0': '*fp32', 'in_ptr1': '*i64', 'out_ptr1': '*fp32', 'xnumel': 'i32', 'r0_numel': 'i32'}, 'device': DeviceProperties(type='cuda', index=0, multi_processor_count=132, cc=90, major=9, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, warp_size=32), 'constants': {'xnumel': 1}, 'configs': [AttrsDescriptor.from_dict({'arg_properties': {'tt.divisibility': (0, 1, 2, 3), 'tt.equal_to': (4,)}, 'cls': 'AttrsDescriptor'})]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_nll_loss_forward_45', 'mutated_arg_names': ['in_out_ptr0'], 'optimize_mem': False, 'no_x_dim': False, 'num_load': 2, 'num_reduction': 2, 'backend_hash': 'CE326918D8FE98067798167ABD26A2E4EDFD110D9ECD4380441C64512F6D164E', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False, 'coordinate_descent_tuning': True, 'coordinate_descent_search_radius': 1, 'coordinate_descent_check_all_directions': False}
)
@triton.jit
def triton_per_fused_nll_loss_forward_45(in_out_ptr0, in_ptr0, in_ptr1, out_ptr1, xnumel, r0_numel, XBLOCK : tl.constexpr):
    xnumel = 1
    r0_numel = 8
    R0_BLOCK: tl.constexpr = 8
    rnumel = r0_numel
    RBLOCK: tl.constexpr = R0_BLOCK
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    r0_index = tl.arange(0, R0_BLOCK)[None, :]
    r0_offset = 0
    r0_mask = tl.full([XBLOCK, R0_BLOCK], True, tl.int1)
    roffset = r0_offset
    rindex = r0_index
    r0_0 = r0_index
    tmp0 = tl.load(in_ptr0 + (r0_0), None)
    tmp4 = tl.load(in_ptr1 + (r0_0), None)
    tmp1 = tl.broadcast_to(tmp0, [XBLOCK, R0_BLOCK])
    tmp3 = tl.sum(tmp1, 1)[:, None]
    tmp5 = tl.broadcast_to(tmp4, [XBLOCK, R0_BLOCK])
    tmp7 = tl.sum(tmp5, 1)[:, None]
    tmp8 = tmp7.to(tl.float32)
    tmp9 = tmp3 / tmp8
    tl.store(out_ptr1 + (tl.full([XBLOCK, 1], 0, tl.int32)), tmp8, None)
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (tl.full([XBLOCK, 1], 0, tl.int32)), tmp9, None)
''', device_str='cuda')

async_compile.wait(globals())
del async_compile

def call(args):
    primals_1, primals_2, primals_3, primals_4, primals_5, primals_6, primals_7, primals_8, primals_9, primals_10, primals_11, primals_12, primals_13, primals_14, primals_15, primals_16, primals_17, primals_18, primals_19, primals_20, primals_21, primals_22, primals_23, primals_24, primals_25, primals_26, primals_27, primals_28, primals_29, primals_30, primals_31, primals_32, primals_33, primals_34, primals_35, primals_36, primals_37, primals_38, primals_39, primals_40, primals_41, primals_42, primals_43, primals_44, primals_45, primals_46, primals_47, primals_48, primals_49, primals_50, primals_51, primals_52, primals_53, primals_54, primals_55, primals_56, primals_57, primals_58, primals_59, primals_60, primals_61, primals_62, primals_63, primals_64, primals_65, primals_66, primals_67, primals_68, primals_69, primals_70, primals_71, primals_72, primals_73, primals_74, primals_75, primals_76, primals_77, primals_78, primals_79, primals_80, primals_81, primals_82, primals_83, primals_84, primals_85, primals_86 = args
    args.clear()
    assert_size_stride(primals_1, (65536, ), (1, ))
    assert_size_stride(primals_2, (50257, 1024), (1024, 1))
    assert_size_stride(primals_3, (50257, 1024), (1024, 1))
    assert_size_stride(primals_4, (50257, 1024), (1024, 1))
    assert_size_stride(primals_5, (), ())
    assert_size_stride(primals_6, (50257, 1024), (1024, 1))
    assert_size_stride(primals_7, (80, ), (1, ))
    assert_size_stride(primals_8, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_9, (262144, 64), (64, 1))
    assert_size_stride(primals_10, (262144, 64), (64, 1))
    assert_size_stride(primals_11, (4096, 1024), (1024, 1))
    assert_size_stride(primals_12, (1024, 4096), (4096, 1))
    assert_size_stride(primals_13, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_14, (262144, 64), (64, 1))
    assert_size_stride(primals_15, (262144, 64), (64, 1))
    assert_size_stride(primals_16, (4096, 1024), (1024, 1))
    assert_size_stride(primals_17, (1024, 4096), (4096, 1))
    assert_size_stride(primals_18, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_19, (262144, 64), (64, 1))
    assert_size_stride(primals_20, (262144, 64), (64, 1))
    assert_size_stride(primals_21, (4096, 1024), (1024, 1))
    assert_size_stride(primals_22, (1024, 4096), (4096, 1))
    assert_size_stride(primals_23, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_24, (262144, 64), (64, 1))
    assert_size_stride(primals_25, (262144, 64), (64, 1))
    assert_size_stride(primals_26, (4096, 1024), (1024, 1))
    assert_size_stride(primals_27, (1024, 4096), (4096, 1))
    assert_size_stride(primals_28, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_29, (262144, 64), (64, 1))
    assert_size_stride(primals_30, (262144, 64), (64, 1))
    assert_size_stride(primals_31, (4096, 1024), (1024, 1))
    assert_size_stride(primals_32, (1024, 4096), (4096, 1))
    assert_size_stride(primals_33, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_34, (262144, 64), (64, 1))
    assert_size_stride(primals_35, (262144, 64), (64, 1))
    assert_size_stride(primals_36, (4096, 1024), (1024, 1))
    assert_size_stride(primals_37, (1024, 4096), (4096, 1))
    assert_size_stride(primals_38, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_39, (262144, 64), (64, 1))
    assert_size_stride(primals_40, (262144, 64), (64, 1))
    assert_size_stride(primals_41, (4096, 1024), (1024, 1))
    assert_size_stride(primals_42, (1024, 4096), (4096, 1))
    assert_size_stride(primals_43, (4096, 1024), (1024, 1))
    assert_size_stride(primals_44, (1024, 4096), (4096, 1))
    assert_size_stride(primals_45, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_46, (262144, 64), (64, 1))
    assert_size_stride(primals_47, (262144, 64), (64, 1))
    assert_size_stride(primals_48, (4096, 1024), (1024, 1))
    assert_size_stride(primals_49, (1024, 4096), (4096, 1))
    assert_size_stride(primals_50, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_51, (262144, 64), (64, 1))
    assert_size_stride(primals_52, (262144, 64), (64, 1))
    assert_size_stride(primals_53, (4096, 1024), (1024, 1))
    assert_size_stride(primals_54, (1024, 4096), (4096, 1))
    assert_size_stride(primals_55, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_56, (262144, 64), (64, 1))
    assert_size_stride(primals_57, (262144, 64), (64, 1))
    assert_size_stride(primals_58, (4096, 1024), (1024, 1))
    assert_size_stride(primals_59, (1024, 4096), (4096, 1))
    assert_size_stride(primals_60, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_61, (262144, 64), (64, 1))
    assert_size_stride(primals_62, (262144, 64), (64, 1))
    assert_size_stride(primals_63, (4096, 1024), (1024, 1))
    assert_size_stride(primals_64, (1024, 4096), (4096, 1))
    assert_size_stride(primals_65, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_66, (262144, 64), (64, 1))
    assert_size_stride(primals_67, (262144, 64), (64, 1))
    assert_size_stride(primals_68, (4096, 1024), (1024, 1))
    assert_size_stride(primals_69, (1024, 4096), (4096, 1))
    assert_size_stride(primals_70, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_71, (262144, 64), (64, 1))
    assert_size_stride(primals_72, (262144, 64), (64, 1))
    assert_size_stride(primals_73, (4096, 1024), (1024, 1))
    assert_size_stride(primals_74, (1024, 4096), (4096, 1))
    assert_size_stride(primals_75, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_76, (262144, 64), (64, 1))
    assert_size_stride(primals_77, (262144, 64), (64, 1))
    assert_size_stride(primals_78, (4096, 1024), (1024, 1))
    assert_size_stride(primals_79, (1024, 4096), (4096, 1))
    assert_size_stride(primals_80, (4, 1024, 1024), (1048576, 1024, 1))
    assert_size_stride(primals_81, (262144, 64), (64, 1))
    assert_size_stride(primals_82, (262144, 64), (64, 1))
    assert_size_stride(primals_83, (4096, 1024), (1024, 1))
    assert_size_stride(primals_84, (1024, 4096), (4096, 1))
    assert_size_stride(primals_85, (50304, 1024), (1024, 1))
    assert_size_stride(primals_86, (65536, ), (1, ))
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        buf50 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        buf51 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf52 = reinterpret_tensor(buf51, (1, 65536, 1), (65536, 1, 1), 0); del buf51  # reuse
        buf53 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [embedding_3, x, mul, mul_1, x_1], Original ATen: [aten.embedding, aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_0.run(buf52, primals_1, primals_6, primals_7, buf50, buf53, 65536, 1024, grid=grid(65536), stream=stream0)
        del primals_6
        buf54 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf53, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_8, (1024, 3072), (1, 1024), 0), out=buf54)
        buf61 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf62 = reinterpret_tensor(buf61, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf61  # reuse
        buf0 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        buf65 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [embedding, v_1, mul_10, mul_11, v_2], Original ATen: [aten.embedding, aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_1.run(buf62, buf54, primals_1, primals_2, primals_7, buf0, buf65, 524288, 128, grid=grid(524288), stream=stream0)
        del primals_2
        buf3 = empty_strided_cuda((65536, ), (1, ), torch.int64)
        # Topologically Sorted Source Nodes: [eq, cumsum], Original ATen: [aten.eq, aten.cumsum]
        workspace_0 = empty_strided_cuda((6144, ), (1, ), torch.uint8)
        workspace_0.zero_()
        stream0 = get_raw_stream(0)
        triton_spl_fused_cumsum_eq_2.run(primals_1, buf3, workspace_0, 1, 65536, grid=split_scan_grid(1, 65536), stream=stream0)
        del workspace_0
        buf8 = empty_strided_cuda((512, 512), (512, 1), torch.int16)
        buf12 = empty_strided_cuda((512, 512), (512, 1), torch.int16)
        buf14 = empty_strided_cuda((1, 1, 512), (512, 512, 1), torch.int32)
        buf32 = empty_strided_cuda((1, 1, 512), (512, 512, 1), torch.int32)
        buf15 = empty_strided_cuda((1, 1, 512), (512, 512, 1), torch.int32)
        buf33 = empty_strided_cuda((1, 1, 512), (512, 512, 1), torch.int32)
        # Topologically Sorted Source Nodes: [block_idx, causal_blockmask_any, causal_blockmask_all, docs_low, docs_high, le, ge_1, document_blockmask_any, eq_1, eq_2, document_blockmask_all, blockmask_any, blockmask_all, invert, and__4, num_blocks, argsort, num_blocks_1, argsort_1, sub, clamp_min, child, sub_1, child_4, floordiv, sub_2, clamp_min_1, child_8, sub_3, child_11], Original ATen: [aten.arange, aten.ge, aten.gt, aten.clone, aten.le, aten.bitwise_and, aten.eq, aten.bitwise_not, aten.sum, aten.sort, aten.sub, aten.clamp_min, aten.clamp_max, aten.floor_divide]
        stream0 = get_raw_stream(0)
        triton_per_fused_arange_bitwise_and_bitwise_not_clamp_max_clamp_min_clone_eq_floor_divide_ge_gt_le_sort_sub_sum_3.run(buf3, primals_5, buf8, buf12, buf14, buf32, buf15, buf33, 512, 512, grid=grid(512), stream=stream0)
        del primals_5
        buf24 = empty_strided_cuda((1, 1, 512, 513), (262656, 262656, 513, 1), torch.int32)
        # Topologically Sorted Source Nodes: [dense_mask, setitem, setitem_1], Original ATen: [aten.new_zeros, aten.view, aten.index_put]
        stream0 = get_raw_stream(0)
        triton_poi_fused_index_put_new_zeros_view_4.run(buf24, 262656, grid=grid(262656), stream=stream0)
        buf42 = empty_strided_cuda((1, 1, 512, 513), (262656, 262656, 513, 1), torch.int32)
        # Topologically Sorted Source Nodes: [dense_mask], Original ATen: [aten.new_zeros]
        stream0 = get_raw_stream(0)
        triton_poi_fused_index_put_new_zeros_view_4.run(buf42, 262656, grid=grid(262656), stream=stream0)
        buf13 = empty_strided_cuda((512, 512), (512, 1), torch.int32)
        # Topologically Sorted Source Nodes: [argsort_1, flip_1, indices_1, dense_mask, setitem, setitem_1, setitem_3], Original ATen: [aten.sort, aten.flip, aten._to_copy, aten.new_zeros, aten.view, aten.index_put]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_flip_index_put_new_zeros_sort_view_5.run(buf12, buf15, buf33, buf13, buf24, buf42, 262144, grid=grid(262144), stream=stream0)
        del buf12
        buf55 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf56 = reinterpret_tensor(buf55, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf55  # reuse
        buf57 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf58 = reinterpret_tensor(buf57, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf57  # reuse
        buf63 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf64 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_1, k_1, cat, q_2, cat_1, k_2], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf56, buf58, buf54, primals_9, primals_10, buf63, buf64, 524288, 128, grid=grid(524288), stream=stream0)
        buf66 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf16 = empty_strided_cuda((1, 1, 512, 513), (262656, 262656, 513, 1), torch.int32)
        # Topologically Sorted Source Nodes: [dense_mask, setitem], Original ATen: [aten.new_zeros, aten.view, aten.index_put]
        stream0 = get_raw_stream(0)
        triton_poi_fused_index_put_new_zeros_view_4.run(buf16, 262656, grid=grid(262656), stream=stream0)
        buf34 = empty_strided_cuda((1, 1, 512, 513), (262656, 262656, 513, 1), torch.int32)
        # Topologically Sorted Source Nodes: [dense_mask, setitem, setitem_2], Original ATen: [aten.new_zeros, aten.view, aten.index_put]
        stream0 = get_raw_stream(0)
        triton_poi_fused_index_put_new_zeros_view_4.run(buf34, 262656, grid=grid(262656), stream=stream0)
        buf9 = empty_strided_cuda((512, 512), (512, 1), torch.int32)
        # Topologically Sorted Source Nodes: [argsort, flip, indices, dense_mask, setitem, setitem_2], Original ATen: [aten.sort, aten.flip, aten._to_copy, aten.new_zeros, aten.view, aten.index_put]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_flip_index_put_new_zeros_sort_view_5.run(buf8, buf14, buf32, buf9, buf16, buf34, 262144, grid=grid(262144), stream=stream0)
        del buf8
        buf67 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf63, buf64, buf65, buf66, buf14, buf9, buf15, buf13, buf3, buf67, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf69 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_2], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf67, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_8, (1024, 1024), (1, 1024), 3145728), out=buf69)
        buf70 = reinterpret_tensor(buf69, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf69  # reuse
        buf71 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf72 = reinterpret_tensor(buf71, (1, 65536, 1), (65536, 1, 1), 0); del buf71  # reuse
        buf73 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_5, rms_norm_4], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf70, buf72, buf53, buf73, 65536, 1024, grid=grid(65536), stream=stream0)
        buf74 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_6], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf73, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_11, (1024, 4096), (1, 1024), 0), out=buf74)
        buf75 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu, x_7], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf74, buf75, 268435456, grid=grid(268435456), stream=stream0)
        buf76 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_8], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf75, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_12, (4096, 1024), (1, 4096), 0), out=buf76)
        buf77 = reinterpret_tensor(buf76, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf76  # reuse
        buf78 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_9, mul_12, mul_13, x_10], Original ATen: [aten._to_copy, aten.mul, aten.add]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_add_mul_10.run(buf77, buf70, primals_7, buf50, buf52, buf78, 67108864, grid=grid(67108864), stream=stream0)
        buf79 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_4], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf78, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_13, (1024, 3072), (1, 1024), 0), out=buf79)
        buf86 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf87 = reinterpret_tensor(buf86, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf86  # reuse
        buf1 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        buf90 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [embedding_1, v_4, mul_22, mul_23, v_5], Original ATen: [aten.embedding, aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_11.run(buf87, buf79, primals_1, primals_3, primals_7, buf1, buf90, 524288, 128, grid=grid(524288), stream=stream0)
        del primals_3
        buf80 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf81 = reinterpret_tensor(buf80, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf80  # reuse
        buf82 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf83 = reinterpret_tensor(buf82, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf82  # reuse
        buf88 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf89 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_4, k_4, cat_2, q_5, cat_3, k_5], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf81, buf83, buf79, primals_14, primals_15, buf88, buf89, 524288, 128, grid=grid(524288), stream=stream0)
        buf91 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf92 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention_1], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf88, buf89, buf90, buf91, buf32, buf9, buf33, buf13, buf3, buf92, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf94 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_5], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf92, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_13, (1024, 1024), (1, 1024), 3145728), out=buf94)
        buf95 = reinterpret_tensor(buf94, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf94  # reuse
        buf96 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf97 = reinterpret_tensor(buf96, (1, 65536, 1), (65536, 1, 1), 0); del buf96  # reuse
        buf98 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_14, rms_norm_8], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf95, buf97, buf78, buf98, 65536, 1024, grid=grid(65536), stream=stream0)
        buf99 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_15], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf98, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_16, (1024, 4096), (1, 1024), 0), out=buf99)
        buf100 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_1, x_16], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf99, buf100, 268435456, grid=grid(268435456), stream=stream0)
        buf101 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_17], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf100, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_17, (4096, 1024), (1, 4096), 0), out=buf101)
        buf102 = reinterpret_tensor(buf101, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf101  # reuse
        buf103 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_18, mul_24, mul_25, x_19], Original ATen: [aten._to_copy, aten.mul, aten.add]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_add_mul_12.run(buf102, buf95, primals_7, buf50, buf52, buf103, 67108864, grid=grid(67108864), stream=stream0)
        buf104 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_8], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf103, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_18, (1024, 3072), (1, 1024), 0), out=buf104)
        buf111 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf112 = reinterpret_tensor(buf111, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf111  # reuse
        buf2 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        buf115 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [embedding_2, v_7, mul_34, mul_35, v_8], Original ATen: [aten.embedding, aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_embedding_mean_mul_pow_rsqrt_13.run(buf112, buf104, primals_1, primals_4, primals_7, buf2, buf115, 524288, 128, grid=grid(524288), stream=stream0)
        del primals_4
        buf18 = empty_strided_cuda((1, 1, 512, 4), (2048, 2048, 1, 512), torch.int64)
        # Topologically Sorted Source Nodes: [num_blocks_in_row], Original ATen: [aten.sum]
        stream0 = get_raw_stream(0)
        triton_red_fused_sum_14.run(buf16, buf18, 2048, 128, grid=grid(2048), stream=stream0)
        buf22 = empty_strided_cuda((1, 1, 512), (512, 512, 1), torch.int32)
        # Topologically Sorted Source Nodes: [num_blocks_in_row, q_num_blocks], Original ATen: [aten.sum, aten._to_copy]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_sum_15.run(buf18, buf22, 512, 4, grid=grid(512), stream=stream0)
        buf23 = empty_strided_cuda((1, 1, 512, 512), (262144, 262144, 512, 1), torch.int32)
        # Topologically Sorted Source Nodes: [col_indices, q_indices], Original ATen: [aten.sort, aten._to_copy]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_sort_16.run(buf16, buf23, 512, 512, grid=grid(512), stream=stream0)
        del buf16
        buf26 = buf18; del buf18  # reuse
        # Topologically Sorted Source Nodes: [num_blocks_in_row_1], Original ATen: [aten.sum]
        stream0 = get_raw_stream(0)
        triton_red_fused_sum_14.run(buf24, buf26, 2048, 128, grid=grid(2048), stream=stream0)
        buf30 = empty_strided_cuda((1, 1, 512), (512, 512, 1), torch.int32)
        # Topologically Sorted Source Nodes: [num_blocks_in_row_1, full_q_num_blocks], Original ATen: [aten.sum, aten._to_copy]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_sum_15.run(buf26, buf30, 512, 4, grid=grid(512), stream=stream0)
        buf31 = empty_strided_cuda((1, 1, 512, 512), (262144, 262144, 512, 1), torch.int32)
        # Topologically Sorted Source Nodes: [col_indices_1, full_q_indices], Original ATen: [aten.sort, aten._to_copy]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_sort_16.run(buf24, buf31, 512, 512, grid=grid(512), stream=stream0)
        del buf24
        buf36 = buf26; del buf26  # reuse
        # Topologically Sorted Source Nodes: [num_blocks_in_row_2], Original ATen: [aten.sum]
        stream0 = get_raw_stream(0)
        triton_red_fused_sum_14.run(buf34, buf36, 2048, 128, grid=grid(2048), stream=stream0)
        buf40 = empty_strided_cuda((1, 1, 512), (512, 512, 1), torch.int32)
        # Topologically Sorted Source Nodes: [num_blocks_in_row_2, q_num_blocks_1], Original ATen: [aten.sum, aten._to_copy]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_sum_15.run(buf36, buf40, 512, 4, grid=grid(512), stream=stream0)
        buf41 = empty_strided_cuda((1, 1, 512, 512), (262144, 262144, 512, 1), torch.int32)
        # Topologically Sorted Source Nodes: [col_indices_2, q_indices_1], Original ATen: [aten.sort, aten._to_copy]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_sort_16.run(buf34, buf41, 512, 512, grid=grid(512), stream=stream0)
        del buf34
        buf44 = buf36; del buf36  # reuse
        # Topologically Sorted Source Nodes: [num_blocks_in_row_3], Original ATen: [aten.sum]
        stream0 = get_raw_stream(0)
        triton_red_fused_sum_14.run(buf42, buf44, 2048, 128, grid=grid(2048), stream=stream0)
        buf48 = empty_strided_cuda((1, 1, 512), (512, 512, 1), torch.int32)
        # Topologically Sorted Source Nodes: [num_blocks_in_row_3, full_q_num_blocks_1], Original ATen: [aten.sum, aten._to_copy]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_sum_15.run(buf44, buf48, 512, 4, grid=grid(512), stream=stream0)
        del buf44
        buf49 = empty_strided_cuda((1, 1, 512, 512), (262144, 262144, 512, 1), torch.int32)
        # Topologically Sorted Source Nodes: [col_indices_3, full_q_indices_1], Original ATen: [aten.sort, aten._to_copy]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_sort_16.run(buf42, buf49, 512, 512, grid=grid(512), stream=stream0)
        del buf42
        buf105 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf106 = reinterpret_tensor(buf105, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf105  # reuse
        buf107 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf108 = reinterpret_tensor(buf107, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf107  # reuse
        buf113 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf114 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_7, k_7, cat_4, q_8, cat_5, k_8], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf106, buf108, buf104, primals_19, primals_20, buf113, buf114, 524288, 128, grid=grid(524288), stream=stream0)
        buf116 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf117 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention_2], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf113, buf114, buf115, buf116, buf32, buf9, buf33, buf13, buf3, buf117, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf119 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_8], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf117, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_18, (1024, 1024), (1, 1024), 3145728), out=buf119)
        buf120 = reinterpret_tensor(buf119, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf119  # reuse
        buf121 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf122 = reinterpret_tensor(buf121, (1, 65536, 1), (65536, 1, 1), 0); del buf121  # reuse
        buf123 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_23, rms_norm_12], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf120, buf122, buf103, buf123, 65536, 1024, grid=grid(65536), stream=stream0)
        buf124 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_24], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf123, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_21, (1024, 4096), (1, 1024), 0), out=buf124)
        buf125 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_2, x_25], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf124, buf125, 268435456, grid=grid(268435456), stream=stream0)
        buf126 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_26], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf125, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_22, (4096, 1024), (1, 4096), 0), out=buf126)
        buf127 = reinterpret_tensor(buf126, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf126  # reuse
        buf128 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_27, mul_36, mul_37, x_28], Original ATen: [aten._to_copy, aten.mul, aten.add]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_add_mul_17.run(buf127, buf120, primals_7, buf50, buf52, buf128, 67108864, grid=grid(67108864), stream=stream0)
        buf129 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_12], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf128, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_23, (1024, 3072), (1, 1024), 0), out=buf129)
        buf130 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf131 = reinterpret_tensor(buf130, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf130  # reuse
        buf132 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf133 = reinterpret_tensor(buf132, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf132  # reuse
        buf138 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf139 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_10, k_10, cat_6, q_11, cat_7, k_11], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf131, buf133, buf129, primals_24, primals_25, buf138, buf139, 524288, 128, grid=grid(524288), stream=stream0)
        buf136 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf137 = reinterpret_tensor(buf136, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf136  # reuse
        buf140 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [v_10, v_11], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_18.run(buf137, buf129, primals_7, buf140, 524288, 128, grid=grid(524288), stream=stream0)
        buf141 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf142 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention_3], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf138, buf139, buf140, buf141, buf32, buf9, buf33, buf13, buf3, buf142, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf144 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_11], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf142, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_23, (1024, 1024), (1, 1024), 3145728), out=buf144)
        buf145 = reinterpret_tensor(buf144, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf144  # reuse
        buf146 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf147 = reinterpret_tensor(buf146, (1, 65536, 1), (65536, 1, 1), 0); del buf146  # reuse
        buf148 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_32, rms_norm_16], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf145, buf147, buf128, buf148, 65536, 1024, grid=grid(65536), stream=stream0)
        buf149 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_33], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf148, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_26, (1024, 4096), (1, 1024), 0), out=buf149)
        buf150 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_3, x_34], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf149, buf150, 268435456, grid=grid(268435456), stream=stream0)
        buf151 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_35], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf150, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_27, (4096, 1024), (1, 4096), 0), out=buf151)
        buf152 = reinterpret_tensor(buf151, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf151  # reuse
        buf153 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_36, mul_47, mul_48, x_37], Original ATen: [aten._to_copy, aten.mul, aten.add]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_add_mul_19.run(buf152, buf145, primals_7, buf50, buf52, buf153, 67108864, grid=grid(67108864), stream=stream0)
        buf154 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_16], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf153, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_28, (1024, 3072), (1, 1024), 0), out=buf154)
        buf155 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf156 = reinterpret_tensor(buf155, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf155  # reuse
        buf157 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf158 = reinterpret_tensor(buf157, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf157  # reuse
        buf163 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf164 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_13, k_13, cat_8, q_14, cat_9, k_14], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf156, buf158, buf154, primals_29, primals_30, buf163, buf164, 524288, 128, grid=grid(524288), stream=stream0)
        buf161 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf162 = reinterpret_tensor(buf161, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf161  # reuse
        buf165 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [v_13, v_14], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_20.run(buf162, buf154, primals_7, buf165, 524288, 128, grid=grid(524288), stream=stream0)
        buf166 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf167 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention_4], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf163, buf164, buf165, buf166, buf14, buf9, buf15, buf13, buf3, buf167, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf169 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_14], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf167, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_28, (1024, 1024), (1, 1024), 3145728), out=buf169)
        buf170 = reinterpret_tensor(buf169, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf169  # reuse
        buf171 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf172 = reinterpret_tensor(buf171, (1, 65536, 1), (65536, 1, 1), 0); del buf171  # reuse
        buf173 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_41, rms_norm_20], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf170, buf172, buf153, buf173, 65536, 1024, grid=grid(65536), stream=stream0)
        buf174 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_42], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf173, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_31, (1024, 4096), (1, 1024), 0), out=buf174)
        buf175 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_4, x_43], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf174, buf175, 268435456, grid=grid(268435456), stream=stream0)
        buf176 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_44], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf175, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_32, (4096, 1024), (1, 4096), 0), out=buf176)
        buf177 = reinterpret_tensor(buf176, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf176  # reuse
        buf178 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_45, mul_58, mul_59, x_46], Original ATen: [aten._to_copy, aten.mul, aten.add]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_add_mul_21.run(buf177, buf170, primals_7, buf50, buf52, buf178, 67108864, grid=grid(67108864), stream=stream0)
        buf179 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_20], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf178, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_33, (1024, 3072), (1, 1024), 0), out=buf179)
        buf180 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf181 = reinterpret_tensor(buf180, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf180  # reuse
        buf182 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf183 = reinterpret_tensor(buf182, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf182  # reuse
        buf188 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf189 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_16, k_16, cat_10, q_17, cat_11, k_17], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf181, buf183, buf179, primals_34, primals_35, buf188, buf189, 524288, 128, grid=grid(524288), stream=stream0)
        buf186 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf187 = reinterpret_tensor(buf186, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf186  # reuse
        buf190 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [v_16, v_17], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_22.run(buf187, buf179, primals_7, buf190, 524288, 128, grid=grid(524288), stream=stream0)
        buf191 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf192 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention_5], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf188, buf189, buf190, buf191, buf32, buf9, buf33, buf13, buf3, buf192, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf194 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_17], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf192, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_33, (1024, 1024), (1, 1024), 3145728), out=buf194)
        buf195 = reinterpret_tensor(buf194, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf194  # reuse
        buf196 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf197 = reinterpret_tensor(buf196, (1, 65536, 1), (65536, 1, 1), 0); del buf196  # reuse
        buf198 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_50, rms_norm_24], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf195, buf197, buf178, buf198, 65536, 1024, grid=grid(65536), stream=stream0)
        buf199 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_51], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf198, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_36, (1024, 4096), (1, 1024), 0), out=buf199)
        buf200 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_5, x_52], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf199, buf200, 268435456, grid=grid(268435456), stream=stream0)
        buf201 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_53], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf200, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_37, (4096, 1024), (1, 4096), 0), out=buf201)
        buf202 = reinterpret_tensor(buf201, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf201  # reuse
        buf203 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_54, mul_69, mul_70, x_55], Original ATen: [aten._to_copy, aten.mul, aten.add]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_add_mul_23.run(buf202, buf195, primals_7, buf50, buf52, buf203, 67108864, grid=grid(67108864), stream=stream0)
        buf204 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_24], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf203, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_38, (1024, 3072), (1, 1024), 0), out=buf204)
        buf205 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf206 = reinterpret_tensor(buf205, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf205  # reuse
        buf207 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf208 = reinterpret_tensor(buf207, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf207  # reuse
        buf213 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf214 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_19, k_19, cat_12, q_20, cat_13, k_20], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf206, buf208, buf204, primals_39, primals_40, buf213, buf214, 524288, 128, grid=grid(524288), stream=stream0)
        buf211 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf212 = reinterpret_tensor(buf211, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf211  # reuse
        buf215 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [v_19, v_20], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_24.run(buf212, buf204, primals_7, buf215, 524288, 128, grid=grid(524288), stream=stream0)
        buf216 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf217 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention_6], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf213, buf214, buf215, buf216, buf32, buf9, buf33, buf13, buf3, buf217, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf219 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_20], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf217, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_38, (1024, 1024), (1, 1024), 3145728), out=buf219)
        buf220 = reinterpret_tensor(buf219, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf219  # reuse
        buf221 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf222 = reinterpret_tensor(buf221, (1, 65536, 1), (65536, 1, 1), 0); del buf221  # reuse
        buf223 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_59, rms_norm_28], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf220, buf222, buf203, buf223, 65536, 1024, grid=grid(65536), stream=stream0)
        buf224 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_60], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf223, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_41, (1024, 4096), (1, 1024), 0), out=buf224)
        buf225 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_6, x_61], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf224, buf225, 268435456, grid=grid(268435456), stream=stream0)
        buf226 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_62], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf225, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_42, (4096, 1024), (1, 4096), 0), out=buf226)
        buf227 = reinterpret_tensor(buf226, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf226  # reuse
        buf228 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        buf229 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf230 = reinterpret_tensor(buf229, (1, 65536, 1), (65536, 1, 1), 0); del buf229  # reuse
        buf231 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_63, mul_80, mul_81, x_64, rms_norm_29], Original ATen: [aten._to_copy, aten.mul, aten.add, aten.pow, aten.mean, aten.rsqrt]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_25.run(buf227, buf230, buf220, primals_7, buf50, buf52, buf228, buf231, 65536, 1024, grid=grid(65536), stream=stream0)
        buf232 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_65], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf231, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_43, (1024, 4096), (1, 1024), 0), out=buf232)
        buf233 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_7, x_66], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf232, buf233, 268435456, grid=grid(268435456), stream=stream0)
        buf234 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_67], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf233, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_44, (4096, 1024), (1, 4096), 0), out=buf234)
        buf235 = reinterpret_tensor(buf234, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf234  # reuse
        buf236 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_68, mul_82, mul_83, x_69], Original ATen: [aten._to_copy, aten.mul, aten.add]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_add_mul_26.run(buf235, buf228, primals_7, buf50, buf52, buf236, 67108864, grid=grid(67108864), stream=stream0)
        buf237 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_30], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf236, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_45, (1024, 3072), (1, 1024), 0), out=buf237)
        buf238 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf239 = reinterpret_tensor(buf238, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf238  # reuse
        buf240 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf241 = reinterpret_tensor(buf240, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf240  # reuse
        buf246 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf247 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_22, k_22, cat_14, q_23, cat_15, k_23], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf239, buf241, buf237, primals_46, primals_47, buf246, buf247, 524288, 128, grid=grid(524288), stream=stream0)
        buf244 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf245 = reinterpret_tensor(buf244, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf244  # reuse
        buf248 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [v_22, v_23], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_27.run(buf245, buf237, primals_7, buf248, 524288, 128, grid=grid(524288), stream=stream0)
        buf249 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf250 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention_7], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf246, buf247, buf248, buf249, buf32, buf9, buf33, buf13, buf3, buf250, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf252 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_23], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf250, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_45, (1024, 1024), (1, 1024), 3145728), out=buf252)
        buf253 = reinterpret_tensor(buf252, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf252  # reuse
        buf254 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf255 = reinterpret_tensor(buf254, (1, 65536, 1), (65536, 1, 1), 0); del buf254  # reuse
        buf256 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_73, rms_norm_33], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf253, buf255, buf236, buf256, 65536, 1024, grid=grid(65536), stream=stream0)
        buf257 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_74], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf256, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_48, (1024, 4096), (1, 1024), 0), out=buf257)
        buf258 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_8, x_75], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf257, buf258, 268435456, grid=grid(268435456), stream=stream0)
        buf259 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_76], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf258, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_49, (4096, 1024), (1, 4096), 0), out=buf259)
        buf260 = reinterpret_tensor(buf259, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf259  # reuse
        buf261 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_77, mul_93, x_78, mul_94, mul_95, x_79], Original ATen: [aten._to_copy, aten.mul, aten.add]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_add_mul_28.run(buf260, buf253, primals_7, buf227, buf50, buf52, buf261, 67108864, grid=grid(67108864), stream=stream0)
        buf262 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_34], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf261, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_50, (1024, 3072), (1, 1024), 0), out=buf262)
        buf263 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf264 = reinterpret_tensor(buf263, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf263  # reuse
        buf265 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf266 = reinterpret_tensor(buf265, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf265  # reuse
        buf271 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf272 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_25, k_25, cat_16, q_26, cat_17, k_26], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf264, buf266, buf262, primals_51, primals_52, buf271, buf272, 524288, 128, grid=grid(524288), stream=stream0)
        buf269 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf270 = reinterpret_tensor(buf269, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf269  # reuse
        buf273 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [v_25, v_26], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_29.run(buf270, buf262, primals_7, buf273, 524288, 128, grid=grid(524288), stream=stream0)
        buf274 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf275 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention_8], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf271, buf272, buf273, buf274, buf32, buf9, buf33, buf13, buf3, buf275, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf277 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_26], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf275, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_50, (1024, 1024), (1, 1024), 3145728), out=buf277)
        buf278 = reinterpret_tensor(buf277, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf277  # reuse
        buf279 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf280 = reinterpret_tensor(buf279, (1, 65536, 1), (65536, 1, 1), 0); del buf279  # reuse
        buf281 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_83, rms_norm_37], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf278, buf280, buf261, buf281, 65536, 1024, grid=grid(65536), stream=stream0)
        buf282 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_84], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf281, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_53, (1024, 4096), (1, 1024), 0), out=buf282)
        buf283 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_9, x_85], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf282, buf283, 268435456, grid=grid(268435456), stream=stream0)
        buf284 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_86], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf283, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_54, (4096, 1024), (1, 4096), 0), out=buf284)
        buf285 = reinterpret_tensor(buf284, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf284  # reuse
        buf286 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_87, mul_105, x_88, mul_106, mul_107, x_89], Original ATen: [aten._to_copy, aten.mul, aten.add]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_add_mul_30.run(buf285, buf278, primals_7, buf177, buf50, buf52, buf286, 67108864, grid=grid(67108864), stream=stream0)
        buf287 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_38], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf286, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_55, (1024, 3072), (1, 1024), 0), out=buf287)
        buf288 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf289 = reinterpret_tensor(buf288, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf288  # reuse
        buf290 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf291 = reinterpret_tensor(buf290, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf290  # reuse
        buf296 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf297 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_28, k_28, cat_18, q_29, cat_19, k_29], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf289, buf291, buf287, primals_56, primals_57, buf296, buf297, 524288, 128, grid=grid(524288), stream=stream0)
        buf294 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf295 = reinterpret_tensor(buf294, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf294  # reuse
        buf298 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [v_28, v_29], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_31.run(buf295, buf287, primals_7, buf298, 524288, 128, grid=grid(524288), stream=stream0)
        buf299 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf300 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention_9], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf296, buf297, buf298, buf299, buf32, buf9, buf33, buf13, buf3, buf300, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf302 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_29], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf300, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_55, (1024, 1024), (1, 1024), 3145728), out=buf302)
        buf303 = reinterpret_tensor(buf302, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf302  # reuse
        buf304 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf305 = reinterpret_tensor(buf304, (1, 65536, 1), (65536, 1, 1), 0); del buf304  # reuse
        buf306 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_93, rms_norm_41], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf303, buf305, buf286, buf306, 65536, 1024, grid=grid(65536), stream=stream0)
        buf307 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_94], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf306, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_58, (1024, 4096), (1, 1024), 0), out=buf307)
        buf308 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_10, x_95], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf307, buf308, 268435456, grid=grid(268435456), stream=stream0)
        buf309 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_96], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf308, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_59, (4096, 1024), (1, 4096), 0), out=buf309)
        buf310 = reinterpret_tensor(buf309, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf309  # reuse
        buf311 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_97, mul_117, x_98, mul_118, mul_119, x_99], Original ATen: [aten._to_copy, aten.mul, aten.add]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_add_mul_32.run(buf310, buf303, primals_7, buf127, buf50, buf52, buf311, 67108864, grid=grid(67108864), stream=stream0)
        buf312 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_42], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf311, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_60, (1024, 3072), (1, 1024), 0), out=buf312)
        buf313 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf314 = reinterpret_tensor(buf313, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf313  # reuse
        buf315 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf316 = reinterpret_tensor(buf315, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf315  # reuse
        buf321 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf322 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_31, k_31, cat_20, q_32, cat_21, k_32], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf314, buf316, buf312, primals_61, primals_62, buf321, buf322, 524288, 128, grid=grid(524288), stream=stream0)
        buf319 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf320 = reinterpret_tensor(buf319, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf319  # reuse
        buf323 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [v_31, v_32], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_33.run(buf320, buf312, primals_7, buf323, 524288, 128, grid=grid(524288), stream=stream0)
        buf324 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf325 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention_10], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf321, buf322, buf323, buf324, buf14, buf9, buf15, buf13, buf3, buf325, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf327 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_32], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf325, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_60, (1024, 1024), (1, 1024), 3145728), out=buf327)
        buf328 = reinterpret_tensor(buf327, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf327  # reuse
        buf329 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf330 = reinterpret_tensor(buf329, (1, 65536, 1), (65536, 1, 1), 0); del buf329  # reuse
        buf331 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_103, rms_norm_45], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf328, buf330, buf311, buf331, 65536, 1024, grid=grid(65536), stream=stream0)
        buf332 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_104], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf331, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_63, (1024, 4096), (1, 1024), 0), out=buf332)
        buf333 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_11, x_105], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf332, buf333, 268435456, grid=grid(268435456), stream=stream0)
        buf334 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_106], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf333, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_64, (4096, 1024), (1, 4096), 0), out=buf334)
        buf335 = reinterpret_tensor(buf334, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf334  # reuse
        buf336 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_107, mul_129, mul_130, x_108], Original ATen: [aten._to_copy, aten.mul, aten.add]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_add_mul_34.run(buf335, buf328, primals_7, buf50, buf52, buf336, 67108864, grid=grid(67108864), stream=stream0)
        buf337 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_46], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf336, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_65, (1024, 3072), (1, 1024), 0), out=buf337)
        buf338 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf339 = reinterpret_tensor(buf338, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf338  # reuse
        buf340 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf341 = reinterpret_tensor(buf340, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf340  # reuse
        buf346 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf347 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_34, k_34, cat_22, q_35, cat_23, k_35], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf339, buf341, buf337, primals_66, primals_67, buf346, buf347, 524288, 128, grid=grid(524288), stream=stream0)
        buf344 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf345 = reinterpret_tensor(buf344, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf344  # reuse
        buf348 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [v_34, v_35], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_35.run(buf345, buf337, primals_7, buf348, 524288, 128, grid=grid(524288), stream=stream0)
        buf349 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf350 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention_11], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf346, buf347, buf348, buf349, buf32, buf9, buf33, buf13, buf3, buf350, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf352 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_35], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf350, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_65, (1024, 1024), (1, 1024), 3145728), out=buf352)
        buf353 = reinterpret_tensor(buf352, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf352  # reuse
        buf354 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf355 = reinterpret_tensor(buf354, (1, 65536, 1), (65536, 1, 1), 0); del buf354  # reuse
        buf356 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_112, rms_norm_49], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf353, buf355, buf336, buf356, 65536, 1024, grid=grid(65536), stream=stream0)
        buf357 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_113], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf356, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_68, (1024, 4096), (1, 1024), 0), out=buf357)
        buf358 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_12, x_114], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf357, buf358, 268435456, grid=grid(268435456), stream=stream0)
        buf359 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_115], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf358, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_69, (4096, 1024), (1, 4096), 0), out=buf359)
        buf360 = reinterpret_tensor(buf359, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf359  # reuse
        buf361 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_116, mul_140, mul_141, x_117], Original ATen: [aten._to_copy, aten.mul, aten.add]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_add_mul_36.run(buf360, buf353, primals_7, buf50, buf52, buf361, 67108864, grid=grid(67108864), stream=stream0)
        buf362 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_50], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf361, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_70, (1024, 3072), (1, 1024), 0), out=buf362)
        buf363 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf364 = reinterpret_tensor(buf363, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf363  # reuse
        buf365 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf366 = reinterpret_tensor(buf365, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf365  # reuse
        buf371 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf372 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_37, k_37, cat_24, q_38, cat_25, k_38], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf364, buf366, buf362, primals_71, primals_72, buf371, buf372, 524288, 128, grid=grid(524288), stream=stream0)
        buf369 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf370 = reinterpret_tensor(buf369, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf369  # reuse
        buf373 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [v_37, mul_150, mul_151, v_38], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_37.run(buf370, buf362, primals_7, buf0, buf373, 524288, 128, grid=grid(524288), stream=stream0)
        buf374 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf375 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention_12], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf371, buf372, buf373, buf374, buf32, buf9, buf33, buf13, buf3, buf375, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf377 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_38], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf375, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_70, (1024, 1024), (1, 1024), 3145728), out=buf377)
        buf378 = reinterpret_tensor(buf377, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf377  # reuse
        buf379 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf380 = reinterpret_tensor(buf379, (1, 65536, 1), (65536, 1, 1), 0); del buf379  # reuse
        buf381 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_121, rms_norm_53], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf378, buf380, buf361, buf381, 65536, 1024, grid=grid(65536), stream=stream0)
        buf382 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_122], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf381, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_73, (1024, 4096), (1, 1024), 0), out=buf382)
        buf383 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_13, x_123], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf382, buf383, 268435456, grid=grid(268435456), stream=stream0)
        buf384 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_124], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf383, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_74, (4096, 1024), (1, 4096), 0), out=buf384)
        buf385 = reinterpret_tensor(buf384, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf384  # reuse
        buf386 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_125, mul_152, mul_153, x_126], Original ATen: [aten._to_copy, aten.mul, aten.add]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_add_mul_38.run(buf385, buf378, primals_7, buf50, buf52, buf386, 67108864, grid=grid(67108864), stream=stream0)
        buf387 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_54], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf386, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_75, (1024, 3072), (1, 1024), 0), out=buf387)
        buf388 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf389 = reinterpret_tensor(buf388, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf388  # reuse
        buf390 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf391 = reinterpret_tensor(buf390, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf390  # reuse
        buf396 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf397 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_40, k_40, cat_26, q_41, cat_27, k_41], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf389, buf391, buf387, primals_76, primals_77, buf396, buf397, 524288, 128, grid=grid(524288), stream=stream0)
        buf394 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf395 = reinterpret_tensor(buf394, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf394  # reuse
        buf398 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [v_40, mul_162, mul_163, v_41], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_39.run(buf395, buf387, primals_7, buf1, buf398, 524288, 128, grid=grid(524288), stream=stream0)
        buf399 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf400 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention_13], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf396, buf397, buf398, buf399, buf32, buf9, buf33, buf13, buf3, buf400, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf402 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_41], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf400, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_75, (1024, 1024), (1, 1024), 3145728), out=buf402)
        buf403 = reinterpret_tensor(buf402, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf402  # reuse
        buf404 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf405 = reinterpret_tensor(buf404, (1, 65536, 1), (65536, 1, 1), 0); del buf404  # reuse
        buf406 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_130, rms_norm_57], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf403, buf405, buf386, buf406, 65536, 1024, grid=grid(65536), stream=stream0)
        buf407 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_131], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf406, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_78, (1024, 4096), (1, 1024), 0), out=buf407)
        buf408 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_14, x_132], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf407, buf408, 268435456, grid=grid(268435456), stream=stream0)
        buf409 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_133], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf408, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_79, (4096, 1024), (1, 4096), 0), out=buf409)
        buf410 = reinterpret_tensor(buf409, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf409  # reuse
        buf411 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x, x_134, mul_164, mul_165, x_135], Original ATen: [aten._to_copy, aten.mul, aten.add]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_add_mul_40.run(buf410, buf403, primals_7, buf50, buf52, buf411, 67108864, grid=grid(67108864), stream=stream0)
        buf412 = empty_strided_cuda((65536, 3072), (3072, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_58], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf411, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_80, (1024, 3072), (1, 1024), 0), out=buf412)
        buf413 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf414 = reinterpret_tensor(buf413, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf413  # reuse
        buf415 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf416 = reinterpret_tensor(buf415, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf415  # reuse
        buf421 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        buf422 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [q_43, k_43, cat_28, q_44, cat_29, k_44], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.cat]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_cat_mean_pow_rsqrt_6.run(buf414, buf416, buf412, primals_81, primals_82, buf421, buf422, 524288, 128, grid=grid(524288), stream=stream0)
        buf419 = empty_strided_cuda((1, 65536, 8, 1), (524288, 8, 1, 524288), torch.float32)
        buf420 = reinterpret_tensor(buf419, (1, 65536, 8, 1), (524288, 8, 1, 1), 0); del buf419  # reuse
        buf423 = empty_strided_cuda((1, 65536, 8, 128), (67108864, 1024, 128, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [v_43, mul_174, mul_175, v_44], Original ATen: [aten._to_copy, aten.pow, aten.mean, aten.add, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_41.run(buf420, buf412, primals_7, buf2, buf423, 524288, 128, grid=grid(524288), stream=stream0)
        buf424 = empty_strided_cuda((1, 8, 65536), (524288, 65536, 1), torch.float32)
        buf425 = empty_strided_cuda((1, 8, 65536, 128), (1, 128, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [flex_attention_14], Original ATen: []
        stream0 = get_raw_stream(0)
        triton_tem_fused_7.run(buf421, buf422, buf423, buf424, buf14, buf9, buf15, buf13, buf3, buf425, grid=torch._inductor.kernel.flex_attention.flex_attention_grid(1, 8, 65536, 128, meta0), stream=stream0)
        buf427 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [y_44], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf425, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_80, (1024, 1024), (1, 1024), 3145728), out=buf427)
        buf428 = reinterpret_tensor(buf427, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf427  # reuse
        buf429 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf430 = reinterpret_tensor(buf429, (1, 65536, 1), (65536, 1, 1), 0); del buf429  # reuse
        buf431 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_139, rms_norm_61], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf428, buf430, buf411, buf431, 65536, 1024, grid=grid(65536), stream=stream0)
        buf432 = empty_strided_cuda((65536, 4096), (4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_140], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf431, (65536, 1024), (1024, 1), 0), reinterpret_tensor(primals_83, (1024, 4096), (1, 1024), 0), out=buf432)
        buf433 = empty_strided_cuda((1, 65536, 4096), (268435456, 4096, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [relu_15, x_141], Original ATen: [aten.relu, aten.pow]
        stream0 = get_raw_stream(0)
        triton_poi_fused_pow_relu_9.run(buf432, buf433, 268435456, grid=grid(268435456), stream=stream0)
        buf434 = empty_strided_cuda((65536, 1024), (1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_142], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf433, (65536, 4096), (4096, 1), 0), reinterpret_tensor(primals_84, (4096, 1024), (1, 4096), 0), out=buf434)
        buf435 = reinterpret_tensor(buf434, (1, 65536, 1024), (67108864, 1024, 1), 0); del buf434  # reuse
        buf436 = empty_strided_cuda((1, 65536, 1), (65536, 1, 65536), torch.float32)
        buf437 = reinterpret_tensor(buf436, (1, 65536, 1), (65536, 1, 1), 0); del buf436  # reuse
        buf438 = empty_strided_cuda((1, 65536, 1024), (67108864, 1024, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [x_143, x_144], Original ATen: [aten.add, aten._to_copy, aten.pow, aten.mean, aten.rsqrt, aten.mul]
        stream0 = get_raw_stream(0)
        triton_per_fused__to_copy_add_mean_mul_pow_rsqrt_8.run(buf435, buf437, buf428, buf438, 65536, 1024, grid=grid(65536), stream=stream0)
        buf439 = empty_strided_cuda((1024, 50304), (1, 1024), torch.bfloat16)
        # Topologically Sorted Source Nodes: [bfloat16, linear_62], Original ATen: [aten._to_copy, aten.t]
        stream0 = get_raw_stream(0)
        triton_poi_fused__to_copy_t_42.run(primals_85, buf439, 51511296, grid=grid(51511296), stream=stream0)
        del primals_85
        buf440 = empty_strided_cuda((65536, 50304), (50304, 1), torch.bfloat16)
        # Topologically Sorted Source Nodes: [linear_62], Original ATen: [aten.mm]
        extern_kernels.mm(reinterpret_tensor(buf438, (65536, 1024), (1024, 1), 0), buf439, out=buf440)
        buf441 = empty_strided_cuda((65536, 1), (1, 1), torch.float32)
        buf442 = empty_strided_cuda((65536, 1), (1, 65536), torch.float32)
        buf443 = reinterpret_tensor(buf442, (65536, 1), (1, 1), 0); del buf442  # reuse
        # Topologically Sorted Source Nodes: [logits, mul_176, square_16, add_116, rsqrt, mul_177, loss], Original ATen: [aten._to_copy, aten.mul, aten.pow, aten.add, aten.rsqrt, aten._log_softmax]
        stream0 = get_raw_stream(0)
        triton_red_fused__log_softmax__to_copy_add_mul_pow_rsqrt_43.run(buf443, buf440, buf441, 65536, 50304, grid=grid(65536), stream=stream0)
        buf444 = empty_strided_cuda((8, ), (1, ), torch.int64)
        buf447 = empty_strided_cuda((8, ), (1, ), torch.float32)
        # Topologically Sorted Source Nodes: [loss], Original ATen: [aten.nll_loss_forward]
        stream0 = get_raw_stream(0)
        triton_red_fused_nll_loss_forward_44.run(primals_86, buf440, buf441, buf443, buf444, buf447, 8, 8192, grid=grid(8), stream=stream0)
        buf448 = empty_strided_cuda((), (), torch.float32)
        buf446 = empty_strided_cuda((), (), torch.float32)
        buf449 = buf448; del buf448  # reuse
        # Topologically Sorted Source Nodes: [loss], Original ATen: [aten.nll_loss_forward]
        stream0 = get_raw_stream(0)
        triton_per_fused_nll_loss_forward_45.run(buf449, buf447, buf444, buf446, 1, 8, grid=grid(1), stream=stream0)
        del buf444
        del buf447
    return (buf449, primals_1, primals_7, primals_86, buf0, buf1, buf2, buf3, reinterpret_tensor(buf9, (1, 1, 512, 512), (262144, 262144, 512, 1), 0), reinterpret_tensor(buf13, (1, 1, 512, 512), (262144, 262144, 512, 1), 0), buf14, buf15, buf22, buf23, buf30, buf31, buf32, buf33, buf40, buf41, buf48, buf49, buf50, buf52, reinterpret_tensor(buf53, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf54, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf54, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf54, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf56, buf58, reinterpret_tensor(primals_9, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_10, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf62, reinterpret_tensor(buf63, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf64, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf65, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf67, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf66, buf70, buf72, reinterpret_tensor(buf73, (65536, 1024), (1024, 1), 0), buf74, reinterpret_tensor(buf75, (65536, 4096), (4096, 1), 0), buf77, reinterpret_tensor(buf78, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf79, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf79, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf79, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf81, buf83, reinterpret_tensor(primals_14, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_15, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf87, reinterpret_tensor(buf88, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf89, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf90, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf92, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf91, buf95, buf97, reinterpret_tensor(buf98, (65536, 1024), (1024, 1), 0), buf99, reinterpret_tensor(buf100, (65536, 4096), (4096, 1), 0), buf102, reinterpret_tensor(buf103, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf104, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf104, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf104, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf106, buf108, reinterpret_tensor(primals_19, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_20, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf112, reinterpret_tensor(buf113, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf114, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf115, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf117, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf116, buf120, buf122, reinterpret_tensor(buf123, (65536, 1024), (1024, 1), 0), buf124, reinterpret_tensor(buf125, (65536, 4096), (4096, 1), 0), buf127, reinterpret_tensor(buf128, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf129, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf129, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf129, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf131, buf133, reinterpret_tensor(primals_24, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_25, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf137, reinterpret_tensor(buf138, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf139, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf140, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf142, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf141, buf145, buf147, reinterpret_tensor(buf148, (65536, 1024), (1024, 1), 0), buf149, reinterpret_tensor(buf150, (65536, 4096), (4096, 1), 0), buf152, reinterpret_tensor(buf153, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf154, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf154, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf154, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf156, buf158, reinterpret_tensor(primals_29, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_30, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf162, reinterpret_tensor(buf163, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf164, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf165, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf167, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf166, buf170, buf172, reinterpret_tensor(buf173, (65536, 1024), (1024, 1), 0), buf174, reinterpret_tensor(buf175, (65536, 4096), (4096, 1), 0), buf177, reinterpret_tensor(buf178, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf179, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf179, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf179, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf181, buf183, reinterpret_tensor(primals_34, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_35, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf187, reinterpret_tensor(buf188, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf189, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf190, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf192, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf191, buf195, buf197, reinterpret_tensor(buf198, (65536, 1024), (1024, 1), 0), buf199, reinterpret_tensor(buf200, (65536, 4096), (4096, 1), 0), buf202, reinterpret_tensor(buf203, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf204, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf204, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf204, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf206, buf208, reinterpret_tensor(primals_39, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_40, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf212, reinterpret_tensor(buf213, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf214, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf215, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf217, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf216, buf220, buf222, reinterpret_tensor(buf223, (65536, 1024), (1024, 1), 0), buf224, reinterpret_tensor(buf225, (65536, 4096), (4096, 1), 0), buf227, buf228, buf230, reinterpret_tensor(buf231, (65536, 1024), (1024, 1), 0), buf232, reinterpret_tensor(buf233, (65536, 4096), (4096, 1), 0), buf235, reinterpret_tensor(buf236, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf237, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf237, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf237, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf239, buf241, reinterpret_tensor(primals_46, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_47, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf245, reinterpret_tensor(buf246, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf247, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf248, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf250, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf249, buf253, buf255, reinterpret_tensor(buf256, (65536, 1024), (1024, 1), 0), buf257, reinterpret_tensor(buf258, (65536, 4096), (4096, 1), 0), buf260, reinterpret_tensor(buf261, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf262, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf262, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf262, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf264, buf266, reinterpret_tensor(primals_51, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_52, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf270, reinterpret_tensor(buf271, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf272, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf273, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf275, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf274, buf278, buf280, reinterpret_tensor(buf281, (65536, 1024), (1024, 1), 0), buf282, reinterpret_tensor(buf283, (65536, 4096), (4096, 1), 0), buf285, reinterpret_tensor(buf286, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf287, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf287, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf287, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf289, buf291, reinterpret_tensor(primals_56, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_57, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf295, reinterpret_tensor(buf296, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf297, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf298, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf300, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf299, buf303, buf305, reinterpret_tensor(buf306, (65536, 1024), (1024, 1), 0), buf307, reinterpret_tensor(buf308, (65536, 4096), (4096, 1), 0), buf310, reinterpret_tensor(buf311, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf312, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf312, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf312, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf314, buf316, reinterpret_tensor(primals_61, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_62, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf320, reinterpret_tensor(buf321, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf322, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf323, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf325, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf324, buf328, buf330, reinterpret_tensor(buf331, (65536, 1024), (1024, 1), 0), buf332, reinterpret_tensor(buf333, (65536, 4096), (4096, 1), 0), buf335, reinterpret_tensor(buf336, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf337, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf337, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf337, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf339, buf341, reinterpret_tensor(primals_66, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_67, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf345, reinterpret_tensor(buf346, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf347, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf348, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf350, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf349, buf353, buf355, reinterpret_tensor(buf356, (65536, 1024), (1024, 1), 0), buf357, reinterpret_tensor(buf358, (65536, 4096), (4096, 1), 0), buf360, reinterpret_tensor(buf361, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf362, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf362, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf362, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf364, buf366, reinterpret_tensor(primals_71, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_72, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf370, reinterpret_tensor(buf371, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf372, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf373, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf375, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf374, buf378, buf380, reinterpret_tensor(buf381, (65536, 1024), (1024, 1), 0), buf382, reinterpret_tensor(buf383, (65536, 4096), (4096, 1), 0), buf385, reinterpret_tensor(buf386, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf387, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf387, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf387, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf389, buf391, reinterpret_tensor(primals_76, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_77, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf395, reinterpret_tensor(buf396, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf397, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf398, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf400, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf399, buf403, buf405, reinterpret_tensor(buf406, (65536, 1024), (1024, 1), 0), buf407, reinterpret_tensor(buf408, (65536, 4096), (4096, 1), 0), buf410, reinterpret_tensor(buf411, (65536, 1024), (1024, 1), 0), reinterpret_tensor(buf412, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 0), reinterpret_tensor(buf412, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 1024), reinterpret_tensor(buf412, (1, 65536, 8, 128), (201326592, 3072, 128, 1), 2048), buf414, buf416, reinterpret_tensor(primals_81, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), reinterpret_tensor(primals_82, (1, 65536, 1, 64), (16777216, 64, 64, 1), 0), buf420, reinterpret_tensor(buf421, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf422, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf423, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), reinterpret_tensor(buf425, (1, 8, 65536, 128), (67108864, 128, 1024, 1), 0), buf424, buf428, buf430, reinterpret_tensor(buf431, (65536, 1024), (1024, 1), 0), buf432, reinterpret_tensor(buf433, (65536, 4096), (4096, 1), 0), buf435, buf437, reinterpret_tensor(buf438, (65536, 1024), (1024, 1), 0), buf440, buf441, buf443, buf446, reinterpret_tensor(buf439, (50304, 1024), (1024, 1), 0), primals_84, primals_83, reinterpret_tensor(primals_80, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_80, (3072, 1024), (1024, 1), 0), primals_79, primals_78, reinterpret_tensor(primals_75, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_75, (3072, 1024), (1024, 1), 0), primals_74, primals_73, reinterpret_tensor(primals_70, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_70, (3072, 1024), (1024, 1), 0), primals_69, primals_68, reinterpret_tensor(primals_65, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_65, (3072, 1024), (1024, 1), 0), primals_64, primals_63, reinterpret_tensor(primals_60, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_60, (3072, 1024), (1024, 1), 0), primals_59, primals_58, reinterpret_tensor(primals_55, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_55, (3072, 1024), (1024, 1), 0), primals_54, primals_53, reinterpret_tensor(primals_50, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_50, (3072, 1024), (1024, 1), 0), primals_49, primals_48, reinterpret_tensor(primals_45, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_45, (3072, 1024), (1024, 1), 0), primals_44, primals_43, primals_42, primals_41, reinterpret_tensor(primals_38, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_38, (3072, 1024), (1024, 1), 0), primals_37, primals_36, reinterpret_tensor(primals_33, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_33, (3072, 1024), (1024, 1), 0), primals_32, primals_31, reinterpret_tensor(primals_28, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_28, (3072, 1024), (1024, 1), 0), primals_27, primals_26, reinterpret_tensor(primals_23, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_23, (3072, 1024), (1024, 1), 0), primals_22, primals_21, reinterpret_tensor(primals_18, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_18, (3072, 1024), (1024, 1), 0), primals_17, primals_16, reinterpret_tensor(primals_13, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_13, (3072, 1024), (1024, 1), 0), primals_12, primals_11, reinterpret_tensor(primals_8, (1024, 1024), (1024, 1), 3145728), reinterpret_tensor(primals_8, (3072, 1024), (1024, 1), 0), )

def benchmark_compiled_module(times=10, repeat=10):
    from torch._dynamo.testing import rand_strided
    from torch._inductor.utils import print_performance
    primals_1 = rand_strided((65536, ), (1, ), device='cuda:0', dtype=torch.int32)
    primals_2 = rand_strided((50257, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_3 = rand_strided((50257, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_4 = rand_strided((50257, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_5 = rand_strided((), (), device='cuda:0', dtype=torch.int32)
    primals_6 = rand_strided((50257, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_7 = rand_strided((80, ), (1, ), device='cuda:0', dtype=torch.float32)
    primals_8 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_9 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_10 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_11 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_12 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_13 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_14 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_15 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_16 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_17 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_18 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_19 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_20 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_21 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_22 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_23 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_24 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_25 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_26 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_27 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_28 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_29 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_30 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_31 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_32 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_33 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_34 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_35 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_36 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_37 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_38 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_39 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_40 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_41 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_42 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_43 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_44 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_45 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_46 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_47 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_48 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_49 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_50 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_51 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_52 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_53 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_54 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_55 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_56 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_57 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_58 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_59 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_60 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_61 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_62 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_63 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_64 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_65 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_66 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_67 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_68 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_69 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_70 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_71 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_72 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_73 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_74 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_75 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_76 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_77 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_78 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_79 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_80 = rand_strided((4, 1024, 1024), (1048576, 1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_81 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_82 = rand_strided((262144, 64), (64, 1), device='cuda:0', dtype=torch.float32)
    primals_83 = rand_strided((4096, 1024), (1024, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_84 = rand_strided((1024, 4096), (4096, 1), device='cuda:0', dtype=torch.bfloat16)
    primals_85 = rand_strided((50304, 1024), (1024, 1), device='cuda:0', dtype=torch.float32)
    primals_86 = rand_strided((65536, ), (1, ), device='cuda:0', dtype=torch.int64)
    fn = lambda: call([primals_1, primals_2, primals_3, primals_4, primals_5, primals_6, primals_7, primals_8, primals_9, primals_10, primals_11, primals_12, primals_13, primals_14, primals_15, primals_16, primals_17, primals_18, primals_19, primals_20, primals_21, primals_22, primals_23, primals_24, primals_25, primals_26, primals_27, primals_28, primals_29, primals_30, primals_31, primals_32, primals_33, primals_34, primals_35, primals_36, primals_37, primals_38, primals_39, primals_40, primals_41, primals_42, primals_43, primals_44, primals_45, primals_46, primals_47, primals_48, primals_49, primals_50, primals_51, primals_52, primals_53, primals_54, primals_55, primals_56, primals_57, primals_58, primals_59, primals_60, primals_61, primals_62, primals_63, primals_64, primals_65, primals_66, primals_67, primals_68, primals_69, primals_70, primals_71, primals_72, primals_73, primals_74, primals_75, primals_76, primals_77, primals_78, primals_79, primals_80, primals_81, primals_82, primals_83, primals_84, primals_85, primals_86])
    return print_performance(fn, times=times, repeat=repeat)

if __name__ == "__main__":
    from torch._inductor.wrapper_benchmark import compiled_module_main
    compiled_module_main('None', benchmark_compiled_module)

Changed text