Confronta il testo

Trova la differenza tra due file di testo

Real-time diff

Unified diff

Collapse lines

Highlight change

Syntax highlighting

Strumenti

Diffchecker Desktop The most secure way to run Diffchecker. Get the Diffchecker Desktop app: your diffs never leave your computer!Get Desktop

output_code_before_after

Created 11 months agoDiff never expires

Lines
Total
Removed

Words
Total
Removed

To continue using this feature, upgrade to Diffchecker Pro View Pricing

469 lines

Lines
Total
Added

Words
Total
Added

To continue using this feature, upgrade to Diffchecker Pro View Pricing

463 lines

# AOT ID: ['0_inference']

from ctypes import c_void_p, c_long

import torch

import math

import random

import os

import tempfile

from math import inf, nan

from torch._inductor.hooks import run_intermediate_hooks

from torch._inductor.utils import maybe_profile

from torch._inductor.codegen.memory_planning import _align as align

from torch import device, empty_strided

from torch._inductor.async_compile import AsyncCompile

from torch._inductor.select_algorithm import extern_kernels

from torch._inductor.codegen.multi_kernel import MultiKernelCall

aten = torch.ops.aten

inductor_ops = torch.ops.inductor

_quantized = torch.ops._quantized

assert_size_stride = torch._C._dynamo.guards.assert_size_stride

empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu

empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda

reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor

alloc_from_pool = torch.ops.inductor._alloc_from_pool

async_compile = AsyncCompile()

# kernel path: /tmp/torchinductor_eellison/wj/cwjmdogaso56hzkb4pdlo4k625tmwevhq5krfbpmpx5szmhdjbsf.py

# Source Nodes: [embeddings, embeddings_1, embeddings_2, inputs_embeds, position_embeddings, token_type_embeddings], Original ATen: [aten.add, aten.embedding, aten.native_layer_norm]

# embeddings => add

# embeddings_1 => add_1

# embeddings_2 => add_2, add_3, convert_element_type_1, convert_element_type_2, mul_1, mul_2, rsqrt, sub_1, var_mean

# embeddings_2 => add_2, add_3, convert_element_type, convert_element_type_1, mul, mul_1, rsqrt, sub, var_mean

# inputs_embeds => embedding

# position_embeddings => embedding_2

# token_type_embeddings => embedding_1

triton_per_fused_add_embedding_native_layer_norm_0 = async_compile.triton('triton_', '''

import triton

import triton.language as tl

from triton.compiler.compiler import AttrsDescriptor

from torch._inductor.runtime import triton_helpers, triton_heuristics

from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math

from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties

@triton_heuristics.persistent_reduction(

size_hints=[8192, 1024],

reduction_hint=ReductionHint.INNER,

filename=__file__,

triton_meta={'signature': {0: '*i64', 1: '*bf16', 2: '*i64', 3: '*bf16', 4: '*i64', 5: '*bf16', 6: '*bf16', 7: '*bf16', 8: '*bf16', 9: '*bf16', 10: 'i32', 11: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]},

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_add_embedding_native_layer_norm_0', 'mutated_arg_names': [], 'no_x_dim': True, 'num_load': 5, 'num_reduction': 4, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}

)

@triton.jit

def triton_(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, out_ptr3, xnumel, rnumel):

xnumel = 8192

XBLOCK: tl.constexpr = 1

rnumel = 768

RBLOCK: tl.constexpr = 1024

xoffset = tl.program_id(0) * XBLOCK

xindex = tl.full([1], xoffset, tl.int32)

xmask = tl.full([RBLOCK], True, tl.int1)

rindex = tl.arange(0, RBLOCK)[:]

roffset = 0

rmask = rindex < rnumel

x3 = xindex

r2 = rindex

x0 = xindex % 512

tmp0 = tl.load(in_ptr0 + (x3), None, eviction_policy='evict_last')

tmp7 = tl.load(in_ptr2 + (x0), None, eviction_policy='evict_last')

tmp15 = tl.load(in_ptr4 + (x0), None, eviction_policy='evict_last')

tmp47 = tl.load(in_ptr6 + (r2), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)

tmp50 = tl.load(in_ptr7 + (r2), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)

tmp1 = tl.full([RBLOCK], 30522, tl.int32)

tmp2 = tmp0 + tmp1

tmp3 = tmp0 < 0

tmp4 = tl.where(tmp3, tmp2, tmp0)

tl.device_assert((0 <= tmp4) & (tmp4 < 30522), "index out of bounds: 0 <= tmp4 < 30522")

tmp6 = tl.load(in_ptr1 + (r2 + (768*tmp4)), rmask, other=0.0).to(tl.float32)

tmp8 = tl.full([RBLOCK], 2, tl.int32)

tmp9 = tmp7 + tmp8

tmp10 = tmp7 < 0

tmp11 = tl.where(tmp10, tmp9, tmp7)

tl.device_assert((0 <= tmp11) & (tmp11 < 2), "index out of bounds: 0 <= tmp11 < 2")

tmp13 = tl.load(in_ptr3 + (r2 + (768*tmp11)), rmask, other=0.0).to(tl.float32)

tmp14 = tmp6 + tmp13

tmp16 = tl.full([RBLOCK], 512, tl.int32)

tmp17 = tmp15 + tmp16

tmp18 = tmp15 < 0

tmp19 = tl.where(tmp18, tmp17, tmp15)

tl.device_assert((0 <= tmp19) & (tmp19 < 512), "index out of bounds: 0 <= tmp19 < 512")

tmp21 = tl.load(in_ptr5 + (r2 + (768*tmp19)), rmask, other=0.0).to(tl.float32)

tmp22 = tmp14 + tmp21

tmp23 = tmp22.to(tl.float32)

tmp24 = tl.broadcast_to(tmp23, [RBLOCK])

tmp26 = tl.where(rmask, tmp24, 0)

tmp27 = tl.broadcast_to(tmp24, [RBLOCK])

tmp29 = tl.where(rmask, tmp27, 0)

tmp30 = triton_helpers.promote_to_tensor(tl.sum(tmp29, 0))

tmp31 = tl.full([1], 768, tl.int32)

tmp32 = tmp31.to(tl.float32)

tmp33 = tmp30 / tmp32

tmp34 = tmp24 - tmp33

tmp35 = tmp34 * tmp34

tmp36 = tl.broadcast_to(tmp35, [RBLOCK])

tmp38 = tl.where(rmask, tmp36, 0)

tmp39 = triton_helpers.promote_to_tensor(tl.sum(tmp38, 0))

tmp40 = tmp23 - tmp33

tmp41 = 768.0

tmp42 = tmp39 / tmp41

tmp43 = 1e-12

tmp44 = tmp42 + tmp43

tmp45 = libdevice.rsqrt(tmp44)

tmp46 = tmp40 * tmp45

tmp48 = tmp47.to(tl.float32)

tmp49 = tmp46 * tmp48

tmp51 = tmp50.to(tl.float32)

tmp52 = tmp49 + tmp51

tmp53 = tmp52.to(tl.float32)

tl.store(out_ptr0 + (r2 + (768*x3)), tmp22, rmask)

tl.store(out_ptr3 + (r2 + (768*x3)), tmp53, rmask)

''', device_str='cuda')

import triton

import triton.language as tl

from torch._inductor.runtime.triton_heuristics import grid, split_scan_grid, grid_combo_kernels, start_graph, end_graph

from torch._C import _cuda_getCurrentRawStream as get_raw_stream

# kernel path: /tmp/torchinductor_eellison/ze/czezsvjgwtdzru43t5gig4zb6t4auharyypwotkirkhmzu46fkzp.py

# kernel path: /tmp/torchinductor_eellison/6q/c6qq6qfsawjfikfoeruueuup5cnzvmpzkusjkel6l6wcw43mgauj.py

# Source Nodes: [add_2, hidden_states_2], Original ATen: [aten.add, aten.native_layer_norm]

# Source Nodes: [attn_output], Original ATen: [aten._scaled_dot_product_efficient_attention]

# add_2 => add_5

# attn_output => _scaled_dot_product_efficient_attention

# hidden_states_2 => add_6, add_7, convert_element_type_21, convert_element_type_22, mul_3, mul_4, rsqrt_1, sub_3, var_mean_1

triton_poi_fused__scaled_dot_product_efficient_attention_1 = async_compile.triton('triton_', '''

triton_per_fused_add_native_layer_norm_1 = async_compile.triton('triton_', '''

import triton

import triton.language as tl

from triton.compiler.compiler import AttrsDescriptor

from torch._inductor.runtime import triton_helpers, triton_heuristics

from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math

from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties

@triton_heuristics.pointwise(

size_hints=[67108864],

filename=__file__,

triton_meta={'signature': {0: '*bf16', 1: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1), equal_to_1=())]},

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_efficient_attention_1', 'mutated_arg_names': [], 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},

min_elem_per_thread=0

)

@triton.jit

def triton_(out_ptr0, xnumel, XBLOCK : tl.constexpr):

xnumel = 50331648

xoffset = tl.program_id(0) * XBLOCK

xindex = xoffset + tl.arange(0, XBLOCK)[:]

xmask = tl.full([XBLOCK], True, tl.int1)

x0 = xindex

tmp0 = tl.full([1], False, tl.int1)

tmp1 = -3.3895313892515355e+38

tmp2 = 0.0

Text moved with changes from lines 464-469 (92.0% similarity)

tmp3 = tl.where(tmp0, tmp1, tmp2)

tl.store(out_ptr0 + (x0), tmp3, None)

''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/at/catipo3niy6sjxmx7lban4oumf6lplhx7qd3qxdhlcdrzt4xmv65.py

# Source Nodes: [add_1, hidden_states_2], Original ATen: [aten.add, aten.native_layer_norm]

# add_1 => add_4

# hidden_states_2 => add_5, add_6, convert_element_type_16, convert_element_type_17, mul_2, mul_3, rsqrt_1, sub_2, var_mean_1

triton_per_fused_add_native_layer_norm_2 = async_compile.triton('triton_', '''

import triton

import triton.language as tl

from triton.compiler.compiler import AttrsDescriptor

from torch._inductor.runtime import triton_helpers, triton_heuristics

from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math

from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties

@triton_heuristics.persistent_reduction(

size_hints=[8192, 1024],

reduction_hint=ReductionHint.INNER,

filename=__file__,

triton_meta={'signature': {0: '*bf16', 1: '*bf16', 2: '*bf16', 3: '*bf16', 4: '*bf16', 5: '*bf16', 6: 'i32', 7: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]},

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_add_native_layer_norm_1', 'mutated_arg_names': [], 'no_x_dim': True, 'num_load': 5, 'num_reduction': 4, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_add_native_layer_norm_2', 'mutated_arg_names': [], 'no_x_dim': True, 'num_load': 5, 'num_reduction': 4, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}

)

@triton.jit

def triton_(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr2, xnumel, rnumel):

xnumel = 8192

XBLOCK: tl.constexpr = 1

rnumel = 768

RBLOCK: tl.constexpr = 1024

xoffset = tl.program_id(0) * XBLOCK

xindex = tl.full([1], xoffset, tl.int32)

xmask = tl.full([RBLOCK], True, tl.int1)

rindex = tl.arange(0, RBLOCK)[:]

roffset = 0

rmask = rindex < rnumel

r1 = rindex

x0 = xindex

tmp0 = tl.load(in_ptr0 + (r1 + (768*x0)), rmask, other=0.0).to(tl.float32)

tmp1 = tl.load(in_ptr1 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)

tmp3 = tl.load(in_ptr2 + (r1 + (768*x0)), rmask, other=0.0).to(tl.float32)

tmp29 = tl.load(in_ptr3 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)

tmp32 = tl.load(in_ptr4 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)

tmp2 = tmp0 + tmp1

tmp4 = tmp2 + tmp3

tmp5 = tmp4.to(tl.float32)

tmp6 = tl.broadcast_to(tmp5, [RBLOCK])

tmp8 = tl.where(rmask, tmp6, 0)

tmp9 = tl.broadcast_to(tmp6, [RBLOCK])

tmp11 = tl.where(rmask, tmp9, 0)

tmp12 = triton_helpers.promote_to_tensor(tl.sum(tmp11, 0))

tmp13 = tl.full([1], 768, tl.int32)

tmp14 = tmp13.to(tl.float32)

tmp15 = tmp12 / tmp14

tmp16 = tmp6 - tmp15

tmp17 = tmp16 * tmp16

tmp18 = tl.broadcast_to(tmp17, [RBLOCK])

tmp20 = tl.where(rmask, tmp18, 0)

tmp21 = triton_helpers.promote_to_tensor(tl.sum(tmp20, 0))

tmp22 = tmp5 - tmp15

tmp23 = 768.0

tmp24 = tmp21 / tmp23

tmp25 = 1e-12

tmp26 = tmp24 + tmp25

tmp27 = libdevice.rsqrt(tmp26)

tmp28 = tmp22 * tmp27

tmp30 = tmp29.to(tl.float32)

tmp31 = tmp28 * tmp30

tmp33 = tmp32.to(tl.float32)

tmp34 = tmp31 + tmp33

tmp35 = tmp34.to(tl.float32)

tl.store(out_ptr2 + (r1 + (768*x0)), tmp35, rmask)

''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/h7/ch7fp7gaeqqseqnc45vn7aus3pytqrkcm2gt6tswtsp4robtviff.py

# kernel path: /tmp/torchinductor_eellison/lp/clpuyqd2qh624e2ye7osnyieorymdus3kflfhssozom7b33frjei.py

# Source Nodes: [hidden_states_4], Original ATen: [aten.gelu]

# hidden_states_4 => add_8, convert_element_type_26, convert_element_type_27, erf, mul_5, mul_6, mul_7

# hidden_states_4 => add_7, convert_element_type_21, convert_element_type_22, erf, mul_4, mul_5, mul_6

triton_poi_fused_gelu_2 = async_compile.triton('triton_', '''

triton_poi_fused_gelu_3 = async_compile.triton('triton_', '''

import triton

import triton.language as tl

from triton.compiler.compiler import AttrsDescriptor

from torch._inductor.runtime import triton_helpers, triton_heuristics

from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math

from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties

@triton_heuristics.pointwise(

size_hints=[33554432],

filename=__file__,

triton_meta={'signature': {0: '*bf16', 1: '*bf16', 2: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]},

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_gelu_2', 'mutated_arg_names': ['in_out_ptr0'], 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_gelu_3', 'mutated_arg_names': ['in_out_ptr0'], 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},

min_elem_per_thread=0

)

@triton.jit

def triton_(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):

xnumel = 25165824

xoffset = tl.program_id(0) * XBLOCK

xindex = xoffset + tl.arange(0, XBLOCK)[:]

xmask = tl.full([XBLOCK], True, tl.int1)

x2 = xindex

x0 = xindex % 3072

tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)

tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last').to(tl.float32)

tmp2 = tmp0 + tmp1

tmp3 = tmp2.to(tl.float32)

tmp4 = 0.5

tmp5 = tmp3 * tmp4

tmp6 = 0.7071067811865476

tmp7 = tmp3 * tmp6

tmp8 = libdevice.erf(tmp7)

tmp9 = 1.0

tmp10 = tmp8 + tmp9

tmp11 = tmp5 * tmp10

tmp12 = tmp11.to(tl.float32)

tl.store(in_out_ptr0 + (x2), tmp12, None)

''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/kv/ckv6n5lz6ocro4pxsewmvlwwdchxn7ycxtd2mm4zi5mmbccvo7j4.py

# kernel path: /tmp/torchinductor_eellison/ai/caiiwspixmf6ozjdgwruqmkld4lyd2nnoyrhxzmgg3mafv6a6yka.py

# Source Nodes: [hidden_states_97, hidden_states_98], Original ATen: [aten.gelu, aten.native_layer_norm]

# hidden_states_97 => add_100, convert_element_type_366, convert_element_type_367, erf_12, mul_87, mul_88, mul_89

# hidden_states_97 => add_88, convert_element_type_295, convert_element_type_296, erf_12, mul_86, mul_87, mul_88

# hidden_states_98 => add_101, add_102, convert_element_type_368, convert_element_type_369, mul_90, mul_91, rsqrt_25, sub_38, var_mean_25

# hidden_states_98 => add_89, add_90, convert_element_type_297, convert_element_type_298, mul_89, mul_90, rsqrt_25, sub_26, var_mean_25

triton_per_fused_gelu_native_layer_norm_3 = async_compile.triton('triton_', '''

triton_per_fused_gelu_native_layer_norm_4 = async_compile.triton('triton_', '''

import triton

import triton.language as tl

from triton.compiler.compiler import AttrsDescriptor

from torch._inductor.runtime import triton_helpers, triton_heuristics

from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math

from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties

@triton_heuristics.persistent_reduction(

size_hints=[8192, 1024],

reduction_hint=ReductionHint.INNER,

filename=__file__,

triton_meta={'signature': {0: '*bf16', 1: '*bf16', 2: '*bf16', 3: '*bf16', 4: '*bf16', 5: 'i32', 6: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]},

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_gelu_native_layer_norm_3', 'mutated_arg_names': [], 'no_x_dim': True, 'num_load': 4, 'num_reduction': 4, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_gelu_native_layer_norm_4', 'mutated_arg_names': [], 'no_x_dim': True, 'num_load': 4, 'num_reduction': 4, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}

)

@triton.jit

def triton_(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr3, xnumel, rnumel):

xnumel = 8192

XBLOCK: tl.constexpr = 1

rnumel = 768

RBLOCK: tl.constexpr = 1024

xoffset = tl.program_id(0) * XBLOCK

xindex = tl.full([1], xoffset, tl.int32)

xmask = tl.full([RBLOCK], True, tl.int1)

rindex = tl.arange(0, RBLOCK)[:]

roffset = 0

rmask = rindex < rnumel

r1 = rindex

x0 = xindex

tmp0 = tl.load(in_ptr0 + (r1 + (768*x0)), rmask, other=0.0).to(tl.float32)

tmp1 = tl.load(in_ptr1 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)

tmp37 = tl.load(in_ptr2 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)

tmp40 = tl.load(in_ptr3 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)

tmp2 = tmp0 + tmp1

tmp3 = tmp2.to(tl.float32)

tmp4 = 0.5

tmp5 = tmp3 * tmp4

tmp6 = 0.7071067811865476

tmp7 = tmp3 * tmp6

tmp8 = libdevice.erf(tmp7)

tmp9 = 1.0

tmp10 = tmp8 + tmp9

tmp11 = tmp5 * tmp10

tmp12 = tmp11.to(tl.float32)

tmp13 = tmp12.to(tl.float32)

tmp14 = tl.broadcast_to(tmp13, [RBLOCK])

tmp16 = tl.where(rmask, tmp14, 0)

tmp17 = tl.broadcast_to(tmp14, [RBLOCK])

tmp19 = tl.where(rmask, tmp17, 0)

tmp20 = triton_helpers.promote_to_tensor(tl.sum(tmp19, 0))

tmp21 = tl.full([1], 768, tl.int32)

tmp22 = tmp21.to(tl.float32)

tmp23 = tmp20 / tmp22

tmp24 = tmp14 - tmp23

tmp25 = tmp24 * tmp24

tmp26 = tl.broadcast_to(tmp25, [RBLOCK])

tmp28 = tl.where(rmask, tmp26, 0)

tmp29 = triton_helpers.promote_to_tensor(tl.sum(tmp28, 0))

tmp30 = tmp13 - tmp23

tmp31 = 768.0

tmp32 = tmp29 / tmp31

tmp33 = 1e-12

tmp34 = tmp32 + tmp33

tmp35 = libdevice.rsqrt(tmp34)

tmp36 = tmp30 * tmp35

tmp38 = tmp37.to(tl.float32)

tmp39 = tmp36 * tmp38

tmp41 = tmp40.to(tl.float32)

tmp42 = tmp39 + tmp41

tmp43 = tmp42.to(tl.float32)

tl.store(out_ptr3 + (r1 + (768*x0)), tmp43, rmask)

''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/ry/cryghuddhmky5nywmwltcxvq5qstdsr5rotzufalizyup7kvjzmb.py

# kernel path: /tmp/torchinductor_eellison/fl/cfllfdzx4v3f3zccah7zu5u634j2vrlvbkru74wmaerhgadlizat.py

# Source Nodes: [], Original ATen: []

triton_poi_fused_4 = async_compile.triton('triton_', '''

triton_poi_fused_5 = async_compile.triton('triton_', '''

import triton

import triton.language as tl

from triton.compiler.compiler import AttrsDescriptor

from torch._inductor.runtime import triton_helpers, triton_heuristics

from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math

from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties

@triton_heuristics.pointwise(

size_hints=[33554432],

filename=__file__,

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_4', 'mutated_arg_names': [], 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_5', 'mutated_arg_names': [], 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},

min_elem_per_thread=0

)

@triton.jit

def triton_(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):

xnumel = 23445504

xoffset = tl.program_id(0) * XBLOCK

xindex = xoffset + tl.arange(0, XBLOCK)[:]

xmask = tl.full([XBLOCK], True, tl.int1)

x0 = xindex % 30528

x1 = (xindex // 30528)

x2 = xindex

tmp0 = x0

tmp1 = tl.full([1], 0, tl.int64)

tmp2 = tmp0 >= tmp1

tmp3 = tl.full([1], 30522, tl.int64)

tmp4 = tmp0 < tmp3

tmp5 = tl.load(in_ptr0 + (x1 + (768*x0)), tmp4, eviction_policy='evict_last', other=0.0).to(tl.float32)

tmp6 = tmp0 >= tmp3

tmp7 = tl.full([1], 30528, tl.int64)

tmp8 = tmp0 < tmp7

tmp9 = 0.0

tmp10 = tl.full(tmp9.shape, 0.0, tmp9.dtype)

tmp11 = tl.where(tmp6, tmp9, tmp10)

tmp12 = tl.where(tmp4, tmp5, tmp11)

tl.store(out_ptr0 + (x2), tmp12, None)

''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/iq/ciq3xiils5u6jz63putfodpdyv6oshlb3gbss7hjki6u3ogkhwqq.py

# kernel path: /tmp/torchinductor_eellison/si/csikk4r46efsgklpubt6iamwjm3jsevq2h2pkkvjbgcc7sj4p24c.py

# Source Nodes: [], Original ATen: []

triton_poi_fused_5 = async_compile.triton('triton_', '''

triton_poi_fused_6 = async_compile.triton('triton_', '''

import triton

import triton.language as tl

from triton.compiler.compiler import AttrsDescriptor

from torch._inductor.runtime import triton_helpers, triton_heuristics

from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math

from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties

@triton_heuristics.pointwise(

size_hints=[32768],

filename=__file__,

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_6', 'mutated_arg_names': [], 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},

min_elem_per_thread=0

)

@triton.jit

def triton_(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):

xnumel = 30528

xoffset = tl.program_id(0) * XBLOCK

xindex = xoffset + tl.arange(0, XBLOCK)[:]

xmask = xindex < xnumel

x0 = xindex

tmp0 = x0

tmp1 = tl.full([1], 0, tl.int64)

tmp2 = tmp0 >= tmp1

tmp3 = tl.full([1], 30522, tl.int64)

tmp4 = tmp0 < tmp3

tmp5 = tl.load(in_ptr0 + (x0), tmp4 & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)

tmp6 = tmp0 >= tmp3

tmp7 = tl.full([1], 30528, tl.int64)

tmp8 = tmp0 < tmp7

tmp9 = 0.0

tmp10 = tl.full(tmp9.shape, 0.0, tmp9.dtype)

tmp11 = tl.where(tmp6, tmp9, tmp10)

tmp12 = tl.where(tmp4, tmp5, tmp11)

tl.store(out_ptr0 + (x0), tmp12, xmask)

''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/vf/cvfw2hcsiy2zivm66wjz4xfhs2ysdjbvqk2rsig4hp62bpxn2vvd.py

# kernel path: /tmp/torchinductor_eellison/z7/cz72sjk2qwjjxgfdcyk6de22sryhbhr6pptrvx4aq2hq2soai2p5.py

# Source Nodes: [masked_lm_loss], Original ATen: [aten._log_softmax]

# masked_lm_loss => amax_12, convert_element_type_373, exp_12, sub_39, sum_13

# masked_lm_loss => amax, convert_element_type_302, exp, sub_27, sum_1

triton_red_fused__log_softmax_6 = async_compile.triton('triton_', '''

triton_red_fused__log_softmax_7 = async_compile.triton('triton_', '''

import triton

import triton.language as tl

from triton.compiler.compiler import AttrsDescriptor

from torch._inductor.runtime import triton_helpers, triton_heuristics

from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math

from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties

@triton_heuristics.reduction(

size_hints=[8192, 32768],

reduction_hint=ReductionHint.DEFAULT,

filename=__file__,

triton_meta={'signature': {0: '*bf16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]},

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__log_softmax_6', 'mutated_arg_names': [], 'no_x_dim': False, 'num_load': 2, 'num_reduction': 2, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}

inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__log_softmax_7', 'mutated_arg_name

)

@triton.jit

def triton_(in_ptr0, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):

xnumel = 8192

rnumel = 30522

xoffset = tl.program_id(0) * XBLOCK

xindex = xoffset + tl.arange(0, XBLOCK)[:, None]

xmask = tl.full([XBLOCK, RBLOCK], True, tl.int1)

rbase = tl.arange(0, RBLOCK)[None, :]

x0 = xindex

_tmp3 = tl.full([XBLOCK, RBLOCK], float("-inf"), tl.float32)

for roffset in range(0, rnumel, RBLOCK):

rindex = roffset + rbase

rmask = rindex < rnumel

r1 = rindex

tmp0 = tl.load(in_ptr0 + (r1 + (30528*x0)), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)

tmp1 = tmp0.to(tl.float32)

tmp2 = tl.broadcast_to(tmp1, [XBLOCK, RBLOCK])

tmp4 = triton_helpers.maximum(_tmp3, tmp2)

_tmp3 = tl.where(rmask, tmp4, _tmp3)

tmp3 = triton_helpers.max2(_tmp3, 1)[:, None]

tl.store(out_ptr0 + (x0), tmp3, None)

_tmp10 = tl.full([XBLOCK, RBLOCK], 0, tl.float32)

for roffset in range(0, rnumel, RBLOCK):

rindex = roffset + rbase

rmask = rindex < rnumel

r1 = rindex

tmp5 = tl.load(in_ptr0 + (r1 + (30528*x0)), rmask, eviction_policy='evict_first', other=0.0).to(tl.float32)

tmp6 = tmp5.to(tl.float32)

tmp7 = tmp6 - tmp3

tmp8 = tl_math.exp(tmp7)

tmp9 = tl.broadcast_to(tmp8, [XBLOCK, RBLOCK])

tmp11 = _tmp10 + tmp9

_tmp10 = tl.where(rmask, tmp11, _tmp10)

Text moved with changes to lines 159-164 (92.0% similarity)

tmp10 = tl.sum(_tmp10, 1)[:, None]

tl.store(out_ptr1 + (x0), tmp10, None)

''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/bt/cbtmzvri2vmmhtlgzxhr7zowbeqsafuqeui5gh3j2bsgiibilujp

Diff salvati

Testo originale

Apri file

# AOT ID: ['0_inference']
from ctypes import c_void_p, c_long
import torch
import math
import random
import os
import tempfile
from math import inf, nan
from torch._inductor.hooks import run_intermediate_hooks
from torch._inductor.utils import maybe_profile
from torch._inductor.codegen.memory_planning import _align as align

from torch import device, empty_strided
from torch._inductor.async_compile import AsyncCompile
from torch._inductor.select_algorithm import extern_kernels
from torch._inductor.codegen.multi_kernel import MultiKernelCall

aten = torch.ops.aten
inductor_ops = torch.ops.inductor
_quantized = torch.ops._quantized
assert_size_stride = torch._C._dynamo.guards.assert_size_stride
empty_strided_cpu = torch._C._dynamo.guards._empty_strided_cpu
empty_strided_cuda = torch._C._dynamo.guards._empty_strided_cuda
reinterpret_tensor = torch._C._dynamo.guards._reinterpret_tensor
alloc_from_pool = torch.ops.inductor._alloc_from_pool
async_compile = AsyncCompile()

# kernel path: /tmp/torchinductor_eellison/wj/cwjmdogaso56hzkb4pdlo4k625tmwevhq5krfbpmpx5szmhdjbsf.py
# Source Nodes: [embeddings, embeddings_1, embeddings_2, inputs_embeds, position_embeddings, token_type_embeddings], Original ATen: [aten.add, aten.embedding, aten.native_layer_norm]
# embeddings => add
# embeddings_1 => add_1
# embeddings_2 => add_2, add_3, convert_element_type_1, convert_element_type_2, mul_1, mul_2, rsqrt, sub_1, var_mean
# inputs_embeds => embedding
# position_embeddings => embedding_2
# token_type_embeddings => embedding_1
triton_per_fused_add_embedding_native_layer_norm_0 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

from torch._inductor.runtime import triton_helpers, triton_heuristics
from torch._inductor.runtime.triton_helpers import libdevice, math as tl_math
from torch._inductor.runtime.hints import AutotuneHint, ReductionHint, TileHint, instance_descriptor, DeviceProperties

@triton_heuristics.persistent_reduction(
    size_hints=[8192, 1024],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    triton_meta={'signature': {0: '*i64', 1: '*bf16', 2: '*i64', 3: '*bf16', 4: '*i64', 5: '*bf16', 6: '*bf16', 7: '*bf16', 8: '*bf16', 9: '*bf16', 10: 'i32', 11: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11), equal_to_1=())]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_add_embedding_native_layer_norm_0', 'mutated_arg_names': [], 'no_x_dim': True, 'num_load': 5, 'num_reduction': 4, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
)
@triton.jit
def triton_(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, in_ptr5, in_ptr6, in_ptr7, out_ptr0, out_ptr3, xnumel, rnumel):
    xnumel = 8192
    XBLOCK: tl.constexpr = 1
    rnumel = 768
    RBLOCK: tl.constexpr = 1024
    xoffset = tl.program_id(0) * XBLOCK
    xindex = tl.full([1], xoffset, tl.int32)
    xmask = tl.full([RBLOCK], True, tl.int1)
    rindex = tl.arange(0, RBLOCK)[:]
    roffset = 0
    rmask = rindex < rnumel
    x3 = xindex
    r2 = rindex
    x0 = xindex % 512
    tmp0 = tl.load(in_ptr0 + (x3), None, eviction_policy='evict_last')
    tmp7 = tl.load(in_ptr2 + (x0), None, eviction_policy='evict_last')
    tmp15 = tl.load(in_ptr4 + (x0), None, eviction_policy='evict_last')
    tmp47 = tl.load(in_ptr6 + (r2), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp50 = tl.load(in_ptr7 + (r2), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp1 = tl.full([RBLOCK], 30522, tl.int32)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp0 < 0
    tmp4 = tl.where(tmp3, tmp2, tmp0)
    tl.device_assert((0 <= tmp4) & (tmp4 < 30522), "index out of bounds: 0 <= tmp4 < 30522")
    tmp6 = tl.load(in_ptr1 + (r2 + (768*tmp4)), rmask, other=0.0).to(tl.float32)
    tmp8 = tl.full([RBLOCK], 2, tl.int32)
    tmp9 = tmp7 + tmp8
    tmp10 = tmp7 < 0
    tmp11 = tl.where(tmp10, tmp9, tmp7)
    tl.device_assert((0 <= tmp11) & (tmp11 < 2), "index out of bounds: 0 <= tmp11 < 2")
    tmp13 = tl.load(in_ptr3 + (r2 + (768*tmp11)), rmask, other=0.0).to(tl.float32)
    tmp14 = tmp6 + tmp13
    tmp16 = tl.full([RBLOCK], 512, tl.int32)
    tmp17 = tmp15 + tmp16
    tmp18 = tmp15 < 0
    tmp19 = tl.where(tmp18, tmp17, tmp15)
    tl.device_assert((0 <= tmp19) & (tmp19 < 512), "index out of bounds: 0 <= tmp19 < 512")
    tmp21 = tl.load(in_ptr5 + (r2 + (768*tmp19)), rmask, other=0.0).to(tl.float32)
    tmp22 = tmp14 + tmp21
    tmp23 = tmp22.to(tl.float32)
    tmp24 = tl.broadcast_to(tmp23, [RBLOCK])
    tmp26 = tl.where(rmask, tmp24, 0)
    tmp27 = tl.broadcast_to(tmp24, [RBLOCK])
    tmp29 = tl.where(rmask, tmp27, 0)
    tmp30 = triton_helpers.promote_to_tensor(tl.sum(tmp29, 0))
    tmp31 = tl.full([1], 768, tl.int32)
    tmp32 = tmp31.to(tl.float32)
    tmp33 = tmp30 / tmp32
    tmp34 = tmp24 - tmp33
    tmp35 = tmp34 * tmp34
    tmp36 = tl.broadcast_to(tmp35, [RBLOCK])
    tmp38 = tl.where(rmask, tmp36, 0)
    tmp39 = triton_helpers.promote_to_tensor(tl.sum(tmp38, 0))
    tmp40 = tmp23 - tmp33
    tmp41 = 768.0
    tmp42 = tmp39 / tmp41
    tmp43 = 1e-12
    tmp44 = tmp42 + tmp43
    tmp45 = libdevice.rsqrt(tmp44)
    tmp46 = tmp40 * tmp45
    tmp48 = tmp47.to(tl.float32)
    tmp49 = tmp46 * tmp48
    tmp51 = tmp50.to(tl.float32)
    tmp52 = tmp49 + tmp51
    tmp53 = tmp52.to(tl.float32)
    tl.store(out_ptr0 + (r2 + (768*x3)), tmp22, rmask)
    tl.store(out_ptr3 + (r2 + (768*x3)), tmp53, rmask)
''', device_str='cuda')

import triton
import triton.language as tl
from torch._inductor.runtime.triton_heuristics import grid, split_scan_grid, grid_combo_kernels, start_graph, end_graph
from torch._C import _cuda_getCurrentRawStream as get_raw_stream

# kernel path: /tmp/torchinductor_eellison/ze/czezsvjgwtdzru43t5gig4zb6t4auharyypwotkirkhmzu46fkzp.py
# Source Nodes: [add_2, hidden_states_2], Original ATen: [aten.add, aten.native_layer_norm]
# add_2 => add_5
# hidden_states_2 => add_6, add_7, convert_element_type_21, convert_element_type_22, mul_3, mul_4, rsqrt_1, sub_3, var_mean_1
triton_per_fused_add_native_layer_norm_1 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.persistent_reduction(
    size_hints=[8192, 1024],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: '*bf16', 2: '*bf16', 3: '*bf16', 4: '*bf16', 5: '*bf16', 6: 'i32', 7: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_add_native_layer_norm_1', 'mutated_arg_names': [], 'no_x_dim': True, 'num_load': 5, 'num_reduction': 4, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
)
@triton.jit
def triton_(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr2, xnumel, rnumel):
    xnumel = 8192
    XBLOCK: tl.constexpr = 1
    rnumel = 768
    RBLOCK: tl.constexpr = 1024
    xoffset = tl.program_id(0) * XBLOCK
    xindex = tl.full([1], xoffset, tl.int32)
    xmask = tl.full([RBLOCK], True, tl.int1)
    rindex = tl.arange(0, RBLOCK)[:]
    roffset = 0
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (r1 + (768*x0)), rmask, other=0.0).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp3 = tl.load(in_ptr2 + (r1 + (768*x0)), rmask, other=0.0).to(tl.float32)
    tmp29 = tl.load(in_ptr3 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp32 = tl.load(in_ptr4 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tmp4 = tmp2 + tmp3
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tl.broadcast_to(tmp5, [RBLOCK])
    tmp8 = tl.where(rmask, tmp6, 0)
    tmp9 = tl.broadcast_to(tmp6, [RBLOCK])
    tmp11 = tl.where(rmask, tmp9, 0)
    tmp12 = triton_helpers.promote_to_tensor(tl.sum(tmp11, 0))
    tmp13 = tl.full([1], 768, tl.int32)
    tmp14 = tmp13.to(tl.float32)
    tmp15 = tmp12 / tmp14
    tmp16 = tmp6 - tmp15
    tmp17 = tmp16 * tmp16
    tmp18 = tl.broadcast_to(tmp17, [RBLOCK])
    tmp20 = tl.where(rmask, tmp18, 0)
    tmp21 = triton_helpers.promote_to_tensor(tl.sum(tmp20, 0))
    tmp22 = tmp5 - tmp15
    tmp23 = 768.0
    tmp24 = tmp21 / tmp23
    tmp25 = 1e-12
    tmp26 = tmp24 + tmp25
    tmp27 = libdevice.rsqrt(tmp26)
    tmp28 = tmp22 * tmp27
    tmp30 = tmp29.to(tl.float32)
    tmp31 = tmp28 * tmp30
    tmp33 = tmp32.to(tl.float32)
    tmp34 = tmp31 + tmp33
    tmp35 = tmp34.to(tl.float32)
    tl.store(out_ptr2 + (r1 + (768*x0)), tmp35, rmask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/h7/ch7fp7gaeqqseqnc45vn7aus3pytqrkcm2gt6tswtsp4robtviff.py
# Source Nodes: [hidden_states_4], Original ATen: [aten.gelu]
# hidden_states_4 => add_8, convert_element_type_26, convert_element_type_27, erf, mul_5, mul_6, mul_7
triton_poi_fused_gelu_2 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints=[33554432], 
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: '*bf16', 2: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_gelu_2', 'mutated_arg_names': ['in_out_ptr0'], 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 25165824
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x2 = xindex
    x0 = xindex % 3072
    tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last').to(tl.float32)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp4 = 0.5
    tmp5 = tmp3 * tmp4
    tmp6 = 0.7071067811865476
    tmp7 = tmp3 * tmp6
    tmp8 = libdevice.erf(tmp7)
    tmp9 = 1.0
    tmp10 = tmp8 + tmp9
    tmp11 = tmp5 * tmp10
    tmp12 = tmp11.to(tl.float32)
    tl.store(in_out_ptr0 + (x2), tmp12, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/kv/ckv6n5lz6ocro4pxsewmvlwwdchxn7ycxtd2mm4zi5mmbccvo7j4.py
# Source Nodes: [hidden_states_97, hidden_states_98], Original ATen: [aten.gelu, aten.native_layer_norm]
# hidden_states_97 => add_100, convert_element_type_366, convert_element_type_367, erf_12, mul_87, mul_88, mul_89
# hidden_states_98 => add_101, add_102, convert_element_type_368, convert_element_type_369, mul_90, mul_91, rsqrt_25, sub_38, var_mean_25
triton_per_fused_gelu_native_layer_norm_3 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.persistent_reduction(
    size_hints=[8192, 1024],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: '*bf16', 2: '*bf16', 3: '*bf16', 4: '*bf16', 5: 'i32', 6: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_gelu_native_layer_norm_3', 'mutated_arg_names': [], 'no_x_dim': True, 'num_load': 4, 'num_reduction': 4, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
)
@triton.jit
def triton_(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr3, xnumel, rnumel):
    xnumel = 8192
    XBLOCK: tl.constexpr = 1
    rnumel = 768
    RBLOCK: tl.constexpr = 1024
    xoffset = tl.program_id(0) * XBLOCK
    xindex = tl.full([1], xoffset, tl.int32)
    xmask = tl.full([RBLOCK], True, tl.int1)
    rindex = tl.arange(0, RBLOCK)[:]
    roffset = 0
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (r1 + (768*x0)), rmask, other=0.0).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp37 = tl.load(in_ptr2 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp40 = tl.load(in_ptr3 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp4 = 0.5
    tmp5 = tmp3 * tmp4
    tmp6 = 0.7071067811865476
    tmp7 = tmp3 * tmp6
    tmp8 = libdevice.erf(tmp7)
    tmp9 = 1.0
    tmp10 = tmp8 + tmp9
    tmp11 = tmp5 * tmp10
    tmp12 = tmp11.to(tl.float32)
    tmp13 = tmp12.to(tl.float32)
    tmp14 = tl.broadcast_to(tmp13, [RBLOCK])
    tmp16 = tl.where(rmask, tmp14, 0)
    tmp17 = tl.broadcast_to(tmp14, [RBLOCK])
    tmp19 = tl.where(rmask, tmp17, 0)
    tmp20 = triton_helpers.promote_to_tensor(tl.sum(tmp19, 0))
    tmp21 = tl.full([1], 768, tl.int32)
    tmp22 = tmp21.to(tl.float32)
    tmp23 = tmp20 / tmp22
    tmp24 = tmp14 - tmp23
    tmp25 = tmp24 * tmp24
    tmp26 = tl.broadcast_to(tmp25, [RBLOCK])
    tmp28 = tl.where(rmask, tmp26, 0)
    tmp29 = triton_helpers.promote_to_tensor(tl.sum(tmp28, 0))
    tmp30 = tmp13 - tmp23
    tmp31 = 768.0
    tmp32 = tmp29 / tmp31
    tmp33 = 1e-12
    tmp34 = tmp32 + tmp33
    tmp35 = libdevice.rsqrt(tmp34)
    tmp36 = tmp30 * tmp35
    tmp38 = tmp37.to(tl.float32)
    tmp39 = tmp36 * tmp38
    tmp41 = tmp40.to(tl.float32)
    tmp42 = tmp39 + tmp41
    tmp43 = tmp42.to(tl.float32)
    tl.store(out_ptr3 + (r1 + (768*x0)), tmp43, rmask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/ry/cryghuddhmky5nywmwltcxvq5qstdsr5rotzufalizyup7kvjzmb.py
# Source Nodes: [], Original ATen: []

triton_poi_fused_4 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints=[33554432], 
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: '*bf16', 2: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_4', 'mutated_arg_names': [], 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 23445504
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex % 30528
    x1 = (xindex // 30528)
    x2 = xindex
    tmp0 = x0
    tmp1 = tl.full([1], 0, tl.int64)
    tmp2 = tmp0 >= tmp1
    tmp3 = tl.full([1], 30522, tl.int64)
    tmp4 = tmp0 < tmp3
    tmp5 = tl.load(in_ptr0 + (x1 + (768*x0)), tmp4, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp6 = tmp0 >= tmp3
    tmp7 = tl.full([1], 30528, tl.int64)
    tmp8 = tmp0 < tmp7
    tmp9 = 0.0
    tmp10 = tl.full(tmp9.shape, 0.0, tmp9.dtype)
    tmp11 = tl.where(tmp6, tmp9, tmp10)
    tmp12 = tl.where(tmp4, tmp5, tmp11)
    tl.store(out_ptr0 + (x2), tmp12, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/iq/ciq3xiils5u6jz63putfodpdyv6oshlb3gbss7hjki6u3ogkhwqq.py
# Source Nodes: [], Original ATen: []

triton_poi_fused_5 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints=[32768], 
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: '*bf16', 2: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_5', 'mutated_arg_names': [], 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 30528
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = x0
    tmp1 = tl.full([1], 0, tl.int64)
    tmp2 = tmp0 >= tmp1
    tmp3 = tl.full([1], 30522, tl.int64)
    tmp4 = tmp0 < tmp3
    tmp5 = tl.load(in_ptr0 + (x0), tmp4 & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp6 = tmp0 >= tmp3
    tmp7 = tl.full([1], 30528, tl.int64)
    tmp8 = tmp0 < tmp7
    tmp9 = 0.0
    tmp10 = tl.full(tmp9.shape, 0.0, tmp9.dtype)
    tmp11 = tl.where(tmp6, tmp9, tmp10)
    tmp12 = tl.where(tmp4, tmp5, tmp11)
    tl.store(out_ptr0 + (x0), tmp12, xmask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/vf/cvfw2hcsiy2zivm66wjz4xfhs2ysdjbvqk2rsig4hp62bpxn2vvd.py
# Source Nodes: [masked_lm_loss], Original ATen: [aten._log_softmax]
# masked_lm_loss => amax_12, convert_element_type_373, exp_12, sub_39, sum_13
triton_red_fused__log_softmax_6 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints=[8192, 32768],
    reduction_hint=ReductionHint.DEFAULT,
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__log_softmax_6', 'mutated_arg_names': [], 'no_x_dim': False, 'num_load': 2, 'num_reduction': 2, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
)
@triton.jit
def triton_(in_ptr0, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 8192
    rnumel = 30522
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, RBLOCK], True, tl.int1)
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp3 = tl.full([XBLOCK, RBLOCK], float("-inf"), tl.float32)
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex
        tmp0 = tl.load(in_ptr0 + (r1 + (30528*x0)), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp2 = tl.broadcast_to(tmp1, [XBLOCK, RBLOCK])
        tmp4 = triton_helpers.maximum(_tmp3, tmp2)
        _tmp3 = tl.where(rmask, tmp4, _tmp3)
    tmp3 = triton_helpers.max2(_tmp3, 1)[:, None]
    tl.store(out_ptr0 + (x0), tmp3, None)
    _tmp10 = tl.full([XBLOCK, RBLOCK], 0, tl.float32)
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex
        tmp5 = tl.load(in_ptr0 + (r1 + (30528*x0)), rmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
        tmp6 = tmp5.to(tl.float32)
        tmp7 = tmp6 - tmp3
        tmp8 = tl_math.exp(tmp7)
        tmp9 = tl.broadcast_to(tmp8, [XBLOCK, RBLOCK])
        tmp11 = _tmp10 + tmp9
        _tmp10 = tl.where(rmask, tmp11, _tmp10)
    tmp10 = tl.sum(_tmp10, 1)[:, None]
    tl.store(out_ptr1 + (x0), tmp10, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/bt/cbtmzvri2vmmhtlgzxhr7zowbeqsafuqeui5gh3j2bsgiibilujp.py
# Source Nodes: [masked_lm_loss], Original ATen: [aten.nll_loss_forward]
# masked_lm_loss => convert_element_type_375, div_24, full_default_2, ne_1, ne_2, neg, sum_14, sum_15, where_1
triton_red_fused_nll_loss_forward_7 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints=[1, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: '*i64', 2: '*bf16', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {5: 1}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4, 6), equal_to_1=(5,))]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_nll_loss_forward_7', 'mutated_arg_names': ['in_out_ptr0'], 'no_x_dim': False, 'num_load': 3, 'num_reduction': 2, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
)
@triton.jit
def triton_(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 1
    rnumel = 8192
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, RBLOCK], True, tl.int1)
    rbase = tl.arange(0, RBLOCK)[None, :]
    _tmp22 = tl.full([XBLOCK, RBLOCK], 0, tl.float32)
    _tmp26 = tl.full([XBLOCK, RBLOCK], 0, tl.int64)
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r0 = rindex
        tmp0 = tl.load(in_ptr0 + (r0), rmask, eviction_policy='evict_first', other=0.0)
        tmp12 = tl.load(in_ptr2 + (r0), rmask, eviction_policy='evict_first', other=0.0)
        tmp14 = tl.load(in_ptr3 + (r0), rmask, eviction_policy='evict_first', other=0.0)
        tmp1 = tl.full([1, 1], -100, tl.int64)
        tmp2 = tmp0 != tmp1
        tmp3 = tl.full([1, 1], 0, tl.int64)
        tmp4 = tl.where(tmp2, tmp0, tmp3)
        tmp5 = tl.full([XBLOCK, RBLOCK], 30522, tl.int32)
        tmp6 = tmp4 + tmp5
        tmp7 = tmp4 < 0
        tmp8 = tl.where(tmp7, tmp6, tmp4)
        tl.device_assert(((0 <= tmp8) & (tmp8 < 30522)) | ~(rmask), "index out of bounds: 0 <= tmp8 < 30522")
        tmp10 = tl.load(in_ptr1 + (tmp8 + (30528*r0)), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp11 = tmp10.to(tl.float32)
        tmp13 = tmp11 - tmp12
        tmp15 = tl_math.log(tmp14)
        tmp16 = tmp13 - tmp15
        tmp17 = tmp16.to(tl.float32)
        tmp18 = -tmp17
        tmp19 = 0.0
        tmp20 = tl.where(tmp2, tmp18, tmp19)
        tmp21 = tl.broadcast_to(tmp20, [XBLOCK, RBLOCK])
        tmp23 = _tmp22 + tmp21
        _tmp22 = tl.where(rmask, tmp23, _tmp22)
        tmp24 = tmp2.to(tl.int64)
        tmp25 = tl.broadcast_to(tmp24, [XBLOCK, RBLOCK])
        tmp27 = _tmp26 + tmp25
        _tmp26 = tl.where(rmask, tmp27, _tmp26)
    tmp22 = tl.sum(_tmp22, 1)[:, None]
    tmp26 = tl.sum(_tmp26, 1)[:, None]
    tmp28 = tmp26.to(tl.float32)
    tmp29 = tmp22 / tmp28
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (tl.full([XBLOCK, 1], 0, tl.int32)), tmp29, None)
''', device_str='cuda')

async_compile.wait(globals())
del async_compile

def call(args):
    arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1, arg23_1, arg24_1, arg25_1, arg26_1, arg27_1, arg28_1, arg29_1, arg30_1, arg31_1, arg32_1, arg33_1, arg34_1, arg35_1, arg36_1, arg37_1, arg38_1, arg39_1, arg40_1, arg41_1, arg42_1, arg43_1, arg44_1, arg45_1, arg46_1, arg47_1, arg48_1, arg49_1, arg50_1, arg51_1, arg52_1, arg53_1, arg54_1, arg55_1, arg56_1, arg57_1, arg58_1, arg59_1, arg60_1, arg61_1, arg62_1, arg63_1, arg64_1, arg65_1, arg66_1, arg67_1, arg68_1, arg69_1, arg70_1, arg71_1, arg72_1, arg73_1, arg74_1, arg75_1, arg76_1, arg77_1, arg78_1, arg79_1, arg80_1, arg81_1, arg82_1, arg83_1, arg84_1, arg85_1, arg86_1, arg87_1, arg88_1, arg89_1, arg90_1, arg91_1, arg92_1, arg93_1, arg94_1, arg95_1, arg96_1, arg97_1, arg98_1, arg99_1, arg100_1, arg101_1, arg102_1, arg103_1, arg104_1, arg105_1, arg106_1, arg107_1, arg108_1, arg109_1, arg110_1, arg111_1, arg112_1, arg113_1, arg114_1, arg115_1, arg116_1, arg117_1, arg118_1, arg119_1, arg120_1, arg121_1, arg122_1, arg123_1, arg124_1, arg125_1, arg126_1, arg127_1, arg128_1, arg129_1, arg130_1, arg131_1, arg132_1, arg133_1, arg134_1, arg135_1, arg136_1, arg137_1, arg138_1, arg139_1, arg140_1, arg141_1, arg142_1, arg143_1, arg144_1, arg145_1, arg146_1, arg147_1, arg148_1, arg149_1, arg150_1, arg151_1, arg152_1, arg153_1, arg154_1, arg155_1, arg156_1, arg157_1, arg158_1, arg159_1, arg160_1, arg161_1, arg162_1, arg163_1, arg164_1, arg165_1, arg166_1, arg167_1, arg168_1, arg169_1, arg170_1, arg171_1, arg172_1, arg173_1, arg174_1, arg175_1, arg176_1, arg177_1, arg178_1, arg179_1, arg180_1, arg181_1, arg182_1, arg183_1, arg184_1, arg185_1, arg186_1, arg187_1, arg188_1, arg189_1, arg190_1, arg191_1, arg192_1, arg193_1, arg194_1, arg195_1, arg196_1, arg197_1, arg198_1, arg199_1, arg200_1, arg201_1, arg202_1, arg203_1, arg204_1, arg205_1, arg206_1 = args
    args.clear()
    assert_size_stride(arg0_1, (30522, 768), (768, 1))
    assert_size_stride(arg1_1, (2, 768), (768, 1))
    assert_size_stride(arg2_1, (512, 768), (768, 1))
    assert_size_stride(arg3_1, (768, ), (1, ))
    assert_size_stride(arg4_1, (768, ), (1, ))
    assert_size_stride(arg5_1, (768, 768), (768, 1))
    assert_size_stride(arg6_1, (768, ), (1, ))
    assert_size_stride(arg7_1, (768, 768), (768, 1))
    assert_size_stride(arg8_1, (768, ), (1, ))
    assert_size_stride(arg9_1, (768, 768), (768, 1))
    assert_size_stride(arg10_1, (768, ), (1, ))
    assert_size_stride(arg11_1, (768, 768), (768, 1))
    assert_size_stride(arg12_1, (768, ), (1, ))
    assert_size_stride(arg13_1, (768, ), (1, ))
    assert_size_stride(arg14_1, (768, ), (1, ))
    assert_size_stride(arg15_1, (3072, 768), (768, 1))
    assert_size_stride(arg16_1, (3072, ), (1, ))
    assert_size_stride(arg17_1, (768, 3072), (3072, 1))
    assert_size_stride(arg18_1, (768, ), (1, ))
    assert_size_stride(arg19_1, (768, ), (1, ))
    assert_size_stride(arg20_1, (768, ), (1, ))
    assert_size_stride(arg21_1, (768, 768), (768, 1))
    assert_size_stride(arg22_1, (768, ), (1, ))
    assert_size_stride(arg23_1, (768, 768), (768, 1))
    assert_size_stride(arg24_1, (768, ), (1, ))
    assert_size_stride(arg25_1, (768, 768), (768, 1))
    assert_size_stride(arg26_1, (768, ), (1, ))
    assert_size_stride(arg27_1, (768, 768), (768, 1))
    assert_size_stride(arg28_1, (768, ), (1, ))
    assert_size_stride(arg29_1, (768, ), (1, ))
    assert_size_stride(arg30_1, (768, ), (1, ))
    assert_size_stride(arg31_1, (3072, 768), (768, 1))
    assert_size_stride(arg32_1, (3072, ), (1, ))
    assert_size_stride(arg33_1, (768, 3072), (3072, 1))
    assert_size_stride(arg34_1, (768, ), (1, ))
    assert_size_stride(arg35_1, (768, ), (1, ))
    assert_size_stride(arg36_1, (768, ), (1, ))
    assert_size_stride(arg37_1, (768, 768), (768, 1))
    assert_size_stride(arg38_1, (768, ), (1, ))
    assert_size_stride(arg39_1, (768, 768), (768, 1))
    assert_size_stride(arg40_1, (768, ), (1, ))
    assert_size_stride(arg41_1, (768, 768), (768, 1))
    assert_size_stride(arg42_1, (768, ), (1, ))
    assert_size_stride(arg43_1, (768, 768), (768, 1))
    assert_size_stride(arg44_1, (768, ), (1, ))
    assert_size_stride(arg45_1, (768, ), (1, ))
    assert_size_stride(arg46_1, (768, ), (1, ))
    assert_size_stride(arg47_1, (3072, 768), (768, 1))
    assert_size_stride(arg48_1, (3072, ), (1, ))
    assert_size_stride(arg49_1, (768, 3072), (3072, 1))
    assert_size_stride(arg50_1, (768, ), (1, ))
    assert_size_stride(arg51_1, (768, ), (1, ))
    assert_size_stride(arg52_1, (768, ), (1, ))
    assert_size_stride(arg53_1, (768, 768), (768, 1))
    assert_size_stride(arg54_1, (768, ), (1, ))
    assert_size_stride(arg55_1, (768, 768), (768, 1))
    assert_size_stride(arg56_1, (768, ), (1, ))
    assert_size_stride(arg57_1, (768, 768), (768, 1))
    assert_size_stride(arg58_1, (768, ), (1, ))
    assert_size_stride(arg59_1, (768, 768), (768, 1))
    assert_size_stride(arg60_1, (768, ), (1, ))
    assert_size_stride(arg61_1, (768, ), (1, ))
    assert_size_stride(arg62_1, (768, ), (1, ))
    assert_size_stride(arg63_1, (3072, 768), (768, 1))
    assert_size_stride(arg64_1, (3072, ), (1, ))
    assert_size_stride(arg65_1, (768, 3072), (3072, 1))
    assert_size_stride(arg66_1, (768, ), (1, ))
    assert_size_stride(arg67_1, (768, ), (1, ))
    assert_size_stride(arg68_1, (768, ), (1, ))
    assert_size_stride(arg69_1, (768, 768), (768, 1))
    assert_size_stride(arg70_1, (768, ), (1, ))
    assert_size_stride(arg71_1, (768, 768), (768, 1))
    assert_size_stride(arg72_1, (768, ), (1, ))
    assert_size_stride(arg73_1, (768, 768), (768, 1))
    assert_size_stride(arg74_1, (768, ), (1, ))
    assert_size_stride(arg75_1, (768, 768), (768, 1))
    assert_size_stride(arg76_1, (768, ), (1, ))
    assert_size_stride(arg77_1, (768, ), (1, ))
    assert_size_stride(arg78_1, (768, ), (1, ))
    assert_size_stride(arg79_1, (3072, 768), (768, 1))
    assert_size_stride(arg80_1, (3072, ), (1, ))
    assert_size_stride(arg81_1, (768, 3072), (3072, 1))
    assert_size_stride(arg82_1, (768, ), (1, ))
    assert_size_stride(arg83_1, (768, ), (1, ))
    assert_size_stride(arg84_1, (768, ), (1, ))
    assert_size_stride(arg85_1, (768, 768), (768, 1))
    assert_size_stride(arg86_1, (768, ), (1, ))
    assert_size_stride(arg87_1, (768, 768), (768, 1))
    assert_size_stride(arg88_1, (768, ), (1, ))
    assert_size_stride(arg89_1, (768, 768), (768, 1))
    assert_size_stride(arg90_1, (768, ), (1, ))
    assert_size_stride(arg91_1, (768, 768), (768, 1))
    assert_size_stride(arg92_1, (768, ), (1, ))
    assert_size_stride(arg93_1, (768, ), (1, ))
    assert_size_stride(arg94_1, (768, ), (1, ))
    assert_size_stride(arg95_1, (3072, 768), (768, 1))
    assert_size_stride(arg96_1, (3072, ), (1, ))
    assert_size_stride(arg97_1, (768, 3072), (3072, 1))
    assert_size_stride(arg98_1, (768, ), (1, ))
    assert_size_stride(arg99_1, (768, ), (1, ))
    assert_size_stride(arg100_1, (768, ), (1, ))
    assert_size_stride(arg101_1, (768, 768), (768, 1))
    assert_size_stride(arg102_1, (768, ), (1, ))
    assert_size_stride(arg103_1, (768, 768), (768, 1))
    assert_size_stride(arg104_1, (768, ), (1, ))
    assert_size_stride(arg105_1, (768, 768), (768, 1))
    assert_size_stride(arg106_1, (768, ), (1, ))
    assert_size_stride(arg107_1, (768, 768), (768, 1))
    assert_size_stride(arg108_1, (768, ), (1, ))
    assert_size_stride(arg109_1, (768, ), (1, ))
    assert_size_stride(arg110_1, (768, ), (1, ))
    assert_size_stride(arg111_1, (3072, 768), (768, 1))
    assert_size_stride(arg112_1, (3072, ), (1, ))
    assert_size_stride(arg113_1, (768, 3072), (3072, 1))
    assert_size_stride(arg114_1, (768, ), (1, ))
    assert_size_stride(arg115_1, (768, ), (1, ))
    assert_size_stride(arg116_1, (768, ), (1, ))
    assert_size_stride(arg117_1, (768, 768), (768, 1))
    assert_size_stride(arg118_1, (768, ), (1, ))
    assert_size_stride(arg119_1, (768, 768), (768, 1))
    assert_size_stride(arg120_1, (768, ), (1, ))
    assert_size_stride(arg121_1, (768, 768), (768, 1))
    assert_size_stride(arg122_1, (768, ), (1, ))
    assert_size_stride(arg123_1, (768, 768), (768, 1))
    assert_size_stride(arg124_1, (768, ), (1, ))
    assert_size_stride(arg125_1, (768, ), (1, ))
    assert_size_stride(arg126_1, (768, ), (1, ))
    assert_size_stride(arg127_1, (3072, 768), (768, 1))
    assert_size_stride(arg128_1, (3072, ), (1, ))
    assert_size_stride(arg129_1, (768, 3072), (3072, 1))
    assert_size_stride(arg130_1, (768, ), (1, ))
    assert_size_stride(arg131_1, (768, ), (1, ))
    assert_size_stride(arg132_1, (768, ), (1, ))
    assert_size_stride(arg133_1, (768, 768), (768, 1))
    assert_size_stride(arg134_1, (768, ), (1, ))
    assert_size_stride(arg135_1, (768, 768), (768, 1))
    assert_size_stride(arg136_1, (768, ), (1, ))
    assert_size_stride(arg137_1, (768, 768), (768, 1))
    assert_size_stride(arg138_1, (768, ), (1, ))
    assert_size_stride(arg139_1, (768, 768), (768, 1))
    assert_size_stride(arg140_1, (768, ), (1, ))
    assert_size_stride(arg141_1, (768, ), (1, ))
    assert_size_stride(arg142_1, (768, ), (1, ))
    assert_size_stride(arg143_1, (3072, 768), (768, 1))
    assert_size_stride(arg144_1, (3072, ), (1, ))
    assert_size_stride(arg145_1, (768, 3072), (3072, 1))
    assert_size_stride(arg146_1, (768, ), (1, ))
    assert_size_stride(arg147_1, (768, ), (1, ))
    assert_size_stride(arg148_1, (768, ), (1, ))
    assert_size_stride(arg149_1, (768, 768), (768, 1))
    assert_size_stride(arg150_1, (768, ), (1, ))
    assert_size_stride(arg151_1, (768, 768), (768, 1))
    assert_size_stride(arg152_1, (768, ), (1, ))
    assert_size_stride(arg153_1, (768, 768), (768, 1))
    assert_size_stride(arg154_1, (768, ), (1, ))
    assert_size_stride(arg155_1, (768, 768), (768, 1))
    assert_size_stride(arg156_1, (768, ), (1, ))
    assert_size_stride(arg157_1, (768, ), (1, ))
    assert_size_stride(arg158_1, (768, ), (1, ))
    assert_size_stride(arg159_1, (3072, 768), (768, 1))
    assert_size_stride(arg160_1, (3072, ), (1, ))
    assert_size_stride(arg161_1, (768, 3072), (3072, 1))
    assert_size_stride(arg162_1, (768, ), (1, ))
    assert_size_stride(arg163_1, (768, ), (1, ))
    assert_size_stride(arg164_1, (768, ), (1, ))
    assert_size_stride(arg165_1, (768, 768), (768, 1))
    assert_size_stride(arg166_1, (768, ), (1, ))
    assert_size_stride(arg167_1, (768, 768), (768, 1))
    assert_size_stride(arg168_1, (768, ), (1, ))
    assert_size_stride(arg169_1, (768, 768), (768, 1))
    assert_size_stride(arg170_1, (768, ), (1, ))
    assert_size_stride(arg171_1, (768, 768), (768, 1))
    assert_size_stride(arg172_1, (768, ), (1, ))
    assert_size_stride(arg173_1, (768, ), (1, ))
    assert_size_stride(arg174_1, (768, ), (1, ))
    assert_size_stride(arg175_1, (3072, 768), (768, 1))
    assert_size_stride(arg176_1, (3072, ), (1, ))
    assert_size_stride(arg177_1, (768, 3072), (3072, 1))
    assert_size_stride(arg178_1, (768, ), (1, ))
    assert_size_stride(arg179_1, (768, ), (1, ))
    assert_size_stride(arg180_1, (768, ), (1, ))
    assert_size_stride(arg181_1, (768, 768), (768, 1))
    assert_size_stride(arg182_1, (768, ), (1, ))
    assert_size_stride(arg183_1, (768, 768), (768, 1))
    assert_size_stride(arg184_1, (768, ), (1, ))
    assert_size_stride(arg185_1, (768, 768), (768, 1))
    assert_size_stride(arg186_1, (768, ), (1, ))
    assert_size_stride(arg187_1, (768, 768), (768, 1))
    assert_size_stride(arg188_1, (768, ), (1, ))
    assert_size_stride(arg189_1, (768, ), (1, ))
    assert_size_stride(arg190_1, (768, ), (1, ))
    assert_size_stride(arg191_1, (3072, 768), (768, 1))
    assert_size_stride(arg192_1, (3072, ), (1, ))
    assert_size_stride(arg193_1, (768, 3072), (3072, 1))
    assert_size_stride(arg194_1, (768, ), (1, ))
    assert_size_stride(arg195_1, (768, ), (1, ))
    assert_size_stride(arg196_1, (768, ), (1, ))
    assert_size_stride(arg197_1, (768, 768), (768, 1))
    assert_size_stride(arg198_1, (768, ), (1, ))
    assert_size_stride(arg199_1, (768, ), (1, ))
    assert_size_stride(arg200_1, (768, ), (1, ))
    assert_size_stride(arg201_1, (30522, 768), (768, 1))
    assert_size_stride(arg202_1, (30522, ), (1, ))
    assert_size_stride(arg203_1, (1, 512), (512, 1))
    assert_size_stride(arg204_1, (1, 512), (512, 1))
    assert_size_stride(arg205_1, (16, 512), (512, 1))
    assert_size_stride(arg206_1, (16, 512), (512, 1))
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        buf0 = empty_strided_cuda((16, 512, 768), (393216, 768, 1), torch.bfloat16)
        buf4 = empty_strided_cuda((16, 512, 768), (393216, 768, 1), torch.bfloat16)
        # Source Nodes: [embeddings, embeddings_1, embeddings_2, inputs_embeds, position_embeddings, token_type_embeddings], Original ATen: [aten.add, aten.embedding, aten.native_layer_norm]
        stream0 = get_raw_stream(0)
        triton_per_fused_add_embedding_native_layer_norm_0.run(arg205_1, arg0_1, arg203_1, arg1_1, arg204_1, arg2_1, arg3_1, arg4_1, buf0, buf4, 8192, 768, grid=grid(8192), stream=stream0)
        del arg0_1
        del arg1_1
        del arg203_1
        del arg204_1
        del arg205_1
        del arg2_1
        del arg3_1
        del arg4_1
        buf5 = reinterpret_tensor(buf0, (8192, 768), (768, 1), 0); del buf0  # reuse
        # Source Nodes: [mixed_query_layer], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg6_1, reinterpret_tensor(buf4, (8192, 768), (768, 1), 0), reinterpret_tensor(arg5_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf5)
        del arg5_1
        del arg6_1
        buf6 = empty_strided_cuda((8192, 768), (768, 1), torch.bfloat16)
        # Source Nodes: [l__mod___bert_encoder_layer_0_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg8_1, reinterpret_tensor(buf4, (8192, 768), (768, 1), 0), reinterpret_tensor(arg7_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf6)
        del arg7_1
        del arg8_1
        buf7 = empty_strided_cuda((8192, 768), (768, 1), torch.bfloat16)
        # Source Nodes: [l__mod___bert_encoder_layer_0_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg10_1, reinterpret_tensor(buf4, (8192, 768), (768, 1), 0), reinterpret_tensor(arg9_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf7)
        del arg10_1
        del arg9_1
        # Source Nodes: [], Original ATen: []
        buf8 = torch.ops.aten._scaled_dot_product_flash_attention.default(reinterpret_tensor(buf5, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf6, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf7, (16, 12, 512, 64), (393216, 64, 768, 1), 0), scale=0.125)
        del buf5
        buf9 = buf8[0]
        del buf8
        buf14 = buf7; del buf7  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf9, (8192, 768), (768, 1), 0), reinterpret_tensor(arg11_1, (768, 768), (1, 768), 0), out=buf14)
        del arg11_1
        buf18 = reinterpret_tensor(buf9, (16, 512, 768), (393216, 768, 1), 0); del buf9  # reuse
        # Source Nodes: [add_2, hidden_states_2], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf14, arg12_1, buf4, arg13_1, arg14_1, buf18, 8192, 768, grid=grid(8192), stream=stream0)
        del arg12_1
        del arg13_1
        del arg14_1
        buf19 = empty_strided_cuda((8192, 3072), (3072, 1), torch.bfloat16)
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf18, (8192, 768), (768, 1), 0), reinterpret_tensor(arg15_1, (768, 3072), (1, 768), 0), out=buf19)
        del arg15_1
        buf20 = reinterpret_tensor(buf19, (16, 512, 3072), (1572864, 3072, 1), 0); del buf19  # reuse
        # Source Nodes: [hidden_states_4], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_2.run(buf20, arg16_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg16_1
        buf21 = reinterpret_tensor(buf4, (8192, 768), (768, 1), 0); del buf4  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf20, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg17_1, (3072, 768), (1, 3072), 0), out=buf21)
        del arg17_1
        buf25 = reinterpret_tensor(buf14, (16, 512, 768), (393216, 768, 1), 0); del buf14  # reuse
        # Source Nodes: [add_3, hidden_states_7], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf21, arg18_1, buf18, arg19_1, arg20_1, buf25, 8192, 768, grid=grid(8192), stream=stream0)
        del arg18_1
        del arg19_1
        del arg20_1
        buf26 = buf21; del buf21  # reuse
        # Source Nodes: [mixed_query_layer_1], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg22_1, reinterpret_tensor(buf25, (8192, 768), (768, 1), 0), reinterpret_tensor(arg21_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf26)
        del arg21_1
        del arg22_1
        buf27 = reinterpret_tensor(buf18, (8192, 768), (768, 1), 0); del buf18  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_1_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg24_1, reinterpret_tensor(buf25, (8192, 768), (768, 1), 0), reinterpret_tensor(arg23_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf27)
        del arg23_1
        del arg24_1
        buf28 = buf6; del buf6  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_1_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg26_1, reinterpret_tensor(buf25, (8192, 768), (768, 1), 0), reinterpret_tensor(arg25_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf28)
        del arg25_1
        del arg26_1
        # Source Nodes: [], Original ATen: []
        buf29 = torch.ops.aten._scaled_dot_product_flash_attention.default(reinterpret_tensor(buf26, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf27, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf28, (16, 12, 512, 64), (393216, 64, 768, 1), 0), scale=0.125)
        del buf26
        buf30 = buf29[0]
        del buf29
        buf35 = buf28; del buf28  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf30, (8192, 768), (768, 1), 0), reinterpret_tensor(arg27_1, (768, 768), (1, 768), 0), out=buf35)
        del arg27_1
        buf39 = reinterpret_tensor(buf30, (16, 512, 768), (393216, 768, 1), 0); del buf30  # reuse
        # Source Nodes: [add_5, hidden_states_10], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf35, arg28_1, buf25, arg29_1, arg30_1, buf39, 8192, 768, grid=grid(8192), stream=stream0)
        del arg28_1
        del arg29_1
        del arg30_1
        buf40 = reinterpret_tensor(buf20, (8192, 3072), (3072, 1), 0); del buf20  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf39, (8192, 768), (768, 1), 0), reinterpret_tensor(arg31_1, (768, 3072), (1, 768), 0), out=buf40)
        del arg31_1
        buf41 = reinterpret_tensor(buf40, (16, 512, 3072), (1572864, 3072, 1), 0); del buf40  # reuse
        # Source Nodes: [hidden_states_12], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_2.run(buf41, arg32_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg32_1
        buf42 = buf35; del buf35  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf41, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg33_1, (3072, 768), (1, 3072), 0), out=buf42)
        del arg33_1
        buf46 = buf25; del buf25  # reuse
        # Source Nodes: [add_6, hidden_states_15], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf42, arg34_1, buf39, arg35_1, arg36_1, buf46, 8192, 768, grid=grid(8192), stream=stream0)
        del arg34_1
        del arg35_1
        del arg36_1
        buf47 = buf42; del buf42  # reuse
        # Source Nodes: [mixed_query_layer_2], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg38_1, reinterpret_tensor(buf46, (8192, 768), (768, 1), 0), reinterpret_tensor(arg37_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf47)
        del arg37_1
        del arg38_1
        buf48 = reinterpret_tensor(buf39, (8192, 768), (768, 1), 0); del buf39  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_2_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg40_1, reinterpret_tensor(buf46, (8192, 768), (768, 1), 0), reinterpret_tensor(arg39_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf48)
        del arg39_1
        del arg40_1
        buf49 = buf27; del buf27  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_2_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg42_1, reinterpret_tensor(buf46, (8192, 768), (768, 1), 0), reinterpret_tensor(arg41_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf49)
        del arg41_1
        del arg42_1
        # Source Nodes: [], Original ATen: []
        buf50 = torch.ops.aten._scaled_dot_product_flash_attention.default(reinterpret_tensor(buf47, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf48, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf49, (16, 12, 512, 64), (393216, 64, 768, 1), 0), scale=0.125)
        del buf47
        buf51 = buf50[0]
        del buf50
        buf56 = buf49; del buf49  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf51, (8192, 768), (768, 1), 0), reinterpret_tensor(arg43_1, (768, 768), (1, 768), 0), out=buf56)
        del arg43_1
        buf60 = reinterpret_tensor(buf51, (16, 512, 768), (393216, 768, 1), 0); del buf51  # reuse
        # Source Nodes: [add_8, hidden_states_18], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf56, arg44_1, buf46, arg45_1, arg46_1, buf60, 8192, 768, grid=grid(8192), stream=stream0)
        del arg44_1
        del arg45_1
        del arg46_1
        buf61 = reinterpret_tensor(buf41, (8192, 3072), (3072, 1), 0); del buf41  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf60, (8192, 768), (768, 1), 0), reinterpret_tensor(arg47_1, (768, 3072), (1, 768), 0), out=buf61)
        del arg47_1
        buf62 = reinterpret_tensor(buf61, (16, 512, 3072), (1572864, 3072, 1), 0); del buf61  # reuse
        # Source Nodes: [hidden_states_20], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_2.run(buf62, arg48_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg48_1
        buf63 = buf56; del buf56  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf62, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg49_1, (3072, 768), (1, 3072), 0), out=buf63)
        del arg49_1
        buf67 = buf46; del buf46  # reuse
        # Source Nodes: [add_9, hidden_states_23], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf63, arg50_1, buf60, arg51_1, arg52_1, buf67, 8192, 768, grid=grid(8192), stream=stream0)
        del arg50_1
        del arg51_1
        del arg52_1
        buf68 = buf63; del buf63  # reuse
        # Source Nodes: [mixed_query_layer_3], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg54_1, reinterpret_tensor(buf67, (8192, 768), (768, 1), 0), reinterpret_tensor(arg53_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf68)
        del arg53_1
        del arg54_1
        buf69 = reinterpret_tensor(buf60, (8192, 768), (768, 1), 0); del buf60  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_3_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg56_1, reinterpret_tensor(buf67, (8192, 768), (768, 1), 0), reinterpret_tensor(arg55_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf69)
        del arg55_1
        del arg56_1
        buf70 = buf48; del buf48  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_3_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg58_1, reinterpret_tensor(buf67, (8192, 768), (768, 1), 0), reinterpret_tensor(arg57_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf70)
        del arg57_1
        del arg58_1
        # Source Nodes: [], Original ATen: []
        buf71 = torch.ops.aten._scaled_dot_product_flash_attention.default(reinterpret_tensor(buf68, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf69, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf70, (16, 12, 512, 64), (393216, 64, 768, 1), 0), scale=0.125)
        del buf68
        buf72 = buf71[0]
        del buf71
        buf77 = buf70; del buf70  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf72, (8192, 768), (768, 1), 0), reinterpret_tensor(arg59_1, (768, 768), (1, 768), 0), out=buf77)
        del arg59_1
        buf81 = reinterpret_tensor(buf72, (16, 512, 768), (393216, 768, 1), 0); del buf72  # reuse
        # Source Nodes: [add_11, hidden_states_26], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf77, arg60_1, buf67, arg61_1, arg62_1, buf81, 8192, 768, grid=grid(8192), stream=stream0)
        del arg60_1
        del arg61_1
        del arg62_1
        buf82 = reinterpret_tensor(buf62, (8192, 3072), (3072, 1), 0); del buf62  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf81, (8192, 768), (768, 1), 0), reinterpret_tensor(arg63_1, (768, 3072), (1, 768), 0), out=buf82)
        del arg63_1
        buf83 = reinterpret_tensor(buf82, (16, 512, 3072), (1572864, 3072, 1), 0); del buf82  # reuse
        # Source Nodes: [hidden_states_28], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_2.run(buf83, arg64_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg64_1
        buf84 = buf77; del buf77  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf83, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg65_1, (3072, 768), (1, 3072), 0), out=buf84)
        del arg65_1
        buf88 = buf67; del buf67  # reuse
        # Source Nodes: [add_12, hidden_states_31], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf84, arg66_1, buf81, arg67_1, arg68_1, buf88, 8192, 768, grid=grid(8192), stream=stream0)
        del arg66_1
        del arg67_1
        del arg68_1
        buf89 = buf84; del buf84  # reuse
        # Source Nodes: [mixed_query_layer_4], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg70_1, reinterpret_tensor(buf88, (8192, 768), (768, 1), 0), reinterpret_tensor(arg69_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf89)
        del arg69_1
        del arg70_1
        buf90 = reinterpret_tensor(buf81, (8192, 768), (768, 1), 0); del buf81  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_4_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg72_1, reinterpret_tensor(buf88, (8192, 768), (768, 1), 0), reinterpret_tensor(arg71_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf90)
        del arg71_1
        del arg72_1
        buf91 = buf69; del buf69  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_4_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg74_1, reinterpret_tensor(buf88, (8192, 768), (768, 1), 0), reinterpret_tensor(arg73_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf91)
        del arg73_1
        del arg74_1
        # Source Nodes: [], Original ATen: []
        buf92 = torch.ops.aten._scaled_dot_product_flash_attention.default(reinterpret_tensor(buf89, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf90, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf91, (16, 12, 512, 64), (393216, 64, 768, 1), 0), scale=0.125)
        del buf89
        buf93 = buf92[0]
        del buf92
        buf98 = buf91; del buf91  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf93, (8192, 768), (768, 1), 0), reinterpret_tensor(arg75_1, (768, 768), (1, 768), 0), out=buf98)
        del arg75_1
        buf102 = reinterpret_tensor(buf93, (16, 512, 768), (393216, 768, 1), 0); del buf93  # reuse
        # Source Nodes: [add_14, hidden_states_34], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf98, arg76_1, buf88, arg77_1, arg78_1, buf102, 8192, 768, grid=grid(8192), stream=stream0)
        del arg76_1
        del arg77_1
        del arg78_1
        buf103 = reinterpret_tensor(buf83, (8192, 3072), (3072, 1), 0); del buf83  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf102, (8192, 768), (768, 1), 0), reinterpret_tensor(arg79_1, (768, 3072), (1, 768), 0), out=buf103)
        del arg79_1
        buf104 = reinterpret_tensor(buf103, (16, 512, 3072), (1572864, 3072, 1), 0); del buf103  # reuse
        # Source Nodes: [hidden_states_36], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_2.run(buf104, arg80_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg80_1
        buf105 = buf98; del buf98  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf104, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg81_1, (3072, 768), (1, 3072), 0), out=buf105)
        del arg81_1
        buf109 = buf88; del buf88  # reuse
        # Source Nodes: [add_15, hidden_states_39], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf105, arg82_1, buf102, arg83_1, arg84_1, buf109, 8192, 768, grid=grid(8192), stream=stream0)
        del arg82_1
        del arg83_1
        del arg84_1
        buf110 = buf105; del buf105  # reuse
        # Source Nodes: [mixed_query_layer_5], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg86_1, reinterpret_tensor(buf109, (8192, 768), (768, 1), 0), reinterpret_tensor(arg85_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf110)
        del arg85_1
        del arg86_1
        buf111 = reinterpret_tensor(buf102, (8192, 768), (768, 1), 0); del buf102  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_5_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg88_1, reinterpret_tensor(buf109, (8192, 768), (768, 1), 0), reinterpret_tensor(arg87_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf111)
        del arg87_1
        del arg88_1
        buf112 = buf90; del buf90  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_5_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg90_1, reinterpret_tensor(buf109, (8192, 768), (768, 1), 0), reinterpret_tensor(arg89_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf112)
        del arg89_1
        del arg90_1
        # Source Nodes: [], Original ATen: []
        buf113 = torch.ops.aten._scaled_dot_product_flash_attention.default(reinterpret_tensor(buf110, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf111, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf112, (16, 12, 512, 64), (393216, 64, 768, 1), 0), scale=0.125)
        del buf110
        buf114 = buf113[0]
        del buf113
        buf119 = buf112; del buf112  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf114, (8192, 768), (768, 1), 0), reinterpret_tensor(arg91_1, (768, 768), (1, 768), 0), out=buf119)
        del arg91_1
        buf123 = reinterpret_tensor(buf114, (16, 512, 768), (393216, 768, 1), 0); del buf114  # reuse
        # Source Nodes: [add_17, hidden_states_42], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf119, arg92_1, buf109, arg93_1, arg94_1, buf123, 8192, 768, grid=grid(8192), stream=stream0)
        del arg92_1
        del arg93_1
        del arg94_1
        buf124 = reinterpret_tensor(buf104, (8192, 3072), (3072, 1), 0); del buf104  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf123, (8192, 768), (768, 1), 0), reinterpret_tensor(arg95_1, (768, 3072), (1, 768), 0), out=buf124)
        del arg95_1
        buf125 = reinterpret_tensor(buf124, (16, 512, 3072), (1572864, 3072, 1), 0); del buf124  # reuse
        # Source Nodes: [hidden_states_44], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_2.run(buf125, arg96_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg96_1
        buf126 = buf119; del buf119  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf125, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg97_1, (3072, 768), (1, 3072), 0), out=buf126)
        del arg97_1
        buf130 = buf109; del buf109  # reuse
        # Source Nodes: [add_18, hidden_states_47], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf126, arg98_1, buf123, arg99_1, arg100_1, buf130, 8192, 768, grid=grid(8192), stream=stream0)
        del arg100_1
        del arg98_1
        del arg99_1
        buf131 = buf126; del buf126  # reuse
        # Source Nodes: [mixed_query_layer_6], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg102_1, reinterpret_tensor(buf130, (8192, 768), (768, 1), 0), reinterpret_tensor(arg101_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf131)
        del arg101_1
        del arg102_1
        buf132 = reinterpret_tensor(buf123, (8192, 768), (768, 1), 0); del buf123  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_6_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg104_1, reinterpret_tensor(buf130, (8192, 768), (768, 1), 0), reinterpret_tensor(arg103_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf132)
        del arg103_1
        del arg104_1
        buf133 = buf111; del buf111  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_6_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg106_1, reinterpret_tensor(buf130, (8192, 768), (768, 1), 0), reinterpret_tensor(arg105_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf133)
        del arg105_1
        del arg106_1
        # Source Nodes: [], Original ATen: []
        buf134 = torch.ops.aten._scaled_dot_product_flash_attention.default(reinterpret_tensor(buf131, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf132, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf133, (16, 12, 512, 64), (393216, 64, 768, 1), 0), scale=0.125)
        del buf131
        buf135 = buf134[0]
        del buf134
        buf140 = buf133; del buf133  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf135, (8192, 768), (768, 1), 0), reinterpret_tensor(arg107_1, (768, 768), (1, 768), 0), out=buf140)
        del arg107_1
        buf144 = reinterpret_tensor(buf135, (16, 512, 768), (393216, 768, 1), 0); del buf135  # reuse
        # Source Nodes: [add_20, hidden_states_50], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf140, arg108_1, buf130, arg109_1, arg110_1, buf144, 8192, 768, grid=grid(8192), stream=stream0)
        del arg108_1
        del arg109_1
        del arg110_1
        buf145 = reinterpret_tensor(buf125, (8192, 3072), (3072, 1), 0); del buf125  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf144, (8192, 768), (768, 1), 0), reinterpret_tensor(arg111_1, (768, 3072), (1, 768), 0), out=buf145)
        del arg111_1
        buf146 = reinterpret_tensor(buf145, (16, 512, 3072), (1572864, 3072, 1), 0); del buf145  # reuse
        # Source Nodes: [hidden_states_52], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_2.run(buf146, arg112_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg112_1
        buf147 = buf140; del buf140  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf146, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg113_1, (3072, 768), (1, 3072), 0), out=buf147)
        del arg113_1
        buf151 = buf130; del buf130  # reuse
        # Source Nodes: [add_21, hidden_states_55], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf147, arg114_1, buf144, arg115_1, arg116_1, buf151, 8192, 768, grid=grid(8192), stream=stream0)
        del arg114_1
        del arg115_1
        del arg116_1
        buf152 = buf147; del buf147  # reuse
        # Source Nodes: [mixed_query_layer_7], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg118_1, reinterpret_tensor(buf151, (8192, 768), (768, 1), 0), reinterpret_tensor(arg117_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf152)
        del arg117_1
        del arg118_1
        buf153 = reinterpret_tensor(buf144, (8192, 768), (768, 1), 0); del buf144  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_7_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg120_1, reinterpret_tensor(buf151, (8192, 768), (768, 1), 0), reinterpret_tensor(arg119_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf153)
        del arg119_1
        del arg120_1
        buf154 = buf132; del buf132  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_7_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg122_1, reinterpret_tensor(buf151, (8192, 768), (768, 1), 0), reinterpret_tensor(arg121_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf154)
        del arg121_1
        del arg122_1
        # Source Nodes: [], Original ATen: []
        buf155 = torch.ops.aten._scaled_dot_product_flash_attention.default(reinterpret_tensor(buf152, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf153, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf154, (16, 12, 512, 64), (393216, 64, 768, 1), 0), scale=0.125)
        del buf152
        buf156 = buf155[0]
        del buf155
        buf161 = buf154; del buf154  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf156, (8192, 768), (768, 1), 0), reinterpret_tensor(arg123_1, (768, 768), (1, 768), 0), out=buf161)
        del arg123_1
        buf165 = reinterpret_tensor(buf156, (16, 512, 768), (393216, 768, 1), 0); del buf156  # reuse
        # Source Nodes: [add_23, hidden_states_58], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf161, arg124_1, buf151, arg125_1, arg126_1, buf165, 8192, 768, grid=grid(8192), stream=stream0)
        del arg124_1
        del arg125_1
        del arg126_1
        buf166 = reinterpret_tensor(buf146, (8192, 3072), (3072, 1), 0); del buf146  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf165, (8192, 768), (768, 1), 0), reinterpret_tensor(arg127_1, (768, 3072), (1, 768), 0), out=buf166)
        del arg127_1
        buf167 = reinterpret_tensor(buf166, (16, 512, 3072), (1572864, 3072, 1), 0); del buf166  # reuse
        # Source Nodes: [hidden_states_60], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_2.run(buf167, arg128_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg128_1
        buf168 = buf161; del buf161  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf167, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg129_1, (3072, 768), (1, 3072), 0), out=buf168)
        del arg129_1
        buf172 = buf151; del buf151  # reuse
        # Source Nodes: [add_24, hidden_states_63], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf168, arg130_1, buf165, arg131_1, arg132_1, buf172, 8192, 768, grid=grid(8192), stream=stream0)
        del arg130_1
        del arg131_1
        del arg132_1
        buf173 = buf168; del buf168  # reuse
        # Source Nodes: [mixed_query_layer_8], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg134_1, reinterpret_tensor(buf172, (8192, 768), (768, 1), 0), reinterpret_tensor(arg133_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf173)
        del arg133_1
        del arg134_1
        buf174 = reinterpret_tensor(buf165, (8192, 768), (768, 1), 0); del buf165  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_8_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg136_1, reinterpret_tensor(buf172, (8192, 768), (768, 1), 0), reinterpret_tensor(arg135_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf174)
        del arg135_1
        del arg136_1
        buf175 = buf153; del buf153  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_8_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg138_1, reinterpret_tensor(buf172, (8192, 768), (768, 1), 0), reinterpret_tensor(arg137_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf175)
        del arg137_1
        del arg138_1
        # Source Nodes: [], Original ATen: []
        buf176 = torch.ops.aten._scaled_dot_product_flash_attention.default(reinterpret_tensor(buf173, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf174, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf175, (16, 12, 512, 64), (393216, 64, 768, 1), 0), scale=0.125)
        del buf173
        buf177 = buf176[0]
        del buf176
        buf182 = buf175; del buf175  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf177, (8192, 768), (768, 1), 0), reinterpret_tensor(arg139_1, (768, 768), (1, 768), 0), out=buf182)
        del arg139_1
        buf186 = reinterpret_tensor(buf177, (16, 512, 768), (393216, 768, 1), 0); del buf177  # reuse
        # Source Nodes: [add_26, hidden_states_66], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf182, arg140_1, buf172, arg141_1, arg142_1, buf186, 8192, 768, grid=grid(8192), stream=stream0)
        del arg140_1
        del arg141_1
        del arg142_1
        buf187 = reinterpret_tensor(buf167, (8192, 3072), (3072, 1), 0); del buf167  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf186, (8192, 768), (768, 1), 0), reinterpret_tensor(arg143_1, (768, 3072), (1, 768), 0), out=buf187)
        del arg143_1
        buf188 = reinterpret_tensor(buf187, (16, 512, 3072), (1572864, 3072, 1), 0); del buf187  # reuse
        # Source Nodes: [hidden_states_68], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_2.run(buf188, arg144_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg144_1
        buf189 = buf182; del buf182  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf188, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg145_1, (3072, 768), (1, 3072), 0), out=buf189)
        del arg145_1
        buf193 = buf172; del buf172  # reuse
        # Source Nodes: [add_27, hidden_states_71], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf189, arg146_1, buf186, arg147_1, arg148_1, buf193, 8192, 768, grid=grid(8192), stream=stream0)
        del arg146_1
        del arg147_1
        del arg148_1
        buf194 = buf189; del buf189  # reuse
        # Source Nodes: [mixed_query_layer_9], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg150_1, reinterpret_tensor(buf193, (8192, 768), (768, 1), 0), reinterpret_tensor(arg149_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf194)
        del arg149_1
        del arg150_1
        buf195 = reinterpret_tensor(buf186, (8192, 768), (768, 1), 0); del buf186  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_9_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg152_1, reinterpret_tensor(buf193, (8192, 768), (768, 1), 0), reinterpret_tensor(arg151_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf195)
        del arg151_1
        del arg152_1
        buf196 = buf174; del buf174  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_9_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg154_1, reinterpret_tensor(buf193, (8192, 768), (768, 1), 0), reinterpret_tensor(arg153_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf196)
        del arg153_1
        del arg154_1
        # Source Nodes: [], Original ATen: []
        buf197 = torch.ops.aten._scaled_dot_product_flash_attention.default(reinterpret_tensor(buf194, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf195, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf196, (16, 12, 512, 64), (393216, 64, 768, 1), 0), scale=0.125)
        del buf194
        buf198 = buf197[0]
        del buf197
        buf203 = buf196; del buf196  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf198, (8192, 768), (768, 1), 0), reinterpret_tensor(arg155_1, (768, 768), (1, 768), 0), out=buf203)
        del arg155_1
        buf207 = reinterpret_tensor(buf198, (16, 512, 768), (393216, 768, 1), 0); del buf198  # reuse
        # Source Nodes: [add_29, hidden_states_74], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf203, arg156_1, buf193, arg157_1, arg158_1, buf207, 8192, 768, grid=grid(8192), stream=stream0)
        del arg156_1
        del arg157_1
        del arg158_1
        buf208 = reinterpret_tensor(buf188, (8192, 3072), (3072, 1), 0); del buf188  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf207, (8192, 768), (768, 1), 0), reinterpret_tensor(arg159_1, (768, 3072), (1, 768), 0), out=buf208)
        del arg159_1
        buf209 = reinterpret_tensor(buf208, (16, 512, 3072), (1572864, 3072, 1), 0); del buf208  # reuse
        # Source Nodes: [hidden_states_76], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_2.run(buf209, arg160_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg160_1
        buf210 = buf203; del buf203  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf209, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg161_1, (3072, 768), (1, 3072), 0), out=buf210)
        del arg161_1
        buf214 = buf193; del buf193  # reuse
        # Source Nodes: [add_30, hidden_states_79], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf210, arg162_1, buf207, arg163_1, arg164_1, buf214, 8192, 768, grid=grid(8192), stream=stream0)
        del arg162_1
        del arg163_1
        del arg164_1
        buf215 = buf210; del buf210  # reuse
        # Source Nodes: [mixed_query_layer_10], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg166_1, reinterpret_tensor(buf214, (8192, 768), (768, 1), 0), reinterpret_tensor(arg165_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf215)
        del arg165_1
        del arg166_1
        buf216 = reinterpret_tensor(buf207, (8192, 768), (768, 1), 0); del buf207  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_10_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg168_1, reinterpret_tensor(buf214, (8192, 768), (768, 1), 0), reinterpret_tensor(arg167_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf216)
        del arg167_1
        del arg168_1
        buf217 = buf195; del buf195  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_10_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg170_1, reinterpret_tensor(buf214, (8192, 768), (768, 1), 0), reinterpret_tensor(arg169_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf217)
        del arg169_1
        del arg170_1
        # Source Nodes: [], Original ATen: []
        buf218 = torch.ops.aten._scaled_dot_product_flash_attention.default(reinterpret_tensor(buf215, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf216, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf217, (16, 12, 512, 64), (393216, 64, 768, 1), 0), scale=0.125)
        del buf215
        buf219 = buf218[0]
        del buf218
        buf224 = buf217; del buf217  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf219, (8192, 768), (768, 1), 0), reinterpret_tensor(arg171_1, (768, 768), (1, 768), 0), out=buf224)
        del arg171_1
        buf228 = reinterpret_tensor(buf219, (16, 512, 768), (393216, 768, 1), 0); del buf219  # reuse
        # Source Nodes: [add_32, hidden_states_82], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf224, arg172_1, buf214, arg173_1, arg174_1, buf228, 8192, 768, grid=grid(8192), stream=stream0)
        del arg172_1
        del arg173_1
        del arg174_1
        buf229 = reinterpret_tensor(buf209, (8192, 3072), (3072, 1), 0); del buf209  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf228, (8192, 768), (768, 1), 0), reinterpret_tensor(arg175_1, (768, 3072), (1, 768), 0), out=buf229)
        del arg175_1
        buf230 = reinterpret_tensor(buf229, (16, 512, 3072), (1572864, 3072, 1), 0); del buf229  # reuse
        # Source Nodes: [hidden_states_84], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_2.run(buf230, arg176_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg176_1
        buf231 = buf224; del buf224  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf230, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg177_1, (3072, 768), (1, 3072), 0), out=buf231)
        del arg177_1
        buf235 = buf214; del buf214  # reuse
        # Source Nodes: [add_33, hidden_states_87], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf231, arg178_1, buf228, arg179_1, arg180_1, buf235, 8192, 768, grid=grid(8192), stream=stream0)
        del arg178_1
        del arg179_1
        del arg180_1
        buf236 = buf231; del buf231  # reuse
        # Source Nodes: [mixed_query_layer_11], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg182_1, reinterpret_tensor(buf235, (8192, 768), (768, 1), 0), reinterpret_tensor(arg181_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf236)
        del arg181_1
        del arg182_1
        buf237 = reinterpret_tensor(buf228, (8192, 768), (768, 1), 0); del buf228  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_11_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg184_1, reinterpret_tensor(buf235, (8192, 768), (768, 1), 0), reinterpret_tensor(arg183_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf237)
        del arg183_1
        del arg184_1
        buf238 = buf216; del buf216  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_11_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg186_1, reinterpret_tensor(buf235, (8192, 768), (768, 1), 0), reinterpret_tensor(arg185_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf238)
        del arg185_1
        del arg186_1
        # Source Nodes: [], Original ATen: []
        buf239 = torch.ops.aten._scaled_dot_product_flash_attention.default(reinterpret_tensor(buf236, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf237, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf238, (16, 12, 512, 64), (393216, 64, 768, 1), 0), scale=0.125)
        del buf236
        del buf237
        buf240 = buf239[0]
        del buf239
        buf245 = buf238; del buf238  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf240, (8192, 768), (768, 1), 0), reinterpret_tensor(arg187_1, (768, 768), (1, 768), 0), out=buf245)
        del arg187_1
        buf249 = reinterpret_tensor(buf240, (16, 512, 768), (393216, 768, 1), 0); del buf240  # reuse
        # Source Nodes: [add_35, hidden_states_90], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf245, arg188_1, buf235, arg189_1, arg190_1, buf249, 8192, 768, grid=grid(8192), stream=stream0)
        del arg188_1
        del arg189_1
        del arg190_1
        buf250 = reinterpret_tensor(buf230, (8192, 3072), (3072, 1), 0); del buf230  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf249, (8192, 768), (768, 1), 0), reinterpret_tensor(arg191_1, (768, 3072), (1, 768), 0), out=buf250)
        del arg191_1
        buf251 = reinterpret_tensor(buf250, (16, 512, 3072), (1572864, 3072, 1), 0); del buf250  # reuse
        # Source Nodes: [hidden_states_92], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_2.run(buf251, arg192_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg192_1
        buf252 = buf245; del buf245  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf251, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg193_1, (3072, 768), (1, 3072), 0), out=buf252)
        del arg193_1
        del buf251
        buf256 = buf235; del buf235  # reuse
        # Source Nodes: [add_36, hidden_states_95], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_1.run(buf252, arg194_1, buf249, arg195_1, arg196_1, buf256, 8192, 768, grid=grid(8192), stream=stream0)
        del arg194_1
        del arg195_1
        del arg196_1
        del buf249
        buf257 = buf252; del buf252  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf256, (8192, 768), (768, 1), 0), reinterpret_tensor(arg197_1, (768, 768), (1, 768), 0), out=buf257)
        del arg197_1
        buf262 = buf256; del buf256  # reuse
        # Source Nodes: [hidden_states_97, hidden_states_98], Original ATen: [aten.gelu, aten.native_layer_norm]
        triton_per_fused_gelu_native_layer_norm_3.run(buf257, arg198_1, arg199_1, arg200_1, buf262, 8192, 768, grid=grid(8192), stream=stream0)
        del arg198_1
        del arg199_1
        del arg200_1
        del buf257
        buf263 = empty_strided_cuda((768, 30528), (30528, 1), torch.bfloat16)
        # Source Nodes: [], Original ATen: []
        triton_poi_fused_4.run(arg201_1, buf263, 23445504, grid=grid(23445504), stream=stream0)
        del arg201_1
        buf264 = empty_strided_cuda((30528, ), (1, ), torch.bfloat16)
        # Source Nodes: [], Original ATen: []
        triton_poi_fused_5.run(arg202_1, buf264, 30528, grid=grid(30528), stream=stream0)
        del arg202_1
        buf265 = empty_strided_cuda((8192, 30528), (30528, 1), torch.bfloat16)
        # Source Nodes: [], Original ATen: []
        extern_kernels.addmm(buf264, reinterpret_tensor(buf262, (8192, 768), (768, 1), 0), buf263, alpha=1, beta=1, out=buf265)
        del buf262
        del buf263
        del buf264
        buf266 = empty_strided_cuda((8192, 1), (1, 8192), torch.float32)
        buf267 = empty_strided_cuda((8192, 1), (1, 8192), torch.float32)
        # Source Nodes: [masked_lm_loss], Original ATen: [aten._log_softmax]
        triton_red_fused__log_softmax_6.run(buf265, buf266, buf267, 8192, 30522, grid=grid(8192), stream=stream0)
        buf268 = empty_strided_cuda((), (), torch.bfloat16)
        buf270 = buf268; del buf268  # reuse
        # Source Nodes: [masked_lm_loss], Original ATen: [aten.nll_loss_forward]
        triton_red_fused_nll_loss_forward_7.run(buf270, arg206_1, buf265, buf266, buf267, 1, 8192, grid=grid(1), stream=stream0)
        del arg206_1
        del buf266
        del buf267
    return (buf270, reinterpret_tensor(buf265, (16, 512, 30522), (15630336, 30528, 1), 0), )

def benchmark_compiled_module(times=10, repeat=10):
    from torch._dynamo.testing import rand_strided
    from torch._inductor.utils import print_performance
    arg0_1 = rand_strided((30522, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg1_1 = rand_strided((2, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg2_1 = rand_strided((512, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg3_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg4_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg5_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg6_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg7_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg8_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg9_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg10_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg11_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg12_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg13_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg14_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg15_1 = rand_strided((3072, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg16_1 = rand_strided((3072, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg17_1 = rand_strided((768, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
    arg18_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg19_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg20_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg21_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg22_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg23_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg24_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg25_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg26_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg27_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg28_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg29_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg30_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg31_1 = rand_strided((3072, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg32_1 = rand_strided((3072, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg33_1 = rand_strided((768, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
    arg34_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg35_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg36_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg37_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg38_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg39_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg40_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg41_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg42_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg43_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg44_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg45_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg46_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg47_1 = rand_strided((3072, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg48_1 = rand_strided((3072, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg49_1 = rand_strided((768, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
    arg50_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg51_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg52_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg53_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg54_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg55_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg56_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg57_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg58_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg59_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg60_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg61_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg62_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg63_1 = rand_strided((3072, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg64_1 = rand_strided((3072, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg65_1 = rand_strided((768, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
    arg66_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg67_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg68_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg69_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg70_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg71_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg72_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg73_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg74_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg75_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg76_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg77_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg78_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg79_1 = rand_strided((3072, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg80_1 = rand_strided((3072, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg81_1 = rand_strided((768, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
    arg82_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg83_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg84_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg85_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg86_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg87_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg88_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg89_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg90_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg91_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg92_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg93_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg94_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg95_1 = rand_strided((3072, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg96_1 = rand_strided((3072, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg97_1 = rand_strided((768, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
    arg98_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg99_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg100_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg101_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg102_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg103_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg104_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg105_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg106_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg107_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg108_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg109_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg110_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg111_1 = rand_strided((3072, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg112_1 = rand_strided((3072, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg113_1 = rand_strided((768, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
    arg114_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg115_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg116_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg117_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg118_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg119_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg120_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg121_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg122_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg123_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg124_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg125_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg126_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg127_1 = rand_strided((3072, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg128_1 = rand_strided((3072, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg129_1 = rand_strided((768, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
    arg130_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg131_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg132_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg133_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg134_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg135_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg136_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg137_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg138_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg139_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg140_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg141_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg142_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg143_1 = rand_strided((3072, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg144_1 = rand_strided((3072, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg145_1 = rand_strided((768, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
    arg146_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg147_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg148_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg149_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg150_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg151_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg152_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg153_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg154_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg155_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg156_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg157_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg158_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg159_1 = rand_strided((3072, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg160_1 = rand_strided((3072, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg161_1 = rand_strided((768, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
    arg162_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg163_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg164_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg165_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg166_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg167_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg168_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg169_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg170_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg171_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg172_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg173_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg174_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg175_1 = rand_strided((3072, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg176_1 = rand_strided((3072, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg177_1 = rand_strided((768, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
    arg178_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg179_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg180_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg181_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg182_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg183_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg184_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg185_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg186_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg187_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg188_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg189_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg190_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg191_1 = rand_strided((3072, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg192_1 = rand_strided((3072, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg193_1 = rand_strided((768, 3072), (3072, 1), device='cuda:0', dtype=torch.bfloat16)
    arg194_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg195_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg196_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg197_1 = rand_strided((768, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg198_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg199_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg200_1 = rand_strided((768, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg201_1 = rand_strided((30522, 768), (768, 1), device='cuda:0', dtype=torch.bfloat16)
    arg202_1 = rand_strided((30522, ), (1, ), device='cuda:0', dtype=torch.bfloat16)
    arg203_1 = rand_strided((1, 512), (512, 1), device='cuda:0', dtype=torch.int64)
    arg204_1 = rand_strided((1, 512), (512, 1), device='cuda:0', dtype=torch.int64)
    arg205_1 = rand_strided((16, 512), (512, 1), device='cuda:0', dtype=torch.int64)
    arg206_1 = rand_strided((16, 512), (512, 1), device='cuda:0', dtype=torch.int64)
    fn = lambda: call([arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1, arg23_1, arg24_1, arg25_1, arg26_1, arg27_1, arg28_1, arg29_1, arg30_1, arg31_1, arg32_1, arg33_1, arg34_1, arg35_1, arg36_1, arg37_1, arg38_1, arg39_1, arg40_1, arg41_1, arg42_1, arg43_1, arg44_1, arg45_1, arg46_1, arg47_1, arg48_1, arg49_1, arg50_1, arg51_1, arg52_1, arg53_1, arg54_1, arg55_1, arg56_1, arg57_1, arg58_1, arg59_1, arg60_1, arg61_1, arg62_1, arg63_1, arg64_1, arg65_1, arg66_1, arg67_1, arg68_1, arg69_1, arg70_1, arg71_1, arg72_1, arg73_1, arg74_1, arg75_1, arg76_1, arg77_1, arg78_1, arg79_1, arg80_1, arg81_1, arg82_1, arg83_1, arg84_1, arg85_1, arg86_1, arg87_1, arg88_1, arg89_1, arg90_1, arg91_1, arg92_1, arg93_1, arg94_1, arg95_1, arg96_1, arg97_1, arg98_1, arg99_1, arg100_1, arg101_1, arg102_1, arg103_1, arg104_1, arg105_1, arg106_1, arg107_1, arg108_1, arg109_1, arg110_1, arg111_1, arg112_1, arg113_1, arg114_1, arg115_1, arg116_1, arg117_1, arg118_1, arg119_1, arg120_1, arg121_1, arg122_1, arg123_1, arg124_1, arg125_1, arg126_1, arg127_1, arg128_1, arg129_1, arg130_1, arg131_1, arg132_1, arg133_1, arg134_1, arg135_1, arg136_1, arg137_1, arg138_1, arg139_1, arg140_1, arg141_1, arg142_1, arg143_1, arg144_1, arg145_1, arg146_1, arg147_1, arg148_1, arg149_1, arg150_1, arg151_1, arg152_1, arg153_1, arg154_1, arg155_1, arg156_1, arg157_1, arg158_1, arg159_1, arg160_1, arg161_1, arg162_1, arg163_1, arg164_1, arg165_1, arg166_1, arg167_1, arg168_1, arg169_1, arg170_1, arg171_1, arg172_1, arg173_1, arg174_1, arg175_1, arg176_1, arg177_1, arg178_1, arg179_1, arg180_1, arg181_1, arg182_1, arg183_1, arg184_1, arg185_1, arg186_1, arg187_1, arg188_1, arg189_1, arg190_1, arg191_1, arg192_1, arg193_1, arg194_1, arg195_1, arg196_1, arg197_1, arg198_1, arg199_1, arg200_1, arg201_1, arg202_1, arg203_1, arg204_1, arg205_1, arg206_1])
    return print_performance(fn, times=times, repeat=repeat)

if __name__ == "__main__":
    from torch._inductor.wrapper_benchmark import compiled_module_main
    compiled_module_main('BertForMaskedLM', benchmark_compiled_module)

Testo modificato

Apri file

# kernel path: /tmp/torchinductor_eellison/wj/cwjmdogaso56hzkb4pdlo4k625tmwevhq5krfbpmpx5szmhdjbsf.py
# Source Nodes: [embeddings, embeddings_1, embeddings_2, inputs_embeds, position_embeddings, token_type_embeddings], Original ATen: [aten.add, aten.embedding, aten.native_layer_norm]
# embeddings => add
# embeddings_1 => add_1
# embeddings_2 => add_2, add_3, convert_element_type, convert_element_type_1, mul, mul_1, rsqrt, sub, var_mean
# inputs_embeds => embedding
# position_embeddings => embedding_2
# token_type_embeddings => embedding_1
triton_per_fused_add_embedding_native_layer_norm_0 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

# kernel path: /tmp/torchinductor_eellison/6q/c6qq6qfsawjfikfoeruueuup5cnzvmpzkusjkel6l6wcw43mgauj.py
# Source Nodes: [attn_output], Original ATen: [aten._scaled_dot_product_efficient_attention]
# attn_output => _scaled_dot_product_efficient_attention
triton_poi_fused__scaled_dot_product_efficient_attention_1 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints=[67108864], 
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1), equal_to_1=())]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused__scaled_dot_product_efficient_attention_1', 'mutated_arg_names': [], 'no_x_dim': False, 'num_load': 0, 'num_reduction': 0, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_(out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 50331648
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex
    tmp0 = tl.full([1], False, tl.int1)
    tmp1 = -3.3895313892515355e+38
    tmp2 = 0.0
    tmp3 = tl.where(tmp0, tmp1, tmp2)
    tl.store(out_ptr0 + (x0), tmp3, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/at/catipo3niy6sjxmx7lban4oumf6lplhx7qd3qxdhlcdrzt4xmv65.py
# Source Nodes: [add_1, hidden_states_2], Original ATen: [aten.add, aten.native_layer_norm]
# add_1 => add_4
# hidden_states_2 => add_5, add_6, convert_element_type_16, convert_element_type_17, mul_2, mul_3, rsqrt_1, sub_2, var_mean_1
triton_per_fused_add_native_layer_norm_2 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.persistent_reduction(
    size_hints=[8192, 1024],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: '*bf16', 2: '*bf16', 3: '*bf16', 4: '*bf16', 5: '*bf16', 6: 'i32', 7: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6, 7), equal_to_1=())]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_add_native_layer_norm_2', 'mutated_arg_names': [], 'no_x_dim': True, 'num_load': 5, 'num_reduction': 4, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
)
@triton.jit
def triton_(in_ptr0, in_ptr1, in_ptr2, in_ptr3, in_ptr4, out_ptr2, xnumel, rnumel):
    xnumel = 8192
    XBLOCK: tl.constexpr = 1
    rnumel = 768
    RBLOCK: tl.constexpr = 1024
    xoffset = tl.program_id(0) * XBLOCK
    xindex = tl.full([1], xoffset, tl.int32)
    xmask = tl.full([RBLOCK], True, tl.int1)
    rindex = tl.arange(0, RBLOCK)[:]
    roffset = 0
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (r1 + (768*x0)), rmask, other=0.0).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp3 = tl.load(in_ptr2 + (r1 + (768*x0)), rmask, other=0.0).to(tl.float32)
    tmp29 = tl.load(in_ptr3 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp32 = tl.load(in_ptr4 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tmp4 = tmp2 + tmp3
    tmp5 = tmp4.to(tl.float32)
    tmp6 = tl.broadcast_to(tmp5, [RBLOCK])
    tmp8 = tl.where(rmask, tmp6, 0)
    tmp9 = tl.broadcast_to(tmp6, [RBLOCK])
    tmp11 = tl.where(rmask, tmp9, 0)
    tmp12 = triton_helpers.promote_to_tensor(tl.sum(tmp11, 0))
    tmp13 = tl.full([1], 768, tl.int32)
    tmp14 = tmp13.to(tl.float32)
    tmp15 = tmp12 / tmp14
    tmp16 = tmp6 - tmp15
    tmp17 = tmp16 * tmp16
    tmp18 = tl.broadcast_to(tmp17, [RBLOCK])
    tmp20 = tl.where(rmask, tmp18, 0)
    tmp21 = triton_helpers.promote_to_tensor(tl.sum(tmp20, 0))
    tmp22 = tmp5 - tmp15
    tmp23 = 768.0
    tmp24 = tmp21 / tmp23
    tmp25 = 1e-12
    tmp26 = tmp24 + tmp25
    tmp27 = libdevice.rsqrt(tmp26)
    tmp28 = tmp22 * tmp27
    tmp30 = tmp29.to(tl.float32)
    tmp31 = tmp28 * tmp30
    tmp33 = tmp32.to(tl.float32)
    tmp34 = tmp31 + tmp33
    tmp35 = tmp34.to(tl.float32)
    tl.store(out_ptr2 + (r1 + (768*x0)), tmp35, rmask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/lp/clpuyqd2qh624e2ye7osnyieorymdus3kflfhssozom7b33frjei.py
# Source Nodes: [hidden_states_4], Original ATen: [aten.gelu]
# hidden_states_4 => add_7, convert_element_type_21, convert_element_type_22, erf, mul_4, mul_5, mul_6
triton_poi_fused_gelu_3 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints=[33554432], 
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: '*bf16', 2: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_gelu_3', 'mutated_arg_names': ['in_out_ptr0'], 'no_x_dim': False, 'num_load': 2, 'num_reduction': 0, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_(in_out_ptr0, in_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 25165824
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x2 = xindex
    x0 = xindex % 3072
    tmp0 = tl.load(in_out_ptr0 + (x2), None).to(tl.float32)
    tmp1 = tl.load(in_ptr0 + (x0), None, eviction_policy='evict_last').to(tl.float32)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp4 = 0.5
    tmp5 = tmp3 * tmp4
    tmp6 = 0.7071067811865476
    tmp7 = tmp3 * tmp6
    tmp8 = libdevice.erf(tmp7)
    tmp9 = 1.0
    tmp10 = tmp8 + tmp9
    tmp11 = tmp5 * tmp10
    tmp12 = tmp11.to(tl.float32)
    tl.store(in_out_ptr0 + (x2), tmp12, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/ai/caiiwspixmf6ozjdgwruqmkld4lyd2nnoyrhxzmgg3mafv6a6yka.py
# Source Nodes: [hidden_states_97, hidden_states_98], Original ATen: [aten.gelu, aten.native_layer_norm]
# hidden_states_97 => add_88, convert_element_type_295, convert_element_type_296, erf_12, mul_86, mul_87, mul_88
# hidden_states_98 => add_89, add_90, convert_element_type_297, convert_element_type_298, mul_89, mul_90, rsqrt_25, sub_26, var_mean_25
triton_per_fused_gelu_native_layer_norm_4 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.persistent_reduction(
    size_hints=[8192, 1024],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: '*bf16', 2: '*bf16', 3: '*bf16', 4: '*bf16', 5: 'i32', 6: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4, 5, 6), equal_to_1=())]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_per_fused_gelu_native_layer_norm_4', 'mutated_arg_names': [], 'no_x_dim': True, 'num_load': 4, 'num_reduction': 4, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
)
@triton.jit
def triton_(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr3, xnumel, rnumel):
    xnumel = 8192
    XBLOCK: tl.constexpr = 1
    rnumel = 768
    RBLOCK: tl.constexpr = 1024
    xoffset = tl.program_id(0) * XBLOCK
    xindex = tl.full([1], xoffset, tl.int32)
    xmask = tl.full([RBLOCK], True, tl.int1)
    rindex = tl.arange(0, RBLOCK)[:]
    roffset = 0
    rmask = rindex < rnumel
    r1 = rindex
    x0 = xindex
    tmp0 = tl.load(in_ptr0 + (r1 + (768*x0)), rmask, other=0.0).to(tl.float32)
    tmp1 = tl.load(in_ptr1 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp37 = tl.load(in_ptr2 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp40 = tl.load(in_ptr3 + (r1), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp2 = tmp0 + tmp1
    tmp3 = tmp2.to(tl.float32)
    tmp4 = 0.5
    tmp5 = tmp3 * tmp4
    tmp6 = 0.7071067811865476
    tmp7 = tmp3 * tmp6
    tmp8 = libdevice.erf(tmp7)
    tmp9 = 1.0
    tmp10 = tmp8 + tmp9
    tmp11 = tmp5 * tmp10
    tmp12 = tmp11.to(tl.float32)
    tmp13 = tmp12.to(tl.float32)
    tmp14 = tl.broadcast_to(tmp13, [RBLOCK])
    tmp16 = tl.where(rmask, tmp14, 0)
    tmp17 = tl.broadcast_to(tmp14, [RBLOCK])
    tmp19 = tl.where(rmask, tmp17, 0)
    tmp20 = triton_helpers.promote_to_tensor(tl.sum(tmp19, 0))
    tmp21 = tl.full([1], 768, tl.int32)
    tmp22 = tmp21.to(tl.float32)
    tmp23 = tmp20 / tmp22
    tmp24 = tmp14 - tmp23
    tmp25 = tmp24 * tmp24
    tmp26 = tl.broadcast_to(tmp25, [RBLOCK])
    tmp28 = tl.where(rmask, tmp26, 0)
    tmp29 = triton_helpers.promote_to_tensor(tl.sum(tmp28, 0))
    tmp30 = tmp13 - tmp23
    tmp31 = 768.0
    tmp32 = tmp29 / tmp31
    tmp33 = 1e-12
    tmp34 = tmp32 + tmp33
    tmp35 = libdevice.rsqrt(tmp34)
    tmp36 = tmp30 * tmp35
    tmp38 = tmp37.to(tl.float32)
    tmp39 = tmp36 * tmp38
    tmp41 = tmp40.to(tl.float32)
    tmp42 = tmp39 + tmp41
    tmp43 = tmp42.to(tl.float32)
    tl.store(out_ptr3 + (r1 + (768*x0)), tmp43, rmask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/fl/cfllfdzx4v3f3zccah7zu5u634j2vrlvbkru74wmaerhgadlizat.py
# Source Nodes: [], Original ATen: []

triton_poi_fused_5 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints=[33554432], 
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: '*bf16', 2: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_5', 'mutated_arg_names': [], 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 23445504
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = tl.full([XBLOCK], True, tl.int1)
    x0 = xindex % 30528
    x1 = (xindex // 30528)
    x2 = xindex
    tmp0 = x0
    tmp1 = tl.full([1], 0, tl.int64)
    tmp2 = tmp0 >= tmp1
    tmp3 = tl.full([1], 30522, tl.int64)
    tmp4 = tmp0 < tmp3
    tmp5 = tl.load(in_ptr0 + (x1 + (768*x0)), tmp4, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp6 = tmp0 >= tmp3
    tmp7 = tl.full([1], 30528, tl.int64)
    tmp8 = tmp0 < tmp7
    tmp9 = 0.0
    tmp10 = tl.full(tmp9.shape, 0.0, tmp9.dtype)
    tmp11 = tl.where(tmp6, tmp9, tmp10)
    tmp12 = tl.where(tmp4, tmp5, tmp11)
    tl.store(out_ptr0 + (x2), tmp12, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/si/csikk4r46efsgklpubt6iamwjm3jsevq2h2pkkvjbgcc7sj4p24c.py
# Source Nodes: [], Original ATen: []

triton_poi_fused_6 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.pointwise(
    size_hints=[32768], 
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: '*bf16', 2: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2), equal_to_1=())]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_poi_fused_6', 'mutated_arg_names': [], 'no_x_dim': False, 'num_load': 1, 'num_reduction': 0, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False},
    min_elem_per_thread=0
)
@triton.jit
def triton_(in_ptr0, out_ptr0, xnumel, XBLOCK : tl.constexpr):
    xnumel = 30528
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:]
    xmask = xindex < xnumel
    x0 = xindex
    tmp0 = x0
    tmp1 = tl.full([1], 0, tl.int64)
    tmp2 = tmp0 >= tmp1
    tmp3 = tl.full([1], 30522, tl.int64)
    tmp4 = tmp0 < tmp3
    tmp5 = tl.load(in_ptr0 + (x0), tmp4 & xmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
    tmp6 = tmp0 >= tmp3
    tmp7 = tl.full([1], 30528, tl.int64)
    tmp8 = tmp0 < tmp7
    tmp9 = 0.0
    tmp10 = tl.full(tmp9.shape, 0.0, tmp9.dtype)
    tmp11 = tl.where(tmp6, tmp9, tmp10)
    tmp12 = tl.where(tmp4, tmp5, tmp11)
    tl.store(out_ptr0 + (x0), tmp12, xmask)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/z7/cz72sjk2qwjjxgfdcyk6de22sryhbhr6pptrvx4aq2hq2soai2p5.py
# Source Nodes: [masked_lm_loss], Original ATen: [aten._log_softmax]
# masked_lm_loss => amax, convert_element_type_302, exp, sub_27, sum_1
triton_red_fused__log_softmax_7 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints=[8192, 32768],
    reduction_hint=ReductionHint.DEFAULT,
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: '*fp32', 2: '*fp32', 3: 'i32', 4: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3), equal_to_1=())]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused__log_softmax_7', 'mutated_arg_names': [], 'no_x_dim': False, 'num_load': 2, 'num_reduction': 2, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
)
@triton.jit
def triton_(in_ptr0, out_ptr0, out_ptr1, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 8192
    rnumel = 30522
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, RBLOCK], True, tl.int1)
    rbase = tl.arange(0, RBLOCK)[None, :]
    x0 = xindex
    _tmp3 = tl.full([XBLOCK, RBLOCK], float("-inf"), tl.float32)
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex
        tmp0 = tl.load(in_ptr0 + (r1 + (30528*x0)), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp1 = tmp0.to(tl.float32)
        tmp2 = tl.broadcast_to(tmp1, [XBLOCK, RBLOCK])
        tmp4 = triton_helpers.maximum(_tmp3, tmp2)
        _tmp3 = tl.where(rmask, tmp4, _tmp3)
    tmp3 = triton_helpers.max2(_tmp3, 1)[:, None]
    tl.store(out_ptr0 + (x0), tmp3, None)
    _tmp10 = tl.full([XBLOCK, RBLOCK], 0, tl.float32)
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r1 = rindex
        tmp5 = tl.load(in_ptr0 + (r1 + (30528*x0)), rmask, eviction_policy='evict_first', other=0.0).to(tl.float32)
        tmp6 = tmp5.to(tl.float32)
        tmp7 = tmp6 - tmp3
        tmp8 = tl_math.exp(tmp7)
        tmp9 = tl.broadcast_to(tmp8, [XBLOCK, RBLOCK])
        tmp11 = _tmp10 + tmp9
        _tmp10 = tl.where(rmask, tmp11, _tmp10)
    tmp10 = tl.sum(_tmp10, 1)[:, None]
    tl.store(out_ptr1 + (x0), tmp10, None)
''', device_str='cuda')

# kernel path: /tmp/torchinductor_eellison/jo/cjonz7vxay7gmy5qv5ie2jflm5jixsrzswexulnc4drrofc2cr64.py
# Source Nodes: [masked_lm_loss], Original ATen: [aten.nll_loss_forward]
# masked_lm_loss => convert_element_type_304, div, full_default_1, ne_1, ne_2, neg, sum_2, sum_3, where_2
triton_red_fused_nll_loss_forward_8 = async_compile.triton('triton_', '''
import triton
import triton.language as tl
from triton.compiler.compiler import AttrsDescriptor

@triton_heuristics.reduction(
    size_hints=[1, 8192],
    reduction_hint=ReductionHint.INNER,
    filename=__file__,
    triton_meta={'signature': {0: '*bf16', 1: '*i64', 2: '*bf16', 3: '*fp32', 4: '*fp32', 5: 'i32', 6: 'i32'}, 'device': DeviceProperties(type='cuda', index=0, cc=80, major=8, regs_per_multiprocessor=65536, max_threads_per_multi_processor=2048, multi_processor_count=108), 'constants': {5: 1}, 'configs': [AttrsDescriptor(divisible_by_16=(0, 1, 2, 3, 4, 6), equal_to_1=(5,))]},
    inductor_meta={'autotune_hints': set(), 'kernel_name': 'triton_red_fused_nll_loss_forward_8', 'mutated_arg_names': ['in_out_ptr0'], 'no_x_dim': False, 'num_load': 3, 'num_reduction': 2, 'backend_hash': 'A4DB0438D2F455BC2DA87EC6800D76C325D555A608FFCEAD009A7F894B0D3C83', 'are_deterministic_algorithms_enabled': False, 'assert_indirect_indexing': True, 'autotune_local_cache': True, 'autotune_pointwise': True, 'autotune_remote_cache': None, 'force_disable_caches': False, 'dynamic_scale_rblock': True, 'max_autotune': False, 'max_autotune_pointwise': False, 'min_split_scan_rblock': 256, 'spill_threshold': 16, 'store_cubin': False}
)
@triton.jit
def triton_(in_out_ptr0, in_ptr0, in_ptr1, in_ptr2, in_ptr3, xnumel, rnumel, XBLOCK : tl.constexpr, RBLOCK : tl.constexpr):
    xnumel = 1
    rnumel = 8192
    xoffset = tl.program_id(0) * XBLOCK
    xindex = xoffset + tl.arange(0, XBLOCK)[:, None]
    xmask = tl.full([XBLOCK, RBLOCK], True, tl.int1)
    rbase = tl.arange(0, RBLOCK)[None, :]
    _tmp22 = tl.full([XBLOCK, RBLOCK], 0, tl.float32)
    _tmp26 = tl.full([XBLOCK, RBLOCK], 0, tl.int64)
    for roffset in range(0, rnumel, RBLOCK):
        rindex = roffset + rbase
        rmask = rindex < rnumel
        r0 = rindex
        tmp0 = tl.load(in_ptr0 + (r0), rmask, eviction_policy='evict_first', other=0.0)
        tmp12 = tl.load(in_ptr2 + (r0), rmask, eviction_policy='evict_first', other=0.0)
        tmp14 = tl.load(in_ptr3 + (r0), rmask, eviction_policy='evict_first', other=0.0)
        tmp1 = tl.full([1, 1], -100, tl.int64)
        tmp2 = tmp0 != tmp1
        tmp3 = tl.full([1, 1], 0, tl.int64)
        tmp4 = tl.where(tmp2, tmp0, tmp3)
        tmp5 = tl.full([XBLOCK, RBLOCK], 30522, tl.int32)
        tmp6 = tmp4 + tmp5
        tmp7 = tmp4 < 0
        tmp8 = tl.where(tmp7, tmp6, tmp4)
        tl.device_assert(((0 <= tmp8) & (tmp8 < 30522)) | ~(rmask), "index out of bounds: 0 <= tmp8 < 30522")
        tmp10 = tl.load(in_ptr1 + (tmp8 + (30528*r0)), rmask, eviction_policy='evict_last', other=0.0).to(tl.float32)
        tmp11 = tmp10.to(tl.float32)
        tmp13 = tmp11 - tmp12
        tmp15 = tl_math.log(tmp14)
        tmp16 = tmp13 - tmp15
        tmp17 = tmp16.to(tl.float32)
        tmp18 = -tmp17
        tmp19 = 0.0
        tmp20 = tl.where(tmp2, tmp18, tmp19)
        tmp21 = tl.broadcast_to(tmp20, [XBLOCK, RBLOCK])
        tmp23 = _tmp22 + tmp21
        _tmp22 = tl.where(rmask, tmp23, _tmp22)
        tmp24 = tmp2.to(tl.int64)
        tmp25 = tl.broadcast_to(tmp24, [XBLOCK, RBLOCK])
        tmp27 = _tmp26 + tmp25
        _tmp26 = tl.where(rmask, tmp27, _tmp26)
    tmp22 = tl.sum(_tmp22, 1)[:, None]
    tmp26 = tl.sum(_tmp26, 1)[:, None]
    tmp28 = tmp26.to(tl.float32)
    tmp29 = tmp22 / tmp28
    tl.debug_barrier()
    tl.store(in_out_ptr0 + (tl.full([XBLOCK, 1], 0, tl.int32)), tmp29, None)
''', device_str='cuda')

async_compile.wait(globals())
del async_compile

def call(args):
    arg0_1, arg1_1, arg2_1, arg3_1, arg4_1, arg5_1, arg6_1, arg7_1, arg8_1, arg9_1, arg10_1, arg11_1, arg12_1, arg13_1, arg14_1, arg15_1, arg16_1, arg17_1, arg18_1, arg19_1, arg20_1, arg21_1, arg22_1, arg23_1, arg24_1, arg25_1, arg26_1, arg27_1, arg28_1, arg29_1, arg30_1, arg31_1, arg32_1, arg33_1, arg34_1, arg35_1, arg36_1, arg37_1, arg38_1, arg39_1, arg40_1, arg41_1, arg42_1, arg43_1, arg44_1, arg45_1, arg46_1, arg47_1, arg48_1, arg49_1, arg50_1, arg51_1, arg52_1, arg53_1, arg54_1, arg55_1, arg56_1, arg57_1, arg58_1, arg59_1, arg60_1, arg61_1, arg62_1, arg63_1, arg64_1, arg65_1, arg66_1, arg67_1, arg68_1, arg69_1, arg70_1, arg71_1, arg72_1, arg73_1, arg74_1, arg75_1, arg76_1, arg77_1, arg78_1, arg79_1, arg80_1, arg81_1, arg82_1, arg83_1, arg84_1, arg85_1, arg86_1, arg87_1, arg88_1, arg89_1, arg90_1, arg91_1, arg92_1, arg93_1, arg94_1, arg95_1, arg96_1, arg97_1, arg98_1, arg99_1, arg100_1, arg101_1, arg102_1, arg103_1, arg104_1, arg105_1, arg106_1, arg107_1, arg108_1, arg109_1, arg110_1, arg111_1, arg112_1, arg113_1, arg114_1, arg115_1, arg116_1, arg117_1, arg118_1, arg119_1, arg120_1, arg121_1, arg122_1, arg123_1, arg124_1, arg125_1, arg126_1, arg127_1, arg128_1, arg129_1, arg130_1, arg131_1, arg132_1, arg133_1, arg134_1, arg135_1, arg136_1, arg137_1, arg138_1, arg139_1, arg140_1, arg141_1, arg142_1, arg143_1, arg144_1, arg145_1, arg146_1, arg147_1, arg148_1, arg149_1, arg150_1, arg151_1, arg152_1, arg153_1, arg154_1, arg155_1, arg156_1, arg157_1, arg158_1, arg159_1, arg160_1, arg161_1, arg162_1, arg163_1, arg164_1, arg165_1, arg166_1, arg167_1, arg168_1, arg169_1, arg170_1, arg171_1, arg172_1, arg173_1, arg174_1, arg175_1, arg176_1, arg177_1, arg178_1, arg179_1, arg180_1, arg181_1, arg182_1, arg183_1, arg184_1, arg185_1, arg186_1, arg187_1, arg188_1, arg189_1, arg190_1, arg191_1, arg192_1, arg193_1, arg194_1, arg195_1, arg196_1, arg197_1, arg198_1, arg199_1, arg200_1, arg201_1, arg202_1, arg203_1, arg204_1, arg205_1, arg206_1 = args
    args.clear()
    assert_size_stride(arg0_1, (30522, 768), (768, 1))
    assert_size_stride(arg1_1, (2, 768), (768, 1))
    assert_size_stride(arg2_1, (512, 768), (768, 1))
    assert_size_stride(arg3_1, (768, ), (1, ))
    assert_size_stride(arg4_1, (768, ), (1, ))
    assert_size_stride(arg5_1, (768, 768), (768, 1))
    assert_size_stride(arg6_1, (768, ), (1, ))
    assert_size_stride(arg7_1, (768, 768), (768, 1))
    assert_size_stride(arg8_1, (768, ), (1, ))
    assert_size_stride(arg9_1, (768, 768), (768, 1))
    assert_size_stride(arg10_1, (768, ), (1, ))
    assert_size_stride(arg11_1, (768, 768), (768, 1))
    assert_size_stride(arg12_1, (768, ), (1, ))
    assert_size_stride(arg13_1, (768, ), (1, ))
    assert_size_stride(arg14_1, (768, ), (1, ))
    assert_size_stride(arg15_1, (3072, 768), (768, 1))
    assert_size_stride(arg16_1, (3072, ), (1, ))
    assert_size_stride(arg17_1, (768, 3072), (3072, 1))
    assert_size_stride(arg18_1, (768, ), (1, ))
    assert_size_stride(arg19_1, (768, ), (1, ))
    assert_size_stride(arg20_1, (768, ), (1, ))
    assert_size_stride(arg21_1, (768, 768), (768, 1))
    assert_size_stride(arg22_1, (768, ), (1, ))
    assert_size_stride(arg23_1, (768, 768), (768, 1))
    assert_size_stride(arg24_1, (768, ), (1, ))
    assert_size_stride(arg25_1, (768, 768), (768, 1))
    assert_size_stride(arg26_1, (768, ), (1, ))
    assert_size_stride(arg27_1, (768, 768), (768, 1))
    assert_size_stride(arg28_1, (768, ), (1, ))
    assert_size_stride(arg29_1, (768, ), (1, ))
    assert_size_stride(arg30_1, (768, ), (1, ))
    assert_size_stride(arg31_1, (3072, 768), (768, 1))
    assert_size_stride(arg32_1, (3072, ), (1, ))
    assert_size_stride(arg33_1, (768, 3072), (3072, 1))
    assert_size_stride(arg34_1, (768, ), (1, ))
    assert_size_stride(arg35_1, (768, ), (1, ))
    assert_size_stride(arg36_1, (768, ), (1, ))
    assert_size_stride(arg37_1, (768, 768), (768, 1))
    assert_size_stride(arg38_1, (768, ), (1, ))
    assert_size_stride(arg39_1, (768, 768), (768, 1))
    assert_size_stride(arg40_1, (768, ), (1, ))
    assert_size_stride(arg41_1, (768, 768), (768, 1))
    assert_size_stride(arg42_1, (768, ), (1, ))
    assert_size_stride(arg43_1, (768, 768), (768, 1))
    assert_size_stride(arg44_1, (768, ), (1, ))
    assert_size_stride(arg45_1, (768, ), (1, ))
    assert_size_stride(arg46_1, (768, ), (1, ))
    assert_size_stride(arg47_1, (3072, 768), (768, 1))
    assert_size_stride(arg48_1, (3072, ), (1, ))
    assert_size_stride(arg49_1, (768, 3072), (3072, 1))
    assert_size_stride(arg50_1, (768, ), (1, ))
    assert_size_stride(arg51_1, (768, ), (1, ))
    assert_size_stride(arg52_1, (768, ), (1, ))
    assert_size_stride(arg53_1, (768, 768), (768, 1))
    assert_size_stride(arg54_1, (768, ), (1, ))
    assert_size_stride(arg55_1, (768, 768), (768, 1))
    assert_size_stride(arg56_1, (768, ), (1, ))
    assert_size_stride(arg57_1, (768, 768), (768, 1))
    assert_size_stride(arg58_1, (768, ), (1, ))
    assert_size_stride(arg59_1, (768, 768), (768, 1))
    assert_size_stride(arg60_1, (768, ), (1, ))
    assert_size_stride(arg61_1, (768, ), (1, ))
    assert_size_stride(arg62_1, (768, ), (1, ))
    assert_size_stride(arg63_1, (3072, 768), (768, 1))
    assert_size_stride(arg64_1, (3072, ), (1, ))
    assert_size_stride(arg65_1, (768, 3072), (3072, 1))
    assert_size_stride(arg66_1, (768, ), (1, ))
    assert_size_stride(arg67_1, (768, ), (1, ))
    assert_size_stride(arg68_1, (768, ), (1, ))
    assert_size_stride(arg69_1, (768, 768), (768, 1))
    assert_size_stride(arg70_1, (768, ), (1, ))
    assert_size_stride(arg71_1, (768, 768), (768, 1))
    assert_size_stride(arg72_1, (768, ), (1, ))
    assert_size_stride(arg73_1, (768, 768), (768, 1))
    assert_size_stride(arg74_1, (768, ), (1, ))
    assert_size_stride(arg75_1, (768, 768), (768, 1))
    assert_size_stride(arg76_1, (768, ), (1, ))
    assert_size_stride(arg77_1, (768, ), (1, ))
    assert_size_stride(arg78_1, (768, ), (1, ))
    assert_size_stride(arg79_1, (3072, 768), (768, 1))
    assert_size_stride(arg80_1, (3072, ), (1, ))
    assert_size_stride(arg81_1, (768, 3072), (3072, 1))
    assert_size_stride(arg82_1, (768, ), (1, ))
    assert_size_stride(arg83_1, (768, ), (1, ))
    assert_size_stride(arg84_1, (768, ), (1, ))
    assert_size_stride(arg85_1, (768, 768), (768, 1))
    assert_size_stride(arg86_1, (768, ), (1, ))
    assert_size_stride(arg87_1, (768, 768), (768, 1))
    assert_size_stride(arg88_1, (768, ), (1, ))
    assert_size_stride(arg89_1, (768, 768), (768, 1))
    assert_size_stride(arg90_1, (768, ), (1, ))
    assert_size_stride(arg91_1, (768, 768), (768, 1))
    assert_size_stride(arg92_1, (768, ), (1, ))
    assert_size_stride(arg93_1, (768, ), (1, ))
    assert_size_stride(arg94_1, (768, ), (1, ))
    assert_size_stride(arg95_1, (3072, 768), (768, 1))
    assert_size_stride(arg96_1, (3072, ), (1, ))
    assert_size_stride(arg97_1, (768, 3072), (3072, 1))
    assert_size_stride(arg98_1, (768, ), (1, ))
    assert_size_stride(arg99_1, (768, ), (1, ))
    assert_size_stride(arg100_1, (768, ), (1, ))
    assert_size_stride(arg101_1, (768, 768), (768, 1))
    assert_size_stride(arg102_1, (768, ), (1, ))
    assert_size_stride(arg103_1, (768, 768), (768, 1))
    assert_size_stride(arg104_1, (768, ), (1, ))
    assert_size_stride(arg105_1, (768, 768), (768, 1))
    assert_size_stride(arg106_1, (768, ), (1, ))
    assert_size_stride(arg107_1, (768, 768), (768, 1))
    assert_size_stride(arg108_1, (768, ), (1, ))
    assert_size_stride(arg109_1, (768, ), (1, ))
    assert_size_stride(arg110_1, (768, ), (1, ))
    assert_size_stride(arg111_1, (3072, 768), (768, 1))
    assert_size_stride(arg112_1, (3072, ), (1, ))
    assert_size_stride(arg113_1, (768, 3072), (3072, 1))
    assert_size_stride(arg114_1, (768, ), (1, ))
    assert_size_stride(arg115_1, (768, ), (1, ))
    assert_size_stride(arg116_1, (768, ), (1, ))
    assert_size_stride(arg117_1, (768, 768), (768, 1))
    assert_size_stride(arg118_1, (768, ), (1, ))
    assert_size_stride(arg119_1, (768, 768), (768, 1))
    assert_size_stride(arg120_1, (768, ), (1, ))
    assert_size_stride(arg121_1, (768, 768), (768, 1))
    assert_size_stride(arg122_1, (768, ), (1, ))
    assert_size_stride(arg123_1, (768, 768), (768, 1))
    assert_size_stride(arg124_1, (768, ), (1, ))
    assert_size_stride(arg125_1, (768, ), (1, ))
    assert_size_stride(arg126_1, (768, ), (1, ))
    assert_size_stride(arg127_1, (3072, 768), (768, 1))
    assert_size_stride(arg128_1, (3072, ), (1, ))
    assert_size_stride(arg129_1, (768, 3072), (3072, 1))
    assert_size_stride(arg130_1, (768, ), (1, ))
    assert_size_stride(arg131_1, (768, ), (1, ))
    assert_size_stride(arg132_1, (768, ), (1, ))
    assert_size_stride(arg133_1, (768, 768), (768, 1))
    assert_size_stride(arg134_1, (768, ), (1, ))
    assert_size_stride(arg135_1, (768, 768), (768, 1))
    assert_size_stride(arg136_1, (768, ), (1, ))
    assert_size_stride(arg137_1, (768, 768), (768, 1))
    assert_size_stride(arg138_1, (768, ), (1, ))
    assert_size_stride(arg139_1, (768, 768), (768, 1))
    assert_size_stride(arg140_1, (768, ), (1, ))
    assert_size_stride(arg141_1, (768, ), (1, ))
    assert_size_stride(arg142_1, (768, ), (1, ))
    assert_size_stride(arg143_1, (3072, 768), (768, 1))
    assert_size_stride(arg144_1, (3072, ), (1, ))
    assert_size_stride(arg145_1, (768, 3072), (3072, 1))
    assert_size_stride(arg146_1, (768, ), (1, ))
    assert_size_stride(arg147_1, (768, ), (1, ))
    assert_size_stride(arg148_1, (768, ), (1, ))
    assert_size_stride(arg149_1, (768, 768), (768, 1))
    assert_size_stride(arg150_1, (768, ), (1, ))
    assert_size_stride(arg151_1, (768, 768), (768, 1))
    assert_size_stride(arg152_1, (768, ), (1, ))
    assert_size_stride(arg153_1, (768, 768), (768, 1))
    assert_size_stride(arg154_1, (768, ), (1, ))
    assert_size_stride(arg155_1, (768, 768), (768, 1))
    assert_size_stride(arg156_1, (768, ), (1, ))
    assert_size_stride(arg157_1, (768, ), (1, ))
    assert_size_stride(arg158_1, (768, ), (1, ))
    assert_size_stride(arg159_1, (3072, 768), (768, 1))
    assert_size_stride(arg160_1, (3072, ), (1, ))
    assert_size_stride(arg161_1, (768, 3072), (3072, 1))
    assert_size_stride(arg162_1, (768, ), (1, ))
    assert_size_stride(arg163_1, (768, ), (1, ))
    assert_size_stride(arg164_1, (768, ), (1, ))
    assert_size_stride(arg165_1, (768, 768), (768, 1))
    assert_size_stride(arg166_1, (768, ), (1, ))
    assert_size_stride(arg167_1, (768, 768), (768, 1))
    assert_size_stride(arg168_1, (768, ), (1, ))
    assert_size_stride(arg169_1, (768, 768), (768, 1))
    assert_size_stride(arg170_1, (768, ), (1, ))
    assert_size_stride(arg171_1, (768, 768), (768, 1))
    assert_size_stride(arg172_1, (768, ), (1, ))
    assert_size_stride(arg173_1, (768, ), (1, ))
    assert_size_stride(arg174_1, (768, ), (1, ))
    assert_size_stride(arg175_1, (3072, 768), (768, 1))
    assert_size_stride(arg176_1, (3072, ), (1, ))
    assert_size_stride(arg177_1, (768, 3072), (3072, 1))
    assert_size_stride(arg178_1, (768, ), (1, ))
    assert_size_stride(arg179_1, (768, ), (1, ))
    assert_size_stride(arg180_1, (768, ), (1, ))
    assert_size_stride(arg181_1, (768, 768), (768, 1))
    assert_size_stride(arg182_1, (768, ), (1, ))
    assert_size_stride(arg183_1, (768, 768), (768, 1))
    assert_size_stride(arg184_1, (768, ), (1, ))
    assert_size_stride(arg185_1, (768, 768), (768, 1))
    assert_size_stride(arg186_1, (768, ), (1, ))
    assert_size_stride(arg187_1, (768, 768), (768, 1))
    assert_size_stride(arg188_1, (768, ), (1, ))
    assert_size_stride(arg189_1, (768, ), (1, ))
    assert_size_stride(arg190_1, (768, ), (1, ))
    assert_size_stride(arg191_1, (3072, 768), (768, 1))
    assert_size_stride(arg192_1, (3072, ), (1, ))
    assert_size_stride(arg193_1, (768, 3072), (3072, 1))
    assert_size_stride(arg194_1, (768, ), (1, ))
    assert_size_stride(arg195_1, (768, ), (1, ))
    assert_size_stride(arg196_1, (768, ), (1, ))
    assert_size_stride(arg197_1, (768, 768), (768, 1))
    assert_size_stride(arg198_1, (768, ), (1, ))
    assert_size_stride(arg199_1, (768, ), (1, ))
    assert_size_stride(arg200_1, (768, ), (1, ))
    assert_size_stride(arg201_1, (30522, 768), (768, 1))
    assert_size_stride(arg202_1, (30522, ), (1, ))
    assert_size_stride(arg203_1, (1, 512), (512, 1))
    assert_size_stride(arg204_1, (1, 512), (512, 1))
    assert_size_stride(arg205_1, (16, 512), (512, 1))
    assert_size_stride(arg206_1, (16, 512), (512, 1))
    with torch.cuda._DeviceGuard(0):
        torch.cuda.set_device(0)
        buf0 = empty_strided_cuda((16, 512, 768), (393216, 768, 1), torch.bfloat16)
        buf4 = empty_strided_cuda((16, 512, 768), (393216, 768, 1), torch.bfloat16)
        # Source Nodes: [embeddings, embeddings_1, embeddings_2, inputs_embeds, position_embeddings, token_type_embeddings], Original ATen: [aten.add, aten.embedding, aten.native_layer_norm]
        stream0 = get_raw_stream(0)
        triton_per_fused_add_embedding_native_layer_norm_0.run(arg205_1, arg0_1, arg203_1, arg1_1, arg204_1, arg2_1, arg3_1, arg4_1, buf0, buf4, 8192, 768, grid=grid(8192), stream=stream0)
        del arg0_1
        del arg1_1
        del arg203_1
        del arg204_1
        del arg205_1
        del arg2_1
        del arg3_1
        del arg4_1
        buf5 = reinterpret_tensor(buf0, (8192, 768), (768, 1), 0); del buf0  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_0_attention_self_query], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg6_1, reinterpret_tensor(buf4, (8192, 768), (768, 1), 0), reinterpret_tensor(arg5_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf5)
        del arg5_1
        del arg6_1
        buf6 = empty_strided_cuda((8192, 768), (768, 1), torch.bfloat16)
        # Source Nodes: [l__mod___bert_encoder_layer_0_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg8_1, reinterpret_tensor(buf4, (8192, 768), (768, 1), 0), reinterpret_tensor(arg7_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf6)
        del arg7_1
        del arg8_1
        buf7 = empty_strided_cuda((8192, 768), (768, 1), torch.bfloat16)
        # Source Nodes: [l__mod___bert_encoder_layer_0_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg10_1, reinterpret_tensor(buf4, (8192, 768), (768, 1), 0), reinterpret_tensor(arg9_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf7)
        del arg10_1
        del arg9_1
        buf8 = empty_strided_cuda((16, 12, 512, 512), (3145728, 262144, 512, 1), torch.bfloat16)
        # Source Nodes: [attn_output], Original ATen: [aten._scaled_dot_product_efficient_attention]
        triton_poi_fused__scaled_dot_product_efficient_attention_1.run(buf8, 50331648, grid=grid(50331648), stream=stream0)
        # Source Nodes: [attn_output], Original ATen: [aten._scaled_dot_product_efficient_attention]
        buf9 = torch.ops.aten._scaled_dot_product_efficient_attention.default(reinterpret_tensor(buf5, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf6, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf7, (16, 12, 512, 64), (393216, 64, 768, 1), 0), buf8, False)
        del buf5
        buf10 = buf9[0]
        del buf9
        buf14 = buf7; del buf7  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf10, (8192, 768), (768, 1), 0), reinterpret_tensor(arg11_1, (768, 768), (1, 768), 0), out=buf14)
        del arg11_1
        buf18 = reinterpret_tensor(buf10, (16, 512, 768), (393216, 768, 1), 0); del buf10  # reuse
        # Source Nodes: [add_1, hidden_states_2], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf14, arg12_1, buf4, arg13_1, arg14_1, buf18, 8192, 768, grid=grid(8192), stream=stream0)
        del arg12_1
        del arg13_1
        del arg14_1
        buf19 = empty_strided_cuda((8192, 3072), (3072, 1), torch.bfloat16)
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf18, (8192, 768), (768, 1), 0), reinterpret_tensor(arg15_1, (768, 3072), (1, 768), 0), out=buf19)
        del arg15_1
        buf20 = reinterpret_tensor(buf19, (16, 512, 3072), (1572864, 3072, 1), 0); del buf19  # reuse
        # Source Nodes: [hidden_states_4], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_3.run(buf20, arg16_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg16_1
        buf21 = reinterpret_tensor(buf4, (8192, 768), (768, 1), 0); del buf4  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf20, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg17_1, (3072, 768), (1, 3072), 0), out=buf21)
        del arg17_1
        buf25 = reinterpret_tensor(buf14, (16, 512, 768), (393216, 768, 1), 0); del buf14  # reuse
        # Source Nodes: [add_2, hidden_states_7], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf21, arg18_1, buf18, arg19_1, arg20_1, buf25, 8192, 768, grid=grid(8192), stream=stream0)
        del arg18_1
        del arg19_1
        del arg20_1
        buf26 = buf21; del buf21  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_1_attention_self_query], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg22_1, reinterpret_tensor(buf25, (8192, 768), (768, 1), 0), reinterpret_tensor(arg21_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf26)
        del arg21_1
        del arg22_1
        buf27 = reinterpret_tensor(buf18, (8192, 768), (768, 1), 0); del buf18  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_1_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg24_1, reinterpret_tensor(buf25, (8192, 768), (768, 1), 0), reinterpret_tensor(arg23_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf27)
        del arg23_1
        del arg24_1
        buf28 = buf6; del buf6  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_1_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg26_1, reinterpret_tensor(buf25, (8192, 768), (768, 1), 0), reinterpret_tensor(arg25_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf28)
        del arg25_1
        del arg26_1
        buf29 = buf8; del buf8  # reuse
        # Source Nodes: [attn_output_3], Original ATen: [aten._scaled_dot_product_efficient_attention]
        triton_poi_fused__scaled_dot_product_efficient_attention_1.run(buf29, 50331648, grid=grid(50331648), stream=stream0)
        # Source Nodes: [attn_output_3], Original ATen: [aten._scaled_dot_product_efficient_attention]
        buf30 = torch.ops.aten._scaled_dot_product_efficient_attention.default(reinterpret_tensor(buf26, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf27, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf28, (16, 12, 512, 64), (393216, 64, 768, 1), 0), buf29, False)
        del buf26
        buf31 = buf30[0]
        del buf30
        buf35 = buf28; del buf28  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf31, (8192, 768), (768, 1), 0), reinterpret_tensor(arg27_1, (768, 768), (1, 768), 0), out=buf35)
        del arg27_1
        buf39 = reinterpret_tensor(buf31, (16, 512, 768), (393216, 768, 1), 0); del buf31  # reuse
        # Source Nodes: [add_3, hidden_states_10], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf35, arg28_1, buf25, arg29_1, arg30_1, buf39, 8192, 768, grid=grid(8192), stream=stream0)
        del arg28_1
        del arg29_1
        del arg30_1
        buf40 = reinterpret_tensor(buf20, (8192, 3072), (3072, 1), 0); del buf20  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf39, (8192, 768), (768, 1), 0), reinterpret_tensor(arg31_1, (768, 3072), (1, 768), 0), out=buf40)
        del arg31_1
        buf41 = reinterpret_tensor(buf40, (16, 512, 3072), (1572864, 3072, 1), 0); del buf40  # reuse
        # Source Nodes: [hidden_states_12], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_3.run(buf41, arg32_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg32_1
        buf42 = buf35; del buf35  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf41, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg33_1, (3072, 768), (1, 3072), 0), out=buf42)
        del arg33_1
        buf46 = buf25; del buf25  # reuse
        # Source Nodes: [add_4, hidden_states_15], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf42, arg34_1, buf39, arg35_1, arg36_1, buf46, 8192, 768, grid=grid(8192), stream=stream0)
        del arg34_1
        del arg35_1
        del arg36_1
        buf47 = buf42; del buf42  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_2_attention_self_query], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg38_1, reinterpret_tensor(buf46, (8192, 768), (768, 1), 0), reinterpret_tensor(arg37_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf47)
        del arg37_1
        del arg38_1
        buf48 = reinterpret_tensor(buf39, (8192, 768), (768, 1), 0); del buf39  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_2_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg40_1, reinterpret_tensor(buf46, (8192, 768), (768, 1), 0), reinterpret_tensor(arg39_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf48)
        del arg39_1
        del arg40_1
        buf49 = buf27; del buf27  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_2_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg42_1, reinterpret_tensor(buf46, (8192, 768), (768, 1), 0), reinterpret_tensor(arg41_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf49)
        del arg41_1
        del arg42_1
        buf50 = buf29; del buf29  # reuse
        # Source Nodes: [attn_output_6], Original ATen: [aten._scaled_dot_product_efficient_attention]
        triton_poi_fused__scaled_dot_product_efficient_attention_1.run(buf50, 50331648, grid=grid(50331648), stream=stream0)
        # Source Nodes: [attn_output_6], Original ATen: [aten._scaled_dot_product_efficient_attention]
        buf51 = torch.ops.aten._scaled_dot_product_efficient_attention.default(reinterpret_tensor(buf47, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf48, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf49, (16, 12, 512, 64), (393216, 64, 768, 1), 0), buf50, False)
        del buf47
        buf52 = buf51[0]
        del buf51
        buf56 = buf49; del buf49  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf52, (8192, 768), (768, 1), 0), reinterpret_tensor(arg43_1, (768, 768), (1, 768), 0), out=buf56)
        del arg43_1
        buf60 = reinterpret_tensor(buf52, (16, 512, 768), (393216, 768, 1), 0); del buf52  # reuse
        # Source Nodes: [add_5, hidden_states_18], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf56, arg44_1, buf46, arg45_1, arg46_1, buf60, 8192, 768, grid=grid(8192), stream=stream0)
        del arg44_1
        del arg45_1
        del arg46_1
        buf61 = reinterpret_tensor(buf41, (8192, 3072), (3072, 1), 0); del buf41  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf60, (8192, 768), (768, 1), 0), reinterpret_tensor(arg47_1, (768, 3072), (1, 768), 0), out=buf61)
        del arg47_1
        buf62 = reinterpret_tensor(buf61, (16, 512, 3072), (1572864, 3072, 1), 0); del buf61  # reuse
        # Source Nodes: [hidden_states_20], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_3.run(buf62, arg48_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg48_1
        buf63 = buf56; del buf56  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf62, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg49_1, (3072, 768), (1, 3072), 0), out=buf63)
        del arg49_1
        buf67 = buf46; del buf46  # reuse
        # Source Nodes: [add_6, hidden_states_23], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf63, arg50_1, buf60, arg51_1, arg52_1, buf67, 8192, 768, grid=grid(8192), stream=stream0)
        del arg50_1
        del arg51_1
        del arg52_1
        buf68 = buf63; del buf63  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_3_attention_self_query], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg54_1, reinterpret_tensor(buf67, (8192, 768), (768, 1), 0), reinterpret_tensor(arg53_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf68)
        del arg53_1
        del arg54_1
        buf69 = reinterpret_tensor(buf60, (8192, 768), (768, 1), 0); del buf60  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_3_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg56_1, reinterpret_tensor(buf67, (8192, 768), (768, 1), 0), reinterpret_tensor(arg55_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf69)
        del arg55_1
        del arg56_1
        buf70 = buf48; del buf48  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_3_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg58_1, reinterpret_tensor(buf67, (8192, 768), (768, 1), 0), reinterpret_tensor(arg57_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf70)
        del arg57_1
        del arg58_1
        buf71 = buf50; del buf50  # reuse
        # Source Nodes: [attn_output_9], Original ATen: [aten._scaled_dot_product_efficient_attention]
        triton_poi_fused__scaled_dot_product_efficient_attention_1.run(buf71, 50331648, grid=grid(50331648), stream=stream0)
        # Source Nodes: [attn_output_9], Original ATen: [aten._scaled_dot_product_efficient_attention]
        buf72 = torch.ops.aten._scaled_dot_product_efficient_attention.default(reinterpret_tensor(buf68, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf69, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf70, (16, 12, 512, 64), (393216, 64, 768, 1), 0), buf71, False)
        del buf68
        buf73 = buf72[0]
        del buf72
        buf77 = buf70; del buf70  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf73, (8192, 768), (768, 1), 0), reinterpret_tensor(arg59_1, (768, 768), (1, 768), 0), out=buf77)
        del arg59_1
        buf81 = reinterpret_tensor(buf73, (16, 512, 768), (393216, 768, 1), 0); del buf73  # reuse
        # Source Nodes: [add_7, hidden_states_26], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf77, arg60_1, buf67, arg61_1, arg62_1, buf81, 8192, 768, grid=grid(8192), stream=stream0)
        del arg60_1
        del arg61_1
        del arg62_1
        buf82 = reinterpret_tensor(buf62, (8192, 3072), (3072, 1), 0); del buf62  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf81, (8192, 768), (768, 1), 0), reinterpret_tensor(arg63_1, (768, 3072), (1, 768), 0), out=buf82)
        del arg63_1
        buf83 = reinterpret_tensor(buf82, (16, 512, 3072), (1572864, 3072, 1), 0); del buf82  # reuse
        # Source Nodes: [hidden_states_28], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_3.run(buf83, arg64_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg64_1
        buf84 = buf77; del buf77  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf83, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg65_1, (3072, 768), (1, 3072), 0), out=buf84)
        del arg65_1
        buf88 = buf67; del buf67  # reuse
        # Source Nodes: [add_8, hidden_states_31], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf84, arg66_1, buf81, arg67_1, arg68_1, buf88, 8192, 768, grid=grid(8192), stream=stream0)
        del arg66_1
        del arg67_1
        del arg68_1
        buf89 = buf84; del buf84  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_4_attention_self_query], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg70_1, reinterpret_tensor(buf88, (8192, 768), (768, 1), 0), reinterpret_tensor(arg69_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf89)
        del arg69_1
        del arg70_1
        buf90 = reinterpret_tensor(buf81, (8192, 768), (768, 1), 0); del buf81  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_4_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg72_1, reinterpret_tensor(buf88, (8192, 768), (768, 1), 0), reinterpret_tensor(arg71_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf90)
        del arg71_1
        del arg72_1
        buf91 = buf69; del buf69  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_4_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg74_1, reinterpret_tensor(buf88, (8192, 768), (768, 1), 0), reinterpret_tensor(arg73_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf91)
        del arg73_1
        del arg74_1
        buf92 = buf71; del buf71  # reuse
        # Source Nodes: [attn_output_12], Original ATen: [aten._scaled_dot_product_efficient_attention]
        triton_poi_fused__scaled_dot_product_efficient_attention_1.run(buf92, 50331648, grid=grid(50331648), stream=stream0)
        # Source Nodes: [attn_output_12], Original ATen: [aten._scaled_dot_product_efficient_attention]
        buf93 = torch.ops.aten._scaled_dot_product_efficient_attention.default(reinterpret_tensor(buf89, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf90, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf91, (16, 12, 512, 64), (393216, 64, 768, 1), 0), buf92, False)
        del buf89
        buf94 = buf93[0]
        del buf93
        buf98 = buf91; del buf91  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf94, (8192, 768), (768, 1), 0), reinterpret_tensor(arg75_1, (768, 768), (1, 768), 0), out=buf98)
        del arg75_1
        buf102 = reinterpret_tensor(buf94, (16, 512, 768), (393216, 768, 1), 0); del buf94  # reuse
        # Source Nodes: [add_9, hidden_states_34], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf98, arg76_1, buf88, arg77_1, arg78_1, buf102, 8192, 768, grid=grid(8192), stream=stream0)
        del arg76_1
        del arg77_1
        del arg78_1
        buf103 = reinterpret_tensor(buf83, (8192, 3072), (3072, 1), 0); del buf83  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf102, (8192, 768), (768, 1), 0), reinterpret_tensor(arg79_1, (768, 3072), (1, 768), 0), out=buf103)
        del arg79_1
        buf104 = reinterpret_tensor(buf103, (16, 512, 3072), (1572864, 3072, 1), 0); del buf103  # reuse
        # Source Nodes: [hidden_states_36], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_3.run(buf104, arg80_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg80_1
        buf105 = buf98; del buf98  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf104, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg81_1, (3072, 768), (1, 3072), 0), out=buf105)
        del arg81_1
        buf109 = buf88; del buf88  # reuse
        # Source Nodes: [add_10, hidden_states_39], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf105, arg82_1, buf102, arg83_1, arg84_1, buf109, 8192, 768, grid=grid(8192), stream=stream0)
        del arg82_1
        del arg83_1
        del arg84_1
        buf110 = buf105; del buf105  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_5_attention_self_query], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg86_1, reinterpret_tensor(buf109, (8192, 768), (768, 1), 0), reinterpret_tensor(arg85_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf110)
        del arg85_1
        del arg86_1
        buf111 = reinterpret_tensor(buf102, (8192, 768), (768, 1), 0); del buf102  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_5_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg88_1, reinterpret_tensor(buf109, (8192, 768), (768, 1), 0), reinterpret_tensor(arg87_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf111)
        del arg87_1
        del arg88_1
        buf112 = buf90; del buf90  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_5_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg90_1, reinterpret_tensor(buf109, (8192, 768), (768, 1), 0), reinterpret_tensor(arg89_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf112)
        del arg89_1
        del arg90_1
        buf113 = buf92; del buf92  # reuse
        # Source Nodes: [attn_output_15], Original ATen: [aten._scaled_dot_product_efficient_attention]
        triton_poi_fused__scaled_dot_product_efficient_attention_1.run(buf113, 50331648, grid=grid(50331648), stream=stream0)
        # Source Nodes: [attn_output_15], Original ATen: [aten._scaled_dot_product_efficient_attention]
        buf114 = torch.ops.aten._scaled_dot_product_efficient_attention.default(reinterpret_tensor(buf110, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf111, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf112, (16, 12, 512, 64), (393216, 64, 768, 1), 0), buf113, False)
        del buf110
        buf115 = buf114[0]
        del buf114
        buf119 = buf112; del buf112  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf115, (8192, 768), (768, 1), 0), reinterpret_tensor(arg91_1, (768, 768), (1, 768), 0), out=buf119)
        del arg91_1
        buf123 = reinterpret_tensor(buf115, (16, 512, 768), (393216, 768, 1), 0); del buf115  # reuse
        # Source Nodes: [add_11, hidden_states_42], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf119, arg92_1, buf109, arg93_1, arg94_1, buf123, 8192, 768, grid=grid(8192), stream=stream0)
        del arg92_1
        del arg93_1
        del arg94_1
        buf124 = reinterpret_tensor(buf104, (8192, 3072), (3072, 1), 0); del buf104  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf123, (8192, 768), (768, 1), 0), reinterpret_tensor(arg95_1, (768, 3072), (1, 768), 0), out=buf124)
        del arg95_1
        buf125 = reinterpret_tensor(buf124, (16, 512, 3072), (1572864, 3072, 1), 0); del buf124  # reuse
        # Source Nodes: [hidden_states_44], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_3.run(buf125, arg96_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg96_1
        buf126 = buf119; del buf119  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf125, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg97_1, (3072, 768), (1, 3072), 0), out=buf126)
        del arg97_1
        buf130 = buf109; del buf109  # reuse
        # Source Nodes: [add_12, hidden_states_47], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf126, arg98_1, buf123, arg99_1, arg100_1, buf130, 8192, 768, grid=grid(8192), stream=stream0)
        del arg100_1
        del arg98_1
        del arg99_1
        buf131 = buf126; del buf126  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_6_attention_self_query], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg102_1, reinterpret_tensor(buf130, (8192, 768), (768, 1), 0), reinterpret_tensor(arg101_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf131)
        del arg101_1
        del arg102_1
        buf132 = reinterpret_tensor(buf123, (8192, 768), (768, 1), 0); del buf123  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_6_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg104_1, reinterpret_tensor(buf130, (8192, 768), (768, 1), 0), reinterpret_tensor(arg103_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf132)
        del arg103_1
        del arg104_1
        buf133 = buf111; del buf111  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_6_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg106_1, reinterpret_tensor(buf130, (8192, 768), (768, 1), 0), reinterpret_tensor(arg105_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf133)
        del arg105_1
        del arg106_1
        buf134 = buf113; del buf113  # reuse
        # Source Nodes: [attn_output_18], Original ATen: [aten._scaled_dot_product_efficient_attention]
        triton_poi_fused__scaled_dot_product_efficient_attention_1.run(buf134, 50331648, grid=grid(50331648), stream=stream0)
        # Source Nodes: [attn_output_18], Original ATen: [aten._scaled_dot_product_efficient_attention]
        buf135 = torch.ops.aten._scaled_dot_product_efficient_attention.default(reinterpret_tensor(buf131, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf132, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf133, (16, 12, 512, 64), (393216, 64, 768, 1), 0), buf134, False)
        del buf131
        buf136 = buf135[0]
        del buf135
        buf140 = buf133; del buf133  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf136, (8192, 768), (768, 1), 0), reinterpret_tensor(arg107_1, (768, 768), (1, 768), 0), out=buf140)
        del arg107_1
        buf144 = reinterpret_tensor(buf136, (16, 512, 768), (393216, 768, 1), 0); del buf136  # reuse
        # Source Nodes: [add_13, hidden_states_50], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf140, arg108_1, buf130, arg109_1, arg110_1, buf144, 8192, 768, grid=grid(8192), stream=stream0)
        del arg108_1
        del arg109_1
        del arg110_1
        buf145 = reinterpret_tensor(buf125, (8192, 3072), (3072, 1), 0); del buf125  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf144, (8192, 768), (768, 1), 0), reinterpret_tensor(arg111_1, (768, 3072), (1, 768), 0), out=buf145)
        del arg111_1
        buf146 = reinterpret_tensor(buf145, (16, 512, 3072), (1572864, 3072, 1), 0); del buf145  # reuse
        # Source Nodes: [hidden_states_52], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_3.run(buf146, arg112_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg112_1
        buf147 = buf140; del buf140  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf146, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg113_1, (3072, 768), (1, 3072), 0), out=buf147)
        del arg113_1
        buf151 = buf130; del buf130  # reuse
        # Source Nodes: [add_14, hidden_states_55], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf147, arg114_1, buf144, arg115_1, arg116_1, buf151, 8192, 768, grid=grid(8192), stream=stream0)
        del arg114_1
        del arg115_1
        del arg116_1
        buf152 = buf147; del buf147  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_7_attention_self_query], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg118_1, reinterpret_tensor(buf151, (8192, 768), (768, 1), 0), reinterpret_tensor(arg117_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf152)
        del arg117_1
        del arg118_1
        buf153 = reinterpret_tensor(buf144, (8192, 768), (768, 1), 0); del buf144  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_7_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg120_1, reinterpret_tensor(buf151, (8192, 768), (768, 1), 0), reinterpret_tensor(arg119_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf153)
        del arg119_1
        del arg120_1
        buf154 = buf132; del buf132  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_7_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg122_1, reinterpret_tensor(buf151, (8192, 768), (768, 1), 0), reinterpret_tensor(arg121_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf154)
        del arg121_1
        del arg122_1
        buf155 = buf134; del buf134  # reuse
        # Source Nodes: [attn_output_21], Original ATen: [aten._scaled_dot_product_efficient_attention]
        triton_poi_fused__scaled_dot_product_efficient_attention_1.run(buf155, 50331648, grid=grid(50331648), stream=stream0)
        # Source Nodes: [attn_output_21], Original ATen: [aten._scaled_dot_product_efficient_attention]
        buf156 = torch.ops.aten._scaled_dot_product_efficient_attention.default(reinterpret_tensor(buf152, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf153, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf154, (16, 12, 512, 64), (393216, 64, 768, 1), 0), buf155, False)
        del buf152
        buf157 = buf156[0]
        del buf156
        buf161 = buf154; del buf154  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf157, (8192, 768), (768, 1), 0), reinterpret_tensor(arg123_1, (768, 768), (1, 768), 0), out=buf161)
        del arg123_1
        buf165 = reinterpret_tensor(buf157, (16, 512, 768), (393216, 768, 1), 0); del buf157  # reuse
        # Source Nodes: [add_15, hidden_states_58], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf161, arg124_1, buf151, arg125_1, arg126_1, buf165, 8192, 768, grid=grid(8192), stream=stream0)
        del arg124_1
        del arg125_1
        del arg126_1
        buf166 = reinterpret_tensor(buf146, (8192, 3072), (3072, 1), 0); del buf146  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf165, (8192, 768), (768, 1), 0), reinterpret_tensor(arg127_1, (768, 3072), (1, 768), 0), out=buf166)
        del arg127_1
        buf167 = reinterpret_tensor(buf166, (16, 512, 3072), (1572864, 3072, 1), 0); del buf166  # reuse
        # Source Nodes: [hidden_states_60], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_3.run(buf167, arg128_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg128_1
        buf168 = buf161; del buf161  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf167, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg129_1, (3072, 768), (1, 3072), 0), out=buf168)
        del arg129_1
        buf172 = buf151; del buf151  # reuse
        # Source Nodes: [add_16, hidden_states_63], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf168, arg130_1, buf165, arg131_1, arg132_1, buf172, 8192, 768, grid=grid(8192), stream=stream0)
        del arg130_1
        del arg131_1
        del arg132_1
        buf173 = buf168; del buf168  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_8_attention_self_query], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg134_1, reinterpret_tensor(buf172, (8192, 768), (768, 1), 0), reinterpret_tensor(arg133_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf173)
        del arg133_1
        del arg134_1
        buf174 = reinterpret_tensor(buf165, (8192, 768), (768, 1), 0); del buf165  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_8_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg136_1, reinterpret_tensor(buf172, (8192, 768), (768, 1), 0), reinterpret_tensor(arg135_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf174)
        del arg135_1
        del arg136_1
        buf175 = buf153; del buf153  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_8_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg138_1, reinterpret_tensor(buf172, (8192, 768), (768, 1), 0), reinterpret_tensor(arg137_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf175)
        del arg137_1
        del arg138_1
        buf176 = buf155; del buf155  # reuse
        # Source Nodes: [attn_output_24], Original ATen: [aten._scaled_dot_product_efficient_attention]
        triton_poi_fused__scaled_dot_product_efficient_attention_1.run(buf176, 50331648, grid=grid(50331648), stream=stream0)
        # Source Nodes: [attn_output_24], Original ATen: [aten._scaled_dot_product_efficient_attention]
        buf177 = torch.ops.aten._scaled_dot_product_efficient_attention.default(reinterpret_tensor(buf173, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf174, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf175, (16, 12, 512, 64), (393216, 64, 768, 1), 0), buf176, False)
        del buf173
        buf178 = buf177[0]
        del buf177
        buf182 = buf175; del buf175  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf178, (8192, 768), (768, 1), 0), reinterpret_tensor(arg139_1, (768, 768), (1, 768), 0), out=buf182)
        del arg139_1
        buf186 = reinterpret_tensor(buf178, (16, 512, 768), (393216, 768, 1), 0); del buf178  # reuse
        # Source Nodes: [add_17, hidden_states_66], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf182, arg140_1, buf172, arg141_1, arg142_1, buf186, 8192, 768, grid=grid(8192), stream=stream0)
        del arg140_1
        del arg141_1
        del arg142_1
        buf187 = reinterpret_tensor(buf167, (8192, 3072), (3072, 1), 0); del buf167  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf186, (8192, 768), (768, 1), 0), reinterpret_tensor(arg143_1, (768, 3072), (1, 768), 0), out=buf187)
        del arg143_1
        buf188 = reinterpret_tensor(buf187, (16, 512, 3072), (1572864, 3072, 1), 0); del buf187  # reuse
        # Source Nodes: [hidden_states_68], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_3.run(buf188, arg144_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg144_1
        buf189 = buf182; del buf182  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf188, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg145_1, (3072, 768), (1, 3072), 0), out=buf189)
        del arg145_1
        buf193 = buf172; del buf172  # reuse
        # Source Nodes: [add_18, hidden_states_71], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf189, arg146_1, buf186, arg147_1, arg148_1, buf193, 8192, 768, grid=grid(8192), stream=stream0)
        del arg146_1
        del arg147_1
        del arg148_1
        buf194 = buf189; del buf189  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_9_attention_self_query], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg150_1, reinterpret_tensor(buf193, (8192, 768), (768, 1), 0), reinterpret_tensor(arg149_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf194)
        del arg149_1
        del arg150_1
        buf195 = reinterpret_tensor(buf186, (8192, 768), (768, 1), 0); del buf186  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_9_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg152_1, reinterpret_tensor(buf193, (8192, 768), (768, 1), 0), reinterpret_tensor(arg151_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf195)
        del arg151_1
        del arg152_1
        buf196 = buf174; del buf174  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_9_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg154_1, reinterpret_tensor(buf193, (8192, 768), (768, 1), 0), reinterpret_tensor(arg153_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf196)
        del arg153_1
        del arg154_1
        buf197 = buf176; del buf176  # reuse
        # Source Nodes: [attn_output_27], Original ATen: [aten._scaled_dot_product_efficient_attention]
        triton_poi_fused__scaled_dot_product_efficient_attention_1.run(buf197, 50331648, grid=grid(50331648), stream=stream0)
        # Source Nodes: [attn_output_27], Original ATen: [aten._scaled_dot_product_efficient_attention]
        buf198 = torch.ops.aten._scaled_dot_product_efficient_attention.default(reinterpret_tensor(buf194, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf195, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf196, (16, 12, 512, 64), (393216, 64, 768, 1), 0), buf197, False)
        del buf194
        buf199 = buf198[0]
        del buf198
        buf203 = buf196; del buf196  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf199, (8192, 768), (768, 1), 0), reinterpret_tensor(arg155_1, (768, 768), (1, 768), 0), out=buf203)
        del arg155_1
        buf207 = reinterpret_tensor(buf199, (16, 512, 768), (393216, 768, 1), 0); del buf199  # reuse
        # Source Nodes: [add_19, hidden_states_74], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf203, arg156_1, buf193, arg157_1, arg158_1, buf207, 8192, 768, grid=grid(8192), stream=stream0)
        del arg156_1
        del arg157_1
        del arg158_1
        buf208 = reinterpret_tensor(buf188, (8192, 3072), (3072, 1), 0); del buf188  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf207, (8192, 768), (768, 1), 0), reinterpret_tensor(arg159_1, (768, 3072), (1, 768), 0), out=buf208)
        del arg159_1
        buf209 = reinterpret_tensor(buf208, (16, 512, 3072), (1572864, 3072, 1), 0); del buf208  # reuse
        # Source Nodes: [hidden_states_76], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_3.run(buf209, arg160_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg160_1
        buf210 = buf203; del buf203  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf209, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg161_1, (3072, 768), (1, 3072), 0), out=buf210)
        del arg161_1
        buf214 = buf193; del buf193  # reuse
        # Source Nodes: [add_20, hidden_states_79], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf210, arg162_1, buf207, arg163_1, arg164_1, buf214, 8192, 768, grid=grid(8192), stream=stream0)
        del arg162_1
        del arg163_1
        del arg164_1
        buf215 = buf210; del buf210  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_10_attention_self_query], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg166_1, reinterpret_tensor(buf214, (8192, 768), (768, 1), 0), reinterpret_tensor(arg165_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf215)
        del arg165_1
        del arg166_1
        buf216 = reinterpret_tensor(buf207, (8192, 768), (768, 1), 0); del buf207  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_10_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg168_1, reinterpret_tensor(buf214, (8192, 768), (768, 1), 0), reinterpret_tensor(arg167_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf216)
        del arg167_1
        del arg168_1
        buf217 = buf195; del buf195  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_10_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg170_1, reinterpret_tensor(buf214, (8192, 768), (768, 1), 0), reinterpret_tensor(arg169_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf217)
        del arg169_1
        del arg170_1
        buf218 = buf197; del buf197  # reuse
        # Source Nodes: [attn_output_30], Original ATen: [aten._scaled_dot_product_efficient_attention]
        triton_poi_fused__scaled_dot_product_efficient_attention_1.run(buf218, 50331648, grid=grid(50331648), stream=stream0)
        # Source Nodes: [attn_output_30], Original ATen: [aten._scaled_dot_product_efficient_attention]
        buf219 = torch.ops.aten._scaled_dot_product_efficient_attention.default(reinterpret_tensor(buf215, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf216, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf217, (16, 12, 512, 64), (393216, 64, 768, 1), 0), buf218, False)
        del buf215
        buf220 = buf219[0]
        del buf219
        buf224 = buf217; del buf217  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf220, (8192, 768), (768, 1), 0), reinterpret_tensor(arg171_1, (768, 768), (1, 768), 0), out=buf224)
        del arg171_1
        buf228 = reinterpret_tensor(buf220, (16, 512, 768), (393216, 768, 1), 0); del buf220  # reuse
        # Source Nodes: [add_21, hidden_states_82], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf224, arg172_1, buf214, arg173_1, arg174_1, buf228, 8192, 768, grid=grid(8192), stream=stream0)
        del arg172_1
        del arg173_1
        del arg174_1
        buf229 = reinterpret_tensor(buf209, (8192, 3072), (3072, 1), 0); del buf209  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf228, (8192, 768), (768, 1), 0), reinterpret_tensor(arg175_1, (768, 3072), (1, 768), 0), out=buf229)
        del arg175_1
        buf230 = reinterpret_tensor(buf229, (16, 512, 3072), (1572864, 3072, 1), 0); del buf229  # reuse
        # Source Nodes: [hidden_states_84], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_3.run(buf230, arg176_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg176_1
        buf231 = buf224; del buf224  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf230, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg177_1, (3072, 768), (1, 3072), 0), out=buf231)
        del arg177_1
        buf235 = buf214; del buf214  # reuse
        # Source Nodes: [add_22, hidden_states_87], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf231, arg178_1, buf228, arg179_1, arg180_1, buf235, 8192, 768, grid=grid(8192), stream=stream0)
        del arg178_1
        del arg179_1
        del arg180_1
        buf236 = buf231; del buf231  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_11_attention_self_query], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg182_1, reinterpret_tensor(buf235, (8192, 768), (768, 1), 0), reinterpret_tensor(arg181_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf236)
        del arg181_1
        del arg182_1
        buf237 = reinterpret_tensor(buf228, (8192, 768), (768, 1), 0); del buf228  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_11_attention_self_key], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg184_1, reinterpret_tensor(buf235, (8192, 768), (768, 1), 0), reinterpret_tensor(arg183_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf237)
        del arg183_1
        del arg184_1
        buf238 = buf216; del buf216  # reuse
        # Source Nodes: [l__mod___bert_encoder_layer_11_attention_self_value], Original ATen: [aten.addmm]
        extern_kernels.addmm(arg186_1, reinterpret_tensor(buf235, (8192, 768), (768, 1), 0), reinterpret_tensor(arg185_1, (768, 768), (1, 768), 0), alpha=1, beta=1, out=buf238)
        del arg185_1
        del arg186_1
        buf239 = buf218; del buf218  # reuse
        # Source Nodes: [attn_output_33], Original ATen: [aten._scaled_dot_product_efficient_attention]
        triton_poi_fused__scaled_dot_product_efficient_attention_1.run(buf239, 50331648, grid=grid(50331648), stream=stream0)
        # Source Nodes: [attn_output_33], Original ATen: [aten._scaled_dot_product_efficient_attention]
        buf240 = torch.ops.aten._scaled_dot_product_efficient_attention.default(reinterpret_tensor(buf236, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf237, (16, 12, 512, 64), (393216, 64, 768, 1), 0), reinterpret_tensor(buf238, (16, 12, 512, 64), (393216, 64, 768, 1), 0), buf239, False)
        del buf236
        del buf237
        del buf239
        buf241 = buf240[0]
        del buf240
        buf245 = buf238; del buf238  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf241, (8192, 768), (768, 1), 0), reinterpret_tensor(arg187_1, (768, 768), (1, 768), 0), out=buf245)
        del arg187_1
        buf249 = reinterpret_tensor(buf241, (16, 512, 768), (393216, 768, 1), 0); del buf241  # reuse
        # Source Nodes: [add_23, hidden_states_90], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf245, arg188_1, buf235, arg189_1, arg190_1, buf249, 8192, 768, grid=grid(8192), stream=stream0)
        del arg188_1
        del arg189_1
        del arg190_1
        buf250 = reinterpret_tensor(buf230, (8192, 3072), (3072, 1), 0); del buf230  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf249, (8192, 768), (768, 1), 0), reinterpret_tensor(arg191_1, (768, 3072), (1, 768), 0), out=buf250)
        del arg191_1
        buf251 = reinterpret_tensor(buf250, (16, 512, 3072), (1572864, 3072, 1), 0); del buf250  # reuse
        # Source Nodes: [hidden_states_92], Original ATen: [aten.gelu]
        triton_poi_fused_gelu_3.run(buf251, arg192_1, 25165824, grid=grid(25165824), stream=stream0)
        del arg192_1
        buf252 = buf245; del buf245  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf251, (8192, 3072), (3072, 1), 0), reinterpret_tensor(arg193_1, (3072, 768), (1, 3072), 0), out=buf252)
        del arg193_1
        del buf251
        buf256 = buf235; del buf235  # reuse
        # Source Nodes: [add_24, hidden_states_95], Original ATen: [aten.add, aten.native_layer_norm]
        triton_per_fused_add_native_layer_norm_2.run(buf252, arg194_1, buf249, arg195_1, arg196_1, buf256, 8192, 768, grid=grid(8192), stream=stream0)
        del arg194_1
        del arg195_1
        del arg196_1
        del buf249
        buf257 = buf252; del buf252  # reuse
        # Source Nodes: [], Original ATen: []
        extern_kernels.mm(reinterpret_tensor(buf256, (8192, 768), (768, 1), 0), reinterpret_tensor(arg197_1, (768, 768), (1, 768), 0), out=buf257)
        del arg197_1
        buf262 = buf256; del buf256  # reuse
        # Source Nodes: [hidden_states_97, hidden_states_98], Original ATen: [aten.gelu, aten.native_layer_norm]
        triton_per_fused_gelu_native_layer_norm_4.run(buf257, arg198_1, arg199_1, arg200_1, buf262, 8192, 768, grid=grid(8192), stream=stream0)
        del arg198_1
        del arg199_1
        del arg200_1
        del buf257
        buf263 = empty_strided_cuda((768, 30528), (30528, 1), torch.bfloat16)
        # Source Nodes: [], Original ATen: []
        triton_poi_fused_5.run(arg201_1, buf263, 23445504, grid=grid(23445504), stream=stream0)
        del arg201_1
        buf264 = empty_strided_cuda((30528, ), (1, ), torch.bfloat16)
        # Source Nodes: [], Original ATen: []
        triton_poi_fused_6.run(arg202_1, buf264, 30528, grid=grid(30528), stream=stream0)
        del arg202_1
        buf265 = empty_strided_cuda((8192, 30528), (30528, 1), torch.bfloat16)
        # Source Nodes: [], Original ATen: []
        extern_kernels.addmm(buf264, reinterpret_tensor(buf262, (8192, 768), (768, 1), 0), buf263, alpha=1, beta=1, out=buf265)
        del buf262
        del buf263
        del buf264
        buf266 = empty_strided_cuda((8192, 1), (1, 8192), torch.float32)
        buf267 = empty_strided_cuda((8192, 1), (1, 8192), torch.float32)
        # Source Nodes: [masked_lm_loss], Original ATen: [aten._log_softmax]
        triton_red_fused__log_softmax_7.run(buf265, buf266, buf267, 8192, 30522, grid=grid(8192), stream=stream0)
        buf268 = empty_strided_cuda((), (), torch.bfloat16)
        buf270 = buf268; del buf268  # reuse
        # Source Nodes: [masked_lm_loss], Original ATen: [aten.nll_loss_forward]
        triton_red_fused_nll_loss_forward_8.run(buf270, arg206_1, buf265, buf266, buf267, 1, 8192, grid=grid(1), stream=stream0)
        del arg206_1
        del buf266
        del buf267
    return (buf270, reinterpret_tensor(buf265, (16, 512, 30522), (15630336, 30528, 1), 0), )

if __name__ == "__main__":
    from torch._inductor.wrapper_benchmark import compiled_module_main
    compiled_module_main('BertForMaskedLM', benchmark_compiled_module)